[#12] Allow duplicate resource crawling

pull/24/head
Aloïs Micard 4 years ago
parent e61dc42d3c
commit 62b54bf385
No known key found for this signature in database
GPG Key ID: 1A0EB82F071F5EFE

@ -21,7 +21,7 @@ type ResourceDto struct {
// Client is the interface to interact with the API process
type Client interface {
SearchResources(url, keyword string) ([]ResourceDto, error)
SearchResources(url, keyword string, startDate, endDate time.Time) ([]ResourceDto, error)
AddResource(res ResourceDto) (ResourceDto, error)
ScheduleURL(url string) error
}
@ -31,7 +31,7 @@ type client struct {
baseURL string
}
func (c *client) SearchResources(url, keyword string) ([]ResourceDto, error) {
func (c *client) SearchResources(url, keyword string, startDate, endDate time.Time) ([]ResourceDto, error) {
targetEndpoint := fmt.Sprintf("%s/v1/resources?", c.baseURL)
if url != "" {
@ -42,6 +42,14 @@ func (c *client) SearchResources(url, keyword string) ([]ResourceDto, error) {
targetEndpoint += fmt.Sprintf("keyword=%s&", keyword)
}
if !startDate.IsZero() {
targetEndpoint += fmt.Sprintf("start-date=%s&", startDate.Format(time.RFC3339))
}
if !endDate.IsZero() {
targetEndpoint += fmt.Sprintf("end-date=%s&", endDate.Format(time.RFC3339))
}
var resources []ResourceDto
_, err := jsonGet(c.httpClient, targetEndpoint, &resources)
return resources, err

@ -14,5 +14,6 @@ require (
github.com/rs/zerolog v1.20.0
github.com/urfave/cli/v2 v2.2.0
github.com/valyala/fasthttp v1.9.0
github.com/xhit/go-str2duration/v2 v2.0.0
mvdan.cc/xurls/v2 v2.1.0
)

@ -99,6 +99,9 @@ github.com/valyala/fasttemplate v1.0.1/go.mod h1:UQGH1tvbgY+Nz5t2n7tXsz52dQxojPU
github.com/valyala/fasttemplate v1.1.0 h1:RZqt0yGBsps8NGvLSGW804QQqCUYYLsaOjTVHy1Ocw4=
github.com/valyala/fasttemplate v1.1.0/go.mod h1:UQGH1tvbgY+Nz5t2n7tXsz52dQxojPUpymEIMZ47gx8=
github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
github.com/xhit/go-str2duration v1.2.0 h1:BcV5u025cITWxEQKGWr1URRzrcXtu7uk8+luz3Yuhwc=
github.com/xhit/go-str2duration/v2 v2.0.0 h1:uFtk6FWB375bP7ewQl+/1wBcn840GPhnySOdcz/okPE=
github.com/xhit/go-str2duration/v2 v2.0.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU=
go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=

@ -122,6 +122,22 @@ func searchResources(es *elastic.Client) echo.HandlerFunc {
withBody = true
}
startDate := time.Time{}
if val := c.QueryParam("start-date"); val != "" {
d, err := time.Parse(time.RFC3339, val)
if err != nil {
startDate = d
}
}
endDate := time.Time{}
if val := c.QueryParam("end-date"); val != "" {
d, err := time.Parse(time.RFC3339, val)
if err != nil {
endDate = d
}
}
// First of all base64decode the URL
b64URL := c.QueryParam("url")
b, err := base64.URLEncoding.DecodeString(b64URL)
@ -135,7 +151,7 @@ func searchResources(es *elastic.Client) echo.HandlerFunc {
from := (p.page - 1) * p.size
// Build up search query
query := buildSearchQuery(string(b), c.QueryParam("keyword"))
query := buildSearchQuery(string(b), c.QueryParam("keyword"), startDate, endDate)
// Get total count
totalCount, err := es.Count(resourcesIndex).Query(query).Do(context.Background())
@ -212,7 +228,7 @@ func addResource(es *elastic.Client) echo.HandlerFunc {
}
}
func buildSearchQuery(url, keyword string) elastic.Query {
func buildSearchQuery(url, keyword string, startDate, endDate time.Time) elastic.Query {
var queries []elastic.Query
if url != "" {
queries = append(queries, elastic.NewTermQuery("url", url))
@ -220,6 +236,17 @@ func buildSearchQuery(url, keyword string) elastic.Query {
if keyword != "" {
queries = append(queries, elastic.NewTermQuery("body", keyword))
}
if !startDate.IsZero() || !endDate.IsZero() {
timeQuery := elastic.NewRangeQuery("time")
if !startDate.IsZero() {
timeQuery.Gte(startDate)
}
if !endDate.IsZero() {
timeQuery.Lte(endDate)
}
queries = append(queries, timeQuery)
}
// Handle specific case
if len(queries) == 0 {

@ -10,8 +10,10 @@ import (
"github.com/nats-io/nats.go"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"github.com/xhit/go-str2duration/v2"
"net/url"
"strings"
"time"
)
// GetApp return the scheduler app
@ -32,6 +34,10 @@ func GetApp() *cli.App {
Usage: "URI to the API server",
Required: true,
},
&cli.StringFlag{
Name: "refresh-delay",
Usage: "Duration before allowing crawl of existing resource (none = never)",
},
},
Action: execute,
}
@ -58,14 +64,14 @@ func execute(ctx *cli.Context) error {
log.Info().Msg("Successfully initialized tdsh-scheduler. Waiting for URLs")
if err := sub.QueueSubscribe(messaging.URLFoundSubject, "schedulers",
handleMessage(apiClient)); err != nil {
handleMessage(apiClient, parseRefreshDelay(ctx.String("refresh-delay")))); err != nil {
return err
}
return nil
}
func handleMessage(apiClient api.Client) natsutil.MsgHandler {
func handleMessage(apiClient api.Client, refreshDelay time.Duration) natsutil.MsgHandler {
return func(nc *nats.Conn, msg *nats.Msg) error {
var urlMsg messaging.URLFoundMsg
if err := natsutil.ReadJSON(msg, &urlMsg); err != nil {
@ -86,8 +92,18 @@ func handleMessage(apiClient api.Client) natsutil.MsgHandler {
return err
}
// If we want to allow re-schedule of existing crawled resources we need to retrieve only resources
// that are newer than now-refreshDelay.
endDate := time.Time{}
if refreshDelay != -1 {
log.Debug().Stringer("delay", refreshDelay).Msg("Existing resources will be crawled again")
endDate = time.Now().Add(-refreshDelay)
} else {
log.Debug().Msg("Existing resources will NOT be crawled again")
}
b64URI := base64.URLEncoding.EncodeToString([]byte(u.String()))
urls, err := apiClient.SearchResources(b64URI, "")
urls, err := apiClient.SearchResources(b64URI, "", time.Time{}, endDate)
if err != nil {
log.Err(err).Msg("Error while searching URL")
return err
@ -106,3 +122,16 @@ func handleMessage(apiClient api.Client) natsutil.MsgHandler {
return nil
}
}
func parseRefreshDelay(delay string) time.Duration {
if delay == "" {
return -1
}
val, err := str2duration.ParseDuration(delay)
if err != nil {
return -1
}
return val
}

@ -1 +1,24 @@
package scheduler
import (
"testing"
"time"
)
func TestParseRefreshDelay(t *testing.T) {
if parseRefreshDelay("") != -1 {
t.Fail()
}
if parseRefreshDelay("50s") != time.Second*50 {
t.Fail()
}
if parseRefreshDelay("50m") != time.Minute*50 {
t.Fail()
}
if parseRefreshDelay("50h") != time.Hour*50 {
t.Fail()
}
if parseRefreshDelay("50d") != time.Hour*24*50 {
t.Fail()
}
}

@ -6,6 +6,7 @@ import (
"github.com/creekorful/trandoshan/internal/util/logging"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"time"
)
// GetApp returns the Trandoshan CLI app
@ -67,7 +68,7 @@ func search(c *cli.Context) error {
keyword := c.Args().First()
apiClient := api.NewClient(c.String("api-uri"))
res, err := apiClient.SearchResources("", keyword)
res, err := apiClient.SearchResources("", keyword, time.Time{}, time.Time{})
if err != nil {
log.Err(err).Str("keyword", keyword).Msg("Unable to search resources")
return err

Loading…
Cancel
Save