Allow to skip scheduling for url with forbidden extensions

Closes: #42
pull/46/head
Aloïs Micard 4 years ago
parent 23cc872054
commit e1c0320a7b
No known key found for this signature in database
GPG Key ID: 1A0EB82F071F5EFE

@ -27,7 +27,7 @@ services:
- torproxy
scheduler:
image: creekorful/tdsh-scheduler:latest
command: --log-level debug --nats-uri nats --api-uri http://api:8080 --api-login scheduler:ZjDXeaLGj4EEUGu6
command: --log-level debug --nats-uri nats --api-uri http://api:8080 --api-login scheduler:ZjDXeaLGj4EEUGu6 --forbidden-extensions jpg --forbidden-extensions png
restart: always
depends_on:
- nats

@ -30,6 +30,10 @@ func GetApp() *cli.App {
Name: "refresh-delay",
Usage: "Duration before allowing crawl of existing resource (none = never)",
},
&cli.StringSliceFlag{
Name: "forbidden-extensions",
Usage: "Extensions to disable scheduling for (i.e png, exe, css, ...) (the dot will be added automatically)",
},
},
Action: execute,
}
@ -65,14 +69,15 @@ func execute(ctx *cli.Context) error {
log.Info().Msg("Successfully initialized tdsh-scheduler. Waiting for URLs")
if err := sub.QueueSubscribe(messaging.URLFoundSubject, "schedulers", handleMessage(apiClient, refreshDelay)); err != nil {
callback := handleMessage(apiClient, refreshDelay, ctx.StringSlice("forbidden-extensions"))
if err := sub.QueueSubscribe(messaging.URLFoundSubject, "schedulers", callback); err != nil {
return err
}
return nil
}
func handleMessage(apiClient api.Client, refreshDelay time.Duration) messaging.MsgHandler {
func handleMessage(apiClient api.Client, refreshDelay time.Duration, forbiddenExtensions []string) messaging.MsgHandler {
return func(sub messaging.Subscriber, msg *nats.Msg) error {
var urlMsg messaging.URLFoundMsg
if err := sub.ReadMsg(msg, &urlMsg); err != nil {
@ -93,6 +98,17 @@ func handleMessage(apiClient api.Client, refreshDelay time.Duration) messaging.M
return fmt.Errorf("%s is not a valid .onion", u.Host)
}
// Make sure extension is not forbidden
for _, ext := range forbiddenExtensions {
if strings.HasSuffix(u.Path, "."+ext) {
log.Trace().
Stringer("url", u).
Str("ext", ext).
Msg("Skipping URL with forbidden extension")
return fmt.Errorf("%s contains invalid extension .%s", u, ext)
}
}
// If we want to allow re-schedule of existing crawled resources we need to retrieve only resources
// that are newer than `now - refreshDelay`.
endDate := time.Time{}

@ -42,12 +42,12 @@ func TestHandleMessageNotOnion(t *testing.T) {
SetArg(1, messaging.URLFoundMsg{URL: "https://example.org"}).
Return(nil)
if err := handleMessage(apiClientMock, -1)(subscriberMock, &msg); err == nil {
if err := handleMessage(apiClientMock, -1, []string{})(subscriberMock, &msg); err == nil {
t.FailNow()
}
}
func TestHandleMessageNoSchedule(t *testing.T) {
func TestHandleMessageAlreadyCrawled(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
@ -64,7 +64,25 @@ func TestHandleMessageNoSchedule(t *testing.T) {
SearchResources("https://example.onion", "", time.Time{}, time.Time{}, 1, 1).
Return([]api.ResourceDto{}, int64(1), nil)
if err := handleMessage(apiClientMock, -1)(subscriberMock, &msg); err != nil {
if err := handleMessage(apiClientMock, -1, []string{"png"})(subscriberMock, &msg); err != nil {
t.FailNow()
}
}
func TestHandleMessageForbiddenExtensions(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
apiClientMock := api_mock.NewMockClient(mockCtrl)
subscriberMock := messaging_mock.NewMockSubscriber(mockCtrl)
msg := nats.Msg{}
subscriberMock.EXPECT().
ReadMsg(&msg, &messaging.URLFoundMsg{}).
SetArg(1, messaging.URLFoundMsg{URL: "https://example.onion/image.png"}).
Return(nil)
if err := handleMessage(apiClientMock, -1, []string{"png"})(subscriberMock, &msg); err == nil {
t.FailNow()
}
}
@ -90,7 +108,7 @@ func TestHandleMessage(t *testing.T) {
PublishMsg(&messaging.URLTodoMsg{URL: "https://example.onion"}).
Return(nil)
if err := handleMessage(apiClientMock, -1)(subscriberMock, &msg); err != nil {
if err := handleMessage(apiClientMock, -1, []string{})(subscriberMock, &msg); err != nil {
t.FailNow()
}
}

Loading…
Cancel
Save