Allow to add forbidden hostnames

pull/82/head
Aloïs Micard 3 years ago
parent 56c5fc8c9c
commit 1cb038727e
No known key found for this signature in database
GPG Key ID: 1A0EB82F071F5EFE

@ -44,6 +44,8 @@ services:
--forbidden-extensions jpg
--forbidden-extensions jpeg
--forbidden-extensions bmp
--forbidden-extensions css
--forbidden-hostnames facebookcorewwwi.onion
restart: always
depends_on:
- rabbitmq

@ -24,6 +24,7 @@ var (
errProtocolNotAllowed = errors.New("protocol is not allowed")
errExtensionNotAllowed = errors.New("extension is not allowed")
errShouldNotSchedule = errors.New("should not be scheduled")
errHostnameNotAllowed = errors.New("hostname is not allowed")
)
// GetApp return the scheduler app
@ -45,6 +46,10 @@ func GetApp() *cli.App {
Name: "forbidden-extensions",
Usage: "Extensions to disable scheduling for (i.e png, exe, css, ...) (the dot will be added automatically)",
},
&cli.StringSliceFlag{
Name: "forbidden-hostnames",
Usage: "Hostnames to disable scheduling for",
},
},
Action: execute,
}
@ -60,6 +65,7 @@ func execute(ctx *cli.Context) error {
Str("hub-uri", ctx.String("hub-uri")).
Str("api-uri", ctx.String("api-uri")).
Strs("forbidden-exts", ctx.StringSlice("forbidden-extensions")).
Strs("forbidden-hostnames", ctx.StringSlice("forbidden-hostnames")).
Dur("refresh-delay", refreshDelay).
Msg("Starting tdsh-scheduler")
@ -77,6 +83,7 @@ func execute(ctx *cli.Context) error {
apiClient: apiClient,
refreshDelay: refreshDelay,
forbiddenExtensions: ctx.StringSlice("forbidden-extensions"),
forbiddenHostnames: ctx.StringSlice("forbidden-hostnames"),
}
if err := sub.SubscribeAsync(event.FoundURLExchange, "schedulingQueue", state.handleURLFoundEvent); err != nil {
@ -103,6 +110,7 @@ type state struct {
apiClient api.API
refreshDelay time.Duration
forbiddenExtensions []string
forbiddenHostnames []string
}
func (state *state) handleURLFoundEvent(subscriber event.Subscriber, body io.Reader) error {
@ -123,7 +131,7 @@ func (state *state) handleURLFoundEvent(subscriber event.Subscriber, body io.Rea
return fmt.Errorf("%s %w", u.Host, errNotOnionHostname)
}
// Make sure protocol is allowed
// Make sure protocol is not forbidden
if !strings.HasPrefix(u.Scheme, "http") {
return fmt.Errorf("%s %w", u, errProtocolNotAllowed)
}
@ -135,6 +143,13 @@ func (state *state) handleURLFoundEvent(subscriber event.Subscriber, body io.Rea
}
}
// Make sure hostname is not forbidden
for _, hostname := range state.forbiddenHostnames {
if u.Hostname() == hostname {
return fmt.Errorf("%s %w", u, errHostnameNotAllowed)
}
}
// If we want to allow re-schedule of existing crawled resources we need to retrieve only resources
// that are newer than `now - refreshDelay`.
endDate := time.Time{}

@ -120,6 +120,53 @@ func TestHandleMessageForbiddenExtensions(t *testing.T) {
}
}
func TestHandleMessageHostnameForbidden(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
apiClientMock := api_mock.NewMockAPI(mockCtrl)
subscriberMock := event_mock.NewMockSubscriber(mockCtrl)
type test struct {
URL string
forbiddenHostnames []string
}
tests := []test{
{
URL: "https://facebookcorewwwi.onion/image.png?id=12&test=2",
forbiddenHostnames: []string{"facebookcorewwwi.onion"},
},
{
URL: "https://google.onion:9099",
forbiddenHostnames: []string{"google.onion"},
},
{
URL: "http://facebook.onion:443/news/test.php?id=12&username=test",
forbiddenHostnames: []string{"facebook.onion"},
},
}
for _, test := range tests {
msg := bytes.NewReader(nil)
subscriberMock.EXPECT().
Read(msg, &event.FoundURLEvent{}).
SetArg(1, event.FoundURLEvent{URL: test.URL}).
Return(nil)
s := state{
apiClient: apiClientMock,
refreshDelay: -1,
forbiddenExtensions: []string{},
forbiddenHostnames: test.forbiddenHostnames,
}
if err := s.handleURLFoundEvent(subscriberMock, msg); !errors.Is(err, errHostnameNotAllowed) {
t.FailNow()
}
}
}
func TestHandleMessage(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()

Loading…
Cancel
Save