Merge pull request #99 from creekorful/replicated-checks

Replicated checks
pull/102/head
Aloïs Micard 3 years ago committed by GitHub
commit 1efc3d5263
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,28 @@
package constraint
import (
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"net/url"
"strings"
)
// CheckHostnameAllowed check if given URL hostname is allowed for crawling
func CheckHostnameAllowed(configClient configapi.Client, rawurl string) (bool, error) {
u, err := url.Parse(rawurl)
if err != nil {
return false, err
}
forbiddenHostnames, err := configClient.GetForbiddenHostnames()
if err != nil {
return false, err
}
for _, hostname := range forbiddenHostnames {
if strings.Contains(u.Hostname(), hostname.Hostname) {
return false, nil
}
}
return true, nil
}

@ -0,0 +1,29 @@
package constraint
import (
"github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/configapi/client_mock"
"github.com/golang/mock/gomock"
"testing"
)
func TestCheckHostnameAllowed(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
configClientMock := client_mock.NewMockClient(mockCtrl)
configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{
{Hostname: "google.onion"},
}, nil)
if allowed, err := CheckHostnameAllowed(configClientMock, "https://google.onion"); allowed || err != nil {
t.Fail()
}
configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{
{Hostname: "google.onion"},
}, nil)
if allowed, err := CheckHostnameAllowed(configClientMock, "https://google2.onion"); !allowed || err != nil {
t.Fail()
}
}

@ -5,6 +5,7 @@ import (
"fmt"
"github.com/creekorful/trandoshan/internal/clock"
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/constraint"
chttp "github.com/creekorful/trandoshan/internal/crawler/http"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/process"
@ -14,7 +15,6 @@ import (
"github.com/valyala/fasthttp/fasthttpproxy"
"io/ioutil"
"net/http"
"net/url"
"strings"
"time"
)
@ -104,20 +104,11 @@ func (state *State) handleNewURLEvent(subscriber event.Subscriber, msg event.Raw
log.Debug().Str("url", evt.URL).Msg("Processing URL")
u, err := url.Parse(evt.URL)
if err != nil {
if allowed, err := constraint.CheckHostnameAllowed(state.configClient, evt.URL); err != nil {
return err
}
forbiddenHostnames, err := state.configClient.GetForbiddenHostnames()
if err != nil {
return err
}
for _, hostname := range forbiddenHostnames {
if strings.Contains(u.Hostname(), hostname.Hostname) {
log.Debug().Str("url", evt.URL).Msg("Skipping forbidden hostname")
return errHostnameNotAllowed
}
} else if !allowed {
log.Debug().Str("url", evt.URL).Msg("Skipping forbidden hostname")
return errHostnameNotAllowed
}
r, err := state.httpClient.Get(evt.URL)

@ -5,6 +5,8 @@ import (
"github.com/PuerkitoBio/goquery"
"github.com/PuerkitoBio/purell"
"github.com/creekorful/trandoshan/api"
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/constraint"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/process"
"github.com/rs/zerolog/log"
@ -14,9 +16,12 @@ import (
"strings"
)
var errHostnameNotAllowed = fmt.Errorf("hostname is not allowed")
// State represent the application state
type State struct {
apiClient api.API
apiClient api.API
configClient configapi.Client
}
// Name return the process name
@ -26,7 +31,7 @@ func (state *State) Name() string {
// CommonFlags return process common flags
func (state *State) CommonFlags() []string {
return []string{process.HubURIFlag, process.APIURIFlag, process.APITokenFlag}
return []string{process.HubURIFlag, process.APIURIFlag, process.APITokenFlag, process.ConfigAPIURIFlag}
}
// CustomFlags return process custom flags
@ -42,6 +47,12 @@ func (state *State) Initialize(provider process.Provider) error {
}
state.apiClient = apiClient
configClient, err := provider.ConfigClient([]string{configapi.ForbiddenHostnamesKey})
if err != nil {
return err
}
state.configClient = configClient
return nil
}
@ -65,6 +76,13 @@ func (state *State) handleNewResourceEvent(subscriber event.Subscriber, msg even
log.Debug().Str("url", evt.URL).Msg("Processing new resource")
if allowed, err := constraint.CheckHostnameAllowed(state.configClient, evt.URL); err != nil {
return err
} else if !allowed {
log.Debug().Str("url", evt.URL).Msg("Skipping forbidden hostname")
return errHostnameNotAllowed
}
// Extract & process resource
resDto, urls, err := extractResource(evt)
if err != nil {

@ -3,6 +3,8 @@ package extractor
import (
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/api_mock"
"github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/configapi/client_mock"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/event_mock"
"github.com/golang/mock/gomock"
@ -92,6 +94,7 @@ This is sparta (hosted on https://example.org)
apiClientMock := api_mock.NewMockAPI(mockCtrl)
subscriberMock := event_mock.NewMockSubscriber(mockCtrl)
configClientMock := client_mock.NewMockClient(mockCtrl)
tn := time.Now()
@ -105,6 +108,8 @@ This is sparta (hosted on https://example.org)
Time: tn,
}).Return(nil)
configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{{Hostname: "example2.onion"}}, nil)
// make sure we are creating the resource
apiClientMock.EXPECT().AddResource(api.ResourceDto{
URL: "https://example.onion",
@ -126,8 +131,46 @@ This is sparta (hosted on https://example.org)
PublishEvent(&event.FoundURLEvent{URL: "https://google.com/test?test=test"}).
Return(nil)
s := State{apiClient: apiClientMock}
s := State{apiClient: apiClientMock, configClient: configClientMock}
if err := s.handleNewResourceEvent(subscriberMock, msg); err != nil {
t.FailNow()
}
}
func TestHandleMessageForbiddenHostname(t *testing.T) {
body := `
<title>Creekorful Inc</title>
This is sparta (hosted on https://example.org)
<a href="https://google.com/test?test=test#12">
<meta name="DescriptIon" content="Zhello world">
<meta property="og:url" content="https://example.org">`
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
apiClientMock := api_mock.NewMockAPI(mockCtrl)
subscriberMock := event_mock.NewMockSubscriber(mockCtrl)
configClientMock := client_mock.NewMockClient(mockCtrl)
tn := time.Now()
msg := event.RawMessage{}
subscriberMock.EXPECT().
Read(&msg, &event.NewResourceEvent{}).
SetArg(1, event.NewResourceEvent{
URL: "https://example.onion",
Body: body,
Headers: map[string]string{"Server": "Traefik", "Content-Type": "application/html"},
Time: tn,
}).Return(nil)
configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{{Hostname: "example.onion"}}, nil)
s := State{apiClient: apiClientMock, configClient: configClientMock}
if err := s.handleNewResourceEvent(subscriberMock, msg); err != errHostnameNotAllowed {
t.FailNow()
}
}

@ -5,6 +5,7 @@ import (
"fmt"
"github.com/creekorful/trandoshan/api"
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/constraint"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/process"
"github.com/rs/zerolog/log"
@ -109,12 +110,11 @@ func (state *State) handleURLFoundEvent(subscriber event.Subscriber, msg event.R
}
// Make sure hostname is not forbidden
if hostnames, err := state.configClient.GetForbiddenHostnames(); err == nil {
for _, hostname := range hostnames {
if strings.Contains(u.Hostname(), hostname.Hostname) {
return fmt.Errorf("%s %w", u, errHostnameNotAllowed)
}
}
if allowed, err := constraint.CheckHostnameAllowed(state.configClient, evt.URL); err != nil {
return err
} else if !allowed {
log.Debug().Str("url", evt.URL).Msg("Skipping forbidden hostname")
return fmt.Errorf("%s %w", u, errHostnameNotAllowed)
}
// If we want to allow re-schedule of existing crawled resources we need to retrieve only resources

Loading…
Cancel
Save