Create constraint package

pull/99/head
Aloïs Micard 3 years ago
parent d62af0889e
commit a70a958ee4
No known key found for this signature in database
GPG Key ID: 1A0EB82F071F5EFE

@ -0,0 +1,28 @@
package constraint
import (
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"net/url"
"strings"
)
// CheckHostnameAllowed check if given URL hostname is allowed for crawling
func CheckHostnameAllowed(configClient configapi.Client, rawurl string) (bool, error) {
u, err := url.Parse(rawurl)
if err != nil {
return false, err
}
forbiddenHostnames, err := configClient.GetForbiddenHostnames()
if err != nil {
return false, err
}
for _, hostname := range forbiddenHostnames {
if strings.Contains(u.Hostname(), hostname.Hostname) {
return false, nil
}
}
return true, nil
}

@ -0,0 +1,29 @@
package constraint
import (
"github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/configapi/client_mock"
"github.com/golang/mock/gomock"
"testing"
)
func TestCheckHostnameAllowed(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
configClientMock := client_mock.NewMockClient(mockCtrl)
configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{
{Hostname: "google.onion"},
}, nil)
if allowed, err := CheckHostnameAllowed(configClientMock, "https://google.onion"); allowed || err != nil {
t.Fail()
}
configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{
{Hostname: "google.onion"},
}, nil)
if allowed, err := CheckHostnameAllowed(configClientMock, "https://google2.onion"); !allowed || err != nil {
t.Fail()
}
}

@ -5,6 +5,7 @@ import (
"fmt"
"github.com/creekorful/trandoshan/internal/clock"
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/constraint"
chttp "github.com/creekorful/trandoshan/internal/crawler/http"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/process"
@ -14,7 +15,6 @@ import (
"github.com/valyala/fasthttp/fasthttpproxy"
"io/ioutil"
"net/http"
"net/url"
"strings"
"time"
)
@ -104,20 +104,11 @@ func (state *State) handleNewURLEvent(subscriber event.Subscriber, msg event.Raw
log.Debug().Str("url", evt.URL).Msg("Processing URL")
u, err := url.Parse(evt.URL)
if err != nil {
if allowed, err := constraint.CheckHostnameAllowed(state.configClient, evt.URL); err != nil {
return err
}
forbiddenHostnames, err := state.configClient.GetForbiddenHostnames()
if err != nil {
return err
}
for _, hostname := range forbiddenHostnames {
if strings.Contains(u.Hostname(), hostname.Hostname) {
log.Debug().Str("url", evt.URL).Msg("Skipping forbidden hostname")
return errHostnameNotAllowed
}
} else if !allowed {
log.Debug().Str("url", evt.URL).Msg("Skipping forbidden hostname")
return errHostnameNotAllowed
}
r, err := state.httpClient.Get(evt.URL)

@ -6,13 +6,13 @@ import (
"github.com/PuerkitoBio/purell"
"github.com/creekorful/trandoshan/api"
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/constraint"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/process"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"mvdan.cc/xurls/v2"
"net/http"
"net/url"
"strings"
)
@ -76,20 +76,11 @@ func (state *State) handleNewResourceEvent(subscriber event.Subscriber, msg even
log.Debug().Str("url", evt.URL).Msg("Processing new resource")
u, err := url.Parse(evt.URL)
if err != nil {
if allowed, err := constraint.CheckHostnameAllowed(state.configClient, evt.URL); err != nil {
return err
}
forbiddenHostnames, err := state.configClient.GetForbiddenHostnames()
if err != nil {
return err
}
for _, hostname := range forbiddenHostnames {
if strings.Contains(u.Hostname(), hostname.Hostname) {
log.Debug().Str("url", evt.URL).Msg("Skipping forbidden hostname")
return errHostnameNotAllowed
}
} else if !allowed {
log.Debug().Str("url", evt.URL).Msg("Skipping forbidden hostname")
return errHostnameNotAllowed
}
// Extract & process resource

@ -5,6 +5,7 @@ import (
"fmt"
"github.com/creekorful/trandoshan/api"
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/constraint"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/process"
"github.com/rs/zerolog/log"
@ -109,12 +110,11 @@ func (state *State) handleURLFoundEvent(subscriber event.Subscriber, msg event.R
}
// Make sure hostname is not forbidden
if hostnames, err := state.configClient.GetForbiddenHostnames(); err == nil {
for _, hostname := range hostnames {
if strings.Contains(u.Hostname(), hostname.Hostname) {
return fmt.Errorf("%s %w", u, errHostnameNotAllowed)
}
}
if allowed, err := constraint.CheckHostnameAllowed(state.configClient, evt.URL); err != nil {
return err
} else if !allowed {
log.Debug().Str("url", evt.URL).Msg("Skipping forbidden hostname")
return fmt.Errorf("%s %w", u, errHostnameNotAllowed)
}
// If we want to allow re-schedule of existing crawled resources we need to retrieve only resources

Loading…
Cancel
Save