Merge remote-tracking branch 'origin/develop' into rabbitmq-refactoring

pull/53/head
Aloïs Micard 4 years ago
commit db983c584b
No known key found for this signature in database
GPG Key ID: 1A0EB82F071F5EFE

@ -31,6 +31,7 @@ type ResourceDto struct {
Title string `json:"title"`
Meta map[string]string `json:"meta"`
Description string `json:"description"`
Headers map[string]string `json:"headers"`
}
// CredentialsDto represent the credential when logging in the API

@ -29,6 +29,7 @@ github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfb
github.com/golang/mock v1.4.4 h1:l75CXGRSwbaYNpl/Z2X1XIIAMSCquvXgpVZDhwEIJsc=
github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM=

@ -20,6 +20,7 @@ type ResourceIdx struct {
Title string `json:"title"`
Meta map[string]string `json:"meta"`
Description string `json:"description"`
Headers map[string]string `json:"headers"`
}
// ResSearchParams is the search params used
@ -32,6 +33,7 @@ type ResSearchParams struct {
PageSize int
PageNumber int
// TODO allow searching by meta
// TODO allow searching by headers
}
// Database is the interface used to abstract communication

@ -78,6 +78,7 @@ func (s *svc) addResource(res api.ResourceDto) (api.ResourceDto, error) {
Title: res.Title,
Meta: res.Meta,
Description: res.Description,
Headers: res.Headers,
}
if err := s.db.AddResource(doc); err != nil {

@ -62,6 +62,7 @@ func TestAddResource(t *testing.T) {
Time: time.Time{},
Meta: map[string]string{"content": "content-meta"},
Description: "the description",
Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"},
})
s := svc{db: dbMock}
@ -73,6 +74,7 @@ func TestAddResource(t *testing.T) {
Time: time.Time{},
Meta: map[string]string{"content": "content-meta"},
Description: "the description",
Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"},
})
if err != nil {
t.FailNow()
@ -96,6 +98,12 @@ func TestAddResource(t *testing.T) {
if res.Description != "the description" {
t.FailNow()
}
if res.Headers["Content-Type"] != "application/html" {
t.FailNow()
}
if res.Headers["Server"] != "Traefik" {
t.FailNow()
}
}
func TestScheduleURL(t *testing.T) {

@ -3,6 +3,7 @@ package crawler
import (
"crypto/tls"
"fmt"
"github.com/creekorful/trandoshan/internal/http"
"github.com/creekorful/trandoshan/internal/logging"
"github.com/creekorful/trandoshan/internal/messaging"
"github.com/creekorful/trandoshan/internal/util"
@ -11,6 +12,7 @@ import (
"github.com/valyala/fasthttp"
"github.com/valyala/fasthttp/fasthttpproxy"
"io"
"io/ioutil"
"strings"
"time"
)
@ -57,7 +59,7 @@ func execute(ctx *cli.Context) error {
Msg("Starting tdsh-crawler")
// Create the HTTP client
httpClient := &fasthttp.Client{
httpClient := http.NewFastHTTPClient(&fasthttp.Client{
// Use given TOR proxy to reach the hidden services
Dial: fasthttpproxy.FasthttpSocksDialer(ctx.String("tor-uri")),
// Disable SSL verification since we do not really care about this
@ -65,7 +67,7 @@ func execute(ctx *cli.Context) error {
ReadTimeout: time.Second * 5,
WriteTimeout: time.Second * 5,
Name: ctx.String("user-agent"),
}
})
// Create the subscriber
sub, err := messaging.NewSubscriber(ctx.String("event-srv-uri"))
@ -84,22 +86,23 @@ func execute(ctx *cli.Context) error {
return nil
}
func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) messaging.MsgHandler {
func handleMessage(httpClient http.Client, allowedContentTypes []string) messaging.MsgHandler {
return func(sub messaging.Subscriber, msg io.Reader) error {
var urlMsg messaging.URLTodoMsg
if err := sub.ReadMsg(msg, &urlMsg); err != nil {
return err
}
body, err := crawURL(httpClient, urlMsg.URL, allowedContentTypes)
body, headers, err := crawURL(httpClient, urlMsg.URL, allowedContentTypes)
if err != nil {
return fmt.Errorf("error while crawling URL: %s", err)
}
// Publish resource body
res := messaging.NewResourceMsg{
URL: urlMsg.URL,
Body: body,
URL: urlMsg.URL,
Body: body,
Headers: headers,
}
if err := sub.PublishMsg(&res); err != nil {
return fmt.Errorf("error while publishing resource: %s", err)
@ -109,34 +112,17 @@ func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) me
}
}
func crawURL(httpClient *fasthttp.Client, url string, allowedContentTypes []string) (string, error) {
func crawURL(httpClient http.Client, url string, allowedContentTypes []string) (string, map[string]string, error) {
log.Debug().Str("url", url).Msg("Processing URL")
// Query the website
req := fasthttp.AcquireRequest()
resp := fasthttp.AcquireResponse()
defer fasthttp.ReleaseRequest(req)
defer fasthttp.ReleaseResponse(resp)
req.SetRequestURI(url)
if err := httpClient.Do(req, resp); err != nil {
return "", err
}
switch code := resp.StatusCode(); {
case code > 302:
return "", fmt.Errorf("non-managed error code %d", code)
// follow redirect
case code == 301 || code == 302:
if location := string(resp.Header.Peek("Location")); location != "" {
return crawURL(httpClient, location, allowedContentTypes)
}
r, err := httpClient.Get(url)
if err != nil {
return "", nil, err
}
// Determinate if content type is allowed
allowed := false
contentType := string(resp.Header.Peek("Content-Type"))
contentType := r.Headers()["Content-Type"]
for _, allowedContentType := range allowedContentTypes {
if strings.Contains(contentType, allowedContentType) {
allowed = true
@ -146,8 +132,13 @@ func crawURL(httpClient *fasthttp.Client, url string, allowedContentTypes []stri
if !allowed {
err := fmt.Errorf("forbidden content type : %s", contentType)
return "", err
return "", nil, err
}
return string(resp.Body()), nil
// Ready body
b, err := ioutil.ReadAll(r.Body())
if err != nil {
return "", nil, err
}
return string(b), r.Headers(), nil
}

@ -1 +1,118 @@
package crawler
import (
"bytes"
"github.com/creekorful/trandoshan/internal/http_mock"
"github.com/creekorful/trandoshan/internal/messaging"
"github.com/creekorful/trandoshan/internal/messaging_mock"
"github.com/golang/mock/gomock"
"strings"
"testing"
)
func TestCrawlURLForbiddenContentType(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
httpClientMock := http_mock.NewMockClient(mockCtrl)
url := "https://example.onion"
allowedContentTypes := []string{"text/plain"}
httpResponseMock := http_mock.NewMockResponse(mockCtrl)
httpResponseMock.EXPECT().Headers().Return(map[string]string{"Content-Type": "image/png"})
httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil)
body, headers, err := crawURL(httpClientMock, url, allowedContentTypes)
if body != "" || headers != nil || err == nil {
t.Fail()
}
}
func TestCrawlURLSameContentType(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
httpClientMock := http_mock.NewMockClient(mockCtrl)
url := "https://example.onion"
allowedContentTypes := []string{"text/plain"}
httpResponseMock := http_mock.NewMockResponse(mockCtrl)
httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain"})
httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello"))
httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil)
body, headers, err := crawURL(httpClientMock, url, allowedContentTypes)
if err != nil {
t.Fail()
}
if body != "Hello" {
t.Fail()
}
if len(headers) != 1 {
t.Fail()
}
if headers["Content-Type"] != "text/plain" {
t.Fail()
}
}
func TestCrawlURLNoContentTypeFiltering(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
httpClientMock := http_mock.NewMockClient(mockCtrl)
url := "https://example.onion"
allowedContentTypes := []string{""}
httpResponseMock := http_mock.NewMockResponse(mockCtrl)
httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain"})
httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello"))
httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil)
body, headers, err := crawURL(httpClientMock, url, allowedContentTypes)
if err != nil {
t.Fail()
}
if body != "Hello" {
t.Fail()
}
if len(headers) != 1 {
t.Fail()
}
if headers["Content-Type"] != "text/plain" {
t.Fail()
}
}
func TestHandleMessage(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
subscriberMock := messaging_mock.NewMockSubscriber(mockCtrl)
httpClientMock := http_mock.NewMockClient(mockCtrl)
httpResponseMock := http_mock.NewMockResponse(mockCtrl)
msg := bytes.NewReader(nil)
subscriberMock.EXPECT().
ReadMsg(msg, &messaging.URLTodoMsg{}).
SetArg(1, messaging.URLTodoMsg{URL: "https://example.onion/image.png?id=12&test=2"}).
Return(nil)
httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain", "Server": "Debian"})
httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello"))
httpClientMock.EXPECT().Get("https://example.onion/image.png?id=12&test=2").Return(httpResponseMock, nil)
subscriberMock.EXPECT().PublishMsg(&messaging.NewResourceMsg{
URL: "https://example.onion/image.png?id=12&test=2",
Body: "Hello",
Headers: map[string]string{"Content-Type": "text/plain", "Server": "Debian"},
}).Return(nil)
if err := handleMessage(httpClientMock, []string{"text/plain", "text/css"})(subscriberMock, msg); err != nil {
t.Fail()
}
}

@ -79,6 +79,7 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler {
if err != nil {
return fmt.Errorf("error while extracting resource: %s", err)
}
resDto.Headers = resMsg.Headers
// Submit to the API
_, err = apiClient.AddResource(resDto)
@ -87,7 +88,15 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler {
}
// Finally push found URLs
publishedURLS := map[string]string{}
for _, url := range urls {
if _, exist := publishedURLS[url]; exist {
log.Trace().
Str("url", url).
Msg("Skipping duplicate URL")
continue
}
log.Trace().
Str("url", url).
Msg("Publishing found URL")
@ -98,6 +107,8 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler {
Str("err", err.Error()).
Msg("Error while publishing URL")
}
publishedURLS[url] = url
}
return nil

@ -42,12 +42,15 @@ This is sparta
t.Fail()
}
if len(urls) == 0 {
if len(urls) != 2 {
t.FailNow()
}
if urls[0] != "https://google.com/test?test=test" {
t.Fail()
}
if urls[1] != "https://example.org" {
t.Fail()
}
if resDto.Description != "Zhello world" {
t.Fail()
@ -77,7 +80,7 @@ func TestHandleMessage(t *testing.T) {
body := `
<title>Creekorful Inc</title>
This is sparta
This is sparta (hosted on https://example.org)
<a href="https://google.com/test?test=test#12">
@ -93,8 +96,11 @@ This is sparta
msg := bytes.NewReader(nil)
subscriberMock.EXPECT().
ReadMsg(msg, &messaging.NewResourceMsg{}).
SetArg(1, messaging.NewResourceMsg{URL: "https://example.onion", Body: body}).
Return(nil)
SetArg(1, messaging.NewResourceMsg{
URL: "https://example.onion",
Body: body,
Headers: map[string]string{"Server": "Traefik", "Content-Type": "application/html"},
}).Return(nil)
// make sure we are creating the resource
apiClientMock.EXPECT().AddResource(&resMatcher{target: api.ResourceDto{
@ -103,9 +109,12 @@ This is sparta
Title: "Creekorful Inc",
Meta: map[string]string{"description": "Zhello world", "og:url": "https://example.org"},
Description: "Zhello world",
Headers: map[string]string{"Server": "Traefik", "Content-Type": "application/html"},
}}).Return(api.ResourceDto{}, nil)
// make sure we are pushing found URLs
// should be called only one time
subscriberMock.EXPECT().
PublishMsg(&messaging.URLFoundMsg{URL: "https://example.org"}).
Return(nil)
@ -118,7 +127,7 @@ This is sparta
}
}
// custom matcher to ignore time field when doing comparison
// custom matcher to ignore time field when doing comparison ;(
// todo: do less crappy?
type resMatcher struct {
target api.ResourceDto
@ -131,7 +140,9 @@ func (rm *resMatcher) Matches(x interface{}) bool {
arg.URL == rm.target.URL &&
arg.Body == rm.target.Body &&
arg.Description == rm.target.Description &&
exactMatch(arg.Meta, rm.target.Meta)
exactMatch(arg.Meta, rm.target.Meta) &&
arg.Headers["Server"][0] == rm.target.Headers["Server"][0] &&
arg.Headers["Content-Type"] == rm.target.Headers["Content-Type"] // TODO allow other headers comparison
}
func (rm *resMatcher) String() string {

@ -0,0 +1,52 @@
package http
//go:generate mockgen -destination=../http_mock/client_mock.go -package=http_mock . Client
import (
"fmt"
"github.com/valyala/fasthttp"
)
// Client is an HTTP client
type Client interface {
// Get the corresponding URL
// this methods follows redirections
Get(URL string) (Response, error)
}
type client struct {
c *fasthttp.Client
}
// NewFastHTTPClient create a new Client using fasthttp.Client as backend
func NewFastHTTPClient(c *fasthttp.Client) Client {
return &client{c: c}
}
func (c *client) Get(URL string) (Response, error) {
req := fasthttp.AcquireRequest()
resp := fasthttp.AcquireResponse()
defer fasthttp.ReleaseRequest(req)
defer fasthttp.ReleaseResponse(resp)
req.SetRequestURI(URL)
if err := c.c.Do(req, resp); err != nil {
return nil, err
}
switch code := resp.StatusCode(); {
case code > 302:
return nil, fmt.Errorf("non-managed error code %d", code)
// follow redirect
case code == 301 || code == 302:
if location := string(resp.Header.Peek("Location")); location != "" {
return c.Get(location)
}
}
r := &response{}
resp.CopyTo(&r.raw)
return r, nil
}

@ -0,0 +1,33 @@
package http
//go:generate mockgen -destination=../http_mock/response_mock.go -package=http_mock . Response
import (
"bytes"
"github.com/valyala/fasthttp"
"io"
)
// Response is an HTTP response
type Response interface {
// Headers returns the response headers
Headers() map[string]string
// Body return the response body
Body() io.Reader
}
type response struct {
raw fasthttp.Response
}
func (r *response) Headers() map[string]string {
headers := map[string]string{}
r.raw.Header.VisitAll(func(key, value []byte) {
headers[string(key)] = string(value) // TODO manage multiple values?
})
return headers
}
func (r *response) Body() io.Reader {
return bytes.NewReader(r.raw.Body())
}

@ -39,8 +39,9 @@ func (msg *URLFoundMsg) Subject() string {
// NewResourceMsg represent a crawled resource
type NewResourceMsg struct {
URL string `json:"url"`
Body string `json:"body"`
URL string `json:"url"`
Body string `json:"body"`
Headers map[string]string `json:"headers"`
}
// Subject returns the subject where message should be push

@ -92,6 +92,12 @@ func handleMessage(apiClient api.Client, refreshDelay time.Duration, forbiddenEx
return nil // Technically not an error
}
// Make sure protocol is allowed
if !strings.HasPrefix(u.Scheme, "http") {
log.Trace().Stringer("url", u).Msg("URL has invalid scheme")
return nil // Technically not an error
}
// Make sure extension is not forbidden
for _, ext := range forbiddenExtensions {
if strings.HasSuffix(u.Path, "."+ext) {

@ -2,6 +2,7 @@ package scheduler
import (
"bytes"
"fmt"
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/api_mock"
"github.com/creekorful/trandoshan/internal/messaging"
@ -47,6 +48,27 @@ func TestHandleMessageNotOnion(t *testing.T) {
}
}
func TestHandleMessageWrongProtocol(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
apiClientMock := api_mock.NewMockClient(mockCtrl)
subscriberMock := messaging_mock.NewMockSubscriber(mockCtrl)
msg := bytes.NewReader(nil)
for _, protocol := range []string{"irc", "ftp"} {
subscriberMock.EXPECT().
ReadMsg(msg, &messaging.URLFoundMsg{}).
SetArg(1, messaging.URLFoundMsg{URL: fmt.Sprintf("%s://example.onion", protocol)}).
Return(nil)
if err := handleMessage(apiClientMock, -1, []string{})(subscriberMock, msg); err != nil {
t.FailNow()
}
}
}
func TestHandleMessageAlreadyCrawled(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()

Loading…
Cancel
Save