Merge pull request #95 from creekorful/91-dead-hostname-event

crawler: publish event in case hostname is down
pull/97/head
Aloïs Micard 3 years ago committed by GitHub
commit 4e849830f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -102,6 +102,11 @@ func (state *State) handleNewURLEvent(subscriber event.Subscriber, msg event.Raw
r, err := state.httpClient.Get(evt.URL)
if err != nil {
if err == chttp.ErrTimeout {
// indicate that crawling has failed
_ = subscriber.PublishEvent(&event.TimeoutURLEvent{URL: evt.URL})
}
return err
}

@ -5,6 +5,7 @@ import (
"github.com/creekorful/trandoshan/internal/clock_mock"
"github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/configapi/client_mock"
"github.com/creekorful/trandoshan/internal/crawler/http"
"github.com/creekorful/trandoshan/internal/crawler/http_mock"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/event_mock"
@ -39,8 +40,8 @@ func TestHandleNewURLEvent(t *testing.T) {
responseBody string
// internal state: allowed mime types
allowedMimeTypes []client.MimeType
// is the test expected to pass?
pass bool
// The expected error
err error
}
tests := []test{
@ -52,14 +53,12 @@ func TestHandleNewURLEvent(t *testing.T) {
{ContentType: "text/plain", Extensions: nil},
{ContentType: "text/css", Extensions: nil},
},
pass: true,
},
{
url: "https://example.onion",
responseHeaders: map[string]string{"Content-Type": "text/plain"},
responseBody: "Hello",
allowedMimeTypes: []client.MimeType{},
pass: true,
},
{
url: "https://example.onion",
@ -71,7 +70,6 @@ func TestHandleNewURLEvent(t *testing.T) {
Extensions: nil,
},
},
pass: true,
},
{
url: "https://example.onion/image.png",
@ -83,7 +81,19 @@ func TestHandleNewURLEvent(t *testing.T) {
Extensions: nil,
},
},
pass: false,
err: errContentTypeNotAllowed,
},
{
url: "https://downhostname.onion",
responseHeaders: map[string]string{"Content-Type": "text/plain"},
responseBody: "Hello",
allowedMimeTypes: []client.MimeType{
{
ContentType: "text/plain",
Extensions: nil,
},
},
err: http.ErrTimeout,
},
}
@ -95,13 +105,21 @@ func TestHandleNewURLEvent(t *testing.T) {
Return(nil)
// mock crawling
httpResponseMock.EXPECT().Headers().Return(test.responseHeaders)
httpClientMock.EXPECT().Get(test.url).Return(httpResponseMock, nil)
switch test.err {
case http.ErrTimeout:
httpClientMock.EXPECT().Get(test.url).Return(httpResponseMock, http.ErrTimeout)
subscriberMock.EXPECT().PublishEvent(&event.TimeoutURLEvent{URL: test.url}).Return(nil)
break
default:
httpResponseMock.EXPECT().Headers().Return(test.responseHeaders)
httpClientMock.EXPECT().Get(test.url).Return(httpResponseMock, nil)
// mock config retrieval
configClientMock.EXPECT().GetAllowedMimeTypes().Return(test.allowedMimeTypes, nil)
// mock config retrieval
configClientMock.EXPECT().GetAllowedMimeTypes().Return(test.allowedMimeTypes, nil)
break
}
if test.pass {
if test.err == nil {
httpResponseMock.EXPECT().Headers().Return(test.responseHeaders)
httpResponseMock.EXPECT().Body().Return(strings.NewReader(test.responseBody))
@ -118,10 +136,10 @@ func TestHandleNewURLEvent(t *testing.T) {
}
err := s.handleNewURLEvent(subscriberMock, msg)
if test.pass && err != nil {
if test.err == nil && err != nil {
t.Errorf("test should have passed but has failed with: %s", err)
}
if !test.pass && !errors.Is(err, errContentTypeNotAllowed) {
if !errors.Is(err, test.err) {
t.Errorf("test shouldn't have passed but hasn't returned expected error: %s", err)
}
}

@ -3,10 +3,15 @@ package http
//go:generate mockgen -destination=../http_mock/client_mock.go -package=http_mock . Client
import (
"errors"
"fmt"
"github.com/valyala/fasthttp"
"strings"
)
// ErrTimeout is returned when the crawling failed because of timeout issue
var ErrTimeout = errors.New("timeout has occurred")
// Client is an HTTP client
type Client interface {
// Get the corresponding URL
@ -32,6 +37,11 @@ func (c *client) Get(URL string) (Response, error) {
req.SetRequestURI(URL)
if err := c.c.Do(req, resp); err != nil {
// TODO better
if strings.Contains(err.Error(), "unknown error TTL expired") {
return nil, ErrTimeout
}
return nil, err
}

@ -5,11 +5,13 @@ import "time"
//go:generate mockgen -destination=../event_mock/event_mock.go -package=event_mock . Publisher,Subscriber
const (
// NewURLExchange is the subject used when an URL is schedule for crawling
// NewURLExchange is the exchange used when an URL is schedule for crawling
NewURLExchange = "url.new"
// FoundURLExchange is the subject used when an URL is extracted from resource
// FoundURLExchange is the exchange used when an URL is extracted from resource
FoundURLExchange = "url.found"
// NewResourceExchange is the subject used when a new resource has been crawled
// TimeoutURLExchange is the exchange used when a crawling fail because of timeout
TimeoutURLExchange = "url.timeout"
// NewResourceExchange is the exchange used when a new resource has been crawled
NewResourceExchange = "resource.new"
// ConfigExchange is the exchange used to dispatch new configuration
ConfigExchange = "config"
@ -41,6 +43,16 @@ func (msg *FoundURLEvent) Exchange() string {
return FoundURLExchange
}
// TimeoutURLEvent represent a failed crawling because of timeout
type TimeoutURLEvent struct {
URL string `json:"url"`
}
// Exchange returns the exchange where event should be push
func (msg *TimeoutURLEvent) Exchange() string {
return TimeoutURLExchange
}
// NewResourceEvent represent a crawled resource
type NewResourceEvent struct {
URL string `json:"url"`

Loading…
Cancel
Save