extractor: prevent from publishing duplicates URLs

pull/55/head
Aloïs Micard 4 years ago
parent b365954e31
commit 82868521ab
No known key found for this signature in database
GPG Key ID: 1A0EB82F071F5EFE

@ -87,7 +87,15 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler {
}
// Finally push found URLs
publishedURLS := map[string]string{}
for _, url := range urls {
if _, exist := publishedURLS[url]; exist {
log.Trace().
Str("url", url).
Msg("Skipping duplicate URL")
continue
}
log.Trace().
Str("url", url).
Msg("Publishing found URL")
@ -98,6 +106,8 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler {
Str("err", err.Error()).
Msg("Error while publishing URL")
}
publishedURLS[url] = url
}
return nil

@ -42,12 +42,15 @@ This is sparta
t.Fail()
}
if len(urls) == 0 {
if len(urls) != 2 {
t.FailNow()
}
if urls[0] != "https://google.com/test?test=test" {
t.Fail()
}
if urls[1] != "https://example.org" {
t.Fail()
}
if resDto.Description != "Zhello world" {
t.Fail()
@ -77,7 +80,7 @@ func TestHandleMessage(t *testing.T) {
body := `
<title>Creekorful Inc</title>
This is sparta
This is sparta (hosted on https://example.org)
<a href="https://google.com/test?test=test#12">
@ -106,6 +109,8 @@ This is sparta
}}).Return(api.ResourceDto{}, nil)
// make sure we are pushing found URLs
// should be called only one time
subscriberMock.EXPECT().
PublishMsg(&messaging.URLFoundMsg{URL: "https://example.org"}).
Return(nil)

Loading…
Cancel
Save