Implement cache logic

pull/115/head
Aloïs Micard 3 years ago
parent 55ae36f3b9
commit 477092316b
No known key found for this signature in database
GPG Key ID: 1A0EB82F071F5EFE

@ -0,0 +1,14 @@
package cache
import "time"
//go:generate mockgen -destination=../cache_mock/cache_mock.go -package=cache_mock . Cache
// Cache represent a KV database
type Cache interface {
GetBytes(key string) ([]byte, error)
SetBytes(key string, value []byte, TTL time.Duration) error
GetInt64(key string) (int64, error)
SetInt64(key string, value int64, TTL time.Duration) error
}

@ -6,6 +6,7 @@ import (
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/PuerkitoBio/purell"
"github.com/creekorful/trandoshan/internal/cache"
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/indexer/auth"
@ -36,6 +37,7 @@ type State struct {
index index.Index
pub event.Publisher
configClient configapi.Client
urlCache cache.Cache
}
// Name return the process name
@ -182,8 +184,14 @@ func (state *State) handleNewResourceEvent(subscriber event.Subscriber, msg even
resDto.Headers[strings.ToLower(key)] = value
}
// Get current refresh delay
refreshDelay := time.Duration(-1)
if val, err := state.configClient.GetRefreshDelay(); err == nil {
refreshDelay = val.Delay
}
// Save resource
if _, err := state.addResource(resDto); err != nil {
if _, err := state.tryAddResource(resDto, refreshDelay); err != nil {
return err
}
@ -196,15 +204,41 @@ func (state *State) handleNewResourceEvent(subscriber event.Subscriber, msg even
continue
}
// make sure url should be published
count, err := state.countResource(u)
// make sure url has not been published (yet)
count, err := state.urlCache.GetInt64(fmt.Sprintf("urls:%s", u))
if err != nil {
log.Err(err).
Str("url", u).
Msg("error while checking URL cache")
continue
}
if count > 0 {
log.Trace().
Str("url", u).
Msg("skipping already published URL")
continue
}
// make sure url should be published (not already crawled)
count, err = state.countResource(u, refreshDelay)
if err != nil {
log.Err(err).
Str("url", evt.URL).
Msg("error while checking ES")
continue
}
if count > 0 {
log.Trace().
Str("url", evt.URL).
Msg("skipping already crawled URL")
continue
}
// Update cache
if err := state.urlCache.SetInt64(fmt.Sprintf("urls:%s", u), count+1, refreshDelay); err != nil {
log.Err(err).Msg("error while updating URL cache")
}
log.Trace().
Str("url", u).
Msg("Publishing found URL")
@ -222,7 +256,7 @@ func (state *State) handleNewResourceEvent(subscriber event.Subscriber, msg even
return nil
}
func (state *State) addResource(res client.ResourceDto) (client.ResourceDto, error) {
func (state *State) tryAddResource(res client.ResourceDto, refreshDelay time.Duration) (client.ResourceDto, error) {
forbiddenHostnames, err := state.configClient.GetForbiddenHostnames()
if err != nil {
return client.ResourceDto{}, err
@ -239,7 +273,13 @@ func (state *State) addResource(res client.ResourceDto) (client.ResourceDto, err
}
}
count, err := state.countResource(res.URL)
// Hacky stuff to prevent from adding 'duplicate resource'
// the thing is: even with the scheduler preventing from crawling 'duplicates' URL by adding a refresh period
// and checking if the resource is not already indexed, this implementation may not work if the URLs was published
// before the resource is saved. And this happen a LOT of time.
// therefore the best thing to do is to make the API check if the resource should **really** be added by checking if
// it isn't present on the database. This may sounds hacky, but it's the best solution i've come up at this time.
count, err := state.countResource(res.URL, refreshDelay)
if err != nil {
return client.ResourceDto{}, err
}
@ -278,18 +318,10 @@ func (state *State) addResource(res client.ResourceDto) (client.ResourceDto, err
return res, nil
}
// Hacky stuff to prevent from adding 'duplicate resource'
// the thing is: even with the scheduler preventing from crawling 'duplicates' URL by adding a refresh period
// and checking if the resource is not already indexed, this implementation may not work if the URLs was published
// before the resource is saved. And this happen a LOT of time.
// therefore the best thing to do is to make the API check if the resource should **really** be added by checking if
// it isn't present on the database. This may sounds hacky, but it's the best solution i've come up at this time.
func (state *State) countResource(URL string) (int64, error) {
func (state *State) countResource(URL string, refreshDelay time.Duration) (int64, error) {
endDate := time.Time{}
if refreshDelay, err := state.configClient.GetRefreshDelay(); err == nil {
if refreshDelay.Delay != -1 {
endDate = time.Now().Add(-refreshDelay.Delay)
}
if refreshDelay != -1 {
endDate = time.Now().Add(-refreshDelay)
}
count, err := state.index.CountResources(&client.ResSearchParams{

@ -4,6 +4,7 @@ import (
"encoding/json"
"errors"
"fmt"
"github.com/creekorful/trandoshan/internal/cache_mock"
"github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/configapi/client_mock"
"github.com/creekorful/trandoshan/internal/event"
@ -180,11 +181,10 @@ func TestAddResource(t *testing.T) {
Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"},
})
configClientMock.EXPECT().GetRefreshDelay().Return(client.RefreshDelay{Delay: 5 * time.Hour}, nil)
configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{}, nil)
s := State{index: indexMock, configClient: configClientMock, pub: pubMock}
res, err := s.addResource(body)
res, err := s.tryAddResource(body, 5*time.Hour)
if err != nil {
t.FailNow()
}
@ -238,12 +238,11 @@ func TestAddResourceDuplicateNotAllowed(t *testing.T) {
PageNumber: 1,
}, endDateZero: true}).Return(int64(1), nil)
configClientMock.EXPECT().GetRefreshDelay().Return(client.RefreshDelay{Delay: -1}, nil)
configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{}, nil)
s := State{index: indexMock, configClient: configClientMock}
if _, err := s.addResource(body); !errors.Is(err, errAlreadyIndexed) {
if _, err := s.tryAddResource(body, -1); !errors.Is(err, errAlreadyIndexed) {
t.FailNow()
}
}
@ -272,12 +271,11 @@ func TestAddResourceTooYoung(t *testing.T) {
PageNumber: 1,
}}).Return(int64(1), nil)
configClientMock.EXPECT().GetRefreshDelay().Return(client.RefreshDelay{Delay: 10 * time.Minute}, nil)
configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{}, nil)
s := State{index: indexMock, configClient: configClientMock}
if _, err := s.addResource(body); !errors.Is(err, errAlreadyIndexed) {
if _, err := s.tryAddResource(body, 10*time.Minute); !errors.Is(err, errAlreadyIndexed) {
t.FailNow()
}
}
@ -302,7 +300,7 @@ func TestAddResourceForbiddenHostname(t *testing.T) {
s := State{configClient: configClientMock}
if _, err := s.addResource(body); err != errHostnameNotAllowed {
if _, err := s.tryAddResource(body, -1); err != errHostnameNotAllowed {
t.FailNow()
}
}
@ -429,6 +427,8 @@ This is sparta (hosted on https://example.org)
<a href="https://google.com/test?test=test#12">
Thanks to https://help.facebook.onion/ for the hosting :D
<meta name="DescriptIon" content="Zhello world">
<meta property="og:url" content="https://example.org">`
@ -439,6 +439,7 @@ This is sparta (hosted on https://example.org)
configClientMock := client_mock.NewMockClient(mockCtrl)
indexMock := index_mock.NewMockIndex(mockCtrl)
pubMock := event_mock.NewMockPublisher(mockCtrl)
urlCacheMock := cache_mock.NewMockCache(mockCtrl)
tn := time.Now()
@ -453,7 +454,7 @@ This is sparta (hosted on https://example.org)
}).Return(nil)
configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{{Hostname: "example2.onion"}}, nil)
configClientMock.EXPECT().GetRefreshDelay().Times(3).Return(client.RefreshDelay{Delay: -1}, nil)
configClientMock.EXPECT().GetRefreshDelay().Times(1).Return(client.RefreshDelay{Delay: -1}, nil)
indexMock.EXPECT().CountResources(&client2.ResSearchParams{
URL: "https://example.onion",
@ -483,23 +484,28 @@ This is sparta (hosted on https://example.org)
}).Return(nil)
// make sure we are pushing found URLs (but only if refresh delay elapsed)
urlCacheMock.EXPECT().GetInt64("urls:https://example.org").Return(int64(0), nil)
indexMock.EXPECT().CountResources(&client2.ResSearchParams{
URL: "https://example.org",
PageSize: 1,
PageNumber: 1,
}).Return(int64(0), nil)
urlCacheMock.EXPECT().GetInt64("urls:https://help.facebook.onion").Return(int64(1), nil)
urlCacheMock.EXPECT().GetInt64("urls:https://google.com/test?test=test").Return(int64(0), nil)
indexMock.EXPECT().CountResources(&client2.ResSearchParams{
URL: "https://google.com/test?test=test",
PageSize: 1,
PageNumber: 1,
}).Return(int64(1), nil)
// should be called only one time
subscriberMock.EXPECT().
PublishEvent(&event.FoundURLEvent{URL: "https://example.org"}).
Return(nil)
urlCacheMock.EXPECT().SetInt64("urls:https://example.org", int64(1), time.Duration(-1)).Return(nil)
s := State{index: indexMock, configClient: configClientMock, pub: pubMock}
s := State{index: indexMock, configClient: configClientMock, pub: pubMock, urlCache: urlCacheMock}
if err := s.handleNewResourceEvent(subscriberMock, msg); err != nil {
t.FailNow()
}
@ -534,6 +540,7 @@ This is sparta (hosted on https://example.org)
Time: tn,
}).Return(nil)
configClientMock.EXPECT().GetRefreshDelay().Return(client.RefreshDelay{Delay: -1}, nil)
configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{{Hostname: "example.onion"}}, nil)
s := State{configClient: configClientMock}

Loading…
Cancel
Save