From 871daf1bcc5641b919d1ef3c6be154270d1de760 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alo=C3=AFs=20Micard?= Date: Fri, 8 Jan 2021 13:00:39 +0100 Subject: [PATCH] scheduler: hash url before caching it --- internal/scheduler/scheduler.go | 27 ++++++++++++++++++++++++--- internal/scheduler/scheduler_test.go | 21 +++++++++++++++------ 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 9c99ff1..b81e64b 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -11,9 +11,11 @@ import ( "github.com/creekorful/trandoshan/internal/process" "github.com/rs/zerolog/log" "github.com/urfave/cli/v2" + "hash/fnv" "mvdan.cc/xurls/v2" "net/http" "net/url" + "strconv" "strings" ) @@ -89,8 +91,20 @@ func (state *State) handleNewResourceEvent(subscriber event.Subscriber, msg even return fmt.Errorf("error while extracting URLs") } + // We are working using URL hash to reduce memory consumption. + // See: https://github.com/creekorful/trandoshan/issues/130 + var urlHashes []string + for _, u := range urls { + c := fnv.New64() + if _, err := c.Write([]byte(u)); err != nil { + return fmt.Errorf("error while computing url hash: %s", err) + } + + urlHashes = append(urlHashes, strconv.FormatUint(c.Sum64(), 10)) + } + // Load values in batch - urlCache, err := state.urlCache.GetManyInt64(urls) + urlCache, err := state.urlCache.GetManyInt64(urlHashes) if err != nil { return err } @@ -175,14 +189,21 @@ func (state *State) processURL(rawURL string, pub event.Publisher, urlCache map[ return fmt.Errorf("%s %w", u, errHostnameNotAllowed) } + // Compute url hash + c := fnv.New64() + if _, err := c.Write([]byte(rawURL)); err != nil { + return fmt.Errorf("error while computing url hash: %s", err) + } + urlHash := strconv.FormatUint(c.Sum64(), 10) + // Check if URL should be scheduled - if urlCache[rawURL] > 0 { + if urlCache[urlHash] > 0 { return fmt.Errorf("%s %w", u, errAlreadyScheduled) } log.Debug().Stringer("url", u).Msg("URL should be scheduled") - urlCache[rawURL]++ + urlCache[urlHash]++ if err := pub.PublishEvent(&event.NewURLEvent{URL: rawURL}); err != nil { return fmt.Errorf("error while publishing URL: %s", err) diff --git a/internal/scheduler/scheduler_test.go b/internal/scheduler/scheduler_test.go index dbbff27..7e7807b 100644 --- a/internal/scheduler/scheduler_test.go +++ b/internal/scheduler/scheduler_test.go @@ -12,6 +12,8 @@ import ( "github.com/creekorful/trandoshan/internal/process_mock" "github.com/creekorful/trandoshan/internal/test" "github.com/golang/mock/gomock" + "hash/fnv" + "strconv" "testing" ) @@ -153,7 +155,7 @@ func TestProcessURL_AlreadyScheduled(t *testing.T) { configClientMock.EXPECT().GetAllowedMimeTypes().Return([]client.MimeType{{Extensions: []string{"html", "php"}}}, nil) configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{}, nil) - urlCache := map[string]int64{"https://facebookcorewwi.onion/test.php?id=12": 1} + urlCache := map[string]int64{"3056224523184958": 1} state := State{configClient: configClientMock} if err := state.processURL("https://facebookcorewwi.onion/test.php?id=12", nil, urlCache); !errors.Is(err, errAlreadyScheduled) { t.Fail() @@ -183,7 +185,14 @@ func TestProcessURL(t *testing.T) { t.Fail() } - if val, exist := urlCache[url]; !exist || val != 1 { + // Compute url hash + c := fnv.New64() + if _, err := c.Write([]byte(url)); err != nil { + t.Error(err) + } + urlHash := strconv.FormatUint(c.Sum64(), 10) + + if val, exist := urlCache[urlHash]; !exist || val != 1 { t.Fail() } } @@ -211,9 +220,9 @@ This domain is blacklisted: https://m.fbi.onion/test.php Return(nil) urlCacheMock.EXPECT(). - GetManyInt64([]string{"https://facebook.onion/test.php?id=1", "https://google.onion", "https://example.onion/test.png", "https://m.fbi.onion/test.php"}). + GetManyInt64([]string{"15038381360563270096", "17173291053643777680", "14332094874591870497", "5985629257333875968"}). Return(map[string]int64{ - "https://google.onion": 1, + "17173291053643777680": 1, }, nil) configClientMock.EXPECT().GetAllowedMimeTypes(). @@ -231,8 +240,8 @@ This domain is blacklisted: https://m.fbi.onion/test.php }) urlCacheMock.EXPECT().SetManyInt64(map[string]int64{ - "https://google.onion": 1, - "https://facebook.onion/test.php?id=1": 1, + "17173291053643777680": 1, + "15038381360563270096": 1, }, cache.NoTTL).Return(nil) s := State{urlCache: urlCacheMock, configClient: configClientMock}