Merge pull request #131 from creekorful/130-optimize-redis-memory

scheduler: hash url before caching it
pull/132/head^2
Aloïs Micard 3 years ago committed by GitHub
commit b84e6d28ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -11,9 +11,11 @@ import (
"github.com/creekorful/trandoshan/internal/process" "github.com/creekorful/trandoshan/internal/process"
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
"github.com/urfave/cli/v2" "github.com/urfave/cli/v2"
"hash/fnv"
"mvdan.cc/xurls/v2" "mvdan.cc/xurls/v2"
"net/http" "net/http"
"net/url" "net/url"
"strconv"
"strings" "strings"
) )
@ -89,8 +91,20 @@ func (state *State) handleNewResourceEvent(subscriber event.Subscriber, msg even
return fmt.Errorf("error while extracting URLs") return fmt.Errorf("error while extracting URLs")
} }
// We are working using URL hash to reduce memory consumption.
// See: https://github.com/creekorful/trandoshan/issues/130
var urlHashes []string
for _, u := range urls {
c := fnv.New64()
if _, err := c.Write([]byte(u)); err != nil {
return fmt.Errorf("error while computing url hash: %s", err)
}
urlHashes = append(urlHashes, strconv.FormatUint(c.Sum64(), 10))
}
// Load values in batch // Load values in batch
urlCache, err := state.urlCache.GetManyInt64(urls) urlCache, err := state.urlCache.GetManyInt64(urlHashes)
if err != nil { if err != nil {
return err return err
} }
@ -175,14 +189,21 @@ func (state *State) processURL(rawURL string, pub event.Publisher, urlCache map[
return fmt.Errorf("%s %w", u, errHostnameNotAllowed) return fmt.Errorf("%s %w", u, errHostnameNotAllowed)
} }
// Compute url hash
c := fnv.New64()
if _, err := c.Write([]byte(rawURL)); err != nil {
return fmt.Errorf("error while computing url hash: %s", err)
}
urlHash := strconv.FormatUint(c.Sum64(), 10)
// Check if URL should be scheduled // Check if URL should be scheduled
if urlCache[rawURL] > 0 { if urlCache[urlHash] > 0 {
return fmt.Errorf("%s %w", u, errAlreadyScheduled) return fmt.Errorf("%s %w", u, errAlreadyScheduled)
} }
log.Debug().Stringer("url", u).Msg("URL should be scheduled") log.Debug().Stringer("url", u).Msg("URL should be scheduled")
urlCache[rawURL]++ urlCache[urlHash]++
if err := pub.PublishEvent(&event.NewURLEvent{URL: rawURL}); err != nil { if err := pub.PublishEvent(&event.NewURLEvent{URL: rawURL}); err != nil {
return fmt.Errorf("error while publishing URL: %s", err) return fmt.Errorf("error while publishing URL: %s", err)

@ -12,6 +12,8 @@ import (
"github.com/creekorful/trandoshan/internal/process_mock" "github.com/creekorful/trandoshan/internal/process_mock"
"github.com/creekorful/trandoshan/internal/test" "github.com/creekorful/trandoshan/internal/test"
"github.com/golang/mock/gomock" "github.com/golang/mock/gomock"
"hash/fnv"
"strconv"
"testing" "testing"
) )
@ -153,7 +155,7 @@ func TestProcessURL_AlreadyScheduled(t *testing.T) {
configClientMock.EXPECT().GetAllowedMimeTypes().Return([]client.MimeType{{Extensions: []string{"html", "php"}}}, nil) configClientMock.EXPECT().GetAllowedMimeTypes().Return([]client.MimeType{{Extensions: []string{"html", "php"}}}, nil)
configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{}, nil) configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{}, nil)
urlCache := map[string]int64{"https://facebookcorewwi.onion/test.php?id=12": 1} urlCache := map[string]int64{"3056224523184958": 1}
state := State{configClient: configClientMock} state := State{configClient: configClientMock}
if err := state.processURL("https://facebookcorewwi.onion/test.php?id=12", nil, urlCache); !errors.Is(err, errAlreadyScheduled) { if err := state.processURL("https://facebookcorewwi.onion/test.php?id=12", nil, urlCache); !errors.Is(err, errAlreadyScheduled) {
t.Fail() t.Fail()
@ -183,7 +185,14 @@ func TestProcessURL(t *testing.T) {
t.Fail() t.Fail()
} }
if val, exist := urlCache[url]; !exist || val != 1 { // Compute url hash
c := fnv.New64()
if _, err := c.Write([]byte(url)); err != nil {
t.Error(err)
}
urlHash := strconv.FormatUint(c.Sum64(), 10)
if val, exist := urlCache[urlHash]; !exist || val != 1 {
t.Fail() t.Fail()
} }
} }
@ -211,9 +220,9 @@ This domain is blacklisted: https://m.fbi.onion/test.php
Return(nil) Return(nil)
urlCacheMock.EXPECT(). urlCacheMock.EXPECT().
GetManyInt64([]string{"https://facebook.onion/test.php?id=1", "https://google.onion", "https://example.onion/test.png", "https://m.fbi.onion/test.php"}). GetManyInt64([]string{"15038381360563270096", "17173291053643777680", "14332094874591870497", "5985629257333875968"}).
Return(map[string]int64{ Return(map[string]int64{
"https://google.onion": 1, "17173291053643777680": 1,
}, nil) }, nil)
configClientMock.EXPECT().GetAllowedMimeTypes(). configClientMock.EXPECT().GetAllowedMimeTypes().
@ -231,8 +240,8 @@ This domain is blacklisted: https://m.fbi.onion/test.php
}) })
urlCacheMock.EXPECT().SetManyInt64(map[string]int64{ urlCacheMock.EXPECT().SetManyInt64(map[string]int64{
"https://google.onion": 1, "17173291053643777680": 1,
"https://facebook.onion/test.php?id=1": 1, "15038381360563270096": 1,
}, cache.NoTTL).Return(nil) }, cache.NoTTL).Return(nil)
s := State{urlCache: urlCacheMock, configClient: configClientMock} s := State{urlCache: urlCacheMock, configClient: configClientMock}

Loading…
Cancel
Save