From 69352f7237750d8cd3f49aecb42e14a8a944eda7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alo=C3=AFs=20Micard?= Date: Thu, 7 Jan 2021 20:58:33 +0100 Subject: [PATCH 1/3] indexer: sort headers to have deterministic output --- internal/indexer/index/local.go | 12 ++++++++++-- internal/indexer/index/local_test.go | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/internal/indexer/index/local.go b/internal/indexer/index/local.go index 9b84f92..8d4c8af 100644 --- a/internal/indexer/index/local.go +++ b/internal/indexer/index/local.go @@ -7,6 +7,7 @@ import ( "net/url" "os" "path/filepath" + "sort" "strconv" "strings" "time" @@ -63,9 +64,16 @@ func formatResource(url string, body string, headers map[string]string) ([]byte, // First URL builder.WriteString(fmt.Sprintf("%s\n\n", url)) + // Sort headers to have deterministic output + var headerNames []string + for headerName := range headers { + headerNames = append(headerNames, headerName) + } + sort.Strings(headerNames) + // Then headers - for key, value := range headers { - builder.WriteString(fmt.Sprintf("%s: %s\n", key, value)) + for _, name := range headerNames { + builder.WriteString(fmt.Sprintf("%s: %s\n", name, headers[name])) } builder.WriteString("\n") diff --git a/internal/indexer/index/local_test.go b/internal/indexer/index/local_test.go index 00bca29..b349e8e 100644 --- a/internal/indexer/index/local_test.go +++ b/internal/indexer/index/local_test.go @@ -134,7 +134,7 @@ func TestFormatResource(t *testing.T) { t.FailNow() } - if string(res) != "https://google.com\n\nServer: Traefik\nContent-Type: text/html\n\nHello, world" { + if string(res) != "https://google.com\n\nContent-Type: text/html\nServer: Traefik\n\nHello, world" { t.Errorf("got %s want %s", string(res), "https://google.com\n\nServer: Traefik\nContent-Type: text/html\n\nHello, world") } } From 871daf1bcc5641b919d1ef3c6be154270d1de760 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alo=C3=AFs=20Micard?= Date: Fri, 8 Jan 2021 13:00:39 +0100 Subject: [PATCH 2/3] scheduler: hash url before caching it --- internal/scheduler/scheduler.go | 27 ++++++++++++++++++++++++--- internal/scheduler/scheduler_test.go | 21 +++++++++++++++------ 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 9c99ff1..b81e64b 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -11,9 +11,11 @@ import ( "github.com/creekorful/trandoshan/internal/process" "github.com/rs/zerolog/log" "github.com/urfave/cli/v2" + "hash/fnv" "mvdan.cc/xurls/v2" "net/http" "net/url" + "strconv" "strings" ) @@ -89,8 +91,20 @@ func (state *State) handleNewResourceEvent(subscriber event.Subscriber, msg even return fmt.Errorf("error while extracting URLs") } + // We are working using URL hash to reduce memory consumption. + // See: https://github.com/creekorful/trandoshan/issues/130 + var urlHashes []string + for _, u := range urls { + c := fnv.New64() + if _, err := c.Write([]byte(u)); err != nil { + return fmt.Errorf("error while computing url hash: %s", err) + } + + urlHashes = append(urlHashes, strconv.FormatUint(c.Sum64(), 10)) + } + // Load values in batch - urlCache, err := state.urlCache.GetManyInt64(urls) + urlCache, err := state.urlCache.GetManyInt64(urlHashes) if err != nil { return err } @@ -175,14 +189,21 @@ func (state *State) processURL(rawURL string, pub event.Publisher, urlCache map[ return fmt.Errorf("%s %w", u, errHostnameNotAllowed) } + // Compute url hash + c := fnv.New64() + if _, err := c.Write([]byte(rawURL)); err != nil { + return fmt.Errorf("error while computing url hash: %s", err) + } + urlHash := strconv.FormatUint(c.Sum64(), 10) + // Check if URL should be scheduled - if urlCache[rawURL] > 0 { + if urlCache[urlHash] > 0 { return fmt.Errorf("%s %w", u, errAlreadyScheduled) } log.Debug().Stringer("url", u).Msg("URL should be scheduled") - urlCache[rawURL]++ + urlCache[urlHash]++ if err := pub.PublishEvent(&event.NewURLEvent{URL: rawURL}); err != nil { return fmt.Errorf("error while publishing URL: %s", err) diff --git a/internal/scheduler/scheduler_test.go b/internal/scheduler/scheduler_test.go index dbbff27..7e7807b 100644 --- a/internal/scheduler/scheduler_test.go +++ b/internal/scheduler/scheduler_test.go @@ -12,6 +12,8 @@ import ( "github.com/creekorful/trandoshan/internal/process_mock" "github.com/creekorful/trandoshan/internal/test" "github.com/golang/mock/gomock" + "hash/fnv" + "strconv" "testing" ) @@ -153,7 +155,7 @@ func TestProcessURL_AlreadyScheduled(t *testing.T) { configClientMock.EXPECT().GetAllowedMimeTypes().Return([]client.MimeType{{Extensions: []string{"html", "php"}}}, nil) configClientMock.EXPECT().GetForbiddenHostnames().Return([]client.ForbiddenHostname{}, nil) - urlCache := map[string]int64{"https://facebookcorewwi.onion/test.php?id=12": 1} + urlCache := map[string]int64{"3056224523184958": 1} state := State{configClient: configClientMock} if err := state.processURL("https://facebookcorewwi.onion/test.php?id=12", nil, urlCache); !errors.Is(err, errAlreadyScheduled) { t.Fail() @@ -183,7 +185,14 @@ func TestProcessURL(t *testing.T) { t.Fail() } - if val, exist := urlCache[url]; !exist || val != 1 { + // Compute url hash + c := fnv.New64() + if _, err := c.Write([]byte(url)); err != nil { + t.Error(err) + } + urlHash := strconv.FormatUint(c.Sum64(), 10) + + if val, exist := urlCache[urlHash]; !exist || val != 1 { t.Fail() } } @@ -211,9 +220,9 @@ This domain is blacklisted: https://m.fbi.onion/test.php Return(nil) urlCacheMock.EXPECT(). - GetManyInt64([]string{"https://facebook.onion/test.php?id=1", "https://google.onion", "https://example.onion/test.png", "https://m.fbi.onion/test.php"}). + GetManyInt64([]string{"15038381360563270096", "17173291053643777680", "14332094874591870497", "5985629257333875968"}). Return(map[string]int64{ - "https://google.onion": 1, + "17173291053643777680": 1, }, nil) configClientMock.EXPECT().GetAllowedMimeTypes(). @@ -231,8 +240,8 @@ This domain is blacklisted: https://m.fbi.onion/test.php }) urlCacheMock.EXPECT().SetManyInt64(map[string]int64{ - "https://google.onion": 1, - "https://facebook.onion/test.php?id=1": 1, + "17173291053643777680": 1, + "15038381360563270096": 1, }, cache.NoTTL).Return(nil) s := State{urlCache: urlCacheMock, configClient: configClientMock} From 6c678478a19ff49abdcaec285eacd677bc3d1185 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alo=C3=AFs=20Micard?= Date: Fri, 8 Jan 2021 20:28:35 +0100 Subject: [PATCH 3/3] Implement better blacklist config --- deployments/docker/docker-compose.yml | 4 +-- internal/blacklister/blacklister.go | 26 +++++++++++---- internal/blacklister/blacklister_test.go | 22 +++++++++---- internal/cache/cache.go | 2 ++ internal/cache/redis.go | 4 +++ internal/configapi/client/client.go | 40 +++++++++++++----------- internal/scheduler/scheduler.go | 7 +---- internal/scheduler/scheduler_test.go | 2 +- 8 files changed, 68 insertions(+), 39 deletions(-) diff --git a/deployments/docker/docker-compose.yml b/deployments/docker/docker-compose.yml index 88d9029..3ceb156 100644 --- a/deployments/docker/docker-compose.yml +++ b/deployments/docker/docker-compose.yml @@ -89,8 +89,8 @@ services: --redis redis:6379 --default-value forbidden-hostnames="[]" --default-value allowed-mime-types="[{\"content-type\":\"text/\",\"extensions\":[\"html\",\"php\",\"aspx\", \"htm\"]}]" - --default-value refresh-delay="{\"delay\": -1}" - --default-value blacklist-threshold="{\"threshold\": 5}" + --default-value refresh-delay="{\"delay\": 0}" + --default-value blacklist-config="{\"threshold\": 5, \"ttl\": 1200}" restart: always depends_on: - rabbitmq diff --git a/internal/blacklister/blacklister.go b/internal/blacklister/blacklister.go index a69140b..c9147e6 100644 --- a/internal/blacklister/blacklister.go +++ b/internal/blacklister/blacklister.go @@ -45,7 +45,7 @@ func (state *State) Initialize(provider process.Provider) error { } state.hostnameCache = hostnameCache - configClient, err := provider.ConfigClient([]string{configapi.ForbiddenHostnamesKey, configapi.BlackListThresholdKey}) + configClient, err := provider.ConfigClient([]string{configapi.ForbiddenHostnamesKey, configapi.BlackListConfigKey}) if err != nil { return err } @@ -104,7 +104,22 @@ func (state *State) handleTimeoutURLEvent(subscriber event.Subscriber, msg event // Check by ourselves if the hostname doesn't respond _, err = state.httpClient.Get(fmt.Sprintf("%s://%s", u.Scheme, u.Host)) - if err == nil || err != chttp.ErrTimeout { + if err != nil && err != chttp.ErrTimeout { + return err + } + + cacheKey := u.Hostname() + + if err == nil { + log.Debug(). + Str("hostname", u.Hostname()). + Msg("Response received.") + + // Host is not down, remove it from cache + if err := state.hostnameCache.Remove(cacheKey); err != nil { + return err + } + return nil } @@ -112,19 +127,18 @@ func (state *State) handleTimeoutURLEvent(subscriber event.Subscriber, msg event Str("hostname", u.Hostname()). Msg("Timeout confirmed") - threshold, err := state.configClient.GetBlackListThreshold() + blackListConfig, err := state.configClient.GetBlackListConfig() if err != nil { return err } - cacheKey := u.Hostname() count, err := state.hostnameCache.GetInt64(cacheKey) if err != nil { return err } count++ - if count >= threshold.Threshold { + if count >= blackListConfig.Threshold { forbiddenHostnames, err := state.configClient.GetForbiddenHostnames() if err != nil { return err @@ -155,7 +169,7 @@ func (state *State) handleTimeoutURLEvent(subscriber event.Subscriber, msg event } // Update count - if err := state.hostnameCache.SetInt64(cacheKey, count, cache.NoTTL); err != nil { + if err := state.hostnameCache.SetInt64(cacheKey, count, blackListConfig.TTL); err != nil { return err } diff --git a/internal/blacklister/blacklister_test.go b/internal/blacklister/blacklister_test.go index a2e26df..64f13f1 100644 --- a/internal/blacklister/blacklister_test.go +++ b/internal/blacklister/blacklister_test.go @@ -2,7 +2,6 @@ package blacklister import ( "errors" - "github.com/creekorful/trandoshan/internal/cache" "github.com/creekorful/trandoshan/internal/cache_mock" configapi "github.com/creekorful/trandoshan/internal/configapi/client" "github.com/creekorful/trandoshan/internal/configapi/client_mock" @@ -15,6 +14,7 @@ import ( "github.com/creekorful/trandoshan/internal/test" "github.com/golang/mock/gomock" "testing" + "time" ) func TestState_Name(t *testing.T) { @@ -37,7 +37,7 @@ func TestState_CustomFlags(t *testing.T) { func TestState_Initialize(t *testing.T) { test.CheckInitialize(t, &State{}, func(p *process_mock.MockProviderMockRecorder) { p.Cache("down-hostname") - p.ConfigClient([]string{configapi.ForbiddenHostnamesKey, configapi.BlackListThresholdKey}) + p.ConfigClient([]string{configapi.ForbiddenHostnamesKey, configapi.BlackListConfigKey}) p.HTTPClient() }) } @@ -69,6 +69,8 @@ func TestHandleTimeoutURLEventNoTimeout(t *testing.T) { httpClientMock.EXPECT().Get("https://down-example.onion:8080").Return(httpResponseMock, nil) configClientMock.EXPECT().GetForbiddenHostnames().Return([]configapi.ForbiddenHostname{}, nil) + hostnameCacheMock.EXPECT().Remove("down-example.onion") + s := State{configClient: configClientMock, hostnameCache: hostnameCacheMock, httpClient: httpClientMock} if err := s.handleTimeoutURLEvent(subscriberMock, msg); err != nil { t.Fail() @@ -94,10 +96,13 @@ func TestHandleTimeoutURLEventNoDispatch(t *testing.T) { httpClientMock.EXPECT().Get("https://down-example.onion").Return(httpResponseMock, http.ErrTimeout) configClientMock.EXPECT().GetForbiddenHostnames().Return([]configapi.ForbiddenHostname{}, nil) - configClientMock.EXPECT().GetBlackListThreshold().Return(configapi.BlackListThreshold{Threshold: 10}, nil) + configClientMock.EXPECT().GetBlackListConfig().Return(configapi.BlackListConfig{ + Threshold: 10, + TTL: 5, + }, nil) hostnameCacheMock.EXPECT().GetInt64("down-example.onion").Return(int64(0), nil) - hostnameCacheMock.EXPECT().SetInt64("down-example.onion", int64(1), cache.NoTTL).Return(nil) + hostnameCacheMock.EXPECT().SetInt64("down-example.onion", int64(1), time.Duration(5)).Return(nil) s := State{configClient: configClientMock, hostnameCache: hostnameCacheMock, httpClient: httpClientMock} if err := s.handleTimeoutURLEvent(subscriberMock, msg); err != nil { @@ -124,7 +129,10 @@ func TestHandleTimeoutURLEvent(t *testing.T) { httpClientMock.EXPECT().Get("https://down-example.onion").Return(httpResponseMock, http.ErrTimeout) configClientMock.EXPECT().GetForbiddenHostnames().Return([]configapi.ForbiddenHostname{}, nil) - configClientMock.EXPECT().GetBlackListThreshold().Return(configapi.BlackListThreshold{Threshold: 10}, nil) + configClientMock.EXPECT().GetBlackListConfig().Return(configapi.BlackListConfig{ + Threshold: 10, + TTL: 5, + }, nil) hostnameCacheMock.EXPECT().GetInt64("down-example.onion").Return(int64(9), nil) @@ -138,7 +146,9 @@ func TestHandleTimeoutURLEvent(t *testing.T) { }). Return(nil) - hostnameCacheMock.EXPECT().SetInt64("down-example.onion", int64(10), cache.NoTTL).Return(nil) + hostnameCacheMock.EXPECT(). + SetInt64("down-example.onion", int64(10), time.Duration(5)). + Return(nil) s := State{configClient: configClientMock, hostnameCache: hostnameCacheMock, httpClient: httpClientMock} if err := s.handleTimeoutURLEvent(subscriberMock, msg); err != nil { diff --git a/internal/cache/cache.go b/internal/cache/cache.go index a2e8285..0a550fb 100644 --- a/internal/cache/cache.go +++ b/internal/cache/cache.go @@ -21,4 +21,6 @@ type Cache interface { GetManyInt64(keys []string) (map[string]int64, error) SetManyInt64(values map[string]int64, TTL time.Duration) error + + Remove(key string) error } diff --git a/internal/cache/redis.go b/internal/cache/redis.go index 41d58cb..1b78608 100644 --- a/internal/cache/redis.go +++ b/internal/cache/redis.go @@ -92,6 +92,10 @@ func (rc *redisCache) SetManyInt64(values map[string]int64, TTL time.Duration) e return err } +func (rc *redisCache) Remove(key string) error { + return rc.client.Del(context.Background(), rc.getKey(key)).Err() +} + func (rc *redisCache) getKey(key string) string { if rc.keyPrefix == "" { return key diff --git a/internal/configapi/client/client.go b/internal/configapi/client/client.go index 268dc9a..77106a0 100644 --- a/internal/configapi/client/client.go +++ b/internal/configapi/client/client.go @@ -21,8 +21,8 @@ const ( ForbiddenHostnamesKey = "forbidden-hostnames" // RefreshDelayKey is the key to access the refresh delay config RefreshDelayKey = "refresh-delay" - // BlackListThresholdKey is the key to access the blacklist threshold config - BlackListThresholdKey = "blacklist-threshold" + // BlackListConfigKey is the key to access the blacklist configuration + BlackListConfigKey = "blacklist-config" ) // MimeType is the mime type as represented in the config @@ -43,9 +43,10 @@ type RefreshDelay struct { Delay time.Duration `json:"delay"` } -// BlackListThreshold is the threshold to reach before blacklisting domain -type BlackListThreshold struct { - Threshold int64 `json:"threshold"` +// BlackListConfig is the config used for hostname blacklisting +type BlackListConfig struct { + Threshold int64 `json:"threshold"` + TTL time.Duration `json:"ttl"` } // Client is a nice client interface for the ConfigAPI @@ -53,7 +54,7 @@ type Client interface { GetAllowedMimeTypes() ([]MimeType, error) GetForbiddenHostnames() ([]ForbiddenHostname, error) GetRefreshDelay() (RefreshDelay, error) - GetBlackListThreshold() (BlackListThreshold, error) + GetBlackListConfig() (BlackListConfig, error) Set(key string, value interface{}) error } @@ -68,7 +69,7 @@ type client struct { allowedMimeTypes []MimeType forbiddenHostnames []ForbiddenHostname refreshDelay RefreshDelay - blackListThreshold BlackListThreshold + blackListConfig BlackListConfig } // NewConfigClient create a new client for the ConfigAPI. @@ -150,18 +151,21 @@ func (c *client) setRefreshDelay(value RefreshDelay) error { return nil } -func (c *client) GetBlackListThreshold() (BlackListThreshold, error) { - c.mutexes[BlackListThresholdKey].RLock() - defer c.mutexes[BlackListThresholdKey].RUnlock() +func (c *client) GetBlackListConfig() (BlackListConfig, error) { + c.mutexes[BlackListConfigKey].RLock() + defer c.mutexes[BlackListConfigKey].RUnlock() - return c.blackListThreshold, nil + return c.blackListConfig, nil } -func (c *client) setBlackListThreshold(value BlackListThreshold) error { - c.mutexes[BlackListThresholdKey].Lock() - defer c.mutexes[BlackListThresholdKey].Unlock() +func (c *client) setBlackListConfig(value BlackListConfig) error { + c.mutexes[BlackListConfigKey].Lock() + defer c.mutexes[BlackListConfigKey].Unlock() - c.blackListThreshold = value + c.blackListConfig = BlackListConfig{ + Threshold: value.Threshold, + TTL: value.TTL * time.Second, // TTL is in seconds + } return nil } @@ -232,12 +236,12 @@ func (c *client) setValue(key string, value []byte) error { return err } break - case BlackListThresholdKey: - var val BlackListThreshold + case BlackListConfigKey: + var val BlackListConfig if err := json.Unmarshal(value, &val); err != nil { return err } - if err := c.setBlackListThreshold(val); err != nil { + if err := c.setBlackListConfig(val); err != nil { return err } break diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index b81e64b..10efc1b 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -121,13 +121,8 @@ func (state *State) handleNewResourceEvent(subscriber event.Subscriber, msg even return err } - ttl := delay.Delay - if ttl == -1 { - ttl = cache.NoTTL - } - // Update values in batch - if err := state.urlCache.SetManyInt64(urlCache, ttl); err != nil { + if err := state.urlCache.SetManyInt64(urlCache, delay.Delay); err != nil { return err } diff --git a/internal/scheduler/scheduler_test.go b/internal/scheduler/scheduler_test.go index 7e7807b..b4e54eb 100644 --- a/internal/scheduler/scheduler_test.go +++ b/internal/scheduler/scheduler_test.go @@ -233,7 +233,7 @@ This domain is blacklisted: https://m.fbi.onion/test.php Return([]client.ForbiddenHostname{ {Hostname: "fbi.onion"}, }, nil) - configClientMock.EXPECT().GetRefreshDelay().Return(client.RefreshDelay{Delay: -1}, nil) + configClientMock.EXPECT().GetRefreshDelay().Return(client.RefreshDelay{Delay: 0}, nil) subscriberMock.EXPECT().PublishEvent(&event.NewURLEvent{ URL: "https://facebook.onion/test.php?id=1",