Merge pull request #86 from creekorful/develop

Release 0.7.0
pull/134/head v0.7.0
Aloïs Micard 4 years ago committed by GitHub
commit 7cbfbb7794
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -7,11 +7,11 @@ Git repository to ease maintenance.
## Why a rewrite?
The first version of Trandoshan [(available here)](https://github.com/trandoshan-io) is working great but
not really professional, the code start to be a mess, hard to manage since split in multiple repositories, etc.
The first version of Trandoshan [(available here)](https://github.com/trandoshan-io) is working great but not really
professional, the code start to be a mess, hard to manage since split in multiple repositories, etc.
I have therefore decided to create & maintain the project in this specific repository,
where all components code will be available (as a Go module).
I have therefore decided to create & maintain the project in this specific repository, where all components code will be
available (as a Go module).
# How to start the crawler
@ -35,23 +35,49 @@ Since the API is exposed on localhost:15005, one can use it to start crawling:
using trandoshanctl executable:
```sh
$ trandoshanctl schedule https://www.facebookcorewwwi.onion
$ trandoshanctl --api-token <token> schedule https://www.facebookcorewwwi.onion
```
or using the docker image:
```sh
$ docker run creekorful/trandoshanctl --api-uri <uri> schedule https://www.facebookcorewwwi.onion
$ docker run creekorful/trandoshanctl --api-token <token> --api-uri <uri> schedule https://www.facebookcorewwwi.onion
```
(you'll need to specify the api uri if you use the docker container)
(you'll need to specify the api uri if you use the docker container)
this will schedule given URL for crawling.
## Example token
Here's a working API token that you can use with trandoshanctl if you haven't changed the API signing key:
```
eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VybmFtZSI6InRyYW5kb3NoYW5jdGwiLCJyaWdodHMiOnsiUE9TVCI6WyIvdjEvdXJscyJdLCJHRVQiOlsiL3YxL3Jlc291cmNlcyJdfX0.jGA8WODYKtKy7ZijngoV8C3iWi1eTvMitA8Z1Is2GUg
```
This token is the representation of the following payload:
```
{
"username": "trandoshanctl",
"rights": {
"POST": [
"/v1/urls"
],
"GET": [
"/v1/resources"
]
}
}
```
you may create your own tokens with the rights needed. In the future a CLI tool will allow token generation easily.
## How to speed up crawling
If one want to speed up the crawling, he can scale the instance of crawling component in order
to increase performances. This may be done by issuing the following command after the crawler is started:
If one want to speed up the crawling, he can scale the instance of crawling component in order to increase performances.
This may be done by issuing the following command after the crawler is started:
```sh
$ ./scripts/scale.sh crawler=5
@ -69,20 +95,20 @@ $ trandoshanctl search <term>
## Using kibana
You can use the Kibana dashboard available at http://localhost:15004.
You will need to create an index pattern named 'resources', and when it asks for the time field, choose 'time'.
You can use the Kibana dashboard available at http://localhost:15004. You will need to create an index pattern named '
resources', and when it asks for the time field, choose 'time'.
# How to hack the crawler
If you've made a change to one of the crawler component and wish to use the updated version when
running start.sh you just need to issue the following command:
If you've made a change to one of the crawler component and wish to use the updated version when running start.sh you
just need to issue the following command:
```sh
$ ./script/build.sh
```
this will rebuild all crawler images using local changes.
After that just run start.sh again to have the updated version running.
this will rebuild all crawler images using local changes. After that just run start.sh again to have the updated version
running.
# Architecture

@ -8,7 +8,7 @@ import (
"time"
)
//go:generate mockgen -destination=../api_mock/api_mock.go -package=api_mock . Client
//go:generate mockgen -destination=../api_mock/api_mock.go -package=api_mock . API
const (
// PaginationPageHeader is the header to determinate current page in paginated endpoint
@ -31,6 +31,7 @@ type ResourceDto struct {
Title string `json:"title"`
Meta map[string]string `json:"meta"`
Description string `json:"description"`
Headers map[string]string `json:"headers"`
}
// CredentialsDto represent the credential when logging in the API
@ -39,10 +40,22 @@ type CredentialsDto struct {
Password string `json:"password"`
}
// Client is the interface to interact with the API component
type Client interface {
SearchResources(url, keyword string, startDate, endDate time.Time,
paginationPage, paginationSize int) ([]ResourceDto, int64, error)
// ResSearchParams is the search params used
type ResSearchParams struct {
URL string
Keyword string
StartDate time.Time
EndDate time.Time
WithBody bool
PageSize int
PageNumber int
// TODO allow searching by meta
// TODO allow searching by headers
}
// API is the interface to interact with the API component
type API interface {
SearchResources(params *ResSearchParams) ([]ResourceDto, int64, error)
AddResource(res ResourceDto) (ResourceDto, error)
ScheduleURL(url string) error
}
@ -52,34 +65,33 @@ type client struct {
baseURL string
}
func (c *client) SearchResources(url, keyword string,
startDate, endDate time.Time, paginationPage, paginationSize int) ([]ResourceDto, int64, error) {
func (c *client) SearchResources(params *ResSearchParams) ([]ResourceDto, int64, error) {
targetEndpoint := fmt.Sprintf("%s/v1/resources?", c.baseURL)
req := c.httpClient.R()
if url != "" {
b64URL := base64.URLEncoding.EncodeToString([]byte(url))
if params.URL != "" {
b64URL := base64.URLEncoding.EncodeToString([]byte(params.URL))
req.SetQueryParam("url", b64URL)
}
if keyword != "" {
req.SetQueryParam("keyword", keyword)
if params.Keyword != "" {
req.SetQueryParam("keyword", params.Keyword)
}
if !startDate.IsZero() {
req.SetQueryParam("start-date", startDate.Format(time.RFC3339))
if !params.StartDate.IsZero() {
req.SetQueryParam("start-date", params.StartDate.Format(time.RFC3339))
}
if !endDate.IsZero() {
req.SetQueryParam("end-date", endDate.Format(time.RFC3339))
if !params.EndDate.IsZero() {
req.SetQueryParam("end-date", params.EndDate.Format(time.RFC3339))
}
if paginationPage != 0 {
req.Header.Set(PaginationPageHeader, strconv.Itoa(paginationPage))
if params.PageNumber != 0 {
req.Header.Set(PaginationPageHeader, strconv.Itoa(params.PageNumber))
}
if paginationSize != 0 {
req.Header.Set(PaginationSizeHeader, strconv.Itoa(paginationSize))
if params.PageSize != 0 {
req.Header.Set(PaginationSizeHeader, strconv.Itoa(params.PageSize))
}
var resources []ResourceDto
@ -123,7 +135,7 @@ func (c *client) ScheduleURL(url string) error {
}
// NewClient create a new API client using given details
func NewClient(baseURL, token string) Client {
func NewClient(baseURL, token string) API {
httpClient := resty.New()
httpClient.SetAuthScheme("Bearer")
httpClient.SetAuthToken(token)

@ -0,0 +1,24 @@
# build image
FROM golang:1.15.0-alpine as builder
RUN apk update && apk upgrade && \
apk add --no-cache bash git openssh
WORKDIR /app
# Copy and download dependencies to cache them and faster build time
COPY go.mod go.sum ./
RUN go mod download
COPY . .
# Test then build app
RUN go build -v github.com/creekorful/trandoshan/cmd/tdsh-archiver
# runtime image
FROM alpine:latest
COPY --from=builder /app/tdsh-archiver /app/
WORKDIR /app/
ENTRYPOINT ["./tdsh-archiver"]

@ -0,0 +1,13 @@
package main
import (
"github.com/creekorful/trandoshan/internal/archiver"
"os"
)
func main() {
app := archiver.GetApp()
if err := app.Run(os.Args); err != nil {
os.Exit(1)
}
}

@ -1,15 +1,19 @@
version: '3'
services:
nats:
image: nats:2.1.9-alpine3.12
rabbitmq:
image: rabbitmq:3.8.9-management-alpine
ports:
- 15003:15672
volumes:
- rabbitdata:/var/lib/rabbitmq
torproxy:
image: dperson/torproxy:latest
elasticsearch:
image: elasticsearch:7.10.1
environment:
- discovery.type=single-node
- ES_JAVA_OPTS=-Xms2g -Xmx2g
- ES_JAVA_OPTS=-Xms2g -Xmx4g
volumes:
- esdata:/usr/share/elasticsearch/data
kibana:
@ -22,17 +26,17 @@ services:
image: creekorful/tdsh-crawler:latest
command: >
--log-level debug
--nats-uri nats
--hub-uri amqp://guest:guest@rabbitmq:5672
--tor-uri torproxy:9050
restart: always
depends_on:
- nats
- rabbitmq
- torproxy
scheduler:
image: creekorful/tdsh-scheduler:latest
command: >
--log-level debug
--nats-uri nats
--hub-uri amqp://guest:guest@rabbitmq:5672
--api-uri http://api:8080
--api-token eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VybmFtZSI6InNjaGVkdWxlciIsInJpZ2h0cyI6eyJHRVQiOlsiL3YxL3Jlc291cmNlcyJdfX0.dBR6KLQp2h2srY-By3zikEznhQplLCtDrvOkcXP6USY
--forbidden-extensions png
@ -40,26 +44,40 @@ services:
--forbidden-extensions jpg
--forbidden-extensions jpeg
--forbidden-extensions bmp
--forbidden-extensions css
--forbidden-extensions js
--forbidden-hostnames facebookcorewwwi.onion
restart: always
depends_on:
- nats
- rabbitmq
- api
extractor:
image: creekorful/tdsh-extractor:latest
command: >
--log-level debug
--nats-uri nats
--hub-uri amqp://guest:guest@rabbitmq:5672
--api-uri http://api:8080
--api-token eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VybmFtZSI6ImV4dHJhY3RvciIsInJpZ2h0cyI6eyJQT1NUIjpbIi92MS9yZXNvdXJjZXMiXX19.mytGd_9zyK8y_T3fsWAmH8FnaBNr6qWefwCPDOx4in0
restart: always
depends_on:
- nats
- rabbitmq
- api
archiver:
image: creekorful/tdsh-archiver:latest
command: >
--log-level debug
--hub-uri amqp://guest:guest@rabbitmq:5672
--storage-dir /archive
restart: always
volumes:
- archiverdata:/archive
depends_on:
- rabbitmq
api:
image: creekorful/tdsh-api:latest
command: >
--log-level debug
--nats-uri nats
--hub-uri amqp://guest:guest@rabbitmq:5672
--elasticsearch-uri http://elasticsearch:9200
--signing-key K==M5RsU_DQa4_XSbkX?L27s^xWmde25
restart: always
@ -71,3 +89,7 @@ services:
volumes:
esdata:
driver: local
rabbitdata:
driver: local
archiverdata:
driver: local

@ -9,16 +9,14 @@ require (
github.com/dgrijalva/jwt-go v3.2.0+incompatible
github.com/go-resty/resty/v2 v2.3.0
github.com/golang/mock v1.4.4
github.com/golang/protobuf v1.4.2 // indirect
github.com/labstack/echo/v4 v4.1.16
github.com/nats-io/nats-server/v2 v2.1.8 // indirect
github.com/nats-io/nats.go v1.10.0
github.com/olekukonko/tablewriter v0.0.4
github.com/olivere/elastic/v7 v7.0.20
github.com/rs/zerolog v1.20.0
github.com/streadway/amqp v1.0.0
github.com/urfave/cli/v2 v2.2.0
github.com/valyala/fasthttp v1.9.0
github.com/xhit/go-str2duration/v2 v2.0.0
golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59
golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59 // indirect
mvdan.cc/xurls/v2 v2.1.0
)

@ -29,18 +29,9 @@ github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfb
github.com/golang/mock v1.4.4 h1:l75CXGRSwbaYNpl/Z2X1XIIAMSCquvXgpVZDhwEIJsc=
github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0=
github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM=
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/jmespath/go-jmespath v0.3.0/go.mod h1:9QtRXoHjLGCJ5IBSaohpXITPlowMeeYCZ7fLUTSywik=
@ -65,17 +56,6 @@ github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHX
github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
github.com/mattn/go-runewidth v0.0.7 h1:Ei8KR0497xHyKJPAv59M1dkC+rOZCMBJ+t3fZ+twI54=
github.com/mattn/go-runewidth v0.0.7/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
github.com/nats-io/jwt v0.3.2 h1:+RB5hMpXUUA2dfxuhBTEkMOrYmM+gKIZYS1KjSostMI=
github.com/nats-io/jwt v0.3.2/go.mod h1:/euKqTS1ZD+zzjYrY7pseZrTtWQSjujC7xjPc8wL6eU=
github.com/nats-io/nats-server/v2 v2.1.8 h1:d5GoJA6W7vQkmt99Nfdeie3pEFFUEjIwt1YZp50DkIQ=
github.com/nats-io/nats-server/v2 v2.1.8/go.mod h1:rbRrRE/Iv93O/rUvZ9dh4NfT0Cm9HWjW/BqOWLGgYiE=
github.com/nats-io/nats.go v1.10.0 h1:L8qnKaofSfNFbXg0C5F71LdjPRnmQwSsA4ukmkt1TvY=
github.com/nats-io/nats.go v1.10.0/go.mod h1:AjGArbfyR50+afOUotNX2Xs5SYHf+CoOa5HH1eEl2HE=
github.com/nats-io/nkeys v0.1.3/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxziKVo7w=
github.com/nats-io/nkeys v0.1.4 h1:aEsHIssIk6ETN5m2/MD8Y4B2X7FfXrBAUdkyRvbVYzA=
github.com/nats-io/nkeys v0.1.4/go.mod h1:XdZpAbhgyyODYqjTawOnIOI7VlbKSarI9Gfy1tqEu/s=
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
github.com/olekukonko/tablewriter v0.0.4 h1:vHD/YYe1Wolo78koG299f7V/VAS08c6IpCLn+Ejf/w8=
github.com/olekukonko/tablewriter v0.0.4/go.mod h1:zq6QwlOf5SlnkVbMSr5EoBv3636FWnp+qbPhuoO21uA=
github.com/olivere/elastic/v7 v7.0.20 h1:5FFpGPVJlBSlWBOdict406Y3yNTIpVpAiUvdFZeSbAo=
@ -96,6 +76,8 @@ github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeV
github.com/smartystreets/assertions v1.1.1/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo=
github.com/smartystreets/go-aws-auth v0.0.0-20180515143844-0c1422d1fdb9/go.mod h1:SnhjPscd9TpLiy1LpzGSKh3bXCfxxXuqd9xmQJy3slM=
github.com/smartystreets/gunit v1.4.2/go.mod h1:ZjM1ozSIMJlAz/ay4SG8PeKF00ckUp+zMHZXV9/bvak=
github.com/streadway/amqp v1.0.0 h1:kuuDrUJFZL1QYL9hUNuCxNObNzB0bV/ZG5jV3RWAQgo=
github.com/streadway/amqp v1.0.0/go.mod h1:AZpEONHx3DKn8O/DFsRAY58/XVQiIPMTMB1SddzLXVw=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
@ -116,7 +98,6 @@ github.com/xhit/go-str2duration/v2 v2.0.0 h1:uFtk6FWB375bP7ewQl+/1wBcn840GPhnySO
github.com/xhit/go-str2duration/v2 v2.0.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU=
go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200221231518-2aa609cf4a9d/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59 h1:3zb4D3T4G8jdExgVU/95+vQXfpEPiMdCaZgmGVxjNHM=
golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
@ -148,7 +129,6 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h
golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae h1:/WDfKMnPU+m5M4xB+6x4kaepxRw6jWvR5iDRdvjHgy8=
@ -175,14 +155,6 @@ google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoA
google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.23.0 h1:4MY060fB1DLGMB/7MBTLnwQUY6+F09GEiz6SsrNqyzM=
google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=

@ -2,6 +2,8 @@ package api
import (
"github.com/creekorful/trandoshan/internal/api/auth"
"github.com/creekorful/trandoshan/internal/api/rest"
"github.com/creekorful/trandoshan/internal/api/service"
"github.com/creekorful/trandoshan/internal/logging"
"github.com/creekorful/trandoshan/internal/util"
"github.com/labstack/echo/v4"
@ -9,20 +11,15 @@ import (
"github.com/urfave/cli/v2"
)
var (
defaultPaginationSize = 50
maxPaginationSize = 100
)
// GetApp return the api app
func GetApp() *cli.App {
return &cli.App{
Name: "tdsh-api",
Version: "0.6.0",
Version: "0.7.0",
Usage: "Trandoshan API component",
Flags: []cli.Flag{
logging.GetLogFlag(),
util.GetNATSURIFlag(),
util.GetHubURI(),
&cli.StringFlag{
Name: "elasticsearch-uri",
Usage: "URI to the Elasticsearch server",
@ -38,6 +35,10 @@ func GetApp() *cli.App {
Usage: "List of API users. (Format user:password)",
Required: false,
},
&cli.StringFlag{
Name: "refresh-delay",
Usage: "Duration before allowing indexation of existing resource (none = never)",
},
},
Action: execute,
}
@ -55,15 +56,15 @@ func execute(c *cli.Context) error {
log.Info().Str("ver", c.App.Version).
Str("elasticsearch-uri", c.String("elasticsearch-uri")).
Str("nats-uri", c.String("nats-uri")).
Str("hub-uri", c.String("hub-uri")).
Msg("Starting tdsh-api")
signingKey := []byte(c.String("signing-key"))
// Create the service
svc, err := newService(c)
svc, err := service.New(c)
if err != nil {
log.Err(err).Msg("Unable to start API")
log.Err(err).Msg("error while creating API service")
return err
}
@ -72,9 +73,9 @@ func execute(c *cli.Context) error {
e.Use(authMiddleware.Middleware())
// Add endpoints
e.GET("/v1/resources", searchResourcesEndpoint(svc))
e.POST("/v1/resources", addResourceEndpoint(svc))
e.POST("/v1/urls", scheduleURLEndpoint(svc))
e.GET("/v1/resources", rest.SearchResources(svc))
e.POST("/v1/resources", rest.AddResource(svc))
e.POST("/v1/urls", rest.ScheduleURL(svc))
log.Info().Msg("Successfully initialized tdsh-api. Waiting for requests")

@ -3,6 +3,7 @@ package database
import (
"context"
"encoding/json"
"github.com/creekorful/trandoshan/api"
"github.com/olivere/elastic/v7"
"github.com/rs/zerolog/log"
"time"
@ -20,25 +21,14 @@ type ResourceIdx struct {
Title string `json:"title"`
Meta map[string]string `json:"meta"`
Description string `json:"description"`
}
// ResSearchParams is the search params used
type ResSearchParams struct {
URL string
Keyword string
StartDate time.Time
EndDate time.Time
WithBody bool
PageSize int
PageNumber int
// TODO allow searching by meta
Headers map[string]string `json:"headers"`
}
// Database is the interface used to abstract communication
// with the persistence unit
type Database interface {
SearchResources(params *ResSearchParams) ([]ResourceIdx, error)
CountResources(params *ResSearchParams) (int64, error)
SearchResources(params *api.ResSearchParams) ([]ResourceIdx, error)
CountResources(params *api.ResSearchParams) (int64, error)
AddResource(res ResourceIdx) error
}
@ -70,7 +60,7 @@ func NewElasticDB(uri string) (Database, error) {
}, nil
}
func (e *elasticSearchDB) SearchResources(params *ResSearchParams) ([]ResourceIdx, error) {
func (e *elasticSearchDB) SearchResources(params *api.ResSearchParams) ([]ResourceIdx, error) {
q := buildSearchQuery(params)
from := (params.PageNumber - 1) * params.PageSize
@ -103,7 +93,7 @@ func (e *elasticSearchDB) SearchResources(params *ResSearchParams) ([]ResourceId
return resources, nil
}
func (e *elasticSearchDB) CountResources(params *ResSearchParams) (int64, error) {
func (e *elasticSearchDB) CountResources(params *api.ResSearchParams) (int64, error) {
q := buildSearchQuery(params)
count, err := e.client.Count(resourcesIndex).Query(q).Do(context.Background())
@ -122,11 +112,11 @@ func (e *elasticSearchDB) AddResource(res ResourceIdx) error {
return err
}
func buildSearchQuery(params *ResSearchParams) elastic.Query {
func buildSearchQuery(params *api.ResSearchParams) elastic.Query {
var queries []elastic.Query
if params.URL != "" {
log.Trace().Str("url", params.URL).Msg("SearchQuery: Setting url")
queries = append(queries, elastic.NewTermQuery("url", params.URL))
queries = append(queries, elastic.NewTermQuery("url.keyword", params.URL))
}
if params.Keyword != "" {
log.Trace().Str("body", params.Keyword).Msg("SearchQuery: Setting body")

@ -1,9 +1,9 @@
package api
package rest
import (
"encoding/base64"
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/internal/api/database"
"github.com/creekorful/trandoshan/internal/api/service"
"github.com/labstack/echo/v4"
"net/http"
"strconv"
@ -11,14 +11,20 @@ import (
"time"
)
func searchResourcesEndpoint(s service) echo.HandlerFunc {
var (
defaultPaginationSize = 50
maxPaginationSize = 100
)
// SearchResources allows to search resources
func SearchResources(s *service.Service) echo.HandlerFunc {
return func(c echo.Context) error {
searchParams, err := newSearchParams(c)
if err != nil {
return err
}
resources, total, err := s.searchResources(searchParams)
resources, total, err := s.SearchResources(searchParams)
if err != nil {
return err
}
@ -29,14 +35,15 @@ func searchResourcesEndpoint(s service) echo.HandlerFunc {
}
}
func addResourceEndpoint(s service) echo.HandlerFunc {
// AddResource persist a new resource
func AddResource(s *service.Service) echo.HandlerFunc {
return func(c echo.Context) error {
var res api.ResourceDto
if err := c.Bind(&res); err != nil {
return err
}
res, err := s.addResource(res)
res, err := s.AddResource(res)
if err != nil {
return err
}
@ -45,14 +52,15 @@ func addResourceEndpoint(s service) echo.HandlerFunc {
}
}
func scheduleURLEndpoint(s service) echo.HandlerFunc {
// ScheduleURL schedule given URL for crawling
func ScheduleURL(s *service.Service) echo.HandlerFunc {
return func(c echo.Context) error {
var url string
if err := c.Bind(&url); err != nil {
return err
}
return s.scheduleURL(url)
return s.ScheduleURL(url)
}
}
@ -73,14 +81,14 @@ func readPagination(c echo.Context) (int, int) {
return paginationPage, paginationSize
}
func writePagination(c echo.Context, s *database.ResSearchParams, totalCount int64) {
func writePagination(c echo.Context, s *api.ResSearchParams, totalCount int64) {
c.Response().Header().Set(api.PaginationPageHeader, strconv.Itoa(s.PageNumber))
c.Response().Header().Set(api.PaginationSizeHeader, strconv.Itoa(s.PageSize))
c.Response().Header().Set(api.PaginationCountHeader, strconv.FormatInt(totalCount, 10))
}
func newSearchParams(c echo.Context) (*database.ResSearchParams, error) {
params := &database.ResSearchParams{}
func newSearchParams(c echo.Context) (*api.ResSearchParams, error) {
params := &api.ResSearchParams{}
params.Keyword = c.QueryParam("keyword")

@ -1,4 +1,4 @@
package api
package rest
import (
"fmt"

@ -1,105 +0,0 @@
package api
import (
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/internal/api/database"
"github.com/creekorful/trandoshan/internal/messaging"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
)
type service interface {
searchResources(params *database.ResSearchParams) ([]api.ResourceDto, int64, error)
addResource(res api.ResourceDto) (api.ResourceDto, error)
scheduleURL(url string) error
close()
}
type svc struct {
db database.Database
pub messaging.Publisher
}
func newService(c *cli.Context) (service, error) {
// Connect to the NATS server
pub, err := messaging.NewPublisher(c.String("nats-uri"))
if err != nil {
log.Err(err).Str("uri", c.String("nats-uri")).Msg("Error while connecting to NATS server")
return nil, err
}
// Create Elasticsearch client
db, err := database.NewElasticDB(c.String("elasticsearch-uri"))
if err != nil {
log.Err(err).Msg("Error while connecting to the database")
return nil, err
}
return &svc{
db: db,
pub: pub,
}, nil
}
func (s *svc) searchResources(params *database.ResSearchParams) ([]api.ResourceDto, int64, error) {
totalCount, err := s.db.CountResources(params)
if err != nil {
log.Err(err).Msg("Error while counting on ES")
return nil, 0, err
}
res, err := s.db.SearchResources(params)
if err != nil {
log.Err(err).Msg("Error while searching on ES")
return nil, 0, err
}
var resources []api.ResourceDto
for _, r := range res {
resources = append(resources, api.ResourceDto{
URL: r.URL,
Body: r.Body,
Title: r.Title,
Time: r.Time,
})
}
return resources, totalCount, nil
}
func (s *svc) addResource(res api.ResourceDto) (api.ResourceDto, error) {
log.Debug().Str("url", res.URL).Msg("Saving resource")
// Create Elasticsearch document
doc := database.ResourceIdx{
URL: res.URL,
Body: res.Body,
Time: res.Time,
Title: res.Title,
Meta: res.Meta,
Description: res.Description,
}
if err := s.db.AddResource(doc); err != nil {
log.Err(err).Msg("Error while adding resource")
return api.ResourceDto{}, err
}
log.Debug().Str("url", res.URL).Msg("Successfully saved resource")
return res, nil
}
func (s *svc) scheduleURL(url string) error {
// Publish the URL
if err := s.pub.PublishMsg(&messaging.URLFoundMsg{URL: url}); err != nil {
log.Err(err).Msg("Unable to publish URL")
return err
}
log.Debug().Str("url", url).Msg("Successfully published URL")
return nil
}
func (s *svc) close() {
s.pub.Close()
}

@ -0,0 +1,138 @@
package service
import (
"fmt"
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/internal/api/database"
"github.com/creekorful/trandoshan/internal/duration"
"github.com/creekorful/trandoshan/internal/event"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"time"
)
// Service represent the functionality the API expose
type Service struct {
db database.Database
pub event.Publisher
refreshDelay time.Duration
}
// New create a new Service instance
func New(c *cli.Context) (*Service, error) {
// Connect to the messaging server
pub, err := event.NewPublisher(c.String("hub-uri"))
if err != nil {
return nil, fmt.Errorf("error while connecting to hub server: %s", err)
}
// Create Elasticsearch client
db, err := database.NewElasticDB(c.String("elasticsearch-uri"))
if err != nil {
return nil, fmt.Errorf("error while connecting to the database: %s", err)
}
refreshDelay := duration.ParseDuration(c.String("refresh-delay"))
return &Service{
db: db,
pub: pub,
refreshDelay: refreshDelay,
}, nil
}
// SearchResources allows to search resources using given params
func (s *Service) SearchResources(params *api.ResSearchParams) ([]api.ResourceDto, int64, error) {
totalCount, err := s.db.CountResources(params)
if err != nil {
log.Err(err).Msg("Error while counting on ES")
return nil, 0, err
}
res, err := s.db.SearchResources(params)
if err != nil {
log.Err(err).Msg("Error while searching on ES")
return nil, 0, err
}
var resources []api.ResourceDto
for _, r := range res {
resources = append(resources, api.ResourceDto{
URL: r.URL,
Body: r.Body,
Title: r.Title,
Time: r.Time,
})
}
return resources, totalCount, nil
}
// AddResource allows to add given resource
func (s *Service) AddResource(res api.ResourceDto) (api.ResourceDto, error) {
log.Debug().Str("url", res.URL).Msg("Saving resource")
// Hacky stuff to prevent from adding 'duplicate resource'
// the thing is: even with the scheduler preventing from crawling 'duplicates' URL by adding a refresh period
// and checking if the resource is not already indexed, this implementation may not work if the URLs was published
// before the resource is saved. And this happen a LOT of time.
// therefore the best thing to do is to make the API check if the resource should **really** be added by checking if
// it isn't present on the database. This may sounds hacky, but it's the best solution i've come up at this time.
endDate := time.Time{}
if s.refreshDelay != -1 {
endDate = time.Now().Add(-s.refreshDelay)
}
count, err := s.db.CountResources(&api.ResSearchParams{
URL: res.URL,
EndDate: endDate,
PageSize: 1,
PageNumber: 1,
})
if err != nil {
log.Err(err).Msg("error while searching for resource")
return api.ResourceDto{}, nil
}
if count > 0 {
// Not an error
log.Debug().Str("url", res.URL).Msg("Skipping duplicate resource")
return res, nil
}
// Create Elasticsearch document
doc := database.ResourceIdx{
URL: res.URL,
Body: res.Body,
Time: res.Time,
Title: res.Title,
Meta: res.Meta,
Description: res.Description,
Headers: res.Headers,
}
if err := s.db.AddResource(doc); err != nil {
log.Err(err).Msg("Error while adding resource")
return api.ResourceDto{}, err
}
log.Debug().Str("url", res.URL).Msg("Successfully saved resource")
return res, nil
}
// ScheduleURL schedule given url for crawling
func (s *Service) ScheduleURL(url string) error {
// Publish the URL
if err := s.pub.Publish(&event.FoundURLEvent{URL: url}); err != nil {
log.Err(err).Msg("Unable to publish URL")
return err
}
log.Debug().Str("url", url).Msg("Successfully published URL")
return nil
}
// Close disconnect the service consumer
func (s *Service) Close() {
s.pub.Close()
}

@ -0,0 +1,202 @@
package service
import (
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/internal/api/database"
"github.com/creekorful/trandoshan/internal/api/database_mock"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/event_mock"
"github.com/golang/mock/gomock"
"testing"
"time"
)
func TestSearchResources(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
params := &api.ResSearchParams{Keyword: "example"}
dbMock := database_mock.NewMockDatabase(mockCtrl)
dbMock.EXPECT().CountResources(params).Return(int64(150), nil)
dbMock.EXPECT().SearchResources(params).Return([]database.ResourceIdx{
{
URL: "example-1.onion",
Body: "Example 1",
Title: "Example 1",
Time: time.Time{},
},
{
URL: "example-2.onion",
Body: "Example 2",
Title: "Example 2",
Time: time.Time{},
},
}, nil)
s := Service{db: dbMock}
res, count, err := s.SearchResources(params)
if err != nil {
t.FailNow()
}
if count != 150 {
t.Error()
}
if len(res) != 2 {
t.Error()
}
}
func TestAddResource(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
dbMock := database_mock.NewMockDatabase(mockCtrl)
dbMock.EXPECT().CountResources(&searchParamsMatcher{target: api.ResSearchParams{
URL: "https://example.onion",
PageSize: 1,
PageNumber: 1,
}}).Return(int64(0), nil)
dbMock.EXPECT().AddResource(database.ResourceIdx{
URL: "https://example.onion",
Body: "TheBody",
Title: "Example",
Time: time.Time{},
Meta: map[string]string{"content": "content-meta"},
Description: "the description",
Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"},
})
s := Service{db: dbMock, refreshDelay: 5 * time.Hour}
res, err := s.AddResource(api.ResourceDto{
URL: "https://example.onion",
Body: "TheBody",
Title: "Example",
Time: time.Time{},
Meta: map[string]string{"content": "content-meta"},
Description: "the description",
Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"},
})
if err != nil {
t.FailNow()
}
if res.URL != "https://example.onion" {
t.FailNow()
}
if res.Body != "TheBody" {
t.FailNow()
}
if res.Title != "Example" {
t.FailNow()
}
if !res.Time.IsZero() {
t.FailNow()
}
if res.Meta["content"] != "content-meta" {
t.FailNow()
}
if res.Description != "the description" {
t.FailNow()
}
if res.Headers["Content-Type"] != "application/html" {
t.FailNow()
}
if res.Headers["Server"] != "Traefik" {
t.FailNow()
}
}
func TestAddResourceDuplicateNotAllowed(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
dbMock := database_mock.NewMockDatabase(mockCtrl)
dbMock.EXPECT().CountResources(&searchParamsMatcher{target: api.ResSearchParams{
URL: "https://example.onion",
PageSize: 1,
PageNumber: 1,
}, endDateZero: true}).Return(int64(1), nil)
s := Service{db: dbMock, refreshDelay: -1}
_, err := s.AddResource(api.ResourceDto{
URL: "https://example.onion",
Body: "TheBody",
Title: "Example",
Time: time.Time{},
Meta: map[string]string{"content": "content-meta"},
Description: "the description",
Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"},
})
if err != nil {
t.FailNow()
}
}
func TestAddResourceTooYoung(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
dbMock := database_mock.NewMockDatabase(mockCtrl)
dbMock.EXPECT().CountResources(&searchParamsMatcher{target: api.ResSearchParams{
URL: "https://example.onion",
EndDate: time.Now().Add(-10 * time.Minute),
PageSize: 1,
PageNumber: 1,
}}).Return(int64(1), nil)
s := Service{db: dbMock, refreshDelay: -10 * time.Minute}
_, err := s.AddResource(api.ResourceDto{
URL: "https://example.onion",
Body: "TheBody",
Title: "Example",
Time: time.Time{},
Meta: map[string]string{"content": "content-meta"},
Description: "the description",
Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"},
})
if err != nil {
t.FailNow()
}
}
func TestScheduleURL(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
pubMock := event_mock.NewMockPublisher(mockCtrl)
s := Service{pub: pubMock}
pubMock.EXPECT().Publish(&event.FoundURLEvent{URL: "https://example.onion"})
if err := s.ScheduleURL("https://example.onion"); err != nil {
t.FailNow()
}
}
// custom matcher to ignore time field when doing comparison ;(
// todo: do less crappy?
type searchParamsMatcher struct {
target api.ResSearchParams
endDateZero bool
}
func (sm *searchParamsMatcher) Matches(x interface{}) bool {
arg := x.(*api.ResSearchParams)
return arg.URL == sm.target.URL && arg.PageSize == sm.target.PageSize && arg.PageNumber == sm.target.PageNumber &&
sm.endDateZero == arg.EndDate.IsZero()
}
func (sm *searchParamsMatcher) String() string {
return "is valid search params"
}

@ -1,114 +0,0 @@
package api
import (
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/internal/api/database"
"github.com/creekorful/trandoshan/internal/api/database_mock"
"github.com/creekorful/trandoshan/internal/messaging"
"github.com/creekorful/trandoshan/internal/messaging_mock"
"github.com/golang/mock/gomock"
"testing"
"time"
)
func TestSearchResources(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
params := &database.ResSearchParams{Keyword: "example"}
dbMock := database_mock.NewMockDatabase(mockCtrl)
dbMock.EXPECT().CountResources(params).Return(int64(150), nil)
dbMock.EXPECT().SearchResources(params).Return([]database.ResourceIdx{
{
URL: "example-1.onion",
Body: "Example 1",
Title: "Example 1",
Time: time.Time{},
},
{
URL: "example-2.onion",
Body: "Example 2",
Title: "Example 2",
Time: time.Time{},
},
}, nil)
s := svc{db: dbMock}
res, count, err := s.searchResources(params)
if err != nil {
t.FailNow()
}
if count != 150 {
t.Error()
}
if len(res) != 2 {
t.Error()
}
}
func TestAddResource(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
dbMock := database_mock.NewMockDatabase(mockCtrl)
dbMock.EXPECT().AddResource(database.ResourceIdx{
URL: "https://example.onion",
Body: "TheBody",
Title: "Example",
Time: time.Time{},
Meta: map[string]string{"content": "content-meta"},
Description: "the description",
})
s := svc{db: dbMock}
res, err := s.addResource(api.ResourceDto{
URL: "https://example.onion",
Body: "TheBody",
Title: "Example",
Time: time.Time{},
Meta: map[string]string{"content": "content-meta"},
Description: "the description",
})
if err != nil {
t.FailNow()
}
if res.URL != "https://example.onion" {
t.FailNow()
}
if res.Body != "TheBody" {
t.FailNow()
}
if res.Title != "Example" {
t.FailNow()
}
if !res.Time.IsZero() {
t.FailNow()
}
if res.Meta["content"] != "content-meta" {
t.FailNow()
}
if res.Description != "the description" {
t.FailNow()
}
}
func TestScheduleURL(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
pubMock := messaging_mock.NewMockPublisher(mockCtrl)
s := svc{pub: pubMock}
pubMock.EXPECT().PublishMsg(&messaging.URLFoundMsg{URL: "https://example.onion"})
if err := s.scheduleURL("https://example.onion"); err != nil {
t.FailNow()
}
}

@ -0,0 +1,122 @@
package archiver
import (
"fmt"
"github.com/creekorful/trandoshan/internal/archiver/storage"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/logging"
"github.com/creekorful/trandoshan/internal/util"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"io"
"os"
"os/signal"
"strings"
"syscall"
)
// GetApp return the crawler app
func GetApp() *cli.App {
return &cli.App{
Name: "tdsh-archiver",
Version: "0.7.0",
Usage: "Trandoshan archiver component",
Flags: []cli.Flag{
logging.GetLogFlag(),
util.GetHubURI(),
&cli.StringFlag{
Name: "storage-dir",
Usage: "Path to the storage directory",
Required: true,
},
},
Action: execute,
}
}
func execute(ctx *cli.Context) error {
logging.ConfigureLogger(ctx)
log.Info().
Str("ver", ctx.App.Version).
Str("hub-uri", ctx.String("hub-uri")).
Str("storage-dir", ctx.String("storage-dir")).
Msg("Starting tdsh-archiver")
// Create the subscriber
sub, err := event.NewSubscriber(ctx.String("hub-uri"))
if err != nil {
return err
}
defer sub.Close()
// Create local storage
st, err := storage.NewLocalStorage(ctx.String("storage-dir"))
if err != nil {
return err
}
state := state{
storage: st,
}
if err := sub.SubscribeAsync(event.NewResourceExchange, "archivingQueue", state.handleNewResourceEvent); err != nil {
return err
}
log.Info().Msg("Successfully initialized tdsh-archiver. Waiting for resources")
// Handle graceful shutdown
c := make(chan os.Signal, 1)
signal.Notify(c, syscall.SIGINT, syscall.SIGTERM)
// Block until we receive our signal.
<-c
if err := sub.Close(); err != nil {
return err
}
return nil
}
type state struct {
storage storage.Storage
}
func (state *state) handleNewResourceEvent(subscriber event.Subscriber, body io.Reader) error {
var evt event.NewResourceEvent
if err := subscriber.Read(body, &evt); err != nil {
return err
}
log.Debug().Str("url", evt.URL).Msg("Processing new resource")
res, err := formatResource(&evt)
if err != nil {
return fmt.Errorf("error while formatting resource: %s", err)
}
if err := state.storage.Store(evt.URL, evt.Time, res); err != nil {
return fmt.Errorf("error while storing resource: %s", err)
}
return nil
}
func formatResource(evt *event.NewResourceEvent) ([]byte, error) {
builder := strings.Builder{}
// First headers
for key, value := range evt.Headers {
builder.WriteString(fmt.Sprintf("%s: %s\r\n", key, value))
}
// Then separator for body
builder.WriteString("\r\n")
// Then body
builder.WriteString(evt.Body)
return []byte(builder.String()), nil
}

@ -0,0 +1,56 @@
package archiver
import (
"bytes"
"github.com/creekorful/trandoshan/internal/archiver/storage_mock"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/event_mock"
"github.com/golang/mock/gomock"
"testing"
"time"
)
func TestHandleNewResourceEvent(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
subscriberMock := event_mock.NewMockSubscriber(mockCtrl)
storageMock := storage_mock.NewMockStorage(mockCtrl)
tn := time.Now()
msg := bytes.NewReader(nil)
subscriberMock.EXPECT().
Read(msg, &event.NewResourceEvent{}).
SetArg(1, event.NewResourceEvent{
URL: "https://example.onion",
Body: "Hello, world",
Headers: map[string]string{"Server": "Traefik", "Content-Type": "application/html"},
Time: tn,
}).Return(nil)
storageMock.EXPECT().Store("https://example.onion", tn, []byte("Server: Traefik\r\nContent-Type: application/html\r\n\r\nHello, world")).Return(nil)
s := state{storage: storageMock}
if err := s.handleNewResourceEvent(subscriberMock, msg); err != nil {
t.Fail()
}
}
func TestFormatResource(t *testing.T) {
evt := &event.NewResourceEvent{
URL: "https://google.com",
Body: "Hello, world",
Headers: map[string]string{"Server": "Traefik", "Content-Type": "text/html"},
Time: time.Now(),
}
res, err := formatResource(evt)
if err != nil {
t.FailNow()
}
if string(res) != "Server: Traefik\r\nContent-Type: text/html\r\n\r\nHello, world" {
t.Fail()
}
}

@ -0,0 +1,68 @@
package storage
import (
"fmt"
"io/ioutil"
"net/url"
"os"
"path/filepath"
"strings"
"time"
)
type localStorage struct {
baseDir string
}
// NewLocalStorage returns a new Storage that use local file system
func NewLocalStorage(root string) (Storage, error) {
return &localStorage{baseDir: root}, nil
}
func (s *localStorage) Store(url string, time time.Time, body []byte) error {
path, err := formatPath(url, time)
if err != nil {
return err
}
fullPath := filepath.Join(s.baseDir, path)
dir := filepath.Dir(fullPath)
if err := os.MkdirAll(dir, 0750); err != nil {
return err
}
if err := ioutil.WriteFile(fullPath, body, 0640); err != nil {
return err
}
return nil
}
func formatPath(rawURL string, time time.Time) (string, error) {
b := strings.Builder{}
u, err := url.Parse(rawURL)
if err != nil {
return "", err
}
// Protocol
b.WriteString(u.Scheme)
b.WriteRune(os.PathSeparator)
// Hostname
b.WriteString(u.Host)
b.WriteRune(os.PathSeparator)
// Write path
if uri := u.RequestURI(); uri != "/" {
b.WriteString(strings.TrimPrefix(u.RequestURI(), "/"))
b.WriteRune(os.PathSeparator)
}
// Write unix time
b.WriteString(fmt.Sprintf("%d", time.Unix()))
return b.String(), nil
}

@ -0,0 +1,82 @@
package storage
import (
"io/ioutil"
"os"
"path/filepath"
"testing"
"time"
)
func TestFormatPath(t *testing.T) {
type test struct {
url string
time time.Time
path string
}
ti := time.Date(2020, time.October, 29, 12, 4, 9, 0, time.UTC)
tests := []test{
{
url: "https://google.com",
time: ti,
path: "https/google.com/1603973049",
},
{
url: "http://facebook.com/admin/login.php?username=admin",
time: ti,
path: "http/facebook.com/admin/login.php?username=admin/1603973049",
},
{
url: "http://thisisalonghostname.onion/admin/tools/list-accounts.php?token=123223453&username=test",
time: ti,
path: "http/thisisalonghostname.onion/admin/tools/list-accounts.php?token=123223453&username=test/1603973049",
},
}
for _, test := range tests {
res, err := formatPath(test.url, test.time)
if err != nil {
t.Error()
}
if res != test.path {
t.Errorf("got: %s, want: %s", res, test.path)
}
}
}
func TestLocalStorage_Store(t *testing.T) {
d, err := ioutil.TempDir("", "")
if err != nil {
t.FailNow()
}
defer os.RemoveAll(d)
s := localStorage{baseDir: d}
ti := time.Date(2020, time.October, 29, 12, 4, 9, 0, time.UTC)
if err := s.Store("https://google.com", ti, []byte("Hello, world")); err != nil {
t.Fail()
}
p := filepath.Join(d, "https", "google.com", "1603973049")
inf, err := os.Stat(p)
if err != nil {
t.Fail()
}
if inf.Mode() != 0640 {
t.Fail()
}
b, err := ioutil.ReadFile(p)
if err != nil {
t.Fail()
}
if string(b) != "Hello, world" {
t.Fail()
}
}

@ -0,0 +1,11 @@
package storage
import "time"
//go:generate mockgen -destination=../storage_mock/storage_mock.go -package=storage_mock . Storage
// Storage is a abstraction layer where we store resource
type Storage interface {
// Store the resource
Store(url string, time time.Time, body []byte) error
}

@ -0,0 +1,20 @@
package clock
//go:generate mockgen -destination=../clock_mock/client_mock.go -package=clock_mock . Clock
import "time"
// Clock is an interface to ease unit testing
type Clock interface {
// Now return current time
Now() time.Time
}
// SystemClock is a clock that use system time
type SystemClock struct {
}
// Now return now from system clock
func (clock *SystemClock) Now() time.Time {
return time.Now()
}

@ -3,15 +3,21 @@ package crawler
import (
"crypto/tls"
"fmt"
"github.com/creekorful/trandoshan/internal/clock"
"github.com/creekorful/trandoshan/internal/crawler/http"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/logging"
"github.com/creekorful/trandoshan/internal/messaging"
"github.com/creekorful/trandoshan/internal/util"
"github.com/nats-io/nats.go"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"github.com/valyala/fasthttp"
"github.com/valyala/fasthttp/fasthttpproxy"
"io"
"io/ioutil"
"os"
"os/signal"
"strings"
"syscall"
"time"
)
@ -21,11 +27,11 @@ const defaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101
func GetApp() *cli.App {
return &cli.App{
Name: "tdsh-crawler",
Version: "0.6.0",
Version: "0.7.0",
Usage: "Trandoshan crawler component",
Flags: []cli.Flag{
logging.GetLogFlag(),
util.GetNATSURIFlag(),
util.GetHubURI(),
&cli.StringFlag{
Name: "tor-uri",
Usage: "URI to the TOR SOCKS proxy",
@ -51,13 +57,13 @@ func execute(ctx *cli.Context) error {
log.Info().
Str("ver", ctx.App.Version).
Str("nats-uri", ctx.String("nats-uri")).
Str("hub-uri", ctx.String("hub-uri")).
Str("tor-uri", ctx.String("tor-uri")).
Strs("allowed-content-types", ctx.StringSlice("allowed-ct")).
Msg("Starting tdsh-crawler")
// Create the HTTP client
httpClient := &fasthttp.Client{
httpClient := http.NewFastHTTPClient(&fasthttp.Client{
// Use given TOR proxy to reach the hidden services
Dial: fasthttpproxy.FasthttpSocksDialer(ctx.String("tor-uri")),
// Disable SSL verification since we do not really care about this
@ -65,78 +71,83 @@ func execute(ctx *cli.Context) error {
ReadTimeout: time.Second * 5,
WriteTimeout: time.Second * 5,
Name: ctx.String("user-agent"),
}
})
// Create the NATS subscriber
sub, err := messaging.NewSubscriber(ctx.String("nats-uri"))
// Create the subscriber
sub, err := event.NewSubscriber(ctx.String("hub-uri"))
if err != nil {
return err
}
defer sub.Close()
state := state{
httpClient: httpClient,
allowedContentTypes: ctx.StringSlice("allowed-ct"),
clock: &clock.SystemClock{},
}
if err := sub.SubscribeAsync(event.NewURLExchange, "crawlingQueue", state.handleNewURLEvent); err != nil {
return err
}
log.Info().Msg("Successfully initialized tdsh-crawler. Waiting for URLs")
handler := handleMessage(httpClient, ctx.StringSlice("allowed-ct"))
if err := sub.QueueSubscribe(messaging.URLTodoSubject, "crawlers", handler); err != nil {
// Handle graceful shutdown
c := make(chan os.Signal, 1)
signal.Notify(c, syscall.SIGINT, syscall.SIGTERM)
// Block until we receive our signal.
<-c
if err := sub.Close(); err != nil {
return err
}
return nil
}
func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) messaging.MsgHandler {
return func(sub messaging.Subscriber, msg *nats.Msg) error {
var urlMsg messaging.URLTodoMsg
if err := sub.ReadMsg(msg, &urlMsg); err != nil {
return err
}
body, err := crawURL(httpClient, urlMsg.URL, allowedContentTypes)
if err != nil {
return fmt.Errorf("error while crawling URL: %s", err)
}
type state struct {
httpClient http.Client
allowedContentTypes []string
clock clock.Clock
}
// Publish resource body
res := messaging.NewResourceMsg{
URL: urlMsg.URL,
Body: body,
}
if err := sub.PublishMsg(&res); err != nil {
return fmt.Errorf("error while publishing resource: %s", err)
}
func (state *state) handleNewURLEvent(subscriber event.Subscriber, body io.Reader) error {
var evt event.NewURLEvent
if err := subscriber.Read(body, &evt); err != nil {
return err
}
return nil
b, headers, err := crawURL(state.httpClient, evt.URL, state.allowedContentTypes)
if err != nil {
return err
}
}
func crawURL(httpClient *fasthttp.Client, url string, allowedContentTypes []string) (string, error) {
log.Debug().Str("url", url).Msg("Processing URL")
res := event.NewResourceEvent{
URL: evt.URL,
Body: b,
Headers: headers,
Time: state.clock.Now(),
}
// Query the website
req := fasthttp.AcquireRequest()
resp := fasthttp.AcquireResponse()
defer fasthttp.ReleaseRequest(req)
defer fasthttp.ReleaseResponse(resp)
if err := subscriber.Publish(&res); err != nil {
return err
}
req.SetRequestURI(url)
return nil
}
if err := httpClient.Do(req, resp); err != nil {
return "", err
}
func crawURL(httpClient http.Client, url string, allowedContentTypes []string) (string, map[string]string, error) {
log.Debug().Str("url", url).Msg("Processing URL")
switch code := resp.StatusCode(); {
case code > 302:
return "", fmt.Errorf("non-managed error code %d", code)
// follow redirect
case code == 301 || code == 302:
if location := string(resp.Header.Peek("Location")); location != "" {
return crawURL(httpClient, location, allowedContentTypes)
}
r, err := httpClient.Get(url)
if err != nil {
return "", nil, err
}
// Determinate if content type is allowed
allowed := false
contentType := string(resp.Header.Peek("Content-Type"))
contentType := r.Headers()["Content-Type"]
for _, allowedContentType := range allowedContentTypes {
if strings.Contains(contentType, allowedContentType) {
allowed = true
@ -146,8 +157,13 @@ func crawURL(httpClient *fasthttp.Client, url string, allowedContentTypes []stri
if !allowed {
err := fmt.Errorf("forbidden content type : %s", contentType)
return "", err
return "", nil, err
}
return string(resp.Body()), nil
// Ready body
b, err := ioutil.ReadAll(r.Body())
if err != nil {
return "", nil, err
}
return string(b), r.Headers(), nil
}

@ -1 +1,130 @@
package crawler
import (
"bytes"
"github.com/creekorful/trandoshan/internal/clock_mock"
"github.com/creekorful/trandoshan/internal/crawler/http_mock"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/event_mock"
"github.com/golang/mock/gomock"
"strings"
"testing"
"time"
)
func TestCrawlURLForbiddenContentType(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
httpClientMock := http_mock.NewMockClient(mockCtrl)
url := "https://example.onion"
allowedContentTypes := []string{"text/plain"}
httpResponseMock := http_mock.NewMockResponse(mockCtrl)
httpResponseMock.EXPECT().Headers().Return(map[string]string{"Content-Type": "image/png"})
httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil)
body, headers, err := crawURL(httpClientMock, url, allowedContentTypes)
if body != "" || headers != nil || err == nil {
t.Fail()
}
}
func TestCrawlURLSameContentType(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
httpClientMock := http_mock.NewMockClient(mockCtrl)
url := "https://example.onion"
allowedContentTypes := []string{"text/plain"}
httpResponseMock := http_mock.NewMockResponse(mockCtrl)
httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain"})
httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello"))
httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil)
body, headers, err := crawURL(httpClientMock, url, allowedContentTypes)
if err != nil {
t.Fail()
}
if body != "Hello" {
t.Fail()
}
if len(headers) != 1 {
t.Fail()
}
if headers["Content-Type"] != "text/plain" {
t.Fail()
}
}
func TestCrawlURLNoContentTypeFiltering(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
httpClientMock := http_mock.NewMockClient(mockCtrl)
url := "https://example.onion"
allowedContentTypes := []string{""}
httpResponseMock := http_mock.NewMockResponse(mockCtrl)
httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain"})
httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello"))
httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil)
body, headers, err := crawURL(httpClientMock, url, allowedContentTypes)
if err != nil {
t.Fail()
}
if body != "Hello" {
t.Fail()
}
if len(headers) != 1 {
t.Fail()
}
if headers["Content-Type"] != "text/plain" {
t.Fail()
}
}
func TestHandleNewURLEvent(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
subscriberMock := event_mock.NewMockSubscriber(mockCtrl)
httpClientMock := http_mock.NewMockClient(mockCtrl)
httpResponseMock := http_mock.NewMockResponse(mockCtrl)
clockMock := clock_mock.NewMockClock(mockCtrl)
msg := bytes.NewReader(nil)
subscriberMock.EXPECT().
Read(msg, &event.NewURLEvent{}).
SetArg(1, event.NewURLEvent{URL: "https://example.onion/image.png?id=12&test=2"}).
Return(nil)
httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain", "Server": "Debian"})
httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello"))
httpClientMock.EXPECT().Get("https://example.onion/image.png?id=12&test=2").Return(httpResponseMock, nil)
tn := time.Now()
clockMock.EXPECT().Now().Return(tn)
subscriberMock.EXPECT().Publish(&event.NewResourceEvent{
URL: "https://example.onion/image.png?id=12&test=2",
Body: "Hello",
Headers: map[string]string{"Content-Type": "text/plain", "Server": "Debian"},
Time: tn,
}).Return(nil)
s := state{
httpClient: httpClientMock,
allowedContentTypes: []string{"text/plain", "text/css"},
clock: clockMock,
}
if err := s.handleNewURLEvent(subscriberMock, msg); err != nil {
t.Fail()
}
}

@ -0,0 +1,52 @@
package http
//go:generate mockgen -destination=../http_mock/client_mock.go -package=http_mock . Client
import (
"fmt"
"github.com/valyala/fasthttp"
)
// Client is an HTTP client
type Client interface {
// Get the corresponding URL
// this methods follows redirections
Get(URL string) (Response, error)
}
type client struct {
c *fasthttp.Client
}
// NewFastHTTPClient create a new Client using fasthttp.Client as backend
func NewFastHTTPClient(c *fasthttp.Client) Client {
return &client{c: c}
}
func (c *client) Get(URL string) (Response, error) {
req := fasthttp.AcquireRequest()
resp := fasthttp.AcquireResponse()
defer fasthttp.ReleaseRequest(req)
defer fasthttp.ReleaseResponse(resp)
req.SetRequestURI(URL)
if err := c.c.Do(req, resp); err != nil {
return nil, err
}
switch code := resp.StatusCode(); {
case code > 302:
return nil, fmt.Errorf("non-managed error code %d", code)
// follow redirect
case code == 301 || code == 302:
if location := string(resp.Header.Peek("Location")); location != "" {
return c.Get(location)
}
}
r := &response{}
resp.CopyTo(&r.raw)
return r, nil
}

@ -0,0 +1,33 @@
package http
//go:generate mockgen -destination=../http_mock/response_mock.go -package=http_mock . Response
import (
"bytes"
"github.com/valyala/fasthttp"
"io"
)
// Response is an HTTP response
type Response interface {
// Headers returns the response headers
Headers() map[string]string
// Body return the response body
Body() io.Reader
}
type response struct {
raw fasthttp.Response
}
func (r *response) Headers() map[string]string {
headers := map[string]string{}
r.raw.Header.VisitAll(func(key, value []byte) {
headers[string(key)] = string(value) // TODO manage multiple values?
})
return headers
}
func (r *response) Body() io.Reader {
return bytes.NewReader(r.raw.Body())
}

@ -0,0 +1,21 @@
package duration
import (
"github.com/xhit/go-str2duration/v2"
"time"
)
// ParseDuration parse given duration into time.Duration
// or returns -1 if fails
func ParseDuration(duration string) time.Duration {
if duration == "" {
return -1
}
val, err := str2duration.ParseDuration(duration)
if err != nil {
return -1
}
return val
}

@ -0,0 +1,24 @@
package duration
import (
"testing"
"time"
)
func TestParseDuration(t *testing.T) {
if ParseDuration("") != -1 {
t.Fail()
}
if ParseDuration("50s") != time.Second*50 {
t.Fail()
}
if ParseDuration("50m") != time.Minute*50 {
t.Fail()
}
if ParseDuration("50h") != time.Hour*50 {
t.Fail()
}
if ParseDuration("50d") != time.Hour*24*50 {
t.Fail()
}
}

@ -0,0 +1,53 @@
package event
import "time"
//go:generate mockgen -destination=../event_mock/event_mock.go -package=event_mock . Publisher,Subscriber
const (
// NewURLExchange is the subject used when an URL is schedule for crawling
NewURLExchange = "url.new"
// FoundURLExchange is the subject used when an URL is extracted from resource
FoundURLExchange = "url.found"
// NewResourceExchange is the subject used when a new resource has been crawled
NewResourceExchange = "resource.new"
)
// Event represent a event
type Event interface {
// Exchange returns the exchange where event should be push
Exchange() string
}
// NewURLEvent represent an URL to crawl
type NewURLEvent struct {
URL string `json:"url"`
}
// Exchange returns the exchange where event should be push
func (msg *NewURLEvent) Exchange() string {
return NewURLExchange
}
// FoundURLEvent represent a found URL
type FoundURLEvent struct {
URL string `json:"url"`
}
// Exchange returns the exchange where event should be push
func (msg *FoundURLEvent) Exchange() string {
return FoundURLExchange
}
// NewResourceEvent represent a crawled resource
type NewResourceEvent struct {
URL string `json:"url"`
Body string `json:"body"`
Headers map[string]string `json:"headers"`
Time time.Time `json:"time"`
}
// Exchange returns the exchange where event should be push
func (msg *NewResourceEvent) Exchange() string {
return NewResourceExchange
}

@ -0,0 +1,55 @@
package event
import (
"encoding/json"
"fmt"
"github.com/streadway/amqp"
)
// Publisher is something that push an event
type Publisher interface {
Publish(event Event) error
Close() error
}
type publisher struct {
channel *amqp.Channel
}
// NewPublisher create a new Publisher instance
func NewPublisher(amqpURI string) (Publisher, error) {
conn, err := amqp.Dial(amqpURI)
if err != nil {
return nil, err
}
c, err := conn.Channel()
if err != nil {
return nil, err
}
return &publisher{
channel: c,
}, nil
}
func (p *publisher) Publish(event Event) error {
return publishJSON(p.channel, event.Exchange(), event)
}
func (p *publisher) Close() error {
return p.channel.Close()
}
func publishJSON(rc *amqp.Channel, exchange string, event interface{}) error {
evtBytes, err := json.Marshal(event)
if err != nil {
return fmt.Errorf("error while encoding event: %s", err)
}
return rc.Publish(exchange, "", false, false, amqp.Publishing{
ContentType: "application/json",
Body: evtBytes,
DeliveryMode: amqp.Persistent,
})
}

@ -0,0 +1,105 @@
package event
import (
"bytes"
"encoding/json"
"fmt"
"github.com/rs/zerolog/log"
"github.com/streadway/amqp"
"io"
)
// Handler represent an event handler
type Handler func(Subscriber, io.Reader) error
// Subscriber is something that read msg from an event queue
type Subscriber interface {
Publisher
Read(body io.Reader, event Event) error
SubscribeAsync(exchange, queue string, handler Handler) error
}
// Subscriber represent a subscriber
type subscriber struct {
channel *amqp.Channel
}
// NewSubscriber create a new subscriber and connect it to given server
func NewSubscriber(amqpURI string) (Subscriber, error) {
conn, err := amqp.Dial(amqpURI)
if err != nil {
return nil, err
}
c, err := conn.Channel()
if err != nil {
return nil, err
}
if err := c.Qos(1, 0, false); err != nil {
return nil, err
}
return &subscriber{
channel: c,
}, nil
}
func (s *subscriber) Publish(event Event) error {
return publishJSON(s.channel, event.Exchange(), event)
}
func (s *subscriber) Close() error {
return s.channel.Close()
}
func (s *subscriber) Read(body io.Reader, event Event) error {
return readJSON(body, event)
}
func (s *subscriber) SubscribeAsync(exchange, queue string, handler Handler) error {
// First of all declare the exchange
if err := s.channel.ExchangeDeclare(exchange, amqp.ExchangeFanout, true, false, false, false, nil); err != nil {
return err
}
// Then declare the queue
q, err := s.channel.QueueDeclare(queue, true, false, false, false, nil)
if err != nil {
return err
}
// Bind the queue to the exchange
if err := s.channel.QueueBind(q.Name, "", exchange, false, nil); err != nil {
return err
}
// Start consuming asynchronously
deliveries, err := s.channel.Consume(q.Name, "", false, false, false, false, nil)
if err != nil {
return err
}
go func() {
for delivery := range deliveries {
if err := handler(s, bytes.NewReader(delivery.Body)); err != nil {
log.Err(err).Msg("error while processing event")
}
// Ack no matter what happen since we doesn't care about failing event (yet?)
if err := delivery.Ack(false); err != nil {
log.Err(err).Msg("error while acknowledging event")
}
}
}()
return nil
}
func readJSON(body io.Reader, event interface{}) error {
if err := json.NewDecoder(body).Decode(event); err != nil {
return fmt.Errorf("error while decoding event: %s", err)
}
return nil
}

@ -5,31 +5,28 @@ import (
"github.com/PuerkitoBio/goquery"
"github.com/PuerkitoBio/purell"
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/logging"
"github.com/creekorful/trandoshan/internal/messaging"
"github.com/creekorful/trandoshan/internal/util"
"github.com/nats-io/nats.go"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"io"
"mvdan.cc/xurls/v2"
"regexp"
"os"
"os/signal"
"strings"
"time"
)
var (
protocolRegex = regexp.MustCompile("https?://")
"syscall"
)
// GetApp return the extractor app
func GetApp() *cli.App {
return &cli.App{
Name: "tdsh-extractor",
Version: "0.6.0",
Version: "0.7.0",
Usage: "Trandoshan extractor component",
Flags: []cli.Flag{
logging.GetLogFlag(),
util.GetNATSURIFlag(),
util.GetHubURI(),
util.GetAPIURIFlag(),
util.GetAPITokenFlag(),
},
@ -42,69 +39,99 @@ func execute(ctx *cli.Context) error {
log.Info().
Str("ver", ctx.App.Version).
Str("nats-uri", ctx.String("nats-uri")).
Str("hub-uri", ctx.String("hub-uri")).
Str("api-uri", ctx.String("api-uri")).
Msg("Starting tdsh-extractor")
apiClient := util.GetAPIClient(ctx)
// Create the NATS subscriber
sub, err := messaging.NewSubscriber(ctx.String("nats-uri"))
// Create the event subscriber
sub, err := event.NewSubscriber(ctx.String("hub-uri"))
if err != nil {
return err
}
defer sub.Close()
state := state{apiClient: apiClient}
if err := sub.SubscribeAsync(event.NewResourceExchange, "extractingQueue", state.handleNewResourceEvent); err != nil {
return err
}
log.Info().Msg("Successfully initialized tdsh-extractor. Waiting for resources")
handler := handleMessage(apiClient)
if err := sub.QueueSubscribe(messaging.NewResourceSubject, "extractors", handler); err != nil {
// Handle graceful shutdown
c := make(chan os.Signal, 1)
signal.Notify(c, syscall.SIGINT, syscall.SIGTERM)
// Block until we receive our signal.
<-c
if err := sub.Close(); err != nil {
return err
}
return nil
}
func handleMessage(apiClient api.Client) messaging.MsgHandler {
return func(sub messaging.Subscriber, msg *nats.Msg) error {
var resMsg messaging.NewResourceMsg
if err := sub.ReadMsg(msg, &resMsg); err != nil {
return err
}
type state struct {
apiClient api.API
}
log.Debug().Str("url", resMsg.URL).Msg("Processing new resource")
func (state *state) handleNewResourceEvent(subscriber event.Subscriber, body io.Reader) error {
var evt event.NewResourceEvent
if err := subscriber.Read(body, &evt); err != nil {
return err
}
// Extract & process resource
resDto, urls, err := extractResource(resMsg)
if err != nil {
return fmt.Errorf("error while extracting resource: %s", err)
}
log.Debug().Str("url", evt.URL).Msg("Processing new resource")
// Submit to the API
_, err = apiClient.AddResource(resDto)
if err != nil {
return fmt.Errorf("error while adding resource (%s): %s", resDto.URL, err)
}
// Extract & process resource
resDto, urls, err := extractResource(evt)
if err != nil {
return fmt.Errorf("error while extracting resource: %s", err)
}
// Lowercase headers
resDto.Headers = map[string]string{}
for key, value := range evt.Headers {
resDto.Headers[strings.ToLower(key)] = value
}
// Finally push found URLs
for _, url := range urls {
// Submit to the API
_, err = state.apiClient.AddResource(resDto)
if err != nil {
return fmt.Errorf("error while adding resource (%s): %s", resDto.URL, err)
}
// Finally push found URLs
publishedURLS := map[string]string{}
for _, url := range urls {
if _, exist := publishedURLS[url]; exist {
log.Trace().
Str("url", url).
Msg("Publishing found URL")
Msg("Skipping duplicate URL")
continue
}
if err := sub.PublishMsg(&messaging.URLFoundMsg{URL: url}); err != nil {
log.Warn().
Str("url", url).
Str("err", err.Error()).
Msg("Error while publishing URL")
}
log.Trace().
Str("url", url).
Msg("Publishing found URL")
if err := subscriber.Publish(&event.FoundURLEvent{URL: url}); err != nil {
log.Warn().
Str("url", url).
Str("err", err.Error()).
Msg("Error while publishing URL")
}
return nil
publishedURLS[url] = url
}
return nil
}
func extractResource(msg messaging.NewResourceMsg) (api.ResourceDto, []string, error) {
func extractResource(msg event.NewResourceEvent) (api.ResourceDto, []string, error) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(msg.Body))
if err != nil {
return api.ResourceDto{}, nil, err
@ -127,7 +154,7 @@ func extractResource(msg messaging.NewResourceMsg) (api.ResourceDto, []string, e
}
}
meta[name] = value
meta[strings.ToLower(name)] = value
})
// Extract & normalize URLs
@ -148,7 +175,7 @@ func extractResource(msg messaging.NewResourceMsg) (api.ResourceDto, []string, e
return api.ResourceDto{
URL: msg.URL,
Body: msg.Body,
Time: time.Now(),
Time: msg.Time,
Title: title,
Meta: meta,
Description: meta["description"],

@ -1,13 +1,14 @@
package extractor
import (
"bytes"
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/api_mock"
"github.com/creekorful/trandoshan/internal/messaging"
"github.com/creekorful/trandoshan/internal/messaging_mock"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/event_mock"
"github.com/golang/mock/gomock"
"github.com/nats-io/nats.go"
"testing"
"time"
)
func TestExtractResource(t *testing.T) {
@ -18,11 +19,11 @@ This is sparta
<a href="https://google.com/test?test=test#12">
<meta name="description" content="Zhello world">
<meta name="Description" content="Zhello world">
<meta property="og:url" content="https://example.org">
`
msg := messaging.NewResourceMsg{
msg := event.NewResourceEvent{
URL: "https://example.org/300",
Body: body,
}
@ -42,12 +43,15 @@ This is sparta
t.Fail()
}
if len(urls) == 0 {
if len(urls) != 2 {
t.FailNow()
}
if urls[0] != "https://google.com/test?test=test" {
t.Fail()
}
if urls[1] != "https://example.org" {
t.Fail()
}
if resDto.Description != "Zhello world" {
t.Fail()
@ -77,73 +81,54 @@ func TestHandleMessage(t *testing.T) {
body := `
<title>Creekorful Inc</title>
This is sparta
This is sparta (hosted on https://example.org)
<a href="https://google.com/test?test=test#12">
<meta name="description" content="Zhello world">
<meta name="DescriptIon" content="Zhello world">
<meta property="og:url" content="https://example.org">`
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
apiClientMock := api_mock.NewMockClient(mockCtrl)
subscriberMock := messaging_mock.NewMockSubscriber(mockCtrl)
apiClientMock := api_mock.NewMockAPI(mockCtrl)
subscriberMock := event_mock.NewMockSubscriber(mockCtrl)
tn := time.Now()
msg := nats.Msg{}
msg := bytes.NewReader(nil)
subscriberMock.EXPECT().
ReadMsg(&msg, &messaging.NewResourceMsg{}).
SetArg(1, messaging.NewResourceMsg{URL: "https://example.onion", Body: body}).
Return(nil)
Read(msg, &event.NewResourceEvent{}).
SetArg(1, event.NewResourceEvent{
URL: "https://example.onion",
Body: body,
Headers: map[string]string{"Server": "Traefik", "Content-Type": "application/html"},
Time: tn,
}).Return(nil)
// make sure we are creating the resource
apiClientMock.EXPECT().AddResource(&resMatcher{target: api.ResourceDto{
apiClientMock.EXPECT().AddResource(api.ResourceDto{
URL: "https://example.onion",
Body: body,
Title: "Creekorful Inc",
Meta: map[string]string{"description": "Zhello world", "og:url": "https://example.org"},
Description: "Zhello world",
}}).Return(api.ResourceDto{}, nil)
Headers: map[string]string{"server": "Traefik", "content-type": "application/html"},
Time: tn,
}).Return(api.ResourceDto{}, nil)
// make sure we are pushing found URLs
// should be called only one time
subscriberMock.EXPECT().
PublishMsg(&messaging.URLFoundMsg{URL: "https://example.org"}).
Publish(&event.FoundURLEvent{URL: "https://example.org"}).
Return(nil)
subscriberMock.EXPECT().
PublishMsg(&messaging.URLFoundMsg{URL: "https://google.com/test?test=test"}).
Publish(&event.FoundURLEvent{URL: "https://google.com/test?test=test"}).
Return(nil)
if err := handleMessage(apiClientMock)(subscriberMock, &msg); err != nil {
s := state{apiClient: apiClientMock}
if err := s.handleNewResourceEvent(subscriberMock, msg); err != nil {
t.FailNow()
}
}
// custom matcher to ignore time field when doing comparison
// todo: do less crappy?
type resMatcher struct {
target api.ResourceDto
}
func (rm *resMatcher) Matches(x interface{}) bool {
arg := x.(api.ResourceDto)
return arg.Title ==
rm.target.Title &&
arg.URL == rm.target.URL &&
arg.Body == rm.target.Body &&
arg.Description == rm.target.Description &&
exactMatch(arg.Meta, rm.target.Meta)
}
func (rm *resMatcher) String() string {
return "is valid resource"
}
func exactMatch(left, right map[string]string) bool {
for key, want := range left {
if got, exist := right[key]; !exist || got != want {
return false
}
}
return true
}

@ -1,7 +0,0 @@
package messaging
// Msg represent a message send-able trough NATS
type Msg interface {
// Subject returns the subject where message should be push
Subject() string
}

@ -1,43 +0,0 @@
package messaging
//go:generate mockgen -destination=../messaging_mock/publisher_mock.go -package=messaging_mock . Publisher,Subscriber
const (
// URLTodoSubject is the subject used when an URL is schedule for crawling
URLTodoSubject = "url.todo"
// URLFoundSubject is the subject used when an URL is extracted from resource
URLFoundSubject = "url.found"
// NewResourceSubject is the subject used when a new resource has been crawled
NewResourceSubject = "resource.new"
)
// URLTodoMsg represent an URL to crawl
type URLTodoMsg struct {
URL string `json:"url"`
}
// Subject returns the subject where message should be push
func (msg *URLTodoMsg) Subject() string {
return URLTodoSubject
}
// URLFoundMsg represent a found URL
type URLFoundMsg struct {
URL string `json:"url"`
}
// Subject returns the subject where message should be push
func (msg *URLFoundMsg) Subject() string {
return URLFoundSubject
}
// NewResourceMsg represent a crawled resource
type NewResourceMsg struct {
URL string `json:"url"`
Body string `json:"body"`
}
// Subject returns the subject where message should be push
func (msg *NewResourceMsg) Subject() string {
return NewResourceSubject
}

@ -1,46 +0,0 @@
package messaging
import (
"encoding/json"
"fmt"
"github.com/nats-io/nats.go"
)
// Publisher is something that push msg to an event queue
type Publisher interface {
PublishMsg(msg Msg) error
Close()
}
type publisher struct {
nc *nats.Conn
}
// NewPublisher create a new Publisher instance
func NewPublisher(natsURI string) (Publisher, error) {
nc, err := nats.Connect(natsURI)
if err != nil {
return nil, err
}
return &publisher{
nc: nc,
}, nil
}
func (p *publisher) PublishMsg(msg Msg) error {
return publishJSON(p.nc, msg.Subject(), msg)
}
func (p *publisher) Close() {
p.nc.Close()
}
func publishJSON(nc *nats.Conn, subject string, msg interface{}) error {
msgBytes, err := json.Marshal(msg)
if err != nil {
return fmt.Errorf("error while encoding message: %s", err)
}
return nc.Publish(subject, msgBytes)
}

@ -1,81 +0,0 @@
package messaging
import (
"context"
"encoding/json"
"fmt"
"github.com/nats-io/nats.go"
"github.com/rs/zerolog/log"
)
// MsgHandler represent an handler for a NATS subscriber
type MsgHandler func(s Subscriber, msg *nats.Msg) error
// Subscriber is something that read msg from an event queue
type Subscriber interface {
Publisher
ReadMsg(natsMsg *nats.Msg, msg Msg) error
QueueSubscribe(subject, queue string, handler MsgHandler) error
Close()
}
// Subscriber represent a NATS subscriber
type subscriber struct {
nc *nats.Conn
}
// NewSubscriber create a new subscriber and connect it to given NATS server
func NewSubscriber(address string) (Subscriber, error) {
nc, err := nats.Connect(address)
if err != nil {
return nil, err
}
return &subscriber{
nc: nc,
}, nil
}
func (s *subscriber) ReadMsg(natsMsg *nats.Msg, msg Msg) error {
return readJSON(natsMsg, msg)
}
func (s *subscriber) QueueSubscribe(subject, queue string, handler MsgHandler) error {
// Create the subscriber
sub, err := s.nc.QueueSubscribeSync(subject, queue)
if err != nil {
return err
}
for {
// Read incoming message
msg, err := sub.NextMsgWithContext(context.Background())
if err != nil {
log.Warn().Str("err", err.Error()).Msg("error while reading incoming message, skipping it")
continue
}
// ... And process it
if err := handler(s, msg); err != nil {
log.Err(err).Msg("error while processing message")
continue
}
}
}
func (s *subscriber) PublishMsg(msg Msg) error {
return publishJSON(s.nc, msg.Subject(), msg)
}
func (s *subscriber) Close() {
s.nc.Close()
}
func readJSON(msg *nats.Msg, body interface{}) error {
if err := json.Unmarshal(msg.Data, body); err != nil {
return fmt.Errorf("error while decoding message: %s", err)
}
return nil
}

@ -1,29 +1,41 @@
package scheduler
import (
"errors"
"fmt"
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/internal/duration"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/logging"
"github.com/creekorful/trandoshan/internal/messaging"
"github.com/creekorful/trandoshan/internal/util"
"github.com/nats-io/nats.go"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"github.com/xhit/go-str2duration/v2"
"io"
"net/url"
"os"
"os/signal"
"strings"
"syscall"
"time"
)
var (
errNotOnionHostname = errors.New("hostname is not .onion")
errProtocolNotAllowed = errors.New("protocol is not allowed")
errExtensionNotAllowed = errors.New("extension is not allowed")
errShouldNotSchedule = errors.New("should not be scheduled")
errHostnameNotAllowed = errors.New("hostname is not allowed")
)
// GetApp return the scheduler app
func GetApp() *cli.App {
return &cli.App{
Name: "tdsh-scheduler",
Version: "0.6.0",
Version: "0.7.0",
Usage: "Trandoshan scheduler component",
Flags: []cli.Flag{
logging.GetLogFlag(),
util.GetNATSURIFlag(),
util.GetHubURI(),
util.GetAPIURIFlag(),
util.GetAPITokenFlag(),
&cli.StringFlag{
@ -34,6 +46,10 @@ func GetApp() *cli.App {
Name: "forbidden-extensions",
Usage: "Extensions to disable scheduling for (i.e png, exe, css, ...) (the dot will be added automatically)",
},
&cli.StringSliceFlag{
Name: "forbidden-hostnames",
Usage: "Hostnames to disable scheduling for",
},
},
Action: execute,
}
@ -42,102 +58,127 @@ func GetApp() *cli.App {
func execute(ctx *cli.Context) error {
logging.ConfigureLogger(ctx)
refreshDelay := parseRefreshDelay(ctx.String("refresh-delay"))
refreshDelay := duration.ParseDuration(ctx.String("refresh-delay"))
log.Info().
Str("ver", ctx.App.Version).
Str("nats-uri", ctx.String("nats-uri")).
Str("hub-uri", ctx.String("hub-uri")).
Str("api-uri", ctx.String("api-uri")).
Strs("forbidden-exts", ctx.StringSlice("forbidden-extensions")).
Strs("forbidden-hostnames", ctx.StringSlice("forbidden-hostnames")).
Dur("refresh-delay", refreshDelay).
Msg("Starting tdsh-scheduler")
// Create the API client
apiClient := util.GetAPIClient(ctx)
// Create the NATS subscriber
sub, err := messaging.NewSubscriber(ctx.String("nats-uri"))
// Create the subscriber
sub, err := event.NewSubscriber(ctx.String("hub-uri"))
if err != nil {
return err
}
defer sub.Close()
state := state{
apiClient: apiClient,
refreshDelay: refreshDelay,
forbiddenExtensions: ctx.StringSlice("forbidden-extensions"),
forbiddenHostnames: ctx.StringSlice("forbidden-hostnames"),
}
if err := sub.SubscribeAsync(event.FoundURLExchange, "schedulingQueue", state.handleURLFoundEvent); err != nil {
return err
}
log.Info().Msg("Successfully initialized tdsh-scheduler. Waiting for URLs")
handler := handleMessage(apiClient, refreshDelay, ctx.StringSlice("forbidden-extensions"))
if err := sub.QueueSubscribe(messaging.URLFoundSubject, "schedulers", handler); err != nil {
// Handle graceful shutdown
c := make(chan os.Signal, 1)
signal.Notify(c, syscall.SIGINT, syscall.SIGTERM)
// Block until we receive our signal.
<-c
if err := sub.Close(); err != nil {
return err
}
return nil
}
func handleMessage(apiClient api.Client, refreshDelay time.Duration, forbiddenExtensions []string) messaging.MsgHandler {
return func(sub messaging.Subscriber, msg *nats.Msg) error {
var urlMsg messaging.URLFoundMsg
if err := sub.ReadMsg(msg, &urlMsg); err != nil {
return err
}
type state struct {
apiClient api.API
refreshDelay time.Duration
forbiddenExtensions []string
forbiddenHostnames []string
}
log.Trace().Str("url", urlMsg.URL).Msg("Processing URL")
func (state *state) handleURLFoundEvent(subscriber event.Subscriber, body io.Reader) error {
var evt event.FoundURLEvent
if err := subscriber.Read(body, &evt); err != nil {
return err
}
u, err := url.Parse(urlMsg.URL)
if err != nil {
return fmt.Errorf("error while parsing URL: %s", err)
}
log.Trace().Str("url", evt.URL).Msg("Processing URL")
// Make sure URL is valid .onion
if !strings.Contains(u.Host, ".onion") {
log.Trace().Stringer("url", u).Msg("URL is not a valid hidden service")
return nil // Technically not an error
}
u, err := url.Parse(evt.URL)
if err != nil {
return fmt.Errorf("error while parsing URL: %s", err)
}
// Make sure extension is not forbidden
for _, ext := range forbiddenExtensions {
if strings.HasSuffix(u.Path, "."+ext) {
log.Trace().
Stringer("url", u).
Str("ext", ext).
Msg("Skipping URL with forbidden extension")
return nil // Technically not an error
}
}
// Make sure URL is valid .onion
if !strings.Contains(u.Host, ".onion") {
return fmt.Errorf("%s %w", u.Host, errNotOnionHostname)
}
// If we want to allow re-schedule of existing crawled resources we need to retrieve only resources
// that are newer than `now - refreshDelay`.
endDate := time.Time{}
if refreshDelay != -1 {
endDate = time.Now().Add(-refreshDelay)
}
// Make sure protocol is not forbidden
if !strings.HasPrefix(u.Scheme, "http") {
return fmt.Errorf("%s %w", u, errProtocolNotAllowed)
}
_, count, err := apiClient.SearchResources(u.String(), "", time.Time{}, endDate, 1, 1)
if err != nil {
return fmt.Errorf("error while searching resource (%s): %s", u, err)
// Make sure extension is not forbidden
for _, ext := range state.forbiddenExtensions {
if strings.HasSuffix(u.Path, "."+ext) {
return fmt.Errorf("%s (.%s) %w", u, ext, errExtensionNotAllowed)
}
}
// No matches: schedule!
if count == 0 {
log.Debug().Stringer("url", u).Msg("URL should be scheduled")
if err := sub.PublishMsg(&messaging.URLTodoMsg{URL: urlMsg.URL}); err != nil {
return fmt.Errorf("error while publishing URL: %s", err)
}
} else {
log.Trace().Stringer("url", u).Msg("URL should not be scheduled")
// Make sure hostname is not forbidden
for _, hostname := range state.forbiddenHostnames {
if strings.Contains(u.Hostname(), hostname) {
return fmt.Errorf("%s %w", u, errHostnameNotAllowed)
}
return nil
}
}
func parseRefreshDelay(delay string) time.Duration {
if delay == "" {
return -1
// If we want to allow re-schedule of existing crawled resources we need to retrieve only resources
// that are newer than `now - refreshDelay`.
endDate := time.Time{}
if state.refreshDelay != -1 {
endDate = time.Now().Add(-state.refreshDelay)
}
val, err := str2duration.ParseDuration(delay)
params := api.ResSearchParams{
URL: u.String(),
EndDate: endDate,
WithBody: false,
PageSize: 1,
PageNumber: 1,
}
_, count, err := state.apiClient.SearchResources(&params)
if err != nil {
return -1
return fmt.Errorf("error while searching resource (%s): %s", u, err)
}
if count > 0 {
return fmt.Errorf("%s %w", u, errShouldNotSchedule)
}
return val
// No matches: schedule!
log.Debug().Stringer("url", u).Msg("URL should be scheduled")
if err := subscriber.Publish(&event.NewURLEvent{URL: evt.URL}); err != nil {
return fmt.Errorf("error while publishing URL: %s", err)
}
return nil
}

@ -1,70 +1,97 @@
package scheduler
import (
"bytes"
"errors"
"fmt"
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/api_mock"
"github.com/creekorful/trandoshan/internal/messaging"
"github.com/creekorful/trandoshan/internal/messaging_mock"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/event_mock"
"github.com/golang/mock/gomock"
"github.com/nats-io/nats.go"
"testing"
"time"
)
func TestParseRefreshDelay(t *testing.T) {
if parseRefreshDelay("") != -1 {
t.Fail()
}
if parseRefreshDelay("50s") != time.Second*50 {
t.Fail()
}
if parseRefreshDelay("50m") != time.Minute*50 {
t.Fail()
}
if parseRefreshDelay("50h") != time.Hour*50 {
t.Fail()
}
if parseRefreshDelay("50d") != time.Hour*24*50 {
t.Fail()
}
}
func TestHandleMessageNotOnion(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
apiClientMock := api_mock.NewMockClient(mockCtrl)
subscriberMock := messaging_mock.NewMockSubscriber(mockCtrl)
apiClientMock := api_mock.NewMockAPI(mockCtrl)
subscriberMock := event_mock.NewMockSubscriber(mockCtrl)
msg := nats.Msg{}
msg := bytes.NewReader(nil)
subscriberMock.EXPECT().
ReadMsg(&msg, &messaging.URLFoundMsg{}).
SetArg(1, messaging.URLFoundMsg{URL: "https://example.org"}).
Read(msg, &event.FoundURLEvent{}).
SetArg(1, event.FoundURLEvent{URL: "https://example.org"}).
Return(nil)
if err := handleMessage(apiClientMock, -1, []string{})(subscriberMock, &msg); err != nil {
s := state{
apiClient: apiClientMock,
refreshDelay: -1,
forbiddenExtensions: []string{},
}
if err := s.handleURLFoundEvent(subscriberMock, msg); !errors.Is(err, errNotOnionHostname) {
t.FailNow()
}
}
func TestHandleMessageWrongProtocol(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
apiClientMock := api_mock.NewMockAPI(mockCtrl)
subscriberMock := event_mock.NewMockSubscriber(mockCtrl)
msg := bytes.NewReader(nil)
s := state{
apiClient: apiClientMock,
refreshDelay: -1,
forbiddenExtensions: []string{},
}
for _, protocol := range []string{"irc", "ftp"} {
subscriberMock.EXPECT().
Read(msg, &event.FoundURLEvent{}).
SetArg(1, event.FoundURLEvent{URL: fmt.Sprintf("%s://example.onion", protocol)}).
Return(nil)
if err := s.handleURLFoundEvent(subscriberMock, msg); !errors.Is(err, errProtocolNotAllowed) {
t.FailNow()
}
}
}
func TestHandleMessageAlreadyCrawled(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
apiClientMock := api_mock.NewMockClient(mockCtrl)
subscriberMock := messaging_mock.NewMockSubscriber(mockCtrl)
apiClientMock := api_mock.NewMockAPI(mockCtrl)
subscriberMock := event_mock.NewMockSubscriber(mockCtrl)
msg := nats.Msg{}
msg := bytes.NewReader(nil)
subscriberMock.EXPECT().
ReadMsg(&msg, &messaging.URLFoundMsg{}).
SetArg(1, messaging.URLFoundMsg{URL: "https://example.onion"}).
Read(msg, &event.FoundURLEvent{}).
SetArg(1, event.FoundURLEvent{URL: "https://example.onion"}).
Return(nil)
params := api.ResSearchParams{
URL: "https://example.onion",
PageSize: 1,
PageNumber: 1,
}
apiClientMock.EXPECT().
SearchResources("https://example.onion", "", time.Time{}, time.Time{}, 1, 1).
SearchResources(&params).
Return([]api.ResourceDto{}, int64(1), nil)
if err := handleMessage(apiClientMock, -1, []string{"png"})(subscriberMock, &msg); err != nil {
s := state{
apiClient: apiClientMock,
refreshDelay: -1,
forbiddenExtensions: []string{"png"},
}
if err := s.handleURLFoundEvent(subscriberMock, msg); !errors.Is(err, errShouldNotSchedule) {
t.FailNow()
}
}
@ -73,42 +100,110 @@ func TestHandleMessageForbiddenExtensions(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
apiClientMock := api_mock.NewMockClient(mockCtrl)
subscriberMock := messaging_mock.NewMockSubscriber(mockCtrl)
apiClientMock := api_mock.NewMockAPI(mockCtrl)
subscriberMock := event_mock.NewMockSubscriber(mockCtrl)
msg := nats.Msg{}
msg := bytes.NewReader(nil)
subscriberMock.EXPECT().
ReadMsg(&msg, &messaging.URLFoundMsg{}).
SetArg(1, messaging.URLFoundMsg{URL: "https://example.onion/image.png?id=12&test=2"}).
Read(msg, &event.FoundURLEvent{}).
SetArg(1, event.FoundURLEvent{URL: "https://example.onion/image.png?id=12&test=2"}).
Return(nil)
if err := handleMessage(apiClientMock, -1, []string{"png"})(subscriberMock, &msg); err != nil {
s := state{
apiClient: apiClientMock,
refreshDelay: -1,
forbiddenExtensions: []string{"png"},
}
if err := s.handleURLFoundEvent(subscriberMock, msg); !errors.Is(err, errExtensionNotAllowed) {
t.FailNow()
}
}
func TestHandleMessageHostnameForbidden(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
apiClientMock := api_mock.NewMockAPI(mockCtrl)
subscriberMock := event_mock.NewMockSubscriber(mockCtrl)
type test struct {
url string
forbiddenHostnames []string
}
tests := []test{
{
url: "https://facebookcorewwwi.onion/image.png?id=12&test=2",
forbiddenHostnames: []string{"facebookcorewwwi.onion"},
},
{
url: "https://google.onion:9099",
forbiddenHostnames: []string{"google.onion"},
},
{
url: "http://facebook.onion:443/news/test.php?id=12&username=test",
forbiddenHostnames: []string{"facebook.onion"},
},
{
url: "https://www.facebookcorewwwi.onion/recover/initiate?ars=facebook_login",
forbiddenHostnames: []string{"facebookcorewwwi.onion"},
},
}
for _, test := range tests {
msg := bytes.NewReader(nil)
subscriberMock.EXPECT().
Read(msg, &event.FoundURLEvent{}).
SetArg(1, event.FoundURLEvent{URL: test.url}).
Return(nil)
s := state{
apiClient: apiClientMock,
refreshDelay: -1,
forbiddenExtensions: []string{},
forbiddenHostnames: test.forbiddenHostnames,
}
if err := s.handleURLFoundEvent(subscriberMock, msg); !errors.Is(err, errHostnameNotAllowed) {
t.Errorf("%s has not returned errHostnameNotAllowed", test.url)
}
}
}
func TestHandleMessage(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
apiClientMock := api_mock.NewMockClient(mockCtrl)
subscriberMock := messaging_mock.NewMockSubscriber(mockCtrl)
apiClientMock := api_mock.NewMockAPI(mockCtrl)
subscriberMock := event_mock.NewMockSubscriber(mockCtrl)
msg := nats.Msg{}
msg := bytes.NewReader(nil)
subscriberMock.EXPECT().
ReadMsg(&msg, &messaging.URLFoundMsg{}).
SetArg(1, messaging.URLFoundMsg{URL: "https://example.onion"}).
Read(msg, &event.FoundURLEvent{}).
SetArg(1, event.FoundURLEvent{URL: "https://example.onion"}).
Return(nil)
params := api.ResSearchParams{
URL: "https://example.onion",
PageSize: 1,
PageNumber: 1,
}
apiClientMock.EXPECT().
SearchResources("https://example.onion", "", time.Time{}, time.Time{}, 1, 1).
SearchResources(&params).
Return([]api.ResourceDto{}, int64(0), nil)
subscriberMock.EXPECT().
PublishMsg(&messaging.URLTodoMsg{URL: "https://example.onion"}).
Publish(&event.NewURLEvent{URL: "https://example.onion"}).
Return(nil)
if err := handleMessage(apiClientMock, -1, []string{})(subscriberMock, &msg); err != nil {
s := state{
apiClient: apiClientMock,
refreshDelay: -1,
forbiddenExtensions: []string{},
}
if err := s.handleURLFoundEvent(subscriberMock, msg); err != nil {
t.FailNow()
}
}

@ -2,6 +2,7 @@ package trandoshanctl
import (
"fmt"
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/internal/logging"
"github.com/creekorful/trandoshan/internal/util"
"github.com/olekukonko/tablewriter"
@ -19,7 +20,7 @@ func GetApp() *cli.App {
return &cli.App{
Name: "trandoshanctl",
Version: "0.6.0",
Version: "0.7.0",
Usage: "Trandoshan CLI",
Flags: []cli.Flag{
logging.GetLogFlag(),
@ -75,7 +76,13 @@ func search(c *cli.Context) error {
// Create the API client
apiClient := util.GetAPIClient(c)
res, count, err := apiClient.SearchResources("", keyword, time.Time{}, time.Time{}, 1, 10)
params := api.ResSearchParams{
Keyword: keyword,
WithBody: false,
PageSize: 1,
PageNumber: 10,
}
res, count, err := apiClient.SearchResources(&params)
if err != nil {
log.Err(err).Str("keyword", keyword).Msg("Unable to search resources")
return err

@ -0,0 +1,12 @@
package util
import "github.com/urfave/cli/v2"
// GetHubURI return the URI of the hub (event) server
func GetHubURI() *cli.StringFlag {
return &cli.StringFlag{
Name: "hub-uri",
Usage: "URI to the hub (event) server",
Required: true,
}
}

@ -24,6 +24,6 @@ func GetAPIURIFlag() *cli.StringFlag {
}
// GetAPIClient return a new configured API client
func GetAPIClient(c *cli.Context) api.Client {
func GetAPIClient(c *cli.Context) api.API {
return api.NewClient(c.String("api-uri"), c.String("api-token"))
}

@ -1,12 +0,0 @@
package util
import "github.com/urfave/cli/v2"
// GetNATSURIFlag return the nats uri from cli flag
func GetNATSURIFlag() *cli.StringFlag {
return &cli.StringFlag{
Name: "nats-uri",
Usage: "URI to the NATS server",
Required: true,
}
}
Loading…
Cancel
Save