Implement crawler process

- Also change module URL.
- Create natsutil package
pull/3/head
Aloïs Micard 4 years ago
parent 2f17ee088a
commit 06f31f8d9c
No known key found for this signature in database
GPG Key ID: 1A0EB82F071F5EFE

@ -1 +1,26 @@
# Trandoshan dark web crawler
# Trandoshan dark web crawler
This repository is a complete rewrite of the Trandoshan dark web crawler. Everything has been written inside a single
Git repository to ease maintenance.
## Why a rewrite?
The first version of Trandoshan [(available here)](https://github.com/trandoshan-io) is working great but
not really professional, the code start to be a mess, hard to manage since split in multiple repositories, etc..
I have therefore decided to create & maintain the project in this specific directory, where all process code will be available
(as a Go module).
## How to start the crawler
Since the docker image are not available yet, one must run the following script in order to build the crawler fully.
```shell script
./scripts/build.sh
```
The crawler can be started using the start script:
```shell script
./scripts/start.sh
```

@ -8,7 +8,7 @@ WORKDIR /app
COPY . .
# Test then build app
RUN go build -v github.com/creekorful/trandoshan-crawler/cmd/crawler
RUN go build -v github.com/creekorful/trandoshan/cmd/crawler
# runtime image
FROM alpine:latest

@ -8,7 +8,7 @@ WORKDIR /app
COPY . .
# Test then build app
RUN go build -v github.com/creekorful/trandoshan-crawler/cmd/feeder
RUN go build -v github.com/creekorful/trandoshan/cmd/feeder
# runtime image
FROM alpine:latest

@ -1,7 +1,7 @@
package main
import (
"github.com/creekorful/trandoshan-crawler/internal/crawler"
"github.com/creekorful/trandoshan/internal/crawler"
"os"
)

@ -1,7 +1,7 @@
package main
import (
"github.com/creekorful/trandoshan-crawler/internal/feeder"
"github.com/creekorful/trandoshan/internal/feeder"
"os"
)

@ -11,13 +11,8 @@ services:
driver: none
crawler:
image: trandoshan.io/crawler:latest
command: --nats-uri nats --tor-uri torproxy
command: --log-level debug --nats-uri nats --tor-uri torproxy
restart: always
depends_on:
- nats
- proxy
feeder:
image: trandoshan.io/feeder:latest
command: --nats-uri nats --url https://google.com
depends_on:
- nats
- proxy

@ -1,4 +1,4 @@
module github.com/creekorful/trandoshan-crawler
module github.com/creekorful/trandoshan
go 1.14
@ -6,4 +6,6 @@ require (
github.com/nats-io/nats.go v1.9.2
github.com/sirupsen/logrus v1.5.0
github.com/urfave/cli/v2 v2.2.0
github.com/valyala/fasthttp v1.9.0
mvdan.cc/xurls/v2 v2.1.0 // indirect
)

@ -2,6 +2,10 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/klauspost/compress v1.8.2 h1:Bx0qjetmNjdFXASH02NSAREKpiaDwkO1DRZ3dV2KCcs=
github.com/klauspost/compress v1.8.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
github.com/klauspost/cpuid v1.2.1 h1:vJi+O/nMdFt0vqm8NZBI6wzALWdA2X+egi0ogNyrC/w=
github.com/klauspost/cpuid v1.2.1/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/nats-io/jwt v0.3.2 h1:+RB5hMpXUUA2dfxuhBTEkMOrYmM+gKIZYS1KjSostMI=
@ -24,11 +28,17 @@ github.com/sirupsen/logrus v1.5.0/go.mod h1:+F7Ogzej0PZc/94MaYx/nvG9jOFMD2osvC3s
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/urfave/cli/v2 v2.2.0 h1:JTTnM6wKzdA0Jqodd966MVj4vWbbquZykeX1sKbe2C4=
github.com/urfave/cli/v2 v2.2.0/go.mod h1:SE9GqnLQmjVa0iPEY0f1w3ygNIYcIJ0OKPMoW2caLfQ=
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasthttp v1.9.0 h1:hNpmUdy/+ZXYpGy0OBfm7K0UQTzb73W0T0U4iJIVrMw=
github.com/valyala/fasthttp v1.9.0/go.mod h1:FstJa9V+Pj9vQ7OJie2qMHdwemEDaDiSdBnvPM1Su9w=
github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59 h1:3zb4D3T4G8jdExgVU/95+vQXfpEPiMdCaZgmGVxjNHM=
golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190422165155-953cdadca894 h1:Cz4ceDQGXuKRnVBDTS23GTn/pU5OE2C0WrNTOYK1Uuc=
@ -36,3 +46,6 @@ golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
mvdan.cc/xurls v1.1.0 h1:kj0j2lonKseISJCiq1Tfk+iTv65dDGCl0rTbanXJGGc=
mvdan.cc/xurls/v2 v2.1.0 h1:KaMb5GLhlcSX+e+qhbRJODnUUBvlw01jt4yrjFIHAuA=
mvdan.cc/xurls/v2 v2.1.0/go.mod h1:5GrSd9rOnKOpZaji1OZLYL/yeAAtGDlo/cFe+8K5n8E=

@ -4,15 +4,20 @@ import (
"context"
"encoding/json"
"fmt"
"github.com/creekorful/trandoshan/internal/natsutil"
"github.com/nats-io/nats.go"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
"github.com/valyala/fasthttp"
"mvdan.cc/xurls/v2"
)
const (
TodoSubject = "todo"
DoneSubject = "done"
)
// UrlMessage represent an URL as read by the crawler process
type UrlMessage struct {
Url string `json:"url"`
}
@ -54,7 +59,7 @@ func execute(ctx *cli.Context) error {
logrus.Infof("Starting trandoshan-crawler v%s", ctx.App.Version)
logrus.Debugf("Using NATS server at: %s", ctx.String("nats-uri"))
logrus.Debugf("Using tor proxy at: %s", ctx.String("tor-uri"))
logrus.Debugf("Using TOR proxy at: %s", ctx.String("tor-uri"))
// Connect to the NATS server
nc, err := nats.Connect(ctx.String("nats-uri"))
@ -82,7 +87,7 @@ func execute(ctx *cli.Context) error {
}
// ... And process it
if err := handleMessage(msg); err != nil {
if err := handleMessage(nc, msg); err != nil {
logrus.Warnf("Skipping current message because of error: %s", err)
continue
}
@ -91,13 +96,32 @@ func execute(ctx *cli.Context) error {
return nil
}
func handleMessage(msg *nats.Msg) error {
func handleMessage(nc *nats.Conn, msg *nats.Msg) error {
var urlMsg UrlMessage
if err := json.Unmarshal(msg.Data, &urlMsg); err != nil {
return fmt.Errorf("error while decoding message: %s", err)
}
logrus.Infof("Processing url: %s", urlMsg.Url)
logrus.Debugf("Processing URL: %s", urlMsg.Url)
httpClient := fasthttp.Client{}
_, body, err := httpClient.Get(nil, urlMsg.Url)
if err != nil {
return err
}
// Extract URLs
xu := xurls.Strict()
urls := xu.FindAllString(string(body), -1)
// Publish found URLs
for _, url := range urls {
logrus.Debugf("Found URL: %s", url)
if err := natsutil.PublishJson(nc, DoneSubject, &UrlMessage{Url: url}); err != nil {
logrus.Warnf("Error while publishing URL: %s", err)
}
}
return nil // TODO
}
}

@ -1,8 +1,8 @@
package feeder
import (
"encoding/json"
"github.com/creekorful/trandoshan-crawler/internal/crawler"
"github.com/creekorful/trandoshan/internal/crawler"
"github.com/creekorful/trandoshan/internal/natsutil"
"github.com/nats-io/nats.go"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
@ -54,20 +54,13 @@ func execute(ctx *cli.Context) error {
}
defer nc.Close()
// Marshal the message
msg, err := json.Marshal(&crawler.UrlMessage{Url: ctx.String("url")})
if err != nil {
logrus.Errorf("Unable to marshal message: %s", err)
return err
}
// Publish the message
if err := nc.Publish(crawler.TodoSubject, msg); err != nil {
logrus.Errorf("Unable to publish message: %s", err)
if err := natsutil.PublishJson(nc, crawler.TodoSubject, &crawler.UrlMessage{Url: ctx.String("url")}); err != nil {
logrus.Errorf("Unable to publish URL: %s", err)
return err
}
logrus.Infof("Url %s successfully sent to the crawler", ctx.String("url"))
logrus.Infof("URL %s successfully sent to the crawler", ctx.String("url"))
return nil
}

@ -0,0 +1,16 @@
package natsutil
import (
"encoding/json"
"github.com/nats-io/nats.go"
)
// PublishJson publish given message serialized in json with given subject
func PublishJson(nc *nats.Conn, subject string, msg interface{}) error {
msgBytes, err := json.Marshal(msg)
if err != nil {
return err
}
return nc.Publish(subject, msgBytes)
}
Loading…
Cancel
Save