You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

140 lines
3.6 KiB

package crawler
import (
natsutil ""
const defaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
// GetApp return the crawler app
func GetApp() *cli.App {
return &cli.App{
Name: "tdsh-crawler",
Version: "0.3.0",
Usage: "Trandoshan crawler process",
Flags: []cli.Flag{
Name: "nats-uri",
Usage: "URI to the NATS server",
Required: true,
Name: "tor-uri",
Usage: "URI to the TOR SOCKS proxy",
Required: true,
Name: "user-agent",
Usage: "User agent to use",
Value: defaultUserAgent,
Name: "allowed-ct",
Usage: "Content types allowed to crawl",
Value: cli.NewStringSlice("text/"),
Action: execute,
func execute(ctx *cli.Context) error {
log.Info().Str("ver", ctx.App.Version).Msg("Starting tdsh-crawler")
log.Debug().Str("uri", ctx.String("nats-uri")).Msg("Using NATS server")
log.Debug().Str("uri", ctx.String("tor-uri")).Msg("Using TOR proxy")
log.Debug().Strs("content-types", ctx.StringSlice("allowed-ct")).Msg("Allowed content types")
// Create the HTTP client
httpClient := &fasthttp.Client{
// Use given TOR proxy to reach the hidden services
Dial: fasthttpproxy.FasthttpSocksDialer(ctx.String("tor-uri")),
// Disable SSL verification since we do not really care about this
TLSConfig: &tls.Config{InsecureSkipVerify: true},
ReadTimeout: time.Second * 5,
WriteTimeout: time.Second * 5,
Name: ctx.String("user-agent"),
// Create the NATS subscriber
sub, err := natsutil.NewSubscriber(ctx.String("nats-uri"))
if err != nil {
return err
defer sub.Close()
log.Info().Msg("Successfully initialized tdsh-crawler. Waiting for URLs")
if err := sub.QueueSubscribe(proto.URLTodoSubject, "crawlers",
handleMessage(httpClient, ctx.StringSlice("allowed-ct"))); err != nil {
return err
return nil
func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) natsutil.MsgHandler {
return func(nc *nats.Conn, msg *nats.Msg) error {
var urlMsg proto.URLTodoMsg
if err := natsutil.ReadMsg(msg, &urlMsg); err != nil {
return err
log.Debug().Str("url", urlMsg.URL).Msg("Processing URL")
// Query the website
req := fasthttp.AcquireRequest()
resp := fasthttp.AcquireResponse()
defer fasthttp.ReleaseRequest(req)
defer fasthttp.ReleaseResponse(resp)
if err := httpClient.Do(req, resp); err != nil {
log.Err(err).Msg("Error while crawling website")
return err
// Determinate if content type is allowed
allowed := false
contentType := string(resp.Header.Peek("Content-Type"))
for _, allowedContentType := range allowedContentTypes {
if strings.Contains(contentType, allowedContentType) {
allowed = true
if !allowed {
log.Debug().Str("content-type", contentType).Msg("Discarding forbidden content type")
return nil
body := string(resp.Body())
// Publish resource body
res := proto.NewResourceMsg{
URL: urlMsg.URL,
Body: body,
if err := natsutil.PublishMsg(nc, &res); err != nil {
log.Err(err).Msg("Error while publishing resource body")
return nil