You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
140 lines
3.6 KiB
Go
140 lines
3.6 KiB
Go
package crawler
|
|
|
|
import (
|
|
"crypto/tls"
|
|
"github.com/creekorful/trandoshan/internal/util/logging"
|
|
natsutil "github.com/creekorful/trandoshan/internal/util/nats"
|
|
"github.com/creekorful/trandoshan/pkg/proto"
|
|
"github.com/nats-io/nats.go"
|
|
"github.com/rs/zerolog/log"
|
|
"github.com/urfave/cli/v2"
|
|
"github.com/valyala/fasthttp"
|
|
"github.com/valyala/fasthttp/fasthttpproxy"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const defaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
|
|
|
|
// GetApp return the crawler app
|
|
func GetApp() *cli.App {
|
|
return &cli.App{
|
|
Name: "tdsh-crawler",
|
|
Version: "0.3.0",
|
|
Usage: "Trandoshan crawler process",
|
|
Flags: []cli.Flag{
|
|
logging.GetLogFlag(),
|
|
&cli.StringFlag{
|
|
Name: "nats-uri",
|
|
Usage: "URI to the NATS server",
|
|
Required: true,
|
|
},
|
|
&cli.StringFlag{
|
|
Name: "tor-uri",
|
|
Usage: "URI to the TOR SOCKS proxy",
|
|
Required: true,
|
|
},
|
|
&cli.StringFlag{
|
|
Name: "user-agent",
|
|
Usage: "User agent to use",
|
|
Value: defaultUserAgent,
|
|
},
|
|
&cli.StringSliceFlag{
|
|
Name: "allowed-ct",
|
|
Usage: "Content types allowed to crawl",
|
|
Value: cli.NewStringSlice("text/"),
|
|
},
|
|
},
|
|
Action: execute,
|
|
}
|
|
}
|
|
|
|
func execute(ctx *cli.Context) error {
|
|
logging.ConfigureLogger(ctx)
|
|
|
|
log.Info().Str("ver", ctx.App.Version).Msg("Starting tdsh-crawler")
|
|
|
|
log.Debug().Str("uri", ctx.String("nats-uri")).Msg("Using NATS server")
|
|
log.Debug().Str("uri", ctx.String("tor-uri")).Msg("Using TOR proxy")
|
|
log.Debug().Strs("content-types", ctx.StringSlice("allowed-ct")).Msg("Allowed content types")
|
|
|
|
// Create the HTTP client
|
|
httpClient := &fasthttp.Client{
|
|
// Use given TOR proxy to reach the hidden services
|
|
Dial: fasthttpproxy.FasthttpSocksDialer(ctx.String("tor-uri")),
|
|
// Disable SSL verification since we do not really care about this
|
|
TLSConfig: &tls.Config{InsecureSkipVerify: true},
|
|
ReadTimeout: time.Second * 5,
|
|
WriteTimeout: time.Second * 5,
|
|
Name: ctx.String("user-agent"),
|
|
}
|
|
|
|
// Create the NATS subscriber
|
|
sub, err := natsutil.NewSubscriber(ctx.String("nats-uri"))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer sub.Close()
|
|
|
|
log.Info().Msg("Successfully initialized tdsh-crawler. Waiting for URLs")
|
|
|
|
if err := sub.QueueSubscribe(proto.URLTodoSubject, "crawlers",
|
|
handleMessage(httpClient, ctx.StringSlice("allowed-ct"))); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) natsutil.MsgHandler {
|
|
return func(nc *nats.Conn, msg *nats.Msg) error {
|
|
var urlMsg proto.URLTodoMsg
|
|
if err := natsutil.ReadMsg(msg, &urlMsg); err != nil {
|
|
return err
|
|
}
|
|
|
|
log.Debug().Str("url", urlMsg.URL).Msg("Processing URL")
|
|
|
|
// Query the website
|
|
req := fasthttp.AcquireRequest()
|
|
resp := fasthttp.AcquireResponse()
|
|
defer fasthttp.ReleaseRequest(req)
|
|
defer fasthttp.ReleaseResponse(resp)
|
|
|
|
req.SetRequestURI(urlMsg.URL)
|
|
|
|
if err := httpClient.Do(req, resp); err != nil {
|
|
log.Err(err).Msg("Error while crawling website")
|
|
return err
|
|
}
|
|
|
|
// Determinate if content type is allowed
|
|
allowed := false
|
|
contentType := string(resp.Header.Peek("Content-Type"))
|
|
for _, allowedContentType := range allowedContentTypes {
|
|
if strings.Contains(contentType, allowedContentType) {
|
|
allowed = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if !allowed {
|
|
log.Debug().Str("content-type", contentType).Msg("Discarding forbidden content type")
|
|
return nil
|
|
}
|
|
|
|
body := string(resp.Body())
|
|
|
|
// Publish resource body
|
|
res := proto.NewResourceMsg{
|
|
URL: urlMsg.URL,
|
|
Body: body,
|
|
}
|
|
if err := natsutil.PublishMsg(nc, &res); err != nil {
|
|
log.Err(err).Msg("Error while publishing resource body")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
}
|