Big improvements
- Reduce debug noise - Create scripts to blacklist 'famous' legit hostnames - Make blacklister more resilient - Merge archiver & indexer together - Better prefix for cache key - Rework scheduling process - Update architecture.png - Remove trandoshanctl - Improve testingpull/121/head
parent
2d7499f7e2
commit
ec3357be5d
@ -1,24 +0,0 @@
|
||||
# build image
|
||||
FROM golang:1.15.0-alpine as builder
|
||||
|
||||
RUN apk update && apk upgrade && \
|
||||
apk add --no-cache bash git openssh
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy and download dependencies to cache them and faster build time
|
||||
COPY go.mod go.sum ./
|
||||
RUN go mod download
|
||||
|
||||
COPY . .
|
||||
|
||||
# Test then build app
|
||||
RUN go build -v github.com/creekorful/trandoshan/cmd/tdsh-archiver
|
||||
|
||||
# runtime image
|
||||
FROM alpine:latest
|
||||
COPY --from=builder /app/tdsh-archiver /app/
|
||||
|
||||
WORKDIR /app/
|
||||
|
||||
ENTRYPOINT ["./tdsh-archiver"]
|
@ -1,24 +0,0 @@
|
||||
# build image
|
||||
FROM golang:1.15.0-alpine as builder
|
||||
|
||||
RUN apk update && apk upgrade && \
|
||||
apk add --no-cache bash git openssh
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy and download dependencies to cache them and faster build time
|
||||
COPY go.mod go.sum ./
|
||||
RUN go mod download
|
||||
|
||||
COPY . .
|
||||
|
||||
# Test then build app
|
||||
RUN go build -v github.com/creekorful/trandoshan/cmd/trandoshanctl
|
||||
|
||||
# runtime image
|
||||
FROM alpine:latest
|
||||
COPY --from=builder /app/trandoshanctl /app/
|
||||
|
||||
WORKDIR /app/
|
||||
|
||||
ENTRYPOINT ["./trandoshanctl"]
|
@ -1,14 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/creekorful/trandoshan/internal/archiver"
|
||||
"github.com/creekorful/trandoshan/internal/process"
|
||||
"os"
|
||||
)
|
||||
|
||||
func main() {
|
||||
app := process.MakeApp(&archiver.State{})
|
||||
if err := app.Run(os.Args); err != nil {
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
@ -1,13 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/creekorful/trandoshan/internal/trandoshanctl"
|
||||
"os"
|
||||
)
|
||||
|
||||
func main() {
|
||||
app := trandoshanctl.GetApp()
|
||||
if err := app.Run(os.Args); err != nil {
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
Binary file not shown.
Before Width: | Height: | Size: 54 KiB After Width: | Height: | Size: 59 KiB |
@ -1,99 +0,0 @@
|
||||
package archiver
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/creekorful/trandoshan/internal/archiver/storage"
|
||||
"github.com/creekorful/trandoshan/internal/event"
|
||||
"github.com/creekorful/trandoshan/internal/process"
|
||||
"github.com/rs/zerolog/log"
|
||||
"github.com/urfave/cli/v2"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// State represent the application state
|
||||
type State struct {
|
||||
storage storage.Storage
|
||||
}
|
||||
|
||||
// Name return the process name
|
||||
func (state *State) Name() string {
|
||||
return "archiver"
|
||||
}
|
||||
|
||||
// CommonFlags return process common flags
|
||||
func (state *State) CommonFlags() []string {
|
||||
return []string{process.HubURIFlag}
|
||||
}
|
||||
|
||||
// CustomFlags return process custom flags
|
||||
func (state *State) CustomFlags() []cli.Flag {
|
||||
return []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "storage-dir",
|
||||
Usage: "Path to the storage directory",
|
||||
Required: true,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize the process
|
||||
func (state *State) Initialize(provider process.Provider) error {
|
||||
st, err := storage.NewLocalStorage(provider.GetValue("storage-dir"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
state.storage = st
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Subscribers return the process subscribers
|
||||
func (state *State) Subscribers() []process.SubscriberDef {
|
||||
return []process.SubscriberDef{
|
||||
{Exchange: event.NewIndexExchange, Queue: "archivingQueue", Handler: state.handleNewIndexEvent},
|
||||
}
|
||||
}
|
||||
|
||||
// HTTPHandler returns the HTTP API the process expose
|
||||
func (state *State) HTTPHandler(provider process.Provider) http.Handler {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (state *State) handleNewIndexEvent(subscriber event.Subscriber, msg event.RawMessage) error {
|
||||
var evt event.NewIndexEvent
|
||||
if err := subscriber.Read(&msg, &evt); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
res, err := formatResource(&evt)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error while formatting resource: %s", err)
|
||||
}
|
||||
|
||||
if err := state.storage.Store(evt.URL, evt.Time, res); err != nil {
|
||||
return fmt.Errorf("error while storing resource: %s", err)
|
||||
}
|
||||
|
||||
log.Debug().Str("url", evt.URL).Msg("Successfully archived resource")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func formatResource(evt *event.NewIndexEvent) ([]byte, error) {
|
||||
builder := strings.Builder{}
|
||||
|
||||
// First URL
|
||||
builder.WriteString(fmt.Sprintf("%s\n\n", evt.URL))
|
||||
|
||||
// Then headers
|
||||
for key, value := range evt.Headers {
|
||||
builder.WriteString(fmt.Sprintf("%s: %s\n", key, value))
|
||||
}
|
||||
builder.WriteString("\n")
|
||||
|
||||
// Then body
|
||||
builder.WriteString(evt.Body)
|
||||
|
||||
return []byte(builder.String()), nil
|
||||
}
|
@ -1,55 +0,0 @@
|
||||
package archiver
|
||||
|
||||
import (
|
||||
"github.com/creekorful/trandoshan/internal/archiver/storage_mock"
|
||||
"github.com/creekorful/trandoshan/internal/event"
|
||||
"github.com/creekorful/trandoshan/internal/event_mock"
|
||||
"github.com/golang/mock/gomock"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestHandleNewResourceEvent(t *testing.T) {
|
||||
mockCtrl := gomock.NewController(t)
|
||||
defer mockCtrl.Finish()
|
||||
|
||||
subscriberMock := event_mock.NewMockSubscriber(mockCtrl)
|
||||
storageMock := storage_mock.NewMockStorage(mockCtrl)
|
||||
|
||||
tn := time.Now()
|
||||
|
||||
msg := event.RawMessage{}
|
||||
subscriberMock.EXPECT().
|
||||
Read(&msg, &event.NewIndexEvent{}).
|
||||
SetArg(1, event.NewIndexEvent{
|
||||
URL: "https://example.onion",
|
||||
Body: "Hello, world",
|
||||
Headers: map[string]string{"Server": "Traefik", "Content-Type": "application/html"},
|
||||
Time: tn,
|
||||
}).Return(nil)
|
||||
|
||||
storageMock.EXPECT().Store("https://example.onion", tn, []byte("https://example.onion\n\nServer: Traefik\nContent-Type: application/html\n\nHello, world")).Return(nil)
|
||||
|
||||
s := State{storage: storageMock}
|
||||
if err := s.handleNewIndexEvent(subscriberMock, msg); err != nil {
|
||||
t.Fail()
|
||||
}
|
||||
}
|
||||
|
||||
func TestFormatResource(t *testing.T) {
|
||||
evt := &event.NewIndexEvent{
|
||||
URL: "https://google.com",
|
||||
Body: "Hello, world",
|
||||
Headers: map[string]string{"Server": "Traefik", "Content-Type": "text/html"},
|
||||
Time: time.Now(),
|
||||
}
|
||||
|
||||
res, err := formatResource(evt)
|
||||
if err != nil {
|
||||
t.FailNow()
|
||||
}
|
||||
|
||||
if string(res) != "https://google.com\n\nServer: Traefik\nContent-Type: text/html\n\nHello, world" {
|
||||
t.Fail()
|
||||
}
|
||||
}
|
@ -1,11 +0,0 @@
|
||||
package storage
|
||||
|
||||
import "time"
|
||||
|
||||
//go:generate mockgen -destination=../storage_mock/storage_mock.go -package=storage_mock . Storage
|
||||
|
||||
// Storage is a abstraction layer where we store resource
|
||||
type Storage interface {
|
||||
// Store the resource
|
||||
Store(url string, time time.Time, body []byte) error
|
||||
}
|
@ -0,0 +1,15 @@
|
||||
package cache
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestRedisCache_GetKey(t *testing.T) {
|
||||
rc := redisCache{}
|
||||
if got := rc.getKey("user"); got != "user" {
|
||||
t.Errorf("got %s want %s", got, "user")
|
||||
}
|
||||
|
||||
rc.keyPrefix = "config"
|
||||
if got := rc.getKey("user"); got != "config:user" {
|
||||
t.Errorf("got %s want %s", got, "config:user")
|
||||
}
|
||||
}
|
@ -1,124 +0,0 @@
|
||||
package auth
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/dgrijalva/jwt-go"
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/rs/zerolog/log"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type key int
|
||||
|
||||
const (
|
||||
usernameKey key = iota
|
||||
)
|
||||
|
||||
// Token is the authentication token used by processes when dialing with the API
|
||||
type Token struct {
|
||||
// Username used for logging purposes
|
||||
Username string `json:"username"`
|
||||
|
||||
// Rights that the token provides
|
||||
// Format is: METHOD - list of paths
|
||||
Rights map[string][]string `json:"rights"`
|
||||
}
|
||||
|
||||
// Middleware is the authentication middleware
|
||||
type Middleware struct {
|
||||
signingKey []byte
|
||||
}
|
||||
|
||||
// NewMiddleware create a new Middleware instance with given secret token signing key
|
||||
func NewMiddleware(signingKey []byte) *Middleware {
|
||||
return &Middleware{signingKey: signingKey}
|
||||
}
|
||||
|
||||
// Middleware return an net/http compatible middleware func to use
|
||||
func (m *Middleware) Middleware() mux.MiddlewareFunc {
|
||||
return func(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Extract authorization header
|
||||
tokenStr := r.Header.Get("Authorization")
|
||||
if tokenStr == "" {
|
||||
log.Warn().Msg("missing token")
|
||||
w.WriteHeader(http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
|
||||
tokenStr = strings.TrimPrefix(tokenStr, "Bearer ")
|
||||
|
||||
// Decode the JWT token
|
||||
token, err := jwt.Parse(tokenStr, func(t *jwt.Token) (interface{}, error) {
|
||||
// Validate expected alg
|
||||
if v, ok := t.Method.(*jwt.SigningMethodHMAC); !ok || v.Name != "HS256" {
|
||||
return nil, fmt.Errorf("unexpected signing method: %s", t.Header["alg"])
|
||||
}
|
||||
|
||||
// Return signing secret
|
||||
return m.signingKey, nil
|
||||
})
|
||||
if err != nil {
|
||||
log.Err(err).Msg("error while decoding JWT token")
|
||||
w.WriteHeader(http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
|
||||
// From here we have a valid JWT token, extract claims
|
||||
claims, ok := token.Claims.(jwt.MapClaims)
|
||||
if !ok {
|
||||
log.Err(err).Msg("error while decoding token claims")
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
rights := map[string][]string{}
|
||||
for method, paths := range claims["rights"].(map[string]interface{}) {
|
||||
for _, path := range paths.([]interface{}) {
|
||||
rights[method] = append(rights[method], path.(string))
|
||||
}
|
||||
}
|
||||
|
||||
t := Token{
|
||||
Username: claims["username"].(string),
|
||||
Rights: rights,
|
||||
}
|
||||
|
||||
// Validate rights
|
||||
paths, contains := t.Rights[r.Method]
|
||||
if !contains {
|
||||
log.Warn().
|
||||
Str("username", t.Username).
|
||||
Str("method", r.Method).
|
||||
Str("resource", r.URL.Path).
|
||||
Msg("Access to resources is unauthorized")
|
||||
w.WriteHeader(http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
|
||||
authorized := false
|
||||
for _, path := range paths {
|
||||
if path == r.URL.Path {
|
||||
authorized = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !authorized {
|
||||
log.Warn().
|
||||
Str("username", t.Username).
|
||||
Str("method", r.Method).
|
||||
Str("resource", r.URL.Path).
|
||||
Msg("Access to resources is unauthorized")
|
||||
w.WriteHeader(http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
|
||||
// Everything's fine, call next handler ;D
|
||||
ctx := context.WithValue(r.Context(), usernameKey, t.Username)
|
||||
next.ServeHTTP(w, r.WithContext(ctx))
|
||||
})
|
||||
}
|
||||
}
|
@ -1,85 +0,0 @@
|
||||
package auth
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMiddleware_NoTokenShouldReturnUnauthorized(t *testing.T) {
|
||||
m := (&Middleware{signingKey: []byte("test")}).Middleware()(okHandler())
|
||||
|
||||
// no token shouldn't be able to access
|
||||
req := httptest.NewRequest(http.MethodGet, "/users", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
m.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusUnauthorized {
|
||||
t.Errorf("StatusUnauthorized was expected")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMiddleware_InvalidTokenShouldReturnUnauthorized(t *testing.T) {
|
||||
m := (&Middleware{signingKey: []byte("test")}).Middleware()(okHandler())
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/users", nil)
|
||||
req.Header.Add("Authorization", "zarBR")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
m.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusUnauthorized {
|
||||
t.Errorf("StatusUnauthorized was expected")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMiddleware_BadRightsShouldReturnUnauthorized(t *testing.T) {
|
||||
m := (&Middleware{signingKey: []byte("test")}).Middleware()(okHandler())
|
||||
|
||||
req := httptest.NewRequest(http.MethodPost, "/users", nil)
|
||||
req.Header.Add("Authorization", "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VybmFtZSI6IkpvaG4gRG9lIiwicmlnaHRzIjp7IkdFVCI6WyIvdXNlcnMiXSwiUE9TVCI6WyIvc2VhcmNoIl19fQ.fRx0Q66ZgnY_rKCf-9Vaz6gzGKH_tKSgkVHhoQMtKfM")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
m.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusUnauthorized {
|
||||
t.Errorf("StatusUnauthorized was expected")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMiddleware(t *testing.T) {
|
||||
m := (&Middleware{signingKey: []byte("test")}).Middleware()(okHandler())
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/users?id=10", nil)
|
||||
req.Header.Add("Authorization", "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VybmFtZSI6IkpvaG4gRG9lIiwicmlnaHRzIjp7IkdFVCI6WyIvdXNlcnMiXSwiUE9TVCI6WyIvc2VhcmNoIl19fQ.fRx0Q66ZgnY_rKCf-9Vaz6gzGKH_tKSgkVHhoQMtKfM")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
m.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Errorf("StatusUnauthorized was expected")
|
||||
}
|
||||
|
||||
b, err := ioutil.ReadAll(rec.Body)
|
||||
if err != nil {
|
||||
t.Fail()
|
||||
}
|
||||
if string(b) != "Hello, John Doe" {
|
||||
t.Fail()
|
||||
}
|
||||
}
|
||||
|
||||
func okHandler() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
if username := r.Context().Value(usernameKey).(string); username != "" {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(fmt.Sprintf("Hello, %s", username)))
|
||||
return
|
||||
}
|
||||
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}
|
||||
}
|
@ -1,141 +0,0 @@
|
||||
package client
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"github.com/go-resty/resty/v2"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
//go:generate mockgen -destination=../client_mock/client_mock.go -package=client_mock . Client
|
||||
|
||||
const (
|
||||
// PaginationPageHeader is the header to determinate current page in paginated endpoint
|
||||
PaginationPageHeader = "X-Pagination-Page"
|
||||
// PaginationSizeHeader is the header to determinate page size in paginated endpoint
|
||||
PaginationSizeHeader = "X-Pagination-Size"
|
||||
// PaginationCountHeader is the header to determinate total count of element in paginated endpoint
|
||||
PaginationCountHeader = "X-Pagination-Count"
|
||||
// PaginationPageQueryParam is the query parameter used to set current page in paginated endpoint
|
||||
PaginationPageQueryParam = "pagination-page"
|
||||
// PaginationSizeQueryParam is the query parameter used to set page size in paginated endpoint
|
||||
PaginationSizeQueryParam = "pagination-size"
|
||||
)
|
||||
|
||||
// ResourceDto represent a resource as given by the API
|
||||
type ResourceDto struct {
|
||||
URL string `json:"url"`
|
||||
Body string `json:"body"`
|
||||
Time time.Time `json:"time"`
|
||||
Title string `json:"title"`
|
||||
Meta map[string]string `json:"meta"`
|
||||
Description string `json:"description"`
|
||||
Headers map[string]string `json:"headers"`
|
||||
}
|
||||
|
||||
// CredentialsDto represent the credential when logging in the API
|
||||
type CredentialsDto struct {
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"`
|
||||
}
|
||||
|
||||
// ResSearchParams is the search params used
|
||||
type ResSearchParams struct {
|
||||
URL string
|
||||
Keyword string
|
||||
StartDate time.Time
|
||||
EndDate time.Time
|
||||
WithBody bool
|
||||
PageSize int
|
||||
PageNumber int
|
||||
// TODO allow searching by meta
|
||||
// TODO allow searching by headers
|
||||
}
|
||||
|
||||
// Client is the interface to interact with the indexer API
|
||||
type Client interface {
|
||||
SearchResources(params *ResSearchParams) ([]ResourceDto, int64, error)
|
||||
ScheduleURL(url string) error
|
||||
}
|
||||
|
||||
type client struct {
|
||||
httpClient *resty.Client
|
||||
baseURL string
|
||||
}
|
||||
|
||||
func (c *client) SearchResources(params *ResSearchParams) ([]ResourceDto, int64, error) {
|
||||
targetEndpoint := fmt.Sprintf("%s/v1/resources?", c.baseURL)
|
||||
|
||||
req := c.httpClient.R()
|
||||
|
||||
if params.URL != "" {
|
||||
b64URL := base64.URLEncoding.EncodeToString([]byte(params.URL))
|
||||
req.SetQueryParam("url", b64URL)
|
||||
}
|
||||
|
||||
if params.Keyword != "" {
|
||||
req.SetQueryParam("keyword", params.Keyword)
|
||||
}
|
||||
|
||||
if !params.StartDate.IsZero() {
|
||||
req.SetQueryParam("start-date", params.StartDate.Format(time.RFC3339))
|
||||
}
|
||||
|
||||
if !params.EndDate.IsZero() {
|
||||
req.SetQueryParam("end-date", params.EndDate.Format(time.RFC3339))
|
||||
}
|
||||
|
||||
if params.PageNumber != 0 {
|
||||
req.Header.Set(PaginationPageHeader, strconv.Itoa(params.PageNumber))
|
||||
}
|
||||
if params.PageSize != 0 {
|
||||
req.Header.Set(PaginationSizeHeader, strconv.Itoa(params.PageSize))
|
||||
}
|
||||
|
||||
var resources []ResourceDto
|
||||
req.SetResult(&resources)
|
||||
|
||||
res, err := req.Get(targetEndpoint)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
count, err := strconv.ParseInt(res.Header().Get(PaginationCountHeader), 10, 64)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
return resources, count, nil
|
||||
}
|
||||
|
||||
func (c *client) ScheduleURL(url string) error {
|
||||
targetEndpoint := fmt.Sprintf("%s/v1/urls", c.baseURL)
|
||||
|
||||
req := c.httpClient.R()
|
||||
req.SetHeader("Content-Type", "application/json")
|
||||
req.SetBody(fmt.Sprintf("\"%s\"", url))
|
||||
|
||||
_, err := req.Post(targetEndpoint)
|
||||
return err
|
||||
}
|
||||
|
||||
// NewClient create a new API client using given details
|
||||
func NewClient(baseURL, token string) Client {
|
||||
httpClient := resty.New()
|
||||
httpClient.SetAuthScheme("Bearer")
|
||||
httpClient.SetAuthToken(token)
|
||||
httpClient.OnAfterResponse(func(c *resty.Client, r *resty.Response) error {
|
||||
if r.StatusCode() > 302 {
|
||||
return fmt.Errorf("error when making HTTP request: %s", r.Status())
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
client := &client{
|
||||
httpClient: httpClient,
|
||||
baseURL: baseURL,
|
||||
}
|
||||
|
||||
return client
|
||||
}
|
@ -0,0 +1,56 @@
|
||||
package index
|
||||
|
||||
import (
|
||||
"github.com/creekorful/trandoshan/internal/event"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestExtractResource(t *testing.T) {
|
||||
body := `
|
||||
<title>Creekorful Inc</title>
|
||||
|
||||
This is sparta
|
||||
|
||||
<a href="https://google.com/test?test=test#12">
|
||||
|
||||
<meta name="Description" content="Zhello world">
|
||||
<meta property="og:url" content="https://example.org">
|
||||
`
|
||||
|
||||
msg := event.NewResourceEvent{
|
||||
URL: "https://example.org/300",
|
||||
Body: body,
|
||||
}
|
||||
|
||||
resDto, err := extractResource("https://example.org/300", time.Time{}, body, map[string]string{"Content-Type": "application/json"})
|
||||
if err != nil {
|
||||
t.FailNow()
|
||||
}
|
||||
|
||||
if resDto.URL != "https://example.org/300" {
|
||||
t.Fail()
|
||||
}
|
||||
if resDto.Title != "Creekorful Inc" {
|
||||
t.Fail()
|
||||
}
|
||||
if resDto.Body != msg.Body {
|
||||
t.Fail()
|
||||
}
|
||||
|
||||
if resDto.Description != "Zhello world" {
|
||||
t.Fail()
|
||||
}
|
||||
|
||||
if resDto.Meta["description"] != "Zhello world" {
|
||||
t.Fail()
|
||||
}
|
||||
|
||||
if resDto.Meta["og:url"] != "https://example.org" {
|
||||
t.Fail()
|
||||
}
|
||||
|
||||
if resDto.Headers["content-type"] != "application/json" {
|
||||
t.Fail()
|
||||
}
|
||||
}
|
@ -1,27 +1,33 @@
|
||||
package index
|
||||
|
||||
//go:generate mockgen -destination=../index_mock/index_mock.go -package=index_mock . Index
|
||||
|
||||
import (
|
||||
"github.com/creekorful/trandoshan/internal/indexer/client"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
//go:generate mockgen -destination=../index_mock/index_mock.go -package=index_mock . Index
|
||||
|
||||
// ResourceIdx represent a resource as stored in elasticsearch
|
||||
type ResourceIdx struct {
|
||||
URL string `json:"url"`
|
||||
Body string `json:"body"`
|
||||
Time time.Time `json:"time"`
|
||||
Title string `json:"title"`
|
||||
Meta map[string]string `json:"meta"`
|
||||
Description string `json:"description"`
|
||||
Headers map[string]string `json:"headers"`
|
||||
}
|
||||
const (
|
||||
// Elastic is an Index backed by ES instance
|
||||
Elastic = "elastic"
|
||||
// Local is an Index backed by local FS instance
|
||||
Local = "local"
|
||||
)
|
||||
|
||||
// Index is the interface used to abstract communication
|
||||
// with the persistence unit
|
||||
type Index interface {
|
||||
SearchResources(params *client.ResSearchParams) ([]ResourceIdx, error)
|
||||
CountResources(params *client.ResSearchParams) (int64, error)
|
||||
AddResource(res ResourceIdx) error
|
||||
IndexResource(url string, time time.Time, body string, headers map[string]string) error
|
||||
}
|
||||
|
||||
// NewIndex create a new index using given driver, destination
|
||||
func NewIndex(driver string, dest string) (Index, error) {
|
||||
switch driver {
|
||||
case Elastic:
|
||||
return newElasticIndex(dest)
|
||||
case Local:
|
||||
return newLocalIndex(dest)
|
||||
default:
|
||||
return nil, fmt.Errorf("no driver named %s found", driver)
|
||||
}
|
||||
}
|
||||
|
@ -1,31 +0,0 @@
|
||||
package logging
|
||||
|
||||
import (
|
||||
"github.com/rs/zerolog"
|
||||
"github.com/rs/zerolog/log"
|
||||
"github.com/urfave/cli/v2"
|
||||
"os"
|
||||
)
|
||||
|
||||
// GetLogFlag return the CLI flag parameter used to setup application log level
|
||||
func GetLogFlag() *cli.StringFlag {
|
||||
return &cli.StringFlag{
|
||||
Name: "log-level",
|
||||
Usage: "Set the application log level",
|
||||
Value: "info",
|
||||
}
|
||||
}
|
||||
|
||||
// ConfigureLogger configure the logger using given log level (read from cli context)
|
||||
func ConfigureLogger(ctx *cli.Context) {
|
||||
log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
|
||||
|
||||
// Set application log level
|
||||
if lvl, err := zerolog.ParseLevel(ctx.String("log-level")); err == nil {
|
||||
zerolog.SetGlobalLevel(lvl)
|
||||
} else {
|
||||
zerolog.SetGlobalLevel(zerolog.InfoLevel)
|
||||
}
|
||||
|
||||
log.Debug().Stringer("lvl", zerolog.GlobalLevel()).Msg("Setting log level")
|
||||
}
|
@ -1,18 +0,0 @@
|
||||
package logging
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestGetLogFlag(t *testing.T) {
|
||||
flag := GetLogFlag()
|
||||
if flag.Name != "log-level" {
|
||||
t.Fail()
|
||||
}
|
||||
if flag.Usage != "Set the application log level" {
|
||||
t.Fail()
|
||||
}
|
||||
if flag.Value != "info" {
|
||||
t.Fail()
|
||||
}
|
||||
}
|
@ -0,0 +1,68 @@
|
||||
package test
|
||||
|
||||
import (
|
||||
"github.com/creekorful/trandoshan/internal/process"
|
||||
"github.com/creekorful/trandoshan/internal/process_mock"
|
||||
"github.com/golang/mock/gomock"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// SubscriberDef is use to test subscriber definition
|
||||
type SubscriberDef struct {
|
||||
Queue string
|
||||
Exchange string
|
||||
}
|
||||
|
||||
// CheckProcessCommonFlags check process defined common flags
|
||||
func CheckProcessCommonFlags(t *testing.T, p process.Process, wantFlags []string) {
|
||||
if !checkListEquals(p.CommonFlags(), wantFlags) {
|
||||
t.Errorf("Differents flags: %v %v", p.CommonFlags(), wantFlags)
|
||||
}
|
||||
}
|
||||
|
||||
// CheckProcessCustomFlags check process defined custom flags
|
||||
func CheckProcessCustomFlags(t *testing.T, p process.Process, wantFlags []string) {
|
||||
var names []string
|
||||
for _, customFlag := range p.CustomFlags() {
|
||||
names = append(names, customFlag.Names()[0])
|
||||
}
|
||||
|
||||
if !checkListEquals(names, wantFlags) {
|
||||
t.Errorf("Differents flags: %v %v", names, wantFlags)
|
||||
}
|
||||
}
|
||||
|
||||
// CheckInitialize check process initialization phase
|
||||
func CheckInitialize(t *testing.T, p process.Process, callback func(provider *process_mock.MockProviderMockRecorder)) {
|
||||
mockCtrl := gomock.NewController(t)
|
||||
defer mockCtrl.Finish()
|
||||
|
||||
providerMock := process_mock.NewMockProvider(mockCtrl)
|
||||
callback(providerMock.EXPECT())
|
||||
|
||||
if err := p.Initialize(providerMock); err != nil {
|
||||
t.Errorf("Error while Initializing process: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
// CheckProcessSubscribers check process defined subscribers
|
||||
func CheckProcessSubscribers(t *testing.T, p process.Process, subscribers []SubscriberDef) {
|
||||
var defs []SubscriberDef
|
||||
for _, sub := range p.Subscribers() {
|
||||
defs = append(defs, SubscriberDef{
|
||||
Queue: sub.Queue,
|
||||
Exchange: sub.Exchange,
|
||||
})
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(defs, subscribers) {
|
||||
t.Errorf("Differents subscribers: %v %v", defs, subscribers)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO HTTPHandler
|
||||
|
||||
func checkListEquals(a []string, b []string) bool {
|
||||
return reflect.DeepEqual(a, b)
|
||||
}
|
@ -1,119 +0,0 @@
|
||||
package trandoshanctl
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/creekorful/trandoshan/internal/indexer/client"
|
||||
"github.com/creekorful/trandoshan/internal/logging"
|
||||
"github.com/olekukonko/tablewriter"
|
||||
"github.com/rs/zerolog/log"
|
||||
"github.com/urfave/cli/v2"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
// GetApp returns the Trandoshan CLI app
|
||||
func GetApp() *cli.App {
|
||||
return &cli.App{
|
||||
Name: "trandoshanctl",
|
||||
Version: "0.9.0",
|
||||
Usage: "Trandoshan CLI",
|
||||
Flags: []cli.Flag{
|
||||
logging.GetLogFlag(),
|
||||
&cli.StringFlag{
|
||||
Name: "api-uri",
|
||||
Usage: "URI to the API server",
|
||||
Value: "http://localhost:15005",
|
||||
Required: false,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "api-token",
|
||||
Usage: "Token to use to authenticate against the API",
|
||||
Required: true,
|
||||
},
|
||||
},
|
||||
Commands: []*cli.Command{
|
||||
{
|
||||
Name: "schedule",
|
||||
Usage: "Schedule crawling for given URL",
|
||||
Action: schedule,
|
||||
ArgsUsage: "URL",
|
||||
},
|
||||
{
|
||||
Name: "search",
|
||||
Usage: "Search for specific resources",
|
||||
ArgsUsage: "keyword",
|
||||
Action: search,
|
||||
},
|
||||
},
|
||||
Before: before,
|
||||
}
|
||||
}
|
||||
|
||||
func before(ctx *cli.Context) error {
|
||||
logging.ConfigureLogger(ctx)
|
||||
return nil
|
||||
}
|
||||
|
||||
func schedule(c *cli.Context) error {
|
||||
if c.NArg() == 0 {
|
||||
return fmt.Errorf("missing argument URL")
|
||||
}
|
||||
|
||||
url := c.Args().First()
|
||||
|
||||
// Create the API client
|
||||
apiClient := client.NewClient(c.String("api-uri"), c.String("api-token"))
|
||||
|
||||
if err := apiClient.ScheduleURL(url); err != nil {
|
||||
log.Err(err).Str("url", url).Msg("Unable to schedule crawling for URL")
|
||||
return err
|
||||
}
|
||||
|
||||
log.Info().Str("url", url).Msg("Successfully schedule crawling")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func search(c *cli.Context) error {
|
||||
keyword := c.Args().First()
|
||||
|
||||
// Create the API client
|
||||
apiClient := client.NewClient(c.String("api-uri"), c.String("api-token"))
|
||||
|
||||
params := client.ResSearchParams{
|
||||
Keyword: keyword,
|
||||
WithBody: false,
|
||||
PageSize: 1,
|
||||
PageNumber: 10,
|
||||
}
|
||||
res, count, err := apiClient.SearchResources(¶ms)
|
||||
if err != nil {
|
||||
log.Err(err).Str("keyword", keyword).Msg("Unable to search resources")
|
||||
return err
|
||||
}
|
||||
|
||||
if len(res) == 0 {
|
||||
fmt.Println("No resources crawled (yet).")
|
||||
}
|
||||
|
||||
table := tablewriter.NewWriter(os.Stdout)
|
||||
table.SetHeader([]string{"Time", "URL", "Title"})
|
||||
|
||||
for _, v := range res {
|
||||
table.Append([]string{v.Time.Format(time.RFC822), shortenURL(v.URL), v.Title})
|
||||
}
|
||||
table.Render()
|
||||
|
||||
fmt.Printf("Total: %d\n", count)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func shortenURL(url string) string {
|
||||
if len(url) > 125 {
|
||||
url := url[0:125]
|
||||
return url + "..."
|
||||
}
|
||||
|
||||
return url
|
||||
}
|
@ -0,0 +1,56 @@
|
||||
import json
|
||||
import sys
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
|
||||
# This script is used to import list of hostnames to 'blacklist'
|
||||
# it will pull hostnames from the CT log source (see url variable) & custom define ones
|
||||
# and blacklist them to prevent useless crawling
|
||||
|
||||
url = "https://raw.githubusercontent.com/alecmuffett/real-world-onion-sites/master/ct-log.txt"
|
||||
custom_hostnames = [
|
||||
'gamebombfak3pwnh.onion', # gaming forum, lot of noise
|
||||
'metagerv65pwclop2rsfzg4jwowpavpwd6grhhlvdgsswvo6ii4akgyd.onion' # search engine, lot of noise
|
||||
]
|
||||
config_api_uri = sys.argv[1]
|
||||
|
||||
|
||||
def add_if_not_exist(a: List[dict], b: str):
|
||||
found = False
|
||||
for i in a:
|
||||
if i['hostname'] == b:
|
||||
found = True
|
||||
|
||||
if not found:
|
||||
a.append({'hostname': b})
|
||||
|
||||
|
||||
# Get up-to-date list of real-world / legit .onion
|
||||
r = requests.get(url)
|
||||
new_hostnames = []
|
||||
for hostname in r.text.splitlines():
|
||||
new_hostnames.append({'hostname': hostname})
|
||||
print("pulled {} real world hostnames from ct-log.txt".format(len(new_hostnames)))
|
||||
|
||||
# Append custom hostnames ignore list
|
||||
for custom_hostname in custom_hostnames:
|
||||
add_if_not_exist(new_hostnames, custom_hostname)
|
||||
print("added {} custom hostnames".format(len(custom_hostnames)))
|
||||
|
||||
# Query existing blacklisted hostnames from ConfigAPI
|
||||
r = requests.get(config_api_uri + "/config/forbidden-hostnames")
|
||||
forbidden_hostnames = r.json()
|
||||
print("there is {} forbidden hostnames defined in ConfigAPI".format(len(forbidden_hostnames)))
|
||||
|
||||
# Merge the lists while preventing duplicates
|
||||
for forbidden_hostname in forbidden_hostnames:
|
||||
add_if_not_exist(new_hostnames, forbidden_hostname['hostname'])
|
||||
print("there is {} forbidden hostnames now".format(len(forbidden_hostnames)))
|
||||
|
||||
# Update ConfigAPI
|
||||
headers = {'Content-Type': 'application/json', 'Accept': 'application/json'}
|
||||
r = requests.put(config_api_uri + "/config/forbidden-hostnames", json.dumps(forbidden_hostnames), headers=headers)
|
||||
|
||||
if r.ok:
|
||||
print("successfully updated forbidden hostnames")
|
Loading…
Reference in New Issue