master
Carlo Strub 7 years ago
parent 0afac8f4c8
commit 6613f768da

@ -18,12 +18,12 @@ func classificationPriors(db *bolt.DB) (g, j float64) {
db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Wordlists"))
good := b.Bucket([]byte("Good"))
gN := good.Stats().KeyN
gN := float64(good.Stats().KeyN)
junk := b.Bucket([]byte("Junk"))
jN := junk.Stats().KeyN
jN := float64(junk.Stats().KeyN)
g = float64(gN) / (float64(gN) + float64(jN))
j = float64(jN) / (float64(gN) + float64(jN))
g = gN / (gN + jN)
j = jN / (gN + jN)
return nil
})

@ -8,8 +8,10 @@ import (
"mime/quotedprintable"
"os"
"regexp"
"strconv"
"strings"
"github.com/boltdb/bolt"
"github.com/kennygrant/sanitize"
"github.com/luksen/maildir"
)
@ -143,7 +145,7 @@ func (m *Mail) Clean() error {
// wordlist takes a string of space separated text and returns a list of unique
// words in a space separated string
func wordlist(s string) (l []string, err error) {
func wordlist(s string) (l []string) {
list := make(map[string]int)
raw := strings.Split(s, " ")
@ -181,30 +183,41 @@ func wordlist(s string) (l []string, err error) {
l = append(l, word)
}
return l, nil
return l
}
// Wordlists prepares the mail's subject and body for training
func (m *Mail) Wordlists() (subject, body []string, err error) {
// Wordlist prepares the mail for training
func (m *Mail) Wordlist() (w []string) {
var s string
if m.Subject != nil {
subject, err = wordlist(*m.Subject)
if err != nil {
return subject, body, err
}
s = s + " " + *m.Subject
}
if m.Body != nil {
body, err = wordlist(*m.Body)
if err != nil {
return subject, body, err
}
s = s + " " + *m.Body
}
return subject, body, nil
w = wordlist(s)
return w
}
// Classify analyses the mail and decides whether it is Junk or Good
func (m *Mail) Classify() error {
func (m *Mail) Classify(db *bolt.DB) error {
err := m.Clean()
if err != nil {
return err
}
list := m.Wordlist()
scoreG, scoreJ, junk := LogScores(db, list)
m.Junk = junk
log.Print("Classified " + m.Key + " as Junk=" + strconv.FormatBool(m.Junk) +
" (good: " + strconv.FormatFloat(scoreG, 'f', 4, 64) +
", junk: " + strconv.FormatFloat(scoreJ, 'f', 4, 64) + ")")
return nil
}

@ -311,15 +311,11 @@ var _ = Describe("Mail", func() {
err = m.Clean()
Ω(err).ShouldNot(HaveOccurred())
subject, body, err := m.Wordlists()
sort.Strings(subject)
sort.Strings(body)
Ω(err).ShouldNot(HaveOccurred())
list := m.Wordlist()
sort.Strings(list)
Ω(subject).Should(Equal(
[]string{"confirm", "remittance"}))
Ω(body).Should(Equal(
[]string{"accuracy", "addressed", "admin", "alliance", "alone", "bank", "been", "belong", "best", "boltas", "cobantur", "computer", "confirm", "contained", "copy", "copying", "date", "deleted", "detail", "director", "entity", "excludes", "expressed", "files", "forwarding", "hereby", "individual", "intended", "kind", "known", "liability", "makes", "message", "notified", "opinions", "payment", "prohibited", "reception", "recipient", "reflect", "regards", "scanned", "sender", "should", "solely", "storage", "strictly", "such", "thanks", "that", "therein", "they", "this", "value", "viruses", "warranty", "whatsoever", "whom", "with"}))
Ω(list).Should(Equal(
[]string{"accuracy", "addressed", "admin", "alliance", "alone", "bank", "been", "belong", "best", "boltas", "cobantur", "computer", "confirm", "contained", "copy", "copying", "date", "deleted", "detail", "director", "entity", "excludes", "expressed", "files", "forwarding", "hereby", "individual", "intended", "kind", "known", "liability", "makes", "message", "notified", "opinions", "payment", "prohibited", "reception", "recipient", "reflect", "regards", "remittance", "scanned", "sender", "should", "solely", "storage", "strictly", "such", "thanks", "that", "therein", "they", "this", "value", "viruses", "warranty", "whatsoever", "whom", "with"}))
})
It("Wordlist 2", func() {
@ -336,15 +332,11 @@ var _ = Describe("Mail", func() {
err = m.Clean()
Ω(err).ShouldNot(HaveOccurred())
subject, body, err := m.Wordlists()
sort.Strings(subject)
sort.Strings(body)
Ω(err).ShouldNot(HaveOccurred())
list := m.Wordlist()
sort.Strings(list)
Ω(subject).Should(Equal(
[]string{"hello"}))
Ω(body).Should(Equal(
[]string{"best", "company", "dear", "distance", "employees", "from", "home", "interested", "kari", "large", "looking", "manager", "most", "name", "offer", "personnel", "please", "regards", "remotely", "salary", "site", "that", "this", "visit", "work", "working"}))
Ω(list).Should(Equal(
[]string{"best", "company", "dear", "distance", "employees", "from", "hello", "home", "interested", "kari", "large", "looking", "manager", "most", "name", "offer", "personnel", "please", "regards", "remotely", "salary", "site", "that", "this", "visit", "work", "working"}))
})
It("Wordlist 3", func() {
@ -361,15 +353,11 @@ var _ = Describe("Mail", func() {
err = m.Clean()
Ω(err).ShouldNot(HaveOccurred())
subject, body, err := m.Wordlists()
sort.Strings(subject)
sort.Strings(body)
Ω(err).ShouldNot(HaveOccurred())
list := m.Wordlist()
sort.Strings(list)
Ω(subject).Should(Equal(
[]string{"herpes", "medical", "shocks", "world"}))
Ω(body).Should(Equal(
[]string{"alongside", "anxiety", "appointed", "authority", "awarded", "bacteria", "beard", "been", "came", "capital", "causes", "city", "civilian", "club", "combated", "creams", "crown", "cure", "cured", "dark", "devalued", "domed", "doukas", "doux", "dreamstime", "drug", "drugs", "earlier", "emperor", "erly", "exclusive", "extracts", "fast", "february", "finally", "forked", "from", "full", "genital", "girl", "give", "golden", "governors", "guard", "have", "held", "herpes", "history", "image", "influence", "instituted", "john", "largesse", "little", "local", "manuscript", "many", "members", "mental", "mice", "military", "mostly", "nicaea", "notables", "only", "other", "people", "portrait", "prevent", "provincial", "rachael", "relief", "remove", "rettner", "sebastos", "secure", "senior", "size", "starting", "studies", "such", "suggest", "that", "theodore", "there", "these", "this", "times", "title", "titles", "today", "topical", "treatment", "treatments", "tzakones", "under", "unlike", "used", "vatatzes", "view", "virus", "wearing", "were", "will", "with", "writer", "your", "zonaras"}))
Ω(list).Should(Equal(
[]string{"alongside", "anxiety", "appointed", "authority", "awarded", "bacteria", "beard", "been", "came", "capital", "causes", "city", "civilian", "club", "combated", "creams", "crown", "cure", "cured", "dark", "devalued", "domed", "doukas", "doux", "dreamstime", "drug", "drugs", "earlier", "emperor", "erly", "exclusive", "extracts", "fast", "february", "finally", "forked", "from", "full", "genital", "girl", "give", "golden", "governors", "guard", "have", "held", "herpes", "history", "image", "influence", "instituted", "john", "largesse", "little", "local", "manuscript", "many", "medical", "members", "mental", "mice", "military", "mostly", "nicaea", "notables", "only", "other", "people", "portrait", "prevent", "provincial", "rachael", "relief", "remove", "rettner", "sebastos", "secure", "senior", "shocks", "size", "starting", "studies", "such", "suggest", "that", "theodore", "there", "these", "this", "times", "title", "titles", "today", "topical", "treatment", "treatments", "tzakones", "under", "unlike", "used", "vatatzes", "view", "virus", "wearing", "were", "will", "with", "world", "writer", "your", "zonaras"}))
})
It("Wordlist 4", func() {
@ -386,14 +374,10 @@ var _ = Describe("Mail", func() {
err = m.Clean()
Ω(err).ShouldNot(HaveOccurred())
subject, body, err := m.Wordlists()
sort.Strings(subject)
sort.Strings(body)
Ω(err).ShouldNot(HaveOccurred())
list := m.Wordlist()
sort.Strings(list)
Ω(subject).Should(Equal(
[]string{"cosan", "friday", "march", "york"}))
Ω(body).Should(Equal(
Ω(list).Should(Equal(
[]string{"ampudia", "avenue", "below", "between", "briget", "call", "cannot", "closing", "cosan", "download", "email", "friday", "here", "hyatt", "image", "invitation", "level", "limited", "listed", "lunch", "march", "mercado", "novo", "nyse", "online", "onyx", "park", "please", "program", "rafferty", "register", "room", "rsvp", "rumo", "second", "street", "taylor", "view", "west", "york"}))
})
@ -411,15 +395,11 @@ var _ = Describe("Mail", func() {
err = m.Clean()
Ω(err).ShouldNot(HaveOccurred())
subject, body, err := m.Wordlists()
sort.Strings(subject)
sort.Strings(body)
Ω(err).ShouldNot(HaveOccurred())
list := m.Wordlist()
sort.Strings(list)
Ω(subject).Should(Equal(
[]string{"eyes", "glasses", "headed", "serious", "trouble", "wear", "your"}))
Ω(body).Should(Equal(
[]string{"about", "associated", "aylesbury", "baron", "became", "being", "below", "brill", "bscribe", "buckingham", "building", "buildings", "built", "canada", "central", "clearing", "closure", "contacts", "converted", "despite", "discover", "duke", "email", "estate", "even", "extended", "ferdinand", "floor", "from", "full", "glasses", "goodness", "hour", "house", "improve", "improved", "initially", "junction", "know", "limited", "line", "link", "london", "manor", "marie", "marketing", "miles", "montreal", "near", "need", "next", "only", "other", "over", "ownership", "part", "passenger", "pictured", "place", "poor", "public", "quainton", "quality", "quebec", "railway", "renamed", "rothschild", "running", "self", "served", "short", "slow", "station", "success", "survive", "taken", "than", "that", "think", "today", "tramway", "trick", "unsu", "until", "very", "village", "ville", "vision", "wear", "weird", "were", "westcott", "will", "year", "your"}))
Ω(list).Should(Equal(
[]string{"about", "associated", "aylesbury", "baron", "became", "being", "below", "brill", "bscribe", "buckingham", "building", "buildings", "built", "canada", "central", "clearing", "closure", "contacts", "converted", "despite", "discover", "duke", "email", "estate", "even", "extended", "eyes", "ferdinand", "floor", "from", "full", "glasses", "goodness", "headed", "hour", "house", "improve", "improved", "initially", "junction", "know", "limited", "line", "link", "london", "manor", "marie", "marketing", "miles", "montreal", "near", "need", "next", "only", "other", "over", "ownership", "part", "passenger", "pictured", "place", "poor", "public", "quainton", "quality", "quebec", "railway", "renamed", "rothschild", "running", "self", "serious", "served", "short", "slow", "station", "success", "survive", "taken", "than", "that", "think", "today", "tramway", "trick", "trouble", "unsu", "until", "very", "village", "ville", "vision", "wear", "weird", "were", "westcott", "will", "year", "your"}))
})
It("Wordlist 6", func() {
@ -436,15 +416,11 @@ var _ = Describe("Mail", func() {
err = m.Clean()
Ω(err).ShouldNot(HaveOccurred())
subject, body, err := m.Wordlists()
sort.Strings(subject)
sort.Strings(body)
Ω(err).ShouldNot(HaveOccurred())
list := m.Wordlist()
sort.Strings(list)
Ω(subject).Should(Equal(
[]string{"always", "form", "good", "super", "viagra", "with"}))
Ω(body).Should(Equal(
[]string{"amazon", "antiviral", "blockquote", "blood", "body", "canada", "check", "click", "deals", "delivery", "diabetes", "discount", "email", "emails", "europe", "following", "font", "herpes", "hola", "keep", "leading", "limited", "link", "longer", "medication", "message", "most", "north", "offer", "online", "other", "please", "popular", "presents", "pressure", "produced", "products", "read", "receive", "registered", "reserved", "rights", "service", "services", "simply", "span", "special", "states", "store", "subsidiary", "table", "terry", "these", "this", "time", "trademark", "united", "various", "view", "when", "wish", "with", "your"}))
Ω(list).Should(Equal(
[]string{"always", "amazon", "antiviral", "blockquote", "blood", "body", "canada", "check", "click", "deals", "delivery", "diabetes", "discount", "email", "emails", "europe", "following", "font", "form", "good", "herpes", "hola", "keep", "leading", "limited", "link", "longer", "medication", "message", "most", "north", "offer", "online", "other", "please", "popular", "presents", "pressure", "produced", "products", "read", "receive", "registered", "reserved", "rights", "service", "services", "simply", "span", "special", "states", "store", "subsidiary", "super", "table", "terry", "these", "this", "time", "trademark", "united", "various", "viagra", "view", "when", "wish", "with", "your"}))
})
})
})

@ -136,7 +136,7 @@ func main() {
b := tx.Bucket([]byte("Processed"))
v := b.Get([]byte(mails[i].Key))
if len(v) == 0 {
err = mails[i].Classify()
err = mails[i].Classify(db)
if err != nil {
log.Print(err)
}
@ -179,7 +179,7 @@ func main() {
Key: mailName[len(mailName)-1],
}
err = m.Classify()
err = m.Classify(db)
if err != nil {
log.Print(err)
}

Loading…
Cancel
Save