Skip to content

Commit

Permalink
Move most logic into corpus package
Browse files Browse the repository at this point in the history
  • Loading branch information
hrs committed May 28, 2023
1 parent 4d61ccd commit c7656f9
Show file tree
Hide file tree
Showing 14 changed files with 111 additions and 110 deletions.
1 change: 0 additions & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ jobs:
uses: goreleaser/goreleaser-action@v4
with:
version: latest
workdir: lib
args: release --clean
env:
GITHUB_TOKEN: ${{ secrets.RELEASER_GITHUB_TOKEN }}
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ MANPAGE=$(BINARY).1
.PHONY: build
build: $(BINARY)

$(BINARY): $(shell find . -iname *.go)
$(GOBUILD) -o $(BINARY) -v ./...
$(BINARY): $(shell find * -iname *.go)
$(GOBUILD) -o $(BINARY) -v

.PHONY: install
install: $(BINARY)
Expand Down
2 changes: 1 addition & 1 deletion lib/config.go → corpus/config.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package corpus

type Config struct {
BestFirst bool
Expand Down
16 changes: 8 additions & 8 deletions lib/corpus.go → corpus/corpus.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package corpus

import (
"fmt"
Expand All @@ -9,17 +9,17 @@ import (
)

type Corpus struct {
Documents []*Document
InvDocFreq TermMap
documents []*Document
invDocFreq termMap
}

func NewCorpus(documents []*Document) *Corpus {
var docFreq = make(TermMap)
var invDocFreq = make(TermMap)
var docFreq = make(termMap)
var invDocFreq = make(termMap)

// For each term, in how many documents does it occur?
for _, doc := range documents {
for term := range doc.TermFreq {
for term := range doc.termFreq {
docFreq[term]++
}
}
Expand All @@ -32,7 +32,7 @@ func NewCorpus(documents []*Document) *Corpus {

// Assign TF-IDF weights to every document in the corpus
for _, doc := range documents {
doc.NormalizeTfIdf(invDocFreq)
doc.normalizeTfIdf(invDocFreq)
}

return &Corpus{documents, invDocFreq}
Expand All @@ -48,7 +48,7 @@ func ParseCorpus(query *Document, paths []string, config *Config) *Corpus {
}

// Don't parse directories or symlinks (or the queried file, if so configured)
if isParsableFile(xinfo, config) && !(config.OmitQuery && sameFile(query.Path, xpath)) {
if isParsableFile(xinfo, config) && !(config.OmitQuery && sameFile(query.path, xpath)) {
doc, err := NewDocument(xpath, config)

if err != nil {
Expand Down
34 changes: 17 additions & 17 deletions lib/document.go → corpus/document.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package corpus

import (
"bufio"
Expand All @@ -13,13 +13,13 @@ import (
"golang.org/x/tools/godoc/vfs"
)

type TermMap map[string]float64
type termMap map[string]float64

type Document struct {
Path string
TermFreq TermMap
TfIdf TermMap
Norm float64
path string
termFreq termMap
tfIdf termMap
norm float64
}

func NewDocument(path string, config *Config) (*Document, error) {
Expand All @@ -42,7 +42,7 @@ func NewDocument(path string, config *Config) (*Document, error) {
scanner.Split(bufio.ScanWords)

// Initialize the words slice
termCount := make(TermMap)
termCount := make(termMap)
totalWordCount := 0.0

// Loop over the words and append each to the words slice
Expand All @@ -60,7 +60,7 @@ func NewDocument(path string, config *Config) (*Document, error) {
word = strings.TrimSuffix(word, "'s")

if word != "" {
if config.NoStoplist || !config.Stoplist.Include(word) {
if config.NoStoplist || !config.Stoplist.include(word) {
if config.NoStemming {
termCount[word]++
} else {
Expand All @@ -79,36 +79,36 @@ func NewDocument(path string, config *Config) (*Document, error) {
}

// Build the term frequency map
termFreq := make(TermMap)
termFreq := make(termMap)
for term, count := range termCount {
termFreq[term] = count / totalWordCount
}

return &Document{Path: path, TermFreq: termFreq}, nil
return &Document{path: path, termFreq: termFreq}, nil
}

func splitToken(r rune) bool {
return !(r >= 'a' && r <= 'z') && !(r >= '0' && r <= '9') && r != ' ' && r != '\''
}

func (doc *Document) NormalizeTfIdf(invDocFreq TermMap) {
func (doc *Document) normalizeTfIdf(invDocFreq termMap) {
// Set the TF-IDF weights
doc.TfIdf = make(TermMap)
for term, weight := range doc.TermFreq {
doc.TfIdf[term] = weight * invDocFreq[term]
doc.tfIdf = make(termMap)
for term, weight := range doc.termFreq {
doc.tfIdf[term] = weight * invDocFreq[term]
}

// Now that we've set TF-IDF weights, we can save memory by removing the
// original weights
doc.TermFreq = nil
doc.termFreq = nil

// Calculate and store the document's norm
doc.Norm = doc.calcNorm()
doc.norm = doc.calcNorm()
}

func (doc *Document) calcNorm() float64 {
norm := 0.0
for _, weight := range doc.TfIdf {
for _, weight := range doc.tfIdf {
norm += weight * weight
}

Expand Down
36 changes: 18 additions & 18 deletions lib/document_test.go → corpus/document_test.go
Original file line number Diff line number Diff line change
@@ -1,65 +1,65 @@
package main
package corpus

import (
"reflect"
"testing"
)

func TestNormalizeTfIdf(t *testing.T) {
tm := TermMap{
tm := termMap{
"foo": 2.0,
"bar": 3.0,
"baz": 4.0,
}

tests := []struct {
doc Document
idf TermMap
idf termMap
expDoc Document
}{
{
Document{
TermFreq: TermMap{},
termFreq: termMap{},
},
tm,
Document{
TfIdf: TermMap{},
Norm: 0.0,
tfIdf: termMap{},
norm: 0.0,
},
},
{
Document{
TermFreq: TermMap{
termFreq: termMap{
"foo": 3.0,
"bar": 4.0,
"baz": 5.0,
},
},
tm,
Document{
TfIdf: TermMap{
tfIdf: termMap{
"foo": 6.0,
"bar": 12.0,
"baz": 20.0,
},
Norm: 24.0832,
norm: 24.0832,
},
},
}

for _, tc := range tests {
tc.doc.NormalizeTfIdf(tc.idf)
tc.doc.normalizeTfIdf(tc.idf)

if !reflect.DeepEqual(tc.doc.TfIdf, tc.expDoc.TfIdf) {
t.Errorf("got %v, wanted %v", tc.doc.TfIdf, tc.expDoc.TfIdf)
if !reflect.DeepEqual(tc.doc.tfIdf, tc.expDoc.tfIdf) {
t.Errorf("got %v, wanted %v", tc.doc.tfIdf, tc.expDoc.tfIdf)
}

if !approxEq(tc.doc.Norm, tc.expDoc.Norm) {
t.Errorf("got %.4f, wanted %.4f", tc.doc.Norm, tc.expDoc.Norm)
if !approxEq(tc.doc.norm, tc.expDoc.norm) {
t.Errorf("got %.4f, wanted %.4f", tc.doc.norm, tc.expDoc.norm)
}

if tc.doc.TermFreq != nil {
t.Errorf("got %v, wanted nil", tc.doc.TermFreq)
if tc.doc.termFreq != nil {
t.Errorf("got %v, wanted nil", tc.doc.termFreq)
}
}
}
Expand All @@ -70,13 +70,13 @@ func TestCalcNorm(t *testing.T) {
}{
{
Document{
TfIdf: TermMap{},
tfIdf: termMap{},
},
0.0,
},
{
Document{
TfIdf: TermMap{
tfIdf: termMap{
"foo": 2.0,
"bar": 3.0,
"baz": 4.0,
Expand Down
20 changes: 10 additions & 10 deletions lib/output.go → corpus/output.go
Original file line number Diff line number Diff line change
@@ -1,28 +1,28 @@
package main
package corpus

import (
"fmt"
"sort"
)

type Score struct {
Query *Document
Document *Document
Score float64
type score struct {
query *Document
document *Document
score float64
}

func printResults(scores []Score, config Config) {
func PrintResults(scores []score, config Config) {
// Sort results by score, worst matches first
sort.Slice(scores, func(i, j int) bool {
return scores[i].Score > scores[j].Score
return scores[i].score > scores[j].score
})

if config.Limit > 0 && len(scores) > config.Limit {
scores = scores[0:config.Limit]
}

if !config.BestFirst {
tmp := make([]Score, len(scores))
tmp := make([]score, len(scores))
for i, score := range scores {
tmp[len(scores)-i-1] = score
}
Expand All @@ -31,9 +31,9 @@ func printResults(scores []Score, config Config) {

for _, score := range scores {
if config.ShowScores {
fmt.Printf("%.4f\t%s\n", score.Score, score.Document.Path)
fmt.Printf("%.4f\t%s\n", score.score, score.document.path)
} else {
fmt.Println(score.Document.Path)
fmt.Println(score.document.path)
}
}
}
29 changes: 29 additions & 0 deletions corpus/similarity.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package corpus

func (corpus *Corpus) SimilarDocuments(query *Document) []score {
// Normalize query document to set TF-IDF weights per the corpus
query.normalizeTfIdf(corpus.invDocFreq)

scores := make([]score, len(corpus.documents))
for i, doc := range corpus.documents {
score := score{
query: query,
document: doc,
score: doc.cosineSimilarity(query),
}

scores[i] = score
}

return scores
}

func (target *Document) cosineSimilarity(other *Document) float64 {
dotProd := 0.0

for term, weight := range target.tfIdf {
dotProd += (weight * other.tfIdf[term])
}

return dotProd / (target.norm * other.norm)
}
14 changes: 7 additions & 7 deletions lib/similarity_test.go → corpus/similarity_test.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package corpus

import (
"math"
Expand All @@ -18,14 +18,14 @@ func TestCosineSimilarity(t *testing.T) {
sim float64
}

docA := Document{TfIdf: TermMap{"foo": 0.3013, "bar": 0.2628}}
docA.Norm = docA.calcNorm()
docA := Document{tfIdf: termMap{"foo": 0.3013, "bar": 0.2628}}
docA.norm = docA.calcNorm()

docB := Document{TfIdf: TermMap{"baz": 0.1577, "quux": 0.7796, "xyzzy": 0.1577}}
docB.Norm = docB.calcNorm()
docB := Document{tfIdf: termMap{"baz": 0.1577, "quux": 0.7796, "xyzzy": 0.1577}}
docB.norm = docB.calcNorm()

docC := Document{TfIdf: TermMap{"foo": 0.2260, "quux": 0.6496}}
docC.Norm = docC.calcNorm()
docC := Document{tfIdf: termMap{"foo": 0.2260, "quux": 0.6496}}
docC.norm = docC.calcNorm()

cosTests := []cosTest{
{&docA, &docA, 1.0},
Expand Down
2 changes: 1 addition & 1 deletion lib/stem.go → corpus/stem.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package corpus

import "github.com/reiver/go-porterstemmer"

Expand Down
Loading

0 comments on commit c7656f9

Please sign in to comment.