Move most logic into corpus package

hrs · May 28, 2023 · c7656f9 · c7656f9
1 parent 4d61ccd
commit c7656f9
Show file tree

Hide file tree

Showing 14 changed files with 111 additions and 110 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -27,7 +27,6 @@ jobs:
         uses: goreleaser/goreleaser-action@v4
         with:
           version: latest
-          workdir: lib
           args: release --clean
         env:
           GITHUB_TOKEN: ${{ secrets.RELEASER_GITHUB_TOKEN }}
diff --git a/Makefile b/Makefile
@@ -12,8 +12,8 @@ MANPAGE=$(BINARY).1
 .PHONY: build
 build: $(BINARY)
 
-$(BINARY): $(shell find . -iname *.go)
-	$(GOBUILD) -o $(BINARY) -v ./...
+$(BINARY): $(shell find * -iname *.go)
+	$(GOBUILD) -o $(BINARY) -v
 
 .PHONY: install
 install: $(BINARY)

diff --git a/lib/config.go → corpus/config.go b/lib/config.go → corpus/config.go
@@ -1,4 +1,4 @@
-package main
+package corpus
 
 type Config struct {
 	BestFirst      bool

diff --git a/lib/corpus.go → corpus/corpus.go b/lib/corpus.go → corpus/corpus.go
@@ -1,4 +1,4 @@
-package main
+package corpus
 
 import (
 	"fmt"
@@ -9,17 +9,17 @@ import (
 )
 
 type Corpus struct {
-	Documents  []*Document
-	InvDocFreq TermMap
+	documents  []*Document
+	invDocFreq termMap
 }
 
 func NewCorpus(documents []*Document) *Corpus {
-	var docFreq = make(TermMap)
-	var invDocFreq = make(TermMap)
+	var docFreq = make(termMap)
+	var invDocFreq = make(termMap)
 
 	// For each term, in how many documents does it occur?
 	for _, doc := range documents {
-		for term := range doc.TermFreq {
+		for term := range doc.termFreq {
 			docFreq[term]++
 		}
 	}
@@ -32,7 +32,7 @@ func NewCorpus(documents []*Document) *Corpus {
 
 	// Assign TF-IDF weights to every document in the corpus
 	for _, doc := range documents {
-		doc.NormalizeTfIdf(invDocFreq)
+		doc.normalizeTfIdf(invDocFreq)
 	}
 
 	return &Corpus{documents, invDocFreq}
@@ -48,7 +48,7 @@ func ParseCorpus(query *Document, paths []string, config *Config) *Corpus {
 			}
 
 			// Don't parse directories or symlinks (or the queried file, if so configured)
-			if isParsableFile(xinfo, config) && !(config.OmitQuery && sameFile(query.Path, xpath)) {
+			if isParsableFile(xinfo, config) && !(config.OmitQuery && sameFile(query.path, xpath)) {
 				doc, err := NewDocument(xpath, config)
 
 				if err != nil {

diff --git a/lib/document.go → corpus/document.go b/lib/document.go → corpus/document.go
@@ -1,4 +1,4 @@
-package main
+package corpus
 
 import (
 	"bufio"
@@ -13,13 +13,13 @@ import (
 	"golang.org/x/tools/godoc/vfs"
 )
 
-type TermMap map[string]float64
+type termMap map[string]float64
 
 type Document struct {
-	Path     string
-	TermFreq TermMap
-	TfIdf    TermMap
-	Norm     float64
+	path     string
+	termFreq termMap
+	tfIdf    termMap
+	norm     float64
 }
 
 func NewDocument(path string, config *Config) (*Document, error) {
@@ -42,7 +42,7 @@ func NewDocument(path string, config *Config) (*Document, error) {
 	scanner.Split(bufio.ScanWords)
 
 	// Initialize the words slice
-	termCount := make(TermMap)
+	termCount := make(termMap)
 	totalWordCount := 0.0
 
 	// Loop over the words and append each to the words slice
@@ -60,7 +60,7 @@ func NewDocument(path string, config *Config) (*Document, error) {
 			word = strings.TrimSuffix(word, "'s")
 
 			if word != "" {
-				if config.NoStoplist || !config.Stoplist.Include(word) {
+				if config.NoStoplist || !config.Stoplist.include(word) {
 					if config.NoStemming {
 						termCount[word]++
 					} else {
@@ -79,36 +79,36 @@ func NewDocument(path string, config *Config) (*Document, error) {
 	}
 
 	// Build the term frequency map
-	termFreq := make(TermMap)
+	termFreq := make(termMap)
 	for term, count := range termCount {
 		termFreq[term] = count / totalWordCount
 	}
 
-	return &Document{Path: path, TermFreq: termFreq}, nil
+	return &Document{path: path, termFreq: termFreq}, nil
 }
 
 func splitToken(r rune) bool {
 	return !(r >= 'a' && r <= 'z') && !(r >= '0' && r <= '9') && r != ' ' && r != '\''
 }
 
-func (doc *Document) NormalizeTfIdf(invDocFreq TermMap) {
+func (doc *Document) normalizeTfIdf(invDocFreq termMap) {
 	// Set the TF-IDF weights
-	doc.TfIdf = make(TermMap)
-	for term, weight := range doc.TermFreq {
-		doc.TfIdf[term] = weight * invDocFreq[term]
+	doc.tfIdf = make(termMap)
+	for term, weight := range doc.termFreq {
+		doc.tfIdf[term] = weight * invDocFreq[term]
 	}
 
 	// Now that we've set TF-IDF weights, we can save memory by removing the
 	// original weights
-	doc.TermFreq = nil
+	doc.termFreq = nil
 
 	// Calculate and store the document's norm
-	doc.Norm = doc.calcNorm()
+	doc.norm = doc.calcNorm()
 }
 
 func (doc *Document) calcNorm() float64 {
 	norm := 0.0
-	for _, weight := range doc.TfIdf {
+	for _, weight := range doc.tfIdf {
 		norm += weight * weight
 	}
 

diff --git a/lib/document_test.go → corpus/document_test.go b/lib/document_test.go → corpus/document_test.go
@@ -1,65 +1,65 @@
-package main
+package corpus
 
 import (
 	"reflect"
 	"testing"
 )
 
 func TestNormalizeTfIdf(t *testing.T) {
-	tm := TermMap{
+	tm := termMap{
 		"foo": 2.0,
 		"bar": 3.0,
 		"baz": 4.0,
 	}
 
 	tests := []struct {
 		doc    Document
-		idf    TermMap
+		idf    termMap
 		expDoc Document
 	}{
 		{
 			Document{
-				TermFreq: TermMap{},
+				termFreq: termMap{},
 			},
 			tm,
 			Document{
-				TfIdf: TermMap{},
-				Norm:  0.0,
+				tfIdf: termMap{},
+				norm:  0.0,
 			},
 		},
 		{
 			Document{
-				TermFreq: TermMap{
+				termFreq: termMap{
 					"foo": 3.0,
 					"bar": 4.0,
 					"baz": 5.0,
 				},
 			},
 			tm,
 			Document{
-				TfIdf: TermMap{
+				tfIdf: termMap{
 					"foo": 6.0,
 					"bar": 12.0,
 					"baz": 20.0,
 				},
-				Norm: 24.0832,
+				norm: 24.0832,
 			},
 		},
 	}
 
 	for _, tc := range tests {
-		tc.doc.NormalizeTfIdf(tc.idf)
+		tc.doc.normalizeTfIdf(tc.idf)
 
-		if !reflect.DeepEqual(tc.doc.TfIdf, tc.expDoc.TfIdf) {
-			t.Errorf("got %v, wanted %v", tc.doc.TfIdf, tc.expDoc.TfIdf)
+		if !reflect.DeepEqual(tc.doc.tfIdf, tc.expDoc.tfIdf) {
+			t.Errorf("got %v, wanted %v", tc.doc.tfIdf, tc.expDoc.tfIdf)
 		}
 
-		if !approxEq(tc.doc.Norm, tc.expDoc.Norm) {
-			t.Errorf("got %.4f, wanted %.4f", tc.doc.Norm, tc.expDoc.Norm)
+		if !approxEq(tc.doc.norm, tc.expDoc.norm) {
+			t.Errorf("got %.4f, wanted %.4f", tc.doc.norm, tc.expDoc.norm)
 		}
 
-		if tc.doc.TermFreq != nil {
-			t.Errorf("got %v, wanted nil", tc.doc.TermFreq)
+		if tc.doc.termFreq != nil {
+			t.Errorf("got %v, wanted nil", tc.doc.termFreq)
 		}
 	}
 }
@@ -70,13 +70,13 @@ func TestCalcNorm(t *testing.T) {
 	}{
 		{
 			Document{
-				TfIdf: TermMap{},
+				tfIdf: termMap{},
 			},
 			0.0,
 		},
 		{
 			Document{
-				TfIdf: TermMap{
+				tfIdf: termMap{
 					"foo": 2.0,
 					"bar": 3.0,
 					"baz": 4.0,

diff --git a/lib/output.go → corpus/output.go b/lib/output.go → corpus/output.go
@@ -1,28 +1,28 @@
-package main
+package corpus
 
 import (
 	"fmt"
 	"sort"
 )
 
-type Score struct {
-	Query    *Document
-	Document *Document
-	Score    float64
+type score struct {
+	query    *Document
+	document *Document
+	score    float64
 }
 
-func printResults(scores []Score, config Config) {
+func PrintResults(scores []score, config Config) {
 	// Sort results by score, worst matches first
 	sort.Slice(scores, func(i, j int) bool {
-		return scores[i].Score > scores[j].Score
+		return scores[i].score > scores[j].score
 	})
 
 	if config.Limit > 0 && len(scores) > config.Limit {
 		scores = scores[0:config.Limit]
 	}
 
 	if !config.BestFirst {
-		tmp := make([]Score, len(scores))
+		tmp := make([]score, len(scores))
 		for i, score := range scores {
 			tmp[len(scores)-i-1] = score
 		}
@@ -31,9 +31,9 @@ func printResults(scores []Score, config Config) {
 
 	for _, score := range scores {
 		if config.ShowScores {
-			fmt.Printf("%.4f\t%s\n", score.Score, score.Document.Path)
+			fmt.Printf("%.4f\t%s\n", score.score, score.document.path)
 		} else {
-			fmt.Println(score.Document.Path)
+			fmt.Println(score.document.path)
 		}
 	}
 }
diff --git a/corpus/similarity.go b/corpus/similarity.go
@@ -0,0 +1,29 @@
+package corpus
+
+func (corpus *Corpus) SimilarDocuments(query *Document) []score {
+	// Normalize query document to set TF-IDF weights per the corpus
+	query.normalizeTfIdf(corpus.invDocFreq)
+
+	scores := make([]score, len(corpus.documents))
+	for i, doc := range corpus.documents {
+		score := score{
+			query:    query,
+			document: doc,
+			score:    doc.cosineSimilarity(query),
+		}
+
+		scores[i] = score
+	}
+
+	return scores
+}
+
+func (target *Document) cosineSimilarity(other *Document) float64 {
+	dotProd := 0.0
+
+	for term, weight := range target.tfIdf {
+		dotProd += (weight * other.tfIdf[term])
+	}
+
+	return dotProd / (target.norm * other.norm)
+}
diff --git a/lib/similarity_test.go → corpus/similarity_test.go b/lib/similarity_test.go → corpus/similarity_test.go
@@ -1,4 +1,4 @@
-package main
+package corpus
 
 import (
 	"math"
@@ -18,14 +18,14 @@ func TestCosineSimilarity(t *testing.T) {
 		sim    float64
 	}
 
-	docA := Document{TfIdf: TermMap{"foo": 0.3013, "bar": 0.2628}}
-	docA.Norm = docA.calcNorm()
+	docA := Document{tfIdf: termMap{"foo": 0.3013, "bar": 0.2628}}
+	docA.norm = docA.calcNorm()
 
-	docB := Document{TfIdf: TermMap{"baz": 0.1577, "quux": 0.7796, "xyzzy": 0.1577}}
-	docB.Norm = docB.calcNorm()
+	docB := Document{tfIdf: termMap{"baz": 0.1577, "quux": 0.7796, "xyzzy": 0.1577}}
+	docB.norm = docB.calcNorm()
 
-	docC := Document{TfIdf: TermMap{"foo": 0.2260, "quux": 0.6496}}
-	docC.Norm = docC.calcNorm()
+	docC := Document{tfIdf: termMap{"foo": 0.2260, "quux": 0.6496}}
+	docC.norm = docC.calcNorm()
 
 	cosTests := []cosTest{
 		{&docA, &docA, 1.0},

diff --git a/lib/stem.go → corpus/stem.go b/lib/stem.go → corpus/stem.go
@@ -1,4 +1,4 @@
-package main
+package corpus
 
 import "github.com/reiver/go-porterstemmer"