From 609ebe7ccd7d0d40fd45f07602cdf252bce0a0a7 Mon Sep 17 00:00:00 2001 From: "Harry R. Schwartz" Date: Tue, 23 May 2023 15:29:13 -0700 Subject: [PATCH] Add a --stoplist flag for custom stoplists --- lib/main.go | 16 +++++++++++++++- lib/stoplist.go | 25 +++++++++++++++++++++++++ man/docsim.1 | 14 ++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/lib/main.go b/lib/main.go index ec703ff..8fbf5bc 100644 --- a/lib/main.go +++ b/lib/main.go @@ -8,6 +8,19 @@ import ( "os" ) +func stoplist(flag string) *Stoplist { + if flag == "" { + return &DefaultStoplist + } else { + var err error + stoplist, err := ParseStoplist(flag) + if err != nil { + log.Fatal("Error reading custom stoplist:", err) + } + return stoplist + } +} + func main() { bestFirstFlag := flag.Bool("best-first", false, "print best matches first") followSymlinksFlag := flag.Bool("follow-symlinks", false, "included symlinked files in results") @@ -17,6 +30,7 @@ func main() { omitQueryFlag := flag.Bool("omit-query", false, "don't include the query file itself in search results") queryFlag := flag.String("query", "", "path to the file that results should match") showScoresFlag := flag.Bool("show-scores", false, "print scores next to file paths") + stoplistFlag := flag.String("stoplist", "", "path to a file of words to be ignored") verboseFlag := flag.Bool("verbose", false, "include debugging information and errors") flag.Parse() @@ -28,7 +42,7 @@ func main() { NoStoplist: *noStoplistFlag, OmitQuery: *omitQueryFlag, ShowScores: *showScoresFlag, - Stoplist: &DefaultStoplist, + Stoplist: stoplist(*stoplistFlag), Verbose: *verboseFlag, } diff --git a/lib/stoplist.go b/lib/stoplist.go index 3a3cdac..21fb98a 100644 --- a/lib/stoplist.go +++ b/lib/stoplist.go @@ -1,5 +1,11 @@ package main +import ( + "bufio" + "os" + "strings" +) + type Stoplist map[string]bool func (stoplist *Stoplist) Include(term string) bool { @@ -17,6 +23,25 @@ func NewStoplist(terms []string) *Stoplist { return &stoplist } +func ParseStoplist(path string) (*Stoplist, error) { + file, err := os.Open(path) + if err != nil { + return nil, err + } + defer file.Close() + + scanner := bufio.NewScanner(file) + scanner.Split(bufio.ScanWords) + + terms := []string{} + + for scanner.Scan() { + terms = append(terms, strings.ToLower(scanner.Text())) + } + + return NewStoplist(terms), nil +} + var DefaultStoplist = Stoplist{ "'ll": true, "'ve": true, diff --git a/man/docsim.1 b/man/docsim.1 index 0ee1939..669e2de 100644 --- a/man/docsim.1 +++ b/man/docsim.1 @@ -65,9 +65,23 @@ Don't filter out common words, like "the" and "because". Don't include the query document itself in the results, even if it's in a directory being searched. .TP +.BR \-\-query " " \fIFILE\fR +Use \fIFILE\fR as the query against which all other files will be compared. +.TP .BR \-\-show\-scores Include the cosine similarity between each document and the query in the results. .TP +.BR \-\-stoplist " " \fISTOPLIST\fR +Provide a custom stoplist to use instead of the default English stoplist. +\fISTOPLIST\fR should be a text file with one word per line. Those words will be +excluded from textual analysis. +.PP +.RS +Generally stoplists are filled with common words (like "the" and "because" in +English, or "char" and "struct" in C) that don't carry significant semantic +value. +.RE +.TP .BR \-\-verbose Print extra debugging information. .LP