Skip to content

Commit

Permalink
Add a --stoplist flag for custom stoplists
Browse files Browse the repository at this point in the history
  • Loading branch information
hrs committed May 23, 2023
1 parent 6d73861 commit 609ebe7
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 1 deletion.
16 changes: 15 additions & 1 deletion lib/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,19 @@ import (
"os"
)

func stoplist(flag string) *Stoplist {
if flag == "" {
return &DefaultStoplist
} else {
var err error
stoplist, err := ParseStoplist(flag)
if err != nil {
log.Fatal("Error reading custom stoplist:", err)
}
return stoplist
}
}

func main() {
bestFirstFlag := flag.Bool("best-first", false, "print best matches first")
followSymlinksFlag := flag.Bool("follow-symlinks", false, "included symlinked files in results")
Expand All @@ -17,6 +30,7 @@ func main() {
omitQueryFlag := flag.Bool("omit-query", false, "don't include the query file itself in search results")
queryFlag := flag.String("query", "", "path to the file that results should match")
showScoresFlag := flag.Bool("show-scores", false, "print scores next to file paths")
stoplistFlag := flag.String("stoplist", "", "path to a file of words to be ignored")
verboseFlag := flag.Bool("verbose", false, "include debugging information and errors")
flag.Parse()

Expand All @@ -28,7 +42,7 @@ func main() {
NoStoplist: *noStoplistFlag,
OmitQuery: *omitQueryFlag,
ShowScores: *showScoresFlag,
Stoplist: &DefaultStoplist,
Stoplist: stoplist(*stoplistFlag),
Verbose: *verboseFlag,
}

Expand Down
25 changes: 25 additions & 0 deletions lib/stoplist.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
package main

import (
"bufio"
"os"
"strings"
)

type Stoplist map[string]bool

func (stoplist *Stoplist) Include(term string) bool {
Expand All @@ -17,6 +23,25 @@ func NewStoplist(terms []string) *Stoplist {
return &stoplist
}

func ParseStoplist(path string) (*Stoplist, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
defer file.Close()

scanner := bufio.NewScanner(file)
scanner.Split(bufio.ScanWords)

terms := []string{}

for scanner.Scan() {
terms = append(terms, strings.ToLower(scanner.Text()))
}

return NewStoplist(terms), nil
}

var DefaultStoplist = Stoplist{
"'ll": true,
"'ve": true,
Expand Down
14 changes: 14 additions & 0 deletions man/docsim.1
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,23 @@ Don't filter out common words, like "the" and "because".
Don't include the query document itself in the results, even if it's in a
directory being searched.
.TP
.BR \-\-query " " \fIFILE\fR
Use \fIFILE\fR as the query against which all other files will be compared.
.TP
.BR \-\-show\-scores
Include the cosine similarity between each document and the query in the results.
.TP
.BR \-\-stoplist " " \fISTOPLIST\fR
Provide a custom stoplist to use instead of the default English stoplist.
\fISTOPLIST\fR should be a text file with one word per line. Those words will be
excluded from textual analysis.
.PP
.RS
Generally stoplists are filled with common words (like "the" and "because" in
English, or "char" and "struct" in C) that don't carry significant semantic
value.
.RE
.TP
.BR \-\-verbose
Print extra debugging information.
.LP
Expand Down

0 comments on commit 609ebe7

Please sign in to comment.