Skip to content

Commit

Permalink
Ensure all Unicode characters are parsed correctly
Browse files Browse the repository at this point in the history
The previous implementation only handled ASCII characters! There's no reason to
do that---Go supports Unicode just fine, and it's actually *easier* to do the
right thing---so this adds Unicode letter and number support in parsing.

It also extracts token parsing out into a separate function, which makes the
tests more readable.

Since it's now easier to test, this also backfills test coverage for tokens that
include numbers.
  • Loading branch information
hrs committed Jun 22, 2023
1 parent 68956e7 commit 073bed8
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 78 deletions.
80 changes: 42 additions & 38 deletions corpus/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"os"
"path/filepath"
"strings"
"unicode"
)

type termMap map[string]float64
Expand Down Expand Up @@ -55,75 +56,78 @@ func ParseDocument(path string, config *Config) (*Document, error) {
return doc, nil
}

func NewDocument(rd io.Reader, config *Config) (*Document, error) {
// Create a scanner from the file
func parseTokens(rd io.Reader) ([]string, error) {
scanner := bufio.NewScanner(rd)

// Set the split function for the scanning operation
scanner.Split(bufio.ScanWords)

termCount := make(termMap)
totalWordCount := 0.0
tokens := []string{}

// Loop over the words, stem them if configured, pass them through the
// stoplist if configured, and, for each that "should" "count", increment it
// in the term map.
for scanner.Scan() {
token := strings.ToLower(scanner.Text())
// Lower-case each word and replace curly apostrophes with single quotes.
token := strings.Map(
normalizeApostrophe,
strings.ToLower(scanner.Text()),
)

// Split each token on non-alphanumeric characters (except single quotes, to
// handle contractions)
for _, word := range strings.FieldsFunc(token, splitToken) {
// Since we didn't split on single quotes, we need to trim them off now.
// We'd like "don't" to stay "don't", but "'hello" to become "hello". We
// also now need to replace any "’" characters with regular single
// quotes to match the stoplist's expectations.
word = strings.Map(
normalizeApostrophe,
strings.Trim(word, apostropheRunes),
)
// We'd like "don't" to stay "don't", but "'hello" to become "hello".
word = strings.Trim(word, apostropheRunes)

// Similarly, we need to remove the common "'s" possessive case
word = strings.TrimSuffix(word, "'s")

if word != "" && (config.NoStoplist || !config.Stoplist.include(word)) {
if config.NoStemming {
termCount[word]++
} else {
termCount[stem(word)]++
}

totalWordCount++
if word != "" {
tokens = append(tokens, word)
}
}

}

// Check for errors in scanning
if err := scanner.Err(); err != nil {
return nil, err
}

return tokens, nil
}

func NewDocument(rd io.Reader, config *Config) (*Document, error) {
termCount := make(termMap)
totalTermCount := 0.0

tokens, err := parseTokens(rd)
if err != nil {
return nil, err
}

// Loop over the tokens, stem them if configured, pass them through the
// stoplist if configured, and, for each that "should" "count", increment it
// in the term map.
for _, token := range tokens {
if config.NoStoplist || !config.Stoplist.include(token) {
if config.NoStemming {
termCount[token]++
} else {
termCount[stem(token)]++
}

totalTermCount++
}
}

// Scale the term frequency map according to the total number of terms in the document.
termFreq := make(termMap)
for term, count := range termCount {
termFreq[term] = count / totalWordCount
termFreq[term] = count / totalTermCount
}

return &Document{termFreq: termFreq}, nil
}

func splitToken(r rune) bool {
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
return false
}

for _, apostrophe := range apostropheRunes {
if r == apostrophe {
return false
}
}

return true
return !(unicode.IsLetter(r) || unicode.IsNumber(r) || r == '\'')
}

func normalizeApostrophe(r rune) rune {
Expand Down
71 changes: 31 additions & 40 deletions corpus/document_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,37 @@ import (
"testing"
)

func TestParseTokens(t *testing.T) {
tests := []struct {
text string
exp []string
}{
{
"naïve née señor",
[]string{"naïve", "née", "señor"},
},
{
"1337 hAx0r",
[]string{"1337", "hax0r"},
},
{
"examples: isn't 'isn't' wasn’t 'wasn’t' ‘won't’ ‘won't’ ‘shan’t’ ‘shan’t’",
[]string{"examples", "isn't", "isn't", "wasn't", "wasn't", "won't", "won't", "shan't", "shan't"},
},
}

for _, tc := range tests {
got, err := parseTokens(strings.NewReader(tc.text))
if err != nil {
t.Errorf("got unexpected error %v", err)
}

if !reflect.DeepEqual(got, tc.exp) {
t.Errorf("got %#v, wanted %#v", got, tc.exp)
}
}
}

func TestNewDocument(t *testing.T) {
sampleText := "It had two positions, and scrawled in pencil on the metal switch body were the words 'magic' and 'more magic'."

Expand Down Expand Up @@ -119,46 +150,6 @@ func TestNewDocument(t *testing.T) {
}
}

func TestParsingApostrophes(t *testing.T) {
sampleText := "examples: isn't 'isn't' wasn’t 'wasn’t' ‘won't’ ‘won't’ ‘shan’t’ ‘shan’t’"

config := &Config{
NoStemming: true,
NoStoplist: true,
}

expected := termMap{
"examples": 0.1111,
"isn't": 0.2222,
"wasn't": 0.2222,
"won't": 0.2222,
"shan't": 0.2222,
}

got, err := NewDocument(strings.NewReader(sampleText), config)
if err != nil {
t.Errorf("got unexpected error %v", err)
}

for gotTerm := range got.termFreq {
_, expKey := expected[gotTerm]
if !expKey {
t.Errorf("parsed unexpected term '%s'", gotTerm)
}
}

for expTerm, expFreq := range expected {
gotFreq, ok := got.termFreq[expTerm]
if !ok {
t.Errorf("found unexpected term '%s' in termFreq", expTerm)
}

if !approxEq(gotFreq, expFreq) {
t.Errorf("for term '%s' got %.4f, wanted %.4f", expTerm, gotFreq, expFreq)
}
}
}

func TestNormalizeTfIdf(t *testing.T) {
tm := termMap{
"foo": 2.0,
Expand Down

0 comments on commit 073bed8

Please sign in to comment.