Ensure all Unicode characters are parsed correctly

The previous implementation only handled ASCII characters! There's no reason to do that---Go supports Unicode just fine, and it's actually *easier* to do the right thing---so this adds Unicode letter and number support in parsing. It also extracts token parsing out into a separate function, which makes the tests more readable. Since it's now easier to test, this also backfills test coverage for tokens that include numbers.
hrs · Jun 22, 2023 · 073bed8 · 073bed8
1 parent 68956e7
commit 073bed8
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 78 deletions.
diff --git a/corpus/document.go b/corpus/document.go
@@ -11,6 +11,7 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
+	"unicode"
 )
 
 type termMap map[string]float64
@@ -55,75 +56,78 @@ func ParseDocument(path string, config *Config) (*Document, error) {
 	return doc, nil
 }
 
-func NewDocument(rd io.Reader, config *Config) (*Document, error) {
-	// Create a scanner from the file
+func parseTokens(rd io.Reader) ([]string, error) {
 	scanner := bufio.NewScanner(rd)
-
-	// Set the split function for the scanning operation
 	scanner.Split(bufio.ScanWords)
 
-	termCount := make(termMap)
-	totalWordCount := 0.0
+	tokens := []string{}
 
-	// Loop over the words, stem them if configured, pass them through the
-	// stoplist if configured, and, for each that "should" "count", increment it
-	// in the term map.
 	for scanner.Scan() {
-		token := strings.ToLower(scanner.Text())
+		// Lower-case each word and replace curly apostrophes with single quotes.
+		token := strings.Map(
+			normalizeApostrophe,
+			strings.ToLower(scanner.Text()),
+		)
 
 		// Split each token on non-alphanumeric characters (except single quotes, to
 		// handle contractions)
 		for _, word := range strings.FieldsFunc(token, splitToken) {
 			// Since we didn't split on single quotes, we need to trim them off now.
-			// We'd like "don't" to stay "don't", but "'hello" to become "hello". We
-			// also now need to replace any "&rsquo;" characters with regular single
-			// quotes to match the stoplist's expectations.
-			word = strings.Map(
-				normalizeApostrophe,
-				strings.Trim(word, apostropheRunes),
-			)
+			// We'd like "don't" to stay "don't", but "'hello" to become "hello".
+			word = strings.Trim(word, apostropheRunes)
 
 			// Similarly, we need to remove the common "'s" possessive case
 			word = strings.TrimSuffix(word, "'s")
 
-			if word != "" && (config.NoStoplist || !config.Stoplist.include(word)) {
-				if config.NoStemming {
-					termCount[word]++
-				} else {
-					termCount[stem(word)]++
-				}
-
-				totalWordCount++
+			if word != "" {
+				tokens = append(tokens, word)
 			}
 		}
+
 	}
 
-	// Check for errors in scanning
 	if err := scanner.Err(); err != nil {
 		return nil, err
 	}
 
+	return tokens, nil
+}
+
+func NewDocument(rd io.Reader, config *Config) (*Document, error) {
+	termCount := make(termMap)
+	totalTermCount := 0.0
+
+	tokens, err := parseTokens(rd)
+	if err != nil {
+		return nil, err
+	}
+
+	// Loop over the tokens, stem them if configured, pass them through the
+	// stoplist if configured, and, for each that "should" "count", increment it
+	// in the term map.
+	for _, token := range tokens {
+		if config.NoStoplist || !config.Stoplist.include(token) {
+			if config.NoStemming {
+				termCount[token]++
+			} else {
+				termCount[stem(token)]++
+			}
+
+			totalTermCount++
+		}
+	}
+
 	// Scale the term frequency map according to the total number of terms in the document.
 	termFreq := make(termMap)
 	for term, count := range termCount {
-		termFreq[term] = count / totalWordCount
+		termFreq[term] = count / totalTermCount
 	}
 
 	return &Document{termFreq: termFreq}, nil
 }
 
 func splitToken(r rune) bool {
-	if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
-		return false
-	}
-
-	for _, apostrophe := range apostropheRunes {
-		if r == apostrophe {
-			return false
-		}
-	}
-
-	return true
+	return !(unicode.IsLetter(r) || unicode.IsNumber(r) || r == '\'')
 }
 
 func normalizeApostrophe(r rune) rune {

diff --git a/corpus/document_test.go b/corpus/document_test.go
@@ -6,6 +6,37 @@ import (
 	"testing"
 )
 
+func TestParseTokens(t *testing.T) {
+	tests := []struct {
+		text string
+		exp  []string
+	}{
+		{
+			"naïve née señor",
+			[]string{"naïve", "née", "señor"},
+		},
+		{
+			"1337 hAx0r",
+			[]string{"1337", "hax0r"},
+		},
+		{
+			"examples: isn't 'isn't' wasn’t 'wasn’t' ‘won't’ ‘won't’ ‘shan’t’ ‘shan’t’",
+			[]string{"examples", "isn't", "isn't", "wasn't", "wasn't", "won't", "won't", "shan't", "shan't"},
+		},
+	}
+
+	for _, tc := range tests {
+		got, err := parseTokens(strings.NewReader(tc.text))
+		if err != nil {
+			t.Errorf("got unexpected error %v", err)
+		}
+
+		if !reflect.DeepEqual(got, tc.exp) {
+			t.Errorf("got %#v, wanted %#v", got, tc.exp)
+		}
+	}
+}
+
 func TestNewDocument(t *testing.T) {
 	sampleText := "It had two positions, and scrawled in pencil on the metal switch body were the words 'magic' and 'more magic'."
 
@@ -119,46 +150,6 @@ func TestNewDocument(t *testing.T) {
 	}
 }
 
-func TestParsingApostrophes(t *testing.T) {
-	sampleText := "examples: isn't 'isn't' wasn’t 'wasn’t' ‘won't’ ‘won't’ ‘shan’t’ ‘shan’t’"
-
-	config := &Config{
-		NoStemming: true,
-		NoStoplist: true,
-	}
-
-	expected := termMap{
-		"examples": 0.1111,
-		"isn't":    0.2222,
-		"wasn't":   0.2222,
-		"won't":    0.2222,
-		"shan't":   0.2222,
-	}
-
-	got, err := NewDocument(strings.NewReader(sampleText), config)
-	if err != nil {
-		t.Errorf("got unexpected error %v", err)
-	}
-
-	for gotTerm := range got.termFreq {
-		_, expKey := expected[gotTerm]
-		if !expKey {
-			t.Errorf("parsed unexpected term '%s'", gotTerm)
-		}
-	}
-
-	for expTerm, expFreq := range expected {
-		gotFreq, ok := got.termFreq[expTerm]
-		if !ok {
-			t.Errorf("found unexpected term '%s' in termFreq", expTerm)
-		}
-
-		if !approxEq(gotFreq, expFreq) {
-			t.Errorf("for term '%s' got %.4f, wanted %.4f", expTerm, gotFreq, expFreq)
-		}
-	}
-}
-
 func TestNormalizeTfIdf(t *testing.T) {
 	tm := termMap{
 		"foo": 2.0,