-
Notifications
You must be signed in to change notification settings - Fork 0
/
cosineSimilarity.ts
110 lines (93 loc) · 3.41 KB
/
cosineSimilarity.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
// reproduced from https://github.com/sumn2u/string-comparison/blob/master/jscosine.js
// https://sumn2u.medium.com/string-similarity-comparision-in-js-with-examples-4bae35f13968
import {ComparisonStrategy, ComparisonStrategyResultObject} from "../atomic.js";
interface StrMap {
[key: string]: number
}
interface BoolMap {
[key: string]: boolean
}
function termFreqMap(str: string) {
var words = str.split(' ');
var termFreq: StrMap = {};
words.forEach(function (w) {
termFreq[w] = (termFreq[w] || 0) + 1;
});
return termFreq;
}
function addKeysToDict(map: StrMap, dict: BoolMap) {
for (var key in map) {
dict[key] = true;
}
}
function termFreqMapToVector(map: StrMap, dict: StrMap): number[] {
var termFreqVector = [];
for (var term in dict) {
termFreqVector.push(map[term] || 0);
}
return termFreqVector;
}
function vecDotProduct(vecA: number[], vecB: number[]) {
var product = 0;
for (var i = 0; i < vecA.length; i++) {
product += vecA[i] * vecB[i];
}
return product;
}
function vecMagnitude(vec: number[]) {
var sum = 0;
for (var i = 0; i < vec.length; i++) {
sum += vec[i] * vec[i];
}
return Math.sqrt(sum);
}
function cosineSimilarity(vecA: number[], vecB: number[]) {
return vecDotProduct(vecA, vecB) / (vecMagnitude(vecA) * vecMagnitude(vecB));
}
export const calculateCosineSimilarity = (strA: string, strB: string) => {
var termFreqA = termFreqMap(strA);
var termFreqB = termFreqMap(strB);
var dict = {};
addKeysToDict(termFreqA, dict);
addKeysToDict(termFreqB, dict);
var termFreqVecA = termFreqMapToVector(termFreqA, dict);
var termFreqVecB = termFreqMapToVector(termFreqB, dict);
return cosineSimilarity(termFreqVecA, termFreqVecB);
}
const cosineBaseStrategy: ComparisonStrategy<ComparisonStrategyResultObject> = {
name: 'cosine',
strategy: (valA: string, valB: string) => {
let res = calculateCosineSimilarity(valA, valB);
if(res > 0.99999) {
res = 1;
}
return {
score: res * 100,
rawScore: res
}
}
}
/**
* Compares whole tokens (words) within a string independent of order
*
* This strategy is automatically disabled for strings with less than 4 words because it can lead to inaccurate scores due to not comparing characters IE it is not very useful for short sentences and comparing single words with typos
*
* If you'd like to use it even in these scenarios build your own strategy array using cosineStrategyAggressive instead of this one
* */
export const cosineStrategy: ComparisonStrategy<ComparisonStrategyResultObject> = {
...cosineBaseStrategy,
isValid: (valA: string, valB: string) => {
// cosine only compares full tokens (words), rather than characters, in a string
// which makes its score very inaccurate when comparing low token-count strings (short sentences and/or words with typos)
// so disable its usage if there are less than 4 tokens
const valATokenLength = valA.split(' ').length;
const valBTokenLength = valB.split(' ').length;
return valATokenLength < 4 || valBTokenLength < 4;
}
}
/**
* Always runs (strings are always valid) which may lead to inaccurate scores in low token-count strings
* */
export const cosineStrategyAggressive: ComparisonStrategy<ComparisonStrategyResultObject> = {
...cosineBaseStrategy
}