From 1f5b52dfd38a96e74628c91efaf3d6a25b02b050 Mon Sep 17 00:00:00 2001 From: 3y3 <3y3@ya.ru> Date: Wed, 25 Sep 2024 17:32:45 +0300 Subject: [PATCH] fix: Improve phrase scoring --- src/worker/score.ts | 61 ++++++++++++++++++++------------------------ src/worker/search.ts | 22 ++++++++++++---- 2 files changed, 44 insertions(+), 39 deletions(-) diff --git a/src/worker/score.ts b/src/worker/score.ts index faf6435..e7ee50b 100644 --- a/src/worker/score.ts +++ b/src/worker/score.ts @@ -13,7 +13,7 @@ type ScoreState = { score: number; prev: ResultToken | null | undefined; curr: ResultToken; - phrase: string; + phrase: string[]; position: Position; }; @@ -89,7 +89,7 @@ export function phrased(result: Index.Result, terms: string[]) { prev: null, curr: token, position: token.position.slice() as Position, - phrase: token.text, + phrase: [token.text], }; return match; @@ -101,7 +101,7 @@ export function phrased(result: Index.Result, terms: string[]) { state.score = 0; state.position = state.curr.position.slice() as Position; - state.phrase = state.curr.text; + state.phrase = [state.curr.text]; if (!tokens.length) { return end; @@ -117,20 +117,12 @@ export function phrased(result: Index.Result, terms: string[]) { state.prev = state.curr; state.curr = tokens.shift() as ResultToken; - state.phrase += ' ' + state.curr.text; + state.phrase.push(state.curr.text); return match; } function match() { - if (terms.includes(state.curr.text as string)) { - return scoreToken; - } else { - return scoreWildcard; - } - } - - function scoreToken() { const {prev, curr} = state; state.score += 2; @@ -139,12 +131,8 @@ export function phrased(result: Index.Result, terms: string[]) { return nextToken; } - // This is partially buggy, if phrase has more that one similar token - if (distance(prev.position, curr.position) <= MERGE_TOLERANCE) { - if (phrase.includes(state.phrase)) { - state.score += 10; - } - + if (isPhrase(phrase, state.phrase, distance(prev.position, curr.position))) { + state.score += 10; state.position[1] = curr.position[1]; return nextToken; @@ -153,22 +141,6 @@ export function phrased(result: Index.Result, terms: string[]) { return nextScore; } - function scoreWildcard() { - const {prev, curr} = state; - - state.score += 0.5; - - if (prev && distance(prev.position, curr.position) <= MERGE_TOLERANCE) { - if (phrase.includes(state.phrase)) { - state.score += 0.5; - } - - state.position[1] = state.curr.position[1]; - } - - return nextScore; - } - function end() { results = dedupe(results); return null; @@ -234,6 +206,27 @@ function dedupe(tokens: ScoreResult[]) { return result; } +function isPhrase(phrase: string, tokens: string[], distance: number) { + if (distance > MERGE_TOLERANCE) { + return false; + } + + tokens = tokens.slice(); + + let index = 0; + while (tokens.length && index > -1) { + const token = tokens.shift() as string; + + index = phrase.indexOf(token, index); + + if (index > -1) { + index += token.length; + } + } + + return index > -1; +} + function isIntersection(a: Position, b: Position) { return (a[1] >= b[0] && a[1] <= b[1]) || (a[1] >= b[0] && a[1] <= b[1]); } diff --git a/src/worker/search.ts b/src/worker/search.ts index 05082f9..b4420c8 100644 --- a/src/worker/search.ts +++ b/src/worker/search.ts @@ -30,7 +30,7 @@ interface FixedClause extends Query.Clause { const makeStrategies = (tolerance: number, index: Index, clauses: FixedClause[], sealed: boolean) => [ - tolerance > -1 && + tolerance >= 0 && function precise(query: Query) { query.clauses = clauses.slice(); @@ -42,7 +42,7 @@ const makeStrategies = (tolerance: number, index: Index, clauses: FixedClause[], } } }, - tolerance > 0 && + tolerance >= 1 && function trailingWildcard(query: Query) { query.clauses = clauses.map((clause) => { if (clause.presence !== Query.presence.PROHIBITED) { @@ -51,7 +51,7 @@ const makeStrategies = (tolerance: number, index: Index, clauses: FixedClause[], return clause; }); }, - tolerance > 1 && + tolerance >= 2 && function bothWildcard(query: Query) { query.clauses = clauses.map((clause) => { if (clause.presence !== Query.presence.PROHIBITED) { @@ -102,14 +102,26 @@ export function search( } function wildcard(clause: FixedClause, mode: Query.wildcard) { + const requiredLength = + [ + // eslint-disable-next-line no-bitwise + mode & Query.wildcard.TRAILING ? 2 : 0, + // eslint-disable-next-line no-bitwise + mode & Query.wildcard.LEADING ? 2 : 0, + ].reduce((a, b) => a + b, 0) + 1; + + if (clause.term.length < requiredLength) { + return; + } + // eslint-disable-next-line no-bitwise if (mode & Query.wildcard.TRAILING) { - clause.term = clause.term + '*'; + clause.term = clause.term.slice(0, -1) + '*'; } // eslint-disable-next-line no-bitwise if (mode & Query.wildcard.LEADING) { - clause.term = '*' + clause.term; + clause.term = '*' + clause.term.slice(1); } clause.wildcard = mode;