From 8efb23882f313e2088b1784c8f936a4be53e853e Mon Sep 17 00:00:00 2001 From: 3y3 <3y3@ya.ru> Date: Wed, 25 Sep 2024 18:11:28 +0300 Subject: [PATCH] feat: Add language specific scoring --- .eslintignore | 3 + .gitignore | 14 +-- .prettierignore | 5 +- .stylelintignore | 5 +- esbuild/build.mjs | 13 +++ esbuild/langs.mjs | 165 +++++++++++++++++++++++++++++++++++ package-lock.json | 28 ++++-- package.json | 15 +++- src/indexer/index.ts | 7 +- src/indexer/langs/index.d.ts | 5 ++ src/types.ts | 1 + src/worker/index.ts | 12 ++- src/worker/langs/index.d.ts | 3 + 13 files changed, 256 insertions(+), 20 deletions(-) create mode 100644 esbuild/langs.mjs create mode 100644 src/indexer/langs/index.d.ts create mode 100644 src/worker/langs/index.d.ts diff --git a/.eslintignore b/.eslintignore index 8a0682f..f3742a7 100644 --- a/.eslintignore +++ b/.eslintignore @@ -12,3 +12,6 @@ node_modules /src/__tests__ /jest.snapshots.js + +/src/indexer/langs/* +/src/worker/langs/* diff --git a/.gitignore b/.gitignore index 0846942..bd2b6f4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,4 @@ node_modules -/plugin -/includer -/runtime -/index.d.ts -/types.d.ts -/types.js .vscode .idea @@ -16,4 +10,10 @@ node_modules /dist /build /cache -/coverage \ No newline at end of file +/coverage + +/src/indexer/langs/* +!/src/indexer/langs/index.d.ts + +/src/worker/langs/* +!/src/worker/langs/index.d.ts diff --git a/.prettierignore b/.prettierignore index 6afb065..b12547d 100644 --- a/.prettierignore +++ b/.prettierignore @@ -8,4 +8,7 @@ node_modules /dist /build /cache -/coverage \ No newline at end of file +/coverage + +/src/indexer/langs/* +/src/worker/langs/* diff --git a/.stylelintignore b/.stylelintignore index 6afb065..b12547d 100644 --- a/.stylelintignore +++ b/.stylelintignore @@ -8,4 +8,7 @@ node_modules /dist /build /cache -/coverage \ No newline at end of file +/coverage + +/src/indexer/langs/* +/src/worker/langs/* diff --git a/esbuild/build.mjs b/esbuild/build.mjs index ced7e9d..4ef4ac1 100644 --- a/esbuild/build.mjs +++ b/esbuild/build.mjs @@ -1,11 +1,15 @@ import esbuild from 'esbuild'; import {TsconfigPathsPlugin} from '@esbuild-plugins/tsconfig-paths'; +import {indexer, worker} from './langs.mjs'; + const common = { tsconfig: './tsconfig.json', bundle: true, }; +await indexer('src/indexer/langs'); + esbuild.build({ ...common, target: 'node18', @@ -26,3 +30,12 @@ esbuild.build({ outdir: 'lib/worker', entryPoints: ['src/worker/index.ts'], }); + +esbuild.build({ + ...common, + target: 'ES6', + format: 'cjs', + platform: 'browser', + outdir: 'lib/worker/langs', + entryPoints: await worker('src/worker/langs'), +}); diff --git a/esbuild/langs.mjs b/esbuild/langs.mjs new file mode 100644 index 0000000..4c6a4cb --- /dev/null +++ b/esbuild/langs.mjs @@ -0,0 +1,165 @@ +import {resolve} from 'node:path'; +import {writeFile} from 'node:fs/promises'; +import {dedent} from 'ts-dedent'; + +const LANGS = [ + 'ar', + 'da', + 'de', + 'du', + 'el', + 'es', + 'fi', + 'fr', + 'he', + 'hu', + 'hy', + 'it', + 'ko', + 'nl', + 'no', + 'pt', + 'ro', + 'ru', + 'sv', + 'tr', + 'vi', + + // 'zh', + 'ja', + 'jp', + 'th', + 'hi', + 'ta', + 'sa', + 'kn', + 'te', +]; + +export async function indexer(outdir) { + for (const lang of LANGS) { + const exports = dedent` + export function ${lang}(lunr: any) { + ${attach(lang)} + + return (lunr as unknown as {[lang: string]: Builder.Plugin}).${lang} as Builder.Plugin; + } + `; + + const template = resolve(outdir, lang + '.ts'); + + await writeFile( + template, + dedent` + ${imports(lang)} + ${exports} + `, + 'utf8', + ); + } + + const template = resolve(outdir, 'index.ts'); + + await writeFile( + template, + dedent` + import type {Builder} from 'lunr'; + + ${LANGS.map((lang) => `import {${lang}} from './${lang}.js';`).join('\n')} + + type Langs = Record; + + export const langs: Langs = {${LANGS.join(', ')}}; + `, + 'utf8', + ); +} + +export async function worker(outdir) { + const entries = []; + + for (const lang of LANGS) { + const exports = dedent` + /// + /// + /// + + // Default type of \`self\` is \`WorkerGlobalScope & typeof globalThis\` + // https://github.com/microsoft/TypeScript/issues/14877 + declare const self: ServiceWorkerGlobalScope & { + language?: (lunr: any) => Builder.Plugin; + }; + + self.language = function(lunr: any) { + ${attach(lang)} + + return (lunr as unknown as {[lang: string]: Builder.Plugin}).${lang} as Builder.Plugin; + }; + `; + + const template = resolve(outdir, lang + '.ts'); + + await writeFile( + template, + dedent` + ${imports(lang)} + ${exports} + `, + 'utf8', + ); + + entries.push(template); + } + + const template = resolve(outdir, 'index.ts'); + + await writeFile( + template, + dedent` + type Langs = string[]; + + export const langs: Langs = [${LANGS.map((lang) => `'${lang}'`).join(', ')}]; + `, + 'utf8', + ); + + entries.push(template); + + return entries; +} + +function imports(lang) { + return dedent` + import type {Builder} from 'lunr'; + + // @ts-ignore + import stemmer from 'lunr-languages/lunr.stemmer.support'; + // @ts-ignore + import lang from 'lunr-languages/lunr.${lang}'; + ${ + ['ja', 'jp'].includes(lang) + ? ` + // @ts-ignore + import tinyseg from 'lunr-languages/tinyseg'; + ` + : '' + } + ${ + ['th', 'hi', 'ta', 'sa', 'kn', 'te'].includes(lang) + ? ` + // @ts-ignore + import wordcut from 'lunr-languages/wordcut'; + ` + : '' + } + `; +} + +function attach(lang) { + return dedent` + stemmer(lunr); + lang(lunr); + ${['ja', 'jp'].includes(lang) ? `tinyseg(lunr);` : ''} + ${['th', 'hi', 'ta', 'sa', 'kn', 'te'].includes(lang) ? `wordcut(lunr);` : ''} + `; +} diff --git a/package-lock.json b/package-lock.json index e7b9800..e069ab9 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,20 +10,22 @@ "license": "MIT", "dependencies": { "lunr": "^2.3.9", + "lunr-languages": "^1.14.0", "node-html-parser": "^6.1.13" }, "devDependencies": { - "@diplodoc/client": "^3.0.0-beta-1", + "@diplodoc/client": "^3.0.2", "@diplodoc/components": "^4.13.0", "@diplodoc/lint": "^1.1.3", "@diplodoc/tsconfig": "^1.0.2", "@esbuild-plugins/tsconfig-paths": "^0.1.2", "@types/lunr": "^2.3.7", "esbuild": "^0.23.1", + "ts-dedent": "^2.2.0", "typescript": "^5.6.2" }, "peerDependencies": { - "@diplodoc/client": "^3.0.0-beta-1", + "@diplodoc/client": "^3.0.2", "@diplodoc/components": "^4.11.2" } }, @@ -576,9 +578,9 @@ } }, "node_modules/@diplodoc/client": { - "version": "3.0.0-beta-1", - "resolved": "https://registry.npmjs.org/@diplodoc/client/-/client-3.0.0-beta-1.tgz", - "integrity": "sha512-qKg7XpNLlwN3dVyd/mVofxdbVv+APW+E0K5ydrhbFtpVeCoaLM8zK4BHMivQS6CRKIaccJ6IVgOWU+TapDNo9g==", + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/@diplodoc/client/-/client-3.0.2.tgz", + "integrity": "sha512-7QB7S8bK18b4NWV75fGRnp+Xn86vVpDrTUPbT0qCIzjfHdGJbdXwWpTI9GiTANWKz7o9myF/WkrHb7TSey340A==", "dev": true, "license": "ISC", "dependencies": { @@ -6542,6 +6544,12 @@ "integrity": "sha512-zTU3DaZaF3Rt9rhN3uBMGQD3dD2/vFQqnvZCDv4dl5iOzq2IZQqTxu90r4E5J+nP70J3ilqVCrbho2eWaeW8Ow==", "license": "MIT" }, + "node_modules/lunr-languages": { + "version": "1.14.0", + "resolved": "https://registry.npmjs.org/lunr-languages/-/lunr-languages-1.14.0.tgz", + "integrity": "sha512-hWUAb2KqM3L7J5bcrngszzISY4BxrXn/Xhbb9TTCJYEGqlR1nG67/M14sp09+PTIRklobrn57IAxcdcO/ZFyNA==", + "license": "MPL-1.1" + }, "node_modules/map-obj": { "version": "4.3.0", "resolved": "https://registry.npmjs.org/map-obj/-/map-obj-4.3.0.tgz", @@ -9423,6 +9431,16 @@ "typescript": ">=4.2.0" } }, + "node_modules/ts-dedent": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/ts-dedent/-/ts-dedent-2.2.0.tgz", + "integrity": "sha512-q5W7tVM71e2xjHZTlgfTDoPF/SmqKG5hddq9SzR49CH2hayqRKJtQ4mtRlSxKaJlR/+9rEM+mnBHf7I2/BQcpQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.10" + } + }, "node_modules/tsconfig-paths": { "version": "3.15.0", "resolved": "https://registry.npmjs.org/tsconfig-paths/-/tsconfig-paths-3.15.0.tgz", diff --git a/package.json b/package.json index acf1d47..d64fd3b 100644 --- a/package.json +++ b/package.json @@ -6,7 +6,7 @@ "types": "./lib/indexer/index.d.ts", "scripts": { "build": "npm run build:clean && npm run build:code", - "build:code": "tsc --emitDeclarationOnly && node esbuild/build.mjs", + "build:code": "node esbuild/build.mjs && tsc --emitDeclarationOnly", "build:clean": "rm -rf lib", "prepublishOnly": "npm run build", "test": "exit 0", @@ -21,26 +21,33 @@ "types": "./lib/indexer/index.d.ts", "default": "./lib/indexer/index.js" }, - "./worker": "./lib/worker/index.js" + "./worker": "./lib/worker/index.js", + "./worker/langs": { + "types": "./lib/worker/langs/index.d.ts", + "default": "./lib/worker/langs/index.js" + }, + "./worker/langs/*": "./lib/worker/langs/*.js" }, "author": "", "license": "MIT", "devDependencies": { - "@diplodoc/client": "^3.0.0-beta-1", + "@diplodoc/client": "^3.0.2", "@diplodoc/components": "^4.13.0", "@diplodoc/lint": "^1.1.3", "@diplodoc/tsconfig": "^1.0.2", "@esbuild-plugins/tsconfig-paths": "^0.1.2", "@types/lunr": "^2.3.7", "esbuild": "^0.23.1", + "ts-dedent": "^2.2.0", "typescript": "^5.6.2" }, "dependencies": { "lunr": "^2.3.9", + "lunr-languages": "^1.14.0", "node-html-parser": "^6.1.13" }, "peerDependencies": { - "@diplodoc/client": "^3.0.0-beta-1", + "@diplodoc/client": "^3.0.2", "@diplodoc/components": "^4.11.2" } } diff --git a/src/indexer/index.ts b/src/indexer/index.ts index 51adb16..48ffaa1 100644 --- a/src/indexer/index.ts +++ b/src/indexer/index.ts @@ -1,9 +1,10 @@ import type {DocPageData} from '@diplodoc/components'; -import {Builder} from 'lunr'; +import lunr, {Builder} from 'lunr'; import {INDEX_FIELDS} from '../constants'; +import {langs} from './langs'; import {html2text} from './html'; type DocumentInfo = { @@ -73,6 +74,10 @@ export class Indexer { private init(lang: string) { const index = new Builder(); + if (langs[lang]) { + index.use(langs[lang](lunr)); + } + index.ref('url'); for (const [field, boost] of Object.entries(INDEX_FIELDS)) { diff --git a/src/indexer/langs/index.d.ts b/src/indexer/langs/index.d.ts new file mode 100644 index 0000000..fd9960a --- /dev/null +++ b/src/indexer/langs/index.d.ts @@ -0,0 +1,5 @@ +import {Builder} from 'lunr'; + +type Langs = Record; + +export const langs: Langs; diff --git a/src/types.ts b/src/types.ts index 3ff8f09..589edd3 100644 --- a/src/types.ts +++ b/src/types.ts @@ -11,6 +11,7 @@ export interface WorkerConfig extends ISearchWorkerConfig { resources: { index: string; registry: string; + language?: string; }; } diff --git a/src/worker/index.ts b/src/worker/index.ts index 842582e..fb72261 100644 --- a/src/worker/index.ts +++ b/src/worker/index.ts @@ -6,7 +6,7 @@ import type {Registry, WorkerConfig} from '../types'; import type {ISearchWorkerApi} from '@diplodoc/client'; -import {Index} from 'lunr'; +import lunr, {Builder, Index} from 'lunr'; import {search} from './search'; import {format, long, short} from './format'; @@ -16,6 +16,7 @@ import {format, long, short} from './format'; declare const self: ServiceWorkerGlobalScope & { config?: WorkerConfig; api?: ISearchWorkerApi; + language?: (lunr: unknown) => Builder.Plugin; }; const NOT_INITIALIZED = { @@ -73,6 +74,15 @@ async function load(): Promise<[Index, Registry]> { request(`${config.base}/${config.resources.index}`), request(`${config.base}/${config.resources.registry}`), ]); + + if (config.resources.language) { + importScripts(`${config.base}/${config.resources.language}`); + } + + if (self.language) { + self.language(lunr); + } + const index = Index.load(indexData); return [index, registry]; diff --git a/src/worker/langs/index.d.ts b/src/worker/langs/index.d.ts new file mode 100644 index 0000000..8567213 --- /dev/null +++ b/src/worker/langs/index.d.ts @@ -0,0 +1,3 @@ +type Langs = string[]; + +export const langs: Langs;