From 32a02a878255d810f85faf7c5a98d8fc62fde4ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Massot?= Date: Tue, 18 Jul 2023 01:52:26 +0900 Subject: [PATCH] Add multilang tokenizer (#3608) * Add multilang tokenizer. * Bump lindera and tantivy version. * Fix cargo and fmt. * Add tokenizers bench. * Take into account comments from review. Bump lindera version to 2.6.0. * Increase postgresql timeout for tests. * Put multilang in custom tokenizer, avoid copying lindera dictionaries. * Remove lindera tantivy dep, clean. * Fix test. * Fix build. * Add multilang in tokenizer manager. * Fix lindera features. --- .../wikipedia/multilang-index-config.yaml | 34 ++ quickwit/Cargo.lock | 434 +++++++++++++++++- quickwit/Cargo.toml | 8 +- quickwit/quickwit-cli/Cargo.toml | 4 + quickwit/quickwit-doc-mapper/Cargo.toml | 7 +- .../src/default_doc_mapper/tokenizer_entry.rs | 12 +- .../quickwit-doc-mapper/src/doc_mapper.rs | 40 ++ .../src/metastore/postgresql_metastore.rs | 7 +- quickwit/quickwit-query/Cargo.toml | 24 +- .../benches/multilang_tokenizers_bench.rs | 170 +++++++ quickwit/quickwit-query/src/lib.rs | 4 +- .../src/tokenizers/chinese_compatible.rs | 279 +++++++++++ .../code_tokenizer.rs} | 368 +-------------- quickwit/quickwit-query/src/tokenizers/mod.rs | 133 ++++++ .../src/tokenizers/multilang.rs | 342 ++++++++++++++ 15 files changed, 1488 insertions(+), 378 deletions(-) create mode 100644 config/tutorials/wikipedia/multilang-index-config.yaml create mode 100644 quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs create mode 100644 quickwit/quickwit-query/src/tokenizers/chinese_compatible.rs rename quickwit/quickwit-query/src/{tokenizers.rs => tokenizers/code_tokenizer.rs} (52%) create mode 100644 quickwit/quickwit-query/src/tokenizers/mod.rs create mode 100644 quickwit/quickwit-query/src/tokenizers/multilang.rs diff --git a/config/tutorials/wikipedia/multilang-index-config.yaml b/config/tutorials/wikipedia/multilang-index-config.yaml new file mode 100644 index 00000000000..002a9cda9e6 --- /dev/null +++ b/config/tutorials/wikipedia/multilang-index-config.yaml @@ -0,0 +1,34 @@ +# +# Index config file for multilang wikipedia datasets. +# + +version: 0.6 + +index_id: multilang-wikipedia + +doc_mapping: + tokenizers: + - name: multilang + type: multilang + field_mappings: + - name: title + type: text + tokenizer: multilang + record: position + stored: true + fieldnorms: true + - name: body + type: text + tokenizer: multilang + record: position + stored: true + fieldnorms: true + - name: url + type: text + tokenizer: raw + +search_settings: + default_search_fields: [title, body] + +indexing_settings: + commit_timeout_secs: 10 diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 1a7f9f17557..50b327fae1e 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -922,6 +922,15 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bit-set" version = "0.5.3" @@ -1105,6 +1114,27 @@ dependencies = [ "either", ] +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "cast" version = "0.3.0" @@ -1421,6 +1451,12 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "795bc6e66a8e340f075fcf6227e417a2dc976b92b91f3cdc778bb858778b6747" +[[package]] +name = "constant_time_eq" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" + [[package]] name = "convert_case" version = "0.4.0" @@ -1941,15 +1977,88 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + [[package]] name = "encoding_rs" -version = "0.8.29" +version = "0.8.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a74ea89a0a1b98f6332de42c95baff457ada66d1cb4030f9ff151b2041a1c746" +checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394" dependencies = [ "cfg-if", ] +[[package]] +name = "encoding_rs_io" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" +dependencies = [ + "encoding_rs", +] + [[package]] name = "enum-iterator" version = "1.4.1" @@ -2074,6 +2183,18 @@ dependencies = [ "instant", ] +[[package]] +name = "filetime" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.2.16", + "windows-sys 0.48.0", +] + [[package]] name = "fixedbitset" version = "0.4.2" @@ -2972,6 +3093,233 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "lindera-cc-cedict" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69c983c7c5068266e882449172a9583b04745045180d5118a52ea0e69743476b" +dependencies = [ + "bincode", + "byteorder", + "encoding", + "lindera-cc-cedict-builder", + "lindera-core", + "lindera-decompress", + "once_cell", + "zip", +] + +[[package]] +name = "lindera-cc-cedict-builder" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d2e8f2ca97ddf952fe340642511b9c14b373cb2eef711d526bb8ef2ca0969b8" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "encoding", + "env_logger", + "glob", + "lindera-compress", + "lindera-core", + "lindera-decompress", + "log", + "yada", +] + +[[package]] +name = "lindera-compress" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f72b460559bcbe8a9cee85ea4a5056133ed3abf373031191589236e656d65b59" +dependencies = [ + "anyhow", + "flate2", + "lindera-decompress", +] + +[[package]] +name = "lindera-core" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f586eb8a9393c32d5525e0e9336a3727bd1329674740097126f3b0bff8a1a1ea" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "encoding_rs", + "log", + "once_cell", + "serde", + "thiserror", + "yada", +] + +[[package]] +name = "lindera-decompress" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fb1facd8da698072fcc7338bd757730db53d59f313f44dd583fa03681dcc0e1" +dependencies = [ + "anyhow", + "flate2", + "serde", +] + +[[package]] +name = "lindera-dictionary" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec7be7410b1da7017a8948986b87af67082f605e9a716f0989790d795d677f0c" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "lindera-cc-cedict", + "lindera-cc-cedict-builder", + "lindera-core", + "lindera-ipadic", + "lindera-ipadic-builder", + "lindera-ipadic-neologd-builder", + "lindera-ko-dic", + "lindera-ko-dic-builder", + "lindera-unidic-builder", + "serde", +] + +[[package]] +name = "lindera-ipadic" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3db2c39d5b69cb7b69df6edb44863d38991e0eb0037d38396604c1e65106a5db" +dependencies = [ + "bincode", + "byteorder", + "encoding", + "flate2", + "lindera-core", + "lindera-decompress", + "lindera-ipadic-builder", + "once_cell", + "tar", +] + +[[package]] +name = "lindera-ipadic-builder" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "705d07f8a45d04fd95149f7ad41a26d1f9e56c9c00402be6f9dd05e3d88b99c6" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "encoding_rs", + "encoding_rs_io", + "env_logger", + "glob", + "lindera-compress", + "lindera-core", + "lindera-decompress", + "log", + "serde", + "yada", +] + +[[package]] +name = "lindera-ipadic-neologd-builder" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "633a93983ba13fba42328311a501091bd4a7aff0c94ae9eaa9d4733dd2b0468a" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "encoding_rs", + "encoding_rs_io", + "env_logger", + "glob", + "lindera-core", + "lindera-decompress", + "log", + "serde", + "yada", +] + +[[package]] +name = "lindera-ko-dic" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a428e0d316b6c86f51bd919479692bc41ad840dba266ebc044663970f431ea18" +dependencies = [ + "bincode", + "byteorder", + "encoding", + "flate2", + "lindera-core", + "lindera-decompress", + "lindera-ko-dic-builder", + "once_cell", + "tar", +] + +[[package]] +name = "lindera-ko-dic-builder" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a5288704c6b8a069c0a1705c38758e836497698b50453373ab3d56c6f9a7ef8" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "encoding", + "env_logger", + "glob", + "lindera-compress", + "lindera-core", + "lindera-decompress", + "log", + "yada", +] + +[[package]] +name = "lindera-tokenizer" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "106ba439b2e87529d9bbedbb88d69f635baba1195c26502b308f55a85885fc81" +dependencies = [ + "bincode", + "byteorder", + "lindera-core", + "lindera-dictionary", + "once_cell", + "serde", + "serde_json", +] + +[[package]] +name = "lindera-unidic-builder" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b698227fdaeac32289173ab389b990d4eb00a40cbc9912020f69a0c491dabf55" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "encoding", + "env_logger", + "glob", + "lindera-core", + "lindera-decompress", + "log", + "yada", +] + [[package]] name = "linked-hash-map" version = "0.5.6" @@ -3809,6 +4157,17 @@ dependencies = [ "regex", ] +[[package]] +name = "password-hash" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700" +dependencies = [ + "base64ct", + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "paste" version = "1.0.14" @@ -3826,6 +4185,18 @@ dependencies = [ "snafu", ] +[[package]] +name = "pbkdf2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917" +dependencies = [ + "digest", + "hmac", + "password-hash", + "sha2", +] + [[package]] name = "pem" version = "1.1.1" @@ -4781,6 +5152,7 @@ dependencies = [ "once_cell", "proptest", "quickwit-datetime", + "quickwit-proto", "quickwit-query", "regex", "serde", @@ -5133,6 +5505,9 @@ dependencies = [ "base64 0.21.2", "criterion", "hex", + "lindera-core", + "lindera-dictionary", + "lindera-tokenizer", "once_cell", "proptest", "quickwit-datetime", @@ -5142,6 +5517,8 @@ dependencies = [ "tantivy", "thiserror", "time 0.3.23", + "tracing", + "whichlang", ] [[package]] @@ -6869,7 +7246,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.1.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=3c30066#3c300666ad448386136d2595b613b3236b123ff9" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=3c30066#7575f9bf1ca61bca6b5fee472a46880973003ee8" dependencies = [ "serde", ] @@ -6880,6 +7257,17 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" +[[package]] +name = "tar" +version = "0.4.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec96d2ffad078296368d46ff1cb309be1c23c513b4ab0e22a45de0185275ac96" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "tempfile" version = "3.6.0" @@ -8127,6 +8515,11 @@ dependencies = [ "once_cell", ] +[[package]] +name = "whichlang" +version = "0.1.0" +source = "git+https://github.com/quickwit-oss/whichlang?rev=fe406416#fe406416cbad9849f790a274d25b2a53caccd2fa" + [[package]] name = "whoami" version = "1.4.1" @@ -8376,12 +8769,27 @@ dependencies = [ "tap", ] +[[package]] +name = "xattr" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc" +dependencies = [ + "libc", +] + [[package]] name = "xmlparser" version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd" +[[package]] +name = "yada" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d12cb7a57bbf2ab670ed9545bae3648048547f9039279a89ce000208e585c1" + [[package]] name = "yaml-rust" version = "0.4.5" @@ -8403,6 +8811,26 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" +[[package]] +name = "zip" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" +dependencies = [ + "aes", + "byteorder", + "bzip2", + "constant_time_eq", + "crc32fast", + "crossbeam-utils", + "flate2", + "hmac", + "pbkdf2", + "sha1", + "time 0.3.23", + "zstd 0.11.2+zstd.1.5.2", +] + [[package]] name = "zstd" version = "0.11.2+zstd.1.5.2" diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 41724a60041..69b6f3ef266 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -89,6 +89,9 @@ itertools = "0.11" json_comments = "0.2" libz-sys = "1.1.8" lru = "0.11" +lindera-core = "0.27.0" +lindera-dictionary = "0.27.0" +lindera-tokenizer = { version = "0.27.0", features = ["ipadic", "ipadic-compress", "cc-cedict", "cc-cedict-compress", "ko-dic", "ko-dic-compress"] } matches = "0.1.9" md5 = "0.7" mime_guess = "2.0.4" @@ -178,6 +181,7 @@ username = "0.2" utoipa = "3.3.0" uuid = { version = "1.4", features = ["v4", "serde"] } warp = "0.3" +whichlang = { git = "https://github.com/quickwit-oss/whichlang", rev = "fe406416" } wiremock = "0.5" aws-config = "0.55.0" @@ -232,8 +236,8 @@ tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "3c30066", d ] } # This is actually not used directly the goal is to fix the version -# used by reqwest. 0.8.30 has an unclear license. -encoding_rs = "=0.8.29" +# used by reqwest. +encoding_rs = "=0.8.32" # vrl deps, at the end because the feature list for vrl-stdlib is long vrl = { git = "https://github.com/vectordotdev/vrl", rev = "v0.3.0", default-features = false, features = [ diff --git a/quickwit/quickwit-cli/Cargo.toml b/quickwit/quickwit-cli/Cargo.toml index ecd58504015..70ca0dc2fb8 100644 --- a/quickwit/quickwit-cli/Cargo.toml +++ b/quickwit/quickwit-cli/Cargo.toml @@ -80,6 +80,7 @@ quickwit-actors = { workspace = true, features = ["testsuite"] } quickwit-common = { workspace = true, features = ["testsuite"] } quickwit-config = { workspace = true, features = ["testsuite"] } quickwit-metastore = { workspace = true, features = ["testsuite"] } +quickwit-storage = { workspace = true, features = ["testsuite"] } [features] jemalloc = ["dep:tikv-jemalloc-ctl", "dep:tikv-jemallocator"] @@ -95,6 +96,7 @@ release-feature-set = [ "quickwit-indexing/vrl", "quickwit-metastore/azure", "quickwit-metastore/postgres", + "quickwit-doc-mapper/multilang", ] release-feature-vendored-set = [ "jemalloc", @@ -104,6 +106,7 @@ release-feature-vendored-set = [ "quickwit-indexing/vendored-kafka", "quickwit-metastore/azure", "quickwit-metastore/postgres", + "quickwit-doc-mapper/multilang", ] release-macos-feature-vendored-set = [ "jemalloc", @@ -113,4 +116,5 @@ release-macos-feature-vendored-set = [ "quickwit-indexing/vendored-kafka-macos", "quickwit-metastore/azure", "quickwit-metastore/postgres", + "quickwit-doc-mapper/multilang", ] diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml index fd93e1356f6..89344f06c8f 100644 --- a/quickwit/quickwit-doc-mapper/Cargo.toml +++ b/quickwit/quickwit-doc-mapper/Cargo.toml @@ -31,17 +31,20 @@ typetag = { workspace = true } utoipa = { workspace = true } quickwit-datetime = { workspace = true } -quickwit-query = { workspace = true } +quickwit-query = { workspace = true, features = ["multilang"] } [dev-dependencies] criterion = { workspace = true } matches = { workspace = true } proptest = { workspace = true } +quickwit-proto = { workspace = true } +quickwit-query = { workspace = true, features = ["testsuite"] } serde_yaml = { workspace = true } time = { workspace = true } [features] -testsuite = [] +multilang = ["quickwit-query/multilang"] +testsuite = ["multilang"] [[bench]] name = "doc_to_json_bench" diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs index 14b5aa7eea4..79788d3492b 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs @@ -18,6 +18,8 @@ // along with this program. If not, see . use anyhow::Context; +#[cfg(feature = "multilang")] +use quickwit_query::MultiLangTokenizer; use quickwit_query::DEFAULT_REMOVE_TOKEN_LENGTH; use serde::{Deserialize, Serialize}; use tantivy::tokenizer::{ @@ -39,9 +41,9 @@ pub struct TokenizerEntry { #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, utoipa::ToSchema)] pub struct TokenizerConfig { #[serde(flatten)] - tokenizer_type: TokenizerType, + pub(crate) tokenizer_type: TokenizerType, #[serde(default)] - filters: Vec, + pub(crate) filters: Vec, } impl TokenizerConfig { @@ -49,6 +51,10 @@ impl TokenizerConfig { pub fn text_analyzer(&self) -> anyhow::Result { let mut text_analyzer_builder = match &self.tokenizer_type { TokenizerType::Simple => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(), + #[cfg(feature = "multilang")] + TokenizerType::Multilang => { + TextAnalyzer::builder(MultiLangTokenizer::default()).dynamic() + } TokenizerType::Ngram(options) => { let tokenizer = NgramTokenizer::new(options.min_gram, options.max_gram, options.prefix_only) @@ -121,6 +127,8 @@ impl TokenFilterType { #[serde(tag = "type", rename_all = "snake_case")] pub enum TokenizerType { Simple, + #[cfg(feature = "multilang")] + Multilang, Ngram(NgramTokenizerOption), Regex(RegexTokenizerOption), } diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs index 45215b9eb8f..b867d1fdab5 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs @@ -525,4 +525,44 @@ mod tests { wi_cloned.merge(wi_2); assert_eq!(wi_cloned, wi_base); } + + #[test] + #[cfg(feature = "testsuite")] + fn test_doc_mapper_query_with_multilang_field() { + use quickwit_query::query_ast::TermQuery; + + use crate::default_doc_mapper::{ + QuickwitTextOptions, QuickwitTextTokenizer, TokenizerType, + }; + use crate::{TokenizerConfig, TokenizerEntry}; + let mut doc_mapper_builder = DefaultDocMapperBuilder::default(); + doc_mapper_builder.field_mappings.push(FieldMappingEntry { + name: "multilang".to_string(), + mapping_type: FieldMappingType::Text( + QuickwitTextOptions { + tokenizer: Some(QuickwitTextTokenizer::from_static("multilang")), + ..Default::default() + }, + Cardinality::SingleValue, + ), + }); + doc_mapper_builder.tokenizers.push(TokenizerEntry { + name: "multilang".to_string(), + config: TokenizerConfig { + tokenizer_type: TokenizerType::Multilang, + filters: vec![], + }, + }); + let doc_mapper = doc_mapper_builder.try_build().unwrap(); + let schema = doc_mapper.schema(); + let query_ast = quickwit_query::query_ast::QueryAst::Term(TermQuery { + field: "multilang".to_string(), + value: "JPN:す".to_string(), + }); + let (query, _) = doc_mapper.query(schema, &query_ast, false).unwrap(); + assert_eq!( + format!("{query:?}"), + r#"TermQuery(Term(field=0, type=Str, "JPN:す"))"# + ); + } } diff --git a/quickwit/quickwit-metastore/src/metastore/postgresql_metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgresql_metastore.rs index b80061f3538..7a6ae58379d 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgresql_metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgresql_metastore.rs @@ -119,11 +119,16 @@ impl PostgresqlMetastore { postgres_metastore_config: &PostgresMetastoreConfig, connection_uri: &Uri, ) -> MetastoreResult { + let acquire_timeout = if cfg!(any(test, feature = "testsuite")) { + Duration::from_secs(20) + } else { + Duration::from_secs(2) + }; let connection_pool = establish_connection( connection_uri, 1, postgres_metastore_config.max_num_connections.get(), - Duration::from_secs(2), + acquire_timeout, Some(Duration::from_secs(1)), None, ) diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index c4a80ea51eb..37a4fd0f942 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -13,19 +13,41 @@ documentation = "https://quickwit.io/docs/" anyhow = { workspace = true } base64 = { workspace = true } hex = { workspace = true } +lindera-core = { workspace = true, optional = true} +lindera-dictionary = { workspace = true, optional = true } +lindera-tokenizer = { workspace = true, optional = true } once_cell = { workspace = true } -quickwit-datetime = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } serde_with = { workspace = true } tantivy = { workspace = true } thiserror = { workspace = true } +tracing = { workspace = true } +whichlang = { workspace = true, optional = true } + +quickwit-datetime = { workspace = true } [dev-dependencies] criterion = { workspace = true } proptest = { workspace = true } time = { workspace = true } +[features] +multilang = [ + "lindera-core", + "lindera-dictionary", + "lindera-tokenizer", + "whichlang", +] + +testsuite = [ + "multilang", +] + [[bench]] name = "tokenizers_bench" harness = false + +[[bench]] +name = "multilang_tokenizers_bench" +harness = false diff --git a/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs b/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs new file mode 100644 index 00000000000..0f4a5b7496f --- /dev/null +++ b/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs @@ -0,0 +1,170 @@ +// Copyright (C) 2023 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use quickwit_query::create_default_quickwit_tokenizer_manager; +use tantivy::tokenizer::{TextAnalyzer, Token, TokenStream}; + +// A random ascii string of length 100 chars. +const ASCII_SHORT: &str = "It is a long established fact"; +static ASCII_LONG: &str = r#"It is a long established fact that a reader will be distracted by the readable content of a + page when looking at its layout. The point of using Lorem Ipsum is that it has a + more-or-less normal distribution of letters, as opposed to using 'Content here, content + here', making it look like readable English. Many desktop publishing packages and web page + editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will + uncover many web sites still in their infancy. Various versions have evolved over the years, + sometimes by accident, sometimes on purpose (injected humour and the like)."#; +const JPN_SHORT: &str = "日本ごです。 とても素敵な言葉ですね"; +const JPN_LONG: &str = r#"日本ごです。 和名の由来は、 + 太陽の動きにつれてその方向を追うように花が回るといわれたことから。 + ただしこの動きは生長に伴うものであるため、 + 実際に太陽を追って動くのは生長が盛んな若い時期だけである。 + 若いヒマワリの茎の上部の葉は太陽に正対になるように動き、 + 朝には東を向いていたのが夕方には西を向く。日没後はまもなく起きあがり、 + 夜明け前にはふたたび東に向く。この運動はつぼみを付ける頃まで続くが、 + つぼみが大きくなり花が開く素敵な言葉ですね."#; +const CMN_SHORT: &str = "滚滚长江东逝水,浪花淘尽英雄。"; +const CMN_LONG: &str = r#"滚滚长江东逝水,浪花淘尽英雄。是非成败转头空,青山依旧在,几度夕阳红。 + 白发渔樵江渚上,惯看秋月春风。一壶浊酒喜相逢,古今多少事,都付笑谈中。 + 是非成败转头空,青山依旧在,惯看秋月春风。一壶浊酒喜相逢,古今多少事, + 滚滚长江东逝水,浪花淘尽英雄。 几度夕阳红。白发渔樵江渚上,都付笑谈中。"#; +const KOR_SHORT: &str = "안녕하세요. 반갑습니다."; +const KOR_LONG: &str = r#" +포근히 내려오는 눈밭속에서는 +낯이 붉은 處女아이들도 깃들이어 오는 소리… +울고 +웃고 +수구리고 +새파라니 얼어서 +運命들이 모두다 안끼어 드는 소리… +큰놈에겐 큰 눈물자국, 작은놈에겐 작은 웃음 흔적 +큰이얘기 작은이얘기들이 오부록이 도란 그리며 안끼어 오는 소리 +끊임없이 내리는 눈발 속에서는 +山도 山도 靑山도 안끼어 드는 소리 +"#; + +fn process_tokens(analyzer: &mut TextAnalyzer, text: &str) -> Vec { + let mut token_stream = analyzer.token_stream(text); + let mut tokens: Vec = vec![]; + token_stream.process(&mut |token: &Token| tokens.push(token.clone())); + tokens +} + +pub fn tokenizers_throughput_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("multilang"); + let tokenizer_manager = create_default_quickwit_tokenizer_manager(); + let mut default_tokenizer = tokenizer_manager.get("default").unwrap(); + let mut multilang_tokenizer = tokenizer_manager.get("multilang").unwrap(); + let mut chinese_tokenizer = tokenizer_manager.get("chinese_compatible").unwrap(); + + group + .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) + .bench_with_input("default-tokenize-short", ASCII_SHORT, |b, text| { + b.iter(|| process_tokens(&mut default_tokenizer, black_box(text))); + }); + group + .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) + .bench_with_input("default-tokenize-long", ASCII_LONG, |b, text| { + b.iter(|| process_tokens(&mut default_tokenizer, black_box(text))); + }); + group + .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) + .bench_with_input("multilang-eng-tokenize-short", ASCII_SHORT, |b, text| { + b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); + }); + group + .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) + .bench_with_input("multilang-eng-tokenize-long", ASCII_LONG, |b, text| { + b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); + }); + let short_with_prefix = "ENG:".to_string() + ASCII_SHORT; + group + .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) + .bench_with_input( + "multilang-tokenize-short-with-prefix", + &short_with_prefix, + |b, text| { + b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); + }, + ); + let long_with_prefix = "ENG:".to_string() + ASCII_LONG; + group + .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) + .bench_with_input( + "multilang-tokenize-long-with-prefix", + &long_with_prefix, + |b, text| { + b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); + }, + ); + group + .throughput(Throughput::Bytes(JPN_SHORT.len() as u64)) + .bench_with_input("multilang-tokenize-jpn-short", JPN_SHORT, |b, text| { + b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); + }); + group + .throughput(Throughput::Bytes(JPN_LONG.len() as u64)) + .bench_with_input("multilang-tokenize-jpn-long", JPN_LONG, |b, text| { + b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); + }); + group + .throughput(Throughput::Bytes(CMN_SHORT.len() as u64)) + .bench_with_input("multilang-tokenize-cmn-short", CMN_SHORT, |b, text| { + b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); + }); + group + .throughput(Throughput::Bytes(CMN_LONG.len() as u64)) + .bench_with_input("multilang-tokenize-cmn-long", CMN_LONG, |b, text| { + b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); + }); + group + .throughput(Throughput::Bytes(KOR_SHORT.len() as u64)) + .bench_with_input("multilang-tokenize-kor-short", KOR_SHORT, |b, text| { + b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); + }); + group + .throughput(Throughput::Bytes(KOR_LONG.len() as u64)) + .bench_with_input("multilang-tokenize-kor-long", KOR_LONG, |b, text| { + b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); + }); + group + .throughput(Throughput::Bytes(CMN_SHORT.len() as u64)) + .bench_with_input( + "chinese-compatible-tokenize-cmn-short", + CMN_SHORT, + |b, text| { + b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text))); + }, + ); + group + .throughput(Throughput::Bytes(CMN_LONG.len() as u64)) + .bench_with_input( + "chinese-compatible-tokenize-cmn-long", + CMN_LONG, + |b, text| { + b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text))); + }, + ); +} + +criterion_group!( + tokenizers_throughput_benches, + tokenizers_throughput_benchmark +); +criterion_main!(tokenizers_throughput_benches); diff --git a/quickwit/quickwit-query/src/lib.rs b/quickwit/quickwit-query/src/lib.rs index ae92071ef2a..ba5905f9e88 100644 --- a/quickwit/quickwit-query/src/lib.rs +++ b/quickwit/quickwit-query/src/lib.rs @@ -33,7 +33,7 @@ mod error; mod json_literal; mod not_nan_f32; pub mod query_ast; -mod tokenizers; +pub mod tokenizers; pub use elastic_query_dsl::{ElasticQueryDsl, OneFieldMap}; pub use error::InvalidQuery; @@ -42,6 +42,8 @@ pub(crate) use not_nan_f32::NotNaNf32; pub use query_ast::utils::find_field_or_hit_dynamic; use serde::{Deserialize, Serialize}; pub use tantivy::query::Query as TantivyQuery; +#[cfg(feature = "multilang")] +pub use tokenizers::MultiLangTokenizer; pub use tokenizers::{ create_default_quickwit_tokenizer_manager, get_quickwit_fastfield_normalizer_manager, CodeTokenizer, DEFAULT_REMOVE_TOKEN_LENGTH, diff --git a/quickwit/quickwit-query/src/tokenizers/chinese_compatible.rs b/quickwit/quickwit-query/src/tokenizers/chinese_compatible.rs new file mode 100644 index 00000000000..1f326081e11 --- /dev/null +++ b/quickwit/quickwit-query/src/tokenizers/chinese_compatible.rs @@ -0,0 +1,279 @@ +// Copyright (C) 2023 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use std::str::CharIndices; + +use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; + +#[derive(Clone)] +pub(crate) struct ChineseTokenizer; + +impl Tokenizer for ChineseTokenizer { + type TokenStream<'a> = ChineseTokenStream<'a>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + ChineseTokenStream { + text, + last_char: None, + chars: text.char_indices(), + token: Token::default(), + } + } +} + +pub(crate) struct ChineseTokenStream<'a> { + text: &'a str, + last_char: Option<(usize, char)>, + chars: CharIndices<'a>, + token: Token, +} + +fn char_is_cjk(c: char) -> bool { + // Block Range Comment + // CJK Unified Ideographs 4E00-9FFF Common + // CJK Unified Ideographs Extension A 3400-4DBF Rare + // CJK Unified Ideographs Extension B 20000-2A6DF Rare, historic + // CJK Unified Ideographs Extension C 2A700–2B73F Rare, historic + // CJK Unified Ideographs Extension D 2B740–2B81F Uncommon, some in current use + // CJK Unified Ideographs Extension E 2B820–2CEAF Rare, historic + matches!(c, + '\u{4500}'..='\u{9FFF}' | + '\u{3400}'..='\u{4DBF}' | + '\u{20000}'..='\u{2A6DF}' | + '\u{2A700}'..='\u{2CEAF}' // merge of extension C,D and E. + ) +} + +#[derive(Clone, Debug, Eq, PartialEq)] +enum Grouping { + Keep, + SplitKeep, + SplitIgnore, +} + +fn char_grouping(c: char) -> Grouping { + if c.is_alphanumeric() { + if char_is_cjk(c) { + Grouping::SplitKeep + } else { + Grouping::Keep + } + } else { + Grouping::SplitIgnore + } +} + +impl<'a> TokenStream for ChineseTokenStream<'a> { + fn advance(&mut self) -> bool { + self.token.text.clear(); + self.token.position = self.token.position.wrapping_add(1); + + let mut iter = self.last_char.take().into_iter().chain(&mut self.chars); + + while let Some((offset_from, c)) = iter.next() { + match char_grouping(c) { + Grouping::Keep => { + let offset_to = if let Some((next_index, next_char)) = + iter.find(|&(_, c)| char_grouping(c) != Grouping::Keep) + { + self.last_char = Some((next_index, next_char)); + next_index + } else { + self.text.len() + }; + + self.token.offset_from = offset_from; + self.token.offset_to = offset_to; + self.token.text.push_str(&self.text[offset_from..offset_to]); + return true; + } + Grouping::SplitKeep => { + let num_bytes_in_char = c.len_utf8(); + self.token.offset_from = offset_from; + self.token.offset_to = offset_from + num_bytes_in_char; + self.token + .text + .push_str(&self.text[offset_from..(self.token.offset_to)]); + return true; + } + Grouping::SplitIgnore => (), + } + } + false + } + + fn token(&self) -> &Token { + &self.token + } + + fn token_mut(&mut self) -> &mut Token { + &mut self.token + } +} + +#[cfg(test)] +mod tests { + use tantivy::tokenizer::{Token, TokenStream}; + + #[test] + fn test_chinese_tokenizer() { + let text = "Hello world, 你好世界, bonjour monde"; + let tokenizer_manager = crate::create_default_quickwit_tokenizer_manager(); + let mut tokenizer = tokenizer_manager.get("chinese_compatible").unwrap(); + let mut text_stream = tokenizer.token_stream(text); + + let mut res = Vec::new(); + while let Some(tok) = text_stream.next() { + res.push(tok.clone()); + } + + // latin alphabet splited on white spaces, Han split on each char + let expected = [ + Token { + offset_from: 0, + offset_to: 5, + position: 0, + text: "hello".to_owned(), + position_length: 1, + }, + Token { + offset_from: 6, + offset_to: 11, + position: 1, + text: "world".to_owned(), + position_length: 1, + }, + Token { + offset_from: 13, + offset_to: 16, + position: 2, + text: "你".to_owned(), + position_length: 1, + }, + Token { + offset_from: 16, + offset_to: 19, + position: 3, + text: "好".to_owned(), + position_length: 1, + }, + Token { + offset_from: 19, + offset_to: 22, + position: 4, + text: "世".to_owned(), + position_length: 1, + }, + Token { + offset_from: 22, + offset_to: 25, + position: 5, + text: "界".to_owned(), + position_length: 1, + }, + Token { + offset_from: 27, + offset_to: 34, + position: 6, + text: "bonjour".to_owned(), + position_length: 1, + }, + Token { + offset_from: 35, + offset_to: 40, + position: 7, + text: "monde".to_owned(), + position_length: 1, + }, + ]; + + assert_eq!(dbg!(res), dbg!(expected)); + } + + #[test] + fn test_chinese_tokenizer_no_space() { + let text = "Hello你好bonjour"; + let tokenizer_manager = crate::create_default_quickwit_tokenizer_manager(); + let mut tokenizer = tokenizer_manager.get("chinese_compatible").unwrap(); + let mut text_stream = tokenizer.token_stream(text); + + let mut res = Vec::new(); + while let Some(tok) = text_stream.next() { + res.push(tok.clone()); + } + + let expected = [ + Token { + offset_from: 0, + offset_to: 5, + position: 0, + text: "hello".to_owned(), + position_length: 1, + }, + Token { + offset_from: 5, + offset_to: 8, + position: 1, + text: "你".to_owned(), + position_length: 1, + }, + Token { + offset_from: 8, + offset_to: 11, + position: 2, + text: "好".to_owned(), + position_length: 1, + }, + Token { + offset_from: 11, + offset_to: 18, + position: 3, + text: "bonjour".to_owned(), + position_length: 1, + }, + ]; + + assert_eq!(res, expected); + } + + proptest::proptest! { + #[test] + fn test_proptest_ascii_default_chinese_equal(text in "[ -~]{0,64}") { + let tokenizer_manager = crate::create_default_quickwit_tokenizer_manager(); + let mut cn_tok = tokenizer_manager.get("chinese_compatible").unwrap(); + let mut default_tok = tokenizer_manager.get("default").unwrap(); + + let mut text_stream = cn_tok.token_stream(&text); + + let mut cn_res = Vec::new(); + while let Some(tok) = text_stream.next() { + cn_res.push(tok.clone()); + } + + let mut text_stream = default_tok.token_stream(&text); + + let mut default_res = Vec::new(); + while let Some(tok) = text_stream.next() { + default_res.push(tok.clone()); + } + + assert_eq!(cn_res, default_res); + } + } +} diff --git a/quickwit/quickwit-query/src/tokenizers.rs b/quickwit/quickwit-query/src/tokenizers/code_tokenizer.rs similarity index 52% rename from quickwit/quickwit-query/src/tokenizers.rs rename to quickwit/quickwit-query/src/tokenizers/code_tokenizer.rs index 19b0b25ed74..a96ffc7f216 100644 --- a/quickwit/quickwit-query/src/tokenizers.rs +++ b/quickwit/quickwit-query/src/tokenizers/code_tokenizer.rs @@ -20,71 +20,7 @@ use std::ops::Range; use std::str::CharIndices; -use once_cell::sync::Lazy; -use tantivy::tokenizer::{ - LowerCaser, RawTokenizer, RemoveLongFilter, TextAnalyzer, Token, TokenStream, Tokenizer, - TokenizerManager, -}; - -pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255; - -pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { - let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .build(); - - let chinese_tokenizer = TextAnalyzer::builder(ChineseTokenizer) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .build(); - - let tokenizer_manager = TokenizerManager::new(); - tokenizer_manager.register("raw", raw_tokenizer); - tokenizer_manager.register("chinese_compatible", chinese_tokenizer); - - tokenizer_manager.register( - "default", - TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .build(), - ); - tokenizer_manager.register( - "en_stem", - TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .filter(tantivy::tokenizer::Stemmer::new( - tantivy::tokenizer::Language::English, - )) - .build(), - ); - tokenizer_manager.register( - "source_code", - TextAnalyzer::builder(CodeTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .build(), - ); - - tokenizer_manager -} - -fn create_quickwit_fastfield_normalizer_manager() -> TokenizerManager { - let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .build(); - - let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) - .filter(LowerCaser) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .build(); - - let tokenizer_manager = TokenizerManager::new(); - tokenizer_manager.register("raw", raw_tokenizer); - tokenizer_manager.register("lowercase", lower_case_tokenizer); - tokenizer_manager -} +use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; /// TODO: add docs. #[derive(Clone, Default)] @@ -300,311 +236,11 @@ enum CharType { Delimiter, } -#[derive(Clone)] -struct ChineseTokenizer; - -impl Tokenizer for ChineseTokenizer { - type TokenStream<'a> = ChineseTokenStream<'a>; - - fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> { - ChineseTokenStream { - text, - last_char: None, - chars: text.char_indices(), - token: Token::default(), - } - } -} - -struct ChineseTokenStream<'a> { - text: &'a str, - last_char: Option<(usize, char)>, - chars: CharIndices<'a>, - token: Token, -} - -fn char_is_cjk(c: char) -> bool { - // Block Range Comment - // CJK Unified Ideographs 4E00-9FFF Common - // CJK Unified Ideographs Extension A 3400-4DBF Rare - // CJK Unified Ideographs Extension B 20000-2A6DF Rare, historic - // CJK Unified Ideographs Extension C 2A700–2B73F Rare, historic - // CJK Unified Ideographs Extension D 2B740–2B81F Uncommon, some in current use - // CJK Unified Ideographs Extension E 2B820–2CEAF Rare, historic - matches!(c, - '\u{4500}'..='\u{9FFF}' | - '\u{3400}'..='\u{4DBF}' | - '\u{20000}'..='\u{2A6DF}' | - '\u{2A700}'..='\u{2CEAF}' // merge of extension C,D and E. - ) -} - -#[derive(Clone, Debug, Eq, PartialEq)] -enum Grouping { - Keep, - SplitKeep, - SplitIgnore, -} - -fn char_grouping(c: char) -> Grouping { - if c.is_alphanumeric() { - if char_is_cjk(c) { - Grouping::SplitKeep - } else { - Grouping::Keep - } - } else { - Grouping::SplitIgnore - } -} - -impl<'a> TokenStream for ChineseTokenStream<'a> { - fn advance(&mut self) -> bool { - self.token.text.clear(); - self.token.position = self.token.position.wrapping_add(1); - - let mut iter = self.last_char.take().into_iter().chain(&mut self.chars); - - while let Some((offset_from, c)) = iter.next() { - match char_grouping(c) { - Grouping::Keep => { - let offset_to = if let Some((next_index, next_char)) = - iter.find(|&(_, c)| char_grouping(c) != Grouping::Keep) - { - self.last_char = Some((next_index, next_char)); - next_index - } else { - self.text.len() - }; - - self.token.offset_from = offset_from; - self.token.offset_to = offset_to; - self.token.text.push_str(&self.text[offset_from..offset_to]); - return true; - } - Grouping::SplitKeep => { - let num_bytes_in_char = c.len_utf8(); - self.token.offset_from = offset_from; - self.token.offset_to = offset_from + num_bytes_in_char; - self.token - .text - .push_str(&self.text[offset_from..(self.token.offset_to)]); - return true; - } - Grouping::SplitIgnore => (), - } - } - false - } - - fn token(&self) -> &Token { - &self.token - } - - fn token_mut(&mut self) -> &mut Token { - &mut self.token - } -} - -pub fn get_quickwit_fastfield_normalizer_manager() -> &'static TokenizerManager { - static QUICKWIT_FAST_FIELD_NORMALIZER_MANAGER: Lazy = - Lazy::new(create_quickwit_fastfield_normalizer_manager); - &QUICKWIT_FAST_FIELD_NORMALIZER_MANAGER -} - #[cfg(test)] mod tests { use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; - use super::{create_default_quickwit_tokenizer_manager, CodeTokenizer}; - - #[test] - fn test_raw_tokenizer() { - let my_haiku = r#" - white sandy beach - a strong wind is coming - sand in my face - "#; - - let mut tokenizer = create_default_quickwit_tokenizer_manager() - .get("raw") - .unwrap(); - { - let mut haiku_stream = tokenizer.token_stream(my_haiku); - assert!(haiku_stream.advance()); - assert!(!haiku_stream.advance()); - } - { - let my_too_long_text = vec!["a".repeat(255)].join(""); - assert!(!tokenizer.token_stream(&my_too_long_text).advance()); - } - { - let my_long_text = vec!["a".repeat(254)].join(""); - assert!(tokenizer.token_stream(&my_long_text).advance()); - } - } - - #[test] - fn test_chinese_tokenizer() { - let text = "Hello world, 你好世界, bonjour monde"; - - let mut tokenizer = create_default_quickwit_tokenizer_manager() - .get("chinese_compatible") - .unwrap(); - let mut text_stream = tokenizer.token_stream(text); - - let mut res = Vec::new(); - while let Some(tok) = text_stream.next() { - res.push(tok.clone()); - } - - // latin alphabet splited on white spaces, Han split on each char - let expected = [ - Token { - offset_from: 0, - offset_to: 5, - position: 0, - text: "hello".to_owned(), - position_length: 1, - }, - Token { - offset_from: 6, - offset_to: 11, - position: 1, - text: "world".to_owned(), - position_length: 1, - }, - Token { - offset_from: 13, - offset_to: 16, - position: 2, - text: "你".to_owned(), - position_length: 1, - }, - Token { - offset_from: 16, - offset_to: 19, - position: 3, - text: "好".to_owned(), - position_length: 1, - }, - Token { - offset_from: 19, - offset_to: 22, - position: 4, - text: "世".to_owned(), - position_length: 1, - }, - Token { - offset_from: 22, - offset_to: 25, - position: 5, - text: "界".to_owned(), - position_length: 1, - }, - Token { - offset_from: 27, - offset_to: 34, - position: 6, - text: "bonjour".to_owned(), - position_length: 1, - }, - Token { - offset_from: 35, - offset_to: 40, - position: 7, - text: "monde".to_owned(), - position_length: 1, - }, - ]; - - assert_eq!(res, expected); - } - - #[test] - fn test_chinese_tokenizer_no_space() { - let text = "Hello你好bonjour"; - - let mut tokenizer = create_default_quickwit_tokenizer_manager() - .get("chinese_compatible") - .unwrap(); - let mut text_stream = tokenizer.token_stream(text); - - let mut res = Vec::new(); - while let Some(tok) = text_stream.next() { - res.push(tok.clone()); - } - - let expected = [ - Token { - offset_from: 0, - offset_to: 5, - position: 0, - text: "hello".to_owned(), - position_length: 1, - }, - Token { - offset_from: 5, - offset_to: 8, - position: 1, - text: "你".to_owned(), - position_length: 1, - }, - Token { - offset_from: 8, - offset_to: 11, - position: 2, - text: "好".to_owned(), - position_length: 1, - }, - Token { - offset_from: 11, - offset_to: 18, - position: 3, - text: "bonjour".to_owned(), - position_length: 1, - }, - ]; - - assert_eq!(res, expected); - } - - proptest::proptest! { - #[test] - fn test_proptest_ascii_default_chinese_equal(text in "[ -~]{0,64}") { - let mut cn_tok = create_default_quickwit_tokenizer_manager().get("chinese_compatible").unwrap(); - let mut default_tok = create_default_quickwit_tokenizer_manager().get("default").unwrap(); - - let mut text_stream = cn_tok.token_stream(&text); - - let mut cn_res = Vec::new(); - while let Some(tok) = text_stream.next() { - cn_res.push(tok.clone()); - } - - let mut text_stream = default_tok.token_stream(&text); - - let mut default_res = Vec::new(); - while let Some(tok) = text_stream.next() { - default_res.push(tok.clone()); - } - - assert_eq!(cn_res, default_res); - } - } - - #[test] - fn test_code_tokenizer_in_tokenizer_manager() { - let mut code_tokenizer = create_default_quickwit_tokenizer_manager() - .get("source_code") - .unwrap(); - let mut token_stream = code_tokenizer.token_stream("PigCaféFactory2"); - let mut tokens = Vec::new(); - while let Some(token) = token_stream.next() { - tokens.push(token.text.to_string()); - } - assert_eq!(tokens, vec!["pig", "café", "factory", "2"]) - } + use super::CodeTokenizer; #[test] fn test_code_tokenizer() { diff --git a/quickwit/quickwit-query/src/tokenizers/mod.rs b/quickwit/quickwit-query/src/tokenizers/mod.rs new file mode 100644 index 00000000000..d0ebfb1bb72 --- /dev/null +++ b/quickwit/quickwit-query/src/tokenizers/mod.rs @@ -0,0 +1,133 @@ +// Copyright (C) 2023 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +mod chinese_compatible; +mod code_tokenizer; +#[cfg(feature = "multilang")] +mod multilang; + +use once_cell::sync::Lazy; +use tantivy::tokenizer::{ + LowerCaser, RawTokenizer, RemoveLongFilter, TextAnalyzer, TokenizerManager, +}; + +use self::chinese_compatible::ChineseTokenizer; +pub use self::code_tokenizer::CodeTokenizer; +#[cfg(feature = "multilang")] +pub use self::multilang::MultiLangTokenizer; + +pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255; + +/// Quickwit's tokenizer/analyzer manager. +pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { + let tokenizer_manager = TokenizerManager::default(); + let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) + .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) + .build(); + tokenizer_manager.register("raw", raw_tokenizer); + + let chinese_tokenizer = TextAnalyzer::builder(ChineseTokenizer) + .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) + .filter(LowerCaser) + .build(); + tokenizer_manager.register("chinese_compatible", chinese_tokenizer); + tokenizer_manager.register( + "source_code", + TextAnalyzer::builder(CodeTokenizer::default()) + .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) + .filter(LowerCaser) + .build(), + ); + #[cfg(feature = "multilang")] + tokenizer_manager.register( + "multilang_default", + TextAnalyzer::builder(MultiLangTokenizer::default()) + .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) + .filter(LowerCaser) + .build(), + ); + tokenizer_manager +} + +fn create_quickwit_fastfield_normalizer_manager() -> TokenizerManager { + let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) + .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) + .build(); + let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) + .filter(LowerCaser) + .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) + .build(); + let tokenizer_manager = TokenizerManager::new(); + tokenizer_manager.register("raw", raw_tokenizer); + tokenizer_manager.register("lowercase", lower_case_tokenizer); + tokenizer_manager +} + +pub fn get_quickwit_fastfield_normalizer_manager() -> &'static TokenizerManager { + static QUICKWIT_FAST_FIELD_NORMALIZER_MANAGER: Lazy = + Lazy::new(create_quickwit_fastfield_normalizer_manager); + &QUICKWIT_FAST_FIELD_NORMALIZER_MANAGER +} + +#[cfg(test)] +mod tests { + + #[test] + fn test_tokenizers_in_manager() { + let tokenizer_manager = super::create_default_quickwit_tokenizer_manager(); + tokenizer_manager.get("chinese_compatible").unwrap(); + tokenizer_manager.get("default").unwrap(); + tokenizer_manager.get("raw").unwrap(); + } + + #[test] + fn test_raw_tokenizer() { + let tokenizer_manager = super::create_default_quickwit_tokenizer_manager(); + let my_haiku = r#" + white sandy beach + a strong wind is coming + sand in my face + "#; + let my_long_text = "a text, that is just too long, no one will type it, no one will like \ + it, no one shall find it. I just need some more chars, now you may \ + not pass."; + + let mut tokenizer = tokenizer_manager.get("raw").unwrap(); + let mut haiku_stream = tokenizer.token_stream(my_haiku); + assert!(haiku_stream.advance()); + assert!(!haiku_stream.advance()); + let mut other_tokenizer = tokenizer_manager.get("raw").unwrap(); + let mut other_stream = other_tokenizer.token_stream(my_long_text); + assert!(other_stream.advance()); + assert!(!other_stream.advance()); + } + + #[test] + fn test_code_tokenizer_in_tokenizer_manager() { + let mut code_tokenizer = super::create_default_quickwit_tokenizer_manager() + .get("source_code") + .unwrap(); + let mut token_stream = code_tokenizer.token_stream("PigCaféFactory2"); + let mut tokens = Vec::new(); + while let Some(token) = token_stream.next() { + tokens.push(token.text.to_string()); + } + assert_eq!(tokens, vec!["pig", "café", "factory", "2"]) + } +} diff --git a/quickwit/quickwit-query/src/tokenizers/multilang.rs b/quickwit/quickwit-query/src/tokenizers/multilang.rs new file mode 100644 index 00000000000..9d45b8bb7cd --- /dev/null +++ b/quickwit/quickwit-query/src/tokenizers/multilang.rs @@ -0,0 +1,342 @@ +// Copyright (C) 2023 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use lindera_core::mode::Mode; +use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind}; +use lindera_tokenizer::token::Token as LinderaToken; +use lindera_tokenizer::tokenizer::Tokenizer as LinderaTokenizer; +use once_cell::sync::Lazy; +use tantivy::tokenizer::{SimpleTokenStream, SimpleTokenizer, Token, TokenStream, Tokenizer}; +use tracing::info; +use whichlang::{detect_language, Lang}; + +// Note(fmassot): we use `lindera_tokenizer::tokenizer::Tokenizer` and not +// `use lindera_tantivy::tokenizer::LinderaTokenizer` to avoid +// costly copy of lindera dictionaries each time we clone the `MultiLangTokenizer`. + +/// Mandarin chinese tokenizer. +static CMN_TOKENIZER: Lazy = Lazy::new(|| { + let cmn_dictionary_config = DictionaryConfig { + kind: Some(DictionaryKind::CcCedict), + path: None, + }; + let cmn_dictionary = load_dictionary_from_config(cmn_dictionary_config) + .expect("Lindera `CcCedict` dictionary must be present"); + info!("loading dict"); + LinderaTokenizer::new(cmn_dictionary, None, Mode::Normal) +}); + +/// Japanese tokenizer. +static JPN_TOKENIZER: Lazy = Lazy::new(|| { + let jpn_dictionary_config = DictionaryConfig { + kind: Some(DictionaryKind::IPADIC), + path: None, + }; + let jpn_dictionary = load_dictionary_from_config(jpn_dictionary_config) + .expect("Lindera `IPAD` dictionary must be present"); + info!("loading dict"); + LinderaTokenizer::new(jpn_dictionary, None, Mode::Normal) +}); + +/// Korean tokenizer. +static KOR_TOKENIZER: Lazy = Lazy::new(|| { + let kor_dictionary_config = DictionaryConfig { + kind: Some(DictionaryKind::KoDic), + path: None, + }; + let kor_dictionary = load_dictionary_from_config(kor_dictionary_config) + .expect("Lindera `KoDic` dictionary must be present"); + info!("loading dict"); + LinderaTokenizer::new(kor_dictionary, None, Mode::Normal) +}); + +/// Multilanguage tokenizer that uses the `whichlang` to detect the language of the text +/// and uses the appropriate tokenizer for the detected language: +/// - lindera for Chinese, Japanese, and Korean. +/// - Quickwit's default tokenizer for other languages. +/// It is possible to bypass the language detection by prefixing the text with the language code +/// followed by a colon. For example, `KOR:일본입니다` will be tokenized by the english tokenizer. +/// Current supported prefix are: +/// - `KOR:` for Korean tokenizer +/// - `JPN:` for Japanese tokenizer +/// - `CMN:` for Chinese tokenizer +/// - `ENG:` for Quickwit's default tokenizer +#[derive(Clone, Default)] +pub struct MultiLangTokenizer { + default_tokenizer: SimpleTokenizer, + token: Token, +} + +impl Tokenizer for MultiLangTokenizer { + type TokenStream<'a> = MultiLanguageTokenStream<'a>; + fn token_stream<'a>(&'a mut self, text: &'a str) -> MultiLanguageTokenStream<'a> { + self.token.reset(); + let (language_prefix, text_to_tokenize) = get_language_from_prefix(text); + // If the text is empty, we return an empty token stream. + // `whichlang::detect_language` panicks if the text is empty. + if text.trim().is_empty() { + return MultiLanguageTokenStream::Empty; + } + let language = language_prefix.unwrap_or_else(|| detect_language(text_to_tokenize)); + match language { + Lang::Cmn => { + let lindera_token_stream = LinderaTokenStream { + tokens: CMN_TOKENIZER + .tokenize(text_to_tokenize) + .expect("tokenize method should never fail"), + token: &mut self.token, + }; + MultiLanguageTokenStream::Lindera(lindera_token_stream) + } + Lang::Jpn => { + let lindera_token_stream = LinderaTokenStream { + tokens: JPN_TOKENIZER + .tokenize(text_to_tokenize) + .expect("tokenize method should never fail"), + token: &mut self.token, + }; + MultiLanguageTokenStream::Lindera(lindera_token_stream) + } + Lang::Kor => { + let lindera_token_stream = LinderaTokenStream { + tokens: KOR_TOKENIZER + .tokenize(text_to_tokenize) + .expect("tokenize method should never fail"), + token: &mut self.token, + }; + MultiLanguageTokenStream::Lindera(lindera_token_stream) + } + _ => MultiLanguageTokenStream::Simple( + self.default_tokenizer.token_stream(text_to_tokenize), + ), + } + } +} + +/// Gets the language defined by a prefix `{ID}:text` where ID being the 3-letter language used by +/// whichlang) and returns the language and the text without the prefix. If the prefix is not +/// recognized, the language is `None` and the text is the original. +fn get_language_from_prefix(text: &str) -> (Option, &str) { + let prefix_bytes = &text.as_bytes()[0..std::cmp::min(4, text.len())]; + // TODO: refactor. + let prefix_language = match prefix_bytes { + b"CMN:" => Some(Lang::Cmn), + b"ENG:" => Some(Lang::Eng), + b"JPN:" => Some(Lang::Jpn), + b"KOR:" => Some(Lang::Kor), + _ => None, + }; + let text_without_prefix = if prefix_language.is_some() { + // This is safe as we know that the prefix is made of 4 ascii characters. + &text[4..] + } else { + text + }; + (prefix_language, text_without_prefix) +} +pub enum MultiLanguageTokenStream<'a> { + Empty, + Lindera(LinderaTokenStream<'a>), + Simple(SimpleTokenStream<'a>), +} + +impl<'a> TokenStream for MultiLanguageTokenStream<'a> { + fn advance(&mut self) -> bool { + match self { + MultiLanguageTokenStream::Empty => false, + MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.advance(), + MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.advance(), + } + } + + fn token(&self) -> &Token { + match self { + MultiLanguageTokenStream::Empty => { + panic!("Cannot call token() on an empty token stream.") + } + MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token(), + MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.token(), + } + } + + fn token_mut(&mut self) -> &mut Token { + match self { + MultiLanguageTokenStream::Empty => { + panic!("Cannot call token_mut() on an empty token stream.") + } + MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token_mut(), + MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.token_mut(), + } + } +} + +pub struct LinderaTokenStream<'a> { + pub tokens: Vec>, + pub token: &'a mut Token, +} + +impl<'a> TokenStream for LinderaTokenStream<'a> { + fn advance(&mut self) -> bool { + if self.tokens.is_empty() { + return false; + } + let token = self.tokens.remove(0); + self.token.text = token.text.to_string(); + self.token.offset_from = token.byte_start; + self.token.offset_to = token.byte_end; + self.token.position = token.position; + self.token.position_length = token.position_length; + + true + } + + fn token(&self) -> &Token { + self.token + } + + fn token_mut(&mut self) -> &mut Token { + self.token + } +} + +#[cfg(test)] +mod tests { + use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; + + use super::{get_language_from_prefix, MultiLangTokenizer, MultiLanguageTokenStream}; + + fn test_helper(mut tokenizer: MultiLanguageTokenStream) -> Vec { + let mut tokens: Vec = vec![]; + tokenizer.process(&mut |token: &Token| tokens.push(token.clone())); + tokens + } + + #[test] + fn test_multilanguage_tokenizer_cmn() { + let mut tokenizer = MultiLangTokenizer::default(); + let tokens = test_helper( + tokenizer.token_stream("地址1,包含無效的字元 (包括符號與不標準的asci阿爾發字元"), + ); + assert_eq!(tokens.len(), 19); + { + let token = &tokens[0]; + assert_eq!(token.text, "地址"); + assert_eq!(token.offset_from, 0); + assert_eq!(token.offset_to, 6); + assert_eq!(token.position, 0); + assert_eq!(token.position_length, 1); + } + } + + #[test] + fn test_multilanguage_tokenizer_jpn() { + let mut tokenizer = MultiLangTokenizer::default(); + { + let tokens = test_helper(tokenizer.token_stream("すもももももももものうち")); + assert_eq!(tokens.len(), 7); + { + let token = &tokens[0]; + assert_eq!(token.text, "すもも"); + assert_eq!(token.offset_from, 0); + assert_eq!(token.offset_to, 9); + assert_eq!(token.position, 0); + assert_eq!(token.position_length, 1); + } + } + { + // Force usage of JPN tokenizer. + let tokens = test_helper(tokenizer.token_stream("JPN:すもももももももものうち")); + assert_eq!(tokens.len(), 7); + } + { + // Force usage of ENG tokenizer. + // This tokenizer will return only one token. + let tokens = test_helper(tokenizer.token_stream("ENG:すもももももももものうち")); + assert_eq!(tokens.len(), 1); + } + } + + #[test] + fn test_multilanguage_tokenizer_kor() { + let mut tokenizer = MultiLangTokenizer::default(); + { + let tokens = test_helper(tokenizer.token_stream("일본입니다. 매우 멋진 단어입니다.")); + assert_eq!(tokens.len(), 11); + { + let token = &tokens[0]; + assert_eq!(token.text, "일본"); + assert_eq!(token.offset_from, 0); + assert_eq!(token.offset_to, 6); + assert_eq!(token.position, 0); + assert_eq!(token.position_length, 1); + } + } + { + let tokens = + test_helper(tokenizer.token_stream("KOR:일본입니다. 매우 멋진 단어입니다.")); + assert_eq!(tokens.len(), 11); + } + { + let tokens = test_helper(tokenizer.token_stream("ENG:일본입니다")); + assert_eq!(tokens.len(), 1); + } + } + + #[test] + fn test_multilanguage_tokenizer_with_empty_string() { + let mut tokenizer = MultiLangTokenizer::default(); + { + let tokens = test_helper(tokenizer.token_stream("")); + assert_eq!(tokens.len(), 0); + } + { + let tokens = test_helper(tokenizer.token_stream(" ")); + assert_eq!(tokens.len(), 0); + } + } + + #[test] + fn test_multilanguage_process_language_prefix() { + { + let (lang, text) = get_language_from_prefix("JPN:すもももももももものうち"); + assert_eq!(lang, Some(whichlang::Lang::Jpn)); + assert_eq!(text, "すもももももももものうち"); + } + { + let (lang, text) = get_language_from_prefix("CMN:地址1,包含無效的字元"); + assert_eq!(lang, Some(whichlang::Lang::Cmn)); + assert_eq!(text, "地址1,包含無效的字元"); + } + { + let (lang, text) = get_language_from_prefix("ENG:my address"); + assert_eq!(lang, Some(whichlang::Lang::Eng)); + assert_eq!(text, "my address"); + } + { + let (lang, text) = get_language_from_prefix("UNK:my address"); + assert!(lang.is_none()); + assert_eq!(text, "UNK:my address"); + } + { + let (lang, text) = get_language_from_prefix(""); + assert!(lang.is_none()); + assert_eq!(text, ""); + } + } +}