From adfb0f778c8216a2b4a7f01b5dfb7a40983d5dd0 Mon Sep 17 00:00:00 2001 From: Marcono1234 Date: Tue, 31 Oct 2023 01:09:27 +0100 Subject: [PATCH] Fix multi-language detection exceptions for multi-script text There were multiple issues: - `end` was previously not correctly set after a new section was started This could erroneously lead to `end <= start` if there was a trailing single character in a different script - `lettersCount` was one too high --- .../lingua/internal/MultiLanguageDetection.kt | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/main/kotlin/com/github/pemistahl/lingua/internal/MultiLanguageDetection.kt b/src/main/kotlin/com/github/pemistahl/lingua/internal/MultiLanguageDetection.kt index 82d3c89f..70bc0681 100644 --- a/src/main/kotlin/com/github/pemistahl/lingua/internal/MultiLanguageDetection.kt +++ b/src/main/kotlin/com/github/pemistahl/lingua/internal/MultiLanguageDetection.kt @@ -96,6 +96,13 @@ private open class PotentialSection( // Cache text to reduce number of created substrings private var cachedText: String? = null, ) { + init { + check(start < end) + check(lettersCount > 0) + // Should have at most as many letters as there are chars in section + check(lettersCount <= end - start) + } + fun getStart() = start fun getEnd() = end fun getLettersCount() = lettersCount @@ -182,24 +189,26 @@ private fun splitPotentialSections(text: String): MutableList if (char.isLetter()) { val script = UnicodeScript.of(char.code) - if (start != -1 && (hasLogograms || lettersCount >= minSectionLength) && + if (start == -1) { + // Start a new section + start = index + } + // Or check if current section should end + else if ((hasLogograms || lettersCount >= minSectionLength) && lastScript != null && !lastScript!!.belongsToSameLanguageAs(script) ) { sections.add(PotentialSection(start, index, lettersCount, text)) // Current letter is start of new section start = index - lettersCount = 1 + // Set to 0 instead of 1 because it is directly incremented below + lettersCount = 0 hasLogograms = false - } else { - if (start == -1) { - start = index - } - - // Mark current letter as potential last letter - end = index + 1 } + // Mark current letter as potential last letter + end = index + 1 + lastScript = script lettersCount++ hasLogograms = hasLogograms || char.isLogogram()