Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pt] Enable multi-token spell-checking #10052

Merged
merged 9 commits into from
Jan 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.WordUtils;
import org.jetbrains.annotations.Nullable;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
Expand All @@ -47,6 +48,7 @@ public class MultiWordChunker extends AbstractDisambiguator {
private final String filename;
private final boolean allowFirstCapitalized;
private final boolean allowAllUppercase;
private final boolean allowTitlecase;

private volatile boolean initialized;
private Map<String, Integer> mStartSpace;
Expand All @@ -68,7 +70,7 @@ public class MultiWordChunker extends AbstractDisambiguator {
* @param filename file text with multiwords and tags
*/
public MultiWordChunker(String filename) {
this(filename, false, false);
this(filename, false, false, false);
}

/**
Expand All @@ -77,17 +79,21 @@ public MultiWordChunker(String filename) {
* multiword can be capitalized
* @param allowAllUppercase if set to {@code true}, the all uppercase
* version of the multiword is allowed
* @param allowTitlecase if set to {@code true}, titlecased variants
* of multi-token words are accepted
*/
public MultiWordChunker(String filename, boolean allowFirstCapitalized, boolean allowAllUppercase) {
public MultiWordChunker(String filename, boolean allowFirstCapitalized, boolean allowAllUppercase, boolean allowTitlecase) {
this.filename = filename;
this.allowFirstCapitalized = allowFirstCapitalized;
this.allowAllUppercase = allowAllUppercase;
this.allowTitlecase = allowTitlecase;
}

public MultiWordChunker(String filename, boolean allowFirstCapitalized, boolean allowAllUppercase, String defaultTag) {
public MultiWordChunker(String filename, boolean allowFirstCapitalized, boolean allowAllUppercase, boolean allowTitlecase, String defaultTag) {
this.filename = filename;
this.allowFirstCapitalized = allowFirstCapitalized;
this.allowAllUppercase = allowAllUppercase;
this.allowTitlecase = allowTitlecase;
this.defaultTag = defaultTag;
}

Expand Down Expand Up @@ -137,18 +143,7 @@ private void fillMaps(Map<String, Integer> mStartSpace, Map<String, Integer> mSt
String originalToken = interner.computeIfAbsent(tokenAndTag[0], Function.identity());
String tag = interner.computeIfAbsent((defaultTag != null ? defaultTag:tokenAndTag[1]), Function.identity());
tokens.add(originalToken);
if (allowFirstCapitalized) {
String tokenFirstCapitalized = StringTools.uppercaseFirstChar(originalToken);
if (!mFull.containsKey(tokenFirstCapitalized) && !originalToken.equals(tokenFirstCapitalized)) {
tokens.add(tokenFirstCapitalized);
}
}
if (allowAllUppercase) {
String tokenAllUppercase = originalToken.toUpperCase();
if (!mFull.containsKey(tokenAllUppercase) && !originalToken.equals(tokenAllUppercase)) {
tokens.add(tokenAllUppercase);
}
}
tokens.addAll(getTokenLettercaseVariants(originalToken, mFull));
for (String token : tokens) {
boolean containsSpace = token.indexOf(' ') > 0;
String firstToken;
Expand Down Expand Up @@ -185,6 +180,36 @@ private void fillMaps(Map<String, Integer> mStartSpace, Map<String, Integer> mSt
}
}

public List<String> getTokenLettercaseVariants(String originalToken, Map<String, AnalyzedToken> tokenMap) {
List<String> newTokens = new ArrayList<>();
if (allowAllUppercase) {
String tokenAllUppercase = originalToken.toUpperCase();
if (!tokenMap.containsKey(tokenAllUppercase) && !originalToken.equals(tokenAllUppercase)) {
newTokens.add(tokenAllUppercase);
}
}
if (allowFirstCapitalized) {
String tokenFirstCapitalized = StringTools.uppercaseFirstChar(originalToken);
if (!tokenMap.containsKey(tokenFirstCapitalized) && !originalToken.equals(tokenFirstCapitalized)) {
newTokens.add(tokenFirstCapitalized);
}
// Titlecasing is only relevant for multi-token entries, and only done for expressions that are entirely lowercase
// It is also limited to when first-letter capitalisation is allowed.
if (allowTitlecase && originalToken.split(" ").length > 1 && StringTools.allStartWithLowercase(originalToken)) {
String tokenNaivelyTitlecased = WordUtils.capitalize(originalToken);
if (!tokenNaivelyTitlecased.equals(tokenFirstCapitalized) && !originalToken.equals(tokenNaivelyTitlecased)) {
newTokens.add(tokenNaivelyTitlecased);
}
String tokenSmartlyTitlecased = StringTools.titlecaseGlobal(originalToken);
if (!tokenSmartlyTitlecased.equals(tokenFirstCapitalized) && !tokenSmartlyTitlecased.equals(tokenNaivelyTitlecased) &&
!originalToken.equals(tokenSmartlyTitlecased)) {
newTokens.add(tokenSmartlyTitlecased);
}
}
}
return newTokens;
}

@Override
public AnalyzedSentence disambiguate(AnalyzedSentence input) throws IOException {
return disambiguate(input, null);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
*/
package org.languagetool.tools;

import com.google.common.collect.Sets;
import com.google.common.xml.XmlEscapers;
import org.jetbrains.annotations.Contract;
import org.jetbrains.annotations.Nullable;
Expand All @@ -29,6 +30,8 @@
import java.text.Normalizer;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static java.util.regex.Pattern.*;

Expand Down Expand Up @@ -75,6 +78,42 @@ public enum ApiPrintMode {
private static final Pattern NOT_WORD_STR = compile("[^\\p{L}]+", DOTALL);
private static final Pattern PATTERN = compile("(?U)[^\\p{Space}\\p{Alnum}\\p{Punct}]");
private static final Pattern DIACRIT_MARKS = compile("[\\p{InCombiningDiacriticalMarks}]");
// Sets of words used for titlecasing in a few locales; useful for named entities in foreign languages, esp. English
private static final Set<String> ENGLISH_TITLECASE_EXCEPTIONS = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList("of", "in", "on", "the", "a", "an", "and", "or"))
);
private static final Set<String> PORTUGUESE_TITLECASE_EXCEPTIONS = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList("e", "ou", "que",
"de", "do", "dos", "da", "das",
"o", "a", "os", "as",
"no", "nos", "na", "nas",
"ao", "aos", "à", "às"))
);
private static final Set<String> FRENCH_TITLECASE_EXCEPTIONS = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList("et", "ou", "que", "qui",
"de", "du", "des", "en",
"le", "les", "la",
"un", "une",
"à", "au", "aux"))
);
private static final Set<String> SPANISH_TITLECASE_EXCEPTIONS = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList("y", "e", "o", "u", "que",
"el", "la", "los", "las",
"un", "unos", "una", "unas",
"del", "nel", "de", "en", "a", "al"))
);
private static final Set<String> GERMAN_TITLECASE_EXCEPTIONS = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList("von", "in", "im", "an", "am", "vom", "und", "oder", "dass", "ob",
"der", "die", "das", "dem", "den", "des",
"ein", "eines", "einem", "einen", "einer", "eine",
"kein", "keines", "keinem", "keinen", "keiner", "keine"))
);
private static final Set<String> DUTCH_TITLECASE_EXCEPTIONS = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList("van", "in", "de", "het", "een", "en", "of"))
);


private static final Set<String> ALL_TITLECASE_EXCEPTIONS = collectAllTitleCaseExceptions();

private StringTools() {
// only static stuff
Expand Down Expand Up @@ -217,6 +256,19 @@ public static boolean startsWithLowercase(String str) {
return Character.isLowerCase(str.charAt(0));
}

public static boolean allStartWithLowercase(String str) {
String[] strParts = str.split(" ");
if (strParts.length < 2) {
return startsWithLowercase(str);
}
for (String strPart : strParts) {
if (!startsWithLowercase(strPart)) {
return false;
}
}
return true;
}

/**
* Return <code>str</code> modified so that its first character is now an
* uppercase character. If <code>str</code> starts with non-alphabetic
Expand Down Expand Up @@ -246,6 +298,44 @@ public static String uppercaseFirstChar(@Nullable String str, Language language)
}
}

private static Set<String> collectAllTitleCaseExceptions() {
List<Set<String>> setList = Arrays.asList(ENGLISH_TITLECASE_EXCEPTIONS, PORTUGUESE_TITLECASE_EXCEPTIONS,
FRENCH_TITLECASE_EXCEPTIONS, SPANISH_TITLECASE_EXCEPTIONS, GERMAN_TITLECASE_EXCEPTIONS, DUTCH_TITLECASE_EXCEPTIONS);
Set<String> union = setList.stream().flatMap(Set::stream).collect(Collectors.toSet());
return union;
}

/**
* Title case a string ignoring a list of words. These words are ignored due to titlecasing conventions in the most
* frequent languages. Differs from {@link #convertToTitleCaseIteratingChars(String)} in that it is less aggressive,
* i.e., we do not force titlecase in all caps words (e.g. IDEA does not become Idea).
* This method behaves the same regardless of the language, and is rather aggressive in its ignoring of words.
* We can, possibly, in the future, have language-specific titlecasing conventions.
*/
@Contract("!null -> !null")
@Nullable
public static String titlecaseGlobal(@Nullable final String str) {
assert str != null;
String[] strParts = str.split(" ");
if (strParts.length == 1) {
return uppercaseFirstChar(str);
}
StringJoiner titlecasedStr = new StringJoiner(" ");
for (int i=0; i < strParts.length; i++) {
String strPart = strParts[i];
if (i == 0) {
titlecasedStr.add(uppercaseFirstChar(strPart));
continue;
}
if (ALL_TITLECASE_EXCEPTIONS.contains(strPart.toLowerCase())) {
titlecasedStr.add(lowercaseFirstCharIfCapitalized(strPart));
} else {
titlecasedStr.add(uppercaseFirstChar(strPart));
}
}
return titlecasedStr.toString();
}

/**
* Return <code>str</code> modified so that its first character is now an
* lowercase character. If <code>str</code> starts with non-alphabetic
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
import org.languagetool.tagging.xx.DemoTagger;

import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
Expand Down Expand Up @@ -37,7 +39,7 @@ public void setUp() throws Exception {

@Test
public void testDisambiguate1() throws IOException {
MultiWordChunker multiWordChunker = new MultiWordChunker("/yy/multiwords.txt", true, true);
MultiWordChunker multiWordChunker = new MultiWordChunker("/yy/multiwords.txt", true, true, true);

AnalyzedSentence analyzedSentence = lt.getAnalyzedSentence("ah for shame");
AnalyzedSentence disambiguated = multiWordChunker.disambiguate(analyzedSentence);
Expand Down Expand Up @@ -99,4 +101,21 @@ public void testDisambiguate2RemoveOtherReadings() throws IOException {
assertFalse(tokens[5].getReadings().toString().contains("FakePosTag"));
}

@Test
public void testLettercaseVariants() throws IOException {
MultiWordChunker multiWordChunker = new MultiWordChunker("/yy/multiwords.txt", true, true, true);
Map<String, AnalyzedToken> map = new HashMap<>();
map.put("rhythm and blues", new AnalyzedToken("rhythm and blues", "NCMS000_", "rhythm and blues"));
map.put("Vênus de Milo", new AnalyzedToken("Vênus de Milo", "NCFSS00_", "Vênus de Milo"));
List<String> tokenVariantsRnB = multiWordChunker.getTokenLettercaseVariants("rhythm and blues", map);
assertTrue(tokenVariantsRnB.contains("Rhythm and blues")); // simple upcase of first word
assertTrue(tokenVariantsRnB.contains("Rhythm And Blues")); // naïve titlecase
assertTrue(tokenVariantsRnB.contains("Rhythm and Blues")); // smarter titlecase
assertTrue(tokenVariantsRnB.contains("RHYTHM AND BLUES")); // all caps
List<String> tokenVariantsVenus = multiWordChunker.getTokenLettercaseVariants("Vênus de Milo", map);
assertFalse(tokenVariantsVenus.contains("Vênus De Milo")); // naïve titlecase
assertFalse(tokenVariantsVenus.contains("vênus de milo")); // downcased
assertTrue(tokenVariantsVenus.contains("VÊNUS DE MILO")); // all caps
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -262,4 +262,20 @@ public void testIsCamelCase() {
assertTrue(StringTools.isCamelCase("iSomeTHING"));
}

@Test
public void testTitlecaseGlobal() {
assertEquals("The Lord of the Rings", StringTools.titlecaseGlobal("the lord of the rings"));
assertEquals("Rhythm and Blues", StringTools.titlecaseGlobal("rhythm And blues"));
assertEquals("Memória de Leitura", StringTools.titlecaseGlobal("memória de leitura"));
assertEquals("Fond du Lac", StringTools.titlecaseGlobal("fond du lac"));
assertEquals("El Niño de las Islas", StringTools.titlecaseGlobal("el niño de Las islas"));
}

@Test
public void testAllStartWithLowercase() {
assertTrue(StringTools.allStartWithLowercase("the lord of the rings"));
assertFalse(StringTools.allStartWithLowercase("the Fellowship of the Ring"));
assertTrue(StringTools.allStartWithLowercase("bilbo"));
assertFalse(StringTools.allStartWithLowercase("Baggins"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@
*/
public class CatalanHybridDisambiguator extends AbstractDisambiguator {

private final MultiWordChunker chunker = new MultiWordChunker("/ca/multiwords.txt", true, true);
private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, "NPCN000");
private final MultiWordChunker chunker = new MultiWordChunker("/ca/multiwords.txt", true, true, false);
private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, false,"NPCN000");
private final Disambiguator disambiguator;

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ public class GermanRuleDisambiguator extends AbstractDisambiguator {
private final Disambiguator disambiguator;

private final MultiWordChunker multitokenSpeller = new MultiWordChunker(
"/de/multitoken-ignore.txt", false, false, MultiWordChunker.tagForNotAddingTags);
"/de/multitoken-ignore.txt", false, false, false, MultiWordChunker.tagForNotAddingTags);

private final MultiWordChunker multitokenSpeller2 = new MultiWordChunker(
"/de/multitoken-suggest.txt", false, false, MultiWordChunker.tagForNotAddingTags);
"/de/multitoken-suggest.txt", false, false, false, MultiWordChunker.tagForNotAddingTags);

private final MultiWordChunker multitokenSpeller3 = new MultiWordChunker(
"/spelling_global.txt", false, false, MultiWordChunker.tagForNotAddingTags);
"/spelling_global.txt", false, false, false, MultiWordChunker.tagForNotAddingTags);

public GermanRuleDisambiguator(Language lang) {
disambiguator = new XmlRuleDisambiguator(lang, true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
*/
public class EnglishHybridDisambiguator extends AbstractDisambiguator {

private final MultiWordChunker chunker = new MultiWordChunker("/en/multiwords.txt", true, true);
private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", true, true, MultiWordChunker.tagForNotAddingTags);
private final MultiWordChunker chunker = new MultiWordChunker("/en/multiwords.txt", true, true, false);
private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", true, true, false, MultiWordChunker.tagForNotAddingTags);
private final Disambiguator disambiguator;

public EnglishHybridDisambiguator(Language lang) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
*/
public class SpanishHybridDisambiguator extends AbstractDisambiguator {

private final MultiWordChunker chunker = new MultiWordChunker("/es/multiwords.txt", true, true);
private final Disambiguator chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, "NPCN000");
private final MultiWordChunker chunker = new MultiWordChunker("/es/multiwords.txt", true, true, false);
private final Disambiguator chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, false, "NPCN000");
private final Disambiguator disambiguator;

public SpanishHybridDisambiguator(Language lang) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

public class FrenchHybridDisambiguator extends AbstractDisambiguator {

private final MultiWordChunker chunker = new MultiWordChunker("/fr/multiwords.txt", true, true);
private final MultiWordChunker chunker = new MultiWordChunker("/fr/multiwords.txt", true, true, false);
private final Disambiguator disambiguator = new XmlRuleDisambiguator(new French(), true);

public FrenchHybridDisambiguator() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@
*/
public class PortugueseHybridDisambiguator extends AbstractDisambiguator {

private final MultiWordChunker chunker = new MultiWordChunker("/pt/multiwords.txt", true, true);
private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, "NPCN000");
private final MultiWordChunker chunker = new MultiWordChunker("/pt/multiwords.txt", true, true, true);
private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, true,"NPCN000");
private final Disambiguator disambiguator;

public PortugueseHybridDisambiguator(Language lang) {
Expand Down
Loading