Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[nl] prevent 2+ part compound words from being accepted #9971

Merged
merged 4 commits into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public String[] getCountries() {
@NotNull
@Override
public Tagger createDefaultTagger() {
return new DutchTagger();
return DutchTagger.INSTANCE;
}

@Nullable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import com.google.common.collect.ImmutableSet;
import org.languagetool.*;
import org.languagetool.rules.RuleMatch;
import org.languagetool.tagging.Tagger;
import org.languagetool.tagging.nl.DutchTagger;

import java.io.IOException;
import java.nio.file.Files;
Expand Down Expand Up @@ -558,14 +558,13 @@ public class CompoundAcceptor {
}
}

private final Tagger tagger;
private DutchTagger dutchTagger = DutchTagger.INSTANCE;

CompoundAcceptor() {
tagger = Languages.getLanguageForShortCode("nl").getTagger();
}

public CompoundAcceptor(Tagger tagger) {
this.tagger = tagger;
public CompoundAcceptor(DutchTagger dutchTagger) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This constructor (lines 566-568) seems unnecessary. It is never used, and the tagger is initialized in line 561.

this.dutchTagger = dutchTagger;
}

boolean acceptCompound(String word) {
Expand All @@ -576,7 +575,7 @@ boolean acceptCompound(String word) {
String part1 = word.substring(0, i);
String part2 = word.substring(i);
if (acceptCompound(part1, part2)) {
//System.out.println(part1+part2 + " -> accepted");
System.out.println(part1+part2 + " -> accepted");
return true;
}
}
Expand Down Expand Up @@ -604,7 +603,7 @@ boolean acceptCompound(String part1, String part2) {
if (part1.endsWith("s") && !part1Exceptions.contains(part1.substring(0, part1.length() -1)) && !alwaysNeedsS.contains(part1) && !noS.contains(part1) && !part1.contains("-")) {
for (String suffix : alwaysNeedsS) {
if (part1lc.endsWith(suffix)) {
return isNoun(part2) && isExistingWord(part1.substring(0, part1.length() - 1)) && spellingOk(part2);
return isNoun(part2) && isExistingWord(part1lc.substring(0, part1lc.length() - 1)) && spellingOk(part2);
}
}
return needsS.contains(part1lc) && isNoun(part2) && spellingOk(part1.substring(0, part1.length() - 1)) && spellingOk(part2);
Expand All @@ -614,21 +613,22 @@ boolean acceptCompound(String part1, String part2) {
part2 = part2.substring(1);
return noS.contains(part1lc) && isNoun(part2) && spellingOk(part1) && spellingOk(part2) && hasCollidingVowels(part1, part2);
} else {
return (noS.contains(part1lc) || part1Exceptions.contains(part1lc)) && isNoun(part2) && spellingOk(part1) && spellingOk(part2) && !hasCollidingVowels(part1, part2);
return (noS.contains(part1lc) || part1Exceptions.contains(part1lc)) && isNoun(part2) && spellingOk(part1) && !hasCollidingVowels(part1, part2);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}

boolean isNoun(String word) throws IOException {
List<AnalyzedTokenReadings> part2Readings = tagger.tag(Collections.singletonList(word));
return part2Readings.stream().anyMatch(k -> k.hasPosTagStartingWith("ZNW")) && !part2Exceptions.contains(word) ;
private boolean isNoun(String word) throws IOException {
return dutchTagger.getPostags(word).stream().anyMatch(k -> {
assert k.getPOSTag() != null;
return k.getPOSTag().startsWith("ZNW") && !part2Exceptions.contains(word);
});
}

private boolean isExistingWord(String word) throws IOException {
List<AnalyzedTokenReadings> part2Readings = tagger.tag(Collections.singletonList(word));
return part2Readings.stream().noneMatch(AnalyzedTokenReadings::isPosTagUnknown);
return dutchTagger.getPostags(word).stream().anyMatch(k -> k.getPOSTag() != null);
}

private boolean hasCollidingVowels(String part1, String part2) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
*/
public class DutchTagger extends BaseTagger {

public static final DutchTagger INSTANCE = new DutchTagger();
private static final Pattern PATTERN1_A = compile("([^aeiouáéíóú])(á)([^aeiouáéíóú])");
private static final Pattern PATTERN1_E = compile("([^aeiouáéíóú])(é)([^aeiouáéíóú])");
private static final Pattern PATTERN1_I = compile("([^aeiouáéíóú])(í)([^aeiouáéíóú])");
Expand Down Expand Up @@ -69,22 +70,22 @@ public DutchTagger() {
super("/nl/dutch.dict", new Locale("nl"));
}
private static final Set<String> alwaysNeedsHet = ImmutableSet.of(
"patroon",
"punt",
"gemaal",
"weer",
"kussen",
"deel"
"patroon",
"punt",
"gemaal",
"weer",
"kussen",
"deel"
);
private static final Set<String> alwaysNeedsDe = ImmutableSet.of(
"keten",
"boor",
"dans"
"keten",
"boor",
"dans"
);
private static final Set<String> alwaysNeedsMrv = ImmutableSet.of(
"pies",
"koeken",
"heden"
"pies",
"koeken",
"heden"
);
// custom code to deal with words carrying optional accents
@Override
Expand Down Expand Up @@ -250,6 +251,11 @@ public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) {
return tokenReadings;
}

// get tags and prevent tagger from passing value back to CompoundAcceptor, going into tagging loop
public List<AnalyzedToken> getPostags(String word) {
return asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
}

private void addTokens(List<AnalyzedToken> taggedTokens, List<AnalyzedToken> l) {
if (taggedTokens != null) {
l.addAll(taggedTokens);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,13 @@
import org.junit.Ignore;
import org.junit.Test;

import java.io.IOException;

import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

public class CompoundAcceptorTest {

@Test
public void testAcceptCompound() throws IOException {
public void testAcceptCompound() {
CompoundAcceptor acceptor = new CompoundAcceptor();

assertTrue(acceptor.acceptCompound("bedrijfsregels"));
Expand All @@ -51,7 +49,9 @@ public void testAcceptCompound() throws IOException {
assertTrue(acceptor.acceptCompound("kunstomlijning"));
assertTrue(acceptor.acceptCompound("webomlijning"));
assertFalse(acceptor.acceptCompound("lingsboek"));
assertTrue(acceptor.acceptCompound("gezondheidsinfluencers"));

assertTrue(acceptor.acceptCompound("webschoolboek"));
assertFalse(acceptor.acceptCompound("gezondheidsomlijningssvervangingsinfluencers"));

assertFalse(acceptor.acceptCompound("Papiersversnipperaar"));

Expand Down Expand Up @@ -104,12 +104,11 @@ public void testAcceptCompound() throws IOException {

assertTrue(acceptor.acceptCompound("auto-uitlaat"));
assertFalse(acceptor.acceptCompound("autouitlaat"));

}

@Ignore("Use for interactive debugging")
@Test
public void testAcceptCompoundInternal() throws IOException {
public void testAcceptCompoundInternal() {
CompoundAcceptor acceptor = new CompoundAcceptor();
assertTrue(acceptor.acceptCompound("passagiers", "schip"));
assertTrue(acceptor.acceptCompound("papier", "versnipperaar"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,13 @@ public void testTagger() throws IOException {
TestTools.myAssert("beoordelingsgeschiedenisje", "beoordelingsgeschiedenisje/[beoordelingsgeschiedenis]ZNW:EKV:VRK:HET", tokenizer, tagger);
TestTools.myAssert("Beoordelingsgeschiedenisjes", "Beoordelingsgeschiedenisjes/[beoordelingsgeschiedenis]ZNW:MRV:VRK:DE_", tokenizer, tagger);

// Test compound words with 2 parts
TestTools.myAssert("beroertegeschiedenisje", "beroertegeschiedenisje/[beroertegeschiedenis]ZNW:EKV:VRK:HET", tokenizer, tagger);
// Test compound words with 3 parts
TestTools.myAssert("gastkritiekgeschiedenis", "gastkritiekgeschiedenis/[null]null", tokenizer, tagger);
// Test compound words with 3+ parts
TestTools.myAssert("haarhalfbergnacht", "haarhalfbergnacht/[null]null", tokenizer, tagger);

// This is not modified, as it's already found in dictionary. If it was, getCompoundPOS would give it postag ZNW:EKV, from "mout".
TestTools.myAssert("havermout", "havermout/[havermout]ZNW:EKV:DE_", tokenizer, tagger);

Expand Down