diff --git a/languagetool-language-modules/nl/src/main/java/org/languagetool/rules/nl/DutchInflector.java b/languagetool-language-modules/nl/src/main/java/org/languagetool/rules/nl/DutchInflector.java index 1effc1fd3377..7e5b4aeb6a46 100644 --- a/languagetool-language-modules/nl/src/main/java/org/languagetool/rules/nl/DutchInflector.java +++ b/languagetool-language-modules/nl/src/main/java/org/languagetool/rules/nl/DutchInflector.java @@ -9,29 +9,189 @@ public class DutchInflector { public static final DutchInflector INSTANCE = new DutchInflector(); protected final CachingWordListLoader wordListLoader = new CachingWordListLoader(); + //NOUNS protected final Set nouns_de = new ObjectOpenHashSet<>(); protected final Set nouns_het = new ObjectOpenHashSet<>(); + protected final Set nouns_de_sf = new ObjectOpenHashSet<>(); + protected final Set nouns_het_sf = new ObjectOpenHashSet<>(); private static final String NOUNS_DE = "nl/inflector/nouns_de.txt"; private static final String NOUNS_HET = "nl/inflector/nouns_het.txt"; + private static final String NOUNS_DE_SF = "nl/inflector/nouns_de_sf.txt"; + private static final String NOUNS_HET_SF = "nl/inflector/nouns_het_sf.txt"; + //VERBS + protected final Set verbs_xde = new ObjectOpenHashSet<>(); + protected final Set verbs_xte = new ObjectOpenHashSet<>(); + private static final String VERBS_XDE = "nl/inflector/verbs_xde.txt"; + private static final String VERBS_XTE = "nl/inflector/verbs_xte.txt"; public DutchInflector() { nouns_de.addAll(wordListLoader.loadWords(NOUNS_DE)); nouns_het.addAll(wordListLoader.loadWords(NOUNS_HET)); + // for nouns where the consonant changes when plural + nouns_de_sf.addAll(wordListLoader.loadWords(NOUNS_DE_SF)); + nouns_het_sf.addAll(wordListLoader.loadWords(NOUNS_HET_SF)); + // for verbs + verbs_xde.addAll(wordListLoader.loadWords(VERBS_XDE)); + verbs_xte.addAll(wordListLoader.loadWords(VERBS_XTE)); } public List getPOSTag(String word) { List result = new ArrayList<>(2); result.add(null); result.add(null); - if (nouns_de.contains(word)) { - //add logic to check for DE nouns - result.set(0, "ZNW:EKV:DE_"); - result.set(1, word); - } else if (nouns_het.contains(word)) { - //add logic to check for HET nouns - result.set(0, "ZNW:EKV:HET"); - result.set(1, word); + + // Check all noun sets + String[] tagAndLemma = checkAllLemmas(word); + if (tagAndLemma != null) { + result.set(0, tagAndLemma[0]); + result.set(1, tagAndLemma[1]); } return result; } -} + + private String[] checkAllLemmas(String word) { + String[] tagAndLemma; + + // check nouns + tagAndLemma = checkLemmas(word, nouns_de, this::nounsDe); + if (tagAndLemma != null) return tagAndLemma; + + tagAndLemma = checkLemmas(word, nouns_het, this::nounsHet); + if (tagAndLemma != null) return tagAndLemma; + + tagAndLemma = checkLemmas(word, nouns_de_sf, this::nounsDe); + if (tagAndLemma != null) return tagAndLemma; + + tagAndLemma = checkLemmas(word, nouns_het_sf, this::nounsHet); + if (tagAndLemma != null) return tagAndLemma; + + // check verbs + // to do: add verb check even if noun was found + tagAndLemma = checkLemmas(word, verbs_xde, this::verbsXde); + if (tagAndLemma != null) return tagAndLemma; + + tagAndLemma = checkLemmas(word, verbs_xte, this::verbsXte); + if (tagAndLemma != null) return tagAndLemma; + + return null; + } + + private String[] checkLemmas(String word, Set lemmas, inflectionLogic logic) { + for (String lemma : lemmas) { + String foundTag = logic.apply(word, lemma); + if (foundTag != null) { + //System.out.println(word + " gets tag " + foundTag); + return new String[]{foundTag, lemma}; + } + } + return null; + } + + private String nounsDe(String word, String lemma) { + if (word.equals(lemma)) return "ZNW:EKV:DE_"; + String commonTag = nounsCommonInflection(word, lemma); + if (commonTag != null) return commonTag; + return null; + } + + private String nounsHet(String word, String lemma) { + if (word.equals(lemma)) return "ZNW:EKV:HET"; + String commonTag = nounsCommonInflection(word, lemma); + if (commonTag != null) return commonTag; + return null; + } + + private String nounsCommonInflection(String word, String lemma) { + if ( word.equals(lemma + "je")){ + return "ZNW:EKV:VRK:HET"; + } else if ( word.equals(lemma + "jes")){ + return "ZNW:MRV:VRK:DE_"; + } else if (lemma.endsWith("f") && (word.equals(lemma.substring(0, lemma.length() - 1) + "ven"))){ + return "ZNW:MRV:DE_"; + } else if (lemma.endsWith("s") && (word.equals(lemma.substring(0, lemma.length() - 1) + "zen"))){ + return "ZNW:MRV:DE_"; + } else if ( word.equals(lemma + "en")){ + return "ZNW:MRV:DE_"; + } + return null; + } + + private String verbsCommonInflection(String word, String lemma, String fpp) { + if (word.equals(lemma)) { + return "WKW:TGW:INF"; + } else if (word.equals(lemma + "de")) { + return "WKW:ODW:VRB"; + } else if (word.equals(lemma + "den")) { + return "WKW:ODW:MRV:DE_"; + } else if (word.equals(fpp + "t")) { + return "WKW:TGW:3EP"; + } else if (word.equals(fpp + "d")) { + return "WKW:ODW:ONV"; + } else if (word.equals(fpp)) { + return "WKW:TGW:1EP"; + } + return null; + } + + private String verbsXde(String word, String lemma) { + // spartelen > spartelde + String fpp = lemma.substring(0, lemma.length() - 2); + String commonTag = verbsCommonInflection(word, lemma, fpp); + if (commonTag != null) return commonTag; + if (word.equals(fpp + "de")){ + return "WKW:VLT:1EP"; + } else if (word.equals(fpp + "den")){ + return "WKW:VLT:INF"; + } else if (word.equals("ge" + fpp + "d")){ + return "WKW:VTD:ONV"; + } else if (word.equals("ge" + fpp + "de")){ + return "WKW:VTD:VRB"; + } else if (word.equals("ge" + fpp + "den")){ + return "WKW:VTD:ZNW:MRV:DE_"; + } else if (word.equals(fpp + "det")){ + return "WKW:VLT:GIJ"; + } + return null; + } + + private String verbsXte(String word, String lemma) { + // janken > jankte + String fpp = lemma.substring(0, lemma.length() - 2); + String commonTag = verbsCommonInflection(word, lemma, fpp); + if (commonTag != null) return commonTag; + if (word.equals(fpp + "te")){ + return "WKW:VLT:1EP"; + } else if (word.equals(fpp + "ten")){ + return "WKW:VLT:INF"; + } else if (word.equals("ge" + fpp + "t")){ + return "WKW:VTD:ONV"; + } else if (word.equals("ge" + fpp + "te")){ + return "WKW:VTD:VRB"; + } else if (word.equals("ge" + fpp + "ten")){ + return "WKW:VTD:ZNW:MRV:DE_"; + } else if (word.equals(fpp + "tet")){ + return "WKW:VLT:GIJ"; + } + return null; + } + + private String verbsDoubleCons(String word, String lemma) { + // stoffen > stofte + return null; + + } + + private interface inflectionLogic { + String apply(String word, String lemma); + } + + public static void main(String[] args) { + DutchInflector inflector = new DutchInflector(); + List result = inflector.getPOSTag("kaasje"); + if (result.get(0) != null) { + System.out.println("POS: " + result.get(0) + ", Lemma: " + result.get(1)); + } else { + System.out.println("Word not found in inflector."); + } + } +} \ No newline at end of file diff --git a/languagetool-language-modules/nl/src/main/java/org/languagetool/tagging/nl/DutchTagger.java b/languagetool-language-modules/nl/src/main/java/org/languagetool/tagging/nl/DutchTagger.java index 5b5f7a8ed673..6558d6a3cba0 100644 --- a/languagetool-language-modules/nl/src/main/java/org/languagetool/tagging/nl/DutchTagger.java +++ b/languagetool-language-modules/nl/src/main/java/org/languagetool/tagging/nl/DutchTagger.java @@ -230,8 +230,8 @@ public List tag(List sentenceTokens) { word = originalWord; if (l.isEmpty()) { - List newValue = inflector.getPOSTag(originalWord); - l.add(new AnalyzedToken(originalWord, newValue.get(0), newValue.get(1))); + List inflectorOutput = inflector.getPOSTag(originalWord); + l.add(new AnalyzedToken(originalWord, inflectorOutput.get(0), inflectorOutput.get(1))); } AnalyzedTokenReadings atr = new AnalyzedTokenReadings(l, pos); diff --git a/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/nouns_de_sf.txt b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/nouns_de_sf.txt new file mode 100644 index 000000000000..2ac1f48071a9 --- /dev/null +++ b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/nouns_de_sf.txt @@ -0,0 +1,2 @@ +zweetkuif +oogzalfspons \ No newline at end of file diff --git a/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/nouns_het.txt b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/nouns_het.txt index 73a71397c486..11024ce81da1 100644 --- a/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/nouns_het.txt +++ b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/nouns_het.txt @@ -1 +1 @@ -bouwvakkersgebed \ No newline at end of file +giraffenboek \ No newline at end of file diff --git a/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/nouns_het_sf.txt b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/nouns_het_sf.txt new file mode 100644 index 000000000000..a460d997354d --- /dev/null +++ b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/nouns_het_sf.txt @@ -0,0 +1,2 @@ +capibaramotief +kwakhuis \ No newline at end of file diff --git a/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/verbs_xde.txt b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/verbs_xde.txt new file mode 100644 index 000000000000..10fbce5bebca --- /dev/null +++ b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/verbs_xde.txt @@ -0,0 +1 @@ +druikelen \ No newline at end of file diff --git a/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/verbs_xte.txt b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/verbs_xte.txt new file mode 100644 index 000000000000..fb809ae48619 --- /dev/null +++ b/languagetool-language-modules/nl/src/main/resources/org/languagetool/resource/nl/inflector/verbs_xte.txt @@ -0,0 +1 @@ +brieksen \ No newline at end of file diff --git a/languagetool-language-modules/nl/src/test/java/org/languagetool/tagging/nl/DutchTaggerTest.java b/languagetool-language-modules/nl/src/test/java/org/languagetool/tagging/nl/DutchTaggerTest.java index 916a16d3d79b..036fa80fcc41 100644 --- a/languagetool-language-modules/nl/src/test/java/org/languagetool/tagging/nl/DutchTaggerTest.java +++ b/languagetool-language-modules/nl/src/test/java/org/languagetool/tagging/nl/DutchTaggerTest.java @@ -63,8 +63,15 @@ public void testTagger() throws IOException { TestTools.myAssert("beoordelingsgeschiedenisje", "beoordelingsgeschiedenisje/[beoordelingsgeschiedenis]ZNW:EKV:VRK:HET", tokenizer, tagger); TestTools.myAssert("Beoordelingsgeschiedenisjes", "Beoordelingsgeschiedenisjes/[beoordelingsgeschiedenis]ZNW:MRV:VRK:DE_", tokenizer, tagger); + TestTools.myAssert("schriebelkaasje", "schriebelkaasje/[schriebelkaas]ZNW:EKV:VRK:HET", tokenizer, tagger); TestTools.myAssert("schriebelkaas", "schriebelkaas/[schriebelkaas]ZNW:EKV:DE_", tokenizer, tagger); - TestTools.myAssert("bouwvakkersgebed", "bouwvakkersgebed/[bouwvakkersgebed]ZNW:EKV:HET", tokenizer, tagger); + TestTools.myAssert("giraffenboekjes", "giraffenboekjes/[giraffenboek]ZNW:MRV:VRK:DE_", tokenizer, tagger); + TestTools.myAssert("gedruikelden", "gedruikelden/[druikelen]WKW:VTD:ZNW:MRV:DE_", tokenizer, tagger); + TestTools.myAssert("briekst", "briekst/[brieksen]WKW:TGW:3EP", tokenizer, tagger); + TestTools.myAssert("gedruikelden", "gedruikelden/[druikelen]WKW:VTD:ZNW:MRV:DE_", tokenizer, tagger); + TestTools.myAssert("capibaramotieven", "capibaramotieven/[capibaramotief]ZNW:MRV:DE_", tokenizer, tagger); + TestTools.myAssert("oogzalfsponsje", "oogzalfsponsje/[oogzalfspons]ZNW:EKV:VRK:HET", tokenizer, tagger); + TestTools.myAssert("zweetkuiven", "zweetkuiven/[zweetkuif]ZNW:MRV:DE_", tokenizer, tagger); // Test regions TestTools.myAssert("Zuidoost-Gouda", "Zuidoost-Gouda/[Gouda]ENM:LOC:PTS", tokenizer, tagger);