Skip to content

Commit

Permalink
[nl] add inflector basics
Browse files Browse the repository at this point in the history
  • Loading branch information
mark-baas committed Jul 10, 2024
1 parent 44b3d35 commit 8ae39ea
Show file tree
Hide file tree
Showing 8 changed files with 186 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,189 @@
public class DutchInflector {
public static final DutchInflector INSTANCE = new DutchInflector();
protected final CachingWordListLoader wordListLoader = new CachingWordListLoader();
//NOUNS
protected final Set<String> nouns_de = new ObjectOpenHashSet<>();
protected final Set<String> nouns_het = new ObjectOpenHashSet<>();
protected final Set<String> nouns_de_sf = new ObjectOpenHashSet<>();
protected final Set<String> nouns_het_sf = new ObjectOpenHashSet<>();
private static final String NOUNS_DE = "nl/inflector/nouns_de.txt";
private static final String NOUNS_HET = "nl/inflector/nouns_het.txt";
private static final String NOUNS_DE_SF = "nl/inflector/nouns_de_sf.txt";
private static final String NOUNS_HET_SF = "nl/inflector/nouns_het_sf.txt";
//VERBS
protected final Set<String> verbs_xde = new ObjectOpenHashSet<>();
protected final Set<String> verbs_xte = new ObjectOpenHashSet<>();
private static final String VERBS_XDE = "nl/inflector/verbs_xde.txt";
private static final String VERBS_XTE = "nl/inflector/verbs_xte.txt";

public DutchInflector() {
nouns_de.addAll(wordListLoader.loadWords(NOUNS_DE));
nouns_het.addAll(wordListLoader.loadWords(NOUNS_HET));
// for nouns where the consonant changes when plural
nouns_de_sf.addAll(wordListLoader.loadWords(NOUNS_DE_SF));
nouns_het_sf.addAll(wordListLoader.loadWords(NOUNS_HET_SF));
// for verbs
verbs_xde.addAll(wordListLoader.loadWords(VERBS_XDE));
verbs_xte.addAll(wordListLoader.loadWords(VERBS_XTE));
}

public List<String> getPOSTag(String word) {
List<String> result = new ArrayList<>(2);
result.add(null);
result.add(null);
if (nouns_de.contains(word)) {
//add logic to check for DE nouns
result.set(0, "ZNW:EKV:DE_");
result.set(1, word);
} else if (nouns_het.contains(word)) {
//add logic to check for HET nouns
result.set(0, "ZNW:EKV:HET");
result.set(1, word);

// Check all noun sets
String[] tagAndLemma = checkAllLemmas(word);
if (tagAndLemma != null) {
result.set(0, tagAndLemma[0]);
result.set(1, tagAndLemma[1]);
}
return result;
}
}

private String[] checkAllLemmas(String word) {
String[] tagAndLemma;

// check nouns
tagAndLemma = checkLemmas(word, nouns_de, this::nounsDe);
if (tagAndLemma != null) return tagAndLemma;

tagAndLemma = checkLemmas(word, nouns_het, this::nounsHet);
if (tagAndLemma != null) return tagAndLemma;

tagAndLemma = checkLemmas(word, nouns_de_sf, this::nounsDe);
if (tagAndLemma != null) return tagAndLemma;

tagAndLemma = checkLemmas(word, nouns_het_sf, this::nounsHet);
if (tagAndLemma != null) return tagAndLemma;

// check verbs
// to do: add verb check even if noun was found
tagAndLemma = checkLemmas(word, verbs_xde, this::verbsXde);
if (tagAndLemma != null) return tagAndLemma;

tagAndLemma = checkLemmas(word, verbs_xte, this::verbsXte);
if (tagAndLemma != null) return tagAndLemma;

return null;
}

private String[] checkLemmas(String word, Set<String> lemmas, inflectionLogic logic) {
for (String lemma : lemmas) {
String foundTag = logic.apply(word, lemma);
if (foundTag != null) {
//System.out.println(word + " gets tag " + foundTag);
return new String[]{foundTag, lemma};
}
}
return null;
}

private String nounsDe(String word, String lemma) {
if (word.equals(lemma)) return "ZNW:EKV:DE_";
String commonTag = nounsCommonInflection(word, lemma);
if (commonTag != null) return commonTag;
return null;
}

private String nounsHet(String word, String lemma) {
if (word.equals(lemma)) return "ZNW:EKV:HET";
String commonTag = nounsCommonInflection(word, lemma);
if (commonTag != null) return commonTag;
return null;
}

private String nounsCommonInflection(String word, String lemma) {
if ( word.equals(lemma + "je")){
return "ZNW:EKV:VRK:HET";
} else if ( word.equals(lemma + "jes")){
return "ZNW:MRV:VRK:DE_";
} else if (lemma.endsWith("f") && (word.equals(lemma.substring(0, lemma.length() - 1) + "ven"))){
return "ZNW:MRV:DE_";
} else if (lemma.endsWith("s") && (word.equals(lemma.substring(0, lemma.length() - 1) + "zen"))){
return "ZNW:MRV:DE_";
} else if ( word.equals(lemma + "en")){
return "ZNW:MRV:DE_";
}
return null;
}

private String verbsCommonInflection(String word, String lemma, String fpp) {
if (word.equals(lemma)) {
return "WKW:TGW:INF";
} else if (word.equals(lemma + "de")) {
return "WKW:ODW:VRB";
} else if (word.equals(lemma + "den")) {
return "WKW:ODW:MRV:DE_";
} else if (word.equals(fpp + "t")) {
return "WKW:TGW:3EP";
} else if (word.equals(fpp + "d")) {
return "WKW:ODW:ONV";
} else if (word.equals(fpp)) {
return "WKW:TGW:1EP";
}
return null;
}

private String verbsXde(String word, String lemma) {
// spartelen > spartelde
String fpp = lemma.substring(0, lemma.length() - 2);
String commonTag = verbsCommonInflection(word, lemma, fpp);
if (commonTag != null) return commonTag;
if (word.equals(fpp + "de")){
return "WKW:VLT:1EP";
} else if (word.equals(fpp + "den")){
return "WKW:VLT:INF";
} else if (word.equals("ge" + fpp + "d")){
return "WKW:VTD:ONV";
} else if (word.equals("ge" + fpp + "de")){
return "WKW:VTD:VRB";
} else if (word.equals("ge" + fpp + "den")){
return "WKW:VTD:ZNW:MRV:DE_";
} else if (word.equals(fpp + "det")){
return "WKW:VLT:GIJ";
}
return null;
}

private String verbsXte(String word, String lemma) {
// janken > jankte
String fpp = lemma.substring(0, lemma.length() - 2);
String commonTag = verbsCommonInflection(word, lemma, fpp);
if (commonTag != null) return commonTag;
if (word.equals(fpp + "te")){
return "WKW:VLT:1EP";
} else if (word.equals(fpp + "ten")){
return "WKW:VLT:INF";
} else if (word.equals("ge" + fpp + "t")){
return "WKW:VTD:ONV";
} else if (word.equals("ge" + fpp + "te")){
return "WKW:VTD:VRB";
} else if (word.equals("ge" + fpp + "ten")){
return "WKW:VTD:ZNW:MRV:DE_";
} else if (word.equals(fpp + "tet")){
return "WKW:VLT:GIJ";
}
return null;
}

private String verbsDoubleCons(String word, String lemma) {
// stoffen > stofte
return null;

}

private interface inflectionLogic {
String apply(String word, String lemma);
}

public static void main(String[] args) {
DutchInflector inflector = new DutchInflector();
List<String> result = inflector.getPOSTag("kaasje");
if (result.get(0) != null) {
System.out.println("POS: " + result.get(0) + ", Lemma: " + result.get(1));
} else {
System.out.println("Word not found in inflector.");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,8 @@ public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) {
word = originalWord;

if (l.isEmpty()) {
List<String> newValue = inflector.getPOSTag(originalWord);
l.add(new AnalyzedToken(originalWord, newValue.get(0), newValue.get(1)));
List<String> inflectorOutput = inflector.getPOSTag(originalWord);
l.add(new AnalyzedToken(originalWord, inflectorOutput.get(0), inflectorOutput.get(1)));
}

AnalyzedTokenReadings atr = new AnalyzedTokenReadings(l, pos);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
zweetkuif
oogzalfspons
Original file line number Diff line number Diff line change
@@ -1 +1 @@
bouwvakkersgebed
giraffenboek
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
capibaramotief
kwakhuis
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
druikelen
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
brieksen
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,15 @@ public void testTagger() throws IOException {
TestTools.myAssert("beoordelingsgeschiedenisje", "beoordelingsgeschiedenisje/[beoordelingsgeschiedenis]ZNW:EKV:VRK:HET", tokenizer, tagger);
TestTools.myAssert("Beoordelingsgeschiedenisjes", "Beoordelingsgeschiedenisjes/[beoordelingsgeschiedenis]ZNW:MRV:VRK:DE_", tokenizer, tagger);

TestTools.myAssert("schriebelkaasje", "schriebelkaasje/[schriebelkaas]ZNW:EKV:VRK:HET", tokenizer, tagger);
TestTools.myAssert("schriebelkaas", "schriebelkaas/[schriebelkaas]ZNW:EKV:DE_", tokenizer, tagger);
TestTools.myAssert("bouwvakkersgebed", "bouwvakkersgebed/[bouwvakkersgebed]ZNW:EKV:HET", tokenizer, tagger);
TestTools.myAssert("giraffenboekjes", "giraffenboekjes/[giraffenboek]ZNW:MRV:VRK:DE_", tokenizer, tagger);
TestTools.myAssert("gedruikelden", "gedruikelden/[druikelen]WKW:VTD:ZNW:MRV:DE_", tokenizer, tagger);
TestTools.myAssert("briekst", "briekst/[brieksen]WKW:TGW:3EP", tokenizer, tagger);
TestTools.myAssert("gedruikelden", "gedruikelden/[druikelen]WKW:VTD:ZNW:MRV:DE_", tokenizer, tagger);
TestTools.myAssert("capibaramotieven", "capibaramotieven/[capibaramotief]ZNW:MRV:DE_", tokenizer, tagger);
TestTools.myAssert("oogzalfsponsje", "oogzalfsponsje/[oogzalfspons]ZNW:EKV:VRK:HET", tokenizer, tagger);
TestTools.myAssert("zweetkuiven", "zweetkuiven/[zweetkuif]ZNW:MRV:DE_", tokenizer, tagger);

// Test regions
TestTools.myAssert("Zuidoost-Gouda", "Zuidoost-Gouda/[Gouda]ENM:LOC:PTS", tokenizer, tagger);
Expand Down

0 comments on commit 8ae39ea

Please sign in to comment.