Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[de] compound improvements #10311

Merged
merged 35 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
c7fe5f4
[de] take old refactor code
affemitkaraffe Feb 8, 2024
2a38cf2
[de] update infix files and verb stem lists
affemitkaraffe Feb 13, 2024
6f025c2
[de] make infix s checks stricter
affemitkaraffe Feb 21, 2024
8bc8a39
[de] require infix s for certain suffixes
affemitkaraffe Feb 21, 2024
60b06d0
[de] make SUB:NOM:SIN default for first part of a compound
affemitkaraffe Mar 7, 2024
3dafa34
[de] update infix s list
affemitkaraffe Mar 19, 2024
635f580
[de] improve infix s for '...tag' (Sonntagnachmittag, SonntagSfahrer)
affemitkaraffe Apr 2, 2024
f36961f
[de] part1 needs to be plural in some cases
affemitkaraffe Apr 2, 2024
7d10f32
[de] don't allow old spelling
affemitkaraffe Apr 9, 2024
2565454
[de] improve recognition of gender neutral compounds
affemitkaraffe Apr 10, 2024
132607f
[de] improve gender-neutral compounds + Link/Links
affemitkaraffe Apr 23, 2024
605bbac
[de] update POS tags, verb stems, ...
affemitkaraffe Apr 25, 2024
08a1279
[de] update lists
affemitkaraffe Apr 26, 2024
55b9dda
[de] improve singular/plural
affemitkaraffe May 7, 2024
2627181
[de] *part1*s that do not have a POS tag
affemitkaraffe May 17, 2024
2c70f09
[de] diff fixes
affemitkaraffe Jun 15, 2024
2cd7e2c
resolve conflicts
affemitkaraffe Jun 19, 2024
4816ba8
add morphologically correct but semantically incorrect compounds
affemitkaraffe Jun 20, 2024
bb85e7a
review changes
affemitkaraffe Jun 20, 2024
ff5fd8c
renaming
affemitkaraffe Jun 20, 2024
267bb23
[de] add comments to .txt lists + small improvements
affemitkaraffe Jun 26, 2024
1252359
merge
affemitkaraffe Jun 26, 2024
3a55ef0
adapt to naming conventions
affemitkaraffe Jun 26, 2024
628710b
remove checks for 'has no infix s' because default is 'has no infix s'
affemitkaraffe Jun 26, 2024
70a6f92
test cases to cover more code + remove unused code
affemitkaraffe Jun 26, 2024
ecf793c
Links can be derived from Link or links
affemitkaraffe Jun 26, 2024
e70d21d
remove unused files
affemitkaraffe Jun 26, 2024
c0983af
remove unused files
affemitkaraffe Jun 26, 2024
775f5bc
add missing words from unused files
affemitkaraffe Jun 27, 2024
70a0003
undo some review changes
affemitkaraffe Jun 27, 2024
35c1cd9
precompile regex
affemitkaraffe Jun 27, 2024
46918dd
Merge remote-tracking branch 'origin/master' into affe071
affemitkaraffe Jun 27, 2024
41e2f31
merge
affemitkaraffe Jun 27, 2024
593eee7
merge conflicts
affemitkaraffe Jul 10, 2024
52c98d0
prefer abscheide over abscheid
affemitkaraffe Jul 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ public class GermanSpellerRule extends CompoundAwareHunspellRule {
private static final Pattern COMPOUND_END_TYPOS = compile(".*(gruße|schaf(s|en)?)$");
private static final Pattern INFIX_S_SUFFIXES = compile(".*(heit|ion|ität|keit|ling|ung|schaft|tum)$");
private static final Pattern WECHSELINFIX = compile("(arbeit|dienstag|donnerstag|freitag|montag|mittwoch|link|recht|samstag|sonntag|verband)s?");
private static final Pattern NEEDS_TO_BE_PLURAL = compile("adresse|aktie|antenne|apache|arbeitnehmerin|autor|bakterie|bauer|bisexuelle|bürge|blume|börse|buche|däne|debatte|decke|diakon(in)?|drohne|druide|ehre|eibe|emittent(in)?|elfe|elle|enge|erde|erste|esche|fassade|farbe|felge|ferien|fluor|frage|frau|förde|galle|gerät|gilde|göttin|halt|heide|historie|hose|hund|jungfer|kante|kathode|katze|kette|kid|klasse|kirche|klaue|klinge|knappe|koeffizient|kojote|kontrahent|kontrolle|krake|kralle|kranke|krähe|kraut|kuriosität|kurve|kusine|küste|laterne|laute|legende|lehne|leise|leuchte|lippe|loge|lotse|länge|läuse|löwe|lücke|made|maske|maßnahme|menge|mensch|metapher|methode|metropole|miene|miete|million|miniatur|mitte|maus|mücke|mühle|nerv|niederlage|nixe|nonne|note|obdachlose|ode|organist|panne|parzelle|pate|patient|petze|pfanne|pfeife|platte|polle|pomade|pomeranze|posse|prise|prominente|prälat|puppe|pädophile|radikale|rakete|rampe|ranke|rate|rendite|repressalie|rest|riese|rinde|rind|robbe|robe|romanist|rose|ross|route|nummer|runde|röhre|rübe|salbe|schabe|schale|scheide|schelle|schenke|schere|sphäre|dicke|kröte|schlampe|schlange|schluchte|schmiere|schnake|schnalle|schneide|schnelle|schokolade|schotte|schwabe|schwalbe|schwule|seele|seide|hölle|höhle|seite|sonne|sorge|spanne|sparte|sperre|spitze|sproße|spule|steppe|straße|streife|studie|stunde|stütze|tabelle|tinte|tote|toilette|traube|treffe|treppe|truhe|träne|tunte|tüte|urne|vene|versicherte|verwandte|virtuose|vorname|waffe|wanne|ware|watte|wehe|welle|wiese");
private static final Pattern NEEDS_TO_BE_PLURAL = compile("adresse|aktie|antenne|apache|arbeitnehmerin|autor|bakterie|bauer|bisexuelle|bürge|blume|börse|buche|däne|debatte|decke|diakon(in)?|drohne|druide|ehre|eibe|emittent(in)?|elfe|elle|enge|erde|erste|esche|fassade|farbe|felge|ferien|fluor|frage|frau|förde|galle|gerät|gilde|göttin|halt|heide|historie|hose|hund|jungfer|kante|kathode|katze|kette|kid|klasse|kirche|klaue|klinge|knappe|koeffizient|kojote|kontrahent|krake|kralle|kranke|krähe|kraut|kuriosität|kurve|kusine|küste|laterne|laute|legende|lehne|leise|leuchte|lippe|loge|lotse|länge|läuse|löwe|lücke|made|maske|maßnahme|menge|mensch|metapher|methode|metropole|miene|miete|million|miniatur|mitte|maus|mücke|mühle|nerv|niederlage|nixe|nonne|note|obdachlose|ode|organist|panne|parzelle|pate|patient|petze|pfanne|pfeife|platte|polle|pomade|pomeranze|posse|prise|prominente|prälat|puppe|pädophile|radikale|rakete|rampe|ranke|rate|rendite|repressalie|rest|riese|rinde|rind|robbe|robe|romanist|rose|ross|route|nummer|runde|röhre|rübe|salbe|schabe|schale|scheide|schelle|schenke|schere|sphäre|dicke|kröte|schlampe|schlange|schluchte|schmiere|schnake|schnalle|schneide|schnelle|schokolade|schotte|schwabe|schwalbe|schwule|seele|seide|hölle|höhle|seite|sonne|sorge|spanne|sparte|sperre|spitze|sproße|spule|steppe|straße|streife|studie|stunde|stütze|tabelle|tinte|tote|toilette|traube|treffe|treppe|truhe|träne|tunte|tüte|urne|vene|versicherte|verwandte|virtuose|vorname|waffe|wanne|ware|watte|wehe|welle|wiese");
private static final Pattern INVALID_COMP_PART = compile("kontrolle|perspektive|schule|sprache|stelle|suche");
private static final Pattern SUBINF_SINGULAR_OBJECT = compile("putzen|rauchen|sein|spielen");
private static final Pattern ARBEIT_COMP = compile("(gebe|nehme)(r(s|n|innen|in)?|nde[mnr]?)");
private static final Pattern LINK_COMP = compile("element|inhalt|liste|portal|text|titel|tracking|verzeichnis");
Expand All @@ -160,6 +161,7 @@ public class GermanSpellerRule extends CompoundAwareHunspellRule {
private final Set<String> germanPrefixes = new HashSet<>();
private static Set<String> verbStems = new HashSet<>();
private static Set<String> verbPrefixes = new HashSet<>();
private static Set<String> otherPrefixes = new HashSet<>();
private static Set<String> oldSpelling = new HashSet<>();
private static final Map<StringMatcher, Function<String,List<String>>> ADDITIONAL_SUGGESTIONS = new HashMap<>();
static {
Expand Down Expand Up @@ -1749,6 +1751,7 @@ public GermanSpellerRule(ResourceBundle messages, German language, UserConfig us
loadFile("/de/german_prefix.txt", germanPrefixes);
loadFile("/de/verb_stems.txt", verbStems);
loadFile("/de/verb_prefixes.txt", verbPrefixes);
loadFile("/de/other_prefixes.txt", otherPrefixes);
loadFile("/de/alt_neu.csv", oldSpelling);
}

Expand Down Expand Up @@ -2465,6 +2468,10 @@ private boolean processTwoPartCompounds(String part1, String part2) throws IOExc
boolean part2upcasedIsNoun = isNoun(part2upcased);
boolean part2upcasedIsMispelled = isMisspelled(uppercaseFirstChar(part2upcased));

if (INVALID_COMP_PART.matcher(lowercaseFirstChar(part1WithoutHyphen)).matches()) {
return false;
}

String part1_without_infix_s = part1upcased;

// Sometimes part1 requires singular or plural
Expand Down Expand Up @@ -2524,8 +2531,8 @@ private boolean processTwoPartCompounds(String part1, String part2) throws IOExc
return true;
}
if (part2upcasedIsNoun && !part2upcasedIsMispelled &&
// *part1* is acronym, e. g. "SEO-Expertinnen"
isAllUppercase(removeTrailingSAndHyphen(part1)) && !isMisspelled(removeTrailingSAndHyphen(part1))) {
// *part1* is acronym or other prefix, e. g. "SEO-Expertinnen", "Sprachvariante"
((isAllUppercase(removeTrailingSAndHyphen(part1)) && !isMisspelled(removeTrailingSAndHyphen(part1))) || isOtherPrefix(part1))) {
return true;
}
return false;
Expand Down Expand Up @@ -2761,6 +2768,10 @@ private boolean isNounNomPlu(String word) throws IOException {
return getTagger().tag(singletonList(word)).stream().anyMatch(k -> k.hasPosTagStartingWith("SUB:NOM:PLU"));
}

private boolean isOtherPrefix(String word) throws IOException {
return otherPrefixes.contains(lowercaseFirstChar(word));
}

private boolean isVerbPrefix(String word) throws IOException {
return verbPrefixes.contains(lowercaseFirstChar(word));
}
Expand Down
Loading