-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[sv] Adding ngram support for Swedish. Adding some initial infra and …
…some rules with tests.
- Loading branch information
Showing
7 changed files
with
481 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
68 changes: 68 additions & 0 deletions
68
...e-modules/sv/src/main/java/org/languagetool/rules/sv/SwedishConfusionProbabilityRule.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
/* LanguageTool, a natural language style checker | ||
* Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de) | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public | ||
* License along with this library; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | ||
* USA | ||
*/ | ||
package org.languagetool.rules.sv; | ||
|
||
import org.languagetool.Language; | ||
import org.languagetool.languagemodel.LanguageModel; | ||
import org.languagetool.rules.ngrams.ConfusionProbabilityRule; | ||
import org.languagetool.rules.Example; | ||
import org.languagetool.rules.patterns.PatternToken; | ||
|
||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.ResourceBundle; | ||
|
||
import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.posRegex; | ||
import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.token; | ||
import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.tokenRegex; | ||
|
||
/** | ||
* @since 2.7 | ||
*/ | ||
public class SwedishConfusionProbabilityRule extends ConfusionProbabilityRule { | ||
|
||
private static final List<String> EXCEPTIONS = Arrays.asList( | ||
// Use all-lowercase, matches will be case-insensitive. | ||
"god sak" | ||
); | ||
|
||
private static final List<List<PatternToken>> ANTI_PATTERNS = Arrays.asList( | ||
Arrays.asList( | ||
// "De små öronen" "Dessa små öron" | ||
tokenRegex("de|dessa|dom"), | ||
token("små"), | ||
posRegex("NN:PLU") | ||
) | ||
); | ||
|
||
public SwedishConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language) { | ||
this(messages, languageModel, language, 3); | ||
} | ||
|
||
public SwedishConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language, int grams) { | ||
super(messages, languageModel, language, grams, EXCEPTIONS, ANTI_PATTERNS); | ||
addExamplePair(Example.wrong("Ett <marker>streck</marker> mot horisonten."), | ||
Example.fixed("Ett <marker>sträck</marker> mot horisonten.")); | ||
} | ||
|
||
protected boolean isCommonWord(String token) { | ||
return token.matches("[\\wåäöüßÅÄÖÜ]+"); | ||
} | ||
|
||
} |
103 changes: 103 additions & 0 deletions
103
...etool-language-modules/sv/src/main/java/org/languagetool/rules/sv/UpperCaseNgramRule.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
/* LanguageTool, a natural language style checker | ||
* Copyright (C) 2020 Daniel Naber (http://www.danielnaber.de) | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public | ||
* License along with this library; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | ||
* USA | ||
*/ | ||
package org.languagetool.rules.sv; | ||
|
||
import org.languagetool.AnalyzedSentence; | ||
import org.languagetool.AnalyzedTokenReadings; | ||
import org.languagetool.Language; | ||
import org.languagetool.languagemodel.LanguageModel; | ||
import org.languagetool.UserConfig; | ||
import org.languagetool.rules.*; | ||
import org.languagetool.rules.ngrams.Probability; | ||
|
||
import java.util.*; | ||
|
||
import static org.languagetool.tools.StringTools.*; | ||
|
||
/** | ||
* Finds some(!) words written uppercase that should be spelled lowercase and vice versa. | ||
* @since 6.2+ | ||
*/ | ||
public class UpperCaseNgramRule extends Rule { | ||
|
||
private static final int THRESHOLD = 50; | ||
private static final Set<String> relevantWords = new HashSet<>(Arrays.asList( | ||
"maj", "Maj", | ||
"Måndag", "måndag" | ||
)); | ||
|
||
private final LanguageModel lm; | ||
|
||
public UpperCaseNgramRule(ResourceBundle messages, LanguageModel lm, Language langUser, UserConfig userConfig) { | ||
super(messages); | ||
super.setCategory(Categories.CASING.getCategory(messages)); | ||
this.lm = Objects.requireNonNull(lm); | ||
//setDefaultTempOff(); // fixme! | ||
setLocQualityIssueType(ITSIssueType.Misspelling); | ||
addExamplePair(Example.wrong("Antagningen sker 15 <marker>Maj</marker>."), | ||
Example.fixed("Antagningen sker 15 <marker>maj</marker>.")); | ||
} | ||
|
||
@Override | ||
public final String getId() { | ||
return "SV_UPPER_CASE_NGRAM"; | ||
} | ||
|
||
@Override | ||
public String getDescription() { | ||
return "Granskar ord som ofta skrivs med liten eller stor begynnelsebokstav när det ska vara tvärsom"; | ||
} | ||
|
||
@Override | ||
public RuleMatch[] match(AnalyzedSentence sentence) { | ||
List<RuleMatch> matches = new ArrayList<>(); | ||
AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace(); | ||
for (int i = 1; i < tokens.length; i++) { | ||
AnalyzedTokenReadings token = tokens[i]; | ||
String tokenStr = token.getToken(); | ||
if (i + 1 < tokens.length && relevantWords.contains(tokenStr) && !isAllUppercase(tokenStr)) { | ||
String ucToken = uppercaseFirstChar(tokenStr); | ||
String lcToken = lowercaseFirstChar(tokenStr); | ||
List<String> ucList = Arrays.asList(tokens[i - 1].getToken(), ucToken, tokens[i + 1].getToken()); | ||
List<String> lcList = Arrays.asList(tokens[i - 1].getToken(), lcToken, tokens[i + 1].getToken()); | ||
Probability ucProb = lm.getPseudoProbability(ucList); | ||
Probability lcProb = lm.getPseudoProbability(lcList); | ||
if (startsWithUppercase(tokenStr)) { | ||
double ratio = lcProb.getProb() / ucProb.getProb(); | ||
if (ratio > THRESHOLD) { | ||
String msg = "Menar du verbet '" + lcToken + "'? Oftast är det initialförkotningar och egennamn som skrivs med stor bokstav."; | ||
RuleMatch match = new RuleMatch(this, sentence, token.getStartPos(), token.getEndPos(), msg); | ||
match.setSuggestedReplacement(lcToken); | ||
matches.add(match); | ||
} | ||
} else { | ||
double ratio = ucProb.getProb() / lcProb.getProb(); | ||
if (ratio > THRESHOLD) { | ||
String msg = "Menar du substantivet '" + ucToken + "'? Oftast är det initialförkortningar och egennamn som skrivs med stor bokstav."; | ||
RuleMatch match = new RuleMatch(this, sentence, token.getStartPos(), token.getEndPos(), msg); | ||
match.setSuggestedReplacement(ucToken); | ||
matches.add(match); | ||
} | ||
} | ||
} | ||
} | ||
return toRuleMatchArray(matches); | ||
} | ||
|
||
} |
17 changes: 17 additions & 0 deletions
17
...ol-language-modules/sv/src/main/resources/org/languagetool/resource/sv/confusion_sets.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Swedish confusion sets | ||
# Line format: | ||
# <word1>|<description1>; <word2>|<description2>; <factor> # optional comment | ||
# <word1> and <word2> are words that can easily be confused | ||
# <description> will be used in the error message to explain the word (optional) | ||
# <factor> is the factor of how much more the other word must be more | ||
# probable so the text is considered potentially incorrect. | ||
# Use a higher value for better precision but lower recall. | ||
# Precision (p) and recall (r) values in the comments come from ConfusionRuleEvaluator | ||
# The number after recall is the number of sentences used for evaluation. | ||
# Order is relevant for ambiguous cases like 'know' ('no' or 'now') where the match | ||
# is used whose pair comes first in this file. | ||
# Alphabetical order on each line is also important! | ||
# | ||
dem; dom; 100 | ||
streck; sträck; 25 | ||
|
Oops, something went wrong.