Skip to content

Commit

Permalink
[sv] Adding ngram support for Swedish. Adding some initial infra and …
Browse files Browse the repository at this point in the history
…some rules with tests.
  • Loading branch information
ljo committed Jul 25, 2023
1 parent aaa88c4 commit cf1172f
Show file tree
Hide file tree
Showing 7 changed files with 481 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.languagetool.Language;
import org.languagetool.LanguageMaintainedState;
import org.languagetool.UserConfig;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.rules.*;
import org.languagetool.rules.spelling.SpellingCheckRule;
import org.languagetool.rules.spelling.hunspell.HunspellRule;
Expand All @@ -35,16 +36,24 @@
import org.languagetool.tagging.sv.SwedishTagger;
import org.languagetool.tokenizers.SRXSentenceTokenizer;
import org.languagetool.tokenizers.SentenceTokenizer;
//import org.languagetool.tokenizers.sv.SwedishWordTokenizer;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;

import static java.util.Arrays.asList;

/**
* @deprecated this language is unmaintained in LT and might be removed in a future release if we cannot find contributors for it (deprecated since 3.6)
* Actively maintained since v6.2+
*
* Deprecated in 3.6, but actively maintained again since v6.2+
*
*/
@Deprecated
public class Swedish extends Language {
public class Swedish extends Language implements AutoCloseable {

private LanguageModel languageModel;

@Override
public String getName() {
Expand Down Expand Up @@ -72,6 +81,56 @@ public SentenceTokenizer createDefaultSentenceTokenizer() {
return new SRXSentenceTokenizer(this);
}

/*
@Override
public Tokenizer createDefaultWordTokenizer() {
return new SwedishWordTokenizer();
}
*/

@Override
public synchronized LanguageModel getLanguageModel(File indexDir) throws IOException {
languageModel = initLanguageModel(indexDir, languageModel);
return languageModel;
}

@Override
public List<Rule> getRelevantLanguageModelRules(ResourceBundle messages, LanguageModel languageModel, UserConfig userConfig) throws IOException {
return asList(
new UpperCaseNgramRule(messages, languageModel, this, userConfig),
new SwedishConfusionProbabilityRule(messages, languageModel, this)
//new SwedishNgramProbabilityRule(messages, languageModel, this)
);
}

/*
@Override
public List<Rule> getRelevantLanguageModelCapableRules(ResourceBundle messages, @Nullable LanguageModel lm, GlobalConfig globalConfig, UserConfig userConfig, Language motherTongue, List<Language> altLanguages) throws IOException {
if (lm != null && motherTongue != null) {
if ("en".equals(motherTongue.getShortCode())) {
return asList(new SwedishForEnglishNativesFalseFriendRule(messages, lm, motherTongue, this));
} else if ("de".equals(motherTongue.getShortCode())) {
return asList(new SwedishForGermansFalseFriendRule(messages, lm, motherTongue, this));
} else if ("da".equals(motherTongue.getShortCode())) {
return asList(new SwedishForDanesFalseFriendRule(messages, lm, motherTongue, this));
} else if ("no".equals(motherTongue.getShortCode())) {
return asList(new SwedishForNorwegiansFalseFriendRule(messages, lm, motherTongue, this));
}
}
return asList();
}
@Override
public boolean hasNGramFalseFriendRule(Language motherTongue) {
return motherTongue != null && (
"en".equals(motherTongue.getShortCode()) ||
"de".equals(motherTongue.getShortCode()) ||
"da".equals(motherTongue.getShortCode()) ||
"no".equals(motherTongue.getShortCode()));
}
*/

@Override
public Disambiguator createDefaultDisambiguator() {
return new SwedishHybridDisambiguator();
Expand Down Expand Up @@ -118,4 +177,41 @@ public List<Rule> getRelevantRules(ResourceBundle messages, UserConfig userConfi
protected SpellingCheckRule createDefaultSpellingRule(ResourceBundle messages) throws IOException {
return new HunspellRule(messages, this, null, null);
}

/** @since 6.2+ */
// @Override
// public String getOpeningDoubleQuote() {
// return "”";
// }

/** @since 6.2+ */
// @Override
// public String getClosingDoubleQuote() {
// return "”";
// }

/** @since 6.2+ */
// @Override
// public String getOpeningSingleQuote() {
// return "’";
// }

/** @since 6.2+ */
// @Override
// public String getClosingSingleQuote() {
// return "’";
// }

/**
* Closes the language model, if any.
* @since 6.2+
*/
@Override
public void close() throws Exception {
if (languageModel != null) {
languageModel.close();
}
}

}

Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules.sv;

import org.languagetool.Language;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.rules.ngrams.ConfusionProbabilityRule;
import org.languagetool.rules.Example;
import org.languagetool.rules.patterns.PatternToken;

import java.util.Arrays;
import java.util.List;
import java.util.ResourceBundle;

import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.posRegex;
import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.token;
import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.tokenRegex;

/**
* @since 2.7
*/
public class SwedishConfusionProbabilityRule extends ConfusionProbabilityRule {

private static final List<String> EXCEPTIONS = Arrays.asList(
// Use all-lowercase, matches will be case-insensitive.
"god sak"
);

private static final List<List<PatternToken>> ANTI_PATTERNS = Arrays.asList(
Arrays.asList(
// "De små öronen" "Dessa små öron"
tokenRegex("de|dessa|dom"),
token("små"),
posRegex("NN:PLU")
)
);

public SwedishConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language) {
this(messages, languageModel, language, 3);
}

public SwedishConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language, int grams) {
super(messages, languageModel, language, grams, EXCEPTIONS, ANTI_PATTERNS);
addExamplePair(Example.wrong("Ett <marker>streck</marker> mot horisonten."),
Example.fixed("Ett <marker>sträck</marker> mot horisonten."));
}

protected boolean isCommonWord(String token) {
return token.matches("[\\wåäöüßÅÄÖÜ]+");
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2020 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules.sv;

import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.Language;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.UserConfig;
import org.languagetool.rules.*;
import org.languagetool.rules.ngrams.Probability;

import java.util.*;

import static org.languagetool.tools.StringTools.*;

/**
* Finds some(!) words written uppercase that should be spelled lowercase and vice versa.
* @since 6.2+
*/
public class UpperCaseNgramRule extends Rule {

private static final int THRESHOLD = 50;
private static final Set<String> relevantWords = new HashSet<>(Arrays.asList(
"maj", "Maj",
"Måndag", "måndag"
));

private final LanguageModel lm;

public UpperCaseNgramRule(ResourceBundle messages, LanguageModel lm, Language langUser, UserConfig userConfig) {
super(messages);
super.setCategory(Categories.CASING.getCategory(messages));
this.lm = Objects.requireNonNull(lm);
//setDefaultTempOff(); // fixme!
setLocQualityIssueType(ITSIssueType.Misspelling);
addExamplePair(Example.wrong("Antagningen sker 15 <marker>Maj</marker>."),
Example.fixed("Antagningen sker 15 <marker>maj</marker>."));
}

@Override
public final String getId() {
return "SV_UPPER_CASE_NGRAM";
}

@Override
public String getDescription() {
return "Granskar ord som ofta skrivs med liten eller stor begynnelsebokstav när det ska vara tvärsom";
}

@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
List<RuleMatch> matches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
for (int i = 1; i < tokens.length; i++) {
AnalyzedTokenReadings token = tokens[i];
String tokenStr = token.getToken();
if (i + 1 < tokens.length && relevantWords.contains(tokenStr) && !isAllUppercase(tokenStr)) {
String ucToken = uppercaseFirstChar(tokenStr);
String lcToken = lowercaseFirstChar(tokenStr);
List<String> ucList = Arrays.asList(tokens[i - 1].getToken(), ucToken, tokens[i + 1].getToken());
List<String> lcList = Arrays.asList(tokens[i - 1].getToken(), lcToken, tokens[i + 1].getToken());
Probability ucProb = lm.getPseudoProbability(ucList);
Probability lcProb = lm.getPseudoProbability(lcList);
if (startsWithUppercase(tokenStr)) {
double ratio = lcProb.getProb() / ucProb.getProb();
if (ratio > THRESHOLD) {
String msg = "Menar du verbet '" + lcToken + "'? Oftast är det initialförkotningar och egennamn som skrivs med stor bokstav.";
RuleMatch match = new RuleMatch(this, sentence, token.getStartPos(), token.getEndPos(), msg);
match.setSuggestedReplacement(lcToken);
matches.add(match);
}
} else {
double ratio = ucProb.getProb() / lcProb.getProb();
if (ratio > THRESHOLD) {
String msg = "Menar du substantivet '" + ucToken + "'? Oftast är det initialförkortningar och egennamn som skrivs med stor bokstav.";
RuleMatch match = new RuleMatch(this, sentence, token.getStartPos(), token.getEndPos(), msg);
match.setSuggestedReplacement(ucToken);
matches.add(match);
}
}
}
}
return toRuleMatchArray(matches);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Swedish confusion sets
# Line format:
# <word1>|<description1>; <word2>|<description2>; <factor> # optional comment
# <word1> and <word2> are words that can easily be confused
# <description> will be used in the error message to explain the word (optional)
# <factor> is the factor of how much more the other word must be more
# probable so the text is considered potentially incorrect.
# Use a higher value for better precision but lower recall.
# Precision (p) and recall (r) values in the comments come from ConfusionRuleEvaluator
# The number after recall is the number of sentences used for evaluation.
# Order is relevant for ambiguous cases like 'know' ('no' or 'now') where the match
# is used whose pair comes first in this file.
# Alphabetical order on each line is also important!
#
dem; dom; 100
streck; sträck; 25

Loading

0 comments on commit cf1172f

Please sign in to comment.