-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[pt] Add logic for state info and toponym check
- Loading branch information
p-goulart
committed
Dec 22, 2023
1 parent
e031851
commit a9ddac2
Showing
9 changed files
with
331 additions
and
17 deletions.
There are no files selected for viewing
46 changes: 46 additions & 0 deletions
46
...etool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianStateInfo.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
/* LanguageTool, a natural language style checker | ||
* Copyright (C) 2023 Pedro Goulart | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public | ||
* License along with this library; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | ||
* USA | ||
*/ | ||
package org.languagetool.rules.pt; | ||
|
||
import java.util.*; | ||
|
||
public class BrazilianStateInfo { | ||
Set<String> ambiguousStates = new HashSet<>(Arrays.asList("RJ", "SP")); | ||
static final BrazilianStateInfoMap map = new BrazilianStateInfoMap(); | ||
String name; | ||
String abbreviation; | ||
String[] articles; | ||
String capital; | ||
|
||
BrazilianStateInfo(String name, String abbreviation, String[] articles, String capital) { | ||
this.name = name; | ||
this.abbreviation = abbreviation; | ||
this.articles = articles.clone(); | ||
this.capital = capital; | ||
} | ||
|
||
public boolean isAmbiguous() { | ||
return ambiguousStates.contains(abbreviation); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return String.format("<%s|%s|%s|%s>", name, abbreviation, Arrays.toString(articles), capital); | ||
} | ||
} |
33 changes: 33 additions & 0 deletions
33
...ol-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianStateInfoMap.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
/* LanguageTool, a natural language style checker | ||
* Copyright (C) 2023 Pedro Goulart | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public | ||
* License along with this library; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | ||
* USA | ||
*/ | ||
package org.languagetool.rules.pt; | ||
|
||
import java.util.Map; | ||
|
||
public class BrazilianStateInfoMap { | ||
private final Map<String, BrazilianStateInfo> map; | ||
|
||
public BrazilianStateInfoMap() { | ||
map = new BrazilianStateInfoMapLoader().buildMap(); | ||
} | ||
|
||
public BrazilianStateInfo get(String stateAbbreviation) { | ||
return map.get(stateAbbreviation); | ||
} | ||
} |
51 changes: 51 additions & 0 deletions
51
...guage-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianStateInfoMapLoader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/* LanguageTool, a natural language style checker | ||
* Copyright (C) 2023 Pedro Goulart | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public | ||
* License along with this library; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | ||
* USA | ||
*/ | ||
package org.languagetool.rules.pt; | ||
|
||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
import static org.languagetool.JLanguageTool.getDataBroker; | ||
|
||
public class BrazilianStateInfoMapLoader { | ||
private final String stateMappingFilename = "pt/state_name_mapping.txt"; | ||
|
||
private List<String> getStateMappingLines() { | ||
return getDataBroker().getFromResourceDirAsLines(stateMappingFilename); | ||
} | ||
|
||
public Map<String, BrazilianStateInfo> buildMap() { | ||
List<String> stateMappingLines = getStateMappingLines(); | ||
Map<String, BrazilianStateInfo> stateMap = new HashMap<>(); | ||
for (String line : stateMappingLines) { | ||
if (!line.startsWith("#") && !line.trim().isEmpty()) { | ||
String[] columns = line.split("\t"); | ||
if (columns.length == 4) { | ||
This comment has been minimized.
Sorry, something went wrong. |
||
String stateName = columns[0]; | ||
String abbreviation = columns[1]; | ||
String[] articles = columns[2].split(","); | ||
String capital = columns[3]; | ||
stateMap.put(abbreviation, new BrazilianStateInfo(stateName, abbreviation, articles, capital)); | ||
} | ||
} | ||
} | ||
return stateMap; | ||
} | ||
} |
2 changes: 1 addition & 1 deletion
2
...l-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...anguage-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymMapLoader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
69 changes: 69 additions & 0 deletions
69
...-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymStateCheckFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
/* LanguageTool, a natural language style checker | ||
* Copyright (C) 2023 Pedro Goulart | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public | ||
* License along with this library; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | ||
* USA | ||
*/ | ||
package org.languagetool.rules.pt; | ||
|
||
import org.languagetool.AnalyzedSentence; | ||
import org.languagetool.rules.RuleMatch; | ||
import org.languagetool.rules.patterns.RegexRuleFilter; | ||
|
||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.regex.Matcher; | ||
import java.util.stream.Collectors; | ||
|
||
public class BrazilianToponymStateCheckFilter extends RegexRuleFilter { | ||
private static final BrazilianToponymMap map = new BrazilianToponymMap(); | ||
private static final BrazilianStateInfoMap stateMap = new BrazilianStateInfoMap(); | ||
|
||
@Override | ||
public RuleMatch acceptRuleMatch(RuleMatch match, Map<String, String> arguments, AnalyzedSentence sentence, Matcher matcher) { | ||
// Group #2 isn't used *for now*, but at some point we may want to utilise this rule to also fix the separator | ||
// in one fell swoop, so let's leave it as a matching group. | ||
String toponym = matcher.group(1); | ||
String state = matcher.group(3); | ||
|
||
// If it isn't a city in *any* state, it's prob. not intended as a city, so we don't perform the check. | ||
if (!map.isValidToponym(toponym)) { | ||
return null; | ||
} | ||
if (map.isToponymInState(toponym, state)) { | ||
return null; | ||
} | ||
BrazilianToponymStateCheckResult checkResult = map.getStatesWithMunicipality(toponym); | ||
setStateAbbrevSuggestions(match, checkResult); | ||
setMessage(match, checkResult, stateMap.get(state)); | ||
return match; | ||
} | ||
|
||
private void setStateAbbrevSuggestions(RuleMatch match, BrazilianToponymStateCheckResult checkResult) { | ||
match.setSuggestedReplacements( | ||
checkResult.states.stream() | ||
.map(stateInfo -> stateInfo.abbreviation) | ||
.collect(Collectors.toList()) | ||
); | ||
} | ||
|
||
private void setMessage(RuleMatch match, BrazilianToponymStateCheckResult checkResult, BrazilianStateInfo wrongState) { | ||
String inTheStateOf = String.format("no estado %s %s", | ||
new PortuguesePreposition("de").contractWith(wrongState.articles[0]), | ||
wrongState.name); | ||
String message = String.format("O município %s não fica %s.", checkResult.matchedToponym, inTheStateOf); | ||
match.setMessage(message); | ||
} | ||
} |
32 changes: 32 additions & 0 deletions
32
...-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymStateCheckResult.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
/* LanguageTool, a natural language style checker | ||
* Copyright (C) 2023 Pedro Goulart | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public | ||
* License along with this library; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | ||
* USA | ||
*/ | ||
|
||
package org.languagetool.rules.pt; | ||
|
||
import java.util.List; | ||
|
||
public class BrazilianToponymStateCheckResult { | ||
public final List<BrazilianStateInfo> states; | ||
public final String matchedToponym; | ||
|
||
BrazilianToponymStateCheckResult(List<BrazilianStateInfo> states, String matchedToponym) { | ||
this.states = states; | ||
this.matchedToponym = matchedToponym; | ||
} | ||
} |
64 changes: 64 additions & 0 deletions
64
...ol-language-modules/pt/src/main/java/org/languagetool/rules/pt/PortuguesePreposition.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
/* LanguageTool, a natural language style checker | ||
* Copyright (C) 2023 Pedro Goulart | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public | ||
* License along with this library; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | ||
* USA | ||
*/ | ||
|
||
package org.languagetool.rules.pt; | ||
|
||
import java.util.Objects; | ||
|
||
public class PortuguesePreposition { | ||
String value; | ||
String contractedOnset; | ||
|
||
PortuguesePreposition(String fullForm) { | ||
this.value = parseContraction(fullForm); | ||
switch(value) { | ||
case "em": contractedOnset = "n"; break; | ||
case "de": contractedOnset = "d"; break; | ||
case "a": contractedOnset = "a"; break; | ||
default: contractedOnset = value + " "; | ||
} | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return value; | ||
} | ||
|
||
private String parseContraction(String fullForm) { | ||
if (fullForm.startsWith("d")) { | ||
return "de"; | ||
} | ||
if (fullForm.equals("em") || fullForm.startsWith("n")) { | ||
return "em"; | ||
} | ||
if (fullForm.startsWith("a") || fullForm.startsWith("à")) { | ||
return "a"; | ||
} | ||
return fullForm; | ||
} | ||
|
||
public String contractWith(String article) { | ||
if (Objects.equals(article, "0")) { | ||
return value; | ||
} | ||
String contracted = contractedOnset + article; | ||
contracted = contracted.replace("aa", "à"); | ||
return contracted; | ||
} | ||
} |
Maybe add an
else
here that makes the loading fail, so that invalid lines get noticed (instead of just being ignored)?