Skip to content

Commit

Permalink
[pt] Add logic for state info and toponym check
Browse files Browse the repository at this point in the history
  • Loading branch information
p-goulart committed Dec 22, 2023
1 parent e031851 commit a9ddac2
Show file tree
Hide file tree
Showing 9 changed files with 331 additions and 17 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2023 Pedro Goulart
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules.pt;

import java.util.*;

public class BrazilianStateInfo {
Set<String> ambiguousStates = new HashSet<>(Arrays.asList("RJ", "SP"));
static final BrazilianStateInfoMap map = new BrazilianStateInfoMap();
String name;
String abbreviation;
String[] articles;
String capital;

BrazilianStateInfo(String name, String abbreviation, String[] articles, String capital) {
this.name = name;
this.abbreviation = abbreviation;
this.articles = articles.clone();
this.capital = capital;
}

public boolean isAmbiguous() {
return ambiguousStates.contains(abbreviation);
}

@Override
public String toString() {
return String.format("<%s|%s|%s|%s>", name, abbreviation, Arrays.toString(articles), capital);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2023 Pedro Goulart
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules.pt;

import java.util.Map;

public class BrazilianStateInfoMap {
private final Map<String, BrazilianStateInfo> map;

public BrazilianStateInfoMap() {
map = new BrazilianStateInfoMapLoader().buildMap();
}

public BrazilianStateInfo get(String stateAbbreviation) {
return map.get(stateAbbreviation);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2023 Pedro Goulart
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules.pt;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static org.languagetool.JLanguageTool.getDataBroker;

public class BrazilianStateInfoMapLoader {
private final String stateMappingFilename = "pt/state_name_mapping.txt";

private List<String> getStateMappingLines() {
return getDataBroker().getFromResourceDirAsLines(stateMappingFilename);
}

public Map<String, BrazilianStateInfo> buildMap() {
List<String> stateMappingLines = getStateMappingLines();
Map<String, BrazilianStateInfo> stateMap = new HashMap<>();
for (String line : stateMappingLines) {
if (!line.startsWith("#") && !line.trim().isEmpty()) {
String[] columns = line.split("\t");
if (columns.length == 4) {

This comment has been minimized.

Copy link
@danielnaber

danielnaber Dec 22, 2023

Member

Maybe add an else here that makes the loading fail, so that invalid lines get noticed (instead of just being ignored)?

String stateName = columns[0];
String abbreviation = columns[1];
String[] articles = columns[2].split(",");
String capital = columns[3];
stateMap.put(abbreviation, new BrazilianStateInfo(stateName, abbreviation, articles, capital));
}
}
}
return stateMap;
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2012 Jaume Ortolà i Font
* Copyright (C) 2023 Pedro Goulart
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2012 Jaume Ortolà i Font
* Copyright (C) 2023 Pedro Goulart
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
Expand All @@ -22,13 +22,16 @@
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;

public class BrazilianToponymMap {
private final Map<String, List<String>> map;
private final Map<String, List<String>> toponymMap;
private static final BrazilianStateInfoMap stateMap = new BrazilianStateInfoMap();

BrazilianToponymMap() {
map = new BrazilianToponymMapLoader().buildMap();
toponymMap = new BrazilianToponymMapLoader().buildMap();
}

// Since the actually toponym string is only a heuristic, it could be that we match more than we need, e.g.:
Expand All @@ -50,22 +53,38 @@ private <T> T toponymIter(String toponym, Function<String, T> processor, T defau

public boolean isValidToponym(String toponym) {
return toponymIter(toponym, toponymToCheck ->
map.values().stream().anyMatch(list -> list.contains(toponymToCheck)) ? true : null,
toponymMap.values().stream().anyMatch(list -> list.contains(toponymToCheck)) ? true : null,
false);
}

public List<String> getStatesWithMunicipality(String toponym) {
List<String> states = new ArrayList<>();
map.forEach((state, municipalities) -> {
if (municipalities.contains(toponym)) {
states.add(state);
}
});
return states;
public boolean isToponymInState(String toponym, String state) {
List<String> municipalities = toponymMap.get(state);
if (municipalities == null) {
return false;
}
Function<String, Boolean> processor = municipalities::contains;
return toponymIter(toponym, processor, false);
}

public boolean isToponymInState(String toponym, String state) {
List<String> municipalities = map.get(state);
return municipalities != null && municipalities.contains(toponym);
public BrazilianToponymStateCheckResult getStatesWithMunicipality(String toponym) {
List<BrazilianStateInfo> allMatchedStates = new ArrayList<>();
AtomicReference<String> matchedToponym = new AtomicReference<>(null);
String[] originalToponymParts = toponym.split(" ");
String[] normalizedToponymParts = toponym.replace('-', ' ').toLowerCase().split(" ");
for (int i = 0; i < normalizedToponymParts.length; i++) {
String normalizedToponymToCheck = String.join(" ",
Arrays.copyOfRange(normalizedToponymParts, i, normalizedToponymParts.length));
for (Map.Entry<String, List<String>> entry : toponymMap.entrySet()) {
if (entry.getValue().contains(normalizedToponymToCheck)) {
allMatchedStates.add(stateMap.get(entry.getKey()));
if (matchedToponym.get() == null) {
String prettyToponym = String.join(" ",
Arrays.copyOfRange(originalToponymParts, i, originalToponymParts.length));
matchedToponym.set(prettyToponym);
}
}
}
}
return new BrazilianToponymStateCheckResult(allMatchedStates, matchedToponym.get());
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2012 Jaume Ortolà i Font
* Copyright (C) 2023 Pedro Goulart
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2023 Pedro Goulart
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules.pt;

import org.languagetool.AnalyzedSentence;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.patterns.RegexRuleFilter;

import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.stream.Collectors;

public class BrazilianToponymStateCheckFilter extends RegexRuleFilter {
private static final BrazilianToponymMap map = new BrazilianToponymMap();
private static final BrazilianStateInfoMap stateMap = new BrazilianStateInfoMap();

@Override
public RuleMatch acceptRuleMatch(RuleMatch match, Map<String, String> arguments, AnalyzedSentence sentence, Matcher matcher) {
// Group #2 isn't used *for now*, but at some point we may want to utilise this rule to also fix the separator
// in one fell swoop, so let's leave it as a matching group.
String toponym = matcher.group(1);
String state = matcher.group(3);

// If it isn't a city in *any* state, it's prob. not intended as a city, so we don't perform the check.
if (!map.isValidToponym(toponym)) {
return null;
}
if (map.isToponymInState(toponym, state)) {
return null;
}
BrazilianToponymStateCheckResult checkResult = map.getStatesWithMunicipality(toponym);
setStateAbbrevSuggestions(match, checkResult);
setMessage(match, checkResult, stateMap.get(state));
return match;
}

private void setStateAbbrevSuggestions(RuleMatch match, BrazilianToponymStateCheckResult checkResult) {
match.setSuggestedReplacements(
checkResult.states.stream()
.map(stateInfo -> stateInfo.abbreviation)
.collect(Collectors.toList())
);
}

private void setMessage(RuleMatch match, BrazilianToponymStateCheckResult checkResult, BrazilianStateInfo wrongState) {
String inTheStateOf = String.format("no estado %s %s",
new PortuguesePreposition("de").contractWith(wrongState.articles[0]),
wrongState.name);
String message = String.format("O município %s não fica %s.", checkResult.matchedToponym, inTheStateOf);
match.setMessage(message);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2023 Pedro Goulart
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/

package org.languagetool.rules.pt;

import java.util.List;

public class BrazilianToponymStateCheckResult {
public final List<BrazilianStateInfo> states;
public final String matchedToponym;

BrazilianToponymStateCheckResult(List<BrazilianStateInfo> states, String matchedToponym) {
this.states = states;
this.matchedToponym = matchedToponym;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2023 Pedro Goulart
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/

package org.languagetool.rules.pt;

import java.util.Objects;

public class PortuguesePreposition {
String value;
String contractedOnset;

PortuguesePreposition(String fullForm) {
this.value = parseContraction(fullForm);
switch(value) {
case "em": contractedOnset = "n"; break;
case "de": contractedOnset = "d"; break;
case "a": contractedOnset = "a"; break;
default: contractedOnset = value + " ";
}
}

@Override
public String toString() {
return value;
}

private String parseContraction(String fullForm) {
if (fullForm.startsWith("d")) {
return "de";
}
if (fullForm.equals("em") || fullForm.startsWith("n")) {
return "em";
}
if (fullForm.startsWith("a") || fullForm.startsWith("à")) {
return "a";
}
return fullForm;
}

public String contractWith(String article) {
if (Objects.equals(article, "0")) {
return value;
}
String contracted = contractedOnset + article;
contracted = contracted.replace("aa", "à");
return contracted;
}
}

0 comments on commit a9ddac2

Please sign in to comment.