diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianStateInfo.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianStateInfo.java new file mode 100644 index 000000000000..89fd479f5dbf --- /dev/null +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianStateInfo.java @@ -0,0 +1,46 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2023 Pedro Goulart + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package org.languagetool.rules.pt; + +import java.util.*; + +public class BrazilianStateInfo { + Set ambiguousStates = new HashSet<>(Arrays.asList("RJ", "SP")); + static final BrazilianStateInfoMap map = new BrazilianStateInfoMap(); + String name; + String abbreviation; + String[] articles; + String capital; + + BrazilianStateInfo(String name, String abbreviation, String[] articles, String capital) { + this.name = name; + this.abbreviation = abbreviation; + this.articles = articles.clone(); + this.capital = capital; + } + + public boolean isAmbiguous() { + return ambiguousStates.contains(abbreviation); + } + + @Override + public String toString() { + return String.format("<%s|%s|%s|%s>", name, abbreviation, Arrays.toString(articles), capital); + } +} diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianStateInfoMap.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianStateInfoMap.java new file mode 100644 index 000000000000..06a86bcba5a7 --- /dev/null +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianStateInfoMap.java @@ -0,0 +1,33 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2023 Pedro Goulart + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package org.languagetool.rules.pt; + +import java.util.Map; + +public class BrazilianStateInfoMap { + private final Map map; + + public BrazilianStateInfoMap() { + map = new BrazilianStateInfoMapLoader().buildMap(); + } + + public BrazilianStateInfo get(String stateAbbreviation) { + return map.get(stateAbbreviation); + } +} diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianStateInfoMapLoader.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianStateInfoMapLoader.java new file mode 100644 index 000000000000..b8ea0a4eee19 --- /dev/null +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianStateInfoMapLoader.java @@ -0,0 +1,51 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2023 Pedro Goulart + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package org.languagetool.rules.pt; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.languagetool.JLanguageTool.getDataBroker; + +public class BrazilianStateInfoMapLoader { + private final String stateMappingFilename = "pt/state_name_mapping.txt"; + + private List getStateMappingLines() { + return getDataBroker().getFromResourceDirAsLines(stateMappingFilename); + } + + public Map buildMap() { + List stateMappingLines = getStateMappingLines(); + Map stateMap = new HashMap<>(); + for (String line : stateMappingLines) { + if (!line.startsWith("#") && !line.trim().isEmpty()) { + String[] columns = line.split("\t"); + if (columns.length == 4) { + String stateName = columns[0]; + String abbreviation = columns[1]; + String[] articles = columns[2].split(","); + String capital = columns[3]; + stateMap.put(abbreviation, new BrazilianStateInfo(stateName, abbreviation, articles, capital)); + } + } + } + return stateMap; + } +} diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymFilter.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymFilter.java index 091c073c0633..8e690072b97a 100644 --- a/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymFilter.java +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymFilter.java @@ -1,5 +1,5 @@ /* LanguageTool, a natural language style checker - * Copyright (C) 2012 Jaume Ortolà i Font + * Copyright (C) 2023 Pedro Goulart * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymMap.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymMap.java index 1dad280aaee7..888fa536cd70 100644 --- a/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymMap.java +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymMap.java @@ -1,5 +1,5 @@ /* LanguageTool, a natural language style checker - * Copyright (C) 2012 Jaume Ortolà i Font + * Copyright (C) 2023 Pedro Goulart * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -22,13 +22,16 @@ import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; public class BrazilianToponymMap { - private final Map> map; + private final Map> toponymMap; + private static final BrazilianStateInfoMap stateMap = new BrazilianStateInfoMap(); BrazilianToponymMap() { - map = new BrazilianToponymMapLoader().buildMap(); + toponymMap = new BrazilianToponymMapLoader().buildMap(); } // Since the actually toponym string is only a heuristic, it could be that we match more than we need, e.g.: @@ -50,22 +53,38 @@ private T toponymIter(String toponym, Function processor, T defau public boolean isValidToponym(String toponym) { return toponymIter(toponym, toponymToCheck -> - map.values().stream().anyMatch(list -> list.contains(toponymToCheck)) ? true : null, + toponymMap.values().stream().anyMatch(list -> list.contains(toponymToCheck)) ? true : null, false); } - public List getStatesWithMunicipality(String toponym) { - List states = new ArrayList<>(); - map.forEach((state, municipalities) -> { - if (municipalities.contains(toponym)) { - states.add(state); - } - }); - return states; + public boolean isToponymInState(String toponym, String state) { + List municipalities = toponymMap.get(state); + if (municipalities == null) { + return false; + } + Function processor = municipalities::contains; + return toponymIter(toponym, processor, false); } - public boolean isToponymInState(String toponym, String state) { - List municipalities = map.get(state); - return municipalities != null && municipalities.contains(toponym); + public BrazilianToponymStateCheckResult getStatesWithMunicipality(String toponym) { + List allMatchedStates = new ArrayList<>(); + AtomicReference matchedToponym = new AtomicReference<>(null); + String[] originalToponymParts = toponym.split(" "); + String[] normalizedToponymParts = toponym.replace('-', ' ').toLowerCase().split(" "); + for (int i = 0; i < normalizedToponymParts.length; i++) { + String normalizedToponymToCheck = String.join(" ", + Arrays.copyOfRange(normalizedToponymParts, i, normalizedToponymParts.length)); + for (Map.Entry> entry : toponymMap.entrySet()) { + if (entry.getValue().contains(normalizedToponymToCheck)) { + allMatchedStates.add(stateMap.get(entry.getKey())); + if (matchedToponym.get() == null) { + String prettyToponym = String.join(" ", + Arrays.copyOfRange(originalToponymParts, i, originalToponymParts.length)); + matchedToponym.set(prettyToponym); + } + } + } + } + return new BrazilianToponymStateCheckResult(allMatchedStates, matchedToponym.get()); } } diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymMapLoader.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymMapLoader.java index 9bd7b7b2933f..24537ba43aba 100644 --- a/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymMapLoader.java +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymMapLoader.java @@ -1,5 +1,5 @@ /* LanguageTool, a natural language style checker - * Copyright (C) 2012 Jaume Ortolà i Font + * Copyright (C) 2023 Pedro Goulart * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymStateCheckFilter.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymStateCheckFilter.java new file mode 100644 index 000000000000..e34f4e2f379f --- /dev/null +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymStateCheckFilter.java @@ -0,0 +1,69 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2023 Pedro Goulart + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package org.languagetool.rules.pt; + +import org.languagetool.AnalyzedSentence; +import org.languagetool.rules.RuleMatch; +import org.languagetool.rules.patterns.RegexRuleFilter; + +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.stream.Collectors; + +public class BrazilianToponymStateCheckFilter extends RegexRuleFilter { + private static final BrazilianToponymMap map = new BrazilianToponymMap(); + private static final BrazilianStateInfoMap stateMap = new BrazilianStateInfoMap(); + + @Override + public RuleMatch acceptRuleMatch(RuleMatch match, Map arguments, AnalyzedSentence sentence, Matcher matcher) { + // Group #2 isn't used *for now*, but at some point we may want to utilise this rule to also fix the separator + // in one fell swoop, so let's leave it as a matching group. + String toponym = matcher.group(1); + String state = matcher.group(3); + + // If it isn't a city in *any* state, it's prob. not intended as a city, so we don't perform the check. + if (!map.isValidToponym(toponym)) { + return null; + } + if (map.isToponymInState(toponym, state)) { + return null; + } + BrazilianToponymStateCheckResult checkResult = map.getStatesWithMunicipality(toponym); + setStateAbbrevSuggestions(match, checkResult); + setMessage(match, checkResult, stateMap.get(state)); + return match; + } + + private void setStateAbbrevSuggestions(RuleMatch match, BrazilianToponymStateCheckResult checkResult) { + match.setSuggestedReplacements( + checkResult.states.stream() + .map(stateInfo -> stateInfo.abbreviation) + .collect(Collectors.toList()) + ); + } + + private void setMessage(RuleMatch match, BrazilianToponymStateCheckResult checkResult, BrazilianStateInfo wrongState) { + String inTheStateOf = String.format("no estado %s %s", + new PortuguesePreposition("de").contractWith(wrongState.articles[0]), + wrongState.name); + String message = String.format("O município %s não fica %s.", checkResult.matchedToponym, inTheStateOf); + match.setMessage(message); + } +} diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymStateCheckResult.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymStateCheckResult.java new file mode 100644 index 000000000000..025a4bb5c16a --- /dev/null +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymStateCheckResult.java @@ -0,0 +1,32 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2023 Pedro Goulart + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package org.languagetool.rules.pt; + +import java.util.List; + +public class BrazilianToponymStateCheckResult { + public final List states; + public final String matchedToponym; + + BrazilianToponymStateCheckResult(List states, String matchedToponym) { + this.states = states; + this.matchedToponym = matchedToponym; + } +} diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/PortuguesePreposition.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/PortuguesePreposition.java new file mode 100644 index 000000000000..fdf84b87a10d --- /dev/null +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/PortuguesePreposition.java @@ -0,0 +1,64 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2023 Pedro Goulart + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package org.languagetool.rules.pt; + +import java.util.Objects; + +public class PortuguesePreposition { + String value; + String contractedOnset; + + PortuguesePreposition(String fullForm) { + this.value = parseContraction(fullForm); + switch(value) { + case "em": contractedOnset = "n"; break; + case "de": contractedOnset = "d"; break; + case "a": contractedOnset = "a"; break; + default: contractedOnset = value + " "; + } + } + + @Override + public String toString() { + return value; + } + + private String parseContraction(String fullForm) { + if (fullForm.startsWith("d")) { + return "de"; + } + if (fullForm.equals("em") || fullForm.startsWith("n")) { + return "em"; + } + if (fullForm.startsWith("a") || fullForm.startsWith("à")) { + return "a"; + } + return fullForm; + } + + public String contractWith(String article) { + if (Objects.equals(article, "0")) { + return value; + } + String contracted = contractedOnset + article; + contracted = contracted.replace("aa", "à"); + return contracted; + } +}