-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[pt] Add logic for Brazilian municipality/state mapping #9946
Merged
Merged
Changes from 2 commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
49 changes: 49 additions & 0 deletions
49
...l-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
/* LanguageTool, a natural language style checker | ||
* Copyright (C) 2012 Jaume Ortolà i Font | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public | ||
* License along with this library; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | ||
* USA | ||
*/ | ||
package org.languagetool.rules.pt; | ||
|
||
import org.languagetool.AnalyzedSentence; | ||
import org.languagetool.rules.RuleMatch; | ||
import org.languagetool.rules.patterns.RegexRuleFilter; | ||
|
||
import java.util.HashMap; | ||
import java.util.Map; | ||
import java.util.regex.Matcher; | ||
|
||
public class BrazilianToponymFilter extends RegexRuleFilter { | ||
@Override | ||
public RuleMatch acceptRuleMatch(RuleMatch match, Map<String, String> arguments, AnalyzedSentence sentence, Matcher matcher) { | ||
String toponym = matcher.group(1); | ||
String underlined = matcher.group(2); | ||
String state = matcher.group(3); | ||
|
||
// TODO: read this from user options or something... | ||
String suggestion = "–" + state; | ||
if (suggestion.equals(underlined)) { | ||
return null; | ||
} | ||
BrazilianToponymMap map = new BrazilianToponymMap(); | ||
// If it isn't a city in *any* state, it's prob. not intended as a city, so we don't perform the check. | ||
if (!map.isValidToponym(toponym)) { | ||
return null; | ||
} | ||
match.setSuggestedReplacement(suggestion); | ||
return match; | ||
} | ||
} |
71 changes: 71 additions & 0 deletions
71
...tool-language-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymMap.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
/* LanguageTool, a natural language style checker | ||
* Copyright (C) 2012 Jaume Ortolà i Font | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public | ||
* License along with this library; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | ||
* USA | ||
*/ | ||
package org.languagetool.rules.pt; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.function.Function; | ||
|
||
public class BrazilianToponymMap { | ||
private final Map<String, List<String>> map; | ||
|
||
BrazilianToponymMap() { | ||
map = new BrazilianToponymMapLoader().buildMap(); | ||
} | ||
|
||
// Since the actually toponym string is only a heuristic, it could be that we match more than we need, e.g.: | ||
// "Venho do Rio de Janeiro" will match, hungrily, the whole thing, rather than just "Rio de Janeiro". | ||
// To account for this, we loop, dropping the leftmost element of the toponym until we can't check any more. | ||
private <T> T toponymIter(String toponym, Function<String, T> processor, T defaultValue) { | ||
String normalisedToponym = toponym.replace('-', ' ').toLowerCase(); | ||
String[] toponymParts = normalisedToponym.split(" "); | ||
int toponymLength = toponymParts.length; | ||
for (int i = 0; i < toponymLength; i++) { | ||
String toponymToCheck = String.join(" ", Arrays.copyOfRange(toponymParts, i, toponymLength)); | ||
T result = processor.apply(toponymToCheck); | ||
if (result != null) { | ||
return result; | ||
} | ||
} | ||
return defaultValue; | ||
} | ||
|
||
public boolean isValidToponym(String toponym) { | ||
return toponymIter(toponym, toponymToCheck -> | ||
map.values().stream().anyMatch(list -> list.contains(toponymToCheck)) ? true : null, | ||
false); | ||
} | ||
|
||
public List<String> getStatesWithMunicipality(String toponym) { | ||
List<String> states = new ArrayList<>(); | ||
map.forEach((state, municipalities) -> { | ||
if (municipalities.contains(toponym)) { | ||
states.add(state); | ||
} | ||
}); | ||
return states; | ||
} | ||
|
||
public boolean isToponymInState(String toponym, String state) { | ||
List<String> municipalities = map.get(state); | ||
return municipalities != null && municipalities.contains(toponym); | ||
} | ||
} |
75 changes: 75 additions & 0 deletions
75
...anguage-modules/pt/src/main/java/org/languagetool/rules/pt/BrazilianToponymMapLoader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
/* LanguageTool, a natural language style checker | ||
* Copyright (C) 2012 Jaume Ortolà i Font | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public | ||
* License along with this library; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | ||
* USA | ||
*/ | ||
package org.languagetool.rules.pt; | ||
|
||
import java.util.*; | ||
import java.util.stream.Collectors; | ||
|
||
import static org.languagetool.JLanguageTool.getDataBroker; | ||
|
||
public class BrazilianToponymMapLoader { | ||
private final String toponymFilepath = "pt/brazilian_municipalities"; | ||
private final List<String> states = Arrays.asList( | ||
"AC", // Acre | ||
"AL", // Alagoas | ||
"AP", // Amapá | ||
"AM", // Amazonas | ||
"BA", // Bahia | ||
"CE", // Ceará | ||
"DF", // Distrito Federal | ||
"ES", // Espírito Santo | ||
"GO", // Goiás | ||
"MA", // Maranhão | ||
"MT", // Mato Grosso | ||
"MS", // Mato Grosso do Sul | ||
"MG", // Minas Gerais | ||
"PA", // Pará | ||
"PB", // Paraíba | ||
"PR", // Paraná | ||
"PE", // Pernambuco | ||
"PI", // Piauí | ||
"RJ", // Rio de Janeiro | ||
"RN", // Rio Grande do Norte | ||
"RS", // Rio Grande do Sul | ||
"RO", // Rondônia | ||
"RR", // Roraima | ||
"SC", // Santa Catarina | ||
"SP", // São Paulo | ||
"SE", // Sergipe | ||
"TO" // Tocantins | ||
); | ||
|
||
BrazilianToponymMapLoader() { | ||
} | ||
|
||
private List<String> getToponymsFromState(String state) { | ||
List<String> toponyms = getDataBroker().getFromResourceDirAsLines(toponymFilepath + "/" + state + ".tsv"); | ||
return toponyms.stream() | ||
.map(toponym -> toponym.replace('-', ' ').toLowerCase()) | ||
.collect(Collectors.toList()); | ||
} | ||
|
||
public Map<String, List<String>> buildMap() { | ||
Map<String, List<String>> map = new HashMap<>(); | ||
for (String state : states) { | ||
map.put(state, getToponymsFromState(state)); | ||
} | ||
return map; | ||
} | ||
} |
22 changes: 22 additions & 0 deletions
22
...odules/pt/src/main/resources/org/languagetool/resource/pt/brazilian_municipalities/AC.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
Acrelândia | ||
Assis Brasil | ||
Brasiléia | ||
Bujari | ||
Capixaba | ||
Cruzeiro do Sul | ||
Epitaciolândia | ||
Feijó | ||
Jordão | ||
Manoel Urbano | ||
Marechal Thaumaturgo | ||
Mâncio Lima | ||
Plácido de Castro | ||
Porto Acre | ||
Porto Walter | ||
Rio Branco | ||
Rodrigues Alves | ||
Santa Rosa do Purus | ||
Sena Madureira | ||
Senador Guiomard | ||
Tarauacá | ||
Xapuri |
102 changes: 102 additions & 0 deletions
102
...odules/pt/src/main/resources/org/languagetool/resource/pt/brazilian_municipalities/AL.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
Água Branca | ||
Anadia | ||
Arapiraca | ||
Atalaia | ||
Barra de Santo Antônio | ||
Barra de São Miguel | ||
Batalha | ||
Belém | ||
Belo Monte | ||
Boca da Mata | ||
Branquinha | ||
Cacimbinhas | ||
Cajueiro | ||
Campestre | ||
Campo Alegre | ||
Campo Grande | ||
Canapi | ||
Capela | ||
Carneiros | ||
Chã Preta | ||
Coité do Noia | ||
Colônia Leopoldina | ||
Coqueiro Seco | ||
Coruripe | ||
Craíbas | ||
Delmiro Gouveia | ||
Dois Riachos | ||
Estrela de Alagoas | ||
Feira Grande | ||
Feliz Deserto | ||
Flexeiras | ||
Girau do Ponciano | ||
Ibateguara | ||
Igaci | ||
Igreja Nova | ||
Inhapi | ||
Jacaré dos Homens | ||
Jacuípe | ||
Japaratinga | ||
Jaramataia | ||
Jequiá da Praia | ||
Joaquim Gomes | ||
Jundiá | ||
Junqueiro | ||
Lagoa da Canoa | ||
Limoeiro de Anadia | ||
Maceió | ||
Major Izidoro | ||
Mar Vermelho | ||
Maragogi | ||
Maravilha | ||
Marechal Deodoro | ||
Maribondo | ||
Mata Grande | ||
Matriz de Camaragibe | ||
Messias | ||
Minador do Negrão | ||
Monteirópolis | ||
Murici | ||
Novo Lino | ||
Olho d'Água das Flores | ||
Olho d'Água do Casado | ||
Olho d'Água Grande | ||
Olivença | ||
Ouro Branco | ||
Palestina | ||
Palmeira dos Índios | ||
Pão de Açúcar | ||
Pariconha | ||
Paripueira | ||
Passo de Camaragibe | ||
Paulo Jacinto | ||
Penedo | ||
Piaçabuçu | ||
Pilar | ||
Pindoba | ||
Piranhas | ||
Poço das Trincheiras | ||
Porto Calvo | ||
Porto de Pedras | ||
Porto Real do Colégio | ||
Quebrangulo | ||
Rio Largo | ||
Roteiro | ||
Santa Luzia do Norte | ||
Santana do Ipanema | ||
Santana do Mundaú | ||
São Brás | ||
São José da Laje | ||
São José da Tapera | ||
São Luís do Quitunde | ||
São Miguel dos Campos | ||
São Miguel dos Milagres | ||
São Sebastião | ||
Satuba | ||
Senador Rui Palmeira | ||
Tanque d'Arca | ||
Taquarana | ||
Teotônio Vilela | ||
Traipu | ||
União dos Palmares | ||
Viçosa |
62 changes: 62 additions & 0 deletions
62
...odules/pt/src/main/resources/org/languagetool/resource/pt/brazilian_municipalities/AM.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
Alvarães | ||
Amaturá | ||
Anamã | ||
Anori | ||
Apuí | ||
Atalaia do Norte | ||
Autazes | ||
Barcelos | ||
Barreirinha | ||
Benjamin Constant | ||
Beruri | ||
Boa Vista do Ramos | ||
Boca do Acre | ||
Borba | ||
Caapiranga | ||
Canutama | ||
Carauari | ||
Careiro | ||
Careiro da Várzea | ||
Coari | ||
Codajás | ||
Eirunepé | ||
Envira | ||
Fonte Boa | ||
Guajará | ||
Humaitá | ||
Ipixuna | ||
Iranduba | ||
Itacoatiara | ||
Itamarati | ||
Itapiranga | ||
Japurá | ||
Juruá | ||
Jutaí | ||
Lábrea | ||
Manacapuru | ||
Manaquiri | ||
Manaus | ||
Manicoré | ||
Maraã | ||
Maués | ||
Nhamundá | ||
Nova Olinda do Norte | ||
Novo Airão | ||
Novo Aripuanã | ||
Parintins | ||
Pauini | ||
Presidente Figueiredo | ||
Rio Preto da Eva | ||
Santa Isabel do Rio Negro | ||
Santo Antônio do Içá | ||
São Gabriel da Cachoeira | ||
São Paulo de Olivença | ||
São Sebastião do Uatumã | ||
Silves | ||
Tabatinga | ||
Tapauá | ||
Tefé | ||
Tonantins | ||
Uarini | ||
Urucará | ||
Urucurituba |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it make sense to make this a static constant? We maybe don't want to re-initialize the object every time there's a potential match.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, we should definitely do that. I was certain I had, but I think I moved it around a bit and forgot to put it back.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done