Skip to content

Commit

Permalink
[pt] Ignore probable English-language toponyms (#10674)
Browse files Browse the repository at this point in the history
* [pt] Add UK counties & councils to spelling

* [pt] Tag likely English toponyms with ignore

 - if unknown (i.e. untagged) words precede a US, Canadian, Australian,
   or British state, province, county or council area, we run the
   English check on them;

 - if they are *valid* English, we tag them with _english_ignore_;

 - this should help us stop correcting English-language toponyms;

 - in the future, I suppose we could have a more global solution for all
   sorts of toponyms... why not have a list of German/Austrian
   bundesländer,French départments, Italian... pizza slices, etc.

* [pt] Add English-language toponym speller tests
  • Loading branch information
p-goulart authored Jun 26, 2024
1 parent 3f35ff4 commit 190624a
Show file tree
Hide file tree
Showing 4 changed files with 296 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3769,6 +3769,152 @@
</rule>
</rulegroup>

<rulegroup id="IGNORE_PROBABLE_ENGLISH_TOPONYMS" name="Ignore words with a high likelihood of being English-language toponyms">
<rule> <!-- #1: US state abbreviations -->
<pattern>
<marker>
<token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
</marker>
<token spacebefore="no" postag="_PUNCT_COMMA"/>
<token regexp="yes" case_sensitive="yes">&state_abbrev_us;</token>
</pattern>
<filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
<disambig action="add"><wd pos="_english_ignore_"/></disambig>
</rule>

<rule> <!-- #2: single-token names of states, counties, provinces -->
<pattern>
<marker>
<token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
</marker>
<token spacebefore="no" postag="_PUNCT_COMMA"/>
<token regexp="yes" min="0">em|n[oa]s?</token>
<token regexp="yes" min="0">estado|província|condado|ducado</token>
<token regexp="yes" min="0">d([oa]s?|e)</token>
<token regexp="yes" case_sensitive="yes">&english_subnational_toponyms;</token>
</pattern>
<filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
<disambig action="add"><wd pos="_english_ignore_"/></disambig>
</rule>

<rule> <!-- #3: multi-token names of states, counties, provinces; "new" -->
<pattern>
<marker>
<token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
</marker>
<token spacebefore="no" postag="_PUNCT_COMMA"/>
<token regexp="yes" min="0">em|n[oa]s?</token>
<token regexp="yes" min="0">estado|província|condado|ducado</token>
<token regexp="yes" min="0">d([oa]s?|e)</token>
<token regexp="yes" case_sensitive="yes">Nov[oa]</token>
<token regexp="yes" case_sensitive="yes">&english_subnational_toponyms_new;</token>
</pattern>
<filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
<disambig action="add"><wd pos="_english_ignore_"/></disambig>
</rule>

<rule> <!-- #4: multi-token names of states, counties, provinces; with "do Norte/Sul" -->
<pattern>
<marker>
<token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
</marker>
<token spacebefore="no" postag="_PUNCT_COMMA"/>
<token regexp="yes" min="0">em|n[oa]s?</token>
<token regexp="yes" min="0">estado|província|condado|ducado</token>
<token regexp="yes" min="0">d([oa]s?|e)</token>
<token regexp="yes" case_sensitive="yes">&english_subnational_toponyms_north_south;</token>
<token>do</token>
<token regexp="yes" case_sensitive="yes">Norte|Sul</token>
</pattern>
<filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
<disambig action="add"><wd pos="_english_ignore_"/></disambig>
</rule>

<rule> <!-- #5: multi-token names of states, counties, provinces; with other geographical adjectives -->
<pattern>
<marker>
<token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
</marker>
<token spacebefore="no" postag="_PUNCT_COMMA"/>
<token regexp="yes" min="0">em|n[oa]s?</token>
<token regexp="yes" min="0">estado|província|condado|ducado</token>
<token regexp="yes" min="0">d([oa]s?|e)</token>
<token regexp="yes" case_sensitive="yes">&english_subnational_toponyms_with_adj;</token>
<token regexp="yes" case_sensitive="yes">Setentrional|Meridional|Ocidental|Oriental</token>
</pattern>
<filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
<disambig action="add"><wd pos="_english_ignore_"/></disambig>
</rule>

<rule> <!-- #6: multi-token names of states, counties, provinces; Rhode Island -->
<pattern>
<marker>
<token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
</marker>
<token spacebefore="no" postag="_PUNCT_COMMA"/>
<token regexp="yes" min="0">em|n[oa]s?</token>
<token regexp="yes" min="0">estado|província|condado|ducado</token>
<token regexp="yes" min="0">d([oa]s?|e)</token>
<token case_sensitive="yes">Rhode</token>
<token case_sensitive="yes">Island</token>
</pattern>
<filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
<disambig action="add"><wd pos="_english_ignore_"/></disambig>
</rule>

<rule> <!-- #7: multi-token names of states, counties, provinces; Newfoundland and Labrador (jesus canada, relax) -->
<pattern>
<marker>
<token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
</marker>
<token spacebefore="no" postag="_PUNCT_COMMA"/>
<token regexp="yes" min="0">em|n[oa]s?</token>
<token regexp="yes" min="0">estado|província|condado|ducado</token>
<token regexp="yes" min="0">d([oa]s?|e)</token>
<token case_sensitive="yes">Terra</token>
<token case_sensitive="yes">Nova</token>
<token case_sensitive="yes">e</token>
<token case_sensitive="yes">Labrador</token>
</pattern>
<filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
<disambig action="add"><wd pos="_english_ignore_"/></disambig>
</rule>

<rule> <!-- #8: multi-token names of states, counties, provinces; British Columbia -->
<pattern>
<marker>
<token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
</marker>
<token spacebefore="no" postag="_PUNCT_COMMA"/>
<token regexp="yes" min="0">em|n[oa]s?</token>
<token regexp="yes" min="0">estado|província|condado|ducado</token>
<token regexp="yes" min="0">d([oa]s?|e)</token>
<token case_sensitive="yes">Colúmbia</token>
<token case_sensitive="yes">Britânica</token>
</pattern>
<filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
<disambig action="add"><wd pos="_english_ignore_"/></disambig>
</rule>

<rule> <!-- #9: multi-token names of states, counties, provinces; Prince Edward Island -->
<pattern>
<marker>
<token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
</marker>
<token spacebefore="no" postag="_PUNCT_COMMA"/>
<token regexp="yes" min="0">em|n[oa]s?</token>
<token regexp="yes" min="0">estado|província|condado|ducado</token>
<token regexp="yes" min="0">d([oa]s?|e)</token>
<token case_sensitive="yes">Ilha</token>
<token case_sensitive="yes">do</token>
<token case_sensitive="yes">Príncipe</token>
<token case_sensitive="yes">Eduardo</token>
</pattern>
<filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
<disambig action="add"><wd pos="_english_ignore_"/></disambig>
</rule>
</rulegroup>

<rulegroup id="IGNORE_ENGLISH_WORDS" name="Label English words">
<rule> <!-- #1 -->
<pattern>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
<!ENTITY english_forward "as?|no|[cs]ome|for|(?-i)I|[td]o">

<!-- Portuguese-specific -->
<!ENTITY english_suffixes "((ing|hood|ship|ction)s?|(ness)(es)?|ed|ly|ity|ary|ish|logy|nomy|cracy|less|ful)">
<!ENTITY english_suffixes "((ing|hood|ship|ction|land)s?|(ness)(es)?|ed|ly|ity|ary|ish|logy|nomy|cracy|shire|less|ful)">
<!ENTITY english_suffixed_word "\p{L}+&english_suffixes;">

<!ENTITY english_prefixes "(over|under|out|after|with|through)">
Expand All @@ -30,6 +30,58 @@
<!ENTITY english_contracted_will "I|you|s?he|we|they|who|what">
<!ENTITY english_contracted_would "I|you|s?he|we|they|who|that|what">

<!-- Uses Portuguese-language names if they exist, since otherwise we are dealing with a misspelling. -->
<!-- <!ENTITY state_abbrev_us "AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY"> -->
<!ENTITY us_states
"Alabama|Alasca|Arizona|Arkansas|Califórnia|Colorado|Connecticut|Delaware|Flórida|
Geórgia|Havaí|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Luisiana|Maine|Maryland|
Massachussetts|Michigan|Minnesota|Missouri|Mississippi|Montana|Nebraska|Nevada|
Ohio|Oklahoma|Oregon|Pensivâlnia|Tennessee|Texas|Utah|Vermont|Virgínia|Washington|
Wisconsin|Wyoming
">

<!-- multi-token: Nova Hamsphire, Nova Jersey, Novo México, Nova Iorque, Rhode Island,
Carolinas, Dakotas, Virgínia Ocidental -->

<!-- Canadian provinces -->
<!ENTITY canadian_provinces "Yukon|Nunavut|Alberta|Saskatchewan|Ontário|Quebec|Manitoba">

<!-- multi-token: Nova Escócia, Novo Brunswick, Colúmbia Birtânica, Ilha do Príncipe Eduardo, Terra Nova e Labrador -->

<!-- Austrialian states -->
<!ENTITY australian_states "Vitória|Tasmânia|Queensland">

<!-- multi-token: Nova Gales do Sul, Austrália Meridional/Ocidental, Território do Norte -->

<!-- UK counties; only 'Cornualha' has a PT name; either way, all of these must be accepted by the speller! -->
<!-- This list contains Scottish council areas and other toponyms that are not really technically counties and whatnot; -->
<!-- for the purposes of what this entity is for as of 21 Jun 2024, that's not an issue... If someone needs to -->
<!-- make this more *accurately* reflect current counties, council areas, ceremonial counties, and whatnot, be my guest. -->
<!-- Also the Welsh ones are in English, not Welsh. And a bunch of them might be historical only. -->
<!ENTITY uk_counties
"Avon|Bedfordshire|Berkshire|Buckinghamshire|Cambridgeshire|Cheshire|Cornualha|Cumberland|Cumbria|
Derbyshire|Devon|Dorset|Durham|Suffolk|Sussex|Essex|Gloucestershire|Hampshire|Herefordshire|
Hertfordshire|Huntingdonshire|Kent|Lancashire|Leicestershire|Lincolnshire|Merseyside|Middlesex|
Norfolk|Northamptonshire|Northumberland|Nottinghamshire|Oxfordshire|Rutland|Shropshire|Somerset|
Staffordshire|Suffolk|Surrey|Yorkshire|Warwickshire|Westmorland|Wiltshire|Worcestershire|
Inverclyde|Dunbartonshire|Glasgow|Renfrewshire|Lanarkshire|Falkirk|Edimburgo|Midlothian|Lothian
Clackmannanshire|Fife|Dundee|Angus|Monmouthshire|Glamorgan|Carmarthenshire|Pembrokeshire|
Cardiganshire|Brecknockshire|Radnorshire|Montgomeryshire|Denbighshire|Flintshire|Merionethshire|
Caernarvonshire|Anglesey|Borders|Strathclyde|Dumfries|Galloway|Grampian|Highland|Shetland|
Caithness|Ross|Cromarty|Aberdeenshire|Banffshire|Kincardineshire|Forfarshire|Perthshire|
Argyll|Ayrshire|Stirlingshire|Clackmannanshire|Kinross|Peeblesshire|Selkirkshire|Berwickshire|
Roxburghshire|Kirkcudbrightshire|Wigtownshire|Orkney
">

<!-- I'm not including here those that actually begin with "North/South/East/West" even in their customary Portuguese names, -->
<!-- since those words are already english_ignore triggers. Here we have only the ones that are single-token or whose -->
<!-- names in PT are entirely translated or contain Portuguese terms (e.g. "do Sul", "Meridional", etc.). -->

<!ENTITY english_subnational_toponyms "(&us_states;|&canadian_provinces;|&australian_states;|&uk_counties;)">
<!ENTITY english_subnational_toponyms_new "(Hampshire|Jersey|Jérsia|México|Iorque|Escócia|Brunswick|Gales)">
<!ENTITY english_subnational_toponyms_north_south "(Carolina|Dakota|Território)">
<!ENTITY english_subnational_toponyms_with_adj "(Virgínia|Austrália)">


<!-- Largely taken from French, but some words have been removed and others added -->
<!ENTITY english_word_list "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -346,3 +346,90 @@ Emmy
Jeff
Larry
Penguin

Bedfordshire
Buckinghamshire
Cambridgeshire
Cheshire
Cumberland
Cumbria
Derbyshire
Devon
Dorset
Durham
Suffolk
Sussex
Essex
Gloucestershire
Hampshire
Herefordshire
Hertfordshire
Huntingdonshire
Kent
Lancashire
Leicestershire
Lincolnshire
Merseyside
Middlesex
Northamptonshire
Northumberland
Nottinghamshire
Oxfordshire
Rutland
Shropshire
Somerset
Suffolk
Yorkshire
Warwickshire
Westmorland
Wiltshire
Worcestershire
Inverclyde
Dunbartonshire
Glasgow
Renfrewshire
Lanarkshire
Falkirk
Midlothian
Lothian
Clackmannanshire
Dundee
Monmouthshire
Glamorgan
Carmarthenshire
Pembrokeshire
Cardiganshire
Brecknockshire
Radnorshire
Montgomeryshire
Denbighshire
Flintshire
Merionethshire
Caernarvonshire
Anglesey
Borders
Strathclyde
Dumfries
Galloway
Grampian
Highland
Shetland
Caithness
Cromarty
Aberdeenshire
Banffshire
Kincardineshire
Forfarshire
Perthshire
Ayrshire
Stirlingshire
Clackmannanshire
Kinross
Peeblesshire
Selkirkshire
Berwickshire
Roxburghshire
Kirkcudbrightshire
Wigtownshire
Orkney

Original file line number Diff line number Diff line change
Expand Up @@ -609,11 +609,19 @@ public void testIgnoreEnglishWordsInPortuguese() throws IOException {
"Narra, segundo o historiador americano Will Durant, uma das maiores aventuras da história humana.",
"Duas décadas mais tarde, os Gipsy Kings incorporaram aquilo.",
"Valente teve três irmãos, um dos quais, Silvio Francesco, também esteve no show business.",
"O lema do estado de Nova Hampshire é Livre Free or Die"
"O lema do estado de Nova Hampshire é Livre Free or Die",
// English-language toponyms that may not be in the PT speller, but we can verify that they're valid
// based on proximity with state/province/county names.
// Note that this does not fact-check whether the toponym actually belongs in the state/province/whatever :p
"Aconteceu na cidade de Victor Harbor, Austrália Meridional.", // just comma, and PT name of state
"Aconteceu no distrito de Tamworth, no condado de Staffordshire.", // "no condado de"
"A pequena cidade de Bethany Beach, em Delaware.", // "em"
"O vilarejo de Goose Bay, na província de Terra Nova e Labrador.", // "na província de"
"Morava, na época, em Keene, estado de Nova Hampshire.", // "estado de"
};
for (String sentence : noErrorSentences) {
List<RuleMatch> matches = lt.check(sentence);
assert matches.isEmpty();
assert matches.isEmpty() : "Unexpected match in: " + sentence;
}
HashMap<String, String> errorSentences = new HashMap<>();
errorSentences.put("Foi uma melhora substantial.", "substancial"); // single word
Expand Down

0 comments on commit 190624a

Please sign in to comment.