[pt] Ignore probable English-language toponyms (#10674)

* [pt] Add UK counties & councils to spelling * [pt] Tag likely English toponyms with ignore - if unknown (i.e. untagged) words precede a US, Canadian, Australian, or British state, province, county or council area, we run the English check on them; - if they are *valid* English, we tag them with _english_ignore_; - this should help us stop correcting English-language toponyms; - in the future, I suppose we could have a more global solution for all sorts of toponyms... why not have a list of German/Austrian bundesländer,French départments, Italian... pizza slices, etc. * [pt] Add English-language toponym speller tests
languagetool-org · Jun 26, 2024 · 190624a · 190624a
1 parent 3f35ff4
commit 190624a
Show file tree

Hide file tree

Showing 4 changed files with 296 additions and 3 deletions.
diff --git a/...ol-language-modules/pt/src/main/resources/org/languagetool/resource/pt/disambiguation.xml b/...ol-language-modules/pt/src/main/resources/org/languagetool/resource/pt/disambiguation.xml
@@ -3769,6 +3769,152 @@
       </rule>
   </rulegroup>
 
+  <rulegroup id="IGNORE_PROBABLE_ENGLISH_TOPONYMS" name="Ignore words with a high likelihood of being English-language toponyms">
+      <rule> <!-- #1: US state abbreviations -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" case_sensitive="yes">&state_abbrev_us;</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #2: single-token names of states, counties, provinces -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token regexp="yes" case_sensitive="yes">&english_subnational_toponyms;</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #3: multi-token names of states, counties, provinces; "new" -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token regexp="yes" case_sensitive="yes">Nov[oa]</token>
+              <token regexp="yes" case_sensitive="yes">&english_subnational_toponyms_new;</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #4: multi-token names of states, counties, provinces; with "do Norte/Sul" -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token regexp="yes" case_sensitive="yes">&english_subnational_toponyms_north_south;</token>
+              <token>do</token>
+              <token regexp="yes" case_sensitive="yes">Norte|Sul</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #5: multi-token names of states, counties, provinces; with other geographical adjectives -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token regexp="yes" case_sensitive="yes">&english_subnational_toponyms_with_adj;</token>
+              <token regexp="yes" case_sensitive="yes">Setentrional|Meridional|Ocidental|Oriental</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #6: multi-token names of states, counties, provinces; Rhode Island -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token case_sensitive="yes">Rhode</token>
+              <token case_sensitive="yes">Island</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #7: multi-token names of states, counties, provinces; Newfoundland and Labrador (jesus canada, relax) -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token case_sensitive="yes">Terra</token>
+              <token case_sensitive="yes">Nova</token>
+              <token case_sensitive="yes">e</token>
+              <token case_sensitive="yes">Labrador</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #8: multi-token names of states, counties, provinces; British Columbia -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token case_sensitive="yes">Colúmbia</token>
+              <token case_sensitive="yes">Britânica</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #9: multi-token names of states, counties, provinces; Prince Edward Island -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token case_sensitive="yes">Ilha</token>
+              <token case_sensitive="yes">do</token>
+              <token case_sensitive="yes">Príncipe</token>
+              <token case_sensitive="yes">Eduardo</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+  </rulegroup>
+
   <rulegroup id="IGNORE_ENGLISH_WORDS" name="Label English words">
       <rule> <!-- #1 -->
           <pattern>

diff --git a/...-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/english.ent b/...-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/english.ent
@@ -16,7 +16,7 @@
 <!ENTITY english_forward "as?|no|[cs]ome|for|(?-i)I|[td]o">
 
 <!-- Portuguese-specific -->
-<!ENTITY english_suffixes "((ing|hood|ship|ction)s?|(ness)(es)?|ed|ly|ity|ary|ish|logy|nomy|cracy|less|ful)">
+<!ENTITY english_suffixes "((ing|hood|ship|ction|land)s?|(ness)(es)?|ed|ly|ity|ary|ish|logy|nomy|cracy|shire|less|ful)">
 <!ENTITY english_suffixed_word "\p{L}+&english_suffixes;">
 
 <!ENTITY english_prefixes "(over|under|out|after|with|through)">
@@ -30,6 +30,58 @@
 <!ENTITY english_contracted_will "I|you|s?he|we|they|who|what">
 <!ENTITY english_contracted_would "I|you|s?he|we|they|who|that|what">
 
+<!-- Uses Portuguese-language names if they exist, since otherwise we are dealing with a misspelling. -->
+<!-- <!ENTITY state_abbrev_us "AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY"> -->
+<!ENTITY us_states
+  "Alabama|Alasca|Arizona|Arkansas|Califórnia|Colorado|Connecticut|Delaware|Flórida|
+   Geórgia|Havaí|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Luisiana|Maine|Maryland|
+   Massachussetts|Michigan|Minnesota|Missouri|Mississippi|Montana|Nebraska|Nevada|
+   Ohio|Oklahoma|Oregon|Pensivâlnia|Tennessee|Texas|Utah|Vermont|Virgínia|Washington|
+   Wisconsin|Wyoming
+  ">
+
+<!-- multi-token: Nova Hamsphire, Nova Jersey, Novo México, Nova Iorque, Rhode Island,
+     Carolinas, Dakotas, Virgínia Ocidental -->
+
+<!-- Canadian provinces -->
+<!ENTITY canadian_provinces "Yukon|Nunavut|Alberta|Saskatchewan|Ontário|Quebec|Manitoba">
+
+<!-- multi-token: Nova Escócia, Novo Brunswick, Colúmbia Birtânica, Ilha do Príncipe Eduardo, Terra Nova e Labrador -->
+
+<!-- Austrialian states -->
+<!ENTITY australian_states "Vitória|Tasmânia|Queensland">
+
+<!-- multi-token: Nova Gales do Sul, Austrália Meridional/Ocidental, Território do Norte -->
+
+<!-- UK counties; only 'Cornualha' has a PT name; either way, all of these must be accepted by the speller! -->
+<!-- This list contains Scottish council areas and other toponyms that are not really technically counties and whatnot; -->
+<!-- for the purposes of what this entity is for as of 21 Jun 2024, that's not an issue... If someone needs to -->
+<!-- make this more *accurately* reflect current counties, council areas, ceremonial counties, and whatnot, be my guest. -->
+<!-- Also the Welsh ones are in English, not Welsh. And a bunch of them might be historical only. -->
+<!ENTITY uk_counties
+  "Avon|Bedfordshire|Berkshire|Buckinghamshire|Cambridgeshire|Cheshire|Cornualha|Cumberland|Cumbria|
+   Derbyshire|Devon|Dorset|Durham|Suffolk|Sussex|Essex|Gloucestershire|Hampshire|Herefordshire|
+   Hertfordshire|Huntingdonshire|Kent|Lancashire|Leicestershire|Lincolnshire|Merseyside|Middlesex|
+   Norfolk|Northamptonshire|Northumberland|Nottinghamshire|Oxfordshire|Rutland|Shropshire|Somerset|
+   Staffordshire|Suffolk|Surrey|Yorkshire|Warwickshire|Westmorland|Wiltshire|Worcestershire|
+   Inverclyde|Dunbartonshire|Glasgow|Renfrewshire|Lanarkshire|Falkirk|Edimburgo|Midlothian|Lothian
+   Clackmannanshire|Fife|Dundee|Angus|Monmouthshire|Glamorgan|Carmarthenshire|Pembrokeshire|
+   Cardiganshire|Brecknockshire|Radnorshire|Montgomeryshire|Denbighshire|Flintshire|Merionethshire|
+   Caernarvonshire|Anglesey|Borders|Strathclyde|Dumfries|Galloway|Grampian|Highland|Shetland|
+   Caithness|Ross|Cromarty|Aberdeenshire|Banffshire|Kincardineshire|Forfarshire|Perthshire|
+   Argyll|Ayrshire|Stirlingshire|Clackmannanshire|Kinross|Peeblesshire|Selkirkshire|Berwickshire|
+   Roxburghshire|Kirkcudbrightshire|Wigtownshire|Orkney
+">
+
+<!-- I'm not including here those that actually begin with "North/South/East/West" even in their customary Portuguese names, -->
+<!-- since those words are already english_ignore triggers. Here we have only the ones that are single-token or whose -->
+<!-- names in PT are entirely translated or contain Portuguese terms (e.g. "do Sul", "Meridional", etc.). -->
+
+<!ENTITY english_subnational_toponyms "(&us_states;|&canadian_provinces;|&australian_states;|&uk_counties;)">
+<!ENTITY english_subnational_toponyms_new "(Hampshire|Jersey|Jérsia|México|Iorque|Escócia|Brunswick|Gales)">
+<!ENTITY english_subnational_toponyms_north_south "(Carolina|Dakota|Território)">
+<!ENTITY english_subnational_toponyms_with_adj "(Virgínia|Austrália)">
+
 
 <!-- Largely taken from French, but some words have been removed and others added -->
 <!ENTITY english_word_list "

diff --git a/...uagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/spelling.txt b/...uagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/spelling.txt
@@ -346,3 +346,90 @@ Emmy
 Jeff
 Larry
 Penguin
+
+Bedfordshire
+Buckinghamshire
+Cambridgeshire
+Cheshire
+Cumberland
+Cumbria
+Derbyshire
+Devon
+Dorset
+Durham
+Suffolk
+Sussex
+Essex
+Gloucestershire
+Hampshire
+Herefordshire
+Hertfordshire
+Huntingdonshire
+Kent
+Lancashire
+Leicestershire
+Lincolnshire
+Merseyside
+Middlesex
+Northamptonshire
+Northumberland
+Nottinghamshire
+Oxfordshire
+Rutland
+Shropshire
+Somerset
+Suffolk
+Yorkshire
+Warwickshire
+Westmorland
+Wiltshire
+Worcestershire
+Inverclyde
+Dunbartonshire
+Glasgow
+Renfrewshire
+Lanarkshire
+Falkirk
+Midlothian
+Lothian
+Clackmannanshire
+Dundee
+Monmouthshire
+Glamorgan
+Carmarthenshire
+Pembrokeshire
+Cardiganshire
+Brecknockshire
+Radnorshire
+Montgomeryshire
+Denbighshire
+Flintshire
+Merionethshire
+Caernarvonshire
+Anglesey
+Borders
+Strathclyde
+Dumfries
+Galloway
+Grampian
+Highland
+Shetland
+Caithness
+Cromarty
+Aberdeenshire
+Banffshire
+Kincardineshire
+Forfarshire
+Perthshire
+Ayrshire
+Stirlingshire
+Clackmannanshire
+Kinross
+Peeblesshire
+Selkirkshire
+Berwickshire
+Roxburghshire
+Kirkcudbrightshire
+Wigtownshire
+Orkney
+
diff --git a/languagetool-standalone/src/test/java/org/languagetool/JLanguageToolTest.java b/languagetool-standalone/src/test/java/org/languagetool/JLanguageToolTest.java
@@ -609,11 +609,19 @@ public void testIgnoreEnglishWordsInPortuguese() throws IOException {
       "Narra, segundo o historiador americano Will Durant, uma das maiores aventuras da história humana.",
       "Duas décadas mais tarde, os Gipsy Kings incorporaram aquilo.",
       "Valente teve três irmãos, um dos quais, Silvio Francesco, também esteve no show business.",
-      "O lema do estado de Nova Hampshire é Livre Free or Die"
+      "O lema do estado de Nova Hampshire é Livre Free or Die",
+      // English-language toponyms that may not be in the PT speller, but we can verify that they're valid
+      // based on proximity with state/province/county names.
+      // Note that this does not fact-check whether the toponym actually belongs in the state/province/whatever :p
+      "Aconteceu na cidade de Victor Harbor, Austrália Meridional.",  // just comma, and PT name of state
+      "Aconteceu no distrito de Tamworth, no condado de Staffordshire.",  // "no condado de"
+      "A pequena cidade de Bethany Beach, em Delaware.",  // "em"
+      "O vilarejo de Goose Bay, na província de Terra Nova e Labrador.",  // "na província de"
+      "Morava, na época, em Keene, estado de Nova Hampshire.",  // "estado de"
     };
     for (String sentence : noErrorSentences) {
       List<RuleMatch> matches = lt.check(sentence);
-      assert matches.isEmpty();
+      assert matches.isEmpty() : "Unexpected match in: " + sentence;
     }
     HashMap<String, String> errorSentences = new HashMap<>();
     errorSentences.put("Foi uma melhora substantial.", "substancial");  // single word