[pt] Tag likely English toponyms with ignore

- if unknown (i.e. untagged) words precede a US, Canadian, Australian, or British state, province, county or council area, we run the English check on them; - if they are *valid* English, we tag them with _english_ignore_; - this should help us stop correcting English-language toponyms; - in the future, I suppose we could have a more global solution for all sorts of toponyms... why not have a list of German/Austrian bundesländer,French départments, Italian... pizza slices, etc.
languagetool-org · Jun 25, 2024 · 9bdfebc · 9bdfebc
1 parent 44fb0b8
commit 9bdfebc
Show file tree

Hide file tree

Showing 2 changed files with 199 additions and 1 deletion.
diff --git a/...ol-language-modules/pt/src/main/resources/org/languagetool/resource/pt/disambiguation.xml b/...ol-language-modules/pt/src/main/resources/org/languagetool/resource/pt/disambiguation.xml
@@ -3769,6 +3769,152 @@
       </rule>
   </rulegroup>
 
+  <rulegroup id="IGNORE_PROBABLE_ENGLISH_TOPONYMS" name="Ignore words with a high likelihood of being English-language toponyms">
+      <rule> <!-- #1: US state abbreviations -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" case_sensitive="yes">&state_abbrev_us;</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #2: single-token names of states, counties, provinces -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token regexp="yes" case_sensitive="yes">&english_subnational_toponyms;</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #3: multi-token names of states, counties, provinces; "new" -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token regexp="yes" case_sensitive="yes">Nov[oa]</token>
+              <token regexp="yes" case_sensitive="yes">&english_subnational_toponyms_new;</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #4: multi-token names of states, counties, provinces; with "do Norte/Sul" -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token regexp="yes" case_sensitive="yes">&english_subnational_toponyms_north_south;</token>
+              <token>do</token>
+              <token regexp="yes" case_sensitive="yes">Norte|Sul</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #5: multi-token names of states, counties, provinces; with other geographical adjectives -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token regexp="yes" case_sensitive="yes">&english_subnational_toponyms_with_adj;</token>
+              <token regexp="yes" case_sensitive="yes">Setentrional|Meridional|Ocidental|Oriental</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #6: multi-token names of states, counties, provinces; Rhode Island -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token case_sensitive="yes">Rhode</token>
+              <token case_sensitive="yes">Island</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #7: multi-token names of states, counties, provinces; Newfoundland and Labrador (jesus canada, relax) -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token case_sensitive="yes">Terra</token>
+              <token case_sensitive="yes">Nova</token>
+              <token case_sensitive="yes">e</token>
+              <token case_sensitive="yes">Labrador</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #8: multi-token names of states, counties, provinces; British Columbia -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token case_sensitive="yes">Colúmbia</token>
+              <token case_sensitive="yes">Britânica</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+
+      <rule> <!-- #9: multi-token names of states, counties, provinces; Prince Edward Island -->
+          <pattern>
+              <marker>
+                  <token regexp="yes" postag="UNKNOWN" case_sensitive="yes">\p{Lu}.+</token>
+              </marker>
+              <token spacebefore="no" postag="_PUNCT_COMMA"/>
+              <token regexp="yes" min="0">em|n[oa]s?</token>
+              <token regexp="yes" min="0">estado|província|condado|ducado</token>
+              <token regexp="yes" min="0">d([oa]s?|e)</token>
+              <token case_sensitive="yes">Ilha</token>
+              <token case_sensitive="yes">do</token>
+              <token case_sensitive="yes">Príncipe</token>
+              <token case_sensitive="yes">Eduardo</token>
+          </pattern>
+          <filter class="org.languagetool.rules.IsEnglishWordFilter" args="formPositions:1"/>
+          <disambig action="add"><wd pos="_english_ignore_"/></disambig>
+      </rule>
+  </rulegroup>
+
   <rulegroup id="IGNORE_ENGLISH_WORDS" name="Label English words">
       <rule> <!-- #1 -->
           <pattern>

diff --git a/...-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/english.ent b/...-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/english.ent
@@ -16,7 +16,7 @@
 <!ENTITY english_forward "as?|no|[cs]ome|for|(?-i)I|[td]o">
 
 <!-- Portuguese-specific -->
-<!ENTITY english_suffixes "((ing|hood|ship|ction)s?|(ness)(es)?|ed|ly|ity|ary|ish|logy|nomy|cracy|less|ful)">
+<!ENTITY english_suffixes "((ing|hood|ship|ction|land)s?|(ness)(es)?|ed|ly|ity|ary|ish|logy|nomy|cracy|shire|less|ful)">
 <!ENTITY english_suffixed_word "\p{L}+&english_suffixes;">
 
 <!ENTITY english_prefixes "(over|under|out|after|with|through)">
@@ -30,6 +30,58 @@
 <!ENTITY english_contracted_will "I|you|s?he|we|they|who|what">
 <!ENTITY english_contracted_would "I|you|s?he|we|they|who|that|what">
 
+<!-- Uses Portuguese-language names if they exist, since otherwise we are dealing with a misspelling. -->
+<!-- <!ENTITY state_abbrev_us "AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY"> -->
+<!ENTITY us_states
+  "Alabama|Alasca|Arizona|Arkansas|Califórnia|Colorado|Connecticut|Delaware|Flórida|
+   Geórgia|Havaí|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Luisiana|Maine|Maryland|
+   Massachussetts|Michigan|Minnesota|Missouri|Mississippi|Montana|Nebraska|Nevada|
+   Ohio|Oklahoma|Oregon|Pensivâlnia|Tennessee|Texas|Utah|Vermont|Virgínia|Washington|
+   Wisconsin|Wyoming
+  ">
+
+<!-- multi-token: Nova Hamsphire, Nova Jersey, Novo México, Nova Iorque, Rhode Island,
+     Carolinas, Dakotas, Virgínia Ocidental -->
+
+<!-- Canadian provinces -->
+<!ENTITY canadian_provinces "Yukon|Nunavut|Alberta|Saskatchewan|Ontário|Quebec|Manitoba">
+
+<!-- multi-token: Nova Escócia, Novo Brunswick, Colúmbia Birtânica, Ilha do Príncipe Eduardo, Terra Nova e Labrador -->
+
+<!-- Austrialian states -->
+<!ENTITY australian_states "Vitória|Tasmânia|Queensland">
+
+<!-- multi-token: Nova Gales do Sul, Austrália Meridional/Ocidental, Território do Norte -->
+
+<!-- UK counties; only 'Cornualha' has a PT name; either way, all of these must be accepted by the speller! -->
+<!-- This list contains Scottish council areas and other toponyms that are not really technically counties and whatnot; -->
+<!-- for the purposes of what this entity is for as of 21 Jun 2024, that's not an issue... If someone needs to -->
+<!-- make this more *accurately* reflect current counties, council areas, ceremonial counties, and whatnot, be my guest. -->
+<!-- Also the Welsh ones are in English, not Welsh. And a bunch of them might be historical only. -->
+<!ENTITY uk_counties
+  "Avon|Bedfordshire|Berkshire|Buckinghamshire|Cambridgeshire|Cheshire|Cornualha|Cumberland|Cumbria|
+   Derbyshire|Devon|Dorset|Durham|Suffolk|Sussex|Essex|Gloucestershire|Hampshire|Herefordshire|
+   Hertfordshire|Huntingdonshire|Kent|Lancashire|Leicestershire|Lincolnshire|Merseyside|Middlesex|
+   Norfolk|Northamptonshire|Northumberland|Nottinghamshire|Oxfordshire|Rutland|Shropshire|Somerset|
+   Staffordshire|Suffolk|Surrey|Yorkshire|Warwickshire|Westmorland|Wiltshire|Worcestershire|
+   Inverclyde|Dunbartonshire|Glasgow|Renfrewshire|Lanarkshire|Falkirk|Edimburgo|Midlothian|Lothian
+   Clackmannanshire|Fife|Dundee|Angus|Monmouthshire|Glamorgan|Carmarthenshire|Pembrokeshire|
+   Cardiganshire|Brecknockshire|Radnorshire|Montgomeryshire|Denbighshire|Flintshire|Merionethshire|
+   Caernarvonshire|Anglesey|Borders|Strathclyde|Dumfries|Galloway|Grampian|Highland|Shetland|
+   Caithness|Ross|Cromarty|Aberdeenshire|Banffshire|Kincardineshire|Forfarshire|Perthshire|
+   Argyll|Ayrshire|Stirlingshire|Clackmannanshire|Kinross|Peeblesshire|Selkirkshire|Berwickshire|
+   Roxburghshire|Kirkcudbrightshire|Wigtownshire|Orkney
+">
+
+<!-- I'm not including here those that actually begin with "North/South/East/West" even in their customary Portuguese names, -->
+<!-- since those words are already english_ignore triggers. Here we have only the ones that are single-token or whose -->
+<!-- names in PT are entirely translated or contain Portuguese terms (e.g. "do Sul", "Meridional", etc.). -->
+
+<!ENTITY english_subnational_toponyms "(&us_states;|&canadian_provinces;|&australian_states;|&uk_counties;)">
+<!ENTITY english_subnational_toponyms_new "(Hampshire|Jersey|Jérsia|México|Iorque|Escócia|Brunswick|Gales)">
+<!ENTITY english_subnational_toponyms_north_south "(Carolina|Dakota|Território)">
+<!ENTITY english_subnational_toponyms_with_adj "(Virgínia|Austrália)">
+
 
 <!-- Largely taken from French, but some words have been removed and others added -->
 <!ENTITY english_word_list "