From 9bdfebc1eb95c95c615adb66e60287d1a7059216 Mon Sep 17 00:00:00 2001 From: p-goulart Date: Fri, 21 Jun 2024 12:41:43 +0200 Subject: [PATCH] [pt] Tag likely English toponyms with ignore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - if unknown (i.e. untagged) words precede a US, Canadian, Australian, or British state, province, county or council area, we run the English check on them; - if they are *valid* English, we tag them with _english_ignore_; - this should help us stop correcting English-language toponyms; - in the future, I suppose we could have a more global solution for all sorts of toponyms... why not have a list of German/Austrian bundesländer,French départments, Italian... pizza slices, etc. --- .../resource/pt/disambiguation.xml | 146 ++++++++++++++++++ .../resource/pt/entities/english.ent | 54 ++++++- 2 files changed, 199 insertions(+), 1 deletion(-) diff --git a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/disambiguation.xml b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/disambiguation.xml index fae27927f070..bbf7d9c245c0 100644 --- a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/disambiguation.xml +++ b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/disambiguation.xml @@ -3769,6 +3769,152 @@ + + + + + \p{Lu}.+ + + + &state_abbrev_us; + + + + + + + + + \p{Lu}.+ + + + em|n[oa]s? + estado|província|condado|ducado + d([oa]s?|e) + &english_subnational_toponyms; + + + + + + + + + \p{Lu}.+ + + + em|n[oa]s? + estado|província|condado|ducado + d([oa]s?|e) + Nov[oa] + &english_subnational_toponyms_new; + + + + + + + + + \p{Lu}.+ + + + em|n[oa]s? + estado|província|condado|ducado + d([oa]s?|e) + &english_subnational_toponyms_north_south; + do + Norte|Sul + + + + + + + + + \p{Lu}.+ + + + em|n[oa]s? + estado|província|condado|ducado + d([oa]s?|e) + &english_subnational_toponyms_with_adj; + Setentrional|Meridional|Ocidental|Oriental + + + + + + + + + \p{Lu}.+ + + + em|n[oa]s? + estado|província|condado|ducado + d([oa]s?|e) + Rhode + Island + + + + + + + + + \p{Lu}.+ + + + em|n[oa]s? + estado|província|condado|ducado + d([oa]s?|e) + Terra + Nova + e + Labrador + + + + + + + + + \p{Lu}.+ + + + em|n[oa]s? + estado|província|condado|ducado + d([oa]s?|e) + Colúmbia + Britânica + + + + + + + + + \p{Lu}.+ + + + em|n[oa]s? + estado|província|condado|ducado + d([oa]s?|e) + Ilha + do + Príncipe + Eduardo + + + + + + diff --git a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/english.ent b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/english.ent index 20afc2efae5f..e213c15d4d4a 100644 --- a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/english.ent +++ b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/english.ent @@ -16,7 +16,7 @@ - + @@ -30,6 +30,58 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +