From bff243c70ca533b2bc95275a264028737d9217e4 Mon Sep 17 00:00:00 2001 From: p-goulart Date: Thu, 20 Jun 2024 12:26:15 +0200 Subject: [PATCH] [pt] Add English contraction rule - the English ignore logic balks at stuff like "whats", which the English tagger fails to tag, and therefore doesn't get recognised as English; - in order to *kind of* address it, or at least help users and maybe convince them to stop adding "whats" to their personal dictionaries, we're adding this rule here, with relevant suggestions; - global English spellchecking still requires a lot of thinking, but this rule here should be safe. --- .../org/languagetool/language/Portuguese.java | 4 +- .../pt/EnglishContractionSpellingRule.java | 91 ++++++++ .../rules/pt/english_contractions.txt | 198 ++++++++++++++++++ 3 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/EnglishContractionSpellingRule.java create mode 100644 languagetool-language-modules/pt/src/main/resources/org/languagetool/rules/pt/english_contractions.txt diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/language/Portuguese.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/language/Portuguese.java index 19dcdd48b72a..91c364d44646 100644 --- a/languagetool-language-modules/pt/src/main/java/org/languagetool/language/Portuguese.java +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/language/Portuguese.java @@ -153,7 +153,8 @@ public List getRelevantRules(ResourceBundle messages, UserConfig userConfi new PortugueseUnitConversionRule(messages), new PortugueseReadabilityRule(messages, this, userConfig, true), new PortugueseReadabilityRule(messages, this, userConfig, false), - new DoublePunctuationRule(messages) + new DoublePunctuationRule(messages), + new EnglishContractionSpellingRule(messages, this) ); } @@ -255,6 +256,7 @@ public boolean isAdvancedTypographyEnabled() { id2prio.put("AUX_VERBO", -45); id2prio.put("ENSINO_A_DISTANCIA", -45); id2prio.put("OQ_O_QUE_ORTHOGRAPHY", -45); + id2prio.put("PT_ENGLISH_CONTRACTION_ORTHOGRAPHY", -45); id2prio.put("EMAIL_SEM_HIFEN", -45); // HIGHER THAN SPELLER // MORFOLOGIK SPELLER FITS HERE AT -50 --------------------- // SPELLER (-50) id2prio.put("PRETERITO_PERFEITO", -51); // LOWER THAN SPELLER diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/EnglishContractionSpellingRule.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/EnglishContractionSpellingRule.java new file mode 100644 index 000000000000..0f9d9d5089f0 --- /dev/null +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/rules/pt/EnglishContractionSpellingRule.java @@ -0,0 +1,91 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package org.languagetool.rules.pt; + +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.ResourceBundle; + +import org.languagetool.Language; +import org.languagetool.rules.*; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead, e.g. {@code Hasnt} instead of {@code Hasn't}. + * This was copied to Portuguese from English to help users with code-switching and improve + * tagging tokens with _english_ignore_. + * + * @author Marcin Miłkowski + * @since 2.5 + */ +public class EnglishContractionSpellingRule extends AbstractSimpleReplaceRule { + + public static final String ENGLISH_CONTRACTION_SPELLING_RULE = "PT_ENGLISH_CONTRACTION_ORTHOGRAPHY"; + + private static final Map> wrongWords = loadFromPath("/pt/english_contractions.txt"); + private static final Locale PT_LOCALE = new Locale("pt"); + + @Override + public Map> getWrongWords() { + return wrongWords; + } + + public EnglishContractionSpellingRule(ResourceBundle messages, Language language) { + super(messages, language); + super.setCategory(Categories.TYPOS.getCategory(messages)); + setLocQualityIssueType(ITSIssueType.Misspelling); + addExamplePair(Example.wrong("Ele adorava assistir whats cooking às sextas-feiras."), + Example.fixed("Ele adorava assistir what's cooking às sextas-feiras.")); +// setUrl(Tools.getUrl("https://languagetool.org/insights/post/grammar-contractions/")); + super.setCheckLemmas(false); + } + + @Override + public final String getId() { + return ENGLISH_CONTRACTION_SPELLING_RULE; + } + + @Override + public String getDescription() { + return "Ortografia de contrações inglesas"; + } + + @Override + public String getShort() { + return "Erro de ortografia inglesa"; + } + + @Override + public String getMessage(String tokenStr, List replacements) { + return "Caso seja uma contração da língua inglesa, prefira \"" + replacements.get(0) + "\"."; + } + + @Override + public boolean isCaseSensitive() { + return true; + } + + @Override + public Locale getLocale() { + return PT_LOCALE; + } + +} + diff --git a/languagetool-language-modules/pt/src/main/resources/org/languagetool/rules/pt/english_contractions.txt b/languagetool-language-modules/pt/src/main/resources/org/languagetool/rules/pt/english_contractions.txt new file mode 100644 index 000000000000..344201f93474 --- /dev/null +++ b/languagetool-language-modules/pt/src/main/resources/org/languagetool/rules/pt/english_contractions.txt @@ -0,0 +1,198 @@ +#Common English contractions +#Used because our speller dictionaries +#do not include forms with the apostrophe +#The file is case sensitive to avoid false alarms for "Ill" and "Im" +#(c) 2013 Marcin Milkowski +#Licensed under LGPL +aint=ain't +ainT=ain't +Aint=Ain't +AINT=AIN'T +arent=aren't +arenT=aren't +Arent=Aren't +ARENT=AREN'T +#cant=can't +couldnt=couldn't +couldnT=couldn't +Couldnt=Couldn't +COULDNT=COULDN'T +didnt=didn't +didnT=didn't +Didnt=Didn't +doesnt=doesn't +doesnT=doesn't +Doesnt=Doesn't +DOESNT=DOESN'T +dont=don't +donT=don't +Dont=Don't +DONT=DON'T +hadnt=hadn't +hadnT=hadn't +Hadnt=Hadn't +HADNT=HADN'T +hasnt=hasn't +hasnT=hasn't +Hasnt=Hasn't +havent=haven't +havenT=haven't +Havent=Haven't +HAVENT=HAVEN'T +hed=he'd +Hed=He'd +#hell=he'll #too ambiguous +hes=he's +heS=he's +Hes=He's +howd=how'd +Howd=How'd +hows=how's +Hows=How's +howll=how'll +Howll=How'll +Id=I'd|ID +Im=I'm +#Ive=I've +isnt=isn't +isnT=isn't +Isnt=Isn't +ISNT=ISN'T +#its=it's +#lets=let's +mightnt=mightn't +Mightnt=Mightn't +MIGHTNT=MIGHTN'T +mustve=must've +Mustve=Must've +MUSTVE=MUST'VE +mustnt=mustn't +Mustnt=Mustn't +MUSTNT=MUSTN'T +neednt=needn't +Neednt=Needn't +NEEDNT=NEEDN'T +oclock=o'clock +Oclock=O'clock +OCLOCK=O'CLOCK +shant=shan't +Shant=Shan't +SHANT=SHAN'T +#shed=she'd +#shell=she'll +shes=she's +sheS=she's +Shes=She's +SHES=SHE'S +shouldve=should've +Shouldve=Should've +SHOULDVE=SHOULD'VE +shouldnt=shouldn't +shouldnT=shouldn't +Shouldnt=Shouldn't +SHOULDNT=SHOULDN'T +thatd=that'd +Thatd=That'd +THATD=THAT'D +thats=that's +Thats=That's +THATS=THAT'S +thered=there'd +Thered=There'd +THERED=THERE'D +theres=there's +Theres=There's +THERES=THERE'S +therere=there are +THERERE=THERE ARE +Therere=There are +theyd=they'd +Theyd=They'd +THEYD=THEY'D +theyll=they'll +Theyll=They'll +THEYLL=THEY'LL +theyre=they're +Theyre=They're +THEYRE=THEY'RE +theyve=they've +Theyve=They've +THEYVE=THEY'VE +wasnt=wasn't +wasnT=wasn't +WASNT=WASN'T +Wasnt=Wasn't +#wed=we'd -- false alarms, needs a proper rule +#not Wed: short for Wednesday +#were=we're +#well=we'll +weve=we've +Weve=We've +WEVE=WE'VE +werent=weren't +werenT=weren't +Werent=Weren't +WERENT=WEREN'T +whatll=what'll +Whatll=What'll +WHATLL=WHAT'LL +whatre=what're +Whatre=What're +WHATRE=WHAT'RE +whats=what's +Whats=What's +WHATS=WHAT'S +whatd=what'd +Whatd=What'd +WHATD=WHAT'D +whatve=what've +Whatve=What've +WHATVE=WHAT'VE +whens=when's +Whens=When's +WHENS=WHEN'S +wheres=where's +Wheres=Where's +WHERES=WHERE'S +whereve=where've|wherever +Whereve=Where've|Wherever +WHEREVE=WHERE'VE|WHEREVER +whod=who'd +Whod=Who'd +WHOD=WHO'D +wholl=who'll +Wholl=Who'll +WHOLL=WHO'LL +#whore=who're +whos=who's +Whos=Who's +WHOS=WHO'S +whove=who've +Whove=Who've +WHOVE=WHO'VE +whys=why's +Whys=Why's +WHYS=WHY'S +#wont=won't +wouldve=would've +Wouldve=Would've +WOULDVE=WOULD'VE +wouldnt=wouldn't +wouldnT=wouldn't +Wouldnt=Wouldn't +WOULDNT=WOULDN'T +yall=y'all +Yall=Y'all +YALL=Y'ALL +youd=you'd +Youd=You'd +YOUD=YOU'D +youll=you'll +Youll=You'll +YOULL=YOU'LL +youre=you're +Youre=You're +YOURE=YOU'RE +youve=you've +Youve=You've +YOUVE=YOU'VE \ No newline at end of file