Skip to content

Commit

Permalink
[pt] Add English contraction rule
Browse files Browse the repository at this point in the history
 - the English ignore logic balks at stuff like "whats", which the
   English tagger fails to tag, and therefore doesn't get recognised
   as English;

 - in order to *kind of* address it, or at least help users and maybe
   convince them to stop adding "whats" to their personal dictionaries,
   we're adding this rule here, with relevant suggestions;

 - global English spellchecking still requires a lot of thinking, but
   this rule here should be safe.
  • Loading branch information
p-goulart committed Jun 25, 2024
1 parent 3b4af24 commit 724e5d1
Show file tree
Hide file tree
Showing 3 changed files with 292 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,8 @@ public List<Rule> getRelevantRules(ResourceBundle messages, UserConfig userConfi
new PortugueseUnitConversionRule(messages),
new PortugueseReadabilityRule(messages, this, userConfig, true),
new PortugueseReadabilityRule(messages, this, userConfig, false),
new DoublePunctuationRule(messages)
new DoublePunctuationRule(messages),
new EnglishContractionSpellingRule(messages, this)
);
}

Expand Down Expand Up @@ -255,6 +256,7 @@ public boolean isAdvancedTypographyEnabled() {
id2prio.put("AUX_VERBO", -45);
id2prio.put("ENSINO_A_DISTANCIA", -45);
id2prio.put("OQ_O_QUE_ORTHOGRAPHY", -45);
id2prio.put("PT_ENGLISH_CONTRACTION_ORTHOGRAPHY", -45);
id2prio.put("EMAIL_SEM_HIFEN", -45); // HIGHER THAN SPELLER
// MORFOLOGIK SPELLER FITS HERE AT -50 --------------------- // SPELLER (-50)
id2prio.put("PRETERITO_PERFEITO", -51); // LOWER THAN SPELLER
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules.pt;

import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.ResourceBundle;

import org.languagetool.Language;
import org.languagetool.rules.*;

/**
* A rule that matches words or phrases which should not be used and suggests
* correct ones instead, e.g. {@code Hasnt} instead of {@code Hasn't}.
* This was copied to Portuguese from English to help users with code-switching and improve
* tagging tokens with _english_ignore_.
*
* @author Marcin Miłkowski
* @since 2.5
*/
public class EnglishContractionSpellingRule extends AbstractSimpleReplaceRule {

public static final String ENGLISH_CONTRACTION_SPELLING_RULE = "PT_ENGLISH_CONTRACTION_ORTHOGRAPHY";

private static final Map<String, List<String>> wrongWords = loadFromPath("/pt/english_contractions.txt");
private static final Locale PT_LOCALE = new Locale("pt");

@Override
public Map<String, List<String>> getWrongWords() {
return wrongWords;
}

public EnglishContractionSpellingRule(ResourceBundle messages, Language language) {
super(messages, language);
super.setCategory(Categories.TYPOS.getCategory(messages));
setLocQualityIssueType(ITSIssueType.Misspelling);
addExamplePair(Example.wrong("Ele adorava assistir <marker>whats</marker> cooking às sextas-feiras."),
Example.fixed("Ele adorava assistir <marker>what's</marker> cooking às sextas-feiras."));
// setUrl(Tools.getUrl("https://languagetool.org/insights/post/grammar-contractions/"));
super.setCheckLemmas(false);
}

@Override
public final String getId() {
return ENGLISH_CONTRACTION_SPELLING_RULE;
}

@Override
public String getDescription() {
return "Ortografia de contrações inglesas";
}

@Override
public String getShort() {
return "Erro de ortografia inglesa";
}

@Override
public String getMessage(String tokenStr, List<String> replacements) {
return "Caso seja uma contração da língua inglesa, prefira \"" + replacements.get(0) + "\".";
}

@Override
public boolean isCaseSensitive() {
return true;
}

@Override
public Locale getLocale() {
return PT_LOCALE;
}

}

Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
#Common English contractions
#Used because our speller dictionaries
#do not include forms with the apostrophe
#The file is case sensitive to avoid false alarms for "Ill" and "Im"
#(c) 2013 Marcin Milkowski
#Licensed under LGPL
aint=ain't
ainT=ain't
Aint=Ain't
AINT=AIN'T
arent=aren't
arenT=aren't
Arent=Aren't
ARENT=AREN'T
#cant=can't
couldnt=couldn't
couldnT=couldn't
Couldnt=Couldn't
COULDNT=COULDN'T
didnt=didn't
didnT=didn't
Didnt=Didn't
doesnt=doesn't
doesnT=doesn't
Doesnt=Doesn't
DOESNT=DOESN'T
dont=don't
donT=don't
Dont=Don't
DONT=DON'T
hadnt=hadn't
hadnT=hadn't
Hadnt=Hadn't
HADNT=HADN'T
hasnt=hasn't
hasnT=hasn't
Hasnt=Hasn't
havent=haven't
havenT=haven't
Havent=Haven't
HAVENT=HAVEN'T
hed=he'd
Hed=He'd
#hell=he'll #too ambiguous
hes=he's
heS=he's
Hes=He's
howd=how'd
Howd=How'd
hows=how's
Hows=How's
howll=how'll
Howll=How'll
Id=I'd|ID
Im=I'm
#Ive=I've
isnt=isn't
isnT=isn't
Isnt=Isn't
ISNT=ISN'T
#its=it's
#lets=let's
mightnt=mightn't
Mightnt=Mightn't
MIGHTNT=MIGHTN'T
mustve=must've
Mustve=Must've
MUSTVE=MUST'VE
mustnt=mustn't
Mustnt=Mustn't
MUSTNT=MUSTN'T
neednt=needn't
Neednt=Needn't
NEEDNT=NEEDN'T
oclock=o'clock
Oclock=O'clock
OCLOCK=O'CLOCK
shant=shan't
Shant=Shan't
SHANT=SHAN'T
#shed=she'd
#shell=she'll
shes=she's
sheS=she's
Shes=She's
SHES=SHE'S
shouldve=should've
Shouldve=Should've
SHOULDVE=SHOULD'VE
shouldnt=shouldn't
shouldnT=shouldn't
Shouldnt=Shouldn't
SHOULDNT=SHOULDN'T
thatd=that'd
Thatd=That'd
THATD=THAT'D
thats=that's
Thats=That's
THATS=THAT'S
thered=there'd
Thered=There'd
THERED=THERE'D
theres=there's
Theres=There's
THERES=THERE'S
therere=there are
THERERE=THERE ARE
Therere=There are
theyd=they'd
Theyd=They'd
THEYD=THEY'D
theyll=they'll
Theyll=They'll
THEYLL=THEY'LL
theyre=they're
Theyre=They're
THEYRE=THEY'RE
theyve=they've
Theyve=They've
THEYVE=THEY'VE
wasnt=wasn't
wasnT=wasn't
WASNT=WASN'T
Wasnt=Wasn't
#wed=we'd -- false alarms, needs a proper rule
#not Wed: short for Wednesday
#were=we're
#well=we'll
weve=we've
Weve=We've
WEVE=WE'VE
werent=weren't
werenT=weren't
Werent=Weren't
WERENT=WEREN'T
whatll=what'll
Whatll=What'll
WHATLL=WHAT'LL
whatre=what're
Whatre=What're
WHATRE=WHAT'RE
whats=what's
Whats=What's
WHATS=WHAT'S
whatd=what'd
Whatd=What'd
WHATD=WHAT'D
whatve=what've
Whatve=What've
WHATVE=WHAT'VE
whens=when's
Whens=When's
WHENS=WHEN'S
wheres=where's
Wheres=Where's
WHERES=WHERE'S
whereve=where've|wherever
Whereve=Where've|Wherever
WHEREVE=WHERE'VE|WHEREVER
whod=who'd
Whod=Who'd
WHOD=WHO'D
wholl=who'll
Wholl=Who'll
WHOLL=WHO'LL
#whore=who're
whos=who's
Whos=Who's
WHOS=WHO'S
whove=who've
Whove=Who've
WHOVE=WHO'VE
whys=why's
Whys=Why's
WHYS=WHY'S
#wont=won't
wouldve=would've
Wouldve=Would've
WOULDVE=WOULD'VE
wouldnt=wouldn't
wouldnT=wouldn't
Wouldnt=Wouldn't
WOULDNT=WOULDN'T
yall=y'all
Yall=Y'all
YALL=Y'ALL
youd=you'd
Youd=You'd
YOUD=YOU'D
youll=you'll
Youll=You'll
YOULL=YOU'LL
youre=you're
Youre=You're
YOURE=YOU'RE
youve=you've
Youve=You've
YOUVE=YOU'VE

0 comments on commit 724e5d1

Please sign in to comment.