-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- the English ignore logic balks at stuff like "whats", which the English tagger fails to tag, and therefore doesn't get recognised as English; - in order to *kind of* address it, or at least help users and maybe convince them to stop adding "whats" to their personal dictionaries, we're adding this rule here, with relevant suggestions; - global English spellchecking still requires a lot of thinking, but this rule here should be safe.
- Loading branch information
Showing
3 changed files
with
292 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
91 changes: 91 additions & 0 deletions
91
...ge-modules/pt/src/main/java/org/languagetool/rules/pt/EnglishContractionSpellingRule.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
/* LanguageTool, a natural language style checker | ||
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public | ||
* License along with this library; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | ||
* USA | ||
*/ | ||
package org.languagetool.rules.pt; | ||
|
||
import java.util.List; | ||
import java.util.Locale; | ||
import java.util.Map; | ||
import java.util.ResourceBundle; | ||
|
||
import org.languagetool.Language; | ||
import org.languagetool.rules.*; | ||
|
||
/** | ||
* A rule that matches words or phrases which should not be used and suggests | ||
* correct ones instead, e.g. {@code Hasnt} instead of {@code Hasn't}. | ||
* This was copied to Portuguese from English to help users with code-switching and improve | ||
* tagging tokens with _english_ignore_. | ||
* | ||
* @author Marcin Miłkowski | ||
* @since 2.5 | ||
*/ | ||
public class EnglishContractionSpellingRule extends AbstractSimpleReplaceRule { | ||
|
||
public static final String ENGLISH_CONTRACTION_SPELLING_RULE = "PT_ENGLISH_CONTRACTION_ORTHOGRAPHY"; | ||
|
||
private static final Map<String, List<String>> wrongWords = loadFromPath("/pt/english_contractions.txt"); | ||
private static final Locale PT_LOCALE = new Locale("pt"); | ||
|
||
@Override | ||
public Map<String, List<String>> getWrongWords() { | ||
return wrongWords; | ||
} | ||
|
||
public EnglishContractionSpellingRule(ResourceBundle messages, Language language) { | ||
super(messages, language); | ||
super.setCategory(Categories.TYPOS.getCategory(messages)); | ||
setLocQualityIssueType(ITSIssueType.Misspelling); | ||
addExamplePair(Example.wrong("Ele adorava assistir <marker>whats</marker> cooking às sextas-feiras."), | ||
Example.fixed("Ele adorava assistir <marker>what's</marker> cooking às sextas-feiras.")); | ||
// setUrl(Tools.getUrl("https://languagetool.org/insights/post/grammar-contractions/")); | ||
super.setCheckLemmas(false); | ||
} | ||
|
||
@Override | ||
public final String getId() { | ||
return ENGLISH_CONTRACTION_SPELLING_RULE; | ||
} | ||
|
||
@Override | ||
public String getDescription() { | ||
return "Ortografia de contrações inglesas"; | ||
} | ||
|
||
@Override | ||
public String getShort() { | ||
return "Erro de ortografia inglesa"; | ||
} | ||
|
||
@Override | ||
public String getMessage(String tokenStr, List<String> replacements) { | ||
return "Caso seja uma contração da língua inglesa, prefira \"" + replacements.get(0) + "\"."; | ||
} | ||
|
||
@Override | ||
public boolean isCaseSensitive() { | ||
return true; | ||
} | ||
|
||
@Override | ||
public Locale getLocale() { | ||
return PT_LOCALE; | ||
} | ||
|
||
} | ||
|
198 changes: 198 additions & 0 deletions
198
...language-modules/pt/src/main/resources/org/languagetool/rules/pt/english_contractions.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
#Common English contractions | ||
#Used because our speller dictionaries | ||
#do not include forms with the apostrophe | ||
#The file is case sensitive to avoid false alarms for "Ill" and "Im" | ||
#(c) 2013 Marcin Milkowski | ||
#Licensed under LGPL | ||
aint=ain't | ||
ainT=ain't | ||
Aint=Ain't | ||
AINT=AIN'T | ||
arent=aren't | ||
arenT=aren't | ||
Arent=Aren't | ||
ARENT=AREN'T | ||
#cant=can't | ||
couldnt=couldn't | ||
couldnT=couldn't | ||
Couldnt=Couldn't | ||
COULDNT=COULDN'T | ||
didnt=didn't | ||
didnT=didn't | ||
Didnt=Didn't | ||
doesnt=doesn't | ||
doesnT=doesn't | ||
Doesnt=Doesn't | ||
DOESNT=DOESN'T | ||
dont=don't | ||
donT=don't | ||
Dont=Don't | ||
DONT=DON'T | ||
hadnt=hadn't | ||
hadnT=hadn't | ||
Hadnt=Hadn't | ||
HADNT=HADN'T | ||
hasnt=hasn't | ||
hasnT=hasn't | ||
Hasnt=Hasn't | ||
havent=haven't | ||
havenT=haven't | ||
Havent=Haven't | ||
HAVENT=HAVEN'T | ||
hed=he'd | ||
Hed=He'd | ||
#hell=he'll #too ambiguous | ||
hes=he's | ||
heS=he's | ||
Hes=He's | ||
howd=how'd | ||
Howd=How'd | ||
hows=how's | ||
Hows=How's | ||
howll=how'll | ||
Howll=How'll | ||
Id=I'd|ID | ||
Im=I'm | ||
#Ive=I've | ||
isnt=isn't | ||
isnT=isn't | ||
Isnt=Isn't | ||
ISNT=ISN'T | ||
#its=it's | ||
#lets=let's | ||
mightnt=mightn't | ||
Mightnt=Mightn't | ||
MIGHTNT=MIGHTN'T | ||
mustve=must've | ||
Mustve=Must've | ||
MUSTVE=MUST'VE | ||
mustnt=mustn't | ||
Mustnt=Mustn't | ||
MUSTNT=MUSTN'T | ||
neednt=needn't | ||
Neednt=Needn't | ||
NEEDNT=NEEDN'T | ||
oclock=o'clock | ||
Oclock=O'clock | ||
OCLOCK=O'CLOCK | ||
shant=shan't | ||
Shant=Shan't | ||
SHANT=SHAN'T | ||
#shed=she'd | ||
#shell=she'll | ||
shes=she's | ||
sheS=she's | ||
Shes=She's | ||
SHES=SHE'S | ||
shouldve=should've | ||
Shouldve=Should've | ||
SHOULDVE=SHOULD'VE | ||
shouldnt=shouldn't | ||
shouldnT=shouldn't | ||
Shouldnt=Shouldn't | ||
SHOULDNT=SHOULDN'T | ||
thatd=that'd | ||
Thatd=That'd | ||
THATD=THAT'D | ||
thats=that's | ||
Thats=That's | ||
THATS=THAT'S | ||
thered=there'd | ||
Thered=There'd | ||
THERED=THERE'D | ||
theres=there's | ||
Theres=There's | ||
THERES=THERE'S | ||
therere=there are | ||
THERERE=THERE ARE | ||
Therere=There are | ||
theyd=they'd | ||
Theyd=They'd | ||
THEYD=THEY'D | ||
theyll=they'll | ||
Theyll=They'll | ||
THEYLL=THEY'LL | ||
theyre=they're | ||
Theyre=They're | ||
THEYRE=THEY'RE | ||
theyve=they've | ||
Theyve=They've | ||
THEYVE=THEY'VE | ||
wasnt=wasn't | ||
wasnT=wasn't | ||
WASNT=WASN'T | ||
Wasnt=Wasn't | ||
#wed=we'd -- false alarms, needs a proper rule | ||
#not Wed: short for Wednesday | ||
#were=we're | ||
#well=we'll | ||
weve=we've | ||
Weve=We've | ||
WEVE=WE'VE | ||
werent=weren't | ||
werenT=weren't | ||
Werent=Weren't | ||
WERENT=WEREN'T | ||
whatll=what'll | ||
Whatll=What'll | ||
WHATLL=WHAT'LL | ||
whatre=what're | ||
Whatre=What're | ||
WHATRE=WHAT'RE | ||
whats=what's | ||
Whats=What's | ||
WHATS=WHAT'S | ||
whatd=what'd | ||
Whatd=What'd | ||
WHATD=WHAT'D | ||
whatve=what've | ||
Whatve=What've | ||
WHATVE=WHAT'VE | ||
whens=when's | ||
Whens=When's | ||
WHENS=WHEN'S | ||
wheres=where's | ||
Wheres=Where's | ||
WHERES=WHERE'S | ||
whereve=where've|wherever | ||
Whereve=Where've|Wherever | ||
WHEREVE=WHERE'VE|WHEREVER | ||
whod=who'd | ||
Whod=Who'd | ||
WHOD=WHO'D | ||
wholl=who'll | ||
Wholl=Who'll | ||
WHOLL=WHO'LL | ||
#whore=who're | ||
whos=who's | ||
Whos=Who's | ||
WHOS=WHO'S | ||
whove=who've | ||
Whove=Who've | ||
WHOVE=WHO'VE | ||
whys=why's | ||
Whys=Why's | ||
WHYS=WHY'S | ||
#wont=won't | ||
wouldve=would've | ||
Wouldve=Would've | ||
WOULDVE=WOULD'VE | ||
wouldnt=wouldn't | ||
wouldnT=wouldn't | ||
Wouldnt=Wouldn't | ||
WOULDNT=WOULDN'T | ||
yall=y'all | ||
Yall=Y'all | ||
YALL=Y'ALL | ||
youd=you'd | ||
Youd=You'd | ||
YOUD=YOU'D | ||
youll=you'll | ||
Youll=You'll | ||
YOULL=YOU'LL | ||
youre=you're | ||
Youre=You're | ||
YOURE=YOU'RE | ||
youve=you've | ||
Youve=You've | ||
YOUVE=YOU'VE |