Skip to content

Commit

Permalink
[pt] Fix DEGREE disambiguation rule
Browse files Browse the repository at this point in the history
  • Loading branch information
p-goulart committed Dec 19, 2023
1 parent d065676 commit 1857815
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,14 @@
<!-- Contribuintes opensource, deem uma olhada aqui: https://dev.languagetool.org/#portuguese -->

<!DOCTYPE rules [
<!ENTITY % messages SYSTEM "../../resource/pt/entities/messages.ent" >
<!ENTITY % datetime SYSTEM "../../resource/pt/entities/datetime.ent" >
<!ENTITY % misc SYSTEM "../../resource/pt/entities/misc.ent" >
<!ENTITY % abbrev SYSTEM "../../resource/pt/entities/abbrev.ent" >
<!ENTITY % paronyms SYSTEM "../../resource/pt/entities/paronyms.ent" >
<!ENTITY % verbs SYSTEM "../../resource/pt/entities/verbs.ent" >
<!ENTITY % languages SYSTEM "../../resource/pt/entities/languages.ent">
<!ENTITY % postal SYSTEM "../../resource/pt/entities/postal.ent" >
<!ENTITY % chars SYSTEM "../../../resource/pt/entities/chars.ent" >
%messages;
<!ENTITY % chars SYSTEM "../../resource/pt/entities/chars.ent" >
%datetime;
%misc;
%abbrev;
Expand All @@ -43,7 +41,7 @@
%languages;
%postal;
%chars;
]>
]>

<rules lang="pt" xsi:noNamespaceSchemaLocation="../../../../../../../../../languagetool-core/src/main/resources/org/languagetool/resource/disambiguation.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xs="http://www.w3.org/2001/XMLSchema">

Expand Down Expand Up @@ -2889,46 +2887,53 @@
<!-- Created by Tiago F. Santos, Portuguese rule -->
<rule>
<pattern>
<token regexp="yes">1[&deg;′″‴] ?[CFKNSEWO]?
<token regexp="yes">1[°′″‴] ?[CFKNSEWO]?
<exception>1º</exception></token>
</pattern>
<disambig action="replace"><wd pos="NCMS000"/></disambig>
</rule>
<rule>
<pattern>
<token regexp="yes">[\d,. ]+[&deg;′″‴][CFKNSEWO]?
<exception regexp='yes'>1[&deg;′″‴].?</exception>
<exception regexp='yes'>\d+&ordm;</exception></token>
<token regexp="yes">[\d,. ]+[°′″‴][CFKNSEWO]?
<exception regexp='yes'>1[°′″‴].?</exception>
<exception regexp='yes'>\d+º</exception></token> <!-- ordm! -->
</pattern>
<disambig action="replace"><wd pos="NCMP000"/></disambig>
</rule>
<rule>
<pattern>
<token regexp="yes">1[&deg;′″‴]</token>
<token regexp="yes">1[°′″‴]</token>
<token regexp="yes" spacebefore='yes'>[CFKNSEWO]
<exception case_sensitive='yes' regexp='yes'>[eo]</exception></token>
</pattern>
<disambig action="replace"><wd pos="NCMS000"/><wd pos="NCMN000"/></disambig>
</rule>
<rule>
<pattern>
<token regexp="yes">[\d,. ]+[&deg;′″‴]</token>
<token regexp="yes">[\d,. ]+[°′″‴]</token>
<token regexp="yes" spacebefore='yes'>[CFKNSEWO]
<exception case_sensitive='yes' regexp='yes'>[eo]</exception></token>
</pattern>
<disambig action="replace"><wd pos="NCMP000"/><wd pos="NCMN000"/></disambig>
</rule>
<rule>
<pattern>
<token regexp="yes">[&deg;][CFKNSEWO]</token>
<token regexp="yes">[°][CFKNSEWO]</token>
</pattern>
<disambig action="replace"><wd pos="NCMN000"/></disambig>
</rule>
<rule>
<pattern>
<token regexp="yes">[\d,. ]*[&deg;′″‴][CFKNSEWO]</token>
</pattern>
<disambig action="ignore_spelling"/>
<pattern>
<!-- p-goulart@2023-12-19 - DESC: explicitly case-sensitive (fix lettercase in XML) -->
<token regexp="yes" case_sensitive="no">(?:\d+[\d,.]*)?[°′″‴][CFKNSEWO]?</token>
</pattern>
<disambig action="ignore_spelling"/>
</rule>
<rule>
<pattern>
<token regexp="yes" case_sensitive="no">(?:&number_token;)?º&degree_abbrevs;?</token>
</pattern>
<disambig action="ignore_spelling"/>
</rule>
</rulegroup>

Expand Down Expand Up @@ -3994,8 +3999,8 @@
</rule>

<rule id="ORDINAL_SUPERSCRIPT_IGNORE">
<pattern>
<token regexp="yes">\d+[&ordf;&ordm;&sup_a;&sup_o;&deg;][&sup_s;]?\d*</token>
<pattern> <!-- // ordf, ordm, sup_a, sup_o, deg, sup_s -->
<token regexp="yes">\d+[ªºᵃᵒ°][ˢ]?\d*</token>
</pattern>
<disambig action="ignore_spelling"/>
</rule>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
<!ENTITY minus_sign "−">

<!-- ORDINALS, SUPERSCRIPTS, DEGREE SIGNS, AND OTHER NONSENSE 😩 -->
<!-- For some reason, entities with special characters do NOT work in the disambiguation.xml file
I've tried making explicit reference to the hex or decimal entities, and that also fails, incl. when used
in the disambiguation file directly. There might be something off with the encoding, but idk where.-->
<!ENTITY ordf "ª">
<!ENTITY ordm "º">

Expand All @@ -28,10 +31,12 @@
<!ENTITY operadores_matematicos "[-x\.·\*\/\^\|~¬±×÷ϐϑϒϕϰϱϴϵ϶؆؇‖′″‴⁀⁄⁒⁺⁻⁼⁽⁾₊₋₌₍₎∀∁∂∃∄∅∆∇∈∉∊∋∌∍∎∏∐∑−∓∔∕∖∗∘∙√∛∜∝∞∟∠∡∢∣∤∥∦∧∨∩∪∫∬∭∮∯∰∱∲∳∴∵∶∷∸∹∺∻∼∽∾∿≀≁≂≃≄≅≆≇≈≉≊≋≌≍≎≏≐≑≒≓≔≕≖≗≘≙≚≛≜≝≞≟≠≡≢≣≤≥≦≧≨≩≪≫≬≭≮≯≰≱≲≳≴≵≶≷≸≹≺≻≼≽≾≿⊀⊁⊂⊃⊄⊅⊆⊇⊈⊉⊊⊋⊌⊍⊎⊏⊐⊑⊒⊓⊔⊕⊖⊗⊘⊙⊚⊛⊜⊝⊞⊟⊠⊡⊢⊣⊤⊥⊦⊧⊨⊩⊪⊫⊬⊭⊮⊯⊰⊱⊲⊳⊴⊵⊶⊷⊸⊹⊺⊻⊼⊽⊾⊿⋀⋁⋂⋃⋄⋅⋆⋇⋈⋉⋊⋋⋌⋍⋎⋏⋐⋑⋒⋓⋔⋕⋖⋗⋘⋙⋚⋛⋜⋝⋞⋟⋠⋡⋢⋣⋤⋥⋦⋧⋨⋩⋪⋫⋬⋭⋮⋯⋰⋱⋲⋳⋴⋵⋶⋷⋸⋹⋺⋻⋼⋽⋾⋿→⇋]|\=|\+">

<!ENTITY currency_symbols "\p{Lu}*[฿₿₵¢₡$₫֏€ƒ₲₴₭₾₺₼₦₱£៛₽₹₪৳₸₮₩¥¤]">
<!ENTITY degree_abbrevs "[CFKNSEWO]">

<!-- Convenience entity, should encompass all valid numbers that the PT tokeniser doesn't split. -->
<!ENTITY number_token "&minus_sign;?\d+[\d,.]*">
<!ENTITY number_token_no_decimal "\d+(\.\d{3})*">
<!ENTITY degree_token "&number_token;&nnbsp;?&deg;">
<!-- No decimals or negatives, as that makes no sense, only thousands allowed -->
<!ENTITY ordinal_token "&number_token_no_decimal;&any_ord;">

Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
<!ENTITY % languages SYSTEM "../../resource/pt/entities/languages.ent">
<!ENTITY % postal SYSTEM "../../resource/pt/entities/postal.ent" >
<!ENTITY % hyphenised SYSTEM "../../resource/pt/entities/hyphenised.ent" >
<!ENTITY % chars SYSTEM "../../../resource/pt/entities/chars.ent" >
<!ENTITY % chars SYSTEM "../../resource/pt/entities/chars.ent" >
%messages;
%datetime;
%misc;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ USA
<!ENTITY % verbs SYSTEM "../../resource/pt/entities/verbs.ent" >
<!ENTITY % languages SYSTEM "../../resource/pt/entities/languages.ent">
<!ENTITY % postal SYSTEM "../../resource/pt/entities/postal.ent" >
<!ENTITY % chars SYSTEM "../../../resource/pt/entities/chars.ent" >
<!ENTITY % chars SYSTEM "../../resource/pt/entities/chars.ent" >
%messages;
%datetime;
%misc;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,15 @@ public void testPortugueseSpellerDoesNotCorrectOrdinalSuperscripts() throws Exce
assertNoErrors("21ª", ltBR, ruleBR);
}

@Test
public void testPortugueseSpellerDoesNotCorrectDegreeExpressions() throws Exception {
assertNoErrors("1,0°", ltBR, ruleBR);
assertNoErrors("2°c", ltBR, ruleBR);
assertNoErrors("3°C", ltBR, ruleBR);
assertNoErrors("4,0ºc", ltBR, ruleBR);
assertNoErrors("5.0ºc", ltBR, ruleBR);
}

@Test
public void testPortugueseSpellerDoesNotCorrectCopyrightSymbol() throws Exception {
assertNoErrors("Copyright©", ltBR, ruleBR);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,17 @@ public void testDoNotTokeniseOrdinalSuperscript() {
testTokenise("22.a", new String[]{"22.a"});
testTokenise("23as", new String[]{"23as"});
testTokenise("24.as", new String[]{"24.as"});
// Degree sign, just to be absolutely sure
}

@Test
public void testDoNotTokeniseDegreeExpressions() {
testTokenise("25°", new String[]{"25°"});
testTokenise("26,0°", new String[]{"26,0°"});
testTokenise("27.0°", new String[]{"27.0°"});
testTokenise("28,0°C", new String[]{"28,0°C"});
testTokenise("29.0°C", new String[]{"29.0°C"});
testTokenise("30,0°c", new String[]{"30,0°c"});
testTokenise("31.0°c", new String[]{"31.0°c"});
}

@Test
Expand Down

0 comments on commit 1857815

Please sign in to comment.