From fcf1898ab7bae3f7308d991e225c3016e460b352 Mon Sep 17 00:00:00 2001 From: Alberto Fernandez Date: Sat, 29 Feb 2020 16:49:06 +0100 Subject: [PATCH] Support for US-ASCII if not charset is detected and all characters tested are pure ASCII, we can assume the encoding iss ASCII. Fix #30 --- README.md | 1 + .../java/org/mozilla/universalchardet/Constants.java | 4 +++- .../mozilla/universalchardet/UniversalDetector.java | 5 ++++- .../BasicFileEncodingDetectionTest.java | 11 +++++++++++ .../mozilla/universalchardet/ShortStringTests.java | 2 +- src/test/resources/ascii.txt | 11 +++++++++++ 6 files changed, 31 insertions(+), 3 deletions(-) create mode 100644 src/test/resources/ascii.txt diff --git a/README.md b/README.md index 2bbc1d1..ca5164f 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,7 @@ http://hg.mozilla.org/mozilla-central/file/tip/extensions/universalchardet/ - Others - WINDOWS-1252 + - US-ASCII All supported encodings are listed in ``org.mozilla.universalchardet.Constants.`` diff --git a/src/main/java/org/mozilla/universalchardet/Constants.java b/src/main/java/org/mozilla/universalchardet/Constants.java index d784fe0..4f45136 100644 --- a/src/main/java/org/mozilla/universalchardet/Constants.java +++ b/src/main/java/org/mozilla/universalchardet/Constants.java @@ -55,7 +55,9 @@ public final class Constants { public static final String CHARSET_UTF_32BE = "UTF-32BE".intern(); public static final String CHARSET_UTF_32LE = "UTF-32LE".intern(); public static final String CHARSET_TIS620 = "TIS620".intern(); - + public static final String CHARSET_US_ASCCI = "US-ASCII".intern(); + + // WARNING: Listed below are charsets which Java does not support. public static final String CHARSET_HZ_GB_2312 = "HZ-GB-2312".intern(); // Simplified Chinese public static final String CHARSET_X_ISO_10646_UCS_4_3412 = "X-ISO-10646-UCS-4-3412".intern(); // Malformed UTF-32 diff --git a/src/main/java/org/mozilla/universalchardet/UniversalDetector.java b/src/main/java/org/mozilla/universalchardet/UniversalDetector.java index a6b045c..3344577 100644 --- a/src/main/java/org/mozilla/universalchardet/UniversalDetector.java +++ b/src/main/java/org/mozilla/universalchardet/UniversalDetector.java @@ -302,7 +302,10 @@ public void dataEnd() } } else if (this.inputState == InputState.ESC_ASCII) { // do nothing - } else { + } else if (this.inputState == InputState.PURE_ASCII && this.gotData) { + this.detectedCharset = Constants.CHARSET_US_ASCCI; + } + else { // do nothing } } diff --git a/src/test/java/org/mozilla/universalchardet/BasicFileEncodingDetectionTest.java b/src/test/java/org/mozilla/universalchardet/BasicFileEncodingDetectionTest.java index 61a8723..17ccffe 100644 --- a/src/test/java/org/mozilla/universalchardet/BasicFileEncodingDetectionTest.java +++ b/src/test/java/org/mozilla/universalchardet/BasicFileEncodingDetectionTest.java @@ -7,6 +7,17 @@ import org.junit.Test; public class BasicFileEncodingDetectionTest { + + + public BasicFileEncodingDetectionTest() throws IOException { + super(); + } + + + @Test + public void testASCII() throws IOException { + Assert.assertEquals("US-ASCII", getFileEncoding("ascii.txt")); + } @Test public void testUTF8 () throws IOException { diff --git a/src/test/java/org/mozilla/universalchardet/ShortStringTests.java b/src/test/java/org/mozilla/universalchardet/ShortStringTests.java index 8d57ae4..5d410ae 100644 --- a/src/test/java/org/mozilla/universalchardet/ShortStringTests.java +++ b/src/test/java/org/mozilla/universalchardet/ShortStringTests.java @@ -53,7 +53,7 @@ public void testDecodeBytesBetterStats() { @Test public void testShortString() throws UnsupportedEncodingException { - Assert.assertNull(guessCharsetName("abcd".getBytes())); + Assert.assertEquals("US-ASCII", guessCharsetName("abcd".getBytes())); // Assert.assertNull(guessCharsetName("Ábcd".getBytes("ISO-8859-15"))); } diff --git a/src/test/resources/ascii.txt b/src/test/resources/ascii.txt new file mode 100644 index 0000000..c01f4ac --- /dev/null +++ b/src/test/resources/ascii.txt @@ -0,0 +1,11 @@ + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean massa ipsum, accumsan at tincidunt at, viverra non nisi. Praesent interdum justo at quam vehicula, vel semper ligula porta. Nam elementum est et velit gravida dignissim. Praesent bibendum bibendum leo id interdum. Vestibulum vel mollis risus, eget consequat odio. Fusce magna magna, rhoncus quis tincidunt eu, laoreet quis lacus. Nullam at ligula id lacus cursus convallis. Mauris ut vulputate mi, quis sodales quam. In diam diam, lacinia ac cursus eget, bibendum sodales risus. Sed est nibh, vestibulum ac euismod in, dictum ullamcorper odio. + +Etiam at tristique est. Mauris quis erat at risus dignissim euismod. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas iaculis malesuada pharetra. Quisque semper elit vel ex pellentesque fermentum vitae accumsan tellus. Maecenas ut blandit eros. Vestibulum orci lectus, volutpat in sapien vel, lobortis blandit sapien. Etiam tristique est et eros placerat iaculis. + +Phasellus ac mollis est. Suspendisse at nulla ullamcorper, tincidunt nisi quis, aliquam augue. Donec id neque purus. Duis risus neque, pharetra et laoreet bibendum, dictum ut tortor. In malesuada viverra condimentum. Donec cursus lorem risus, a consequat tellus hendrerit non. Pellentesque a feugiat mauris. Ut vel dui sed tortor pharetra tristique ac feugiat ligula. Integer viverra consectetur augue, nec blandit dolor pellentesque tincidunt. Duis ut augue sed dolor efficitur vehicula a iaculis turpis. Suspendisse ultrices, nunc in posuere sagittis, erat elit feugiat lacus, id hendrerit elit neque ut velit. Mauris eu varius massa. Duis gravida, mauris a consequat blandit, neque diam interdum justo, non tincidunt justo elit id mi. Curabitur hendrerit nisi purus, id luctus mi maximus nec. + +Pellentesque in volutpat diam, sagittis varius quam. In metus mauris, aliquam gravida feugiat eu, condimentum sed augue. Cras cursus, sem ut venenatis lacinia, quam ipsum pharetra risus, eu scelerisque odio orci non turpis. In commodo, felis accumsan pellentesque tempor, ligula nisl ultrices quam, eu facilisis tellus erat sed dolor. Suspendisse et nibh id nisi aliquet tempus non sed nisi. Donec cursus purus quam, nec dictum felis tempor ac. Nunc eu est orci. Sed fringilla diam id tortor pulvinar, vel faucibus turpis malesuada. Morbi pharetra sit amet dolor eget faucibus. Nam in tristique sem. Ut et eros ac lorem bibendum mattis. Aliquam erat volutpat. Nunc congue mattis neque, eu maximus nisi pellentesque at. + +Quisque laoreet faucibus justo a auctor. Nunc condimentum erat tempor, placerat nulla vel, convallis urna. Praesent rhoncus nulla orci, vitae laoreet eros elementum quis. Quisque sed pellentesque tortor, id aliquet libero. Donec ultrices rhoncus felis nec congue. Donec eu ullamcorper massa. Aliquam at auctor elit. \ No newline at end of file