From 8ba6fb3c145dc8323280343efc83ae873c8f99ad Mon Sep 17 00:00:00 2001 From: Alberto Fernandez Date: Fri, 27 Mar 2020 18:57:46 +0100 Subject: [PATCH] Only detet US-ASCII if there are printable characters, new lines and tabs #33 --- .../universalchardet/UniversalDetector.java | 10 +++++- .../Bug33USASCIIToGenerous.java | 34 +++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 src/test/java/org/mozilla/universalchardet/Bug33USASCIIToGenerous.java diff --git a/src/main/java/org/mozilla/universalchardet/UniversalDetector.java b/src/main/java/org/mozilla/universalchardet/UniversalDetector.java index 3344577..3cce54c 100644 --- a/src/main/java/org/mozilla/universalchardet/UniversalDetector.java +++ b/src/main/java/org/mozilla/universalchardet/UniversalDetector.java @@ -83,6 +83,7 @@ public enum InputState private boolean done; private boolean start; private boolean gotData; + private boolean onlyPrintableASCII = true; private byte lastChar; private String detectedCharset; @@ -192,6 +193,13 @@ public void handleData(final byte[] buf, int offset, int length) (c == 0x1B || (c == 0x7B && this.lastChar == 0x7E))) { this.inputState = InputState.ESC_ASCII; } + if (this.inputState == InputState.PURE_ASCII && onlyPrintableASCII) { + onlyPrintableASCII = + (c >= 0x20 && c <= 0x7e) // Printable characters + || c == 0x0A // New Line + || c == 0x0D // Carriage return + || c== 0x09; // TAB + } this.lastChar = buf[i]; } } // for end @@ -302,7 +310,7 @@ public void dataEnd() } } else if (this.inputState == InputState.ESC_ASCII) { // do nothing - } else if (this.inputState == InputState.PURE_ASCII && this.gotData) { + } else if (this.inputState == InputState.PURE_ASCII && this.onlyPrintableASCII) { this.detectedCharset = Constants.CHARSET_US_ASCCI; } else { diff --git a/src/test/java/org/mozilla/universalchardet/Bug33USASCIIToGenerous.java b/src/test/java/org/mozilla/universalchardet/Bug33USASCIIToGenerous.java new file mode 100644 index 0000000..c8db5ee --- /dev/null +++ b/src/test/java/org/mozilla/universalchardet/Bug33USASCIIToGenerous.java @@ -0,0 +1,34 @@ +package org.mozilla.universalchardet; + +import java.io.ByteArrayInputStream; +import java.io.IOException; + +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; + +public class Bug33USASCIIToGenerous { + + public Bug33USASCIIToGenerous() { + super(); + } + + + + @Test + @Ignore("Not sure") + public void testUTF16 () throws IOException { + Assert.assertEquals("UTF-16BE", detect("ab".getBytes("UTF-16BE"))); + Assert.assertEquals("UTF-16LE", detect("ab".getBytes("UTF-16LE"))); + } + @Test + public void testZipHeader() throws IOException { + byte[] zipHeader = new byte[]{0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x02, 0x00}; + Assert.assertNull(detect(zipHeader)); + } + + private String detect(byte[] data) throws IOException { + return UniversalDetector.detectCharset(new ByteArrayInputStream(data)); + } + +}