Skip to content

Commit

Permalink
Only detet US-ASCII if there are printable characters, new lines and …
Browse files Browse the repository at this point in the history
…tabs #33
  • Loading branch information
albfernandez committed Mar 27, 2020
1 parent f6efa33 commit 8ba6fb3
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ public enum InputState
private boolean done;
private boolean start;
private boolean gotData;
private boolean onlyPrintableASCII = true;
private byte lastChar;
private String detectedCharset;

Expand Down Expand Up @@ -192,6 +193,13 @@ public void handleData(final byte[] buf, int offset, int length)
(c == 0x1B || (c == 0x7B && this.lastChar == 0x7E))) {
this.inputState = InputState.ESC_ASCII;
}
if (this.inputState == InputState.PURE_ASCII && onlyPrintableASCII) {
onlyPrintableASCII =
(c >= 0x20 && c <= 0x7e) // Printable characters
|| c == 0x0A // New Line
|| c == 0x0D // Carriage return
|| c== 0x09; // TAB
}
this.lastChar = buf[i];
}
} // for end
Expand Down Expand Up @@ -302,7 +310,7 @@ public void dataEnd()
}
} else if (this.inputState == InputState.ESC_ASCII) {
// do nothing
} else if (this.inputState == InputState.PURE_ASCII && this.gotData) {
} else if (this.inputState == InputState.PURE_ASCII && this.onlyPrintableASCII) {
this.detectedCharset = Constants.CHARSET_US_ASCCI;
}
else {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package org.mozilla.universalchardet;

import java.io.ByteArrayInputStream;
import java.io.IOException;

import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;

public class Bug33USASCIIToGenerous {

public Bug33USASCIIToGenerous() {
super();
}



@Test
@Ignore("Not sure")
public void testUTF16 () throws IOException {
Assert.assertEquals("UTF-16BE", detect("ab".getBytes("UTF-16BE")));
Assert.assertEquals("UTF-16LE", detect("ab".getBytes("UTF-16LE")));
}
@Test
public void testZipHeader() throws IOException {
byte[] zipHeader = new byte[]{0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x02, 0x00};
Assert.assertNull(detect(zipHeader));
}

private String detect(byte[] data) throws IOException {
return UniversalDetector.detectCharset(new ByteArrayInputStream(data));
}

}

0 comments on commit 8ba6fb3

Please sign in to comment.