From 57716fc4512a84092f4f6bad087a9c706e316b40 Mon Sep 17 00:00:00 2001 From: Artem Fedorov Date: Tue, 7 Nov 2017 20:31:57 +0300 Subject: [PATCH] Fix/random csv/file ecnoding (#30) * fix read multibytes char with RandomAccessFile * fix buffered reader position * fix random buffered reader * fix buffered reader * add test --- .../com/blazemeter/csv/BufferedReaderExt.java | 10 +++- .../blazemeter/csv/RandomBufferedReader.java | 48 +++++++++++++++++-- .../blazemeter/csv/RandomCSVReaderTest.java | 48 +++++++++++++++++++ .../src/test/resources/text.csv | 5 ++ 4 files changed, 104 insertions(+), 7 deletions(-) create mode 100644 random-csv-data-set/src/test/resources/text.csv diff --git a/random-csv-data-set/src/main/java/com/blazemeter/csv/BufferedReaderExt.java b/random-csv-data-set/src/main/java/com/blazemeter/csv/BufferedReaderExt.java index 11c0b22..97412ad 100644 --- a/random-csv-data-set/src/main/java/com/blazemeter/csv/BufferedReaderExt.java +++ b/random-csv-data-set/src/main/java/com/blazemeter/csv/BufferedReaderExt.java @@ -22,8 +22,14 @@ public int getPos() { @Override public int read() throws IOException { - pos++; - return super.read(); + int res = super.read(); + if (res <= Byte.MAX_VALUE) { + pos++; + } else { + byte[] buf = new String(new char[]{(char) res}).getBytes(); + pos += buf.length; + } + return res; } @Override diff --git a/random-csv-data-set/src/main/java/com/blazemeter/csv/RandomBufferedReader.java b/random-csv-data-set/src/main/java/com/blazemeter/csv/RandomBufferedReader.java index d043c29..f159f23 100644 --- a/random-csv-data-set/src/main/java/com/blazemeter/csv/RandomBufferedReader.java +++ b/random-csv-data-set/src/main/java/com/blazemeter/csv/RandomBufferedReader.java @@ -1,9 +1,6 @@ package com.blazemeter.csv; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.io.Reader; +import java.io.*; public class RandomBufferedReader extends BufferedReader { @@ -21,7 +18,48 @@ public void seek(long pos) throws IOException { @Override public int read() throws IOException { - return raf.read(); + return readUtf8Char(raf); + } + + public static int readUtf8Char(final DataInput dataInput) throws IOException { + int char1, char2, char3; + + try { + char1 = dataInput.readByte() & 0xff; + switch (char1 >> 4) { + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + /* 0xxxxxxx*/ + return (char) char1; + case 12: + case 13: + /* 110x xxxx 10xx xxxx*/ + char2 = dataInput.readByte() & 0xff; + if ((char2 & 0xC0) != 0x80) { + throw new UTFDataFormatException("malformed input"); + } + return (char) (((char1 & 0x1F) << 6) | (char2 & 0x3F)); + case 14: + /* 1110 xxxx 10xx xxxx 10xx xxxx */ + char2 = dataInput.readByte() & 0xff; + char3 = dataInput.readByte() & 0xff; + if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) { + throw new UTFDataFormatException("malformed input"); + } + return (char) (((char1 & 0x0F) << 12) | ((char2 & 0x3F) << 6) | ((char3 & 0x3F) << 0)); + default: + /* 10xx xxxx, 1111 xxxx */ + throw new UTFDataFormatException("malformed input"); + } + } catch (EOFException ex) { + return -1; + } } @Override diff --git a/random-csv-data-set/src/test/java/com/blazemeter/csv/RandomCSVReaderTest.java b/random-csv-data-set/src/test/java/com/blazemeter/csv/RandomCSVReaderTest.java index 6b6a70c..1543149 100644 --- a/random-csv-data-set/src/test/java/com/blazemeter/csv/RandomCSVReaderTest.java +++ b/random-csv-data-set/src/test/java/com/blazemeter/csv/RandomCSVReaderTest.java @@ -1,5 +1,6 @@ package com.blazemeter.csv; +import com.mchange.util.AssertException; import kg.apc.emulators.TestJMeterUtils; import org.junit.BeforeClass; import org.junit.Test; @@ -67,6 +68,53 @@ public void testReadWithHeaderAndRepeat() throws Exception { assertEquals("1393227741258", reader.getNextRecord()[0]); } + @Test + public void testMultiBytes() throws Exception { + String path = this.getClass().getResource("/text.csv").getPath(); + + RandomCSVReader reader = new RandomCSVReader(path, "UTF-8", ";", true, false, false, false); + + assertEquals("[firstname, lastname, street, city]", + Arrays.toString(reader.getHeader())); + + String[] record; + + for (int i = 0; i < 4; i++) { + assertTrue(reader.hasNextRecord()); + record = reader.getNextRecord(); + assertRecord(record); + } + + assertFalse(reader.hasNextRecord()); + } + + private void assertRecord(String[] record) { + switch (record[0]) { + case "Hänsel" : + assertEquals("Mustermann", record[1]); + assertEquals("Einbahnstraße", record[2]); + assertEquals("Hamburg", record[3]); + break; + case "André" : + assertEquals("Lecompte", record[1]); + assertEquals("Rue du marché", record[2]); + assertEquals("Moÿ-de-l'Aisne", record[3]); + break; + case "Ἀλέξανδρος" : + assertEquals("Павлов", record[1]); + assertEquals("Большая Пироговская улица", record[2]); + assertEquals("Москва́", record[3]); + break; + case "בנימין" : // idea shows incorrect this line. firstname is real Benjamin -> 'בנימין' + assertEquals("يعقوب", record[1]); + assertEquals("Street", record[2]); + assertEquals("Megapolis", record[3]); + break; + default: + throw new AssertException("No such firstname in csv file " + record[0]); + } + } + @Test public void testRecordsCount() throws Exception { String path = this.getClass().getResource("/JMeterCsvResults.csv").getPath(); diff --git a/random-csv-data-set/src/test/resources/text.csv b/random-csv-data-set/src/test/resources/text.csv new file mode 100644 index 0000000..3da7a3f --- /dev/null +++ b/random-csv-data-set/src/test/resources/text.csv @@ -0,0 +1,5 @@ +firstname;lastname;street;city +Hänsel;Mustermann;Einbahnstraße;Hamburg +André;Lecompte;Rue du marché;Moÿ-de-l'Aisne +Ἀλέξανδρος;Павлов;Большая Пироговская улица;Москва́ +בנימין;يعقوب;Street;Megapolis \ No newline at end of file