From 133a217190bf2ca16279f35ce8ed20819fe33fd1 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Mon, 21 Oct 2024 14:12:49 +0200 Subject: [PATCH] fast parse --- .../sysds/runtime/io/FrameReaderTextCSV.java | 44 +++++++++++++------ .../sysds/runtime/io/IOUtilFunctions.java | 2 +- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSV.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSV.java index 400b2aced90..c531c54855b 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSV.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextCSV.java @@ -168,6 +168,19 @@ private void parseLine(String cellStr, String delim, Array[] destA , int row, int clen, double dfillValue, String sfillValue, boolean isFill, Set naValues) { try{ + int from = 0, to = 0; + final int len = cellStr.length(); + final int delimLen = delim.length(); + int c = 0; + while(from < len) { // for all tokens + to = IOUtilFunctions.getTo(cellStr, from, delim, len, delimLen); + String cell = cellStr.substring(from, to); + assignCellGeneric(row, destA, cell, naValues, isFill, dfillValue, sfillValue, false, c); + c++; + from = to + delimLen; + } + + String[] parts = IOUtilFunctions.splitCSV(cellStr, delim, clen); assignColumns(row, (int)clen, destA, parts, naValues, isFill, dfillValue, sfillValue); IOUtilFunctions.checkAndRaiseErrorCSVNumColumns("", cellStr, parts, clen); @@ -189,15 +202,23 @@ private boolean assignColumnsGeneric(int row, int nCol, Array[] destA, Strin boolean isFill, double dfillValue, String sfillValue) { boolean emptyValuesFound = false; for(int col = 0; col < nCol; col++) { - emptyValuesFound = assignCellGeneric(row, destA, parts, naValues, isFill, dfillValue, sfillValue, emptyValuesFound, col); + emptyValuesFound = assignCellGeneric(row, destA, parts[col], naValues, isFill, dfillValue, sfillValue, emptyValuesFound, col); } + return emptyValuesFound; + } + private boolean assignColumnsNoFillNoNan(int row, int nCol, Array[] destA, String[] parts){ + boolean emptyValuesFound = false; + for(int col = 0; col < nCol; col++) { + emptyValuesFound = assignCellNoNan(row, destA, parts[col], emptyValuesFound, col); + } return emptyValuesFound; } - private boolean assignCellGeneric(int row, Array[] destA, String[] parts, Set naValues, boolean isFill, + + private static boolean assignCellGeneric(int row, Array[] destA, String val, Set naValues, boolean isFill, double dfillValue, String sfillValue, boolean emptyValuesFound, int col) { - String part = IOUtilFunctions.trim(parts[col]); + String part = IOUtilFunctions.trim(val); if(part == null || part.isEmpty() || (naValues != null && naValues.contains(part))) { if(isFill && dfillValue != 0) destA[col].set(row, sfillValue); @@ -208,17 +229,12 @@ private boolean assignCellGeneric(int row, Array[] destA, String[] parts, Set return emptyValuesFound; } - private boolean assignColumnsNoFillNoNan(int row, int nCol, Array[] destA, String[] parts){ - - boolean emptyValuesFound = false; - for(int col = 0; col < nCol; col++) { - String part = IOUtilFunctions.trim(parts[col]); - if(part.isEmpty()) - emptyValuesFound = true; - else - destA[col].set(row, part); - } - + private static boolean assignCellNoNan(int row, Array[] destA, String val, boolean emptyValuesFound, int col) { + String part = IOUtilFunctions.trim(val); + if(part.isEmpty()) + emptyValuesFound = true; + else + destA[col].set(row, part); return emptyValuesFound; } diff --git a/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java b/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java index 0feb5f484df..4cd9f24a419 100644 --- a/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java +++ b/src/main/java/org/apache/sysds/runtime/io/IOUtilFunctions.java @@ -369,7 +369,7 @@ private static boolean isEmptyMatch(final String str, final int from, final Stri * @param dLen The length of the delimiter string * @return The next index. */ - private static int getTo(final String str, final int from, final String delim, + public static int getTo(final String str, final int from, final String delim, final int len, final int dLen) { final char cq = CSV_QUOTE_CHAR; final int fromP1 = from + 1;