From 0d9be06ff366b5b36527f817d4e820ba5224a844 Mon Sep 17 00:00:00 2001 From: kevinyhzou Date: Fri, 27 Oct 2023 16:05:48 +0800 Subject: [PATCH] Bug fix csv file cr at end of line --- .../csv_with_cr_end_of_line.csv | 2 ++ .../GlutenClickHouseHiveTableSuite.scala | 23 +++++++++++++++++++ cpp-ch/local-engine/Common/CHUtil.cpp | 1 + 3 files changed, 26 insertions(+) create mode 100644 backends-clickhouse/src/test/resources/text-data/cr_end_of_line/csv_with_cr_end_of_line.csv diff --git a/backends-clickhouse/src/test/resources/text-data/cr_end_of_line/csv_with_cr_end_of_line.csv b/backends-clickhouse/src/test/resources/text-data/cr_end_of_line/csv_with_cr_end_of_line.csv new file mode 100644 index 0000000000000..077ca2c84c539 --- /dev/null +++ b/backends-clickhouse/src/test/resources/text-data/cr_end_of_line/csv_with_cr_end_of_line.csv @@ -0,0 +1,2 @@ +A,110,208819249 +B,112,208819248 C,123,783434434 diff --git a/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseHiveTableSuite.scala b/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseHiveTableSuite.scala index 5e45bb99d2d8b..0cf3496b9fa1d 100644 --- a/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseHiveTableSuite.scala +++ b/backends-clickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseHiveTableSuite.scala @@ -1005,4 +1005,27 @@ class GlutenClickHouseHiveTableSuite() ) } } + + test("GLUTEN-3548: Bug fix csv allow cr end of line") { + val data_path = rootPath + "/text-data/cr_end_of_line" + spark.sql(s""" + | CREATE TABLE test_tbl_3548( + | a string, + | b string, + | c string) + | ROW FORMAT SERDE + | 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + |WITH SERDEPROPERTIES ( + | 'field.delim'=',' + | ) + | STORED AS INPUTFORMAT + | 'org.apache.hadoop.mapred.TextInputFormat' + |OUTPUTFORMAT + | 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' + |LOCATION '$data_path' + |""".stripMargin) + val select_sql = "select * from test_tbl_3548" + compareResultsAgainstVanillaSpark(select_sql, compareResult = true, _ => {}) + spark.sql("DROP TABLE test_tbl_3548") + } } diff --git a/cpp-ch/local-engine/Common/CHUtil.cpp b/cpp-ch/local-engine/Common/CHUtil.cpp index 698eb25686d74..af10b6de62479 100644 --- a/cpp-ch/local-engine/Common/CHUtil.cpp +++ b/cpp-ch/local-engine/Common/CHUtil.cpp @@ -598,6 +598,7 @@ void BackendInitializerUtil::initSettings(std::map & b settings.set("input_format_parquet_import_nested", true); settings.set("input_format_json_read_numbers_as_strings", true); settings.set("input_format_json_read_bools_as_numbers", false); + settings.set("input_format_csv_allow_cr_at_end_of_line", true); settings.set("output_format_orc_string_as_string", true); settings.set("output_format_parquet_version", "1.0"); settings.set("output_format_parquet_compression_method", "snappy");