diff --git a/cpp/src/arrow/dataset/file_csv.cc b/cpp/src/arrow/dataset/file_csv.cc index 14ab972e60d88..c42775dd82798 100644 --- a/cpp/src/arrow/dataset/file_csv.cc +++ b/cpp/src/arrow/dataset/file_csv.cc @@ -526,6 +526,8 @@ Result> CsvFragmentScanOptions::from( for (auto field : schema->fields()) { column_types[field->name()] = field->type(); } + } else if (key == "strings_can_be_null") { + options->convert_options.strings_can_be_null = parseBool(value); } else { return Status::Invalid("Config " + it.first + "is not supported."); } diff --git a/cpp/src/arrow/engine/CMakeLists.txt b/cpp/src/arrow/engine/CMakeLists.txt index 946425edb8cd5..fcaa242b11487 100644 --- a/cpp/src/arrow/engine/CMakeLists.txt +++ b/cpp/src/arrow/engine/CMakeLists.txt @@ -22,7 +22,6 @@ arrow_install_all_headers("arrow/engine") set(ARROW_SUBSTRAIT_SRCS substrait/expression_internal.cc substrait/extended_expression_internal.cc - substrait/extension_internal.cc substrait/extension_set.cc substrait/extension_types.cc substrait/options.cc diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index 480cf30d3033f..8ec68a42cb96a 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -1537,5 +1537,17 @@ Result> ToProto( return std::move(out); } +Status FromProto(const substrait::Expression::Literal& literal, + std::unordered_map& out) { + ARROW_RETURN_IF(!literal.has_map(), Status::Invalid("Literal does not have a map.")); + auto literalMap = literal.map(); + auto size = literalMap.key_values_size(); + for (auto i = 0; i < size; i++) { + substrait::Expression_Literal_Map_KeyValue keyValue = literalMap.key_values(i); + out.emplace(keyValue.key().string(), keyValue.value().string()); + } + return Status::OK(); +} + } // namespace engine } // namespace arrow diff --git a/cpp/src/arrow/engine/substrait/expression_internal.h b/cpp/src/arrow/engine/substrait/expression_internal.h index 2ce2ee76af20b..9be81b7ab674e 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.h +++ b/cpp/src/arrow/engine/substrait/expression_internal.h @@ -61,5 +61,9 @@ ARROW_ENGINE_EXPORT Result FromProto(const substrait::AggregateFunction&, bool is_hash, const ExtensionSet&, const ConversionOptions&); +ARROW_ENGINE_EXPORT +Status FromProto(const substrait::Expression::Literal& literal, + std::unordered_map& out); + } // namespace engine } // namespace arrow diff --git a/cpp/src/arrow/engine/substrait/extension_internal.cc b/cpp/src/arrow/engine/substrait/extension_internal.cc deleted file mode 100644 index 3b9382763ca0e..0000000000000 --- a/cpp/src/arrow/engine/substrait/extension_internal.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This API is EXPERIMENTAL. - -#include "arrow/engine/substrait/extension_internal.h" - -#include "substrait/algebra.pb.h" - -namespace arrow { -namespace engine { - -Status FromProto(const substrait::extensions::AdvancedExtension& extension, - std::unordered_map& out) { - ARROW_RETURN_IF(!extension.has_enhancement(), - Status::Invalid("AdvancedExtension does not have enhancement.")); - const auto& enhancement = extension.enhancement(); - substrait::Expression expression; - ARROW_RETURN_IF(!enhancement.UnpackTo(&expression), - Status::Invalid("Unpack the Expression failed.")); - ARROW_RETURN_IF(!expression.has_literal(), - Status::Invalid("Expression does not have a literal.")); - auto literal = expression.literal(); - ARROW_RETURN_IF(!literal.has_map(), Status::Invalid("Literal does not have a map.")); - auto literalMap = literal.map(); - auto size = literalMap.key_values_size(); - for (auto i = 0; i < size; i++) { - substrait::Expression_Literal_Map_KeyValue keyValue = literalMap.key_values(i); - out.emplace(keyValue.key().string(), keyValue.value().string()); - } - return Status::OK(); -} -} // namespace engine -} // namespace arrow diff --git a/cpp/src/arrow/engine/substrait/extension_internal.h b/cpp/src/arrow/engine/substrait/extension_internal.h deleted file mode 100644 index 84ae57c3de03b..0000000000000 --- a/cpp/src/arrow/engine/substrait/extension_internal.h +++ /dev/null @@ -1,44 +0,0 @@ - -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This API is EXPERIMENTAL. - -#pragma once - -#include - -#include "arrow/compute/type_fwd.h" -#include "arrow/engine/substrait/extension_set.h" -#include "arrow/engine/substrait/options.h" -#include "arrow/engine/substrait/relation.h" -#include "arrow/engine/substrait/visibility.h" -#include "arrow/result.h" -#include "arrow/status.h" - -#include "substrait/extensions/extensions.pb.h" // IWYU pragma: export - -namespace arrow { -namespace engine { - -/// Convert a Substrait ExtendedExpression to a vector of expressions and output names -ARROW_ENGINE_EXPORT -Status FromProto(const substrait::extensions::AdvancedExtension& extension, - std::unordered_map& out); - -} // namespace engine -} // namespace arrow diff --git a/cpp/src/arrow/engine/substrait/serde.cc b/cpp/src/arrow/engine/substrait/serde.cc index 8d97410772eb8..02e5c71715aaf 100644 --- a/cpp/src/arrow/engine/substrait/serde.cc +++ b/cpp/src/arrow/engine/substrait/serde.cc @@ -36,7 +36,6 @@ #include "arrow/dataset/file_base.h" #include "arrow/engine/substrait/expression_internal.h" #include "arrow/engine/substrait/extended_expression_internal.h" -#include "arrow/engine/substrait/extension_internal.h" #include "arrow/engine/substrait/extension_set.h" #include "arrow/engine/substrait/plan_internal.h" #include "arrow/engine/substrait/relation.h" @@ -249,10 +248,13 @@ Result DeserializeExpressions( } Status DeserializeMap(const Buffer& buf, - std::unordered_map out) { - ARROW_ASSIGN_OR_RAISE(auto advanced_extension, - ParseFromBuffer(buf)); - return FromProto(advanced_extension, out); + std::unordered_map& out) { + // ARROW_ASSIGN_OR_RAISE(auto advanced_extension, + // ParseFromBuffer(buf)); + // return FromProto(advanced_extension, out); + ARROW_ASSIGN_OR_RAISE(auto literal, + ParseFromBuffer(buf)); + return FromProto(literal, out); } namespace { diff --git a/cpp/src/arrow/engine/substrait/serde.h b/cpp/src/arrow/engine/substrait/serde.h index 1f6d261c2671b..6312ec2394d1b 100644 --- a/cpp/src/arrow/engine/substrait/serde.h +++ b/cpp/src/arrow/engine/substrait/serde.h @@ -185,7 +185,7 @@ ARROW_ENGINE_EXPORT Result DeserializeExpressions( ExtensionSet* ext_set_out = NULLPTR); ARROW_ENGINE_EXPORT Status -DeserializeMap(const Buffer& buf, std::unordered_map out); +DeserializeMap(const Buffer& buf, std::unordered_map& out); /// \brief Deserializes a Substrait Type message to the corresponding Arrow type /// diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc index 0fdaa9234d3af..f041b6e9f1b3c 100644 --- a/java/dataset/src/main/cpp/jni_wrapper.cc +++ b/java/dataset/src/main/cpp/jni_wrapper.cc @@ -705,6 +705,15 @@ Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory( std::shared_ptr scan_options = JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map)); file_format->default_fragment_scan_options = scan_options; +#ifdef ARROW_CSV + if (file_format_id == 3) { + std::shared_ptr csv_file_format = + std::dynamic_pointer_cast(file_format); + csv_file_format->parse_options = + std::dynamic_pointer_cast(scan_options) + ->parse_options; + } +#endif } arrow::dataset::FileSystemFactoryOptions options; std::shared_ptr d = @@ -733,6 +742,15 @@ Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactoryWithFi std::shared_ptr scan_options = JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map)); file_format->default_fragment_scan_options = scan_options; +#ifdef ARROW_CSV + if (file_format_id == 3) { + std::shared_ptr csv_file_format = + std::dynamic_pointer_cast(file_format); + csv_file_format->parse_options = + std::dynamic_pointer_cast(scan_options) + ->parse_options; + } +#endif } arrow::dataset::FileSystemFactoryOptions options; diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java index 844448411a0c3..bd83f0d7e879f 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/scanner/FragmentScanOptions.java @@ -23,6 +23,7 @@ import org.apache.arrow.dataset.substrait.util.ConvertUtil; import io.substrait.proto.AdvancedExtension; +import io.substrait.proto.Expression; public interface FragmentScanOptions { String typeName(); @@ -42,9 +43,11 @@ default ByteBuffer serializeMap(Map config) { return null; } - AdvancedExtension extension = ConvertUtil.expressionToExtension(ConvertUtil.mapToExpression(config)); - ByteBuffer buf = ByteBuffer.allocateDirect(extension.getSerializedSize()); - buf.put(extension.toByteArray()); + Expression.Literal literal = ConvertUtil.mapToExpressionLiteral(config); + +// AdvancedExtension extension = ConvertUtil.expressionToExtension(ConvertUtil.mapToExpression(config)); + ByteBuffer buf = ByteBuffer.allocateDirect(literal.getSerializedSize()); + buf.put(literal.toByteArray()); return buf; } } diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/util/ConvertUtil.java b/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/util/ConvertUtil.java index 658ac19345f93..31a4023af727b 100644 --- a/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/util/ConvertUtil.java +++ b/java/dataset/src/main/java/org/apache/arrow/dataset/substrait/util/ConvertUtil.java @@ -19,9 +19,6 @@ import java.util.Map; -import com.google.protobuf.Any; - -import io.substrait.proto.AdvancedExtension; import io.substrait.proto.Expression; public class ConvertUtil { @@ -31,7 +28,7 @@ public class ConvertUtil { * * @return Substrait Expression */ - public static Expression mapToExpression(Map values) { + public static Expression.Literal mapToExpressionLiteral(Map values) { Expression.Literal.Builder literalBuilder = Expression.Literal.newBuilder(); Expression.Literal.Map.KeyValue.Builder keyValueBuilder = Expression.Literal.Map.KeyValue.newBuilder(); @@ -44,19 +41,6 @@ public static Expression mapToExpression(Map values) { mapBuilder.addKeyValues(keyValueBuilder.build()); } literalBuilder.setMap(mapBuilder.build()); - return Expression.newBuilder().setLiteral(literalBuilder.build()).build(); - } - - /** - * Add substrait expression to AdvancedExtension. - * - * @param expr Substrait Expression. - * @return Substrait AdvancedExtension - */ - public static AdvancedExtension expressionToExtension(Expression expr) { - AdvancedExtension.Builder extensionBuilder = AdvancedExtension.newBuilder(); - Any.Builder builder = extensionBuilder.getEnhancementBuilder(); - builder.setValue(expr.toByteString()); - return extensionBuilder.build(); + return literalBuilder.build(); } }