Optimize get_json_object Spark function using simdjson (5179)

marin-ma · Jul 3, 2024 · 1c90fc1 · 1c90fc1
1 parent fc0b9fa
commit 1c90fc1
Show file tree

Hide file tree

Showing 6 changed files with 370 additions and 1 deletion.
diff --git a/velox/docs/functions/spark/json.rst b/velox/docs/functions/spark/json.rst
@@ -0,0 +1,35 @@
+==============
+JSON Functions
+==============
+
+JSON Format
+-----------
+
+JSON is a language-independent data format that represents data as
+human-readable text. A JSON text can represent a number, a boolean, a
+string, an array, an object, or a null, with slightly different grammar.
+For instance, a JSON text representing a string must escape all characters
+and enclose the string in double quotes, such as ``"123\n"``, whereas a JSON
+text representing a number does not need to, such as ``123``. A JSON text
+representing an array must enclose the array elements in square brackets,
+such as ``[1,2,3]``. More detailed grammar can be found in
+`this JSON introduction`_.
+
+.. _this JSON introduction: https://www.json.org
+
+JSON Functions
+--------------
+
+.. spark:function:: get_json_object(jsonString, path) -> varchar
+
+    Returns a json object, represented by VARCHAR, from ``jsonString`` by searching ``path``.
+    Valid ``path`` should start with '$' and then contain "[index]", "['field']" or ".field"
+    to define a JSON path. Here are some examples: "$.a" "$.a.b", "$[0]['a'].b". Returns
+    ``jsonString`` if ``path`` is "$". Returns NULL if ``jsonString`` or ``path`` is malformed.
+    Also returns NULL if ``path`` doesn't exist. ::
+
+        SELECT get_json_object('{"a":"b"}', '$.a'); -- 'b'
+        SELECT get_json_object('{"a":{"b":"c"}}', '$.a'); -- '{"b":"c"}'
+        SELECT get_json_object('{"a":3}', '$.b'); -- NULL (unexisting field)
+        SELECT get_json_object('{"a"-3}'', '$.a'); -- NULL (malformed JSON string)
+        SELECT get_json_object('{"a":3}'', '.a'); -- NULL (malformed JSON path)
diff --git a/velox/functions/sparksql/CMakeLists.txt b/velox/functions/sparksql/CMakeLists.txt
@@ -50,7 +50,8 @@ target_link_libraries(
   velox_functions_spark_specialforms
   velox_is_null_functions
   velox_functions_util
-  Folly::folly)
+  Folly::folly
+  simdjson)
 
 set_property(TARGET velox_functions_spark PROPERTY JOB_POOL_COMPILE
                                                    high_memory_pool)

diff --git a/velox/functions/sparksql/JsonFunctions.h b/velox/functions/sparksql/JsonFunctions.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/functions/prestosql/json/SIMDJsonUtil.h"
+
+namespace facebook::velox::functions::sparksql {
+
+template <typename T>
+struct GetJsonObjectFunction {
+  VELOX_DEFINE_FUNCTION_TYPES(T);
+
+  // ASCII input always produces ASCII result.
+  static constexpr bool is_default_ascii_behavior = true;
+
+  FOLLY_ALWAYS_INLINE void initialize(
+      const std::vector<TypePtr>& /*inputTypes*/,
+      const core::QueryConfig& config,
+      const arg_type<Varchar>* /*json*/,
+      const arg_type<Varchar>* jsonPath) {
+    if (jsonPath != nullptr) {
+      if (checkJsonPath(*jsonPath)) {
+        jsonPath_ = removeSingleQuotes(*jsonPath);
+      }
+    }
+  }
+
+  FOLLY_ALWAYS_INLINE bool call(
+      out_type<Varchar>& result,
+      const arg_type<Varchar>& json,
+      const arg_type<Varchar>& jsonPath) {
+    // Spark requires the first char in jsonPath is '$'.
+    if (!checkJsonPath(jsonPath)) {
+      return false;
+    }
+    // jsonPath is "$".
+    if (jsonPath.size() == 1) {
+      result.append(json);
+      return true;
+    }
+    simdjson::ondemand::document jsonDoc;
+    simdjson::padded_string paddedJson(json.data(), json.size());
+    if (simdjsonParse(paddedJson).get(jsonDoc)) {
+      return false;
+    }
+
+    auto rawResult = jsonPath_.has_value()
+        ? jsonDoc.at_path(jsonPath_.value().data())
+        : jsonDoc.at_path(removeSingleQuotes(jsonPath));
+    if (rawResult.error()) {
+      return false;
+    }
+
+    if (!extractStringResult(rawResult, result)) {
+      return false;
+    }
+
+    const char* currentPos;
+    jsonDoc.current_location().get(currentPos);
+    return isValidEndingCharacter(currentPos);
+  }
+
+ private:
+  FOLLY_ALWAYS_INLINE bool checkJsonPath(StringView jsonPath) {
+    // Spark requires the first char in jsonPath is '$'.
+    if (jsonPath.size() < 1 || jsonPath.data()[0] != '$') {
+      return false;
+    }
+    return true;
+  }
+
+  // Spark's json path requires field name surrounded by single quotes if it is
+  // specified in "[]". But simdjson lib requires not. This method just removes
+  // such single quotes, e.g., converts "['a']['b']" to "[a][b]".
+  std::string removeSingleQuotes(StringView jsonPath) {
+    // Skip the initial "$".
+    std::string result(jsonPath.data() + 1, jsonPath.size() - 1);
+    size_t pairEnd = 0;
+    while (true) {
+      auto pairBegin = result.find("['", pairEnd);
+      if (pairBegin == std::string::npos) {
+        break;
+      }
+      pairEnd = result.find("]", pairBegin);
+      if (pairEnd == std::string::npos || result[pairEnd - 1] != '\'') {
+        return "-1";
+      }
+      result.erase(pairEnd - 1, 1);
+      result.erase(pairBegin + 1, 1);
+      pairEnd -= 2;
+    }
+    return result;
+  }
+
+  // Returns true if no error.
+  bool extractStringResult(
+      simdjson::simdjson_result<simdjson::ondemand::value> rawResult,
+      out_type<Varchar>& result) {
+    std::stringstream ss;
+    switch (rawResult.type()) {
+      // For number and bool types, we need to explicitly get the value
+      // for specific types instead of using `ss << rawResult`. Thus, we
+      // can make simdjson's internal parsing position moved and then we
+      // can check the validity of ending character.
+      case simdjson::ondemand::json_type::number: {
+        switch (rawResult.get_number_type()) {
+          case simdjson::ondemand::number_type::unsigned_integer: {
+            uint64_t numberResult;
+            if (!rawResult.get_uint64().get(numberResult)) {
+              ss << numberResult;
+              result.append(ss.str());
+              return true;
+            }
+            return false;
+          }
+          case simdjson::ondemand::number_type::signed_integer: {
+            int64_t numberResult;
+            if (!rawResult.get_int64().get(numberResult)) {
+              ss << numberResult;
+              result.append(ss.str());
+              return true;
+            }
+            return false;
+          }
+          case simdjson::ondemand::number_type::floating_point_number: {
+            double numberResult;
+            if (!rawResult.get_double().get(numberResult)) {
+              ss << rawResult;
+              result.append(ss.str());
+              return true;
+            }
+            return false;
+          }
+          default:
+            VELOX_UNREACHABLE();
+        }
+      }
+      case simdjson::ondemand::json_type::boolean: {
+        bool boolResult;
+        if (!rawResult.get_bool().get(boolResult)) {
+          result.append(boolResult ? "true" : "false");
+          return true;
+        }
+        return false;
+      }
+      case simdjson::ondemand::json_type::string: {
+        std::string_view stringResult;
+        if (!rawResult.get_string().get(stringResult)) {
+          result.append(stringResult);
+          return true;
+        }
+        return false;
+      }
+      case simdjson::ondemand::json_type::object: {
+        // For nested case, e.g., for "{"my": {"hello": 10}}", "$.my" will
+        // return an object type.
+        ss << rawResult;
+        result.append(ss.str());
+        return true;
+      }
+      case simdjson::ondemand::json_type::array: {
+        ss << rawResult;
+        result.append(ss.str());
+        return true;
+      }
+      default: {
+        return false;
+      }
+    }
+  }
+
+  // This is a simple validation by checking whether the obtained result is
+  // followed by valid char. Because ondemand parsing we are using ignores json
+  // format validation for characters following the current parsing position.
+  // As json doc is padded with NULL characters, it's safe to do recursively
+  // check.
+  bool isValidEndingCharacter(const char* currentPos) {
+    char endingChar = *currentPos;
+    if (endingChar == ',' || endingChar == '}' || endingChar == ']') {
+      return true;
+    }
+    // These chars can be prior to a valid ending char.
+    if (endingChar == ' ' || endingChar == '\r' || endingChar == '\n' ||
+        endingChar == '\t') {
+      return isValidEndingCharacter(currentPos++);
+    }
+    return false;
+  }
+
+  std::optional<std::string> jsonPath_;
+};
+
+} // namespace facebook::velox::functions::sparksql
diff --git a/velox/functions/sparksql/Register.cpp b/velox/functions/sparksql/Register.cpp
@@ -35,6 +35,7 @@
 #include "velox/functions/sparksql/DateTimeFunctions.h"
 #include "velox/functions/sparksql/Hash.h"
 #include "velox/functions/sparksql/In.h"
+#include "velox/functions/sparksql/JsonFunctions.h"
 #include "velox/functions/sparksql/LeastGreatest.h"
 #include "velox/functions/sparksql/MightContain.h"
 #include "velox/functions/sparksql/MonotonicallyIncreasingId.h"
@@ -174,6 +175,9 @@ void registerFunctions(const std::string& prefix) {
 
   registerRegexpReplace(prefix);
 
+  registerFunction<GetJsonObjectFunction, Varchar, Varchar, Varchar>(
+      {prefix + "get_json_object"});
+
   // Register string functions.
   registerFunction<sparksql::ChrFunction, Varchar, int64_t>({prefix + "chr"});
   registerFunction<AsciiFunction, int32_t, Varchar>({prefix + "ascii"});

diff --git a/velox/functions/sparksql/tests/CMakeLists.txt b/velox/functions/sparksql/tests/CMakeLists.txt
@@ -32,6 +32,7 @@ add_executable(
   ElementAtTest.cpp
   HashTest.cpp
   InTest.cpp
+  JsonFunctionsTest.cpp
   LeastGreatestTest.cpp
   MakeDecimalTest.cpp
   MakeTimestampTest.cpp