diff --git a/ci/conan/all/conanfile.py b/ci/conan/all/conanfile.py index 7dab8c82f69..9217f5bad83 100644 --- a/ci/conan/all/conanfile.py +++ b/ci/conan/all/conanfile.py @@ -163,7 +163,7 @@ def configure(self): def layout(self): cmake_layout(self, src_folder="src") - def _requires_rapidjson(self): + def _requires_json_library(self): return self.options.with_json or self.options.encryption def requirements(self): @@ -185,8 +185,9 @@ def requirements(self): self.requires("google-cloud-cpp/1.40.1") if self.options.with_grpc: self.requires("grpc/1.50.0") - if self._requires_rapidjson(): + if self._requires_json_library(): self.requires("rapidjson/1.1.0") + self.requires("simdjson/4.2.2") if self.options.with_llvm: self.requires("llvm-core/13.0.0") if self.options.with_openssl: @@ -371,6 +372,7 @@ def generate(self): tc.variables["ARROW_LZ4_USE_SHARED"] = bool(self.dependencies["lz4"].options.shared) tc.variables["ARROW_WITH_SNAPPY"] = bool(self.options.with_snappy) tc.variables["RapidJSON_SOURCE"] = "SYSTEM" + tc.variables["SIMDJSON_SOURCE"] = "SYSTEM" tc.variables["Snappy_SOURCE"] = "SYSTEM" if self.options.with_snappy: tc.variables["ARROW_SNAPPY_USE_SHARED"] = bool(self.dependencies["snappy"].options.shared) @@ -559,8 +561,9 @@ def package_info(self): self.cpp_info.components["libarrow"].requires.append("libbacktrace::libbacktrace") if self.options.with_cuda: self.cpp_info.components["libarrow"].requires.append("cuda::cuda") - if self._requires_rapidjson(): + if self._requires_json_library(): self.cpp_info.components["libarrow"].requires.append("rapidjson::rapidjson") + self.cpp_info.components["libarrow"].requires.append("simdjson::simdjson") if self.options.with_s3: # https://github.com/apache/arrow/blob/6b268f62a8a172249ef35f093009c740c32e1f36/cpp/src/arrow/CMakeLists.txt#L98 self.cpp_info.components["libarrow"].requires.extend([f"aws-sdk-cpp::{x}" for x in ["cognito-identity", "core", "identity-management", "s3", "sts"]]) diff --git a/ci/docker/alpine-linux-3.22-cpp.dockerfile b/ci/docker/alpine-linux-3.22-cpp.dockerfile index 48907e61a4a..fe5a0c443f9 100644 --- a/ci/docker/alpine-linux-3.22-cpp.dockerfile +++ b/ci/docker/alpine-linux-3.22-cpp.dockerfile @@ -59,6 +59,7 @@ RUN apk add \ re2-dev \ rsync \ samurai \ + simdjson-dev \ snappy-dev \ sqlite-dev \ thrift-dev \ diff --git a/ci/docker/debian-12-cpp.dockerfile b/ci/docker/debian-12-cpp.dockerfile index 44c845bb17e..1ec5ea57446 100644 --- a/ci/docker/debian-12-cpp.dockerfile +++ b/ci/docker/debian-12-cpp.dockerfile @@ -71,6 +71,7 @@ RUN apt-get update -y -q && \ libpsl-dev \ libre2-dev \ librtmp-dev \ + libsimdjson-dev \ libsnappy-dev \ libsqlite3-dev \ libssh-dev \ diff --git a/ci/docker/debian-13-cpp.dockerfile b/ci/docker/debian-13-cpp.dockerfile index ca96b4177ff..aca4b244644 100644 --- a/ci/docker/debian-13-cpp.dockerfile +++ b/ci/docker/debian-13-cpp.dockerfile @@ -71,6 +71,7 @@ RUN apt-get update -y -q && \ libpsl-dev \ libre2-dev \ librtmp-dev \ + libsimdjson-dev \ libsnappy-dev \ libsqlite3-dev \ libssh-dev \ diff --git a/ci/docker/debian-experimental-cpp.dockerfile b/ci/docker/debian-experimental-cpp.dockerfile index d37b58e2307..b0a67e227f5 100644 --- a/ci/docker/debian-experimental-cpp.dockerfile +++ b/ci/docker/debian-experimental-cpp.dockerfile @@ -65,6 +65,7 @@ RUN if [ -n "${gcc}" ]; then \ libpsl-dev \ libre2-dev \ librtmp-dev \ + libsimdjson-dev \ libsnappy-dev \ libsqlite3-dev \ libssh-dev \ diff --git a/ci/docker/fedora-42-cpp.dockerfile b/ci/docker/fedora-42-cpp.dockerfile index cabb066fec3..fd04edb6915 100644 --- a/ci/docker/fedora-42-cpp.dockerfile +++ b/ci/docker/fedora-42-cpp.dockerfile @@ -60,6 +60,7 @@ RUN dnf update -y && \ python-pip \ rapidjson-devel \ re2-devel \ + simdjson-dev \ snappy-devel \ thrift-devel \ utf8proc-devel \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index 88a27efe335..01500c86e94 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -93,6 +93,7 @@ RUN apt-get update -y -q && \ libradospp-dev \ libre2-dev \ librtmp-dev \ + libsimdjson-dev \ libsnappy-dev \ libsqlite3-dev \ libssh-dev \ diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile index 0347d452d7b..1af6543626b 100644 --- a/ci/docker/ubuntu-24.04-cpp.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp.dockerfile @@ -94,6 +94,7 @@ RUN apt-get update -y -q && \ libradospp-dev \ libre2-dev \ librtmp-dev \ + libsimdjson-dev \ libsnappy-dev \ libsqlite3-dev \ libssh-dev \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 79b64dbc2a4..62e8b103935 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -280,6 +280,7 @@ else -DProtobuf_SOURCE=${Protobuf_SOURCE:-} \ -DRapidJSON_SOURCE=${RapidJSON_SOURCE:-} \ -Dre2_SOURCE=${re2_SOURCE:-} \ + -DSIMDJSON_SOURCE=${SIMDJSON_SOURCE:-} \ -DSnappy_SOURCE=${Snappy_SOURCE:-} \ -DThrift_SOURCE=${Thrift_SOURCE:-} \ -Dutf8proc_SOURCE=${utf8proc_SOURCE:-} \ diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index b95d6491457..c48e7a7b5bc 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -64,6 +64,7 @@ set(ARROW_THIRDPARTY_DEPENDENCIES re2 Protobuf RapidJSON + simdjson Snappy Substrait Thrift @@ -211,6 +212,8 @@ macro(build_dependency DEPENDENCY_NAME) build_rapidjson() elseif("${DEPENDENCY_NAME}" STREQUAL "re2") build_re2() + elseif("${DEPENDENCY_NAME}" STREQUAL "simdjson") + build_simdjson() elseif("${DEPENDENCY_NAME}" STREQUAL "Snappy") build_snappy() elseif("${DEPENDENCY_NAME}" STREQUAL "Substrait") @@ -381,6 +384,7 @@ endif() if(ARROW_PARQUET) set(ARROW_WITH_RAPIDJSON ON) + set(ARROW_WITH_SIMDJSON ON) set(ARROW_WITH_THRIFT ON) endif() @@ -409,6 +413,7 @@ endif() if(ARROW_JSON OR ARROW_FLIGHT_SQL_ODBC) set(ARROW_WITH_RAPIDJSON ON) + set(ARROW_WITH_SIMDJSON ON) endif() if(ARROW_ORC OR ARROW_FLIGHT) @@ -780,6 +785,14 @@ else() "https://github.com/aws/s2n-tls/archive/${ARROW_S2N_TLS_BUILD_VERSION}.tar.gz") endif() +if(DEFINED ENV{ARROW_SIMDJSON_URL}) + set(SIMDJSON_SOURCE_URL "$ENV{ARROW_SIMDJSON_URL}") +else() + set_urls(SIMDJSON_SOURCE_URL + "https://github.com/simdjson/simdjson/archive/${ARROW_SIMDJSON_BUILD_VERSION}.tar.gz" + "${THIRDPARTY_MIRROR_URL}/simdjson-${ARROW_SIMDJSON_BUILD_VERSION}.tar.gz") +endif() + if(DEFINED ENV{ARROW_SNAPPY_URL}) set(SNAPPY_SOURCE_URL "$ENV{ARROW_SNAPPY_URL}") else() @@ -2613,6 +2626,45 @@ if(ARROW_WITH_RAPIDJSON) FALSE) endif() +macro(build_simdjson) + message(STATUS "Building simdjson from source") + set(SIMDJSON_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/simdjson_ep/src/simdjson_ep-install") + set(SIMDJSON_CMAKE_ARGS + ${EP_COMMON_CMAKE_ARGS} + -DSIMDJSON_JUST_LIBRARY=ON + -DSIMDJSON_BUILD_STATIC_LIB=ON + "-DCMAKE_INSTALL_PREFIX=${SIMDJSON_PREFIX}") + + externalproject_add(simdjson_ep + ${EP_COMMON_OPTIONS} + PREFIX "${CMAKE_BINARY_DIR}" + URL ${SIMDJSON_SOURCE_URL} + URL_HASH "SHA256=${ARROW_SIMDJSON_BUILD_SHA256_CHECKSUM}" + CMAKE_ARGS ${SIMDJSON_CMAKE_ARGS}) + + set(SIMDJSON_INCLUDE_DIR "${SIMDJSON_PREFIX}/include") + set(SIMDJSON_STATIC_LIB + "${SIMDJSON_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}simdjson${CMAKE_STATIC_LIBRARY_SUFFIX}") + + # The include directory must exist before it is referenced by a target. + file(MAKE_DIRECTORY "${SIMDJSON_INCLUDE_DIR}") + + add_library(simdjson::simdjson STATIC IMPORTED) + set_target_properties(simdjson::simdjson + PROPERTIES IMPORTED_LOCATION "${SIMDJSON_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES "${SIMDJSON_INCLUDE_DIR}") + add_dependencies(simdjson::simdjson simdjson_ep) + + set(SIMDJSON_VENDORED TRUE) +endmacro() + +if(ARROW_WITH_SIMDJSON) + resolve_dependency(simdjson + USE_CONFIG + IS_RUNTIME_DEPENDENCY + FALSE) +endif() + macro(build_xsimd) message(STATUS "Building xsimd from source") set(XSIMD_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install") diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index df9b783d531..0e96682b0f9 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -622,6 +622,11 @@ if(ARROW_WITH_RAPIDJSON) target_link_libraries(${ARROW_UTIL_TARGET} PRIVATE RapidJSON) endforeach() endif() +if(ARROW_WITH_SIMDJSON) + foreach(ARROW_UTIL_TARGET ${ARROW_UTIL_TARGETS}) + target_link_libraries(${ARROW_UTIL_TARGET} PRIVATE simdjson::simdjson) + endforeach() +endif() if(ARROW_WITH_ZLIB) foreach(ARROW_UTIL_TARGET ${ARROW_UTIL_TARGETS}) target_link_libraries(${ARROW_UTIL_TARGET} PRIVATE ZLIB::ZLIB) @@ -1003,6 +1008,7 @@ if(ARROW_JSON) json/reader.cc) foreach(ARROW_JSON_TARGET ${ARROW_JSON_TARGETS}) target_link_libraries(${ARROW_JSON_TARGET} PRIVATE RapidJSON) + target_link_libraries(${ARROW_JSON_TARGET} PRIVATE simdjson::simdjson) endforeach() else() set(ARROW_JSON_TARGET_SHARED) diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index 326eb24d083..3f97633e397 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -171,7 +171,7 @@ TEST_F(TestChunkedArrayEqualsSameAddress, NestedTypeWithoutFloat) { } TEST_F(TestChunkedArrayEqualsSameAddress, FloatType) { - auto float64_array = ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, NaN]"); + auto float64_array = ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, \"NaN\"]"); ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({float64_array})); ASSERT_FALSE(chunked_array->Equals(chunked_array)); @@ -182,7 +182,7 @@ TEST_F(TestChunkedArrayEqualsSameAddress, FloatType) { } TEST_F(TestChunkedArrayEqualsSameAddress, NestedTypeWithFloat) { - auto float64_array = ArrayFromJSON(float64(), "[0.0, 1.0, NaN]"); + auto float64_array = ArrayFromJSON(float64(), "[0.0, 1.0, \"NaN\"]"); ASSERT_OK_AND_ASSIGN(auto struct_array, StructArray::Make({float64_array}, {"Float64Type"})); ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({struct_array})); diff --git a/cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc b/cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc index 53c28032b82..55172f2519d 100644 --- a/cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc @@ -930,24 +930,24 @@ TEST(TestCumulativeMean, ConvenienceFunction) { TEST(TestCumulative, NaN) { // addition with NaN is always NaN - CheckVectorUnary("cumulative_sum", ArrayFromJSON(float64(), "[1, 2, NaN, 4, 5]"), - ArrayFromJSON(float64(), "[1, 3, NaN, NaN, NaN]")); + CheckVectorUnary("cumulative_sum", ArrayFromJSON(float64(), "[1, 2, \"NaN\", 4, 5]"), + ArrayFromJSON(float64(), R"([1, 3, "NaN", "NaN", "NaN"])")); // multiply with Nan is always NaN - CheckVectorUnary("cumulative_prod", ArrayFromJSON(float64(), "[1, 2, NaN, 4, 5]"), - ArrayFromJSON(float64(), "[1, 2, NaN, NaN, NaN]")); + CheckVectorUnary("cumulative_prod", ArrayFromJSON(float64(), "[1, 2, \"NaN\", 4, 5]"), + ArrayFromJSON(float64(), R"([1, 2, "NaN", "NaN", "NaN"])")); // max with NaN is always ignored because Nan > a always returns false - CheckVectorUnary("cumulative_max", ArrayFromJSON(float64(), "[1, 2, NaN, 4, 5]"), + CheckVectorUnary("cumulative_max", ArrayFromJSON(float64(), "[1, 2, \"NaN\", 4, 5]"), ArrayFromJSON(float64(), "[1, 2, 2, 4, 5]")); // min with NaN is always ignored because Nan < a always returns false - CheckVectorUnary("cumulative_min", ArrayFromJSON(float64(), "[5, 4, NaN, 2, 1]"), + CheckVectorUnary("cumulative_min", ArrayFromJSON(float64(), "[5, 4, \"NaN\", 2, 1]"), ArrayFromJSON(float64(), "[5, 4, 4, 2, 1]")); // mean with NaN is always Nan - CheckVectorUnary("cumulative_mean", ArrayFromJSON(float64(), "[5, 4, NaN, 2, 1]"), - ArrayFromJSON(float64(), "[5, 4.5, NaN, NaN, NaN]")); + CheckVectorUnary("cumulative_mean", ArrayFromJSON(float64(), "[5, 4, \"NaN\", 2, 1]"), + ArrayFromJSON(float64(), R"([5, 4.5, "NaN", "NaN", "NaN"])")); } } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/json/from_string.cc b/cpp/src/arrow/json/from_string.cc index e35a362f5a2..e17c614f67e 100644 --- a/cpp/src/arrow/json/from_string.cc +++ b/cpp/src/arrow/json/from_string.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include @@ -32,23 +33,21 @@ #include "arrow/array/builder_union.h" #include "arrow/chunked_array.h" #include "arrow/json/from_string.h" +#include "arrow/result.h" #include "arrow/scalar.h" +#include "arrow/status.h" +#include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/float16.h" #include "arrow/util/logging_internal.h" +#include "arrow/util/unreachable.h" #include "arrow/util/value_parsing.h" -#include "arrow/json/rapidjson_defs.h" +#include -#include -#include -#include -#include -#include - -namespace rj = arrow::rapidjson; +namespace sj = simdjson::ondemand; namespace arrow { @@ -62,30 +61,21 @@ using ::arrow::internal::checked_pointer_cast; namespace { -constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; - -const char* JsonTypeName(rj::Type json_type) { - switch (json_type) { - case rapidjson::kNullType: - return "null"; - case rapidjson::kFalseType: - return "false"; - case rapidjson::kTrueType: - return "true"; - case rapidjson::kObjectType: - return "object"; - case rapidjson::kArrayType: - return "array"; - case rapidjson::kStringType: - return "string"; - case rapidjson::kNumberType: - return "number"; - default: - return "unknown"; - } +// TODO.TAE constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; + +const char* JsonTypeName(sj::json_type type) { + switch (type) { + case sj::json_type::array: return "array"; + case sj::json_type::object: return "object"; + case sj::json_type::number: return "number"; + case sj::json_type::string: return "string"; + case sj::json_type::boolean: return "boolean"; + case sj::json_type::null: return "null"; + default: return "unknown"; + } } -Status JSONTypeError(const char* expected_type, rj::Type json_type) { +Status JSONTypeError(const char* expected_type, sj::json_type json_type) { return Status::Invalid("Expected ", expected_type, " or null, got JSON type ", JsonTypeName(json_type)); } @@ -96,11 +86,12 @@ class JSONConverter { virtual Status Init() { return Status::OK(); } - virtual Status AppendValue(const rj::Value& json_obj) = 0; + virtual Status AppendValue(sj::value& json_obj) = 0; Status AppendNull() { return this->builder()->AppendNull(); } - virtual Status AppendValues(const rj::Value& json_array) = 0; + // TODO.TAE returns the number of elements that were appended + virtual Result AppendValues(sj::array& json_array) = 0; virtual std::shared_ptr builder() = 0; @@ -124,20 +115,18 @@ Status GetConverter(const std::shared_ptr&, template class ConcreteConverter : public JSONConverter { public: - Result SizeOfJSONArray(const rj::Value& json_obj) { - if (!json_obj.IsArray()) { - return JSONTypeError("array", json_obj.GetType()); - } - return json_obj.Size(); - } - - Status AppendValues(const rj::Value& json_array) final { + Result AppendValues(sj::array& json_array) final { auto self = static_cast(this); - ARROW_ASSIGN_OR_RAISE(auto size, SizeOfJSONArray(json_array)); - for (uint32_t i = 0; i < size; ++i) { - RETURN_NOT_OK(self->AppendValue(json_array[i])); + int32_t num_elements = 0; + for (auto element : json_array) { + sj::value value; + if(element.get(value) != simdjson::SUCCESS){ + return Status::Invalid("Could not iterate elements of array: ", json_array.raw_json()); + } + RETURN_NOT_OK(self->AppendValue(value)); + num_elements++; } - return Status::OK(); + return num_elements; } const std::shared_ptr& value_type() { @@ -167,11 +156,11 @@ class NullConverter final : public ConcreteConverter { builder_ = std::make_shared(); } - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return AppendNull(); } - return JSONTypeError("null", json_obj.GetType()); + return JSONTypeError("null", json_obj.type()); } std::shared_ptr builder() override { return builder_; } @@ -190,17 +179,19 @@ class BooleanConverter final : public ConcreteConverter { builder_ = std::make_shared(); } - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return AppendNull(); } - if (json_obj.IsBool()) { - return builder_->Append(json_obj.GetBool()); + bool bool_value; + if (json_obj.get(bool_value) == simdjson::SUCCESS) { + return builder_->Append(bool_value); } - if (json_obj.IsInt()) { - return builder_->Append(json_obj.GetInt() != 0); + int64_t int_value; + if (json_obj.get(int_value) == simdjson::SUCCESS) { + return builder_->Append(int_value != 0); } - return JSONTypeError("boolean", json_obj.GetType()); + return JSONTypeError("boolean", json_obj.type()); } std::shared_ptr builder() override { return builder_; } @@ -214,11 +205,12 @@ class BooleanConverter final : public ConcreteConverter { // Convert single signed integer value (also {Date,Time}{32,64} and Timestamp) template -enable_if_physical_signed_integer ConvertNumber(const rj::Value& json_obj, +enable_if_physical_signed_integer ConvertNumber(sj::value& json_obj, const DataType& type, typename T::c_type* out) { - if (json_obj.IsInt64()) { - int64_t v64 = json_obj.GetInt64(); + + int64_t v64; + if (json_obj.get(v64) == simdjson::SUCCESS) { *out = static_cast(v64); if (*out == v64) { return Status::OK(); @@ -227,17 +219,18 @@ enable_if_physical_signed_integer ConvertNumber(const rj::Value& json } } else { *out = static_cast(0); - return JSONTypeError("signed int", json_obj.GetType()); + return JSONTypeError("int", json_obj.type()); } } // Convert single unsigned integer value template -enable_if_unsigned_integer ConvertNumber(const rj::Value& json_obj, +enable_if_unsigned_integer ConvertNumber(sj::value& json_obj, const DataType& type, typename T::c_type* out) { - if (json_obj.IsUint64()) { - uint64_t v64 = json_obj.GetUint64(); + + uint64_t v64; + if (json_obj.get(v64) == simdjson::SUCCESS) { *out = static_cast(v64); if (*out == v64) { return Status::OK(); @@ -246,46 +239,125 @@ enable_if_unsigned_integer ConvertNumber(const rj::Value& json_obj, } } else { *out = static_cast(0); - return JSONTypeError("unsigned int", json_obj.GetType()); + return JSONTypeError("unsigned int", json_obj.type()); } } // Convert float16/HalfFloatType template -enable_if_half_float ConvertNumber(const rj::Value& json_obj, +enable_if_half_float ConvertNumber(sj::value& json_obj, const DataType& type, uint16_t* out) { - if (json_obj.IsDouble()) { - double f64 = json_obj.GetDouble(); + double f64; + if (json_obj.get(f64) == simdjson::SUCCESS) { *out = Float16(f64).bits(); return Status::OK(); - } else if (json_obj.IsUint()) { - uint32_t u32t = json_obj.GetUint(); - double f64 = static_cast(u32t); + } + uint64_t u64t; + if (json_obj.get(u64t)) { + auto f64 = static_cast(u64t); *out = Float16(f64).bits(); return Status::OK(); - } else if (json_obj.IsInt()) { - int32_t i32t = json_obj.GetInt(); - double f64 = static_cast(i32t); + } + int64_t i64t; + if (json_obj.get(i64t)) { + auto f64 = static_cast(i64t); *out = Float16(f64).bits(); return Status::OK(); - } else { - *out = static_cast(0); - return JSONTypeError("unsigned int", json_obj.GetType()); } + std::string_view str; + if (json_obj.get(str) == simdjson::SUCCESS) { + if(str == "NaN") { + *out = Float16(std::numeric_limits::quiet_NaN()).bits(); + return Status::OK(); + } + else if (str == "Inf" || str == "Infinity") { + *out = Float16(std::numeric_limits::infinity()).bits(); + return Status::OK(); + } + else if (str == "-Inf" || str == "-Infinity") { + *out = Float16(-std::numeric_limits::infinity()).bits(); + return Status::OK(); + } + } + *out = static_cast(0); + return JSONTypeError("number", json_obj.type()); } // Convert single floating point value template -enable_if_physical_floating_point ConvertNumber(const rj::Value& json_obj, +enable_if_physical_floating_point ConvertNumber(sj::value& json_obj, const DataType& type, typename T::c_type* out) { - if (json_obj.IsNumber()) { - *out = static_cast(json_obj.GetDouble()); + sj::number number; + if (json_obj.get(number) == simdjson::SUCCESS) { + *out = static_cast(number.as_double()); return Status::OK(); - } else { - *out = static_cast(0); - return JSONTypeError("number", json_obj.GetType()); } + std::string_view str; + if (json_obj.get(str) == simdjson::SUCCESS) { + if(str == "NaN") { + *out = static_cast(std::numeric_limits::quiet_NaN()); + return Status::OK(); + } + else if (str == "Inf" || str == "Infinity") { + *out = static_cast(std::numeric_limits::infinity()); + return Status::OK(); + } + else if (str == "-Inf" || str == "-Infinity") { + *out = static_cast(-std::numeric_limits::infinity()); + return Status::OK(); + } + } + *out = static_cast(0); + return JSONTypeError("number", json_obj.type()); +} + + +// ------------------------------------------------------------------------ +// Helper to process a JSON array with exactly N elements, calling a handler for each. +// Each handler is a callable taking sj::value& and returning Status. +template +Status ProcessJsonArrayElements(sj::array& json_array, const char* error_context, + Handlers&&... handlers) { + constexpr size_t expected_size = sizeof...(Handlers); + auto it = json_array.begin(); + auto end = json_array.end(); + + size_t index = 0; + Status result = Status::OK(); + + auto process_one = [&](auto&& handler) -> bool { + if (!result.ok()) return false; + + if (it == end) { + result = Status::Invalid(error_context, " must have exactly ", expected_size, + " elements, had ", index); + return false; + } + + sj::value element; + auto error = (*it).get(element); + if (error) { + result = Status::Invalid("Failed to get element ", index, " from ", error_context); + return false; + } + + result = handler(element); + ++it; + ++index; + return result.ok(); + }; + + // Use fold expression to process all handlers in order + (process_one(std::forward(handlers)) && ...); + + if (!result.ok()) return result; + + if (it != end) { + return Status::Invalid(error_context, " must have exactly ", expected_size, + " elements, had more"); + } + return Status::OK(); } // ------------------------------------------------------------------------ @@ -303,8 +375,8 @@ class IntegerConverter final Status Init() override { return this->MakeConcreteBuilder(&builder_); } - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return this->AppendNull(); } c_type value; @@ -330,8 +402,8 @@ class FloatConverter final : public ConcreteConverterMakeConcreteBuilder(&builder_); } - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return this->AppendNull(); } c_type value; @@ -360,22 +432,22 @@ class DecimalConverter final Status Init() override { return this->MakeConcreteBuilder(&builder_); } - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return this->AppendNull(); } - if (json_obj.IsString()) { + std::string_view string_value; + if (json_obj.get(string_value) == simdjson::SUCCESS) { int32_t precision, scale; DecimalValue d; - auto view = std::string_view(json_obj.GetString(), json_obj.GetStringLength()); - RETURN_NOT_OK(DecimalValue::FromString(view, &d, &precision, &scale)); + RETURN_NOT_OK(DecimalValue::FromString(string_value, &d, &precision, &scale)); if (scale != decimal_type_->scale()) { return Status::Invalid("Invalid scale for decimal: expected ", decimal_type_->scale(), ", got ", scale); } return builder_->Append(d); } - return JSONTypeError("decimal string", json_obj.GetType()); + return JSONTypeError("decimal string", json_obj.type()); } std::shared_ptr builder() override { return builder_; } @@ -405,21 +477,19 @@ class TimestampConverter final : public ConcreteConverter { builder_ = std::make_shared(type, default_memory_pool()); } - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return this->AppendNull(); } int64_t value; - if (json_obj.IsNumber()) { - RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); - } else if (json_obj.IsString()) { - std::string_view view(json_obj.GetString(), json_obj.GetStringLength()); + std::string_view view; + if (json_obj.get(view) == simdjson::SUCCESS) { if (!ParseValue(*timestamp_type_, view.data(), view.size(), &value)) { return Status::Invalid("couldn't parse timestamp from ", view); } - } else { - return JSONTypeError("timestamp", json_obj.GetType()); } + // TODO.TAE invert the check order + RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); return builder_->Append(value); } @@ -441,21 +511,25 @@ class DayTimeIntervalConverter final builder_ = std::make_shared(default_memory_pool()); } - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return this->AppendNull(); } - DayTimeIntervalType::DayMilliseconds value; - if (!json_obj.IsArray()) { - return JSONTypeError("array", json_obj.GetType()); - } - if (json_obj.Size() != 2) { - return Status::Invalid( - "day time interval pair must have exactly two elements, had ", json_obj.Size()); + + sj::array array; + if (json_obj.get(array) != simdjson::SUCCESS) { + return JSONTypeError("array", json_obj.type()); } - RETURN_NOT_OK(ConvertNumber(json_obj[0], *this->type_, &value.days)); - RETURN_NOT_OK( - ConvertNumber(json_obj[1], *this->type_, &value.milliseconds)); + + DayTimeIntervalType::DayMilliseconds value; + RETURN_NOT_OK(ProcessJsonArrayElements( + array, "day-time interval", + [this, &value](sj::value& elem) { + return ConvertNumber(elem, *this->type_, &value.days); + }, + [this, &value](sj::value& elem) { + return ConvertNumber(elem, *this->type_, &value.milliseconds); + })); return builder_->Append(value); } @@ -473,23 +547,28 @@ class MonthDayNanoIntervalConverter final builder_ = std::make_shared(default_memory_pool()); } - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return this->AppendNull(); } - MonthDayNanoIntervalType::MonthDayNanos value; - if (!json_obj.IsArray()) { - return JSONTypeError("array", json_obj.GetType()); - } - if (json_obj.Size() != 3) { - return Status::Invalid( - "month_day_nano_interval must have exactly 3 elements, had ", json_obj.Size()); + + sj::array array; + if (json_obj.get(array) != simdjson::SUCCESS) { + return JSONTypeError("array", json_obj.type()); } - RETURN_NOT_OK(ConvertNumber(json_obj[0], *this->type_, &value.months)); - RETURN_NOT_OK(ConvertNumber(json_obj[1], *this->type_, &value.days)); - RETURN_NOT_OK( - ConvertNumber(json_obj[2], *this->type_, &value.nanoseconds)); + MonthDayNanoIntervalType::MonthDayNanos value; + RETURN_NOT_OK(ProcessJsonArrayElements( + array, "month-day-nano interval", + [this, &value](sj::value& elem) { + return ConvertNumber(elem, *this->type_, &value.months); + }, + [this, &value](sj::value& elem) { + return ConvertNumber(elem, *this->type_, &value.days); + }, + [this, &value](sj::value& elem) { + return ConvertNumber(elem, *this->type_, &value.nanoseconds); + })); return builder_->Append(value); } @@ -510,15 +589,15 @@ class StringConverter final Status Init() override { return this->MakeConcreteBuilder(&builder_); } - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return this->AppendNull(); } - if (json_obj.IsString()) { - auto view = std::string_view(json_obj.GetString(), json_obj.GetStringLength()); + std::string_view view; + if (json_obj.get(view) == simdjson::SUCCESS) { return builder_->Append(view); } else { - return JSONTypeError("string", json_obj.GetType()); + return JSONTypeError("string", json_obj.type()); } } @@ -541,12 +620,12 @@ class FixedSizeBinaryConverter final Status Init() override { return this->MakeConcreteBuilder(&builder_); } - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return this->AppendNull(); } - if (json_obj.IsString()) { - auto view = std::string_view(json_obj.GetString(), json_obj.GetStringLength()); + std::string_view view; + if (json_obj.get(view) == simdjson::SUCCESS) { if (view.length() != static_cast(builder_->byte_width())) { std::stringstream ss; ss << "Invalid string length " << view.length() << " in JSON input for " @@ -555,7 +634,7 @@ class FixedSizeBinaryConverter final } return builder_->Append(view); } else { - return JSONTypeError("string", json_obj.GetType()); + return JSONTypeError("string", json_obj.type()); } } @@ -588,14 +667,17 @@ class VarLengthListLikeConverter final return Status::OK(); } - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return this->AppendNull(); } + sj::array array; + if(json_obj.get(array) != simdjson::SUCCESS){ + return JSONTypeError("array", json_obj.type()); + } // Extend the child converter with this JSON array - ARROW_ASSIGN_OR_RAISE(auto size, this->SizeOfJSONArray(json_obj)); - RETURN_NOT_OK(builder_->Append(true, size)); - return child_converter_->AppendValues(json_obj); + ARROW_ASSIGN_OR_RAISE(int32_t size, child_converter_->AppendValues(array)); + return builder_->Append(true, size); } std::shared_ptr builder() override { return builder_; } @@ -623,29 +705,31 @@ class MapConverter final : public ConcreteConverter { return Status::OK(); } - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return this->AppendNull(); } RETURN_NOT_OK(builder_->Append()); - if (!json_obj.IsArray()) { - return JSONTypeError("array", json_obj.GetType()); + sj::array array; + if (json_obj.get(array) != simdjson::SUCCESS) { + return JSONTypeError("array", json_obj.type()); } - auto size = json_obj.Size(); - for (uint32_t i = 0; i < size; ++i) { - const auto& json_pair = json_obj[i]; - if (!json_pair.IsArray()) { - return JSONTypeError("array", json_pair.GetType()); - } - if (json_pair.Size() != 2) { - return Status::Invalid("key item pair must have exactly two elements, had ", - json_pair.Size()); - } - if (json_pair[0].IsNull()) { - return Status::Invalid("null key is invalid"); + + for (auto json_pair : array) { + sj::array json_pair_array; + if (json_pair.get(json_pair_array) != simdjson::SUCCESS) { + return JSONTypeError("array", json_pair.type()); } - RETURN_NOT_OK(key_converter_->AppendValue(json_pair[0])); - RETURN_NOT_OK(item_converter_->AppendValue(json_pair[1])); + + RETURN_NOT_OK(ProcessJsonArrayElements( + json_pair_array, "key-item pair", + [this](sj::value& key) { + if (key.is_null()) { + return Status::Invalid("null key is invalid"); + } + return key_converter_->AppendValue(key); + }, + [this](sj::value& item) { return item_converter_->AppendValue(item); })); } return Status::OK(); } @@ -674,15 +758,19 @@ class FixedSizeListConverter final : public ConcreteConverterAppendNull(); } RETURN_NOT_OK(builder_->Append()); // Extend the child converter with this JSON array - RETURN_NOT_OK(child_converter_->AppendValues(json_obj)); - if (json_obj.GetArray().Size() != static_cast(list_size_)) { - return Status::Invalid("incorrect list size ", json_obj.GetArray().Size()); + sj::array array; + if(json_obj.get(array) != simdjson::SUCCESS){ + return JSONTypeError("array", json_obj.type()); + } + ARROW_ASSIGN_OR_RAISE(int32_t size, child_converter_->AppendValues(array)); + if (size != list_size_) { + return Status::Invalid("incorrect list size ", size); } return Status::OK(); } @@ -718,45 +806,52 @@ class StructConverter final : public ConcreteConverter { // Append a JSON value that is either an array of N elements in order // or an object mapping struct names to values (omitted struct members // are mapped to null). - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return this->AppendNull(); } - if (json_obj.IsArray()) { - auto size = json_obj.Size(); + sj::array array; + if (json_obj.get(array) == simdjson::SUCCESS) { auto expected_size = static_cast(type_->num_fields()); - if (size != expected_size) { - return Status::Invalid("Expected array of size ", expected_size, - ", got array of size ", size); + uint32_t i = 0; + for (auto child : array) { + // TODO.TAE no unsafe + sj::value child_value = child.value_unsafe(); + RETURN_NOT_OK(child_converters_[i]->AppendValue(child_value)); + ++i; } - for (uint32_t i = 0; i < size; ++i) { - RETURN_NOT_OK(child_converters_[i]->AppendValue(json_obj[i])); + if (i != expected_size) { + return Status::Invalid("Expected array of size ", expected_size, + ", got array of size ", i); } return builder_->Append(); } - if (json_obj.IsObject()) { - auto remaining = json_obj.MemberCount(); - auto num_children = type_->num_fields(); - for (int32_t i = 0; i < num_children; ++i) { + sj::object object; + if (json_obj.get(object) == simdjson::SUCCESS) { + size_t remaining_num_fields_in_json; + if(object.count_fields().get(remaining_num_fields_in_json) != simdjson::SUCCESS){ + return Status::Invalid("Malformed json object: ", object.raw_json()); + } + auto num_fields = type_->num_fields(); + for (int32_t i = 0; i < num_fields; ++i) { const auto& field = type_->field(i); - auto it = json_obj.FindMember(field->name()); - if (it != json_obj.MemberEnd()) { - --remaining; - RETURN_NOT_OK(child_converters_[i]->AppendValue(it->value)); + auto it = object.find_field_unordered(field->name()); + sj::value value; + if (it.get(value) == simdjson::SUCCESS) { + --remaining_num_fields_in_json; + RETURN_NOT_OK(child_converters_[i]->AppendValue(value)); } else { RETURN_NOT_OK(child_converters_[i]->AppendNull()); } } - if (remaining > 0) { - rj::StringBuffer sb; - rj::Writer writer(sb); - json_obj.Accept(writer); + if (remaining_num_fields_in_json > 0) { + std::string_view raw_json = json_obj.raw_json(); return Status::Invalid("Unexpected members in JSON object for type ", - type_->ToString(), " Object: ", sb.GetString()); + type_->ToString(), " Object: ", raw_json); } return builder_->Append(); } - return JSONTypeError("array or object", json_obj.GetType()); + return JSONTypeError("array or object", json_obj.type()); } std::shared_ptr builder() override { return builder_; } @@ -801,40 +896,49 @@ class UnionConverter final : public ConcreteConverter { // Append a JSON value that must be a 2-long array, containing the type_id // and value of the UnionArray's slot. - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { + Status AppendValue(sj::value& json_obj) override { + if (json_obj.is_null()) { return this->AppendNull(); } - if (!json_obj.IsArray()) { - return JSONTypeError("array", json_obj.GetType()); - } - if (json_obj.Size() != 2) { - return Status::Invalid("Expected [type_id, value] pair, got array of size ", - json_obj.Size()); - } - const auto& id_obj = json_obj[0]; - if (!id_obj.IsInt()) { - return JSONTypeError("int", id_obj.GetType()); - } - auto id = static_cast(id_obj.GetInt()); - auto child_num = type_id_to_child_num_[id]; - if (child_num == -1) { - return Status::Invalid("type_id ", id, " not found in ", *type_); + sj::array array; + if (json_obj.get(array) != simdjson::SUCCESS) { + return JSONTypeError("array", json_obj.type()); } - auto child_converter = child_converters_[child_num]; - if (mode_ == UnionMode::SPARSE) { - RETURN_NOT_OK(checked_cast(*builder_).Append(id)); - for (auto&& other_converter : child_converters_) { - if (other_converter != child_converter) { - RETURN_NOT_OK(other_converter->AppendNull()); - } - } - } else { - RETURN_NOT_OK(checked_cast(*builder_).Append(id)); - } - return child_converter->AppendValue(json_obj[1]); + int8_t id = 0; + std::shared_ptr child_converter; + + RETURN_NOT_OK(ProcessJsonArrayElements( + array, "[type_id, value] pair", + [this, &id, &child_converter](sj::value& id_elem) { + int64_t id_value; + if (id_elem.get(id_value) != simdjson::SUCCESS) { + return JSONTypeError("int", id_elem.type()); + } + id = static_cast(id_value); + auto child_num = type_id_to_child_num_[id]; + if (child_num == -1) { + return Status::Invalid("type_id ", id, " not found in ", *type_); + } + child_converter = child_converters_[child_num]; + + if (mode_ == UnionMode::SPARSE) { + RETURN_NOT_OK(checked_cast(*builder_).Append(id)); + for (auto&& other_converter : child_converters_) { + if (other_converter != child_converter) { + RETURN_NOT_OK(other_converter->AppendNull()); + } + } + } else { + RETURN_NOT_OK(checked_cast(*builder_).Append(id)); + } + return Status::OK(); + }, + [&child_converter](sj::value& value_elem) { + return child_converter->AppendValue(value_elem); + })); + return Status::OK(); } std::shared_ptr builder() override { return builder_; } @@ -980,15 +1084,23 @@ Result> ArrayFromJSONString(const std::shared_ptr converter; RETURN_NOT_OK(GetConverter(type, &converter)); - rj::Document json_doc; - json_doc.Parse(json_string.data(), json_string.length()); - if (json_doc.HasParseError()) { - return Status::Invalid("JSON parse error at offset ", json_doc.GetErrorOffset(), ": ", - GetParseError_En(json_doc.GetParseError())); + // TODO we should not copy the whole string. Maybe we can move the requirement of padding to users of this function + simdjson::padded_string padded_string{json_string}; + + sj::parser parser; + sj::document json_doc; + auto error = parser.iterate(padded_string).get(json_doc); + if (error) { + return Status::Invalid("JSON parse error: ", simdjson::error_message(error)); + } + + sj::array array; + if(json_doc.get(array) != simdjson::SUCCESS){ + return JSONTypeError("array", json_doc.type()); } // The JSON document should be an array, append it - RETURN_NOT_OK(converter->AppendValues(json_doc)); + RETURN_NOT_OK(converter->AppendValues(array)); std::shared_ptr out; RETURN_NOT_OK(converter->Finish(&out)); return out; @@ -1036,15 +1148,24 @@ Result> ScalarFromJSONString( std::shared_ptr converter; RETURN_NOT_OK(GetConverter(type, &converter)); - rj::Document json_doc; - json_doc.Parse(json_string.data(), json_string.length()); - if (json_doc.HasParseError()) { - return Status::Invalid("JSON parse error at offset ", json_doc.GetErrorOffset(), ": ", - GetParseError_En(json_doc.GetParseError())); + // TODO we should not copy the whole string. Maybe we can move the requirement of padding to users of this function + simdjson::padded_string padded_string{json_string}; + + sj::parser parser; + sj::document json_doc; + auto error = parser.iterate(padded_string).get(json_doc); + if (error) { + return Status::Invalid("JSON parse error: ", simdjson::error_message(error)); + } + + sj::value value; + if(json_doc.get(value) != simdjson::SUCCESS){ + return JSONTypeError("value", json_doc.type()); } + RETURN_NOT_OK(converter->AppendValue(value)); + std::shared_ptr array; - RETURN_NOT_OK(converter->AppendValue(json_doc)); RETURN_NOT_OK(converter->Finish(&array)); DCHECK_EQ(array->length(), 1); return array->GetScalar(0); diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 4516b808a84..ce10b33be70 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -265,7 +265,7 @@ TEST_F(TestRecordBatchEqualsSameAddress, FloatType) { auto schema = ::arrow::schema({f0, f1}); auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]"); - auto a1 = ArrayFromJSON(f1->type(), "[0.0, 1.0, 2.0, NaN]"); + auto a1 = ArrayFromJSON(f1->type(), "[0.0, 1.0, 2.0, \"NaN\"]"); auto b0 = RecordBatch::Make(schema, 3, {a0, a1}); auto b1 = b0; @@ -287,7 +287,7 @@ TEST_F(TestRecordBatchEqualsSameAddress, NestedTypesWithFloatType) { auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]"); auto a1 = ArrayFromJSON( - f1->type(), R"([{"f2": 1, "f3": 4.0}, {"f2": 2, "f3": 4.0}, {"f2":3, "f3": NaN}])"); + f1->type(), R"([{"f2": 1, "f3": 4.0}, {"f2": 2, "f3": 4.0}, {"f2":3, "f3": "NaN"}])"); auto b0 = RecordBatch::Make(schema, 3, {a0, a1}); auto b1 = b0; @@ -966,8 +966,8 @@ TEST_F(TestRecordBatch, ToTensorSupportedNaN) { std::vector> fields = {f0, f1}; auto schema = ::arrow::schema(fields); - auto a0 = ArrayFromJSON(float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9]"); - auto a1 = ArrayFromJSON(float32(), "[10, 20, 30, 40, NaN, 60, 70, 80, 90]"); + auto a0 = ArrayFromJSON(float32(), "[\"NaN\", 2, 3, 4, 5, 6, 7, 8, 9]"); + auto a1 = ArrayFromJSON(float32(), "[10, 20, 30, 40, \"NaN\", 60, 70, 80, 90]"); auto batch = RecordBatch::Make(schema, length, {a0, a1}); @@ -979,7 +979,7 @@ TEST_F(TestRecordBatch, ToTensorSupportedNaN) { const int64_t f32_size = sizeof(float); std::vector f_strides = {f32_size, f32_size * shape[0]}; std::shared_ptr tensor_expected = TensorFromJSON( - float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + float32(), R"(["NaN", 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, "NaN", 60, 70, 80, 90])", shape, f_strides); EXPECT_FALSE(tensor_expected->Equals(*tensor)); @@ -1010,7 +1010,7 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { const int64_t f64_size = sizeof(double); std::vector f_strides = {f64_size, f64_size * shape[0]}; std::shared_ptr tensor_expected = TensorFromJSON( - float64(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + float64(), "[\"NaN\", 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, \"NaN\", 60, 70, 80, 90]", shape, f_strides); EXPECT_FALSE(tensor_expected->Equals(*tensor)); @@ -1023,7 +1023,7 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { std::vector strides = {f64_size * shape[1], f64_size}; std::shared_ptr tensor_expected_row = TensorFromJSON( - float64(), "[NaN, 10, 2, 20, 3, 30, 4, 40, 5, NaN, 6, 60, 7, 70, 8, 80, 9, 90]", + float64(), "[\"NaN\", 10, 2, 20, 3, 30, 4, 40, 5, \"NaN\", 6, 60, 7, 70, 8, 80, 9, 90]", shape, strides); EXPECT_FALSE(tensor_expected_row->Equals(*tensor_row)); @@ -1075,7 +1075,7 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { const int64_t f32_size = sizeof(float); std::vector f_strides_2 = {f32_size, f32_size * shape[0]}; std::shared_ptr tensor_expected_2 = TensorFromJSON( - float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + float32(), R"(["NaN", 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, "NaN", 60, 70, 80, 90])", shape, f_strides_2); EXPECT_FALSE(tensor_expected_2->Equals(*tensor2)); @@ -1088,7 +1088,7 @@ TEST_F(TestRecordBatch, ToTensorSupportedNullToNan) { std::vector strides_2 = {f32_size * shape[1], f32_size}; std::shared_ptr tensor2_expected_row = TensorFromJSON( - float32(), "[NaN, 10, 2, 20, 3, 30, 4, 40, 5, NaN, 6, 60, 7, 70, 8, 80, 9, 90]", + float32(), "[\"NaN\", 10, 2, 20, 3, 30, 4, 40, 5, \"NaN\", 6, 60, 7, 70, 8, 80, 9, 90]", shape, strides_2); EXPECT_FALSE(tensor2_expected_row->Equals(*tensor2_row)); @@ -1107,7 +1107,7 @@ TEST_F(TestRecordBatch, ToTensorSupportedTypesMixed) { auto a0 = ArrayFromJSON(uint16(), "[1, 2, 3, 4, 5, 6, 7, 8, 9]"); auto a1 = ArrayFromJSON(int16(), "[10, 20, 30, 40, 50, 60, 70, 80, 90]"); - auto a2 = ArrayFromJSON(float32(), "[100, 200, 300, NaN, 500, 600, 700, 800, 900]"); + auto a2 = ArrayFromJSON(float32(), "[100, 200, 300, \"NaN\", 500, 600, 700, 800, 900]"); // Single column std::vector> fields = {f0}; @@ -1168,7 +1168,7 @@ TEST_F(TestRecordBatch, ToTensorSupportedTypesMixed) { std::shared_ptr tensor_expected_2 = TensorFromJSON(float64(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, " - "60, 70, 80, 90, 100, 200, 300, NaN, 500, 600, 700, 800, 900]", + "60, 70, 80, 90, 100, 200, 300, \"NaN\", 500, 600, 700, 800, 900]", shape2, f_strides_2); EXPECT_FALSE(tensor_expected_2->Equals(*tensor2)); diff --git a/cpp/src/arrow/table_test.cc b/cpp/src/arrow/table_test.cc index 692671910b8..da1c8a8857a 100644 --- a/cpp/src/arrow/table_test.cc +++ b/cpp/src/arrow/table_test.cc @@ -273,11 +273,11 @@ TEST(TestTableEqualityFloatType, SingedZero) { TEST(TestTableEqualityFloatType, Infinity) { auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())}); auto table = TableFromJSON( - schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": Inf}])"}); + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": "Inf"}])"}); auto table_different_inf = TableFromJSON( - schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": -Inf}])"}); + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": "-Inf"}])"}); auto table_same_inf = TableFromJSON( - schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": Inf}])"}); + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": "Inf"}])"}); ASSERT_FALSE(table->Equals(*table_different_inf)); ASSERT_TRUE(table->Equals(*table_same_inf)); @@ -286,9 +286,9 @@ TEST(TestTableEqualityFloatType, Infinity) { TEST(TestTableEqualityFloatType, NaN) { auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())}); auto table = TableFromJSON( - schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": NaN}, {"f0": 3, "f1": 6.0}])"}); + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": "NaN"}, {"f0": 3, "f1": 6.0}])"}); auto other_table = TableFromJSON( - schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": NaN}, {"f0": 3, "f1": 6.0}])"}); + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": "NaN"}, {"f0": 3, "f1": 6.0}])"}); auto options = EqualOptions::Defaults(); ASSERT_FALSE(table->Equals(*other_table, options)); @@ -338,7 +338,7 @@ TEST(TestTableEqualitySameAddress, NestedTypesWithoutFloatType) { TEST(TestTableEqualitySameAddress, FloatType) { auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())}); auto table = TableFromJSON( - schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": NaN}, {"f0": 3, "f1": 6.0}])"}); + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": "NaN"}, {"f0": 3, "f1": 6.0}])"}); auto other_table = table; auto options = EqualOptions::Defaults(); @@ -351,7 +351,7 @@ TEST(TestTableEqualitySameAddress, NestedTypesWithFloatType) { {field("f0", int32()), field("f1", struct_({{"f2", utf8()}, {"f3", float64()}}))}); auto table = TableFromJSON( schema, - {R"([{"f0": 1, "f1": {"f2": "4", "f3": 7.0}}, {"f0": 2, "f1": {"f2": "5", "f3": NaN}}, {"f0": 3,"f1": {"f2" : "6", "f3": 9.0}}])"}); + {R"([{"f0": 1, "f1": {"f2": "4", "f3": 7.0}}, {"f0": 2, "f1": {"f2": "5", "f3": "NaN"}}, {"f0": 3,"f1": {"f2" : "6", "f3": 9.0}}])"}); auto other_table = table; auto options = EqualOptions::Defaults(); diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 442cde2c9c0..cacf96d9ec1 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -99,6 +99,8 @@ ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM=2f723218f6cb709ae4cdc4fb5ed56a5951fc5d466f0 # warnings. ARROW_RAPIDJSON_BUILD_VERSION=232389d4f1012dddec4ef84861face2d2ba85709 ARROW_RAPIDJSON_BUILD_SHA256_CHECKSUM=b9290a9a6d444c8e049bd589ab804e0ccf2b05dc5984a19ed5ae75d090064806 +ARROW_SIMDJSON_BUILD_VERSION=v4.2.4 +ARROW_SIMDJSON_BUILD_SHA256_CHECKSUM=6f942d018561a6c30838651a386a17e6e4abbfc396afd0f62740dea1810dedea # RE2 2023-03-01 is pinned to avoid Abseil dependency. Versions after 2023-06-01 # require Abseil, which would add significant build time and complexity, particularly # for CRAN builds. This version includes musl libc support (GH-48010). @@ -162,6 +164,7 @@ DEPENDENCIES=( "ARROW_ORC_URL orc-${ARROW_ORC_BUILD_VERSION}.tar.gz https://www.apache.org/dyn/closer.lua/orc/orc-${ARROW_ORC_BUILD_VERSION}/orc-${ARROW_ORC_BUILD_VERSION}.tar.gz?action=download" "ARROW_PROTOBUF_URL protobuf-${ARROW_PROTOBUF_BUILD_VERSION}.tar.gz https://github.com/google/protobuf/releases/download/${ARROW_PROTOBUF_BUILD_VERSION}/protobuf-all-${ARROW_PROTOBUF_BUILD_VERSION:1}.tar.gz" "ARROW_RAPIDJSON_URL rapidjson-${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz https://github.com/miloyip/rapidjson/archive/${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz" + "ARROW_SIMDJSON_URL simdjson-${ARROW_SIMDJSON_BUILD_VERSION}.tar.gz https://github.com/simdjson/simdjson/archive/${ARROW_SIMDJSON_BUILD_VERSION}.tar.gz" "ARROW_RE2_URL re2-${ARROW_RE2_BUILD_VERSION}.tar.gz https://github.com/google/re2/archive/${ARROW_RE2_BUILD_VERSION}.tar.gz" "ARROW_S2N_TLS_URL s2n-${ARROW_S2N_TLS_BUILD_VERSION}.tar.gz https://github.com/aws/s2n-tls/archive/${ARROW_S2N_TLS_BUILD_VERSION}.tar.gz" "ARROW_SNAPPY_URL snappy-${ARROW_SNAPPY_BUILD_VERSION}.tar.gz https://github.com/google/snappy/archive/${ARROW_SNAPPY_BUILD_VERSION}.tar.gz"