Skip to content
Draft
9 changes: 6 additions & 3 deletions ci/conan/all/conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def configure(self):
def layout(self):
cmake_layout(self, src_folder="src")

def _requires_rapidjson(self):
def _requires_json_library(self):
return self.options.with_json or self.options.encryption

def requirements(self):
Expand All @@ -185,8 +185,9 @@ def requirements(self):
self.requires("google-cloud-cpp/1.40.1")
if self.options.with_grpc:
self.requires("grpc/1.50.0")
if self._requires_rapidjson():
if self._requires_json_library():
self.requires("rapidjson/1.1.0")
self.requires("simdjson/4.2.2")
if self.options.with_llvm:
self.requires("llvm-core/13.0.0")
if self.options.with_openssl:
Expand Down Expand Up @@ -371,6 +372,7 @@ def generate(self):
tc.variables["ARROW_LZ4_USE_SHARED"] = bool(self.dependencies["lz4"].options.shared)
tc.variables["ARROW_WITH_SNAPPY"] = bool(self.options.with_snappy)
tc.variables["RapidJSON_SOURCE"] = "SYSTEM"
tc.variables["SIMDJSON_SOURCE"] = "SYSTEM"
tc.variables["Snappy_SOURCE"] = "SYSTEM"
if self.options.with_snappy:
tc.variables["ARROW_SNAPPY_USE_SHARED"] = bool(self.dependencies["snappy"].options.shared)
Expand Down Expand Up @@ -559,8 +561,9 @@ def package_info(self):
self.cpp_info.components["libarrow"].requires.append("libbacktrace::libbacktrace")
if self.options.with_cuda:
self.cpp_info.components["libarrow"].requires.append("cuda::cuda")
if self._requires_rapidjson():
if self._requires_json_library():
self.cpp_info.components["libarrow"].requires.append("rapidjson::rapidjson")
self.cpp_info.components["libarrow"].requires.append("simdjson::simdjson")
if self.options.with_s3:
# https://github.com/apache/arrow/blob/6b268f62a8a172249ef35f093009c740c32e1f36/cpp/src/arrow/CMakeLists.txt#L98
self.cpp_info.components["libarrow"].requires.extend([f"aws-sdk-cpp::{x}" for x in ["cognito-identity", "core", "identity-management", "s3", "sts"]])
Expand Down
1 change: 1 addition & 0 deletions ci/docker/alpine-linux-3.22-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ RUN apk add \
re2-dev \
rsync \
samurai \
simdjson-dev \
snappy-dev \
sqlite-dev \
thrift-dev \
Expand Down
1 change: 1 addition & 0 deletions ci/docker/debian-12-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ RUN apt-get update -y -q && \
libpsl-dev \
libre2-dev \
librtmp-dev \
libsimdjson-dev \
libsnappy-dev \
libsqlite3-dev \
libssh-dev \
Expand Down
1 change: 1 addition & 0 deletions ci/docker/debian-13-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ RUN apt-get update -y -q && \
libpsl-dev \
libre2-dev \
librtmp-dev \
libsimdjson-dev \
libsnappy-dev \
libsqlite3-dev \
libssh-dev \
Expand Down
1 change: 1 addition & 0 deletions ci/docker/debian-experimental-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ RUN if [ -n "${gcc}" ]; then \
libpsl-dev \
libre2-dev \
librtmp-dev \
libsimdjson-dev \
libsnappy-dev \
libsqlite3-dev \
libssh-dev \
Expand Down
1 change: 1 addition & 0 deletions ci/docker/fedora-42-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ RUN dnf update -y && \
python-pip \
rapidjson-devel \
re2-devel \
simdjson-dev \
snappy-devel \
thrift-devel \
utf8proc-devel \
Expand Down
1 change: 1 addition & 0 deletions ci/docker/ubuntu-22.04-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ RUN apt-get update -y -q && \
libradospp-dev \
libre2-dev \
librtmp-dev \
libsimdjson-dev \
libsnappy-dev \
libsqlite3-dev \
libssh-dev \
Expand Down
1 change: 1 addition & 0 deletions ci/docker/ubuntu-24.04-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ RUN apt-get update -y -q && \
libradospp-dev \
libre2-dev \
librtmp-dev \
libsimdjson-dev \
libsnappy-dev \
libsqlite3-dev \
libssh-dev \
Expand Down
1 change: 1 addition & 0 deletions ci/scripts/cpp_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ else
-DProtobuf_SOURCE=${Protobuf_SOURCE:-} \
-DRapidJSON_SOURCE=${RapidJSON_SOURCE:-} \
-Dre2_SOURCE=${re2_SOURCE:-} \
-DSIMDJSON_SOURCE=${SIMDJSON_SOURCE:-} \
-DSnappy_SOURCE=${Snappy_SOURCE:-} \
-DThrift_SOURCE=${Thrift_SOURCE:-} \
-Dutf8proc_SOURCE=${utf8proc_SOURCE:-} \
Expand Down
52 changes: 52 additions & 0 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ set(ARROW_THIRDPARTY_DEPENDENCIES
re2
Protobuf
RapidJSON
simdjson
Snappy
Substrait
Thrift
Expand Down Expand Up @@ -211,6 +212,8 @@ macro(build_dependency DEPENDENCY_NAME)
build_rapidjson()
elseif("${DEPENDENCY_NAME}" STREQUAL "re2")
build_re2()
elseif("${DEPENDENCY_NAME}" STREQUAL "simdjson")
build_simdjson()
elseif("${DEPENDENCY_NAME}" STREQUAL "Snappy")
build_snappy()
elseif("${DEPENDENCY_NAME}" STREQUAL "Substrait")
Expand Down Expand Up @@ -381,6 +384,7 @@ endif()

if(ARROW_PARQUET)
set(ARROW_WITH_RAPIDJSON ON)
set(ARROW_WITH_SIMDJSON ON)
set(ARROW_WITH_THRIFT ON)
endif()

Expand Down Expand Up @@ -409,6 +413,7 @@ endif()

if(ARROW_JSON OR ARROW_FLIGHT_SQL_ODBC)
set(ARROW_WITH_RAPIDJSON ON)
set(ARROW_WITH_SIMDJSON ON)
endif()

if(ARROW_ORC OR ARROW_FLIGHT)
Expand Down Expand Up @@ -780,6 +785,14 @@ else()
"https://github.com/aws/s2n-tls/archive/${ARROW_S2N_TLS_BUILD_VERSION}.tar.gz")
endif()

if(DEFINED ENV{ARROW_SIMDJSON_URL})
set(SIMDJSON_SOURCE_URL "$ENV{ARROW_SIMDJSON_URL}")
else()
set_urls(SIMDJSON_SOURCE_URL
"https://github.com/simdjson/simdjson/archive/${ARROW_SIMDJSON_BUILD_VERSION}.tar.gz"
"${THIRDPARTY_MIRROR_URL}/simdjson-${ARROW_SIMDJSON_BUILD_VERSION}.tar.gz")
endif()

if(DEFINED ENV{ARROW_SNAPPY_URL})
set(SNAPPY_SOURCE_URL "$ENV{ARROW_SNAPPY_URL}")
else()
Expand Down Expand Up @@ -2613,6 +2626,45 @@ if(ARROW_WITH_RAPIDJSON)
FALSE)
endif()

macro(build_simdjson)
message(STATUS "Building simdjson from source")
set(SIMDJSON_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/simdjson_ep/src/simdjson_ep-install")
set(SIMDJSON_CMAKE_ARGS
${EP_COMMON_CMAKE_ARGS}
-DSIMDJSON_JUST_LIBRARY=ON
-DSIMDJSON_BUILD_STATIC_LIB=ON
"-DCMAKE_INSTALL_PREFIX=${SIMDJSON_PREFIX}")

externalproject_add(simdjson_ep
${EP_COMMON_OPTIONS}
PREFIX "${CMAKE_BINARY_DIR}"
URL ${SIMDJSON_SOURCE_URL}
URL_HASH "SHA256=${ARROW_SIMDJSON_BUILD_SHA256_CHECKSUM}"
CMAKE_ARGS ${SIMDJSON_CMAKE_ARGS})

set(SIMDJSON_INCLUDE_DIR "${SIMDJSON_PREFIX}/include")
set(SIMDJSON_STATIC_LIB
"${SIMDJSON_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}simdjson${CMAKE_STATIC_LIBRARY_SUFFIX}")

# The include directory must exist before it is referenced by a target.
file(MAKE_DIRECTORY "${SIMDJSON_INCLUDE_DIR}")

add_library(simdjson::simdjson STATIC IMPORTED)
set_target_properties(simdjson::simdjson
PROPERTIES IMPORTED_LOCATION "${SIMDJSON_STATIC_LIB}"
INTERFACE_INCLUDE_DIRECTORIES "${SIMDJSON_INCLUDE_DIR}")
add_dependencies(simdjson::simdjson simdjson_ep)

set(SIMDJSON_VENDORED TRUE)
endmacro()

if(ARROW_WITH_SIMDJSON)
resolve_dependency(simdjson
USE_CONFIG
IS_RUNTIME_DEPENDENCY
FALSE)
endif()

macro(build_xsimd)
message(STATUS "Building xsimd from source")
set(XSIMD_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install")
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,11 @@ if(ARROW_WITH_RAPIDJSON)
target_link_libraries(${ARROW_UTIL_TARGET} PRIVATE RapidJSON)
endforeach()
endif()
if(ARROW_WITH_SIMDJSON)
foreach(ARROW_UTIL_TARGET ${ARROW_UTIL_TARGETS})
target_link_libraries(${ARROW_UTIL_TARGET} PRIVATE simdjson::simdjson)
endforeach()
endif()
if(ARROW_WITH_ZLIB)
foreach(ARROW_UTIL_TARGET ${ARROW_UTIL_TARGETS})
target_link_libraries(${ARROW_UTIL_TARGET} PRIVATE ZLIB::ZLIB)
Expand Down Expand Up @@ -1003,6 +1008,7 @@ if(ARROW_JSON)
json/reader.cc)
foreach(ARROW_JSON_TARGET ${ARROW_JSON_TARGETS})
target_link_libraries(${ARROW_JSON_TARGET} PRIVATE RapidJSON)
target_link_libraries(${ARROW_JSON_TARGET} PRIVATE simdjson::simdjson)
endforeach()
else()
set(ARROW_JSON_TARGET_SHARED)
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/chunked_array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ TEST_F(TestChunkedArrayEqualsSameAddress, NestedTypeWithoutFloat) {
}

TEST_F(TestChunkedArrayEqualsSameAddress, FloatType) {
auto float64_array = ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, NaN]");
auto float64_array = ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, \"NaN\"]");
ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({float64_array}));

ASSERT_FALSE(chunked_array->Equals(chunked_array));
Expand All @@ -182,7 +182,7 @@ TEST_F(TestChunkedArrayEqualsSameAddress, FloatType) {
}

TEST_F(TestChunkedArrayEqualsSameAddress, NestedTypeWithFloat) {
auto float64_array = ArrayFromJSON(float64(), "[0.0, 1.0, NaN]");
auto float64_array = ArrayFromJSON(float64(), "[0.0, 1.0, \"NaN\"]");
ASSERT_OK_AND_ASSIGN(auto struct_array,
StructArray::Make({float64_array}, {"Float64Type"}));
ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({struct_array}));
Expand Down
16 changes: 8 additions & 8 deletions cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -930,24 +930,24 @@ TEST(TestCumulativeMean, ConvenienceFunction) {

TEST(TestCumulative, NaN) {
// addition with NaN is always NaN
CheckVectorUnary("cumulative_sum", ArrayFromJSON(float64(), "[1, 2, NaN, 4, 5]"),
ArrayFromJSON(float64(), "[1, 3, NaN, NaN, NaN]"));
CheckVectorUnary("cumulative_sum", ArrayFromJSON(float64(), "[1, 2, \"NaN\", 4, 5]"),
ArrayFromJSON(float64(), R"([1, 3, "NaN", "NaN", "NaN"])"));

// multiply with Nan is always NaN
CheckVectorUnary("cumulative_prod", ArrayFromJSON(float64(), "[1, 2, NaN, 4, 5]"),
ArrayFromJSON(float64(), "[1, 2, NaN, NaN, NaN]"));
CheckVectorUnary("cumulative_prod", ArrayFromJSON(float64(), "[1, 2, \"NaN\", 4, 5]"),
ArrayFromJSON(float64(), R"([1, 2, "NaN", "NaN", "NaN"])"));

// max with NaN is always ignored because Nan > a always returns false
CheckVectorUnary("cumulative_max", ArrayFromJSON(float64(), "[1, 2, NaN, 4, 5]"),
CheckVectorUnary("cumulative_max", ArrayFromJSON(float64(), "[1, 2, \"NaN\", 4, 5]"),
ArrayFromJSON(float64(), "[1, 2, 2, 4, 5]"));

// min with NaN is always ignored because Nan < a always returns false
CheckVectorUnary("cumulative_min", ArrayFromJSON(float64(), "[5, 4, NaN, 2, 1]"),
CheckVectorUnary("cumulative_min", ArrayFromJSON(float64(), "[5, 4, \"NaN\", 2, 1]"),
ArrayFromJSON(float64(), "[5, 4, 4, 2, 1]"));

// mean with NaN is always Nan
CheckVectorUnary("cumulative_mean", ArrayFromJSON(float64(), "[5, 4, NaN, 2, 1]"),
ArrayFromJSON(float64(), "[5, 4.5, NaN, NaN, NaN]"));
CheckVectorUnary("cumulative_mean", ArrayFromJSON(float64(), "[5, 4, \"NaN\", 2, 1]"),
ArrayFromJSON(float64(), R"([5, 4.5, "NaN", "NaN", "NaN"])"));
}
} // namespace compute
} // namespace arrow
Loading