From d93c3fc65a7f29918239cc4ad50929a1ad3829e1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Nov 2024 11:33:39 -0800 Subject: [PATCH 1/4] Add version config (#17312) Resolves #3155. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Robert Maynard (https://github.com/robertmaynard) URL: https://github.com/rapidsai/cudf/pull/17312 --- cpp/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a3815748e2c..1c13f65fe3c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -36,6 +36,8 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERS ) endif() +rapids_cmake_write_version_file(include/cudf/version_config.hpp) + # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to # have different values for the `Threads::Threads` target. Setting this flag ensures # `Threads::Threads` is the same value in first run and subsequent runs. @@ -1126,6 +1128,9 @@ install( DESTINATION ${lib_dir} EXPORT cudf-exports ) +install(FILES ${CUDF_BINARY_DIR}/include/cudf/version_config.hpp + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cudf +) set(_components_export_string) if(TARGET cudftestutil) From a7194f612adfb5ab9591d5f1bfb5bde4efe97eb7 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 14 Nov 2024 11:37:46 -0800 Subject: [PATCH 2/4] Fix reading of single-row unterminated CSV files (#17305) Fixed the logic in the CSV reader that led to empty output instead of producing a table with a single column and one row. Added tests to make sure the new logic does not cause regressions. Also did some small clean up around the fix. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17305 --- cpp/src/io/csv/reader_impl.cu | 42 +++++++++++++----------------- python/cudf/cudf/tests/test_csv.py | 17 ++++++++++++ 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 72fca75c56b..6c84b53db46 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -118,47 +118,41 @@ string removeQuotes(string str, char quotechar) } /** - * @brief Parse the first row to set the column names in the raw_csv parameter. - * The first row can be either the header row, or the first data row + * @brief Parse a row of input to get the column names. The row can either be the header, or the + * first data row. If the header is not used, column names are generated automatically. */ -std::vector get_column_names(std::vector const& header, +std::vector get_column_names(std::vector const& row, parse_options_view const& parse_opts, int header_row, std::string prefix) { - std::vector col_names; - - // If there is only a single character then it would be the terminator - if (header.size() <= 1) { return col_names; } - - std::vector first_row = header; + // Empty row, return empty column names vector + if (row.empty()) { return {}; } + std::vector col_names; bool quotation = false; - for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) { + for (size_t pos = 0, prev = 0; pos < row.size(); ++pos) { // Flip the quotation flag if current character is a quotechar - if (first_row[pos] == parse_opts.quotechar) { - quotation = !quotation; - } + if (row[pos] == parse_opts.quotechar) { quotation = !quotation; } // Check if end of a column/row - else if (pos == first_row.size() - 1 || - (!quotation && first_row[pos] == parse_opts.terminator) || - (!quotation && first_row[pos] == parse_opts.delimiter)) { + if (pos == row.size() - 1 || (!quotation && row[pos] == parse_opts.terminator) || + (!quotation && row[pos] == parse_opts.delimiter)) { // This is the header, add the column name if (header_row >= 0) { // Include the current character, in case the line is not terminated int col_name_len = pos - prev + 1; // Exclude the delimiter/terminator is present - if (first_row[pos] == parse_opts.delimiter || first_row[pos] == parse_opts.terminator) { + if (row[pos] == parse_opts.delimiter || row[pos] == parse_opts.terminator) { --col_name_len; } // Also exclude '\r' character at the end of the column name if it's // part of the terminator - if (col_name_len > 0 && parse_opts.terminator == '\n' && first_row[pos] == '\n' && - first_row[pos - 1] == '\r') { + if (col_name_len > 0 && parse_opts.terminator == '\n' && row[pos] == '\n' && + row[pos - 1] == '\r') { --col_name_len; } - string const new_col_name(first_row.data() + prev, col_name_len); + string const new_col_name(row.data() + prev, col_name_len); col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar)); } else { // This is the first data row, add the automatically generated name @@ -166,14 +160,14 @@ std::vector get_column_names(std::vector const& header, } // Stop parsing when we hit the line terminator; relevant when there is - // a blank line following the header. In this case, first_row includes + // a blank line following the header. In this case, row includes // multiple line terminators at the end, as the new recStart belongs to // a line that comes after the blank line(s) - if (!quotation && first_row[pos] == parse_opts.terminator) { break; } + if (!quotation && row[pos] == parse_opts.terminator) { break; } // Skip adjacent delimiters if delim_whitespace is set - while (parse_opts.multi_delimiter && pos < first_row.size() && - first_row[pos] == parse_opts.delimiter && first_row[pos + 1] == parse_opts.delimiter) { + while (parse_opts.multi_delimiter && pos < row.size() && row[pos] == parse_opts.delimiter && + row[pos + 1] == parse_opts.delimiter) { ++pos; } prev = pos + 1; diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 8800275bf67..ac772c47e3a 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -2277,3 +2277,20 @@ def test_read_header_none_pandas_compat_column_type(): result = cudf.read_csv(StringIO(data), header=None).columns expected = pd.read_csv(StringIO(data), header=None).columns pd.testing.assert_index_equal(result, expected, exact=True) + + +@pytest.mark.parametrize("buffer", ["1", '"one"']) +def test_read_single_unterminated_row(buffer): + gdf = cudf.read_csv(StringIO(buffer), header=None) + assert_eq(gdf.shape, (1, 1)) + + +@pytest.mark.parametrize("buffer", ["\n", "\r\n"]) +def test_read_empty_only_row(buffer): + gdf = cudf.read_csv(StringIO(buffer), header=None) + assert_eq(gdf.shape, (0, 0)) + + +def test_read_empty_only_row_custom_terminator(): + gdf = cudf.read_csv(StringIO("*"), header=None, lineterminator="*") + assert_eq(gdf.shape, (0, 0)) From 66c5a2d724f14b7062d97c87cbf782fca9f9fabb Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 14 Nov 2024 17:26:29 -0600 Subject: [PATCH 3/4] prefer wheel-provided libcudf.so in load_library(), use RTLD_LOCAL (#17316) Contributes to https://github.com/rapidsai/build-planning/issues/118 Modifies `libcudf.load_library()` in the following ways: * prefer wheel-provided `libcudf.so` to system installation * expose environment variable `RAPIDS_LIBCUDF_PREFER_SYSTEM_LIBRARY` for switching that preference * load `libcudf.so` with `RTLD_LOCAL`, to prevent adding symbols to the global namespace ([dlopen docs](https://linux.die.net/man/3/dlopen)) ## Notes for Reviewers ### How I tested this Locally (x86_64, CUDA 12, Python 3.12), built `libcudf`, `pylibcudf`, and `cudf` wheels from this branch, then `libcuspatial` and `cuspatial` from the corresponding cuspatial branch. Ran `cuspatial`'s unit tests, and tried setting the environment variable and inspecting `ld`'s logs to confirm that the environment variable changed the loading and search behavior. e.g. ```shell # clear ld cache to avoid cheating rm -f /etc/ld.so.cache ldconfig # try using an env variable to say "prefer the system-installed version" LD_DEBUG=libs \ LD_DEBUG_OUTPUT=/tmp/out.txt \ RAPIDS_LIBCUDF_PREFER_SYSTEM_LIBRARY=true \ python -c "import cuspatial; print(cuspatial.__version__)" cat /tmp/out.txt.* > prefer-system.txt # (then manually looked through those logs to confirm it searched places like /usr/lib64 and /lib64) ``` # Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17316 --- python/libcudf/libcudf/load.py | 70 ++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/python/libcudf/libcudf/load.py b/python/libcudf/libcudf/load.py index bf27ecfa7f5..a91fbb7aecf 100644 --- a/python/libcudf/libcudf/load.py +++ b/python/libcudf/libcudf/load.py @@ -16,8 +16,36 @@ import ctypes import os +# Loading with RTLD_LOCAL adds the library itself to the loader's +# loaded library cache without loading any symbols into the global +# namespace. This allows libraries that express a dependency on +# this library to be loaded later and successfully satisfy this dependency +# without polluting the global symbol table with symbols from +# libcudf that could conflict with symbols from other DSOs. +PREFERRED_LOAD_FLAG = ctypes.RTLD_LOCAL + + +def _load_system_installation(soname: str): + """Try to dlopen() the library indicated by ``soname`` + Raises ``OSError`` if library cannot be loaded. + """ + return ctypes.CDLL(soname, PREFERRED_LOAD_FLAG) + + +def _load_wheel_installation(soname: str): + """Try to dlopen() the library indicated by ``soname`` + + Returns ``None`` if the library cannot be loaded. + """ + if os.path.isfile( + lib := os.path.join(os.path.dirname(__file__), "lib64", soname) + ): + return ctypes.CDLL(lib, PREFERRED_LOAD_FLAG) + return None + def load_library(): + """Dynamically load libcudf.so and its dependencies""" try: # libkvikio must be loaded before libcudf because libcudf references its symbols import libkvikio @@ -29,28 +57,28 @@ def load_library(): # we assume the library is discoverable on system paths. pass - # Dynamically load libcudf.so. Prefer a system library if one is present to - # avoid clobbering symbols that other packages might expect, but if no - # other library is present use the one in the wheel. + prefer_system_installation = ( + os.getenv("RAPIDS_LIBCUDF_PREFER_SYSTEM_LIBRARY", "false").lower() + != "false" + ) + + soname = "libcudf.so" libcudf_lib = None - try: - libcudf_lib = ctypes.CDLL("libcudf.so", ctypes.RTLD_GLOBAL) - except OSError: - # If neither of these directories contain the library, we assume we are in an - # environment where the C++ library is already installed somewhere else and the - # CMake build of the libcudf Python package was a no-op. - # - # Note that this approach won't work for real editable installs of the libcudf package. - # scikit-build-core has limited support for importlib.resources so there isn't a clean - # way to support that case yet. - for lib_dir in ("lib", "lib64"): - if os.path.isfile( - lib := os.path.join( - os.path.dirname(__file__), lib_dir, "libcudf.so" - ) - ): - libcudf_lib = ctypes.CDLL(lib, ctypes.RTLD_GLOBAL) - break + if prefer_system_installation: + # Prefer a system library if one is present to + # avoid clobbering symbols that other packages might expect, but if no + # other library is present use the one in the wheel. + try: + libcudf_lib = _load_system_installation(soname) + except OSError: + libcudf_lib = _load_wheel_installation(soname) + else: + # Prefer the libraries bundled in this package. If they aren't found + # (which might be the case in builds where the library was prebuilt before + # packaging the wheel), look for a system installation. + libcudf_lib = _load_wheel_installation(soname) + if libcudf_lib is None: + libcudf_lib = _load_system_installation(soname) # The caller almost never needs to do anything with this library, but no # harm in offering the option since this object at least provides a handle From 927ae9c19e9ae2bcf1a0d5db4b61380e325f5bc9 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 14 Nov 2024 19:14:19 -0500 Subject: [PATCH 4/4] Do not exclude nanoarrow and flatbuffers from installation if statically linked (#17322) Had an issue crop up in spark-rapids-jni where we statically link arrow and the build started to fail due to change #17308. Authors: - Mike Wilson (https://github.com/hyperbolic2346) Approvers: - Gera Shegalov (https://github.com/gerashegalov) - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cudf/pull/17322 --- cpp/cmake/thirdparty/get_flatbuffers.cmake | 9 +++++++-- cpp/cmake/thirdparty/get_nanoarrow.cmake | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/cpp/cmake/thirdparty/get_flatbuffers.cmake b/cpp/cmake/thirdparty/get_flatbuffers.cmake index 6a889566300..39521049c85 100644 --- a/cpp/cmake/thirdparty/get_flatbuffers.cmake +++ b/cpp/cmake/thirdparty/get_flatbuffers.cmake @@ -15,14 +15,19 @@ # Use CPM to find or clone flatbuffers function(find_and_configure_flatbuffers VERSION) + if(NOT BUILD_SHARED_LIBS) + set(_exclude_from_all EXCLUDE_FROM_ALL FALSE) + else() + set(_exclude_from_all EXCLUDE_FROM_ALL TRUE) + endif() + rapids_cpm_find( flatbuffers ${VERSION} GLOBAL_TARGETS flatbuffers CPM_ARGS GIT_REPOSITORY https://github.com/google/flatbuffers.git GIT_TAG v${VERSION} - GIT_SHALLOW TRUE - EXCLUDE_FROM_ALL TRUE + GIT_SHALLOW TRUE ${_exclude_from_all} ) rapids_export_find_package_root( diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index 4fc742dea2e..c440643037b 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -19,14 +19,19 @@ function(find_and_configure_nanoarrow) set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json") + if(NOT BUILD_SHARED_LIBS) + set(_exclude_from_all EXCLUDE_FROM_ALL FALSE) + else() + set(_exclude_from_all EXCLUDE_FROM_ALL TRUE) + endif() + # Currently we need to always build nanoarrow so we don't pickup a previous installed version set(CPM_DOWNLOAD_nanoarrow ON) rapids_cpm_find( nanoarrow 0.6.0.dev GLOBAL_TARGETS nanoarrow CPM_ARGS - OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" - EXCLUDE_FROM_ALL TRUE + OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all} ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) rapids_export_find_package_root(BUILD nanoarrow "${nanoarrow_BINARY_DIR}" EXPORT_SET cudf-exports)