From d93c3fc65a7f29918239cc4ad50929a1ad3829e1 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 14 Nov 2024 11:33:39 -0800
Subject: [PATCH 1/4] Add version config (#17312)

Resolves #3155.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/17312
---
 cpp/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a3815748e2c..1c13f65fe3c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -36,6 +36,8 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERS
   )
 endif()
 
+rapids_cmake_write_version_file(include/cudf/version_config.hpp)
+
 # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
 # have different values for the `Threads::Threads` target. Setting this flag ensures
 # `Threads::Threads` is the same value in first run and subsequent runs.
@@ -1126,6 +1128,9 @@ install(
   DESTINATION ${lib_dir}
   EXPORT cudf-exports
 )
+install(FILES ${CUDF_BINARY_DIR}/include/cudf/version_config.hpp
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cudf
+)
 
 set(_components_export_string)
 if(TARGET cudftestutil)

From a7194f612adfb5ab9591d5f1bfb5bde4efe97eb7 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 14 Nov 2024 11:37:46 -0800
Subject: [PATCH 2/4] Fix reading of single-row unterminated CSV files (#17305)

Fixed the logic in the CSV reader that led to empty output instead of producing a table with a single column and one row.
Added tests to make sure the new logic does not cause regressions.
Also did some small clean up around the fix.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17305
---
 cpp/src/io/csv/reader_impl.cu      | 42 +++++++++++++-----------------
 python/cudf/cudf/tests/test_csv.py | 17 ++++++++++++
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 72fca75c56b..6c84b53db46 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -118,47 +118,41 @@ string removeQuotes(string str, char quotechar)
 }
 
 /**
- * @brief Parse the first row to set the column names in the raw_csv parameter.
- * The first row can be either the header row, or the first data row
+ * @brief Parse a row of input to get the column names. The row can either be the header, or the
+ * first data row. If the header is not used, column names are generated automatically.
  */
-std::vector<std::string> get_column_names(std::vector<char> const& header,
+std::vector<std::string> get_column_names(std::vector<char> const& row,
                                           parse_options_view const& parse_opts,
                                           int header_row,
                                           std::string prefix)
 {
-  std::vector<std::string> col_names;
-
-  // If there is only a single character then it would be the terminator
-  if (header.size() <= 1) { return col_names; }
-
-  std::vector<char> first_row = header;
+  // Empty row, return empty column names vector
+  if (row.empty()) { return {}; }
 
+  std::vector<std::string> col_names;
   bool quotation = false;
-  for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) {
+  for (size_t pos = 0, prev = 0; pos < row.size(); ++pos) {
     // Flip the quotation flag if current character is a quotechar
-    if (first_row[pos] == parse_opts.quotechar) {
-      quotation = !quotation;
-    }
+    if (row[pos] == parse_opts.quotechar) { quotation = !quotation; }
     // Check if end of a column/row
-    else if (pos == first_row.size() - 1 ||
-             (!quotation && first_row[pos] == parse_opts.terminator) ||
-             (!quotation && first_row[pos] == parse_opts.delimiter)) {
+    if (pos == row.size() - 1 || (!quotation && row[pos] == parse_opts.terminator) ||
+        (!quotation && row[pos] == parse_opts.delimiter)) {
       // This is the header, add the column name
       if (header_row >= 0) {
         // Include the current character, in case the line is not terminated
         int col_name_len = pos - prev + 1;
         // Exclude the delimiter/terminator is present
-        if (first_row[pos] == parse_opts.delimiter || first_row[pos] == parse_opts.terminator) {
+        if (row[pos] == parse_opts.delimiter || row[pos] == parse_opts.terminator) {
           --col_name_len;
         }
         // Also exclude '\r' character at the end of the column name if it's
         // part of the terminator
-        if (col_name_len > 0 && parse_opts.terminator == '\n' && first_row[pos] == '\n' &&
-            first_row[pos - 1] == '\r') {
+        if (col_name_len > 0 && parse_opts.terminator == '\n' && row[pos] == '\n' &&
+            row[pos - 1] == '\r') {
           --col_name_len;
         }
 
-        string const new_col_name(first_row.data() + prev, col_name_len);
+        string const new_col_name(row.data() + prev, col_name_len);
         col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar));
       } else {
         // This is the first data row, add the automatically generated name
@@ -166,14 +160,14 @@ std::vector<std::string> get_column_names(std::vector<char> const& header,
       }
 
       // Stop parsing when we hit the line terminator; relevant when there is
-      // a blank line following the header. In this case, first_row includes
+      // a blank line following the header. In this case, row includes
       // multiple line terminators at the end, as the new recStart belongs to
       // a line that comes after the blank line(s)
-      if (!quotation && first_row[pos] == parse_opts.terminator) { break; }
+      if (!quotation && row[pos] == parse_opts.terminator) { break; }
 
       // Skip adjacent delimiters if delim_whitespace is set
-      while (parse_opts.multi_delimiter && pos < first_row.size() &&
-             first_row[pos] == parse_opts.delimiter && first_row[pos + 1] == parse_opts.delimiter) {
+      while (parse_opts.multi_delimiter && pos < row.size() && row[pos] == parse_opts.delimiter &&
+             row[pos + 1] == parse_opts.delimiter) {
         ++pos;
       }
       prev = pos + 1;
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 8800275bf67..ac772c47e3a 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -2277,3 +2277,20 @@ def test_read_header_none_pandas_compat_column_type():
         result = cudf.read_csv(StringIO(data), header=None).columns
     expected = pd.read_csv(StringIO(data), header=None).columns
     pd.testing.assert_index_equal(result, expected, exact=True)
+
+
+@pytest.mark.parametrize("buffer", ["1", '"one"'])
+def test_read_single_unterminated_row(buffer):
+    gdf = cudf.read_csv(StringIO(buffer), header=None)
+    assert_eq(gdf.shape, (1, 1))
+
+
+@pytest.mark.parametrize("buffer", ["\n", "\r\n"])
+def test_read_empty_only_row(buffer):
+    gdf = cudf.read_csv(StringIO(buffer), header=None)
+    assert_eq(gdf.shape, (0, 0))
+
+
+def test_read_empty_only_row_custom_terminator():
+    gdf = cudf.read_csv(StringIO("*"), header=None, lineterminator="*")
+    assert_eq(gdf.shape, (0, 0))

From 66c5a2d724f14b7062d97c87cbf782fca9f9fabb Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Thu, 14 Nov 2024 17:26:29 -0600
Subject: [PATCH 3/4] prefer wheel-provided libcudf.so in load_library(), use
 RTLD_LOCAL (#17316)

Contributes to https://github.com/rapidsai/build-planning/issues/118

Modifies `libcudf.load_library()` in the following ways:

* prefer wheel-provided `libcudf.so` to system installation
* expose environment variable `RAPIDS_LIBCUDF_PREFER_SYSTEM_LIBRARY` for switching that preference
* load `libcudf.so` with `RTLD_LOCAL`, to prevent adding symbols to the global namespace ([dlopen docs](https://linux.die.net/man/3/dlopen))

## Notes for Reviewers

### How I tested this

Locally (x86_64, CUDA 12, Python 3.12), built `libcudf`, `pylibcudf`, and `cudf` wheels from this branch, then `libcuspatial` and `cuspatial` from the corresponding cuspatial branch. Ran `cuspatial`'s unit tests, and tried setting the environment variable and inspecting `ld`'s logs to confirm that the environment variable changed the loading and search behavior.

e.g.

```shell
# clear ld cache to avoid cheating
rm -f /etc/ld.so.cache
ldconfig

# try using an env variable to say "prefer the system-installed version"
LD_DEBUG=libs \
LD_DEBUG_OUTPUT=/tmp/out.txt \
RAPIDS_LIBCUDF_PREFER_SYSTEM_LIBRARY=true \
python -c "import cuspatial; print(cuspatial.__version__)"

cat /tmp/out.txt.* > prefer-system.txt
# (then manually looked through those logs to confirm it searched places like /usr/lib64 and /lib64)
```

#

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17316
---
 python/libcudf/libcudf/load.py | 70 ++++++++++++++++++++++++----------
 1 file changed, 49 insertions(+), 21 deletions(-)

diff --git a/python/libcudf/libcudf/load.py b/python/libcudf/libcudf/load.py
index bf27ecfa7f5..a91fbb7aecf 100644
--- a/python/libcudf/libcudf/load.py
+++ b/python/libcudf/libcudf/load.py
@@ -16,8 +16,36 @@
 import ctypes
 import os
 
+# Loading with RTLD_LOCAL adds the library itself to the loader's
+# loaded library cache without loading any symbols into the global
+# namespace. This allows libraries that express a dependency on
+# this library to be loaded later and successfully satisfy this dependency
+# without polluting the global symbol table with symbols from
+# libcudf that could conflict with symbols from other DSOs.
+PREFERRED_LOAD_FLAG = ctypes.RTLD_LOCAL
+
+
+def _load_system_installation(soname: str):
+    """Try to dlopen() the library indicated by ``soname``
+    Raises ``OSError`` if library cannot be loaded.
+    """
+    return ctypes.CDLL(soname, PREFERRED_LOAD_FLAG)
+
+
+def _load_wheel_installation(soname: str):
+    """Try to dlopen() the library indicated by ``soname``
+
+    Returns ``None`` if the library cannot be loaded.
+    """
+    if os.path.isfile(
+        lib := os.path.join(os.path.dirname(__file__), "lib64", soname)
+    ):
+        return ctypes.CDLL(lib, PREFERRED_LOAD_FLAG)
+    return None
+
 
 def load_library():
+    """Dynamically load libcudf.so and its dependencies"""
     try:
         # libkvikio must be loaded before libcudf because libcudf references its symbols
         import libkvikio
@@ -29,28 +57,28 @@ def load_library():
         # we assume the library is discoverable on system paths.
         pass
 
-    # Dynamically load libcudf.so. Prefer a system library if one is present to
-    # avoid clobbering symbols that other packages might expect, but if no
-    # other library is present use the one in the wheel.
+    prefer_system_installation = (
+        os.getenv("RAPIDS_LIBCUDF_PREFER_SYSTEM_LIBRARY", "false").lower()
+        != "false"
+    )
+
+    soname = "libcudf.so"
     libcudf_lib = None
-    try:
-        libcudf_lib = ctypes.CDLL("libcudf.so", ctypes.RTLD_GLOBAL)
-    except OSError:
-        # If neither of these directories contain the library, we assume we are in an
-        # environment where the C++ library is already installed somewhere else and the
-        # CMake build of the libcudf Python package was a no-op.
-        #
-        # Note that this approach won't work for real editable installs of the libcudf package.
-        # scikit-build-core has limited support for importlib.resources so there isn't a clean
-        # way to support that case yet.
-        for lib_dir in ("lib", "lib64"):
-            if os.path.isfile(
-                lib := os.path.join(
-                    os.path.dirname(__file__), lib_dir, "libcudf.so"
-                )
-            ):
-                libcudf_lib = ctypes.CDLL(lib, ctypes.RTLD_GLOBAL)
-                break
+    if prefer_system_installation:
+        # Prefer a system library if one is present to
+        # avoid clobbering symbols that other packages might expect, but if no
+        # other library is present use the one in the wheel.
+        try:
+            libcudf_lib = _load_system_installation(soname)
+        except OSError:
+            libcudf_lib = _load_wheel_installation(soname)
+    else:
+        # Prefer the libraries bundled in this package. If they aren't found
+        # (which might be the case in builds where the library was prebuilt before
+        # packaging the wheel), look for a system installation.
+        libcudf_lib = _load_wheel_installation(soname)
+        if libcudf_lib is None:
+            libcudf_lib = _load_system_installation(soname)
 
     # The caller almost never needs to do anything with this library, but no
     # harm in offering the option since this object at least provides a handle

From 927ae9c19e9ae2bcf1a0d5db4b61380e325f5bc9 Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Thu, 14 Nov 2024 19:14:19 -0500
Subject: [PATCH 4/4] Do not exclude nanoarrow and flatbuffers from
 installation if statically linked (#17322)

Had an issue crop up in spark-rapids-jni where we statically link arrow and the build started to fail due to change #17308.

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - Gera Shegalov (https://github.com/gerashegalov)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/17322
---
 cpp/cmake/thirdparty/get_flatbuffers.cmake | 9 +++++++--
 cpp/cmake/thirdparty/get_nanoarrow.cmake   | 9 +++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_flatbuffers.cmake b/cpp/cmake/thirdparty/get_flatbuffers.cmake
index 6a889566300..39521049c85 100644
--- a/cpp/cmake/thirdparty/get_flatbuffers.cmake
+++ b/cpp/cmake/thirdparty/get_flatbuffers.cmake
@@ -15,14 +15,19 @@
 # Use CPM to find or clone flatbuffers
 function(find_and_configure_flatbuffers VERSION)
 
+  if(NOT BUILD_SHARED_LIBS)
+    set(_exclude_from_all EXCLUDE_FROM_ALL FALSE)
+  else()
+    set(_exclude_from_all EXCLUDE_FROM_ALL TRUE)
+  endif()
+
   rapids_cpm_find(
     flatbuffers ${VERSION}
     GLOBAL_TARGETS flatbuffers
     CPM_ARGS
     GIT_REPOSITORY https://github.com/google/flatbuffers.git
     GIT_TAG v${VERSION}
-    GIT_SHALLOW TRUE
-    EXCLUDE_FROM_ALL TRUE
+    GIT_SHALLOW TRUE ${_exclude_from_all}
   )
 
   rapids_export_find_package_root(
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index 4fc742dea2e..c440643037b 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -19,14 +19,19 @@ function(find_and_configure_nanoarrow)
   set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
   rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json")
 
+  if(NOT BUILD_SHARED_LIBS)
+    set(_exclude_from_all EXCLUDE_FROM_ALL FALSE)
+  else()
+    set(_exclude_from_all EXCLUDE_FROM_ALL TRUE)
+  endif()
+
   # Currently we need to always build nanoarrow so we don't pickup a previous installed version
   set(CPM_DOWNLOAD_nanoarrow ON)
   rapids_cpm_find(
     nanoarrow 0.6.0.dev
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
-    OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
-    EXCLUDE_FROM_ALL TRUE
+    OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all}
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
   rapids_export_find_package_root(BUILD nanoarrow "${nanoarrow_BINARY_DIR}" EXPORT_SET cudf-exports)