Skip to content

Commit

Permalink
Merge branch 'branch-24.12' into cudf-polars-chunked-parquet-reader
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar authored Nov 15, 2024
2 parents b398172 + d475dca commit 2116d94
Show file tree
Hide file tree
Showing 35 changed files with 1,168 additions and 229 deletions.
4 changes: 2 additions & 2 deletions ci/test_wheel_cudf_polars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ rapids-logger "Installing cudf_polars and its dependencies"
# generate constraints (possibly pinning to oldest support versions of dependencies)
rapids-generate-pip-constraints py_test_cudf_polars ./constraints.txt

# echo to expand wildcard before adding `[test]` requires for pip
# echo to expand wildcard before adding `[test,experimental]` requires for pip
python -m pip install \
-v \
--constraint ./constraints.txt \
"$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
"$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,experimental]" \
"$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
"$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"

Expand Down
5 changes: 5 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERS
)
endif()

rapids_cmake_write_version_file(include/cudf/version_config.hpp)

# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
# have different values for the `Threads::Threads` target. Setting this flag ensures
# `Threads::Threads` is the same value in first run and subsequent runs.
Expand Down Expand Up @@ -1126,6 +1128,9 @@ install(
DESTINATION ${lib_dir}
EXPORT cudf-exports
)
install(FILES ${CUDF_BINARY_DIR}/include/cudf/version_config.hpp
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cudf
)

set(_components_export_string)
if(TARGET cudftestutil)
Expand Down
9 changes: 7 additions & 2 deletions cpp/cmake/thirdparty/get_flatbuffers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,19 @@
# Use CPM to find or clone flatbuffers
function(find_and_configure_flatbuffers VERSION)

if(NOT BUILD_SHARED_LIBS)
set(_exclude_from_all EXCLUDE_FROM_ALL FALSE)
else()
set(_exclude_from_all EXCLUDE_FROM_ALL TRUE)
endif()

rapids_cpm_find(
flatbuffers ${VERSION}
GLOBAL_TARGETS flatbuffers
CPM_ARGS
GIT_REPOSITORY https://github.com/google/flatbuffers.git
GIT_TAG v${VERSION}
GIT_SHALLOW TRUE
EXCLUDE_FROM_ALL TRUE
GIT_SHALLOW TRUE ${_exclude_from_all}
)

rapids_export_find_package_root(
Expand Down
9 changes: 7 additions & 2 deletions cpp/cmake/thirdparty/get_nanoarrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,19 @@ function(find_and_configure_nanoarrow)
set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json")

if(NOT BUILD_SHARED_LIBS)
set(_exclude_from_all EXCLUDE_FROM_ALL FALSE)
else()
set(_exclude_from_all EXCLUDE_FROM_ALL TRUE)
endif()

# Currently we need to always build nanoarrow so we don't pickup a previous installed version
set(CPM_DOWNLOAD_nanoarrow ON)
rapids_cpm_find(
nanoarrow 0.6.0.dev
GLOBAL_TARGETS nanoarrow
CPM_ARGS
OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
EXCLUDE_FROM_ALL TRUE
OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all}
)
set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
rapids_export_find_package_root(BUILD nanoarrow "${nanoarrow_BINARY_DIR}" EXPORT_SET cudf-exports)
Expand Down
42 changes: 18 additions & 24 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -118,62 +118,56 @@ string removeQuotes(string str, char quotechar)
}

/**
* @brief Parse the first row to set the column names in the raw_csv parameter.
* The first row can be either the header row, or the first data row
* @brief Parse a row of input to get the column names. The row can either be the header, or the
* first data row. If the header is not used, column names are generated automatically.
*/
std::vector<std::string> get_column_names(std::vector<char> const& header,
std::vector<std::string> get_column_names(std::vector<char> const& row,
parse_options_view const& parse_opts,
int header_row,
std::string prefix)
{
std::vector<std::string> col_names;

// If there is only a single character then it would be the terminator
if (header.size() <= 1) { return col_names; }

std::vector<char> first_row = header;
// Empty row, return empty column names vector
if (row.empty()) { return {}; }

std::vector<std::string> col_names;
bool quotation = false;
for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) {
for (size_t pos = 0, prev = 0; pos < row.size(); ++pos) {
// Flip the quotation flag if current character is a quotechar
if (first_row[pos] == parse_opts.quotechar) {
quotation = !quotation;
}
if (row[pos] == parse_opts.quotechar) { quotation = !quotation; }
// Check if end of a column/row
else if (pos == first_row.size() - 1 ||
(!quotation && first_row[pos] == parse_opts.terminator) ||
(!quotation && first_row[pos] == parse_opts.delimiter)) {
if (pos == row.size() - 1 || (!quotation && row[pos] == parse_opts.terminator) ||
(!quotation && row[pos] == parse_opts.delimiter)) {
// This is the header, add the column name
if (header_row >= 0) {
// Include the current character, in case the line is not terminated
int col_name_len = pos - prev + 1;
// Exclude the delimiter/terminator is present
if (first_row[pos] == parse_opts.delimiter || first_row[pos] == parse_opts.terminator) {
if (row[pos] == parse_opts.delimiter || row[pos] == parse_opts.terminator) {
--col_name_len;
}
// Also exclude '\r' character at the end of the column name if it's
// part of the terminator
if (col_name_len > 0 && parse_opts.terminator == '\n' && first_row[pos] == '\n' &&
first_row[pos - 1] == '\r') {
if (col_name_len > 0 && parse_opts.terminator == '\n' && row[pos] == '\n' &&
row[pos - 1] == '\r') {
--col_name_len;
}

string const new_col_name(first_row.data() + prev, col_name_len);
string const new_col_name(row.data() + prev, col_name_len);
col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar));
} else {
// This is the first data row, add the automatically generated name
col_names.push_back(prefix + std::to_string(col_names.size()));
}

// Stop parsing when we hit the line terminator; relevant when there is
// a blank line following the header. In this case, first_row includes
// a blank line following the header. In this case, row includes
// multiple line terminators at the end, as the new recStart belongs to
// a line that comes after the blank line(s)
if (!quotation && first_row[pos] == parse_opts.terminator) { break; }
if (!quotation && row[pos] == parse_opts.terminator) { break; }

// Skip adjacent delimiters if delim_whitespace is set
while (parse_opts.multi_delimiter && pos < first_row.size() &&
first_row[pos] == parse_opts.delimiter && first_row[pos + 1] == parse_opts.delimiter) {
while (parse_opts.multi_delimiter && pos < row.size() && row[pos] == parse_opts.delimiter &&
row[pos + 1] == parse_opts.delimiter) {
++pos;
}
prev = pos + 1;
Expand Down
30 changes: 30 additions & 0 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,11 @@ files:
- test_cpp
- test_python_common
- test_python_cudf
- test_python_cudf_common
- test_python_dask_cudf
- test_python_pylibcudf
- test_python_cudf_pandas
- test_python_cudf_polars
test_static_build:
output: none
includes:
Expand All @@ -59,6 +61,7 @@ files:
- cuda_version
- py_version
- test_python_common
- test_python_cudf_common
- test_python_cudf
- test_python_cudf_pandas
test_python_cudf:
Expand All @@ -67,13 +70,15 @@ files:
- cuda_version
- py_version
- test_python_common
- test_python_cudf_common
- test_python_cudf
test_python_other:
output: none
includes:
- cuda_version
- py_version
- test_python_common
- test_python_cudf_common
- test_python_dask_cudf
test_java:
output: none
Expand Down Expand Up @@ -152,6 +157,7 @@ files:
key: test
includes:
- test_python_common
- test_python_cudf_common
- test_python_cudf
py_build_libcudf:
output: pyproject
Expand Down Expand Up @@ -216,6 +222,7 @@ files:
key: test
includes:
- test_python_common
- test_python_cudf_common
- test_python_pylibcudf
py_test_pandas_cudf:
output: pyproject
Expand Down Expand Up @@ -248,6 +255,14 @@ files:
includes:
- run_cudf_polars
- depends_on_pylibcudf
py_run_cudf_polars_experimental:
output: pyproject
pyproject_dir: python/cudf_polars
extras:
table: project.optional-dependencies
key: experimental
includes:
- run_cudf_polars_experimental
py_test_cudf_polars:
output: pyproject
pyproject_dir: python/cudf_polars
Expand All @@ -256,6 +271,7 @@ files:
key: test
includes:
- test_python_common
- test_python_cudf_polars
py_build_dask_cudf:
output: pyproject
pyproject_dir: python/dask_cudf
Expand All @@ -281,6 +297,7 @@ files:
key: test
includes:
- test_python_common
- test_python_cudf_common
- test_python_dask_cudf
py_build_cudf_kafka:
output: pyproject
Expand Down Expand Up @@ -313,6 +330,7 @@ files:
key: test
includes:
- test_python_common
- test_python_cudf_common
py_build_custreamz:
output: pyproject
pyproject_dir: python/custreamz
Expand All @@ -337,6 +355,7 @@ files:
key: test
includes:
- test_python_common
- test_python_cudf_common
channels:
- rapidsai
- rapidsai-nightly
Expand Down Expand Up @@ -730,6 +749,11 @@ dependencies:
- output_types: [conda, requirements, pyproject]
packages:
- polars>=1.11,<1.14
run_cudf_polars_experimental:
common:
- output_types: [conda, requirements, pyproject]
packages:
- rapids-dask-dependency==24.12.*,>=0.0.0a0
run_dask_cudf:
common:
- output_types: [conda, requirements, pyproject]
Expand Down Expand Up @@ -779,6 +803,7 @@ dependencies:
- pytest<8
- pytest-cov
- pytest-xdist
test_python_cudf_common:
specific:
# Define additional constraints for testing with oldest dependencies.
- output_types: [conda, requirements]
Expand Down Expand Up @@ -884,6 +909,11 @@ dependencies:
- pyarrow==14.0.1
- matrix:
packages:
test_python_cudf_polars:
common:
- output_types: [conda, requirements, pyproject]
packages:
- *numpy
depends_on_libcudf:
common:
- output_types: conda
Expand Down
Loading

0 comments on commit 2116d94

Please sign in to comment.