Skip to content

Commit

Permalink
Add batch reading of strings
Browse files Browse the repository at this point in the history
  • Loading branch information
bmcdonald3 committed Nov 3, 2023
1 parent 2e3bdfd commit 13392a1
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 0 deletions.
81 changes: 81 additions & 0 deletions cpp-comparison/read-parquet-old-strings.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#include "read-parquet.h"

int64_t cpp_getStringColumnNumBytes(std::string filename, std::string colname, void* chpl_offsets, int64_t numElems, int64_t startIdx) {
try {
auto offsets = (int64_t*)chpl_offsets;
int64_t byteSize = 0;

std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
parquet::ParquetFileReader::OpenFile(filename, false);

std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
int num_row_groups = file_metadata->num_row_groups();

int64_t i = 0;
for (int r = 0; r < num_row_groups; r++) {
std::shared_ptr<parquet::RowGroupReader> row_group_reader =
parquet_reader->RowGroup(r);

int64_t values_read = 0;

std::shared_ptr<parquet::ColumnReader> column_reader;

int64_t idx = file_metadata -> schema() -> ColumnIndex(colname);

if(idx < 0) {
std::string dname(colname);
std::string fname(filename);
std::string msg = "Dataset: " + dname + " does not exist in file: " + fname;
std::cout << msg << std::endl;
return -1;
}
column_reader = row_group_reader->Column(idx);

int16_t definition_level;
parquet::ByteArrayReader* ba_reader =
static_cast<parquet::ByteArrayReader*>(column_reader.get());

std::vector<parquet::ByteArray> string_values(10);
int64_t numRead = 0;
while (ba_reader->HasNext() && numRead < numElems) {
//parquet::ByteArray value;
(void)ba_reader->ReadBatch(10, &definition_level, nullptr, string_values.data(), &values_read);
/*if(values_read > 0) {
offsets[i] = value.len + 1;
byteSize += value.len + 1;
numRead += values_read;
} else {
offsets[i] = 1;
byteSize+=1;
numRead+=1;
}
i++;*/
}
return byteSize;
}
return -1;
} catch (const std::exception& e) {
return -1;
}
}

int main(int argc, char** argv) {
std::string filename = argv[1];
std::string colname = argv[2];

int numElems = 1000000000;

// This is the length of each string, so it is the number
// of strings in size
void* chpl_offsets = (void*)malloc(numElems*sizeof(int));

std::cout << "Getting number of bytes " << std::endl;;
auto start = std::chrono::high_resolution_clock::now();
auto byteSize = cpp_getStringColumnNumBytes(filename, colname, chpl_offsets, numElems, 0);
auto finish = std::chrono::high_resolution_clock::now();
auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(finish-start);
std::cout << milliseconds.count()/1000.0 << "s\n";
std::cout << "Byte size " << byteSize << std::endl;

return 0;
}
69 changes: 69 additions & 0 deletions cpp-comparison/read-parquet-strings.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#include "read-parquet.h"

int64_t cpp_getStringColumnNumBytes(std::string filename, std::string colname, void* chpl_offsets, int64_t numElems, int64_t startIdx) {
try {
auto offsets = (int64_t*)chpl_offsets;
int64_t byteSize = 0;

std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
parquet::ParquetFileReader::OpenFile(filename, false);

std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
int num_row_groups = file_metadata->num_row_groups();

int64_t i = 0;
for (int r = 0; r < num_row_groups; r++) {
std::shared_ptr<parquet::RowGroupReader> row_group_reader =
parquet_reader->RowGroup(r);

int64_t values_read = 0;

std::shared_ptr<parquet::ColumnReader> column_reader;

int64_t idx = file_metadata -> schema() -> ColumnIndex(colname);

if(idx < 0) {
std::string dname(colname);
std::string fname(filename);
std::string msg = "Dataset: " + dname + " does not exist in file: " + fname;
std::cout << msg << std::endl;
return -1;
}
column_reader = row_group_reader->Column(idx);

int16_t definition_level;
parquet::ByteArrayReader* ba_reader =
static_cast<parquet::ByteArrayReader*>(column_reader.get());

int64_t numRead = 0;
std::vector<parquet::ByteArray> string_values(numElems);

(void)ba_reader->ReadBatch(numElems, &definition_level, nullptr, string_values.data(), &values_read);
return byteSize;
}
return -1;
} catch (const std::exception& e) {
return -1;
}
}

int main(int argc, char** argv) {
std::string filename = argv[1];
std::string colname = argv[2];

int numElems = 1000;

// This is the length of each string, so it is the number
// of strings in size
void* chpl_offsets = (void*)malloc(numElems*sizeof(int));

std::cout << "Getting number of bytes " << std::endl;;
auto start = std::chrono::high_resolution_clock::now();
auto byteSize = cpp_getStringColumnNumBytes(filename, colname, chpl_offsets, numElems, 0);
auto finish = std::chrono::high_resolution_clock::now();
auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(finish-start);
std::cout << milliseconds.count()/1000.0 << "s\n";
std::cout << "Byte size " << byteSize << std::endl;

return 0;
}

0 comments on commit 13392a1

Please sign in to comment.