forked from Bears-R-Us/arkouda
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2e3bdfd
commit 13392a1
Showing
2 changed files
with
150 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
#include "read-parquet.h" | ||
|
||
int64_t cpp_getStringColumnNumBytes(std::string filename, std::string colname, void* chpl_offsets, int64_t numElems, int64_t startIdx) { | ||
try { | ||
auto offsets = (int64_t*)chpl_offsets; | ||
int64_t byteSize = 0; | ||
|
||
std::unique_ptr<parquet::ParquetFileReader> parquet_reader = | ||
parquet::ParquetFileReader::OpenFile(filename, false); | ||
|
||
std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata(); | ||
int num_row_groups = file_metadata->num_row_groups(); | ||
|
||
int64_t i = 0; | ||
for (int r = 0; r < num_row_groups; r++) { | ||
std::shared_ptr<parquet::RowGroupReader> row_group_reader = | ||
parquet_reader->RowGroup(r); | ||
|
||
int64_t values_read = 0; | ||
|
||
std::shared_ptr<parquet::ColumnReader> column_reader; | ||
|
||
int64_t idx = file_metadata -> schema() -> ColumnIndex(colname); | ||
|
||
if(idx < 0) { | ||
std::string dname(colname); | ||
std::string fname(filename); | ||
std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; | ||
std::cout << msg << std::endl; | ||
return -1; | ||
} | ||
column_reader = row_group_reader->Column(idx); | ||
|
||
int16_t definition_level; | ||
parquet::ByteArrayReader* ba_reader = | ||
static_cast<parquet::ByteArrayReader*>(column_reader.get()); | ||
|
||
std::vector<parquet::ByteArray> string_values(10); | ||
int64_t numRead = 0; | ||
while (ba_reader->HasNext() && numRead < numElems) { | ||
//parquet::ByteArray value; | ||
(void)ba_reader->ReadBatch(10, &definition_level, nullptr, string_values.data(), &values_read); | ||
/*if(values_read > 0) { | ||
offsets[i] = value.len + 1; | ||
byteSize += value.len + 1; | ||
numRead += values_read; | ||
} else { | ||
offsets[i] = 1; | ||
byteSize+=1; | ||
numRead+=1; | ||
} | ||
i++;*/ | ||
} | ||
return byteSize; | ||
} | ||
return -1; | ||
} catch (const std::exception& e) { | ||
return -1; | ||
} | ||
} | ||
|
||
int main(int argc, char** argv) { | ||
std::string filename = argv[1]; | ||
std::string colname = argv[2]; | ||
|
||
int numElems = 1000000000; | ||
|
||
// This is the length of each string, so it is the number | ||
// of strings in size | ||
void* chpl_offsets = (void*)malloc(numElems*sizeof(int)); | ||
|
||
std::cout << "Getting number of bytes " << std::endl;; | ||
auto start = std::chrono::high_resolution_clock::now(); | ||
auto byteSize = cpp_getStringColumnNumBytes(filename, colname, chpl_offsets, numElems, 0); | ||
auto finish = std::chrono::high_resolution_clock::now(); | ||
auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(finish-start); | ||
std::cout << milliseconds.count()/1000.0 << "s\n"; | ||
std::cout << "Byte size " << byteSize << std::endl; | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
#include "read-parquet.h" | ||
|
||
int64_t cpp_getStringColumnNumBytes(std::string filename, std::string colname, void* chpl_offsets, int64_t numElems, int64_t startIdx) { | ||
try { | ||
auto offsets = (int64_t*)chpl_offsets; | ||
int64_t byteSize = 0; | ||
|
||
std::unique_ptr<parquet::ParquetFileReader> parquet_reader = | ||
parquet::ParquetFileReader::OpenFile(filename, false); | ||
|
||
std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata(); | ||
int num_row_groups = file_metadata->num_row_groups(); | ||
|
||
int64_t i = 0; | ||
for (int r = 0; r < num_row_groups; r++) { | ||
std::shared_ptr<parquet::RowGroupReader> row_group_reader = | ||
parquet_reader->RowGroup(r); | ||
|
||
int64_t values_read = 0; | ||
|
||
std::shared_ptr<parquet::ColumnReader> column_reader; | ||
|
||
int64_t idx = file_metadata -> schema() -> ColumnIndex(colname); | ||
|
||
if(idx < 0) { | ||
std::string dname(colname); | ||
std::string fname(filename); | ||
std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; | ||
std::cout << msg << std::endl; | ||
return -1; | ||
} | ||
column_reader = row_group_reader->Column(idx); | ||
|
||
int16_t definition_level; | ||
parquet::ByteArrayReader* ba_reader = | ||
static_cast<parquet::ByteArrayReader*>(column_reader.get()); | ||
|
||
int64_t numRead = 0; | ||
std::vector<parquet::ByteArray> string_values(numElems); | ||
|
||
(void)ba_reader->ReadBatch(numElems, &definition_level, nullptr, string_values.data(), &values_read); | ||
return byteSize; | ||
} | ||
return -1; | ||
} catch (const std::exception& e) { | ||
return -1; | ||
} | ||
} | ||
|
||
int main(int argc, char** argv) { | ||
std::string filename = argv[1]; | ||
std::string colname = argv[2]; | ||
|
||
int numElems = 1000; | ||
|
||
// This is the length of each string, so it is the number | ||
// of strings in size | ||
void* chpl_offsets = (void*)malloc(numElems*sizeof(int)); | ||
|
||
std::cout << "Getting number of bytes " << std::endl;; | ||
auto start = std::chrono::high_resolution_clock::now(); | ||
auto byteSize = cpp_getStringColumnNumBytes(filename, colname, chpl_offsets, numElems, 0); | ||
auto finish = std::chrono::high_resolution_clock::now(); | ||
auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(finish-start); | ||
std::cout << milliseconds.count()/1000.0 << "s\n"; | ||
std::cout << "Byte size " << byteSize << std::endl; | ||
|
||
return 0; | ||
} |