forked from Bears-R-Us/arkouda
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add benchmark for for CSV Read and write perf (Bears-R-Us#3189)
* Add benchmark for for CSV Read and write perf This does the following: - Updates the `time_ak_write`, `time_ak_read`, and `check_correctness` functions in IO.py to take a `fileFormat` argument instead of a the old `parquet` argument. - `fileFormat` is an enum to distinguish between the three different filetypes, namely- hdf5, parquet, and csv. - Add support for testing performance of csv reads and write in the above functions - Remove the unused `seed` argument from `time_ak_read` - Add a new benchmark `csvIO.py`. Currently CSV read performance is pretty bad and the benchmark just says 0.0 because of rounding. I have a follow up task to improve CSV read performance. - Update `multiIO.py`, `parquetIO.py`, and `multiParquetIO.py` to work with the new changes to `IO.py` described in the first bullet - Small change to contributing.rst (we omit the `.py`) Signed-off-by: Shreyas Khandekar <[email protected]> * Add graph infrastructure Signed-off-by: Shreyas Khandekar <[email protected]> * Fix errors with correctness tests Signed-off-by: Shreyas Khandekar <[email protected]> * Fix issue with array Truth values in correctness tests Signed-off-by: Shreyas Khandekar <[email protected]> * Change write to read in output for time_ak_read Signed-off-by: Shreyas Khandekar <[email protected]> * Increase significant digits in rate output to 4 Signed-off-by: Shreyas Khandekar <[email protected]> --------- Signed-off-by: Shreyas Khandekar <[email protected]>
- Loading branch information
1 parent
fc95156
commit ea7ef8b
Showing
9 changed files
with
249 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import argparse | ||
|
||
from IO import * | ||
|
||
TYPES = ( | ||
"int64", | ||
"float64", | ||
"uint64", | ||
"str" | ||
) | ||
|
||
|
||
def create_parser(): | ||
parser = argparse.ArgumentParser(description="Measure performance of Parquet reads/writes.") | ||
parser.add_argument("hostname", help="Hostname of arkouda server") | ||
parser.add_argument("port", type=int, help="Port of arkouda server") | ||
parser.add_argument( | ||
"-n", "--size", type=int, default=10**6, help="Problem size: length of array to read/write" | ||
) | ||
parser.add_argument( | ||
"-t", "--trials", type=int, default=1, help="Number of times to run the benchmark" | ||
) | ||
parser.add_argument( | ||
"-d", "--dtype", default="int64", help="Dtype of array ({})".format(", ".join(TYPES)) | ||
) | ||
parser.add_argument( | ||
"--correctness-only", | ||
default=False, | ||
action="store_true", | ||
help="Only check correctness, not performance.", | ||
) | ||
parser.add_argument( | ||
"-p", | ||
"--path", | ||
default=os.getcwd() + "ak-io-test", | ||
help="Target path for measuring read/write rates", | ||
) | ||
parser.add_argument( | ||
"-s", "--seed", default=None, type=int, help="Value to initialize random number generator" | ||
) | ||
parser.add_argument( | ||
"-w", | ||
"--only-write", | ||
default=False, | ||
action="store_true", | ||
help="Only write the files; files will not be removed", | ||
) | ||
parser.add_argument( | ||
"-r", | ||
"--only-read", | ||
default=False, | ||
action="store_true", | ||
help="Only read the files; files will not be removed", | ||
) | ||
parser.add_argument( | ||
"-f", | ||
"--only-delete", | ||
default=False, | ||
action="store_true", | ||
help="Only delete files created from writing with this benchmark", | ||
) | ||
parser.add_argument( | ||
"-l", "--files-per-loc", type=int, default=1, help="Number of files to create per locale" | ||
) | ||
parser.add_argument( | ||
"-c", | ||
"--compression", | ||
default="", | ||
action="store", | ||
help="Compression types to run Parquet benchmarks against. Comma delimited list (NO SPACES) allowing " | ||
"for multiple. Accepted values: none, snappy, gzip, brotli, zstd, and lz4" | ||
) | ||
return parser | ||
|
||
|
||
if __name__ == "__main__": | ||
import sys | ||
|
||
parser = create_parser() | ||
args = parser.parse_args() | ||
if args.dtype not in TYPES: | ||
raise ValueError("Dtype must be {}, not {}".format("/".join(TYPES), args.dtype)) | ||
ak.verbose = False | ||
ak.connect(args.hostname, args.port) | ||
comp_str = args.compression | ||
comp_types = COMPRESSIONS if comp_str == "" else comp_str.lower().split(",") | ||
|
||
if args.correctness_only: | ||
for dtype in TYPES: | ||
check_correctness(dtype, args.path, args.seed, FileFormat.CSV) | ||
sys.exit(0) | ||
|
||
print("array size = {:,}".format(args.size)) | ||
print("number of trials = ", args.trials) | ||
|
||
if args.only_write: | ||
time_ak_write( | ||
args.size, | ||
args.files_per_loc, | ||
args.trials, | ||
args.dtype, | ||
args.path, | ||
args.seed, | ||
FileFormat.CSV, | ||
comp_types, | ||
) | ||
elif args.only_read: | ||
time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.CSV, comp_types) | ||
else: | ||
time_ak_write( | ||
args.size, | ||
args.files_per_loc, | ||
args.trials, | ||
args.dtype, | ||
args.path, | ||
args.seed, | ||
FileFormat.CSV, | ||
comp_types, | ||
) | ||
time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.CSV, comp_types) | ||
remove_files(args.path) | ||
|
||
sys.exit(0) |
Oops, something went wrong.