Skip to content

Commit

Permalink
Made imdb download (data_imdb) function atomic (#14225)
Browse files Browse the repository at this point in the history
* Made imdb download (data_imdb) function atomic

* Removed numfmt

* Removed numfmt

* Fix bug, add imdb to list of benchmarks in help

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
Spaarsh and alamb authored Jan 23, 2025
1 parent 44cf77f commit 72c0df4
Showing 1 changed file with 42 additions and 12 deletions.
54 changes: 42 additions & 12 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ external_aggr: External aggregation benchmark
h2o_small: h2oai benchmark with small dataset (1e7 rows), default file format is csv
h2o_medium: h2oai benchmark with medium dataset (1e8 rows), default file format is csv
h2o_big: h2oai benchmark with large dataset (1e9 rows), default file format is csv
imdb: Join Order Benchmark (JOB) using the IMDB dataset converted to parquet
**********
* Supported Configuration (Environment Variables)
Expand Down Expand Up @@ -536,23 +537,52 @@ data_imdb() {
done

if [ "$convert_needed" = true ]; then
if [ ! -f "${imdb_dir}/imdb.tgz" ]; then
echo "Downloading IMDB dataset..."
# Expected size of the dataset
expected_size="1263193115" # 1.18 GB

echo -n "Looking for imdb.tgz... "
if [ -f "${imdb_temp_gz}" ]; then
echo "found"
echo -n "Checking size... "
OUTPUT_SIZE=$(wc -c "${imdb_temp_gz}" 2>/dev/null | awk '{print $1}' || true)

#Checking the size of the existing file
if [ "${OUTPUT_SIZE}" = "${expected_size}" ]; then
# Existing file is of the expected size, no need for download
echo "OK ${OUTPUT_SIZE}"
else
# Existing file is partially installed, remove it and initiate a new download
echo "MISMATCH"
echo "Size less than expected: ${OUTPUT_SIZE} found, ${expected_size} required"
echo "Downloading IMDB dataset..."
rm -f "${imdb_temp_gz}"

# Download the dataset
curl -o "${imdb_temp_gz}" "${imdb_url}"

# Size check of the installed file
DOWNLOADED_SIZE=$(wc -c "${imdb_temp_gz}" | awk '{print $1}')
if [ "${DOWNLOADED_SIZE}" != "${expected_size}" ]; then
echo "Error: Download size mismatch"
echo "Expected: ${expected_size}"
echo "Got: ${DOWNLADED_SIZE}"
echo "Please re-initiate the download"
return 1
fi
fi
else
# No existing file found, initiate a new download
echo "not found"
echo "Downloading IMDB dataset ${expected_size} expected)..."
# Download the dataset
curl -o "${imdb_temp_gz}" "${imdb_url}"

# Extract the dataset
tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
$CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
else
echo "IMDB.tgz already exists."

# Extract the dataset
tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
$CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
fi

# Extract the dataset
tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
$CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
echo "IMDB dataset downloaded and extracted."

else
echo "IMDB dataset already exists and contains required parquet files."
fi
Expand Down

0 comments on commit 72c0df4

Please sign in to comment.