diff --git a/zarrtraj/benchmarking.ipynb b/zarrtraj/benchmarking.ipynb index e69de29..144d59a 100644 --- a/zarrtraj/benchmarking.ipynb +++ b/zarrtraj/benchmarking.ipynb @@ -0,0 +1,324 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [], + "source": [ + "# Method to create trajectory data\n", + "# based on MDAnalaysisTests/data/coordinates/create_data.py\n", + "\n", + "import numpy as np\n", + "\n", + "def generate_traj(n_atoms, frames):\n", + " pos = np.arange(3 * n_atoms).reshape(n_atoms, 3)\n", + " orig_box = np.array([81.1, 82.2, 83.3, 75, 80, 85], dtype=np.float32)\n", + "\n", + " positions = np.empty((frames, n_atoms, 3), dtype=np.float32)\n", + " velocities = np.empty((frames, n_atoms, 3), dtype=np.float32)\n", + " forces = np.empty((frames, n_atoms, 3), dtype=np.float32)\n", + " time = np.empty((frames), dtype=np.float32)\n", + " frame = np.empty((frames), dtype=np.int32)\n", + "\n", + " dimensions = np.empty((frames, 6))\n", + "\n", + " for i in range(frames):\n", + " positions[i] = 2** i * pos\n", + " velocities[i] = positions[i] / 10\n", + " forces[i] = positions[i] / 100\n", + " time[i] = i\n", + " frame[i] = i\n", + "\n", + " dimensions[i][:3] = orig_box[:3] + i\n", + " dimensions[i][3:] = orig_box[3:] + i * 0.1\n", + "\n", + " return [frames, dimensions, positions, velocities, forces, time]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [], + "source": [ + "# Method to load trajectory data into a zarr trajectory \n", + "# Also includes a method to load data into an hdf5 traj using the same \n", + "# test format to make a fair comparison\n", + "\n", + "import zarr\n", + "import h5py\n", + "\n", + "def create_zarr_traj(n_atoms, frames, compressor):\n", + " # create zarr group layout\n", + " root = zarr.open(f'zarrfiles/zarr_{n_atoms}_{frames}.zarr', mode='a')\n", + " particles = root.create_group('particles')\n", + " group1 = particles.create_group('group1')\n", + " box = group1.create_group('box')\n", + " edges = box.create_group('edges')\n", + " position = group1.create_group('position')\n", + " velocity = group1.create_group('velocity')\n", + " force = group1.create_group('force')\n", + "\n", + " traj = generate_traj(n_atoms, frames)\n", + "\n", + " edges.create_dataset('step', data=traj[0], dtype=np.int32)\n", + " edges.create_dataset('value', data=traj[1], dtype=np.float32)\n", + " position.create_dataset('value', data=traj[2], compressor=compressor, \n", + " chunks=(1, n_atoms, 3), dtype=np.float32)\n", + " position.create_dataset('step', data=traj[0], dtype=np.int32)\n", + " position.create_dataset('time', data=traj[-1], dtype=np.float32)\n", + " velocity.create_dataset('value', data=traj[3], compressor=compressor, \n", + " chunks=(1, n_atoms, 3), dtype=np.float32)\n", + " velocity.create_dataset('step', data=traj[0], dtype=np.int32)\n", + " velocity.create_dataset('time', data=traj[-1], dtype=np.float32)\n", + " force.create_dataset('value', data=traj[4], compressor=compressor, \n", + " chunks=(1, n_atoms, 3), dtype=np.float32)\n", + " force.create_dataset('step', data=traj[0], dtype=np.int32)\n", + " force.create_dataset('time', data=traj[-1], dtype=np.float32)\n", + "\n", + " # Return filename to make it easy to open file\n", + " return f'zarrfiles/zarr_{n_atoms}_{frames}.zarr'\n", + "\n", + "\n", + "def create_hdf5_traj(n_atoms, frames, compression, compression_opts):\n", + " with h5py.File(f'h5files/h5_{n_atoms}_{frames}.h5', 'w') as root:\n", + " particles = root.create_group('particles')\n", + " group1 = particles.create_group('group1')\n", + " box = group1.create_group('box')\n", + " edges = box.create_group('edges')\n", + " position = group1.create_group('position')\n", + " velocity = group1.create_group('velocity')\n", + " force = group1.create_group('force')\n", + "\n", + " traj = generate_traj(n_atoms, frames)\n", + "\n", + " edges.create_dataset('step', data=traj[0], dtype=np.int32)\n", + " edges.create_dataset('value', data=traj[1], dtype=np.float32)\n", + " position.create_dataset('value', data=traj[2], compression=compression, \n", + " compression_opts=compression_opts, chunks=(1, n_atoms, 3),\n", + " dtype=np.float32)\n", + " position.create_dataset('step', data=traj[0], dtype=np.int32)\n", + " position.create_dataset('time', data=traj[-1], dtype=np.float32)\n", + " velocity.create_dataset('value', data=traj[3], compression=compression, \n", + " compression_opts=compression_opts, chunks=(1, n_atoms, 3),\n", + " dtype=np.float32)\n", + " velocity.create_dataset('step', data=traj[0], dtype=np.int32)\n", + " velocity.create_dataset('time', data=traj[-1], dtype=np.float32)\n", + " force.create_dataset('value', data=traj[4], compression=compression, \n", + " compression_opts=compression_opts, chunks=(1, n_atoms, 3),\n", + " dtype=np.float32)\n", + " force.create_dataset('step', data=traj[0], dtype=np.int32)\n", + " force.create_dataset('time', data=traj[-1], dtype=np.float32)\n", + "\n", + " # Return filename to make it easy to open file\n", + " return f'h5files/h5_{n_atoms}_{frames}.h5'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import subprocess\n", + "\n", + "\n", + "def zarr_filesize(filename):\n", + " return int(subprocess.check_output(['du','-s', filename]).split()[0].decode('utf-8'))\n", + "def h5_filesize(filename):\n", + " return int(subprocess.check_output(['du','-s', filename]).split()[0].decode('utf-8'))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "import zarr\n", + "import h5py\n", + "import time\n", + "\n", + "# Each method opens, iterates through each frame, and closes the file\n", + "\n", + "def zarr_iterate_frames(filename):\n", + " start_time = time.time()\n", + " root = zarr.open(filename, mode='a')\n", + " pos_vals = root['particles/group1/position/value']\n", + " num = 0\n", + " for i in range(len(pos_vals)):\n", + " # arbitrary task that requires accessing third dimension\n", + " num += pos_vals[i][0][0]\n", + " end_time = time.time()\n", + " return end_time - start_time\n", + "\n", + "def h5_iterate_frames(filename):\n", + " start_time = time.time()\n", + " with h5py.File(filename, 'r') as root:\n", + " pos_vals = root['particles/group1/position/value']\n", + " num = 0\n", + " for i in range(len(pos_vals)):\n", + " # arbitrary task that requires accessing third dimension\n", + " num += pos_vals[i][0][0]\n", + " end_time = time.time()\n", + " return end_time - start_time\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [], + "source": [ + "import zarr\n", + "\n", + "# Create files of different sizes\n", + "\n", + "\n", + "n = []\n", + "z_filesize = []\n", + "h_filesize =[]\n", + "\n", + "for i in range(1000, 50001, 500):\n", + " compressor = zarr.Blosc(cname='zstd', clevel=9, shuffle=zarr.Blosc.SHUFFLE)\n", + " zarr_fname = create_zarr_traj(i, 5, compressor)\n", + " h5_fname = create_hdf5_traj(i, 5, compression='gzip', compression_opts=9)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Graph filesize vs num atoms for hdf5 and zarr\n", + "\n", + "n = [np.arange(1000, 50001, 500)]\n", + "z_filesize = []\n", + "h_filesize =[]\n", + "\n", + "for i in range(1000, 50001, 500):\n", + " z_filesize.append(zarr_filesize(f'zarrfiles/zarr_{i}_5.zarr'))\n", + " h_filesize.append(h5_filesize(f'h5files/h5_{i}_5.h5'))\n", + "\n", + "# Graph zarrtraj size vs h5 size\n", + "plt.xlabel('Number of atoms')\n", + "plt.ylabel('Trajectory filesize')\n", + "\n", + "plt.scatter(n, z_filesize, c='blue', s=10)\n", + "plt.scatter(n, h_filesize, c='red', s=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 134, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Graph iteration time vs number atoms for hdf5 and zarr\n", + "\n", + "n = [np.arange(1000, 50001, 2000)]\n", + "z_iteration_time = []\n", + "h_iteration_time = []\n", + "\n", + "for i in range(1000, 50001, 2000):\n", + " z_iteration_time.append(zarr_iterate_frames(f'zarrfiles/zarr_{i}_5.zarr'))\n", + " h_iteration_time.append(h5_iterate_frames(f'h5files/h5_{i}_5.h5'))\n", + "\n", + "\n", + "\n", + "\n", + "# Graph zarrtraj size vs h5 size\n", + "plt.xlabel('Number of atoms')\n", + "plt.ylabel('Trajectory filesize')\n", + "\n", + "plt.scatter(n, z_iteration_time, c='blue')\n", + "plt.scatter(n, h_iteration_time, c='red')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "zarrtraj", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}