apache · marin-ma · Nov 14, 2024 · Oct 22, 2024 · Oct 24, 2024 · Oct 29, 2024
diff --git a/tools/notebook/README.md b/tools/notebook/README.md
@@ -0,0 +1,42 @@
+# Setup, Build and Benchmark Spark/Gluten with Jupyter Notebook
+
+This guide provides notebooks and scripts for conducting performance testing in Gluten. The standard approach involves setting up the test environment on a bare-metal machine or cloud instance and running performance tests with TPC-H/TPC-DS workloads. These scripts enable users to reproduce our performance results in their own environment.
+
+## Environment Setup
+
+The recommended OS is ubuntu22.04 with kernel 5.15. To prepare the environment, run [initialize.ipynb](./initialize.ipynb), which will:
+
+- Install system dependencies and set up jupyter notebook
+- Configure Hadoop and Spark
+- Configure kernel parameters
+- Install monitoring tools (e.g., sar, emon)
+- Build Gluten using Docker
+- Generate TPC-H/TPC-DS tables
+
+## Running TPC-H/TPC-DS Benchmarks
+
+To run TPC-H/TPC-DS benchmarks, use [tpc_workload.ipynb](./tpc_workload.ipynb). You can create a copy of the notebook and modify the parameters defined in this notebook to run different workloads. However, creating and modifying a copy each time you change workloads can be inconvenient. Instead, it's recommended to use Papermill to pass parameters via the command line for greater flexibility.
+
+The required parameters are specified in [params.yaml.template](./params.yaml.template). To use it, create your own YAML file by copying and modifying the template. The command to run the notebook is:
+
+```bash
+papermill tpc_workload.ipynb -f params.yaml gluten_tpch.ipynb
+```
+After execution, the output notebook will be saved as `gluten_tpch.ipynb`.
+
+If you want to use different parameters, you can specify them via the `-f` option. It will overwrite the previously defined parameters in `params.yaml`. e.g. To switch to the TPC-DS workload, run:
+
+```bash
+papermill tpc_workload.ipynb -f params.yaml -p workoad tpcds gluten_tpcds.ipynb
+```
+
+Please refer to the Papermill documentation for additional usage details.
+
+We also provide a script [run_tpc_workload.sh](./run_tpc_workload.sh). This script wraps the Papermill command, automatically renaming the output notebook with a timestamp and application ID to prevent overwriting existing output files.
+
+## Analyzing Performance Results
+
+You can check the **Show Performance** section in the output notebook after execution. It shows the cpu% per query, and draws charts for the cpu%, memory throughput, disk throughput/util%, network throughput and pagefaults.
+
+To get more detailed metrics, We use a performance analysis cluster to analyze the output from event log and system monitors (TBD).
+
diff --git a/tools/notebook/init_disks.py b/tools/notebook/init_disks.py
@@ -0,0 +1,81 @@
+import sys
+import subprocess
+import questionary
+import json
+
+def yes_or_no(question):
+    while True:
+        user_input = input(question + '(yes/no/quit): ')
+        if user_input.lower() == 'yes':
+            return True
+        elif user_input.lower() == 'no':
+            return False
+        elif user_input.lower() == 'quit':
+            sys.exit(0)
+        else:
+            continue
+
+def filter_empty_str(l):
+    return [x for x in l if x]
+
+def run_and_log(cmd):
+    print('\033[92m' + '>>> Running command: ' + repr(cmd) + '\033[0m')
+    result = subprocess.run(cmd, check=True, shell=True, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+def init_disks():
+    all_disks = filter_empty_str(subprocess.run("lsblk -I 7,8,259 -npd --output NAME".split(' '), capture_output=True, text=True).stdout.split('\n'))
+    if not all_disks:
+        print("No disks found on system. Exit.")
+        sys.exit(0)
+
+    answer = False
+    disks = []
+    while not answer:
+        disks = questionary.checkbox('Select disks to initialize:', choices=all_disks).ask()
+        answer = yes_or_no('Confirm selected:\n' + '\n'.join(disks) + '\n')
+
+    if not disks:
+        print('No disks are selected.')
+        return
+
+    for d in disks:
+        print('Initializing {} ...'.format(d))
+        run_and_log('wipefs -a {}'.format(d))
+        run_and_log('echo "g\nw\n" | fdisk {}'.format(d))
+        run_and_log('echo "n\n\n\n\nw\n" | fdisk {}'.format(d))
+        run_and_log('mkfs.ext4 {}p1'.format(d))
+
+def mount_partitions():
+    subprocess.run('lsblk -pf --json > lsblk.json', shell=True)
+    partitions = []
+    with open('lsblk.json', 'r') as f:
+        data = json.load(f)
+        for d in data['blockdevices']:
+            if 'children' in d:
+                for c in d['children']:
+                    if c['fstype'] == 'ext4':
+                        partitions.append(c['name'])
+    answer = False
+    while not answer:
+        partitions = questionary.checkbox('Select partitions to create mount points:', choices=partitions).ask()
+        answer = yes_or_no('Confirm selected:\n' + '\n'.join(partitions) + '\n')
+
+    for i, p in enumerate(partitions):
+        d = 'data{}'.format(i)
+        run_and_log('e2label {} ""'.format(p))
+        run_and_log('e2label {} {}'.format(p, d))
+        run_and_log('mkdir -p /{}'.format(d))
+        run_and_log('mount -L {} /{}'.format(d, d))
+
+def choose():
+    choice = questionary.select('Select operation:', choices=['Format disks', 'Mount partitions']).ask()
+    print(choice)
+    if choice == 'Format disks':
+        init_disks()
+    elif choice == 'Mount partitions':
+        mount_partitions()
+
+if __name__ == '__main__':
+    choose()