Skip to content

Commit

Permalink
Add support for eager mode performance (#1539)
Browse files Browse the repository at this point in the history
* Add support for eager mode performance

Summary:

Added "compile" filed to "extra_info" that allows us to record eager mode performance as well

context is eager, eager + compile, eager + compile + autoquant can all have performance improvements/changes over time, so we want to track:

(1) eager perf on some previous date (configurable by user)
(2) current eager perf
(3) current compile perf
(4) current autoqunat + compile perf

Test Plan:
tested locally:
https://gist.github.com/jerryzh168/2a15322b0c8f40f35e52956837c67fec

Reviewers:

Subscribers:

Tasks:

Tags:

* move min_sqnr

* format

* remove redundant headers

* add upload_to_s3 script

* format
  • Loading branch information
jerryzh168 authored Jan 11, 2025
1 parent 24a78fe commit 6d6aa01
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 16 deletions.
32 changes: 19 additions & 13 deletions examples/sam2_amg_server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ def set_autoquant(mask_generator, autoquant_type, min_sqnr):
mask_generator.predictor._transforms_device = mask_generator.predictor.device
torch.set_float32_matmul_precision("high")
# NOTE: this fails when we run
# python server.py ~/checkpoints/sam2 large --port 8000 --host localhost --fast --use_autoquant --unittest
# python server.py ~/checkpoints/sam2 large --port 8000 --host localhost --fast --autoquant_type autoquant --unittest
# https://gist.github.com/jerryzh168/d337cb5de0a1dec306069fe48ac8225e
# mask_generator.predictor.model.sam_mask_decoder = autoquant(mask_generator.predictor.model.sam_mask_decoder, qtensor_class_list=DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST, min_sqnr=40)

Expand Down Expand Up @@ -508,7 +508,7 @@ def main(

# since autoquant is replicating what furious mode is doing, don't use these two together
if autoquant_type is not None:
assert not furious, "use autoquant can't be used together with furious"
assert not furious, "autoquant can't be used together with furious"
set_autoquant(mask_generator, autoquant_type, min_sqnr)

with open("dog.jpg", "rb") as f:
Expand Down Expand Up @@ -568,10 +568,22 @@ def main(
benchmark_fn(image_tensors_to_masks, random_images, mask_generator)

if output_json_path:
headers = ["name", "dtype", "device", "arch", "metric", "actual", "target"]
headers = [
"name",
"dtype",
"min_sqnr",
"compile",
"device",
"arch",
"metric",
"actual",
"target",
]
name = "sam2-" + model_type
arch = get_arch_name()
dtype = autoquant_type or "noquant"
# boolean flag to indicate whether it's eager or compile
compile = fast
(
avg_time_per_run,
max_memory_allocated_bytes,
Expand All @@ -580,24 +592,19 @@ def main(
memory_result = [
name,
dtype,
min_sqnr,
compile,
device,
arch,
"memory(MiB)",
max_memory_allocated_bytes,
None,
]
memory_percent_result = [
name,
dtype,
device,
arch,
"memory(%)",
max_memory_allocated_percentage,
None,
]
performance_result = [
name,
dtype,
min_sqnr,
compile,
device,
arch,
"time_s(avg)",
Expand All @@ -610,7 +617,6 @@ def main(
else write_json_result_ossci
)
write_json_result(output_json_path, headers, memory_result)
write_json_result(output_json_path, headers, memory_percent_result)
write_json_result(output_json_path, headers, performance_result)

if profile is not None:
Expand Down
73 changes: 73 additions & 0 deletions scripts/upload_to_s3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import io
import json
import os
from functools import lru_cache
from typing import Any

import boto3


@lru_cache
def get_s3_resource() -> Any:
return boto3.resource("s3")


def upload_to_s3(
bucket_name: str,
key: str,
json_path: str,
) -> None:
print(f"Writing {json_path} documents to S3")
data = []
with open(f"{os.path.splitext(json_path)[0]}.json", "r") as f:
for l in f.readlines():
data.append(json.loads(l))

body = io.StringIO()
for benchmark_entry in data:
json.dump(benchmark_entry, body)
body.write("\n")

try:
get_s3_resource().Object(
f"{bucket_name}",
f"{key}",
).put(
Body=body.getvalue(),
ContentType="application/json",
)
except Exception as e:
print("fail to upload to s3:", e)
return
print("Done!")


if __name__ == "__main__":
import argparse
import datetime

parser = argparse.ArgumentParser(
description="Upload benchmark result json file to clickhouse"
)
parser.add_argument(
"--json-path",
type=str,
help="json file path to upload to click house",
required=True,
)
args = parser.parse_args()
today = datetime.date.today()
today = datetime.datetime.combine(today, datetime.time.min)
today_timestamp = str(int(today.timestamp()))
print("Today timestamp:", today_timestamp)
import subprocess

# Execute the command and capture the output
output = subprocess.check_output(["hostname", "-s"])
# Decode the output from bytes to string
hostname = output.decode("utf-8").strip()
upload_to_s3(
"ossci-benchmarks",
f"v3/pytorch/ao/{hostname}/torchao-models-" + today_timestamp + ".json",
args.json_path,
)
14 changes: 13 additions & 1 deletion torchao/_models/llama/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,6 +1028,7 @@ def callback(x):
"name",
"dtype",
"min_sqnr",
"compile",
"device",
"arch",
"metric",
Expand All @@ -1037,11 +1038,22 @@ def callback(x):
name = checkpoint_path.parent.name
arch = get_arch_name()
dtype = quantization or "noquant"
memory_result = [name, dtype, min_sqnr, device, arch, "mem/s", bandwidth, None]
memory_result = [
name,
dtype,
min_sqnr,
compile,
device,
arch,
"mem/s",
bandwidth,
None,
]
performance_result = [
name,
dtype,
min_sqnr,
compile,
device,
arch,
"tok/s",
Expand Down
5 changes: 5 additions & 0 deletions torchao/_models/sam/eval_combo.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,7 @@ def mlp_only(mod, name):
"name",
"dtype",
"min_sqnr",
"compile",
"device",
"arch",
"metric",
Expand All @@ -651,10 +652,13 @@ def mlp_only(mod, name):
name = sam_model_type
arch = get_arch_name()
dtype = compress or "noquant"
# boolean flag to indicate whether compile is used
compile = use_compile != "False"
memory_result = [
name,
dtype,
min_sqnr,
compile,
device,
arch,
"memory(MiB)",
Expand All @@ -665,6 +669,7 @@ def mlp_only(mod, name):
name,
dtype,
min_sqnr,
compile,
device,
arch,
"img_s(avg)",
Expand Down
8 changes: 6 additions & 2 deletions torchao/_models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ def write_json_result_ossci(output_json_path, headers, row):
"name": "TorchAO benchmark",
"mode": "inference",
"dtype": mapping_headers["dtype"],
"min_sqnr": mapping_headers["min_sqnr"],
"extra_info": {
"device": mapping_headers["device"],
"arch": mapping_headers["arch"],
"min_sqnr": mapping_headers["min_sqnr"],
# True means compile is enabled, False means eager mode
"complie": mapping_headers["compile"],
},
},
"model": {
Expand Down Expand Up @@ -80,10 +82,12 @@ def write_json_result_local(output_json_path, headers, row):
"name": "TorchAO benchmark",
"mode": "inference",
"dtype": mapping_headers["dtype"],
"min_sqnr": mapping_headers["min_sqnr"],
"extra_info": {
"device": mapping_headers["device"],
"arch": mapping_headers["arch"],
"min_sqnr": mapping_headers["min_sqnr"],
# True means compile is enabled, False means eager mode
"complie": mapping_headers["compile"],
},
},
"model": {
Expand Down

0 comments on commit 6d6aa01

Please sign in to comment.