From fe06a847b827a78a0fc907be46614984092aa8d7 Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 11 Jun 2024 00:13:31 -0700 Subject: [PATCH] support extension Signed-off-by: changwangss --- .../quantization/llm_quantization_recipes.md | 26 +++++++++---------- .../quantization/run_benchmark.sh | 4 +-- .../quantization/run_generation_sq.py | 17 +++++++----- .../transformers/utils/config.py | 1 + 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md index ce1764bc0bb..f3c697d0680 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md +++ b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md @@ -61,7 +61,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.85 ``` @@ -115,7 +115,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.9 ``` @@ -169,7 +169,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.5 ``` @@ -340,7 +340,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.8 ``` @@ -394,7 +394,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.9 ``` @@ -500,7 +500,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.95 ``` @@ -554,7 +554,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.65 ``` @@ -662,7 +662,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.75 ``` @@ -715,7 +715,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.9 ``` @@ -768,7 +768,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.6 ``` @@ -821,7 +821,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.7 ``` @@ -874,7 +874,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.75 ``` @@ -927,7 +927,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.75 ``` diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index e74b7077be0..be12520c902 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -73,7 +73,7 @@ function run_benchmark { extra_cmd=$extra_cmd" --tasks ${lm_eval_tasks}" elif [[ ${mode} == "benchmark" ]]; then mode_cmd=" --benchmark " - extra_cmd=$extra_cmd" --iters ${iters}" + extra_cmd=$extra_cmd" --benchmark_iters ${iters}" else echo "Error: No such mode: ${mode}" exit 1 @@ -248,7 +248,7 @@ function run_benchmark { elif [ "${script}" == "run_generation_cpu_woq.py" ];then python -u ./${script} \ --model ${model_name_or_path} \ - --batch_size ${batch_size} \ + --eval_batch_size ${batch_size} \ ${mode_cmd} \ ${extra_cmd} else diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py index b32655bac12..7b34ea720f7 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py @@ -43,11 +43,12 @@ ) # ============Benchmark configs============== parser.add_argument("--benchmark", action="store_true") -parser.add_argument("--iters", default=100, type=int, help="num iter") +parser.add_argument("--benchmark_iters", default=100, type=int, help="num iter") +parser.add_argument("--benchmark_batch_size", default=1, type=int, help="batch size for benchmark") parser.add_argument("--num_warmup", default=10, type=int, help="num warmup") # ============Accuracy configs============== parser.add_argument("--accuracy", action="store_true") -parser.add_argument("--batch_size", default=56, type=int, help="batch size num.") +parser.add_argument("--eval_batch_size", default=56, type=int, help="batch size num.") parser.add_argument( "--tasks", default="lambada_openai", @@ -65,6 +66,7 @@ parser.add_argument( "--seq_len", default=512, type=int, help="Smooth quant calibration input length." ) +parser.add_argument("--batch_size", default=1, type=int, help="batch size num.") # sq alpha "auto" parameters parser.add_argument("--scale_sharing", action="store_true") parser.add_argument( @@ -138,6 +140,7 @@ tokenizer=tokenizer, seq_len=args.seq_len, n_samples=args.n_samples, + batch_size=args.batch_size, excluded_precisions=excluded_precisions, alpha=args.alpha if args.alpha == "auto" else float(args.alpha), scale_sharing=args.scale_sharing, @@ -205,7 +208,7 @@ # start total_time = 0.0 - num_iter = args.iters + num_iter = args.benchmark_iters num_warmup = args.num_warmup total_token_num = 0 eos_token_id = tokenizer.eos_token_id @@ -215,7 +218,7 @@ # for chatglm2 only if hasattr(tokenizer, "build_chat_input"): input_ids = tokenizer.build_chat_input(prompt)["input_ids"] - input_ids = input_ids.repeat(args.batch_size, 1) + input_ids = input_ids.repeat(args.benchmark_batch_size, 1) eos_token_id = [ tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), @@ -225,11 +228,11 @@ elif hasattr(tokenizer, "build_prompt"): build_prompt = tokenizer.build_prompt(prompt) input_ids = tokenizer( - [build_prompt] * args.batch_size, return_tensors="pt" + [build_prompt] * args.benchmark_batch_size, return_tensors="pt" ).input_ids else: input_ids = tokenizer( - [prompt] * args.batch_size, return_tensors="pt" + [prompt] * args.benchmark_batch_size, return_tensors="pt" ).input_ids gen_ids = user_model.generate( input_ids, @@ -270,7 +273,7 @@ user_model=user_model, tasks=args.tasks, device="cpu", - batch_size=args.batch_size, + batch_size=args.eval_batch_size, ) results = evaluate(args) for task_name in args.tasks.split(","): diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 4fec7d089c4..3f6da8c8a05 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -780,6 +780,7 @@ def __init__( self.ipex_opt_llm = ipex_opt_llm self.num_beams = num_beams self.excluded_precisions = excluded_precisions + self.batch_size = kwargs.pop("batch_size", 1) class RtnConfig(ITREXQuantizationConfigMixin):