diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json index 260c21dad7a..2957e1008ad 100644 --- a/examples/.config/pytorch_optimize.json +++ b/examples/.config/pytorch_optimize.json @@ -1580,7 +1580,8 @@ "params": { "topology": "mistral_7b_autoround", "task": "generation", - "output_model": "saved_results" + "output_model": "saved_results", + "weight_dtype": "int4_clip" } }, "benchmark": { @@ -1590,11 +1591,10 @@ "task": "generation", "backend": "neuralspeed", "mode": "benchmark", - "batch_size": "112", + "batch_size": "10", "iters": "100", "int8": "false", - "config": "saved_results", - "weight_dtype": "int4_clip" + "config": "saved_results" } } }, @@ -1616,7 +1616,7 @@ "task": "generation", "mode": "benchmark", "backend": "neuralspeed", - "batch_size": "112", + "batch_size": "10", "iters": "100", "int8": "false", "config": "saved_results" @@ -1642,7 +1642,7 @@ "task": "generation", "backend": "neuralspeed", "mode": "benchmark", - "batch_size": "112", + "batch_size": "10", "iters": "100", "int8": "false", "config": "saved_results" @@ -1732,7 +1732,7 @@ "task": "generation", "backend": "neuralspeed", "mode": "benchmark", - "batch_size": "112", + "batch_size": "10", "iters": "100", "int8": "false", "config": "saved_results", @@ -1750,7 +1750,7 @@ "task": "generation", "mode": "benchmark", "backend": "neuralspeed", - "batch_size": "112", + "batch_size": "10", "iters": "100", "int8": "false", "config": "saved_results", diff --git a/examples/huggingface/neural_speed/perplexity/requirements.txt b/examples/huggingface/neural_speed/perplexity/requirements.txt index edf08020849..ae2da4b19d9 100644 --- a/examples/huggingface/neural_speed/perplexity/requirements.txt +++ b/examples/huggingface/neural_speed/perplexity/requirements.txt @@ -13,4 +13,4 @@ tiktoken py-cpuinfo cmake gguf -neural-speed==1.0a0 +neural-speed diff --git a/examples/huggingface/neural_speed/requirements.txt b/examples/huggingface/neural_speed/requirements.txt index a5d89166463..34ce3169441 100644 --- a/examples/huggingface/neural_speed/requirements.txt +++ b/examples/huggingface/neural_speed/requirements.txt @@ -1,5 +1,5 @@ intel_extension_for_transformers -neural-speed==1.0a0 +neural-speed git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2 sentencepiece gguf diff --git a/examples/huggingface/neural_speed/run_accuracy.py b/examples/huggingface/neural_speed/run_accuracy.py index 27da2985674..3a38f6440cd 100644 --- a/examples/huggingface/neural_speed/run_accuracy.py +++ b/examples/huggingface/neural_speed/run_accuracy.py @@ -19,7 +19,7 @@ parser = argparse.ArgumentParser(description="Evaluate diff for a model") parser.add_argument('--model_name', type=str, default="~/Llama-2-7b-chat-hf", help="path to model") parser.add_argument('--tasks', type=str, default="lambada_openai") - parser.add_argument('--model_format', type=str, default="runtime") + parser.add_argument('--model_format', type=str, default="neural_speed") parser.add_argument('--use_gptq', action='store_true') parser.add_argument('--batch_size', type=int, default=1) args = parser.parse_args() @@ -27,7 +27,7 @@ model_args=f'pretrained="{args.model_name}",dtype=float32,trust_remote_code=True' if args.use_gptq: model_args += ",use_gptq=True" - if args.model_format == "runtime": + if args.model_format == "neural_speed": results = evaluate( model="hf-causal", model_args=model_args, diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt index 130c3ef6fd4..ed1e1d78ce5 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt +++ b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt @@ -13,7 +13,8 @@ bitsandbytes #baichuan transformers_stream_generator tiktoken #qwen einops #qwen -neural-speed +git+https://github.com/intel/neural-speed.git@v1.0.1.dev0 auto-round git+https://github.com/intel/neural-compressor.git git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2 +huggingface_hub diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index 6822469a9ff..6c65bf26133 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -163,6 +163,8 @@ function run_benchmark { model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1" elif [ "${topology}" = "mistral_7b_rtn" ] && [ "$model_source" != "huggingface" ]; then model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1" + elif [ "${topology}" = "mistral_7b_gptq" ] && [ "$model_source" != "huggingface" ]; then + model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1" fi if [[ ${int8} == "true" ]]; then diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py index 8fa664160c9..956331387d3 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py @@ -250,7 +250,10 @@ args.model = args.peft_model_id if args.peft_model_id is not None else args.model # Generation -generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4) +if args.use_neural_speed: + generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=1) +else: + generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4) # mp/sq/woq/bitsandbytes config setting quantization_config = None @@ -478,10 +481,9 @@ if args.benchmark: user_model = ( - user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model + user_model.eval() if (not (args.int8 or args.int8_bf16_mixed) and hasattr(user_model, "eval")) else user_model ) prompt = "Once upon a time, there existed a little girl, who liked to have adventures. She wanted to go to places and meet new people, and have fun." - input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1) print("---- Prompt size:", input_size) @@ -521,7 +523,7 @@ toc = time.time() # please check the gen_ids if include input_ids. input_tokens_num = input_ids.numel() - output_tokens_num = gen_ids.numel() - input_tokens_num + output_tokens_num = torch.tensor(gen_ids).numel() - input_tokens_num print(gen_text, flush=True) if i >= num_warmup: total_time += toc - tic @@ -534,18 +536,30 @@ print("Throughput: {} samples/sec".format(throughput)) if args.accuracy: - user_model = (user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model) + user_model = (user_model.eval() if (not (args.int8 or args.int8_bf16_mixed) and hasattr(user_model, "eval")) \ + else user_model) args.model = (peft_config.base_model_name_or_path if args.peft_model_id else args.model) from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate - + pretrained = ',pretrained=' + args.model args._commit_hash = "main" if args._commit_hash is None else args._commit_hash + eval_args = "tokenizer=" + args.model + ",dtype=float32" + ",_commit_hash=" + \ + args._commit_hash + ",trust_remote_code=" + str(args.trust_remote_code) + if args.use_neural_speed: + eval_args += pretrained + q_conf = user_model.config.quantization_config + if isinstance(q_conf, dict): + q_algo = q_conf.get("quant_method", None) + else: + q_algo = q_conf.quant_method.value + if q_algo.upper() in ["AWQ", "GPTQ", "AUTOROUND"]: + eval_args += ",use_gptq=True" results = evaluate( model="hf-causal", - model_args="pretrained=" + args.model + ",tokenizer=" + args.model + ",dtype=float32" + ",_commit_hash=" + args._commit_hash + - ",trust_remote_code=" + str(args.trust_remote_code), + model_args=eval_args, user_model=user_model, batch_size=args.batch_size, tasks=args.tasks, + model_format="neural_speed" if args.use_neural_speed else "torch", ) dumped = json.dumps(results, indent=2) if args.save_accuracy_path: diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py index 09ef3869e19..b18d7f3888d 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py @@ -323,7 +323,7 @@ results = evaluate( model="hf-causal", - model_args='pretrained=' + args.model + ',tokenizer=' + args.model + \ + model_args='tokenizer=' + args.model + \ ',dtype=float32,trust_remote_code=' + str(args.trust_remote_code), user_model=user_model, batch_size=args.batch_size, diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh index eeb1150fad5..a2753405a4c 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh @@ -220,7 +220,7 @@ function run_tuning { extra_cmd=$extra_cmd" --weight_dtype ${weight_dtype}" elif [ "${topology}" = "mistral_7b_rtn" ]; then model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1" - extra_cmd=$extra_cmd" --woq --bits 4 -compute_dtype fp32 --scheme asym " + extra_cmd=$extra_cmd" --woq --bits 4 --compute_dtype fp32 --scheme asym " extra_cmd=$extra_cmd" --woq_algo "Rtn" --desc_act --blocksize 128 --max_input_length 2048 " extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" extra_cmd=$extra_cmd" --trust_remote_code" diff --git a/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py b/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py index ae5941ebde1..f0850a10d48 100644 --- a/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py +++ b/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py @@ -124,6 +124,8 @@ def evaluate(model, } if user_model: kwargs["init_empty_weights"] = True + if "pretrained" not in model_args: + model_args = "pretrained='Muennighoff/tiny-random-bert'," + model_args if device == "hpu": # if hpu, set user_model diff --git a/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/models/huggingface.py b/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/models/huggingface.py index d6c66cda2c8..638bfdbd591 100644 --- a/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/models/huggingface.py +++ b/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/models/huggingface.py @@ -465,7 +465,7 @@ def add_special_tokens(self) -> bool: """ if self._add_special_tokens is not None: return self._add_special_tokens - elif self.model_format == "runtime": + elif self.model_format == "neural_speed": return True elif self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM: return False @@ -614,7 +614,7 @@ class AutoCausalLM(HuggingFaceAutoLM): def __init__(self, *args, pretrained, model_format, **kwargs): self.model_format = model_format - if self.model_format == "runtime": + if self.model_format == "neural_speed": from intel_extension_for_transformers.transformers import RtnConfig, AwqConfig, GPTQConfig, AutoRoundConfig use_gptq = kwargs.pop("use_gptq", False) if use_gptq: @@ -623,11 +623,11 @@ def __init__(self, *args, pretrained, model_format, **kwargs): self.woq_config = RtnConfig(bits=4, compute_dtype="int8", weight_dtype="int4") super().__init__(*args, pretrained=pretrained, model_format=model_format, **kwargs) - if self.model_format == "runtime": + if self.model_format == "neural_speed": from transformers import AutoTokenizer, TextStreamer from intel_extension_for_transformers.transformers import AutoModelForCausalLM self.runtime_model = AutoModelForCausalLM.from_pretrained(pretrained, quantization_config=self.woq_config, - trust_remote_code=kwargs.get("trust_remote_code", False)) + use_neural_speed=True, trust_remote_code=kwargs.get("trust_remote_code", False)) if self.model_format == "onnx": if not os.path.exists(os.path.join(pretrained, "decoder_model.onnx")) and \ @@ -758,7 +758,7 @@ def _model_call( input_bs, input_len = inputs.shape bos = torch.tensor([64790, 64792]).repeat(input_bs, 1) inputs = torch.cat((bos, inputs), 1) - if self.model_format == "runtime": + if self.model_format == "neural_speed": out = self.runtime_model(inputs, reinit=True, logits_all=True, ignore_padding=True) output = {"logits": torch.from_numpy(out)} elif self.model_format != "onnx": diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index a087489f4e0..86afcf33105 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -398,12 +398,29 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): else: use_neural_speed = False - if hasattr(config, "quantization_config") and not use_neural_speed: + if hasattr(config, "quantization_config"): if config.quantization_config is None: logger.warning( "Quantization_config loading failed. If you want to load saved " "low bit model, please check your quantizate_config.json." ) + elif use_neural_speed: + if not os.path.exists(pretrained_model_name_or_path): + from huggingface_hub import snapshot_download + pretrained_model_name_or_path = snapshot_download(repo_id=pretrained_model_name_or_path, + allow_patterns=["*.pt", "*.safetensors", "*.json", ".model"], + ) + if quantization_config is None: + ConfigInit = {"rtn": RtnConfig, + "awq": AwqConfig, + "teq": TeqConfig, + "gptq": GPTQConfig, + "autoround": AutoRoundConfig, + } + quantization_config = config.quantization_config + assert quantization_config.get("quant_method", None) in ConfigInit, \ + "Detect this model is not a low-bit model." + quantization_config = ConfigInit[quantization_config["quant_method"]].from_dict(quantization_config) else: logger.info( "quantization_config: {}".format(config.quantization_config) @@ -556,11 +573,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): scale_dtype=quantization_config.scale_dtype, compute_dtype=quantization_config.compute_dtype, use_ggml=quantization_config.use_ggml, - use_quant=( - quantization_config.use_quant - if hasattr(quantization_config, "use_quant") - else False - ), + use_quant=True, use_gptq=quantization_config.quant_method.value == "gptq" or quantization_config.quant_method.value == "autoround", use_awq=quantization_config.quant_method.value == "awq",