Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
fix for token latency (#1520)
Browse files Browse the repository at this point in the history
  • Loading branch information
louie-tsai authored May 13, 2024
1 parent c169bec commit ae7a4ae
Showing 1 changed file with 45 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,6 @@
"Library imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b63fa0ac",
"metadata": {},
"outputs": [],
"source": [
"from time import time"
]
},
{
"cell_type": "markdown",
"id": "c3c6d1e2-61f1-4ee4-98c7-4f6202e7f2ea",
Expand All @@ -93,16 +83,19 @@
"outputs": [],
"source": [
"# Build chatbot\n",
"from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig\n",
"from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig, GenerationConfig\n",
"config = PipelineConfig(model_name_or_path='Intel/neural-chat-7b-v3-1')\n",
"chatbot = build_chatbot(config)\n",
"\n",
"# Perform inference/generate a response\n",
"start = time()\n",
"response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
"end = time()\n",
"print(response)\n",
"print(\"%.5f seconds\" %(end-start))"
"\n",
"gen_config = GenerationConfig(return_stats=True, format_version=\"v2\")\n",
"results, _ = chatbot.predict_stream(\"Tell me about Intel Xeon Scalable Processors.\", config=gen_config)\n",
"stream_text = \"\"\n",
"for text in results:\n",
" stream_text += text\n",
"print(stream_text)\n",
"\n"
]
},
{
Expand All @@ -124,18 +117,20 @@
"outputs": [],
"source": [
"# Build chatbot in BF16\n",
"from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig\n",
"from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig, GenerationConfig\n",
"from intel_extension_for_transformers.transformers import MixedPrecisionConfig\n",
"mix_config = MixedPrecisionConfig(dtype=\"bfloat16\")\n",
"config = PipelineConfig(model_name_or_path='Intel/neural-chat-7b-v3-1',\n",
" optimization_config=MixedPrecisionConfig(dtype='bfloat16'))\n",
" optimization_config=mix_config)\n",
"chatbot = build_chatbot(config)\n",
"\n",
"# Perform inference/generate a response\n",
"start = time()\n",
"response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
"end = time()\n",
"print(response)\n",
"print(\"%.5f seconds\" %(end-start))"
"gen_config = GenerationConfig(return_stats=True, format_version=\"v2\")\n",
"results, _ = chatbot.predict_stream(\"Tell me about Intel Xeon Scalable Processors.\", config=gen_config)\n",
"stream_text = \"\"\n",
"for text in results:\n",
" stream_text += text\n",
"print(stream_text)\n"
]
},
{
Expand All @@ -155,16 +150,20 @@
"source": [
"# Build chatbot with INT4 weight-only quantization, computations in AMX INT8\n",
"from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig\n",
"from intel_extension_for_transformers.transformers import RtnConfig\n",
"from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig\n",
"from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig\n",
"config = PipelineConfig(model_name_or_path=\"Intel/neural-chat-7b-v3-1\",\n",
" optimization_config=RtnConfig(bits=4, compute_dtype=\"int8\", weight_dtype=\"int4_fullrange\"), \n",
"config = PipelineConfig(model_name_or_path='Intel/neural-chat-7b-v3-1',\n",
" optimization_config=WeightOnlyQuantConfig(compute_dtype=\"int8\", weight_dtype=\"int4_fullrange\"), \n",
" loading_config=LoadingModelConfig(use_neural_speed=False))\n",
"chatbot = build_chatbot(config)\n",
"\n",
"# Perform inference/generate a response\n",
"response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
"print(response)"
"gen_config = GenerationConfig(return_stats=True, format_version=\"v2\")\n",
"results, _ = chatbot.predict_stream(\"Tell me about Intel Xeon Scalable Processors.\", config=gen_config)\n",
"stream_text = \"\"\n",
"for text in results:\n",
" stream_text += text\n",
"print(stream_text)"
]
},
{
Expand All @@ -184,8 +183,7 @@
"outputs": [],
"source": [
"# OPTIONAL: log in to HuggingFace to access Llama2\n",
"export HUGGINGFACE_TOKEN=None #@TODO: enter in HF token here\n",
"!huggingface-cli login --token $HUGGINGFACE_TOKEN --add-to-git-credential"
"#!huggingface-cli login --token <@TODO: enter in HF token here> --add-to-git-credential"
]
},
{
Expand Down Expand Up @@ -310,7 +308,9 @@
"cell_type": "code",
"execution_count": null,
"id": "103340b5-1b57-486c-9fba-3060de63ab42",
"metadata": {},
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Build chatbot with AST and TTS plugin\n",
Expand Down Expand Up @@ -338,6 +338,17 @@
"Open the audio files using your own audio player to hear the query and response. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "25157d11-15c5-4d56-a4f4-5294db04a332",
"metadata": {},
"outputs": [],
"source": [
"import IPython\n",
"IPython.display.Audio(\"response.wav\")"
]
},
{
"cell_type": "markdown",
"id": "e93a0779-8c26-4c62-a514-64bf9c58896f",
Expand Down Expand Up @@ -389,7 +400,7 @@
" TextGenerationFinetuningConfig,\n",
")\n",
"from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
"model_args = ModelArguments(model_name_or_path=\"Intel/neural-chat-7b-v3-1\")\n",
"model_args = ModelArguments(model_name_or_path='Intel/neural-chat-7b-v3-1')\n",
"data_args = DataArguments(train_file=\"alpaca_data.json\")\n",
"training_args = TrainingArguments(\n",
" output_dir='./finetuned_model_path',\n",
Expand Down Expand Up @@ -434,7 +445,7 @@
"from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
"from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig\n",
"\n",
"config = PipelineConfig(model_name_or_path=\"Intel/neural-chat-7b-v3-1\",\n",
"config = PipelineConfig(model_name_or_path='Intel/neural-chat-7b-v3-1',\n",
" loading_config=LoadingModelConfig(peft_path=\"./finetuned_model_path\"))\n",
"chatbot = build_chatbot(config)\n",
"response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
Expand Down Expand Up @@ -467,7 +478,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.10.13"
}
},
"nbformat": 4,
Expand Down

0 comments on commit ae7a4ae

Please sign in to comment.