diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/pc/build_talkingbot_on_pc.ipynb b/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/pc/build_talkingbot_on_pc.ipynb index 4336028c99d..71ddbac5f8d 100644 --- a/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/pc/build_talkingbot_on_pc.ipynb +++ b/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/pc/build_talkingbot_on_pc.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> make sure you are running in a conda environment with Python 3.10\n", + "> make sure you are running in a conda environment\n", "\n", "[IntelĀ® Extension for Transformers Neural Chat](https://github.com/intel/intel-extension-for-transformers/tree/main/intel_extension_for_transformers/neural_chat) provides a lot of plugins to meet different users' scenarios. In this notebook we will show you how to create a TalkingBot on your local laptop with **Intel CPU** (no GPU needed).\n", "\n", @@ -36,7 +36,8 @@ "metadata": {}, "outputs": [], "source": [ - "!curl -O https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/intel_extension_for_transformers/neural_chat/assets/audio/sample_2.wav" + "!curl -O https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/intel_extension_for_transformers/neural_chat/assets/audio/sample_2.wav\n", + "!curl -O https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/intel_extension_for_transformers/neural_chat/assets/speaker_embeddings/spk_embed_default.pt" ] }, { @@ -88,9 +89,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Directly load given int4 model to do inference\n", + "### Optimize the model with quantization to do inference\n", "\n", - "Here for quick demo, we just use a given int4 model to generate text. If you want to convert your int4 model manually, please refer to next cell." + "This conversion will generate a quantized LLM model under `runtime_outs/`. Next time it will load the model directly without re-quantization." ] }, { @@ -99,53 +100,22 @@ "metadata": {}, "outputs": [], "source": [ + "# Get the quantized model\n", "from transformers import AutoTokenizer, TextStreamer\n", "from neural_speed import Model\n", - "\n", - "prompt = in_text\n", - "\n", - "model_name = \"meta-llama/Llama-2-7b-chat-hf\"\n", - "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", - "inputs = tokenizer(prompt, return_tensors=\"pt\").input_ids\n", - "\n", - "model = Model()\n", - "model.tokenizer = tokenizer\n", - "model.init_from_bin(model_name=\"llama\", model_path=\"ne_llama_q.bin\", max_new_tokens=43, do_sample=False)\n", - "\n", - "streamer = TextStreamer(tokenizer)\n", - "outputs = model.generate(inputs, streamer=streamer)\n", - "output_text = tokenizer.batch_decocde(outputs)[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Convert int4 model to do inference\n", - "\n", - "This conversion will generate a int4 model `ne_llama_q.bin` that the above cell needs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ "from intel_extension_for_transformers.transformers import RtnConfig\n", - "from transformers import AutoTokenizer, TextStreamer\n", "from intel_extension_for_transformers.transformers import AutoModel\n", "\n", - "model_name = \"meta-llama/Llama-2-7b-chat-hf\" # Please first download the model and replace this model_name with the local path\n", - "woq_config = RtnConfig(bits=4, compute_type=\"int8\", weight_dtype=\"int4\")\n", - "prompt = \"Who is andy grove\"\n", - "\n", + "model_name = \"meta-llama/Llama-2-7b-chat-hf\" # You can first download the model and replace this model_name with the local path\n", "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", + "prompt = in_text\n", "inputs = tokenizer(prompt, return_tensors=\"pt\").input_ids\n", + "\n", + "\n", + "woq_config = RtnConfig(bits=8, compute_dtype=\"int8\", weight_dtype=\"int8\")\n", "streamer = TextStreamer(tokenizer)\n", "model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)\n", - "\n", - "outputs = model.generate(inputs, streamer=streamer, max_new_tokens=20)\n", + "outputs = model.generate(inputs, streamer=streamer, max_new_tokens=100) # Change the max_new_tokens here to control the output length\n", "output_text = tokenizer.batch_decode(outputs)[0]" ] }, @@ -153,25 +123,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Text To Speech" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.tts import TextToSpeech" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tts = TextToSpeech()" + "## Text To Speech\n", + "\n", + "This is to convert the output text to audio and saved the output as `output.wav`." ] }, { @@ -180,7 +134,9 @@ "metadata": {}, "outputs": [], "source": [ - "result_path = tts.text2speech(output_text, \"output.wav\")" + "from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.tts import TextToSpeech\n", + "tts = TextToSpeech()\n", + "result_path = tts.text2speech(output_text[:290], \"output.wav\") # Truncate part of the input text as you needed" ] }, { @@ -190,7 +146,7 @@ "outputs": [], "source": [ "from IPython.display import Audio\n", - "Audio(result_path, rate=16000)" + "Audio(r\"./output.wav\", rate=16000)" ] } ], @@ -210,7 +166,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.9.0" } }, "nbformat": 4,