From 53fd1402b94d10159f4489fbb40b45d15ef95ff2 Mon Sep 17 00:00:00 2001
From: Sihan Chen <39623753+Spycsh@users.noreply.github.com>
Date: Mon, 13 May 2024 10:55:01 +0800
Subject: [PATCH] update talkingbot pc notebook (#1521)

---
 .../pc/build_talkingbot_on_pc.ipynb           | 84 +++++--------------
 1 file changed, 20 insertions(+), 64 deletions(-)

diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/pc/build_talkingbot_on_pc.ipynb b/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/pc/build_talkingbot_on_pc.ipynb
index 4336028c99d..71ddbac5f8d 100644
--- a/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/pc/build_talkingbot_on_pc.ipynb
+++ b/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/pc/build_talkingbot_on_pc.ipynb
@@ -11,7 +11,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "> make sure you are running in a conda environment with Python 3.10\n",
+    "> make sure you are running in a conda environment\n",
     "\n",
     "[Intel® Extension for Transformers Neural Chat](https://github.com/intel/intel-extension-for-transformers/tree/main/intel_extension_for_transformers/neural_chat) provides a lot of plugins to meet different users' scenarios. In this notebook we will show you how to create a TalkingBot on your local laptop with **Intel CPU** (no GPU needed).\n",
     "\n",
@@ -36,7 +36,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!curl -O https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/intel_extension_for_transformers/neural_chat/assets/audio/sample_2.wav"
+    "!curl -O https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/intel_extension_for_transformers/neural_chat/assets/audio/sample_2.wav\n",
+    "!curl -O https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/intel_extension_for_transformers/neural_chat/assets/speaker_embeddings/spk_embed_default.pt"
    ]
   },
   {
@@ -88,9 +89,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Directly load given int4 model to do inference\n",
+    "### Optimize the model with quantization to do inference\n",
     "\n",
-    "Here for quick demo, we just use a given int4 model to generate text. If you want to convert your int4 model manually, please refer to next cell."
+    "This conversion will generate a quantized LLM model under `runtime_outs/`. Next time it will load the model directly without re-quantization."
    ]
   },
   {
@@ -99,53 +100,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Get the quantized model\n",
     "from transformers import AutoTokenizer, TextStreamer\n",
     "from neural_speed import Model\n",
-    "\n",
-    "prompt = in_text\n",
-    "\n",
-    "model_name = \"meta-llama/Llama-2-7b-chat-hf\"\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
-    "inputs = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
-    "\n",
-    "model = Model()\n",
-    "model.tokenizer = tokenizer\n",
-    "model.init_from_bin(model_name=\"llama\", model_path=\"ne_llama_q.bin\", max_new_tokens=43, do_sample=False)\n",
-    "\n",
-    "streamer = TextStreamer(tokenizer)\n",
-    "outputs = model.generate(inputs, streamer=streamer)\n",
-    "output_text = tokenizer.batch_decocde(outputs)[0]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Convert int4 model to do inference\n",
-    "\n",
-    "This conversion will generate a int4 model `ne_llama_q.bin` that the above cell needs."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
     "from intel_extension_for_transformers.transformers import RtnConfig\n",
-    "from transformers import AutoTokenizer, TextStreamer\n",
     "from intel_extension_for_transformers.transformers import AutoModel\n",
     "\n",
-    "model_name = \"meta-llama/Llama-2-7b-chat-hf\"    # Please first download the model and replace this model_name with the local path\n",
-    "woq_config = RtnConfig(bits=4, compute_type=\"int8\", weight_dtype=\"int4\")\n",
-    "prompt = \"Who is andy grove\"\n",
-    "\n",
+    "model_name = \"meta-llama/Llama-2-7b-chat-hf\"    # You can first download the model and replace this model_name with the local path\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
+    "prompt = in_text\n",
     "inputs = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
+    "\n",
+    "\n",
+    "woq_config = RtnConfig(bits=8, compute_dtype=\"int8\", weight_dtype=\"int8\")\n",
     "streamer = TextStreamer(tokenizer)\n",
     "model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)\n",
-    "\n",
-    "outputs = model.generate(inputs, streamer=streamer, max_new_tokens=20)\n",
+    "outputs = model.generate(inputs, streamer=streamer, max_new_tokens=100)   # Change the max_new_tokens here to control the output length\n",
     "output_text = tokenizer.batch_decode(outputs)[0]"
    ]
   },
@@ -153,25 +123,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Text To Speech"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.tts import TextToSpeech"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tts = TextToSpeech()"
+    "## Text To Speech\n",
+    "\n",
+    "This is to convert the output text to audio and saved the output as `output.wav`."
    ]
   },
   {
@@ -180,7 +134,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "result_path = tts.text2speech(output_text, \"output.wav\")"
+    "from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.tts import TextToSpeech\n",
+    "tts = TextToSpeech()\n",
+    "result_path = tts.text2speech(output_text[:290], \"output.wav\")  # Truncate part of the input text as you needed"
    ]
   },
   {
@@ -190,7 +146,7 @@
    "outputs": [],
    "source": [
     "from IPython.display import Audio\n",
-    "Audio(result_path, rate=16000)"
+    "Audio(r\"./output.wav\", rate=16000)"
    ]
   }
  ],
@@ -210,7 +166,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.11"
+   "version": "3.9.0"
   }
  },
  "nbformat": 4,