From 755ba0e6a3e3873f5c73c13723044ee0472a9f89 Mon Sep 17 00:00:00 2001
From: VincyZhang <wenxin.zhang@intel.com>
Date: Tue, 4 Apr 2023 23:33:25 +0800
Subject: [PATCH] update Readme (#57)

---
 README.md                                     | 23 +++++++++++++++----
 conda_meta/meta.yaml                          |  2 +-
 docs/architecture.md                          |  2 +-
 .../deployment/neural_engine/gpt-j/README.md  |  8 +++----
 .../neural_engine/stable_diffusion/README.md  | 12 +++++-----
 intel_extension_for_transformers/version.py   |  2 +-
 6 files changed, 31 insertions(+), 18 deletions(-)
diff --git a/README.md b/README.md
index 9c14b193db4..a45b8c6a5de 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,27 @@
-# Intel® Extension for Transformers: Accelerating Transformer-based Models on Intel Platforms
- Intel® Extension for Transformers is an innovative toolkit to accelerate Transformer-based models on Intel platforms, in particular effective on 4th Intel Xeon Scalable processor Sapphire Rapids (codenamed [Sapphire Rapids](https://www.intel.com/content/www/us/en/products/docs/processors/xeon-accelerated/4th-gen-xeon-scalable-processors.html)). The toolkit provides the key features and examples as below:
+<div align="center">
+  
+Intel® Extension for Transformers
+===========================
+<h3> An innovative toolkit to accelerate Transformer-based models on Intel platforms</h3>
 
+[Architecture](./docs/architecture.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[NeuralChat](./examples/optimization/pytorch/huggingface/language-modeling/chatbot)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./docs/examples.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentations](https://intel.github.io/intel-extension-for-transformers/latest/docs/Welcome.html)
+</div>
 
-*  Seamless user experience of model compressions on Transformers-based models by extending [Hugging Face transformers](https://github.com/huggingface/transformers) APIs and leveraging [Intel® Neural Compressor](https://github.com/intel/neural-compressor)
+---
+<div align="left">
+
+Intel® Extension for Transformers is an innovative toolkit to accelerate Transformer-based models on Intel platforms, in particular effective on 4th Intel Xeon Scalable processor Sapphire Rapids (codenamed [Sapphire Rapids](https://www.intel.com/content/www/us/en/products/docs/processors/xeon-accelerated/4th-gen-xeon-scalable-processors.html)). The toolkit provides the below key features and examples:
+
+
+*  Seamless user experience of model compressions on Transformer-based models by extending [Hugging Face transformers](https://github.com/huggingface/transformers) APIs and leveraging [Intel® Neural Compressor](https://github.com/intel/neural-compressor)
 
 
 *  Advanced software optimizations and unique compression-aware runtime (released with NeurIPS 2022's paper [Fast Distilbert on CPUs](https://arxiv.org/abs/2211.07715) and [QuaLA-MiniLM: a Quantized Length Adaptive MiniLM](https://arxiv.org/abs/2210.17114), and NeurIPS 2021's paper [Prune Once for All: Sparse Pre-Trained Language Models](https://arxiv.org/abs/2111.05754))
 
 
-*  Accelerated end-to-end Transformer-based applications such as [Stable Diffusion](./examples/optimization/pytorch/huggingface/textual_inversion), [GPT-J-6B](./examples/optimization/pytorch/huggingface/language-modeling/inference/README.md#GPT-J), [BLOOM-176B](./examples/optimization/pytorch/huggingface/language-modeling/inference/README.md#BLOOM-176B), [T5](https://github.com/intel/intel-extension-for-transformers/blob/main/examples/optimization/pytorch/huggingface/summarization/quantization), and [SetFit](./docs/tutorials/pytorch/text-classification/SetFit_model_compression_AGNews.ipynb) by leveraging Intel AI software such as [Intel® Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch)     
+*  Optimized Transformer-based model packages such as [Stable Diffusion](https://github.com/intel/intel-extension-for-transformers/tree/main/examples/deployment/neural_engine/stable_diffusion), [GPT-J-6B](https://github.com/intel/intel-extension-for-transformers/tree/main/examples/deployment/neural_engine/gpt-j), [GPT-NEOX](https://github.com/intel/intel-extension-for-transformers/tree/main/examples/optimization/pytorch/huggingface/language-modeling/quantization/inc#2-validated-model-list), [BLOOM-176B](./examples/optimization/pytorch/huggingface/language-modeling/inference/README.md#BLOOM-176B), [T5](https://github.com/intel/intel-extension-for-transformers/tree/main/examples/optimization/pytorch/huggingface/summarization/quantization#2-validated-model-list), [Flan-T5](https://github.com/intel/intel-extension-for-transformers/tree/main/examples/optimization/pytorch/huggingface/summarization/quantization#2-validated-model-list) and end-to-end workflows such as [SetFit-based text classification](./docs/tutorials/pytorch/text-classification/SetFit_model_compression_AGNews.ipynb) and [document level sentiment analysis (DLSA)](https://github.com/intel/intel-extension-for-transformers/tree/main/examples/E2E-solution/DLSA) 
+
+*  [NeuralChat](https://github.com/intel/intel-extension-for-transformers/tree/main/examples/optimization/pytorch/huggingface/language-modeling/chatbot), a custom Chatbot trained on Intel CPUs through parameter-efficient fine-tuning [PEFT](https://github.com/huggingface/peft) on domain knowledge
 
 
 ## Installation
@@ -64,9 +77,9 @@ output = model(**input).logits.argmax().item()
 <tbody>
   <tr>
     <td colspan="2" align="center"><a href="https://github.com/intel/intel-extension-for-transformers/tree/main/docs">Model Compression</a></td>
+    <td colspan="2" align="center"><a href="https://github.com/intel/intel-extension-for-transformers/tree/main/examples/optimization/pytorch/huggingface/language-modeling/chatbot">NeuralChat</a></td>
     <td colspan="2" align="center"><a href="https://github.com/intel/intel-extension-for-transformers/tree/main/intel_extension_for_transformers/backends/neural_engine/docs">Neural Engine</a></td>
     <td colspan="2" align="center"><a href="intel_extension_for_transformers/backends/neural_engine/kernels/README.md">Kernel Libraries</a></td>
-    <td colspan="2" align="center"><a href="https://github.com/intel/intel-extension-for-transformers/tree/main/examples">Examples</a></td>
   </tr>
   <tr>
     <th colspan="8" align="center">MODEL COMPRESSION</th>
diff --git a/conda_meta/meta.yaml b/conda_meta/meta.yaml
index af7c0cdb713..2ea9f0b02ac 100644
--- a/conda_meta/meta.yaml
+++ b/conda_meta/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "1.0" %}
+{% set version = "1.0.0" %}
 {% set buildnumber = 0 %}
 package:
   name: intel_extension_for_transformers
diff --git a/docs/architecture.md b/docs/architecture.md
index 6bfa0e5c90f..0e5c72a7c02 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -1,4 +1,4 @@
 # Architecture of Intel® Extension for Transformers
 
-<img src="./imgs/arch.png" width=691 height=444 alt="arch">
+<img src="./imgs/arch.png" width=600 height=250 alt="arch">
 </br>
\ No newline at end of file
diff --git a/examples/deployment/neural_engine/gpt-j/README.md b/examples/deployment/neural_engine/gpt-j/README.md
index ef9d28e36c1..d7e228c37e5 100644
--- a/examples/deployment/neural_engine/gpt-j/README.md
+++ b/examples/deployment/neural_engine/gpt-j/README.md
@@ -26,17 +26,17 @@ export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 ```
 ## Performance
-### SingleNode inference
-The fp32 model are in huggingface [EleutherAI/gpt-j-6B](https://huggingface.co/EleutherAI/gpt-j-6B), int8 model has been publiced on [Intel/gpt-j-6B-pytorch-int8-static](https://huggingface.co/Intel/gpt-j-6B-pytorch-int8-static).
 
-#### Generate IR
+The fp32 model is [EleutherAI/gpt-j-6B](https://huggingface.co/EleutherAI/gpt-j-6B), and int8 model has been publiced on [Intel/gpt-j-6B-pytorch-int8-static](https://huggingface.co/Intel/gpt-j-6B-pytorch-int8-static).
+
+### Generate IR
 ```bash
 python gen_ir.py --model=EleutherAI/gpt-j-6B --dtype=bf16 --output_model='./ir' --pt_file='new.pt' # dtype could be fp32/ int8/ bf16 
 ```
 - When the input dtype is fp32 or bf16, the pt file will be automatically saved if it does not exist.
 - When the input dtype is int8, the pt file should exist.
 
-#### Inference 
+### Inference 
 ```bash
 # support single socket and multiple sockets
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_gptj.py --max-new-tokens 32 --ir_path <path to ir>
diff --git a/examples/deployment/neural_engine/stable_diffusion/README.md b/examples/deployment/neural_engine/stable_diffusion/README.md
index 70ebb9536d6..748a01882ec 100644
--- a/examples/deployment/neural_engine/stable_diffusion/README.md
+++ b/examples/deployment/neural_engine/stable_diffusion/README.md
@@ -46,7 +46,7 @@ export WEIGHT_SHARING=1
 export INST_NUM=<inst num>
 ```
 # End-to-End Workflow
-## Prepare Models
+## 1. Prepare Models
 
 The stable diffusion mainly includes three onnx models: text_encoder, unet, vae_decoder.
 
@@ -54,7 +54,7 @@ The pretrained model [CompVis/stable-diffusion-v1-4](https://huggingface.co/Comp
 
 Here we take CompVis/stable-diffusion-v1-4 as an example.
 
-### Download Models
+### 1.1 Download Models
 Export FP32 ONNX models from the hugginface diffusers module, command as follows:
 
 ```python
@@ -66,7 +66,7 @@ By setting --bf16 to export FP32 and BF16 models.
 python prepare_model.py --input_model=CompVis/stable-diffusion-v1-4 --output_path=./model --bf16
 ```
 
-### Compile Models
+### 1.2 Compile Models
 Export three FP32 onnx sub models of the stable diffusion to Nerual Engine IRs.
 
 ```bash
@@ -105,7 +105,7 @@ python export_ir.py --onnx_model=./model/unet_bf16/model.onnx --pattern_config=u
 python export_ir.py --onnx_model=./model/vae_decoder_bf16/bf16-model.onnx --pattern_config=vae_decoder_pattern.conf --output_path=./bf16_ir/vae_decoder/
 ```
 
-## Performance
+## 2. Performance
 
 Python API command as follows:
 ```python
@@ -116,7 +116,7 @@ GLOG_minloglevel=2 python run_executor.py --ir_path=./fp32_ir --mode=performance
 GLOG_minloglevel=2 python run_executor.py --ir_path=./bf16_ir --mode=performance
 ```
 
-## Accuracy
+## 3. Accuracy
 Frechet Inception Distance(FID) metric is used to evaluate the accuracy. This case we check the FID scores between the pytorch image and engine image.
 
 By setting --accuracy to check FID socre.
@@ -129,7 +129,7 @@ GLOG_minloglevel=2 python run_executor.py --ir_path=./fp32_ir --mode=accuracy
 GLOG_minloglevel=2 python run_executor.py --ir_path=./bf16_ir --mode=accuracy
 ```
 
-## Text-to-image
+## 4. Try Text to Image
 
 Try using one sentence to create a picture!
 
diff --git a/intel_extension_for_transformers/version.py b/intel_extension_for_transformers/version.py
index 7d9df720300..702a01df8a1 100644
--- a/intel_extension_for_transformers/version.py
+++ b/intel_extension_for_transformers/version.py
@@ -17,4 +17,4 @@
 
 """The neural engine version file."""
 
-__version__ = "1.0"
+__version__ = "1.0.0"