microsoft · kunal-vaishnavi · Feb 1, 2025 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
@@ -10,6 +10,7 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
   ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
   ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
   ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
@@ -84,10 +85,6 @@ jobs:
           python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
           python3 -m pip install --user --no-index --no-deps --find-links build/cpu/wheel onnxruntime_genai
 
-      - name: Use Dummy HuggingFace Token
-        run: |
-          echo "HF_TOKEN=12345" >> $GITHUB_ENV
-
       - name: Verify Build Artifacts
         if: always()
         continue-on-error: true

diff --git a/.github/workflows/linux-cpu-x64-nightly-build.yml b/.github/workflows/linux-cpu-x64-nightly-build.yml
@@ -12,6 +12,7 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
   ort_dir: "onnxruntime-linux-x64-1.18.0"
   ort_zip: "onnxruntime-linux-x64-1.18.0.tgz"
   ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.18.0/onnxruntime-linux-x64-1.18.0.tgz"
@@ -55,10 +56,6 @@ jobs:
           python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
           python3 -m pip install build/cpu/wheel/onnxruntime_genai*.whl --no-deps
 
-      - name: Use Dummy HuggingFace Token
-        run: |
-          echo "HF_TOKEN=12345" >> $GITHUB_ENV
-
       - name: Run the python tests
         run: |
           python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models --e2e

diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
@@ -12,6 +12,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
   ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Linux&api-version=6.0-preview.1"
   ORT_PACKAGE_NAME: Microsoft.ML.OnnxRuntime.Gpu.Linux
   ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
@@ -109,10 +110,6 @@ jobs:
             bash -c " \
               /usr/bin/cmake --build --preset linux_gcc_cuda_release"
 
-      - name: Use Dummy HuggingFace Token
-        run: |
-          echo "HF_TOKEN=12345" >> $GITHUB_ENV
-
       - name: Install the onnxruntime-genai Python wheel and run python test
         run: |
           echo "Installing the onnxruntime-genai Python wheel and running the Python tests"

diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
@@ -10,6 +10,7 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
   ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
   ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
 jobs:
@@ -86,7 +87,6 @@ jobs:
       - name: Run the python tests
         run: |
           source genai-macos-venv/bin/activate
-          export HF_TOKEN="12345"
           export ORTGENAI_LOG_ORT_LIB=1
           python3 -m pip install requests
           python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models

diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml
@@ -11,6 +11,7 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
   binaryDir: 'build/cpu/win-x64'
   ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
   ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
@@ -91,10 +92,6 @@ jobs:
         python3 -m pip install -r test\python\cpu\ort\requirements.txt --user
         python3 -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
 
-    - name: Use Dummy HuggingFace Token
-      run: |
-        Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
-
     - name: Run the Python Tests
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"

diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml
@@ -12,6 +12,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
   AZCOPY_AUTO_LOGIN_TYPE: MSI
   AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
   cuda_dir: "${{ github.workspace }}\\cuda_sdk"
@@ -80,10 +81,6 @@ jobs:
         python -m pip install -r test\python\cuda\ort\requirements.txt
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
 
-    - name: Use Dummy HuggingFace Token
-      run: |
-        Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
-
     - name: Run the Python Tests
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e

diff --git a/examples/python/model-chat.py b/examples/python/model-chat.py
@@ -29,24 +29,38 @@ def main(args):
     search_options['batch_size'] = 1
 
     if args.verbose: print(search_options)
+
+    # Get model type
+    model_type = None
+    if hasattr(model, "type"):
+        model_type = model.type
+    else:
+        import json, os
+
+        with open(os.path.join(args.model_path, "genai_config.json"), "r") as f:
+            genai_config = json.load(f)
+            model_type = genai_config["model"]["type"]
 
+    # Set chat template
     if args.chat_template:
         if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
             raise ValueError("Chat template must have exactly one pair of curly braces with input word in it, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
     else:
-        if model.type.startswith("phi2") or model.type.startswith("phi3"):
+        if model_type.startswith("phi2") or model_type.startswith("phi3"):
             args.chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'
-        elif model.type.startswith("phi4"):
+        elif model_type.startswith("phi4"):
             args.chat_template = '<|im_start|>user<|im_sep|>\n{input}<|im_end|>\n<|im_start|>assistant<|im_sep|>'
-        elif model.type.startswith("llama3"):
+        elif model_type.startswith("llama3"):
             args.chat_template = '<|start_header_id|>user<|end_header_id|>\n{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>'
-        elif model.type.startswith("llama2"):
+        elif model_type.startswith("llama2"):
             args.chat_template = '<s>{input}'
+        elif model_type.startswith("qwen2"):
+            args.chat_template = '<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n'
         else:
-            raise ValueError(f"Chat Template for model type {model.type} is not known. Please provide chat template using --chat_template")
+            raise ValueError(f"Chat Template for model type {model_type} is not known. Please provide chat template using --chat_template")
 
     if args.verbose:
-        print("Model type is:", model.type)
+        print("Model type is:", model_type)
         print("Chat Template is:", args.chat_template)
 
     params = og.GeneratorParams(model)
@@ -55,16 +69,22 @@ def main(args):
     if args.verbose: print("Generator created")
 
     # Set system prompt
-    if model.type.startswith('phi2') or model.type.startswith('phi3'):
-        system_prompt = f"<|system|>\n{args.system_prompt}<|end|>"
-    elif model.type.startswith('phi4'):
-        system_prompt = f"<|im_start|>system<|im_sep|>\n{args.system_prompt}<|im_end|>"
-    elif model.type.startswith("llama3"):
-        system_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{args.system_prompt}<|eot_id|>"
-    elif model.type.startswith("llama2"):
-        system_prompt = f"<s>[INST] <<SYS>>\n{args.system_prompt}\n<</SYS>>"
-    else:
+    if "<|" in args.system_prompt and "|>" in args.system_prompt:
+        # User-provided system template already has tags
         system_prompt = args.system_prompt
+    else:
+        if model_type.startswith('phi2') or model_type.startswith('phi3'):
+            system_prompt = f"<|system|>\n{args.system_prompt}<|end|>"
+        elif model_type.startswith('phi4'):
+            system_prompt = f"<|im_start|>system<|im_sep|>\n{args.system_prompt}<|im_end|>"
+        elif model_type.startswith("llama3"):
+            system_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{args.system_prompt}<|eot_id|>"
+        elif model_type.startswith("llama2"):
+            system_prompt = f"<s>[INST] <<SYS>>\n{args.system_prompt}\n<</SYS>>"
+        elif model_type.startswith("qwen2"):
+            system_prompt = f"<|im_start|>system\n{args.system_prompt}<|im_end|>\n"
+        else:
+            system_prompt = args.system_prompt
 
     system_tokens = tokenizer.encode(system_prompt)
     generator.append_tokens(system_tokens)
@@ -79,11 +99,7 @@ def main(args):
 
         if args.timings: started_timestamp = time.time()
 
-        # If there is a chat template, use it
-        prompt = text
-        if args.chat_template:
-            prompt = f'{args.chat_template.format(input=text)}'
-
+        prompt = f'{args.chat_template.format(input=text)}'
         input_tokens = tokenizer.encode(prompt)
 
         generator.append_tokens(input_tokens)

diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py
@@ -26,37 +26,57 @@ def main(args):
     search_options['batch_size'] = 1
 
     if args.verbose: print(search_options)
+
+    # Get model type
+    model_type = None
+    if hasattr(model, "type"):
+        model_type = model.type
+    else:
+        import json, os
+
+        with open(os.path.join(args.model_path, "genai_config.json"), "r") as f:
+            genai_config = json.load(f)
+            model_type = genai_config["model"]["type"]
 
+    # Set chat template
     if args.chat_template:
         if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
             raise ValueError("Chat template must have exactly one pair of curly braces with input word in it, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
     else:
-        if model.type.startswith("phi2") or model.type.startswith("phi3"):
+        if model_type.startswith("phi2") or model_type.startswith("phi3"):
             args.chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'
-        elif model.type.startswith("phi4"):
+        elif model_type.startswith("phi4"):
             args.chat_template = '<|im_start|>user<|im_sep|>\n{input}<|im_end|>\n<|im_start|>assistant<|im_sep|>'
-        elif model.type.startswith("llama3"):
+        elif model_type.startswith("llama3"):
             args.chat_template = '<|start_header_id|>user<|end_header_id|>\n{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>'
-        elif model.type.startswith("llama2"):
+        elif model_type.startswith("llama2"):
             args.chat_template = '<s>{input}'
+        elif model_type.startswith("qwen2"):
+            args.chat_template = '<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n'
         else:
-            raise ValueError(f"Chat Template for model type {model.type} is not known. Please provide chat template using --chat_template")
+            raise ValueError(f"Chat Template for model type {model_type} is not known. Please provide chat template using --chat_template")
 
     params = og.GeneratorParams(model)
     params.set_search_options(**search_options)
     generator = og.Generator(model, params)
 
     # Set system prompt
-    if model.type.startswith('phi2') or model.type.startswith('phi3'):
-        system_prompt = f"<|system|>\n{args.system_prompt}<|end|>"
-    elif model.type.startswith('phi4'):
-        system_prompt = f"<|im_start|>system<|im_sep|>\n{args.system_prompt}<|im_end|>"
-    elif model.type.startswith("llama3"):
-        system_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{args.system_prompt}<|eot_id|>"
-    elif model.type.startswith("llama2"):
-        system_prompt = f"<s>[INST] <<SYS>>\n{args.system_prompt}\n<</SYS>>"
-    else:
+    if "<|" in args.system_prompt and "|>" in args.system_prompt:
+        # User-provided system template already has tags
         system_prompt = args.system_prompt
+    else:
+        if model_type.startswith('phi2') or model_type.startswith('phi3'):
+            system_prompt = f"<|system|>\n{args.system_prompt}<|end|>"
+        elif model_type.startswith('phi4'):
+            system_prompt = f"<|im_start|>system<|im_sep|>\n{args.system_prompt}<|im_end|>"
+        elif model_type.startswith("llama3"):
+            system_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{args.system_prompt}<|eot_id|>"
+        elif model_type.startswith("llama2"):
+            system_prompt = f"<s>[INST] <<SYS>>\n{args.system_prompt}\n<</SYS>>"
+        elif model_type.startswith("qwen2"):
+            system_prompt = f"<|im_start|>system\n{args.system_prompt}<|im_end|>\n"
+        else:
+            system_prompt = args.system_prompt
 
     system_tokens = tokenizer.encode(system_prompt)
     generator.append_tokens(system_tokens)
@@ -71,11 +91,7 @@ def main(args):
 
         if args.timings: started_timestamp = time.time()
 
-        # If there is a chat template, use it
-        prompt = text
-        if args.chat_template:
-            prompt = f'{args.chat_template.format(input=text)}'
-
+        prompt = f'{args.chat_template.format(input=text)}'
         input_tokens = tokenizer.encode(prompt)
 
         generator.append_tokens(input_tokens)