microsoft · ajindal1 · Jan 27, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 27, 2025
diff --git a/examples/python/model-chat.py b/examples/python/model-chat.py
@@ -34,10 +34,14 @@ def main(args):
         if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
             raise ValueError("Chat template must have exactly one pair of curly braces with input word in it, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
     else:
-        if model.type.startswith("phi"):
+        if model.type.startswith("phi2") or model.type.startswith("phi3"):
             args.chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'
-        elif model.type.startswith("llama"):
-            args.chat_template = '<|start_header_id|>user<|end_header_id|>{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>'
+        elif model.type.startswith("phi4"):
+            args.chat_template = '<|im_start|>user<|im_sep|>\n{input}<|im_end|>\n<|im_start|>assistant<|im_sep|>'
+        elif model.type.startswith("llama3"):
+            args.chat_template = '<|start_header_id|>user<|end_header_id|>\n{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>'
+        elif model.type.startswith("llama2"):
+            args.chat_template = '<s>{input}'
         else:
             raise ValueError(f"Chat Template for model type {model.type} is not known. Please provide chat template using --chat_template")
 
@@ -51,7 +55,17 @@ def main(args):
     if args.verbose: print("Generator created")
 
     # Set system prompt
-    system_prompt = args.system_prompt
+    if model.type.startswith('phi2') or model.type.startswith('phi3'):
+        system_prompt = f"<|system|>\n{args.system_prompt}<|end|>"
+    elif model.type.startswith('phi4'):
+        system_prompt = f"<|im_start|>system<|im_sep|>\n{args.system_prompt}<|im_end|>"
+    elif model.type.startswith("llama3"):
+        system_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{args.system_prompt}<|eot_id|>"
+    elif model.type.startswith("llama2"):
+        system_prompt = f"<s>[INST] <<SYS>>\n{args.system_prompt}\n<</SYS>>"
+    else:
+        system_prompt = args.system_prompt
+
     system_tokens = tokenizer.encode(system_prompt)
     generator.append_tokens(system_tokens)
     system_prompt_length = len(system_tokens)
@@ -103,7 +117,7 @@ def main(args):
             run_time = time.time() - first_token_timestamp
             print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps")
 
-        # Rewind the generator to the system prompt
+        # Rewind the generator to the system prompt, this will erase all the memory of the model.
         if args.rewind:
             generator.rewind_to(system_prompt_length)
 

diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py
@@ -30,13 +30,34 @@ def main(args):
     if args.chat_template:
         if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
             raise ValueError("Chat template must have exactly one pair of curly braces with input word in it, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
+    else:
+        if model.type.startswith("phi2") or model.type.startswith("phi3"):
+            args.chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'
+        elif model.type.startswith("phi4"):
+            args.chat_template = '<|im_start|>user<|im_sep|>\n{input}<|im_end|>\n<|im_start|>assistant<|im_sep|>'
+        elif model.type.startswith("llama3"):
+            args.chat_template = '<|start_header_id|>user<|end_header_id|>\n{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>'
+        elif model.type.startswith("llama2"):
+            args.chat_template = '<s>{input}'
+        else:
+            raise ValueError(f"Chat Template for model type {model.type} is not known. Please provide chat template using --chat_template")
 
     params = og.GeneratorParams(model)
     params.set_search_options(**search_options)
     generator = og.Generator(model, params)
 
     # Set system prompt
-    system_prompt = args.system_prompt
+    if model.type.startswith('phi2') or model.type.startswith('phi3'):
+        system_prompt = f"<|system|>\n{args.system_prompt}<|end|>"
+    elif model.type.startswith('phi4'):
+        system_prompt = f"<|im_start|>system<|im_sep|>\n{args.system_prompt}<|im_end|>"
+    elif model.type.startswith("llama3"):
+        system_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{args.system_prompt}<|eot_id|>"
+    elif model.type.startswith("llama2"):
+        system_prompt = f"<s>[INST] <<SYS>>\n{args.system_prompt}\n<</SYS>>"
+    else:
+        system_prompt = args.system_prompt
+
     system_tokens = tokenizer.encode(system_prompt)
     generator.append_tokens(system_tokens)
     system_prompt_length = len(system_tokens)
@@ -89,7 +110,7 @@ def main(args):
             run_time = time.time() - first_token_timestamp
             print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps")
 
-        # Rewind the generator to the system prompt
+        # Rewind the generator to the system prompt, this will erase all the memory of the model.
         if args.rewind:
             generator.rewind_to(system_prompt_length)
 
@@ -108,6 +129,6 @@ def main(args):
     parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false')
     parser.add_argument('-c', '--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}')
     parser.add_argument('-s', '--system_prompt', type=str, default='You are a helpful AI assistant.', help='System prompt to use for the prompt.')
-    parser.add_argument('-r', '--rewind', action='store_true', default=False, help='Rewind to the system prompt after each generation. Defaults to false')
+    parser.add_argument('-r', '--rewind', action='store_true', default=True, help='Rewind to the system prompt after each generation. Defaults to true')
     args = parser.parse_args()
     main(args)