Merge branch 'main' into aciddelgado/fix_top_k

microsoft · Jan 21, 2025 · e8ad028 · e8ad028
2 parents 601c261 + 471e715
commit e8ad028
Show file tree

Hide file tree

Showing 6 changed files with 17 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ See documentation at https://onnxruntime.ai/docs/genai.
 
 | Support matrix | Supported now | Under development | On the roadmap |
 | -------------- | ------------- | ----------------- | -------------- |
-| Model architectures | Gemma <br/> Llama * <br/> Mistral + <br/> Phi (language + vision) <br/> Qwen <br/> Nemotron <br/> Granite <br/> | Whisper | Stable diffusion |
+| Model architectures | Gemma <br/> Llama * <br/> Mistral + <br/> Phi (language + vision) <br/> Qwen <br/> Nemotron <br/> Granite <br/> AMD OLMo | Whisper | Stable diffusion |
 | API | Python <br/> C# <br/> C/C++ <br/> Java ^ | Objective-C | |
 | Platform | Linux <br/> Windows <br/> Mac ^ <br/> Android ^ | | iOS |
 | Architecture | x86 <br/> x64 <br/> Arm64 ~ | | |

diff --git a/src/models/model.cpp b/src/models/model.cpp
@@ -590,7 +590,7 @@ std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, const char* config_path, con
 }
 
 std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, std::unique_ptr<Config> config) {
-  std::set<std::string> llm_types = {"chatglm", "decoder", "gemma", "gemma2", "granite", "llama", "mistral", "nemotron", "phi", "phimoe", "phi3", "phi3small", "qwen2"};
+  std::set<std::string> llm_types = {"chatglm", "decoder", "gemma", "gemma2", "granite", "llama", "mistral", "nemotron", "olmo", "phi", "phimoe", "phi3", "phi3small", "qwen2"};
   if (config->model.type == "gpt2")
     return std::make_shared<Gpt_Model>(std::move(config), ort_env);
   if (llm_types.find(config->model.type) != llm_types.end())

diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp
@@ -365,7 +365,7 @@ OgaResult* OGA_API_CALL OgaGenerator_GetOutput(const OgaGenerator* oga_generator
     throw std::runtime_error("Unexpected error. Trying to access DML memory but the project is not compiled with DML.");
 #endif
   } else {
-    throw std::runtime_error("Unsupported device type: " + static_cast<int>(device_type));
+    throw std::runtime_error("Unsupported device type: " + std::to_string(static_cast<int>(device_type)));
   }
 
   auto tensor = std::make_shared<Generators::Tensor>(std::move(ortvalue_clone));

diff --git a/src/python/py/models/README.md b/src/python/py/models/README.md
@@ -39,6 +39,7 @@ The tool currently supports the following model architectures.
 - Nemotron
 - Phi
 - Qwen
+- AMD OLMo
 
 It is intended for supporting the latest, popular state-of-the-art models.
 

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -330,7 +330,7 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
 
         genai_config = {
             "model": {
-                "bos_token_id": config.bos_token_id if hasattr(config, "bos_token_id") else 1,  # config.bos_token_id not present in ChatGLM model configs.
+                "bos_token_id": config.bos_token_id if hasattr(config, "bos_token_id") and config.bos_token_id != None else 1,  # config.bos_token_id not present in ChatGLM model configs.
                 "context_length": self.context_length,
                 "decoder": {
                     "session_options" : {
@@ -3068,6 +3068,14 @@ def make_layer(self, layer_id, layer):
         layer.self_attn = layer.self_attn if hasattr(layer, 'self_attn') else layer.self_attention
         super().make_layer(layer_id, layer)
 
+class OLMoModel(Model):
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
+        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
+
+    def make_layernorm(self, layer_id, layernorm, skip, simple, location):
+        layernorm.weight = torch.ones(self.hidden_size)
+        layernorm.bias = torch.zeros(self.hidden_size)
+        super().make_layernorm(layer_id, layernorm, skip, simple, location)
 
 class GraniteModel(MistralModel):
     def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
@@ -3200,6 +3208,8 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid
             onnx_model = MistralModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         elif config.architectures[0] == "NemotronForCausalLM":
             onnx_model = NemotronModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
+        elif config.architectures[0] == "OlmoForCausalLM":
+            onnx_model = OLMoModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         elif config.architectures[0] == "PhiForCausalLM":
             onnx_model = PhiModel(config, io_dtype, precision, execution_provider, cache_dir, extra_options)
         elif config.architectures[0] == "Phi3ForCausalLM" and config.max_position_embeddings == config.original_max_position_embeddings:

diff --git a/test/python/_test_utils.py b/test/python/_test_utils.py
@@ -55,6 +55,8 @@ def run_subprocess(
 def get_model_paths():
     hf_paths = {
         "phi-2": "microsoft/phi-2",
+        "olmo": "amd/AMD-OLMo-1B-SFT-DPO",
+        "qwen": "Qwen/Qwen2.5-0.5B",
         "phi-3.5": "microsoft/Phi-3.5-mini-instruct",
         # "llama-3.2": "meta-llama/Llama-3.2-1B-instruct",
         "granite-3.0": "ibm-granite/granite-3.0-2b-instruct",
-Original file line number
+Diff line change
@@ Expand Up @@
     - Nemotron
     - Phi
     - Qwen
+    - AMD OLMo
     It is intended for supporting the latest, popular state-of-the-art models.
@@ Expand Down @@