microsoft · aciddelgado · Jan 30, 2025 · Feb 3, 2025 · Feb 3, 2025 · Feb 3, 2025
diff --git a/.github/workflows/win-directml-x64-build.yml b/.github/workflows/win-directml-x64-build.yml
@@ -12,6 +12,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
   AZCOPY_AUTO_LOGIN_TYPE: MSI
   AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
   ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
@@ -80,7 +81,7 @@ jobs:
 
     - name: Configure CMake
       run: |
-        cmake --preset windows_x64_directml_release -DTEST_PHI2=False
+        cmake --preset windows_x64_directml_release -DTEST_PHI2=True
 
     - name: Build with CMake
       run: |
@@ -93,6 +94,10 @@ jobs:
         python -m pip install -r test\python\directml\ort\requirements.txt
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
 
+    - name: Run the Python Tests
+      run: |
+        python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e
+
     - name: Verify Build Artifacts
       if: always()
       continue-on-error: true

diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp
@@ -14,6 +14,8 @@
 #ifndef PHI2_PATH
 #if USE_CUDA
 #define PHI2_PATH MODEL_PATH "phi-2/int4/cuda"
+#elif USE_DML
+#define PHI2_PATH MODEL_PATH "phi-2/int4/dml"
 #else
 #define PHI2_PATH MODEL_PATH "phi-2/int4/cpu"
 #endif
@@ -148,6 +150,7 @@ TEST(CAPITests, MaxLength) {
   generator->AppendTokens(input_ids_0.data(), input_ids_0.size());
   EXPECT_THROW(generator->AppendTokens(input_ids_1.data(), input_ids_1.size()), std::runtime_error);
 
+#if !USE_DML
   // Batch size 3 case
   std::vector<int32_t> input_ids_2{1, 2, 3, 5, 8, 13, 21, 34, 55, 89,
                                    0, 0, 0, 52, 104, 52, 53, 54, 55, 56,
@@ -158,10 +161,12 @@ TEST(CAPITests, MaxLength) {
 
   generator = OgaGenerator::Create(*model, *params);
   EXPECT_THROW(generator->AppendTokens(input_ids_2.data(), input_ids_2.size()), std::runtime_error);
+#endif
 }
 
+// DML doesn't support batch_size > 1
 TEST(CAPITests, EndToEndPhiBatch) {
-#if TEST_PHI2
+#if TEST_PHI2 && !USE_DML
   auto model = OgaModel::Create(PHI2_PATH);
   auto tokenizer = OgaTokenizer::Create(*model);
 
@@ -191,6 +196,65 @@ TEST(CAPITests, EndToEndPhiBatch) {
     auto out_string = tokenizer->Decode(generator->GetSequenceData(i), generator->GetSequenceCount(i));
     std::cout << "Decoded string:" << out_string << std::endl;
   }
+
+  // Verify outputs match expected outputs
+  std::vector<int32_t> expected_output{
+      1212, 318, 257, 1332, 13, 50256, 50256, 50256, 50256, 50256, 198, 50280, 2, 16926, 1330, 1635, 10412, 6617, 278, 6335, 32994, 21857, 13849, 38665, 82, 21815, 1108, 9557, 40755, 27446, 2417, 6381, 6, 7131, 6, 14870, 31314, 21411, 46009, 3974,
+      49, 1381, 389, 7427, 17252, 0, 50256, 50256, 50256, 50256, 198, 50284, 37811, 628, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
+      464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13, 198, 50284, 37811, 628, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256};
+
+  for (size_t i = 0; i < 3; i++) {
+    const auto sequence_length = generator->GetSequenceCount(i);
+    const auto* sequence_data = generator->GetSequenceData(i);
+
+    ASSERT_LE(sequence_length, 40);
+
+    const auto* expected_output_start = &expected_output[i * 40];
+    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t)));
+  }
+#endif
+}
+
+TEST(CAPITests, EndToEndPhi) {
+#if TEST_PHI2
+  auto model = OgaModel::Create(PHI2_PATH);
+  auto tokenizer = OgaTokenizer::Create(*model);
+
+  const char* input_strings[] = {
+      "This is a test."
+  };
+
+  auto input_sequences = OgaSequences::Create();
+  for (auto& string : input_strings)
+    tokenizer->Encode(string, *input_sequences);
+
+  auto params = OgaGeneratorParams::Create(*model);
+  params->SetSearchOption("max_length", 40);
+
+  auto generator = OgaGenerator::Create(*model, *params);
+  generator->AppendTokenSequences(*input_sequences);
+
+  while (!generator->IsDone()) {
+    generator->GenerateNextToken();
+  }
+
+  // Decode The Batch
+  auto out_string = tokenizer->Decode(generator->GetSequenceData(0), generator->GetSequenceCount(0));
+  std::cout << "Decoded string:" << out_string << std::endl;
+
+  // Verify outputs match expected outputs
+  std::vector<int32_t> expected_output{
+      1212, 318, 257, 1332, 13, 198, 50280, 2, 16926, 1330, 1635, 10412, 6617, 278, 
+      6335, 32994, 21857, 13849, 38665, 82, 21815, 1108, 9557, 40755, 27446, 2417, 
+      6381, 6, 7131, 6, 14870, 31314, 21411, 46009, 3974, 82, 1039, 889, 263, 3684};
+
+  const auto sequence_length = generator->GetSequenceCount(0);
+  const auto* sequence_data = generator->GetSequenceData(0);
+
+  ASSERT_LE(sequence_length, 40);
+
+  const auto* expected_output_start = &expected_output[0];
+  EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t)));
 #endif
 }
 
@@ -443,7 +507,8 @@ TEST(CAPITests, SetTerminate) {
 #endif
 }
 
-#if TEST_PHI2
+// DML Doesn't support batch_size > 1
+#if TEST_PHI2 && !USE_DML
 
 struct Phi2Test {
   Phi2Test() {
@@ -521,12 +586,14 @@ TEST(CAPITests, TopKTopPCAPI) {
   test.Run();
 }
 
-#endif  // TEST_PHI2
+#endif  // TEST_PHI2 && !USE_DML
 
 #if TEST_PHI2
 TEST(CAPITests, AdaptersTest) {
 #ifdef USE_CUDA
   using OutputType = Ort::Float16_t;
+#elif defined(USE_DML)
+  using OutputType = Ort::Float16_t; 
 #else
   using OutputType = float;
 #endif

diff --git a/test/model_tests.cpp b/test/model_tests.cpp
@@ -14,6 +14,8 @@
 #ifndef PHI2_PATH
 #if USE_CUDA
 #define PHI2_PATH MODEL_PATH "phi-2/int4/cuda"
+#elif USE_DML
+#define PHI2_PATH MODEL_PATH "phi-2/int4/dml"
 #else
 #define PHI2_PATH MODEL_PATH "phi-2/int4/cpu"
 #endif
@@ -274,5 +276,68 @@ Print all primes between 1 and n
   std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n";
 #endif
 }
+#endif
+
+#if USE_DML && TEST_PHI2
+TEST(ModelTests, TestApiDml) {
+
+  auto prompt = R"(
+def print_prime(n):
+'''
+Print all primes between 1 and n
+'''
+)";
+
+  std::cout << "With prompt:" << prompt << "\r\n";
+
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), PHI2_PATH);
+  auto tokenizer = model->CreateTokenizer();
+  auto tokens = tokenizer->Encode(prompt);
 
+  auto params = Generators::CreateGeneratorParams(*model);
+  params->search.batch_size = 1;
+  params->search.max_length = 128;
+
+  // Generator version
+  auto generator = Generators::CreateGenerator(*model, *params);
+  generator->AppendTokens(Generators::cpu_span<int>(tokens.data(), tokens.size()));
+  while (!generator->IsDone()) {
+    generator->GenerateNextToken();
+  }
+
+  auto result = generator->GetSequence(0);
+
+  std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n";
+}
+
+TEST(ModelTests, TestTopKDml) {
+  auto prompt = R"(
+def print_prime(n):
+'''
+Print all primes between 1 and n
+'''
+)";
+
+  std::cout << "With prompt:" << prompt << "\r\n";
+
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), PHI2_PATH);
+  auto tokenizer = model->CreateTokenizer();
+  auto tokens = tokenizer->Encode(prompt);
+
+  auto params = Generators::CreateGeneratorParams(*model);
+  params->search.top_k = 3;
+  params->search.batch_size = 1;
+  params->search.max_length = 128;
+
+  // Generator version
+  auto generator = Generators::CreateGenerator(*model, *params);
+  generator->AppendTokens(Generators::cpu_span<int>(tokens.data(), tokens.size()));
+  while (!generator->IsDone()) {
+    generator->GenerateNextToken();
+  }
+
+  auto result = generator->GetSequence(0);
+
+  std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n";
+}
 #endif
diff --git a/test/python/test_onnxruntime_genai.py b/test/python/test_onnxruntime_genai.py
@@ -87,6 +87,8 @@ def main():
         output_paths += download_models(os.path.abspath(args.test_models), "int4", "cpu")
         if og.is_cuda_available():
             output_paths += download_models(os.path.abspath(args.test_models), "int4", "cuda")
+        if og.is_dml_available():
+            output_paths += download_models(os.path.abspath(args.test_models), "int4", "dml")
 
     # Run ONNX Runtime GenAI tests
     run_onnxruntime_genai_api_tests(os.path.abspath(args.cwd), log, os.path.abspath(args.test_models))

diff --git a/test/python/test_onnxruntime_genai_api.py b/test/python/test_onnxruntime_genai_api.py
@@ -247,6 +247,9 @@ def test_tokenizer_stream(device, phi2_for):
 )
 @pytest.mark.parametrize("device", devices)
 def test_batching(device, phi2_for):
+    if device == "dml":
+        pytest.skip("EP DML does not support batching")
+
     model = og.Model(phi2_for(device))
     tokenizer = og.Tokenizer(model)
 
@@ -259,6 +262,32 @@ def test_batching(device, phi2_for):
     params = og.GeneratorParams(model)
     params.set_search_options(max_length=20, batch_size=len(prompts))  # To run faster
 
+    generator = og.Generator(model, params)
+    generator.append_tokens(tokenizer.encode_batch(prompts))
+    while not generator.is_done():
+        generator.generate_next_token()
+    for i in range(len(prompts)):
+        print(tokenizer.decode(generator.get_sequence(0)))
+
+
+# TODO: CUDA pipelines use python3.6 and do not have a way to download models since downloading models
+# requires pytorch and hf transformers. This test should be re-enabled once the pipeline is updated.
+@pytest.mark.skipif(
+    sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8,
+    reason="Python 3.8 is required for downloading models.",
+)
+@pytest.mark.parametrize("device", devices)
+def test_e2e(device, phi2_for):
+    model = og.Model(phi2_for(device))
+    tokenizer = og.Tokenizer(model)
+
+    prompts = [
+        "This is a test.",
+    ]
+
+    params = og.GeneratorParams(model)
+    params.set_search_options(max_length=20, batch_size=len(prompts))  # To run faster
+
     if device == "dml":
         params.try_graph_capture_with_max_batch_size(len(prompts))
 
@@ -617,6 +646,9 @@ def _export_adapter(adapter, adapter_file_name):
             adapter_paths.append(adapter_file_name)
 
         return adapter_model_path, adapter_paths
+
+    if device == "dml":
+        pytest.skip("EP DML does not support adapters")
 
     model_path, adapter_paths = _prepare_adapter_model(test_data_path)
     model = og.Model(model_path)