diff --git a/.github/workflows/win-directml-x64-build.yml b/.github/workflows/win-directml-x64-build.yml index 678573606..0277983a3 100644 --- a/.github/workflows/win-directml-x64-build.yml +++ b/.github/workflows/win-directml-x64-build.yml @@ -12,6 +12,7 @@ concurrency: cancel-in-progress: true env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} AZCOPY_AUTO_LOGIN_TYPE: MSI AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4 ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1" @@ -80,7 +81,7 @@ jobs: - name: Configure CMake run: | - cmake --preset windows_x64_directml_release -DTEST_PHI2=False + cmake --preset windows_x64_directml_release -DTEST_PHI2=True - name: Build with CMake run: | @@ -93,6 +94,10 @@ jobs: python -m pip install -r test\python\directml\ort\requirements.txt python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps + - name: Run the Python Tests + run: | + python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e + - name: Verify Build Artifacts if: always() continue-on-error: true diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp index 40835a42a..b0c6c6f18 100644 --- a/test/c_api_tests.cpp +++ b/test/c_api_tests.cpp @@ -14,6 +14,8 @@ #ifndef PHI2_PATH #if USE_CUDA #define PHI2_PATH MODEL_PATH "phi-2/int4/cuda" +#elif USE_DML +#define PHI2_PATH MODEL_PATH "phi-2/int4/dml" #else #define PHI2_PATH MODEL_PATH "phi-2/int4/cpu" #endif @@ -148,6 +150,7 @@ TEST(CAPITests, MaxLength) { generator->AppendTokens(input_ids_0.data(), input_ids_0.size()); EXPECT_THROW(generator->AppendTokens(input_ids_1.data(), input_ids_1.size()), std::runtime_error); +#if !USE_DML // Batch size 3 case std::vector input_ids_2{1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 0, 0, 0, 52, 104, 52, 53, 54, 55, 56, @@ -158,10 +161,12 @@ TEST(CAPITests, MaxLength) { generator = OgaGenerator::Create(*model, *params); EXPECT_THROW(generator->AppendTokens(input_ids_2.data(), input_ids_2.size()), std::runtime_error); +#endif } +// DML doesn't support batch_size > 1 TEST(CAPITests, EndToEndPhiBatch) { -#if TEST_PHI2 +#if TEST_PHI2 && !USE_DML auto model = OgaModel::Create(PHI2_PATH); auto tokenizer = OgaTokenizer::Create(*model); @@ -191,6 +196,65 @@ TEST(CAPITests, EndToEndPhiBatch) { auto out_string = tokenizer->Decode(generator->GetSequenceData(i), generator->GetSequenceCount(i)); std::cout << "Decoded string:" << out_string << std::endl; } + + // Verify outputs match expected outputs + std::vector expected_output{ + 1212, 318, 257, 1332, 13, 50256, 50256, 50256, 50256, 50256, 198, 50280, 2, 16926, 1330, 1635, 10412, 6617, 278, 6335, 32994, 21857, 13849, 38665, 82, 21815, 1108, 9557, 40755, 27446, 2417, 6381, 6, 7131, 6, 14870, 31314, 21411, 46009, 3974, + 49, 1381, 389, 7427, 17252, 0, 50256, 50256, 50256, 50256, 198, 50284, 37811, 628, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, + 464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13, 198, 50284, 37811, 628, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256}; + + for (size_t i = 0; i < 3; i++) { + const auto sequence_length = generator->GetSequenceCount(i); + const auto* sequence_data = generator->GetSequenceData(i); + + ASSERT_LE(sequence_length, 40); + + const auto* expected_output_start = &expected_output[i * 40]; + EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t))); + } +#endif +} + +TEST(CAPITests, EndToEndPhi) { +#if TEST_PHI2 + auto model = OgaModel::Create(PHI2_PATH); + auto tokenizer = OgaTokenizer::Create(*model); + + const char* input_strings[] = { + "This is a test." + }; + + auto input_sequences = OgaSequences::Create(); + for (auto& string : input_strings) + tokenizer->Encode(string, *input_sequences); + + auto params = OgaGeneratorParams::Create(*model); + params->SetSearchOption("max_length", 40); + + auto generator = OgaGenerator::Create(*model, *params); + generator->AppendTokenSequences(*input_sequences); + + while (!generator->IsDone()) { + generator->GenerateNextToken(); + } + + // Decode The Batch + auto out_string = tokenizer->Decode(generator->GetSequenceData(0), generator->GetSequenceCount(0)); + std::cout << "Decoded string:" << out_string << std::endl; + + // Verify outputs match expected outputs + std::vector expected_output{ + 1212, 318, 257, 1332, 13, 198, 50280, 2, 16926, 1330, 1635, 10412, 6617, 278, + 6335, 32994, 21857, 13849, 38665, 82, 21815, 1108, 9557, 40755, 27446, 2417, + 6381, 6, 7131, 6, 14870, 31314, 21411, 46009, 3974, 82, 1039, 889, 263, 3684}; + + const auto sequence_length = generator->GetSequenceCount(0); + const auto* sequence_data = generator->GetSequenceData(0); + + ASSERT_LE(sequence_length, 40); + + const auto* expected_output_start = &expected_output[0]; + EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t))); #endif } @@ -443,7 +507,8 @@ TEST(CAPITests, SetTerminate) { #endif } -#if TEST_PHI2 +// DML Doesn't support batch_size > 1 +#if TEST_PHI2 && !USE_DML struct Phi2Test { Phi2Test() { @@ -521,12 +586,14 @@ TEST(CAPITests, TopKTopPCAPI) { test.Run(); } -#endif // TEST_PHI2 +#endif // TEST_PHI2 && !USE_DML #if TEST_PHI2 TEST(CAPITests, AdaptersTest) { #ifdef USE_CUDA using OutputType = Ort::Float16_t; +#elif defined(USE_DML) + using OutputType = Ort::Float16_t; #else using OutputType = float; #endif diff --git a/test/model_tests.cpp b/test/model_tests.cpp index 321d1ac46..c5dc29cae 100644 --- a/test/model_tests.cpp +++ b/test/model_tests.cpp @@ -14,6 +14,8 @@ #ifndef PHI2_PATH #if USE_CUDA #define PHI2_PATH MODEL_PATH "phi-2/int4/cuda" +#elif USE_DML +#define PHI2_PATH MODEL_PATH "phi-2/int4/dml" #else #define PHI2_PATH MODEL_PATH "phi-2/int4/cpu" #endif @@ -274,5 +276,68 @@ Print all primes between 1 and n std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n"; #endif } +#endif + +#if USE_DML && TEST_PHI2 +TEST(ModelTests, TestApiDml) { + + auto prompt = R"( +def print_prime(n): +''' +Print all primes between 1 and n +''' +)"; + + std::cout << "With prompt:" << prompt << "\r\n"; + + auto model = Generators::CreateModel(Generators::GetOrtEnv(), PHI2_PATH); + auto tokenizer = model->CreateTokenizer(); + auto tokens = tokenizer->Encode(prompt); + auto params = Generators::CreateGeneratorParams(*model); + params->search.batch_size = 1; + params->search.max_length = 128; + + // Generator version + auto generator = Generators::CreateGenerator(*model, *params); + generator->AppendTokens(Generators::cpu_span(tokens.data(), tokens.size())); + while (!generator->IsDone()) { + generator->GenerateNextToken(); + } + + auto result = generator->GetSequence(0); + + std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n"; +} + +TEST(ModelTests, TestTopKDml) { + auto prompt = R"( +def print_prime(n): +''' +Print all primes between 1 and n +''' +)"; + + std::cout << "With prompt:" << prompt << "\r\n"; + + auto model = Generators::CreateModel(Generators::GetOrtEnv(), PHI2_PATH); + auto tokenizer = model->CreateTokenizer(); + auto tokens = tokenizer->Encode(prompt); + + auto params = Generators::CreateGeneratorParams(*model); + params->search.top_k = 3; + params->search.batch_size = 1; + params->search.max_length = 128; + + // Generator version + auto generator = Generators::CreateGenerator(*model, *params); + generator->AppendTokens(Generators::cpu_span(tokens.data(), tokens.size())); + while (!generator->IsDone()) { + generator->GenerateNextToken(); + } + + auto result = generator->GetSequence(0); + + std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n"; +} #endif \ No newline at end of file diff --git a/test/python/test_onnxruntime_genai.py b/test/python/test_onnxruntime_genai.py index 212de1cfd..35115760b 100644 --- a/test/python/test_onnxruntime_genai.py +++ b/test/python/test_onnxruntime_genai.py @@ -87,6 +87,8 @@ def main(): output_paths += download_models(os.path.abspath(args.test_models), "int4", "cpu") if og.is_cuda_available(): output_paths += download_models(os.path.abspath(args.test_models), "int4", "cuda") + if og.is_dml_available(): + output_paths += download_models(os.path.abspath(args.test_models), "int4", "dml") # Run ONNX Runtime GenAI tests run_onnxruntime_genai_api_tests(os.path.abspath(args.cwd), log, os.path.abspath(args.test_models)) diff --git a/test/python/test_onnxruntime_genai_api.py b/test/python/test_onnxruntime_genai_api.py index 445cbf19e..6220e8e36 100644 --- a/test/python/test_onnxruntime_genai_api.py +++ b/test/python/test_onnxruntime_genai_api.py @@ -247,6 +247,9 @@ def test_tokenizer_stream(device, phi2_for): ) @pytest.mark.parametrize("device", devices) def test_batching(device, phi2_for): + if device == "dml": + pytest.skip("EP DML does not support batching") + model = og.Model(phi2_for(device)) tokenizer = og.Tokenizer(model) @@ -259,6 +262,32 @@ def test_batching(device, phi2_for): params = og.GeneratorParams(model) params.set_search_options(max_length=20, batch_size=len(prompts)) # To run faster + generator = og.Generator(model, params) + generator.append_tokens(tokenizer.encode_batch(prompts)) + while not generator.is_done(): + generator.generate_next_token() + for i in range(len(prompts)): + print(tokenizer.decode(generator.get_sequence(0))) + + +# TODO: CUDA pipelines use python3.6 and do not have a way to download models since downloading models +# requires pytorch and hf transformers. This test should be re-enabled once the pipeline is updated. +@pytest.mark.skipif( + sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8, + reason="Python 3.8 is required for downloading models.", +) +@pytest.mark.parametrize("device", devices) +def test_e2e(device, phi2_for): + model = og.Model(phi2_for(device)) + tokenizer = og.Tokenizer(model) + + prompts = [ + "This is a test.", + ] + + params = og.GeneratorParams(model) + params.set_search_options(max_length=20, batch_size=len(prompts)) # To run faster + if device == "dml": params.try_graph_capture_with_max_batch_size(len(prompts)) @@ -617,6 +646,9 @@ def _export_adapter(adapter, adapter_file_name): adapter_paths.append(adapter_file_name) return adapter_model_path, adapter_paths + + if device == "dml": + pytest.skip("EP DML does not support adapters") model_path, adapter_paths = _prepare_adapter_model(test_data_path) model = og.Model(model_path)