Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add DML tests #1219

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .github/workflows/win-directml-x64-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ concurrency:
cancel-in-progress: true

env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
AZCOPY_AUTO_LOGIN_TYPE: MSI
AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
Expand Down Expand Up @@ -80,7 +81,7 @@ jobs:

- name: Configure CMake
run: |
cmake --preset windows_x64_directml_release -DTEST_PHI2=False
cmake --preset windows_x64_directml_release -DTEST_PHI2=True

- name: Build with CMake
run: |
Expand All @@ -93,6 +94,10 @@ jobs:
python -m pip install -r test\python\directml\ort\requirements.txt
python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps

- name: Run the Python Tests
run: |
python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e

- name: Verify Build Artifacts
if: always()
continue-on-error: true
Expand Down
73 changes: 70 additions & 3 deletions test/c_api_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#ifndef PHI2_PATH
#if USE_CUDA
#define PHI2_PATH MODEL_PATH "phi-2/int4/cuda"
#elif USE_DML
#define PHI2_PATH MODEL_PATH "phi-2/int4/dml"
#else
#define PHI2_PATH MODEL_PATH "phi-2/int4/cpu"
#endif
Expand Down Expand Up @@ -148,6 +150,7 @@ TEST(CAPITests, MaxLength) {
generator->AppendTokens(input_ids_0.data(), input_ids_0.size());
EXPECT_THROW(generator->AppendTokens(input_ids_1.data(), input_ids_1.size()), std::runtime_error);

#if !USE_DML
// Batch size 3 case
std::vector<int32_t> input_ids_2{1, 2, 3, 5, 8, 13, 21, 34, 55, 89,
0, 0, 0, 52, 104, 52, 53, 54, 55, 56,
Expand All @@ -158,10 +161,12 @@ TEST(CAPITests, MaxLength) {

generator = OgaGenerator::Create(*model, *params);
EXPECT_THROW(generator->AppendTokens(input_ids_2.data(), input_ids_2.size()), std::runtime_error);
#endif
}

// DML doesn't support batch_size > 1
TEST(CAPITests, EndToEndPhiBatch) {
#if TEST_PHI2
#if TEST_PHI2 && !USE_DML
auto model = OgaModel::Create(PHI2_PATH);
auto tokenizer = OgaTokenizer::Create(*model);

Expand Down Expand Up @@ -191,6 +196,65 @@ TEST(CAPITests, EndToEndPhiBatch) {
auto out_string = tokenizer->Decode(generator->GetSequenceData(i), generator->GetSequenceCount(i));
std::cout << "Decoded string:" << out_string << std::endl;
}

// Verify outputs match expected outputs
std::vector<int32_t> expected_output{
1212, 318, 257, 1332, 13, 50256, 50256, 50256, 50256, 50256, 198, 50280, 2, 16926, 1330, 1635, 10412, 6617, 278, 6335, 32994, 21857, 13849, 38665, 82, 21815, 1108, 9557, 40755, 27446, 2417, 6381, 6, 7131, 6, 14870, 31314, 21411, 46009, 3974,
49, 1381, 389, 7427, 17252, 0, 50256, 50256, 50256, 50256, 198, 50284, 37811, 628, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13, 198, 50284, 37811, 628, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256};

for (size_t i = 0; i < 3; i++) {
const auto sequence_length = generator->GetSequenceCount(i);
const auto* sequence_data = generator->GetSequenceData(i);

ASSERT_LE(sequence_length, 40);

const auto* expected_output_start = &expected_output[i * 40];
EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t)));
}
#endif
}

TEST(CAPITests, EndToEndPhi) {
#if TEST_PHI2
auto model = OgaModel::Create(PHI2_PATH);
auto tokenizer = OgaTokenizer::Create(*model);

const char* input_strings[] = {
"This is a test."
};

auto input_sequences = OgaSequences::Create();
for (auto& string : input_strings)
tokenizer->Encode(string, *input_sequences);

auto params = OgaGeneratorParams::Create(*model);
params->SetSearchOption("max_length", 40);

auto generator = OgaGenerator::Create(*model, *params);
generator->AppendTokenSequences(*input_sequences);

while (!generator->IsDone()) {
generator->GenerateNextToken();
}

// Decode The Batch
auto out_string = tokenizer->Decode(generator->GetSequenceData(0), generator->GetSequenceCount(0));
std::cout << "Decoded string:" << out_string << std::endl;

// Verify outputs match expected outputs
std::vector<int32_t> expected_output{
1212, 318, 257, 1332, 13, 198, 50280, 2, 16926, 1330, 1635, 10412, 6617, 278,
6335, 32994, 21857, 13849, 38665, 82, 21815, 1108, 9557, 40755, 27446, 2417,
6381, 6, 7131, 6, 14870, 31314, 21411, 46009, 3974, 82, 1039, 889, 263, 3684};

const auto sequence_length = generator->GetSequenceCount(0);
const auto* sequence_data = generator->GetSequenceData(0);

ASSERT_LE(sequence_length, 40);

const auto* expected_output_start = &expected_output[0];
EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t)));
#endif
}

Expand Down Expand Up @@ -443,7 +507,8 @@ TEST(CAPITests, SetTerminate) {
#endif
}

#if TEST_PHI2
// DML Doesn't support batch_size > 1
#if TEST_PHI2 && !USE_DML

struct Phi2Test {
Phi2Test() {
Expand Down Expand Up @@ -521,12 +586,14 @@ TEST(CAPITests, TopKTopPCAPI) {
test.Run();
}

#endif // TEST_PHI2
#endif // TEST_PHI2 && !USE_DML

#if TEST_PHI2
TEST(CAPITests, AdaptersTest) {
#ifdef USE_CUDA
using OutputType = Ort::Float16_t;
#elif defined(USE_DML)
using OutputType = Ort::Float16_t;
#else
using OutputType = float;
#endif
Expand Down
65 changes: 65 additions & 0 deletions test/model_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#ifndef PHI2_PATH
#if USE_CUDA
#define PHI2_PATH MODEL_PATH "phi-2/int4/cuda"
#elif USE_DML
#define PHI2_PATH MODEL_PATH "phi-2/int4/dml"
#else
#define PHI2_PATH MODEL_PATH "phi-2/int4/cpu"
#endif
Expand Down Expand Up @@ -274,5 +276,68 @@ Print all primes between 1 and n
std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n";
#endif
}
#endif

#if USE_DML && TEST_PHI2
TEST(ModelTests, TestApiDml) {

auto prompt = R"(
def print_prime(n):
'''
Print all primes between 1 and n
'''
)";

std::cout << "With prompt:" << prompt << "\r\n";

auto model = Generators::CreateModel(Generators::GetOrtEnv(), PHI2_PATH);
auto tokenizer = model->CreateTokenizer();
auto tokens = tokenizer->Encode(prompt);

auto params = Generators::CreateGeneratorParams(*model);
params->search.batch_size = 1;
params->search.max_length = 128;

// Generator version
auto generator = Generators::CreateGenerator(*model, *params);
generator->AppendTokens(Generators::cpu_span<int>(tokens.data(), tokens.size()));
while (!generator->IsDone()) {
generator->GenerateNextToken();
}

auto result = generator->GetSequence(0);

std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n";
}

TEST(ModelTests, TestTopKDml) {
auto prompt = R"(
def print_prime(n):
'''
Print all primes between 1 and n
'''
)";

std::cout << "With prompt:" << prompt << "\r\n";

auto model = Generators::CreateModel(Generators::GetOrtEnv(), PHI2_PATH);
auto tokenizer = model->CreateTokenizer();
auto tokens = tokenizer->Encode(prompt);

auto params = Generators::CreateGeneratorParams(*model);
params->search.top_k = 3;
params->search.batch_size = 1;
params->search.max_length = 128;

// Generator version
auto generator = Generators::CreateGenerator(*model, *params);
generator->AppendTokens(Generators::cpu_span<int>(tokens.data(), tokens.size()));
while (!generator->IsDone()) {
generator->GenerateNextToken();
}

auto result = generator->GetSequence(0);

std::cout << tokenizer->Decode(result.CopyDeviceToCpu()) << "\r\n";
}
#endif
2 changes: 2 additions & 0 deletions test/python/test_onnxruntime_genai.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ def main():
output_paths += download_models(os.path.abspath(args.test_models), "int4", "cpu")
if og.is_cuda_available():
output_paths += download_models(os.path.abspath(args.test_models), "int4", "cuda")
if og.is_dml_available():
output_paths += download_models(os.path.abspath(args.test_models), "int4", "dml")

# Run ONNX Runtime GenAI tests
run_onnxruntime_genai_api_tests(os.path.abspath(args.cwd), log, os.path.abspath(args.test_models))
Expand Down
32 changes: 32 additions & 0 deletions test/python/test_onnxruntime_genai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,9 @@ def test_tokenizer_stream(device, phi2_for):
)
@pytest.mark.parametrize("device", devices)
def test_batching(device, phi2_for):
if device == "dml":
pytest.skip("EP DML does not support batching")

model = og.Model(phi2_for(device))
tokenizer = og.Tokenizer(model)

Expand All @@ -259,6 +262,32 @@ def test_batching(device, phi2_for):
params = og.GeneratorParams(model)
params.set_search_options(max_length=20, batch_size=len(prompts)) # To run faster

generator = og.Generator(model, params)
generator.append_tokens(tokenizer.encode_batch(prompts))
while not generator.is_done():
generator.generate_next_token()
for i in range(len(prompts)):
print(tokenizer.decode(generator.get_sequence(0)))


# TODO: CUDA pipelines use python3.6 and do not have a way to download models since downloading models
# requires pytorch and hf transformers. This test should be re-enabled once the pipeline is updated.
@pytest.mark.skipif(
sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8,
reason="Python 3.8 is required for downloading models.",
)
@pytest.mark.parametrize("device", devices)
def test_e2e(device, phi2_for):
model = og.Model(phi2_for(device))
tokenizer = og.Tokenizer(model)

prompts = [
"This is a test.",
]

params = og.GeneratorParams(model)
params.set_search_options(max_length=20, batch_size=len(prompts)) # To run faster

if device == "dml":
params.try_graph_capture_with_max_batch_size(len(prompts))

Expand Down Expand Up @@ -617,6 +646,9 @@ def _export_adapter(adapter, adapter_file_name):
adapter_paths.append(adapter_file_name)

return adapter_model_path, adapter_paths

if device == "dml":
pytest.skip("EP DML does not support adapters")

model_path, adapter_paths = _prepare_adapter_model(test_data_path)
model = og.Model(model_path)
Expand Down
Loading