diff --git a/README.md b/README.md index 290058040..2c5f8f02b 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ --- -[![Build Status](https://github.com/KernelTuner/kernel_tuner/actions/workflows/build-test-python-package.yml/badge.svg)](https://github.com/KernelTuner/kernel_tuner/actions/workflows/build-test-python-package.yml) +[![Build Status](https://github.com/KernelTuner/kernel_tuner/actions/workflows/test-python-package.yml/badge.svg)](https://github.com/KernelTuner/kernel_tuner/actions/workflows/test-python-package.yml) [![CodeCov Badge](https://codecov.io/gh/KernelTuner/kernel_tuner/branch/master/graph/badge.svg)](https://codecov.io/gh/KernelTuner/kernel_tuner) [![PyPi Badge](https://img.shields.io/pypi/v/kernel_tuner.svg?colorB=blue)](https://pypi.python.org/pypi/kernel_tuner/) [![Zenodo Badge](https://zenodo.org/badge/54894320.svg)](https://zenodo.org/badge/latestdoi/54894320) diff --git a/doc/source/observers.rst b/doc/source/observers.rst index 174e6a01a..df4013734 100644 --- a/doc/source/observers.rst +++ b/doc/source/observers.rst @@ -112,3 +112,11 @@ More information about PMT can be found here: https://git.astron.nl/RD/pmt/ +NCUObserver +~~~~~~~~~~~ + +The NCUObserver can be used to automatically extract performance counters during tuning using Nvidia's NsightCompute profiler. +The NCUObserver relies on an intermediate library, which can be found here: https://github.com/nlesc-recruit/nvmetrics + +.. autoclass:: kernel_tuner.observers.ncu.NCUObserver + diff --git a/examples/cuda/vector_add_observers_ncu.py b/examples/cuda/vector_add_observers_ncu.py new file mode 100644 index 000000000..589420a3f --- /dev/null +++ b/examples/cuda/vector_add_observers_ncu.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +"""This is the minimal example from the README""" +import json + +import numpy +from kernel_tuner import tune_kernel +from kernel_tuner.observers.ncu import NCUObserver + +def tune(): + + kernel_string = """ + __global__ void vector_add(float *c, float *a, float *b, int n) { + int i = blockIdx.x * block_size_x + threadIdx.x; + if (i