From 931cccef85f2a1ededd56f317121db9dd94aa892 Mon Sep 17 00:00:00 2001 From: sgbaird Date: Sat, 21 Aug 2021 19:29:44 -0600 Subject: [PATCH 01/20] Remove usage of the word "allocate" in reference to local memory See https://stackoverflow.com/questions/68869806/how-is-performance-affected-by-using-numba-cuda-local-array-compared-with-numb --- docs/source/cuda/memory.rst | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/source/cuda/memory.rst b/docs/source/cuda/memory.rst index 7b134fe5d9e..b17d9e8ced2 100644 --- a/docs/source/cuda/memory.rst +++ b/docs/source/cuda/memory.rst @@ -152,21 +152,23 @@ traditional dynamic memory management. Local memory ============ -Local memory is an area of memory private to each thread. Using local -memory helps allocate some scratchpad area when scalar local variables -are not enough. The memory is allocated once for the duration of the kernel, -unlike traditional dynamic memory management. +Local memory is an area of memory private to each thread. Using local memory +helps reserve some scratchpad area to variables when scalar local variables are +not enough. The memory is allocated once for the duration of the kernel, unlike +traditional dynamic memory management. .. function:: numba.cuda.local.array(shape, type) :noindex: - Allocate a local array of the given *shape* and *type* on the device. + Create a variable representing a local array of the given *shape* and *type* + on the device that corresponds to a chunk of statically defined memory. *shape* is either an integer or a tuple of integers representing the array's - dimensions and must be a simple constant expression. *type* is a - :ref:`Numba type ` of the elements needing to be stored in the - array. The array is private to the current thread. An array-like object is - returned which can be read and written to like any standard array - (e.g. through indexing). + dimensions and must be a simple constant expression. *type* is a :ref:`Numba + type ` of the elements needing to be stored in the array. The + array is private to the current thread. An array-like object is returned + which can be read and written to like any standard array (e.g. through + indexing). See also the Local Memory section of `Device Memory Accesses + `_ Constant memory =============== From 4123b0f4a6e725b6924be717ca4a23dae9043b6c Mon Sep 17 00:00:00 2001 From: Debargha Saha Date: Thu, 26 Aug 2021 15:10:12 +0530 Subject: [PATCH 02/20] fix:updated url to use issue template --- numba/core/errors.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/numba/core/errors.py b/numba/core/errors.py index 7f52311ec2a..7968f11e560 100644 --- a/numba/core/errors.py +++ b/numba/core/errors.py @@ -349,7 +349,7 @@ def termcolor(): Unsupported functionality was found in the code Numba was trying to compile. If this functionality is important to you please file a feature request at: -https://github.com/numba/numba/issues/new +https://github.com/numba/numba/issues/new?template=bug_report.md """ interpreter_error_info = """ @@ -359,7 +359,7 @@ def termcolor(): environment variable to non-zero, and then rerun the code). If the code is valid and the unsupported functionality is important to you -please file a feature request at: https://github.com/numba/numba/issues/new +please file a feature request at: https://github.com/numba/numba/issues/new?template=bug_report.md To see Python/NumPy features supported by the latest release of Numba visit: https://numba.pydata.org/numba-doc/latest/reference/pysupported.html @@ -376,7 +376,7 @@ def termcolor(): https://numba.pydata.org/numba-doc/latest/reference/pysupported.html?highlight=exceptions#constructs If the code is valid and the unsupported functionality is important to you -please file a feature request at: https://github.com/numba/numba/issues/new +please file a feature request at: https://github.com/numba/numba/issues/new?template=bug_report.md If you think your code should work with Numba. %s """ % feedback_details @@ -395,7 +395,7 @@ def termcolor(): If you think your code should work with Numba, please report the error message and traceback, along with a minimal reproducer at: -https://github.com/numba/numba/issues/new +https://github.com/numba/numba/issues/new?template=bug_report.md """ reportable_issue_info = """ From e3bfaf048137e043525a89cba3196950f3fdbd59 Mon Sep 17 00:00:00 2001 From: Debargha Saha Date: Thu, 26 Aug 2021 16:14:27 +0530 Subject: [PATCH 03/20] fix:updated url in error message and feature request to use issue template --- numba/core/errors.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/numba/core/errors.py b/numba/core/errors.py index 7968f11e560..c4b872dc957 100644 --- a/numba/core/errors.py +++ b/numba/core/errors.py @@ -349,7 +349,7 @@ def termcolor(): Unsupported functionality was found in the code Numba was trying to compile. If this functionality is important to you please file a feature request at: -https://github.com/numba/numba/issues/new?template=bug_report.md +https://github.com/numba/numba/issues/new?template=feature_request.md """ interpreter_error_info = """ @@ -359,7 +359,8 @@ def termcolor(): environment variable to non-zero, and then rerun the code). If the code is valid and the unsupported functionality is important to you -please file a feature request at: https://github.com/numba/numba/issues/new?template=bug_report.md +please file a feature request at: +https://github.com/numba/numba/issues/new?template=feature_request.md To see Python/NumPy features supported by the latest release of Numba visit: https://numba.pydata.org/numba-doc/latest/reference/pysupported.html @@ -376,7 +377,8 @@ def termcolor(): https://numba.pydata.org/numba-doc/latest/reference/pysupported.html?highlight=exceptions#constructs If the code is valid and the unsupported functionality is important to you -please file a feature request at: https://github.com/numba/numba/issues/new?template=bug_report.md +please file a feature request at: +https://github.com/numba/numba/issues/new?template=feature_request.md If you think your code should work with Numba. %s """ % feedback_details From 645f34c3a16bd2c8fa2dd0391246aa12f1d631ee Mon Sep 17 00:00:00 2001 From: Debargha Saha Date: Thu, 26 Aug 2021 16:25:05 +0530 Subject: [PATCH 04/20] fix:updated url for error report and feature request --- numba/core/errors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numba/core/errors.py b/numba/core/errors.py index c4b872dc957..061954ab5f0 100644 --- a/numba/core/errors.py +++ b/numba/core/errors.py @@ -358,8 +358,8 @@ def termcolor(): without Numba? (To temporarily disable Numba JIT, set the `NUMBA_DISABLE_JIT` environment variable to non-zero, and then rerun the code). -If the code is valid and the unsupported functionality is important to you -please file a feature request at: +If the code is valid and the unsupported functionality is important to you +please file a feature request at: https://github.com/numba/numba/issues/new?template=feature_request.md To see Python/NumPy features supported by the latest release of Numba visit: From ca35fc77c027f7a4e4f3faa08aa5cfb4ffe08b49 Mon Sep 17 00:00:00 2001 From: Debargha Saha Date: Thu, 26 Aug 2021 17:00:04 +0530 Subject: [PATCH 05/20] fix:updated url for error report and feature request using issue template --- numba/core/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba/core/errors.py b/numba/core/errors.py index 061954ab5f0..b69321cd332 100644 --- a/numba/core/errors.py +++ b/numba/core/errors.py @@ -358,7 +358,7 @@ def termcolor(): without Numba? (To temporarily disable Numba JIT, set the `NUMBA_DISABLE_JIT` environment variable to non-zero, and then rerun the code). -If the code is valid and the unsupported functionality is important to you +If the code is valid and the unsupported functionality is important to you please file a feature request at: https://github.com/numba/numba/issues/new?template=feature_request.md From dcdde27a260c1fc414e57c771f6b69c23e76d6e3 Mon Sep 17 00:00:00 2001 From: Graham Markall <535640+gmarkall@users.noreply.github.com> Date: Mon, 23 Aug 2021 17:31:11 +0100 Subject: [PATCH 06/20] test_inspect_cli: Decode exception with default (utf-8) codec Fixes #7337. --- numba/tests/test_help.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba/tests/test_help.py b/numba/tests/test_help.py index a78f05f0a3c..8199d4c93c4 100644 --- a/numba/tests/test_help.py +++ b/numba/tests/test_help.py @@ -89,4 +89,4 @@ def test_inspect_cli(self): with self.assertRaises(subprocess.CalledProcessError) as raises: subprocess.check_output(cmds, stderr=subprocess.STDOUT) self.assertIn("\'foo\' is not supported", - raises.exception.stdout.decode('ascii')) + raises.exception.stdout.decode()) From 6ce6a6d10957440df2119b6c51fe14c95bdc22a9 Mon Sep 17 00:00:00 2001 From: Sterling Baird <45469701+sgbaird@users.noreply.github.com> Date: Sat, 28 Aug 2021 01:49:04 -0600 Subject: [PATCH 07/20] Update docs/source/cuda/memory.rst Co-authored-by: Graham Markall <535640+gmarkall@users.noreply.github.com> --- docs/source/cuda/memory.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/source/cuda/memory.rst b/docs/source/cuda/memory.rst index b17d9e8ced2..c91c46dc588 100644 --- a/docs/source/cuda/memory.rst +++ b/docs/source/cuda/memory.rst @@ -167,8 +167,11 @@ traditional dynamic memory management. type ` of the elements needing to be stored in the array. The array is private to the current thread. An array-like object is returned which can be read and written to like any standard array (e.g. through - indexing). See also the Local Memory section of `Device Memory Accesses - `_ + indexing). + + .. seealso:: The Local Memory section of `Device Memory Accesses + `_ + in the CUDA programming guide. Constant memory =============== From e3a7eeaa09fd33f7333a9532a56b3029c09104be Mon Sep 17 00:00:00 2001 From: sgbaird Date: Sat, 28 Aug 2021 01:51:56 -0600 Subject: [PATCH 08/20] Update memory.rst --- docs/source/cuda/memory.rst | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/source/cuda/memory.rst b/docs/source/cuda/memory.rst index c91c46dc588..1b434b962d0 100644 --- a/docs/source/cuda/memory.rst +++ b/docs/source/cuda/memory.rst @@ -152,16 +152,15 @@ traditional dynamic memory management. Local memory ============ -Local memory is an area of memory private to each thread. Using local memory -helps reserve some scratchpad area to variables when scalar local variables are -not enough. The memory is allocated once for the duration of the kernel, unlike -traditional dynamic memory management. +Local memory is an area of memory private to each thread. Using local +memory helps allocate some scratchpad area when scalar local variables +are not enough. The memory is allocated once for the duration of the kernel, +unlike traditional dynamic memory management. .. function:: numba.cuda.local.array(shape, type) :noindex: - Create a variable representing a local array of the given *shape* and *type* - on the device that corresponds to a chunk of statically defined memory. + Allocate a local array of the given *shape* and *type* on the device. *shape* is either an integer or a tuple of integers representing the array's dimensions and must be a simple constant expression. *type* is a :ref:`Numba type ` of the elements needing to be stored in the array. The From 35ff818de2431796b4887a5072a85836f011108c Mon Sep 17 00:00:00 2001 From: sgbaird Date: Mon, 6 Sep 2021 01:30:05 -0600 Subject: [PATCH 09/20] Create helper.py --- numba/cuda/kernels/device/helper.py | 349 ++++++++++++++++++++++++++++ 1 file changed, 349 insertions(+) create mode 100644 numba/cuda/kernels/device/helper.py diff --git a/numba/cuda/kernels/device/helper.py b/numba/cuda/kernels/device/helper.py new file mode 100644 index 00000000000..074a964b90d --- /dev/null +++ b/numba/cuda/kernels/device/helper.py @@ -0,0 +1,349 @@ +""" +Helper functions for distance matrix computations. +""" + +import os +from math import sqrt +from numba import jit +from time import time + +inline = os.environ["INLINE"] + + +class Timer(object): + """ + Simple timer class. + + https://stackoverflow.com/a/5849861/13697228 + Usage + ----- + with Timer("description"): + # do stuff + """ + + def __init__(self, name=None): + self.name = name + + def __enter__(self): + self.tstart = time() + + def __exit__(self, type, value, traceback): + if self.name: + print("[%s]" % self.name,) + print(("Elapsed: {}\n").format(round((time() - self.tstart), 5))) + + +@jit(inline=inline) +def copy(a, out): + """ + Copy a into out. + + Parameters + ---------- + a : vector + values to copy. + out : vector + values to copy into. + + Returns + ------- + None. + + """ + for i in range(len(a)): + i = int(i) + out[i] = a[i] + + +@jit(inline=inline) +def insertionSort(arr): + """ + Perform insertion sorting on a vector. + + Source: https://www.geeksforgeeks.org/insertion-sort/ + + Parameters + ---------- + arr : 1D array + Array to be sorted in-place. + + Returns + ------- + None. + + """ + + for i in range(1, len(arr)): + key = arr[i] + + # Move elements of arr[0..i-1], that are greater than key, to one + # position ahead of their current position + j = i - 1 + while j >= 0 and key < arr[j]: + arr[j + 1] = arr[j] + j -= 1 + arr[j + 1] = key + + +@jit(inline=inline) +def insertionArgSort(arr, ids): + """ + Perform insertion sorting on a vector and track the sorting indices. + + Source: https://www.geeksforgeeks.org/insertion-sort/ + + Parameters + ---------- + arr : 1D array + Array to be sorted in-place. + + ids : 1D array + Initialized array to fill with sorting indices. + + Returns + ------- + None. + + """ + + # fill ids + for i in range(len(arr)): + ids[i] = i + + for i in range(1, len(arr)): + key = arr[i] + id_key = ids[i] + + # Move elements of arr[0..i-1], that are greater than key, to one + # position ahead of their current position + j = i - 1 + while j >= 0 and key < arr[j]: + arr[j + 1] = arr[j] + ids[j + 1] = ids[j] + j -= 1 + arr[j + 1] = key + ids[j + 1] = id_key + + +@jit(inline=inline) +def concatenate(vec, vec2, out): + """ + Concatenate two vectors. + + It also reassigns the values of out back into vec and vec2 to prevent odd + roundoff issues. + + Parameters + ---------- + vec : vector + First vector to concatenate. + vec2 : vector + Second vector to concatenate. + out : vector + Initialization of concatenated vector. + + Returns + ------- + None. + + """ + # number of elements + n = len(vec) + n2 = len(vec2) + # populate out + for i in range(n): + out[i] = vec[i] + for i in range(n2): + out[i + n] = vec2[i] + + +@jit(inline=inline) +def diff(vec, out): + """ + Compute gradient. + + Parameters + ---------- + vec : vector of floats + The vector for which to calculate the gradient. + out : vector of floats + Initialization of output vector (one less element than vec) which + contains the gradient. + + Returns + ------- + None. + + """ + # number of elements + n = len(vec) - 1 + # progressively add next elements + for i in range(n): + out[i] = vec[i + 1] - vec[i] + +# maybe there's a faster implementation somewhere + + +@jit(inline=inline) +def bisect_right(a, v, ids): + """ + Return indices where to insert items in v in list a, assuming a is sorted. + + Parameters + ---------- + arr : 1D array + Array for which to perform a binary search. + v : 1D array + Values to perform the binary search for. + ids : 1D array + Initialized list of indices, same size as v. + + Returns + ------- + None. + + The return value i is such that all e in a[:i] have e <= x, and all e in + a[i:] have e > x. So if x already appears in the list, a.insert(x) will + insert just after the rightmost x already there. Optional args lo + (default 0) and hi (default len(a)) bound the slice of a to be searched. + + Source: modified from + https://github.com/python/cpython/blob/43bab0537ceb6e2ca3597f8f3a3c79733b897434/Lib/bisect.py#L15-L35 # noqa + """ + n = len(a) + n2 = len(v) + for i in range(n2): + lo = 0 + mid = 0 + hi = n + x = v[i] + while lo < hi: + mid = (lo + hi) // 2 + # Use __lt__ to match the logic in list.sort() and in heapq + if x < a[mid]: + hi = mid + else: + lo = mid + 1 + ids[i] = lo + + +@jit(inline=inline) +def sort_by_indices(v, ids, out): + """ + Sort v by ids and assign to out, as in out = v[ids]. + + Parameters + ---------- + v : vector + values to sort. + ids : vector + indices to use for sorting. + out : vector + preallocated vector to put sorted values into. + + Returns + ------- + None. + + """ + for i in range(len(ids)): + idx = ids[i] + out[i] = v[idx] + + +@jit(inline=inline) +def cumsum(vec, out): + """ + Return the cumulative sum of the elements in a vector. + + Referenced https://stackoverflow.com/a/15889203/13697228 + + Parameters + ---------- + vec : 1D array + Vector for which to compute the cumulative sum. + + out: 1D array + Initialized vector which stores the cumulative sum. + + Returns + ------- + out : 1D array + The cumulative sum of elements in vec (same shape as vec). + + """ + # number of elements + n = len(vec) + # initialize + total = 0.0 + # progressively add next elements + for i in range(n): + total += vec[i] + out[i] = total + + +@jit(inline=inline) +def divide(v, b, out): + """ + Divide a vector by a scalar. + + Parameters + ---------- + v : vector + vector numerator. + b : float + scalar denominator. + out : vector + initialized vector to populate with divided values. + + Returns + ------- + None. + + """ + for i in range(len(v)): + out[i] = v[i] / b + + +@jit(inline=inline) +def integrate(u_cdf, v_cdf, deltas, p): + """ + Integrate between two vectors using a p-norm. + + Parameters + ---------- + u_cdf : numeric vector + First CDF. + v_cdf : numeric vector + Second CDF of same length as a. + deltas : numeric vector + The differences between the original vectors a and b. + p : int, optional + The finite power for which to compute the p-norm. The default is 2. + + If p == 1 or p == 2, use of power is avoided, which introduces an overhead + # of about 15% + Source: https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8404-L8488 # noqa + However, without support for math.square or np.square (2021-08-19), + math.pow is required for p == 2. + + Returns + ------- + out : numeric scalar + The p-norm of a and b. + + """ + n = len(u_cdf) + out = 0.0 + if p == 1: + for i in range(n): + out += abs(u_cdf[i] - v_cdf[i]) * deltas[i] + elif p == 2: + for i in range(n): + out += ((u_cdf[i] - v_cdf[i]) ** p) * deltas[i] + out = sqrt(out) + elif p > 2: + for i in range(n): + out += (u_cdf[i] - v_cdf[i]) ** p * deltas[i] + out = out ** (1 / p) + return out From 367ac21cfed35bba5a3047b60a136ebe73bc949c Mon Sep 17 00:00:00 2001 From: sgbaird Date: Mon, 6 Sep 2021 01:30:11 -0600 Subject: [PATCH 10/20] Create myjit.py --- numba/cuda/kernels/device/myjit.py | 87 ++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 numba/cuda/kernels/device/myjit.py diff --git a/numba/cuda/kernels/device/myjit.py b/numba/cuda/kernels/device/myjit.py new file mode 100644 index 00000000000..bc83d768b74 --- /dev/null +++ b/numba/cuda/kernels/device/myjit.py @@ -0,0 +1,87 @@ +""" +Decorator to assign the right jit for different targets. +""" +from numba import cuda, jit +from Lib import inspect +# from Lib.textwrap import dedent + + +# HACK +def get_myjit(target="cuda", + inline="never", + device=True, + parallel=False, + debug=False, + opt=True + ): + """ + Get a decorator that assigns the right jit for different targets. + + Parameters + ---------- + target : str, optional + Whether to compute on "cuda" or on "cpu". The default is "cuda". + self.inline : str, optional + Whether to inline functions. The default is "never". + + Returns + ------- + myjit : function + The decorator with the specified keyword arguments. + + """ + if debug and opt: + raise ValueError("debug and opt should not both be True") + + def myjit(f): + """ + Decorator to assign the right jit for different targets. + + In case of non-cuda targets, all instances of `cuda.local.array` + are replaced by `np.empty`. This is a dirty fix, hopefully in the + near future numba will support numpy array allocation and this will + not be necessary anymore + + Modified from: https://github.com/numba/numba/issues/2571 + + Parameters + ---------- + f : function + function for which the decorator is applied to. + + Returns + ------- + newfun : function + cuda.jit or jit version of f. + + """ + source = inspect.getsource(f).splitlines() + assert '@myjit' in source[0] or '@mycudajit' in source[0] + indent_spaces = len(source[0]) - len(source[0].lstrip()) + source = [s[indent_spaces:] for s in source] + source = '\n'.join(source[1:]) + '\n' + + # source = inspect.dedent(source) + + if target == 'cuda': + source = source.replace('prange', 'range') + exec(source) + fun = eval(f.__name__) + newfun = cuda.jit(f, device=device, inline=inline, + debug=debug, opt=opt) + # needs to be exported to globals + globals()[f.__name__] = newfun + return newfun + + elif target == 'cpu': + source = source.replace('cuda.local.array', 'np.empty') + if not parallel: + source = source.replace('prange', 'range') + exec(source) + fun = eval(f.__name__) + newfun = jit(fun, nopython=True, inline=inline) + # needs to be exported to globals + globals()[f.__name__] = newfun + return newfun + + return myjit From ce4dd4da2a114cf27fcfa46e2201ed8edd3061f6 Mon Sep 17 00:00:00 2001 From: sgbaird Date: Mon, 6 Sep 2021 01:30:30 -0600 Subject: [PATCH 11/20] Create test_helper.py --- numba/cuda/tests/cudapy/test_helper.py | 83 ++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 numba/cuda/tests/cudapy/test_helper.py diff --git a/numba/cuda/tests/cudapy/test_helper.py b/numba/cuda/tests/cudapy/test_helper.py new file mode 100644 index 00000000000..81d2666fa8a --- /dev/null +++ b/numba/cuda/tests/cudapy/test_helper.py @@ -0,0 +1,83 @@ +""" +Test helper functions for distance matrix calculations. +""" +import os +import numpy as np +from numba.cuda.testing import unittest, CUDATestCase +import numba.cuda.kernels.device.helper as hp +from numpy.testing import assert_allclose, assert_equal + +bits = int(os.environ["MACHINE_BITS"]) + +if bits == 32: + np_float = np.float32 + np_int = np.float32 +elif bits == 64: + np_float = np.float64 + np_int = np.float64 + +tol = 1e-5 + + +class TestHelperFunctions(CUDATestCase): + def test_concatenate(self): + vec = np.random.rand(5) + vec2 = np.random.rand(10) + check = np.concatenate((vec, vec2)) + out = np.zeros(len(vec) + len(vec2), dtype=np_float) + hp.concatenate(vec, vec2, out) + + assert_allclose(out, check) + vec[0] = 100.0 + assert_allclose(out, check) + + def test_diff(self): + vec = np.random.rand(10) + out = np.zeros(len(vec) - 1) + hp.diff(vec, out) + check = np.diff(vec) + assert_equal(out, check) + + def test_bisect_right(self): + a = np.random.rand(10) + a.sort() + v = np.random.rand(5) + ids = np.zeros_like(v[:-1], dtype=np_int) + hp.bisect_right(a, v[:-1], ids) + check = np.searchsorted(a, v[:-1], side="right") + assert_equal(ids, check) + + def test_sort_by_indices(self): + v = np.random.rand(10) + ids = np.arange(len(v)) + np.random.shuffle(ids) + out = np.zeros_like(v) + hp.sort_by_indices(v, ids, out) + check = v[ids] + assert_equal(out, check) + + v = np.array([0.0, 0.5528931, 1.1455898, 1.5933731]) + ids = np.array([1, 2, 3, 3, 3]) + out = np.zeros_like(ids, dtype=np_float) + hp.sort_by_indices(v, ids, out) + check = v[ids] + assert_allclose(out, check) + + def test_cumsum(self): + vec = np.random.rand(10) + out = np.zeros_like(vec, dtype=np_float) + hp.cumsum(vec, out) + check = np.cumsum(vec) + assert_allclose(out, check) + + def test_divide(self): + v = np.random.rand(10) + b = 4.62 + out = np.zeros_like(v, dtype=np_float) + hp.divide(v, b, out) + check = v / b + assert_allclose(out, check) + + +if __name__ == '__main__': + unittest.main() From 0463c09b4a8bf5458c8cb977d0e05ad0b55491cd Mon Sep 17 00:00:00 2001 From: sgbaird Date: Mon, 6 Sep 2021 01:30:35 -0600 Subject: [PATCH 12/20] Create test_dist_matrix.py --- numba/cuda/tests/cudapy/test_dist_matrix.py | 230 ++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 numba/cuda/tests/cudapy/test_dist_matrix.py diff --git a/numba/cuda/tests/cudapy/test_dist_matrix.py b/numba/cuda/tests/cudapy/test_dist_matrix.py new file mode 100644 index 00000000000..cf5fa5c5e22 --- /dev/null +++ b/numba/cuda/tests/cudapy/test_dist_matrix.py @@ -0,0 +1,230 @@ +# -*- coding: utf-8 -*- +""" +Test distance matrix calculations. +""" +from scipy.spatial.distance import euclidean, cdist +from scipy.stats import wasserstein_distance as scipy_wasserstein_distance +from numpy.testing import assert_allclose +import numpy as np +import os + +os.environ["INLINE"] = "never" + +from numba.cuda.kernels.device.helper import Timer # noqa +from numba.cuda.testing import unittest, CUDATestCase # noqa + + +rows = 6 +cols = 100 + +testQ = True +verbose_test = True + + +class TestDistMat(CUDATestCase): + """Test distance matrix calculations on GPU for various metrics.""" + + def test_dist_matrix(self): + """ + Loop through distance metrics and perform unit tests. + + The four test cases are: + pairwise distances within a single set of vectors + pairwise distances between two sets of vectors + sparse pairwise distances within a single set of vectors + sparse pairwise distances between two sets of vectors + + The ground truth for Euclidean comes from cdist. + The ground truth for Earth Mover's distance (1-Wasserstein) is via + a scipy.stats function. + + Helper functions are used to generate test data and support the use of + Wasserstein distances in cdist. + + Returns + ------- + None. + + """ + + def test_data(): + """ + Generate seeded test values and weights for two distributions. + + Returns + ------- + U : 2D array + Values of first distribution. + V : 2D array + Values of second distribution. + U_weights : 2D array + Weights of first distribution. + V_weights : 2D array + Weights of second distribution. + + """ + np.random.seed(42) + [U, V, U_weights, V_weights] = [ + np.random.rand(rows, cols) for i in range(4)] + return U, V, U_weights, V_weights + + def my_wasserstein_distance(u_uw, v_vw): + """ + Return Earth Mover's distance using concatenated values and weights. + + Parameters + ---------- + u_uw : 1D numeric array + Horizontally stacked values and weights of first distribution. + v_vw : TYPE + Horizontally stacked values and weights of second distribution. + + Returns + ------- + distance : numeric scalar + Earth Mover's distance given two distributions. + + """ + # split into values and weights + n = len(u_uw) + i = n // 2 + u = u_uw[0: i] + uw = u_uw[i: n] + v = v_vw[0: i] + vw = v_vw[i: n] + # calculate distance + distance = scipy_wasserstein_distance( + u, v, u_weights=uw, v_weights=vw) + return distance + + def join_wasserstein(U, V, Uw, Vw): + """ + Horizontally stack values and weights for each distribution. + + Weights are added as additional columns to values. + + Example: + u_uw, v_vw = join_wasserstein(u, v, uw, vw) + d = my_wasserstein_distance(u_uw, v_vw) + cdist(u_uw, v_vw, metric=my_wasserstein_distance) + + Parameters + ---------- + u : 1D or 2D numeric array + First set of distribution values. + v : 1D or 2D numeric array + Second set of values of distribution values. + uw : 1D or 2D numeric array + Weights for first distribution. + vw : 1D or 2D numeric array + Weights for second distribution. + + Returns + ------- + u_uw : 1D or 2D numeric array + Horizontally stacked values and weights of first distribution. + v_vw : TYPE + Horizontally stacked values and weights of second distribution. + + """ + U_Uw = np.concatenate((U, Uw), axis=1) + V_Vw = np.concatenate((V, Vw), axis=1) + return U_Uw, V_Vw + + # test and production data + pairs = np.array([(0, 1), (1, 2), (2, 3)]) + i, j = pairs[0] + U, V, U_weights, V_weights = test_data() + + Utest = U[0:6] + Vtest = V[0:6] + Uwtest = U_weights[0:6] + Vwtest = V_weights[0:6] + + for target in ["cuda", "cpu"]: + from numba.cuda.kernels.dist_matrix import dist_matrix # noqa + for metric in ['euclidean', 'wasserstein']: + print('[' + target.upper() + "_" + metric.upper() + ']') + + if testQ: + # compile + dist_matrix(Utest, U_weights=Uwtest, + metric=metric, target=target) + with Timer("one set"): + one_set = dist_matrix( + U, U_weights=U_weights, metric=metric, target=target) # noqa + if verbose_test: + print(one_set, '\n') + + # compile + dist_matrix(Utest, V=Vtest, U_weights=Uwtest, + V_weights=Vwtest, metric=metric, target=target) + with Timer("two set"): + two_set = dist_matrix(U, V=V, U_weights=U_weights, + V_weights=V_weights, metric=metric, target=target) # noqa + if verbose_test: + print(two_set, '\n') + + one_set_sparse = dist_matrix( + U, U_weights=U_weights, pairs=pairs, metric=metric, target=target) # noqa + if verbose_test: + print(one_set_sparse, '\n') + + two_set_sparse = dist_matrix( + U, + V=V, + U_weights=U_weights, + V_weights=V_weights, + pairs=pairs, + metric=metric, + target=target + ) + if verbose_test: + print(two_set_sparse, '\n') + + if testQ: + if metric == "euclidean": + with Timer("one set check (cdist)"): + one_set_check = cdist(U, U) + with Timer("two set check (cdist)"): + two_set_check = cdist(U, V) + + one_sparse_check = [ + euclidean(U[i], U[j]) for i, j in pairs] + two_sparse_check = [ + euclidean(U[i], V[j]) for i, j in pairs] + + elif metric == "wasserstein": + U_Uw, V_Vw = join_wasserstein( + U, V, U_weights, V_weights) + + with Timer("one set check (cdist)"): + one_set_check = cdist( + U_Uw, U_Uw, metric=my_wasserstein_distance) + with Timer("two set check (cdist)"): + two_set_check = cdist( + U_Uw, V_Vw, metric=my_wasserstein_distance) + + one_sparse_check = [my_wasserstein_distance(U_Uw[i], U_Uw[j]) # noqa + for i, j in pairs] + two_sparse_check = [my_wasserstein_distance(U_Uw[i], V_Vw[j]) # noqa + for i, j in pairs] + + # check results + tol = 1e-5 + assert_allclose( + one_set.ravel(), one_set_check.ravel(), rtol=tol, + err_msg="one set {} {} distance matrix inaccurate".format(target, metric)) # noqa + assert_allclose( + two_set.ravel(), two_set_check.ravel(), rtol=tol, + err_msg="two set {} {} distance matrix inaccurate".format(target, metric)) # noqa + assert_allclose( + one_set_sparse, one_sparse_check, rtol=tol, + err_msg="one set {} {} sparse distance matrix inaccurate".format(target, metric)) # noqa + assert_allclose( + two_set_sparse, two_sparse_check, rtol=tol, + err_msg="two set {} {} distance matrix inaccurate".format(target, metric)) # noqa + + +if __name__ == '__main__': + unittest.main() From 61f6927ff758818943dbdf5d10447f7f51b1adab Mon Sep 17 00:00:00 2001 From: sgbaird Date: Mon, 6 Sep 2021 06:39:49 -0600 Subject: [PATCH 13/20] Create dist_matrix.py --- numba/cuda/kernels/dist_matrix.py | 776 ++++++++++++++++++++++++++++++ 1 file changed, 776 insertions(+) create mode 100644 numba/cuda/kernels/dist_matrix.py diff --git a/numba/cuda/kernels/dist_matrix.py b/numba/cuda/kernels/dist_matrix.py new file mode 100644 index 00000000000..7561cb3fd81 --- /dev/null +++ b/numba/cuda/kernels/dist_matrix.py @@ -0,0 +1,776 @@ +""" +Compute a distance matrix for various cases. + +Cases: + within a single set of vectors (like pdist) + between two sets of vectors (like cdist) + between prespecified pairs (i.e. sparse) for either one or two sets of + vectors. + +Various distance metrics are available. +""" +import numba.cuda.kernels.device.helper as hp +import os +import numpy as np +from math import sqrt, ceil + +# CUDA Simulator not working +# os.environ["NUMBA_ENABLE_CUDASIM"] = "1" +# os.environ["NUMBA_CUDA_DEBUGINFO"] = "1" + +from numba import cuda, jit, prange # noqa +from numba.cuda.testing import unittest, CUDATestCase # noqa +from numba.types import int32, float32, int64, float64 # noqa +from numba.cuda.kernels.device.myjit import get_myjit # noqa +# from numba.cuda.cudadrv.driver import driver +# TPB = driver.get_device().MAX_THREADS_PER_BLOCK # max TPB causes crash.. + +inline = os.environ.get("INLINE", "never") +fastmath = bool(os.environ.get("FASTMATH", "1")) +cols = os.environ.get("COLUMNS") +USE_64 = bool(os.environ.get("USE_64", "0")) +target = os.environ.get("TARGET", "cuda") + +if USE_64 is None: + USE_64 = False +if USE_64: + bits = 64 + nb_float = float64 + nb_int = int64 + np_float = np.float64 + np_int = np.int64 +else: + bits = 32 + nb_float = float32 + nb_int = int32 + np_float = np.float32 + np_int = np.int32 +os.environ["MACHINE_BITS"] = str(bits) + +if cols is not None: + cols = int(cols) + cols_plus_1 = cols + 1 + tot_cols = cols * 2 + tot_cols_minus_1 = tot_cols - 1 +else: + raise KeyError("For performance reasons and architecture constraints " + "the number of columns of U (which is the same as V) " + "must be defined as the environment variable, COLUMNS, " + "via e.g. `os.environ[\"COLUMNS\"] = \"100\"`.") + +# override types +if target == "cpu": + nb_float = np_float + nb_int = np_int + +# a "hacky" way to get compatibility between @njit and @cuda.jit +# myjit = get_myjit(target=target, inline=inline) +# mycudajit = get_myjit(device=False, target=target, inline=inline) + +# local array shapes needs to be defined globally due to lack of dynamic +# array allocation support. Don't wrap with np.int32, etc. see +# https://github.com/numba/numba/issues/7314 +TPB = 16 + +# np.savetxt('100-zeros.csv', +# np.zeros(tmp, dtype=np.int32), +# delimiter=",") +# OK to take cols from .shape +# cols = np.genfromtxt('100-zeros.csv', +# dtype=np.int32, +# delimiter=",").shape[0] + + +@ jit(inline=inline) +def cdf_distance(u, v, u_weights, v_weights, p, presorted, cumweighted, prepended): # noqa + r""" # noqa + Compute distance between two 1D distributions :math:`u` and :math:`v`. + + The respective CDFs are :math:`U` and :math:`V`, and the + statistical distance is defined as: + .. math:: + l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p} + p is a positive parameter; p = 1 gives the Wasserstein distance, + p = 2 gives the energy distance. + + Parameters + ---------- + u_values, v_values : array_like + Values observed in the (empirical) distribution. + u_weights, v_weights : array_like + Weight for each value. + `u_weights` (resp. `v_weights`) must have the same length as + `u_values` (resp. `v_values`). If the weight sum differs from + 1, it must still be positive and finite so that the weights can + be normalized to sum to 1. + p : scalar + positive parameter that determines the type of cdf distance. + presorted : bool + Whether u and v have been sorted already *and* u_weights and + v_weights have been sorted using the same indices used to sort + u and v, respectively. + cumweighted : bool + Whether u_weights and v_weights have been converted to their + cumulative weights via e.g. np.cumsum(). + prepended : bool + Whether a zero has been prepended to accumated, sorted + u_weights and v_weights. + + By setting presorted, cumweighted, *and* prepended to False, the + computationproceeds proceeds in the same fashion as _cdf_distance + from scipy.stats. + + Returns + ------- + distance : float + The computed distance between the distributions. + + Notes + ----- + The input distributions can be empirical, therefore coming from + samples whose values are effectively inputs of the function, or + they can be seen as generalized functions, in which case they are + weighted sums of Dirac delta functions located at the specified + values. + + References + ---------- + .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, + Hoyer, Munos "The Cramer Distance as a Solution to Biased + Wasserstein Gradients" (2017). :arXiv:`1705.10743`. + """ + # allocate local float arrays + # combined vector + uv = cuda.local.array(tot_cols, nb_float) + uv_deltas = cuda.local.array(tot_cols_minus_1, nb_float) + + # CDFs + u_cdf = cuda.local.array(tot_cols_minus_1, nb_float) + v_cdf = cuda.local.array(tot_cols_minus_1, nb_float) + + # allocate local int arrays + # CDF indices via binary search + u_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int) + v_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int) + + u_cdf_sorted_cumweights = cuda.local.array( + tot_cols_minus_1, nb_float) + v_cdf_sorted_cumweights = cuda.local.array( + tot_cols_minus_1, nb_float) + + # short-circuit + if presorted and cumweighted and prepended: + u_sorted = u + v_sorted = v + + u_0_cumweights = u_weights + v_0_cumweights = v_weights + + # sorting, accumulating, and prepending (for compatibility) + else: + # check arguments + if not presorted and (cumweighted or prepended): + raise ValueError( + "if cumweighted or prepended are True, then presorted cannot be False") # noqa + + if (not presorted or not cumweighted) and prepended: + raise ValueError( + "if prepended is True, then presorted and cumweighted must both be True") # noqa + + # sorting + if not presorted: + # local arrays + u_sorted = cuda.local.array(cols, np_float) + v_sorted = cuda.local.array(cols, np_float) + + u_sorter = cuda.local.array(cols, nb_int) + v_sorter = cuda.local.array(cols, nb_int) + + u_sorted_weights = cuda.local.array(cols, nb_float) + v_sorted_weights = cuda.local.array(cols, nb_float) + + # local copy since quickArgSortIterative sorts in-place + hp.copy(u, u_sorted) + hp.copy(v, v_sorted) + + # sorting + hp.insertionArgSort(u_sorted, u_sorter) + hp.insertionArgSort(v_sorted, v_sorter) + + # inplace to avoid extra cuda local array + hp.sort_by_indices(u_weights, u_sorter, u_sorted_weights) + hp.sort_by_indices(u_weights, u_sorter, v_sorted_weights) + + # cumulative weights + if not cumweighted: + # local arrays + u_cumweights = cuda.local.array(cols, nb_float) + v_cumweights = cuda.local.array(cols, nb_float) + # accumulate + hp.cumsum(u_sorted_weights, u_cumweights) + hp.cumsum(v_sorted_weights, v_cumweights) + + # prepend weights with zero + if not prepended: + zero = cuda.local.array(1, nb_float) + + u_0_cumweights = cuda.local.array( + cols_plus_1, nb_float) + v_0_cumweights = cuda.local.array( + cols_plus_1, nb_float) + + hp.concatenate(zero, u_cumweights, u_0_cumweights) + hp.concatenate(zero, v_cumweights, v_0_cumweights) + + # concatenate u and v into uv + hp.concatenate(u_sorted, v_sorted, uv) + + # sorting + # quickSortIterative(uv, uv_stack) + hp.insertionSort(uv) + + # Get the respective positions of the values of u and v among the + # values of both distributions. See also np.searchsorted + hp.bisect_right(u_sorted, uv[:-1], u_cdf_indices) + hp.bisect_right(v_sorted, uv[:-1], v_cdf_indices) + + # empirical CDFs + hp.sort_by_indices(u_0_cumweights, u_cdf_indices, + u_cdf_sorted_cumweights) + hp.divide(u_cdf_sorted_cumweights, u_0_cumweights[-1], u_cdf) + + hp.sort_by_indices(v_0_cumweights, v_cdf_indices, + v_cdf_sorted_cumweights) + hp.divide(v_cdf_sorted_cumweights, v_0_cumweights[-1], v_cdf) + + # # Integration + hp.diff(uv, uv_deltas) # See also np.diff + + out = hp.integrate(u_cdf, v_cdf, uv_deltas, p) + + return out + + +@ jit(inline=inline) +def wasserstein_distance(u, v, u_weights, v_weights, presorted, cumweighted, prepended): # noqa + r""" + Compute the first Wasserstein distance between two 1D distributions. + + This distance is also known as the earth mover's distance, since it can be + seen as the minimum amount of "work" required to transform :math:`u` into + :math:`v`, where "work" is measured as the amount of distribution weight + that must be moved, multiplied by the distance it has to be moved. + + Source + ------ + https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8245-L8319 # noqa + + Parameters + ---------- + u_values, v_values : array_like + Values observed in the (empirical) distribution. + u_weights, v_weights : array_like, optional + Weight for each value. If unspecified, each value is assigned the same + weight. + `u_weights` (resp. `v_weights`) must have the same length as + `u_values` (resp. `v_values`). If the weight sum differs from 1, it + must still be positive and finite so that the weights can be normalized + to sum to 1. + + Returns + ------- + distance : float + The computed distance between the distributions. + + Notes + ----- + The input distributions can be empirical, therefore coming from samples + whose values are effectively inputs of the function, or they can be seen as + generalized functions, in which case they are weighted sums of Dirac delta + functions located at the specified values. + + References + ---------- + .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer, + Munos "The Cramer Distance as a Solution to Biased Wasserstein + Gradients" (2017). :arXiv:`1705.10743`. + """ + return cdf_distance(u, v, u_weights, v_weights, np_int(1), presorted, cumweighted, prepended) # noqa + + +@ jit(inline=inline) +def euclidean_distance(a, b): + """ + Calculate Euclidean distance between vectors a and b. + + Parameters + ---------- + a : 1D array + First vector. + b : 1D array + Second vector. + + Returns + ------- + d : numeric scalar + Euclidean distance between vectors a and b. + """ + d = 0 + for i in range(len(a)): + d += (b[i] - a[i]) ** 2 + d = sqrt(d) + return d + + +@ jit(inline=inline) +def compute_distance(u, v, u_weights, v_weights, metric_num): + """ + Calculate weighted distance between two vectors, u and v. + + Parameters + ---------- + u : 1D array of float + First vector. + v : 1D array of float + Second vector. + u_weights : 1D array of float + Weights for u. + v_weights : 1D array of float + Weights for v. + metric_num : int + Which metric to use (0 == "euclidean", 1=="wasserstein"). + + Raises + ------ + NotImplementedError + "Specified metric is mispelled or has not been implemented yet. + If not implemented, consider submitting a pull request." + + Returns + ------- + d : float + Weighted distance between u and v. + + """ + if metric_num == 0: + d = euclidean_distance(u, v) + elif metric_num == 1: + # assume u and v are presorted, and weights are sorted by u and v + d = wasserstein_distance( + u, v, u_weights, v_weights, True, True, True) + else: + raise NotImplementedError( + "Specified metric is mispelled or has not been implemented yet. " + "If not implemented, consider submitting a pull request." + ) + return d + + +@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], " + "int{0}[:,:], float{0}[:], int{0})".format(bits), fastmath=fastmath) +def sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, metric_num): + """ + Calculate sparse pairwise distances between two sets of vectors for pairs. + + Parameters + ---------- + mat : numeric cuda array + First set of vectors for which to compute a single pairwise distance. + mat2 : numeric cuda array + Second set of vectors for which to compute a single pairwise distance. + pairs : cuda array of 2-tuples + All pairs for which distances are to be computed. + out : numeric cuda array + The initialized array which will be populated with distances. + + Raises + ------ + ValueError + Both matrices should have the same number of columns. + + Returns + ------- + None. + + """ + k = cuda.grid(1) + + npairs = pairs.shape[0] + + if k < npairs: + pair = pairs[k] + # extract indices of the pair for which the distance will be computed + i, j = pair + + u = U[i] + v = V[j] + uw = U_weights[i] + vw = V_weights[j] + + out[k] = compute_distance(u, v, uw, vw, metric_num) + + +@cuda.jit( + "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], int{0})".format(bits), + fastmath=fastmath, +) +def one_set_distance_matrix(U, U_weights, out, metric_num): + """ + Calculate pairwise distances within single set of vectors. + + Parameters + ---------- + U : 2D array of float + Vertically stacked vectors. + U_weights : 2D array of float + Vertically stacked weight vectors. + out : 2D array of float + Initialized matrix to populate with pairwise distances. + metric_num : int + Which metric to use (0 == "euclidean", 1=="wasserstein"). + + Returns + ------- + None. + + """ + i, j = cuda.grid(2) + + dm_rows = U.shape[0] + dm_cols = U.shape[0] + + if i < j and i < dm_rows and j < dm_cols and i != j: + u = U[i] + v = U[j] + uw = U_weights[i] + vw = U_weights[j] + d = compute_distance(u, v, uw, vw, metric_num) + out[i, j] = d + out[j, i] = d + + +# faster compilation *and* runtimes with explicit signature +@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], " + "float{0}[:,:], int{0})".format(bits), fastmath=fastmath) +def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num): + """ + Calculate pairwise distances between two sets of vectors. + + Parameters + ---------- + U : 2D array of float + Vertically stacked vectors. + V : 2D array of float + Vertically stacked vectors. + U_weights : 2D array of float + Vertically stacked weight vectors. + V_weights : 2D array of float + Vertically stacked weight vectors. + out : 2D array of float + Pairwise distance matrix between two sets of vectors. + metric_num : int + Distance metric number {"euclidean": 0, "wasserstein": 1}. + + Returns + ------- + None. + + """ + i, j = cuda.grid(2) + + # distance matrix shape + dm_rows = U.shape[0] + dm_cols = V.shape[0] + + if i < dm_rows and j < dm_cols: + u = U[i] + v = V[j] + uw = U_weights[i] + vw = V_weights[j] + d = compute_distance(u, v, uw, vw, metric_num) + out[i, j] = d + + +def dist_matrix(U, + V=None, + U_weights=None, + V_weights=None, + pairs=None, + metric="euclidean"): # noqa + """ + Compute distance matrices using Numba/CUDA. + + Parameters + ---------- + mat : array + First set of vectors for which to compute pairwise distances. + + mat2 : array, optional + Second set of vectors for which to compute pairwise distances. + If not specified, then mat2 is a copy of mat. + + pairs : array, optional + List of 2-tuples which contain the indices for which to compute + distances. If mat2 was specified, then the second index accesses + mat2 instead of mat. If not specified, then the pairs are + auto-generated. If mat2 was specified, all combinations of the two + vector sets are used. If mat2 isn't specified, then only the upper + triangle (minus diagonal) pairs are computed. + + metric : str, optional + Possible options are 'euclidean', 'wasserstein'. + Defaults to Euclidean distance. These are converted to integers + internally due to Numba's lack of support for string arguments + (2021-08-14). See compute_distance() for other keys. + + For example: + 0 - 'euclidean' + 1 - 'wasserstein' + target : str, optional + Which target to use: "cuda" or "cpu". Default is "cuda". + inline : str, optional + Whether to inline functions: "always" or "never". Default is "never". + fastmath : bool, optional + Whether to use fastmath or not. The default is True. + USE_64 : bool, optional + Whether to use 64 bit or 34 bit types. The default is False. + + Raises + ------ + ValueError + DESCRIPTION. + NotImplementedError + DESCRIPTION. + + Returns + ------- + out : array + A pairwise distance matrix, or if pairs are specified, then a + vector of distances corresponding to the pairs. + + """ + rows, cols_check = U.shape + + if cols_check != cols: + raise KeyError("For performance reasons and architecture constraints " + "the number of columns of U (which is the same as V) " + "must be defined as the environment variable: COLUMNS. " + "However, os.environ[\"COLUMNS\"] does not match the " + "number of columns in U. Reset the environment variable " + "via e.g. `os.environ[\"COLUMNS\"] = str(U.shape[0])` " + "defined at the top of your script and make sure that " + "dist_matrix is reloaded. You may also need to restart " + "Python.") + + if V is not None and cols is not V.shape[1]: + raise ValueError("The number of columns for U and V should match") + + # is it a distance matrix between two sets of vectors? + # (rather than within a single set) + isXY = V is not None + + # were pairs specified? (useful for sparse matrix generation) + pairQ = pairs is not None + + # block, grid, and out shape + if pairQ: + block_dim = TPB + npairs = pairs.shape[0] + grid_dim = ceil(npairs / block_dim) + else: + block_dim = (TPB, TPB) + blockspergrid_x = ceil(rows / block_dim[0]) + if isXY: + blockspergrid_y = ceil(V.shape[0] / block_dim[1]) + else: + blockspergrid_y = ceil(rows / block_dim[1]) + grid_dim = (blockspergrid_x, blockspergrid_y) + + # CUDA setup + stream = cuda.stream() + + # sorting and cumulative weights + if metric == "wasserstein": + # presort values (and weights by sorted value indices) + U_sorter = np.argsort(U) + U = np.take_along_axis(U, U_sorter, axis=-1) + U_weights = np.take_along_axis(U_weights, U_sorter, axis=-1) + + # calculate cumulative weights + U_weights = np.cumsum(U_weights, axis=1) + + # prepend a column of zeros + zero = np.zeros((U_weights.shape[0], 1)) + U_weights = np.column_stack((zero, U_weights)) + + # do the same for V and V_weights + if isXY: + V_sorter = np.argsort(V) + V = np.take_along_axis(V, V_sorter, axis=-1) + V_weights = np.take_along_axis(V_weights, V_sorter, axis=-1) + V_weights = np.cumsum(V_weights, axis=1) + V_weights = np.column_stack((zero, V_weights)) + + # assign dummy arrays + if V is None: + V = np.zeros((2, 2)) + + if U_weights is None: + U_weights = np.zeros((2, 2)) + + if V_weights is None: + V_weights = np.zeros((2, 2)) + + if pairs is None: + pairs = np.zeros((2, 2)) + + # assign metric_num based on specified metric (no string support) + metric_dict = {"euclidean": 0, "wasserstein": 1} + metric_num = metric_dict[metric] + + # # same for target_num + # target_dict = {"cuda": 0, "cpu": 1} + # target_num = target_dict[target] + + m = U.shape[0] + + if isXY: + m2 = V.shape[0] + else: + m2 = m + + if pairQ: + shape = (npairs,) + else: + shape = (m, m2) + + # values should be initialized instead of using cuda.device_array + out = np.zeros(shape) + + if target == "cuda": + # copying/allocating to GPU + cU = cuda.to_device(np.asarray( + U, dtype=np_float), stream=stream) + + cU_weights = cuda.to_device(np.asarray( + U_weights, dtype=np_float), stream=stream) + + if isXY or pairQ: + cV = cuda.to_device(np.asarray( + V, dtype=np_float), stream=stream) + cV_weights = cuda.to_device( + np.asarray(V_weights, dtype=np_float), stream=stream + ) + + if pairQ: + cpairs = cuda.to_device(np.asarray( + pairs, dtype=np_int), stream=stream) + + cuda_out = cuda.to_device(np.asarray( + out, dtype=np_float), stream=stream) + + elif target == "cpu": + cU = U + cV = V + cU_weights = U_weights + cV_weights = V_weights + cpairs = pairs + cuda_out = out + + # distance matrix between two sets of vectors + if isXY and not pairQ: + fn = two_set_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cV, cU_weights, cV_weights, cuda_out, metric_num) + + # specified pairwise distances within single set of vectors + elif not isXY and pairQ: + fn = sparse_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cU, cU_weights, cU_weights, cpairs, + cuda_out, metric_num) + + # distance matrix within single set of vectors + elif not isXY and not pairQ: + fn = one_set_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cU_weights, cuda_out, metric_num) + + # specified pairwise distances within single set of vectors + elif isXY and pairQ: + fn = sparse_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cV, cU_weights, cV_weights, cpairs, + cuda_out, metric_num) + + out = cuda_out.copy_to_host(stream=stream) + + return out + +# %% Code Graveyard + +# u_stack = cuda.local.array(cols, nb_int) +# v_stack = cuda.local.array(cols, nb_int) + +# copy(u_sorted, utmp) +# copy(v_sorted, vtmp) + +# copy(u_weights, u_sorted_weights) +# copy(v_weights, v_sorted_weights) + +# stack = [0] * size +# ids = list(range(len(arr))) + +# u_sorted_weights = cuda.local.array(cols, nb_float) +# v_sorted_weights = cuda.local.array(cols, nb_float) + +# sorting stack +# uv_stack = cuda.local.array(tot_cols, nb_int) + +# def sort_cum_prepend(X, X_weights): +# # presort values (and weights by sorted value indices) +# X_sorter = np.argsort(X) +# X = np.take_along_axis(X, X_sorter, axis=-1) +# X_weights = np.take_along_axis(X_weights, X_sorter, axis=-1) +# # calculate cumulative weights +# X_weights = np.cumsum(X_weights) +# # prepend a column of zeros +# zero = np.zeros((X_weights.shape[0], 1)) +# X_weights = np.column_stack((zero, X_weights)) + +# def mean_squared_error(y, y_pred, squared=False): +# """ +# Return mean squared error (MSE) without using sklearn. + +# If squared == True, then return root mean squared error (RMSE). + +# Parameters +# ---------- +# y : 1D numeric array +# "True" or first set of values. +# y_pred : 1D numeric array +# "Predicted" or second set of values. + +# Returns +# ------- +# rmse : numeric scalar +# DESCRIPTION. + +# """ +# mse = np.mean((y - y_pred)**2) +# if squared is True: +# rmse = np.sqrt(mse) +# return rmse +# else: +# return mse +# return mse + +# opt = False +# os.environ["NUMBA_CUDA_DEBUGINFO"] = "0" +# opt = True +# os.environ["NUMBA_BOUNDSCHECK"] = "0" +# os.environ["OPT"] = "0" From aa413a966fc3c28b9ff646f7d9e2a13f673ff20b Mon Sep 17 00:00:00 2001 From: sgbaird Date: Mon, 6 Sep 2021 06:39:59 -0600 Subject: [PATCH 14/20] Update helper.py --- numba/cuda/kernels/device/helper.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/numba/cuda/kernels/device/helper.py b/numba/cuda/kernels/device/helper.py index 074a964b90d..3ff059b3d7e 100644 --- a/numba/cuda/kernels/device/helper.py +++ b/numba/cuda/kernels/device/helper.py @@ -1,13 +1,10 @@ -""" -Helper functions for distance matrix computations. -""" - +"""Helper functions for distance matrix computations.""" import os from math import sqrt from numba import jit from time import time -inline = os.environ["INLINE"] +inline = os.environ.get("INLINE", "never") class Timer(object): @@ -25,9 +22,11 @@ def __init__(self, name=None): self.name = name def __enter__(self): + """Enter the timer.""" self.tstart = time() def __exit__(self, type, value, traceback): + """Exit the timer.""" if self.name: print("[%s]" % self.name,) print(("Elapsed: {}\n").format(round((time() - self.tstart), 5))) @@ -72,7 +71,6 @@ def insertionSort(arr): None. """ - for i in range(1, len(arr)): key = arr[i] @@ -105,7 +103,6 @@ def insertionArgSort(arr, ids): None. """ - # fill ids for i in range(len(arr)): ids[i] = i @@ -347,3 +344,12 @@ def integrate(u_cdf, v_cdf, deltas, p): out += (u_cdf[i] - v_cdf[i]) ** p * deltas[i] out = out ** (1 / p) return out + + +# %% CODE GRAVEYARD +# from numba.core import config +# config_keys = dir(config) +# if "INLINE" in config_keys: +# inline = config.INLINE +# else: +# inline = "never" From 0bc56b2602c4f673e9fb036988864172ca6d26ee Mon Sep 17 00:00:00 2001 From: sgbaird Date: Mon, 6 Sep 2021 06:40:12 -0600 Subject: [PATCH 15/20] Update test_dist_matrix.py --- numba/cuda/tests/cudapy/test_dist_matrix.py | 209 +++++++++++--------- 1 file changed, 116 insertions(+), 93 deletions(-) diff --git a/numba/cuda/tests/cudapy/test_dist_matrix.py b/numba/cuda/tests/cudapy/test_dist_matrix.py index cf5fa5c5e22..4ad8f948a16 100644 --- a/numba/cuda/tests/cudapy/test_dist_matrix.py +++ b/numba/cuda/tests/cudapy/test_dist_matrix.py @@ -1,23 +1,32 @@ # -*- coding: utf-8 -*- -""" -Test distance matrix calculations. -""" +"""Test distance matrix calculations using CUDA/Numba.""" from scipy.spatial.distance import euclidean, cdist from scipy.stats import wasserstein_distance as scipy_wasserstein_distance from numpy.testing import assert_allclose import numpy as np import os +from importlib import reload +# os.environ["NUMBA_DISABLE_JIT"] = "1" + +# number of columns of U and V must be set as env var before import dist_matrix +cols = 100 +os.environ["COLUMNS"] = str(cols) + +# other environment variables (set before importing dist_matrix) +os.environ["USE_64"] = "0" os.environ["INLINE"] = "never" +os.environ["FASTMATH"] = "1" +os.environ["TARGET"] = "cuda" from numba.cuda.kernels.device.helper import Timer # noqa from numba.cuda.testing import unittest, CUDATestCase # noqa +import numba.cuda.kernels.dist_matrix # noqa +# to overwrite env vars (source: https://stackoverflow.com/a/1254379/13697228) +reload(numba.cuda.kernels.dist_matrix) +dist_matrix = numba.cuda.kernels.dist_matrix.dist_matrix -rows = 6 -cols = 100 - -testQ = True verbose_test = True @@ -47,7 +56,7 @@ def test_dist_matrix(self): """ - def test_data(): + def test_data(rows, cols): """ Generate seeded test values and weights for two distributions. @@ -131,100 +140,114 @@ def join_wasserstein(U, V, Uw, Vw): V_Vw = np.concatenate((V, Vw), axis=1) return U_Uw, V_Vw + tol = 1e-5 + # test and production data pairs = np.array([(0, 1), (1, 2), (2, 3)]) i, j = pairs[0] - U, V, U_weights, V_weights = test_data() - - Utest = U[0:6] - Vtest = V[0:6] - Uwtest = U_weights[0:6] - Vwtest = V_weights[0:6] - for target in ["cuda", "cpu"]: - from numba.cuda.kernels.dist_matrix import dist_matrix # noqa - for metric in ['euclidean', 'wasserstein']: - print('[' + target.upper() + "_" + metric.upper() + ']') + for testQ in [True, False]: + # large # of rows with testQ == True is slow due to use of + # cdist and non-jitted scipy-wasserstein + if testQ: + rows = 6 + else: + rows = 1000 + print("====[PARTIAL TEST WITH {} ROWS]====".format(rows)) + U, V, U_weights, V_weights = test_data(rows, cols) + Utest, Vtest, Uwtest, Vwtest = [x[0:6] for x in [U, V, U_weights, V_weights]] # noqa + + for target in ["cuda"]: # "cpu" not implemented + for metric in ['euclidean', 'wasserstein']: + print('[' + target.upper() + "_" + metric.upper() + ']') + + if testQ: + # compile + dist_matrix(Utest, U_weights=Uwtest, metric=metric) + with Timer("one set"): + one_set = dist_matrix( + U, U_weights=U_weights, metric=metric) # noqa + if verbose_test: + print(one_set, '\n') - if testQ: # compile - dist_matrix(Utest, U_weights=Uwtest, - metric=metric, target=target) - with Timer("one set"): - one_set = dist_matrix( - U, U_weights=U_weights, metric=metric, target=target) # noqa - if verbose_test: - print(one_set, '\n') - - # compile - dist_matrix(Utest, V=Vtest, U_weights=Uwtest, - V_weights=Vwtest, metric=metric, target=target) - with Timer("two set"): - two_set = dist_matrix(U, V=V, U_weights=U_weights, - V_weights=V_weights, metric=metric, target=target) # noqa - if verbose_test: - print(two_set, '\n') - - one_set_sparse = dist_matrix( - U, U_weights=U_weights, pairs=pairs, metric=metric, target=target) # noqa - if verbose_test: - print(one_set_sparse, '\n') - - two_set_sparse = dist_matrix( - U, - V=V, - U_weights=U_weights, - V_weights=V_weights, - pairs=pairs, - metric=metric, - target=target - ) - if verbose_test: - print(two_set_sparse, '\n') - - if testQ: - if metric == "euclidean": - with Timer("one set check (cdist)"): - one_set_check = cdist(U, U) + dist_matrix(Utest, V=Vtest, U_weights=Uwtest, + V_weights=Vwtest, metric=metric) + with Timer("two set"): + two_set = dist_matrix(U, V=V, U_weights=U_weights, + V_weights=V_weights, metric=metric) # noqa + if testQ and verbose_test: + print(two_set, '\n') + + one_set_sparse = dist_matrix( + U, U_weights=U_weights, pairs=pairs, metric=metric) # noqa + if testQ and verbose_test: + print(one_set_sparse, '\n') + + two_set_sparse = dist_matrix( + U, + V=V, + U_weights=U_weights, + V_weights=V_weights, + pairs=pairs, + metric=metric, + ) + if testQ and verbose_test: + print(two_set_sparse, '\n') + + if testQ: + if metric == "euclidean": + with Timer("one set check (cdist)"): + one_set_check = cdist(U, U) + with Timer("two set check (cdist)"): + two_set_check = cdist(U, V) + + one_sparse_check = [ + euclidean(U[i], U[j]) for i, j in pairs] + two_sparse_check = [ + euclidean(U[i], V[j]) for i, j in pairs] + + elif metric == "wasserstein": + U_Uw, V_Vw = join_wasserstein( + U, V, U_weights, V_weights) + + with Timer("one set check (cdist)"): + one_set_check = cdist( + U_Uw, U_Uw, metric=my_wasserstein_distance) + with Timer("two set check (cdist)"): + two_set_check = cdist( + U_Uw, V_Vw, metric=my_wasserstein_distance) + + one_sparse_check = [my_wasserstein_distance(U_Uw[i], U_Uw[j]) # noqa + for i, j in pairs] + two_sparse_check = [my_wasserstein_distance(U_Uw[i], V_Vw[j]) # noqa + for i, j in pairs] + + # check results + assert_allclose( + one_set.ravel(), one_set_check.ravel(), rtol=tol, + err_msg="one set {} {} distance matrix inaccurate".format(target, metric)) # noqa + assert_allclose( + two_set.ravel(), two_set_check.ravel(), rtol=tol, + err_msg="two set {} {} distance matrix inaccurate".format(target, metric)) # noqa + assert_allclose( + one_set_sparse, one_sparse_check, rtol=tol, + err_msg="one set {} {} sparse distance matrix inaccurate".format(target, metric)) # noqa + assert_allclose( + two_set_sparse, two_sparse_check, rtol=tol, + err_msg="two set {} {} distance matrix inaccurate".format(target, metric)) # noqa + elif metric == "euclidean": with Timer("two set check (cdist)"): two_set_check = cdist(U, V) - one_sparse_check = [ - euclidean(U[i], U[j]) for i, j in pairs] - two_sparse_check = [ - euclidean(U[i], V[j]) for i, j in pairs] - - elif metric == "wasserstein": - U_Uw, V_Vw = join_wasserstein( - U, V, U_weights, V_weights) - - with Timer("one set check (cdist)"): - one_set_check = cdist( - U_Uw, U_Uw, metric=my_wasserstein_distance) - with Timer("two set check (cdist)"): - two_set_check = cdist( - U_Uw, V_Vw, metric=my_wasserstein_distance) - - one_sparse_check = [my_wasserstein_distance(U_Uw[i], U_Uw[j]) # noqa - for i, j in pairs] - two_sparse_check = [my_wasserstein_distance(U_Uw[i], V_Vw[j]) # noqa - for i, j in pairs] - - # check results - tol = 1e-5 - assert_allclose( - one_set.ravel(), one_set_check.ravel(), rtol=tol, - err_msg="one set {} {} distance matrix inaccurate".format(target, metric)) # noqa - assert_allclose( - two_set.ravel(), two_set_check.ravel(), rtol=tol, - err_msg="two set {} {} distance matrix inaccurate".format(target, metric)) # noqa - assert_allclose( - one_set_sparse, one_sparse_check, rtol=tol, - err_msg="one set {} {} sparse distance matrix inaccurate".format(target, metric)) # noqa - assert_allclose( - two_set_sparse, two_sparse_check, rtol=tol, - err_msg="two set {} {} distance matrix inaccurate".format(target, metric)) # noqa - if __name__ == '__main__': unittest.main() + + +# TF = [ +# np.allclose(one_set, one_set_check), +# np.allclose(two_set, two_set_check), +# np.allclose(one_set_sparse, one_sparse_check), +# np.allclose(two_set_sparse, two_sparse_check) +# ] From 571af8dfcd8ac684c662126bd3e2adbff741c29c Mon Sep 17 00:00:00 2001 From: sgbaird Date: Mon, 6 Sep 2021 06:58:34 -0600 Subject: [PATCH 16/20] Revert "Create myjit.py" This reverts commit 367ac21cfed35bba5a3047b60a136ebe73bc949c. --- numba/cuda/kernels/device/myjit.py | 87 ------------------------------ 1 file changed, 87 deletions(-) delete mode 100644 numba/cuda/kernels/device/myjit.py diff --git a/numba/cuda/kernels/device/myjit.py b/numba/cuda/kernels/device/myjit.py deleted file mode 100644 index bc83d768b74..00000000000 --- a/numba/cuda/kernels/device/myjit.py +++ /dev/null @@ -1,87 +0,0 @@ -""" -Decorator to assign the right jit for different targets. -""" -from numba import cuda, jit -from Lib import inspect -# from Lib.textwrap import dedent - - -# HACK -def get_myjit(target="cuda", - inline="never", - device=True, - parallel=False, - debug=False, - opt=True - ): - """ - Get a decorator that assigns the right jit for different targets. - - Parameters - ---------- - target : str, optional - Whether to compute on "cuda" or on "cpu". The default is "cuda". - self.inline : str, optional - Whether to inline functions. The default is "never". - - Returns - ------- - myjit : function - The decorator with the specified keyword arguments. - - """ - if debug and opt: - raise ValueError("debug and opt should not both be True") - - def myjit(f): - """ - Decorator to assign the right jit for different targets. - - In case of non-cuda targets, all instances of `cuda.local.array` - are replaced by `np.empty`. This is a dirty fix, hopefully in the - near future numba will support numpy array allocation and this will - not be necessary anymore - - Modified from: https://github.com/numba/numba/issues/2571 - - Parameters - ---------- - f : function - function for which the decorator is applied to. - - Returns - ------- - newfun : function - cuda.jit or jit version of f. - - """ - source = inspect.getsource(f).splitlines() - assert '@myjit' in source[0] or '@mycudajit' in source[0] - indent_spaces = len(source[0]) - len(source[0].lstrip()) - source = [s[indent_spaces:] for s in source] - source = '\n'.join(source[1:]) + '\n' - - # source = inspect.dedent(source) - - if target == 'cuda': - source = source.replace('prange', 'range') - exec(source) - fun = eval(f.__name__) - newfun = cuda.jit(f, device=device, inline=inline, - debug=debug, opt=opt) - # needs to be exported to globals - globals()[f.__name__] = newfun - return newfun - - elif target == 'cpu': - source = source.replace('cuda.local.array', 'np.empty') - if not parallel: - source = source.replace('prange', 'range') - exec(source) - fun = eval(f.__name__) - newfun = jit(fun, nopython=True, inline=inline) - # needs to be exported to globals - globals()[f.__name__] = newfun - return newfun - - return myjit From 0eb6b8d322332788765335028235cdbb9cd83088 Mon Sep 17 00:00:00 2001 From: sgbaird Date: Mon, 6 Sep 2021 07:43:48 -0600 Subject: [PATCH 17/20] bad call to MACHINE_BITS --- numba/cuda/tests/cudapy/test_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba/cuda/tests/cudapy/test_helper.py b/numba/cuda/tests/cudapy/test_helper.py index 81d2666fa8a..67383ca9093 100644 --- a/numba/cuda/tests/cudapy/test_helper.py +++ b/numba/cuda/tests/cudapy/test_helper.py @@ -7,7 +7,7 @@ import numba.cuda.kernels.device.helper as hp from numpy.testing import assert_allclose, assert_equal -bits = int(os.environ["MACHINE_BITS"]) +bits = int(os.environ.get("MACHINE_BITS", "32")) if bits == 32: np_float = np.float32 From 3727396d1a03baf4bb27913dc52db5ac373d0c26 Mon Sep 17 00:00:00 2001 From: sgbaird Date: Mon, 6 Sep 2021 08:07:31 -0600 Subject: [PATCH 18/20] Update dist_matrix.py --- numba/cuda/kernels/dist_matrix.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/numba/cuda/kernels/dist_matrix.py b/numba/cuda/kernels/dist_matrix.py index 7561cb3fd81..df96846c7f6 100644 --- a/numba/cuda/kernels/dist_matrix.py +++ b/numba/cuda/kernels/dist_matrix.py @@ -21,7 +21,7 @@ from numba import cuda, jit, prange # noqa from numba.cuda.testing import unittest, CUDATestCase # noqa from numba.types import int32, float32, int64, float64 # noqa -from numba.cuda.kernels.device.myjit import get_myjit # noqa +# HACK from numba.cuda.kernels.device.myjit import get_myjit # from numba.cuda.cudadrv.driver import driver # TPB = driver.get_device().MAX_THREADS_PER_BLOCK # max TPB causes crash.. @@ -63,6 +63,7 @@ nb_float = np_float nb_int = np_int +# HACK # a "hacky" way to get compatibility between @njit and @cuda.jit # myjit = get_myjit(target=target, inline=inline) # mycudajit = get_myjit(device=False, target=target, inline=inline) @@ -492,11 +493,11 @@ def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num): def dist_matrix(U, - V=None, - U_weights=None, - V_weights=None, - pairs=None, - metric="euclidean"): # noqa + V=None, + U_weights=None, + V_weights=None, + pairs=None, + metric="euclidean"): # noqa """ Compute distance matrices using Numba/CUDA. From 87bba9139b139b4213a00f216412c071ef4c7127 Mon Sep 17 00:00:00 2001 From: sgbaird Date: Mon, 6 Sep 2021 20:33:15 -0600 Subject: [PATCH 19/20] remove scipy-like functionality --- numba/cuda/kernels/dist_matrix.py | 777 -------------------- numba/cuda/tests/cudapy/test_dist_matrix.py | 253 ------- 2 files changed, 1030 deletions(-) delete mode 100644 numba/cuda/kernels/dist_matrix.py delete mode 100644 numba/cuda/tests/cudapy/test_dist_matrix.py diff --git a/numba/cuda/kernels/dist_matrix.py b/numba/cuda/kernels/dist_matrix.py deleted file mode 100644 index df96846c7f6..00000000000 --- a/numba/cuda/kernels/dist_matrix.py +++ /dev/null @@ -1,777 +0,0 @@ -""" -Compute a distance matrix for various cases. - -Cases: - within a single set of vectors (like pdist) - between two sets of vectors (like cdist) - between prespecified pairs (i.e. sparse) for either one or two sets of - vectors. - -Various distance metrics are available. -""" -import numba.cuda.kernels.device.helper as hp -import os -import numpy as np -from math import sqrt, ceil - -# CUDA Simulator not working -# os.environ["NUMBA_ENABLE_CUDASIM"] = "1" -# os.environ["NUMBA_CUDA_DEBUGINFO"] = "1" - -from numba import cuda, jit, prange # noqa -from numba.cuda.testing import unittest, CUDATestCase # noqa -from numba.types import int32, float32, int64, float64 # noqa -# HACK from numba.cuda.kernels.device.myjit import get_myjit -# from numba.cuda.cudadrv.driver import driver -# TPB = driver.get_device().MAX_THREADS_PER_BLOCK # max TPB causes crash.. - -inline = os.environ.get("INLINE", "never") -fastmath = bool(os.environ.get("FASTMATH", "1")) -cols = os.environ.get("COLUMNS") -USE_64 = bool(os.environ.get("USE_64", "0")) -target = os.environ.get("TARGET", "cuda") - -if USE_64 is None: - USE_64 = False -if USE_64: - bits = 64 - nb_float = float64 - nb_int = int64 - np_float = np.float64 - np_int = np.int64 -else: - bits = 32 - nb_float = float32 - nb_int = int32 - np_float = np.float32 - np_int = np.int32 -os.environ["MACHINE_BITS"] = str(bits) - -if cols is not None: - cols = int(cols) - cols_plus_1 = cols + 1 - tot_cols = cols * 2 - tot_cols_minus_1 = tot_cols - 1 -else: - raise KeyError("For performance reasons and architecture constraints " - "the number of columns of U (which is the same as V) " - "must be defined as the environment variable, COLUMNS, " - "via e.g. `os.environ[\"COLUMNS\"] = \"100\"`.") - -# override types -if target == "cpu": - nb_float = np_float - nb_int = np_int - -# HACK -# a "hacky" way to get compatibility between @njit and @cuda.jit -# myjit = get_myjit(target=target, inline=inline) -# mycudajit = get_myjit(device=False, target=target, inline=inline) - -# local array shapes needs to be defined globally due to lack of dynamic -# array allocation support. Don't wrap with np.int32, etc. see -# https://github.com/numba/numba/issues/7314 -TPB = 16 - -# np.savetxt('100-zeros.csv', -# np.zeros(tmp, dtype=np.int32), -# delimiter=",") -# OK to take cols from .shape -# cols = np.genfromtxt('100-zeros.csv', -# dtype=np.int32, -# delimiter=",").shape[0] - - -@ jit(inline=inline) -def cdf_distance(u, v, u_weights, v_weights, p, presorted, cumweighted, prepended): # noqa - r""" # noqa - Compute distance between two 1D distributions :math:`u` and :math:`v`. - - The respective CDFs are :math:`U` and :math:`V`, and the - statistical distance is defined as: - .. math:: - l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p} - p is a positive parameter; p = 1 gives the Wasserstein distance, - p = 2 gives the energy distance. - - Parameters - ---------- - u_values, v_values : array_like - Values observed in the (empirical) distribution. - u_weights, v_weights : array_like - Weight for each value. - `u_weights` (resp. `v_weights`) must have the same length as - `u_values` (resp. `v_values`). If the weight sum differs from - 1, it must still be positive and finite so that the weights can - be normalized to sum to 1. - p : scalar - positive parameter that determines the type of cdf distance. - presorted : bool - Whether u and v have been sorted already *and* u_weights and - v_weights have been sorted using the same indices used to sort - u and v, respectively. - cumweighted : bool - Whether u_weights and v_weights have been converted to their - cumulative weights via e.g. np.cumsum(). - prepended : bool - Whether a zero has been prepended to accumated, sorted - u_weights and v_weights. - - By setting presorted, cumweighted, *and* prepended to False, the - computationproceeds proceeds in the same fashion as _cdf_distance - from scipy.stats. - - Returns - ------- - distance : float - The computed distance between the distributions. - - Notes - ----- - The input distributions can be empirical, therefore coming from - samples whose values are effectively inputs of the function, or - they can be seen as generalized functions, in which case they are - weighted sums of Dirac delta functions located at the specified - values. - - References - ---------- - .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, - Hoyer, Munos "The Cramer Distance as a Solution to Biased - Wasserstein Gradients" (2017). :arXiv:`1705.10743`. - """ - # allocate local float arrays - # combined vector - uv = cuda.local.array(tot_cols, nb_float) - uv_deltas = cuda.local.array(tot_cols_minus_1, nb_float) - - # CDFs - u_cdf = cuda.local.array(tot_cols_minus_1, nb_float) - v_cdf = cuda.local.array(tot_cols_minus_1, nb_float) - - # allocate local int arrays - # CDF indices via binary search - u_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int) - v_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int) - - u_cdf_sorted_cumweights = cuda.local.array( - tot_cols_minus_1, nb_float) - v_cdf_sorted_cumweights = cuda.local.array( - tot_cols_minus_1, nb_float) - - # short-circuit - if presorted and cumweighted and prepended: - u_sorted = u - v_sorted = v - - u_0_cumweights = u_weights - v_0_cumweights = v_weights - - # sorting, accumulating, and prepending (for compatibility) - else: - # check arguments - if not presorted and (cumweighted or prepended): - raise ValueError( - "if cumweighted or prepended are True, then presorted cannot be False") # noqa - - if (not presorted or not cumweighted) and prepended: - raise ValueError( - "if prepended is True, then presorted and cumweighted must both be True") # noqa - - # sorting - if not presorted: - # local arrays - u_sorted = cuda.local.array(cols, np_float) - v_sorted = cuda.local.array(cols, np_float) - - u_sorter = cuda.local.array(cols, nb_int) - v_sorter = cuda.local.array(cols, nb_int) - - u_sorted_weights = cuda.local.array(cols, nb_float) - v_sorted_weights = cuda.local.array(cols, nb_float) - - # local copy since quickArgSortIterative sorts in-place - hp.copy(u, u_sorted) - hp.copy(v, v_sorted) - - # sorting - hp.insertionArgSort(u_sorted, u_sorter) - hp.insertionArgSort(v_sorted, v_sorter) - - # inplace to avoid extra cuda local array - hp.sort_by_indices(u_weights, u_sorter, u_sorted_weights) - hp.sort_by_indices(u_weights, u_sorter, v_sorted_weights) - - # cumulative weights - if not cumweighted: - # local arrays - u_cumweights = cuda.local.array(cols, nb_float) - v_cumweights = cuda.local.array(cols, nb_float) - # accumulate - hp.cumsum(u_sorted_weights, u_cumweights) - hp.cumsum(v_sorted_weights, v_cumweights) - - # prepend weights with zero - if not prepended: - zero = cuda.local.array(1, nb_float) - - u_0_cumweights = cuda.local.array( - cols_plus_1, nb_float) - v_0_cumweights = cuda.local.array( - cols_plus_1, nb_float) - - hp.concatenate(zero, u_cumweights, u_0_cumweights) - hp.concatenate(zero, v_cumweights, v_0_cumweights) - - # concatenate u and v into uv - hp.concatenate(u_sorted, v_sorted, uv) - - # sorting - # quickSortIterative(uv, uv_stack) - hp.insertionSort(uv) - - # Get the respective positions of the values of u and v among the - # values of both distributions. See also np.searchsorted - hp.bisect_right(u_sorted, uv[:-1], u_cdf_indices) - hp.bisect_right(v_sorted, uv[:-1], v_cdf_indices) - - # empirical CDFs - hp.sort_by_indices(u_0_cumweights, u_cdf_indices, - u_cdf_sorted_cumweights) - hp.divide(u_cdf_sorted_cumweights, u_0_cumweights[-1], u_cdf) - - hp.sort_by_indices(v_0_cumweights, v_cdf_indices, - v_cdf_sorted_cumweights) - hp.divide(v_cdf_sorted_cumweights, v_0_cumweights[-1], v_cdf) - - # # Integration - hp.diff(uv, uv_deltas) # See also np.diff - - out = hp.integrate(u_cdf, v_cdf, uv_deltas, p) - - return out - - -@ jit(inline=inline) -def wasserstein_distance(u, v, u_weights, v_weights, presorted, cumweighted, prepended): # noqa - r""" - Compute the first Wasserstein distance between two 1D distributions. - - This distance is also known as the earth mover's distance, since it can be - seen as the minimum amount of "work" required to transform :math:`u` into - :math:`v`, where "work" is measured as the amount of distribution weight - that must be moved, multiplied by the distance it has to be moved. - - Source - ------ - https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8245-L8319 # noqa - - Parameters - ---------- - u_values, v_values : array_like - Values observed in the (empirical) distribution. - u_weights, v_weights : array_like, optional - Weight for each value. If unspecified, each value is assigned the same - weight. - `u_weights` (resp. `v_weights`) must have the same length as - `u_values` (resp. `v_values`). If the weight sum differs from 1, it - must still be positive and finite so that the weights can be normalized - to sum to 1. - - Returns - ------- - distance : float - The computed distance between the distributions. - - Notes - ----- - The input distributions can be empirical, therefore coming from samples - whose values are effectively inputs of the function, or they can be seen as - generalized functions, in which case they are weighted sums of Dirac delta - functions located at the specified values. - - References - ---------- - .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer, - Munos "The Cramer Distance as a Solution to Biased Wasserstein - Gradients" (2017). :arXiv:`1705.10743`. - """ - return cdf_distance(u, v, u_weights, v_weights, np_int(1), presorted, cumweighted, prepended) # noqa - - -@ jit(inline=inline) -def euclidean_distance(a, b): - """ - Calculate Euclidean distance between vectors a and b. - - Parameters - ---------- - a : 1D array - First vector. - b : 1D array - Second vector. - - Returns - ------- - d : numeric scalar - Euclidean distance between vectors a and b. - """ - d = 0 - for i in range(len(a)): - d += (b[i] - a[i]) ** 2 - d = sqrt(d) - return d - - -@ jit(inline=inline) -def compute_distance(u, v, u_weights, v_weights, metric_num): - """ - Calculate weighted distance between two vectors, u and v. - - Parameters - ---------- - u : 1D array of float - First vector. - v : 1D array of float - Second vector. - u_weights : 1D array of float - Weights for u. - v_weights : 1D array of float - Weights for v. - metric_num : int - Which metric to use (0 == "euclidean", 1=="wasserstein"). - - Raises - ------ - NotImplementedError - "Specified metric is mispelled or has not been implemented yet. - If not implemented, consider submitting a pull request." - - Returns - ------- - d : float - Weighted distance between u and v. - - """ - if metric_num == 0: - d = euclidean_distance(u, v) - elif metric_num == 1: - # assume u and v are presorted, and weights are sorted by u and v - d = wasserstein_distance( - u, v, u_weights, v_weights, True, True, True) - else: - raise NotImplementedError( - "Specified metric is mispelled or has not been implemented yet. " - "If not implemented, consider submitting a pull request." - ) - return d - - -@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], " - "int{0}[:,:], float{0}[:], int{0})".format(bits), fastmath=fastmath) -def sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, metric_num): - """ - Calculate sparse pairwise distances between two sets of vectors for pairs. - - Parameters - ---------- - mat : numeric cuda array - First set of vectors for which to compute a single pairwise distance. - mat2 : numeric cuda array - Second set of vectors for which to compute a single pairwise distance. - pairs : cuda array of 2-tuples - All pairs for which distances are to be computed. - out : numeric cuda array - The initialized array which will be populated with distances. - - Raises - ------ - ValueError - Both matrices should have the same number of columns. - - Returns - ------- - None. - - """ - k = cuda.grid(1) - - npairs = pairs.shape[0] - - if k < npairs: - pair = pairs[k] - # extract indices of the pair for which the distance will be computed - i, j = pair - - u = U[i] - v = V[j] - uw = U_weights[i] - vw = V_weights[j] - - out[k] = compute_distance(u, v, uw, vw, metric_num) - - -@cuda.jit( - "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], int{0})".format(bits), - fastmath=fastmath, -) -def one_set_distance_matrix(U, U_weights, out, metric_num): - """ - Calculate pairwise distances within single set of vectors. - - Parameters - ---------- - U : 2D array of float - Vertically stacked vectors. - U_weights : 2D array of float - Vertically stacked weight vectors. - out : 2D array of float - Initialized matrix to populate with pairwise distances. - metric_num : int - Which metric to use (0 == "euclidean", 1=="wasserstein"). - - Returns - ------- - None. - - """ - i, j = cuda.grid(2) - - dm_rows = U.shape[0] - dm_cols = U.shape[0] - - if i < j and i < dm_rows and j < dm_cols and i != j: - u = U[i] - v = U[j] - uw = U_weights[i] - vw = U_weights[j] - d = compute_distance(u, v, uw, vw, metric_num) - out[i, j] = d - out[j, i] = d - - -# faster compilation *and* runtimes with explicit signature -@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], " - "float{0}[:,:], int{0})".format(bits), fastmath=fastmath) -def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num): - """ - Calculate pairwise distances between two sets of vectors. - - Parameters - ---------- - U : 2D array of float - Vertically stacked vectors. - V : 2D array of float - Vertically stacked vectors. - U_weights : 2D array of float - Vertically stacked weight vectors. - V_weights : 2D array of float - Vertically stacked weight vectors. - out : 2D array of float - Pairwise distance matrix between two sets of vectors. - metric_num : int - Distance metric number {"euclidean": 0, "wasserstein": 1}. - - Returns - ------- - None. - - """ - i, j = cuda.grid(2) - - # distance matrix shape - dm_rows = U.shape[0] - dm_cols = V.shape[0] - - if i < dm_rows and j < dm_cols: - u = U[i] - v = V[j] - uw = U_weights[i] - vw = V_weights[j] - d = compute_distance(u, v, uw, vw, metric_num) - out[i, j] = d - - -def dist_matrix(U, - V=None, - U_weights=None, - V_weights=None, - pairs=None, - metric="euclidean"): # noqa - """ - Compute distance matrices using Numba/CUDA. - - Parameters - ---------- - mat : array - First set of vectors for which to compute pairwise distances. - - mat2 : array, optional - Second set of vectors for which to compute pairwise distances. - If not specified, then mat2 is a copy of mat. - - pairs : array, optional - List of 2-tuples which contain the indices for which to compute - distances. If mat2 was specified, then the second index accesses - mat2 instead of mat. If not specified, then the pairs are - auto-generated. If mat2 was specified, all combinations of the two - vector sets are used. If mat2 isn't specified, then only the upper - triangle (minus diagonal) pairs are computed. - - metric : str, optional - Possible options are 'euclidean', 'wasserstein'. - Defaults to Euclidean distance. These are converted to integers - internally due to Numba's lack of support for string arguments - (2021-08-14). See compute_distance() for other keys. - - For example: - 0 - 'euclidean' - 1 - 'wasserstein' - target : str, optional - Which target to use: "cuda" or "cpu". Default is "cuda". - inline : str, optional - Whether to inline functions: "always" or "never". Default is "never". - fastmath : bool, optional - Whether to use fastmath or not. The default is True. - USE_64 : bool, optional - Whether to use 64 bit or 34 bit types. The default is False. - - Raises - ------ - ValueError - DESCRIPTION. - NotImplementedError - DESCRIPTION. - - Returns - ------- - out : array - A pairwise distance matrix, or if pairs are specified, then a - vector of distances corresponding to the pairs. - - """ - rows, cols_check = U.shape - - if cols_check != cols: - raise KeyError("For performance reasons and architecture constraints " - "the number of columns of U (which is the same as V) " - "must be defined as the environment variable: COLUMNS. " - "However, os.environ[\"COLUMNS\"] does not match the " - "number of columns in U. Reset the environment variable " - "via e.g. `os.environ[\"COLUMNS\"] = str(U.shape[0])` " - "defined at the top of your script and make sure that " - "dist_matrix is reloaded. You may also need to restart " - "Python.") - - if V is not None and cols is not V.shape[1]: - raise ValueError("The number of columns for U and V should match") - - # is it a distance matrix between two sets of vectors? - # (rather than within a single set) - isXY = V is not None - - # were pairs specified? (useful for sparse matrix generation) - pairQ = pairs is not None - - # block, grid, and out shape - if pairQ: - block_dim = TPB - npairs = pairs.shape[0] - grid_dim = ceil(npairs / block_dim) - else: - block_dim = (TPB, TPB) - blockspergrid_x = ceil(rows / block_dim[0]) - if isXY: - blockspergrid_y = ceil(V.shape[0] / block_dim[1]) - else: - blockspergrid_y = ceil(rows / block_dim[1]) - grid_dim = (blockspergrid_x, blockspergrid_y) - - # CUDA setup - stream = cuda.stream() - - # sorting and cumulative weights - if metric == "wasserstein": - # presort values (and weights by sorted value indices) - U_sorter = np.argsort(U) - U = np.take_along_axis(U, U_sorter, axis=-1) - U_weights = np.take_along_axis(U_weights, U_sorter, axis=-1) - - # calculate cumulative weights - U_weights = np.cumsum(U_weights, axis=1) - - # prepend a column of zeros - zero = np.zeros((U_weights.shape[0], 1)) - U_weights = np.column_stack((zero, U_weights)) - - # do the same for V and V_weights - if isXY: - V_sorter = np.argsort(V) - V = np.take_along_axis(V, V_sorter, axis=-1) - V_weights = np.take_along_axis(V_weights, V_sorter, axis=-1) - V_weights = np.cumsum(V_weights, axis=1) - V_weights = np.column_stack((zero, V_weights)) - - # assign dummy arrays - if V is None: - V = np.zeros((2, 2)) - - if U_weights is None: - U_weights = np.zeros((2, 2)) - - if V_weights is None: - V_weights = np.zeros((2, 2)) - - if pairs is None: - pairs = np.zeros((2, 2)) - - # assign metric_num based on specified metric (no string support) - metric_dict = {"euclidean": 0, "wasserstein": 1} - metric_num = metric_dict[metric] - - # # same for target_num - # target_dict = {"cuda": 0, "cpu": 1} - # target_num = target_dict[target] - - m = U.shape[0] - - if isXY: - m2 = V.shape[0] - else: - m2 = m - - if pairQ: - shape = (npairs,) - else: - shape = (m, m2) - - # values should be initialized instead of using cuda.device_array - out = np.zeros(shape) - - if target == "cuda": - # copying/allocating to GPU - cU = cuda.to_device(np.asarray( - U, dtype=np_float), stream=stream) - - cU_weights = cuda.to_device(np.asarray( - U_weights, dtype=np_float), stream=stream) - - if isXY or pairQ: - cV = cuda.to_device(np.asarray( - V, dtype=np_float), stream=stream) - cV_weights = cuda.to_device( - np.asarray(V_weights, dtype=np_float), stream=stream - ) - - if pairQ: - cpairs = cuda.to_device(np.asarray( - pairs, dtype=np_int), stream=stream) - - cuda_out = cuda.to_device(np.asarray( - out, dtype=np_float), stream=stream) - - elif target == "cpu": - cU = U - cV = V - cU_weights = U_weights - cV_weights = V_weights - cpairs = pairs - cuda_out = out - - # distance matrix between two sets of vectors - if isXY and not pairQ: - fn = two_set_distance_matrix - if target == "cuda": - fn = fn[grid_dim, block_dim] - fn(cU, cV, cU_weights, cV_weights, cuda_out, metric_num) - - # specified pairwise distances within single set of vectors - elif not isXY and pairQ: - fn = sparse_distance_matrix - if target == "cuda": - fn = fn[grid_dim, block_dim] - fn(cU, cU, cU_weights, cU_weights, cpairs, - cuda_out, metric_num) - - # distance matrix within single set of vectors - elif not isXY and not pairQ: - fn = one_set_distance_matrix - if target == "cuda": - fn = fn[grid_dim, block_dim] - fn(cU, cU_weights, cuda_out, metric_num) - - # specified pairwise distances within single set of vectors - elif isXY and pairQ: - fn = sparse_distance_matrix - if target == "cuda": - fn = fn[grid_dim, block_dim] - fn(cU, cV, cU_weights, cV_weights, cpairs, - cuda_out, metric_num) - - out = cuda_out.copy_to_host(stream=stream) - - return out - -# %% Code Graveyard - -# u_stack = cuda.local.array(cols, nb_int) -# v_stack = cuda.local.array(cols, nb_int) - -# copy(u_sorted, utmp) -# copy(v_sorted, vtmp) - -# copy(u_weights, u_sorted_weights) -# copy(v_weights, v_sorted_weights) - -# stack = [0] * size -# ids = list(range(len(arr))) - -# u_sorted_weights = cuda.local.array(cols, nb_float) -# v_sorted_weights = cuda.local.array(cols, nb_float) - -# sorting stack -# uv_stack = cuda.local.array(tot_cols, nb_int) - -# def sort_cum_prepend(X, X_weights): -# # presort values (and weights by sorted value indices) -# X_sorter = np.argsort(X) -# X = np.take_along_axis(X, X_sorter, axis=-1) -# X_weights = np.take_along_axis(X_weights, X_sorter, axis=-1) -# # calculate cumulative weights -# X_weights = np.cumsum(X_weights) -# # prepend a column of zeros -# zero = np.zeros((X_weights.shape[0], 1)) -# X_weights = np.column_stack((zero, X_weights)) - -# def mean_squared_error(y, y_pred, squared=False): -# """ -# Return mean squared error (MSE) without using sklearn. - -# If squared == True, then return root mean squared error (RMSE). - -# Parameters -# ---------- -# y : 1D numeric array -# "True" or first set of values. -# y_pred : 1D numeric array -# "Predicted" or second set of values. - -# Returns -# ------- -# rmse : numeric scalar -# DESCRIPTION. - -# """ -# mse = np.mean((y - y_pred)**2) -# if squared is True: -# rmse = np.sqrt(mse) -# return rmse -# else: -# return mse -# return mse - -# opt = False -# os.environ["NUMBA_CUDA_DEBUGINFO"] = "0" -# opt = True -# os.environ["NUMBA_BOUNDSCHECK"] = "0" -# os.environ["OPT"] = "0" diff --git a/numba/cuda/tests/cudapy/test_dist_matrix.py b/numba/cuda/tests/cudapy/test_dist_matrix.py deleted file mode 100644 index 4ad8f948a16..00000000000 --- a/numba/cuda/tests/cudapy/test_dist_matrix.py +++ /dev/null @@ -1,253 +0,0 @@ -# -*- coding: utf-8 -*- -"""Test distance matrix calculations using CUDA/Numba.""" -from scipy.spatial.distance import euclidean, cdist -from scipy.stats import wasserstein_distance as scipy_wasserstein_distance -from numpy.testing import assert_allclose -import numpy as np -import os -from importlib import reload - -# os.environ["NUMBA_DISABLE_JIT"] = "1" - -# number of columns of U and V must be set as env var before import dist_matrix -cols = 100 -os.environ["COLUMNS"] = str(cols) - -# other environment variables (set before importing dist_matrix) -os.environ["USE_64"] = "0" -os.environ["INLINE"] = "never" -os.environ["FASTMATH"] = "1" -os.environ["TARGET"] = "cuda" - -from numba.cuda.kernels.device.helper import Timer # noqa -from numba.cuda.testing import unittest, CUDATestCase # noqa -import numba.cuda.kernels.dist_matrix # noqa - -# to overwrite env vars (source: https://stackoverflow.com/a/1254379/13697228) -reload(numba.cuda.kernels.dist_matrix) -dist_matrix = numba.cuda.kernels.dist_matrix.dist_matrix - -verbose_test = True - - -class TestDistMat(CUDATestCase): - """Test distance matrix calculations on GPU for various metrics.""" - - def test_dist_matrix(self): - """ - Loop through distance metrics and perform unit tests. - - The four test cases are: - pairwise distances within a single set of vectors - pairwise distances between two sets of vectors - sparse pairwise distances within a single set of vectors - sparse pairwise distances between two sets of vectors - - The ground truth for Euclidean comes from cdist. - The ground truth for Earth Mover's distance (1-Wasserstein) is via - a scipy.stats function. - - Helper functions are used to generate test data and support the use of - Wasserstein distances in cdist. - - Returns - ------- - None. - - """ - - def test_data(rows, cols): - """ - Generate seeded test values and weights for two distributions. - - Returns - ------- - U : 2D array - Values of first distribution. - V : 2D array - Values of second distribution. - U_weights : 2D array - Weights of first distribution. - V_weights : 2D array - Weights of second distribution. - - """ - np.random.seed(42) - [U, V, U_weights, V_weights] = [ - np.random.rand(rows, cols) for i in range(4)] - return U, V, U_weights, V_weights - - def my_wasserstein_distance(u_uw, v_vw): - """ - Return Earth Mover's distance using concatenated values and weights. - - Parameters - ---------- - u_uw : 1D numeric array - Horizontally stacked values and weights of first distribution. - v_vw : TYPE - Horizontally stacked values and weights of second distribution. - - Returns - ------- - distance : numeric scalar - Earth Mover's distance given two distributions. - - """ - # split into values and weights - n = len(u_uw) - i = n // 2 - u = u_uw[0: i] - uw = u_uw[i: n] - v = v_vw[0: i] - vw = v_vw[i: n] - # calculate distance - distance = scipy_wasserstein_distance( - u, v, u_weights=uw, v_weights=vw) - return distance - - def join_wasserstein(U, V, Uw, Vw): - """ - Horizontally stack values and weights for each distribution. - - Weights are added as additional columns to values. - - Example: - u_uw, v_vw = join_wasserstein(u, v, uw, vw) - d = my_wasserstein_distance(u_uw, v_vw) - cdist(u_uw, v_vw, metric=my_wasserstein_distance) - - Parameters - ---------- - u : 1D or 2D numeric array - First set of distribution values. - v : 1D or 2D numeric array - Second set of values of distribution values. - uw : 1D or 2D numeric array - Weights for first distribution. - vw : 1D or 2D numeric array - Weights for second distribution. - - Returns - ------- - u_uw : 1D or 2D numeric array - Horizontally stacked values and weights of first distribution. - v_vw : TYPE - Horizontally stacked values and weights of second distribution. - - """ - U_Uw = np.concatenate((U, Uw), axis=1) - V_Vw = np.concatenate((V, Vw), axis=1) - return U_Uw, V_Vw - - tol = 1e-5 - - # test and production data - pairs = np.array([(0, 1), (1, 2), (2, 3)]) - i, j = pairs[0] - - for testQ in [True, False]: - # large # of rows with testQ == True is slow due to use of - # cdist and non-jitted scipy-wasserstein - if testQ: - rows = 6 - else: - rows = 1000 - print("====[PARTIAL TEST WITH {} ROWS]====".format(rows)) - U, V, U_weights, V_weights = test_data(rows, cols) - Utest, Vtest, Uwtest, Vwtest = [x[0:6] for x in [U, V, U_weights, V_weights]] # noqa - - for target in ["cuda"]: # "cpu" not implemented - for metric in ['euclidean', 'wasserstein']: - print('[' + target.upper() + "_" + metric.upper() + ']') - - if testQ: - # compile - dist_matrix(Utest, U_weights=Uwtest, metric=metric) - with Timer("one set"): - one_set = dist_matrix( - U, U_weights=U_weights, metric=metric) # noqa - if verbose_test: - print(one_set, '\n') - - # compile - dist_matrix(Utest, V=Vtest, U_weights=Uwtest, - V_weights=Vwtest, metric=metric) - with Timer("two set"): - two_set = dist_matrix(U, V=V, U_weights=U_weights, - V_weights=V_weights, metric=metric) # noqa - if testQ and verbose_test: - print(two_set, '\n') - - one_set_sparse = dist_matrix( - U, U_weights=U_weights, pairs=pairs, metric=metric) # noqa - if testQ and verbose_test: - print(one_set_sparse, '\n') - - two_set_sparse = dist_matrix( - U, - V=V, - U_weights=U_weights, - V_weights=V_weights, - pairs=pairs, - metric=metric, - ) - if testQ and verbose_test: - print(two_set_sparse, '\n') - - if testQ: - if metric == "euclidean": - with Timer("one set check (cdist)"): - one_set_check = cdist(U, U) - with Timer("two set check (cdist)"): - two_set_check = cdist(U, V) - - one_sparse_check = [ - euclidean(U[i], U[j]) for i, j in pairs] - two_sparse_check = [ - euclidean(U[i], V[j]) for i, j in pairs] - - elif metric == "wasserstein": - U_Uw, V_Vw = join_wasserstein( - U, V, U_weights, V_weights) - - with Timer("one set check (cdist)"): - one_set_check = cdist( - U_Uw, U_Uw, metric=my_wasserstein_distance) - with Timer("two set check (cdist)"): - two_set_check = cdist( - U_Uw, V_Vw, metric=my_wasserstein_distance) - - one_sparse_check = [my_wasserstein_distance(U_Uw[i], U_Uw[j]) # noqa - for i, j in pairs] - two_sparse_check = [my_wasserstein_distance(U_Uw[i], V_Vw[j]) # noqa - for i, j in pairs] - - # check results - assert_allclose( - one_set.ravel(), one_set_check.ravel(), rtol=tol, - err_msg="one set {} {} distance matrix inaccurate".format(target, metric)) # noqa - assert_allclose( - two_set.ravel(), two_set_check.ravel(), rtol=tol, - err_msg="two set {} {} distance matrix inaccurate".format(target, metric)) # noqa - assert_allclose( - one_set_sparse, one_sparse_check, rtol=tol, - err_msg="one set {} {} sparse distance matrix inaccurate".format(target, metric)) # noqa - assert_allclose( - two_set_sparse, two_sparse_check, rtol=tol, - err_msg="two set {} {} distance matrix inaccurate".format(target, metric)) # noqa - elif metric == "euclidean": - with Timer("two set check (cdist)"): - two_set_check = cdist(U, V) - - -if __name__ == '__main__': - unittest.main() - - -# TF = [ -# np.allclose(one_set, one_set_check), -# np.allclose(two_set, two_set_check), -# np.allclose(one_set_sparse, one_sparse_check), -# np.allclose(two_set_sparse, two_sparse_check) -# ] From f862f168a0cf15ef5b0455c4ff6eb1ef89d7224b Mon Sep 17 00:00:00 2001 From: sgbaird Date: Mon, 6 Sep 2021 20:38:42 -0600 Subject: [PATCH 20/20] Revert "remove scipy-like functionality" This reverts commit 87bba9139b139b4213a00f216412c071ef4c7127. --- numba/cuda/kernels/dist_matrix.py | 777 ++++++++++++++++++++ numba/cuda/tests/cudapy/test_dist_matrix.py | 253 +++++++ 2 files changed, 1030 insertions(+) create mode 100644 numba/cuda/kernels/dist_matrix.py create mode 100644 numba/cuda/tests/cudapy/test_dist_matrix.py diff --git a/numba/cuda/kernels/dist_matrix.py b/numba/cuda/kernels/dist_matrix.py new file mode 100644 index 00000000000..df96846c7f6 --- /dev/null +++ b/numba/cuda/kernels/dist_matrix.py @@ -0,0 +1,777 @@ +""" +Compute a distance matrix for various cases. + +Cases: + within a single set of vectors (like pdist) + between two sets of vectors (like cdist) + between prespecified pairs (i.e. sparse) for either one or two sets of + vectors. + +Various distance metrics are available. +""" +import numba.cuda.kernels.device.helper as hp +import os +import numpy as np +from math import sqrt, ceil + +# CUDA Simulator not working +# os.environ["NUMBA_ENABLE_CUDASIM"] = "1" +# os.environ["NUMBA_CUDA_DEBUGINFO"] = "1" + +from numba import cuda, jit, prange # noqa +from numba.cuda.testing import unittest, CUDATestCase # noqa +from numba.types import int32, float32, int64, float64 # noqa +# HACK from numba.cuda.kernels.device.myjit import get_myjit +# from numba.cuda.cudadrv.driver import driver +# TPB = driver.get_device().MAX_THREADS_PER_BLOCK # max TPB causes crash.. + +inline = os.environ.get("INLINE", "never") +fastmath = bool(os.environ.get("FASTMATH", "1")) +cols = os.environ.get("COLUMNS") +USE_64 = bool(os.environ.get("USE_64", "0")) +target = os.environ.get("TARGET", "cuda") + +if USE_64 is None: + USE_64 = False +if USE_64: + bits = 64 + nb_float = float64 + nb_int = int64 + np_float = np.float64 + np_int = np.int64 +else: + bits = 32 + nb_float = float32 + nb_int = int32 + np_float = np.float32 + np_int = np.int32 +os.environ["MACHINE_BITS"] = str(bits) + +if cols is not None: + cols = int(cols) + cols_plus_1 = cols + 1 + tot_cols = cols * 2 + tot_cols_minus_1 = tot_cols - 1 +else: + raise KeyError("For performance reasons and architecture constraints " + "the number of columns of U (which is the same as V) " + "must be defined as the environment variable, COLUMNS, " + "via e.g. `os.environ[\"COLUMNS\"] = \"100\"`.") + +# override types +if target == "cpu": + nb_float = np_float + nb_int = np_int + +# HACK +# a "hacky" way to get compatibility between @njit and @cuda.jit +# myjit = get_myjit(target=target, inline=inline) +# mycudajit = get_myjit(device=False, target=target, inline=inline) + +# local array shapes needs to be defined globally due to lack of dynamic +# array allocation support. Don't wrap with np.int32, etc. see +# https://github.com/numba/numba/issues/7314 +TPB = 16 + +# np.savetxt('100-zeros.csv', +# np.zeros(tmp, dtype=np.int32), +# delimiter=",") +# OK to take cols from .shape +# cols = np.genfromtxt('100-zeros.csv', +# dtype=np.int32, +# delimiter=",").shape[0] + + +@ jit(inline=inline) +def cdf_distance(u, v, u_weights, v_weights, p, presorted, cumweighted, prepended): # noqa + r""" # noqa + Compute distance between two 1D distributions :math:`u` and :math:`v`. + + The respective CDFs are :math:`U` and :math:`V`, and the + statistical distance is defined as: + .. math:: + l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p} + p is a positive parameter; p = 1 gives the Wasserstein distance, + p = 2 gives the energy distance. + + Parameters + ---------- + u_values, v_values : array_like + Values observed in the (empirical) distribution. + u_weights, v_weights : array_like + Weight for each value. + `u_weights` (resp. `v_weights`) must have the same length as + `u_values` (resp. `v_values`). If the weight sum differs from + 1, it must still be positive and finite so that the weights can + be normalized to sum to 1. + p : scalar + positive parameter that determines the type of cdf distance. + presorted : bool + Whether u and v have been sorted already *and* u_weights and + v_weights have been sorted using the same indices used to sort + u and v, respectively. + cumweighted : bool + Whether u_weights and v_weights have been converted to their + cumulative weights via e.g. np.cumsum(). + prepended : bool + Whether a zero has been prepended to accumated, sorted + u_weights and v_weights. + + By setting presorted, cumweighted, *and* prepended to False, the + computationproceeds proceeds in the same fashion as _cdf_distance + from scipy.stats. + + Returns + ------- + distance : float + The computed distance between the distributions. + + Notes + ----- + The input distributions can be empirical, therefore coming from + samples whose values are effectively inputs of the function, or + they can be seen as generalized functions, in which case they are + weighted sums of Dirac delta functions located at the specified + values. + + References + ---------- + .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, + Hoyer, Munos "The Cramer Distance as a Solution to Biased + Wasserstein Gradients" (2017). :arXiv:`1705.10743`. + """ + # allocate local float arrays + # combined vector + uv = cuda.local.array(tot_cols, nb_float) + uv_deltas = cuda.local.array(tot_cols_minus_1, nb_float) + + # CDFs + u_cdf = cuda.local.array(tot_cols_minus_1, nb_float) + v_cdf = cuda.local.array(tot_cols_minus_1, nb_float) + + # allocate local int arrays + # CDF indices via binary search + u_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int) + v_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int) + + u_cdf_sorted_cumweights = cuda.local.array( + tot_cols_minus_1, nb_float) + v_cdf_sorted_cumweights = cuda.local.array( + tot_cols_minus_1, nb_float) + + # short-circuit + if presorted and cumweighted and prepended: + u_sorted = u + v_sorted = v + + u_0_cumweights = u_weights + v_0_cumweights = v_weights + + # sorting, accumulating, and prepending (for compatibility) + else: + # check arguments + if not presorted and (cumweighted or prepended): + raise ValueError( + "if cumweighted or prepended are True, then presorted cannot be False") # noqa + + if (not presorted or not cumweighted) and prepended: + raise ValueError( + "if prepended is True, then presorted and cumweighted must both be True") # noqa + + # sorting + if not presorted: + # local arrays + u_sorted = cuda.local.array(cols, np_float) + v_sorted = cuda.local.array(cols, np_float) + + u_sorter = cuda.local.array(cols, nb_int) + v_sorter = cuda.local.array(cols, nb_int) + + u_sorted_weights = cuda.local.array(cols, nb_float) + v_sorted_weights = cuda.local.array(cols, nb_float) + + # local copy since quickArgSortIterative sorts in-place + hp.copy(u, u_sorted) + hp.copy(v, v_sorted) + + # sorting + hp.insertionArgSort(u_sorted, u_sorter) + hp.insertionArgSort(v_sorted, v_sorter) + + # inplace to avoid extra cuda local array + hp.sort_by_indices(u_weights, u_sorter, u_sorted_weights) + hp.sort_by_indices(u_weights, u_sorter, v_sorted_weights) + + # cumulative weights + if not cumweighted: + # local arrays + u_cumweights = cuda.local.array(cols, nb_float) + v_cumweights = cuda.local.array(cols, nb_float) + # accumulate + hp.cumsum(u_sorted_weights, u_cumweights) + hp.cumsum(v_sorted_weights, v_cumweights) + + # prepend weights with zero + if not prepended: + zero = cuda.local.array(1, nb_float) + + u_0_cumweights = cuda.local.array( + cols_plus_1, nb_float) + v_0_cumweights = cuda.local.array( + cols_plus_1, nb_float) + + hp.concatenate(zero, u_cumweights, u_0_cumweights) + hp.concatenate(zero, v_cumweights, v_0_cumweights) + + # concatenate u and v into uv + hp.concatenate(u_sorted, v_sorted, uv) + + # sorting + # quickSortIterative(uv, uv_stack) + hp.insertionSort(uv) + + # Get the respective positions of the values of u and v among the + # values of both distributions. See also np.searchsorted + hp.bisect_right(u_sorted, uv[:-1], u_cdf_indices) + hp.bisect_right(v_sorted, uv[:-1], v_cdf_indices) + + # empirical CDFs + hp.sort_by_indices(u_0_cumweights, u_cdf_indices, + u_cdf_sorted_cumweights) + hp.divide(u_cdf_sorted_cumweights, u_0_cumweights[-1], u_cdf) + + hp.sort_by_indices(v_0_cumweights, v_cdf_indices, + v_cdf_sorted_cumweights) + hp.divide(v_cdf_sorted_cumweights, v_0_cumweights[-1], v_cdf) + + # # Integration + hp.diff(uv, uv_deltas) # See also np.diff + + out = hp.integrate(u_cdf, v_cdf, uv_deltas, p) + + return out + + +@ jit(inline=inline) +def wasserstein_distance(u, v, u_weights, v_weights, presorted, cumweighted, prepended): # noqa + r""" + Compute the first Wasserstein distance between two 1D distributions. + + This distance is also known as the earth mover's distance, since it can be + seen as the minimum amount of "work" required to transform :math:`u` into + :math:`v`, where "work" is measured as the amount of distribution weight + that must be moved, multiplied by the distance it has to be moved. + + Source + ------ + https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8245-L8319 # noqa + + Parameters + ---------- + u_values, v_values : array_like + Values observed in the (empirical) distribution. + u_weights, v_weights : array_like, optional + Weight for each value. If unspecified, each value is assigned the same + weight. + `u_weights` (resp. `v_weights`) must have the same length as + `u_values` (resp. `v_values`). If the weight sum differs from 1, it + must still be positive and finite so that the weights can be normalized + to sum to 1. + + Returns + ------- + distance : float + The computed distance between the distributions. + + Notes + ----- + The input distributions can be empirical, therefore coming from samples + whose values are effectively inputs of the function, or they can be seen as + generalized functions, in which case they are weighted sums of Dirac delta + functions located at the specified values. + + References + ---------- + .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer, + Munos "The Cramer Distance as a Solution to Biased Wasserstein + Gradients" (2017). :arXiv:`1705.10743`. + """ + return cdf_distance(u, v, u_weights, v_weights, np_int(1), presorted, cumweighted, prepended) # noqa + + +@ jit(inline=inline) +def euclidean_distance(a, b): + """ + Calculate Euclidean distance between vectors a and b. + + Parameters + ---------- + a : 1D array + First vector. + b : 1D array + Second vector. + + Returns + ------- + d : numeric scalar + Euclidean distance between vectors a and b. + """ + d = 0 + for i in range(len(a)): + d += (b[i] - a[i]) ** 2 + d = sqrt(d) + return d + + +@ jit(inline=inline) +def compute_distance(u, v, u_weights, v_weights, metric_num): + """ + Calculate weighted distance between two vectors, u and v. + + Parameters + ---------- + u : 1D array of float + First vector. + v : 1D array of float + Second vector. + u_weights : 1D array of float + Weights for u. + v_weights : 1D array of float + Weights for v. + metric_num : int + Which metric to use (0 == "euclidean", 1=="wasserstein"). + + Raises + ------ + NotImplementedError + "Specified metric is mispelled or has not been implemented yet. + If not implemented, consider submitting a pull request." + + Returns + ------- + d : float + Weighted distance between u and v. + + """ + if metric_num == 0: + d = euclidean_distance(u, v) + elif metric_num == 1: + # assume u and v are presorted, and weights are sorted by u and v + d = wasserstein_distance( + u, v, u_weights, v_weights, True, True, True) + else: + raise NotImplementedError( + "Specified metric is mispelled or has not been implemented yet. " + "If not implemented, consider submitting a pull request." + ) + return d + + +@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], " + "int{0}[:,:], float{0}[:], int{0})".format(bits), fastmath=fastmath) +def sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, metric_num): + """ + Calculate sparse pairwise distances between two sets of vectors for pairs. + + Parameters + ---------- + mat : numeric cuda array + First set of vectors for which to compute a single pairwise distance. + mat2 : numeric cuda array + Second set of vectors for which to compute a single pairwise distance. + pairs : cuda array of 2-tuples + All pairs for which distances are to be computed. + out : numeric cuda array + The initialized array which will be populated with distances. + + Raises + ------ + ValueError + Both matrices should have the same number of columns. + + Returns + ------- + None. + + """ + k = cuda.grid(1) + + npairs = pairs.shape[0] + + if k < npairs: + pair = pairs[k] + # extract indices of the pair for which the distance will be computed + i, j = pair + + u = U[i] + v = V[j] + uw = U_weights[i] + vw = V_weights[j] + + out[k] = compute_distance(u, v, uw, vw, metric_num) + + +@cuda.jit( + "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], int{0})".format(bits), + fastmath=fastmath, +) +def one_set_distance_matrix(U, U_weights, out, metric_num): + """ + Calculate pairwise distances within single set of vectors. + + Parameters + ---------- + U : 2D array of float + Vertically stacked vectors. + U_weights : 2D array of float + Vertically stacked weight vectors. + out : 2D array of float + Initialized matrix to populate with pairwise distances. + metric_num : int + Which metric to use (0 == "euclidean", 1=="wasserstein"). + + Returns + ------- + None. + + """ + i, j = cuda.grid(2) + + dm_rows = U.shape[0] + dm_cols = U.shape[0] + + if i < j and i < dm_rows and j < dm_cols and i != j: + u = U[i] + v = U[j] + uw = U_weights[i] + vw = U_weights[j] + d = compute_distance(u, v, uw, vw, metric_num) + out[i, j] = d + out[j, i] = d + + +# faster compilation *and* runtimes with explicit signature +@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], " + "float{0}[:,:], int{0})".format(bits), fastmath=fastmath) +def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num): + """ + Calculate pairwise distances between two sets of vectors. + + Parameters + ---------- + U : 2D array of float + Vertically stacked vectors. + V : 2D array of float + Vertically stacked vectors. + U_weights : 2D array of float + Vertically stacked weight vectors. + V_weights : 2D array of float + Vertically stacked weight vectors. + out : 2D array of float + Pairwise distance matrix between two sets of vectors. + metric_num : int + Distance metric number {"euclidean": 0, "wasserstein": 1}. + + Returns + ------- + None. + + """ + i, j = cuda.grid(2) + + # distance matrix shape + dm_rows = U.shape[0] + dm_cols = V.shape[0] + + if i < dm_rows and j < dm_cols: + u = U[i] + v = V[j] + uw = U_weights[i] + vw = V_weights[j] + d = compute_distance(u, v, uw, vw, metric_num) + out[i, j] = d + + +def dist_matrix(U, + V=None, + U_weights=None, + V_weights=None, + pairs=None, + metric="euclidean"): # noqa + """ + Compute distance matrices using Numba/CUDA. + + Parameters + ---------- + mat : array + First set of vectors for which to compute pairwise distances. + + mat2 : array, optional + Second set of vectors for which to compute pairwise distances. + If not specified, then mat2 is a copy of mat. + + pairs : array, optional + List of 2-tuples which contain the indices for which to compute + distances. If mat2 was specified, then the second index accesses + mat2 instead of mat. If not specified, then the pairs are + auto-generated. If mat2 was specified, all combinations of the two + vector sets are used. If mat2 isn't specified, then only the upper + triangle (minus diagonal) pairs are computed. + + metric : str, optional + Possible options are 'euclidean', 'wasserstein'. + Defaults to Euclidean distance. These are converted to integers + internally due to Numba's lack of support for string arguments + (2021-08-14). See compute_distance() for other keys. + + For example: + 0 - 'euclidean' + 1 - 'wasserstein' + target : str, optional + Which target to use: "cuda" or "cpu". Default is "cuda". + inline : str, optional + Whether to inline functions: "always" or "never". Default is "never". + fastmath : bool, optional + Whether to use fastmath or not. The default is True. + USE_64 : bool, optional + Whether to use 64 bit or 34 bit types. The default is False. + + Raises + ------ + ValueError + DESCRIPTION. + NotImplementedError + DESCRIPTION. + + Returns + ------- + out : array + A pairwise distance matrix, or if pairs are specified, then a + vector of distances corresponding to the pairs. + + """ + rows, cols_check = U.shape + + if cols_check != cols: + raise KeyError("For performance reasons and architecture constraints " + "the number of columns of U (which is the same as V) " + "must be defined as the environment variable: COLUMNS. " + "However, os.environ[\"COLUMNS\"] does not match the " + "number of columns in U. Reset the environment variable " + "via e.g. `os.environ[\"COLUMNS\"] = str(U.shape[0])` " + "defined at the top of your script and make sure that " + "dist_matrix is reloaded. You may also need to restart " + "Python.") + + if V is not None and cols is not V.shape[1]: + raise ValueError("The number of columns for U and V should match") + + # is it a distance matrix between two sets of vectors? + # (rather than within a single set) + isXY = V is not None + + # were pairs specified? (useful for sparse matrix generation) + pairQ = pairs is not None + + # block, grid, and out shape + if pairQ: + block_dim = TPB + npairs = pairs.shape[0] + grid_dim = ceil(npairs / block_dim) + else: + block_dim = (TPB, TPB) + blockspergrid_x = ceil(rows / block_dim[0]) + if isXY: + blockspergrid_y = ceil(V.shape[0] / block_dim[1]) + else: + blockspergrid_y = ceil(rows / block_dim[1]) + grid_dim = (blockspergrid_x, blockspergrid_y) + + # CUDA setup + stream = cuda.stream() + + # sorting and cumulative weights + if metric == "wasserstein": + # presort values (and weights by sorted value indices) + U_sorter = np.argsort(U) + U = np.take_along_axis(U, U_sorter, axis=-1) + U_weights = np.take_along_axis(U_weights, U_sorter, axis=-1) + + # calculate cumulative weights + U_weights = np.cumsum(U_weights, axis=1) + + # prepend a column of zeros + zero = np.zeros((U_weights.shape[0], 1)) + U_weights = np.column_stack((zero, U_weights)) + + # do the same for V and V_weights + if isXY: + V_sorter = np.argsort(V) + V = np.take_along_axis(V, V_sorter, axis=-1) + V_weights = np.take_along_axis(V_weights, V_sorter, axis=-1) + V_weights = np.cumsum(V_weights, axis=1) + V_weights = np.column_stack((zero, V_weights)) + + # assign dummy arrays + if V is None: + V = np.zeros((2, 2)) + + if U_weights is None: + U_weights = np.zeros((2, 2)) + + if V_weights is None: + V_weights = np.zeros((2, 2)) + + if pairs is None: + pairs = np.zeros((2, 2)) + + # assign metric_num based on specified metric (no string support) + metric_dict = {"euclidean": 0, "wasserstein": 1} + metric_num = metric_dict[metric] + + # # same for target_num + # target_dict = {"cuda": 0, "cpu": 1} + # target_num = target_dict[target] + + m = U.shape[0] + + if isXY: + m2 = V.shape[0] + else: + m2 = m + + if pairQ: + shape = (npairs,) + else: + shape = (m, m2) + + # values should be initialized instead of using cuda.device_array + out = np.zeros(shape) + + if target == "cuda": + # copying/allocating to GPU + cU = cuda.to_device(np.asarray( + U, dtype=np_float), stream=stream) + + cU_weights = cuda.to_device(np.asarray( + U_weights, dtype=np_float), stream=stream) + + if isXY or pairQ: + cV = cuda.to_device(np.asarray( + V, dtype=np_float), stream=stream) + cV_weights = cuda.to_device( + np.asarray(V_weights, dtype=np_float), stream=stream + ) + + if pairQ: + cpairs = cuda.to_device(np.asarray( + pairs, dtype=np_int), stream=stream) + + cuda_out = cuda.to_device(np.asarray( + out, dtype=np_float), stream=stream) + + elif target == "cpu": + cU = U + cV = V + cU_weights = U_weights + cV_weights = V_weights + cpairs = pairs + cuda_out = out + + # distance matrix between two sets of vectors + if isXY and not pairQ: + fn = two_set_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cV, cU_weights, cV_weights, cuda_out, metric_num) + + # specified pairwise distances within single set of vectors + elif not isXY and pairQ: + fn = sparse_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cU, cU_weights, cU_weights, cpairs, + cuda_out, metric_num) + + # distance matrix within single set of vectors + elif not isXY and not pairQ: + fn = one_set_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cU_weights, cuda_out, metric_num) + + # specified pairwise distances within single set of vectors + elif isXY and pairQ: + fn = sparse_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cV, cU_weights, cV_weights, cpairs, + cuda_out, metric_num) + + out = cuda_out.copy_to_host(stream=stream) + + return out + +# %% Code Graveyard + +# u_stack = cuda.local.array(cols, nb_int) +# v_stack = cuda.local.array(cols, nb_int) + +# copy(u_sorted, utmp) +# copy(v_sorted, vtmp) + +# copy(u_weights, u_sorted_weights) +# copy(v_weights, v_sorted_weights) + +# stack = [0] * size +# ids = list(range(len(arr))) + +# u_sorted_weights = cuda.local.array(cols, nb_float) +# v_sorted_weights = cuda.local.array(cols, nb_float) + +# sorting stack +# uv_stack = cuda.local.array(tot_cols, nb_int) + +# def sort_cum_prepend(X, X_weights): +# # presort values (and weights by sorted value indices) +# X_sorter = np.argsort(X) +# X = np.take_along_axis(X, X_sorter, axis=-1) +# X_weights = np.take_along_axis(X_weights, X_sorter, axis=-1) +# # calculate cumulative weights +# X_weights = np.cumsum(X_weights) +# # prepend a column of zeros +# zero = np.zeros((X_weights.shape[0], 1)) +# X_weights = np.column_stack((zero, X_weights)) + +# def mean_squared_error(y, y_pred, squared=False): +# """ +# Return mean squared error (MSE) without using sklearn. + +# If squared == True, then return root mean squared error (RMSE). + +# Parameters +# ---------- +# y : 1D numeric array +# "True" or first set of values. +# y_pred : 1D numeric array +# "Predicted" or second set of values. + +# Returns +# ------- +# rmse : numeric scalar +# DESCRIPTION. + +# """ +# mse = np.mean((y - y_pred)**2) +# if squared is True: +# rmse = np.sqrt(mse) +# return rmse +# else: +# return mse +# return mse + +# opt = False +# os.environ["NUMBA_CUDA_DEBUGINFO"] = "0" +# opt = True +# os.environ["NUMBA_BOUNDSCHECK"] = "0" +# os.environ["OPT"] = "0" diff --git a/numba/cuda/tests/cudapy/test_dist_matrix.py b/numba/cuda/tests/cudapy/test_dist_matrix.py new file mode 100644 index 00000000000..4ad8f948a16 --- /dev/null +++ b/numba/cuda/tests/cudapy/test_dist_matrix.py @@ -0,0 +1,253 @@ +# -*- coding: utf-8 -*- +"""Test distance matrix calculations using CUDA/Numba.""" +from scipy.spatial.distance import euclidean, cdist +from scipy.stats import wasserstein_distance as scipy_wasserstein_distance +from numpy.testing import assert_allclose +import numpy as np +import os +from importlib import reload + +# os.environ["NUMBA_DISABLE_JIT"] = "1" + +# number of columns of U and V must be set as env var before import dist_matrix +cols = 100 +os.environ["COLUMNS"] = str(cols) + +# other environment variables (set before importing dist_matrix) +os.environ["USE_64"] = "0" +os.environ["INLINE"] = "never" +os.environ["FASTMATH"] = "1" +os.environ["TARGET"] = "cuda" + +from numba.cuda.kernels.device.helper import Timer # noqa +from numba.cuda.testing import unittest, CUDATestCase # noqa +import numba.cuda.kernels.dist_matrix # noqa + +# to overwrite env vars (source: https://stackoverflow.com/a/1254379/13697228) +reload(numba.cuda.kernels.dist_matrix) +dist_matrix = numba.cuda.kernels.dist_matrix.dist_matrix + +verbose_test = True + + +class TestDistMat(CUDATestCase): + """Test distance matrix calculations on GPU for various metrics.""" + + def test_dist_matrix(self): + """ + Loop through distance metrics and perform unit tests. + + The four test cases are: + pairwise distances within a single set of vectors + pairwise distances between two sets of vectors + sparse pairwise distances within a single set of vectors + sparse pairwise distances between two sets of vectors + + The ground truth for Euclidean comes from cdist. + The ground truth for Earth Mover's distance (1-Wasserstein) is via + a scipy.stats function. + + Helper functions are used to generate test data and support the use of + Wasserstein distances in cdist. + + Returns + ------- + None. + + """ + + def test_data(rows, cols): + """ + Generate seeded test values and weights for two distributions. + + Returns + ------- + U : 2D array + Values of first distribution. + V : 2D array + Values of second distribution. + U_weights : 2D array + Weights of first distribution. + V_weights : 2D array + Weights of second distribution. + + """ + np.random.seed(42) + [U, V, U_weights, V_weights] = [ + np.random.rand(rows, cols) for i in range(4)] + return U, V, U_weights, V_weights + + def my_wasserstein_distance(u_uw, v_vw): + """ + Return Earth Mover's distance using concatenated values and weights. + + Parameters + ---------- + u_uw : 1D numeric array + Horizontally stacked values and weights of first distribution. + v_vw : TYPE + Horizontally stacked values and weights of second distribution. + + Returns + ------- + distance : numeric scalar + Earth Mover's distance given two distributions. + + """ + # split into values and weights + n = len(u_uw) + i = n // 2 + u = u_uw[0: i] + uw = u_uw[i: n] + v = v_vw[0: i] + vw = v_vw[i: n] + # calculate distance + distance = scipy_wasserstein_distance( + u, v, u_weights=uw, v_weights=vw) + return distance + + def join_wasserstein(U, V, Uw, Vw): + """ + Horizontally stack values and weights for each distribution. + + Weights are added as additional columns to values. + + Example: + u_uw, v_vw = join_wasserstein(u, v, uw, vw) + d = my_wasserstein_distance(u_uw, v_vw) + cdist(u_uw, v_vw, metric=my_wasserstein_distance) + + Parameters + ---------- + u : 1D or 2D numeric array + First set of distribution values. + v : 1D or 2D numeric array + Second set of values of distribution values. + uw : 1D or 2D numeric array + Weights for first distribution. + vw : 1D or 2D numeric array + Weights for second distribution. + + Returns + ------- + u_uw : 1D or 2D numeric array + Horizontally stacked values and weights of first distribution. + v_vw : TYPE + Horizontally stacked values and weights of second distribution. + + """ + U_Uw = np.concatenate((U, Uw), axis=1) + V_Vw = np.concatenate((V, Vw), axis=1) + return U_Uw, V_Vw + + tol = 1e-5 + + # test and production data + pairs = np.array([(0, 1), (1, 2), (2, 3)]) + i, j = pairs[0] + + for testQ in [True, False]: + # large # of rows with testQ == True is slow due to use of + # cdist and non-jitted scipy-wasserstein + if testQ: + rows = 6 + else: + rows = 1000 + print("====[PARTIAL TEST WITH {} ROWS]====".format(rows)) + U, V, U_weights, V_weights = test_data(rows, cols) + Utest, Vtest, Uwtest, Vwtest = [x[0:6] for x in [U, V, U_weights, V_weights]] # noqa + + for target in ["cuda"]: # "cpu" not implemented + for metric in ['euclidean', 'wasserstein']: + print('[' + target.upper() + "_" + metric.upper() + ']') + + if testQ: + # compile + dist_matrix(Utest, U_weights=Uwtest, metric=metric) + with Timer("one set"): + one_set = dist_matrix( + U, U_weights=U_weights, metric=metric) # noqa + if verbose_test: + print(one_set, '\n') + + # compile + dist_matrix(Utest, V=Vtest, U_weights=Uwtest, + V_weights=Vwtest, metric=metric) + with Timer("two set"): + two_set = dist_matrix(U, V=V, U_weights=U_weights, + V_weights=V_weights, metric=metric) # noqa + if testQ and verbose_test: + print(two_set, '\n') + + one_set_sparse = dist_matrix( + U, U_weights=U_weights, pairs=pairs, metric=metric) # noqa + if testQ and verbose_test: + print(one_set_sparse, '\n') + + two_set_sparse = dist_matrix( + U, + V=V, + U_weights=U_weights, + V_weights=V_weights, + pairs=pairs, + metric=metric, + ) + if testQ and verbose_test: + print(two_set_sparse, '\n') + + if testQ: + if metric == "euclidean": + with Timer("one set check (cdist)"): + one_set_check = cdist(U, U) + with Timer("two set check (cdist)"): + two_set_check = cdist(U, V) + + one_sparse_check = [ + euclidean(U[i], U[j]) for i, j in pairs] + two_sparse_check = [ + euclidean(U[i], V[j]) for i, j in pairs] + + elif metric == "wasserstein": + U_Uw, V_Vw = join_wasserstein( + U, V, U_weights, V_weights) + + with Timer("one set check (cdist)"): + one_set_check = cdist( + U_Uw, U_Uw, metric=my_wasserstein_distance) + with Timer("two set check (cdist)"): + two_set_check = cdist( + U_Uw, V_Vw, metric=my_wasserstein_distance) + + one_sparse_check = [my_wasserstein_distance(U_Uw[i], U_Uw[j]) # noqa + for i, j in pairs] + two_sparse_check = [my_wasserstein_distance(U_Uw[i], V_Vw[j]) # noqa + for i, j in pairs] + + # check results + assert_allclose( + one_set.ravel(), one_set_check.ravel(), rtol=tol, + err_msg="one set {} {} distance matrix inaccurate".format(target, metric)) # noqa + assert_allclose( + two_set.ravel(), two_set_check.ravel(), rtol=tol, + err_msg="two set {} {} distance matrix inaccurate".format(target, metric)) # noqa + assert_allclose( + one_set_sparse, one_sparse_check, rtol=tol, + err_msg="one set {} {} sparse distance matrix inaccurate".format(target, metric)) # noqa + assert_allclose( + two_set_sparse, two_sparse_check, rtol=tol, + err_msg="two set {} {} distance matrix inaccurate".format(target, metric)) # noqa + elif metric == "euclidean": + with Timer("two set check (cdist)"): + two_set_check = cdist(U, V) + + +if __name__ == '__main__': + unittest.main() + + +# TF = [ +# np.allclose(one_set, one_set_check), +# np.allclose(two_set, two_set_check), +# np.allclose(one_set_sparse, one_sparse_check), +# np.allclose(two_set_sparse, two_sparse_check) +# ]