From 931cccef85f2a1ededd56f317121db9dd94aa892 Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Sat, 21 Aug 2021 19:29:44 -0600
Subject: [PATCH 01/20] Remove usage of the word "allocate" in reference to
 local memory

See https://stackoverflow.com/questions/68869806/how-is-performance-affected-by-using-numba-cuda-local-array-compared-with-numb
---
 docs/source/cuda/memory.rst | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/docs/source/cuda/memory.rst b/docs/source/cuda/memory.rst
index 7b134fe5d9e..b17d9e8ced2 100644
--- a/docs/source/cuda/memory.rst
+++ b/docs/source/cuda/memory.rst
@@ -152,21 +152,23 @@ traditional dynamic memory management.
 Local memory
 ============
 
-Local memory is an area of memory private to each thread.  Using local
-memory helps allocate some scratchpad area when scalar local variables
-are not enough.  The memory is allocated once for the duration of the kernel,
-unlike traditional dynamic memory management.
+Local memory is an area of memory private to each thread.  Using local memory
+helps reserve some scratchpad area to variables when scalar local variables are
+not enough.  The memory is allocated once for the duration of the kernel, unlike
+traditional dynamic memory management.
 
 .. function:: numba.cuda.local.array(shape, type)
    :noindex:
 
-   Allocate a local array of the given *shape* and *type* on the device.
+   Create a variable representing a local array of the given *shape* and *type*
+   on the device that corresponds to a chunk of statically defined memory.
    *shape* is either an integer or a tuple of integers representing the array's
-   dimensions and must be a simple constant expression. *type* is a
-   :ref:`Numba type <numba-types>` of the elements needing to be stored in the
-   array. The array is private to the current thread. An array-like object is
-   returned which can be read and written to like any standard array
-   (e.g. through indexing).
+   dimensions and must be a simple constant expression. *type* is a :ref:`Numba
+   type <numba-types>` of the elements needing to be stored in the array. The
+   array is private to the current thread. An array-like object is returned
+   which can be read and written to like any standard array (e.g. through
+   indexing). See also the Local Memory section of `Device Memory Accesses
+   <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses>`_
 
 Constant memory
 ===============

From 4123b0f4a6e725b6924be717ca4a23dae9043b6c Mon Sep 17 00:00:00 2001
From: Debargha Saha <debarghasaha01@gmail.com>
Date: Thu, 26 Aug 2021 15:10:12 +0530
Subject: [PATCH 02/20] fix:updated url to use issue template

---
 numba/core/errors.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/numba/core/errors.py b/numba/core/errors.py
index 7f52311ec2a..7968f11e560 100644
--- a/numba/core/errors.py
+++ b/numba/core/errors.py
@@ -349,7 +349,7 @@ def termcolor():
 Unsupported functionality was found in the code Numba was trying to compile.
 
 If this functionality is important to you please file a feature request at:
-https://github.com/numba/numba/issues/new
+https://github.com/numba/numba/issues/new?template=bug_report.md
 """
 
 interpreter_error_info = """
@@ -359,7 +359,7 @@ def termcolor():
 environment variable to non-zero, and then rerun the code).
 
 If the code is valid and the unsupported functionality is important to you
-please file a feature request at: https://github.com/numba/numba/issues/new
+please file a feature request at: https://github.com/numba/numba/issues/new?template=bug_report.md
 
 To see Python/NumPy features supported by the latest release of Numba visit:
 https://numba.pydata.org/numba-doc/latest/reference/pysupported.html
@@ -376,7 +376,7 @@ def termcolor():
 https://numba.pydata.org/numba-doc/latest/reference/pysupported.html?highlight=exceptions#constructs
 
 If the code is valid and the unsupported functionality is important to you
-please file a feature request at: https://github.com/numba/numba/issues/new
+please file a feature request at: https://github.com/numba/numba/issues/new?template=bug_report.md
 
 If you think your code should work with Numba. %s
 """ % feedback_details
@@ -395,7 +395,7 @@ def termcolor():
 
 If you think your code should work with Numba, please report the error message
 and traceback, along with a minimal reproducer at:
-https://github.com/numba/numba/issues/new
+https://github.com/numba/numba/issues/new?template=bug_report.md
 """
 
 reportable_issue_info = """

From e3bfaf048137e043525a89cba3196950f3fdbd59 Mon Sep 17 00:00:00 2001
From: Debargha Saha <debarghasaha01@gmail.com>
Date: Thu, 26 Aug 2021 16:14:27 +0530
Subject: [PATCH 03/20] fix:updated url in error message and feature request to
 use issue template

---
 numba/core/errors.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/numba/core/errors.py b/numba/core/errors.py
index 7968f11e560..c4b872dc957 100644
--- a/numba/core/errors.py
+++ b/numba/core/errors.py
@@ -349,7 +349,7 @@ def termcolor():
 Unsupported functionality was found in the code Numba was trying to compile.
 
 If this functionality is important to you please file a feature request at:
-https://github.com/numba/numba/issues/new?template=bug_report.md
+https://github.com/numba/numba/issues/new?template=feature_request.md
 """
 
 interpreter_error_info = """
@@ -359,7 +359,8 @@ def termcolor():
 environment variable to non-zero, and then rerun the code).
 
 If the code is valid and the unsupported functionality is important to you
-please file a feature request at: https://github.com/numba/numba/issues/new?template=bug_report.md
+please file a feature request at: 
+https://github.com/numba/numba/issues/new?template=feature_request.md
 
 To see Python/NumPy features supported by the latest release of Numba visit:
 https://numba.pydata.org/numba-doc/latest/reference/pysupported.html
@@ -376,7 +377,8 @@ def termcolor():
 https://numba.pydata.org/numba-doc/latest/reference/pysupported.html?highlight=exceptions#constructs
 
 If the code is valid and the unsupported functionality is important to you
-please file a feature request at: https://github.com/numba/numba/issues/new?template=bug_report.md
+please file a feature request at:
+https://github.com/numba/numba/issues/new?template=feature_request.md
 
 If you think your code should work with Numba. %s
 """ % feedback_details

From 645f34c3a16bd2c8fa2dd0391246aa12f1d631ee Mon Sep 17 00:00:00 2001
From: Debargha Saha <debarghasaha01@gmail.com>
Date: Thu, 26 Aug 2021 16:25:05 +0530
Subject: [PATCH 04/20] fix:updated url for error report and feature request

---
 numba/core/errors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numba/core/errors.py b/numba/core/errors.py
index c4b872dc957..061954ab5f0 100644
--- a/numba/core/errors.py
+++ b/numba/core/errors.py
@@ -358,8 +358,8 @@ def termcolor():
 without Numba? (To temporarily disable Numba JIT, set the `NUMBA_DISABLE_JIT`
 environment variable to non-zero, and then rerun the code).
 
-If the code is valid and the unsupported functionality is important to you
-please file a feature request at: 
+If the code is valid and the unsupported functionality is important to you 
+please file a feature request at:
 https://github.com/numba/numba/issues/new?template=feature_request.md
 
 To see Python/NumPy features supported by the latest release of Numba visit:

From ca35fc77c027f7a4e4f3faa08aa5cfb4ffe08b49 Mon Sep 17 00:00:00 2001
From: Debargha Saha <debarghasaha01@gmail.com>
Date: Thu, 26 Aug 2021 17:00:04 +0530
Subject: [PATCH 05/20] fix:updated url for error report and feature request
 using issue template

---
 numba/core/errors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/core/errors.py b/numba/core/errors.py
index 061954ab5f0..b69321cd332 100644
--- a/numba/core/errors.py
+++ b/numba/core/errors.py
@@ -358,7 +358,7 @@ def termcolor():
 without Numba? (To temporarily disable Numba JIT, set the `NUMBA_DISABLE_JIT`
 environment variable to non-zero, and then rerun the code).
 
-If the code is valid and the unsupported functionality is important to you 
+If the code is valid and the unsupported functionality is important to you
 please file a feature request at:
 https://github.com/numba/numba/issues/new?template=feature_request.md
 

From dcdde27a260c1fc414e57c771f6b69c23e76d6e3 Mon Sep 17 00:00:00 2001
From: Graham Markall <535640+gmarkall@users.noreply.github.com>
Date: Mon, 23 Aug 2021 17:31:11 +0100
Subject: [PATCH 06/20] test_inspect_cli: Decode exception with default (utf-8)
 codec

Fixes #7337.
---
 numba/tests/test_help.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/tests/test_help.py b/numba/tests/test_help.py
index a78f05f0a3c..8199d4c93c4 100644
--- a/numba/tests/test_help.py
+++ b/numba/tests/test_help.py
@@ -89,4 +89,4 @@ def test_inspect_cli(self):
         with self.assertRaises(subprocess.CalledProcessError) as raises:
             subprocess.check_output(cmds, stderr=subprocess.STDOUT)
         self.assertIn("\'foo\' is not supported",
-                      raises.exception.stdout.decode('ascii'))
+                      raises.exception.stdout.decode())

From 6ce6a6d10957440df2119b6c51fe14c95bdc22a9 Mon Sep 17 00:00:00 2001
From: Sterling Baird <45469701+sgbaird@users.noreply.github.com>
Date: Sat, 28 Aug 2021 01:49:04 -0600
Subject: [PATCH 07/20] Update docs/source/cuda/memory.rst

Co-authored-by: Graham Markall <535640+gmarkall@users.noreply.github.com>
---
 docs/source/cuda/memory.rst | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/source/cuda/memory.rst b/docs/source/cuda/memory.rst
index b17d9e8ced2..c91c46dc588 100644
--- a/docs/source/cuda/memory.rst
+++ b/docs/source/cuda/memory.rst
@@ -167,8 +167,11 @@ traditional dynamic memory management.
    type <numba-types>` of the elements needing to be stored in the array. The
    array is private to the current thread. An array-like object is returned
    which can be read and written to like any standard array (e.g. through
-   indexing). See also the Local Memory section of `Device Memory Accesses
-   <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses>`_
+   indexing).
+
+   .. seealso:: The Local Memory section of `Device Memory Accesses
+      <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses>`_
+      in the CUDA programming guide.
 
 Constant memory
 ===============

From e3a7eeaa09fd33f7333a9532a56b3029c09104be Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Sat, 28 Aug 2021 01:51:56 -0600
Subject: [PATCH 08/20] Update memory.rst

---
 docs/source/cuda/memory.rst | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/docs/source/cuda/memory.rst b/docs/source/cuda/memory.rst
index c91c46dc588..1b434b962d0 100644
--- a/docs/source/cuda/memory.rst
+++ b/docs/source/cuda/memory.rst
@@ -152,16 +152,15 @@ traditional dynamic memory management.
 Local memory
 ============
 
-Local memory is an area of memory private to each thread.  Using local memory
-helps reserve some scratchpad area to variables when scalar local variables are
-not enough.  The memory is allocated once for the duration of the kernel, unlike
-traditional dynamic memory management.
+Local memory is an area of memory private to each thread.  Using local
+memory helps allocate some scratchpad area when scalar local variables
+are not enough.  The memory is allocated once for the duration of the kernel,
+unlike traditional dynamic memory management.
 
 .. function:: numba.cuda.local.array(shape, type)
    :noindex:
 
-   Create a variable representing a local array of the given *shape* and *type*
-   on the device that corresponds to a chunk of statically defined memory.
+   Allocate a local array of the given *shape* and *type* on the device.
    *shape* is either an integer or a tuple of integers representing the array's
    dimensions and must be a simple constant expression. *type* is a :ref:`Numba
    type <numba-types>` of the elements needing to be stored in the array. The

From 35ff818de2431796b4887a5072a85836f011108c Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Mon, 6 Sep 2021 01:30:05 -0600
Subject: [PATCH 09/20] Create helper.py

---
 numba/cuda/kernels/device/helper.py | 349 ++++++++++++++++++++++++++++
 1 file changed, 349 insertions(+)
 create mode 100644 numba/cuda/kernels/device/helper.py

diff --git a/numba/cuda/kernels/device/helper.py b/numba/cuda/kernels/device/helper.py
new file mode 100644
index 00000000000..074a964b90d
--- /dev/null
+++ b/numba/cuda/kernels/device/helper.py
@@ -0,0 +1,349 @@
+"""
+Helper functions for distance matrix computations.
+"""
+
+import os
+from math import sqrt
+from numba import jit
+from time import time
+
+inline = os.environ["INLINE"]
+
+
+class Timer(object):
+    """
+    Simple timer class.
+
+    https://stackoverflow.com/a/5849861/13697228
+    Usage
+    -----
+    with Timer("description"):
+        # do stuff
+    """
+
+    def __init__(self, name=None):
+        self.name = name
+
+    def __enter__(self):
+        self.tstart = time()
+
+    def __exit__(self, type, value, traceback):
+        if self.name:
+            print("[%s]" % self.name,)
+        print(("Elapsed: {}\n").format(round((time() - self.tstart), 5)))
+
+
+@jit(inline=inline)
+def copy(a, out):
+    """
+    Copy a into out.
+
+    Parameters
+    ----------
+    a : vector
+        values to copy.
+    out : vector
+        values to copy into.
+
+    Returns
+    -------
+    None.
+
+    """
+    for i in range(len(a)):
+        i = int(i)
+        out[i] = a[i]
+
+
+@jit(inline=inline)
+def insertionSort(arr):
+    """
+    Perform insertion sorting on a vector.
+
+    Source: https://www.geeksforgeeks.org/insertion-sort/
+
+    Parameters
+    ----------
+    arr : 1D array
+        Array to be sorted in-place.
+
+    Returns
+    -------
+    None.
+
+    """
+
+    for i in range(1, len(arr)):
+        key = arr[i]
+
+        # Move elements of arr[0..i-1], that are greater than key, to one
+        # position ahead of their current position
+        j = i - 1
+        while j >= 0 and key < arr[j]:
+            arr[j + 1] = arr[j]
+            j -= 1
+        arr[j + 1] = key
+
+
+@jit(inline=inline)
+def insertionArgSort(arr, ids):
+    """
+    Perform insertion sorting on a vector and track the sorting indices.
+
+    Source: https://www.geeksforgeeks.org/insertion-sort/
+
+    Parameters
+    ----------
+    arr : 1D array
+        Array to be sorted in-place.
+
+    ids : 1D array
+        Initialized array to fill with sorting indices.
+
+    Returns
+    -------
+    None.
+
+    """
+
+    # fill ids
+    for i in range(len(arr)):
+        ids[i] = i
+
+    for i in range(1, len(arr)):
+        key = arr[i]
+        id_key = ids[i]
+
+        # Move elements of arr[0..i-1], that are greater than key, to one
+        # position ahead of their current position
+        j = i - 1
+        while j >= 0 and key < arr[j]:
+            arr[j + 1] = arr[j]
+            ids[j + 1] = ids[j]
+            j -= 1
+        arr[j + 1] = key
+        ids[j + 1] = id_key
+
+
+@jit(inline=inline)
+def concatenate(vec, vec2, out):
+    """
+    Concatenate two vectors.
+
+    It also reassigns the values of out back into vec and vec2 to prevent odd
+    roundoff issues.
+
+    Parameters
+    ----------
+    vec : vector
+        First vector to concatenate.
+    vec2 : vector
+        Second vector to concatenate.
+    out : vector
+        Initialization of concatenated vector.
+
+    Returns
+    -------
+    None.
+
+    """
+    # number of elements
+    n = len(vec)
+    n2 = len(vec2)
+    # populate out
+    for i in range(n):
+        out[i] = vec[i]
+    for i in range(n2):
+        out[i + n] = vec2[i]
+
+
+@jit(inline=inline)
+def diff(vec, out):
+    """
+    Compute gradient.
+
+    Parameters
+    ----------
+    vec : vector of floats
+        The vector for which to calculate the gradient.
+    out : vector of floats
+        Initialization of output vector (one less element than vec) which
+        contains the gradient.
+
+    Returns
+    -------
+    None.
+
+    """
+    # number of elements
+    n = len(vec) - 1
+    # progressively add next elements
+    for i in range(n):
+        out[i] = vec[i + 1] - vec[i]
+
+# maybe there's a faster implementation somewhere
+
+
+@jit(inline=inline)
+def bisect_right(a, v, ids):
+    """
+    Return indices where to insert items in v in list a, assuming a is sorted.
+
+    Parameters
+    ----------
+    arr : 1D array
+        Array for which to perform a binary search.
+    v : 1D array
+        Values to perform the binary search for.
+    ids : 1D array
+        Initialized list of indices, same size as v.
+
+    Returns
+    -------
+    None.
+
+    The return value i is such that all e in a[:i] have e <= x, and all e in
+    a[i:] have e > x.  So if x already appears in the list, a.insert(x) will
+    insert just after the rightmost x already there. Optional args lo
+    (default 0) and hi (default len(a)) bound the slice of a to be searched.
+
+    Source: modified from
+    https://github.com/python/cpython/blob/43bab0537ceb6e2ca3597f8f3a3c79733b897434/Lib/bisect.py#L15-L35 # noqa
+    """
+    n = len(a)
+    n2 = len(v)
+    for i in range(n2):
+        lo = 0
+        mid = 0
+        hi = n
+        x = v[i]
+        while lo < hi:
+            mid = (lo + hi) // 2
+            # Use __lt__ to match the logic in list.sort() and in heapq
+            if x < a[mid]:
+                hi = mid
+            else:
+                lo = mid + 1
+        ids[i] = lo
+
+
+@jit(inline=inline)
+def sort_by_indices(v, ids, out):
+    """
+    Sort v by ids and assign to out, as in out = v[ids].
+
+    Parameters
+    ----------
+    v : vector
+        values to sort.
+    ids : vector
+        indices to use for sorting.
+    out : vector
+        preallocated vector to put sorted values into.
+
+    Returns
+    -------
+    None.
+
+    """
+    for i in range(len(ids)):
+        idx = ids[i]
+        out[i] = v[idx]
+
+
+@jit(inline=inline)
+def cumsum(vec, out):
+    """
+    Return the cumulative sum of the elements in a vector.
+
+    Referenced https://stackoverflow.com/a/15889203/13697228
+
+    Parameters
+    ----------
+    vec : 1D array
+        Vector for which to compute the cumulative sum.
+
+    out: 1D array
+        Initialized vector which stores the cumulative sum.
+
+    Returns
+    -------
+    out : 1D array
+        The cumulative sum of elements in vec (same shape as vec).
+
+    """
+    # number of elements
+    n = len(vec)
+    # initialize
+    total = 0.0
+    # progressively add next elements
+    for i in range(n):
+        total += vec[i]
+        out[i] = total
+
+
+@jit(inline=inline)
+def divide(v, b, out):
+    """
+    Divide a vector by a scalar.
+
+    Parameters
+    ----------
+    v : vector
+        vector numerator.
+    b : float
+        scalar denominator.
+    out : vector
+        initialized vector to populate with divided values.
+
+    Returns
+    -------
+    None.
+
+    """
+    for i in range(len(v)):
+        out[i] = v[i] / b
+
+
+@jit(inline=inline)
+def integrate(u_cdf, v_cdf, deltas, p):
+    """
+    Integrate between two vectors using a p-norm.
+
+    Parameters
+    ----------
+    u_cdf : numeric vector
+        First CDF.
+    v_cdf : numeric vector
+        Second CDF of same length as a.
+    deltas : numeric vector
+        The differences between the original vectors a and b.
+    p : int, optional
+        The finite power for which to compute the p-norm. The default is 2.
+
+    If p == 1 or p == 2, use of power is avoided, which introduces an overhead
+    # of about 15%
+    Source: https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8404-L8488 # noqa
+    However, without support for math.square or np.square (2021-08-19),
+    math.pow is required for p == 2.
+
+    Returns
+    -------
+    out : numeric scalar
+        The p-norm of a and b.
+
+    """
+    n = len(u_cdf)
+    out = 0.0
+    if p == 1:
+        for i in range(n):
+            out += abs(u_cdf[i] - v_cdf[i]) * deltas[i]
+    elif p == 2:
+        for i in range(n):
+            out += ((u_cdf[i] - v_cdf[i]) ** p) * deltas[i]
+            out = sqrt(out)
+    elif p > 2:
+        for i in range(n):
+            out += (u_cdf[i] - v_cdf[i]) ** p * deltas[i]
+            out = out ** (1 / p)
+    return out

From 367ac21cfed35bba5a3047b60a136ebe73bc949c Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Mon, 6 Sep 2021 01:30:11 -0600
Subject: [PATCH 10/20] Create myjit.py

---
 numba/cuda/kernels/device/myjit.py | 87 ++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 numba/cuda/kernels/device/myjit.py

diff --git a/numba/cuda/kernels/device/myjit.py b/numba/cuda/kernels/device/myjit.py
new file mode 100644
index 00000000000..bc83d768b74
--- /dev/null
+++ b/numba/cuda/kernels/device/myjit.py
@@ -0,0 +1,87 @@
+"""
+Decorator to assign the right jit for different targets.
+"""
+from numba import cuda, jit
+from Lib import inspect
+# from Lib.textwrap import dedent
+
+
+# HACK
+def get_myjit(target="cuda",
+              inline="never",
+              device=True,
+              parallel=False,
+              debug=False,
+              opt=True
+              ):
+    """
+    Get a decorator that assigns the right jit for different targets.
+
+    Parameters
+    ----------
+    target : str, optional
+        Whether to compute on "cuda" or on "cpu". The default is "cuda".
+    self.inline : str, optional
+        Whether to inline functions. The default is "never".
+
+    Returns
+    -------
+    myjit : function
+        The decorator with the specified keyword arguments.
+
+    """
+    if debug and opt:
+        raise ValueError("debug and opt should not both be True")
+
+    def myjit(f):
+        """
+        Decorator to assign the right jit for different targets.
+
+        In case of non-cuda targets, all instances of `cuda.local.array`
+        are replaced by `np.empty`. This is a dirty fix, hopefully in the
+        near future numba will support numpy array allocation and this will
+        not be necessary anymore
+
+        Modified from: https://github.com/numba/numba/issues/2571
+
+        Parameters
+        ----------
+        f : function
+            function for which the decorator is applied to.
+
+        Returns
+        -------
+        newfun : function
+            cuda.jit or jit version of f.
+
+        """
+        source = inspect.getsource(f).splitlines()
+        assert '@myjit' in source[0] or '@mycudajit' in source[0]
+        indent_spaces = len(source[0]) - len(source[0].lstrip())
+        source = [s[indent_spaces:] for s in source]
+        source = '\n'.join(source[1:]) + '\n'
+
+        # source = inspect.dedent(source)
+
+        if target == 'cuda':
+            source = source.replace('prange', 'range')
+            exec(source)
+            fun = eval(f.__name__)
+            newfun = cuda.jit(f, device=device, inline=inline,
+                              debug=debug, opt=opt)
+            # needs to be exported to globals
+            globals()[f.__name__] = newfun
+            return newfun
+
+        elif target == 'cpu':
+            source = source.replace('cuda.local.array', 'np.empty')
+            if not parallel:
+                source = source.replace('prange', 'range')
+            exec(source)
+            fun = eval(f.__name__)
+            newfun = jit(fun, nopython=True, inline=inline)
+            # needs to be exported to globals
+            globals()[f.__name__] = newfun
+            return newfun
+
+    return myjit

From ce4dd4da2a114cf27fcfa46e2201ed8edd3061f6 Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Mon, 6 Sep 2021 01:30:30 -0600
Subject: [PATCH 11/20] Create test_helper.py

---
 numba/cuda/tests/cudapy/test_helper.py | 83 ++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 numba/cuda/tests/cudapy/test_helper.py

diff --git a/numba/cuda/tests/cudapy/test_helper.py b/numba/cuda/tests/cudapy/test_helper.py
new file mode 100644
index 00000000000..81d2666fa8a
--- /dev/null
+++ b/numba/cuda/tests/cudapy/test_helper.py
@@ -0,0 +1,83 @@
+"""
+Test helper functions for distance matrix calculations.
+"""
+import os
+import numpy as np
+from numba.cuda.testing import unittest, CUDATestCase
+import numba.cuda.kernels.device.helper as hp
+from numpy.testing import assert_allclose, assert_equal
+
+bits = int(os.environ["MACHINE_BITS"])
+
+if bits == 32:
+    np_float = np.float32
+    np_int = np.float32
+elif bits == 64:
+    np_float = np.float64
+    np_int = np.float64
+
+tol = 1e-5
+
+
+class TestHelperFunctions(CUDATestCase):
+    def test_concatenate(self):
+        vec = np.random.rand(5)
+        vec2 = np.random.rand(10)
+        check = np.concatenate((vec, vec2))
+        out = np.zeros(len(vec) + len(vec2), dtype=np_float)
+        hp.concatenate(vec, vec2, out)
+
+        assert_allclose(out, check)
+        vec[0] = 100.0
+        assert_allclose(out, check)
+
+    def test_diff(self):
+        vec = np.random.rand(10)
+        out = np.zeros(len(vec) - 1)
+        hp.diff(vec, out)
+        check = np.diff(vec)
+        assert_equal(out, check)
+
+    def test_bisect_right(self):
+        a = np.random.rand(10)
+        a.sort()
+        v = np.random.rand(5)
+        ids = np.zeros_like(v[:-1], dtype=np_int)
+        hp.bisect_right(a, v[:-1], ids)
+        check = np.searchsorted(a, v[:-1], side="right")
+        assert_equal(ids, check)
+
+    def test_sort_by_indices(self):
+        v = np.random.rand(10)
+        ids = np.arange(len(v))
+        np.random.shuffle(ids)
+        out = np.zeros_like(v)
+        hp.sort_by_indices(v, ids, out)
+        check = v[ids]
+        assert_equal(out, check)
+
+        v = np.array([0.0, 0.5528931, 1.1455898, 1.5933731])
+        ids = np.array([1, 2, 3, 3, 3])
+        out = np.zeros_like(ids, dtype=np_float)
+        hp.sort_by_indices(v, ids, out)
+        check = v[ids]
+        assert_allclose(out, check)
+
+    def test_cumsum(self):
+        vec = np.random.rand(10)
+        out = np.zeros_like(vec, dtype=np_float)
+        hp.cumsum(vec, out)
+        check = np.cumsum(vec)
+        assert_allclose(out, check)
+
+    def test_divide(self):
+        v = np.random.rand(10)
+        b = 4.62
+        out = np.zeros_like(v, dtype=np_float)
+        hp.divide(v, b, out)
+        check = v / b
+        assert_allclose(out, check)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 0463c09b4a8bf5458c8cb977d0e05ad0b55491cd Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Mon, 6 Sep 2021 01:30:35 -0600
Subject: [PATCH 12/20] Create test_dist_matrix.py

---
 numba/cuda/tests/cudapy/test_dist_matrix.py | 230 ++++++++++++++++++++
 1 file changed, 230 insertions(+)
 create mode 100644 numba/cuda/tests/cudapy/test_dist_matrix.py

diff --git a/numba/cuda/tests/cudapy/test_dist_matrix.py b/numba/cuda/tests/cudapy/test_dist_matrix.py
new file mode 100644
index 00000000000..cf5fa5c5e22
--- /dev/null
+++ b/numba/cuda/tests/cudapy/test_dist_matrix.py
@@ -0,0 +1,230 @@
+# -*- coding: utf-8 -*-
+"""
+Test distance matrix calculations.
+"""
+from scipy.spatial.distance import euclidean, cdist
+from scipy.stats import wasserstein_distance as scipy_wasserstein_distance
+from numpy.testing import assert_allclose
+import numpy as np
+import os
+
+os.environ["INLINE"] = "never"
+
+from numba.cuda.kernels.device.helper import Timer  # noqa
+from numba.cuda.testing import unittest, CUDATestCase  # noqa
+
+
+rows = 6
+cols = 100
+
+testQ = True
+verbose_test = True
+
+
+class TestDistMat(CUDATestCase):
+    """Test distance matrix calculations on GPU for various metrics."""
+
+    def test_dist_matrix(self):
+        """
+        Loop through distance metrics and perform unit tests.
+
+        The four test cases are:
+            pairwise distances within a single set of vectors
+            pairwise distances between two sets of vectors
+            sparse pairwise distances within a single set of vectors
+            sparse pairwise distances between two sets of vectors
+
+        The ground truth for Euclidean comes from cdist.
+        The ground truth for Earth Mover's distance (1-Wasserstein) is via
+        a scipy.stats function.
+
+        Helper functions are used to generate test data and support the use of
+        Wasserstein distances in cdist.
+
+        Returns
+        -------
+        None.
+
+        """
+
+        def test_data():
+            """
+            Generate seeded test values and weights for two distributions.
+
+            Returns
+            -------
+            U : 2D array
+                Values of first distribution.
+            V : 2D array
+                Values of second distribution.
+            U_weights : 2D array
+                Weights of first distribution.
+            V_weights : 2D array
+                Weights of second distribution.
+
+            """
+            np.random.seed(42)
+            [U, V, U_weights, V_weights] = [
+                np.random.rand(rows, cols) for i in range(4)]
+            return U, V, U_weights, V_weights
+
+        def my_wasserstein_distance(u_uw, v_vw):
+            """
+            Return Earth Mover's distance using concatenated values and weights.
+
+            Parameters
+            ----------
+            u_uw : 1D numeric array
+                Horizontally stacked values and weights of first distribution.
+            v_vw : TYPE
+                Horizontally stacked values and weights of second distribution.
+
+            Returns
+            -------
+            distance : numeric scalar
+                Earth Mover's distance given two distributions.
+
+            """
+            # split into values and weights
+            n = len(u_uw)
+            i = n // 2
+            u = u_uw[0: i]
+            uw = u_uw[i: n]
+            v = v_vw[0: i]
+            vw = v_vw[i: n]
+            # calculate distance
+            distance = scipy_wasserstein_distance(
+                u, v, u_weights=uw, v_weights=vw)
+            return distance
+
+        def join_wasserstein(U, V, Uw, Vw):
+            """
+            Horizontally stack values and weights for each distribution.
+
+            Weights are added as additional columns to values.
+
+            Example:
+                u_uw, v_vw = join_wasserstein(u, v, uw, vw)
+                d = my_wasserstein_distance(u_uw, v_vw)
+                cdist(u_uw, v_vw, metric=my_wasserstein_distance)
+
+            Parameters
+            ----------
+            u : 1D or 2D numeric array
+                First set of distribution values.
+            v : 1D or 2D numeric array
+                Second set of values of distribution values.
+            uw : 1D or 2D numeric array
+                Weights for first distribution.
+            vw : 1D or 2D numeric array
+                Weights for second distribution.
+
+            Returns
+            -------
+            u_uw : 1D or 2D numeric array
+                Horizontally stacked values and weights of first distribution.
+            v_vw : TYPE
+                Horizontally stacked values and weights of second distribution.
+
+            """
+            U_Uw = np.concatenate((U, Uw), axis=1)
+            V_Vw = np.concatenate((V, Vw), axis=1)
+            return U_Uw, V_Vw
+
+        # test and production data
+        pairs = np.array([(0, 1), (1, 2), (2, 3)])
+        i, j = pairs[0]
+        U, V, U_weights, V_weights = test_data()
+
+        Utest = U[0:6]
+        Vtest = V[0:6]
+        Uwtest = U_weights[0:6]
+        Vwtest = V_weights[0:6]
+
+        for target in ["cuda", "cpu"]:
+            from numba.cuda.kernels.dist_matrix import dist_matrix  # noqa
+            for metric in ['euclidean', 'wasserstein']:
+                print('[' + target.upper() + "_" + metric.upper() + ']')
+
+                if testQ:
+                    # compile
+                    dist_matrix(Utest, U_weights=Uwtest,
+                                metric=metric, target=target)
+                    with Timer("one set"):
+                        one_set = dist_matrix(
+                            U, U_weights=U_weights, metric=metric, target=target)  # noqa
+                        if verbose_test:
+                            print(one_set, '\n')
+
+                # compile
+                dist_matrix(Utest, V=Vtest, U_weights=Uwtest,
+                            V_weights=Vwtest, metric=metric, target=target)
+                with Timer("two set"):
+                    two_set = dist_matrix(U, V=V, U_weights=U_weights,
+                                          V_weights=V_weights, metric=metric, target=target)  # noqa
+                    if verbose_test:
+                        print(two_set, '\n')
+
+                    one_set_sparse = dist_matrix(
+                        U, U_weights=U_weights, pairs=pairs, metric=metric, target=target)  # noqa
+                    if verbose_test:
+                        print(one_set_sparse, '\n')
+
+                two_set_sparse = dist_matrix(
+                    U,
+                    V=V,
+                    U_weights=U_weights,
+                    V_weights=V_weights,
+                    pairs=pairs,
+                    metric=metric,
+                    target=target
+                )
+                if verbose_test:
+                    print(two_set_sparse, '\n')
+
+                if testQ:
+                    if metric == "euclidean":
+                        with Timer("one set check (cdist)"):
+                            one_set_check = cdist(U, U)
+                        with Timer("two set check (cdist)"):
+                            two_set_check = cdist(U, V)
+
+                        one_sparse_check = [
+                            euclidean(U[i], U[j]) for i, j in pairs]
+                        two_sparse_check = [
+                            euclidean(U[i], V[j]) for i, j in pairs]
+
+                    elif metric == "wasserstein":
+                        U_Uw, V_Vw = join_wasserstein(
+                            U, V, U_weights, V_weights)
+
+                        with Timer("one set check (cdist)"):
+                            one_set_check = cdist(
+                                U_Uw, U_Uw, metric=my_wasserstein_distance)
+                        with Timer("two set check (cdist)"):
+                            two_set_check = cdist(
+                                U_Uw, V_Vw, metric=my_wasserstein_distance)
+
+                        one_sparse_check = [my_wasserstein_distance(U_Uw[i], U_Uw[j])  # noqa
+                                            for i, j in pairs]
+                        two_sparse_check = [my_wasserstein_distance(U_Uw[i], V_Vw[j])  # noqa
+                                            for i, j in pairs]
+
+                    # check results
+                    tol = 1e-5
+                    assert_allclose(
+                        one_set.ravel(), one_set_check.ravel(), rtol=tol,
+                        err_msg="one set {} {} distance matrix inaccurate".format(target, metric))  # noqa
+                    assert_allclose(
+                        two_set.ravel(), two_set_check.ravel(), rtol=tol,
+                        err_msg="two set {} {} distance matrix inaccurate".format(target, metric))  # noqa
+                    assert_allclose(
+                        one_set_sparse, one_sparse_check, rtol=tol,
+                        err_msg="one set {} {} sparse distance matrix inaccurate".format(target, metric))  # noqa
+                    assert_allclose(
+                        two_set_sparse, two_sparse_check, rtol=tol,
+                        err_msg="two set {} {} distance matrix inaccurate".format(target, metric))  # noqa
+
+
+if __name__ == '__main__':
+    unittest.main()

From 61f6927ff758818943dbdf5d10447f7f51b1adab Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Mon, 6 Sep 2021 06:39:49 -0600
Subject: [PATCH 13/20] Create dist_matrix.py

---
 numba/cuda/kernels/dist_matrix.py | 776 ++++++++++++++++++++++++++++++
 1 file changed, 776 insertions(+)
 create mode 100644 numba/cuda/kernels/dist_matrix.py

diff --git a/numba/cuda/kernels/dist_matrix.py b/numba/cuda/kernels/dist_matrix.py
new file mode 100644
index 00000000000..7561cb3fd81
--- /dev/null
+++ b/numba/cuda/kernels/dist_matrix.py
@@ -0,0 +1,776 @@
+"""
+Compute a distance matrix for various cases.
+
+Cases:
+    within a single set of vectors (like pdist)
+    between two sets of vectors (like cdist)
+    between prespecified pairs (i.e. sparse) for either one or two sets of
+        vectors.
+
+Various distance metrics are available.
+"""
+import numba.cuda.kernels.device.helper as hp
+import os
+import numpy as np
+from math import sqrt, ceil
+
+# CUDA Simulator not working
+# os.environ["NUMBA_ENABLE_CUDASIM"] = "1"
+# os.environ["NUMBA_CUDA_DEBUGINFO"] = "1"
+
+from numba import cuda, jit, prange  # noqa
+from numba.cuda.testing import unittest, CUDATestCase  # noqa
+from numba.types import int32, float32, int64, float64  # noqa
+from numba.cuda.kernels.device.myjit import get_myjit  # noqa
+# from numba.cuda.cudadrv.driver import driver
+# TPB = driver.get_device().MAX_THREADS_PER_BLOCK # max TPB causes crash..
+
+inline = os.environ.get("INLINE", "never")
+fastmath = bool(os.environ.get("FASTMATH", "1"))
+cols = os.environ.get("COLUMNS")
+USE_64 = bool(os.environ.get("USE_64", "0"))
+target = os.environ.get("TARGET", "cuda")
+
+if USE_64 is None:
+    USE_64 = False
+if USE_64:
+    bits = 64
+    nb_float = float64
+    nb_int = int64
+    np_float = np.float64
+    np_int = np.int64
+else:
+    bits = 32
+    nb_float = float32
+    nb_int = int32
+    np_float = np.float32
+    np_int = np.int32
+os.environ["MACHINE_BITS"] = str(bits)
+
+if cols is not None:
+    cols = int(cols)
+    cols_plus_1 = cols + 1
+    tot_cols = cols * 2
+    tot_cols_minus_1 = tot_cols - 1
+else:
+    raise KeyError("For performance reasons and architecture constraints "
+                   "the number of columns of U (which is the same as V) "
+                   "must be defined as the environment variable, COLUMNS, "
+                   "via e.g. `os.environ[\"COLUMNS\"] = \"100\"`.")
+
+# override types
+if target == "cpu":
+    nb_float = np_float
+    nb_int = np_int
+
+# a "hacky" way to get compatibility between @njit and @cuda.jit
+# myjit = get_myjit(target=target, inline=inline)
+# mycudajit = get_myjit(device=False, target=target, inline=inline)
+
+# local array shapes needs to be defined globally due to lack of dynamic
+# array allocation support. Don't wrap with np.int32, etc. see
+# https://github.com/numba/numba/issues/7314
+TPB = 16
+
+# np.savetxt('100-zeros.csv',
+#           np.zeros(tmp, dtype=np.int32),
+#           delimiter=",")
+# OK to take cols from .shape
+# cols = np.genfromtxt('100-zeros.csv',
+#                      dtype=np.int32,
+#                      delimiter=",").shape[0]
+
+
+@ jit(inline=inline)
+def cdf_distance(u, v, u_weights, v_weights, p, presorted, cumweighted, prepended):  # noqa
+    r""" # noqa
+    Compute distance between two 1D distributions :math:`u` and :math:`v`.
+
+    The respective CDFs are :math:`U` and :math:`V`, and the
+    statistical distance is defined as:
+    .. math::
+        l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p}
+    p is a positive parameter; p = 1 gives the Wasserstein distance,
+    p = 2 gives the energy distance.
+
+    Parameters
+    ----------
+    u_values, v_values : array_like
+        Values observed in the (empirical) distribution.
+    u_weights, v_weights : array_like
+        Weight for each value.
+        `u_weights` (resp. `v_weights`) must have the same length as
+        `u_values` (resp. `v_values`). If the weight sum differs from
+        1, it must still be positive and finite so that the weights can
+        be normalized to sum to 1.
+    p : scalar
+        positive parameter that determines the type of cdf distance.
+    presorted : bool
+        Whether u and v have been sorted already *and* u_weights and
+        v_weights have been sorted using the same indices used to sort
+        u and v, respectively.
+    cumweighted : bool
+        Whether u_weights and v_weights have been converted to their
+        cumulative weights via e.g. np.cumsum().
+    prepended : bool
+        Whether a zero has been prepended to accumated, sorted
+        u_weights and v_weights.
+
+    By setting presorted, cumweighted, *and* prepended to False, the
+    computationproceeds proceeds in the same fashion as _cdf_distance
+    from scipy.stats.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    The input distributions can be empirical, therefore coming from
+    samples whose values are effectively inputs of the function, or
+    they can be seen as generalized functions, in which case they are
+    weighted sums of Dirac delta functions located at the specified
+    values.
+
+    References
+    ----------
+    .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan,
+            Hoyer, Munos "The Cramer Distance as a Solution to Biased
+            Wasserstein Gradients" (2017). :arXiv:`1705.10743`.
+    """
+    # allocate local float arrays
+    # combined vector
+    uv = cuda.local.array(tot_cols, nb_float)
+    uv_deltas = cuda.local.array(tot_cols_minus_1, nb_float)
+
+    # CDFs
+    u_cdf = cuda.local.array(tot_cols_minus_1, nb_float)
+    v_cdf = cuda.local.array(tot_cols_minus_1, nb_float)
+
+    # allocate local int arrays
+    # CDF indices via binary search
+    u_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int)
+    v_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int)
+
+    u_cdf_sorted_cumweights = cuda.local.array(
+        tot_cols_minus_1, nb_float)
+    v_cdf_sorted_cumweights = cuda.local.array(
+        tot_cols_minus_1, nb_float)
+
+    # short-circuit
+    if presorted and cumweighted and prepended:
+        u_sorted = u
+        v_sorted = v
+
+        u_0_cumweights = u_weights
+        v_0_cumweights = v_weights
+
+    # sorting, accumulating, and prepending (for compatibility)
+    else:
+        # check arguments
+        if not presorted and (cumweighted or prepended):
+            raise ValueError(
+                "if cumweighted or prepended are True, then presorted cannot be False")  # noqa
+
+        if (not presorted or not cumweighted) and prepended:
+            raise ValueError(
+                "if prepended is True, then presorted and cumweighted must both be True")  # noqa
+
+        # sorting
+        if not presorted:
+            # local arrays
+            u_sorted = cuda.local.array(cols, np_float)
+            v_sorted = cuda.local.array(cols, np_float)
+
+            u_sorter = cuda.local.array(cols, nb_int)
+            v_sorter = cuda.local.array(cols, nb_int)
+
+            u_sorted_weights = cuda.local.array(cols, nb_float)
+            v_sorted_weights = cuda.local.array(cols, nb_float)
+
+            # local copy since quickArgSortIterative sorts in-place
+            hp.copy(u, u_sorted)
+            hp.copy(v, v_sorted)
+
+            # sorting
+            hp.insertionArgSort(u_sorted, u_sorter)
+            hp.insertionArgSort(v_sorted, v_sorter)
+
+            # inplace to avoid extra cuda local array
+            hp.sort_by_indices(u_weights, u_sorter, u_sorted_weights)
+            hp.sort_by_indices(u_weights, u_sorter, v_sorted_weights)
+
+        # cumulative weights
+        if not cumweighted:
+            # local arrays
+            u_cumweights = cuda.local.array(cols, nb_float)
+            v_cumweights = cuda.local.array(cols, nb_float)
+            # accumulate
+            hp.cumsum(u_sorted_weights, u_cumweights)
+            hp.cumsum(v_sorted_weights, v_cumweights)
+
+        # prepend weights with zero
+        if not prepended:
+            zero = cuda.local.array(1, nb_float)
+
+            u_0_cumweights = cuda.local.array(
+                cols_plus_1, nb_float)
+            v_0_cumweights = cuda.local.array(
+                cols_plus_1, nb_float)
+
+            hp.concatenate(zero, u_cumweights, u_0_cumweights)
+            hp.concatenate(zero, v_cumweights, v_0_cumweights)
+
+    # concatenate u and v into uv
+    hp.concatenate(u_sorted, v_sorted, uv)
+
+    # sorting
+    # quickSortIterative(uv, uv_stack)
+    hp.insertionSort(uv)
+
+    # Get the respective positions of the values of u and v among the
+    # values of both distributions. See also np.searchsorted
+    hp.bisect_right(u_sorted, uv[:-1], u_cdf_indices)
+    hp.bisect_right(v_sorted, uv[:-1], v_cdf_indices)
+
+    # empirical CDFs
+    hp.sort_by_indices(u_0_cumweights, u_cdf_indices,
+                       u_cdf_sorted_cumweights)
+    hp.divide(u_cdf_sorted_cumweights, u_0_cumweights[-1], u_cdf)
+
+    hp.sort_by_indices(v_0_cumweights, v_cdf_indices,
+                       v_cdf_sorted_cumweights)
+    hp.divide(v_cdf_sorted_cumweights, v_0_cumweights[-1], v_cdf)
+
+    # # Integration
+    hp.diff(uv, uv_deltas)  # See also np.diff
+
+    out = hp.integrate(u_cdf, v_cdf, uv_deltas, p)
+
+    return out
+
+
+@ jit(inline=inline)
+def wasserstein_distance(u, v, u_weights, v_weights, presorted, cumweighted, prepended):  # noqa
+    r"""
+    Compute the first Wasserstein distance between two 1D distributions.
+
+    This distance is also known as the earth mover's distance, since it can be
+    seen as the minimum amount of "work" required to transform :math:`u` into
+    :math:`v`, where "work" is measured as the amount of distribution weight
+    that must be moved, multiplied by the distance it has to be moved.
+
+    Source
+    ------
+    https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8245-L8319 # noqa
+
+    Parameters
+    ----------
+    u_values, v_values : array_like
+        Values observed in the (empirical) distribution.
+    u_weights, v_weights : array_like, optional
+        Weight for each value. If unspecified, each value is assigned the same
+        weight.
+        `u_weights` (resp. `v_weights`) must have the same length as
+        `u_values` (resp. `v_values`). If the weight sum differs from 1, it
+        must still be positive and finite so that the weights can be normalized
+        to sum to 1.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    The input distributions can be empirical, therefore coming from samples
+    whose values are effectively inputs of the function, or they can be seen as
+    generalized functions, in which case they are weighted sums of Dirac delta
+    functions located at the specified values.
+
+    References
+    ----------
+    .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
+           Munos "The Cramer Distance as a Solution to Biased Wasserstein
+           Gradients" (2017). :arXiv:`1705.10743`.
+    """
+    return cdf_distance(u, v, u_weights, v_weights, np_int(1), presorted, cumweighted, prepended)  # noqa
+
+
+@ jit(inline=inline)
+def euclidean_distance(a, b):
+    """
+    Calculate Euclidean distance between vectors a and b.
+
+    Parameters
+    ----------
+    a : 1D array
+        First vector.
+    b : 1D array
+        Second vector.
+
+    Returns
+    -------
+    d : numeric scalar
+        Euclidean distance between vectors a and b.
+    """
+    d = 0
+    for i in range(len(a)):
+        d += (b[i] - a[i]) ** 2
+    d = sqrt(d)
+    return d
+
+
+@ jit(inline=inline)
+def compute_distance(u, v, u_weights, v_weights, metric_num):
+    """
+    Calculate weighted distance between two vectors, u and v.
+
+    Parameters
+    ----------
+    u : 1D array of float
+        First vector.
+    v : 1D array of float
+        Second vector.
+    u_weights : 1D array of float
+        Weights for u.
+    v_weights : 1D array of float
+        Weights for v.
+    metric_num : int
+        Which metric to use (0 == "euclidean", 1=="wasserstein").
+
+    Raises
+    ------
+    NotImplementedError
+        "Specified metric is mispelled or has not been implemented yet.
+        If not implemented, consider submitting a pull request."
+
+    Returns
+    -------
+    d : float
+        Weighted distance between u and v.
+
+    """
+    if metric_num == 0:
+        d = euclidean_distance(u, v)
+    elif metric_num == 1:
+        # assume u and v are presorted, and weights are sorted by u and v
+        d = wasserstein_distance(
+            u, v, u_weights, v_weights, True, True, True)
+    else:
+        raise NotImplementedError(
+            "Specified metric is mispelled or has not been implemented yet. "
+            "If not implemented, consider submitting a pull request."
+        )
+    return d
+
+
+@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], "
+          "int{0}[:,:], float{0}[:], int{0})".format(bits), fastmath=fastmath)
+def sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, metric_num):
+    """
+    Calculate sparse pairwise distances between two sets of vectors for pairs.
+
+    Parameters
+    ----------
+    mat : numeric cuda array
+        First set of vectors for which to compute a single pairwise distance.
+    mat2 : numeric cuda array
+        Second set of vectors for which to compute a single pairwise distance.
+    pairs : cuda array of 2-tuples
+        All pairs for which distances are to be computed.
+    out : numeric cuda array
+        The initialized array which will be populated with distances.
+
+    Raises
+    ------
+    ValueError
+        Both matrices should have the same number of columns.
+
+    Returns
+    -------
+    None.
+
+    """
+    k = cuda.grid(1)
+
+    npairs = pairs.shape[0]
+
+    if k < npairs:
+        pair = pairs[k]
+        # extract indices of the pair for which the distance will be computed
+        i, j = pair
+
+        u = U[i]
+        v = V[j]
+        uw = U_weights[i]
+        vw = V_weights[j]
+
+        out[k] = compute_distance(u, v, uw, vw, metric_num)
+
+
+@cuda.jit(
+    "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], int{0})".format(bits),
+    fastmath=fastmath,
+)
+def one_set_distance_matrix(U, U_weights, out, metric_num):
+    """
+    Calculate pairwise distances within single set of vectors.
+
+    Parameters
+    ----------
+    U : 2D array of float
+        Vertically stacked vectors.
+    U_weights : 2D array of float
+        Vertically stacked weight vectors.
+    out : 2D array of float
+        Initialized matrix to populate with pairwise distances.
+    metric_num : int
+        Which metric to use (0 == "euclidean", 1=="wasserstein").
+
+    Returns
+    -------
+    None.
+
+    """
+    i, j = cuda.grid(2)
+
+    dm_rows = U.shape[0]
+    dm_cols = U.shape[0]
+
+    if i < j and i < dm_rows and j < dm_cols and i != j:
+        u = U[i]
+        v = U[j]
+        uw = U_weights[i]
+        vw = U_weights[j]
+        d = compute_distance(u, v, uw, vw, metric_num)
+        out[i, j] = d
+        out[j, i] = d
+
+
+# faster compilation *and* runtimes with explicit signature
+@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], "
+          "float{0}[:,:], int{0})".format(bits), fastmath=fastmath)
+def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num):
+    """
+    Calculate pairwise distances between two sets of vectors.
+
+    Parameters
+    ----------
+    U : 2D array of float
+        Vertically stacked vectors.
+    V : 2D array of float
+        Vertically stacked vectors.
+    U_weights : 2D array of float
+        Vertically stacked weight vectors.
+    V_weights : 2D array of float
+        Vertically stacked weight vectors.
+    out : 2D array of float
+        Pairwise distance matrix between two sets of vectors.
+    metric_num : int
+        Distance metric number {"euclidean": 0, "wasserstein": 1}.
+
+    Returns
+    -------
+    None.
+
+    """
+    i, j = cuda.grid(2)
+
+    # distance matrix shape
+    dm_rows = U.shape[0]
+    dm_cols = V.shape[0]
+
+    if i < dm_rows and j < dm_cols:
+        u = U[i]
+        v = V[j]
+        uw = U_weights[i]
+        vw = V_weights[j]
+        d = compute_distance(u, v, uw, vw, metric_num)
+        out[i, j] = d
+
+
+def dist_matrix(U,
+            V=None,
+            U_weights=None,
+            V_weights=None,
+            pairs=None,
+            metric="euclidean"):  # noqa
+    """
+    Compute distance matrices using Numba/CUDA.
+
+    Parameters
+    ----------
+    mat : array
+        First set of vectors for which to compute pairwise distances.
+
+    mat2 : array, optional
+        Second set of vectors for which to compute pairwise distances.
+        If not specified, then mat2 is a copy of mat.
+
+    pairs : array, optional
+        List of 2-tuples which contain the indices for which to compute
+        distances. If mat2 was specified, then the second index accesses
+        mat2 instead of mat. If not specified, then the pairs are
+        auto-generated. If mat2 was specified, all combinations of the two
+        vector sets are used. If mat2 isn't specified, then only the upper
+        triangle (minus diagonal) pairs are computed.
+
+    metric : str, optional
+        Possible options are 'euclidean', 'wasserstein'.
+        Defaults to Euclidean distance. These are converted to integers
+        internally due to Numba's lack of support for string arguments
+        (2021-08-14). See compute_distance() for other keys.
+
+        For example:
+            0 - 'euclidean'
+            1 - 'wasserstein'
+    target : str, optional
+        Which target to use: "cuda" or "cpu". Default is "cuda".
+    inline : str, optional
+        Whether to inline functions: "always" or "never". Default is "never".
+    fastmath : bool, optional
+        Whether to use fastmath or not. The default is True.
+    USE_64 : bool, optional
+        Whether to use 64 bit or 34 bit types. The default is False.
+
+    Raises
+    ------
+    ValueError
+        DESCRIPTION.
+    NotImplementedError
+        DESCRIPTION.
+
+    Returns
+    -------
+    out : array
+        A pairwise distance matrix, or if pairs are specified, then a
+        vector of distances corresponding to the pairs.
+
+    """
+    rows, cols_check = U.shape
+
+    if cols_check != cols:
+        raise KeyError("For performance reasons and architecture constraints "
+                       "the number of columns of U (which is the same as V) "
+                       "must be defined as the environment variable: COLUMNS. "
+                       "However, os.environ[\"COLUMNS\"] does not match the "
+                       "number of columns in U. Reset the environment variable "
+                       "via e.g. `os.environ[\"COLUMNS\"] = str(U.shape[0])` "
+                       "defined at the top of your script and make sure that "
+                       "dist_matrix is reloaded. You may also need to restart "
+                       "Python.")
+
+    if V is not None and cols is not V.shape[1]:
+        raise ValueError("The number of columns for U and V should match")
+
+    # is it a distance matrix between two sets of vectors?
+    # (rather than within a single set)
+    isXY = V is not None
+
+    # were pairs specified? (useful for sparse matrix generation)
+    pairQ = pairs is not None
+
+    # block, grid, and out shape
+    if pairQ:
+        block_dim = TPB
+        npairs = pairs.shape[0]
+        grid_dim = ceil(npairs / block_dim)
+    else:
+        block_dim = (TPB, TPB)
+        blockspergrid_x = ceil(rows / block_dim[0])
+        if isXY:
+            blockspergrid_y = ceil(V.shape[0] / block_dim[1])
+        else:
+            blockspergrid_y = ceil(rows / block_dim[1])
+        grid_dim = (blockspergrid_x, blockspergrid_y)
+
+    # CUDA setup
+    stream = cuda.stream()
+
+    # sorting and cumulative weights
+    if metric == "wasserstein":
+        # presort values (and weights by sorted value indices)
+        U_sorter = np.argsort(U)
+        U = np.take_along_axis(U, U_sorter, axis=-1)
+        U_weights = np.take_along_axis(U_weights, U_sorter, axis=-1)
+
+        # calculate cumulative weights
+        U_weights = np.cumsum(U_weights, axis=1)
+
+        # prepend a column of zeros
+        zero = np.zeros((U_weights.shape[0], 1))
+        U_weights = np.column_stack((zero, U_weights))
+
+        # do the same for V and V_weights
+        if isXY:
+            V_sorter = np.argsort(V)
+            V = np.take_along_axis(V, V_sorter, axis=-1)
+            V_weights = np.take_along_axis(V_weights, V_sorter, axis=-1)
+            V_weights = np.cumsum(V_weights, axis=1)
+            V_weights = np.column_stack((zero, V_weights))
+
+    # assign dummy arrays
+    if V is None:
+        V = np.zeros((2, 2))
+
+    if U_weights is None:
+        U_weights = np.zeros((2, 2))
+
+    if V_weights is None:
+        V_weights = np.zeros((2, 2))
+
+    if pairs is None:
+        pairs = np.zeros((2, 2))
+
+    # assign metric_num based on specified metric (no string support)
+    metric_dict = {"euclidean": 0, "wasserstein": 1}
+    metric_num = metric_dict[metric]
+
+    # # same for target_num
+    # target_dict = {"cuda": 0, "cpu": 1}
+    # target_num = target_dict[target]
+
+    m = U.shape[0]
+
+    if isXY:
+        m2 = V.shape[0]
+    else:
+        m2 = m
+
+    if pairQ:
+        shape = (npairs,)
+    else:
+        shape = (m, m2)
+
+    # values should be initialized instead of using cuda.device_array
+    out = np.zeros(shape)
+
+    if target == "cuda":
+        # copying/allocating to GPU
+        cU = cuda.to_device(np.asarray(
+            U, dtype=np_float), stream=stream)
+
+        cU_weights = cuda.to_device(np.asarray(
+            U_weights, dtype=np_float), stream=stream)
+
+        if isXY or pairQ:
+            cV = cuda.to_device(np.asarray(
+                V, dtype=np_float), stream=stream)
+            cV_weights = cuda.to_device(
+                np.asarray(V_weights, dtype=np_float), stream=stream
+            )
+
+        if pairQ:
+            cpairs = cuda.to_device(np.asarray(
+                pairs, dtype=np_int), stream=stream)
+
+        cuda_out = cuda.to_device(np.asarray(
+            out, dtype=np_float), stream=stream)
+
+    elif target == "cpu":
+        cU = U
+        cV = V
+        cU_weights = U_weights
+        cV_weights = V_weights
+        cpairs = pairs
+        cuda_out = out
+
+    # distance matrix between two sets of vectors
+    if isXY and not pairQ:
+        fn = two_set_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cV, cU_weights, cV_weights, cuda_out, metric_num)
+
+    # specified pairwise distances within single set of vectors
+    elif not isXY and pairQ:
+        fn = sparse_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cU, cU_weights, cU_weights, cpairs,
+           cuda_out, metric_num)
+
+    # distance matrix within single set of vectors
+    elif not isXY and not pairQ:
+        fn = one_set_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cU_weights, cuda_out, metric_num)
+
+    # specified pairwise distances within single set of vectors
+    elif isXY and pairQ:
+        fn = sparse_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cV, cU_weights, cV_weights, cpairs,
+           cuda_out, metric_num)
+
+    out = cuda_out.copy_to_host(stream=stream)
+
+    return out
+
+# %% Code Graveyard
+
+# u_stack = cuda.local.array(cols, nb_int)
+# v_stack = cuda.local.array(cols, nb_int)
+
+# copy(u_sorted, utmp)
+# copy(v_sorted, vtmp)
+
+# copy(u_weights, u_sorted_weights)
+# copy(v_weights, v_sorted_weights)
+
+# stack = [0] * size
+# ids = list(range(len(arr)))
+
+# u_sorted_weights = cuda.local.array(cols, nb_float)
+# v_sorted_weights = cuda.local.array(cols, nb_float)
+
+# sorting stack
+# uv_stack = cuda.local.array(tot_cols, nb_int)
+
+# def sort_cum_prepend(X, X_weights):
+#     # presort values (and weights by sorted value indices)
+#     X_sorter = np.argsort(X)
+#     X = np.take_along_axis(X, X_sorter, axis=-1)
+#     X_weights = np.take_along_axis(X_weights, X_sorter, axis=-1)
+#     # calculate cumulative weights
+#     X_weights = np.cumsum(X_weights)
+#     # prepend a column of zeros
+#     zero = np.zeros((X_weights.shape[0], 1))
+#     X_weights = np.column_stack((zero, X_weights))
+
+# def mean_squared_error(y, y_pred, squared=False):
+#     """
+#     Return mean squared error (MSE) without using sklearn.
+
+#     If squared == True, then return root mean squared error (RMSE).
+
+#     Parameters
+#     ----------
+#     y : 1D numeric array
+#         "True" or first set of values.
+#     y_pred : 1D numeric array
+#         "Predicted" or second set of values.
+
+#     Returns
+#     -------
+#     rmse : numeric scalar
+#         DESCRIPTION.
+
+#     """
+#     mse = np.mean((y - y_pred)**2)
+#     if squared is True:
+#         rmse = np.sqrt(mse)
+#         return rmse
+#     else:
+#         return mse
+#         return mse
+
+# opt = False
+# os.environ["NUMBA_CUDA_DEBUGINFO"] = "0"
+# opt = True
+# os.environ["NUMBA_BOUNDSCHECK"] = "0"
+# os.environ["OPT"] = "0"

From aa413a966fc3c28b9ff646f7d9e2a13f673ff20b Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Mon, 6 Sep 2021 06:39:59 -0600
Subject: [PATCH 14/20] Update helper.py

---
 numba/cuda/kernels/device/helper.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/numba/cuda/kernels/device/helper.py b/numba/cuda/kernels/device/helper.py
index 074a964b90d..3ff059b3d7e 100644
--- a/numba/cuda/kernels/device/helper.py
+++ b/numba/cuda/kernels/device/helper.py
@@ -1,13 +1,10 @@
-"""
-Helper functions for distance matrix computations.
-"""
-
+"""Helper functions for distance matrix computations."""
 import os
 from math import sqrt
 from numba import jit
 from time import time
 
-inline = os.environ["INLINE"]
+inline = os.environ.get("INLINE", "never")
 
 
 class Timer(object):
@@ -25,9 +22,11 @@ def __init__(self, name=None):
         self.name = name
 
     def __enter__(self):
+        """Enter the timer."""
         self.tstart = time()
 
     def __exit__(self, type, value, traceback):
+        """Exit the timer."""
         if self.name:
             print("[%s]" % self.name,)
         print(("Elapsed: {}\n").format(round((time() - self.tstart), 5)))
@@ -72,7 +71,6 @@ def insertionSort(arr):
     None.
 
     """
-
     for i in range(1, len(arr)):
         key = arr[i]
 
@@ -105,7 +103,6 @@ def insertionArgSort(arr, ids):
     None.
 
     """
-
     # fill ids
     for i in range(len(arr)):
         ids[i] = i
@@ -347,3 +344,12 @@ def integrate(u_cdf, v_cdf, deltas, p):
             out += (u_cdf[i] - v_cdf[i]) ** p * deltas[i]
             out = out ** (1 / p)
     return out
+
+
+# %% CODE GRAVEYARD
+# from numba.core import config
+# config_keys = dir(config)
+# if "INLINE" in config_keys:
+#     inline = config.INLINE
+# else:
+#     inline = "never"

From 0bc56b2602c4f673e9fb036988864172ca6d26ee Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Mon, 6 Sep 2021 06:40:12 -0600
Subject: [PATCH 15/20] Update test_dist_matrix.py

---
 numba/cuda/tests/cudapy/test_dist_matrix.py | 209 +++++++++++---------
 1 file changed, 116 insertions(+), 93 deletions(-)

diff --git a/numba/cuda/tests/cudapy/test_dist_matrix.py b/numba/cuda/tests/cudapy/test_dist_matrix.py
index cf5fa5c5e22..4ad8f948a16 100644
--- a/numba/cuda/tests/cudapy/test_dist_matrix.py
+++ b/numba/cuda/tests/cudapy/test_dist_matrix.py
@@ -1,23 +1,32 @@
 # -*- coding: utf-8 -*-
-"""
-Test distance matrix calculations.
-"""
+"""Test distance matrix calculations using CUDA/Numba."""
 from scipy.spatial.distance import euclidean, cdist
 from scipy.stats import wasserstein_distance as scipy_wasserstein_distance
 from numpy.testing import assert_allclose
 import numpy as np
 import os
+from importlib import reload
 
+# os.environ["NUMBA_DISABLE_JIT"] = "1"
+
+# number of columns of U and V must be set as env var before import dist_matrix
+cols = 100
+os.environ["COLUMNS"] = str(cols)
+
+# other environment variables (set before importing dist_matrix)
+os.environ["USE_64"] = "0"
 os.environ["INLINE"] = "never"
+os.environ["FASTMATH"] = "1"
+os.environ["TARGET"] = "cuda"
 
 from numba.cuda.kernels.device.helper import Timer  # noqa
 from numba.cuda.testing import unittest, CUDATestCase  # noqa
+import numba.cuda.kernels.dist_matrix  # noqa
 
+# to overwrite env vars (source: https://stackoverflow.com/a/1254379/13697228)
+reload(numba.cuda.kernels.dist_matrix)
+dist_matrix = numba.cuda.kernels.dist_matrix.dist_matrix
 
-rows = 6
-cols = 100
-
-testQ = True
 verbose_test = True
 
 
@@ -47,7 +56,7 @@ def test_dist_matrix(self):
 
         """
 
-        def test_data():
+        def test_data(rows, cols):
             """
             Generate seeded test values and weights for two distributions.
 
@@ -131,100 +140,114 @@ def join_wasserstein(U, V, Uw, Vw):
             V_Vw = np.concatenate((V, Vw), axis=1)
             return U_Uw, V_Vw
 
+        tol = 1e-5
+
         # test and production data
         pairs = np.array([(0, 1), (1, 2), (2, 3)])
         i, j = pairs[0]
-        U, V, U_weights, V_weights = test_data()
-
-        Utest = U[0:6]
-        Vtest = V[0:6]
-        Uwtest = U_weights[0:6]
-        Vwtest = V_weights[0:6]
 
-        for target in ["cuda", "cpu"]:
-            from numba.cuda.kernels.dist_matrix import dist_matrix  # noqa
-            for metric in ['euclidean', 'wasserstein']:
-                print('[' + target.upper() + "_" + metric.upper() + ']')
+        for testQ in [True, False]:
+            # large # of rows with testQ == True is slow due to use of
+            # cdist and non-jitted scipy-wasserstein
+            if testQ:
+                rows = 6
+            else:
+                rows = 1000
+            print("====[PARTIAL TEST WITH {} ROWS]====".format(rows))
+            U, V, U_weights, V_weights = test_data(rows, cols)
+            Utest, Vtest, Uwtest, Vwtest = [x[0:6] for x in [U, V, U_weights, V_weights]]  # noqa
+
+            for target in ["cuda"]:  # "cpu" not implemented
+                for metric in ['euclidean', 'wasserstein']:
+                    print('[' + target.upper() + "_" + metric.upper() + ']')
+
+                    if testQ:
+                        # compile
+                        dist_matrix(Utest, U_weights=Uwtest, metric=metric)
+                        with Timer("one set"):
+                            one_set = dist_matrix(
+                                U, U_weights=U_weights, metric=metric)  # noqa
+                            if verbose_test:
+                                print(one_set, '\n')
 
-                if testQ:
                     # compile
-                    dist_matrix(Utest, U_weights=Uwtest,
-                                metric=metric, target=target)
-                    with Timer("one set"):
-                        one_set = dist_matrix(
-                            U, U_weights=U_weights, metric=metric, target=target)  # noqa
-                        if verbose_test:
-                            print(one_set, '\n')
-
-                # compile
-                dist_matrix(Utest, V=Vtest, U_weights=Uwtest,
-                            V_weights=Vwtest, metric=metric, target=target)
-                with Timer("two set"):
-                    two_set = dist_matrix(U, V=V, U_weights=U_weights,
-                                          V_weights=V_weights, metric=metric, target=target)  # noqa
-                    if verbose_test:
-                        print(two_set, '\n')
-
-                    one_set_sparse = dist_matrix(
-                        U, U_weights=U_weights, pairs=pairs, metric=metric, target=target)  # noqa
-                    if verbose_test:
-                        print(one_set_sparse, '\n')
-
-                two_set_sparse = dist_matrix(
-                    U,
-                    V=V,
-                    U_weights=U_weights,
-                    V_weights=V_weights,
-                    pairs=pairs,
-                    metric=metric,
-                    target=target
-                )
-                if verbose_test:
-                    print(two_set_sparse, '\n')
-
-                if testQ:
-                    if metric == "euclidean":
-                        with Timer("one set check (cdist)"):
-                            one_set_check = cdist(U, U)
+                    dist_matrix(Utest, V=Vtest, U_weights=Uwtest,
+                                V_weights=Vwtest, metric=metric)
+                    with Timer("two set"):
+                        two_set = dist_matrix(U, V=V, U_weights=U_weights,
+                                              V_weights=V_weights, metric=metric)  # noqa
+                        if testQ and verbose_test:
+                            print(two_set, '\n')
+
+                        one_set_sparse = dist_matrix(
+                            U, U_weights=U_weights, pairs=pairs, metric=metric)  # noqa
+                        if testQ and verbose_test:
+                            print(one_set_sparse, '\n')
+
+                    two_set_sparse = dist_matrix(
+                        U,
+                        V=V,
+                        U_weights=U_weights,
+                        V_weights=V_weights,
+                        pairs=pairs,
+                        metric=metric,
+                    )
+                    if testQ and verbose_test:
+                        print(two_set_sparse, '\n')
+
+                    if testQ:
+                        if metric == "euclidean":
+                            with Timer("one set check (cdist)"):
+                                one_set_check = cdist(U, U)
+                            with Timer("two set check (cdist)"):
+                                two_set_check = cdist(U, V)
+
+                            one_sparse_check = [
+                                euclidean(U[i], U[j]) for i, j in pairs]
+                            two_sparse_check = [
+                                euclidean(U[i], V[j]) for i, j in pairs]
+
+                        elif metric == "wasserstein":
+                            U_Uw, V_Vw = join_wasserstein(
+                                U, V, U_weights, V_weights)
+
+                            with Timer("one set check (cdist)"):
+                                one_set_check = cdist(
+                                    U_Uw, U_Uw, metric=my_wasserstein_distance)
+                            with Timer("two set check (cdist)"):
+                                two_set_check = cdist(
+                                    U_Uw, V_Vw, metric=my_wasserstein_distance)
+
+                            one_sparse_check = [my_wasserstein_distance(U_Uw[i], U_Uw[j])  # noqa
+                                                for i, j in pairs]
+                            two_sparse_check = [my_wasserstein_distance(U_Uw[i], V_Vw[j])  # noqa
+                                                for i, j in pairs]
+
+                        # check results
+                        assert_allclose(
+                            one_set.ravel(), one_set_check.ravel(), rtol=tol,
+                            err_msg="one set {} {} distance matrix inaccurate".format(target, metric))  # noqa
+                        assert_allclose(
+                            two_set.ravel(), two_set_check.ravel(), rtol=tol,
+                            err_msg="two set {} {} distance matrix inaccurate".format(target, metric))  # noqa
+                        assert_allclose(
+                            one_set_sparse, one_sparse_check, rtol=tol,
+                            err_msg="one set {} {} sparse distance matrix inaccurate".format(target, metric))  # noqa
+                        assert_allclose(
+                            two_set_sparse, two_sparse_check, rtol=tol,
+                            err_msg="two set {} {} distance matrix inaccurate".format(target, metric))  # noqa
+                    elif metric == "euclidean":
                         with Timer("two set check (cdist)"):
                             two_set_check = cdist(U, V)
 
-                        one_sparse_check = [
-                            euclidean(U[i], U[j]) for i, j in pairs]
-                        two_sparse_check = [
-                            euclidean(U[i], V[j]) for i, j in pairs]
-
-                    elif metric == "wasserstein":
-                        U_Uw, V_Vw = join_wasserstein(
-                            U, V, U_weights, V_weights)
-
-                        with Timer("one set check (cdist)"):
-                            one_set_check = cdist(
-                                U_Uw, U_Uw, metric=my_wasserstein_distance)
-                        with Timer("two set check (cdist)"):
-                            two_set_check = cdist(
-                                U_Uw, V_Vw, metric=my_wasserstein_distance)
-
-                        one_sparse_check = [my_wasserstein_distance(U_Uw[i], U_Uw[j])  # noqa
-                                            for i, j in pairs]
-                        two_sparse_check = [my_wasserstein_distance(U_Uw[i], V_Vw[j])  # noqa
-                                            for i, j in pairs]
-
-                    # check results
-                    tol = 1e-5
-                    assert_allclose(
-                        one_set.ravel(), one_set_check.ravel(), rtol=tol,
-                        err_msg="one set {} {} distance matrix inaccurate".format(target, metric))  # noqa
-                    assert_allclose(
-                        two_set.ravel(), two_set_check.ravel(), rtol=tol,
-                        err_msg="two set {} {} distance matrix inaccurate".format(target, metric))  # noqa
-                    assert_allclose(
-                        one_set_sparse, one_sparse_check, rtol=tol,
-                        err_msg="one set {} {} sparse distance matrix inaccurate".format(target, metric))  # noqa
-                    assert_allclose(
-                        two_set_sparse, two_sparse_check, rtol=tol,
-                        err_msg="two set {} {} distance matrix inaccurate".format(target, metric))  # noqa
-
 
 if __name__ == '__main__':
     unittest.main()
+
+
+# TF = [
+#     np.allclose(one_set, one_set_check),
+#     np.allclose(two_set, two_set_check),
+#     np.allclose(one_set_sparse, one_sparse_check),
+#     np.allclose(two_set_sparse, two_sparse_check)
+#     ]

From 571af8dfcd8ac684c662126bd3e2adbff741c29c Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Mon, 6 Sep 2021 06:58:34 -0600
Subject: [PATCH 16/20] Revert "Create myjit.py"

This reverts commit 367ac21cfed35bba5a3047b60a136ebe73bc949c.
---
 numba/cuda/kernels/device/myjit.py | 87 ------------------------------
 1 file changed, 87 deletions(-)
 delete mode 100644 numba/cuda/kernels/device/myjit.py

diff --git a/numba/cuda/kernels/device/myjit.py b/numba/cuda/kernels/device/myjit.py
deleted file mode 100644
index bc83d768b74..00000000000
--- a/numba/cuda/kernels/device/myjit.py
+++ /dev/null
@@ -1,87 +0,0 @@
-"""
-Decorator to assign the right jit for different targets.
-"""
-from numba import cuda, jit
-from Lib import inspect
-# from Lib.textwrap import dedent
-
-
-# HACK
-def get_myjit(target="cuda",
-              inline="never",
-              device=True,
-              parallel=False,
-              debug=False,
-              opt=True
-              ):
-    """
-    Get a decorator that assigns the right jit for different targets.
-
-    Parameters
-    ----------
-    target : str, optional
-        Whether to compute on "cuda" or on "cpu". The default is "cuda".
-    self.inline : str, optional
-        Whether to inline functions. The default is "never".
-
-    Returns
-    -------
-    myjit : function
-        The decorator with the specified keyword arguments.
-
-    """
-    if debug and opt:
-        raise ValueError("debug and opt should not both be True")
-
-    def myjit(f):
-        """
-        Decorator to assign the right jit for different targets.
-
-        In case of non-cuda targets, all instances of `cuda.local.array`
-        are replaced by `np.empty`. This is a dirty fix, hopefully in the
-        near future numba will support numpy array allocation and this will
-        not be necessary anymore
-
-        Modified from: https://github.com/numba/numba/issues/2571
-
-        Parameters
-        ----------
-        f : function
-            function for which the decorator is applied to.
-
-        Returns
-        -------
-        newfun : function
-            cuda.jit or jit version of f.
-
-        """
-        source = inspect.getsource(f).splitlines()
-        assert '@myjit' in source[0] or '@mycudajit' in source[0]
-        indent_spaces = len(source[0]) - len(source[0].lstrip())
-        source = [s[indent_spaces:] for s in source]
-        source = '\n'.join(source[1:]) + '\n'
-
-        # source = inspect.dedent(source)
-
-        if target == 'cuda':
-            source = source.replace('prange', 'range')
-            exec(source)
-            fun = eval(f.__name__)
-            newfun = cuda.jit(f, device=device, inline=inline,
-                              debug=debug, opt=opt)
-            # needs to be exported to globals
-            globals()[f.__name__] = newfun
-            return newfun
-
-        elif target == 'cpu':
-            source = source.replace('cuda.local.array', 'np.empty')
-            if not parallel:
-                source = source.replace('prange', 'range')
-            exec(source)
-            fun = eval(f.__name__)
-            newfun = jit(fun, nopython=True, inline=inline)
-            # needs to be exported to globals
-            globals()[f.__name__] = newfun
-            return newfun
-
-    return myjit

From 0eb6b8d322332788765335028235cdbb9cd83088 Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Mon, 6 Sep 2021 07:43:48 -0600
Subject: [PATCH 17/20] bad call to MACHINE_BITS

---
 numba/cuda/tests/cudapy/test_helper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba/cuda/tests/cudapy/test_helper.py b/numba/cuda/tests/cudapy/test_helper.py
index 81d2666fa8a..67383ca9093 100644
--- a/numba/cuda/tests/cudapy/test_helper.py
+++ b/numba/cuda/tests/cudapy/test_helper.py
@@ -7,7 +7,7 @@
 import numba.cuda.kernels.device.helper as hp
 from numpy.testing import assert_allclose, assert_equal
 
-bits = int(os.environ["MACHINE_BITS"])
+bits = int(os.environ.get("MACHINE_BITS", "32"))
 
 if bits == 32:
     np_float = np.float32

From 3727396d1a03baf4bb27913dc52db5ac373d0c26 Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Mon, 6 Sep 2021 08:07:31 -0600
Subject: [PATCH 18/20] Update dist_matrix.py

---
 numba/cuda/kernels/dist_matrix.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/numba/cuda/kernels/dist_matrix.py b/numba/cuda/kernels/dist_matrix.py
index 7561cb3fd81..df96846c7f6 100644
--- a/numba/cuda/kernels/dist_matrix.py
+++ b/numba/cuda/kernels/dist_matrix.py
@@ -21,7 +21,7 @@
 from numba import cuda, jit, prange  # noqa
 from numba.cuda.testing import unittest, CUDATestCase  # noqa
 from numba.types import int32, float32, int64, float64  # noqa
-from numba.cuda.kernels.device.myjit import get_myjit  # noqa
+# HACK from numba.cuda.kernels.device.myjit import get_myjit
 # from numba.cuda.cudadrv.driver import driver
 # TPB = driver.get_device().MAX_THREADS_PER_BLOCK # max TPB causes crash..
 
@@ -63,6 +63,7 @@
     nb_float = np_float
     nb_int = np_int
 
+# HACK
 # a "hacky" way to get compatibility between @njit and @cuda.jit
 # myjit = get_myjit(target=target, inline=inline)
 # mycudajit = get_myjit(device=False, target=target, inline=inline)
@@ -492,11 +493,11 @@ def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num):
 
 
 def dist_matrix(U,
-            V=None,
-            U_weights=None,
-            V_weights=None,
-            pairs=None,
-            metric="euclidean"):  # noqa
+                V=None,
+                U_weights=None,
+                V_weights=None,
+                pairs=None,
+                metric="euclidean"):  # noqa
     """
     Compute distance matrices using Numba/CUDA.
 

From 87bba9139b139b4213a00f216412c071ef4c7127 Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Mon, 6 Sep 2021 20:33:15 -0600
Subject: [PATCH 19/20] remove scipy-like functionality

---
 numba/cuda/kernels/dist_matrix.py           | 777 --------------------
 numba/cuda/tests/cudapy/test_dist_matrix.py | 253 -------
 2 files changed, 1030 deletions(-)
 delete mode 100644 numba/cuda/kernels/dist_matrix.py
 delete mode 100644 numba/cuda/tests/cudapy/test_dist_matrix.py

diff --git a/numba/cuda/kernels/dist_matrix.py b/numba/cuda/kernels/dist_matrix.py
deleted file mode 100644
index df96846c7f6..00000000000
--- a/numba/cuda/kernels/dist_matrix.py
+++ /dev/null
@@ -1,777 +0,0 @@
-"""
-Compute a distance matrix for various cases.
-
-Cases:
-    within a single set of vectors (like pdist)
-    between two sets of vectors (like cdist)
-    between prespecified pairs (i.e. sparse) for either one or two sets of
-        vectors.
-
-Various distance metrics are available.
-"""
-import numba.cuda.kernels.device.helper as hp
-import os
-import numpy as np
-from math import sqrt, ceil
-
-# CUDA Simulator not working
-# os.environ["NUMBA_ENABLE_CUDASIM"] = "1"
-# os.environ["NUMBA_CUDA_DEBUGINFO"] = "1"
-
-from numba import cuda, jit, prange  # noqa
-from numba.cuda.testing import unittest, CUDATestCase  # noqa
-from numba.types import int32, float32, int64, float64  # noqa
-# HACK from numba.cuda.kernels.device.myjit import get_myjit
-# from numba.cuda.cudadrv.driver import driver
-# TPB = driver.get_device().MAX_THREADS_PER_BLOCK # max TPB causes crash..
-
-inline = os.environ.get("INLINE", "never")
-fastmath = bool(os.environ.get("FASTMATH", "1"))
-cols = os.environ.get("COLUMNS")
-USE_64 = bool(os.environ.get("USE_64", "0"))
-target = os.environ.get("TARGET", "cuda")
-
-if USE_64 is None:
-    USE_64 = False
-if USE_64:
-    bits = 64
-    nb_float = float64
-    nb_int = int64
-    np_float = np.float64
-    np_int = np.int64
-else:
-    bits = 32
-    nb_float = float32
-    nb_int = int32
-    np_float = np.float32
-    np_int = np.int32
-os.environ["MACHINE_BITS"] = str(bits)
-
-if cols is not None:
-    cols = int(cols)
-    cols_plus_1 = cols + 1
-    tot_cols = cols * 2
-    tot_cols_minus_1 = tot_cols - 1
-else:
-    raise KeyError("For performance reasons and architecture constraints "
-                   "the number of columns of U (which is the same as V) "
-                   "must be defined as the environment variable, COLUMNS, "
-                   "via e.g. `os.environ[\"COLUMNS\"] = \"100\"`.")
-
-# override types
-if target == "cpu":
-    nb_float = np_float
-    nb_int = np_int
-
-# HACK
-# a "hacky" way to get compatibility between @njit and @cuda.jit
-# myjit = get_myjit(target=target, inline=inline)
-# mycudajit = get_myjit(device=False, target=target, inline=inline)
-
-# local array shapes needs to be defined globally due to lack of dynamic
-# array allocation support. Don't wrap with np.int32, etc. see
-# https://github.com/numba/numba/issues/7314
-TPB = 16
-
-# np.savetxt('100-zeros.csv',
-#           np.zeros(tmp, dtype=np.int32),
-#           delimiter=",")
-# OK to take cols from .shape
-# cols = np.genfromtxt('100-zeros.csv',
-#                      dtype=np.int32,
-#                      delimiter=",").shape[0]
-
-
-@ jit(inline=inline)
-def cdf_distance(u, v, u_weights, v_weights, p, presorted, cumweighted, prepended):  # noqa
-    r""" # noqa
-    Compute distance between two 1D distributions :math:`u` and :math:`v`.
-
-    The respective CDFs are :math:`U` and :math:`V`, and the
-    statistical distance is defined as:
-    .. math::
-        l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p}
-    p is a positive parameter; p = 1 gives the Wasserstein distance,
-    p = 2 gives the energy distance.
-
-    Parameters
-    ----------
-    u_values, v_values : array_like
-        Values observed in the (empirical) distribution.
-    u_weights, v_weights : array_like
-        Weight for each value.
-        `u_weights` (resp. `v_weights`) must have the same length as
-        `u_values` (resp. `v_values`). If the weight sum differs from
-        1, it must still be positive and finite so that the weights can
-        be normalized to sum to 1.
-    p : scalar
-        positive parameter that determines the type of cdf distance.
-    presorted : bool
-        Whether u and v have been sorted already *and* u_weights and
-        v_weights have been sorted using the same indices used to sort
-        u and v, respectively.
-    cumweighted : bool
-        Whether u_weights and v_weights have been converted to their
-        cumulative weights via e.g. np.cumsum().
-    prepended : bool
-        Whether a zero has been prepended to accumated, sorted
-        u_weights and v_weights.
-
-    By setting presorted, cumweighted, *and* prepended to False, the
-    computationproceeds proceeds in the same fashion as _cdf_distance
-    from scipy.stats.
-
-    Returns
-    -------
-    distance : float
-        The computed distance between the distributions.
-
-    Notes
-    -----
-    The input distributions can be empirical, therefore coming from
-    samples whose values are effectively inputs of the function, or
-    they can be seen as generalized functions, in which case they are
-    weighted sums of Dirac delta functions located at the specified
-    values.
-
-    References
-    ----------
-    .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan,
-            Hoyer, Munos "The Cramer Distance as a Solution to Biased
-            Wasserstein Gradients" (2017). :arXiv:`1705.10743`.
-    """
-    # allocate local float arrays
-    # combined vector
-    uv = cuda.local.array(tot_cols, nb_float)
-    uv_deltas = cuda.local.array(tot_cols_minus_1, nb_float)
-
-    # CDFs
-    u_cdf = cuda.local.array(tot_cols_minus_1, nb_float)
-    v_cdf = cuda.local.array(tot_cols_minus_1, nb_float)
-
-    # allocate local int arrays
-    # CDF indices via binary search
-    u_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int)
-    v_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int)
-
-    u_cdf_sorted_cumweights = cuda.local.array(
-        tot_cols_minus_1, nb_float)
-    v_cdf_sorted_cumweights = cuda.local.array(
-        tot_cols_minus_1, nb_float)
-
-    # short-circuit
-    if presorted and cumweighted and prepended:
-        u_sorted = u
-        v_sorted = v
-
-        u_0_cumweights = u_weights
-        v_0_cumweights = v_weights
-
-    # sorting, accumulating, and prepending (for compatibility)
-    else:
-        # check arguments
-        if not presorted and (cumweighted or prepended):
-            raise ValueError(
-                "if cumweighted or prepended are True, then presorted cannot be False")  # noqa
-
-        if (not presorted or not cumweighted) and prepended:
-            raise ValueError(
-                "if prepended is True, then presorted and cumweighted must both be True")  # noqa
-
-        # sorting
-        if not presorted:
-            # local arrays
-            u_sorted = cuda.local.array(cols, np_float)
-            v_sorted = cuda.local.array(cols, np_float)
-
-            u_sorter = cuda.local.array(cols, nb_int)
-            v_sorter = cuda.local.array(cols, nb_int)
-
-            u_sorted_weights = cuda.local.array(cols, nb_float)
-            v_sorted_weights = cuda.local.array(cols, nb_float)
-
-            # local copy since quickArgSortIterative sorts in-place
-            hp.copy(u, u_sorted)
-            hp.copy(v, v_sorted)
-
-            # sorting
-            hp.insertionArgSort(u_sorted, u_sorter)
-            hp.insertionArgSort(v_sorted, v_sorter)
-
-            # inplace to avoid extra cuda local array
-            hp.sort_by_indices(u_weights, u_sorter, u_sorted_weights)
-            hp.sort_by_indices(u_weights, u_sorter, v_sorted_weights)
-
-        # cumulative weights
-        if not cumweighted:
-            # local arrays
-            u_cumweights = cuda.local.array(cols, nb_float)
-            v_cumweights = cuda.local.array(cols, nb_float)
-            # accumulate
-            hp.cumsum(u_sorted_weights, u_cumweights)
-            hp.cumsum(v_sorted_weights, v_cumweights)
-
-        # prepend weights with zero
-        if not prepended:
-            zero = cuda.local.array(1, nb_float)
-
-            u_0_cumweights = cuda.local.array(
-                cols_plus_1, nb_float)
-            v_0_cumweights = cuda.local.array(
-                cols_plus_1, nb_float)
-
-            hp.concatenate(zero, u_cumweights, u_0_cumweights)
-            hp.concatenate(zero, v_cumweights, v_0_cumweights)
-
-    # concatenate u and v into uv
-    hp.concatenate(u_sorted, v_sorted, uv)
-
-    # sorting
-    # quickSortIterative(uv, uv_stack)
-    hp.insertionSort(uv)
-
-    # Get the respective positions of the values of u and v among the
-    # values of both distributions. See also np.searchsorted
-    hp.bisect_right(u_sorted, uv[:-1], u_cdf_indices)
-    hp.bisect_right(v_sorted, uv[:-1], v_cdf_indices)
-
-    # empirical CDFs
-    hp.sort_by_indices(u_0_cumweights, u_cdf_indices,
-                       u_cdf_sorted_cumweights)
-    hp.divide(u_cdf_sorted_cumweights, u_0_cumweights[-1], u_cdf)
-
-    hp.sort_by_indices(v_0_cumweights, v_cdf_indices,
-                       v_cdf_sorted_cumweights)
-    hp.divide(v_cdf_sorted_cumweights, v_0_cumweights[-1], v_cdf)
-
-    # # Integration
-    hp.diff(uv, uv_deltas)  # See also np.diff
-
-    out = hp.integrate(u_cdf, v_cdf, uv_deltas, p)
-
-    return out
-
-
-@ jit(inline=inline)
-def wasserstein_distance(u, v, u_weights, v_weights, presorted, cumweighted, prepended):  # noqa
-    r"""
-    Compute the first Wasserstein distance between two 1D distributions.
-
-    This distance is also known as the earth mover's distance, since it can be
-    seen as the minimum amount of "work" required to transform :math:`u` into
-    :math:`v`, where "work" is measured as the amount of distribution weight
-    that must be moved, multiplied by the distance it has to be moved.
-
-    Source
-    ------
-    https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8245-L8319 # noqa
-
-    Parameters
-    ----------
-    u_values, v_values : array_like
-        Values observed in the (empirical) distribution.
-    u_weights, v_weights : array_like, optional
-        Weight for each value. If unspecified, each value is assigned the same
-        weight.
-        `u_weights` (resp. `v_weights`) must have the same length as
-        `u_values` (resp. `v_values`). If the weight sum differs from 1, it
-        must still be positive and finite so that the weights can be normalized
-        to sum to 1.
-
-    Returns
-    -------
-    distance : float
-        The computed distance between the distributions.
-
-    Notes
-    -----
-    The input distributions can be empirical, therefore coming from samples
-    whose values are effectively inputs of the function, or they can be seen as
-    generalized functions, in which case they are weighted sums of Dirac delta
-    functions located at the specified values.
-
-    References
-    ----------
-    .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
-           Munos "The Cramer Distance as a Solution to Biased Wasserstein
-           Gradients" (2017). :arXiv:`1705.10743`.
-    """
-    return cdf_distance(u, v, u_weights, v_weights, np_int(1), presorted, cumweighted, prepended)  # noqa
-
-
-@ jit(inline=inline)
-def euclidean_distance(a, b):
-    """
-    Calculate Euclidean distance between vectors a and b.
-
-    Parameters
-    ----------
-    a : 1D array
-        First vector.
-    b : 1D array
-        Second vector.
-
-    Returns
-    -------
-    d : numeric scalar
-        Euclidean distance between vectors a and b.
-    """
-    d = 0
-    for i in range(len(a)):
-        d += (b[i] - a[i]) ** 2
-    d = sqrt(d)
-    return d
-
-
-@ jit(inline=inline)
-def compute_distance(u, v, u_weights, v_weights, metric_num):
-    """
-    Calculate weighted distance between two vectors, u and v.
-
-    Parameters
-    ----------
-    u : 1D array of float
-        First vector.
-    v : 1D array of float
-        Second vector.
-    u_weights : 1D array of float
-        Weights for u.
-    v_weights : 1D array of float
-        Weights for v.
-    metric_num : int
-        Which metric to use (0 == "euclidean", 1=="wasserstein").
-
-    Raises
-    ------
-    NotImplementedError
-        "Specified metric is mispelled or has not been implemented yet.
-        If not implemented, consider submitting a pull request."
-
-    Returns
-    -------
-    d : float
-        Weighted distance between u and v.
-
-    """
-    if metric_num == 0:
-        d = euclidean_distance(u, v)
-    elif metric_num == 1:
-        # assume u and v are presorted, and weights are sorted by u and v
-        d = wasserstein_distance(
-            u, v, u_weights, v_weights, True, True, True)
-    else:
-        raise NotImplementedError(
-            "Specified metric is mispelled or has not been implemented yet. "
-            "If not implemented, consider submitting a pull request."
-        )
-    return d
-
-
-@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], "
-          "int{0}[:,:], float{0}[:], int{0})".format(bits), fastmath=fastmath)
-def sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, metric_num):
-    """
-    Calculate sparse pairwise distances between two sets of vectors for pairs.
-
-    Parameters
-    ----------
-    mat : numeric cuda array
-        First set of vectors for which to compute a single pairwise distance.
-    mat2 : numeric cuda array
-        Second set of vectors for which to compute a single pairwise distance.
-    pairs : cuda array of 2-tuples
-        All pairs for which distances are to be computed.
-    out : numeric cuda array
-        The initialized array which will be populated with distances.
-
-    Raises
-    ------
-    ValueError
-        Both matrices should have the same number of columns.
-
-    Returns
-    -------
-    None.
-
-    """
-    k = cuda.grid(1)
-
-    npairs = pairs.shape[0]
-
-    if k < npairs:
-        pair = pairs[k]
-        # extract indices of the pair for which the distance will be computed
-        i, j = pair
-
-        u = U[i]
-        v = V[j]
-        uw = U_weights[i]
-        vw = V_weights[j]
-
-        out[k] = compute_distance(u, v, uw, vw, metric_num)
-
-
-@cuda.jit(
-    "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], int{0})".format(bits),
-    fastmath=fastmath,
-)
-def one_set_distance_matrix(U, U_weights, out, metric_num):
-    """
-    Calculate pairwise distances within single set of vectors.
-
-    Parameters
-    ----------
-    U : 2D array of float
-        Vertically stacked vectors.
-    U_weights : 2D array of float
-        Vertically stacked weight vectors.
-    out : 2D array of float
-        Initialized matrix to populate with pairwise distances.
-    metric_num : int
-        Which metric to use (0 == "euclidean", 1=="wasserstein").
-
-    Returns
-    -------
-    None.
-
-    """
-    i, j = cuda.grid(2)
-
-    dm_rows = U.shape[0]
-    dm_cols = U.shape[0]
-
-    if i < j and i < dm_rows and j < dm_cols and i != j:
-        u = U[i]
-        v = U[j]
-        uw = U_weights[i]
-        vw = U_weights[j]
-        d = compute_distance(u, v, uw, vw, metric_num)
-        out[i, j] = d
-        out[j, i] = d
-
-
-# faster compilation *and* runtimes with explicit signature
-@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], "
-          "float{0}[:,:], int{0})".format(bits), fastmath=fastmath)
-def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num):
-    """
-    Calculate pairwise distances between two sets of vectors.
-
-    Parameters
-    ----------
-    U : 2D array of float
-        Vertically stacked vectors.
-    V : 2D array of float
-        Vertically stacked vectors.
-    U_weights : 2D array of float
-        Vertically stacked weight vectors.
-    V_weights : 2D array of float
-        Vertically stacked weight vectors.
-    out : 2D array of float
-        Pairwise distance matrix between two sets of vectors.
-    metric_num : int
-        Distance metric number {"euclidean": 0, "wasserstein": 1}.
-
-    Returns
-    -------
-    None.
-
-    """
-    i, j = cuda.grid(2)
-
-    # distance matrix shape
-    dm_rows = U.shape[0]
-    dm_cols = V.shape[0]
-
-    if i < dm_rows and j < dm_cols:
-        u = U[i]
-        v = V[j]
-        uw = U_weights[i]
-        vw = V_weights[j]
-        d = compute_distance(u, v, uw, vw, metric_num)
-        out[i, j] = d
-
-
-def dist_matrix(U,
-                V=None,
-                U_weights=None,
-                V_weights=None,
-                pairs=None,
-                metric="euclidean"):  # noqa
-    """
-    Compute distance matrices using Numba/CUDA.
-
-    Parameters
-    ----------
-    mat : array
-        First set of vectors for which to compute pairwise distances.
-
-    mat2 : array, optional
-        Second set of vectors for which to compute pairwise distances.
-        If not specified, then mat2 is a copy of mat.
-
-    pairs : array, optional
-        List of 2-tuples which contain the indices for which to compute
-        distances. If mat2 was specified, then the second index accesses
-        mat2 instead of mat. If not specified, then the pairs are
-        auto-generated. If mat2 was specified, all combinations of the two
-        vector sets are used. If mat2 isn't specified, then only the upper
-        triangle (minus diagonal) pairs are computed.
-
-    metric : str, optional
-        Possible options are 'euclidean', 'wasserstein'.
-        Defaults to Euclidean distance. These are converted to integers
-        internally due to Numba's lack of support for string arguments
-        (2021-08-14). See compute_distance() for other keys.
-
-        For example:
-            0 - 'euclidean'
-            1 - 'wasserstein'
-    target : str, optional
-        Which target to use: "cuda" or "cpu". Default is "cuda".
-    inline : str, optional
-        Whether to inline functions: "always" or "never". Default is "never".
-    fastmath : bool, optional
-        Whether to use fastmath or not. The default is True.
-    USE_64 : bool, optional
-        Whether to use 64 bit or 34 bit types. The default is False.
-
-    Raises
-    ------
-    ValueError
-        DESCRIPTION.
-    NotImplementedError
-        DESCRIPTION.
-
-    Returns
-    -------
-    out : array
-        A pairwise distance matrix, or if pairs are specified, then a
-        vector of distances corresponding to the pairs.
-
-    """
-    rows, cols_check = U.shape
-
-    if cols_check != cols:
-        raise KeyError("For performance reasons and architecture constraints "
-                       "the number of columns of U (which is the same as V) "
-                       "must be defined as the environment variable: COLUMNS. "
-                       "However, os.environ[\"COLUMNS\"] does not match the "
-                       "number of columns in U. Reset the environment variable "
-                       "via e.g. `os.environ[\"COLUMNS\"] = str(U.shape[0])` "
-                       "defined at the top of your script and make sure that "
-                       "dist_matrix is reloaded. You may also need to restart "
-                       "Python.")
-
-    if V is not None and cols is not V.shape[1]:
-        raise ValueError("The number of columns for U and V should match")
-
-    # is it a distance matrix between two sets of vectors?
-    # (rather than within a single set)
-    isXY = V is not None
-
-    # were pairs specified? (useful for sparse matrix generation)
-    pairQ = pairs is not None
-
-    # block, grid, and out shape
-    if pairQ:
-        block_dim = TPB
-        npairs = pairs.shape[0]
-        grid_dim = ceil(npairs / block_dim)
-    else:
-        block_dim = (TPB, TPB)
-        blockspergrid_x = ceil(rows / block_dim[0])
-        if isXY:
-            blockspergrid_y = ceil(V.shape[0] / block_dim[1])
-        else:
-            blockspergrid_y = ceil(rows / block_dim[1])
-        grid_dim = (blockspergrid_x, blockspergrid_y)
-
-    # CUDA setup
-    stream = cuda.stream()
-
-    # sorting and cumulative weights
-    if metric == "wasserstein":
-        # presort values (and weights by sorted value indices)
-        U_sorter = np.argsort(U)
-        U = np.take_along_axis(U, U_sorter, axis=-1)
-        U_weights = np.take_along_axis(U_weights, U_sorter, axis=-1)
-
-        # calculate cumulative weights
-        U_weights = np.cumsum(U_weights, axis=1)
-
-        # prepend a column of zeros
-        zero = np.zeros((U_weights.shape[0], 1))
-        U_weights = np.column_stack((zero, U_weights))
-
-        # do the same for V and V_weights
-        if isXY:
-            V_sorter = np.argsort(V)
-            V = np.take_along_axis(V, V_sorter, axis=-1)
-            V_weights = np.take_along_axis(V_weights, V_sorter, axis=-1)
-            V_weights = np.cumsum(V_weights, axis=1)
-            V_weights = np.column_stack((zero, V_weights))
-
-    # assign dummy arrays
-    if V is None:
-        V = np.zeros((2, 2))
-
-    if U_weights is None:
-        U_weights = np.zeros((2, 2))
-
-    if V_weights is None:
-        V_weights = np.zeros((2, 2))
-
-    if pairs is None:
-        pairs = np.zeros((2, 2))
-
-    # assign metric_num based on specified metric (no string support)
-    metric_dict = {"euclidean": 0, "wasserstein": 1}
-    metric_num = metric_dict[metric]
-
-    # # same for target_num
-    # target_dict = {"cuda": 0, "cpu": 1}
-    # target_num = target_dict[target]
-
-    m = U.shape[0]
-
-    if isXY:
-        m2 = V.shape[0]
-    else:
-        m2 = m
-
-    if pairQ:
-        shape = (npairs,)
-    else:
-        shape = (m, m2)
-
-    # values should be initialized instead of using cuda.device_array
-    out = np.zeros(shape)
-
-    if target == "cuda":
-        # copying/allocating to GPU
-        cU = cuda.to_device(np.asarray(
-            U, dtype=np_float), stream=stream)
-
-        cU_weights = cuda.to_device(np.asarray(
-            U_weights, dtype=np_float), stream=stream)
-
-        if isXY or pairQ:
-            cV = cuda.to_device(np.asarray(
-                V, dtype=np_float), stream=stream)
-            cV_weights = cuda.to_device(
-                np.asarray(V_weights, dtype=np_float), stream=stream
-            )
-
-        if pairQ:
-            cpairs = cuda.to_device(np.asarray(
-                pairs, dtype=np_int), stream=stream)
-
-        cuda_out = cuda.to_device(np.asarray(
-            out, dtype=np_float), stream=stream)
-
-    elif target == "cpu":
-        cU = U
-        cV = V
-        cU_weights = U_weights
-        cV_weights = V_weights
-        cpairs = pairs
-        cuda_out = out
-
-    # distance matrix between two sets of vectors
-    if isXY and not pairQ:
-        fn = two_set_distance_matrix
-        if target == "cuda":
-            fn = fn[grid_dim, block_dim]
-        fn(cU, cV, cU_weights, cV_weights, cuda_out, metric_num)
-
-    # specified pairwise distances within single set of vectors
-    elif not isXY and pairQ:
-        fn = sparse_distance_matrix
-        if target == "cuda":
-            fn = fn[grid_dim, block_dim]
-        fn(cU, cU, cU_weights, cU_weights, cpairs,
-           cuda_out, metric_num)
-
-    # distance matrix within single set of vectors
-    elif not isXY and not pairQ:
-        fn = one_set_distance_matrix
-        if target == "cuda":
-            fn = fn[grid_dim, block_dim]
-        fn(cU, cU_weights, cuda_out, metric_num)
-
-    # specified pairwise distances within single set of vectors
-    elif isXY and pairQ:
-        fn = sparse_distance_matrix
-        if target == "cuda":
-            fn = fn[grid_dim, block_dim]
-        fn(cU, cV, cU_weights, cV_weights, cpairs,
-           cuda_out, metric_num)
-
-    out = cuda_out.copy_to_host(stream=stream)
-
-    return out
-
-# %% Code Graveyard
-
-# u_stack = cuda.local.array(cols, nb_int)
-# v_stack = cuda.local.array(cols, nb_int)
-
-# copy(u_sorted, utmp)
-# copy(v_sorted, vtmp)
-
-# copy(u_weights, u_sorted_weights)
-# copy(v_weights, v_sorted_weights)
-
-# stack = [0] * size
-# ids = list(range(len(arr)))
-
-# u_sorted_weights = cuda.local.array(cols, nb_float)
-# v_sorted_weights = cuda.local.array(cols, nb_float)
-
-# sorting stack
-# uv_stack = cuda.local.array(tot_cols, nb_int)
-
-# def sort_cum_prepend(X, X_weights):
-#     # presort values (and weights by sorted value indices)
-#     X_sorter = np.argsort(X)
-#     X = np.take_along_axis(X, X_sorter, axis=-1)
-#     X_weights = np.take_along_axis(X_weights, X_sorter, axis=-1)
-#     # calculate cumulative weights
-#     X_weights = np.cumsum(X_weights)
-#     # prepend a column of zeros
-#     zero = np.zeros((X_weights.shape[0], 1))
-#     X_weights = np.column_stack((zero, X_weights))
-
-# def mean_squared_error(y, y_pred, squared=False):
-#     """
-#     Return mean squared error (MSE) without using sklearn.
-
-#     If squared == True, then return root mean squared error (RMSE).
-
-#     Parameters
-#     ----------
-#     y : 1D numeric array
-#         "True" or first set of values.
-#     y_pred : 1D numeric array
-#         "Predicted" or second set of values.
-
-#     Returns
-#     -------
-#     rmse : numeric scalar
-#         DESCRIPTION.
-
-#     """
-#     mse = np.mean((y - y_pred)**2)
-#     if squared is True:
-#         rmse = np.sqrt(mse)
-#         return rmse
-#     else:
-#         return mse
-#         return mse
-
-# opt = False
-# os.environ["NUMBA_CUDA_DEBUGINFO"] = "0"
-# opt = True
-# os.environ["NUMBA_BOUNDSCHECK"] = "0"
-# os.environ["OPT"] = "0"
diff --git a/numba/cuda/tests/cudapy/test_dist_matrix.py b/numba/cuda/tests/cudapy/test_dist_matrix.py
deleted file mode 100644
index 4ad8f948a16..00000000000
--- a/numba/cuda/tests/cudapy/test_dist_matrix.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Test distance matrix calculations using CUDA/Numba."""
-from scipy.spatial.distance import euclidean, cdist
-from scipy.stats import wasserstein_distance as scipy_wasserstein_distance
-from numpy.testing import assert_allclose
-import numpy as np
-import os
-from importlib import reload
-
-# os.environ["NUMBA_DISABLE_JIT"] = "1"
-
-# number of columns of U and V must be set as env var before import dist_matrix
-cols = 100
-os.environ["COLUMNS"] = str(cols)
-
-# other environment variables (set before importing dist_matrix)
-os.environ["USE_64"] = "0"
-os.environ["INLINE"] = "never"
-os.environ["FASTMATH"] = "1"
-os.environ["TARGET"] = "cuda"
-
-from numba.cuda.kernels.device.helper import Timer  # noqa
-from numba.cuda.testing import unittest, CUDATestCase  # noqa
-import numba.cuda.kernels.dist_matrix  # noqa
-
-# to overwrite env vars (source: https://stackoverflow.com/a/1254379/13697228)
-reload(numba.cuda.kernels.dist_matrix)
-dist_matrix = numba.cuda.kernels.dist_matrix.dist_matrix
-
-verbose_test = True
-
-
-class TestDistMat(CUDATestCase):
-    """Test distance matrix calculations on GPU for various metrics."""
-
-    def test_dist_matrix(self):
-        """
-        Loop through distance metrics and perform unit tests.
-
-        The four test cases are:
-            pairwise distances within a single set of vectors
-            pairwise distances between two sets of vectors
-            sparse pairwise distances within a single set of vectors
-            sparse pairwise distances between two sets of vectors
-
-        The ground truth for Euclidean comes from cdist.
-        The ground truth for Earth Mover's distance (1-Wasserstein) is via
-        a scipy.stats function.
-
-        Helper functions are used to generate test data and support the use of
-        Wasserstein distances in cdist.
-
-        Returns
-        -------
-        None.
-
-        """
-
-        def test_data(rows, cols):
-            """
-            Generate seeded test values and weights for two distributions.
-
-            Returns
-            -------
-            U : 2D array
-                Values of first distribution.
-            V : 2D array
-                Values of second distribution.
-            U_weights : 2D array
-                Weights of first distribution.
-            V_weights : 2D array
-                Weights of second distribution.
-
-            """
-            np.random.seed(42)
-            [U, V, U_weights, V_weights] = [
-                np.random.rand(rows, cols) for i in range(4)]
-            return U, V, U_weights, V_weights
-
-        def my_wasserstein_distance(u_uw, v_vw):
-            """
-            Return Earth Mover's distance using concatenated values and weights.
-
-            Parameters
-            ----------
-            u_uw : 1D numeric array
-                Horizontally stacked values and weights of first distribution.
-            v_vw : TYPE
-                Horizontally stacked values and weights of second distribution.
-
-            Returns
-            -------
-            distance : numeric scalar
-                Earth Mover's distance given two distributions.
-
-            """
-            # split into values and weights
-            n = len(u_uw)
-            i = n // 2
-            u = u_uw[0: i]
-            uw = u_uw[i: n]
-            v = v_vw[0: i]
-            vw = v_vw[i: n]
-            # calculate distance
-            distance = scipy_wasserstein_distance(
-                u, v, u_weights=uw, v_weights=vw)
-            return distance
-
-        def join_wasserstein(U, V, Uw, Vw):
-            """
-            Horizontally stack values and weights for each distribution.
-
-            Weights are added as additional columns to values.
-
-            Example:
-                u_uw, v_vw = join_wasserstein(u, v, uw, vw)
-                d = my_wasserstein_distance(u_uw, v_vw)
-                cdist(u_uw, v_vw, metric=my_wasserstein_distance)
-
-            Parameters
-            ----------
-            u : 1D or 2D numeric array
-                First set of distribution values.
-            v : 1D or 2D numeric array
-                Second set of values of distribution values.
-            uw : 1D or 2D numeric array
-                Weights for first distribution.
-            vw : 1D or 2D numeric array
-                Weights for second distribution.
-
-            Returns
-            -------
-            u_uw : 1D or 2D numeric array
-                Horizontally stacked values and weights of first distribution.
-            v_vw : TYPE
-                Horizontally stacked values and weights of second distribution.
-
-            """
-            U_Uw = np.concatenate((U, Uw), axis=1)
-            V_Vw = np.concatenate((V, Vw), axis=1)
-            return U_Uw, V_Vw
-
-        tol = 1e-5
-
-        # test and production data
-        pairs = np.array([(0, 1), (1, 2), (2, 3)])
-        i, j = pairs[0]
-
-        for testQ in [True, False]:
-            # large # of rows with testQ == True is slow due to use of
-            # cdist and non-jitted scipy-wasserstein
-            if testQ:
-                rows = 6
-            else:
-                rows = 1000
-            print("====[PARTIAL TEST WITH {} ROWS]====".format(rows))
-            U, V, U_weights, V_weights = test_data(rows, cols)
-            Utest, Vtest, Uwtest, Vwtest = [x[0:6] for x in [U, V, U_weights, V_weights]]  # noqa
-
-            for target in ["cuda"]:  # "cpu" not implemented
-                for metric in ['euclidean', 'wasserstein']:
-                    print('[' + target.upper() + "_" + metric.upper() + ']')
-
-                    if testQ:
-                        # compile
-                        dist_matrix(Utest, U_weights=Uwtest, metric=metric)
-                        with Timer("one set"):
-                            one_set = dist_matrix(
-                                U, U_weights=U_weights, metric=metric)  # noqa
-                            if verbose_test:
-                                print(one_set, '\n')
-
-                    # compile
-                    dist_matrix(Utest, V=Vtest, U_weights=Uwtest,
-                                V_weights=Vwtest, metric=metric)
-                    with Timer("two set"):
-                        two_set = dist_matrix(U, V=V, U_weights=U_weights,
-                                              V_weights=V_weights, metric=metric)  # noqa
-                        if testQ and verbose_test:
-                            print(two_set, '\n')
-
-                        one_set_sparse = dist_matrix(
-                            U, U_weights=U_weights, pairs=pairs, metric=metric)  # noqa
-                        if testQ and verbose_test:
-                            print(one_set_sparse, '\n')
-
-                    two_set_sparse = dist_matrix(
-                        U,
-                        V=V,
-                        U_weights=U_weights,
-                        V_weights=V_weights,
-                        pairs=pairs,
-                        metric=metric,
-                    )
-                    if testQ and verbose_test:
-                        print(two_set_sparse, '\n')
-
-                    if testQ:
-                        if metric == "euclidean":
-                            with Timer("one set check (cdist)"):
-                                one_set_check = cdist(U, U)
-                            with Timer("two set check (cdist)"):
-                                two_set_check = cdist(U, V)
-
-                            one_sparse_check = [
-                                euclidean(U[i], U[j]) for i, j in pairs]
-                            two_sparse_check = [
-                                euclidean(U[i], V[j]) for i, j in pairs]
-
-                        elif metric == "wasserstein":
-                            U_Uw, V_Vw = join_wasserstein(
-                                U, V, U_weights, V_weights)
-
-                            with Timer("one set check (cdist)"):
-                                one_set_check = cdist(
-                                    U_Uw, U_Uw, metric=my_wasserstein_distance)
-                            with Timer("two set check (cdist)"):
-                                two_set_check = cdist(
-                                    U_Uw, V_Vw, metric=my_wasserstein_distance)
-
-                            one_sparse_check = [my_wasserstein_distance(U_Uw[i], U_Uw[j])  # noqa
-                                                for i, j in pairs]
-                            two_sparse_check = [my_wasserstein_distance(U_Uw[i], V_Vw[j])  # noqa
-                                                for i, j in pairs]
-
-                        # check results
-                        assert_allclose(
-                            one_set.ravel(), one_set_check.ravel(), rtol=tol,
-                            err_msg="one set {} {} distance matrix inaccurate".format(target, metric))  # noqa
-                        assert_allclose(
-                            two_set.ravel(), two_set_check.ravel(), rtol=tol,
-                            err_msg="two set {} {} distance matrix inaccurate".format(target, metric))  # noqa
-                        assert_allclose(
-                            one_set_sparse, one_sparse_check, rtol=tol,
-                            err_msg="one set {} {} sparse distance matrix inaccurate".format(target, metric))  # noqa
-                        assert_allclose(
-                            two_set_sparse, two_sparse_check, rtol=tol,
-                            err_msg="two set {} {} distance matrix inaccurate".format(target, metric))  # noqa
-                    elif metric == "euclidean":
-                        with Timer("two set check (cdist)"):
-                            two_set_check = cdist(U, V)
-
-
-if __name__ == '__main__':
-    unittest.main()
-
-
-# TF = [
-#     np.allclose(one_set, one_set_check),
-#     np.allclose(two_set, two_set_check),
-#     np.allclose(one_set_sparse, one_sparse_check),
-#     np.allclose(two_set_sparse, two_sparse_check)
-#     ]

From f862f168a0cf15ef5b0455c4ff6eb1ef89d7224b Mon Sep 17 00:00:00 2001
From: sgbaird <sterling.baird@icloud.com>
Date: Mon, 6 Sep 2021 20:38:42 -0600
Subject: [PATCH 20/20] Revert "remove scipy-like functionality"

This reverts commit 87bba9139b139b4213a00f216412c071ef4c7127.
---
 numba/cuda/kernels/dist_matrix.py           | 777 ++++++++++++++++++++
 numba/cuda/tests/cudapy/test_dist_matrix.py | 253 +++++++
 2 files changed, 1030 insertions(+)
 create mode 100644 numba/cuda/kernels/dist_matrix.py
 create mode 100644 numba/cuda/tests/cudapy/test_dist_matrix.py

diff --git a/numba/cuda/kernels/dist_matrix.py b/numba/cuda/kernels/dist_matrix.py
new file mode 100644
index 00000000000..df96846c7f6
--- /dev/null
+++ b/numba/cuda/kernels/dist_matrix.py
@@ -0,0 +1,777 @@
+"""
+Compute a distance matrix for various cases.
+
+Cases:
+    within a single set of vectors (like pdist)
+    between two sets of vectors (like cdist)
+    between prespecified pairs (i.e. sparse) for either one or two sets of
+        vectors.
+
+Various distance metrics are available.
+"""
+import numba.cuda.kernels.device.helper as hp
+import os
+import numpy as np
+from math import sqrt, ceil
+
+# CUDA Simulator not working
+# os.environ["NUMBA_ENABLE_CUDASIM"] = "1"
+# os.environ["NUMBA_CUDA_DEBUGINFO"] = "1"
+
+from numba import cuda, jit, prange  # noqa
+from numba.cuda.testing import unittest, CUDATestCase  # noqa
+from numba.types import int32, float32, int64, float64  # noqa
+# HACK from numba.cuda.kernels.device.myjit import get_myjit
+# from numba.cuda.cudadrv.driver import driver
+# TPB = driver.get_device().MAX_THREADS_PER_BLOCK # max TPB causes crash..
+
+inline = os.environ.get("INLINE", "never")
+fastmath = bool(os.environ.get("FASTMATH", "1"))
+cols = os.environ.get("COLUMNS")
+USE_64 = bool(os.environ.get("USE_64", "0"))
+target = os.environ.get("TARGET", "cuda")
+
+if USE_64 is None:
+    USE_64 = False
+if USE_64:
+    bits = 64
+    nb_float = float64
+    nb_int = int64
+    np_float = np.float64
+    np_int = np.int64
+else:
+    bits = 32
+    nb_float = float32
+    nb_int = int32
+    np_float = np.float32
+    np_int = np.int32
+os.environ["MACHINE_BITS"] = str(bits)
+
+if cols is not None:
+    cols = int(cols)
+    cols_plus_1 = cols + 1
+    tot_cols = cols * 2
+    tot_cols_minus_1 = tot_cols - 1
+else:
+    raise KeyError("For performance reasons and architecture constraints "
+                   "the number of columns of U (which is the same as V) "
+                   "must be defined as the environment variable, COLUMNS, "
+                   "via e.g. `os.environ[\"COLUMNS\"] = \"100\"`.")
+
+# override types
+if target == "cpu":
+    nb_float = np_float
+    nb_int = np_int
+
+# HACK
+# a "hacky" way to get compatibility between @njit and @cuda.jit
+# myjit = get_myjit(target=target, inline=inline)
+# mycudajit = get_myjit(device=False, target=target, inline=inline)
+
+# local array shapes needs to be defined globally due to lack of dynamic
+# array allocation support. Don't wrap with np.int32, etc. see
+# https://github.com/numba/numba/issues/7314
+TPB = 16
+
+# np.savetxt('100-zeros.csv',
+#           np.zeros(tmp, dtype=np.int32),
+#           delimiter=",")
+# OK to take cols from .shape
+# cols = np.genfromtxt('100-zeros.csv',
+#                      dtype=np.int32,
+#                      delimiter=",").shape[0]
+
+
+@ jit(inline=inline)
+def cdf_distance(u, v, u_weights, v_weights, p, presorted, cumweighted, prepended):  # noqa
+    r""" # noqa
+    Compute distance between two 1D distributions :math:`u` and :math:`v`.
+
+    The respective CDFs are :math:`U` and :math:`V`, and the
+    statistical distance is defined as:
+    .. math::
+        l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p}
+    p is a positive parameter; p = 1 gives the Wasserstein distance,
+    p = 2 gives the energy distance.
+
+    Parameters
+    ----------
+    u_values, v_values : array_like
+        Values observed in the (empirical) distribution.
+    u_weights, v_weights : array_like
+        Weight for each value.
+        `u_weights` (resp. `v_weights`) must have the same length as
+        `u_values` (resp. `v_values`). If the weight sum differs from
+        1, it must still be positive and finite so that the weights can
+        be normalized to sum to 1.
+    p : scalar
+        positive parameter that determines the type of cdf distance.
+    presorted : bool
+        Whether u and v have been sorted already *and* u_weights and
+        v_weights have been sorted using the same indices used to sort
+        u and v, respectively.
+    cumweighted : bool
+        Whether u_weights and v_weights have been converted to their
+        cumulative weights via e.g. np.cumsum().
+    prepended : bool
+        Whether a zero has been prepended to accumated, sorted
+        u_weights and v_weights.
+
+    By setting presorted, cumweighted, *and* prepended to False, the
+    computationproceeds proceeds in the same fashion as _cdf_distance
+    from scipy.stats.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    The input distributions can be empirical, therefore coming from
+    samples whose values are effectively inputs of the function, or
+    they can be seen as generalized functions, in which case they are
+    weighted sums of Dirac delta functions located at the specified
+    values.
+
+    References
+    ----------
+    .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan,
+            Hoyer, Munos "The Cramer Distance as a Solution to Biased
+            Wasserstein Gradients" (2017). :arXiv:`1705.10743`.
+    """
+    # allocate local float arrays
+    # combined vector
+    uv = cuda.local.array(tot_cols, nb_float)
+    uv_deltas = cuda.local.array(tot_cols_minus_1, nb_float)
+
+    # CDFs
+    u_cdf = cuda.local.array(tot_cols_minus_1, nb_float)
+    v_cdf = cuda.local.array(tot_cols_minus_1, nb_float)
+
+    # allocate local int arrays
+    # CDF indices via binary search
+    u_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int)
+    v_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int)
+
+    u_cdf_sorted_cumweights = cuda.local.array(
+        tot_cols_minus_1, nb_float)
+    v_cdf_sorted_cumweights = cuda.local.array(
+        tot_cols_minus_1, nb_float)
+
+    # short-circuit
+    if presorted and cumweighted and prepended:
+        u_sorted = u
+        v_sorted = v
+
+        u_0_cumweights = u_weights
+        v_0_cumweights = v_weights
+
+    # sorting, accumulating, and prepending (for compatibility)
+    else:
+        # check arguments
+        if not presorted and (cumweighted or prepended):
+            raise ValueError(
+                "if cumweighted or prepended are True, then presorted cannot be False")  # noqa
+
+        if (not presorted or not cumweighted) and prepended:
+            raise ValueError(
+                "if prepended is True, then presorted and cumweighted must both be True")  # noqa
+
+        # sorting
+        if not presorted:
+            # local arrays
+            u_sorted = cuda.local.array(cols, np_float)
+            v_sorted = cuda.local.array(cols, np_float)
+
+            u_sorter = cuda.local.array(cols, nb_int)
+            v_sorter = cuda.local.array(cols, nb_int)
+
+            u_sorted_weights = cuda.local.array(cols, nb_float)
+            v_sorted_weights = cuda.local.array(cols, nb_float)
+
+            # local copy since quickArgSortIterative sorts in-place
+            hp.copy(u, u_sorted)
+            hp.copy(v, v_sorted)
+
+            # sorting
+            hp.insertionArgSort(u_sorted, u_sorter)
+            hp.insertionArgSort(v_sorted, v_sorter)
+
+            # inplace to avoid extra cuda local array
+            hp.sort_by_indices(u_weights, u_sorter, u_sorted_weights)
+            hp.sort_by_indices(u_weights, u_sorter, v_sorted_weights)
+
+        # cumulative weights
+        if not cumweighted:
+            # local arrays
+            u_cumweights = cuda.local.array(cols, nb_float)
+            v_cumweights = cuda.local.array(cols, nb_float)
+            # accumulate
+            hp.cumsum(u_sorted_weights, u_cumweights)
+            hp.cumsum(v_sorted_weights, v_cumweights)
+
+        # prepend weights with zero
+        if not prepended:
+            zero = cuda.local.array(1, nb_float)
+
+            u_0_cumweights = cuda.local.array(
+                cols_plus_1, nb_float)
+            v_0_cumweights = cuda.local.array(
+                cols_plus_1, nb_float)
+
+            hp.concatenate(zero, u_cumweights, u_0_cumweights)
+            hp.concatenate(zero, v_cumweights, v_0_cumweights)
+
+    # concatenate u and v into uv
+    hp.concatenate(u_sorted, v_sorted, uv)
+
+    # sorting
+    # quickSortIterative(uv, uv_stack)
+    hp.insertionSort(uv)
+
+    # Get the respective positions of the values of u and v among the
+    # values of both distributions. See also np.searchsorted
+    hp.bisect_right(u_sorted, uv[:-1], u_cdf_indices)
+    hp.bisect_right(v_sorted, uv[:-1], v_cdf_indices)
+
+    # empirical CDFs
+    hp.sort_by_indices(u_0_cumweights, u_cdf_indices,
+                       u_cdf_sorted_cumweights)
+    hp.divide(u_cdf_sorted_cumweights, u_0_cumweights[-1], u_cdf)
+
+    hp.sort_by_indices(v_0_cumweights, v_cdf_indices,
+                       v_cdf_sorted_cumweights)
+    hp.divide(v_cdf_sorted_cumweights, v_0_cumweights[-1], v_cdf)
+
+    # # Integration
+    hp.diff(uv, uv_deltas)  # See also np.diff
+
+    out = hp.integrate(u_cdf, v_cdf, uv_deltas, p)
+
+    return out
+
+
+@ jit(inline=inline)
+def wasserstein_distance(u, v, u_weights, v_weights, presorted, cumweighted, prepended):  # noqa
+    r"""
+    Compute the first Wasserstein distance between two 1D distributions.
+
+    This distance is also known as the earth mover's distance, since it can be
+    seen as the minimum amount of "work" required to transform :math:`u` into
+    :math:`v`, where "work" is measured as the amount of distribution weight
+    that must be moved, multiplied by the distance it has to be moved.
+
+    Source
+    ------
+    https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8245-L8319 # noqa
+
+    Parameters
+    ----------
+    u_values, v_values : array_like
+        Values observed in the (empirical) distribution.
+    u_weights, v_weights : array_like, optional
+        Weight for each value. If unspecified, each value is assigned the same
+        weight.
+        `u_weights` (resp. `v_weights`) must have the same length as
+        `u_values` (resp. `v_values`). If the weight sum differs from 1, it
+        must still be positive and finite so that the weights can be normalized
+        to sum to 1.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    The input distributions can be empirical, therefore coming from samples
+    whose values are effectively inputs of the function, or they can be seen as
+    generalized functions, in which case they are weighted sums of Dirac delta
+    functions located at the specified values.
+
+    References
+    ----------
+    .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
+           Munos "The Cramer Distance as a Solution to Biased Wasserstein
+           Gradients" (2017). :arXiv:`1705.10743`.
+    """
+    return cdf_distance(u, v, u_weights, v_weights, np_int(1), presorted, cumweighted, prepended)  # noqa
+
+
+@ jit(inline=inline)
+def euclidean_distance(a, b):
+    """
+    Calculate Euclidean distance between vectors a and b.
+
+    Parameters
+    ----------
+    a : 1D array
+        First vector.
+    b : 1D array
+        Second vector.
+
+    Returns
+    -------
+    d : numeric scalar
+        Euclidean distance between vectors a and b.
+    """
+    d = 0
+    for i in range(len(a)):
+        d += (b[i] - a[i]) ** 2
+    d = sqrt(d)
+    return d
+
+
+@ jit(inline=inline)
+def compute_distance(u, v, u_weights, v_weights, metric_num):
+    """
+    Calculate weighted distance between two vectors, u and v.
+
+    Parameters
+    ----------
+    u : 1D array of float
+        First vector.
+    v : 1D array of float
+        Second vector.
+    u_weights : 1D array of float
+        Weights for u.
+    v_weights : 1D array of float
+        Weights for v.
+    metric_num : int
+        Which metric to use (0 == "euclidean", 1=="wasserstein").
+
+    Raises
+    ------
+    NotImplementedError
+        "Specified metric is mispelled or has not been implemented yet.
+        If not implemented, consider submitting a pull request."
+
+    Returns
+    -------
+    d : float
+        Weighted distance between u and v.
+
+    """
+    if metric_num == 0:
+        d = euclidean_distance(u, v)
+    elif metric_num == 1:
+        # assume u and v are presorted, and weights are sorted by u and v
+        d = wasserstein_distance(
+            u, v, u_weights, v_weights, True, True, True)
+    else:
+        raise NotImplementedError(
+            "Specified metric is mispelled or has not been implemented yet. "
+            "If not implemented, consider submitting a pull request."
+        )
+    return d
+
+
+@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], "
+          "int{0}[:,:], float{0}[:], int{0})".format(bits), fastmath=fastmath)
+def sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, metric_num):
+    """
+    Calculate sparse pairwise distances between two sets of vectors for pairs.
+
+    Parameters
+    ----------
+    mat : numeric cuda array
+        First set of vectors for which to compute a single pairwise distance.
+    mat2 : numeric cuda array
+        Second set of vectors for which to compute a single pairwise distance.
+    pairs : cuda array of 2-tuples
+        All pairs for which distances are to be computed.
+    out : numeric cuda array
+        The initialized array which will be populated with distances.
+
+    Raises
+    ------
+    ValueError
+        Both matrices should have the same number of columns.
+
+    Returns
+    -------
+    None.
+
+    """
+    k = cuda.grid(1)
+
+    npairs = pairs.shape[0]
+
+    if k < npairs:
+        pair = pairs[k]
+        # extract indices of the pair for which the distance will be computed
+        i, j = pair
+
+        u = U[i]
+        v = V[j]
+        uw = U_weights[i]
+        vw = V_weights[j]
+
+        out[k] = compute_distance(u, v, uw, vw, metric_num)
+
+
+@cuda.jit(
+    "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], int{0})".format(bits),
+    fastmath=fastmath,
+)
+def one_set_distance_matrix(U, U_weights, out, metric_num):
+    """
+    Calculate pairwise distances within single set of vectors.
+
+    Parameters
+    ----------
+    U : 2D array of float
+        Vertically stacked vectors.
+    U_weights : 2D array of float
+        Vertically stacked weight vectors.
+    out : 2D array of float
+        Initialized matrix to populate with pairwise distances.
+    metric_num : int
+        Which metric to use (0 == "euclidean", 1=="wasserstein").
+
+    Returns
+    -------
+    None.
+
+    """
+    i, j = cuda.grid(2)
+
+    dm_rows = U.shape[0]
+    dm_cols = U.shape[0]
+
+    if i < j and i < dm_rows and j < dm_cols and i != j:
+        u = U[i]
+        v = U[j]
+        uw = U_weights[i]
+        vw = U_weights[j]
+        d = compute_distance(u, v, uw, vw, metric_num)
+        out[i, j] = d
+        out[j, i] = d
+
+
+# faster compilation *and* runtimes with explicit signature
+@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], "
+          "float{0}[:,:], int{0})".format(bits), fastmath=fastmath)
+def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num):
+    """
+    Calculate pairwise distances between two sets of vectors.
+
+    Parameters
+    ----------
+    U : 2D array of float
+        Vertically stacked vectors.
+    V : 2D array of float
+        Vertically stacked vectors.
+    U_weights : 2D array of float
+        Vertically stacked weight vectors.
+    V_weights : 2D array of float
+        Vertically stacked weight vectors.
+    out : 2D array of float
+        Pairwise distance matrix between two sets of vectors.
+    metric_num : int
+        Distance metric number {"euclidean": 0, "wasserstein": 1}.
+
+    Returns
+    -------
+    None.
+
+    """
+    i, j = cuda.grid(2)
+
+    # distance matrix shape
+    dm_rows = U.shape[0]
+    dm_cols = V.shape[0]
+
+    if i < dm_rows and j < dm_cols:
+        u = U[i]
+        v = V[j]
+        uw = U_weights[i]
+        vw = V_weights[j]
+        d = compute_distance(u, v, uw, vw, metric_num)
+        out[i, j] = d
+
+
+def dist_matrix(U,
+                V=None,
+                U_weights=None,
+                V_weights=None,
+                pairs=None,
+                metric="euclidean"):  # noqa
+    """
+    Compute distance matrices using Numba/CUDA.
+
+    Parameters
+    ----------
+    mat : array
+        First set of vectors for which to compute pairwise distances.
+
+    mat2 : array, optional
+        Second set of vectors for which to compute pairwise distances.
+        If not specified, then mat2 is a copy of mat.
+
+    pairs : array, optional
+        List of 2-tuples which contain the indices for which to compute
+        distances. If mat2 was specified, then the second index accesses
+        mat2 instead of mat. If not specified, then the pairs are
+        auto-generated. If mat2 was specified, all combinations of the two
+        vector sets are used. If mat2 isn't specified, then only the upper
+        triangle (minus diagonal) pairs are computed.
+
+    metric : str, optional
+        Possible options are 'euclidean', 'wasserstein'.
+        Defaults to Euclidean distance. These are converted to integers
+        internally due to Numba's lack of support for string arguments
+        (2021-08-14). See compute_distance() for other keys.
+
+        For example:
+            0 - 'euclidean'
+            1 - 'wasserstein'
+    target : str, optional
+        Which target to use: "cuda" or "cpu". Default is "cuda".
+    inline : str, optional
+        Whether to inline functions: "always" or "never". Default is "never".
+    fastmath : bool, optional
+        Whether to use fastmath or not. The default is True.
+    USE_64 : bool, optional
+        Whether to use 64 bit or 34 bit types. The default is False.
+
+    Raises
+    ------
+    ValueError
+        DESCRIPTION.
+    NotImplementedError
+        DESCRIPTION.
+
+    Returns
+    -------
+    out : array
+        A pairwise distance matrix, or if pairs are specified, then a
+        vector of distances corresponding to the pairs.
+
+    """
+    rows, cols_check = U.shape
+
+    if cols_check != cols:
+        raise KeyError("For performance reasons and architecture constraints "
+                       "the number of columns of U (which is the same as V) "
+                       "must be defined as the environment variable: COLUMNS. "
+                       "However, os.environ[\"COLUMNS\"] does not match the "
+                       "number of columns in U. Reset the environment variable "
+                       "via e.g. `os.environ[\"COLUMNS\"] = str(U.shape[0])` "
+                       "defined at the top of your script and make sure that "
+                       "dist_matrix is reloaded. You may also need to restart "
+                       "Python.")
+
+    if V is not None and cols is not V.shape[1]:
+        raise ValueError("The number of columns for U and V should match")
+
+    # is it a distance matrix between two sets of vectors?
+    # (rather than within a single set)
+    isXY = V is not None
+
+    # were pairs specified? (useful for sparse matrix generation)
+    pairQ = pairs is not None
+
+    # block, grid, and out shape
+    if pairQ:
+        block_dim = TPB
+        npairs = pairs.shape[0]
+        grid_dim = ceil(npairs / block_dim)
+    else:
+        block_dim = (TPB, TPB)
+        blockspergrid_x = ceil(rows / block_dim[0])
+        if isXY:
+            blockspergrid_y = ceil(V.shape[0] / block_dim[1])
+        else:
+            blockspergrid_y = ceil(rows / block_dim[1])
+        grid_dim = (blockspergrid_x, blockspergrid_y)
+
+    # CUDA setup
+    stream = cuda.stream()
+
+    # sorting and cumulative weights
+    if metric == "wasserstein":
+        # presort values (and weights by sorted value indices)
+        U_sorter = np.argsort(U)
+        U = np.take_along_axis(U, U_sorter, axis=-1)
+        U_weights = np.take_along_axis(U_weights, U_sorter, axis=-1)
+
+        # calculate cumulative weights
+        U_weights = np.cumsum(U_weights, axis=1)
+
+        # prepend a column of zeros
+        zero = np.zeros((U_weights.shape[0], 1))
+        U_weights = np.column_stack((zero, U_weights))
+
+        # do the same for V and V_weights
+        if isXY:
+            V_sorter = np.argsort(V)
+            V = np.take_along_axis(V, V_sorter, axis=-1)
+            V_weights = np.take_along_axis(V_weights, V_sorter, axis=-1)
+            V_weights = np.cumsum(V_weights, axis=1)
+            V_weights = np.column_stack((zero, V_weights))
+
+    # assign dummy arrays
+    if V is None:
+        V = np.zeros((2, 2))
+
+    if U_weights is None:
+        U_weights = np.zeros((2, 2))
+
+    if V_weights is None:
+        V_weights = np.zeros((2, 2))
+
+    if pairs is None:
+        pairs = np.zeros((2, 2))
+
+    # assign metric_num based on specified metric (no string support)
+    metric_dict = {"euclidean": 0, "wasserstein": 1}
+    metric_num = metric_dict[metric]
+
+    # # same for target_num
+    # target_dict = {"cuda": 0, "cpu": 1}
+    # target_num = target_dict[target]
+
+    m = U.shape[0]
+
+    if isXY:
+        m2 = V.shape[0]
+    else:
+        m2 = m
+
+    if pairQ:
+        shape = (npairs,)
+    else:
+        shape = (m, m2)
+
+    # values should be initialized instead of using cuda.device_array
+    out = np.zeros(shape)
+
+    if target == "cuda":
+        # copying/allocating to GPU
+        cU = cuda.to_device(np.asarray(
+            U, dtype=np_float), stream=stream)
+
+        cU_weights = cuda.to_device(np.asarray(
+            U_weights, dtype=np_float), stream=stream)
+
+        if isXY or pairQ:
+            cV = cuda.to_device(np.asarray(
+                V, dtype=np_float), stream=stream)
+            cV_weights = cuda.to_device(
+                np.asarray(V_weights, dtype=np_float), stream=stream
+            )
+
+        if pairQ:
+            cpairs = cuda.to_device(np.asarray(
+                pairs, dtype=np_int), stream=stream)
+
+        cuda_out = cuda.to_device(np.asarray(
+            out, dtype=np_float), stream=stream)
+
+    elif target == "cpu":
+        cU = U
+        cV = V
+        cU_weights = U_weights
+        cV_weights = V_weights
+        cpairs = pairs
+        cuda_out = out
+
+    # distance matrix between two sets of vectors
+    if isXY and not pairQ:
+        fn = two_set_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cV, cU_weights, cV_weights, cuda_out, metric_num)
+
+    # specified pairwise distances within single set of vectors
+    elif not isXY and pairQ:
+        fn = sparse_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cU, cU_weights, cU_weights, cpairs,
+           cuda_out, metric_num)
+
+    # distance matrix within single set of vectors
+    elif not isXY and not pairQ:
+        fn = one_set_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cU_weights, cuda_out, metric_num)
+
+    # specified pairwise distances within single set of vectors
+    elif isXY and pairQ:
+        fn = sparse_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cV, cU_weights, cV_weights, cpairs,
+           cuda_out, metric_num)
+
+    out = cuda_out.copy_to_host(stream=stream)
+
+    return out
+
+# %% Code Graveyard
+
+# u_stack = cuda.local.array(cols, nb_int)
+# v_stack = cuda.local.array(cols, nb_int)
+
+# copy(u_sorted, utmp)
+# copy(v_sorted, vtmp)
+
+# copy(u_weights, u_sorted_weights)
+# copy(v_weights, v_sorted_weights)
+
+# stack = [0] * size
+# ids = list(range(len(arr)))
+
+# u_sorted_weights = cuda.local.array(cols, nb_float)
+# v_sorted_weights = cuda.local.array(cols, nb_float)
+
+# sorting stack
+# uv_stack = cuda.local.array(tot_cols, nb_int)
+
+# def sort_cum_prepend(X, X_weights):
+#     # presort values (and weights by sorted value indices)
+#     X_sorter = np.argsort(X)
+#     X = np.take_along_axis(X, X_sorter, axis=-1)
+#     X_weights = np.take_along_axis(X_weights, X_sorter, axis=-1)
+#     # calculate cumulative weights
+#     X_weights = np.cumsum(X_weights)
+#     # prepend a column of zeros
+#     zero = np.zeros((X_weights.shape[0], 1))
+#     X_weights = np.column_stack((zero, X_weights))
+
+# def mean_squared_error(y, y_pred, squared=False):
+#     """
+#     Return mean squared error (MSE) without using sklearn.
+
+#     If squared == True, then return root mean squared error (RMSE).
+
+#     Parameters
+#     ----------
+#     y : 1D numeric array
+#         "True" or first set of values.
+#     y_pred : 1D numeric array
+#         "Predicted" or second set of values.
+
+#     Returns
+#     -------
+#     rmse : numeric scalar
+#         DESCRIPTION.
+
+#     """
+#     mse = np.mean((y - y_pred)**2)
+#     if squared is True:
+#         rmse = np.sqrt(mse)
+#         return rmse
+#     else:
+#         return mse
+#         return mse
+
+# opt = False
+# os.environ["NUMBA_CUDA_DEBUGINFO"] = "0"
+# opt = True
+# os.environ["NUMBA_BOUNDSCHECK"] = "0"
+# os.environ["OPT"] = "0"
diff --git a/numba/cuda/tests/cudapy/test_dist_matrix.py b/numba/cuda/tests/cudapy/test_dist_matrix.py
new file mode 100644
index 00000000000..4ad8f948a16
--- /dev/null
+++ b/numba/cuda/tests/cudapy/test_dist_matrix.py
@@ -0,0 +1,253 @@
+# -*- coding: utf-8 -*-
+"""Test distance matrix calculations using CUDA/Numba."""
+from scipy.spatial.distance import euclidean, cdist
+from scipy.stats import wasserstein_distance as scipy_wasserstein_distance
+from numpy.testing import assert_allclose
+import numpy as np
+import os
+from importlib import reload
+
+# os.environ["NUMBA_DISABLE_JIT"] = "1"
+
+# number of columns of U and V must be set as env var before import dist_matrix
+cols = 100
+os.environ["COLUMNS"] = str(cols)
+
+# other environment variables (set before importing dist_matrix)
+os.environ["USE_64"] = "0"
+os.environ["INLINE"] = "never"
+os.environ["FASTMATH"] = "1"
+os.environ["TARGET"] = "cuda"
+
+from numba.cuda.kernels.device.helper import Timer  # noqa
+from numba.cuda.testing import unittest, CUDATestCase  # noqa
+import numba.cuda.kernels.dist_matrix  # noqa
+
+# to overwrite env vars (source: https://stackoverflow.com/a/1254379/13697228)
+reload(numba.cuda.kernels.dist_matrix)
+dist_matrix = numba.cuda.kernels.dist_matrix.dist_matrix
+
+verbose_test = True
+
+
+class TestDistMat(CUDATestCase):
+    """Test distance matrix calculations on GPU for various metrics."""
+
+    def test_dist_matrix(self):
+        """
+        Loop through distance metrics and perform unit tests.
+
+        The four test cases are:
+            pairwise distances within a single set of vectors
+            pairwise distances between two sets of vectors
+            sparse pairwise distances within a single set of vectors
+            sparse pairwise distances between two sets of vectors
+
+        The ground truth for Euclidean comes from cdist.
+        The ground truth for Earth Mover's distance (1-Wasserstein) is via
+        a scipy.stats function.
+
+        Helper functions are used to generate test data and support the use of
+        Wasserstein distances in cdist.
+
+        Returns
+        -------
+        None.
+
+        """
+
+        def test_data(rows, cols):
+            """
+            Generate seeded test values and weights for two distributions.
+
+            Returns
+            -------
+            U : 2D array
+                Values of first distribution.
+            V : 2D array
+                Values of second distribution.
+            U_weights : 2D array
+                Weights of first distribution.
+            V_weights : 2D array
+                Weights of second distribution.
+
+            """
+            np.random.seed(42)
+            [U, V, U_weights, V_weights] = [
+                np.random.rand(rows, cols) for i in range(4)]
+            return U, V, U_weights, V_weights
+
+        def my_wasserstein_distance(u_uw, v_vw):
+            """
+            Return Earth Mover's distance using concatenated values and weights.
+
+            Parameters
+            ----------
+            u_uw : 1D numeric array
+                Horizontally stacked values and weights of first distribution.
+            v_vw : TYPE
+                Horizontally stacked values and weights of second distribution.
+
+            Returns
+            -------
+            distance : numeric scalar
+                Earth Mover's distance given two distributions.
+
+            """
+            # split into values and weights
+            n = len(u_uw)
+            i = n // 2
+            u = u_uw[0: i]
+            uw = u_uw[i: n]
+            v = v_vw[0: i]
+            vw = v_vw[i: n]
+            # calculate distance
+            distance = scipy_wasserstein_distance(
+                u, v, u_weights=uw, v_weights=vw)
+            return distance
+
+        def join_wasserstein(U, V, Uw, Vw):
+            """
+            Horizontally stack values and weights for each distribution.
+
+            Weights are added as additional columns to values.
+
+            Example:
+                u_uw, v_vw = join_wasserstein(u, v, uw, vw)
+                d = my_wasserstein_distance(u_uw, v_vw)
+                cdist(u_uw, v_vw, metric=my_wasserstein_distance)
+
+            Parameters
+            ----------
+            u : 1D or 2D numeric array
+                First set of distribution values.
+            v : 1D or 2D numeric array
+                Second set of values of distribution values.
+            uw : 1D or 2D numeric array
+                Weights for first distribution.
+            vw : 1D or 2D numeric array
+                Weights for second distribution.
+
+            Returns
+            -------
+            u_uw : 1D or 2D numeric array
+                Horizontally stacked values and weights of first distribution.
+            v_vw : TYPE
+                Horizontally stacked values and weights of second distribution.
+
+            """
+            U_Uw = np.concatenate((U, Uw), axis=1)
+            V_Vw = np.concatenate((V, Vw), axis=1)
+            return U_Uw, V_Vw
+
+        tol = 1e-5
+
+        # test and production data
+        pairs = np.array([(0, 1), (1, 2), (2, 3)])
+        i, j = pairs[0]
+
+        for testQ in [True, False]:
+            # large # of rows with testQ == True is slow due to use of
+            # cdist and non-jitted scipy-wasserstein
+            if testQ:
+                rows = 6
+            else:
+                rows = 1000
+            print("====[PARTIAL TEST WITH {} ROWS]====".format(rows))
+            U, V, U_weights, V_weights = test_data(rows, cols)
+            Utest, Vtest, Uwtest, Vwtest = [x[0:6] for x in [U, V, U_weights, V_weights]]  # noqa
+
+            for target in ["cuda"]:  # "cpu" not implemented
+                for metric in ['euclidean', 'wasserstein']:
+                    print('[' + target.upper() + "_" + metric.upper() + ']')
+
+                    if testQ:
+                        # compile
+                        dist_matrix(Utest, U_weights=Uwtest, metric=metric)
+                        with Timer("one set"):
+                            one_set = dist_matrix(
+                                U, U_weights=U_weights, metric=metric)  # noqa
+                            if verbose_test:
+                                print(one_set, '\n')
+
+                    # compile
+                    dist_matrix(Utest, V=Vtest, U_weights=Uwtest,
+                                V_weights=Vwtest, metric=metric)
+                    with Timer("two set"):
+                        two_set = dist_matrix(U, V=V, U_weights=U_weights,
+                                              V_weights=V_weights, metric=metric)  # noqa
+                        if testQ and verbose_test:
+                            print(two_set, '\n')
+
+                        one_set_sparse = dist_matrix(
+                            U, U_weights=U_weights, pairs=pairs, metric=metric)  # noqa
+                        if testQ and verbose_test:
+                            print(one_set_sparse, '\n')
+
+                    two_set_sparse = dist_matrix(
+                        U,
+                        V=V,
+                        U_weights=U_weights,
+                        V_weights=V_weights,
+                        pairs=pairs,
+                        metric=metric,
+                    )
+                    if testQ and verbose_test:
+                        print(two_set_sparse, '\n')
+
+                    if testQ:
+                        if metric == "euclidean":
+                            with Timer("one set check (cdist)"):
+                                one_set_check = cdist(U, U)
+                            with Timer("two set check (cdist)"):
+                                two_set_check = cdist(U, V)
+
+                            one_sparse_check = [
+                                euclidean(U[i], U[j]) for i, j in pairs]
+                            two_sparse_check = [
+                                euclidean(U[i], V[j]) for i, j in pairs]
+
+                        elif metric == "wasserstein":
+                            U_Uw, V_Vw = join_wasserstein(
+                                U, V, U_weights, V_weights)
+
+                            with Timer("one set check (cdist)"):
+                                one_set_check = cdist(
+                                    U_Uw, U_Uw, metric=my_wasserstein_distance)
+                            with Timer("two set check (cdist)"):
+                                two_set_check = cdist(
+                                    U_Uw, V_Vw, metric=my_wasserstein_distance)
+
+                            one_sparse_check = [my_wasserstein_distance(U_Uw[i], U_Uw[j])  # noqa
+                                                for i, j in pairs]
+                            two_sparse_check = [my_wasserstein_distance(U_Uw[i], V_Vw[j])  # noqa
+                                                for i, j in pairs]
+
+                        # check results
+                        assert_allclose(
+                            one_set.ravel(), one_set_check.ravel(), rtol=tol,
+                            err_msg="one set {} {} distance matrix inaccurate".format(target, metric))  # noqa
+                        assert_allclose(
+                            two_set.ravel(), two_set_check.ravel(), rtol=tol,
+                            err_msg="two set {} {} distance matrix inaccurate".format(target, metric))  # noqa
+                        assert_allclose(
+                            one_set_sparse, one_sparse_check, rtol=tol,
+                            err_msg="one set {} {} sparse distance matrix inaccurate".format(target, metric))  # noqa
+                        assert_allclose(
+                            two_set_sparse, two_sparse_check, rtol=tol,
+                            err_msg="two set {} {} distance matrix inaccurate".format(target, metric))  # noqa
+                    elif metric == "euclidean":
+                        with Timer("two set check (cdist)"):
+                            two_set_check = cdist(U, V)
+
+
+if __name__ == '__main__':
+    unittest.main()
+
+
+# TF = [
+#     np.allclose(one_set, one_set_check),
+#     np.allclose(two_set, two_set_check),
+#     np.allclose(one_set_sparse, one_sparse_check),
+#     np.allclose(two_set_sparse, two_sparse_check)
+#     ]