From af41bb8715c88b2ccd6d934d63e2996a72933ba6 Mon Sep 17 00:00:00 2001 From: tess <48131946+stress-tess@users.noreply.github.com> Date: Tue, 21 May 2024 17:34:19 -0400 Subject: [PATCH] Closes #3167: Add `normal` to random number generators (#3180) * Closes #3167: Add `normal` to random number generators This PR (closes #3167) adds `normal` to our generators by transforming the results of standard normal. It also upddates `standard_normal` to use generators ot make sure we have the same multiple calls of the same function give different output as the other rand funcs. Adds goodness of fit testing against scipy to ensure we are actually drawing from the correct distribution * add docs for random functions and deps to yaml files * leftover latex preventing display in usage --------- Co-authored-by: Tess Hayes --- PROTO_tests/tests/random_test.py | 41 ++++++++++++++ arkouda-env-dev.yml | 96 +++++++++++++++++--------------- arkouda/random/_generator.py | 73 ++++++++++++++++++++++-- pydoc/usage/random.rst | 16 ++++++ src/RandArray.chpl | 3 + src/RandMsg.chpl | 38 ++++++++++++- tests/random_test.py | 41 ++++++++++++++ 7 files changed, 255 insertions(+), 53 deletions(-) diff --git a/PROTO_tests/tests/random_test.py b/PROTO_tests/tests/random_test.py index 8bbe068e40..ce4157684e 100644 --- a/PROTO_tests/tests/random_test.py +++ b/PROTO_tests/tests/random_test.py @@ -1,5 +1,6 @@ import numpy as np import pytest +from scipy import stats as sp_stats import arkouda as ak from arkouda.scipy import chisquare as akchisquare @@ -205,6 +206,46 @@ def test_choice_flags(self): current = rng.choice(a, size, replace, p) assert np.allclose(previous.to_list(), current.to_list()) + def test_normal(self): + rng = ak.random.default_rng(17) + both_scalar = rng.normal(loc=10, scale=2, size=10).to_list() + scale_scalar = rng.normal(loc=ak.array([0, 10, 20]), scale=1, size=3).to_list() + loc_scalar = rng.normal(loc=10, scale=ak.array([1, 2, 3]), size=3).to_list() + both_array = rng.normal(loc=ak.array([0, 10, 20]), scale=ak.array([1, 2, 3]), size=3).to_list() + + # redeclare rng with same seed to test reproducibility + rng = ak.random.default_rng(17) + assert rng.normal(loc=10, scale=2, size=10).to_list() == both_scalar + assert rng.normal(loc=ak.array([0, 10, 20]), scale=1, size=3).to_list() == scale_scalar + assert rng.normal(loc=10, scale=ak.array([1, 2, 3]), size=3).to_list() == loc_scalar + assert ( + rng.normal(loc=ak.array([0, 10, 20]), scale=ak.array([1, 2, 3]), size=3).to_list() + == both_array + ) + + def test_normal_hypothesis_testing(self): + # I tested this many times without a set seed, but with no seed + # it's expected to fail one out of every ~20 runs given a pval limit of 0.05. + rng = ak.random.default_rng(43) + num_samples = 10**4 + + mean = rng.uniform(-10, 10) + deviation = rng.uniform(0, 10) + sample = rng.normal(loc=mean, scale=deviation, size=num_samples) + sample_list = sample.to_list() + + # first test if samples are normal at all + _, pval = sp_stats.normaltest(sample_list) + + # if pval <= 0.05, the difference from the expected distribution is significant + assert pval > 0.05 + + # second goodness of fit test against the distribution with proper mean and std + good_fit_res = sp_stats.goodness_of_fit( + sp_stats.norm, sample_list, known_params={"loc": mean, "scale": deviation} + ) + assert good_fit_res.pvalue > 0.05 + def test_legacy_randint(self): testArray = ak.random.randint(0, 10, 5) assert isinstance(testArray, ak.pdarray) diff --git a/arkouda-env-dev.yml b/arkouda-env-dev.yml index 39368c2f50..7f456b9d26 100644 --- a/arkouda-env-dev.yml +++ b/arkouda-env-dev.yml @@ -1,46 +1,50 @@ -name: arkouda-dev -channels: - - conda-forge - - defaults -dependencies: - - python>=3.8 # minimum 3.8 - - numpy>=1.24.1 - - pandas>=1.4.0,!=2.2.0 - - pyzmq>=20.0.0 - - tabulate - - pyfiglet - - versioneer - - matplotlib>=3.3.2 - - h5py>=3.7.0 - - hdf5==1.12.2 - - pip - - types-tabulate - - pytables>=3.7.0 - - pyarrow - - libiconv - - libidn2 - - jupyter - - scipy - - # Developer dependencies - - pexpect - - pytest>=6.0 - - pytest-env - - Sphinx>=5.1.1 - - sphinx-argparse - - sphinx-autoapi - - typed-ast - - flake8 - - mypy>=0.931,<0.990 - - black - - isort - - pytest-json-report - - pytest-benchmark - - - pip: - # Developer dependencies - - typeguard==2.10.0 - - furo # sphinx theme - - myst-parser - - linkify-it-py - +name: arkouda-dev +channels: + - conda-forge + - defaults +dependencies: + - python>=3.8 # minimum 3.8 + - numpy>=1.24.1 + - pandas>=1.4.0,!=2.2.0 + - pyzmq>=20.0.0 + - tabulate + - pyfiglet + - versioneer + - matplotlib>=3.3.2 + - h5py>=3.7.0 + - hdf5==1.12.2 + - pip + - types-tabulate + - pytables>=3.7.0 + - pyarrow + - libiconv + - libidn2 + - jupyter + - scipy + + # Developer dependencies + - pexpect + - pytest>=6.0 + - pytest-env + - Sphinx>=5.1.1 + - sphinx-argparse + - sphinx-autoapi + - sphinx-design + - sphinx-autodoc-typehints + - typed-ast + - flake8 + - mypy>=0.931,<0.990 + - black + - isort + - pytest-json-report + - pytest-benchmark + - mathjax + + - pip: + # Developer dependencies + - typeguard==2.10.0 + - furo # sphinx theme + - myst-parser + - linkify-it-py + - sphinx-autopackagesummary + diff --git a/arkouda/random/_generator.py b/arkouda/random/_generator.py index 82ce981ebe..72fa3f2ac7 100644 --- a/arkouda/random/_generator.py +++ b/arkouda/random/_generator.py @@ -209,6 +209,55 @@ def integers(self, low, high=None, size=None, dtype=akint64, endpoint=False): self._state += size return create_pdarray(rep_msg) + def normal(self, loc=0.0, scale=1.0, size=None): + r""" + Draw samples from a normal (Gaussian) distribution + + Parameters + ---------- + loc: float or pdarray of floats, optional + Mean of the distribution. Default of 0. + + scale: float or pdarray of floats, optional + Standard deviation of the distribution. Must be non-negative. Default of 1. + + size: numeric_scalars, optional + Output shape. Default is None, in which case a single value is returned. + + Notes + ----- + The probability density for the Gaussian distribution is: + + .. math:: + p(x) = \frac{1}{\sqrt{2\pi \sigma^2}} e^{-\frac{(x-\mu)^2}{2\sigma^2}} + + where :math:`\mu` is the mean and :math:`\sigma` the standard deviation. + The square of the standard deviation, :math:`\sigma^2`, is called the variance. + + Returns + ------- + pdarray + Pdarray of floats (unless size=None, in which case a single float is returned). + + See Also + -------- + standard_normal + uniform + + Examples + -------- + >>> ak.random.default_rng(17).normal(3, 2.5, 10) + array([2.3673425816523577 4.0532529435624589 2.0598322696795694]) + """ + if size is None: + # delegate to numpy when return size is 1 + return self._np_generator.standard_normal(loc, scale) + + if (scale < 0).any() if isinstance(scale, pdarray) else scale < 0: + raise TypeError("scale must be non-negative.") + + return loc + scale * self.standard_normal(size=size) + def random(self, size=None): """ Return random floats in the half-open interval [0.0, 1.0). @@ -261,7 +310,7 @@ def random(self, size=None): return create_pdarray(rep_msg) def standard_normal(self, size=None): - """ + r""" Draw samples from a standard Normal distribution (mean=0, stdev=1). Parameters @@ -276,10 +325,14 @@ def standard_normal(self, size=None): Notes ----- - For random samples from :math:`N(\\mu, \\sigma^2)`, use: + For random samples from :math:`N(\mu, \sigma^2)`, either call `normal` or do: - ``(sigma * standard_normal(size)) + mu`` + .. math:: + (\sigma \cdot \texttt{standard_normal}(size)) + \mu + See Also + -------- + normal Examples -------- @@ -289,12 +342,20 @@ def standard_normal(self, size=None): >>> rng.standard_normal(3) array([0.8797352989638163, -0.7085325853376141, 0.021728052940979934]) # random """ - from arkouda.random._legacy import standard_normal - if size is None: # delegate to numpy when return size is 1 return self._np_generator.standard_normal() - return standard_normal(size=size, seed=self._seed) + rep_msg = generic_msg( + cmd="standardNormalGenerator", + args={ + "name": self._name_dict[akfloat64], + "size": size, + "state": self._state, + }, + ) + # since we generate 2*size uniform samples for box-muller transform + self._state += (size * 2) + return create_pdarray(rep_msg) def shuffle(self, x): """ diff --git a/pydoc/usage/random.rst b/pydoc/usage/random.rst index 776cf80060..62feb4fcc0 100644 --- a/pydoc/usage/random.rst +++ b/pydoc/usage/random.rst @@ -17,10 +17,26 @@ To create a ``Generator`` use the ``default_rng`` constructor Features ========== +choice +--------- +.. autofunction:: arkouda.random.Generator.choice + integers --------- .. autofunction:: arkouda.random.Generator.integers +normal +--------- +.. autofunction:: arkouda.random.Generator.normal + +permutation +--------- +.. autofunction:: arkouda.random.Generator.permutation + +shuffle +--------- +.. autofunction:: arkouda.random.Generator.shuffle + random --------- .. autofunction:: arkouda.random.Generator.random diff --git a/src/RandArray.chpl b/src/RandArray.chpl index f790e88c46..b98b32eb16 100644 --- a/src/RandArray.chpl +++ b/src/RandArray.chpl @@ -69,6 +69,9 @@ module RandArray { } proc fillNormal(ref a:[?D] real, const seedStr:string="None") throws { + // uses Box–Muller transform + // generates values drawn from the standard normal distribution using + // 2 uniformly distributed random numbers var u1 = makeDistArray(D, real); var u2 = makeDistArray(D, real); if (seedStr.toLower() == "none") { diff --git a/src/RandMsg.chpl b/src/RandMsg.chpl index 82ce71094d..24053e2b5b 100644 --- a/src/RandMsg.chpl +++ b/src/RandMsg.chpl @@ -3,7 +3,7 @@ module RandMsg use ServerConfig; use ArkoudaTimeCompat as Time; - use Math only; + use Math; use Reflection; use ServerErrors; use ServerConfig; @@ -297,6 +297,41 @@ module RandMsg return new MsgTuple(repMsg, MsgType.NORMAL); } + proc standardNormalGeneratorMsg(cmd: string, msgArgs: borrowed MessageArgs, st: borrowed SymTab): MsgTuple throws { + const pn = Reflection.getRoutineName(), + name = msgArgs.getValueOf("name"), // generator name + size = msgArgs.get("size").getIntValue(), // population size + state = msgArgs.get("state").getIntValue(), // rng state + rname = st.nextName(); + + randLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "name: %? size %i state %i".doFormat(name, size, state)); + + st.checkTable(name); + + var generatorEntry: borrowed GeneratorSymEntry(real) = toGeneratorSymEntry(st.lookup(name), real); + ref rng = generatorEntry.generator; + if state != 1 { + // you have to skip to one before where you want to be + rng.skipTo(state-1); + } + + // uses Box–Muller transform + // generates values drawn from the standard normal distribution using + // 2 uniformly distributed random numbers + var u1 = makeDistArray(size, real); + var u2 = makeDistArray(size, real); + rng.fill(u1); + rng.fill(u2); + + var standNorm = sqrt(-2*log(u1))*cos(2*pi*u2); + st.addEntry(rname, createSymEntry(standNorm)); + + var repMsg = "created " + st.attrib(rname); + randLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); + return new MsgTuple(repMsg, MsgType.NORMAL); + } + proc segmentedSampleMsg(cmd: string, msgArgs: borrowed MessageArgs, st: borrowed SymTab): MsgTuple throws { const pn = Reflection.getRoutineName(), genName = msgArgs.getValueOf("genName"), // generator name @@ -567,6 +602,7 @@ module RandMsg registerFunction("randomNormal", randomNormalMsg, getModuleName()); registerFunction("createGenerator", createGeneratorMsg, getModuleName()); registerFunction("uniformGenerator", uniformGeneratorMsg, getModuleName()); + registerFunction("standardNormalGenerator", standardNormalGeneratorMsg, getModuleName()); registerFunction("segmentedSample", segmentedSampleMsg, getModuleName()); registerFunction("choice", choiceMsg, getModuleName()); registerFunction("permutation", permutationMsg, getModuleName()); diff --git a/tests/random_test.py b/tests/random_test.py index 237fcee0c4..ed1f4a39e4 100644 --- a/tests/random_test.py +++ b/tests/random_test.py @@ -1,6 +1,7 @@ import numpy as np from base_test import ArkoudaTest from context import arkouda as ak +from scipy import stats as sp_stats from arkouda.scipy import chisquare as akchisquare @@ -212,6 +213,46 @@ def test_choice_flags(self): print(f"Failure with seed:\n{seed}") self.assertTrue(res) + def test_normal(self): + rng = ak.random.default_rng(17) + both_scalar = rng.normal(loc=10, scale=2, size=10).to_list() + scale_scalar = rng.normal(loc=ak.array([0, 10, 20]), scale=1, size=3).to_list() + loc_scalar = rng.normal(loc=10, scale=ak.array([1, 2, 3]), size=3).to_list() + both_array = rng.normal(loc=ak.array([0, 10, 20]), scale=ak.array([1, 2, 3]), size=3).to_list() + + # redeclare rng with same seed to test reproducibility + rng = ak.random.default_rng(17) + self.assertEqual(rng.normal(loc=10, scale=2, size=10).to_list(), both_scalar) + self.assertEqual(rng.normal(loc=ak.array([0, 10, 20]), scale=1, size=3).to_list(), scale_scalar) + self.assertEqual(rng.normal(loc=10, scale=ak.array([1, 2, 3]), size=3).to_list(), loc_scalar) + self.assertEqual( + rng.normal(loc=ak.array([0, 10, 20]), scale=ak.array([1, 2, 3]), size=3).to_list(), + both_array, + ) + + def test_normal_hypothesis_testing(self): + # I tested this many times without a set seed, but with no seed + # it's expected to fail one out of every ~20 runs given a pval limit of 0.05. + rng = ak.random.default_rng(43) + num_samples = 10**4 + + mean = rng.uniform(-10, 10) + deviation = rng.uniform(0, 10) + sample = rng.normal(loc=mean, scale=deviation, size=num_samples) + sample_list = sample.to_list() + + # first test if samples are normal at all + _, pval = sp_stats.normaltest(sample_list) + + # if pval <= 0.05, the difference from the expected distribution is significant + self.assertTrue(pval > 0.05) + + # second goodness of fit test against the distribution with proper mean and std + good_fit_res = sp_stats.goodness_of_fit( + sp_stats.norm, sample_list, known_params={"loc": mean, "scale": deviation} + ) + self.assertTrue(good_fit_res.pvalue > 0.05) + def test_legacy_randint(self): testArray = ak.random.randint(0, 10, 5) self.assertIsInstance(testArray, ak.pdarray)