From 342535385d8ac90f9af8e6c6563ab29abf8e2442 Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Fri, 27 Oct 2023 12:47:42 +0200 Subject: [PATCH 01/12] Added reader and help functions to convert scikit-hep histogram to Tables --- examples/reading_scikihep_histograms.ipynb | 374 +++++++++++++++++++++ hepdata_lib/hist_utils.py | 213 ++++++++++++ tests/test_histread.py | 54 +++ 3 files changed, 641 insertions(+) create mode 100644 examples/reading_scikihep_histograms.ipynb create mode 100644 hepdata_lib/hist_utils.py create mode 100644 tests/test_histread.py diff --git a/examples/reading_scikihep_histograms.ipynb b/examples/reading_scikihep_histograms.ipynb new file mode 100644 index 00000000..d559eb30 --- /dev/null +++ b/examples/reading_scikihep_histograms.ipynb @@ -0,0 +1,374 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reading histograms\n", + "\n", + "For the new python-based frameworks, another common task would be to translate\n", + "histogram in the [`scikit-hep.hist`](https://hist.readthedocs.io/en/latest/)\n", + "package into the HEPData format. The functions in the `hepdata_lib` will help\n", + "you with that, and this notebook will demonstrate how to do that.\n", + "\n", + "As explained in the [Getting started notebook](Getting_started.ipynb), a\n", + "`Submission` needs to exist or be created. Here, we'll just create one without\n", + "any additional information:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Welcome to JupyROOT 6.28/04\n" + ] + } + ], + "source": [ + "from hepdata_lib import Submission\n", + "\n", + "submission = Submission()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The common use-case `scikit-hep` histograms is to allow for after-the-fact\n", + "slicing and grouping from a main histogram. Let us first generate a fake\n", + "histogram that may appear in common histograms, as well as a common slicing\n", + "routine" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Hist(\n", + " StrCategory(['data', 'QCD', 'ttbar'], name='dataset'),\n", + " IntCategory([-1, 0, 4, 5], name='flavor'),\n", + " Regular(60, -3, 3, name='eta'),\n", + " Regular(20, 0, 500, name='pt'),\n", + " storage=Weight()) # Sum: WeightedSum(value=221221, variance=123802) (WeightedSum(value=256973, variance=143935) with flow)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import hist\n", + "import numpy as np\n", + "\n", + "rng = np.random.default_rng(seed=123_456_789)\n", + "\n", + "h = hist.Hist(\n", + " hist.axis.StrCategory([\"data\", \"QCD\", \"ttbar\"], name=\"dataset\"),\n", + " hist.axis.IntCategory([-1, 0, 4, 5], name=\"flavor\"),\n", + " hist.axis.Regular(60, -3, +3, name=\"eta\"),\n", + " hist.axis.Regular(20, 0, 500, name=\"pt\"),\n", + " storage=hist.storage.Weight(),\n", + ")\n", + "\n", + "h.fill( ## For mock data\n", + " dataset=\"data\",\n", + " flavor=-1,\n", + " eta=rng.normal(0, 2.0, size=123_456),\n", + " pt=rng.exponential(100, size=123_456),\n", + ")\n", + "h.fill( ## For Mock QCD\n", + " dataset=\"QCD\",\n", + " flavor=rng.choice([0, 4, 5], size=1_000_000, p=[0.8, 0.15, 0.05]),\n", + " eta=rng.normal(0.0, 2.0, size=1_000_000),\n", + " pt=rng.exponential(100, size=1_000_000),\n", + " weight=0.123456 * 2 * rng.random(size=1_000_000),\n", + ")\n", + "h.fill( ## For mock ttbar\n", + " dataset=\"ttbar\",\n", + " flavor=rng.choice([0, 4, 5], size=1_000_000, p=[0.45, 0.1, 0.45]),\n", + " eta=rng.normal(0.0, 1.5, size=1_000_000),\n", + " pt=rng.exponential(200, size=1_000_000),\n", + " weight=0.01 * 2 * rng.random(size=1_000_000),\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example of manual processing to 1D array\n", + "\n", + "Let use create a simple slicing routine to get the various histograms of\n", + "interest, then use the most versatile function, the\n", + "`hepdata_lib.hist_utils.read_hist` method, to create arrays that will be\n", + "compatible with variable creation." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'pt': array([( 0., 25.), ( 25., 50.), ( 50., 75.), ( 75., 100.),\n", + " (100., 125.), (125., 150.), (150., 175.), (175., 200.),\n", + " (200., 225.), (225., 250.), (250., 275.), (275., 300.),\n", + " (300., 325.), (325., 350.), (350., 375.), (375., 400.),\n", + " (400., 425.), (425., 450.), (450., 475.), (475., 500.)],\n", + " dtype=[('f0', ' Dict[str, numpy.ndarray]: + """ + Converting the scikit-hep histogram in to a dictionary of numpy arrays that + can be used for hepdata_lib Variable and Uncertainty declaration. + + For all axes define in the histogram, a `hepdata_lib.Variable` with + `is_independent=True` should be declared. The `values` of this variable will + be stored in the return dictionary following the axes names. + + Overflow and underflow bin will be handled using a single flag for all axes, + so be sure to declare/modify histogram axes properties according to your + needs. + + The storage content will be returned as is, so additional uncertainty + processing will need to be handled by the user using the return values. + """ + axes_entries = [_get_histaxis_array(ax, flow=flow) for ax in h.axes] + axes_entries = numpy.meshgrid(*axes_entries) + + ## Getting axes return values + readout = {ax.name: axes_entries[idx].flatten() for idx, ax in enumerate(h.axes)} + + ## Getting the histogram return values + view = h.view(flow=flow) + + _storage_keys = { + hist.storage.Weight: ["value", "variance"], + hist.storage.Mean: ["value", ""], + } + + if h.storage_type is hist.storage.Int64 or h.storage_type is hist.storage.Double: + readout["hist_value"] = view.flatten() + elif h.storage_type in _storage_keys: + for key in _storage_keys[h.storage_type]: + readout["hist_" + key] = view[key].flatten() + else: + raise NotImplementedError( + f"Storage type {h.storage_type} currently not implemented" + ) + + return readout + + +def _get_histaxis_array(axis, flow: bool) -> numpy.ndarray: + """ + Given an axis array, return the bin entries and a numpy array. + + For continuous axes, the return will be a Nx2 array of bin edge pairs. For + categorical axes, the return will be a N array of bin content values. If the + flow is set to true, the function will also add the overflow/underflow bins + according to the settings found in axis.traits. For categorical axis, this + will include an extra `__UNDETERMINED__` entry or a +1 entry. + """ + + ## Getting the entries as a simple list + entries = [x for x in axis] + + ## Adding overflow bin + if flow and axis.traits.overflow: + if isinstance(axis, hist.axis.StrCategory): + entries.append("__UNDETERMINED__") + elif isinstance(axis, hist.axis.IntCategory): + entries.append(entries[-1] + 1) + else: + entries.append((axis.edges[-1], numpy.inf)) + + ## Adding underflow bin + if flow and axis.traits.underflow: + entries = [(-numpy.inf, axis.edges[0])] + entries + + ## Converting to numpy array + if axis.traits.continuous: + return numpy.array(entries, dtype="f,f") + else: + return numpy.array(entries) + + +def hist_as_variable( + var_name, + h: hist.Hist, + flow: bool = False, + uncertainty: Optional[Dict[str, Union[str, hist.Hist, numpy.ndarray]]] = None, + **kwargs, +) -> Variable: + """ + Returning this histogram entries as a Variable object, with a simpler + interface for modifying uncertainty + """ + if uncertainty is None: + uncertainty = {} + + readout = read_hist(h, flow=flow) + var = Variable( + var_name, + is_independent=False, + is_binned=False, + **kwargs, + ) + var.values = readout["hist_value"] + + def _make_unc_array(x): + if isinstance(x, float): + return numpy.ones_like(readout["hist_value"]) * x + elif isinstance(x, numpy.ndarray): + return x + elif isinstance(x, hist.Hist): + return read_hist(x, flow=flow)["hist_value"] + else: + raise NotImplementedError(f"Unknown uncertainty format! {type(x)}") + + for unc_name, unc_proc in uncertainty.items(): + is_symmetric = None + if isinstance(unc_proc, str): + if unc_proc == "poisson_sym": # Symmetric poisson uncertainty + is_symmetric = True + arr = _make_poisson_unc_array(readout, is_symmetric) + elif unc_proc == "poisson_asym": # Asymmetric poisson uncertainty + is_symmetric = False + arr = _make_poisson_unc_array(readout, is_symmetric) + else: + raise NotImplementedError(f"Unknown uncertainty process {unc_proc}") + elif isinstance(unc_proc, tuple): # Asymmetric uncertainty + is_symmetric = False # + assert len(unc_proc) == 2, "Asymmetric uncertainty can only have 2 entries" + lo, up = _make_unc_array(unc_proc[0]), _make_unc_array(unc_proc[1]) + arr = [x for x in zip(lo, up)] + else: # Assuming symmetric error + is_symmetric = True + arr = _make_unc_array(unc_proc) + + assert len(arr) == len(readout["hist_value"]), "Mismatch array formats" + + unc_var = Uncertainty(unc_name, is_symmetric=is_symmetric) + unc_var.values = arr + var.add_uncertainty(unc_var) + + return var + + +def _make_poisson_unc_array( + readout: Dict[str, numpy.ndarray], symmetric: bool +) -> numpy.ndarray: + if symmetric: + if "hist_variance" not in readout.keys(): + numpy.sqrt(readout["hist_value"]) + else: # Effective number of events + sw, sw2 = readout["hist_value"], readout["hist_variance"] + n_events = numpy.divide( + sw**2, sw2, out=numpy.zeros_like(sw), where=(sw2 != 0) + ) + rel_unc = numpy.divide( + numpy.sqrt(n_events), + n_events, + out=numpy.zeros_like(sw), + where=(n_events != 0), + ) + return sw * rel_unc + return numpy.sqrt(n_events) + else: + sw, sw2 = readout["hist_value"], readout["hist_value"] + if "hist_variance" in readout.keys(): + sw2 = readout["hist_variance"] + lo, up = hist.intervals.poisson_interval(sw, sw2) + lo, up = lo - sw, up - sw + return [x for x in zip(lo, up)] + + +def create_hist_base_table( + table_name: str, + h: hist.Hist, + flow: bool = False, + axes_rename: Optional[Dict[str, str]] = None, # Additional axes proces + axes_units: Optional[Dict[str, str]] = None, # Additional axes proces +) -> Table: + """ + Preparing the table based on hist, allows for the additional exclusion of + axes via a list of string names + """ + if axes_rename is None: + axes_rename = {} + if axes_units is None: + axes_units = {} + table = Table(table_name) + + readout = read_hist(h) + + for ax in h.axes: + var_name = ax.name + if ax.name in axes_rename: + var_name = axes_rename[ax.name] + elif ax.label: + var_name = ax.label + var = Variable( + var_name, + is_independent=True, + is_binned=ax.traits.continuous, + units=axes_units.get(ax.name, ""), + ) + var.values = readout[ax.name] + table.add_variable(var) + + return table diff --git a/tests/test_histread.py b/tests/test_histread.py new file mode 100644 index 00000000..2e5e49a0 --- /dev/null +++ b/tests/test_histread.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +"""Test scikit-hep reading""" +from unittest import TestCase +import numpy as np +import hist +from hepdata_lib.hist_utils import * + + +class TestHistUtils(TestCase): + """Test the hist_utils functions.""" + + base_hist = hist.Hist( + hist.axis.StrCategory(["data", "QCD", "ttbar"], name="dataset"), + hist.axis.IntCategory([-1, 0, 4, 5], name="flavor"), + hist.axis.Regular(60, -3, +3, name="eta"), + hist.axis.Regular(50, 0, 500, name="pt"), + storage=hist.storage.Weight(), + ) + rng = np.random.default_rng(seed=123_456_789) + + base_hist.fill( ## For mock data + dataset="data", + flavor=-1, + eta=rng.normal(0, 2.0, size=123_456), + pt=rng.exponential(100, size=123_456), + ) + base_hist.fill( ## For Mock QCD + dataset="QCD", + flavor=rng.choice([0, 4, 5], size=1_000_000, p=[0.8, 0.15, 0.05]), + eta=rng.normal(0.0, 2.0, size=1_000_000), + pt=rng.exponential(100, size=1_000_000), + weight=0.123456 * 2 * rng.random(size=1_000_000), + ) + base_hist.fill( ## For mock ttbar + dataset="ttbar", + flavor=rng.choice([0, 4, 5], size=1_000_000, p=[0.45, 0.1, 0.45]), + eta=rng.normal(0.0, 1.5, size=1_000_000), + pt=rng.exponential(200, size=1_000_000), + weight=0.01 * 2 * rng.random(size=1_000_000), + ) + + def test_default_read(self): + try: + readout = read_hist(TestHistUtils.base_hist) + except: + self.fail("Histogram reading raised an unexpected exception.") + + # Checking dimension compatibility + self.assertTrue(len(readout["dataset"]) == len(readout["flavor"])) + self.assertTrue(len(readout["dataset"]) == len(readout["eta"])) + self.assertTrue(len(readout["dataset"]) == len(readout["pt"])) + + # Clean up + self.doCleanups() From b6a0d9aa415941cf3105dc81a02b65d964cd6df8 Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Fri, 27 Oct 2023 16:21:28 +0200 Subject: [PATCH 02/12] Updated documentation and dependencies --- README.md | 3 + examples/reading_scikihep_histograms.ipynb | 79 ++++++++++++---------- hepdata_lib/hist_utils.py | 42 ++++++++++-- requirements.txt | 1 + 4 files changed, 82 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index ce20b896..7fcac52c 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,9 @@ There are a few more examples available that can directly be run using the [bind - [Reading TGraph and TGraphError from '.C' files](https://github.com/HEPData/hepdata_lib/blob/main/examples/read_c_file.ipynb) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/HEPData/hepdata_lib/main?filepath=examples/read_c_file.ipynb)

+- [Preparing scikit-hep histograms](https://github.com/HEPData/hepdata_lib/blob/main/examples/reading_scikithep_histogram.ipynb) +[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/HEPData/hepdata_lib/main?filepath=examples/reading_scikihep_histogram.ipynb) +

## External dependencies diff --git a/examples/reading_scikihep_histograms.ipynb b/examples/reading_scikihep_histograms.ipynb index d559eb30..7334b5cd 100644 --- a/examples/reading_scikihep_histograms.ipynb +++ b/examples/reading_scikihep_histograms.ipynb @@ -6,14 +6,15 @@ "source": [ "# Reading histograms\n", "\n", - "For the new python-based frameworks, another common task would be to translate\n", - "histogram in the [`scikit-hep.hist`](https://hist.readthedocs.io/en/latest/)\n", - "package into the HEPData format. The functions in the `hepdata_lib` will help\n", - "you with that, and this notebook will demonstrate how to do that.\n", + "For the new python-based frameworks, another common format would needs\n", + "translation are histogram in the\n", + "[`scikit-hep.hist`](https://hist.readthedocs.io/en/latest/). The functions in\n", + "the `hepdata_lib.hist_utils` will help you with that, and this notebook will\n", + "demonstrate how to do that.\n", "\n", "As explained in the [Getting started notebook](Getting_started.ipynb), a\n", "`Submission` needs to exist or be created. Here, we'll just create one without\n", - "any additional information:" + "any additional information.\n" ] }, { @@ -39,10 +40,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The common use-case `scikit-hep` histograms is to allow for after-the-fact\n", - "slicing and grouping from a main histogram. Let us first generate a fake\n", + "The common use-case for `scikit-hep` histograms is to allow for after-the-fact\n", + "slicing and grouping from a primary histogram. Let us first generate a fake\n", "histogram that may appear in common histograms, as well as a common slicing\n", - "routine" + "routine\n" ] }, { @@ -109,9 +110,9 @@ "## Example of manual processing to 1D array\n", "\n", "Let use create a simple slicing routine to get the various histograms of\n", - "interest, then use the most versatile function, the\n", + "interest, then use the most general function, the\n", "`hepdata_lib.hist_utils.read_hist` method, to create arrays that will be\n", - "compatible with variable creation." + "compatible with `hepdata_lib.Variable` declaration.\n" ] }, { @@ -123,27 +124,33 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'pt': array([( 0., 25.), ( 25., 50.), ( 50., 75.), ( 75., 100.),\n", + "{'hist_value': array([27405., 21382., 16585., 12740., 10069., 7878., 6007., 4678.,\n", + " 3666., 2903., 2333., 1734., 1352., 1048., 851., 634.,\n", + " 485., 401., 294., 230.]),\n", + " 'hist_variance': array([27405., 21382., 16585., 12740., 10069., 7878., 6007., 4678.,\n", + " 3666., 2903., 2333., 1734., 1352., 1048., 851., 634.,\n", + " 485., 401., 294., 230.]),\n", + " 'pt': array([( 0., 25.), ( 25., 50.), ( 50., 75.), ( 75., 100.),\n", " (100., 125.), (125., 150.), (150., 175.), (175., 200.),\n", " (200., 225.), (225., 250.), (250., 275.), (275., 300.),\n", " (300., 325.), (325., 350.), (350., 375.), (375., 400.),\n", " (400., 425.), (425., 450.), (450., 475.), (475., 500.)],\n", - " dtype=[('f0', ' numpy.ndarray: categorical axes, the return will be a N array of bin content values. If the flow is set to true, the function will also add the overflow/underflow bins according to the settings found in axis.traits. For categorical axis, this - will include an extra `__UNDETERMINED__` entry or a +1 entry. + will include an extra `"__UNDETERMINED__"` entry (for StrCategory) or an +1 + entry (for IntCategory). """ ## Getting the entries as a simple list @@ -95,7 +96,28 @@ def hist_as_variable( ) -> Variable: """ Returning this histogram entries as a Variable object, with a simpler - interface for modifying uncertainty + interface for automatically generating values uncertainties. + + The `h` and `flow` inputs are passed directly to the `read_hist` method to + extract the value to be used for the variable. + + The `uncertainty` is a dictionary defining how uncertainties should be + defined. Dictionary keys are used as the name of the uncertainty, while the + value defines how the uncertainty should be constructed. This can either be: + + - `str`: either "poisson_asym" or "poisson_sym", indicating to extract + Poisson uncertainty directly from the histogram values. (Either the + asymmetric Garwood interval defined by `hist.intervals` or a simply, + symmetric `sqrt(n)`.) + - `float`: A flat uncertainty to be used on all bins. + - `numpy.ndarray`: An array indicating the uncertainty for each bin. The + array should be compatible with the output of `read_hist['hist_values']` + - `hist.Hist`: The histogram with bin values indicating the uncertainty to + be used for each bin. The histogram should be compatible with the input + histogram. + - `tuple(T,T)` where `T` can either be a `float`, `numpy.ndarray` or + `hist.Hist`. This is used to indicate asymmetric uncertainties, following + the lower/upper ordering convention of hepdata_lib """ if uncertainty is None: uncertainty = {} @@ -151,9 +173,15 @@ def _make_unc_array(x): def _make_poisson_unc_array( readout: Dict[str, numpy.ndarray], symmetric: bool ) -> numpy.ndarray: + """ + Given the results of `read_hist`, extract the Poisson uncertainty using + hist.intervals. Automatically detecting the histogram storage type to handle + weighted uncertainties + """ if symmetric: if "hist_variance" not in readout.keys(): numpy.sqrt(readout["hist_value"]) + return numpy.sqrt(n_events) else: # Effective number of events sw, sw2 = readout["hist_value"], readout["hist_variance"] n_events = numpy.divide( @@ -166,7 +194,6 @@ def _make_poisson_unc_array( where=(n_events != 0), ) return sw * rel_unc - return numpy.sqrt(n_events) else: sw, sw2 = readout["hist_value"], readout["hist_value"] if "hist_variance" in readout.keys(): @@ -180,12 +207,13 @@ def create_hist_base_table( table_name: str, h: hist.Hist, flow: bool = False, - axes_rename: Optional[Dict[str, str]] = None, # Additional axes proces - axes_units: Optional[Dict[str, str]] = None, # Additional axes proces + axes_rename: Optional[Dict[str, str]] = None, + axes_units: Optional[Dict[str, str]] = None, ) -> Table: """ - Preparing the table based on hist, allows for the additional exclusion of - axes via a list of string names + Preparing the table based on hist. This constructs just the histogram axis + as the table variable. Histogram entries should be added via the + `hist_as_variable` method. """ if axes_rename is None: axes_rename = {} diff --git a/requirements.txt b/requirements.txt index eaf80f96..c1a8d716 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ numpy PyYAML>=4.0 future +hist hepdata-validator>=0.3.5 From 6ab8cb662db411f284a2d44e1756e1153c6a6edf Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Mon, 30 Oct 2023 11:42:25 +0100 Subject: [PATCH 03/12] Fixing complaints from pylint --- hepdata_lib/hist_utils.py | 106 ++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 49 deletions(-) diff --git a/hepdata_lib/hist_utils.py b/hepdata_lib/hist_utils.py index f5a41d88..e8bbc447 100644 --- a/hepdata_lib/hist_utils.py +++ b/hepdata_lib/hist_utils.py @@ -1,15 +1,15 @@ """hepdata_lib utilities for interacting with scikit-hep hist histograms""" -from typing import Tuple, Optional, Union, List, Dict +from typing import Optional, Union, Dict import numpy -from hepdata_lib import Table, Variable, Uncertainty - # scikit-hep hist package import hist import hist.intervals +from hepdata_lib import Table, Variable, Uncertainty + -def read_hist(h: hist.Hist, flow: bool = False) -> Dict[str, numpy.ndarray]: +def read_hist(histo: hist.Hist, flow: bool = False) -> Dict[str, numpy.ndarray]: """ Converting the scikit-hep histogram in to a dictionary of numpy arrays that can be used for hepdata_lib Variable and Uncertainty declaration. @@ -25,28 +25,33 @@ def read_hist(h: hist.Hist, flow: bool = False) -> Dict[str, numpy.ndarray]: The storage content will be returned as is, so additional uncertainty processing will need to be handled by the user using the return values. """ - axes_entries = [_get_histaxis_array(ax, flow=flow) for ax in h.axes] + axes_entries = [_get_histaxis_array(ax, flow=flow) for ax in histo.axes] axes_entries = numpy.meshgrid(*axes_entries) ## Getting axes return values - readout = {ax.name: axes_entries[idx].flatten() for idx, ax in enumerate(h.axes)} + readout = { + ax.name: axes_entries[idx].flatten() for idx, ax in enumerate(histo.axes) + } ## Getting the histogram return values - view = h.view(flow=flow) + view = histo.view(flow=flow) _storage_keys = { hist.storage.Weight: ["value", "variance"], hist.storage.Mean: ["value", ""], } - if h.storage_type is hist.storage.Int64 or h.storage_type is hist.storage.Double: + if ( + histo.storage_type is hist.storage.Int64 + or histo.storage_type is hist.storage.Double + ): readout["hist_value"] = view.flatten() - elif h.storage_type in _storage_keys: - for key in _storage_keys[h.storage_type]: + elif histo.storage_type in _storage_keys: + for key in _storage_keys[histo.storage_type]: readout["hist_" + key] = view[key].flatten() else: raise NotImplementedError( - f"Storage type {h.storage_type} currently not implemented" + f"Storage type {histo.storage_type} currently not implemented" ) return readout @@ -65,7 +70,7 @@ def _get_histaxis_array(axis, flow: bool) -> numpy.ndarray: """ ## Getting the entries as a simple list - entries = [x for x in axis] + entries = list(axis) ## Adding overflow bin if flow and axis.traits.overflow: @@ -82,14 +87,16 @@ def _get_histaxis_array(axis, flow: bool) -> numpy.ndarray: ## Converting to numpy array if axis.traits.continuous: - return numpy.array(entries, dtype="f,f") + entries = numpy.array(entries, dtype="f,f") else: - return numpy.array(entries) + entries = numpy.array(entries) + + return entries def hist_as_variable( var_name, - h: hist.Hist, + histo: hist.Hist, flow: bool = False, uncertainty: Optional[Dict[str, Union[str, hist.Hist, numpy.ndarray]]] = None, **kwargs, @@ -122,7 +129,7 @@ def hist_as_variable( if uncertainty is None: uncertainty = {} - readout = read_hist(h, flow=flow) + readout = read_hist(histo, flow=flow) var = Variable( var_name, is_independent=False, @@ -131,15 +138,14 @@ def hist_as_variable( ) var.values = readout["hist_value"] - def _make_unc_array(x): - if isinstance(x, float): - return numpy.ones_like(readout["hist_value"]) * x - elif isinstance(x, numpy.ndarray): - return x - elif isinstance(x, hist.Hist): - return read_hist(x, flow=flow)["hist_value"] - else: - raise NotImplementedError(f"Unknown uncertainty format! {type(x)}") + def _make_unc_array(unc_val): + if isinstance(unc_val, float): + return numpy.ones_like(readout["hist_value"]) * unc_val + if isinstance(unc_val, numpy.ndarray): + return unc_val + if isinstance(unc_val, hist.Hist): + return read_hist(unc_val, flow=flow)["hist_value"] + raise NotImplementedError(f"Unknown uncertainty format! {type(unc_val)}") for unc_name, unc_proc in uncertainty.items(): is_symmetric = None @@ -155,8 +161,8 @@ def _make_unc_array(x): elif isinstance(unc_proc, tuple): # Asymmetric uncertainty is_symmetric = False # assert len(unc_proc) == 2, "Asymmetric uncertainty can only have 2 entries" - lo, up = _make_unc_array(unc_proc[0]), _make_unc_array(unc_proc[1]) - arr = [x for x in zip(lo, up)] + _lo, _up = _make_unc_array(unc_proc[0]), _make_unc_array(unc_proc[1]) + arr = list(zip(_lo, _up)) else: # Assuming symmetric error is_symmetric = True arr = _make_unc_array(unc_proc) @@ -180,32 +186,34 @@ def _make_poisson_unc_array( """ if symmetric: if "hist_variance" not in readout.keys(): - numpy.sqrt(readout["hist_value"]) - return numpy.sqrt(n_events) + n_events = numpy.sqrt(readout["hist_value"]) + unc_arr = numpy.sqrt(n_events) else: # Effective number of events - sw, sw2 = readout["hist_value"], readout["hist_variance"] + _sw, _sw2 = readout["hist_value"], readout["hist_variance"] n_events = numpy.divide( - sw**2, sw2, out=numpy.zeros_like(sw), where=(sw2 != 0) + _sw**2, _sw2, out=numpy.zeros_like(_sw), where=(_sw2 != 0) ) rel_unc = numpy.divide( numpy.sqrt(n_events), n_events, - out=numpy.zeros_like(sw), + out=numpy.zeros_like(_sw), where=(n_events != 0), ) - return sw * rel_unc + unc_arr = _sw * rel_unc else: - sw, sw2 = readout["hist_value"], readout["hist_value"] + _sw, _sw2 = readout["hist_value"], readout["hist_value"] if "hist_variance" in readout.keys(): - sw2 = readout["hist_variance"] - lo, up = hist.intervals.poisson_interval(sw, sw2) - lo, up = lo - sw, up - sw - return [x for x in zip(lo, up)] + _sw2 = readout["hist_variance"] + _lo, _up = hist.intervals.poisson_interval(_sw, _sw2) + _lo, _up = _lo - _sw, _up - _sw + unc_arr = list(zip(_lo, _up)) + + return unc_arr def create_hist_base_table( table_name: str, - h: hist.Hist, + histo: hist.Hist, flow: bool = False, axes_rename: Optional[Dict[str, str]] = None, axes_units: Optional[Dict[str, str]] = None, @@ -221,21 +229,21 @@ def create_hist_base_table( axes_units = {} table = Table(table_name) - readout = read_hist(h) + readout = read_hist(histo, flow=flow) - for ax in h.axes: - var_name = ax.name - if ax.name in axes_rename: - var_name = axes_rename[ax.name] - elif ax.label: - var_name = ax.label + for axis in histo.axes: + var_name = axis.name + if axis.name in axes_rename: + var_name = axes_rename[axis.name] + elif axis.label: + var_name = axis.label var = Variable( var_name, is_independent=True, - is_binned=ax.traits.continuous, - units=axes_units.get(ax.name, ""), + is_binned=axis.traits.continuous, + units=axes_units.get(axis.name, ""), ) - var.values = readout[ax.name] + var.values = readout[axis.name] table.add_variable(var) return table From 7ddb4556bafc255b5726daea5efb8cc93f7c5074 Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Mon, 30 Oct 2023 13:33:36 +0100 Subject: [PATCH 04/12] Added unit test for hist_utils functions --- tests/test_histread.py | 98 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/tests/test_histread.py b/tests/test_histread.py index 2e5e49a0..2e6d1742 100644 --- a/tests/test_histread.py +++ b/tests/test_histread.py @@ -3,6 +3,7 @@ from unittest import TestCase import numpy as np import hist +import hist.intervals from hepdata_lib.hist_utils import * @@ -40,6 +41,7 @@ class TestHistUtils(TestCase): ) def test_default_read(self): + # Default read with no projection try: readout = read_hist(TestHistUtils.base_hist) except: @@ -50,5 +52,101 @@ def test_default_read(self): self.assertTrue(len(readout["dataset"]) == len(readout["eta"])) self.assertTrue(len(readout["dataset"]) == len(readout["pt"])) + def test_projection_read(self): + # Default read with simple projection + try: + r1 = read_hist(TestHistUtils.base_hist[{"dataset": "data", "flavor": sum}]) + r2 = read_hist(TestHistUtils.base_hist[{"dataset": "QCD", "flavor": 0j}]) + except: + self.fail("Histogram reading raised an unexpected exception.") + # Checking dimension compatibility + self.assertTrue(len(r1["eta"]) == len(r1["pt"])) + self.assertTrue(len(r1["eta"]) == len(r2["pt"])) + self.assertTrue(np.all(r1["eta"] == r2["eta"])) + self.assertTrue(np.all(r1["pt"] == r2["pt"])) + # Clean up self.doCleanups() + + def test_uncertainty_generation(self): + h = TestHistUtils.base_hist[{"dataset": "data"}] + q_h = TestHistUtils.base_hist[{"dataset": "QCD"}] + t_h = TestHistUtils.base_hist[{"dataset": "ttbar"}] + + h_arr = h.view(flow=True)["value"].flatten() + unc_arr = np.ones_like(h_arr) * np.random.random(size=h_arr.shape[0]) + try: + r = hist_as_variable( + "testing", + h, + flow=True, + uncertainty={ + "symmetric stat": "poisson_sym", + "asymmetric stat": "poisson_asym", + "symmetric float": 1.5, + "asymmetric float": (1.5, 2.2), + "symmetric array": unc_arr, + "asymmetric array": (-0.8 * unc_arr, unc_arr), + "symmetric histogram": q_h, + "asymmetric histogram": (q_h, t_h), + }, + ) + except: + self.fail("Unexpected exception of automatic uncertainty generation.") + + def check_val(a, b): + return self.assertTrue(np.all(np.isclose(a, b) | np.isnan(a) | np.isnan(b))) + + h_arr = h.view(flow=True)["value"].flatten() + check_val(r.values, h_arr) + # Symmetric Poisson + check_val(r.uncertainties[0].values, np.sqrt(h_arr)) + + # Asymmetric Poisson + l, u = hist.intervals.poisson_interval(h_arr) + l, u = l - h_arr, u - h_arr + check_val(r.uncertainties[1].values, list(zip(l, u))) + + # Flat uncertainties + check_val(r.uncertainties[2].values, 1.5) + check_val(r.uncertainties[3].values, (1.5, 2.2)) + + # Array defined uncertainties + check_val(r.uncertainties[4].values, unc_arr) + check_val(r.uncertainties[5].values, list(zip(-0.8 * unc_arr, unc_arr))) + + # Histogram defined uncertainties + q_arr = q_h.view(flow=True)["value"].flatten() + t_arr = t_h.view(flow=True)["value"].flatten() + check_val(r.uncertainties[6].values, q_arr) + check_val(r.uncertainties[7].values, list(zip(q_arr, t_arr))) + + self.doCleanups() + + def test_table_generation(self): + _rename_ = { + "dataset": "Data set", + "flavor": "Jet flavor", + "eta": "Jet $\eta$", + "pt": "Jet $p_{T}$", + } + _units_ = {"pt": "GeV"} + try: + table = create_hist_base_table( + "my_table", + TestHistUtils.base_hist, + axes_rename=_rename_, + axes_units=_units_, + ) + except: + self.fail("Unexpected exception of table generation.") + + readout = read_hist(TestHistUtils.base_hist) + + for idx, (name, new_name) in enumerate(_rename_.items()): + # Checking rename + self.assertTrue(table.variables[idx].name == new_name) + self.assertTrue(table.variables[idx].units == _units_.get(name, "")) + self.assertTrue(len(table.variables[idx].values) == len(readout[name])) + + self.doCleanups() From ab99f26ec475a8dcfb6c365caffeb05ea31af994 Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Mon, 30 Oct 2023 13:38:38 +0100 Subject: [PATCH 05/12] Updating dependencies --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c1a8d716..8b7dc410 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy PyYAML>=4.0 future -hist +hist[plot] # Required for intervals hepdata-validator>=0.3.5 From 67a9ad47db173336fb9cfe35c4473c04488b2b92 Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Mon, 30 Oct 2023 13:39:06 +0100 Subject: [PATCH 06/12] Expliciltly listing all dependencies --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8b7dc410..b4f40231 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ numpy PyYAML>=4.0 future -hist[plot] # Required for intervals +hist +scipy hepdata-validator>=0.3.5 From 6ae8b034102d31c9159c9d1671b99a801aa57657 Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Mon, 30 Oct 2023 16:27:25 +0100 Subject: [PATCH 07/12] Fixed pylint for test scripts --- tests/test_histread.py | 77 ++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/tests/test_histread.py b/tests/test_histread.py index 2e6d1742..93607563 100644 --- a/tests/test_histread.py +++ b/tests/test_histread.py @@ -4,7 +4,7 @@ import numpy as np import hist import hist.intervals -from hepdata_lib.hist_utils import * +from hepdata_lib.hist_utils import read_hist, hist_as_variable, create_hist_base_table class TestHistUtils(TestCase): @@ -41,7 +41,10 @@ class TestHistUtils(TestCase): ) def test_default_read(self): - # Default read with no projection + """ + Ensure basic readout function generates arrays with compatible + dimensions with given the base-level histogram. + """ try: readout = read_hist(TestHistUtils.base_hist) except: @@ -51,34 +54,46 @@ def test_default_read(self): self.assertTrue(len(readout["dataset"]) == len(readout["flavor"])) self.assertTrue(len(readout["dataset"]) == len(readout["eta"])) self.assertTrue(len(readout["dataset"]) == len(readout["pt"])) + self.doCleanups() def test_projection_read(self): + """ + Ensure basic readout function generates arrays with compatible + dimensions with histogram slicing operations. + """ # Default read with simple projection try: - r1 = read_hist(TestHistUtils.base_hist[{"dataset": "data", "flavor": sum}]) - r2 = read_hist(TestHistUtils.base_hist[{"dataset": "QCD", "flavor": 0j}]) + read1 = read_hist( + TestHistUtils.base_hist[{"dataset": "data", "flavor": sum}] + ) + read2 = read_hist(TestHistUtils.base_hist[{"dataset": "QCD", "flavor": 0j}]) except: self.fail("Histogram reading raised an unexpected exception.") # Checking dimension compatibility - self.assertTrue(len(r1["eta"]) == len(r1["pt"])) - self.assertTrue(len(r1["eta"]) == len(r2["pt"])) - self.assertTrue(np.all(r1["eta"] == r2["eta"])) - self.assertTrue(np.all(r1["pt"] == r2["pt"])) + self.assertTrue(len(read1["eta"]) == len(read1["pt"])) + self.assertTrue(len(read1["eta"]) == len(read2["pt"])) + self.assertTrue(np.all(read1["eta"] == read2["eta"])) + self.assertTrue(np.all(read1["pt"] == read2["pt"])) # Clean up self.doCleanups() def test_uncertainty_generation(self): - h = TestHistUtils.base_hist[{"dataset": "data"}] + """ + Exhaustively testing automatic variable generation with all defined + uncertainty formats + """ + + d_h = TestHistUtils.base_hist[{"dataset": "data"}] q_h = TestHistUtils.base_hist[{"dataset": "QCD"}] t_h = TestHistUtils.base_hist[{"dataset": "ttbar"}] - h_arr = h.view(flow=True)["value"].flatten() - unc_arr = np.ones_like(h_arr) * np.random.random(size=h_arr.shape[0]) + d_arr = d_h.view(flow=True)["value"].flatten() + unc_arr = np.ones_like(d_arr) * np.random.random(size=d_arr.shape[0]) try: - r = hist_as_variable( + auto_var = hist_as_variable( "testing", - h, + d_h, flow=True, uncertainty={ "symmetric stat": "poisson_sym", @@ -94,41 +109,45 @@ def test_uncertainty_generation(self): except: self.fail("Unexpected exception of automatic uncertainty generation.") - def check_val(a, b): - return self.assertTrue(np.all(np.isclose(a, b) | np.isnan(a) | np.isnan(b))) + def check_val(arr1, arr2): + return self.assertTrue( + np.all(np.isclose(arr1, arr2) | np.isnan(arr1) | np.isnan(arr2)) + ) - h_arr = h.view(flow=True)["value"].flatten() - check_val(r.values, h_arr) + check_val(auto_var.values, d_arr) # Symmetric Poisson - check_val(r.uncertainties[0].values, np.sqrt(h_arr)) + check_val(auto_var.uncertainties[0].values, np.sqrt(d_arr)) # Asymmetric Poisson - l, u = hist.intervals.poisson_interval(h_arr) - l, u = l - h_arr, u - h_arr - check_val(r.uncertainties[1].values, list(zip(l, u))) + _l, _u = hist.intervals.poisson_interval(d_arr) + _l, _u = _l - d_arr, _u - d_arr + check_val(auto_var.uncertainties[1].values, list(zip(_l, _u))) # Flat uncertainties - check_val(r.uncertainties[2].values, 1.5) - check_val(r.uncertainties[3].values, (1.5, 2.2)) + check_val(auto_var.uncertainties[2].values, 1.5) + check_val(auto_var.uncertainties[3].values, (1.5, 2.2)) # Array defined uncertainties - check_val(r.uncertainties[4].values, unc_arr) - check_val(r.uncertainties[5].values, list(zip(-0.8 * unc_arr, unc_arr))) + check_val(auto_var.uncertainties[4].values, unc_arr) + check_val(auto_var.uncertainties[5].values, list(zip(-0.8 * unc_arr, unc_arr))) # Histogram defined uncertainties q_arr = q_h.view(flow=True)["value"].flatten() t_arr = t_h.view(flow=True)["value"].flatten() - check_val(r.uncertainties[6].values, q_arr) - check_val(r.uncertainties[7].values, list(zip(q_arr, t_arr))) + check_val(auto_var.uncertainties[6].values, q_arr) + check_val(auto_var.uncertainties[7].values, list(zip(q_arr, t_arr))) self.doCleanups() def test_table_generation(self): + """ + Base table generation with base histogram + """ _rename_ = { "dataset": "Data set", "flavor": "Jet flavor", - "eta": "Jet $\eta$", - "pt": "Jet $p_{T}$", + "eta": r"Jet $\eta$", + "pt": r"Jet $p_{T}$", } _units_ = {"pt": "GeV"} try: From a9783ee71aef22aba8d429da16d46a8b24015f3a Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Mon, 30 Oct 2023 16:55:11 +0100 Subject: [PATCH 08/12] Disabling specific pylint --- tests/test_histread.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_histread.py b/tests/test_histread.py index 93607563..27de29fa 100644 --- a/tests/test_histread.py +++ b/tests/test_histread.py @@ -47,8 +47,10 @@ def test_default_read(self): """ try: readout = read_hist(TestHistUtils.base_hist) + # pylint: disable=W0702 except: self.fail("Histogram reading raised an unexpected exception.") + # pylint: enable=W0702 # Checking dimension compatibility self.assertTrue(len(readout["dataset"]) == len(readout["flavor"])) @@ -67,8 +69,11 @@ def test_projection_read(self): TestHistUtils.base_hist[{"dataset": "data", "flavor": sum}] ) read2 = read_hist(TestHistUtils.base_hist[{"dataset": "QCD", "flavor": 0j}]) + # pylint: disable=W0702 except: self.fail("Histogram reading raised an unexpected exception.") + # pylint: enable=W0702 + # Checking dimension compatibility self.assertTrue(len(read1["eta"]) == len(read1["pt"])) self.assertTrue(len(read1["eta"]) == len(read2["pt"])) @@ -106,8 +111,10 @@ def test_uncertainty_generation(self): "asymmetric histogram": (q_h, t_h), }, ) + # pylint: disable=W0702 except: self.fail("Unexpected exception of automatic uncertainty generation.") + # pylint: enable=W0702 def check_val(arr1, arr2): return self.assertTrue( @@ -157,8 +164,10 @@ def test_table_generation(self): axes_rename=_rename_, axes_units=_units_, ) + # pylint: disable=W0702 except: self.fail("Unexpected exception of table generation.") + # pylint: enable=W0702 readout = read_hist(TestHistUtils.base_hist) From bab436477bf20fd3a190c30913889e122516ad42 Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Mon, 30 Oct 2023 20:59:50 +0100 Subject: [PATCH 09/12] Support for all standard hist.storage methods. Testing for all storage types and removing exception handling --- hepdata_lib/hist_utils.py | 12 +++- tests/test_histread.py | 129 +++++++++++++++++++++++--------------- 2 files changed, 88 insertions(+), 53 deletions(-) diff --git a/hepdata_lib/hist_utils.py b/hepdata_lib/hist_utils.py index e8bbc447..3e28af03 100644 --- a/hepdata_lib/hist_utils.py +++ b/hepdata_lib/hist_utils.py @@ -38,12 +38,20 @@ def read_hist(histo: hist.Hist, flow: bool = False) -> Dict[str, numpy.ndarray]: _storage_keys = { hist.storage.Weight: ["value", "variance"], - hist.storage.Mean: ["value", ""], + hist.storage.Mean: ["value", "count", "_sum_of_deltas_squared"], + hist.storage.WeightedMean: [ + "value", + "sum_of_weights", + "sum_of_weights_squared", + "_sum_of_weighted_deltas_squared", + ], } - if ( + if ( # Single value storages histo.storage_type is hist.storage.Int64 or histo.storage_type is hist.storage.Double + or histo.storage_type is hist.storage.AtomicInt64 + or histo.storage_type is hist.storage.Unlimited ): readout["hist_value"] = view.flatten() elif histo.storage_type in _storage_keys: diff --git a/tests/test_histread.py b/tests/test_histread.py index 27de29fa..a02dd5e0 100644 --- a/tests/test_histread.py +++ b/tests/test_histread.py @@ -43,19 +43,55 @@ class TestHistUtils(TestCase): def test_default_read(self): """ Ensure basic readout function generates arrays with compatible - dimensions with given the base-level histogram. + dimensions with given the base-level histogram. Also checking alternate + binning methods. """ - try: - readout = read_hist(TestHistUtils.base_hist) - # pylint: disable=W0702 - except: - self.fail("Histogram reading raised an unexpected exception.") - # pylint: enable=W0702 + # Always test the base histogram + readout = read_hist(TestHistUtils.base_hist) # Checking dimension compatibility - self.assertTrue(len(readout["dataset"]) == len(readout["flavor"])) - self.assertTrue(len(readout["dataset"]) == len(readout["eta"])) - self.assertTrue(len(readout["dataset"]) == len(readout["pt"])) + self.assertTrue(len(readout["hist_value"]) == len(readout["hist_variance"])) + self.assertTrue(len(readout["hist_value"]) == len(readout["dataset"])) + self.assertTrue(len(readout["hist_value"]) == len(readout["flavor"])) + self.assertTrue(len(readout["hist_value"]) == len(readout["eta"])) + self.assertTrue(len(readout["hist_value"]) == len(readout["pt"])) + self.assertTrue(len(readout.keys()) == 6) + + ## Histograms with alternate types + # Simple double + h_double = hist.Hist(hist.axis.Regular(10, 0, 1, name="x")) # Double + readout = read_hist(h_double) + self.assertTrue(len(readout["hist_value"]) == len(readout["x"])) + self.assertTrue(len(readout.keys()) == 2) + + h_mean = hist.Hist( + hist.axis.Regular(10, 0, 1, name="x"), storage=hist.storage.Mean() + ) + readout = read_hist(h_mean) + self.assertTrue(len(readout["hist_value"]) == len(readout["x"])) + self.assertTrue(len(readout["hist_value"]) == len(readout["hist_count"])) + self.assertTrue( + len(readout["hist_value"]) == len(readout["hist__sum_of_deltas_squared"]) + ) + self.assertTrue(len(readout.keys()) == 4) + + h_mean = hist.Hist( + hist.axis.Regular(10, 0, 1, name="x"), storage=hist.storage.WeightedMean() + ) + readout = read_hist(h_mean) + self.assertTrue(len(readout["hist_value"]) == len(readout["x"])) + self.assertTrue( + len(readout["hist_value"]) == len(readout["hist_sum_of_weights"]) + ) + self.assertTrue( + len(readout["hist_value"]) == len(readout["hist_sum_of_weights_squared"]) + ) + self.assertTrue( + len(readout["hist_value"]) + == len(readout["hist__sum_of_weighted_deltas_squared"]) + ) + self.assertTrue(len(readout.keys()) == 5) + self.doCleanups() def test_projection_read(self): @@ -63,16 +99,9 @@ def test_projection_read(self): Ensure basic readout function generates arrays with compatible dimensions with histogram slicing operations. """ - # Default read with simple projection - try: - read1 = read_hist( - TestHistUtils.base_hist[{"dataset": "data", "flavor": sum}] - ) - read2 = read_hist(TestHistUtils.base_hist[{"dataset": "QCD", "flavor": 0j}]) - # pylint: disable=W0702 - except: - self.fail("Histogram reading raised an unexpected exception.") - # pylint: enable=W0702 + # Default read with slicing projection + read1 = read_hist(TestHistUtils.base_hist[{"dataset": "data", "flavor": sum}]) + read2 = read_hist(TestHistUtils.base_hist[{"dataset": "QCD", "flavor": 0j}]) # Checking dimension compatibility self.assertTrue(len(read1["eta"]) == len(read1["pt"])) @@ -80,6 +109,13 @@ def test_projection_read(self): self.assertTrue(np.all(read1["eta"] == read2["eta"])) self.assertTrue(np.all(read1["pt"] == read2["pt"])) + # Profiling histogram conversion + read3 = read_hist( + TestHistUtils.base_hist[{"dataset": sum, "flavor": sum}].profile("eta") + ) + self.assertTrue(len(read3["pt"]) == len(read3["hist_value"])) + self.assertTrue(len(read3.keys()) == 4) + # Clean up self.doCleanups() @@ -95,26 +131,21 @@ def test_uncertainty_generation(self): d_arr = d_h.view(flow=True)["value"].flatten() unc_arr = np.ones_like(d_arr) * np.random.random(size=d_arr.shape[0]) - try: - auto_var = hist_as_variable( - "testing", - d_h, - flow=True, - uncertainty={ - "symmetric stat": "poisson_sym", - "asymmetric stat": "poisson_asym", - "symmetric float": 1.5, - "asymmetric float": (1.5, 2.2), - "symmetric array": unc_arr, - "asymmetric array": (-0.8 * unc_arr, unc_arr), - "symmetric histogram": q_h, - "asymmetric histogram": (q_h, t_h), - }, - ) - # pylint: disable=W0702 - except: - self.fail("Unexpected exception of automatic uncertainty generation.") - # pylint: enable=W0702 + auto_var = hist_as_variable( + "testing", + d_h, + flow=True, + uncertainty={ + "symmetric stat": "poisson_sym", + "asymmetric stat": "poisson_asym", + "symmetric float": 1.5, + "asymmetric float": (1.5, 2.2), + "symmetric array": unc_arr, + "asymmetric array": (-0.8 * unc_arr, unc_arr), + "symmetric histogram": q_h, + "asymmetric histogram": (q_h, t_h), + }, + ) def check_val(arr1, arr2): return self.assertTrue( @@ -157,17 +188,13 @@ def test_table_generation(self): "pt": r"Jet $p_{T}$", } _units_ = {"pt": "GeV"} - try: - table = create_hist_base_table( - "my_table", - TestHistUtils.base_hist, - axes_rename=_rename_, - axes_units=_units_, - ) - # pylint: disable=W0702 - except: - self.fail("Unexpected exception of table generation.") - # pylint: enable=W0702 + + table = create_hist_base_table( + "my_table", + TestHistUtils.base_hist, + axes_rename=_rename_, + axes_units=_units_, + ) readout = read_hist(TestHistUtils.base_hist) From da1740a3ebd0454f57009af1f9fdc7c65f7b9e85 Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Fri, 3 Nov 2023 10:54:26 +0100 Subject: [PATCH 10/12] Additional edge cases handling --- hepdata_lib/hist_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hepdata_lib/hist_utils.py b/hepdata_lib/hist_utils.py index 3e28af03..a3a90bed 100644 --- a/hepdata_lib/hist_utils.py +++ b/hepdata_lib/hist_utils.py @@ -150,7 +150,7 @@ def _make_unc_array(unc_val): if isinstance(unc_val, float): return numpy.ones_like(readout["hist_value"]) * unc_val if isinstance(unc_val, numpy.ndarray): - return unc_val + return unc_val.flatten() if isinstance(unc_val, hist.Hist): return read_hist(unc_val, flow=flow)["hist_value"] raise NotImplementedError(f"Unknown uncertainty format! {type(unc_val)}") @@ -214,6 +214,9 @@ def _make_poisson_unc_array( _sw2 = readout["hist_variance"] _lo, _up = hist.intervals.poisson_interval(_sw, _sw2) _lo, _up = _lo - _sw, _up - _sw + # Suppressing the NAN for zero-entry events + _lo = numpy.nan_to_num(_lo, nan=0.0) + _up = numpy.nan_to_num(_up, nan=0.0) unc_arr = list(zip(_lo, _up)) return unc_arr From 11ae27513c050a1ef30b963a9cadf7bee940711e Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Fri, 3 Nov 2023 13:29:21 +0100 Subject: [PATCH 11/12] Generalizing storage method detector to work for any input (compatibility for hist==2.4.0 for python==3.6) --- hepdata_lib/hist_utils.py | 15 +++------------ tests/test_histread.py | 2 +- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/hepdata_lib/hist_utils.py b/hepdata_lib/hist_utils.py index a3a90bed..9b111eaa 100644 --- a/hepdata_lib/hist_utils.py +++ b/hepdata_lib/hist_utils.py @@ -47,20 +47,11 @@ def read_hist(histo: hist.Hist, flow: bool = False) -> Dict[str, numpy.ndarray]: ], } - if ( # Single value storages - histo.storage_type is hist.storage.Int64 - or histo.storage_type is hist.storage.Double - or histo.storage_type is hist.storage.AtomicInt64 - or histo.storage_type is hist.storage.Unlimited - ): + if view.dtype.names is None: # Single value storages readout["hist_value"] = view.flatten() - elif histo.storage_type in _storage_keys: - for key in _storage_keys[histo.storage_type]: - readout["hist_" + key] = view[key].flatten() else: - raise NotImplementedError( - f"Storage type {histo.storage_type} currently not implemented" - ) + for var_name in view.dtype.names: + readout["hist_" + var_name] = view[var_name].flatten() return readout diff --git a/tests/test_histread.py b/tests/test_histread.py index a02dd5e0..44d80233 100644 --- a/tests/test_histread.py +++ b/tests/test_histread.py @@ -157,7 +157,7 @@ def check_val(arr1, arr2): check_val(auto_var.uncertainties[0].values, np.sqrt(d_arr)) # Asymmetric Poisson - _l, _u = hist.intervals.poisson_interval(d_arr) + _l, _u = hist.intervals.poisson_interval(d_arr, d_arr) _l, _u = _l - d_arr, _u - d_arr check_val(auto_var.uncertainties[1].values, list(zip(_l, _u))) From f7dcb33090fb327c107b0a49f2eb80456a01a607 Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Tue, 7 Nov 2023 12:06:27 +0100 Subject: [PATCH 12/12] Added additional handling for integer bins --- hepdata_lib/hist_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hepdata_lib/hist_utils.py b/hepdata_lib/hist_utils.py index 9b111eaa..2ec99608 100644 --- a/hepdata_lib/hist_utils.py +++ b/hepdata_lib/hist_utils.py @@ -77,12 +77,17 @@ def _get_histaxis_array(axis, flow: bool) -> numpy.ndarray: entries.append("__UNDETERMINED__") elif isinstance(axis, hist.axis.IntCategory): entries.append(entries[-1] + 1) + elif isinstance(axis, hist.axis.Integer): + entries.append(numpy.inf) else: entries.append((axis.edges[-1], numpy.inf)) ## Adding underflow bin if flow and axis.traits.underflow: - entries = [(-numpy.inf, axis.edges[0])] + entries + if isinstance(axis,hist.axis.Integer): + entries = [-numpy.inf] + entries + else: + entries = [(-numpy.inf, axis.edges[0])] + entries ## Converting to numpy array if axis.traits.continuous: