From b6a0d9aa415941cf3105dc81a02b65d964cd6df8 Mon Sep 17 00:00:00 2001 From: Yi-Mu Chen Date: Fri, 27 Oct 2023 16:21:28 +0200 Subject: [PATCH] Updated documentation and dependencies --- README.md | 3 + examples/reading_scikihep_histograms.ipynb | 79 ++++++++++++---------- hepdata_lib/hist_utils.py | 42 ++++++++++-- requirements.txt | 1 + 4 files changed, 82 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index ce20b896..7fcac52c 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,9 @@ There are a few more examples available that can directly be run using the [bind - [Reading TGraph and TGraphError from '.C' files](https://github.com/HEPData/hepdata_lib/blob/main/examples/read_c_file.ipynb) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/HEPData/hepdata_lib/main?filepath=examples/read_c_file.ipynb)

+- [Preparing scikit-hep histograms](https://github.com/HEPData/hepdata_lib/blob/main/examples/reading_scikithep_histogram.ipynb) +[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/HEPData/hepdata_lib/main?filepath=examples/reading_scikihep_histogram.ipynb) +

## External dependencies diff --git a/examples/reading_scikihep_histograms.ipynb b/examples/reading_scikihep_histograms.ipynb index d559eb30..7334b5cd 100644 --- a/examples/reading_scikihep_histograms.ipynb +++ b/examples/reading_scikihep_histograms.ipynb @@ -6,14 +6,15 @@ "source": [ "# Reading histograms\n", "\n", - "For the new python-based frameworks, another common task would be to translate\n", - "histogram in the [`scikit-hep.hist`](https://hist.readthedocs.io/en/latest/)\n", - "package into the HEPData format. The functions in the `hepdata_lib` will help\n", - "you with that, and this notebook will demonstrate how to do that.\n", + "For the new python-based frameworks, another common format would needs\n", + "translation are histogram in the\n", + "[`scikit-hep.hist`](https://hist.readthedocs.io/en/latest/). The functions in\n", + "the `hepdata_lib.hist_utils` will help you with that, and this notebook will\n", + "demonstrate how to do that.\n", "\n", "As explained in the [Getting started notebook](Getting_started.ipynb), a\n", "`Submission` needs to exist or be created. Here, we'll just create one without\n", - "any additional information:" + "any additional information.\n" ] }, { @@ -39,10 +40,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The common use-case `scikit-hep` histograms is to allow for after-the-fact\n", - "slicing and grouping from a main histogram. Let us first generate a fake\n", + "The common use-case for `scikit-hep` histograms is to allow for after-the-fact\n", + "slicing and grouping from a primary histogram. Let us first generate a fake\n", "histogram that may appear in common histograms, as well as a common slicing\n", - "routine" + "routine\n" ] }, { @@ -109,9 +110,9 @@ "## Example of manual processing to 1D array\n", "\n", "Let use create a simple slicing routine to get the various histograms of\n", - "interest, then use the most versatile function, the\n", + "interest, then use the most general function, the\n", "`hepdata_lib.hist_utils.read_hist` method, to create arrays that will be\n", - "compatible with variable creation." + "compatible with `hepdata_lib.Variable` declaration.\n" ] }, { @@ -123,27 +124,33 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'pt': array([( 0., 25.), ( 25., 50.), ( 50., 75.), ( 75., 100.),\n", + "{'hist_value': array([27405., 21382., 16585., 12740., 10069., 7878., 6007., 4678.,\n", + " 3666., 2903., 2333., 1734., 1352., 1048., 851., 634.,\n", + " 485., 401., 294., 230.]),\n", + " 'hist_variance': array([27405., 21382., 16585., 12740., 10069., 7878., 6007., 4678.,\n", + " 3666., 2903., 2333., 1734., 1352., 1048., 851., 634.,\n", + " 485., 401., 294., 230.]),\n", + " 'pt': array([( 0., 25.), ( 25., 50.), ( 50., 75.), ( 75., 100.),\n", " (100., 125.), (125., 150.), (150., 175.), (175., 200.),\n", " (200., 225.), (225., 250.), (250., 275.), (275., 300.),\n", " (300., 325.), (325., 350.), (350., 375.), (375., 400.),\n", " (400., 425.), (425., 450.), (450., 475.), (475., 500.)],\n", - " dtype=[('f0', ' numpy.ndarray: categorical axes, the return will be a N array of bin content values. If the flow is set to true, the function will also add the overflow/underflow bins according to the settings found in axis.traits. For categorical axis, this - will include an extra `__UNDETERMINED__` entry or a +1 entry. + will include an extra `"__UNDETERMINED__"` entry (for StrCategory) or an +1 + entry (for IntCategory). """ ## Getting the entries as a simple list @@ -95,7 +96,28 @@ def hist_as_variable( ) -> Variable: """ Returning this histogram entries as a Variable object, with a simpler - interface for modifying uncertainty + interface for automatically generating values uncertainties. + + The `h` and `flow` inputs are passed directly to the `read_hist` method to + extract the value to be used for the variable. + + The `uncertainty` is a dictionary defining how uncertainties should be + defined. Dictionary keys are used as the name of the uncertainty, while the + value defines how the uncertainty should be constructed. This can either be: + + - `str`: either "poisson_asym" or "poisson_sym", indicating to extract + Poisson uncertainty directly from the histogram values. (Either the + asymmetric Garwood interval defined by `hist.intervals` or a simply, + symmetric `sqrt(n)`.) + - `float`: A flat uncertainty to be used on all bins. + - `numpy.ndarray`: An array indicating the uncertainty for each bin. The + array should be compatible with the output of `read_hist['hist_values']` + - `hist.Hist`: The histogram with bin values indicating the uncertainty to + be used for each bin. The histogram should be compatible with the input + histogram. + - `tuple(T,T)` where `T` can either be a `float`, `numpy.ndarray` or + `hist.Hist`. This is used to indicate asymmetric uncertainties, following + the lower/upper ordering convention of hepdata_lib """ if uncertainty is None: uncertainty = {} @@ -151,9 +173,15 @@ def _make_unc_array(x): def _make_poisson_unc_array( readout: Dict[str, numpy.ndarray], symmetric: bool ) -> numpy.ndarray: + """ + Given the results of `read_hist`, extract the Poisson uncertainty using + hist.intervals. Automatically detecting the histogram storage type to handle + weighted uncertainties + """ if symmetric: if "hist_variance" not in readout.keys(): numpy.sqrt(readout["hist_value"]) + return numpy.sqrt(n_events) else: # Effective number of events sw, sw2 = readout["hist_value"], readout["hist_variance"] n_events = numpy.divide( @@ -166,7 +194,6 @@ def _make_poisson_unc_array( where=(n_events != 0), ) return sw * rel_unc - return numpy.sqrt(n_events) else: sw, sw2 = readout["hist_value"], readout["hist_value"] if "hist_variance" in readout.keys(): @@ -180,12 +207,13 @@ def create_hist_base_table( table_name: str, h: hist.Hist, flow: bool = False, - axes_rename: Optional[Dict[str, str]] = None, # Additional axes proces - axes_units: Optional[Dict[str, str]] = None, # Additional axes proces + axes_rename: Optional[Dict[str, str]] = None, + axes_units: Optional[Dict[str, str]] = None, ) -> Table: """ - Preparing the table based on hist, allows for the additional exclusion of - axes via a list of string names + Preparing the table based on hist. This constructs just the histogram axis + as the table variable. Histogram entries should be added via the + `hist_as_variable` method. """ if axes_rename is None: axes_rename = {} diff --git a/requirements.txt b/requirements.txt index eaf80f96..c1a8d716 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ numpy PyYAML>=4.0 future +hist hepdata-validator>=0.3.5