From b6a0d9aa415941cf3105dc81a02b65d964cd6df8 Mon Sep 17 00:00:00 2001
From: Yi-Mu Chen <enochnotsocool@gmail.com>
Date: Fri, 27 Oct 2023 16:21:28 +0200
Subject: [PATCH] Updated documentation and dependencies

---
 README.md                                  |  3 +
 examples/reading_scikihep_histograms.ipynb | 79 ++++++++++++----------
 hepdata_lib/hist_utils.py                  | 42 ++++++++++--
 requirements.txt                           |  1 +
 4 files changed, 82 insertions(+), 43 deletions(-)
diff --git a/README.md b/README.md
index ce20b896..7fcac52c 100644
--- a/README.md
+++ b/README.md
@@ -72,6 +72,9 @@ There are a few more examples available that can directly be run using the [bind
 - [Reading TGraph and TGraphError from '.C' files](https://github.com/HEPData/hepdata_lib/blob/main/examples/read_c_file.ipynb)
 [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/HEPData/hepdata_lib/main?filepath=examples/read_c_file.ipynb)
 <br/><br/>
+- [Preparing scikit-hep histograms](https://github.com/HEPData/hepdata_lib/blob/main/examples/reading_scikithep_histogram.ipynb)
+[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/HEPData/hepdata_lib/main?filepath=examples/reading_scikihep_histogram.ipynb)
+<br/><br/>
 
 ## External dependencies
 
diff --git a/examples/reading_scikihep_histograms.ipynb b/examples/reading_scikihep_histograms.ipynb
index d559eb30..7334b5cd 100644
--- a/examples/reading_scikihep_histograms.ipynb
+++ b/examples/reading_scikihep_histograms.ipynb
@@ -6,14 +6,15 @@
    "source": [
     "# Reading histograms\n",
     "\n",
-    "For the new python-based frameworks, another common task would be to translate\n",
-    "histogram in the [`scikit-hep.hist`](https://hist.readthedocs.io/en/latest/)\n",
-    "package into the HEPData format. The functions in the `hepdata_lib` will help\n",
-    "you with that, and this notebook will demonstrate how to do that.\n",
+    "For the new python-based frameworks, another common format would needs\n",
+    "translation are histogram in the\n",
+    "[`scikit-hep.hist`](https://hist.readthedocs.io/en/latest/). The functions in\n",
+    "the `hepdata_lib.hist_utils` will help you with that, and this notebook will\n",
+    "demonstrate how to do that.\n",
     "\n",
     "As explained in the [Getting started notebook](Getting_started.ipynb), a\n",
     "`Submission` needs to exist or be created. Here, we'll just create one without\n",
-    "any additional information:"
+    "any additional information.\n"
    ]
   },
   {
@@ -39,10 +40,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The common use-case `scikit-hep` histograms is to allow for after-the-fact\n",
-    "slicing and grouping from a main histogram. Let us first generate a fake\n",
+    "The common use-case for `scikit-hep` histograms is to allow for after-the-fact\n",
+    "slicing and grouping from a primary histogram. Let us first generate a fake\n",
     "histogram that may appear in common histograms, as well as a common slicing\n",
-    "routine"
+    "routine\n"
    ]
   },
   {
@@ -109,9 +110,9 @@
     "## Example of manual processing to 1D array\n",
     "\n",
     "Let use create a simple slicing routine to get the various histograms of\n",
-    "interest, then use the most versatile function, the\n",
+    "interest, then use the most general function, the\n",
     "`hepdata_lib.hist_utils.read_hist` method, to create arrays that will be\n",
-    "compatible with variable creation."
+    "compatible with `hepdata_lib.Variable` declaration.\n"
    ]
   },
   {
@@ -123,27 +124,33 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'pt': array([(  0.,  25.), ( 25.,  50.), ( 50.,  75.), ( 75., 100.),\n",
+      "{'hist_value': array([27405., 21382., 16585., 12740., 10069.,  7878.,  6007.,  4678.,\n",
+      "        3666.,  2903.,  2333.,  1734.,  1352.,  1048.,   851.,   634.,\n",
+      "         485.,   401.,   294.,   230.]),\n",
+      " 'hist_variance': array([27405., 21382., 16585., 12740., 10069.,  7878.,  6007.,  4678.,\n",
+      "        3666.,  2903.,  2333.,  1734.,  1352.,  1048.,   851.,   634.,\n",
+      "         485.,   401.,   294.,   230.]),\n",
+      " 'pt': array([(  0.,  25.), ( 25.,  50.), ( 50.,  75.), ( 75., 100.),\n",
       "       (100., 125.), (125., 150.), (150., 175.), (175., 200.),\n",
       "       (200., 225.), (225., 250.), (250., 275.), (275., 300.),\n",
       "       (300., 325.), (325., 350.), (350., 375.), (375., 400.),\n",
       "       (400., 425.), (425., 450.), (450., 475.), (475., 500.)],\n",
-      "      dtype=[('f0', '<f4'), ('f1', '<f4')]), 'hist_value': array([27405., 21382., 16585., 12740., 10069.,  7878.,  6007.,  4678.,\n",
-      "        3666.,  2903.,  2333.,  1734.,  1352.,  1048.,   851.,   634.,\n",
-      "         485.,   401.,   294.,   230.]), 'hist_variance': array([27405., 21382., 16585., 12740., 10069.,  7878.,  6007.,  4678.,\n",
-      "        3666.,  2903.,  2333.,  1734.,  1352.,  1048.,   851.,   634.,\n",
-      "         485.,   401.,   294.,   230.])}\n"
+      "      dtype=[('f0', '<f4'), ('f1', '<f4')])}\n"
      ]
     }
    ],
    "source": [
     "from hepdata_lib.hist_utils import read_hist\n",
+    "import pprint\n",
     "\n",
-    "data_hist = h[dict(dataset='data', flavor=sum, eta=sum)]\n",
-    "fqcd_hist = h[dict(dataset='QCD', flavor=sum, eta=slice(1.4j,None,sum))] +  h[dict(dataset='QCD', flavor=sum, eta=slice(None,-1.4j,sum))]\n",
-    "cqcd_hist = h[dict(dataset='QCD', flavor=sum, eta=slice(-1.4j, 1.4j,sum))]\n",
-    "tt_b_hist = h[dict(dataset='ttbar', flavor=4j, eta=sum)]\n",
-    "tt_l_hist = h[dict(dataset='ttbar', flavor=0j, eta=sum)]\n",
+    "data_hist = h[dict(dataset=\"data\", flavor=sum, eta=sum)]\n",
+    "fqcd_hist = (\n",
+    "    h[dict(dataset=\"QCD\", flavor=sum, eta=slice(1.4j, None, sum))]\n",
+    "    + h[dict(dataset=\"QCD\", flavor=sum, eta=slice(None, -1.4j, sum))]\n",
+    ")\n",
+    "cqcd_hist = h[dict(dataset=\"QCD\", flavor=sum, eta=slice(-1.4j, 1.4j, sum))]\n",
+    "tt_b_hist = h[dict(dataset=\"ttbar\", flavor=4j, eta=sum)]\n",
+    "tt_l_hist = h[dict(dataset=\"ttbar\", flavor=0j, eta=sum)]\n",
     "\n",
     "tab_data = read_hist(data_hist)\n",
     "tab_fqcd = read_hist(fqcd_hist)\n",
@@ -151,7 +158,7 @@
     "tab_tt_b = read_hist(tt_b_hist)\n",
     "tab_tt_l = read_hist(tt_l_hist)\n",
     "\n",
-    "print(tab_data)"
+    "pprint.pprint(tab_data)\n"
    ]
   },
   {
@@ -159,10 +166,10 @@
    "metadata": {},
    "source": [
     "All axes remaining will generate a corresponding array that can be used to\n",
-    "declare `Variable` instances. Notice that because the histogram was declared\n",
-    "with `storage=Weight`, entries for `hist_value` (sum of weights) and\n",
-    "`hist_variance` (sum of weight-squared) will be presented to the user. This\n",
-    "information can the be used for the uncertainty generation."
+    "declare `Variable` instances. Because the histogram was declared with\n",
+    "`storage=Weight`, entries for `hist_value` (sum of weights) and `hist_variance`\n",
+    "(sum of weight-squared) will be presented to the user. This information can the\n",
+    "be used for the uncertainty generation. A simple example is shown below:\n"
    ]
   },
   {
@@ -216,7 +223,7 @@
     "the uncertainty should be calculated (`poisson_sym` or `poisson_asym`), a\n",
     "floating point (pair) to indicate flat, a histogram (pair) of the same format as\n",
     "the input histogram representing the uncertainty, or a numpy array (pair) the is\n",
-    "compatible with the final array output."
+    "compatible with the final array output.\n"
    ]
   },
   {
@@ -248,10 +255,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Example using N-dimensional histogram \n",
+    "## Example using N-dimensional histogram\n",
     "\n",
-    "Notice that because the flexibility of the scikit-hep histogram syntax, the same\n",
-    "syntax can be used for the histograms of arbitrary dimensions\n"
+    "Because of the flexibility of the scikit-hep histogram syntax, the same\n",
+    "functions can be used for the histograms of arbitrary dimensions.\n"
    ]
   },
   {
@@ -269,7 +276,7 @@
     "tt_l_hist = h[dict(dataset=\"ttbar\", flavor=0j)]\n",
     "\n",
     "tab_data = read_hist(data_hist)\n",
-    "# tab_qcd = read_hist(qcd_hist)\n",
+    "# tab_qcd = read_hist(qcd_hist) # No-longer required to be declared!\n",
     "# tab_tt_b = read_hist(tt_b_hist)\n",
     "# tab_tt_l = read_hist(tt_l_hist)\n",
     "\n",
@@ -300,7 +307,7 @@
    "source": [
     "To further simply the construction of table from N-dimensional histograms, we\n",
     "provide a `create_hist_base_table` function such that the axes variables are\n",
-    "automatically setup.\n"
+    "automatically set up directly with table declaration.\n"
    ]
   },
   {
@@ -332,14 +339,14 @@
    "source": [
     "## Outputting the submission\n",
     "\n",
-    "Finally, we can add the table to the sumission and create the files. Please\n",
-    "refer to the [Getting started notebook](Getting_started.ipynb) for a complete\n",
-    "example."
+    "Finally, we can add the table to the sumission and create the required files.\n",
+    "Please refer to the [Getting started notebook](Getting_started.ipynb) for a\n",
+    "complete example.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/hepdata_lib/hist_utils.py b/hepdata_lib/hist_utils.py
index 0c071b46..f5a41d88 100644
--- a/hepdata_lib/hist_utils.py
+++ b/hepdata_lib/hist_utils.py
@@ -60,7 +60,8 @@ def _get_histaxis_array(axis, flow: bool) -> numpy.ndarray:
     categorical axes, the return will be a N array of bin content values. If the
     flow is set to true, the function will also add the overflow/underflow bins
     according to the settings found in axis.traits. For categorical axis, this
-    will include an extra `__UNDETERMINED__` entry or a +1 entry.
+    will include an extra `"__UNDETERMINED__"` entry (for StrCategory) or an +1
+    entry (for IntCategory).
     """
 
     ## Getting the entries as a simple list
@@ -95,7 +96,28 @@ def hist_as_variable(
 ) -> Variable:
     """
     Returning this histogram entries as a Variable object, with a simpler
-    interface for modifying uncertainty
+    interface for automatically generating values uncertainties.
+
+    The `h` and `flow` inputs are passed directly to the `read_hist` method to
+    extract the value to be used for the variable.
+
+    The `uncertainty` is a dictionary defining how uncertainties should be
+    defined. Dictionary keys are used as the name of the uncertainty, while the
+    value defines how the uncertainty should be constructed. This can either be:
+
+    - `str`: either "poisson_asym" or "poisson_sym", indicating to extract
+      Poisson uncertainty directly from the histogram values. (Either the
+      asymmetric Garwood interval defined by `hist.intervals` or a simply,
+      symmetric `sqrt(n)`.)
+    - `float`: A flat uncertainty to be used on all bins.
+    - `numpy.ndarray`: An array indicating the uncertainty for each bin. The
+      array should be compatible with the output of `read_hist['hist_values']`
+    - `hist.Hist`: The histogram with bin values indicating the uncertainty to
+      be used for each bin. The histogram should be compatible with the input
+      histogram.
+    - `tuple(T,T)` where `T` can either be a `float`, `numpy.ndarray` or
+      `hist.Hist`. This is used to indicate asymmetric uncertainties, following
+      the lower/upper ordering convention of hepdata_lib
     """
     if uncertainty is None:
         uncertainty = {}
@@ -151,9 +173,15 @@ def _make_unc_array(x):
 def _make_poisson_unc_array(
     readout: Dict[str, numpy.ndarray], symmetric: bool
 ) -> numpy.ndarray:
+    """
+    Given the results of `read_hist`, extract the Poisson uncertainty using
+    hist.intervals. Automatically detecting the histogram storage type to handle
+    weighted uncertainties
+    """
     if symmetric:
         if "hist_variance" not in readout.keys():
             numpy.sqrt(readout["hist_value"])
+            return numpy.sqrt(n_events)
         else:  # Effective number of events
             sw, sw2 = readout["hist_value"], readout["hist_variance"]
             n_events = numpy.divide(
@@ -166,7 +194,6 @@ def _make_poisson_unc_array(
                 where=(n_events != 0),
             )
             return sw * rel_unc
-        return numpy.sqrt(n_events)
     else:
         sw, sw2 = readout["hist_value"], readout["hist_value"]
         if "hist_variance" in readout.keys():
@@ -180,12 +207,13 @@ def create_hist_base_table(
     table_name: str,
     h: hist.Hist,
     flow: bool = False,
-    axes_rename: Optional[Dict[str, str]] = None,  # Additional axes proces
-    axes_units: Optional[Dict[str, str]] = None,  # Additional axes proces
+    axes_rename: Optional[Dict[str, str]] = None,
+    axes_units: Optional[Dict[str, str]] = None,
 ) -> Table:
     """
-    Preparing the table based on hist, allows for the additional exclusion of
-    axes via a list of string names
+    Preparing the table based on hist. This constructs just the histogram axis
+    as the table variable. Histogram entries should be added via the
+    `hist_as_variable` method.
     """
     if axes_rename is None:
         axes_rename = {}
diff --git a/requirements.txt b/requirements.txt
index eaf80f96..c1a8d716 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 numpy
 PyYAML>=4.0
 future
+hist
 hepdata-validator>=0.3.5