Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: drop use of awkward in yield uncertainty calculation #408

Merged
merged 19 commits into from
Jul 5, 2023
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 57 additions & 45 deletions src/cabinetry/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,6 @@
import logging
from typing import Any, DefaultDict, Dict, List, NamedTuple, Optional, Tuple, Union

try:
# use awkward v2
import awkward._v2 as ak
except ModuleNotFoundError: # pragma: no cover
# fallback if the _v2 submodule disappears after the full v2 release
import awkward as ak # pragma: no cover
import numpy as np
import pyhf

Expand Down Expand Up @@ -300,16 +294,23 @@ def yield_stdev(
up_comb = np.vstack((up_comb, np.sum(up_comb, axis=0)))
# turn into list of channels (keep all samples, select correct bins per channel)
# indices: channel, sample, bin
up_yields = [
up_yields_per_channel = [
up_comb[:, model.config.channel_slices[ch]] for ch in model.config.channels
]
# append list of yields summed per channel
up_yields += [
np.asarray(np.sum(chan_yields, axis=-1, keepdims=True))
for chan_yields in up_yields
]
# indices: variation, channel, sample, bin
up_variations.append(up_yields)
# calculate list of yields summed per channel
up_yields_sum = np.array(
[
np.sum(chan_yields, axis=-1, keepdims=True).tolist()
for chan_yields in up_yields_per_channel
]
)
# reshape to get rid of extra axis
ekauffma marked this conversation as resolved.
Show resolved Hide resolved
up_yields_sum = up_yields_sum.reshape(up_yields_sum.shape[:-1]).T

# concatenate per-channel sum to up_comb
ekauffma marked this conversation as resolved.
Show resolved Hide resolved
up_yields = np.concatenate((up_comb, up_yields_sum), axis=1)
# indices: variation, sample, bin
up_variations.append(up_yields.tolist())

# model distribution per sample with this parameter varied down
down_comb = pyhf.tensorlib.to_numpy(
Expand All @@ -318,68 +319,79 @@ def yield_stdev(
# add total prediction (sum over samples)
down_comb = np.vstack((down_comb, np.sum(down_comb, axis=0)))
# turn into list of channels
down_yields = [
down_yields_per_channel = [
down_comb[:, model.config.channel_slices[ch]]
for ch in model.config.channels
]
# append list of yields summed per channel
down_yields += [
np.asarray(np.sum(chan_yields, axis=-1, keepdims=True))
for chan_yields in down_yields
]

# calculate list of yields summed per channel
down_yields_sum = np.array(
[
np.sum(chan_yields, axis=-1, keepdims=True).tolist()
for chan_yields in down_yields_per_channel
]
)
ekauffma marked this conversation as resolved.
Show resolved Hide resolved
down_yields_sum = down_yields_sum.reshape(down_yields_sum.shape[:-1]).T

down_yields = np.concatenate((down_comb, down_yields_sum), axis=1)
down_variations.append(down_yields)

# convert to awkward arrays for further processing
up_variations_ak = ak.from_iter(up_variations)
down_variations_ak = ak.from_iter(down_variations)
# convert to numpy arrays for further processing
up_variations_np = np.array(up_variations)
down_variations_np = np.array(down_variations)
ekauffma marked this conversation as resolved.
Show resolved Hide resolved

# calculate symmetric uncertainties for all components
# indices: variation, channel (last entries sums), sample (last entry sum), bin
sym_uncs = (up_variations_ak - down_variations_ak) / 2
sym_uncs = (up_variations_np - down_variations_np) / 2

# calculate total variance, indexed by channel, sample, bin (per-channel numbers act
# calculate total variance, indexed by sample, bin (per-channel numbers act
# like additional channels with one bin each)
ekauffma marked this conversation as resolved.
Show resolved Hide resolved
if np.count_nonzero(corr_mat - np.diagflat(np.ones_like(parameters))) == 0:
# no off-diagonal contributions from correlation matrix (e.g. pre-fit)
total_variance = ak.sum(np.power(sym_uncs, 2), axis=0)
total_variance = np.sum(np.power(sym_uncs, 2), axis=0)
else:
# full calculation including off-diagonal contributions
# with v as vector of variations (each element contains yields under variation)
# and M as correlation matrix, calculate variance as follows:
# variance = sum_i sum_j v[i] * M[i, j] * v[j]
# where the product between elements of v again is elementwise (multiplying bin
# yields), and the final variance shape is the same as element of v (yield
# uncertainties per bin, sample, channel)
# uncertainties per bin, sample)
ekauffma marked this conversation as resolved.
Show resolved Hide resolved

# possible optimizations that could be considered here:
# - skipping staterror-staterror terms for per-bin calculation (orthogonal)
# - taking advantage of correlation matrix symmetry
# - (optional) skipping combinations with correlations below threshold

# calculate M[i, j] * v[j] first
# indices: pars (i), pars (j), channel, sample, bin
m_times_v = (
corr_mat[..., np.newaxis, np.newaxis, np.newaxis]
* sym_uncs[np.newaxis, ...]
)
# now multiply by v[i] as well, indices: pars(i), pars(j), channel, sample, bin
# indices: pars (i), pars (j), sample, bin
m_times_v = corr_mat[..., np.newaxis, np.newaxis] * sym_uncs[np.newaxis, ...]
# now multiply by v[i] as well, indices: pars(i), pars(j), sample, bin
v_times_m_times_v = sym_uncs[:, np.newaxis, ...] * m_times_v
# finally perform sums over i and j, remaining indices: channel, sample, bin
total_variance = ak.sum(ak.sum(v_times_m_times_v, axis=1), axis=0)
# finally perform sums over i and j, remaining indices: sample, bin
total_variance = np.sum(np.sum(v_times_m_times_v, axis=1), axis=0)

# convert to standard deviations per bin and per channel
ekauffma marked this conversation as resolved.
Show resolved Hide resolved
n_channels = len(model.config.channels)
# indices: (channel, sample, bin)
total_stdev_per_bin = np.sqrt(total_variance[:n_channels])
# calculate separation between per-bin indices and per-channel indices
last_bin_index = model.config.channel_slices[model.config.channels[-1]].stop
# indices: (channel, sample)
ekauffma marked this conversation as resolved.
Show resolved Hide resolved
total_stdev_per_channel = ak.sum(np.sqrt(total_variance[n_channels:]), axis=-1)
total_stdev_per_channel = np.sqrt(total_variance[:, last_bin_index:].T).tolist()
# indices: (channel, sample, bin)
ekauffma marked this conversation as resolved.
Show resolved Hide resolved
total_stdev_per_bin = [
np.sqrt(
total_variance[:, :last_bin_index][:, model.config.channel_slices[ch]]
).tolist()
for ch in model.config.channels
]
# log total stdev per bin / channel (-1 index for sample sum)
ekauffma marked this conversation as resolved.
Show resolved Hide resolved
log.debug(f"total stdev is {total_stdev_per_bin[:, -1, :]}")
log.debug(f"total stdev per channel is {total_stdev_per_channel[:, -1]}")

# convert to lists
total_stdev_per_bin = ak.to_list(total_stdev_per_bin)
total_stdev_per_channel = ak.to_list(total_stdev_per_channel)
total_stdev_bin = [
total_stdev_per_bin[i][-1] for i in range(len(total_stdev_per_bin))
]
log.debug(f"total stdev is {total_stdev_bin}")
total_stdev_chan = [
total_stdev_per_channel[i][-1] for i in range(len(total_stdev_per_channel))
]
ekauffma marked this conversation as resolved.
Show resolved Hide resolved
log.debug(f"total stdev per channel is {total_stdev_chan}")

# save to cache
_YIELD_STDEV_CACHE.update(
Expand Down