Skip to content

Commit

Permalink
Merge branch 'release/3.1'
Browse files Browse the repository at this point in the history
  • Loading branch information
HDembinski committed Jul 25, 2018
2 parents bfc437b + 93493ca commit 31d4204
Show file tree
Hide file tree
Showing 36 changed files with 1,043 additions and 965 deletions.
2 changes: 1 addition & 1 deletion appveyor.yml → .appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ environment:
test_script:
# - ps: iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1'))
- cd build
- cmake . -DBUILD_PYTHON=OFF -DBUILD_NUMPY_SUPPORT=OFF
- cmake . -DBUILD_PYTHON=OFF -DBUILD_NUMPY=OFF
-DBOOST_ROOT="%BOOST_ROOT%" -DBoost_USE_STATIC_LIBS="ON"
- cmake --build .
- ctest -V
89 changes: 38 additions & 51 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,65 +16,52 @@ branches:
- master
- develop

# addons:
# apt:
# sources: deadsnakes
# packages:
# - python2.7
# - python3.5
# - python3-pip
# - libpython2.7-dev
# - libpython3.5-dev

matrix:
include:
- os: linux # minimum gcc
env:
CC=gcc CXX=g++ PYTHON_VERSION=2.7
BUILD_PYTHON=OFF
BUILD_NUMPY=OFF
BUILD_SERIALIZATION=OFF
- os: linux # maximum gcc
env:
CC=gcc CXX=g++ PYTHON_VERSION=2.7
BUILD_PYTHON=ON
BUILD_NUMPY=ON
BUILD_SERIALIZATION=ON
- os: linux # maximum gcc
env:
CC=gcc CXX=g++ PYTHON_VERSION=3.6
BUILD_PYTHON=ON
BUILD_NUMPY=OFF
BUILD_SERIALIZATION=ON
- os: linux # maximum clang
env:
CC=clang CXX=clang++
BUILD_PYTHON=ON PYTHON_VERSION=2.7
BUILD_NUMPY=ON
BUILD_SERIALIZATION=ON
- os: linux # coverage gcc
env:
CC=gcc CXX=g++ GCOV=gcov PYTHON_VERSION=2.7
CMAKE_BUILD_TYPE=coverage
- os: linux # gcc minimum
env: PYVER=2.7 CC=gcc CXX=g++ PY=OFF NUMPY=OFF SERIAL=OFF
- os: linux # gcc py27 w/o numpy
env: PYVER=2.7 CC=gcc CXX=g++ PY=ON NUMPY=OFF SERIAL=ON
- os: linux # gcc py27
env: PYVER=2.7 CC=gcc CXX=g++ PY=ON NUMPY=ON SERIAL=ON
- os: linux # gcc py36
env: PYVER=3.6 CC=gcc CXX=g++ PY=ON NUMPY=ON SERIAL=ON
- os: linux # clang py36
env: PYVER=3.6 CC=clang CXX=clang++ PY=ON NUMPY=ON SERIAL=ON
- os: linux # coverage py27
env: PYVER=2.7 CC=gcc CXX=g++ GCOV=gcov
- os: osx # minimum osx Xcode 8.3
osx_image: xcode8.3
env: PY=OFF NUMPY=OFF SERIAL=OFF
allow_failures:
- os: osx

git:
depth: 1
depth: 10

# Install packages (pre-installed: pytest numpy)
# Install packages (pre-installed: pytest)
install:
- pyenv versions
- pyenv global ${PYTHON_VERSION}
- if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then
export PATH="/usr/local/opt/python/libexec/bin:$PATH";
else
pyenv versions;
pyenv global ${PYVER};
fi

- python --version
- python build/get_python_include.py
- python build/get_python_library.py

- pip install --user numpy
- pip install --upgrade numpy # update numpy to avoid segfaults later
- source build/travis_install_boost.sh

- if [ "${CMAKE_BUILD_TYPE}" = "coverage" ]; then
pip install --user cpp-coveralls urllib3[secure];
- if [ -n "$GCOV" ]; then
pip install cpp-coveralls urllib3[secure];
fi

script:
- cd build
- if [ "${CMAKE_BUILD_TYPE}" = "coverage" ]; then
- if [ -n "$GCOV" ]; then
cmake . -DBOOST_ROOT=${BOOST_DIR}
-DBUILD_PYTHON=OFF
-DBUILD_SERIALIZATION=ON
Expand All @@ -95,17 +82,17 @@ script:
ctest;
else
cmake . -DBOOST_ROOT=${BOOST_DIR}
-DBUILD_PYTHON=${BUILD_PYTHON}
-DBUILD_NUMPY=${BUILD_NUMPY}
-DBUILD_SERIALIZATION=${BUILD_SERIALIZATION}
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} &&
-DBUILD_PYTHON=${PY}
-DBUILD_NUMPY=${NUMPY}
-DBUILD_SERIALIZATION=${SERIAL}
-DCMAKE_BUILD_TYPE=Debug &&
make -j2 &&
ctest -V;
fi

# Calculate coverage
after_success:
if [ "${CMAKE_BUILD_TYPE}" = "coverage" ]; then
if [ -n "$GCOV" ]; then
coveralls -r .. -b . --verbose --exclude ${TRAVIS_BUILD_DIR}/deps --gcov=`which ${GCOV}` --gcov-options '\-lpbc';
fi

Expand Down
14 changes: 2 additions & 12 deletions build/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
endif()

if(TRACE_ALLOCS)
if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
add_compile_options(/DBOOST_HISTOGRAM_TRACE_ALLOCS)
else()
add_compile_options(-DBOOST_HISTOGRAM_TRACE_ALLOCS)
endif()
add_definitions(-DBOOST_HISTOGRAM_TRACE_ALLOCS)
endif()

if(BUILD_PYTHON)
Expand Down Expand Up @@ -109,9 +105,9 @@ else()
# serialization only required for tests
if (BUILD_SERIALIZATION)
find_package(Boost ${MIN_BOOST_VERSION} REQUIRED serialization)
add_definitions(-DHAVE_SERIALIZATION)
else ()
find_package(Boost ${MIN_BOOST_VERSION} REQUIRED)
add_definitions(-DBOOST_HISTOGRAM_NO_SERIALIZATION)
endif()
set(LIBRARIES ${Boost_LIBRARIES})
endif()
Expand Down Expand Up @@ -191,12 +187,6 @@ file(GLOB_RECURSE
../test/*_test.cpp ../include/*.hpp
)

add_custom_target(clf
COMMAND clang-format
-i
${ALL_SOURCE_FILES}
)

get_property(INCLUDE_DIRS DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
set(TIDY_INCLUDE)
foreach(x ${INCLUDE_DIRS})
Expand Down
44 changes: 31 additions & 13 deletions build/get_python_library.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,37 @@
from distutils import sysconfig
import os.path
import os
import sys
import glob
from pprint import pprint
from glob import glob
pj = os.path.join

pyver = sysconfig.get_config_var('VERSION')
getvar = sysconfig.get_config_var
LIB_KEYS = ('LIBDEST', 'LIBDIR', 'LIBPL')

libname = "python" + pyver
if sys.platform == "darwin":
so_ext = "dylib"
elif sys.platform.startswith("linux"):
so_ext = "so"
else:
so_ext = "dll"

for libvar in ('LIBDIR', 'LIBPL'):
for ext in ('so', 'dylib', 'dll'):
match = pj(getvar(libvar), "*" + libname + "*." + ext)
lib = glob.glob(match)
if lib:
assert len(lib) == 1
sys.stdout.write(lib[0])
raise SystemExit
config = sysconfig.get_config_vars()

library = "*python%s*%s" % (sysconfig.get_python_version(), so_ext)
for libpath in LIB_KEYS:
p = pj(config[libpath], library)
cand = glob(p)
if cand and len(cand) == 1:
sys.stdout.write(cand[0])
raise SystemExit

pprint("no library found, dumping library pattern, config, and directory contents:")
pprint(library)
pprint(config)

for libpath in LIB_KEYS:
pprint(libpath)
p = config[libpath]
if os.path.exists(p):
pprint(os.listdir(p))

raise SystemExit(1)
9 changes: 9 additions & 0 deletions build/make_user_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import sys
from distutils import sysconfig

s = "using python : {version} : {prefix} : {inc} ;\n".format(
version=sysconfig.get_python_version(),
prefix=sysconfig.get_config_var("prefix"),
inc=sysconfig.get_python_inc())

sys.stdout.write(s)
12 changes: 7 additions & 5 deletions build/travis_install_boost.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,16 @@ if [[ -z "${TRAVIS_BUILD_DIR}" ]]; then
fi
PYVER=$(python -c 'import sys; sys.stdout.write("%i"%sys.version_info.major)')
BOOST_DIR=${TRAVIS_BUILD_DIR}/deps/boost-${BOOST_VERSION}-py${PYVER}
PROJECT_DIR=$(pwd)
echo "Boost: ${BOOST_DIR}"
mkdir -p ${BOOST_DIR}
BOOSTRAP_PATCH_REGEX="s|\( *using python.*\);|\1: $(python build/get_python_include.py) ;|"
if [[ -z "$(ls -A ${BOOST_DIR})" ]]; then
BOOST_URL="http://sourceforge.net/projects/boost/files/boost/${BOOST_VERSION}/boost_${BOOST_VERSION//\./_}.tar.gz"
{ wget --quiet -O - ${BOOST_URL} | tar --strip-components=1 -xz -C ${BOOST_DIR}; } || exit 1
(cd ${BOOST_DIR} && ./bootstrap.sh > /dev/null && \
sed -i "${BOOSTRAP_PATCH_REGEX}" project-config.jam && \
./b2 install --prefix=${BOOST_DIR} --with-serialization --with-iostreams --with-python | grep -v -e common\.copy -e common\.mkdir)
( cd ${BOOST_DIR}
./bootstrap.sh > /dev/null
python ${PROJECT_DIR}/build/make_user_config.py > $HOME/user-config.jam
cat $HOME/user-config.jam
(./b2 install --prefix=${BOOST_DIR} --with-serialization --with-iostreams --with-python | grep -v -e common\.copy -e common\.mkdir) )
fi
ls ${BOOST_DIR}/lib | grep libboost
ls ${BOOST_DIR}/lib | grep libboost || exit 1
13 changes: 13 additions & 0 deletions doc/changelog.qbk
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@

[master]

[heading 3.1 (not in boost)]

* Renamed `bincount` method to `size`
* Support for axes with only overflow and no underflow bin
* category axis now by default has bin for "other" input that does not fall
into the predefined categories, making it consistent with other axes
* NaN is now consistently put into overflow bin for all axes
* Eliminated warnings about safe internal conversions on MSVC
* Established a cpp house style with corresponding .clang-format file
* Better detection of Python library on all systems
* Improved code coverage by testing more input errors
* Raise ValueError instead of generic RuntimeError in Python on input errors

[heading 3.0 (not in boost)]

* Support for efficient adding of multiple histograms and scaling
Expand Down
12 changes: 5 additions & 7 deletions doc/guide.qbk
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,11 @@ This guide covers the basic and more advanced usage of the library. It is design

[section Introduction]

This library provides a templated [@https://en.wikipedia.org/wiki/Histogram histogram] class for multi-dimensional data. A histogram consists a number of non-overlapping cells in the data space, called *bins*. When a value tuple is passed to the histogram, the corresponding bin that envelopes the value tuple is found and a counter associated to the bin is incremented by one. Keeping the bin counts in memory for analysis requires fewer resources than keeping all the original value tuples around. If the bins are small enough[footnote What small enough means has to be decided case by case.], they still represent the original information in the data distribution. A histogram is therefore a useful lossy compression. It is also often used as a simple estimator for the [@https://en.wikipedia.org/wiki/Probability_density_function probability density function] of the input data. More complex density estimators exist, but histograms have the appeal that they are easy to reason about.
This library provides a templated [@https://en.wikipedia.org/wiki/Histogram histogram] class for multi-dimensional data. A histogram consists a number of non-overlapping consecutive cells in data space, called *bins*. When a value is passed to the histogram, the corresponding bin that envelopes the value is found and an associated counter is incremented. In large data sets, keeping the bin counts in memory for analysis requires fewer resources than keeping the original value tuples. If the bins are small enough[footnote What small enough means has to be decided case by case.], they still represent the original information in the data distribution. A histogram is therefore a useful lossy compression. It is also often used as a simple estimator for the [@https://en.wikipedia.org/wiki/Probability_density_function probability density function] of the input data. More complex density estimators exist, but histograms are easy to reason about.

Input for the histogram can be one- or multi-dimensional. In the multi-dimensional case, the input consist of tuples of values which belong together, describing different aspects of the same entity. A point in space is an example. You need three coordinate values to describe a point. The entity here is the point, and to fully characterize a point distribution in space you need three values and therefore a three-dimensional (3d) histogram.
Input for the histogram can be one- or multi-dimensional. In the multi-dimensional case, the input consist of tuples of values which belong together, describing different aspects of the same entity. A point in space is an example. You need three coordinate values to describe a point. The entity here is the point, and to fully characterize a point distribution in space you need three values and therefore a three-dimensional (3d) histogram. The advantage of using a 3d histogram over three separate 1d histograms, one for each coordinate, is that the 3d histogram is able to capture more information. For example, you could have a point distribution that looks like a checker board in three dimensions (a checker cube): high and low densities are alternating along each coordinate. Then the 1d histograms for each separate coordinate would look like flat distributions, completely hiding the complex structure, while the 3d histogram would retain the structure for further analysis.

The advantage of using a 3d histogram over three separate 1d histograms, one for each coordinate, is that the 3d histogram is able to capture more information. For example, you could have a point distribution that looks like a checker board in three dimensions (a checker cube): high and low densities are alternating along each coordinate. Then the 1d histograms for each separate coordinate would look like flat distributions, completely hiding the complex structure, while the 3d histogram would retain the structure for further analysis.

The term /histogram/ is usually strictly used for something with bins over continuous data. The histogram class in this library generalize this concept. It can also process categorical variables and it even allows for non-consecutive bins. There is no restriction to numbers as input. Any type can be fed into the histogram, if there is a specialized axis object that maps values of this type to a bin index. The only remaining restriction is that bins are non-overlapping, since there must be a unique mapping from input value to bin.
The term /histogram/ is usually strictly used for something with bins over discrete or continuous data. The histogram class can also process categorical variables and it even allows for non-consecutive bins if that is desired. There is no restriction to numbers as input. Any type can be fed into the histogram, if the user provides a specialized axis class that maps values of this type to a bin index. The only remaining restriction is that bins are non-overlapping, since there must be a unique mapping from input value to bin. The library is not able to automatically ensure this for user-provided axis classes, so the responsibily is on the implementer.

[endsect]

Expand Down Expand Up @@ -42,7 +40,7 @@ When you work with dynamic histograms, you can also create a sequence of axes at

[funcref boost::histogram::make_static_histogram make_static_histogram] cannot handle this case because a static histogram can only be constructed when the number and types of all axes are known already at compile time. While strictly speaking that is also true in this example, you could have filled the vector also at run-time, based on run-time user input.

[note Memory for bin counters is allocated lazily, because if the default storage policy [classref boost::histogram::adaptive_storage adaptive_storage] is used. Allocation is deferred to the first time, when input values are passed to the histogram. Therefore memory allocation exceptions are not thrown when the histogram is created, but possibly later. This gives you a chance to check how much memory the histogram will allocate and possible give a warning if that amount is excessively large. Use the method `histogram::bincount()` to see how many bins your axis layout requires. At the first fill, that many bytes will be allocated. The allocated amount of memory may grow further later when the capacity of the bin counters needs to grow.]
[note Memory for bin counters is allocated lazily, because if the default storage policy [classref boost::histogram::adaptive_storage adaptive_storage] is used. Allocation is deferred to the first time, when input values are passed to the histogram. Therefore memory allocation exceptions are not thrown when the histogram is created, but possibly later. This gives you a chance to check how much memory the histogram will allocate and possibly give a warning if that amount is excessively large. Use the method `histogram::size()` to see how many bins your axis layout requires. At the first fill, that many bytes will be allocated. The allocated amount of memory may grow further later when the capacity of the bin counters needs to grow.]

[endsect]

Expand Down Expand Up @@ -70,7 +68,7 @@ By default, additional under- and overflow bins are added automatically for each

We use an [classref boost::histogram::axis::integer integer axis] here, because the input values are integers and we want one bin for each eye value.

[note The [classref boost::histogram::axis::circular circular axis] never creates under- and overflow bins. The highest bin wraps around to the lowest bin and vice versa, so there is no possibility for overflow. Similarly, the [classref boost::histogram::axis::category category axis] comes without under- and overflow bins, because these terms have no meaning for categorical variables.]
[note The [classref boost::histogram::axis::circular circular axis] never creates under- and overflow bins. The highest bin wraps around to the lowest bin and vice versa, so there is no possibility for overflow. The [classref boost::histogram::axis::category category axis] comes only with an "overflow" bin, which counts all types of categorical input that was not recognized.]

[endsect]

Expand Down
6 changes: 3 additions & 3 deletions doc/rationale.qbk
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,15 @@ In a sense, [classref boost::histogram::adaptive_storage adaptive_storage] is th

[section:uoflow Under- and overflow bins]

Axis instances by default add extra bins that count values which fall below or above the range covered by the axis (for those types where that makes sense). These extra bins are called under- and overflow bins, respectively. The extra bins can be turned off individually for each axis to conserve memory, but it is generally recommended to keep them. The extra bins do not interfere with normal bin counting. On an axis with `n` bins, the first bin has the index `0`, the last bin `n-1`, while the under- and overflow bins are accessible at the indices `-1` and `n`, respectively.
Axis instances by default add extra bins that count values which fall below or above the range covered by the axis (for those types where that makes sense). These extra bins are called under- and overflow bins, respectively. The extra bins can be turned off individually for each axis to conserve memory, but it is generally recommended to have them. The extra bins do not interfere with normal bin counting. On an axis with `n` bins, the first bin has the index `0`, the last bin `n-1`, while the under- and overflow bins are accessible at the indices `-1` and `n`, respectively.

Under- and overflow bins are useful in one-dimensional histograms, and nearly essential in multi-dimensional histograms. Here are the advantages:

* No loss: The total sum over all bin counts is strictly equal to the number of times `fill(...)` was called. Even NaN values are counted, they end up in the underflow bin by convention.
* No loss: The total sum over all bin counts is strictly equal to the number of times the histogram was filled. Even NaN values are counted, they are put in the overflow-bin by convention.

* Diagnosis: Unexpected extreme values show up in the extra bins, which otherwise may be overlooked.

* Reducibility: In multi-dimensional histograms, an out-of-range value along one axis may be paired with an in-range value along another axis. If under- and overflow bins are missing, such a value pair is lost completely. If you apply a `reduce` operation on a histogram, which removes somes axes by resummation of the bin counts, this would lead to distortions of the histogram along the remaining axes. When under- and overflow bins are present, the `reduce` operation always produces the same sub-histogram that would have been obtained if it was filled from scratch with the original data.
* Ability to reduce histograms: In multi-dimensional histograms, an out-of-range value along one axis may be paired with an in-range value along another axis. If under- and overflow bins are missing, such a value pair is lost completely. If you apply a `reduce` operation on a histogram, which removes somes axes by summing counts over that dimension, this would lead to distortions of the histogram along the remaining axes. When under- and overflow bins are present, the `reduce` operation always produces a sub-histogram identical to one obtained if it was filled from scratch with the original data.

[endsect]

Expand Down
Loading

0 comments on commit 31d4204

Please sign in to comment.