Merge branch 'release/3.1'

boostorg · Jul 25, 2018 · 31d4204 · 31d4204
2 parents bfc437b + 93493ca
commit 31d4204
Show file tree

Hide file tree

Showing 36 changed files with 1,043 additions and 965 deletions.
diff --git a/appveyor.yml → .appveyor.yml b/appveyor.yml → .appveyor.yml
@@ -20,7 +20,7 @@ environment:
 test_script:
   # - ps: iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1'))
   - cd build
-  - cmake . -DBUILD_PYTHON=OFF -DBUILD_NUMPY_SUPPORT=OFF 
+  - cmake . -DBUILD_PYTHON=OFF -DBUILD_NUMPY=OFF
     -DBOOST_ROOT="%BOOST_ROOT%" -DBoost_USE_STATIC_LIBS="ON"
   - cmake --build .
   - ctest -V
diff --git a/.travis.yml b/.travis.yml
@@ -16,65 +16,52 @@ branches:
     - master
     - develop
 
-# addons:
-#   apt:
-#     sources: deadsnakes
-#     packages:
-#       - python2.7
-#       - python3.5
-#       - python3-pip
-#       - libpython2.7-dev
-#       - libpython3.5-dev
-
 matrix:
   include:
-    - os: linux # minimum gcc
-      env:
-        CC=gcc CXX=g++ PYTHON_VERSION=2.7
-        BUILD_PYTHON=OFF
-        BUILD_NUMPY=OFF
-        BUILD_SERIALIZATION=OFF
-    - os: linux # maximum gcc
-      env:
-        CC=gcc CXX=g++ PYTHON_VERSION=2.7
-        BUILD_PYTHON=ON
-        BUILD_NUMPY=ON
-        BUILD_SERIALIZATION=ON
-    - os: linux # maximum gcc
-      env:
-        CC=gcc CXX=g++ PYTHON_VERSION=3.6
-        BUILD_PYTHON=ON
-        BUILD_NUMPY=OFF
-        BUILD_SERIALIZATION=ON
-    - os: linux # maximum clang
-      env:
-        CC=clang CXX=clang++
-        BUILD_PYTHON=ON PYTHON_VERSION=2.7
-        BUILD_NUMPY=ON
-        BUILD_SERIALIZATION=ON
-    - os: linux # coverage gcc
-      env:
-        CC=gcc CXX=g++ GCOV=gcov PYTHON_VERSION=2.7
-        CMAKE_BUILD_TYPE=coverage
+    - os: linux # gcc minimum
+      env: PYVER=2.7 CC=gcc CXX=g++ PY=OFF NUMPY=OFF SERIAL=OFF
+    - os: linux # gcc py27 w/o numpy
+      env: PYVER=2.7 CC=gcc CXX=g++ PY=ON NUMPY=OFF SERIAL=ON
+    - os: linux # gcc py27
+      env: PYVER=2.7 CC=gcc CXX=g++ PY=ON NUMPY=ON SERIAL=ON
+    - os: linux # gcc py36
+      env: PYVER=3.6 CC=gcc CXX=g++ PY=ON NUMPY=ON SERIAL=ON
+    - os: linux # clang py36
+      env: PYVER=3.6 CC=clang CXX=clang++ PY=ON NUMPY=ON SERIAL=ON
+    - os: linux # coverage py27
+      env: PYVER=2.7 CC=gcc CXX=g++ GCOV=gcov
+    - os: osx # minimum osx Xcode 8.3
+      osx_image: xcode8.3
+      env: PY=OFF NUMPY=OFF SERIAL=OFF
+  allow_failures:
+    - os: osx
 
 git:
-  depth: 1
+  depth: 10
 
-# Install packages (pre-installed: pytest numpy)
+# Install packages (pre-installed: pytest)
 install:
-  - pyenv versions
-  - pyenv global ${PYTHON_VERSION}
+  - if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then
+      export PATH="/usr/local/opt/python/libexec/bin:$PATH";
+    else
+      pyenv versions;
+      pyenv global ${PYVER};
+    fi
+
+  - python --version
+  - python build/get_python_include.py
+  - python build/get_python_library.py
 
-  - pip install --user numpy
+  - pip install --upgrade numpy # update numpy to avoid segfaults later
   - source build/travis_install_boost.sh
 
-  - if [ "${CMAKE_BUILD_TYPE}" = "coverage" ]; then
-      pip install --user cpp-coveralls urllib3[secure];
+  - if [ -n "$GCOV" ]; then
+      pip install cpp-coveralls urllib3[secure];
     fi
 
 script:
   - cd build
-  - if [ "${CMAKE_BUILD_TYPE}" = "coverage" ]; then
+  - if [ -n "$GCOV" ]; then
       cmake . -DBOOST_ROOT=${BOOST_DIR}
               -DBUILD_PYTHON=OFF
               -DBUILD_SERIALIZATION=ON
@@ -95,17 +82,17 @@ script:
       ctest;
     else
       cmake . -DBOOST_ROOT=${BOOST_DIR}
-              -DBUILD_PYTHON=${BUILD_PYTHON}
-              -DBUILD_NUMPY=${BUILD_NUMPY}
-              -DBUILD_SERIALIZATION=${BUILD_SERIALIZATION}
-              -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} &&
+              -DBUILD_PYTHON=${PY}
+              -DBUILD_NUMPY=${NUMPY}
+              -DBUILD_SERIALIZATION=${SERIAL}
+              -DCMAKE_BUILD_TYPE=Debug &&
       make -j2 &&
       ctest -V;
     fi
 
 # Calculate coverage
 after_success:
-  if [ "${CMAKE_BUILD_TYPE}" = "coverage" ]; then
+  if [ -n "$GCOV" ]; then
     coveralls -r .. -b . --verbose --exclude ${TRAVIS_BUILD_DIR}/deps  --gcov=`which ${GCOV}` --gcov-options '\-lpbc';
   fi
 

diff --git a/build/CMakeLists.txt b/build/CMakeLists.txt
@@ -50,11 +50,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
 endif()
 
 if(TRACE_ALLOCS)
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    add_compile_options(/DBOOST_HISTOGRAM_TRACE_ALLOCS)
-  else()
-    add_compile_options(-DBOOST_HISTOGRAM_TRACE_ALLOCS)
-  endif()
+  add_definitions(-DBOOST_HISTOGRAM_TRACE_ALLOCS)
 endif()
 
 if(BUILD_PYTHON)
@@ -109,9 +105,9 @@ else()
   # serialization only required for tests
   if (BUILD_SERIALIZATION)
     find_package(Boost ${MIN_BOOST_VERSION} REQUIRED serialization)
-    add_definitions(-DHAVE_SERIALIZATION)
   else ()
     find_package(Boost ${MIN_BOOST_VERSION} REQUIRED)
+    add_definitions(-DBOOST_HISTOGRAM_NO_SERIALIZATION)
   endif()
   set(LIBRARIES ${Boost_LIBRARIES})
 endif()
@@ -191,12 +187,6 @@ file(GLOB_RECURSE
      ../test/*_test.cpp ../include/*.hpp
      )
 
-add_custom_target(clf
-  COMMAND clang-format
-  -i
-  ${ALL_SOURCE_FILES}
-  )
-
 get_property(INCLUDE_DIRS DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
 set(TIDY_INCLUDE)
 foreach(x ${INCLUDE_DIRS})

diff --git a/build/get_python_library.py b/build/get_python_library.py
@@ -1,19 +1,37 @@
 from distutils import sysconfig
-import os.path
+import os
 import sys
-import glob
+from pprint import pprint
+from glob import glob
 pj = os.path.join
 
-pyver = sysconfig.get_config_var('VERSION')
-getvar = sysconfig.get_config_var
+LIB_KEYS = ('LIBDEST', 'LIBDIR', 'LIBPL')
 
-libname = "python" + pyver
+if sys.platform == "darwin":
+    so_ext = "dylib"
+elif sys.platform.startswith("linux"):
+    so_ext = "so"
+else:
+    so_ext = "dll"
 
-for libvar in ('LIBDIR', 'LIBPL'):
-    for ext in ('so', 'dylib', 'dll'):
-        match = pj(getvar(libvar), "*" + libname + "*." + ext)
-        lib = glob.glob(match)
-        if lib:
-            assert len(lib) == 1
-            sys.stdout.write(lib[0])
-            raise SystemExit
+config = sysconfig.get_config_vars()
+
+library = "*python%s*%s" % (sysconfig.get_python_version(), so_ext) 
+for libpath in LIB_KEYS:
+    p = pj(config[libpath], library)
+    cand = glob(p)
+    if cand and len(cand) == 1:
+        sys.stdout.write(cand[0])
+        raise SystemExit
+
+pprint("no library found, dumping library pattern, config, and directory contents:")
+pprint(library)
+pprint(config)
+
+for libpath in LIB_KEYS:
+    pprint(libpath)
+    p = config[libpath]
+    if os.path.exists(p):
+        pprint(os.listdir(p))
+
+raise SystemExit(1)
diff --git a/build/make_user_config.py b/build/make_user_config.py
@@ -0,0 +1,9 @@
+import sys
+from distutils import sysconfig
+
+s = "using python : {version} : {prefix} : {inc} ;\n".format(
+                 version=sysconfig.get_python_version(),
+                 prefix=sysconfig.get_config_var("prefix"),
+                 inc=sysconfig.get_python_inc())
+
+sys.stdout.write(s)
diff --git a/build/travis_install_boost.sh b/build/travis_install_boost.sh
@@ -7,14 +7,16 @@ if [[ -z "${TRAVIS_BUILD_DIR}" ]]; then
 fi
 PYVER=$(python -c 'import sys; sys.stdout.write("%i"%sys.version_info.major)')
 BOOST_DIR=${TRAVIS_BUILD_DIR}/deps/boost-${BOOST_VERSION}-py${PYVER}
+PROJECT_DIR=$(pwd)
 echo "Boost: ${BOOST_DIR}"
 mkdir -p ${BOOST_DIR}
-BOOSTRAP_PATCH_REGEX="s|\( *using python.*\);|\1: $(python build/get_python_include.py) ;|"
 if [[ -z "$(ls -A ${BOOST_DIR})" ]]; then
   BOOST_URL="http://sourceforge.net/projects/boost/files/boost/${BOOST_VERSION}/boost_${BOOST_VERSION//\./_}.tar.gz"
   { wget --quiet -O - ${BOOST_URL} | tar --strip-components=1 -xz -C ${BOOST_DIR}; } || exit 1
-  (cd ${BOOST_DIR} && ./bootstrap.sh > /dev/null && \
-   sed -i "${BOOSTRAP_PATCH_REGEX}" project-config.jam && \
-   ./b2 install --prefix=${BOOST_DIR} --with-serialization --with-iostreams --with-python | grep -v -e common\.copy -e common\.mkdir)
+  ( cd ${BOOST_DIR}
+    ./bootstrap.sh > /dev/null
+    python ${PROJECT_DIR}/build/make_user_config.py > $HOME/user-config.jam
+    cat $HOME/user-config.jam
+    (./b2 install --prefix=${BOOST_DIR} --with-serialization --with-iostreams --with-python | grep -v -e common\.copy -e common\.mkdir) )
 fi
-ls ${BOOST_DIR}/lib | grep libboost
+ls ${BOOST_DIR}/lib | grep libboost || exit 1
diff --git a/doc/changelog.qbk b/doc/changelog.qbk
@@ -2,6 +2,19 @@
 
 [master]
 
+[heading 3.1 (not in boost)]
+
+* Renamed `bincount` method to `size`
+* Support for axes with only overflow and no underflow bin
+* category axis now by default has bin for "other" input that does not fall
+  into the predefined categories, making it consistent with other axes
+* NaN is now consistently put into overflow bin for all axes
+* Eliminated warnings about safe internal conversions on MSVC
+* Established a cpp house style with corresponding .clang-format file
+* Better detection of Python library on all systems
+* Improved code coverage by testing more input errors
+* Raise ValueError instead of generic RuntimeError in Python on input errors
+
 [heading 3.0 (not in boost)]
 
 * Support for efficient adding of multiple histograms and scaling

diff --git a/doc/guide.qbk b/doc/guide.qbk
@@ -4,13 +4,11 @@ This guide covers the basic and more advanced usage of the library. It is design
 
 [section Introduction]
 
-This library provides a templated [@https://en.wikipedia.org/wiki/Histogram histogram] class for multi-dimensional data. A histogram consists a number of non-overlapping cells in the data space, called *bins*. When a value tuple is passed to the histogram, the corresponding bin that envelopes the value tuple is found and a counter associated to the bin is incremented by one. Keeping the bin counts in memory for analysis requires fewer resources than keeping all the original value tuples around. If the bins are small enough[footnote What small enough means has to be decided case by case.], they still represent the original information in the data distribution. A histogram is therefore a useful lossy compression. It is also often used as a simple estimator for the [@https://en.wikipedia.org/wiki/Probability_density_function probability density function] of the input data. More complex density estimators exist, but histograms have the appeal that they are easy to reason about.
+This library provides a templated [@https://en.wikipedia.org/wiki/Histogram histogram] class for multi-dimensional data. A histogram consists a number of non-overlapping consecutive cells in data space, called *bins*. When a value is passed to the histogram, the corresponding bin that envelopes the value is found and an associated counter is incremented. In large data sets, keeping the bin counts in memory for analysis requires fewer resources than keeping the original value tuples. If the bins are small enough[footnote What small enough means has to be decided case by case.], they still represent the original information in the data distribution. A histogram is therefore a useful lossy compression. It is also often used as a simple estimator for the [@https://en.wikipedia.org/wiki/Probability_density_function probability density function] of the input data. More complex density estimators exist, but histograms are easy to reason about.
 
-Input for the histogram can be one- or multi-dimensional. In the multi-dimensional case, the input consist of tuples of values which belong together, describing different aspects of the same entity. A point in space is an example. You need three coordinate values to describe a point. The entity here is the point, and to fully characterize a point distribution in space you need three values and therefore a three-dimensional (3d) histogram.
+Input for the histogram can be one- or multi-dimensional. In the multi-dimensional case, the input consist of tuples of values which belong together, describing different aspects of the same entity. A point in space is an example. You need three coordinate values to describe a point. The entity here is the point, and to fully characterize a point distribution in space you need three values and therefore a three-dimensional (3d) histogram. The advantage of using a 3d histogram over three separate 1d histograms, one for each coordinate, is that the 3d histogram is able to capture more information. For example, you could have a point distribution that looks like a checker board in three dimensions (a checker cube): high and low densities are alternating along each coordinate. Then the 1d histograms for each separate coordinate would look like flat distributions, completely hiding the complex structure, while the 3d histogram would retain the structure for further analysis.
 
-The advantage of using a 3d histogram over three separate 1d histograms, one for each coordinate, is that the 3d histogram is able to capture more information. For example, you could have a point distribution that looks like a checker board in three dimensions (a checker cube): high and low densities are alternating along each coordinate. Then the 1d histograms for each separate coordinate would look like flat distributions, completely hiding the complex structure, while the 3d histogram would retain the structure for further analysis.
-
-The term /histogram/ is usually strictly used for something with bins over continuous data. The histogram class in this library generalize this concept. It can also process categorical variables and it even allows for non-consecutive bins. There is no restriction to numbers as input. Any type can be fed into the histogram, if there is a specialized axis object that maps values of this type to a bin index. The only remaining restriction is that bins are non-overlapping, since there must be a unique mapping from input value to bin.
+The term /histogram/ is usually strictly used for something with bins over discrete or continuous data. The histogram class can also process categorical variables and it even allows for non-consecutive bins if that is desired. There is no restriction to numbers as input. Any type can be fed into the histogram, if the user provides a specialized axis class that maps values of this type to a bin index. The only remaining restriction is that bins are non-overlapping, since there must be a unique mapping from input value to bin. The library is not able to automatically ensure this for user-provided axis classes, so the responsibily is on the implementer.
 
 [endsect]
 
@@ -42,7 +40,7 @@ When you work with dynamic histograms, you can also create a sequence of axes at
 
 [funcref boost::histogram::make_static_histogram make_static_histogram] cannot handle this case because a static histogram can only be constructed when the number and types of all axes are known already at compile time. While strictly speaking that is also true in this example, you could have filled the vector also at run-time, based on run-time user input.
 
-[note Memory for bin counters is allocated lazily, because if the default storage policy [classref boost::histogram::adaptive_storage adaptive_storage] is used. Allocation is deferred to the first time, when input values are passed to the histogram. Therefore memory allocation exceptions are not thrown when the histogram is created, but possibly later. This gives you a chance to check how much memory the histogram will allocate and possible give a warning if that amount is excessively large. Use the method `histogram::bincount()` to see how many bins your axis layout requires. At the first fill, that many bytes will be allocated. The allocated amount of memory may grow further later when the capacity of the bin counters needs to grow.]
+[note Memory for bin counters is allocated lazily, because if the default storage policy [classref boost::histogram::adaptive_storage adaptive_storage] is used. Allocation is deferred to the first time, when input values are passed to the histogram. Therefore memory allocation exceptions are not thrown when the histogram is created, but possibly later. This gives you a chance to check how much memory the histogram will allocate and possibly give a warning if that amount is excessively large. Use the method `histogram::size()` to see how many bins your axis layout requires. At the first fill, that many bytes will be allocated. The allocated amount of memory may grow further later when the capacity of the bin counters needs to grow.]
 
 [endsect]
 
@@ -70,7 +68,7 @@ By default, additional under- and overflow bins are added automatically for each
 
 We use an [classref boost::histogram::axis::integer integer axis] here, because the input values are integers and we want one bin for each eye value.
 
-[note The [classref boost::histogram::axis::circular circular axis] never creates under- and overflow bins. The highest bin wraps around to the lowest bin and vice versa, so there is no possibility for overflow. Similarly, the [classref boost::histogram::axis::category category axis] comes without under- and overflow bins, because these terms have no meaning for categorical variables.]
+[note The [classref boost::histogram::axis::circular circular axis] never creates under- and overflow bins. The highest bin wraps around to the lowest bin and vice versa, so there is no possibility for overflow. The [classref boost::histogram::axis::category category axis] comes only with an "overflow" bin, which counts all types of categorical input that was not recognized.]
 
 [endsect]
 

diff --git a/doc/rationale.qbk b/doc/rationale.qbk
@@ -88,15 +88,15 @@ In a sense, [classref boost::histogram::adaptive_storage adaptive_storage] is th
 
 [section:uoflow Under- and overflow bins]
 
-Axis instances by default add extra bins that count values which fall below or above the range covered by the axis (for those types where that makes sense). These extra bins are called under- and overflow bins, respectively. The extra bins can be turned off individually for each axis to conserve memory, but it is generally recommended to keep them. The extra bins do not interfere with normal bin counting. On an axis with `n` bins, the first bin has the index `0`, the last bin `n-1`, while the under- and overflow bins are accessible at the indices `-1` and `n`, respectively.
+Axis instances by default add extra bins that count values which fall below or above the range covered by the axis (for those types where that makes sense). These extra bins are called under- and overflow bins, respectively. The extra bins can be turned off individually for each axis to conserve memory, but it is generally recommended to have them. The extra bins do not interfere with normal bin counting. On an axis with `n` bins, the first bin has the index `0`, the last bin `n-1`, while the under- and overflow bins are accessible at the indices `-1` and `n`, respectively.
 
 Under- and overflow bins are useful in one-dimensional histograms, and nearly essential in multi-dimensional histograms. Here are the advantages:
 
-* No loss: The total sum over all bin counts is strictly equal to the number of times `fill(...)` was called. Even NaN values are counted, they end up in the underflow bin by convention.
+* No loss: The total sum over all bin counts is strictly equal to the number of times the histogram was filled. Even NaN values are counted, they are put in the overflow-bin by convention.
 
 * Diagnosis: Unexpected extreme values show up in the extra bins, which otherwise may be overlooked.
 
-* Reducibility: In multi-dimensional histograms, an out-of-range value along one axis may be paired with an in-range value along another axis. If under- and overflow bins are missing, such a value pair is lost completely. If you apply a `reduce` operation on a histogram, which removes somes axes by resummation of the bin counts, this would lead to distortions of the histogram along the remaining axes. When under- and overflow bins are present, the `reduce` operation always produces the same sub-histogram that would have been obtained if it was filled from scratch with the original data.
+* Ability to reduce histograms: In multi-dimensional histograms, an out-of-range value along one axis may be paired with an in-range value along another axis. If under- and overflow bins are missing, such a value pair is lost completely. If you apply a `reduce` operation on a histogram, which removes somes axes by summing counts over that dimension, this would lead to distortions of the histogram along the remaining axes. When under- and overflow bins are present, the `reduce` operation always produces a sub-histogram identical to one obtained if it was filled from scratch with the original data.
 
 [endsect]