diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 87c0d967a..1d277d431 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -39,6 +40,7 @@ RunParams::RunParams(int argc, char** argv) size_factor(0.0), data_alignment(RAJA::DATA_ALIGN), num_parts(10), + part_type(PartType::Even), gpu_stream(1), gpu_block_sizes(), pf_tol(0.1), @@ -89,6 +91,97 @@ RunParams::~RunParams() } +/* + ******************************************************************************* + * + * Print all run params data to given output stream. + * + ******************************************************************************* + */ +std::vector RunParams::getPartition(Index_type len, Index_type num_parts) const +{ + std::vector parts; + + parts.reserve(num_parts+1); + + parts.emplace_back(0); + + switch ( (len > num_parts && num_parts > 1) + ? part_type : PartType::Even ) { + case PartType::Even: + { + for (Index_type p = 1; p < num_parts; ++p) { + + parts.emplace_back((len/num_parts)*p + + (len%num_parts)*p / num_parts); + } + } break; + + case PartType::Geometric: + { + auto geo_sum = [](double a, double r, double n) { + // sum of geometric series + // for i in [0, n), a*pow(r, i) + return a * (1.0 - std::pow(r, n)) / (1.0 - r); + }; + + auto geo_solve_for_r = [&](double sum, double a, double n) + { + double max_r = std::pow(sum/a, 1.0 / (n-1.0)); + double min_r = 1.0; + + double r = (max_r + min_r) / 2.0; + double diff = geo_sum(a, r, n) - sum; + + constexpr double tolerance = 1.0; + constexpr size_t max_iter = 1000; + + // use bisection to find r + for (size_t iter = 0; + iter < max_iter && (diff < 0.0 || diff > tolerance); + ++iter) { + + if (diff > 0.0) { + max_r = r; + } else { + min_r = r; + } + + r = (max_r + min_r) / 2.0; + diff = geo_sum(a, r, n) - sum; + } + + return r; + }; + + constexpr double a = 1.0; + double r = geo_solve_for_r(len, a, num_parts); + + for (Index_type p = 1; p < num_parts; ++p) { + + Index_type val = static_cast(std::floor(geo_sum(a, r, p))); + + if (val > 0 && val < len) { + parts.emplace_back(val); + } else { + getCout() << "RunParams::getPartition: Geometric failed to generate partition" << std::endl; + break; + } + } + + } break; + default: + { + getCout() << "RunParams::getPartition: unknown part_type" << std::endl; + } break; + } + + parts.emplace_back(len); + + return parts; +} + + /* ******************************************************************************* * @@ -118,6 +211,7 @@ void RunParams::print(std::ostream& str) const str << "\n size_factor = " << size_factor; str << "\n data_alignment = " << data_alignment; str << "\n num_parts = " << num_parts; + str << "\n part_type = " << PartTypeToStr(part_type); str << "\n gpu stream = " << ((gpu_stream == 0) ? "0" : "RAJA default"); str << "\n gpu_block_sizes = "; for (size_t j = 0; j < gpu_block_sizes.size(); ++j) { @@ -428,6 +522,32 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--part_type") ) { + + bool got_someting = false; + i++; + if ( i < argc ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + } else { + for (int ipt = 0; ipt < static_cast(PartType::NumPartTypes); ++ipt) { + PartType pt = static_cast(ipt); + if (PartTypeToStr(pt) == opt) { + got_someting = true; + part_type = pt; + break; + } + } + if (!got_someting) { + getCout() << "\nBad input:" + << " must give a valid partition type" + << std::endl; + input_state = BadInput; + } + } + } + } else if ( opt == std::string("--gpu_stream_0") ) { gpu_stream = 0; @@ -605,6 +725,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } } } + } else if ( std::string(argv[i]) == std::string("--tunings") || std::string(argv[i]) == std::string("-t") ) { @@ -984,6 +1105,12 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t --num_parts 100 (breaks *_PARTED kernels into 100 loops)\n\n"; + str << "\t --part_type [default is Even]\n" + << "\t (distribution for parts in *_PARTED kernels).\n" + << "\t Must be a name of a member of the PartType enum.\n"; + str << "\t\t Example...\n" + << "\t\t --part_type Geometric (makes partitions with a fixed ratio of sizes)\n\n"; + str << "\t --seq-data-space, -sds [Default is Host]\n" << "\t (name of data space to use for sequential variants)\n" << "\t Valid data space names are 'Host' or 'CudaPinned'\n"; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 1e6944f49..e62465958 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -96,6 +96,38 @@ class RunParams { } } + /*! + * \brief Enumeration indicating how to separate partitions + */ + enum PartType { + Even, /*!< indicates value is unset */ + Geometric, /*!< multiplier on default kernel iteration space */ + + NumPartTypes + }; + + /*! + * \brief Translate PartType enum value to string + */ + static std::string PartTypeToStr(PartType pt) + { + switch (pt) { + case PartType::Even: + return "Even"; + case PartType::Geometric: + return "Geometric"; + case PartType::NumPartTypes: + default: + return "Unknown"; + } + } + + /*! + * \brief Get a partition from a length, number of partitions, and PartType enum value + * Note that the vector will be of length (num_part+1) + */ + std::vector getPartition(Index_type len, Index_type num_parts) const; + /*! * \brief Return state of input parsed to this point. */ @@ -217,6 +249,7 @@ class RunParams { Size_type data_alignment; Index_type num_parts; /*!< number of parts used in parted kernels (input option) */ + PartType part_type; /*!< how the partition sizes are generated (input option) */ int gpu_stream; /*!< 0 -> use stream 0; anything else -> use raja default stream */ std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ diff --git a/src/stream/TRIAD_PARTED_FUSED.cpp b/src/stream/TRIAD_PARTED_FUSED.cpp index 38b4f204d..5c0011305 100644 --- a/src/stream/TRIAD_PARTED_FUSED.cpp +++ b/src/stream/TRIAD_PARTED_FUSED.cpp @@ -40,14 +40,7 @@ TRIAD_PARTED_FUSED::TRIAD_PARTED_FUSED(const RunParams& params) ( static_cast(getDefaultProblemSize()) / getActualProblemSize() ); - m_parts.reserve(num_parts+1); - m_parts.emplace_back(0); - for (Index_type p = 1; p < num_parts; ++p) { - // use evenly spaced parts for now - m_parts.emplace_back((getActualProblemSize()/num_parts)*p + - (getActualProblemSize()%num_parts)*p / num_parts); - } - m_parts.emplace_back(getActualProblemSize()); + m_parts = params.getPartition(getActualProblemSize(), num_parts); setUsesFeature( Workgroup );