Skip to content

Commit

Permalink
Merge pull request seqan#223 from smehringer/update
Browse files Browse the repository at this point in the history
[FEATURE] update hibf lib and introduce --relaxed-fpr option.
  • Loading branch information
eseiler authored Oct 27, 2023
2 parents 5e5e428 + 5f80522 commit 7823287
Show file tree
Hide file tree
Showing 12 changed files with 102 additions and 71 deletions.
2 changes: 1 addition & 1 deletion lib/hibf
Submodule hibf updated 34 files
+5 −1 .clang-tidy
+1 −1 README.md
+6 −1 include/hibf/build/bin_size_in_bits.hpp
+54 −21 include/hibf/config.hpp
+2 −0 include/hibf/contrib/robin_hood.hpp
+1 −1 include/hibf/hierarchical_interleaved_bloom_filter.hpp
+24 −24 include/hibf/interleaved_bloom_filter.hpp
+5 −0 include/hibf/layout/graph.hpp
+70 −9 include/hibf/layout/hierarchical_binning.hpp
+27 −0 include/hibf/misc/divide_and_ceil.hpp
+2 −2 include/hibf/misc/insert_iterator.hpp
+18 −8 src/build/construct_ibf.cpp
+2 −1 src/build/insert_into_ibf.cpp
+9 −2 src/config.cpp
+10 −6 src/hierarchical_interleaved_bloom_filter.cpp
+2 −1 src/interleaved_bloom_filter.cpp
+2 −2 src/layout/compute_layout.cpp
+10 −10 src/layout/graph.cpp
+25 −32 src/layout/hierarchical_binning.cpp
+9 −4 src/layout/simple_binning.cpp
+5 −4 test/performance/example/example_benchmark.cpp
+3 −2 test/performance/hibf/hierarchical_interleaved_bloom_filter_benchmark.cpp
+2 −2 test/performance/ibf/binning_bitvector_benchmark.cpp
+2 −1 test/performance/ibf/interleaved_bloom_filter_benchmark.cpp
+2 −2 test/snippet/hibf/hibf_construction.cpp
+1 −1 test/snippet/readme.cpp
+5 −0 test/unit/hibf/build/CMakeLists.txt
+28 −0 test/unit/hibf/build/bin_size_in_bits_test.cpp
+34 −13 test/unit/hibf/config_test.cpp
+5 −4 test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp
+12 −1 test/unit/hibf/layout/graph_test.cpp
+6 −6 test/unit/hibf/layout/hierarchical_binning_test.cpp
+2 −1 test/unit/hibf/sketch/hyperloglog_test.cpp
+2 −1 util/fpr_correction_check.cpp
3 changes: 2 additions & 1 deletion src/layout/determine_best_number_of_technical_bins.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ determine_best_number_of_technical_bins(chopper::configuration & config)
file_out << "## ### Parameters ###\n"
<< "## number of user bins = " << config.hibf_config.number_of_user_bins << '\n'
<< "## number of hash functions = " << config.hibf_config.number_of_hash_functions << '\n'
<< "## false positive rate = " << config.hibf_config.maximum_false_positive_rate << '\n';
<< "## maximum false positive rate = " << config.hibf_config.maximum_fpr << '\n'
<< "## relaxed false positive rate = " << config.hibf_config.relaxed_fpr << '\n';
hibf_statistics::print_header_to(file_out, config.output_verbose_statistics);

double best_expected_HIBF_query_cost{std::numeric_limits<double>::infinity()};
Expand Down
14 changes: 6 additions & 8 deletions src/layout/hibf_statistics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ hibf_statistics::hibf_statistics(configuration const & config_,
std::vector<size_t> const & kmer_counts) :
config{config_},
fp_correction{
seqan::hibf::layout::compute_fpr_correction({.fpr = config_.hibf_config.maximum_false_positive_rate,
seqan::hibf::layout::compute_fpr_correction({.fpr = config_.hibf_config.maximum_fpr,
.hash_count = config_.hibf_config.number_of_hash_functions,
.t_max = config_.hibf_config.tmax})},
sketches{sketches_},
Expand Down Expand Up @@ -187,9 +187,8 @@ void hibf_statistics::print_summary_to(size_t & t_max_64_memory, std::ostream &
stream /* tmax */ << config.hibf_config.tmax
<< '\t'
/* c_tmax */
<< chopper::layout::ibf_query_cost::interpolated(
config.hibf_config.tmax,
config.hibf_config.maximum_false_positive_rate)
<< chopper::layout::ibf_query_cost::interpolated(config.hibf_config.tmax,
config.hibf_config.maximum_fpr)
<< '\t'
/* l_tmax */
<< expected_HIBF_query_cost
Expand Down Expand Up @@ -260,7 +259,7 @@ size_t hibf_statistics::total_hibf_size_in_byte()
}

size_t const size_in_bits =
seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_false_positive_rate,
seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_fpr,
.hash_count = config.hibf_config.number_of_hash_functions,
.elements = total_size});

Expand Down Expand Up @@ -349,7 +348,7 @@ size_t hibf_statistics::total_hibf_size_in_byte()
std::string hibf_statistics::to_formatted_BF_size(size_t const number_of_kmers_to_be_stored) const
{
size_t const size_in_bits =
seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_false_positive_rate,
seqan::hibf::build::bin_size_in_bits({.fpr = config.hibf_config.maximum_fpr,
.hash_count = config.hibf_config.number_of_hash_functions,
.elements = number_of_kmers_to_be_stored});
return byte_size_to_formatted_str(size_in_bits / 8);
Expand Down Expand Up @@ -486,8 +485,7 @@ void hibf_statistics::compute_total_query_cost(level & curr_level)

// Add cost of querying the current IBF
// (how costly is querying number_of_tbs (e.g. 128 tbs) compared to 64 tbs given the current FPR)
curr_level.current_query_cost +=
ibf_query_cost::interpolated(number_of_tbs, config.hibf_config.maximum_false_positive_rate);
curr_level.current_query_cost += ibf_query_cost::interpolated(number_of_tbs, config.hibf_config.maximum_fpr);

// Add costs of querying the HIBF for each kmer in this level.
total_query_cost += curr_level.current_query_cost * level_kmer_count;
Expand Down
12 changes: 11 additions & 1 deletion src/set_up_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,23 @@ void set_up_parser(sharg::parser & parser, configuration & config)
"This parameter is needed to correctly estimate the index size when computing the layout."});

parser.add_option(
config.hibf_config.maximum_false_positive_rate,
config.hibf_config.maximum_fpr,
sharg::config{.short_id = '\0',
.long_id = "fpr",
.description =
"The false positive rate you aim for when building the HIBF from the resulting layout. "
"This parameter is needed to correctly estimate the index size when computing the layout."});

parser.add_option(
config.hibf_config.relaxed_fpr,
sharg::config{.short_id = '\0',
.long_id = "relaxed-fpr",
.description =
"The relaxed false positive rate (fpr) for parts that are not critical for the maximum fpr. "
"Choosing a higher relaxed FPR can lower the memory requirement but increases the runtime. "
"Experiments show that the decrease in memory is significant while the the runtime suffers "
"only slightly. We still guarantee that we never exceed the maximum fpr (--fpr)."});

parser.add_option(
config.output_filename,
sharg::config{.short_id = '\0', .long_id = "output", .description = "A file name for the resulting layout."});
Expand Down
23 changes: 17 additions & 6 deletions src/util/display_layout/compute_ibf_size.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,37 @@
#include <hibf/build/build_data.hpp>
#include <hibf/contrib/robin_hood.hpp>
#include <hibf/layout/graph.hpp>
#include <hibf/misc/divide_and_ceil.hpp>

void update_parent_kmers(robin_hood::unordered_flat_set<uint64_t> & parent_kmers,
robin_hood::unordered_flat_set<uint64_t> const & kmers)
{
parent_kmers.insert(kmers.begin(), kmers.end());
}

// this function is copied from seqan::hibf::build::construct_ibf
// it needs to be held consistent in order to compute the correct sizes
size_t compute_ibf_size(robin_hood::unordered_flat_set<uint64_t> & parent_kmers,
robin_hood::unordered_flat_set<uint64_t> & kmers,
size_t const number_of_bins,
seqan::hibf::layout::graph::node const & ibf_node,
seqan::hibf::build::build_data & data,
size_t const current_hibf_level)
{
size_t const kmers_per_bin = std::ceil(static_cast<double>(kmers.size()) / number_of_bins);
size_t const bin_size =
std::ceil(seqan::hibf::build::bin_size_in_bits({.fpr = data.config.maximum_false_positive_rate,
.hash_count = data.config.number_of_hash_functions,
.elements = kmers_per_bin})
* data.fpr_correction[number_of_bins]);
bool const max_bin_is_merged = ibf_node.max_bin_is_merged();
assert(!max_bin_is_merged || number_of_bins == 1u); // merged max bin implies (=>) number of bins == 1

size_t const kmers_per_bin = seqan::hibf::divide_and_ceil(kmers.size(), number_of_bins);
double const fpr = max_bin_is_merged ? data.config.relaxed_fpr : data.config.maximum_fpr;

size_t const bin_bits{seqan::hibf::build::bin_size_in_bits({.fpr = fpr, //
.hash_count = data.config.number_of_hash_functions,
.elements = kmers_per_bin})};
// data.fpr_correction[1] == 1.0, but we can avoid floating point operations with the ternary.
// Check number_of_bins instead of max_bin_is_merged, because split bins can also occupy only one technical bin.
size_t const bin_size{number_of_bins == 1u
? bin_bits
: static_cast<size_t>(std::ceil(bin_bits * data.fpr_correction[number_of_bins]))};

size_t const ibf_size = ibf_node.number_of_technical_bins * bin_size;

Expand Down
6 changes: 2 additions & 4 deletions src/util/display_layout/sizes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,10 +317,8 @@ void execute_general_stats(config const & cfg)
seqan::hibf::build::build_data data{.config = hibf_config, .ibf_graph = {hibf_layout}};
seqan::hibf::layout::graph::node const & root_node = data.ibf_graph.root;
size_t const t_max{root_node.number_of_technical_bins};
data.fpr_correction =
seqan::hibf::layout::compute_fpr_correction({.fpr = hibf_config.maximum_false_positive_rate,
.hash_count = hibf_config.number_of_hash_functions,
.t_max = t_max});
data.fpr_correction = seqan::hibf::layout::compute_fpr_correction(
{.fpr = hibf_config.maximum_fpr, .hash_count = hibf_config.number_of_hash_functions, .t_max = t_max});

// Get stats
hierarchical_stats(stats, root_node, data);
Expand Down
43 changes: 23 additions & 20 deletions test/api/config_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ chopper::configuration generate_config()

config.hibf_config.number_of_user_bins = 123456789;
config.hibf_config.number_of_hash_functions = 4;
config.hibf_config.maximum_false_positive_rate = 0.0001;
config.hibf_config.maximum_fpr = 0.0001;
config.hibf_config.relaxed_fpr = 0.2;
config.hibf_config.threads = 31;
config.hibf_config.sketch_bits = 8;
config.hibf_config.tmax = 128;
Expand All @@ -42,24 +43,25 @@ namespace chopper

bool operator==(chopper::configuration const & lhs, chopper::configuration const & rhs)
{
return lhs.data_file == rhs.data_file && //
lhs.debug == rhs.debug && //
lhs.sketch_directory == rhs.sketch_directory && //
lhs.k == rhs.k && //
lhs.disable_sketch_output == rhs.disable_sketch_output && //
lhs.precomputed_files == rhs.precomputed_files && //
lhs.output_filename == rhs.output_filename && //
lhs.determine_best_tmax == rhs.determine_best_tmax && //
lhs.force_all_binnings == rhs.force_all_binnings && //
lhs.hibf_config.number_of_user_bins == rhs.hibf_config.number_of_user_bins && //
lhs.hibf_config.number_of_hash_functions == rhs.hibf_config.number_of_hash_functions && //
lhs.hibf_config.maximum_false_positive_rate == rhs.hibf_config.maximum_false_positive_rate && //
lhs.hibf_config.threads == rhs.hibf_config.threads && //
lhs.hibf_config.sketch_bits == rhs.hibf_config.sketch_bits && //
lhs.hibf_config.tmax == rhs.hibf_config.tmax && //
lhs.hibf_config.alpha == rhs.hibf_config.alpha && //
lhs.hibf_config.max_rearrangement_ratio == rhs.hibf_config.max_rearrangement_ratio && //
lhs.hibf_config.disable_estimate_union == rhs.hibf_config.disable_estimate_union && //
return lhs.data_file == rhs.data_file && //
lhs.debug == rhs.debug && //
lhs.sketch_directory == rhs.sketch_directory && //
lhs.k == rhs.k && //
lhs.disable_sketch_output == rhs.disable_sketch_output && //
lhs.precomputed_files == rhs.precomputed_files && //
lhs.output_filename == rhs.output_filename && //
lhs.determine_best_tmax == rhs.determine_best_tmax && //
lhs.force_all_binnings == rhs.force_all_binnings && //
lhs.hibf_config.number_of_user_bins == rhs.hibf_config.number_of_user_bins && //
lhs.hibf_config.number_of_hash_functions == rhs.hibf_config.number_of_hash_functions && //
lhs.hibf_config.maximum_fpr == rhs.hibf_config.maximum_fpr && //
lhs.hibf_config.relaxed_fpr == rhs.hibf_config.relaxed_fpr && //
lhs.hibf_config.threads == rhs.hibf_config.threads && //
lhs.hibf_config.sketch_bits == rhs.hibf_config.sketch_bits && //
lhs.hibf_config.tmax == rhs.hibf_config.tmax && //
lhs.hibf_config.alpha == rhs.hibf_config.alpha && //
lhs.hibf_config.max_rearrangement_ratio == rhs.hibf_config.max_rearrangement_ratio && //
lhs.hibf_config.disable_estimate_union == rhs.hibf_config.disable_estimate_union && //
lhs.hibf_config.disable_rearrangement == rhs.hibf_config.disable_rearrangement;
}

Expand Down Expand Up @@ -93,7 +95,8 @@ static constexpr std::string_view config_string_view{"@CHOPPER_CONFIG\n"
"@ \"version\": 1,\n"
"@ \"number_of_user_bins\": 123456789,\n"
"@ \"number_of_hash_functions\": 4,\n"
"@ \"maximum_false_positive_rate\": 0.0001,\n"
"@ \"maximum_fpr\": 0.0001,\n"
"@ \"relaxed_fpr\": 0.2,\n"
"@ \"threads\": 31,\n"
"@ \"sketch_bits\": 8,\n"
"@ \"tmax\": 128,\n"
Expand Down
4 changes: 2 additions & 2 deletions test/api/display_layout/compute_ibf_size_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ TEST(compute_ibf_size_test, merged_bin_is_max_bin)
seqan::hibf::build::build_data data{
.config = {.number_of_user_bins = 123,
.number_of_hash_functions = hash,
.maximum_false_positive_rate = fpr,
.maximum_fpr = fpr,
.threads = 1,
.tmax = tmax},
.fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = fpr, .hash_count = hash, .t_max = tmax})};
Expand Down Expand Up @@ -79,7 +79,7 @@ TEST(compute_ibf_size_test, split_bin_is_max_bin)
seqan::hibf::build::build_data data{
.config = {.number_of_user_bins = 123,
.number_of_hash_functions = hash,
.maximum_false_positive_rate = fpr,
.maximum_fpr = fpr,
.threads = 1,
.tmax = tmax},
.fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = fpr, .hash_count = hash, .t_max = tmax})};
Expand Down
8 changes: 5 additions & 3 deletions test/api/layout/execute_layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ TEST(execute_test, few_ubs)
"@ \"version\": 1,\n"
"@ \"number_of_user_bins\": 8,\n"
"@ \"number_of_hash_functions\": 2,\n"
"@ \"maximum_false_positive_rate\": 0.05,\n"
"@ \"maximum_fpr\": 0.05,\n"
"@ \"relaxed_fpr\": 0.3,\n"
"@ \"threads\": 1,\n"
"@ \"sketch_bits\": 12,\n"
"@ \"tmax\": 64,\n"
Expand Down Expand Up @@ -292,7 +293,8 @@ TEST(execute_test, many_ubs)
"@ \"version\": 1,\n"
"@ \"number_of_user_bins\": 96,\n"
"@ \"number_of_hash_functions\": 2,\n"
"@ \"maximum_false_positive_rate\": 0.05,\n"
"@ \"maximum_fpr\": 0.05,\n"
"@ \"relaxed_fpr\": 0.3,\n"
"@ \"threads\": 1,\n"
"@ \"sketch_bits\": 12,\n"
"@ \"tmax\": 64,\n"
Expand All @@ -303,7 +305,7 @@ TEST(execute_test, many_ubs)
"@ }\n"
"@}\n"
"@HIBF_CONFIG_END\n"
"#TOP_LEVEL_IBF fullest_technical_bin_idx:26\n"
"#TOP_LEVEL_IBF fullest_technical_bin_idx:48\n"
"#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:14\n"
"#LOWER_LEVEL_IBF_1 fullest_technical_bin_idx:14\n"
"#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:14\n"
Expand Down
Loading

0 comments on commit 7823287

Please sign in to comment.