seqan · eseiler · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/cmake/test/config.cmake b/cmake/test/config.cmake
@@ -19,7 +19,7 @@ add_definitions (-DAPPNAME=\"${PROJECT_NAME}\")
 # Add the test interface library.
 if (NOT TARGET ${PROJECT_NAME}_test)
     add_library (${PROJECT_NAME}_test INTERFACE)
-    target_compile_options (${PROJECT_NAME}_lib PUBLIC "-pedantic" "-Wall" "-Wextra" "-Werror")
+    target_compile_options (${PROJECT_NAME}_lib PUBLIC "-pedantic" "-Wall" "-Wextra")
 
     if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
         # Disable bogus warnings in GCC12.

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -14,3 +14,18 @@ target_include_directories ("${PROJECT_NAME}_lib" PUBLIC ../include)
 add_executable ("${PROJECT_NAME}" main.cpp)
 target_link_libraries ("${PROJECT_NAME}" PRIVATE "${PROJECT_NAME}_lib")
 set_target_properties ("${PROJECT_NAME}" PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+
+# https://cmake.org/cmake/help/latest/module/CMakeDependentOption.html
+# We usually want "-Werror" in the CI, but not when developing locally.
+# Virtually all CI providers set the environment variable `CI` to some value.
+# If `CI` is defined in the environment, we use `-Werror`, otherwise not.
+# Caveat: If `CI` is not set, `-Dneedle_WITH_WERROR=ON` has no effect.
+#         The other way around works. If `CI` is set, `-Dneedle_WITH_WERROR=OFF` is honored.
+# Solutions: * `export CI=1` (any value after `=` works, the variable only needs to be not empty)
+#            * `-DCMAKE_CXX_FLAGS="-Werror"`
+include (CMakeDependentOption)
+cmake_dependent_option (${PROJECT_NAME}_WITH_WERROR "Report compiler warnings as errors." ON "DEFINED ENV{CI}" OFF)
+
+if (${PROJECT_NAME}_WITH_WERROR)
+    target_compile_options (${PROJECT_NAME}_lib PUBLIC "-Werror")
+endif ()
diff --git a/src/estimate.cpp b/src/estimate.cpp
@@ -29,76 +29,84 @@ template <class IBFType, bool last_exp, bool normalization, typename exp_t>
 void check_ibf(min_arguments const & args,
                IBFType const & ibf,
                std::vector<uint16_t> & estimations_i,
-               seqan3::dna4_vector const seq,
+               seqan3::dna4_vector const & seq,
                std::vector<uint32_t> & prev_counts,
                exp_t const & expressions,
-               uint16_t const k,
+               uint16_t const level,
                std::vector<double> const fprs,
                std::vector<int> & deleted)
 {
     // Check, if one expression threshold for all or individual thresholds
     static constexpr bool multiple_expressions = std::same_as<exp_t, std::vector<std::vector<uint16_t>>>;
 
     // Count minimisers in ibf of current level
-    std::vector<uint32_t> counter;
-    counter.assign(ibf.bin_count(), 0);
-    uint64_t minimiser_length = 0;
+    std::vector<uint32_t> counter(ibf.bin_count());
+    uint64_t minimiser_count{};
+    auto agent = ibf.membership_agent();
     for (auto minHash : seqan3::views::minimiser_hash(seq, args.shape, args.w_size, args.s))
     {
-        auto agent = ibf.membership_agent();
-        std::transform(counter.begin(),
-                       counter.end(),
-                       agent.bulk_contains(minHash).begin(),
-                       counter.begin(),
-                       std::plus<int>());
-        ++minimiser_length;
+        std::ranges::transform(counter, agent.bulk_contains(minHash), counter.begin(), std::plus<uint32_t>());
+        ++minimiser_count;
     }
 
     // Defines, where the median should be
-    float minimiser_pos = minimiser_length / 2.0;
+    double const minimiser_pos = minimiser_count / 2.0;
 
     // Check every experiment by going over the number of bins in the ibf.
-    for (size_t j = 0; j < counter.size(); j++)
+    for (size_t bin = 0; bin < counter.size(); ++bin)
     {
-        if (std::find(deleted.begin(), deleted.end(), j) != deleted.end())
+        if (std::ranges::find(deleted, bin) != deleted.end())
             continue;
         // Correction by substracting the expected number of false positives
-        counter[j] = std::max((double)0.0, (double)((counter[j] - (minimiser_length * fprs[j])) / (1.0 - fprs[j])));
-        // Check, if considering previously seen minimisers and minimisers found ar current level equal to or are greater
+        auto const expected_false_positives = minimiser_count * fprs[bin];
+        auto const corrected_count = counter[bin] - expected_false_positives;
+        auto const normalized_count = corrected_count / (1.0 - fprs[bin]);
+
+        counter[bin] = std::max<double>(0.0, normalized_count);
+        // Check if considering previously seen minimisers and minimisers found at current level equal to or are greater
         // than the minimiser_pow, which gives the median position.
-        // If ań estimation took already place (estimations_i[j]!=0), a second estimation is not performed.
-        if (((prev_counts[j] + counter[j]) >= minimiser_pos) && (estimations_i[j] == 0))
+        // If an estimation took already place (estimations_i[bin]!=0), a second estimation is not performed.
+        if (estimations_i[bin] == 0 && prev_counts[bin] + counter[bin] >= minimiser_pos)
         {
             // If there was no previous level, because we are looking at the last level.
             if constexpr (last_exp)
             {
                 if constexpr (multiple_expressions)
-                    estimations_i[j] = expressions[k][j];
+                    estimations_i[bin] = expressions[level][bin];
                 else
-                    estimations_i[j] = expressions;
+                    estimations_i[bin] = expressions;
             }
             else
             {
-                // Actually calculate estimation, in the else case k stands for the prev_expression
+                auto const normalized_minimiser_pos = std::abs(minimiser_pos - prev_counts[bin]) / counter[bin];
+
+                // Actually calculate estimation, in the else case level stands for the prev_expression
                 if constexpr (multiple_expressions)
-                    estimations_i[j] = std::max(expressions[k][j] * 1.0,
-                                                expressions[k + 1][j]
-                                                    - ((abs(minimiser_pos - prev_counts[j]) / (counter[j] * 1.0))
-                                                       * (expressions[k + 1][j] - expressions[k][j])));
+                {
+                    auto const prev_level_expression = expressions[level + 1][bin];
+                    auto const expression_difference = prev_level_expression - expressions[level][bin];
+                    auto const estimate = prev_level_expression - (normalized_minimiser_pos * expression_difference);
+
+                    estimations_i[bin] = std::max<double>(expressions[level][bin], estimate);
+                }
                 else
-                    estimations_i[j] =
-                        std::max(expressions * 1.0,
-                                 k - ((abs(minimiser_pos - prev_counts[j]) / (counter[j] * 1.0)) * (k - expressions)));
+                {
+                    auto const prev_level_expression = level;
+                    auto const expression_difference = prev_level_expression - expressions;
+                    auto const estimate = prev_level_expression - (normalized_minimiser_pos * expression_difference);
+
+                    estimations_i[bin] = std::max<double>(expressions, estimate);
+                }
             }
 
-            // Perform normalization by dividing through the threshold of the first level. Only works, if multiple expressions were used.
+            // Perform normalization by dividing through the threshold of the first level. Only works if multiple expressions were used.
             if constexpr (normalization && multiple_expressions)
-                estimations_i[j] = estimations_i[j] / expressions[1][j];
+                estimations_i[bin] /= expressions[1][bin];
         }
         else
         {
             // If not found at this level, add to previous count.
-            prev_counts[j] = prev_counts[j] + counter[j];
+            prev_counts[bin] += counter[bin];
         }
     }
 }

diff --git a/src/ibf.cpp b/src/ibf.cpp
@@ -116,31 +116,48 @@
                      bool const only_include = false,
                      uint8_t cutoff = 0)
 {
+    // Would result in no minimisers being added to hash_table.
+    bool const containment_check_in_empty_include_set = only_include && include_set_table.empty();
+    if (containment_check_in_empty_include_set)
+        return;
+
+    // Would result in no filtering.
+    bool const exclusion_check_in_empty_exclude_set = !only_include && exclude_set_table.empty();
+
+    auto check_value_in_tables = [&](auto && minHash)
+    {
+        return (only_include && include_set_table.contains(minHash))
+            || (!only_include && !exclude_set_table.contains(minHash));
+    };
+
     for (auto & [seq] : fin)
     {
         for (auto && minHash : seqan3::views::minimiser_hash(seq, args.shape, args.w_size, args.s))
         {
-            if ((only_include && include_set_table.contains(minHash))
-                || (!only_include && !exclude_set_table.contains(minHash)))
+            if (exclusion_check_in_empty_exclude_set || check_value_in_tables(minHash))
             {
+                // Note that hash_table[minHash] would insert minHash into the map if it does not yet exist.
                 auto it = hash_table.find(minHash);
-                // If minHash is already in hash table, increase count in hash table
+
+                // If minHash is already in hash table, increase count in hash table.
                 if (it != hash_table.end())
                 {
-                    it->second = std::min<uint16_t>(65534u, hash_table[minHash] + 1);
-                }
-                // If minHash equals now the cutoff than add it to the hash table and add plus one for the current
-                // iteration.
-                else if (cutoff_table[minHash] == cutoff)
-                {
-                    hash_table[minHash] = cutoff_table[minHash] + 1;
-                    cutoff_table.erase(minHash);
+                    // Prevent overflow. 65535 is the maximum value for uint16_t.
+                    it->second = std::min<uint16_t>(65534u, it->second + 1);
                 }
+                // If there is no cutoff, add value to hash_table and set count to 1.
                 else if (cutoff == 0)
                 {
                     hash_table[minHash]++;
                 }
-                // If none of the above, increase count in cutoff table. Cutoff Table increases RAM usage by storing
+                // If minHash equals the cutoff, add it to the hash table and add plus one for the current
+                // iteration.
+                else if (uint8_t const value = cutoff_table[minHash]; value == cutoff)
+                {
+                    hash_table[minHash] = value + 1u;
+                    cutoff_table.erase(minHash);
+                }
+                // If none of the above, increase count in cutoff table. Cutoff Table reduces RAM usage by storing
                 // minimisers with a low occurence in a smaller hash table.
                 else
                 {
@@ -792,7 +809,8 @@
                                                                          minimiser_files[file_iterator].extension());
             filesize = std::filesystem::file_size(minimiser_files[file_iterator]) * minimiser_args.samples[i]
                      * (is_fasta ? 2 : 1) / (is_compressed ? 1 : 3);
-            filesize = filesize / ((cutoffs[i] + 1) * (is_fasta ? 1 : 2));
+            filesize /= ((cutoffs[i] + 1) * (is_fasta ? 1 : 2));
+            // ^^ why divide? --> estimate the number of minimisers
         }
         // If set_expression_thresholds_samplewise is not set the expressions as determined by the first file are used for
         // all files.