Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MISC] Small refactoring #153

Merged
merged 2 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/test/config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ add_definitions (-DAPPNAME=\"${PROJECT_NAME}\")
# Add the test interface library.
if (NOT TARGET ${PROJECT_NAME}_test)
add_library (${PROJECT_NAME}_test INTERFACE)
target_compile_options (${PROJECT_NAME}_lib PUBLIC "-pedantic" "-Wall" "-Wextra" "-Werror")
target_compile_options (${PROJECT_NAME}_lib PUBLIC "-pedantic" "-Wall" "-Wextra")

if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
# Disable bogus warnings in GCC12.
Expand Down
15 changes: 15 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,18 @@ target_include_directories ("${PROJECT_NAME}_lib" PUBLIC ../include)
add_executable ("${PROJECT_NAME}" main.cpp)
target_link_libraries ("${PROJECT_NAME}" PRIVATE "${PROJECT_NAME}_lib")
set_target_properties ("${PROJECT_NAME}" PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")

# https://cmake.org/cmake/help/latest/module/CMakeDependentOption.html
# We usually want "-Werror" in the CI, but not when developing locally.
# Virtually all CI providers set the environment variable `CI` to some value.
# If `CI` is defined in the environment, we use `-Werror`, otherwise not.
# Caveat: If `CI` is not set, `-Dneedle_WITH_WERROR=ON` has no effect.
# The other way around works. If `CI` is set, `-Dneedle_WITH_WERROR=OFF` is honored.
# Solutions: * `export CI=1` (any value after `=` works, the variable only needs to be not empty)
# * `-DCMAKE_CXX_FLAGS="-Werror"`
include (CMakeDependentOption)
cmake_dependent_option (${PROJECT_NAME}_WITH_WERROR "Report compiler warnings as errors." ON "DEFINED ENV{CI}" OFF)

if (${PROJECT_NAME}_WITH_WERROR)
target_compile_options (${PROJECT_NAME}_lib PUBLIC "-Werror")
endif ()
72 changes: 40 additions & 32 deletions src/estimate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,76 +29,84 @@ template <class IBFType, bool last_exp, bool normalization, typename exp_t>
void check_ibf(min_arguments const & args,
IBFType const & ibf,
std::vector<uint16_t> & estimations_i,
seqan3::dna4_vector const seq,
seqan3::dna4_vector const & seq,
std::vector<uint32_t> & prev_counts,
exp_t const & expressions,
uint16_t const k,
uint16_t const level,
std::vector<double> const fprs,
std::vector<int> & deleted)
{
// Check, if one expression threshold for all or individual thresholds
static constexpr bool multiple_expressions = std::same_as<exp_t, std::vector<std::vector<uint16_t>>>;

// Count minimisers in ibf of current level
std::vector<uint32_t> counter;
counter.assign(ibf.bin_count(), 0);
uint64_t minimiser_length = 0;
std::vector<uint32_t> counter(ibf.bin_count());
uint64_t minimiser_count{};
auto agent = ibf.membership_agent();
for (auto minHash : seqan3::views::minimiser_hash(seq, args.shape, args.w_size, args.s))
{
auto agent = ibf.membership_agent();
std::transform(counter.begin(),
counter.end(),
agent.bulk_contains(minHash).begin(),
counter.begin(),
std::plus<int>());
++minimiser_length;
std::ranges::transform(counter, agent.bulk_contains(minHash), counter.begin(), std::plus<uint32_t>());
++minimiser_count;
}

// Defines, where the median should be
float minimiser_pos = minimiser_length / 2.0;
double const minimiser_pos = minimiser_count / 2.0;

// Check every experiment by going over the number of bins in the ibf.
for (size_t j = 0; j < counter.size(); j++)
for (size_t bin = 0; bin < counter.size(); ++bin)
{
if (std::find(deleted.begin(), deleted.end(), j) != deleted.end())
if (std::ranges::find(deleted, bin) != deleted.end())
continue;
// Correction by substracting the expected number of false positives
counter[j] = std::max((double)0.0, (double)((counter[j] - (minimiser_length * fprs[j])) / (1.0 - fprs[j])));
// Check, if considering previously seen minimisers and minimisers found ar current level equal to or are greater
auto const expected_false_positives = minimiser_count * fprs[bin];
auto const corrected_count = counter[bin] - expected_false_positives;
auto const normalized_count = corrected_count / (1.0 - fprs[bin]);

counter[bin] = std::max<double>(0.0, normalized_count);
// Check if considering previously seen minimisers and minimisers found at current level equal to or are greater
// than the minimiser_pow, which gives the median position.
// If estimation took already place (estimations_i[j]!=0), a second estimation is not performed.
if (((prev_counts[j] + counter[j]) >= minimiser_pos) && (estimations_i[j] == 0))
// If an estimation took already place (estimations_i[bin]!=0), a second estimation is not performed.
if (estimations_i[bin] == 0 && prev_counts[bin] + counter[bin] >= minimiser_pos)
{
// If there was no previous level, because we are looking at the last level.
if constexpr (last_exp)
{
if constexpr (multiple_expressions)
estimations_i[j] = expressions[k][j];
estimations_i[bin] = expressions[level][bin];
else
estimations_i[j] = expressions;
estimations_i[bin] = expressions;
}
else
{
// Actually calculate estimation, in the else case k stands for the prev_expression
auto const normalized_minimiser_pos = std::abs(minimiser_pos - prev_counts[bin]) / counter[bin];

// Actually calculate estimation, in the else case level stands for the prev_expression
if constexpr (multiple_expressions)
estimations_i[j] = std::max(expressions[k][j] * 1.0,
expressions[k + 1][j]
- ((abs(minimiser_pos - prev_counts[j]) / (counter[j] * 1.0))
* (expressions[k + 1][j] - expressions[k][j])));
{
auto const prev_level_expression = expressions[level + 1][bin];
auto const expression_difference = prev_level_expression - expressions[level][bin];
auto const estimate = prev_level_expression - (normalized_minimiser_pos * expression_difference);

estimations_i[bin] = std::max<double>(expressions[level][bin], estimate);
}
else
estimations_i[j] =
std::max(expressions * 1.0,
k - ((abs(minimiser_pos - prev_counts[j]) / (counter[j] * 1.0)) * (k - expressions)));
{
auto const prev_level_expression = level;
auto const expression_difference = prev_level_expression - expressions;
auto const estimate = prev_level_expression - (normalized_minimiser_pos * expression_difference);

estimations_i[bin] = std::max<double>(expressions, estimate);
}
}

// Perform normalization by dividing through the threshold of the first level. Only works, if multiple expressions were used.
// Perform normalization by dividing through the threshold of the first level. Only works if multiple expressions were used.
if constexpr (normalization && multiple_expressions)
estimations_i[j] = estimations_i[j] / expressions[1][j];
estimations_i[bin] /= expressions[1][bin];
}
else
{
// If not found at this level, add to previous count.
prev_counts[j] = prev_counts[j] + counter[j];
prev_counts[bin] += counter[bin];
}
}
}
Expand Down
44 changes: 31 additions & 13 deletions src/ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,31 +116,48 @@
bool const only_include = false,
uint8_t cutoff = 0)
{
// Would result in no minimisers being added to hash_table.
bool const containment_check_in_empty_include_set = only_include && include_set_table.empty();
if (containment_check_in_empty_include_set)
return;

Check warning on line 122 in src/ibf.cpp

View check run for this annotation

Codecov / codecov/patch

src/ibf.cpp#L122

Added line #L122 was not covered by tests

// Would result in no filtering.
bool const exclusion_check_in_empty_exclude_set = !only_include && exclude_set_table.empty();

auto check_value_in_tables = [&](auto && minHash)
{
return (only_include && include_set_table.contains(minHash))
|| (!only_include && !exclude_set_table.contains(minHash));
};

for (auto & [seq] : fin)
{
for (auto && minHash : seqan3::views::minimiser_hash(seq, args.shape, args.w_size, args.s))
{
if ((only_include && include_set_table.contains(minHash))
|| (!only_include && !exclude_set_table.contains(minHash)))
if (exclusion_check_in_empty_exclude_set || check_value_in_tables(minHash))
{
// Note that hash_table[minHash] would insert minHash into the map if it does not yet exist.
auto it = hash_table.find(minHash);
// If minHash is already in hash table, increase count in hash table

// If minHash is already in hash table, increase count in hash table.
if (it != hash_table.end())
{
it->second = std::min<uint16_t>(65534u, hash_table[minHash] + 1);
}
// If minHash equals now the cutoff than add it to the hash table and add plus one for the current
// iteration.
else if (cutoff_table[minHash] == cutoff)
{
hash_table[minHash] = cutoff_table[minHash] + 1;
cutoff_table.erase(minHash);
// Prevent overflow. 65535 is the maximum value for uint16_t.
it->second = std::min<uint16_t>(65534u, it->second + 1);
}
// If there is no cutoff, add value to hash_table and set count to 1.
else if (cutoff == 0)
{
hash_table[minHash]++;
}
// If none of the above, increase count in cutoff table. Cutoff Table increases RAM usage by storing
// If minHash equals the cutoff, add it to the hash table and add plus one for the current
// iteration.
else if (uint8_t const value = cutoff_table[minHash]; value == cutoff)
{
hash_table[minHash] = value + 1u;
cutoff_table.erase(minHash);
}
// If none of the above, increase count in cutoff table. Cutoff Table reduces RAM usage by storing
// minimisers with a low occurence in a smaller hash table.
else
{
Expand Down Expand Up @@ -792,7 +809,8 @@
minimiser_files[file_iterator].extension());
filesize = std::filesystem::file_size(minimiser_files[file_iterator]) * minimiser_args.samples[i]
* (is_fasta ? 2 : 1) / (is_compressed ? 1 : 3);
filesize = filesize / ((cutoffs[i] + 1) * (is_fasta ? 1 : 2));
filesize /= ((cutoffs[i] + 1) * (is_fasta ? 1 : 2));
// ^^ why divide? --> estimate the number of minimisers
}
// If set_expression_thresholds_samplewise is not set the expressions as determined by the first file are used for
// all files.
Expand Down