From 343351059558d27a7f22d9f6d7ef798033a052f8 Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Wed, 11 Oct 2023 16:00:19 +0200 Subject: [PATCH 01/15] bm25 only code --- cli/src/FuzzyMatch-cli.cc | 14 ++-- include/fuzzy/index.hh | 4 +- include/fuzzy/index.hxx | 21 ++++++ src/CMakeLists.txt | 2 + src/bm25_matches.cc | 2 +- src/filter.cc | 1 + src/fuzzy_match.cc | 140 ++++++++++++++++++++++++++------------ src/index.cc | 2 + 8 files changed, 136 insertions(+), 50 deletions(-) diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc index 909555d..b472c36 100644 --- a/cli/src/FuzzyMatch-cli.cc +++ b/cli/src/FuzzyMatch-cli.cc @@ -129,10 +129,10 @@ std::pair process_stream(const Function& function, if (!res.empty()) count_nonempty++; out << res << std::endl; - if (count_nonempty % 100 == 0) - std::cerr << "\rPROGRESS: " << count_nonempty << " " << std::flush; + // if (count_nonempty % 100 == 0) + // std::cerr << "\rPROGRESS: " << count_nonempty << " " << std::flush; } - std::cerr << std::endl; + // std::cerr << std::endl; return std::make_pair(count_nonempty, count_total); } @@ -163,8 +163,8 @@ std::pair process_stream(const Function& function, count_nonempty++; out << res << std::endl; futures.pop(); - if (count_nonempty % 100 == 0) - std::cerr << "\rPROGRESS: " << count_nonempty << " " << std::flush; + // if (count_nonempty % 100 == 0) + // std::cerr << "\rPROGRESS: " << count_nonempty << " " << std::flush; } }; @@ -187,7 +187,7 @@ std::pair process_stream(const Function& function, if (!futures.empty()) pop_results(/*blocking=*/true); - std::cerr << std::endl; + // std::cerr << std::endl; { std::lock_guard lock(mutex); @@ -437,6 +437,8 @@ int main(int argc, char** argv) fuzzy::IndexType filter_type; if (filter_type_str == "bm25") filter_type = fuzzy::IndexType::BM25; + else if (filter_type_str == "no") + filter_type = fuzzy::IndexType::NO; else filter_type = fuzzy::IndexType::SUFFIX; #ifdef NO_EIGEN diff --git a/include/fuzzy/index.hh b/include/fuzzy/index.hh index 9bad0d6..3df24d6 100644 --- a/include/fuzzy/index.hh +++ b/include/fuzzy/index.hh @@ -12,13 +12,14 @@ #ifdef USE_EIGEN #include #endif +#include #include #include namespace fuzzy { constexpr size_t DEFAULT_MAX_TOKENS_IN_PATTERN = 300; // if you change this value, update README.md - enum class IndexType { SUFFIX, BM25 }; + enum class IndexType { SUFFIX, BM25, NO }; class FilterIndex { public: @@ -62,6 +63,7 @@ namespace fuzzy #ifdef USE_EIGEN inline std::shared_ptr createBM25(const FilterIndexParams ¶ms = FilterIndexParams()) { return std::make_shared(params); } #endif + inline std::shared_ptr createNo() { return std::make_shared(); } std::vector _ids; std::vector _real_tokens; size_t _max_tokens_in_pattern; diff --git a/include/fuzzy/index.hxx b/include/fuzzy/index.hxx index 0156857..ef533d8 100644 --- a/include/fuzzy/index.hxx +++ b/include/fuzzy/index.hxx @@ -64,6 +64,17 @@ namespace fuzzy & _max_tokens_in_pattern; } #endif + else if (_type == IndexType::NO) + { + NoFilter& no_filter = static_cast(*_filter); + ar + & _type + & _vocabIndexer + & no_filter + & _ids + & _real_tokens + & _max_tokens_in_pattern; + } } template @@ -94,6 +105,16 @@ namespace fuzzy & _real_tokens; } #endif + else if (_type == IndexType::NO) + { + _filter = createNo(); + NoFilter& no_filter = static_cast(*_filter); + ar + & _vocabIndexer + & no_filter + & _ids + & _real_tokens; + } if (version >= 1) ar & _max_tokens_in_pattern; } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8e140f3..6e6c44c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -12,6 +12,8 @@ set(FUZZY_SOURCES pattern_coverage.cc filter.cc index.cc + no_filter.cc + no_matches.cc ) if(MSVC) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) diff --git a/src/bm25_matches.cc b/src/bm25_matches.cc index 0881bcc..a10fb2a 100644 --- a/src/bm25_matches.cc +++ b/src/bm25_matches.cc @@ -44,7 +44,7 @@ namespace fuzzy _best_matches.reserve(k_best.size()); while (!k_best.empty()) { - _best_matches.push_back({k_best.top().second, 0}); + _best_matches.push_back({k_best.top().second, (int)(k_best.top().first * 1000)}); k_best.pop(); } std::reverse(_best_matches.begin(), _best_matches.end()); diff --git a/src/filter.cc b/src/filter.cc index 39537c8..94f89c2 100644 --- a/src/filter.cc +++ b/src/filter.cc @@ -10,6 +10,7 @@ namespace fuzzy Filter::add_sentence(const std::vector& sentence) { size_t sidx = _sentence_pos.size(); + std::cerr << sidx << std::endl; _sentence_pos.push_back(_sentence_buffer.size()); /* first token in sentence buffer is the sentence size */ diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index 9640250..83629ad 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -14,6 +14,8 @@ #include #include #endif +#include +#include #include #include #include @@ -540,7 +542,7 @@ namespace fuzzy if (range_suffixid.first != range_suffixid.second) { /* do not register unigrams - yet */ - if (subseq_length > 2) + if (subseq_length >= 2) { /* register (n-1) grams */ nGramMatches->register_suffix_range_match(previous_range_suffixid.first, @@ -561,13 +563,12 @@ namespace fuzzy break; } } - if (subseq_length >= 2) + if (subseq_length >= 1) nGramMatches->register_suffix_range_match(previous_range_suffixid.first, previous_range_suffixid.second, subseq_length, edit_costs); } - // filter_matches = &nGramMatches; } #ifdef USE_EIGEN else if (filter_type == IndexType::BM25) @@ -579,6 +580,13 @@ namespace fuzzy bm25Matches.register_pattern(pattern_wids, edit_costs); } #endif + else if (filter_type == IndexType::NO) + { + const NoFilter& no_filter = static_cast(filter); + filter_matches = std::make_shared(fuzzy, p_length, min_subseq_length, no_filter); + NoMatches& no_matches = static_cast(*filter_matches); + no_matches.load_all(); + } /* Consolidation of the results */ /* now explore for the best segments */ @@ -597,51 +605,99 @@ namespace fuzzy lowest_costs.push(std::numeric_limits::max()); unsigned cpt = 0; + // unsigned num_filtered = 0; + + // ONLY N-grams + // for (const auto& pair : filter_matches->get_best_matches()) + // { + // const auto s_id = pair.first; + // const auto longest_match = pair.second; + // size_t s_length = 0; + // const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length); + // Match m(sentence_wids, s_length); + // m.score = (float)longest_match / (float)s_length; + // m.max_subseq = longest_match; + // m.s_id = s_id; + // m.id = _filterIndex->id(s_id); + // m.secondary_sort = s_id; + // m.penalty = 0; + // result.push(m); + // cpt++; + // if (cpt > contrast_buffer) + // break; + // } + + // ONLY BM25 for (const auto& pair : filter_matches->get_best_matches()) { const auto s_id = pair.first; - const auto longest_match = pair.second; + const auto bm25_score = pair.second; size_t s_length = 0; const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length); - const auto num_covered_words = (longest_match < p_length - ? pattern_coverage.count_covered_words(sentence_wids, s_length) - : p_length); - /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ - if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs)) - { - const Costs costs(p_length, s_length, edit_costs); - - /* let us check the candidates */ - const auto sentence_realtok = _filterIndex->real_tokens(s_id); - const auto cost_upper_bound = lowest_costs.top(); - float cost = _edit_distance(sentence_wids, sentence_realtok, s_length, - pattern_wids.data(), pattern_realtok, p_length, - st, sn, - idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max, - edit_costs, - costs, cost_upper_bound); - if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) - continue; - - float score = int(10000-cost*100)/10000.0; - - - lowest_costs.push(cost); - if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) - lowest_costs.pop(); - if (score >= fuzzy) { - Match m(sentence_wids, s_length); - m.score = score; - m.max_subseq = longest_match; - m.s_id = s_id; - m.id = _filterIndex->id(s_id); - m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt; - m.penalty = 0; - result.push(m); - cpt++; - } - } + Match m(sentence_wids, s_length); + m.score = (float)bm25_score / (float)1000.; + m.max_subseq = 0; + m.s_id = s_id; + m.id = _filterIndex->id(s_id); + m.secondary_sort = s_id; + m.penalty = 0; + result.push(m); + cpt++; + if (cpt > contrast_buffer) + break; } + + + // for (const auto& pair : filter_matches->get_best_matches()) + // { + // // num_filtered++; + // const auto s_id = pair.first; + // const auto longest_match = pair.second; + // size_t s_length = 0; + // const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length); + // const auto num_covered_words = (longest_match < p_length + // ? pattern_coverage.count_covered_words(sentence_wids, s_length) + // : p_length); + // /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ + // // if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs)) + // // { + // const Costs costs(p_length, s_length, edit_costs); + + // /* let us check the candidates */ + // const auto sentence_realtok = _filterIndex->real_tokens(s_id); + // const auto cost_upper_bound = lowest_costs.top(); + // float cost = _edit_distance(sentence_wids, sentence_realtok, s_length, + // pattern_wids.data(), pattern_realtok, p_length, + // st, sn, + // idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max, + // edit_costs, + // costs, cost_upper_bound); + // // float cost = 0.1; + // if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) + // continue; + + // float score = int(10000-cost*100)/10000.0; + + + // lowest_costs.push(cost); + // if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) + // lowest_costs.pop(); + // if (score >= fuzzy) { + // Match m(sentence_wids, s_length); + // m.score = score; + // m.max_subseq = longest_match; + // m.s_id = s_id; + // m.id = _filterIndex->id(s_id); + // m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt; + // m.penalty = 0; + // result.push(m); + // cpt++; + // } + // // } + // } + // COUT filter + // std::cerr << num_filtered << std::endl; + // std::cerr << filter_matches->get_best_matches().size() << std::endl; // delete filter_matches; /* Contrastive reranking */ if (contrastive_factor > 0) diff --git a/src/index.cc b/src/index.cc index c994e88..ec4d256 100644 --- a/src/index.cc +++ b/src/index.cc @@ -13,6 +13,8 @@ namespace fuzzy #endif if (_type == IndexType::SUFFIX) _filter = createSuffixArray(); + else if (_type == IndexType::NO) + _filter = createNo(); } int From b16626d232748712af63432a162d6fe9c4f1e94d Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Tue, 2 Jan 2024 15:58:08 +0100 Subject: [PATCH 02/15] no_filter --- include/fuzzy/no_filter.hh | 62 +++++++++++++++++++++++++++++++++++++ include/fuzzy/no_filter.hxx | 30 ++++++++++++++++++ include/fuzzy/no_matches.hh | 27 ++++++++++++++++ src/no_filter.cc | 30 ++++++++++++++++++ src/no_matches.cc | 30 ++++++++++++++++++ 5 files changed, 179 insertions(+) create mode 100644 include/fuzzy/no_filter.hh create mode 100644 include/fuzzy/no_filter.hxx create mode 100644 include/fuzzy/no_matches.hh create mode 100644 src/no_filter.cc create mode 100644 src/no_matches.cc diff --git a/include/fuzzy/no_filter.hh b/include/fuzzy/no_filter.hh new file mode 100644 index 0000000..afcdc76 --- /dev/null +++ b/include/fuzzy/no_filter.hh @@ -0,0 +1,62 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace fuzzy +{ + class NoFilter : public Filter + { + public: + NoFilter(const FilterIndexParams ¶ms=FilterIndexParams()); + ~NoFilter(); + // unsigned add_sentence(const std::vector& sentence) override; + using Filter::add_sentence; + + using Filter::dump; + using Filter::num_sentences; + using Filter::get_sentence; + + void prepare(size_t vocab_size); + + std::ostream& dump(std::ostream&) const; + + unsigned get_sentence_length(size_t s_id) const; + + private: + friend class boost::serialization::access; + + template + void save(Archive&, unsigned int version) const; + + template + void load(Archive&, unsigned int version); + + BOOST_SERIALIZATION_SPLIT_MEMBER() + }; +} + +BOOST_CLASS_VERSION(fuzzy::NoFilter, 1) + +#include "fuzzy/no_filter.hxx" \ No newline at end of file diff --git a/include/fuzzy/no_filter.hxx b/include/fuzzy/no_filter.hxx new file mode 100644 index 0000000..5c9e68a --- /dev/null +++ b/include/fuzzy/no_filter.hxx @@ -0,0 +1,30 @@ +#include + +namespace fuzzy +{ + inline unsigned + NoFilter::get_sentence_length(size_t s_id) const + { + if (s_id + 1 == _sentence_pos.size()) + return _sentence_buffer.size() - _sentence_pos[s_id] - 2; + return _sentence_pos[s_id + 1] - _sentence_pos[s_id] - 2; + } + + template + void NoFilter::save(Archive& archive, unsigned int) const + { + archive + & _sentence_buffer + & _sentence_pos + & _quickVocabAccess; + } + + template + void NoFilter::load(Archive& archive, unsigned int) + { + archive + & _sentence_buffer + & _sentence_pos + & _quickVocabAccess; + } +} diff --git a/include/fuzzy/no_matches.hh b/include/fuzzy/no_matches.hh new file mode 100644 index 0000000..a94d564 --- /dev/null +++ b/include/fuzzy/no_matches.hh @@ -0,0 +1,27 @@ +#pragma once + +#include +#include + +namespace fuzzy +{ + class NoMatches : public FilterMatches + { + public: + using FilterMatches::FilterMatches; + NoMatches(float fuzzy, + unsigned p_length, + unsigned min_seq_len, + const NoFilter &); + + // Registers a match for this range of suffixes. + using FilterMatches::theoretical_rejection; + using FilterMatches::theoretical_rejection_cover; + + void load_all(); + std::vector> get_best_matches() const override; + + private: + std::vector> _all_matches; + }; +} diff --git a/src/no_filter.cc b/src/no_filter.cc new file mode 100644 index 0000000..a4ed3c4 --- /dev/null +++ b/src/no_filter.cc @@ -0,0 +1,30 @@ +#include + +#include +#include +#include + +namespace fuzzy +{ + NoFilter::NoFilter(const FilterIndexParams& params) + {} + NoFilter::~NoFilter() {} + // unsigned + // NoFilter::add_sentence(const std::vector& sentence) + // { + // size_t sidx = _sentence_pos.size(); + // _sentence_pos.push_back(_sentence_buffer.size()); + + // /* first token in sentence buffer is the sentence size */ + // _sentence_buffer.push_back(sentence.size()); + + // for (size_t i = 0; i < sentence.size(); i++) + // { + // _sentence_buffer.push_back(sentence[i]); + // } + // _sentence_buffer.push_back(fuzzy::VocabIndexer::SENTENCE_SEPARATOR); + // return sidx; + // } + + void NoFilter::prepare(size_t vocab_size) {} +} diff --git a/src/no_matches.cc b/src/no_matches.cc new file mode 100644 index 0000000..277aa01 --- /dev/null +++ b/src/no_matches.cc @@ -0,0 +1,30 @@ +#include + +#include + +namespace fuzzy +{ + NoMatches::NoMatches(float fuzzy, + unsigned p_length, + unsigned min_seq_len, + const NoFilter &no_filter) + /* add a small epsilon to avoid rounding errors counting for an error */ + : FilterMatches(fuzzy, p_length, min_seq_len, no_filter) + { + } + + std::vector> + NoMatches::get_best_matches() const + { + return _all_matches; + } + + void NoMatches::load_all() + { + _all_matches = std::vector>(_filter.num_sentences()); + size_t *length; + + for (unsigned i = 0; i < _filter.num_sentences(); i++) + _all_matches[i] = {i, 0}; + } +} From ebe5dc3cfce2e4ba8bde57d34a0b210e93b62a55 Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Tue, 2 Jan 2024 16:06:26 +0100 Subject: [PATCH 03/15] starting functional point --- src/fuzzy_match.cc | 122 ++++++++++++++++++++++----------------------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index 83629ad..a708089 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -628,73 +628,73 @@ namespace fuzzy // } // ONLY BM25 - for (const auto& pair : filter_matches->get_best_matches()) - { - const auto s_id = pair.first; - const auto bm25_score = pair.second; - size_t s_length = 0; - const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length); - Match m(sentence_wids, s_length); - m.score = (float)bm25_score / (float)1000.; - m.max_subseq = 0; - m.s_id = s_id; - m.id = _filterIndex->id(s_id); - m.secondary_sort = s_id; - m.penalty = 0; - result.push(m); - cpt++; - if (cpt > contrast_buffer) - break; - } - - // for (const auto& pair : filter_matches->get_best_matches()) // { - // // num_filtered++; // const auto s_id = pair.first; - // const auto longest_match = pair.second; + // const auto bm25_score = pair.second; // size_t s_length = 0; // const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length); - // const auto num_covered_words = (longest_match < p_length - // ? pattern_coverage.count_covered_words(sentence_wids, s_length) - // : p_length); - // /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ - // // if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs)) - // // { - // const Costs costs(p_length, s_length, edit_costs); - - // /* let us check the candidates */ - // const auto sentence_realtok = _filterIndex->real_tokens(s_id); - // const auto cost_upper_bound = lowest_costs.top(); - // float cost = _edit_distance(sentence_wids, sentence_realtok, s_length, - // pattern_wids.data(), pattern_realtok, p_length, - // st, sn, - // idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max, - // edit_costs, - // costs, cost_upper_bound); - // // float cost = 0.1; - // if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) - // continue; - - // float score = int(10000-cost*100)/10000.0; - - - // lowest_costs.push(cost); - // if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) - // lowest_costs.pop(); - // if (score >= fuzzy) { - // Match m(sentence_wids, s_length); - // m.score = score; - // m.max_subseq = longest_match; - // m.s_id = s_id; - // m.id = _filterIndex->id(s_id); - // m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt; - // m.penalty = 0; - // result.push(m); - // cpt++; - // } - // // } + // Match m(sentence_wids, s_length); + // m.score = (float)bm25_score / (float)1000.; + // m.max_subseq = 0; + // m.s_id = s_id; + // m.id = _filterIndex->id(s_id); + // m.secondary_sort = s_id; + // m.penalty = 0; + // result.push(m); + // cpt++; + // if (cpt > contrast_buffer) + // break; // } + + + for (const auto& pair : filter_matches->get_best_matches()) + { + // num_filtered++; + const auto s_id = pair.first; + const auto longest_match = pair.second; + size_t s_length = 0; + const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length); + const auto num_covered_words = (longest_match < p_length + ? pattern_coverage.count_covered_words(sentence_wids, s_length) + : p_length); + /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ + // if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs)) + // { + const Costs costs(p_length, s_length, edit_costs); + + /* let us check the candidates */ + const auto sentence_realtok = _filterIndex->real_tokens(s_id); + const auto cost_upper_bound = lowest_costs.top(); + float cost = _edit_distance(sentence_wids, sentence_realtok, s_length, + pattern_wids.data(), pattern_realtok, p_length, + st, sn, + idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max, + edit_costs, + costs, cost_upper_bound); + // float cost = 0.1; + if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) + continue; + + float score = int(10000-cost*100)/10000.0; + + + lowest_costs.push(cost); + if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) + lowest_costs.pop(); + if (score >= fuzzy) { + Match m(sentence_wids, s_length); + m.score = score; + m.max_subseq = longest_match; + m.s_id = s_id; + m.id = _filterIndex->id(s_id); + m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt; + m.penalty = 0; + result.push(m); + cpt++; + } + // } + } // COUT filter // std::cerr << num_filtered << std::endl; // std::cerr << filter_matches->get_best_matches().size() << std::endl; From 2dab6b831e88ee8228507686f5686faedbbf857c Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Wed, 10 Jan 2024 11:37:22 +0100 Subject: [PATCH 04/15] bow submodularity working --- cli/src/FuzzyMatch-cli.cc | 12 +- include/fuzzy/bm25.hh | 1 + include/fuzzy/bm25.hxx | 5 + include/fuzzy/bm25_matches.hh | 5 +- include/fuzzy/filter_matches.hh | 2 +- include/fuzzy/fuzzy_match.hh | 12 +- include/fuzzy/ngram_matches.hh | 2 +- include/fuzzy/no_matches.hh | 4 +- include/fuzzy/pattern_coverage.hh | 1 + src/bm25_matches.cc | 22 ++- src/fuzzy_match.cc | 233 ++++++++++++++++++++++++------ src/ngram_matches.cc | 6 +- src/no_matches.cc | 4 +- src/pattern_coverage.cc | 10 ++ 14 files changed, 253 insertions(+), 66 deletions(-) diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc index b472c36..30d8f18 100644 --- a/cli/src/FuzzyMatch-cli.cc +++ b/cli/src/FuzzyMatch-cli.cc @@ -210,7 +210,7 @@ class processor { std::string contrastive_reduce_str, int contrastive_buffer, fuzzy::IndexType filter_type, - int bm25_buffer, float bm25_cutoff, const fuzzy::FilterIndexParams& filter_index_params): + int bm25_buffer, float bm25_cutoff, float submodular_shrinking_factor, const fuzzy::FilterIndexParams& filter_index_params): _fuzzyMatcher(pt, max_tokens_in_pattern, filter_type, filter_index_params), _fuzzy(fuzzy), _contrastive_factor(contrastive_factor), @@ -224,7 +224,8 @@ class processor { _contrastive_buffer(contrastive_buffer), _filter_type(filter_type), _bm25_buffer(bm25_buffer), - _bm25_cutoff(bm25_cutoff) { + _bm25_cutoff(bm25_cutoff), + _submodular_shrinking_factor(submodular_shrinking_factor) { if (contrastive_reduce_str == "max") _contrastive_reduce = fuzzy::ContrastReduce::MAX; else @@ -236,7 +237,7 @@ class processor { _fuzzyMatcher.match(sentence, _fuzzy, _nmatch, _no_perfect, matches, _min_subseq_length, _min_subseq_ratio, _idf_penalty, _cost, _contrastive_factor, _contrastive_reduce, _contrastive_buffer, - _filter_type, _bm25_buffer, _bm25_cutoff); + _filter_type, _bm25_buffer, _bm25_cutoff, _submodular_shrinking_factor); std::string out; for(const fuzzy::FuzzyMatch::Match &m: matches) { @@ -294,6 +295,7 @@ class processor { fuzzy::IndexType _filter_type; int _bm25_buffer; float _bm25_cutoff; + float _submodular_shrinking_factor; }; int main(int argc, char** argv) @@ -326,6 +328,7 @@ int main(int argc, char** argv) float contrastive_factor; float bm25_cutoff; float bm25_ratio_idf; + float submodular_shrinking_factor; int nmatch; int nthreads; int min_subseq_length; @@ -365,6 +368,7 @@ int main(int argc, char** argv) ("bm25-ratio-idf", po::value(&bm25_ratio_idf)->default_value(0.5f), "filter in the reverse index to consider only terms rare enough (close to 0 = ignores a lot : close to 1 = considers a lot)") ("bm25-buffer", po::value(&bm25_buffer)->default_value(10), "number of best BM25 to rerank") ("bm25-cutoff", po::value(&bm25_cutoff)->default_value(0.f), "minimum BM25 score threshold cutoff") + ("submodular-shrinking-factor,lambda", po::value(&submodular_shrinking_factor)->default_value(1.f), "In submodularity coverage, weight shrinking factor of each covered salient aspect of the source") ("nthreads,N", po::value(&nthreads)->default_value(4), "number of thread to use for match") ; @@ -450,7 +454,7 @@ int main(int argc, char** argv) idf_penalty, subseq_idf_weighting, max_tokens_in_pattern, edit_cost, contrastive_reduce, contrastive_buffer, - filter_type, bm25_buffer, bm25_cutoff, filter_index_params); + filter_type, bm25_buffer, bm25_cutoff, submodular_shrinking_factor, filter_index_params); if (index_file.length()) { TICK("Loading index_file: "+index_file); diff --git a/include/fuzzy/bm25.hh b/include/fuzzy/bm25.hh index a62d9fd..6f0735c 100644 --- a/include/fuzzy/bm25.hh +++ b/include/fuzzy/bm25.hh @@ -61,6 +61,7 @@ namespace fuzzy inline int get_vocab_size() const { return _vocab_size; } Eigen::SparseVector compute_product(const Eigen::SparseVector& pattern_voc) const; + Eigen::SparseVector get_cover(const Eigen::SparseVector& pattern_voc, unsigned s_id) const; private: size_t _vocab_size; diff --git a/include/fuzzy/bm25.hxx b/include/fuzzy/bm25.hxx index ce0aec9..04c199e 100644 --- a/include/fuzzy/bm25.hxx +++ b/include/fuzzy/bm25.hxx @@ -45,6 +45,11 @@ namespace fuzzy { return _bm25_inverse_index * pattern_voc; } + inline Eigen::SparseVector BM25::get_cover(const Eigen::SparseVector& pattern_voc, unsigned s_id) const + { + Eigen::SparseVector row_s = _bm25_inverse_index.row(s_id); + return row_s.cwiseProduct(pattern_voc); + } template void BM25::save(Archive& archive, unsigned int) const { diff --git a/include/fuzzy/bm25_matches.hh b/include/fuzzy/bm25_matches.hh index d6d3c49..2e630f8 100644 --- a/include/fuzzy/bm25_matches.hh +++ b/include/fuzzy/bm25_matches.hh @@ -33,12 +33,13 @@ namespace fuzzy using FilterMatches::theoretical_rejection; using FilterMatches::theoretical_rejection_cover; - std::vector> get_best_matches() const override; + std::vector> get_best_matches() const override; + std::vector cover(const std::vector& unique_pattern_wids, const std::vector& count, unsigned s_id) const; private: // Num of sentences to place in the buffer const unsigned _buffer; const float _cutoff_threshold; - std::vector> _best_matches; + std::vector> _best_matches; }; } diff --git a/include/fuzzy/filter_matches.hh b/include/fuzzy/filter_matches.hh index a57880f..7cafa32 100644 --- a/include/fuzzy/filter_matches.hh +++ b/include/fuzzy/filter_matches.hh @@ -32,7 +32,7 @@ namespace fuzzy bool theoretical_rejection(size_t p_length, size_t s_length, const EditCosts& edit_costs) const; bool theoretical_rejection_cover(size_t p_length, size_t s_length, size_t cover, const EditCosts& edit_costs) const; - virtual std::vector> get_best_matches() const = 0; + virtual std::vector> get_best_matches() const = 0; float fuzzy_threshold; // unsigned max_differences_with_pattern; diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh index 589e2e9..ac91b09 100644 --- a/include/fuzzy/fuzzy_match.hh +++ b/include/fuzzy/fuzzy_match.hh @@ -15,6 +15,8 @@ namespace onmt { namespace fuzzy { enum class ContrastReduce { MEAN, MAX }; + enum class SubmodularFunction { BOW, BM25, NGRAM, ED }; + enum class SubmodularNormalization { NO, IDF }; class FuzzyMatch { @@ -44,6 +46,7 @@ namespace fuzzy int max_subseq; unsigned s_id; std::string id; + std::vector cover; int length; const unsigned* s; }; @@ -74,7 +77,8 @@ namespace fuzzy int contrast_buffer=-1, IndexType filter_type=IndexType::SUFFIX, int bm25_buffer=10, - float bm25_cutoff=0) const; + float bm25_cutoff=0, + float shrinking_factor=1.f) const; bool match(const Sentence& real, const Tokens& pattern, float fuzzy, @@ -90,7 +94,8 @@ namespace fuzzy int contrast_buffer=-1, IndexType filter_type=IndexType::SUFFIX, int bm25_buffer=10, - float bm25_cutoff=0) const; + float bm25_cutoff=0, + float shrinking_factor=1.f) const; /* simplified, include tokenization */ bool match(const std::string &sentence, float fuzzy, @@ -106,7 +111,8 @@ namespace fuzzy int contrast_buffer=-1, IndexType filter_type=IndexType::SUFFIX, int bm25_buffer=10, - float bm25_cutoff=0) const; + float bm25_cutoff=0, + float shrinking_factor=1.f) const; bool subsequence(const std::string &sentence, unsigned number_of_matches, bool no_perfect, diff --git a/include/fuzzy/ngram_matches.hh b/include/fuzzy/ngram_matches.hh index f42de9d..eee7b86 100644 --- a/include/fuzzy/ngram_matches.hh +++ b/include/fuzzy/ngram_matches.hh @@ -29,7 +29,7 @@ namespace fuzzy using FilterMatches::theoretical_rejection; using FilterMatches::theoretical_rejection_cover; - std::vector> get_best_matches() const override; + std::vector> get_best_matches() const override; private: LongestMatches _longest_matches; diff --git a/include/fuzzy/no_matches.hh b/include/fuzzy/no_matches.hh index a94d564..e410ec6 100644 --- a/include/fuzzy/no_matches.hh +++ b/include/fuzzy/no_matches.hh @@ -19,9 +19,9 @@ namespace fuzzy using FilterMatches::theoretical_rejection_cover; void load_all(); - std::vector> get_best_matches() const override; + std::vector> get_best_matches() const override; private: - std::vector> _all_matches; + std::vector> _all_matches; }; } diff --git a/include/fuzzy/pattern_coverage.hh b/include/fuzzy/pattern_coverage.hh index e2406b4..66a3203 100644 --- a/include/fuzzy/pattern_coverage.hh +++ b/include/fuzzy/pattern_coverage.hh @@ -19,4 +19,5 @@ namespace fuzzy std::unordered_map _words_count; }; + bool equal_arrays(const size_t s_len, const size_t p_len, const unsigned* s, const unsigned* p); } diff --git a/src/bm25_matches.cc b/src/bm25_matches.cc index a10fb2a..3a2a0c0 100644 --- a/src/bm25_matches.cc +++ b/src/bm25_matches.cc @@ -50,9 +50,29 @@ namespace fuzzy std::reverse(_best_matches.begin(), _best_matches.end()); } - std::vector> + std::vector> BM25Matches::get_best_matches() const { return _best_matches; } + + std::vector + BM25Matches::cover(const std::vector& unique_pattern_wids, const std::vector& count, unsigned s_id) const + { + const BM25& bm25 = static_cast(_filter); + + Eigen::SparseVector pattern_sparse_vec(bm25.get_vocab_size()); + // for (const unsigned& wid : unique_pattern_wids) + // pattern_sparse_vec.coeffRef(wid) += 1.0; + for (unsigned i = 0; i < unique_pattern_wids.size(); i++) + pattern_sparse_vec.coeffRef(unique_pattern_wids[i]) = (float)count[i]; + + Eigen::SparseVector all_coverage = bm25.get_cover(pattern_sparse_vec, s_id); + + std::vector coverage(unique_pattern_wids.size(), 0.f); + for (int i = 0; i < coverage.size(); i++) + coverage[i] = all_coverage.coeff(unique_pattern_wids[i]); + + return coverage; + } } diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index a708089..a1d8368 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,7 @@ #include #include #include +#include #include #include @@ -404,7 +406,8 @@ namespace fuzzy int contrast_buffer, IndexType filter_type, int bm25_buffer, - float bm25_cutoff) const { + float bm25_cutoff, + float shrinking_factor) const { Sentence real; Tokens norm; @@ -412,7 +415,8 @@ namespace fuzzy return match(real, norm, fuzzy, number_of_matches, no_perfect, matches, min_subseq_length, min_subseq_ratio, vocab_idf_penalty, edit_costs, contrastive_factor, reduce, contrast_buffer, - filter_type, bm25_buffer, bm25_cutoff); + filter_type, bm25_buffer, bm25_cutoff, + shrinking_factor); } /* backward compatibility */ @@ -430,13 +434,15 @@ namespace fuzzy int contrast_buffer, IndexType filter_type, int bm25_buffer, - float bm25_cutoff) const + float bm25_cutoff, + float shrinking_factor) const { const Sentence real(pattern); return match(real, pattern, fuzzy, number_of_matches, false, matches, min_subseq_length, min_subseq_ratio, vocab_idf_penalty, edit_costs, contrastive_factor, reduce, contrast_buffer, - filter_type, bm25_buffer, bm25_cutoff); + filter_type, bm25_buffer, bm25_cutoff, + shrinking_factor); } /* check for the pattern in the suffix-array index SAI */ @@ -456,7 +462,8 @@ namespace fuzzy int contrast_buffer, IndexType filter_type, int bm25_buffer, - float bm25_cutoff) const + float bm25_cutoff, + float shrinking_factor) const { size_t p_length = pattern.size(); if (contrast_buffer == -1) @@ -647,60 +654,180 @@ namespace fuzzy // break; // } - - for (const auto& pair : filter_matches->get_best_matches()) + // case S = x + + ///////////// BM25 (BOW) + assert((filter_type == IndexType::BM25)); + std::vector> best_matches = filter_matches->get_best_matches(); + BM25Matches& bm25Matches = static_cast(*filter_matches); + std::vector sorted_pattern_wids(pattern_wids); + std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end()); + std::vector sorted_pattern_terms; + std::vector count_terms; + sorted_pattern_terms.reserve(pattern_wids.size()); + count_terms.reserve(pattern_wids.size()); + if (sorted_pattern_wids.size() > 0) + { + unsigned current_term = sorted_pattern_wids[0]; + unsigned current_count = 1; + for (unsigned i = 1; i < sorted_pattern_wids.size(); i++) + { + if (current_term != sorted_pattern_wids[i]) + { + sorted_pattern_terms.push_back(current_term); + count_terms.push_back(current_count); + current_term = sorted_pattern_wids[i]; + current_count = 1; + } + else + current_count++; + } + sorted_pattern_terms.push_back(current_term); + count_terms.push_back(current_count); + } + // std::cerr << "sorted unique terms" << ": "; + // for (const auto& c : sorted_pattern_terms) + // std::cerr << c << ", "; + // std::cerr << std::endl; + // std::cerr << "counts" << ": "; + // for (const auto& c : count_terms) + // std::cerr << c << ", "; + // std::cerr << std::endl; + ///////////// + + // std::cerr << std::endl << "num best match after bm25 = " << best_matches.size() << std::endl << std::flush; + + for (const auto& pair : best_matches) { // num_filtered++; const auto s_id = pair.first; - const auto longest_match = pair.second; + const auto score_filter = pair.second; size_t s_length = 0; const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length); - const auto num_covered_words = (longest_match < p_length - ? pattern_coverage.count_covered_words(sentence_wids, s_length) - : p_length); - /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ + // const auto num_covered_words = (score_filter < p_length + // ? pattern_coverage.count_covered_words(sentence_wids, s_length) + // : p_length); + // const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length); + // TODO: adapt to filter n-gram existence + + std::vector s_cover; + float score; + ///////////// ED + // /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ // if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs)) // { - const Costs costs(p_length, s_length, edit_costs); - - /* let us check the candidates */ - const auto sentence_realtok = _filterIndex->real_tokens(s_id); - const auto cost_upper_bound = lowest_costs.top(); - float cost = _edit_distance(sentence_wids, sentence_realtok, s_length, - pattern_wids.data(), pattern_realtok, p_length, - st, sn, - idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max, - edit_costs, - costs, cost_upper_bound); - // float cost = 0.1; - if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) - continue; - - float score = int(10000-cost*100)/10000.0; - - - lowest_costs.push(cost); - if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) - lowest_costs.pop(); - if (score >= fuzzy) { - Match m(sentence_wids, s_length); - m.score = score; - m.max_subseq = longest_match; - m.s_id = s_id; - m.id = _filterIndex->id(s_id); - m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt; - m.penalty = 0; - result.push(m); - cpt++; - } + // const Costs costs(p_length, s_length, edit_costs); + // /* let us check the candidates */ + // const auto sentence_realtok = _filterIndex->real_tokens(s_id); + // const auto cost_upper_bound = lowest_costs.top(); + // float cost = _edit_distance(sentence_wids, sentence_realtok, s_length, + // pattern_wids.data(), pattern_realtok, p_length, + // st, sn, + // idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max, + // edit_costs, + // costs, cost_upper_bound); + // // float cost = 0.1; + // if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) + // continue; + // float score = int(10000 - cost * 100) / 10000.0; + + // lowest_costs.push(cost); + // if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) + // lowest_costs.pop(); + // } + // else + // { + // continue; // } + ///////////// + + if ((no_perfect && equal_arrays(s_length, p_length, sentence_wids, pattern_wids.data()))) + continue; + + ///////////// BM25 + score = (float)score_filter / 1000.f; + s_cover = bm25Matches.cover(sorted_pattern_terms, count_terms, s_id); + // s_cover = std::vector(sorted_pattern_terms.size(), 1); + ///////////// + + ///////////// NGRAM + /* TODO */ + ///////////// + + if (score >= fuzzy) { + Match m(sentence_wids, s_length); + m.score = (filter_type == IndexType::BM25) ? (float)score_filter / (float)1000. : score; + m.max_subseq = (filter_type == IndexType::BM25) ? 0 : score_filter; + m.s_id = s_id; + m.id = _filterIndex->id(s_id); + m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt; + m.penalty = 0; + m.cover = s_cover; + result.push(m); + // std::cerr << m.s_id << ": "; + // for (const auto& c : m.cover) + // std::cerr << c << ", "; + // std::cerr << std::endl; + cpt++; + if (cpt > contrast_buffer) + break; + } } // COUT filter // std::cerr << num_filtered << std::endl; // std::cerr << filter_matches->get_best_matches().size() << std::endl; - // delete filter_matches; - /* Contrastive reranking */ - if (contrastive_factor > 0) + + if (shrinking_factor < 1.f) // submodular coverage + { + std::vector cover_weights(sorted_pattern_wids.size(), 1.f); + std::list candidates; + while (!result.empty()) + { + auto match = result.top(); + // match.penalty = match.score; // initialize corr. to weights of 1 + // match.penalty = match.score; // initialize corr. to weights of 1 + candidates.push_back(match); + result.pop(); + } + auto comp = [](const Match& m1, const Match& m2) { + return m1.penalty < m2.penalty; + }; + while (!candidates.empty() && (number_of_matches == 0 || matches.size() < number_of_matches)) + { + // rescore penalties of candidates + for (Match &match : candidates) + { + float rescore = 0.f; + // std::cerr << "rescore " << match.s_id << " : ("; + for (unsigned i = 0; i < cover_weights.size(); i++) + { + rescore += cover_weights[i] * match.cover[i]; + // if (match.cover[i] != 0) + // std::cerr << cover_weights[i] << "*" << match.cover[i] << "+"; + } + + // std::cerr << ") " << match.penalty << " -> " << rescore << std::endl; + match.penalty = rescore; + } + auto it_max = std::max_element(candidates.begin(), candidates.end(), comp); + matches.push_back(*it_max); + // std::cerr << "choose No " << it_max->s_id << std::endl; + // update cover_weights + for (unsigned i = 0; i < cover_weights.size(); i++) + if (it_max->cover[i] > 0) + cover_weights[i] *= shrinking_factor; + candidates.erase(it_max); + if (shrinking_factor < 1e-20f) + { + float sum = 0.f; + for (const float& w : cover_weights) + sum += w; + if (sum < 1e-20f) + cover_weights.assign(cover_weights.size(), 1.f); + } + } + } + else if (contrastive_factor > 0) // MMR { std::list candidates; while (!result.empty()) @@ -766,6 +893,18 @@ namespace fuzzy result.pop(); } } + + // std::cerr << "final matches " << " : "; + // for (unsigned i = 0; i < matches.size(); i++) + // { + // std::cerr << std::endl << " "; + // std::cerr << matches[i].s_id << ": "; + // for (int j = 0; j < matches[i].length; j++) + // std::cerr << matches[i].s[j] << " "; + // std::cerr << std::endl; + // std::cerr << matches[i].id; + // } + // std::cerr << std::endl; return matches.size() > 0; } } diff --git a/src/ngram_matches.cc b/src/ngram_matches.cc index ed0b2a3..f50f130 100644 --- a/src/ngram_matches.cc +++ b/src/ngram_matches.cc @@ -12,13 +12,13 @@ namespace fuzzy : FilterMatches(fuzzy, p_length, min_seq_len, suffixArray) {} - std::vector> + std::vector> NGramMatches::get_best_matches() const { - std::vector> sorted_matches(_longest_matches.begin(), + std::vector> sorted_matches(_longest_matches.begin(), _longest_matches.end()); std::sort(sorted_matches.begin(), sorted_matches.end(), - [](const std::pair& a, const std::pair& b) { + [](const std::pair& a, const std::pair& b) { return a.second > b.second || (a.second == b.second && a.first < b.first); }); return sorted_matches; diff --git a/src/no_matches.cc b/src/no_matches.cc index 277aa01..2e0a9a3 100644 --- a/src/no_matches.cc +++ b/src/no_matches.cc @@ -13,7 +13,7 @@ namespace fuzzy { } - std::vector> + std::vector> NoMatches::get_best_matches() const { return _all_matches; @@ -21,7 +21,7 @@ namespace fuzzy void NoMatches::load_all() { - _all_matches = std::vector>(_filter.num_sentences()); + _all_matches = std::vector>(_filter.num_sentences()); size_t *length; for (unsigned i = 0; i < _filter.num_sentences(); i++) diff --git a/src/pattern_coverage.cc b/src/pattern_coverage.cc index 76e7887..3753377 100644 --- a/src/pattern_coverage.cc +++ b/src/pattern_coverage.cc @@ -27,4 +27,14 @@ namespace fuzzy return num_covered_words; } + bool equal_arrays(const size_t s_len, const size_t p_len, const unsigned* s, const unsigned* p) + { + if (s_len != p_len) + return false; + + for (unsigned i = 0; i < p_len; i++) + if (p[i] != s[i]) + return false; + return true; + } } From c0d8b1d90916e556aa792452452f81acbec1d004 Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Wed, 10 Jan 2024 12:43:38 +0100 Subject: [PATCH 05/15] cli structure --- cli/src/FuzzyMatch-cli.cc | 34 +++++- include/fuzzy/fuzzy_match.hh | 16 ++- src/fuzzy_match.cc | 195 ++++++++++++++++------------------- 3 files changed, 130 insertions(+), 115 deletions(-) diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc index 30d8f18..b56403a 100644 --- a/cli/src/FuzzyMatch-cli.cc +++ b/cli/src/FuzzyMatch-cli.cc @@ -210,7 +210,9 @@ class processor { std::string contrastive_reduce_str, int contrastive_buffer, fuzzy::IndexType filter_type, - int bm25_buffer, float bm25_cutoff, float submodular_shrinking_factor, const fuzzy::FilterIndexParams& filter_index_params): + int bm25_buffer, float bm25_cutoff, + float submodular_shrinking_factor, fuzzy::SubmodularFunction submodular_function, fuzzy::SubmodularNormalization submodular_normalization, + const fuzzy::FilterIndexParams& filter_index_params): _fuzzyMatcher(pt, max_tokens_in_pattern, filter_type, filter_index_params), _fuzzy(fuzzy), _contrastive_factor(contrastive_factor), @@ -225,7 +227,9 @@ class processor { _filter_type(filter_type), _bm25_buffer(bm25_buffer), _bm25_cutoff(bm25_cutoff), - _submodular_shrinking_factor(submodular_shrinking_factor) { + _submodular_shrinking_factor(submodular_shrinking_factor), + _submodular_function(submodular_function), + _submodular_normalization(submodular_normalization) { if (contrastive_reduce_str == "max") _contrastive_reduce = fuzzy::ContrastReduce::MAX; else @@ -237,7 +241,8 @@ class processor { _fuzzyMatcher.match(sentence, _fuzzy, _nmatch, _no_perfect, matches, _min_subseq_length, _min_subseq_ratio, _idf_penalty, _cost, _contrastive_factor, _contrastive_reduce, _contrastive_buffer, - _filter_type, _bm25_buffer, _bm25_cutoff, _submodular_shrinking_factor); + _filter_type, _bm25_buffer, _bm25_cutoff, + _submodular_shrinking_factor, _submodular_function, _submodular_normalization); std::string out; for(const fuzzy::FuzzyMatch::Match &m: matches) { @@ -296,6 +301,8 @@ class processor { int _bm25_buffer; float _bm25_cutoff; float _submodular_shrinking_factor; + fuzzy::SubmodularFunction _submodular_function; + fuzzy::SubmodularNormalization _submodular_normalization; }; int main(int argc, char** argv) @@ -329,6 +336,8 @@ int main(int argc, char** argv) float bm25_cutoff; float bm25_ratio_idf; float submodular_shrinking_factor; + std::string submodular_function_str; + std::string submodular_normalization_str; int nmatch; int nthreads; int min_subseq_length; @@ -369,6 +378,8 @@ int main(int argc, char** argv) ("bm25-buffer", po::value(&bm25_buffer)->default_value(10), "number of best BM25 to rerank") ("bm25-cutoff", po::value(&bm25_cutoff)->default_value(0.f), "minimum BM25 score threshold cutoff") ("submodular-shrinking-factor,lambda", po::value(&submodular_shrinking_factor)->default_value(1.f), "In submodularity coverage, weight shrinking factor of each covered salient aspect of the source") + ("submodular-function", po::value(&submodular_function_str)->default_value("NO"), "submodularity coverage function category (NO|BOW|NGRAM|ED)") + ("submodular-norm", po::value(&submodular_normalization_str)->default_value("NO"), "Normalization in submodular coverage score") ("nthreads,N", po::value(&nthreads)->default_value(4), "number of thread to use for match") ; @@ -445,6 +456,18 @@ int main(int argc, char** argv) filter_type = fuzzy::IndexType::NO; else filter_type = fuzzy::IndexType::SUFFIX; + fuzzy::SubmodularFunction submodular_function; + if (submodular_function_str == "BOW") + submodular_function = fuzzy::SubmodularFunction::BOW; + else if (submodular_function_str == "ED") + submodular_function = fuzzy::SubmodularFunction::ED; + else + submodular_function = fuzzy::SubmodularFunction::NO; + fuzzy::SubmodularNormalization submodular_normalization; + if (submodular_normalization_str == "BM25") + submodular_normalization = fuzzy::SubmodularNormalization::BM25; + else + submodular_normalization = fuzzy::SubmodularNormalization::NO; #ifdef NO_EIGEN assert(filter_type != fuzzy::IndexType::BM25); #endif @@ -454,8 +477,9 @@ int main(int argc, char** argv) idf_penalty, subseq_idf_weighting, max_tokens_in_pattern, edit_cost, contrastive_reduce, contrastive_buffer, - filter_type, bm25_buffer, bm25_cutoff, submodular_shrinking_factor, filter_index_params); - + filter_type, bm25_buffer, bm25_cutoff, + submodular_shrinking_factor, submodular_function, submodular_normalization, + filter_index_params); if (index_file.length()) { TICK("Loading index_file: "+index_file); import_binarized_fuzzy_matcher(index_file, O._fuzzyMatcher); diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh index ac91b09..fac23b6 100644 --- a/include/fuzzy/fuzzy_match.hh +++ b/include/fuzzy/fuzzy_match.hh @@ -15,8 +15,8 @@ namespace onmt { namespace fuzzy { enum class ContrastReduce { MEAN, MAX }; - enum class SubmodularFunction { BOW, BM25, NGRAM, ED }; - enum class SubmodularNormalization { NO, IDF }; + enum class SubmodularFunction { NO, BOW, NGRAM, ED }; + enum class SubmodularNormalization { NO, BM25 }; class FuzzyMatch { @@ -78,7 +78,9 @@ namespace fuzzy IndexType filter_type=IndexType::SUFFIX, int bm25_buffer=10, float bm25_cutoff=0, - float shrinking_factor=1.f) const; + float shrinking_factor=1.f, + SubmodularFunction submod_fun=SubmodularFunction::NO, + SubmodularNormalization submod_norm=SubmodularNormalization::NO) const; bool match(const Sentence& real, const Tokens& pattern, float fuzzy, @@ -95,7 +97,9 @@ namespace fuzzy IndexType filter_type=IndexType::SUFFIX, int bm25_buffer=10, float bm25_cutoff=0, - float shrinking_factor=1.f) const; + float shrinking_factor=1.f, + SubmodularFunction submod_fun=SubmodularFunction::NO, + SubmodularNormalization submod_norm=SubmodularNormalization::NO) const; /* simplified, include tokenization */ bool match(const std::string &sentence, float fuzzy, @@ -112,7 +116,9 @@ namespace fuzzy IndexType filter_type=IndexType::SUFFIX, int bm25_buffer=10, float bm25_cutoff=0, - float shrinking_factor=1.f) const; + float shrinking_factor=1.f, + SubmodularFunction submod_fun=SubmodularFunction::NO, + SubmodularNormalization submod_norm=SubmodularNormalization::NO) const; bool subsequence(const std::string &sentence, unsigned number_of_matches, bool no_perfect, diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index a1d8368..12be570 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -407,7 +407,9 @@ namespace fuzzy IndexType filter_type, int bm25_buffer, float bm25_cutoff, - float shrinking_factor) const { + float shrinking_factor, + SubmodularFunction submod_fun, + SubmodularNormalization submod_norm) const { Sentence real; Tokens norm; @@ -416,7 +418,7 @@ namespace fuzzy min_subseq_length, min_subseq_ratio, vocab_idf_penalty, edit_costs, contrastive_factor, reduce, contrast_buffer, filter_type, bm25_buffer, bm25_cutoff, - shrinking_factor); + shrinking_factor, submod_fun, submod_norm); } /* backward compatibility */ @@ -435,14 +437,16 @@ namespace fuzzy IndexType filter_type, int bm25_buffer, float bm25_cutoff, - float shrinking_factor) const + float shrinking_factor, + SubmodularFunction submod_fun, + SubmodularNormalization submod_norm) const { const Sentence real(pattern); return match(real, pattern, fuzzy, number_of_matches, false, matches, min_subseq_length, min_subseq_ratio, vocab_idf_penalty, edit_costs, contrastive_factor, reduce, contrast_buffer, filter_type, bm25_buffer, bm25_cutoff, - shrinking_factor); + shrinking_factor, submod_fun, submod_norm); } /* check for the pattern in the suffix-array index SAI */ @@ -463,7 +467,9 @@ namespace fuzzy IndexType filter_type, int bm25_buffer, float bm25_cutoff, - float shrinking_factor) const + float shrinking_factor, + SubmodularFunction submod_fun, + SubmodularNormalization submod_norm) const { size_t p_length = pattern.size(); if (contrast_buffer == -1) @@ -612,79 +618,45 @@ namespace fuzzy lowest_costs.push(std::numeric_limits::max()); unsigned cpt = 0; - // unsigned num_filtered = 0; - - // ONLY N-grams - // for (const auto& pair : filter_matches->get_best_matches()) - // { - // const auto s_id = pair.first; - // const auto longest_match = pair.second; - // size_t s_length = 0; - // const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length); - // Match m(sentence_wids, s_length); - // m.score = (float)longest_match / (float)s_length; - // m.max_subseq = longest_match; - // m.s_id = s_id; - // m.id = _filterIndex->id(s_id); - // m.secondary_sort = s_id; - // m.penalty = 0; - // result.push(m); - // cpt++; - // if (cpt > contrast_buffer) - // break; - // } - - // ONLY BM25 - // for (const auto& pair : filter_matches->get_best_matches()) - // { - // const auto s_id = pair.first; - // const auto bm25_score = pair.second; - // size_t s_length = 0; - // const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length); - // Match m(sentence_wids, s_length); - // m.score = (float)bm25_score / (float)1000.; - // m.max_subseq = 0; - // m.s_id = s_id; - // m.id = _filterIndex->id(s_id); - // m.secondary_sort = s_id; - // m.penalty = 0; - // result.push(m); - // cpt++; - // if (cpt > contrast_buffer) - // break; - // } - - // case S = x - - ///////////// BM25 (BOW) - assert((filter_type == IndexType::BM25)); - std::vector> best_matches = filter_matches->get_best_matches(); - BM25Matches& bm25Matches = static_cast(*filter_matches); - std::vector sorted_pattern_wids(pattern_wids); - std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end()); + std::vector sorted_pattern_terms; std::vector count_terms; - sorted_pattern_terms.reserve(pattern_wids.size()); - count_terms.reserve(pattern_wids.size()); - if (sorted_pattern_wids.size() > 0) + std::vector> best_matches = filter_matches->get_best_matches(); + + switch(submod_fun) // Salient aspects enumeration { - unsigned current_term = sorted_pattern_wids[0]; - unsigned current_count = 1; - for (unsigned i = 1; i < sorted_pattern_wids.size(); i++) + case SubmodularFunction::BOW: { - if (current_term != sorted_pattern_wids[i]) + std::vector sorted_pattern_wids(pattern_wids); + std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end()); + + sorted_pattern_terms.reserve(pattern_wids.size()); + count_terms.reserve(pattern_wids.size()); + if (sorted_pattern_wids.size() > 0) { + unsigned current_term = sorted_pattern_wids[0]; + unsigned current_count = 1; + for (unsigned i = 1; i < sorted_pattern_wids.size(); i++) + { + if (current_term != sorted_pattern_wids[i]) + { + sorted_pattern_terms.push_back(current_term); + count_terms.push_back(current_count); + current_term = sorted_pattern_wids[i]; + current_count = 1; + } + else + current_count++; + } sorted_pattern_terms.push_back(current_term); count_terms.push_back(current_count); - current_term = sorted_pattern_wids[i]; - current_count = 1; } - else - current_count++; + break; } - sorted_pattern_terms.push_back(current_term); - count_terms.push_back(current_count); + default: + ; } + // std::cerr << "sorted unique terms" << ": "; // for (const auto& c : sorted_pattern_terms) // std::cerr << c << ", "; @@ -712,47 +684,60 @@ namespace fuzzy std::vector s_cover; float score; - ///////////// ED - // /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ - // if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs)) - // { - // const Costs costs(p_length, s_length, edit_costs); - // /* let us check the candidates */ - // const auto sentence_realtok = _filterIndex->real_tokens(s_id); - // const auto cost_upper_bound = lowest_costs.top(); - // float cost = _edit_distance(sentence_wids, sentence_realtok, s_length, - // pattern_wids.data(), pattern_realtok, p_length, - // st, sn, - // idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max, - // edit_costs, - // costs, cost_upper_bound); - // // float cost = 0.1; - // if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) - // continue; - // float score = int(10000 - cost * 100) / 10000.0; - - // lowest_costs.push(cost); - // if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) - // lowest_costs.pop(); - // } - // else - // { - // continue; - // } - ///////////// if ((no_perfect && equal_arrays(s_length, p_length, sentence_wids, pattern_wids.data()))) continue; - ///////////// BM25 - score = (float)score_filter / 1000.f; - s_cover = bm25Matches.cover(sorted_pattern_terms, count_terms, s_id); - // s_cover = std::vector(sorted_pattern_terms.size(), 1); - ///////////// + if (submod_norm == SubmodularNormalization::BM25) + { + score = (float)score_filter / 1000.f; + assert((filter_type == IndexType::BM25)); + BM25Matches& bm25Matches = static_cast(*filter_matches); + s_cover = bm25Matches.cover(sorted_pattern_terms, count_terms, s_id); + break; + } + else if (submod_norm == SubmodularNormalization::NO) + { + // score = 0; + // s_cover; + // TODO: function to compute those + } + + switch(submod_fun) // salient aspect weighted cover + { + case SubmodularFunction::BOW: + ; + + default: + const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length); + /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ + if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs)) + { + const Costs costs(p_length, s_length, edit_costs); + /* let us check the candidates */ + const auto sentence_realtok = _filterIndex->real_tokens(s_id); + const auto cost_upper_bound = lowest_costs.top(); + float cost = _edit_distance(sentence_wids, sentence_realtok, s_length, + pattern_wids.data(), pattern_realtok, p_length, + st, sn, + idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max, + edit_costs, + costs, cost_upper_bound); + // float cost = 0.1; + if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) + continue; + float score = int(10000 - cost * 100) / 10000.0; - ///////////// NGRAM - /* TODO */ - ///////////// + lowest_costs.push(cost); + if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) + lowest_costs.pop(); + } + else + { + continue; + } + + } if (score >= fuzzy) { Match m(sentence_wids, s_length); @@ -779,7 +764,7 @@ namespace fuzzy if (shrinking_factor < 1.f) // submodular coverage { - std::vector cover_weights(sorted_pattern_wids.size(), 1.f); + std::vector cover_weights(sorted_pattern_terms.size(), 1.f); std::list candidates; while (!result.empty()) { From 334fba2ad84a64725f9061afbded5374b6f11da6 Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Wed, 10 Jan 2024 14:14:15 +0100 Subject: [PATCH 06/15] cli structure --- src/fuzzy_match.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index 12be570..a670040 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -698,6 +698,13 @@ namespace fuzzy } else if (submod_norm == SubmodularNormalization::NO) { + const auto sentence_realtok = _filterIndex->real_tokens(s_id); + get_bow_score( + sorted_pattern_terms, + count_terms, + sentence_realtok, + s_length + ) // score = 0; // s_cover; // TODO: function to compute those From 3519f7896ca73b3219e6eda5a25a2dcea48a5478 Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Wed, 10 Jan 2024 17:37:17 +0100 Subject: [PATCH 07/15] BOW | NO+IDF+BM25 --- cli/src/FuzzyMatch-cli.cc | 3 ++ include/fuzzy/fuzzy_match.hh | 2 +- src/CMakeLists.txt | 1 + src/fuzzy_match.cc | 72 +++++++++++++++++++++++++----------- 4 files changed, 56 insertions(+), 22 deletions(-) diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc index b56403a..4d4cfab 100644 --- a/cli/src/FuzzyMatch-cli.cc +++ b/cli/src/FuzzyMatch-cli.cc @@ -464,8 +464,11 @@ int main(int argc, char** argv) else submodular_function = fuzzy::SubmodularFunction::NO; fuzzy::SubmodularNormalization submodular_normalization; + std::cerr << "submodular_normalization_str = " << submodular_normalization_str << std::endl; if (submodular_normalization_str == "BM25") submodular_normalization = fuzzy::SubmodularNormalization::BM25; + else if (submodular_normalization_str == "IDF") + submodular_normalization = fuzzy::SubmodularNormalization::IDF; else submodular_normalization = fuzzy::SubmodularNormalization::NO; #ifdef NO_EIGEN diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh index fac23b6..5e0cb38 100644 --- a/include/fuzzy/fuzzy_match.hh +++ b/include/fuzzy/fuzzy_match.hh @@ -16,7 +16,7 @@ namespace fuzzy { enum class ContrastReduce { MEAN, MAX }; enum class SubmodularFunction { NO, BOW, NGRAM, ED }; - enum class SubmodularNormalization { NO, BM25 }; + enum class SubmodularNormalization { NO, IDF, BM25 }; class FuzzyMatch { diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6e6c44c..b7a96fa 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -14,6 +14,7 @@ set(FUZZY_SOURCES index.cc no_filter.cc no_matches.cc + submodular.cc ) if(MSVC) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index a670040..9309d4f 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -622,6 +622,7 @@ namespace fuzzy std::vector sorted_pattern_terms; std::vector count_terms; std::vector> best_matches = filter_matches->get_best_matches(); + std::vector norm_weight; switch(submod_fun) // Salient aspects enumeration { @@ -656,18 +657,21 @@ namespace fuzzy default: ; } - - // std::cerr << "sorted unique terms" << ": "; - // for (const auto& c : sorted_pattern_terms) - // std::cerr << c << ", "; - // std::cerr << std::endl; - // std::cerr << "counts" << ": "; - // for (const auto& c : count_terms) - // std::cerr << c << ", "; - // std::cerr << std::endl; + std::vector sorted_pattern_terms_idf; + if (submod_norm == SubmodularNormalization::IDF) + sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms); + + std::cerr << "sorted unique terms" << ": "; + for (const auto& c : sorted_pattern_terms) + std::cerr << c << ", "; + std::cerr << std::endl; + std::cerr << "Idf" << ": "; + for (const auto& c : sorted_pattern_terms_idf) + std::cerr << c << ", "; + std::cerr << std::endl; ///////////// - // std::cerr << std::endl << "num best match after bm25 = " << best_matches.size() << std::endl << std::flush; + std::cerr << std::endl << "num best match after bm25 = " << best_matches.size() << std::endl << std::flush; for (const auto& pair : best_matches) { @@ -680,7 +684,6 @@ namespace fuzzy // ? pattern_coverage.count_covered_words(sentence_wids, s_length) // : p_length); // const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length); - // TODO: adapt to filter n-gram existence std::vector s_cover; float score; @@ -690,31 +693,58 @@ namespace fuzzy if (submod_norm == SubmodularNormalization::BM25) { + std::cerr << "BM25 norm..." << std::endl << std::flush; score = (float)score_filter / 1000.f; assert((filter_type == IndexType::BM25)); BM25Matches& bm25Matches = static_cast(*filter_matches); s_cover = bm25Matches.cover(sorted_pattern_terms, count_terms, s_id); - break; } else if (submod_norm == SubmodularNormalization::NO) { - const auto sentence_realtok = _filterIndex->real_tokens(s_id); + std::cerr << "No norm..." << std::endl << std::flush; get_bow_score( sorted_pattern_terms, count_terms, - sentence_realtok, - s_length - ) - // score = 0; - // s_cover; - // TODO: function to compute those + sentence_wids, + s_length, + score, + s_cover); + } + else if (submod_norm == SubmodularNormalization::IDF) + { + std::cerr << "IDF norm..." << std::endl << std::flush; + get_bow_score_idf( + sorted_pattern_terms, + count_terms, + sentence_wids, + s_length, + sorted_pattern_terms_idf, + score, + s_cover); } + // std::cerr << "q: "; + // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++) + // std::cerr << sorted_pattern_terms[i] << ","; + // std::cerr << std::endl; + // std::cerr << "q count: "; + // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++) + // std::cerr << count_terms[i] << ","; + // std::cerr << std::endl; + std::cerr << "sent: "; + for (unsigned i = 0; i < s_length; i++) + std::cerr << sentence_wids[i] << ","; + std::cerr << std::endl; + std::cerr << "cover: "; + for (unsigned i = 0; i < s_cover.size(); i++) + std::cerr << s_cover[i] << ","; + std::cerr << std::endl; + std::cerr << "score: " << score << std::endl; + std::cerr << "...done" << std::endl << std::flush; switch(submod_fun) // salient aspect weighted cover { case SubmodularFunction::BOW: - ; - + break; default: const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length); /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ From d55f369ea83b9b4b90474392d4ca644bad0ae86a Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Wed, 10 Jan 2024 17:37:55 +0100 Subject: [PATCH 08/15] forgotten files --- include/fuzzy/submodular.hh | 26 ++++++++++++++++ src/submodular.cc | 62 +++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 include/fuzzy/submodular.hh create mode 100644 src/submodular.cc diff --git a/include/fuzzy/submodular.hh b/include/fuzzy/submodular.hh new file mode 100644 index 0000000..64a5f7f --- /dev/null +++ b/include/fuzzy/submodular.hh @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include + + +namespace fuzzy +{ + void get_bow_score( + std::vector& sorted_pattern_terms, + std::vector& count_terms, + const unsigned* sentence, + const unsigned sentence_length, + float& score, + std::vector& cover); + + void get_bow_score_idf( + std::vector& sorted_pattern_terms, + std::vector& count_terms, + const unsigned* sentence, + const unsigned sentence_length, + std::vector& idf_penalty, + float& score, + std::vector& cover); +} diff --git a/src/submodular.cc b/src/submodular.cc new file mode 100644 index 0000000..81aadab --- /dev/null +++ b/src/submodular.cc @@ -0,0 +1,62 @@ +#include +#include +#include + +namespace fuzzy +{ + void get_bow_score( + std::vector& sorted_pattern_terms, + std::vector& count_terms, + const unsigned* sentence, + const unsigned sentence_length, + float& score, + std::vector& cover) + { + std::vector sorted_sentence_terms(sentence, sentence + sentence_length); + std::sort(sorted_sentence_terms.begin(), sorted_sentence_terms.end()); + cover = std::vector(sorted_pattern_terms.size(), 0.f); + score = 0.f; + for (unsigned i, j = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++) + { + while ( + (i < sorted_pattern_terms.size()) && + (sorted_pattern_terms[i] < sorted_sentence_terms[j])) + i++; + // std::cerr << sorted_pattern_terms[i] << " ?= " << sorted_sentence_terms[j] << " (" << i << ", " << j << ")" << std::endl; + if (sorted_pattern_terms[i] == sorted_sentence_terms[j]) + if ((float)count_terms[i] > cover[i] + 1e-6f) + { + cover[i] += 1.f; + score += 1.f; + } + } + } + void get_bow_score_idf( + std::vector& sorted_pattern_terms, + std::vector& count_terms, + const unsigned* sentence, + const unsigned sentence_length, + std::vector& idf_penalty, + float& score, + std::vector& cover) + { + std::vector sorted_sentence_terms(sentence, sentence + sentence_length); + std::sort(sorted_sentence_terms.begin(), sorted_sentence_terms.end()); + cover = std::vector(sorted_pattern_terms.size(), 0.f); + score = 0.f; + for (unsigned i, j, k = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++) + { + while ( + (i < sorted_pattern_terms.size()) && + (sorted_pattern_terms[i] < sorted_sentence_terms[j])) + i++; + // std::cerr << sorted_pattern_terms[i] << " ?= " << sorted_sentence_terms[j] << " (" << i << ", " << j << ")" << std::endl; + if (sorted_pattern_terms[i] == sorted_sentence_terms[j]) + if ((float)count_terms[i] > cover[i] / idf_penalty[i] + 1e-6f) + { + cover[i] += idf_penalty[i]; + score += idf_penalty[i]; + } + } + } +} \ No newline at end of file From 83172ace2db7656cb2974f23288cdc7cd6966dc5 Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Fri, 12 Jan 2024 13:42:26 +0100 Subject: [PATCH 09/15] ngram implementation --- cli/src/FuzzyMatch-cli.cc | 2 + include/fuzzy/fuzzy_match.hh | 4 + include/fuzzy/submodular.hh | 87 ++++++++++++++---- include/fuzzy/submodular.hxx | 85 ++++++++++++++++++ src/fuzzy_match.cc | 170 ++++++++++++++++++++++------------- src/submodular.cc | 164 ++++++++++++++++++++++++--------- 6 files changed, 396 insertions(+), 116 deletions(-) create mode 100644 include/fuzzy/submodular.hxx diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc index 4d4cfab..9dd1b23 100644 --- a/cli/src/FuzzyMatch-cli.cc +++ b/cli/src/FuzzyMatch-cli.cc @@ -459,6 +459,8 @@ int main(int argc, char** argv) fuzzy::SubmodularFunction submodular_function; if (submodular_function_str == "BOW") submodular_function = fuzzy::SubmodularFunction::BOW; + else if (submodular_function_str == "NGRAM") + submodular_function = fuzzy::SubmodularFunction::NGRAM; else if (submodular_function_str == "ED") submodular_function = fuzzy::SubmodularFunction::ED; else diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh index 5e0cb38..3d4560f 100644 --- a/include/fuzzy/fuzzy_match.hh +++ b/include/fuzzy/fuzzy_match.hh @@ -6,6 +6,7 @@ #include #include #include +#include #include namespace onmt { @@ -163,6 +164,9 @@ namespace fuzzy std::vector compute_idf_penalty(const std::vector& pattern_wids, float unknown_vocab_word_penalty = 0) const; + std::vector + compute_idf_penalty(const std::vector& pattern_ngrams, + float unknown_vocab_word_penalty = 0) const; /* penalty tokens */ int _pt; diff --git a/include/fuzzy/submodular.hh b/include/fuzzy/submodular.hh index 64a5f7f..9bebe94 100644 --- a/include/fuzzy/submodular.hh +++ b/include/fuzzy/submodular.hh @@ -3,24 +3,79 @@ #include #include #include +#include namespace fuzzy { - void get_bow_score( - std::vector& sorted_pattern_terms, - std::vector& count_terms, - const unsigned* sentence, - const unsigned sentence_length, - float& score, - std::vector& cover); - - void get_bow_score_idf( - std::vector& sorted_pattern_terms, - std::vector& count_terms, - const unsigned* sentence, - const unsigned sentence_length, - std::vector& idf_penalty, - float& score, - std::vector& cover); + class NGram + { + public: + NGram(const unsigned* start, unsigned N); + ~NGram() {} + NGram& operator=(const NGram& other); + bool operator==(NGram& other); + bool operator<(NGram& other); + void print() const; + const unsigned* _start; + unsigned _N; + }; + + inline + std::vector get_sorted_ngrams( + const unsigned N, + const unsigned* sentence, + const unsigned sentence_length); + + template + void get_unique_with_count( + std::vector& sorted_salient, + std::vector& unique, + std::vector& count); + + template + void get_score( + std::vector& sorted_pattern_terms, + std::vector& sorted_sentence_terms, + std::vector& count_terms, + float& score, + std::vector& cover, + std::vector& idf_penalty); + + void get_bow_score( + std::vector& sorted_pattern_terms, + std::vector& count_terms, + const unsigned* sentence, + const unsigned sentence_length, + float& score, + std::vector& cover, + std::vector& idf_penalty); + + void get_ngram_score( + std::vector& sorted_pattern_terms, + const unsigned N, + std::vector& count_terms, + const unsigned* sentence, + const unsigned sentence_length, + float& score, + std::vector& cover, + std::vector& idf_penalty); + + // void get_bow_score_idf( + // std::vector& sorted_pattern_terms, + // std::vector& count_terms, + // const unsigned* sentence, + // const unsigned sentence_length, + // std::vector& idf_penalty, + // float& score, + // std::vector& cover); + + void get_all_ngrams( + const unsigned* sequence, + const unsigned length, + const unsigned N, + std::vector& ngrams, + std::vector& counts); } + +#include "submodular.hxx" diff --git a/include/fuzzy/submodular.hxx b/include/fuzzy/submodular.hxx new file mode 100644 index 0000000..0353158 --- /dev/null +++ b/include/fuzzy/submodular.hxx @@ -0,0 +1,85 @@ + +namespace fuzzy +{ + inline + std::vector get_sorted_ngrams( + const unsigned N_const, + const unsigned* sentence, + const unsigned sentence_length) + { + std::vector all_ngrams; + const unsigned N = std::min(N_const, sentence_length); + all_ngrams.reserve(N * sentence_length - N * (N - 1) / 2); + for (unsigned n = 1; n <= N; n++) + for (unsigned i = 0; i < sentence_length - n + 1; i++) + all_ngrams.push_back(NGram(sentence + i, n)); + std::sort(all_ngrams.begin(), all_ngrams.end()); + return all_ngrams; + } + + template + void get_score( + std::vector& sorted_pattern_terms, + std::vector& sorted_sentence_terms, + std::vector& count_terms, + float& score, + std::vector& cover, + std::vector& idf_penalty) + { + cover = std::vector(sorted_pattern_terms.size(), 0.f); + score = 0.f; + for ( + unsigned i, j, k = 0; + (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); + j++) + { + while ( + (i < sorted_pattern_terms.size()) && + (sorted_pattern_terms[i] < sorted_sentence_terms[j])) + i++; + if (sorted_pattern_terms[i] == sorted_sentence_terms[j]) + if (idf_penalty.size() > 0) + { + if ((float)count_terms[i] > cover[i] / idf_penalty[i] + 1e-6f) + { + cover[i] += idf_penalty[i]; + score += idf_penalty[i]; + } + } + else if ((float)count_terms[i] > cover[i] + 1e-6f) + { + cover[i] += 1.f; + score += 1.f; + } + } + } + + template + void get_unique_with_count( + std::vector& sorted_salient, + std::vector& unique, + std::vector& count) + { + unique.reserve(sorted_salient.size()); + count.reserve(sorted_salient.size()); + if (sorted_salient.size() > 0) + { + T& current_salient = sorted_salient[0]; + unsigned current_count = 1; + for (unsigned i = 1; i < sorted_salient.size(); i++) + { + if (!(current_salient == sorted_salient[i])) + { + unique.push_back(current_salient); + count.push_back(current_count); + current_salient = sorted_salient[i]; + current_count = 1; + } + else + current_count++; + } + unique.push_back(current_salient); + count.push_back(current_count); + } + } +} \ No newline at end of file diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index 9309d4f..3b6a60a 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -391,6 +391,29 @@ namespace fuzzy return idf_penalty; } + std::vector FuzzyMatch::compute_idf_penalty(const std::vector& pattern_ngrams, + float unknown_vocab_word_penalty) const { + std::vector idf_penalty; + idf_penalty.reserve(pattern_ngrams.size()); + + const unsigned num_sentences = _filterIndex->get_Filter().num_sentences(); + + const std::vector& word_frequency_in_sentences = _filterIndex->get_VocabIndexer().getSFreq(); + unsigned n; + float tot_idf; + for (const NGram& ngram : pattern_ngrams) { + for (n = 0, tot_idf = 0.f; n < ngram._N; n++) + { + if (ngram._start[n] != fuzzy::VocabIndexer::VOCAB_UNK) + tot_idf += std::log((float)num_sentences/(float)word_frequency_in_sentences[ngram._start[n]]); + else + tot_idf += unknown_vocab_word_penalty; + } + idf_penalty.push_back(tot_idf / (float)n); + } + return idf_penalty; + } + /* interface with integrated tokenization */ bool FuzzyMatch::match(const std::string &sentence, float fuzzy, @@ -620,46 +643,62 @@ namespace fuzzy unsigned cpt = 0; std::vector sorted_pattern_terms; + std::vector sorted_pattern_ngrams; std::vector count_terms; std::vector> best_matches = filter_matches->get_best_matches(); std::vector norm_weight; + std::vector sorted_pattern_terms_idf; - switch(submod_fun) // Salient aspects enumeration + /* Salient aspects enumeration */ + switch(submod_fun) { case SubmodularFunction::BOW: { std::vector sorted_pattern_wids(pattern_wids); std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end()); - sorted_pattern_terms.reserve(pattern_wids.size()); - count_terms.reserve(pattern_wids.size()); - if (sorted_pattern_wids.size() > 0) - { - unsigned current_term = sorted_pattern_wids[0]; - unsigned current_count = 1; - for (unsigned i = 1; i < sorted_pattern_wids.size(); i++) - { - if (current_term != sorted_pattern_wids[i]) - { - sorted_pattern_terms.push_back(current_term); - count_terms.push_back(current_count); - current_term = sorted_pattern_wids[i]; - current_count = 1; - } - else - current_count++; - } - sorted_pattern_terms.push_back(current_term); - count_terms.push_back(current_count); - } - break; + get_unique_with_count(sorted_pattern_wids, sorted_pattern_terms, count_terms); + + if (submod_norm == SubmodularNormalization::IDF) + sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms); + + // sorted_pattern_terms.reserve(pattern_wids.size()); + // count_terms.reserve(pattern_wids.size()); + // if (sorted_pattern_wids.size() > 0) + // { + // unsigned current_term = sorted_pattern_wids[0]; + // unsigned current_count = 1; + // for (unsigned i = 1; i < sorted_pattern_wids.size(); i++) + // { + // if (current_term != sorted_pattern_wids[i]) + // { + // sorted_pattern_terms.push_back(current_term); + // count_terms.push_back(current_count); + // current_term = sorted_pattern_wids[i]; + // current_count = 1; + // } + // else + // current_count++; + // } + // sorted_pattern_terms.push_back(current_term); + // count_terms.push_back(current_count); + // } + // break; + } + case SubmodularFunction::NGRAM: + { + get_all_ngrams( + pattern_wids.data(), + p_length, + 4, + sorted_pattern_ngrams, + count_terms); + if (submod_norm == SubmodularNormalization::IDF) + sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_ngrams); } default: ; } - std::vector sorted_pattern_terms_idf; - if (submod_norm == SubmodularNormalization::IDF) - sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms); std::cerr << "sorted unique terms" << ": "; for (const auto& c : sorted_pattern_terms) @@ -699,28 +738,41 @@ namespace fuzzy BM25Matches& bm25Matches = static_cast(*filter_matches); s_cover = bm25Matches.cover(sorted_pattern_terms, count_terms, s_id); } - else if (submod_norm == SubmodularNormalization::NO) + // else if (submod_norm == SubmodularNormalization::NO) + // { + // std::cerr << "No norm..." << std::endl << std::flush; + // get_bow_score( + // sorted_pattern_terms, + // count_terms, + // sentence_wids, + // s_length, + // score, + // s_cover); + // } + else if (submod_norm == SubmodularNormalization::IDF || submod_norm == SubmodularNormalization::NO) { - std::cerr << "No norm..." << std::endl << std::flush; - get_bow_score( - sorted_pattern_terms, - count_terms, - sentence_wids, - s_length, - score, - s_cover); - } - else if (submod_norm == SubmodularNormalization::IDF) - { - std::cerr << "IDF norm..." << std::endl << std::flush; - get_bow_score_idf( - sorted_pattern_terms, - count_terms, - sentence_wids, - s_length, - sorted_pattern_terms_idf, - score, - s_cover); + std::cerr + << (submod_norm == SubmodularNormalization::IDF ? "IDF" : "NO") + << " norm..." << std::endl << std::flush; + if (submod_fun == SubmodularFunction::BOW) + get_bow_score( + sorted_pattern_terms, + count_terms, + sentence_wids, + s_length, + score, + s_cover, + sorted_pattern_terms_idf); + else if (submod_fun == SubmodularFunction::NGRAM) + get_ngram_score( + sorted_pattern_ngrams, + 4, + count_terms, + sentence_wids, + s_length, + score, + s_cover, + sorted_pattern_terms_idf); } // std::cerr << "q: "; // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++) @@ -744,6 +796,7 @@ namespace fuzzy switch(submod_fun) // salient aspect weighted cover { case SubmodularFunction::BOW: + case SubmodularFunction::NGRAM: break; default: const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length); @@ -763,17 +816,16 @@ namespace fuzzy // float cost = 0.1; if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) continue; - float score = int(10000 - cost * 100) / 10000.0; + float score = int(10000 - cost * 100) / 10000.0; - lowest_costs.push(cost); - if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) - lowest_costs.pop(); + lowest_costs.push(cost); + if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) + lowest_costs.pop(); } else { continue; } - } if (score >= fuzzy) { @@ -801,13 +853,11 @@ namespace fuzzy if (shrinking_factor < 1.f) // submodular coverage { - std::vector cover_weights(sorted_pattern_terms.size(), 1.f); + std::vector cover_weights(count_terms.size(), 1.f); std::list candidates; while (!result.empty()) { auto match = result.top(); - // match.penalty = match.score; // initialize corr. to weights of 1 - // match.penalty = match.score; // initialize corr. to weights of 1 candidates.push_back(match); result.pop(); } @@ -820,20 +870,20 @@ namespace fuzzy for (Match &match : candidates) { float rescore = 0.f; - // std::cerr << "rescore " << match.s_id << " : ("; + std::cerr << "rescore " << match.s_id << " : ("; for (unsigned i = 0; i < cover_weights.size(); i++) { rescore += cover_weights[i] * match.cover[i]; - // if (match.cover[i] != 0) - // std::cerr << cover_weights[i] << "*" << match.cover[i] << "+"; + if (match.cover[i] != 0) + std::cerr << cover_weights[i] << "*" << match.cover[i] << "+"; } - // std::cerr << ") " << match.penalty << " -> " << rescore << std::endl; + std::cerr << ") " << match.penalty << " -> " << rescore << std::endl; match.penalty = rescore; } auto it_max = std::max_element(candidates.begin(), candidates.end(), comp); matches.push_back(*it_max); - // std::cerr << "choose No " << it_max->s_id << std::endl; + std::cerr << "choose No " << it_max->s_id << std::endl; // update cover_weights for (unsigned i = 0; i < cover_weights.size(); i++) if (it_max->cover[i] > 0) diff --git a/src/submodular.cc b/src/submodular.cc index 81aadab..fcaf2a8 100644 --- a/src/submodular.cc +++ b/src/submodular.cc @@ -1,62 +1,146 @@ #include -#include #include +#include namespace fuzzy { + NGram::NGram(const unsigned* start, unsigned N) : + _start(start), + _N(N) {} + NGram& NGram::operator=(const NGram& other) { + if (this != &other) + { + _start = other._start; + _N = other._N; + } + return *this; + } + bool NGram::operator==(NGram& other) + { + bool out = _N == other._N; + for (unsigned i = 0; out && (i < _N); i++) + out = (other._start[i] == _start[i]); + return out; + } + bool NGram::operator<(NGram& other) + { + if (_N != other._N) + return _N < other._N; + return std::lexicographical_compare( + _start, _start + _N, + other._start, other._start + other._N); + } + void NGram::print() const + { + for (unsigned i = 0; i < _N; i++) + std::cerr << _start[i] << ","; + std::cerr << '\t'; + } + void get_bow_score( std::vector& sorted_pattern_terms, std::vector& count_terms, const unsigned* sentence, const unsigned sentence_length, float& score, - std::vector& cover) + std::vector& cover, + std::vector& idf_penalty) { std::vector sorted_sentence_terms(sentence, sentence + sentence_length); std::sort(sorted_sentence_terms.begin(), sorted_sentence_terms.end()); - cover = std::vector(sorted_pattern_terms.size(), 0.f); - score = 0.f; - for (unsigned i, j = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++) - { - while ( - (i < sorted_pattern_terms.size()) && - (sorted_pattern_terms[i] < sorted_sentence_terms[j])) - i++; - // std::cerr << sorted_pattern_terms[i] << " ?= " << sorted_sentence_terms[j] << " (" << i << ", " << j << ")" << std::endl; - if (sorted_pattern_terms[i] == sorted_sentence_terms[j]) - if ((float)count_terms[i] > cover[i] + 1e-6f) - { - cover[i] += 1.f; - score += 1.f; - } - } + get_score(sorted_pattern_terms, sorted_sentence_terms, count_terms, score, cover, idf_penalty); } - void get_bow_score_idf( - std::vector& sorted_pattern_terms, + + void get_ngram_score( + std::vector& sorted_pattern_terms, + const unsigned N, std::vector& count_terms, const unsigned* sentence, const unsigned sentence_length, - std::vector& idf_penalty, float& score, - std::vector& cover) + std::vector& cover, + std::vector& idf_penalty) { - std::vector sorted_sentence_terms(sentence, sentence + sentence_length); - std::sort(sorted_sentence_terms.begin(), sorted_sentence_terms.end()); - cover = std::vector(sorted_pattern_terms.size(), 0.f); - score = 0.f; - for (unsigned i, j, k = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++) - { - while ( - (i < sorted_pattern_terms.size()) && - (sorted_pattern_terms[i] < sorted_sentence_terms[j])) - i++; - // std::cerr << sorted_pattern_terms[i] << " ?= " << sorted_sentence_terms[j] << " (" << i << ", " << j << ")" << std::endl; - if (sorted_pattern_terms[i] == sorted_sentence_terms[j]) - if ((float)count_terms[i] > cover[i] / idf_penalty[i] + 1e-6f) - { - cover[i] += idf_penalty[i]; - score += idf_penalty[i]; - } - } + std::cerr << "avant " << sentence_length << std::endl; + std::vector all_ngrams = get_sorted_ngrams(N, sentence, sentence_length); + std::cerr << "apres" << std::endl; + // all_ngrams.reserve(N * sentence_length - N * (N - 1) / 2); + // for (unsigned n = 1; n <= N; n++) + // for (unsigned i = 0; i < sentence_length - n + 1; i++) + // all_ngrams.push_back(NGram(sentence + i, n)); + + // std::sort(all_ngrams.begin(), all_ngrams.end()); + + for (const NGram& ngram : all_ngrams) + ngram.print(); + std::cerr << std::endl; + + get_score(sorted_pattern_terms, all_ngrams, count_terms, score, cover, idf_penalty); + std::cerr << "xxx" << std::endl; + } + + // void get_bow_score( + // std::vector& sorted_pattern_terms, + // std::vector& count_terms, + // const unsigned* sentence, + // const unsigned sentence_length, + // float& score, + // std::vector& cover) + // { + // std::vector sorted_sentence_terms(sentence, sentence + sentence_length); + // std::sort(sorted_sentence_terms.begin(), sorted_sentence_terms.end()); + // cover = std::vector(sorted_pattern_terms.size(), 0.f); + // score = 0.f; + // for (unsigned i, j = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++) + // { + // while ( + // (i < sorted_pattern_terms.size()) && + // (sorted_pattern_terms[i] < sorted_sentence_terms[j])) + // i++; + // if (sorted_pattern_terms[i] == sorted_sentence_terms[j]) + // if ((float)count_terms[i] > cover[i] + 1e-6f) + // { + // cover[i] += 1.f; + // score += 1.f; + // } + // } + // } + + void get_all_ngrams( + const unsigned* sequence, + const unsigned length, + const unsigned N, + std::vector& ngrams, + std::vector& counts) + { + std::vector all_ngrams = get_sorted_ngrams(N, sequence, length); + // std::vector all_ngrams; + // all_ngrams.reserve(N * length - N * (N - 1) / 2); + // for (unsigned n = 1; n <= N; n++) + // for (unsigned i = 0; i < length - n + 1; i++) + // all_ngrams.push_back(NGram(sequence + i, n)); + // // std::cerr << std::endl; + // std::sort(all_ngrams.begin(), all_ngrams.end()); + for (const NGram& ngram : all_ngrams) + ngram.print(); + std::cerr << std::endl; + + get_unique_with_count(all_ngrams, ngrams, counts); + // std::cerr << "ALL" << std::endl; + // for (unsigned i = 0; i < ngrams.size(); i++) + // { + // ngrams[i].print(); + // std::cerr << counts[i] << std::endl; + // } + // std::cerr << std::endl; + // std::cerr << "ONLY > 1" << std::endl; + // for (unsigned i = 0; i < ngrams.size(); i++) + // if (counts[i] > 1) + // { + // ngrams[i].print(); + // std::cerr << counts[i] << std::endl; + // } + // std::cerr << std::endl; + } } \ No newline at end of file From 6021f02c486b9c2d1c80dbb1976c6608b86ad695 Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Fri, 12 Jan 2024 16:32:24 +0100 Subject: [PATCH 10/15] ngram implementation --- src/fuzzy_match.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index 3b6a60a..27dea52 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -403,12 +403,10 @@ namespace fuzzy float tot_idf; for (const NGram& ngram : pattern_ngrams) { for (n = 0, tot_idf = 0.f; n < ngram._N; n++) - { if (ngram._start[n] != fuzzy::VocabIndexer::VOCAB_UNK) tot_idf += std::log((float)num_sentences/(float)word_frequency_in_sentences[ngram._start[n]]); else tot_idf += unknown_vocab_word_penalty; - } idf_penalty.push_back(tot_idf / (float)n); } return idf_penalty; From f1f92e5c2a08b1f7623e22daf7e80c88432cc729 Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Fri, 12 Jan 2024 18:09:25 +0100 Subject: [PATCH 11/15] ed implementation --- include/fuzzy/edit_distance.hh | 12 +++ src/edit_distance.cc | 109 ++++++++++++++++++++++ src/fuzzy_match.cc | 163 ++++++++++++++++++--------------- src/submodular.cc | 68 -------------- 4 files changed, 211 insertions(+), 141 deletions(-) diff --git a/include/fuzzy/edit_distance.hh b/include/fuzzy/edit_distance.hh index eff7ab9..aa88717 100644 --- a/include/fuzzy/edit_distance.hh +++ b/include/fuzzy/edit_distance.hh @@ -1,6 +1,8 @@ #pragma once #include +#include +#include #include #include @@ -21,6 +23,16 @@ namespace fuzzy const EditCosts& edit_costs, const Costs& costs, float max_fuzzyness = std::numeric_limits::max()); + + float _edit_distance_cover(const unsigned* thes, const Sentence &reals, int slen, + const unsigned* thep, const Tokens &realptok, int plen, + const std::vector& st, const std::vector& sn, + const std::vector &idf_penalty, float idf_weight, + const EditCosts&, + const Costs&, + std::vector& cover, + const bool idf_cover = false, + float max_fuzziness = std::numeric_limits::max()); } #include diff --git a/src/edit_distance.cc b/src/edit_distance.cc index 2d72d41..30c3976 100644 --- a/src/edit_distance.cc +++ b/src/edit_distance.cc @@ -120,4 +120,113 @@ namespace fuzzy } return arr[n1][n2]; } + + float + _edit_distance_cover(const unsigned* s1, const Sentence &real1, int n1, + const unsigned* s2, const Tokens &real2tok, int n2, + const std::vector& st2, const std::vector& sn2, + const std::vector &idf_penalty, float idf_weight, + const EditCosts& edit_costs, + const Costs& costs, + std::vector& cover, + const bool idf_cover, + float max_fuzzyness) + { + boost::multi_array arr(boost::extents[n1+1][n2+1]); + boost::multi_array traceback(boost::extents[n1+1][n2+1]); + boost::multi_array cost_tag(boost::extents[n1+1][n2+1]); + /* idf_penalty(w) = log(nbre seqs / nbre occ w) */ + /* idf_weight = weight * costs.diff_word / log(nbre seqs) */ + + std::vector st1(n1+1, nullptr); + std::vector sn1(n1+1, 0); + real1.get_itoks(st1, sn1); + Tokens real1tok = (Tokens)real1; + + /* we have a fixed cost corresponding to trailing penalty_tokens */ + arr[0][0] = _edit_distance_char(st1[n1], sn1[n1], st2[n2], sn2[n2]); + cost_tag[0][0] = _edit_distance_char(st1[0], sn1[0], st2[0], sn2[0]); + + for (int i = 1; i < n1 + 1; i++) { + /* initialize distance source side (real1) */ + arr[i][0] = arr[i-1][0] + costs.diff_word * edit_costs.delete_cost + sn1[i]; + traceback[i][0] = 0; + cost_tag[i][0] = _edit_distance_char(st1[i], sn1[i], st2[0], sn2[0]); + } + for (int j = 1; j < n2 + 1; j++) { + /* initialize distance target side (real2tok) */ + arr[0][j] = arr[0][j-1] + costs.diff_word * edit_costs.insert_cost + sn2[j]; + traceback[0][j] = 1; + if (idf_weight) + arr[0][j] += idf_penalty[j-1] * idf_weight; + cost_tag[0][j] = _edit_distance_char(st1[0], sn1[0], st2[j], sn2[j]); + } + + for (int i = 1; i < n1 + 1; i++) + { + float min = std::numeric_limits::max(); + for (int j = 1; j < n2 + 1; j++) + { + float diff = 0.f; + float penalty_j1 = 0.f; + if (idf_weight) + penalty_j1 = idf_penalty[j-1] * idf_weight; + if (s1[i-1] != s2[j-1]) { + diff = edit_costs.replace_cost * costs.diff_word + penalty_j1; + } + else if (real1tok[i-1] != real2tok[j-1]) { + /* is difference only a case difference */ + if (strchr("LUMC", real1tok[i-1][0])) + diff = edit_costs.replace_cost * costs.diff_case; + else { + diff = edit_costs.replace_cost * costs.diff_real; + } + } + + cost_tag[i][j] = _edit_distance_char(st1[i], sn1[i], st2[j], sn2[j]); + const auto previous = { + arr[i - 1][j] + edit_costs.delete_cost * costs.diff_word + cost_tag[i - 1][j], + arr[i][j - 1] + edit_costs.insert_cost * costs.diff_word + cost_tag[i][j - 1] + penalty_j1, + arr[i - 1][j - 1] + diff + cost_tag[i - 1][j - 1] + }; + const auto min_it = std::min_element(std::begin(previous), std::end(previous)); + unsigned argmin = std::distance(std::begin(previous), min_it); + const auto distance = *min_it; + if (argmin == 2 && diff == 0.f) + argmin = 3; + traceback[i][j] = argmin; + + arr[i][j] = distance; + min = std::min(min, distance); + } + if (min > max_fuzzyness) + return min; + } + int i = n1; + int j = n2; + while (i != 0 || j != 0) + switch(traceback[i][j]) + { + case 0: + i--; + break; + case 1: + j--; + break; + case 2: + i--; + j--; + break; + case 3: + i--; + j--; + std::cerr << "idf cover " << j << " " << idf_penalty.size() << std::endl << std::flush; + cover[j] = idf_cover ? idf_penalty[j] : 1.f; + break; + default: + ; + }; + + return arr[n1][n2]; + } } diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index 27dea52..f1d394d 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -652,6 +652,7 @@ namespace fuzzy { case SubmodularFunction::BOW: { + // all terms bow std::vector sorted_pattern_wids(pattern_wids); std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end()); @@ -659,32 +660,11 @@ namespace fuzzy if (submod_norm == SubmodularNormalization::IDF) sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms); - - // sorted_pattern_terms.reserve(pattern_wids.size()); - // count_terms.reserve(pattern_wids.size()); - // if (sorted_pattern_wids.size() > 0) - // { - // unsigned current_term = sorted_pattern_wids[0]; - // unsigned current_count = 1; - // for (unsigned i = 1; i < sorted_pattern_wids.size(); i++) - // { - // if (current_term != sorted_pattern_wids[i]) - // { - // sorted_pattern_terms.push_back(current_term); - // count_terms.push_back(current_count); - // current_term = sorted_pattern_wids[i]; - // current_count = 1; - // } - // else - // current_count++; - // } - // sorted_pattern_terms.push_back(current_term); - // count_terms.push_back(current_count); - // } - // break; + break; } case SubmodularFunction::NGRAM: { + // all ngrams bow get_all_ngrams( pattern_wids.data(), p_length, @@ -693,6 +673,15 @@ namespace fuzzy count_terms); if (submod_norm == SubmodularNormalization::IDF) sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_ngrams); + break; + } + case SubmodularFunction::ED: + { + // sentence indices + sorted_pattern_terms = std::vector(pattern_wids); + if (submod_norm == SubmodularNormalization::IDF) + sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms); + break; } default: ; @@ -708,8 +697,6 @@ namespace fuzzy std::cerr << std::endl; ///////////// - std::cerr << std::endl << "num best match after bm25 = " << best_matches.size() << std::endl << std::flush; - for (const auto& pair : best_matches) { // num_filtered++; @@ -717,10 +704,6 @@ namespace fuzzy const auto score_filter = pair.second; size_t s_length = 0; const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length); - // const auto num_covered_words = (score_filter < p_length - // ? pattern_coverage.count_covered_words(sentence_wids, s_length) - // : p_length); - // const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length); std::vector s_cover; float score; @@ -736,17 +719,6 @@ namespace fuzzy BM25Matches& bm25Matches = static_cast(*filter_matches); s_cover = bm25Matches.cover(sorted_pattern_terms, count_terms, s_id); } - // else if (submod_norm == SubmodularNormalization::NO) - // { - // std::cerr << "No norm..." << std::endl << std::flush; - // get_bow_score( - // sorted_pattern_terms, - // count_terms, - // sentence_wids, - // s_length, - // score, - // s_cover); - // } else if (submod_norm == SubmodularNormalization::IDF || submod_norm == SubmodularNormalization::NO) { std::cerr @@ -771,32 +743,43 @@ namespace fuzzy score, s_cover, sorted_pattern_terms_idf); - } - // std::cerr << "q: "; - // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++) - // std::cerr << sorted_pattern_terms[i] << ","; - // std::cerr << std::endl; - // std::cerr << "q count: "; - // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++) - // std::cerr << count_terms[i] << ","; - // std::cerr << std::endl; - std::cerr << "sent: "; - for (unsigned i = 0; i < s_length; i++) - std::cerr << sentence_wids[i] << ","; - std::cerr << std::endl; - std::cerr << "cover: "; - for (unsigned i = 0; i < s_cover.size(); i++) - std::cerr << s_cover[i] << ","; - std::cerr << std::endl; - std::cerr << "score: " << score << std::endl; - std::cerr << "...done" << std::endl << std::flush; - - switch(submod_fun) // salient aspect weighted cover - { - case SubmodularFunction::BOW: - case SubmodularFunction::NGRAM: - break; - default: + else if (submod_fun == SubmodularFunction::ED) + { + const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length); + /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ + if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs)) + { + const Costs costs(p_length, s_length, edit_costs); + /* let us check the candidates */ + const auto sentence_realtok = _filterIndex->real_tokens(s_id); + const auto cost_upper_bound = lowest_costs.top(); + s_cover = std::vector(p_length, 0.f); + if (idf_penalty.size() == 0 && submod_norm == SubmodularNormalization::IDF) + idf_penalty = compute_idf_penalty(pattern_wids); + float cost = _edit_distance_cover(sentence_wids, sentence_realtok, s_length, + pattern_wids.data(), pattern_realtok, p_length, + st, sn, + idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max, + edit_costs, + costs, + s_cover, + submod_norm == SubmodularNormalization::IDF, + cost_upper_bound); + if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) + continue; + float score = int(10000 - cost * 100) / 10000.0; + + lowest_costs.push(cost); + if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) + lowest_costs.pop(); + } + else + { + continue; + } + } + else + { const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length); /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs)) @@ -812,20 +795,52 @@ namespace fuzzy edit_costs, costs, cost_upper_bound); // float cost = 0.1; - if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) - continue; - float score = int(10000 - cost * 100) / 10000.0; + if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) + continue; + score = int(10000 - cost * 100) / 10000.0; - lowest_costs.push(cost); - if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) - lowest_costs.pop(); + lowest_costs.push(cost); + if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) + lowest_costs.pop(); } else { continue; } + } } + // switch(submod_fun) + // { + // case SubmodularFunction::BOW: + // case SubmodularFunction::NGRAM: + // break; + // case SubmodularFunction::ED: + + // default: + + // } + + // std::cerr << "q: "; + // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++) + // std::cerr << sorted_pattern_terms[i] << ","; + // std::cerr << std::endl; + // std::cerr << "q count: "; + // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++) + // std::cerr << count_terms[i] << ","; + // std::cerr << std::endl; + std::cerr << "sent: "; + for (unsigned i = 0; i < s_length; i++) + std::cerr << sentence_wids[i] << ","; + std::cerr << std::endl; + std::cerr << "cover: "; + for (unsigned i = 0; i < s_cover.size(); i++) + std::cerr << s_cover[i] << ","; + std::cerr << std::endl; + std::cerr << "score: " << score << std::endl; + std::cerr << "...done" << std::endl << std::flush; + + if (score >= fuzzy) { Match m(sentence_wids, s_length); m.score = (filter_type == IndexType::BM25) ? (float)score_filter / (float)1000. : score; @@ -851,7 +866,8 @@ namespace fuzzy if (shrinking_factor < 1.f) // submodular coverage { - std::vector cover_weights(count_terms.size(), 1.f); + const unsigned cover_length = (submod_fun == SubmodularFunction::ED) ? p_length : count_terms.size(); + std::vector cover_weights(cover_length, 1.f); std::list candidates; while (!result.empty()) { @@ -864,6 +880,7 @@ namespace fuzzy }; while (!candidates.empty() && (number_of_matches == 0 || matches.size() < number_of_matches)) { + // TODO: not compute it the first iteration // rescore penalties of candidates for (Match &match : candidates) { diff --git a/src/submodular.cc b/src/submodular.cc index fcaf2a8..626453c 100644 --- a/src/submodular.cc +++ b/src/submodular.cc @@ -61,51 +61,10 @@ namespace fuzzy std::vector& cover, std::vector& idf_penalty) { - std::cerr << "avant " << sentence_length << std::endl; std::vector all_ngrams = get_sorted_ngrams(N, sentence, sentence_length); - std::cerr << "apres" << std::endl; - // all_ngrams.reserve(N * sentence_length - N * (N - 1) / 2); - // for (unsigned n = 1; n <= N; n++) - // for (unsigned i = 0; i < sentence_length - n + 1; i++) - // all_ngrams.push_back(NGram(sentence + i, n)); - - // std::sort(all_ngrams.begin(), all_ngrams.end()); - - for (const NGram& ngram : all_ngrams) - ngram.print(); - std::cerr << std::endl; - get_score(sorted_pattern_terms, all_ngrams, count_terms, score, cover, idf_penalty); - std::cerr << "xxx" << std::endl; } - // void get_bow_score( - // std::vector& sorted_pattern_terms, - // std::vector& count_terms, - // const unsigned* sentence, - // const unsigned sentence_length, - // float& score, - // std::vector& cover) - // { - // std::vector sorted_sentence_terms(sentence, sentence + sentence_length); - // std::sort(sorted_sentence_terms.begin(), sorted_sentence_terms.end()); - // cover = std::vector(sorted_pattern_terms.size(), 0.f); - // score = 0.f; - // for (unsigned i, j = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++) - // { - // while ( - // (i < sorted_pattern_terms.size()) && - // (sorted_pattern_terms[i] < sorted_sentence_terms[j])) - // i++; - // if (sorted_pattern_terms[i] == sorted_sentence_terms[j]) - // if ((float)count_terms[i] > cover[i] + 1e-6f) - // { - // cover[i] += 1.f; - // score += 1.f; - // } - // } - // } - void get_all_ngrams( const unsigned* sequence, const unsigned length, @@ -114,33 +73,6 @@ namespace fuzzy std::vector& counts) { std::vector all_ngrams = get_sorted_ngrams(N, sequence, length); - // std::vector all_ngrams; - // all_ngrams.reserve(N * length - N * (N - 1) / 2); - // for (unsigned n = 1; n <= N; n++) - // for (unsigned i = 0; i < length - n + 1; i++) - // all_ngrams.push_back(NGram(sequence + i, n)); - // // std::cerr << std::endl; - // std::sort(all_ngrams.begin(), all_ngrams.end()); - for (const NGram& ngram : all_ngrams) - ngram.print(); - std::cerr << std::endl; - get_unique_with_count(all_ngrams, ngrams, counts); - // std::cerr << "ALL" << std::endl; - // for (unsigned i = 0; i < ngrams.size(); i++) - // { - // ngrams[i].print(); - // std::cerr << counts[i] << std::endl; - // } - // std::cerr << std::endl; - // std::cerr << "ONLY > 1" << std::endl; - // for (unsigned i = 0; i < ngrams.size(); i++) - // if (counts[i] > 1) - // { - // ngrams[i].print(); - // std::cerr << counts[i] << std::endl; - // } - // std::cerr << std::endl; - } } \ No newline at end of file From 7a2c13f8ff4890f5810cf09261679e8170d1fc8f Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Mon, 15 Jan 2024 17:47:18 +0100 Subject: [PATCH 12/15] working but unclean --- cli/src/FuzzyMatch-cli.cc | 2 +- include/fuzzy/fuzzy_match.hh | 1 + include/fuzzy/submodular.hxx | 2 +- src/edit_distance.cc | 2 +- src/fuzzy_match.cc | 245 ++++++++++++++++++----------------- src/submodular.cc | 6 + 6 files changed, 136 insertions(+), 122 deletions(-) diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc index 9dd1b23..97d257b 100644 --- a/cli/src/FuzzyMatch-cli.cc +++ b/cli/src/FuzzyMatch-cli.cc @@ -466,7 +466,7 @@ int main(int argc, char** argv) else submodular_function = fuzzy::SubmodularFunction::NO; fuzzy::SubmodularNormalization submodular_normalization; - std::cerr << "submodular_normalization_str = " << submodular_normalization_str << std::endl; + // std::cerr << "submodular_normalization_str = " << submodular_normalization_str << std::endl; if (submodular_normalization_str == "BM25") submodular_normalization = fuzzy::SubmodularNormalization::BM25; else if (submodular_normalization_str == "IDF") diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh index 3d4560f..bfca226 100644 --- a/include/fuzzy/fuzzy_match.hh +++ b/include/fuzzy/fuzzy_match.hh @@ -8,6 +8,7 @@ #include #include #include +#include namespace onmt { class Tokenizer; diff --git a/include/fuzzy/submodular.hxx b/include/fuzzy/submodular.hxx index 0353158..858217e 100644 --- a/include/fuzzy/submodular.hxx +++ b/include/fuzzy/submodular.hxx @@ -29,7 +29,7 @@ namespace fuzzy cover = std::vector(sorted_pattern_terms.size(), 0.f); score = 0.f; for ( - unsigned i, j, k = 0; + unsigned i = 0, j = 0, k = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++) { diff --git a/src/edit_distance.cc b/src/edit_distance.cc index 30c3976..170b4d1 100644 --- a/src/edit_distance.cc +++ b/src/edit_distance.cc @@ -220,7 +220,7 @@ namespace fuzzy case 3: i--; j--; - std::cerr << "idf cover " << j << " " << idf_penalty.size() << std::endl << std::flush; + // std::cerr << "idf cover " << j << " " << idf_penalty.size() << std::endl << std::flush; cover[j] = idf_cover ? idf_penalty[j] : 1.f; break; default: diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index f1d394d..09d80ce 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -657,6 +657,13 @@ namespace fuzzy std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end()); get_unique_with_count(sorted_pattern_wids, sorted_pattern_terms, count_terms); + // std::cerr << "### " + // << sorted_pattern_wids.size() + // << ", " + // << sorted_pattern_terms.size() + // << ", " + // << count_terms.size() + // << std::endl; if (submod_norm == SubmodularNormalization::IDF) sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms); @@ -687,14 +694,14 @@ namespace fuzzy ; } - std::cerr << "sorted unique terms" << ": "; - for (const auto& c : sorted_pattern_terms) - std::cerr << c << ", "; - std::cerr << std::endl; - std::cerr << "Idf" << ": "; - for (const auto& c : sorted_pattern_terms_idf) - std::cerr << c << ", "; - std::cerr << std::endl; + // std::cerr << "sorted unique terms" << ": "; + // for (const auto& c : sorted_pattern_terms) + // std::cerr << c << ", "; + // std::cerr << std::endl; + // std::cerr << "Idf" << ": "; + // for (const auto& c : sorted_pattern_terms_idf) + // std::cerr << c << ", "; + // std::cerr << std::endl; ///////////// for (const auto& pair : best_matches) @@ -713,7 +720,7 @@ namespace fuzzy if (submod_norm == SubmodularNormalization::BM25) { - std::cerr << "BM25 norm..." << std::endl << std::flush; + // std::cerr << "BM25 norm..." << std::endl << std::flush; score = (float)score_filter / 1000.f; assert((filter_type == IndexType::BM25)); BM25Matches& bm25Matches = static_cast(*filter_matches); @@ -721,106 +728,104 @@ namespace fuzzy } else if (submod_norm == SubmodularNormalization::IDF || submod_norm == SubmodularNormalization::NO) { - std::cerr - << (submod_norm == SubmodularNormalization::IDF ? "IDF" : "NO") - << " norm..." << std::endl << std::flush; - if (submod_fun == SubmodularFunction::BOW) - get_bow_score( - sorted_pattern_terms, - count_terms, - sentence_wids, - s_length, - score, - s_cover, - sorted_pattern_terms_idf); - else if (submod_fun == SubmodularFunction::NGRAM) - get_ngram_score( - sorted_pattern_ngrams, - 4, - count_terms, - sentence_wids, - s_length, - score, - s_cover, - sorted_pattern_terms_idf); - else if (submod_fun == SubmodularFunction::ED) - { - const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length); - /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ - if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs)) - { - const Costs costs(p_length, s_length, edit_costs); - /* let us check the candidates */ - const auto sentence_realtok = _filterIndex->real_tokens(s_id); - const auto cost_upper_bound = lowest_costs.top(); - s_cover = std::vector(p_length, 0.f); - if (idf_penalty.size() == 0 && submod_norm == SubmodularNormalization::IDF) - idf_penalty = compute_idf_penalty(pattern_wids); - float cost = _edit_distance_cover(sentence_wids, sentence_realtok, s_length, - pattern_wids.data(), pattern_realtok, p_length, - st, sn, - idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max, - edit_costs, - costs, - s_cover, - submod_norm == SubmodularNormalization::IDF, - cost_upper_bound); - if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) - continue; - float score = int(10000 - cost * 100) / 10000.0; - - lowest_costs.push(cost); - if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) - lowest_costs.pop(); - } - else - { - continue; - } - } - else + // std::cerr + // << (submod_norm == SubmodularNormalization::IDF ? "IDF" : "NO") + // << " norm..." << std::endl << std::flush; + + switch(submod_fun) { - const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length); - /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ - if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs)) - { - const Costs costs(p_length, s_length, edit_costs); - /* let us check the candidates */ - const auto sentence_realtok = _filterIndex->real_tokens(s_id); - const auto cost_upper_bound = lowest_costs.top(); - float cost = _edit_distance(sentence_wids, sentence_realtok, s_length, - pattern_wids.data(), pattern_realtok, p_length, - st, sn, - idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max, - edit_costs, - costs, cost_upper_bound); - // float cost = 0.1; - if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) + case SubmodularFunction::BOW: + get_bow_score( + sorted_pattern_terms, + count_terms, + sentence_wids, + s_length, + score, + s_cover, + sorted_pattern_terms_idf); + score /= (float)std::accumulate(count_terms.begin(), count_terms.end(), 0); + break; + case SubmodularFunction::NGRAM: + get_ngram_score( + sorted_pattern_ngrams, + 4, + count_terms, + sentence_wids, + s_length, + score, + s_cover, + sorted_pattern_terms_idf); + score /= (float)std::accumulate(count_terms.begin(), count_terms.end(), 0); + break; + case SubmodularFunction::ED: + { + const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length); + /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ + if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs)) + { + const Costs costs(p_length, s_length, edit_costs); + /* let us check the candidates */ + const auto sentence_realtok = _filterIndex->real_tokens(s_id); + const auto cost_upper_bound = lowest_costs.top(); + s_cover = std::vector(p_length, 0.f); + if (idf_penalty.size() == 0 && submod_norm == SubmodularNormalization::IDF) + idf_penalty = compute_idf_penalty(pattern_wids); + float cost = _edit_distance_cover(sentence_wids, sentence_realtok, s_length, + pattern_wids.data(), pattern_realtok, p_length, + st, sn, + idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max, + edit_costs, + costs, + s_cover, + submod_norm == SubmodularNormalization::IDF, + cost_upper_bound); + if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) + continue; + score = int(10000 - cost * 100) / 10000.0; + + lowest_costs.push(cost); + if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) + lowest_costs.pop(); + } + else + { continue; - score = int(10000 - cost * 100) / 10000.0; - - lowest_costs.push(cost); - if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) - lowest_costs.pop(); + } + break; } - else + default: { - continue; + const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length); + /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */ + if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs)) + { + const Costs costs(p_length, s_length, edit_costs); + /* let us check the candidates */ + const auto sentence_realtok = _filterIndex->real_tokens(s_id); + const auto cost_upper_bound = lowest_costs.top(); + float cost = _edit_distance(sentence_wids, sentence_realtok, s_length, + pattern_wids.data(), pattern_realtok, p_length, + st, sn, + idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max, + edit_costs, + costs, cost_upper_bound); + // float cost = 0.1; + if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound) + continue; + score = int(10000 - cost * 100) / 10000.0; + + lowest_costs.push(cost); + if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer)) + lowest_costs.pop(); + } + else + { + continue; + } } } } - // switch(submod_fun) - // { - // case SubmodularFunction::BOW: - // case SubmodularFunction::NGRAM: - // break; - // case SubmodularFunction::ED: - - // default: - - // } - // std::cerr << "q: "; // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++) // std::cerr << sorted_pattern_terms[i] << ","; @@ -829,21 +834,23 @@ namespace fuzzy // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++) // std::cerr << count_terms[i] << ","; // std::cerr << std::endl; - std::cerr << "sent: "; - for (unsigned i = 0; i < s_length; i++) - std::cerr << sentence_wids[i] << ","; - std::cerr << std::endl; - std::cerr << "cover: "; - for (unsigned i = 0; i < s_cover.size(); i++) - std::cerr << s_cover[i] << ","; - std::cerr << std::endl; - std::cerr << "score: " << score << std::endl; - std::cerr << "...done" << std::endl << std::flush; + // std::cerr << "sent: "; + // for (unsigned i = 0; i < s_length; i++) + // std::cerr << sentence_wids[i] << ","; + // std::cerr << std::endl; + // std::cerr << "cover: "; + // for (unsigned i = 0; i < s_cover.size(); i++) + // std::cerr << s_cover[i] << ","; + // std::cerr << std::endl; + // std::cerr << "score: " << score << std::endl; + // std::cerr << "...done" << std::endl << std::flush; if (score >= fuzzy) { Match m(sentence_wids, s_length); - m.score = (filter_type == IndexType::BM25) ? (float)score_filter / (float)1000. : score; + + // m.score = (filter_type == IndexType::BM25) ? (float)score_filter / (float)1000. : score; + m.score = score; m.max_subseq = (filter_type == IndexType::BM25) ? 0 : score_filter; m.s_id = s_id; m.id = _filterIndex->id(s_id); @@ -864,9 +871,10 @@ namespace fuzzy // std::cerr << num_filtered << std::endl; // std::cerr << filter_matches->get_best_matches().size() << std::endl; - if (shrinking_factor < 1.f) // submodular coverage + if (submod_fun != SubmodularFunction::NO && shrinking_factor < 1.f) // submodular coverage { const unsigned cover_length = (submod_fun == SubmodularFunction::ED) ? p_length : count_terms.size(); + // std::cerr << ">> " << cover_length << std::endl; std::vector cover_weights(cover_length, 1.f); std::list candidates; while (!result.empty()) @@ -885,20 +893,19 @@ namespace fuzzy for (Match &match : candidates) { float rescore = 0.f; - std::cerr << "rescore " << match.s_id << " : ("; + // std::cerr << "rescore " << match.s_id << " : ("; for (unsigned i = 0; i < cover_weights.size(); i++) { rescore += cover_weights[i] * match.cover[i]; - if (match.cover[i] != 0) - std::cerr << cover_weights[i] << "*" << match.cover[i] << "+"; + // if (match.cover[i] != 0) + // std::cerr << cover_weights[i] << "*" << match.cover[i] << "+"; } - - std::cerr << ") " << match.penalty << " -> " << rescore << std::endl; + // std::cerr << ") " << match.penalty << " -> " << rescore << std::endl; match.penalty = rescore; } auto it_max = std::max_element(candidates.begin(), candidates.end(), comp); matches.push_back(*it_max); - std::cerr << "choose No " << it_max->s_id << std::endl; + // std::cerr << "choose No " << it_max->s_id << std::endl; // update cover_weights for (unsigned i = 0; i < cover_weights.size(); i++) if (it_max->cover[i] > 0) diff --git a/src/submodular.cc b/src/submodular.cc index 626453c..a5b89ae 100644 --- a/src/submodular.cc +++ b/src/submodular.cc @@ -37,6 +37,12 @@ namespace fuzzy std::cerr << '\t'; } + std::ostream& operator<<(std::ostream &s, const NGram &ngram) { + for (unsigned i = 0; i < ngram._N; i++) + s << ngram._start[i] << ","; + return s << '\t'; + } + void get_bow_score( std::vector& sorted_pattern_terms, std::vector& count_terms, From e22bee158ca6c38bf4c6caaa78dde5f54682dfad Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Sun, 21 Jan 2024 00:52:51 +0100 Subject: [PATCH 13/15] cerr everywhere --- cli/src/FuzzyMatch-cli.cc | 3 +++ include/fuzzy/fuzzy_match.hh | 3 ++- src/bm25_matches.cc | 4 ++++ src/fuzzy_match.cc | 31 +++++++++++++++++++++++++------ 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc index 97d257b..8a18e34 100644 --- a/cli/src/FuzzyMatch-cli.cc +++ b/cli/src/FuzzyMatch-cli.cc @@ -125,12 +125,15 @@ std::pair process_stream(const Function& function, if (num_threads <= 1) // Fast path for sequential processing. { while (std::getline(in, line)) { + std::cerr << "#" << std::flush; std::string res = function(line); + std::cerr << "+" << std::flush; if (!res.empty()) count_nonempty++; out << res << std::endl; // if (count_nonempty % 100 == 0) // std::cerr << "\rPROGRESS: " << count_nonempty << " " << std::flush; + std::cerr << "+" << std::endl << std::flush; } // std::cerr << std::endl; return std::make_pair(count_nonempty, count_total); diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh index bfca226..08debfc 100644 --- a/include/fuzzy/fuzzy_match.hh +++ b/include/fuzzy/fuzzy_match.hh @@ -42,13 +42,14 @@ namespace fuzzy int length ) : length(length), s(seq) {} Match() {} + ~Match() {} float score; float secondary_sort; float penalty; int max_subseq; unsigned s_id; std::string id; - std::vector cover; + float* cover; int length; const unsigned* s; }; diff --git a/src/bm25_matches.cc b/src/bm25_matches.cc index 3a2a0c0..9d55492 100644 --- a/src/bm25_matches.cc +++ b/src/bm25_matches.cc @@ -24,12 +24,15 @@ namespace fuzzy std::priority_queue, std::vector>, ComparePairs> k_best; + std::cerr << "1" << std::flush; Eigen::SparseVector pattern_sparse_vec(bm25.get_vocab_size()); for (const unsigned& wid : pattern_wids) pattern_sparse_vec.coeffRef(wid) += 1.0; + std::cerr << "2" << std::flush; Eigen::SparseVector scores = bm25.compute_product(pattern_sparse_vec); + std::cerr << "3" << std::flush; for (Eigen::SparseVector::InnerIterator it(scores); it; ++it) { int s_id = it.index(); float bm25_score = it.value(); @@ -41,6 +44,7 @@ namespace fuzzy } } + std::cerr << "4" << std::flush; _best_matches.reserve(k_best.size()); while (!k_best.empty()) { diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index 09d80ce..f507c76 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -505,6 +505,8 @@ namespace fuzzy if (!p_length) return false; + std::cerr << "[" << std::flush; + if ((std::size_t)(min_subseq_length) > pattern.size()) min_subseq_length = pattern.size(); @@ -527,6 +529,8 @@ namespace fuzzy // FilterMatches* filter_matches = nullptr; // std::unique_ptr filter_matches; std::shared_ptr filter_matches; + + std::cerr << "$" << std::flush; if (filter_type == IndexType::SUFFIX) { const SuffixArray& suffix_array = static_cast(filter); // filter_matches = new NGramMatches(fuzzy, p_length, min_subseq_length, suffix_array); @@ -611,6 +615,7 @@ namespace fuzzy filter_matches = std::make_shared(fuzzy, p_length, min_subseq_length, bm25, bm25_buffer, bm25_cutoff); // filter_matches = new BM25Matches(fuzzy, p_length, min_subseq_length, bm25, bm25_buffer, bm25_cutoff); BM25Matches& bm25Matches = static_cast(*filter_matches); + std::cerr << "!" << std::flush; bm25Matches.register_pattern(pattern_wids, edit_costs); } #endif @@ -622,7 +627,7 @@ namespace fuzzy no_matches.load_all(); } /* Consolidation of the results */ - + std::cerr << "~" << std::flush; /* now explore for the best segments */ PatternCoverage pattern_coverage(pattern_wids); @@ -647,6 +652,8 @@ namespace fuzzy std::vector norm_weight; std::vector sorted_pattern_terms_idf; + std::cerr << "|" << std::flush; + /* Salient aspects enumeration */ switch(submod_fun) { @@ -706,6 +713,7 @@ namespace fuzzy for (const auto& pair : best_matches) { + // std::cerr << "-" << std::flush; // num_filtered++; const auto s_id = pair.first; const auto score_filter = pair.second; @@ -856,7 +864,10 @@ namespace fuzzy m.id = _filterIndex->id(s_id); m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt; m.penalty = 0; - m.cover = s_cover; + // m.cover = s_cover; + // m.cover = std::vector(s_cover); + // m.cover = std::vector(s_cover.size()); + // std::copy(s_cover.begin(), s_cover.end(), m.cover.begin()); result.push(m); // std::cerr << m.s_id << ": "; // for (const auto& c : m.cover) @@ -868,6 +879,7 @@ namespace fuzzy } } // COUT filter + std::cerr << "]" << std::flush; // std::cerr << num_filtered << std::endl; // std::cerr << filter_matches->get_best_matches().size() << std::endl; @@ -896,7 +908,9 @@ namespace fuzzy // std::cerr << "rescore " << match.s_id << " : ("; for (unsigned i = 0; i < cover_weights.size(); i++) { - rescore += cover_weights[i] * match.cover[i]; + ///////////////////////////////// TODO: uncomment + // rescore += cover_weights[i] * match.cover[i]; + rescore += cover_weights[i]; // if (match.cover[i] != 0) // std::cerr << cover_weights[i] << "*" << match.cover[i] << "+"; } @@ -907,9 +921,10 @@ namespace fuzzy matches.push_back(*it_max); // std::cerr << "choose No " << it_max->s_id << std::endl; // update cover_weights - for (unsigned i = 0; i < cover_weights.size(); i++) - if (it_max->cover[i] > 0) - cover_weights[i] *= shrinking_factor; + ///////////////////////////////// TODO: uncomment + // for (unsigned i = 0; i < cover_weights.size(); i++) + // if (it_max->cover[i] > 0) + // cover_weights[i] *= shrinking_factor; candidates.erase(it_max); if (shrinking_factor < 1e-20f) { @@ -987,6 +1002,7 @@ namespace fuzzy result.pop(); } } + std::cerr << "|" << std::flush; // std::cerr << "final matches " << " : "; // for (unsigned i = 0; i < matches.size(); i++) @@ -999,6 +1015,9 @@ namespace fuzzy // std::cerr << matches[i].id; // } // std::cerr << std::endl; + + //// Attempts to free memory which is corrupted + //// Probably from vector return matches.size() > 0; } } From 30f56d22a10fab1a9c101775eed4a97e5c58d8d6 Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Sun, 21 Jan 2024 17:50:00 +0100 Subject: [PATCH 14/15] bug fixed --- cli/src/FuzzyMatch-cli.cc | 5 -- include/fuzzy/fuzzy_match.hh | 3 +- include/fuzzy/submodular.hxx | 10 +++- src/bm25_matches.cc | 4 -- src/fuzzy_match.cc | 95 +++++------------------------------- 5 files changed, 20 insertions(+), 97 deletions(-) diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc index 8a18e34..94c54dc 100644 --- a/cli/src/FuzzyMatch-cli.cc +++ b/cli/src/FuzzyMatch-cli.cc @@ -125,15 +125,12 @@ std::pair process_stream(const Function& function, if (num_threads <= 1) // Fast path for sequential processing. { while (std::getline(in, line)) { - std::cerr << "#" << std::flush; std::string res = function(line); - std::cerr << "+" << std::flush; if (!res.empty()) count_nonempty++; out << res << std::endl; // if (count_nonempty % 100 == 0) // std::cerr << "\rPROGRESS: " << count_nonempty << " " << std::flush; - std::cerr << "+" << std::endl << std::flush; } // std::cerr << std::endl; return std::make_pair(count_nonempty, count_total); @@ -190,8 +187,6 @@ std::pair process_stream(const Function& function, if (!futures.empty()) pop_results(/*blocking=*/true); - // std::cerr << std::endl; - { std::lock_guard lock(mutex); request_end = true; diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh index 08debfc..bfca226 100644 --- a/include/fuzzy/fuzzy_match.hh +++ b/include/fuzzy/fuzzy_match.hh @@ -42,14 +42,13 @@ namespace fuzzy int length ) : length(length), s(seq) {} Match() {} - ~Match() {} float score; float secondary_sort; float penalty; int max_subseq; unsigned s_id; std::string id; - float* cover; + std::vector cover; int length; const unsigned* s; }; diff --git a/include/fuzzy/submodular.hxx b/include/fuzzy/submodular.hxx index 858217e..c31c8cc 100644 --- a/include/fuzzy/submodular.hxx +++ b/include/fuzzy/submodular.hxx @@ -18,6 +18,7 @@ namespace fuzzy } template + inline void get_score( std::vector& sorted_pattern_terms, std::vector& sorted_sentence_terms, @@ -27,16 +28,20 @@ namespace fuzzy std::vector& idf_penalty) { cover = std::vector(sorted_pattern_terms.size(), 0.f); + // std::cerr << sorted_pattern_terms.size() << "|" + // << sorted_sentence_terms.size() << ">" + // << std::flush; score = 0.f; for ( - unsigned i = 0, j = 0, k = 0; + unsigned i = 0, j = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++) { while ( - (i < sorted_pattern_terms.size()) && + (i < sorted_pattern_terms.size() - 1) && (sorted_pattern_terms[i] < sorted_sentence_terms[j])) i++; + if (sorted_pattern_terms[i] == sorted_sentence_terms[j]) if (idf_penalty.size() > 0) { @@ -55,6 +60,7 @@ namespace fuzzy } template + inline void get_unique_with_count( std::vector& sorted_salient, std::vector& unique, diff --git a/src/bm25_matches.cc b/src/bm25_matches.cc index 9d55492..3a2a0c0 100644 --- a/src/bm25_matches.cc +++ b/src/bm25_matches.cc @@ -24,15 +24,12 @@ namespace fuzzy std::priority_queue, std::vector>, ComparePairs> k_best; - std::cerr << "1" << std::flush; Eigen::SparseVector pattern_sparse_vec(bm25.get_vocab_size()); for (const unsigned& wid : pattern_wids) pattern_sparse_vec.coeffRef(wid) += 1.0; - std::cerr << "2" << std::flush; Eigen::SparseVector scores = bm25.compute_product(pattern_sparse_vec); - std::cerr << "3" << std::flush; for (Eigen::SparseVector::InnerIterator it(scores); it; ++it) { int s_id = it.index(); float bm25_score = it.value(); @@ -44,7 +41,6 @@ namespace fuzzy } } - std::cerr << "4" << std::flush; _best_matches.reserve(k_best.size()); while (!k_best.empty()) { diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index f507c76..7398291 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -505,8 +505,6 @@ namespace fuzzy if (!p_length) return false; - std::cerr << "[" << std::flush; - if ((std::size_t)(min_subseq_length) > pattern.size()) min_subseq_length = pattern.size(); @@ -530,7 +528,6 @@ namespace fuzzy // std::unique_ptr filter_matches; std::shared_ptr filter_matches; - std::cerr << "$" << std::flush; if (filter_type == IndexType::SUFFIX) { const SuffixArray& suffix_array = static_cast(filter); // filter_matches = new NGramMatches(fuzzy, p_length, min_subseq_length, suffix_array); @@ -615,7 +612,6 @@ namespace fuzzy filter_matches = std::make_shared(fuzzy, p_length, min_subseq_length, bm25, bm25_buffer, bm25_cutoff); // filter_matches = new BM25Matches(fuzzy, p_length, min_subseq_length, bm25, bm25_buffer, bm25_cutoff); BM25Matches& bm25Matches = static_cast(*filter_matches); - std::cerr << "!" << std::flush; bm25Matches.register_pattern(pattern_wids, edit_costs); } #endif @@ -627,7 +623,6 @@ namespace fuzzy no_matches.load_all(); } /* Consolidation of the results */ - std::cerr << "~" << std::flush; /* now explore for the best segments */ PatternCoverage pattern_coverage(pattern_wids); @@ -652,25 +647,17 @@ namespace fuzzy std::vector norm_weight; std::vector sorted_pattern_terms_idf; - std::cerr << "|" << std::flush; - /* Salient aspects enumeration */ switch(submod_fun) { case SubmodularFunction::BOW: { // all terms bow - std::vector sorted_pattern_wids(pattern_wids); + std::vector sorted_pattern_wids = pattern_wids; std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end()); get_unique_with_count(sorted_pattern_wids, sorted_pattern_terms, count_terms); - // std::cerr << "### " - // << sorted_pattern_wids.size() - // << ", " - // << sorted_pattern_terms.size() - // << ", " - // << count_terms.size() - // << std::endl; + if (submod_norm == SubmodularNormalization::IDF) sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms); @@ -692,7 +679,8 @@ namespace fuzzy case SubmodularFunction::ED: { // sentence indices - sorted_pattern_terms = std::vector(pattern_wids); + // sorted_pattern_terms = std::vector(pattern_wids); + sorted_pattern_terms = pattern_wids; if (submod_norm == SubmodularNormalization::IDF) sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms); break; @@ -701,19 +689,8 @@ namespace fuzzy ; } - // std::cerr << "sorted unique terms" << ": "; - // for (const auto& c : sorted_pattern_terms) - // std::cerr << c << ", "; - // std::cerr << std::endl; - // std::cerr << "Idf" << ": "; - // for (const auto& c : sorted_pattern_terms_idf) - // std::cerr << c << ", "; - // std::cerr << std::endl; - ///////////// - for (const auto& pair : best_matches) { - // std::cerr << "-" << std::flush; // num_filtered++; const auto s_id = pair.first; const auto score_filter = pair.second; @@ -728,7 +705,6 @@ namespace fuzzy if (submod_norm == SubmodularNormalization::BM25) { - // std::cerr << "BM25 norm..." << std::endl << std::flush; score = (float)score_filter / 1000.f; assert((filter_type == IndexType::BM25)); BM25Matches& bm25Matches = static_cast(*filter_matches); @@ -834,25 +810,6 @@ namespace fuzzy } } - // std::cerr << "q: "; - // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++) - // std::cerr << sorted_pattern_terms[i] << ","; - // std::cerr << std::endl; - // std::cerr << "q count: "; - // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++) - // std::cerr << count_terms[i] << ","; - // std::cerr << std::endl; - // std::cerr << "sent: "; - // for (unsigned i = 0; i < s_length; i++) - // std::cerr << sentence_wids[i] << ","; - // std::cerr << std::endl; - // std::cerr << "cover: "; - // for (unsigned i = 0; i < s_cover.size(); i++) - // std::cerr << s_cover[i] << ","; - // std::cerr << std::endl; - // std::cerr << "score: " << score << std::endl; - // std::cerr << "...done" << std::endl << std::flush; - if (score >= fuzzy) { Match m(sentence_wids, s_length); @@ -864,29 +821,20 @@ namespace fuzzy m.id = _filterIndex->id(s_id); m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt; m.penalty = 0; - // m.cover = s_cover; - // m.cover = std::vector(s_cover); - // m.cover = std::vector(s_cover.size()); - // std::copy(s_cover.begin(), s_cover.end(), m.cover.begin()); + m.cover = s_cover; result.push(m); - // std::cerr << m.s_id << ": "; - // for (const auto& c : m.cover) - // std::cerr << c << ", "; - // std::cerr << std::endl; cpt++; if (cpt > contrast_buffer) break; } } // COUT filter - std::cerr << "]" << std::flush; // std::cerr << num_filtered << std::endl; // std::cerr << filter_matches->get_best_matches().size() << std::endl; if (submod_fun != SubmodularFunction::NO && shrinking_factor < 1.f) // submodular coverage { const unsigned cover_length = (submod_fun == SubmodularFunction::ED) ? p_length : count_terms.size(); - // std::cerr << ">> " << cover_length << std::endl; std::vector cover_weights(cover_length, 1.f); std::list candidates; while (!result.empty()) @@ -908,23 +856,17 @@ namespace fuzzy // std::cerr << "rescore " << match.s_id << " : ("; for (unsigned i = 0; i < cover_weights.size(); i++) { - ///////////////////////////////// TODO: uncomment - // rescore += cover_weights[i] * match.cover[i]; - rescore += cover_weights[i]; - // if (match.cover[i] != 0) - // std::cerr << cover_weights[i] << "*" << match.cover[i] << "+"; + rescore += cover_weights[i] * match.cover[i]; + // rescore += cover_weights[i]; } - // std::cerr << ") " << match.penalty << " -> " << rescore << std::endl; match.penalty = rescore; } auto it_max = std::max_element(candidates.begin(), candidates.end(), comp); matches.push_back(*it_max); - // std::cerr << "choose No " << it_max->s_id << std::endl; // update cover_weights - ///////////////////////////////// TODO: uncomment - // for (unsigned i = 0; i < cover_weights.size(); i++) - // if (it_max->cover[i] > 0) - // cover_weights[i] *= shrinking_factor; + for (unsigned i = 0; i < cover_weights.size(); i++) + if (it_max->cover[i] > 0) + cover_weights[i] *= shrinking_factor; candidates.erase(it_max); if (shrinking_factor < 1e-20f) { @@ -1002,22 +944,7 @@ namespace fuzzy result.pop(); } } - std::cerr << "|" << std::flush; - - // std::cerr << "final matches " << " : "; - // for (unsigned i = 0; i < matches.size(); i++) - // { - // std::cerr << std::endl << " "; - // std::cerr << matches[i].s_id << ": "; - // for (int j = 0; j < matches[i].length; j++) - // std::cerr << matches[i].s[j] << " "; - // std::cerr << std::endl; - // std::cerr << matches[i].id; - // } - // std::cerr << std::endl; - - //// Attempts to free memory which is corrupted - //// Probably from vector + return matches.size() > 0; } } From 0b6187f61430bb4eec9fa58a7a5b7f72ca6a0816 Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Mon, 14 Oct 2024 16:09:38 +0200 Subject: [PATCH 15/15] fixed problem of best match buffer --- src/CMakeLists.txt | 2 +- src/filter.cc | 2 +- src/fuzzy_match.cc | 25 ++++++++++++++++++++++--- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b7a96fa..b80565a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -55,7 +55,7 @@ target_include_directories(${PROJECT_NAME} PUBLIC target_link_libraries(${PROJECT_NAME} ${OPENNMT_TOKENIZER_LIB} - ${ICU_LIBRARIES} + # ${ICU_LIBRARIES} ${Boost_LIBRARIES} Threads::Threads ) diff --git a/src/filter.cc b/src/filter.cc index 94f89c2..e2a5e6d 100644 --- a/src/filter.cc +++ b/src/filter.cc @@ -10,7 +10,7 @@ namespace fuzzy Filter::add_sentence(const std::vector& sentence) { size_t sidx = _sentence_pos.size(); - std::cerr << sidx << std::endl; + // std::cerr << sidx << std::endl; _sentence_pos.push_back(_sentence_buffer.size()); /* first token in sentence buffer is the sentence size */ diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index 7398291..b3fc964 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -41,6 +41,16 @@ namespace fuzzy } }; + class CompareMatchInverse + { + public: + bool operator()(const FuzzyMatch::Match &x, const FuzzyMatch::Match &y) + { + return x.score > y.score || + (x.score == y.score && x.secondary_sort < y.secondary_sort); + } + }; + static std::string normalize(const std::string& text_utf8) { UErrorCode error_code = U_ZERO_ERROR; const auto* normalizer = icu::Normalizer2::getNFCInstance(error_code); @@ -522,6 +532,7 @@ namespace fuzzy /* result map - normalized error => sentence */ std::priority_queue, CompareMatch> result; + std::priority_queue, CompareMatchInverse> result_best; const Filter& filter = _filterIndex->get_Filter(); // FilterMatches* filter_matches = nullptr; @@ -822,12 +833,20 @@ namespace fuzzy m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt; m.penalty = 0; m.cover = s_cover; - result.push(m); + // result.push(m); + result_best.push(m); + if (contrast_buffer > 0 && (int)result_best.size() > contrast_buffer) + result_best.pop(); cpt++; - if (cpt > contrast_buffer) - break; + // if (cpt > contrast_buffer) + // break } } + while (result_best.size() > 0) + { + result.push(result_best.top()); + result_best.pop(); + } // COUT filter // std::cerr << num_filtered << std::endl; // std::cerr << filter_matches->get_best_matches().size() << std::endl;