From 30f56d22a10fab1a9c101775eed4a97e5c58d8d6 Mon Sep 17 00:00:00 2001 From: Maxwell1447 Date: Sun, 21 Jan 2024 17:50:00 +0100 Subject: [PATCH] bug fixed --- cli/src/FuzzyMatch-cli.cc | 5 -- include/fuzzy/fuzzy_match.hh | 3 +- include/fuzzy/submodular.hxx | 10 +++- src/bm25_matches.cc | 4 -- src/fuzzy_match.cc | 95 +++++------------------------------- 5 files changed, 20 insertions(+), 97 deletions(-) diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc index 8a18e34..94c54dc 100644 --- a/cli/src/FuzzyMatch-cli.cc +++ b/cli/src/FuzzyMatch-cli.cc @@ -125,15 +125,12 @@ std::pair process_stream(const Function& function, if (num_threads <= 1) // Fast path for sequential processing. { while (std::getline(in, line)) { - std::cerr << "#" << std::flush; std::string res = function(line); - std::cerr << "+" << std::flush; if (!res.empty()) count_nonempty++; out << res << std::endl; // if (count_nonempty % 100 == 0) // std::cerr << "\rPROGRESS: " << count_nonempty << " " << std::flush; - std::cerr << "+" << std::endl << std::flush; } // std::cerr << std::endl; return std::make_pair(count_nonempty, count_total); @@ -190,8 +187,6 @@ std::pair process_stream(const Function& function, if (!futures.empty()) pop_results(/*blocking=*/true); - // std::cerr << std::endl; - { std::lock_guard lock(mutex); request_end = true; diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh index 08debfc..bfca226 100644 --- a/include/fuzzy/fuzzy_match.hh +++ b/include/fuzzy/fuzzy_match.hh @@ -42,14 +42,13 @@ namespace fuzzy int length ) : length(length), s(seq) {} Match() {} - ~Match() {} float score; float secondary_sort; float penalty; int max_subseq; unsigned s_id; std::string id; - float* cover; + std::vector cover; int length; const unsigned* s; }; diff --git a/include/fuzzy/submodular.hxx b/include/fuzzy/submodular.hxx index 858217e..c31c8cc 100644 --- a/include/fuzzy/submodular.hxx +++ b/include/fuzzy/submodular.hxx @@ -18,6 +18,7 @@ namespace fuzzy } template + inline void get_score( std::vector& sorted_pattern_terms, std::vector& sorted_sentence_terms, @@ -27,16 +28,20 @@ namespace fuzzy std::vector& idf_penalty) { cover = std::vector(sorted_pattern_terms.size(), 0.f); + // std::cerr << sorted_pattern_terms.size() << "|" + // << sorted_sentence_terms.size() << ">" + // << std::flush; score = 0.f; for ( - unsigned i = 0, j = 0, k = 0; + unsigned i = 0, j = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++) { while ( - (i < sorted_pattern_terms.size()) && + (i < sorted_pattern_terms.size() - 1) && (sorted_pattern_terms[i] < sorted_sentence_terms[j])) i++; + if (sorted_pattern_terms[i] == sorted_sentence_terms[j]) if (idf_penalty.size() > 0) { @@ -55,6 +60,7 @@ namespace fuzzy } template + inline void get_unique_with_count( std::vector& sorted_salient, std::vector& unique, diff --git a/src/bm25_matches.cc b/src/bm25_matches.cc index 9d55492..3a2a0c0 100644 --- a/src/bm25_matches.cc +++ b/src/bm25_matches.cc @@ -24,15 +24,12 @@ namespace fuzzy std::priority_queue, std::vector>, ComparePairs> k_best; - std::cerr << "1" << std::flush; Eigen::SparseVector pattern_sparse_vec(bm25.get_vocab_size()); for (const unsigned& wid : pattern_wids) pattern_sparse_vec.coeffRef(wid) += 1.0; - std::cerr << "2" << std::flush; Eigen::SparseVector scores = bm25.compute_product(pattern_sparse_vec); - std::cerr << "3" << std::flush; for (Eigen::SparseVector::InnerIterator it(scores); it; ++it) { int s_id = it.index(); float bm25_score = it.value(); @@ -44,7 +41,6 @@ namespace fuzzy } } - std::cerr << "4" << std::flush; _best_matches.reserve(k_best.size()); while (!k_best.empty()) { diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index f507c76..7398291 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -505,8 +505,6 @@ namespace fuzzy if (!p_length) return false; - std::cerr << "[" << std::flush; - if ((std::size_t)(min_subseq_length) > pattern.size()) min_subseq_length = pattern.size(); @@ -530,7 +528,6 @@ namespace fuzzy // std::unique_ptr filter_matches; std::shared_ptr filter_matches; - std::cerr << "$" << std::flush; if (filter_type == IndexType::SUFFIX) { const SuffixArray& suffix_array = static_cast(filter); // filter_matches = new NGramMatches(fuzzy, p_length, min_subseq_length, suffix_array); @@ -615,7 +612,6 @@ namespace fuzzy filter_matches = std::make_shared(fuzzy, p_length, min_subseq_length, bm25, bm25_buffer, bm25_cutoff); // filter_matches = new BM25Matches(fuzzy, p_length, min_subseq_length, bm25, bm25_buffer, bm25_cutoff); BM25Matches& bm25Matches = static_cast(*filter_matches); - std::cerr << "!" << std::flush; bm25Matches.register_pattern(pattern_wids, edit_costs); } #endif @@ -627,7 +623,6 @@ namespace fuzzy no_matches.load_all(); } /* Consolidation of the results */ - std::cerr << "~" << std::flush; /* now explore for the best segments */ PatternCoverage pattern_coverage(pattern_wids); @@ -652,25 +647,17 @@ namespace fuzzy std::vector norm_weight; std::vector sorted_pattern_terms_idf; - std::cerr << "|" << std::flush; - /* Salient aspects enumeration */ switch(submod_fun) { case SubmodularFunction::BOW: { // all terms bow - std::vector sorted_pattern_wids(pattern_wids); + std::vector sorted_pattern_wids = pattern_wids; std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end()); get_unique_with_count(sorted_pattern_wids, sorted_pattern_terms, count_terms); - // std::cerr << "### " - // << sorted_pattern_wids.size() - // << ", " - // << sorted_pattern_terms.size() - // << ", " - // << count_terms.size() - // << std::endl; + if (submod_norm == SubmodularNormalization::IDF) sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms); @@ -692,7 +679,8 @@ namespace fuzzy case SubmodularFunction::ED: { // sentence indices - sorted_pattern_terms = std::vector(pattern_wids); + // sorted_pattern_terms = std::vector(pattern_wids); + sorted_pattern_terms = pattern_wids; if (submod_norm == SubmodularNormalization::IDF) sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms); break; @@ -701,19 +689,8 @@ namespace fuzzy ; } - // std::cerr << "sorted unique terms" << ": "; - // for (const auto& c : sorted_pattern_terms) - // std::cerr << c << ", "; - // std::cerr << std::endl; - // std::cerr << "Idf" << ": "; - // for (const auto& c : sorted_pattern_terms_idf) - // std::cerr << c << ", "; - // std::cerr << std::endl; - ///////////// - for (const auto& pair : best_matches) { - // std::cerr << "-" << std::flush; // num_filtered++; const auto s_id = pair.first; const auto score_filter = pair.second; @@ -728,7 +705,6 @@ namespace fuzzy if (submod_norm == SubmodularNormalization::BM25) { - // std::cerr << "BM25 norm..." << std::endl << std::flush; score = (float)score_filter / 1000.f; assert((filter_type == IndexType::BM25)); BM25Matches& bm25Matches = static_cast(*filter_matches); @@ -834,25 +810,6 @@ namespace fuzzy } } - // std::cerr << "q: "; - // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++) - // std::cerr << sorted_pattern_terms[i] << ","; - // std::cerr << std::endl; - // std::cerr << "q count: "; - // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++) - // std::cerr << count_terms[i] << ","; - // std::cerr << std::endl; - // std::cerr << "sent: "; - // for (unsigned i = 0; i < s_length; i++) - // std::cerr << sentence_wids[i] << ","; - // std::cerr << std::endl; - // std::cerr << "cover: "; - // for (unsigned i = 0; i < s_cover.size(); i++) - // std::cerr << s_cover[i] << ","; - // std::cerr << std::endl; - // std::cerr << "score: " << score << std::endl; - // std::cerr << "...done" << std::endl << std::flush; - if (score >= fuzzy) { Match m(sentence_wids, s_length); @@ -864,29 +821,20 @@ namespace fuzzy m.id = _filterIndex->id(s_id); m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt; m.penalty = 0; - // m.cover = s_cover; - // m.cover = std::vector(s_cover); - // m.cover = std::vector(s_cover.size()); - // std::copy(s_cover.begin(), s_cover.end(), m.cover.begin()); + m.cover = s_cover; result.push(m); - // std::cerr << m.s_id << ": "; - // for (const auto& c : m.cover) - // std::cerr << c << ", "; - // std::cerr << std::endl; cpt++; if (cpt > contrast_buffer) break; } } // COUT filter - std::cerr << "]" << std::flush; // std::cerr << num_filtered << std::endl; // std::cerr << filter_matches->get_best_matches().size() << std::endl; if (submod_fun != SubmodularFunction::NO && shrinking_factor < 1.f) // submodular coverage { const unsigned cover_length = (submod_fun == SubmodularFunction::ED) ? p_length : count_terms.size(); - // std::cerr << ">> " << cover_length << std::endl; std::vector cover_weights(cover_length, 1.f); std::list candidates; while (!result.empty()) @@ -908,23 +856,17 @@ namespace fuzzy // std::cerr << "rescore " << match.s_id << " : ("; for (unsigned i = 0; i < cover_weights.size(); i++) { - ///////////////////////////////// TODO: uncomment - // rescore += cover_weights[i] * match.cover[i]; - rescore += cover_weights[i]; - // if (match.cover[i] != 0) - // std::cerr << cover_weights[i] << "*" << match.cover[i] << "+"; + rescore += cover_weights[i] * match.cover[i]; + // rescore += cover_weights[i]; } - // std::cerr << ") " << match.penalty << " -> " << rescore << std::endl; match.penalty = rescore; } auto it_max = std::max_element(candidates.begin(), candidates.end(), comp); matches.push_back(*it_max); - // std::cerr << "choose No " << it_max->s_id << std::endl; // update cover_weights - ///////////////////////////////// TODO: uncomment - // for (unsigned i = 0; i < cover_weights.size(); i++) - // if (it_max->cover[i] > 0) - // cover_weights[i] *= shrinking_factor; + for (unsigned i = 0; i < cover_weights.size(); i++) + if (it_max->cover[i] > 0) + cover_weights[i] *= shrinking_factor; candidates.erase(it_max); if (shrinking_factor < 1e-20f) { @@ -1002,22 +944,7 @@ namespace fuzzy result.pop(); } } - std::cerr << "|" << std::flush; - - // std::cerr << "final matches " << " : "; - // for (unsigned i = 0; i < matches.size(); i++) - // { - // std::cerr << std::endl << " "; - // std::cerr << matches[i].s_id << ": "; - // for (int j = 0; j < matches[i].length; j++) - // std::cerr << matches[i].s[j] << " "; - // std::cerr << std::endl; - // std::cerr << matches[i].id; - // } - // std::cerr << std::endl; - - //// Attempts to free memory which is corrupted - //// Probably from vector + return matches.size() > 0; } }