From 343351059558d27a7f22d9f6d7ef798033a052f8 Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Wed, 11 Oct 2023 16:00:19 +0200
Subject: [PATCH 01/15] bm25 only code

---
 cli/src/FuzzyMatch-cli.cc |  14 ++--
 include/fuzzy/index.hh    |   4 +-
 include/fuzzy/index.hxx   |  21 ++++++
 src/CMakeLists.txt        |   2 +
 src/bm25_matches.cc       |   2 +-
 src/filter.cc             |   1 +
 src/fuzzy_match.cc        | 140 ++++++++++++++++++++++++++------------
 src/index.cc              |   2 +
 8 files changed, 136 insertions(+), 50 deletions(-)
diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc
index 909555d..b472c36 100644
--- a/cli/src/FuzzyMatch-cli.cc
+++ b/cli/src/FuzzyMatch-cli.cc
@@ -129,10 +129,10 @@ std::pair<int, int> process_stream(const Function& function,
       if (!res.empty())
         count_nonempty++;
       out << res << std::endl;
-      if (count_nonempty % 100 == 0)
-        std::cerr << "\rPROGRESS: " << count_nonempty << "  " << std::flush;
+      // if (count_nonempty % 100 == 0)
+      //   std::cerr << "\rPROGRESS: " << count_nonempty << "  " << std::flush;
     }
-    std::cerr << std::endl;
+    // std::cerr << std::endl;
     return std::make_pair(count_nonempty, count_total);
   }
 
@@ -163,8 +163,8 @@ std::pair<int, int> process_stream(const Function& function,
         count_nonempty++;
       out << res << std::endl;
       futures.pop();
-      if (count_nonempty % 100 == 0)
-        std::cerr << "\rPROGRESS: " << count_nonempty << "  " << std::flush;
+      // if (count_nonempty % 100 == 0)
+      //   std::cerr << "\rPROGRESS: " << count_nonempty << "  " << std::flush;
     }
   };
 
@@ -187,7 +187,7 @@ std::pair<int, int> process_stream(const Function& function,
   if (!futures.empty())
     pop_results(/*blocking=*/true);
   
-  std::cerr << std::endl;
+  // std::cerr << std::endl;
 
   {
     std::lock_guard<std::mutex> lock(mutex);
@@ -437,6 +437,8 @@ int main(int argc, char** argv)
   fuzzy::IndexType filter_type;
   if (filter_type_str == "bm25")
     filter_type = fuzzy::IndexType::BM25;
+  else if (filter_type_str == "no")
+    filter_type = fuzzy::IndexType::NO;
   else
     filter_type = fuzzy::IndexType::SUFFIX;
 #ifdef NO_EIGEN
diff --git a/include/fuzzy/index.hh b/include/fuzzy/index.hh
index 9bad0d6..3df24d6 100644
--- a/include/fuzzy/index.hh
+++ b/include/fuzzy/index.hh
@@ -12,13 +12,14 @@
 #ifdef USE_EIGEN
   #include <fuzzy/bm25.hh>
 #endif
+#include <fuzzy/no_filter.hh>
 #include <fuzzy/vocab_indexer.hh>
 #include <fuzzy/sentence.hh>
 
 namespace fuzzy
 {
   constexpr size_t DEFAULT_MAX_TOKENS_IN_PATTERN = 300; // if you change this value, update README.md
-  enum class IndexType { SUFFIX, BM25 };
+  enum class IndexType { SUFFIX, BM25, NO };
   class FilterIndex
   {
   public:
@@ -62,6 +63,7 @@ namespace fuzzy
 #ifdef USE_EIGEN
     inline std::shared_ptr<Filter> createBM25(const FilterIndexParams &params = FilterIndexParams()) { return std::make_shared<BM25>(params); }
 #endif
+    inline std::shared_ptr<Filter> createNo() { return std::make_shared<NoFilter>(); }
     std::vector<std::string> _ids;
     std::vector<Sentence>    _real_tokens;
     size_t _max_tokens_in_pattern;
diff --git a/include/fuzzy/index.hxx b/include/fuzzy/index.hxx
index 0156857..ef533d8 100644
--- a/include/fuzzy/index.hxx
+++ b/include/fuzzy/index.hxx
@@ -64,6 +64,17 @@ namespace fuzzy
         & _max_tokens_in_pattern;
     }
 #endif
+    else if (_type == IndexType::NO)
+    {
+      NoFilter& no_filter = static_cast<NoFilter&>(*_filter);
+        ar
+          & _type
+          & _vocabIndexer
+          & no_filter
+          & _ids
+          & _real_tokens
+          & _max_tokens_in_pattern;
+    }
   }
 
   template<class Archive>
@@ -94,6 +105,16 @@ namespace fuzzy
         & _real_tokens;
     }
 #endif
+    else if (_type == IndexType::NO)
+    {
+      _filter = createNo();
+      NoFilter& no_filter = static_cast<NoFilter&>(*_filter);
+      ar
+        & _vocabIndexer
+        & no_filter
+        & _ids
+        & _real_tokens;
+    }
     if (version >= 1)
       ar & _max_tokens_in_pattern;
   }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8e140f3..6e6c44c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -12,6 +12,8 @@ set(FUZZY_SOURCES
   pattern_coverage.cc
   filter.cc
   index.cc
+  no_filter.cc
+  no_matches.cc
 )
 if(MSVC)
   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
diff --git a/src/bm25_matches.cc b/src/bm25_matches.cc
index 0881bcc..a10fb2a 100644
--- a/src/bm25_matches.cc
+++ b/src/bm25_matches.cc
@@ -44,7 +44,7 @@ namespace fuzzy
     _best_matches.reserve(k_best.size());
     while (!k_best.empty())
     {
-      _best_matches.push_back({k_best.top().second, 0});
+      _best_matches.push_back({k_best.top().second, (int)(k_best.top().first * 1000)});
       k_best.pop();
     }
     std::reverse(_best_matches.begin(), _best_matches.end()); 
diff --git a/src/filter.cc b/src/filter.cc
index 39537c8..94f89c2 100644
--- a/src/filter.cc
+++ b/src/filter.cc
@@ -10,6 +10,7 @@ namespace fuzzy
   Filter::add_sentence(const std::vector<unsigned>& sentence)
   {
     size_t sidx = _sentence_pos.size();
+    std::cerr << sidx << std::endl;
     _sentence_pos.push_back(_sentence_buffer.size());
 
     /* first token in sentence buffer is the sentence size */
diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc
index 9640250..83629ad 100644
--- a/src/fuzzy_match.cc
+++ b/src/fuzzy_match.cc
@@ -14,6 +14,8 @@
   #include <fuzzy/bm25.hh>
   #include <fuzzy/bm25_matches.hh>
 #endif
+#include <fuzzy/no_filter.hh>
+#include <fuzzy/no_matches.hh>
 #include <fuzzy/costs.hh>
 #include <fuzzy/ngram_matches.hh>
 #include <fuzzy/edit_distance.hh>
@@ -540,7 +542,7 @@ namespace fuzzy
           if (range_suffixid.first != range_suffixid.second)
           {
             /* do not register unigrams - yet */
-            if (subseq_length > 2)
+            if (subseq_length >= 2)
             {
               /* register (n-1) grams */
               nGramMatches->register_suffix_range_match(previous_range_suffixid.first,
@@ -561,13 +563,12 @@ namespace fuzzy
             break;
           }
         }
-        if (subseq_length >= 2)
+        if (subseq_length >= 1)
           nGramMatches->register_suffix_range_match(previous_range_suffixid.first,
                                                   previous_range_suffixid.second,
                                                   subseq_length,
                                                   edit_costs);
       }
-      // filter_matches = &nGramMatches;
     }
 #ifdef USE_EIGEN
     else if (filter_type == IndexType::BM25)
@@ -579,6 +580,13 @@ namespace fuzzy
       bm25Matches.register_pattern(pattern_wids, edit_costs);
     }
 #endif
+    else if (filter_type == IndexType::NO)
+    {
+      const NoFilter& no_filter = static_cast<const NoFilter&>(filter);
+      filter_matches = std::make_shared<NoMatches>(fuzzy, p_length, min_subseq_length, no_filter);
+      NoMatches& no_matches = static_cast<NoMatches&>(*filter_matches);
+      no_matches.load_all();
+    }
     /* Consolidation of the results */
 
     /* now explore for the best segments */
@@ -597,51 +605,99 @@ namespace fuzzy
     lowest_costs.push(std::numeric_limits<float>::max());
 
     unsigned cpt = 0;
+    // unsigned num_filtered = 0;
+
+    // ONLY N-grams
+    // for (const auto& pair : filter_matches->get_best_matches())
+    // {
+    //   const auto s_id = pair.first;
+    //   const auto longest_match = pair.second;
+    //   size_t s_length = 0;
+    //   const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length);
+    //   Match m(sentence_wids, s_length);
+    //   m.score = (float)longest_match / (float)s_length;
+    //   m.max_subseq = longest_match;
+    //   m.s_id = s_id;
+    //   m.id = _filterIndex->id(s_id);
+    //   m.secondary_sort = s_id;
+    //   m.penalty = 0;
+    //   result.push(m);
+    //   cpt++;
+    //   if (cpt > contrast_buffer)
+    //     break;
+    // }
+
+    // ONLY BM25
     for (const auto& pair : filter_matches->get_best_matches())
     {
       const auto s_id = pair.first;
-      const auto longest_match = pair.second;
+      const auto bm25_score = pair.second;
       size_t s_length = 0;
       const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length);
-      const auto num_covered_words = (longest_match < p_length
-                                      ? pattern_coverage.count_covered_words(sentence_wids, s_length)
-                                      : p_length);
-      /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
-      if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs))
-      {
-        const Costs costs(p_length, s_length, edit_costs);
-
-        /* let us check the candidates */
-        const auto sentence_realtok = _filterIndex->real_tokens(s_id);
-        const auto cost_upper_bound = lowest_costs.top();
-        float cost = _edit_distance(sentence_wids, sentence_realtok, s_length,
-                                    pattern_wids.data(), pattern_realtok, p_length,
-                                    st, sn,
-                                    idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max,
-                                    edit_costs,
-                                    costs, cost_upper_bound);
-        if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
-          continue;
-
-        float score = int(10000-cost*100)/10000.0;
-
-
-        lowest_costs.push(cost);
-        if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
-          lowest_costs.pop();
-        if (score >= fuzzy) {
-          Match m(sentence_wids, s_length);
-          m.score = score;
-          m.max_subseq = longest_match;
-          m.s_id = s_id;
-          m.id = _filterIndex->id(s_id);
-          m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt;
-          m.penalty = 0;
-          result.push(m);
-          cpt++;
-        }
-      }
+      Match m(sentence_wids, s_length);
+      m.score = (float)bm25_score / (float)1000.;
+      m.max_subseq = 0;
+      m.s_id = s_id;
+      m.id = _filterIndex->id(s_id);
+      m.secondary_sort = s_id;
+      m.penalty = 0;
+      result.push(m);
+      cpt++;
+      if (cpt > contrast_buffer)
+        break;
     }
+
+
+    // for (const auto& pair : filter_matches->get_best_matches())
+    // {
+    //   // num_filtered++;
+    //   const auto s_id = pair.first;
+    //   const auto longest_match = pair.second;
+    //   size_t s_length = 0;
+    //   const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length);
+    //   const auto num_covered_words = (longest_match < p_length
+    //                                   ? pattern_coverage.count_covered_words(sentence_wids, s_length)
+    //                                   : p_length);
+    //   /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
+    //   // if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs))
+    //   // {
+    //     const Costs costs(p_length, s_length, edit_costs);
+
+    //     /* let us check the candidates */
+    //     const auto sentence_realtok = _filterIndex->real_tokens(s_id);
+    //     const auto cost_upper_bound = lowest_costs.top();
+    //     float cost = _edit_distance(sentence_wids, sentence_realtok, s_length,
+    //                                 pattern_wids.data(), pattern_realtok, p_length,
+    //                                 st, sn,
+    //                                 idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max,
+    //                                 edit_costs,
+    //                                 costs, cost_upper_bound);
+    //     // float cost = 0.1;
+    //     if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
+    //       continue;
+
+    //     float score = int(10000-cost*100)/10000.0;
+
+
+    //     lowest_costs.push(cost);
+    //     if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
+    //       lowest_costs.pop();
+    //     if (score >= fuzzy) {
+    //       Match m(sentence_wids, s_length);
+    //       m.score = score;
+    //       m.max_subseq = longest_match;
+    //       m.s_id = s_id;
+    //       m.id = _filterIndex->id(s_id);
+    //       m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt;
+    //       m.penalty = 0;
+    //       result.push(m);
+    //       cpt++;
+    //     }
+    //   // }
+    // }
+    // COUT filter
+    // std::cerr << num_filtered << std::endl;
+    // std::cerr << filter_matches->get_best_matches().size() << std::endl;
     // delete filter_matches;
     /* Contrastive reranking */
     if (contrastive_factor > 0)
diff --git a/src/index.cc b/src/index.cc
index c994e88..ec4d256 100644
--- a/src/index.cc
+++ b/src/index.cc
@@ -13,6 +13,8 @@ namespace fuzzy
 #endif
     if (_type == IndexType::SUFFIX)
       _filter = createSuffixArray();
+    else if (_type == IndexType::NO)
+      _filter = createNo();
   }
 
   int

From b16626d232748712af63432a162d6fe9c4f1e94d Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Tue, 2 Jan 2024 15:58:08 +0100
Subject: [PATCH 02/15] no_filter

---
 include/fuzzy/no_filter.hh  | 62 +++++++++++++++++++++++++++++++++++++
 include/fuzzy/no_filter.hxx | 30 ++++++++++++++++++
 include/fuzzy/no_matches.hh | 27 ++++++++++++++++
 src/no_filter.cc            | 30 ++++++++++++++++++
 src/no_matches.cc           | 30 ++++++++++++++++++
 5 files changed, 179 insertions(+)
 create mode 100644 include/fuzzy/no_filter.hh
 create mode 100644 include/fuzzy/no_filter.hxx
 create mode 100644 include/fuzzy/no_matches.hh
 create mode 100644 src/no_filter.cc
 create mode 100644 src/no_matches.cc

diff --git a/include/fuzzy/no_filter.hh b/include/fuzzy/no_filter.hh
new file mode 100644
index 0000000..afcdc76
--- /dev/null
+++ b/include/fuzzy/no_filter.hh
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <ostream>
+#include <algorithm>
+#include <unordered_set>
+#include <unordered_map>
+#include <math.h> 
+#include <Eigen/Sparse>
+
+#include <fuzzy/utils.hh>
+#include <fuzzy/filter.hh>
+
+#include <boost/multi_array.hpp>
+#include <boost/format.hpp>
+#include <boost/container/vector.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/serialization/unordered_map.hpp>
+#include <boost/serialization/serialization.hpp>
+#include <boost/serialization/vector.hpp>
+#include <boost/serialization/split_member.hpp>
+#include <boost/serialization/version.hpp>
+#include <boost/serialization/array.hpp>
+
+namespace fuzzy
+{
+  class NoFilter : public Filter
+  {
+  public:
+    NoFilter(const FilterIndexParams &params=FilterIndexParams());
+    ~NoFilter();
+    // unsigned add_sentence(const std::vector<unsigned>& sentence) override;
+    using Filter::add_sentence;
+
+    using Filter::dump;
+    using Filter::num_sentences;
+    using Filter::get_sentence;
+
+    void prepare(size_t vocab_size);
+
+    std::ostream& dump(std::ostream&) const;
+
+    unsigned get_sentence_length(size_t s_id) const;
+
+  private:
+    friend class boost::serialization::access;
+
+    template<class Archive>
+    void save(Archive&, unsigned int version) const;
+
+    template<class Archive>
+    void load(Archive&, unsigned int version);
+
+    BOOST_SERIALIZATION_SPLIT_MEMBER()
+  };
+}
+
+BOOST_CLASS_VERSION(fuzzy::NoFilter, 1)
+
+#include "fuzzy/no_filter.hxx"
\ No newline at end of file
diff --git a/include/fuzzy/no_filter.hxx b/include/fuzzy/no_filter.hxx
new file mode 100644
index 0000000..5c9e68a
--- /dev/null
+++ b/include/fuzzy/no_filter.hxx
@@ -0,0 +1,30 @@
+#include <stdexcept>
+
+namespace fuzzy
+{
+  inline unsigned
+  NoFilter::get_sentence_length(size_t s_id) const
+  {
+    if (s_id + 1 == _sentence_pos.size())
+      return _sentence_buffer.size() - _sentence_pos[s_id] -  2;
+    return _sentence_pos[s_id + 1] - _sentence_pos[s_id] - 2;
+  }
+  
+  template<class Archive>
+  void NoFilter::save(Archive& archive, unsigned int) const
+  {
+    archive
+    & _sentence_buffer
+    & _sentence_pos
+    & _quickVocabAccess;
+  }
+
+  template<class Archive>
+  void NoFilter::load(Archive& archive, unsigned int)
+  {
+    archive
+    & _sentence_buffer
+    & _sentence_pos
+    & _quickVocabAccess;
+  }
+}
diff --git a/include/fuzzy/no_matches.hh b/include/fuzzy/no_matches.hh
new file mode 100644
index 0000000..a94d564
--- /dev/null
+++ b/include/fuzzy/no_matches.hh
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <fuzzy/filter_matches.hh>
+#include <fuzzy/no_filter.hh>
+
+namespace fuzzy
+{
+    class NoMatches : public FilterMatches
+    {
+    public:
+        using FilterMatches::FilterMatches;
+        NoMatches(float fuzzy,
+                  unsigned p_length,
+                  unsigned min_seq_len,
+                  const NoFilter &);
+
+        // Registers a match for this range of suffixes.
+        using FilterMatches::theoretical_rejection;
+        using FilterMatches::theoretical_rejection_cover;
+
+        void load_all();
+        std::vector<std::pair<unsigned, unsigned>> get_best_matches() const override;
+
+    private:
+        std::vector<std::pair<unsigned, unsigned>> _all_matches;
+    };
+}
diff --git a/src/no_filter.cc b/src/no_filter.cc
new file mode 100644
index 0000000..a4ed3c4
--- /dev/null
+++ b/src/no_filter.cc
@@ -0,0 +1,30 @@
+#include <fuzzy/no_filter.hh>
+
+#include <fuzzy/ngram_matches.hh>
+#include <fuzzy/vocab_indexer.hh>
+#include <cassert>
+
+namespace fuzzy
+{
+  NoFilter::NoFilter(const FilterIndexParams& params)
+  {}
+  NoFilter::~NoFilter() {}
+  // unsigned
+  // NoFilter::add_sentence(const std::vector<unsigned>& sentence)
+  // {
+  //   size_t sidx = _sentence_pos.size();
+  //   _sentence_pos.push_back(_sentence_buffer.size());
+
+  //   /* first token in sentence buffer is the sentence size */
+  //   _sentence_buffer.push_back(sentence.size());
+
+  //   for (size_t i = 0; i < sentence.size(); i++)
+  //   {
+  //     _sentence_buffer.push_back(sentence[i]);
+  //   }
+  //   _sentence_buffer.push_back(fuzzy::VocabIndexer::SENTENCE_SEPARATOR);
+  //   return sidx;
+  // }
+
+  void NoFilter::prepare(size_t vocab_size) {}
+}
diff --git a/src/no_matches.cc b/src/no_matches.cc
new file mode 100644
index 0000000..277aa01
--- /dev/null
+++ b/src/no_matches.cc
@@ -0,0 +1,30 @@
+#include <fuzzy/no_matches.hh>
+
+#include <cmath>
+
+namespace fuzzy
+{
+    NoMatches::NoMatches(float fuzzy,
+                         unsigned p_length,
+                         unsigned min_seq_len,
+                         const NoFilter &no_filter)
+        /* add a small epsilon to avoid rounding errors counting for an error */
+        : FilterMatches(fuzzy, p_length, min_seq_len, no_filter)
+    {
+    }
+
+    std::vector<std::pair<unsigned, unsigned>>
+    NoMatches::get_best_matches() const
+    {
+        return _all_matches;
+    }
+
+    void NoMatches::load_all()
+    {
+        _all_matches = std::vector<std::pair<unsigned, unsigned>>(_filter.num_sentences());
+        size_t *length;
+
+        for (unsigned i = 0; i < _filter.num_sentences(); i++)
+            _all_matches[i] = {i, 0};
+    }
+}

From ebe5dc3cfce2e4ba8bde57d34a0b210e93b62a55 Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Tue, 2 Jan 2024 16:06:26 +0100
Subject: [PATCH 03/15] starting functional point

---
 src/fuzzy_match.cc | 122 ++++++++++++++++++++++-----------------------
 1 file changed, 61 insertions(+), 61 deletions(-)

diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc
index 83629ad..a708089 100644
--- a/src/fuzzy_match.cc
+++ b/src/fuzzy_match.cc
@@ -628,73 +628,73 @@ namespace fuzzy
     // }
 
     // ONLY BM25
-    for (const auto& pair : filter_matches->get_best_matches())
-    {
-      const auto s_id = pair.first;
-      const auto bm25_score = pair.second;
-      size_t s_length = 0;
-      const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length);
-      Match m(sentence_wids, s_length);
-      m.score = (float)bm25_score / (float)1000.;
-      m.max_subseq = 0;
-      m.s_id = s_id;
-      m.id = _filterIndex->id(s_id);
-      m.secondary_sort = s_id;
-      m.penalty = 0;
-      result.push(m);
-      cpt++;
-      if (cpt > contrast_buffer)
-        break;
-    }
-
-
     // for (const auto& pair : filter_matches->get_best_matches())
     // {
-    //   // num_filtered++;
     //   const auto s_id = pair.first;
-    //   const auto longest_match = pair.second;
+    //   const auto bm25_score = pair.second;
     //   size_t s_length = 0;
     //   const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length);
-    //   const auto num_covered_words = (longest_match < p_length
-    //                                   ? pattern_coverage.count_covered_words(sentence_wids, s_length)
-    //                                   : p_length);
-    //   /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
-    //   // if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs))
-    //   // {
-    //     const Costs costs(p_length, s_length, edit_costs);
-
-    //     /* let us check the candidates */
-    //     const auto sentence_realtok = _filterIndex->real_tokens(s_id);
-    //     const auto cost_upper_bound = lowest_costs.top();
-    //     float cost = _edit_distance(sentence_wids, sentence_realtok, s_length,
-    //                                 pattern_wids.data(), pattern_realtok, p_length,
-    //                                 st, sn,
-    //                                 idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max,
-    //                                 edit_costs,
-    //                                 costs, cost_upper_bound);
-    //     // float cost = 0.1;
-    //     if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
-    //       continue;
-
-    //     float score = int(10000-cost*100)/10000.0;
-
-
-    //     lowest_costs.push(cost);
-    //     if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
-    //       lowest_costs.pop();
-    //     if (score >= fuzzy) {
-    //       Match m(sentence_wids, s_length);
-    //       m.score = score;
-    //       m.max_subseq = longest_match;
-    //       m.s_id = s_id;
-    //       m.id = _filterIndex->id(s_id);
-    //       m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt;
-    //       m.penalty = 0;
-    //       result.push(m);
-    //       cpt++;
-    //     }
-    //   // }
+    //   Match m(sentence_wids, s_length);
+    //   m.score = (float)bm25_score / (float)1000.;
+    //   m.max_subseq = 0;
+    //   m.s_id = s_id;
+    //   m.id = _filterIndex->id(s_id);
+    //   m.secondary_sort = s_id;
+    //   m.penalty = 0;
+    //   result.push(m);
+    //   cpt++;
+    //   if (cpt > contrast_buffer)
+    //     break;
     // }
+
+
+    for (const auto& pair : filter_matches->get_best_matches())
+    {
+      // num_filtered++;
+      const auto s_id = pair.first;
+      const auto longest_match = pair.second;
+      size_t s_length = 0;
+      const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length);
+      const auto num_covered_words = (longest_match < p_length
+                                      ? pattern_coverage.count_covered_words(sentence_wids, s_length)
+                                      : p_length);
+      /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
+      // if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs))
+      // {
+        const Costs costs(p_length, s_length, edit_costs);
+
+        /* let us check the candidates */
+        const auto sentence_realtok = _filterIndex->real_tokens(s_id);
+        const auto cost_upper_bound = lowest_costs.top();
+        float cost = _edit_distance(sentence_wids, sentence_realtok, s_length,
+                                    pattern_wids.data(), pattern_realtok, p_length,
+                                    st, sn,
+                                    idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max,
+                                    edit_costs,
+                                    costs, cost_upper_bound);
+        // float cost = 0.1;
+        if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
+          continue;
+
+        float score = int(10000-cost*100)/10000.0;
+
+
+        lowest_costs.push(cost);
+        if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
+          lowest_costs.pop();
+        if (score >= fuzzy) {
+          Match m(sentence_wids, s_length);
+          m.score = score;
+          m.max_subseq = longest_match;
+          m.s_id = s_id;
+          m.id = _filterIndex->id(s_id);
+          m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt;
+          m.penalty = 0;
+          result.push(m);
+          cpt++;
+        }
+      // }
+    }
     // COUT filter
     // std::cerr << num_filtered << std::endl;
     // std::cerr << filter_matches->get_best_matches().size() << std::endl;

From 2dab6b831e88ee8228507686f5686faedbbf857c Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Wed, 10 Jan 2024 11:37:22 +0100
Subject: [PATCH 04/15] bow submodularity working

---
 cli/src/FuzzyMatch-cli.cc         |  12 +-
 include/fuzzy/bm25.hh             |   1 +
 include/fuzzy/bm25.hxx            |   5 +
 include/fuzzy/bm25_matches.hh     |   5 +-
 include/fuzzy/filter_matches.hh   |   2 +-
 include/fuzzy/fuzzy_match.hh      |  12 +-
 include/fuzzy/ngram_matches.hh    |   2 +-
 include/fuzzy/no_matches.hh       |   4 +-
 include/fuzzy/pattern_coverage.hh |   1 +
 src/bm25_matches.cc               |  22 ++-
 src/fuzzy_match.cc                | 233 ++++++++++++++++++++++++------
 src/ngram_matches.cc              |   6 +-
 src/no_matches.cc                 |   4 +-
 src/pattern_coverage.cc           |  10 ++
 14 files changed, 253 insertions(+), 66 deletions(-)

diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc
index b472c36..30d8f18 100644
--- a/cli/src/FuzzyMatch-cli.cc
+++ b/cli/src/FuzzyMatch-cli.cc
@@ -210,7 +210,7 @@ class processor {
             std::string contrastive_reduce_str,
             int contrastive_buffer,
             fuzzy::IndexType filter_type,
-            int bm25_buffer, float bm25_cutoff, const fuzzy::FilterIndexParams& filter_index_params):
+            int bm25_buffer, float bm25_cutoff, float submodular_shrinking_factor, const fuzzy::FilterIndexParams& filter_index_params):
              _fuzzyMatcher(pt, max_tokens_in_pattern, filter_type, filter_index_params),
              _fuzzy(fuzzy),
              _contrastive_factor(contrastive_factor),
@@ -224,7 +224,8 @@ class processor {
              _contrastive_buffer(contrastive_buffer),
              _filter_type(filter_type),
              _bm25_buffer(bm25_buffer),
-             _bm25_cutoff(bm25_cutoff) {
+             _bm25_cutoff(bm25_cutoff),
+             _submodular_shrinking_factor(submodular_shrinking_factor) {
     if (contrastive_reduce_str == "max")
       _contrastive_reduce = fuzzy::ContrastReduce::MAX;
     else
@@ -236,7 +237,7 @@ class processor {
     _fuzzyMatcher.match(sentence, _fuzzy, _nmatch, _no_perfect, matches,
                         _min_subseq_length, _min_subseq_ratio, _idf_penalty, _cost,
                         _contrastive_factor, _contrastive_reduce, _contrastive_buffer,
-                        _filter_type, _bm25_buffer, _bm25_cutoff);
+                        _filter_type, _bm25_buffer, _bm25_cutoff, _submodular_shrinking_factor);
 
     std::string   out;
     for(const fuzzy::FuzzyMatch::Match &m: matches) {
@@ -294,6 +295,7 @@ class processor {
   fuzzy::IndexType _filter_type;
   int _bm25_buffer;
   float _bm25_cutoff;
+  float _submodular_shrinking_factor;
 };
 
 int main(int argc, char** argv)
@@ -326,6 +328,7 @@ int main(int argc, char** argv)
   float contrastive_factor;
   float bm25_cutoff;
   float bm25_ratio_idf;
+  float submodular_shrinking_factor;
   int nmatch;
   int nthreads;
   int min_subseq_length;
@@ -365,6 +368,7 @@ int main(int argc, char** argv)
     ("bm25-ratio-idf", po::value(&bm25_ratio_idf)->default_value(0.5f), "filter in the reverse index to consider only terms rare enough (close to 0 = ignores a lot : close to 1 = considers a lot)")
     ("bm25-buffer", po::value(&bm25_buffer)->default_value(10), "number of best BM25 to rerank")
     ("bm25-cutoff", po::value(&bm25_cutoff)->default_value(0.f), "minimum BM25 score threshold cutoff")
+    ("submodular-shrinking-factor,lambda", po::value(&submodular_shrinking_factor)->default_value(1.f), "In submodularity coverage, weight shrinking factor of each covered salient aspect of the source") 
     ("nthreads,N", po::value(&nthreads)->default_value(4), "number of thread to use for match")
     ;
 
@@ -450,7 +454,7 @@ int main(int argc, char** argv)
               idf_penalty, subseq_idf_weighting,
               max_tokens_in_pattern, edit_cost,
               contrastive_reduce, contrastive_buffer,
-              filter_type, bm25_buffer, bm25_cutoff, filter_index_params);
+              filter_type, bm25_buffer, bm25_cutoff, submodular_shrinking_factor, filter_index_params);
 
   if (index_file.length()) {
     TICK("Loading index_file: "+index_file);
diff --git a/include/fuzzy/bm25.hh b/include/fuzzy/bm25.hh
index a62d9fd..6f0735c 100644
--- a/include/fuzzy/bm25.hh
+++ b/include/fuzzy/bm25.hh
@@ -61,6 +61,7 @@ namespace fuzzy
 
     inline int get_vocab_size() const { return _vocab_size; }
     Eigen::SparseVector<float> compute_product(const Eigen::SparseVector<float>& pattern_voc) const;
+    Eigen::SparseVector<float> get_cover(const Eigen::SparseVector<float>& pattern_voc, unsigned s_id) const;
 
   private:
     size_t _vocab_size;
diff --git a/include/fuzzy/bm25.hxx b/include/fuzzy/bm25.hxx
index ce0aec9..04c199e 100644
--- a/include/fuzzy/bm25.hxx
+++ b/include/fuzzy/bm25.hxx
@@ -45,6 +45,11 @@ namespace fuzzy
   {
     return _bm25_inverse_index * pattern_voc;
   }
+  inline Eigen::SparseVector<float> BM25::get_cover(const Eigen::SparseVector<float>& pattern_voc, unsigned s_id) const
+  {
+    Eigen::SparseVector<float> row_s = _bm25_inverse_index.row(s_id);
+    return row_s.cwiseProduct(pattern_voc);
+  }
   template<class Archive>
   void BM25::save(Archive& archive, unsigned int) const
   {
diff --git a/include/fuzzy/bm25_matches.hh b/include/fuzzy/bm25_matches.hh
index d6d3c49..2e630f8 100644
--- a/include/fuzzy/bm25_matches.hh
+++ b/include/fuzzy/bm25_matches.hh
@@ -33,12 +33,13 @@ namespace fuzzy
     using FilterMatches::theoretical_rejection;
     using FilterMatches::theoretical_rejection_cover;
 
-    std::vector<std::pair<unsigned, unsigned>> get_best_matches() const override;
+    std::vector<std::pair<unsigned, int>> get_best_matches() const override;
+    std::vector<float> cover(const std::vector<unsigned>& unique_pattern_wids, const std::vector<unsigned>& count, unsigned s_id) const;
 
   private:
     // Num of sentences to place in the buffer
     const unsigned _buffer;
     const float _cutoff_threshold;
-    std::vector<std::pair<unsigned, unsigned>> _best_matches;
+    std::vector<std::pair<unsigned, int>> _best_matches;
   };
 }
diff --git a/include/fuzzy/filter_matches.hh b/include/fuzzy/filter_matches.hh
index a57880f..7cafa32 100644
--- a/include/fuzzy/filter_matches.hh
+++ b/include/fuzzy/filter_matches.hh
@@ -32,7 +32,7 @@ namespace fuzzy
     bool theoretical_rejection(size_t p_length, size_t s_length, const EditCosts& edit_costs) const;
     bool theoretical_rejection_cover(size_t p_length, size_t s_length, size_t cover, const EditCosts& edit_costs) const;
 
-    virtual std::vector<std::pair<unsigned, unsigned>> get_best_matches() const = 0;
+    virtual std::vector<std::pair<unsigned, int>> get_best_matches() const = 0;
 
     float fuzzy_threshold;
     // unsigned max_differences_with_pattern;
diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh
index 589e2e9..ac91b09 100644
--- a/include/fuzzy/fuzzy_match.hh
+++ b/include/fuzzy/fuzzy_match.hh
@@ -15,6 +15,8 @@ namespace onmt {
 namespace fuzzy
 {
   enum class ContrastReduce { MEAN, MAX };
+  enum class SubmodularFunction { BOW, BM25, NGRAM, ED };
+  enum class SubmodularNormalization { NO, IDF };
 
   class FuzzyMatch
   {
@@ -44,6 +46,7 @@ namespace fuzzy
       int         max_subseq;
       unsigned    s_id;
       std::string id;
+      std::vector<float> cover;
       int length;
       const unsigned* s;
     };
@@ -74,7 +77,8 @@ namespace fuzzy
                int contrast_buffer=-1,
                IndexType filter_type=IndexType::SUFFIX,
                int bm25_buffer=10,
-               float bm25_cutoff=0) const;
+               float bm25_cutoff=0,
+               float shrinking_factor=1.f) const;
     bool match(const Sentence& real,
                const Tokens& pattern,
                float fuzzy,
@@ -90,7 +94,8 @@ namespace fuzzy
                int contrast_buffer=-1,
                IndexType filter_type=IndexType::SUFFIX,
                int bm25_buffer=10,
-               float bm25_cutoff=0) const;
+               float bm25_cutoff=0,
+               float shrinking_factor=1.f) const;
     /* simplified, include tokenization */
     bool match(const std::string &sentence,
                float fuzzy,
@@ -106,7 +111,8 @@ namespace fuzzy
                int contrast_buffer=-1,
                IndexType filter_type=IndexType::SUFFIX,
                int bm25_buffer=10,
-               float bm25_cutoff=0) const;
+               float bm25_cutoff=0,
+               float shrinking_factor=1.f) const;
     bool subsequence(const std::string &sentence,
                unsigned number_of_matches,
                bool no_perfect,
diff --git a/include/fuzzy/ngram_matches.hh b/include/fuzzy/ngram_matches.hh
index f42de9d..eee7b86 100644
--- a/include/fuzzy/ngram_matches.hh
+++ b/include/fuzzy/ngram_matches.hh
@@ -29,7 +29,7 @@ namespace fuzzy
     using FilterMatches::theoretical_rejection;
     using FilterMatches::theoretical_rejection_cover;
 
-    std::vector<std::pair<unsigned, unsigned>> get_best_matches() const override;
+    std::vector<std::pair<unsigned, int>> get_best_matches() const override;
 
   private:
     LongestMatches _longest_matches;
diff --git a/include/fuzzy/no_matches.hh b/include/fuzzy/no_matches.hh
index a94d564..e410ec6 100644
--- a/include/fuzzy/no_matches.hh
+++ b/include/fuzzy/no_matches.hh
@@ -19,9 +19,9 @@ namespace fuzzy
         using FilterMatches::theoretical_rejection_cover;
 
         void load_all();
-        std::vector<std::pair<unsigned, unsigned>> get_best_matches() const override;
+        std::vector<std::pair<unsigned, int>> get_best_matches() const override;
 
     private:
-        std::vector<std::pair<unsigned, unsigned>> _all_matches;
+        std::vector<std::pair<unsigned, int>> _all_matches;
     };
 }
diff --git a/include/fuzzy/pattern_coverage.hh b/include/fuzzy/pattern_coverage.hh
index e2406b4..66a3203 100644
--- a/include/fuzzy/pattern_coverage.hh
+++ b/include/fuzzy/pattern_coverage.hh
@@ -19,4 +19,5 @@ namespace fuzzy
     std::unordered_map<unsigned, unsigned> _words_count;
   };
 
+  bool equal_arrays(const size_t s_len, const size_t p_len, const unsigned* s, const unsigned* p);
 }
diff --git a/src/bm25_matches.cc b/src/bm25_matches.cc
index a10fb2a..3a2a0c0 100644
--- a/src/bm25_matches.cc
+++ b/src/bm25_matches.cc
@@ -50,9 +50,29 @@ namespace fuzzy
     std::reverse(_best_matches.begin(), _best_matches.end()); 
   }
 
-  std::vector<std::pair<unsigned, unsigned>>
+  std::vector<std::pair<unsigned, int>>
   BM25Matches::get_best_matches() const
   {
     return _best_matches;
   }
+
+  std::vector<float>
+  BM25Matches::cover(const std::vector<unsigned>& unique_pattern_wids, const std::vector<unsigned>& count, unsigned s_id) const
+  {
+    const BM25& bm25 = static_cast<const BM25&>(_filter);
+
+    Eigen::SparseVector<float> pattern_sparse_vec(bm25.get_vocab_size());
+    // for (const unsigned& wid : unique_pattern_wids)
+      // pattern_sparse_vec.coeffRef(wid) += 1.0;
+    for (unsigned i = 0; i < unique_pattern_wids.size(); i++)
+      pattern_sparse_vec.coeffRef(unique_pattern_wids[i]) = (float)count[i];
+
+    Eigen::SparseVector<float> all_coverage = bm25.get_cover(pattern_sparse_vec, s_id);
+
+    std::vector<float> coverage(unique_pattern_wids.size(), 0.f);
+    for (int i = 0; i < coverage.size(); i++)
+      coverage[i] = all_coverage.coeff(unique_pattern_wids[i]);
+
+    return coverage;
+  }
 }
diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc
index a708089..a1d8368 100644
--- a/src/fuzzy_match.cc
+++ b/src/fuzzy_match.cc
@@ -7,6 +7,7 @@
 #include <set>
 #include <numeric>
 #include <algorithm>
+#include <cassert>
 
 #include <unicode/normalizer2.h>
 #include <fuzzy/suffix_array.hh>
@@ -20,6 +21,7 @@
 #include <fuzzy/ngram_matches.hh>
 #include <fuzzy/edit_distance.hh>
 #include <fuzzy/pattern_coverage.hh>
+#include <fuzzy/submodular.hh>
 
 #include <onmt/Tokenizer.h>
 #include <onmt/unicode/Unicode.h>
@@ -404,7 +406,8 @@ namespace fuzzy
                          int contrast_buffer,
                          IndexType filter_type,
                          int bm25_buffer,
-                         float bm25_cutoff) const {
+                         float bm25_cutoff,
+                         float shrinking_factor) const {
 
     Sentence real;
     Tokens norm;
@@ -412,7 +415,8 @@ namespace fuzzy
     return match(real, norm, fuzzy, number_of_matches, no_perfect, matches,
                  min_subseq_length, min_subseq_ratio, vocab_idf_penalty,
                  edit_costs, contrastive_factor, reduce, contrast_buffer,
-                 filter_type, bm25_buffer, bm25_cutoff);
+                 filter_type, bm25_buffer, bm25_cutoff,
+                 shrinking_factor);
   }
 
   /* backward compatibility */
@@ -430,13 +434,15 @@ namespace fuzzy
                     int contrast_buffer,
                     IndexType filter_type,
                     int bm25_buffer,
-                    float bm25_cutoff) const
+                    float bm25_cutoff,
+                    float shrinking_factor) const
   {
     const Sentence real(pattern);
     return match(real, pattern, fuzzy, number_of_matches, false, matches,
                  min_subseq_length, min_subseq_ratio, vocab_idf_penalty,
                  edit_costs, contrastive_factor, reduce, contrast_buffer,
-                 filter_type, bm25_buffer, bm25_cutoff);
+                 filter_type, bm25_buffer, bm25_cutoff,
+                 shrinking_factor);
   }
 
   /* check for the pattern in the suffix-array index SAI */ 
@@ -456,7 +462,8 @@ namespace fuzzy
                     int contrast_buffer,
                     IndexType filter_type,
                     int bm25_buffer,
-                    float bm25_cutoff) const
+                    float bm25_cutoff,
+                    float shrinking_factor) const
   {
     size_t p_length = pattern.size();
     if (contrast_buffer == -1)
@@ -647,60 +654,180 @@ namespace fuzzy
     //     break;
     // }
 
-
-    for (const auto& pair : filter_matches->get_best_matches())
+    // case S = x
+
+    ///////////// BM25 (BOW)
+    assert((filter_type == IndexType::BM25));
+    std::vector<std::pair<unsigned, int>> best_matches = filter_matches->get_best_matches();
+    BM25Matches& bm25Matches = static_cast<BM25Matches&>(*filter_matches);
+    std::vector<unsigned> sorted_pattern_wids(pattern_wids);
+    std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end());
+    std::vector<unsigned> sorted_pattern_terms;
+    std::vector<unsigned> count_terms;
+    sorted_pattern_terms.reserve(pattern_wids.size());
+    count_terms.reserve(pattern_wids.size());
+    if (sorted_pattern_wids.size() > 0)
+    {
+      unsigned current_term = sorted_pattern_wids[0];
+      unsigned current_count = 1;
+      for (unsigned i = 1; i < sorted_pattern_wids.size(); i++)
+      {
+        if (current_term != sorted_pattern_wids[i])
+        {
+          sorted_pattern_terms.push_back(current_term);
+          count_terms.push_back(current_count);
+          current_term = sorted_pattern_wids[i];
+          current_count = 1;
+        }
+        else
+          current_count++;
+      }
+      sorted_pattern_terms.push_back(current_term);
+      count_terms.push_back(current_count);
+    }
+    // std::cerr << "sorted unique terms" << ": ";
+    // for (const auto& c : sorted_pattern_terms)
+    //   std::cerr << c << ", ";
+    // std::cerr << std::endl;
+    // std::cerr << "counts" << ": ";
+    // for (const auto& c : count_terms)
+    //   std::cerr << c << ", ";
+    // std::cerr << std::endl;
+    /////////////
+
+    // std::cerr << std::endl << "num best match after bm25 = " << best_matches.size() << std::endl << std::flush;
+
+    for (const auto& pair : best_matches)
     {
       // num_filtered++;
       const auto s_id = pair.first;
-      const auto longest_match = pair.second;
+      const auto score_filter = pair.second;
       size_t s_length = 0;
       const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length);
-      const auto num_covered_words = (longest_match < p_length
-                                      ? pattern_coverage.count_covered_words(sentence_wids, s_length)
-                                      : p_length);
-      /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
+      // const auto num_covered_words = (score_filter < p_length
+      //                                 ? pattern_coverage.count_covered_words(sentence_wids, s_length)
+      //                                 : p_length);
+      // const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length);
+      // TODO: adapt to filter n-gram existence
+
+      std::vector<float> s_cover;
+      float score;
+      ///////////// ED
+      // /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
       // if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs))
       // {
-        const Costs costs(p_length, s_length, edit_costs);
-
-        /* let us check the candidates */
-        const auto sentence_realtok = _filterIndex->real_tokens(s_id);
-        const auto cost_upper_bound = lowest_costs.top();
-        float cost = _edit_distance(sentence_wids, sentence_realtok, s_length,
-                                    pattern_wids.data(), pattern_realtok, p_length,
-                                    st, sn,
-                                    idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max,
-                                    edit_costs,
-                                    costs, cost_upper_bound);
-        // float cost = 0.1;
-        if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
-          continue;
-
-        float score = int(10000-cost*100)/10000.0;
-
-
-        lowest_costs.push(cost);
-        if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
-          lowest_costs.pop();
-        if (score >= fuzzy) {
-          Match m(sentence_wids, s_length);
-          m.score = score;
-          m.max_subseq = longest_match;
-          m.s_id = s_id;
-          m.id = _filterIndex->id(s_id);
-          m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt;
-          m.penalty = 0;
-          result.push(m);
-          cpt++;
-        }
+      //   const Costs costs(p_length, s_length, edit_costs);
+      //   /* let us check the candidates */
+      //   const auto sentence_realtok = _filterIndex->real_tokens(s_id);
+      //   const auto cost_upper_bound = lowest_costs.top();
+      //   float cost = _edit_distance(sentence_wids, sentence_realtok, s_length,
+      //                               pattern_wids.data(), pattern_realtok, p_length,
+      //                               st, sn,
+      //                               idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max,
+      //                               edit_costs,
+      //                               costs, cost_upper_bound);
+      //   // float cost = 0.1;
+      // if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
+      //   continue;
+      //   float score = int(10000 - cost * 100) / 10000.0;
+
+      //   lowest_costs.push(cost);
+      //   if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
+      //     lowest_costs.pop();
+      // }
+      // else 
+      // {
+      //   continue;
       // }
+      /////////////
+
+      if ((no_perfect && equal_arrays(s_length, p_length, sentence_wids, pattern_wids.data())))
+        continue;
+
+      ///////////// BM25
+      score = (float)score_filter / 1000.f;
+      s_cover = bm25Matches.cover(sorted_pattern_terms, count_terms, s_id);
+      // s_cover = std::vector<float>(sorted_pattern_terms.size(), 1);
+      /////////////
+
+      ///////////// NGRAM
+      /* TODO */
+      /////////////
+
+      if (score >= fuzzy) {
+        Match m(sentence_wids, s_length);
+        m.score = (filter_type == IndexType::BM25) ? (float)score_filter / (float)1000. : score;
+        m.max_subseq = (filter_type == IndexType::BM25) ? 0 : score_filter;
+        m.s_id = s_id;
+        m.id = _filterIndex->id(s_id);
+        m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt;
+        m.penalty = 0;
+        m.cover = s_cover;
+        result.push(m);
+        // std::cerr << m.s_id << ": ";
+        // for (const auto& c : m.cover)
+        //   std::cerr << c << ", ";
+        // std::cerr << std::endl;
+        cpt++;
+        if (cpt > contrast_buffer)
+          break;
+      }
     }
     // COUT filter
     // std::cerr << num_filtered << std::endl;
     // std::cerr << filter_matches->get_best_matches().size() << std::endl;
-    // delete filter_matches;
-    /* Contrastive reranking */
-    if (contrastive_factor > 0)
+
+    if (shrinking_factor < 1.f) // submodular coverage
+    {
+      std::vector<float> cover_weights(sorted_pattern_wids.size(), 1.f);
+      std::list<Match> candidates;
+      while (!result.empty())
+      {
+        auto match = result.top();
+        // match.penalty = match.score; // initialize corr. to weights of 1
+        // match.penalty = match.score; // initialize corr. to weights of 1
+        candidates.push_back(match);
+        result.pop();
+      }
+      auto comp = [](const Match& m1, const Match& m2) {
+          return m1.penalty < m2.penalty;
+      };
+      while (!candidates.empty() && (number_of_matches == 0 || matches.size() < number_of_matches))
+      {
+        // rescore penalties of candidates
+        for (Match &match : candidates)
+        {
+          float rescore = 0.f;
+          // std::cerr << "rescore " << match.s_id << " : (";
+          for (unsigned i = 0; i < cover_weights.size(); i++)
+          {  
+            rescore += cover_weights[i] * match.cover[i];
+            // if (match.cover[i] != 0) 
+            //   std::cerr << cover_weights[i] << "*" << match.cover[i] << "+";
+          }
+          
+          // std::cerr << ") " << match.penalty << " -> " << rescore << std::endl;
+          match.penalty = rescore;
+        }
+        auto it_max = std::max_element(candidates.begin(), candidates.end(), comp);
+        matches.push_back(*it_max);
+        // std::cerr << "choose No " << it_max->s_id << std::endl;
+        // update cover_weights
+        for (unsigned i = 0; i < cover_weights.size(); i++)
+          if (it_max->cover[i] > 0)
+            cover_weights[i] *= shrinking_factor;
+        candidates.erase(it_max);
+        if (shrinking_factor < 1e-20f)
+        {
+          float sum = 0.f;
+          for (const float& w : cover_weights)
+            sum += w;
+          if (sum < 1e-20f)
+            cover_weights.assign(cover_weights.size(), 1.f);
+        }
+      }
+    }
+    else if (contrastive_factor > 0) // MMR
     {
       std::list<Match> candidates;
       while (!result.empty())
@@ -766,6 +893,18 @@ namespace fuzzy
         result.pop();
       }
     }
+
+    // std::cerr << "final matches " << " : ";
+    // for (unsigned i = 0; i < matches.size(); i++)
+    // { 
+    //   std::cerr << std::endl << "   ";
+    //   std::cerr << matches[i].s_id << ": ";
+    //   for (int j = 0; j < matches[i].length; j++)
+    //     std::cerr << matches[i].s[j] << " ";
+    //   std::cerr << std::endl;
+    //   std::cerr << matches[i].id;
+    // }
+    // std::cerr << std::endl;
     return matches.size() > 0;
   }
 }
diff --git a/src/ngram_matches.cc b/src/ngram_matches.cc
index ed0b2a3..f50f130 100644
--- a/src/ngram_matches.cc
+++ b/src/ngram_matches.cc
@@ -12,13 +12,13 @@ namespace fuzzy
       : FilterMatches(fuzzy, p_length, min_seq_len, suffixArray)
   {}
 
-  std::vector<std::pair<unsigned, unsigned>>
+  std::vector<std::pair<unsigned, int>>
   NGramMatches::get_best_matches() const
   {
-    std::vector<std::pair<unsigned, unsigned>> sorted_matches(_longest_matches.begin(),
+    std::vector<std::pair<unsigned, int>> sorted_matches(_longest_matches.begin(),
                                                               _longest_matches.end());
     std::sort(sorted_matches.begin(), sorted_matches.end(),
-              [](const std::pair<unsigned, unsigned>& a, const std::pair<unsigned, unsigned>& b) {
+              [](const std::pair<unsigned, int>& a, const std::pair<unsigned, int>& b) {
                 return a.second > b.second || (a.second == b.second && a.first < b.first);
               });
     return sorted_matches;
diff --git a/src/no_matches.cc b/src/no_matches.cc
index 277aa01..2e0a9a3 100644
--- a/src/no_matches.cc
+++ b/src/no_matches.cc
@@ -13,7 +13,7 @@ namespace fuzzy
     {
     }
 
-    std::vector<std::pair<unsigned, unsigned>>
+    std::vector<std::pair<unsigned, int>>
     NoMatches::get_best_matches() const
     {
         return _all_matches;
@@ -21,7 +21,7 @@ namespace fuzzy
 
     void NoMatches::load_all()
     {
-        _all_matches = std::vector<std::pair<unsigned, unsigned>>(_filter.num_sentences());
+        _all_matches = std::vector<std::pair<unsigned, int>>(_filter.num_sentences());
         size_t *length;
 
         for (unsigned i = 0; i < _filter.num_sentences(); i++)
diff --git a/src/pattern_coverage.cc b/src/pattern_coverage.cc
index 76e7887..3753377 100644
--- a/src/pattern_coverage.cc
+++ b/src/pattern_coverage.cc
@@ -27,4 +27,14 @@ namespace fuzzy
     return num_covered_words;
   }
 
+  bool equal_arrays(const size_t s_len, const size_t p_len, const unsigned* s, const unsigned* p)
+  {
+    if (s_len != p_len)
+      return false;
+
+    for (unsigned i = 0; i < p_len; i++)
+      if (p[i] != s[i])
+        return false;
+    return true;
+  }
 }

From c0d8b1d90916e556aa792452452f81acbec1d004 Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Wed, 10 Jan 2024 12:43:38 +0100
Subject: [PATCH 05/15] cli structure

---
 cli/src/FuzzyMatch-cli.cc    |  34 +++++-
 include/fuzzy/fuzzy_match.hh |  16 ++-
 src/fuzzy_match.cc           | 195 ++++++++++++++++-------------------
 3 files changed, 130 insertions(+), 115 deletions(-)

diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc
index 30d8f18..b56403a 100644
--- a/cli/src/FuzzyMatch-cli.cc
+++ b/cli/src/FuzzyMatch-cli.cc
@@ -210,7 +210,9 @@ class processor {
             std::string contrastive_reduce_str,
             int contrastive_buffer,
             fuzzy::IndexType filter_type,
-            int bm25_buffer, float bm25_cutoff, float submodular_shrinking_factor, const fuzzy::FilterIndexParams& filter_index_params):
+            int bm25_buffer, float bm25_cutoff,
+            float submodular_shrinking_factor, fuzzy::SubmodularFunction submodular_function, fuzzy::SubmodularNormalization submodular_normalization,
+            const fuzzy::FilterIndexParams& filter_index_params):
              _fuzzyMatcher(pt, max_tokens_in_pattern, filter_type, filter_index_params),
              _fuzzy(fuzzy),
              _contrastive_factor(contrastive_factor),
@@ -225,7 +227,9 @@ class processor {
              _filter_type(filter_type),
              _bm25_buffer(bm25_buffer),
              _bm25_cutoff(bm25_cutoff),
-             _submodular_shrinking_factor(submodular_shrinking_factor) {
+             _submodular_shrinking_factor(submodular_shrinking_factor),
+             _submodular_function(submodular_function),
+             _submodular_normalization(submodular_normalization) {
     if (contrastive_reduce_str == "max")
       _contrastive_reduce = fuzzy::ContrastReduce::MAX;
     else
@@ -237,7 +241,8 @@ class processor {
     _fuzzyMatcher.match(sentence, _fuzzy, _nmatch, _no_perfect, matches,
                         _min_subseq_length, _min_subseq_ratio, _idf_penalty, _cost,
                         _contrastive_factor, _contrastive_reduce, _contrastive_buffer,
-                        _filter_type, _bm25_buffer, _bm25_cutoff, _submodular_shrinking_factor);
+                        _filter_type, _bm25_buffer, _bm25_cutoff,
+                        _submodular_shrinking_factor, _submodular_function, _submodular_normalization);
 
     std::string   out;
     for(const fuzzy::FuzzyMatch::Match &m: matches) {
@@ -296,6 +301,8 @@ class processor {
   int _bm25_buffer;
   float _bm25_cutoff;
   float _submodular_shrinking_factor;
+  fuzzy::SubmodularFunction _submodular_function;
+  fuzzy::SubmodularNormalization _submodular_normalization;
 };
 
 int main(int argc, char** argv)
@@ -329,6 +336,8 @@ int main(int argc, char** argv)
   float bm25_cutoff;
   float bm25_ratio_idf;
   float submodular_shrinking_factor;
+  std::string submodular_function_str;
+  std::string submodular_normalization_str;
   int nmatch;
   int nthreads;
   int min_subseq_length;
@@ -369,6 +378,8 @@ int main(int argc, char** argv)
     ("bm25-buffer", po::value(&bm25_buffer)->default_value(10), "number of best BM25 to rerank")
     ("bm25-cutoff", po::value(&bm25_cutoff)->default_value(0.f), "minimum BM25 score threshold cutoff")
     ("submodular-shrinking-factor,lambda", po::value(&submodular_shrinking_factor)->default_value(1.f), "In submodularity coverage, weight shrinking factor of each covered salient aspect of the source") 
+    ("submodular-function", po::value(&submodular_function_str)->default_value("NO"), "submodularity coverage function category (NO|BOW|NGRAM|ED)") 
+    ("submodular-norm", po::value(&submodular_normalization_str)->default_value("NO"), "Normalization in submodular coverage score") 
     ("nthreads,N", po::value(&nthreads)->default_value(4), "number of thread to use for match")
     ;
 
@@ -445,6 +456,18 @@ int main(int argc, char** argv)
     filter_type = fuzzy::IndexType::NO;
   else
     filter_type = fuzzy::IndexType::SUFFIX;
+  fuzzy::SubmodularFunction submodular_function;
+  if (submodular_function_str == "BOW")
+    submodular_function = fuzzy::SubmodularFunction::BOW;
+  else if (submodular_function_str == "ED")
+    submodular_function = fuzzy::SubmodularFunction::ED;
+  else
+    submodular_function = fuzzy::SubmodularFunction::NO;
+  fuzzy::SubmodularNormalization submodular_normalization;
+  if (submodular_normalization_str == "BM25")
+    submodular_normalization = fuzzy::SubmodularNormalization::BM25;
+  else
+    submodular_normalization = fuzzy::SubmodularNormalization::NO;
 #ifdef NO_EIGEN
   assert(filter_type != fuzzy::IndexType::BM25);
 #endif
@@ -454,8 +477,9 @@ int main(int argc, char** argv)
               idf_penalty, subseq_idf_weighting,
               max_tokens_in_pattern, edit_cost,
               contrastive_reduce, contrastive_buffer,
-              filter_type, bm25_buffer, bm25_cutoff, submodular_shrinking_factor, filter_index_params);
-
+              filter_type, bm25_buffer, bm25_cutoff,
+              submodular_shrinking_factor, submodular_function, submodular_normalization,
+              filter_index_params);
   if (index_file.length()) {
     TICK("Loading index_file: "+index_file);
     import_binarized_fuzzy_matcher(index_file, O._fuzzyMatcher);
diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh
index ac91b09..fac23b6 100644
--- a/include/fuzzy/fuzzy_match.hh
+++ b/include/fuzzy/fuzzy_match.hh
@@ -15,8 +15,8 @@ namespace onmt {
 namespace fuzzy
 {
   enum class ContrastReduce { MEAN, MAX };
-  enum class SubmodularFunction { BOW, BM25, NGRAM, ED };
-  enum class SubmodularNormalization { NO, IDF };
+  enum class SubmodularFunction { NO, BOW, NGRAM, ED };
+  enum class SubmodularNormalization { NO, BM25 };
 
   class FuzzyMatch
   {
@@ -78,7 +78,9 @@ namespace fuzzy
                IndexType filter_type=IndexType::SUFFIX,
                int bm25_buffer=10,
                float bm25_cutoff=0,
-               float shrinking_factor=1.f) const;
+               float shrinking_factor=1.f,
+               SubmodularFunction submod_fun=SubmodularFunction::NO,
+               SubmodularNormalization submod_norm=SubmodularNormalization::NO) const;
     bool match(const Sentence& real,
                const Tokens& pattern,
                float fuzzy,
@@ -95,7 +97,9 @@ namespace fuzzy
                IndexType filter_type=IndexType::SUFFIX,
                int bm25_buffer=10,
                float bm25_cutoff=0,
-               float shrinking_factor=1.f) const;
+               float shrinking_factor=1.f,
+               SubmodularFunction submod_fun=SubmodularFunction::NO,
+               SubmodularNormalization submod_norm=SubmodularNormalization::NO) const;
     /* simplified, include tokenization */
     bool match(const std::string &sentence,
                float fuzzy,
@@ -112,7 +116,9 @@ namespace fuzzy
                IndexType filter_type=IndexType::SUFFIX,
                int bm25_buffer=10,
                float bm25_cutoff=0,
-               float shrinking_factor=1.f) const;
+               float shrinking_factor=1.f,
+               SubmodularFunction submod_fun=SubmodularFunction::NO,
+               SubmodularNormalization submod_norm=SubmodularNormalization::NO) const;
     bool subsequence(const std::string &sentence,
                unsigned number_of_matches,
                bool no_perfect,
diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc
index a1d8368..12be570 100644
--- a/src/fuzzy_match.cc
+++ b/src/fuzzy_match.cc
@@ -407,7 +407,9 @@ namespace fuzzy
                          IndexType filter_type,
                          int bm25_buffer,
                          float bm25_cutoff,
-                         float shrinking_factor) const {
+                         float shrinking_factor,
+                         SubmodularFunction submod_fun,
+                         SubmodularNormalization submod_norm) const {
 
     Sentence real;
     Tokens norm;
@@ -416,7 +418,7 @@ namespace fuzzy
                  min_subseq_length, min_subseq_ratio, vocab_idf_penalty,
                  edit_costs, contrastive_factor, reduce, contrast_buffer,
                  filter_type, bm25_buffer, bm25_cutoff,
-                 shrinking_factor);
+                 shrinking_factor, submod_fun, submod_norm);
   }
 
   /* backward compatibility */
@@ -435,14 +437,16 @@ namespace fuzzy
                     IndexType filter_type,
                     int bm25_buffer,
                     float bm25_cutoff,
-                    float shrinking_factor) const
+                    float shrinking_factor,
+                    SubmodularFunction submod_fun,
+                    SubmodularNormalization submod_norm) const
   {
     const Sentence real(pattern);
     return match(real, pattern, fuzzy, number_of_matches, false, matches,
                  min_subseq_length, min_subseq_ratio, vocab_idf_penalty,
                  edit_costs, contrastive_factor, reduce, contrast_buffer,
                  filter_type, bm25_buffer, bm25_cutoff,
-                 shrinking_factor);
+                 shrinking_factor, submod_fun, submod_norm);
   }
 
   /* check for the pattern in the suffix-array index SAI */ 
@@ -463,7 +467,9 @@ namespace fuzzy
                     IndexType filter_type,
                     int bm25_buffer,
                     float bm25_cutoff,
-                    float shrinking_factor) const
+                    float shrinking_factor,
+                    SubmodularFunction submod_fun,
+                    SubmodularNormalization submod_norm) const
   {
     size_t p_length = pattern.size();
     if (contrast_buffer == -1)
@@ -612,79 +618,45 @@ namespace fuzzy
     lowest_costs.push(std::numeric_limits<float>::max());
 
     unsigned cpt = 0;
-    // unsigned num_filtered = 0;
-
-    // ONLY N-grams
-    // for (const auto& pair : filter_matches->get_best_matches())
-    // {
-    //   const auto s_id = pair.first;
-    //   const auto longest_match = pair.second;
-    //   size_t s_length = 0;
-    //   const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length);
-    //   Match m(sentence_wids, s_length);
-    //   m.score = (float)longest_match / (float)s_length;
-    //   m.max_subseq = longest_match;
-    //   m.s_id = s_id;
-    //   m.id = _filterIndex->id(s_id);
-    //   m.secondary_sort = s_id;
-    //   m.penalty = 0;
-    //   result.push(m);
-    //   cpt++;
-    //   if (cpt > contrast_buffer)
-    //     break;
-    // }
-
-    // ONLY BM25
-    // for (const auto& pair : filter_matches->get_best_matches())
-    // {
-    //   const auto s_id = pair.first;
-    //   const auto bm25_score = pair.second;
-    //   size_t s_length = 0;
-    //   const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length);
-    //   Match m(sentence_wids, s_length);
-    //   m.score = (float)bm25_score / (float)1000.;
-    //   m.max_subseq = 0;
-    //   m.s_id = s_id;
-    //   m.id = _filterIndex->id(s_id);
-    //   m.secondary_sort = s_id;
-    //   m.penalty = 0;
-    //   result.push(m);
-    //   cpt++;
-    //   if (cpt > contrast_buffer)
-    //     break;
-    // }
-
-    // case S = x
-
-    ///////////// BM25 (BOW)
-    assert((filter_type == IndexType::BM25));
-    std::vector<std::pair<unsigned, int>> best_matches = filter_matches->get_best_matches();
-    BM25Matches& bm25Matches = static_cast<BM25Matches&>(*filter_matches);
-    std::vector<unsigned> sorted_pattern_wids(pattern_wids);
-    std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end());
+      
     std::vector<unsigned> sorted_pattern_terms;
     std::vector<unsigned> count_terms;
-    sorted_pattern_terms.reserve(pattern_wids.size());
-    count_terms.reserve(pattern_wids.size());
-    if (sorted_pattern_wids.size() > 0)
+    std::vector<std::pair<unsigned, int>> best_matches = filter_matches->get_best_matches();
+
+    switch(submod_fun) // Salient aspects enumeration
     {
-      unsigned current_term = sorted_pattern_wids[0];
-      unsigned current_count = 1;
-      for (unsigned i = 1; i < sorted_pattern_wids.size(); i++)
+      case SubmodularFunction::BOW:
       {
-        if (current_term != sorted_pattern_wids[i])
+        std::vector<unsigned> sorted_pattern_wids(pattern_wids);
+        std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end());
+
+        sorted_pattern_terms.reserve(pattern_wids.size());
+        count_terms.reserve(pattern_wids.size());
+        if (sorted_pattern_wids.size() > 0)
         {
+          unsigned current_term = sorted_pattern_wids[0];
+          unsigned current_count = 1;
+          for (unsigned i = 1; i < sorted_pattern_wids.size(); i++)
+          {
+            if (current_term != sorted_pattern_wids[i])
+            {
+              sorted_pattern_terms.push_back(current_term);
+              count_terms.push_back(current_count);
+              current_term = sorted_pattern_wids[i];
+              current_count = 1;
+            }
+            else
+              current_count++;
+          }
           sorted_pattern_terms.push_back(current_term);
           count_terms.push_back(current_count);
-          current_term = sorted_pattern_wids[i];
-          current_count = 1;
         }
-        else
-          current_count++;
+        break;
       }
-      sorted_pattern_terms.push_back(current_term);
-      count_terms.push_back(current_count);
+      default:
+        ;
     }
+
     // std::cerr << "sorted unique terms" << ": ";
     // for (const auto& c : sorted_pattern_terms)
     //   std::cerr << c << ", ";
@@ -712,47 +684,60 @@ namespace fuzzy
 
       std::vector<float> s_cover;
       float score;
-      ///////////// ED
-      // /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
-      // if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs))
-      // {
-      //   const Costs costs(p_length, s_length, edit_costs);
-      //   /* let us check the candidates */
-      //   const auto sentence_realtok = _filterIndex->real_tokens(s_id);
-      //   const auto cost_upper_bound = lowest_costs.top();
-      //   float cost = _edit_distance(sentence_wids, sentence_realtok, s_length,
-      //                               pattern_wids.data(), pattern_realtok, p_length,
-      //                               st, sn,
-      //                               idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max,
-      //                               edit_costs,
-      //                               costs, cost_upper_bound);
-      //   // float cost = 0.1;
-      // if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
-      //   continue;
-      //   float score = int(10000 - cost * 100) / 10000.0;
-
-      //   lowest_costs.push(cost);
-      //   if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
-      //     lowest_costs.pop();
-      // }
-      // else 
-      // {
-      //   continue;
-      // }
-      /////////////
 
       if ((no_perfect && equal_arrays(s_length, p_length, sentence_wids, pattern_wids.data())))
         continue;
 
-      ///////////// BM25
-      score = (float)score_filter / 1000.f;
-      s_cover = bm25Matches.cover(sorted_pattern_terms, count_terms, s_id);
-      // s_cover = std::vector<float>(sorted_pattern_terms.size(), 1);
-      /////////////
+      if (submod_norm == SubmodularNormalization::BM25)
+      {
+        score = (float)score_filter / 1000.f;
+        assert((filter_type == IndexType::BM25));
+        BM25Matches& bm25Matches = static_cast<BM25Matches&>(*filter_matches);
+        s_cover = bm25Matches.cover(sorted_pattern_terms, count_terms, s_id);
+        break;
+      }
+      else if (submod_norm == SubmodularNormalization::NO)
+      {
+        // score = 0;
+        // s_cover;
+        // TODO: function to compute those
+      }
+
+      switch(submod_fun) // salient aspect weighted cover
+      {
+        case SubmodularFunction::BOW:
+          ;
+
+        default:
+          const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length);
+          /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
+          if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs))
+          {
+            const Costs costs(p_length, s_length, edit_costs);
+            /* let us check the candidates */
+            const auto sentence_realtok = _filterIndex->real_tokens(s_id);
+            const auto cost_upper_bound = lowest_costs.top();
+            float cost = _edit_distance(sentence_wids, sentence_realtok, s_length,
+                                        pattern_wids.data(), pattern_realtok, p_length,
+                                        st, sn,
+                                        idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max,
+                                        edit_costs,
+                                        costs, cost_upper_bound);
+            // float cost = 0.1;
+          if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
+            continue;
+            float score = int(10000 - cost * 100) / 10000.0;
 
-      ///////////// NGRAM
-      /* TODO */
-      /////////////
+            lowest_costs.push(cost);
+            if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
+              lowest_costs.pop();
+          }
+          else 
+          {
+            continue;
+          }
+
+      }
 
       if (score >= fuzzy) {
         Match m(sentence_wids, s_length);
@@ -779,7 +764,7 @@ namespace fuzzy
 
     if (shrinking_factor < 1.f) // submodular coverage
     {
-      std::vector<float> cover_weights(sorted_pattern_wids.size(), 1.f);
+      std::vector<float> cover_weights(sorted_pattern_terms.size(), 1.f);
       std::list<Match> candidates;
       while (!result.empty())
       {

From 334fba2ad84a64725f9061afbded5374b6f11da6 Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Wed, 10 Jan 2024 14:14:15 +0100
Subject: [PATCH 06/15] cli structure

---
 src/fuzzy_match.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc
index 12be570..a670040 100644
--- a/src/fuzzy_match.cc
+++ b/src/fuzzy_match.cc
@@ -698,6 +698,13 @@ namespace fuzzy
       }
       else if (submod_norm == SubmodularNormalization::NO)
       {
+        const auto sentence_realtok = _filterIndex->real_tokens(s_id);
+        get_bow_score(
+          sorted_pattern_terms,
+          count_terms,
+          sentence_realtok,
+          s_length
+        )
         // score = 0;
         // s_cover;
         // TODO: function to compute those

From 3519f7896ca73b3219e6eda5a25a2dcea48a5478 Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Wed, 10 Jan 2024 17:37:17 +0100
Subject: [PATCH 07/15] BOW | NO+IDF+BM25

---
 cli/src/FuzzyMatch-cli.cc    |  3 ++
 include/fuzzy/fuzzy_match.hh |  2 +-
 src/CMakeLists.txt           |  1 +
 src/fuzzy_match.cc           | 72 +++++++++++++++++++++++++-----------
 4 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc
index b56403a..4d4cfab 100644
--- a/cli/src/FuzzyMatch-cli.cc
+++ b/cli/src/FuzzyMatch-cli.cc
@@ -464,8 +464,11 @@ int main(int argc, char** argv)
   else
     submodular_function = fuzzy::SubmodularFunction::NO;
   fuzzy::SubmodularNormalization submodular_normalization;
+  std::cerr << "submodular_normalization_str = " << submodular_normalization_str << std::endl;
   if (submodular_normalization_str == "BM25")
     submodular_normalization = fuzzy::SubmodularNormalization::BM25;
+  else if (submodular_normalization_str == "IDF")
+    submodular_normalization = fuzzy::SubmodularNormalization::IDF;
   else
     submodular_normalization = fuzzy::SubmodularNormalization::NO;
 #ifdef NO_EIGEN
diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh
index fac23b6..5e0cb38 100644
--- a/include/fuzzy/fuzzy_match.hh
+++ b/include/fuzzy/fuzzy_match.hh
@@ -16,7 +16,7 @@ namespace fuzzy
 {
   enum class ContrastReduce { MEAN, MAX };
   enum class SubmodularFunction { NO, BOW, NGRAM, ED };
-  enum class SubmodularNormalization { NO, BM25 };
+  enum class SubmodularNormalization { NO, IDF, BM25 };
 
   class FuzzyMatch
   {
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6e6c44c..b7a96fa 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -14,6 +14,7 @@ set(FUZZY_SOURCES
   index.cc
   no_filter.cc
   no_matches.cc
+  submodular.cc
 )
 if(MSVC)
   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc
index a670040..9309d4f 100644
--- a/src/fuzzy_match.cc
+++ b/src/fuzzy_match.cc
@@ -622,6 +622,7 @@ namespace fuzzy
     std::vector<unsigned> sorted_pattern_terms;
     std::vector<unsigned> count_terms;
     std::vector<std::pair<unsigned, int>> best_matches = filter_matches->get_best_matches();
+    std::vector<float> norm_weight;
 
     switch(submod_fun) // Salient aspects enumeration
     {
@@ -656,18 +657,21 @@ namespace fuzzy
       default:
         ;
     }
-
-    // std::cerr << "sorted unique terms" << ": ";
-    // for (const auto& c : sorted_pattern_terms)
-    //   std::cerr << c << ", ";
-    // std::cerr << std::endl;
-    // std::cerr << "counts" << ": ";
-    // for (const auto& c : count_terms)
-    //   std::cerr << c << ", ";
-    // std::cerr << std::endl;
+    std::vector<float> sorted_pattern_terms_idf;
+    if (submod_norm == SubmodularNormalization::IDF)
+      sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms);
+
+    std::cerr << "sorted unique terms" << ": ";
+    for (const auto& c : sorted_pattern_terms)
+      std::cerr << c << ", ";
+    std::cerr << std::endl;
+    std::cerr << "Idf" << ": ";
+    for (const auto& c : sorted_pattern_terms_idf)
+      std::cerr << c << ", ";
+    std::cerr << std::endl;
     /////////////
 
-    // std::cerr << std::endl << "num best match after bm25 = " << best_matches.size() << std::endl << std::flush;
+    std::cerr << std::endl << "num best match after bm25 = " << best_matches.size() << std::endl << std::flush;
 
     for (const auto& pair : best_matches)
     {
@@ -680,7 +684,6 @@ namespace fuzzy
       //                                 ? pattern_coverage.count_covered_words(sentence_wids, s_length)
       //                                 : p_length);
       // const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length);
-      // TODO: adapt to filter n-gram existence
 
       std::vector<float> s_cover;
       float score;
@@ -690,31 +693,58 @@ namespace fuzzy
 
       if (submod_norm == SubmodularNormalization::BM25)
       {
+        std::cerr << "BM25 norm..." << std::endl << std::flush;
         score = (float)score_filter / 1000.f;
         assert((filter_type == IndexType::BM25));
         BM25Matches& bm25Matches = static_cast<BM25Matches&>(*filter_matches);
         s_cover = bm25Matches.cover(sorted_pattern_terms, count_terms, s_id);
-        break;
       }
       else if (submod_norm == SubmodularNormalization::NO)
       {
-        const auto sentence_realtok = _filterIndex->real_tokens(s_id);
+        std::cerr << "No norm..." << std::endl << std::flush;
         get_bow_score(
           sorted_pattern_terms,
           count_terms,
-          sentence_realtok,
-          s_length
-        )
-        // score = 0;
-        // s_cover;
-        // TODO: function to compute those
+          sentence_wids,
+          s_length,
+          score,
+          s_cover);
+      }
+      else if (submod_norm == SubmodularNormalization::IDF)
+      {
+        std::cerr << "IDF norm..." << std::endl << std::flush;
+        get_bow_score_idf(
+          sorted_pattern_terms,
+          count_terms,
+          sentence_wids,
+          s_length,
+          sorted_pattern_terms_idf,
+          score,
+          s_cover);
       }
+      // std::cerr << "q:       ";
+      // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++)
+      //   std::cerr << sorted_pattern_terms[i] << ",";
+      // std::cerr << std::endl;
+      // std::cerr << "q count: ";
+      // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++)
+      //   std::cerr << count_terms[i] << ",";
+      // std::cerr << std::endl;
+      std::cerr << "sent:    ";
+      for (unsigned i = 0; i < s_length; i++)
+        std::cerr << sentence_wids[i] << ",";
+      std::cerr << std::endl;
+      std::cerr << "cover:   ";
+      for (unsigned i = 0; i < s_cover.size(); i++)
+        std::cerr << s_cover[i] << ",";
+      std::cerr << std::endl;
+      std::cerr << "score:   " << score << std::endl;
+      std::cerr << "...done" << std::endl << std::flush;
 
       switch(submod_fun) // salient aspect weighted cover
       {
         case SubmodularFunction::BOW:
-          ;
-
+          break;
         default:
           const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length);
           /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */

From d55f369ea83b9b4b90474392d4ca644bad0ae86a Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Wed, 10 Jan 2024 17:37:55 +0100
Subject: [PATCH 08/15] forgotten files

---
 include/fuzzy/submodular.hh | 26 ++++++++++++++++
 src/submodular.cc           | 62 +++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 include/fuzzy/submodular.hh
 create mode 100644 src/submodular.cc

diff --git a/include/fuzzy/submodular.hh b/include/fuzzy/submodular.hh
new file mode 100644
index 0000000..64a5f7f
--- /dev/null
+++ b/include/fuzzy/submodular.hh
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <vector>
+#include <iostream>
+#include <memory>
+
+
+namespace fuzzy
+{
+   void get_bow_score(
+    std::vector<unsigned>& sorted_pattern_terms,
+    std::vector<unsigned>& count_terms,
+    const unsigned* sentence,
+    const unsigned sentence_length,
+    float& score,
+    std::vector<float>& cover);
+
+   void get_bow_score_idf(
+    std::vector<unsigned>& sorted_pattern_terms,
+    std::vector<unsigned>& count_terms,
+    const unsigned* sentence,
+    const unsigned sentence_length,
+    std::vector<float>& idf_penalty,
+    float& score,
+    std::vector<float>& cover);
+}
diff --git a/src/submodular.cc b/src/submodular.cc
new file mode 100644
index 0000000..81aadab
--- /dev/null
+++ b/src/submodular.cc
@@ -0,0 +1,62 @@
+#include <fuzzy/submodular.hh>
+#include <algorithm>
+#include <cmath>
+
+namespace fuzzy
+{
+    void get_bow_score(
+        std::vector<unsigned>& sorted_pattern_terms,
+        std::vector<unsigned>& count_terms,
+        const unsigned* sentence,
+        const unsigned sentence_length,
+        float& score,
+        std::vector<float>& cover)
+    {
+        std::vector<unsigned> sorted_sentence_terms(sentence, sentence + sentence_length);
+        std::sort(sorted_sentence_terms.begin(), sorted_sentence_terms.end());
+        cover = std::vector<float>(sorted_pattern_terms.size(), 0.f);
+        score = 0.f;
+        for (unsigned i, j = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++)
+        {
+            while (
+                (i < sorted_pattern_terms.size()) && 
+                (sorted_pattern_terms[i] < sorted_sentence_terms[j]))
+                i++;
+            // std::cerr << sorted_pattern_terms[i] << " ?= " << sorted_sentence_terms[j] << "  (" << i << ", " << j << ")" << std::endl;
+            if (sorted_pattern_terms[i] == sorted_sentence_terms[j])
+                if ((float)count_terms[i] > cover[i] + 1e-6f)
+                {
+                    cover[i] += 1.f;
+                    score += 1.f;
+                }
+        }
+    }
+    void get_bow_score_idf(
+        std::vector<unsigned>& sorted_pattern_terms,
+        std::vector<unsigned>& count_terms,
+        const unsigned* sentence,
+        const unsigned sentence_length,
+        std::vector<float>& idf_penalty,
+        float& score,
+        std::vector<float>& cover)
+    {
+        std::vector<unsigned> sorted_sentence_terms(sentence, sentence + sentence_length);
+        std::sort(sorted_sentence_terms.begin(), sorted_sentence_terms.end());
+        cover = std::vector<float>(sorted_pattern_terms.size(), 0.f);
+        score = 0.f;
+        for (unsigned i, j, k = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++)
+        {
+            while (
+                (i < sorted_pattern_terms.size()) && 
+                (sorted_pattern_terms[i] < sorted_sentence_terms[j]))
+                i++;
+            // std::cerr << sorted_pattern_terms[i] << " ?= " << sorted_sentence_terms[j] << "  (" << i << ", " << j << ")" << std::endl;
+            if (sorted_pattern_terms[i] == sorted_sentence_terms[j])
+                if ((float)count_terms[i] > cover[i] / idf_penalty[i] + 1e-6f)
+                {
+                    cover[i] += idf_penalty[i];
+                    score += idf_penalty[i];
+                }
+        }
+    }
+}
\ No newline at end of file

From 83172ace2db7656cb2974f23288cdc7cd6966dc5 Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Fri, 12 Jan 2024 13:42:26 +0100
Subject: [PATCH 09/15] ngram implementation

---
 cli/src/FuzzyMatch-cli.cc    |   2 +
 include/fuzzy/fuzzy_match.hh |   4 +
 include/fuzzy/submodular.hh  |  87 ++++++++++++++----
 include/fuzzy/submodular.hxx |  85 ++++++++++++++++++
 src/fuzzy_match.cc           | 170 ++++++++++++++++++++++-------------
 src/submodular.cc            | 164 ++++++++++++++++++++++++---------
 6 files changed, 396 insertions(+), 116 deletions(-)
 create mode 100644 include/fuzzy/submodular.hxx

diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc
index 4d4cfab..9dd1b23 100644
--- a/cli/src/FuzzyMatch-cli.cc
+++ b/cli/src/FuzzyMatch-cli.cc
@@ -459,6 +459,8 @@ int main(int argc, char** argv)
   fuzzy::SubmodularFunction submodular_function;
   if (submodular_function_str == "BOW")
     submodular_function = fuzzy::SubmodularFunction::BOW;
+  else if (submodular_function_str == "NGRAM")
+    submodular_function = fuzzy::SubmodularFunction::NGRAM;
   else if (submodular_function_str == "ED")
     submodular_function = fuzzy::SubmodularFunction::ED;
   else
diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh
index 5e0cb38..3d4560f 100644
--- a/include/fuzzy/fuzzy_match.hh
+++ b/include/fuzzy/fuzzy_match.hh
@@ -6,6 +6,7 @@
 #include <fuzzy/index.hh>
 #include <fuzzy/sentence.hh>
 #include <fuzzy/edit_distance.hh>
+#include <fuzzy/submodular.hh>
 #include <memory>
 
 namespace onmt {
@@ -163,6 +164,9 @@ namespace fuzzy
     std::vector<float>
     compute_idf_penalty(const std::vector<unsigned int>& pattern_wids,
                         float unknown_vocab_word_penalty = 0) const;
+    std::vector<float>
+    compute_idf_penalty(const std::vector<NGram>& pattern_ngrams,
+                        float unknown_vocab_word_penalty = 0) const;
 
     /* penalty tokens */
     int                    _pt;
diff --git a/include/fuzzy/submodular.hh b/include/fuzzy/submodular.hh
index 64a5f7f..9bebe94 100644
--- a/include/fuzzy/submodular.hh
+++ b/include/fuzzy/submodular.hh
@@ -3,24 +3,79 @@
 #include <vector>
 #include <iostream>
 #include <memory>
+#include <algorithm>
 
 
 namespace fuzzy
 {
-   void get_bow_score(
-    std::vector<unsigned>& sorted_pattern_terms,
-    std::vector<unsigned>& count_terms,
-    const unsigned* sentence,
-    const unsigned sentence_length,
-    float& score,
-    std::vector<float>& cover);
-
-   void get_bow_score_idf(
-    std::vector<unsigned>& sorted_pattern_terms,
-    std::vector<unsigned>& count_terms,
-    const unsigned* sentence,
-    const unsigned sentence_length,
-    std::vector<float>& idf_penalty,
-    float& score,
-    std::vector<float>& cover);
+    class NGram
+    {
+        public:
+            NGram(const unsigned* start, unsigned N);
+            ~NGram() {}
+            NGram& operator=(const NGram& other);
+            bool operator==(NGram& other);
+            bool operator<(NGram& other);
+            void print() const;
+            const unsigned* _start;
+            unsigned _N;
+    };
+
+    inline
+    std::vector<NGram> get_sorted_ngrams(
+        const unsigned N,
+        const unsigned* sentence,
+        const unsigned sentence_length);
+
+    template <typename T>
+    void get_unique_with_count(
+        std::vector<T>& sorted_salient,
+        std::vector<T>& unique,
+        std::vector<unsigned>& count);
+
+    template <typename T>
+    void get_score(
+        std::vector<T>& sorted_pattern_terms,
+        std::vector<T>& sorted_sentence_terms,
+        std::vector<unsigned>& count_terms,
+        float& score,
+        std::vector<float>& cover,
+        std::vector<float>& idf_penalty);
+
+    void get_bow_score(
+        std::vector<unsigned>& sorted_pattern_terms,
+        std::vector<unsigned>& count_terms,
+        const unsigned* sentence,
+        const unsigned sentence_length,
+        float& score,
+        std::vector<float>& cover,
+        std::vector<float>& idf_penalty);
+
+    void get_ngram_score(
+        std::vector<NGram>& sorted_pattern_terms,
+        const unsigned N,
+        std::vector<unsigned>& count_terms,
+        const unsigned* sentence,
+        const unsigned sentence_length,
+        float& score,
+        std::vector<float>& cover,
+        std::vector<float>& idf_penalty);
+
+    // void get_bow_score_idf(
+    //     std::vector<unsigned>& sorted_pattern_terms,
+    //     std::vector<unsigned>& count_terms,
+    //     const unsigned* sentence,
+    //     const unsigned sentence_length,
+    //     std::vector<float>& idf_penalty,
+    //     float& score,
+    //     std::vector<float>& cover);
+
+    void get_all_ngrams(
+        const unsigned* sequence,
+        const unsigned length,
+        const unsigned N,
+        std::vector<NGram>& ngrams,
+        std::vector<unsigned>& counts);
 }
+
+#include "submodular.hxx"
diff --git a/include/fuzzy/submodular.hxx b/include/fuzzy/submodular.hxx
new file mode 100644
index 0000000..0353158
--- /dev/null
+++ b/include/fuzzy/submodular.hxx
@@ -0,0 +1,85 @@
+
+namespace fuzzy
+{
+    inline
+    std::vector<NGram> get_sorted_ngrams(
+        const unsigned N_const,
+        const unsigned* sentence,
+        const unsigned sentence_length)
+    {
+        std::vector<NGram> all_ngrams;
+        const unsigned N = std::min(N_const, sentence_length);
+        all_ngrams.reserve(N * sentence_length - N * (N - 1) / 2);
+        for (unsigned n = 1; n <= N; n++)
+            for (unsigned i = 0; i < sentence_length - n + 1; i++)
+                all_ngrams.push_back(NGram(sentence + i, n));
+        std::sort(all_ngrams.begin(), all_ngrams.end());
+        return all_ngrams;
+    }
+
+    template <typename T>
+    void get_score(
+        std::vector<T>& sorted_pattern_terms,
+        std::vector<T>& sorted_sentence_terms,
+        std::vector<unsigned>& count_terms,
+        float& score,
+        std::vector<float>& cover,
+        std::vector<float>& idf_penalty)
+    {
+        cover = std::vector<float>(sorted_pattern_terms.size(), 0.f);
+        score = 0.f;
+        for (
+            unsigned i, j, k = 0;
+            (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size());
+            j++)
+        {
+            while (
+                (i < sorted_pattern_terms.size()) && 
+                (sorted_pattern_terms[i] < sorted_sentence_terms[j]))
+                i++;
+            if (sorted_pattern_terms[i] == sorted_sentence_terms[j])
+                if (idf_penalty.size() > 0)
+                {
+                    if ((float)count_terms[i] > cover[i] / idf_penalty[i] + 1e-6f)
+                    {
+                        cover[i] += idf_penalty[i];
+                        score += idf_penalty[i];
+                    }
+                }
+                else if ((float)count_terms[i] > cover[i] + 1e-6f)
+                {
+                    cover[i] += 1.f;
+                    score += 1.f;
+                }
+        }
+    }
+
+    template <typename T>
+    void get_unique_with_count(
+        std::vector<T>& sorted_salient,
+        std::vector<T>& unique,
+        std::vector<unsigned>& count)
+    {
+        unique.reserve(sorted_salient.size());
+        count.reserve(sorted_salient.size());
+        if (sorted_salient.size() > 0)
+        {
+          T& current_salient = sorted_salient[0];
+          unsigned current_count = 1;
+          for (unsigned i = 1; i < sorted_salient.size(); i++)
+          {
+            if (!(current_salient == sorted_salient[i]))
+            {
+              unique.push_back(current_salient);
+              count.push_back(current_count);
+              current_salient = sorted_salient[i];
+              current_count = 1;
+            }
+            else
+              current_count++;
+          }
+          unique.push_back(current_salient);
+          count.push_back(current_count);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc
index 9309d4f..3b6a60a 100644
--- a/src/fuzzy_match.cc
+++ b/src/fuzzy_match.cc
@@ -391,6 +391,29 @@ namespace fuzzy
     return idf_penalty;
   }
 
+  std::vector<float> FuzzyMatch::compute_idf_penalty(const std::vector<NGram>& pattern_ngrams,
+                                                     float unknown_vocab_word_penalty) const {
+    std::vector<float> idf_penalty;
+    idf_penalty.reserve(pattern_ngrams.size());
+
+    const unsigned num_sentences = _filterIndex->get_Filter().num_sentences();
+
+    const std::vector<unsigned>& word_frequency_in_sentences = _filterIndex->get_VocabIndexer().getSFreq();
+    unsigned n;
+    float tot_idf;
+    for (const NGram& ngram : pattern_ngrams) {
+      for (n = 0, tot_idf = 0.f; n < ngram._N; n++)
+      {
+        if (ngram._start[n] != fuzzy::VocabIndexer::VOCAB_UNK)
+          tot_idf += std::log((float)num_sentences/(float)word_frequency_in_sentences[ngram._start[n]]);
+        else
+          tot_idf += unknown_vocab_word_penalty;
+      }
+      idf_penalty.push_back(tot_idf / (float)n);
+    }
+    return idf_penalty;
+  } 
+
   /* interface with integrated tokenization */
   bool FuzzyMatch::match(const std::string &sentence,
                          float fuzzy,
@@ -620,46 +643,62 @@ namespace fuzzy
     unsigned cpt = 0;
       
     std::vector<unsigned> sorted_pattern_terms;
+    std::vector<NGram> sorted_pattern_ngrams;
     std::vector<unsigned> count_terms;
     std::vector<std::pair<unsigned, int>> best_matches = filter_matches->get_best_matches();
     std::vector<float> norm_weight;
+    std::vector<float> sorted_pattern_terms_idf;
 
-    switch(submod_fun) // Salient aspects enumeration
+    /* Salient aspects enumeration */
+    switch(submod_fun)
     {
       case SubmodularFunction::BOW:
       {
         std::vector<unsigned> sorted_pattern_wids(pattern_wids);
         std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end());
 
-        sorted_pattern_terms.reserve(pattern_wids.size());
-        count_terms.reserve(pattern_wids.size());
-        if (sorted_pattern_wids.size() > 0)
-        {
-          unsigned current_term = sorted_pattern_wids[0];
-          unsigned current_count = 1;
-          for (unsigned i = 1; i < sorted_pattern_wids.size(); i++)
-          {
-            if (current_term != sorted_pattern_wids[i])
-            {
-              sorted_pattern_terms.push_back(current_term);
-              count_terms.push_back(current_count);
-              current_term = sorted_pattern_wids[i];
-              current_count = 1;
-            }
-            else
-              current_count++;
-          }
-          sorted_pattern_terms.push_back(current_term);
-          count_terms.push_back(current_count);
-        }
-        break;
+        get_unique_with_count(sorted_pattern_wids, sorted_pattern_terms, count_terms);
+
+        if (submod_norm == SubmodularNormalization::IDF)
+          sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms);
+
+        // sorted_pattern_terms.reserve(pattern_wids.size());
+        // count_terms.reserve(pattern_wids.size());
+        // if (sorted_pattern_wids.size() > 0)
+        // {
+        //   unsigned current_term = sorted_pattern_wids[0];
+        //   unsigned current_count = 1;
+        //   for (unsigned i = 1; i < sorted_pattern_wids.size(); i++)
+        //   {
+        //     if (current_term != sorted_pattern_wids[i])
+        //     {
+        //       sorted_pattern_terms.push_back(current_term);
+        //       count_terms.push_back(current_count);
+        //       current_term = sorted_pattern_wids[i];
+        //       current_count = 1;
+        //     }
+        //     else
+        //       current_count++;
+        //   }
+        //   sorted_pattern_terms.push_back(current_term);
+        //   count_terms.push_back(current_count);
+        // }
+        // break;
+      }
+      case SubmodularFunction::NGRAM:
+      {
+        get_all_ngrams(
+          pattern_wids.data(),
+          p_length,
+          4,
+          sorted_pattern_ngrams,
+          count_terms);
+        if (submod_norm == SubmodularNormalization::IDF)
+          sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_ngrams);
       }
       default:
         ;
     }
-    std::vector<float> sorted_pattern_terms_idf;
-    if (submod_norm == SubmodularNormalization::IDF)
-      sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms);
 
     std::cerr << "sorted unique terms" << ": ";
     for (const auto& c : sorted_pattern_terms)
@@ -699,28 +738,41 @@ namespace fuzzy
         BM25Matches& bm25Matches = static_cast<BM25Matches&>(*filter_matches);
         s_cover = bm25Matches.cover(sorted_pattern_terms, count_terms, s_id);
       }
-      else if (submod_norm == SubmodularNormalization::NO)
+      // else if (submod_norm == SubmodularNormalization::NO)
+      // {
+      //   std::cerr << "No norm..." << std::endl << std::flush;
+      //   get_bow_score(
+      //     sorted_pattern_terms,
+      //     count_terms,
+      //     sentence_wids,
+      //     s_length,
+      //     score,
+      //     s_cover);
+      // }
+      else if (submod_norm == SubmodularNormalization::IDF || submod_norm == SubmodularNormalization::NO)
       {
-        std::cerr << "No norm..." << std::endl << std::flush;
-        get_bow_score(
-          sorted_pattern_terms,
-          count_terms,
-          sentence_wids,
-          s_length,
-          score,
-          s_cover);
-      }
-      else if (submod_norm == SubmodularNormalization::IDF)
-      {
-        std::cerr << "IDF norm..." << std::endl << std::flush;
-        get_bow_score_idf(
-          sorted_pattern_terms,
-          count_terms,
-          sentence_wids,
-          s_length,
-          sorted_pattern_terms_idf,
-          score,
-          s_cover);
+        std::cerr 
+          << (submod_norm == SubmodularNormalization::IDF ? "IDF" : "NO")
+          << " norm..." << std::endl << std::flush;
+        if (submod_fun == SubmodularFunction::BOW)
+          get_bow_score(
+            sorted_pattern_terms,
+            count_terms,
+            sentence_wids,
+            s_length,
+            score,
+            s_cover,
+            sorted_pattern_terms_idf);
+        else if (submod_fun == SubmodularFunction::NGRAM)
+          get_ngram_score(
+            sorted_pattern_ngrams,
+            4,
+            count_terms,
+            sentence_wids,
+            s_length,
+            score,
+            s_cover,
+            sorted_pattern_terms_idf);
       }
       // std::cerr << "q:       ";
       // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++)
@@ -744,6 +796,7 @@ namespace fuzzy
       switch(submod_fun) // salient aspect weighted cover
       {
         case SubmodularFunction::BOW:
+        case SubmodularFunction::NGRAM:
           break;
         default:
           const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length);
@@ -763,17 +816,16 @@ namespace fuzzy
             // float cost = 0.1;
           if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
             continue;
-            float score = int(10000 - cost * 100) / 10000.0;
+          float score = int(10000 - cost * 100) / 10000.0;
 
-            lowest_costs.push(cost);
-            if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
-              lowest_costs.pop();
+          lowest_costs.push(cost);
+          if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
+            lowest_costs.pop();
           }
           else 
           {
             continue;
           }
-
       }
 
       if (score >= fuzzy) {
@@ -801,13 +853,11 @@ namespace fuzzy
 
     if (shrinking_factor < 1.f) // submodular coverage
     {
-      std::vector<float> cover_weights(sorted_pattern_terms.size(), 1.f);
+      std::vector<float> cover_weights(count_terms.size(), 1.f);
       std::list<Match> candidates;
       while (!result.empty())
       {
         auto match = result.top();
-        // match.penalty = match.score; // initialize corr. to weights of 1
-        // match.penalty = match.score; // initialize corr. to weights of 1
         candidates.push_back(match);
         result.pop();
       }
@@ -820,20 +870,20 @@ namespace fuzzy
         for (Match &match : candidates)
         {
           float rescore = 0.f;
-          // std::cerr << "rescore " << match.s_id << " : (";
+          std::cerr << "rescore " << match.s_id << " : (";
           for (unsigned i = 0; i < cover_weights.size(); i++)
           {  
             rescore += cover_weights[i] * match.cover[i];
-            // if (match.cover[i] != 0) 
-            //   std::cerr << cover_weights[i] << "*" << match.cover[i] << "+";
+            if (match.cover[i] != 0) 
+              std::cerr << cover_weights[i] << "*" << match.cover[i] << "+";
           }
           
-          // std::cerr << ") " << match.penalty << " -> " << rescore << std::endl;
+          std::cerr << ") " << match.penalty << " -> " << rescore << std::endl;
           match.penalty = rescore;
         }
         auto it_max = std::max_element(candidates.begin(), candidates.end(), comp);
         matches.push_back(*it_max);
-        // std::cerr << "choose No " << it_max->s_id << std::endl;
+        std::cerr << "choose No " << it_max->s_id << std::endl;
         // update cover_weights
         for (unsigned i = 0; i < cover_weights.size(); i++)
           if (it_max->cover[i] > 0)
diff --git a/src/submodular.cc b/src/submodular.cc
index 81aadab..fcaf2a8 100644
--- a/src/submodular.cc
+++ b/src/submodular.cc
@@ -1,62 +1,146 @@
 #include <fuzzy/submodular.hh>
-#include <algorithm>
 #include <cmath>
+#include <iostream>
 
 namespace fuzzy
 {
+    NGram::NGram(const unsigned* start, unsigned N) : 
+        _start(start),
+        _N(N) {}
+    NGram& NGram::operator=(const NGram& other) {
+        if (this != &other)
+        {
+            _start = other._start;
+            _N = other._N;
+        }
+        return *this;
+    }
+    bool NGram::operator==(NGram& other)
+    {
+        bool out = _N == other._N;
+        for (unsigned i = 0; out && (i < _N); i++)
+            out = (other._start[i] == _start[i]);
+        return out;
+    }
+    bool NGram::operator<(NGram& other)
+    {
+        if (_N != other._N)
+            return _N < other._N;
+        return std::lexicographical_compare(
+            _start, _start + _N, 
+            other._start, other._start + other._N);
+    }
+    void NGram::print() const
+    {
+        for (unsigned i = 0; i < _N; i++)
+            std::cerr << _start[i] << ",";
+        std::cerr << '\t';
+    }
+
     void get_bow_score(
         std::vector<unsigned>& sorted_pattern_terms,
         std::vector<unsigned>& count_terms,
         const unsigned* sentence,
         const unsigned sentence_length,
         float& score,
-        std::vector<float>& cover)
+        std::vector<float>& cover,
+        std::vector<float>& idf_penalty)
     {
         std::vector<unsigned> sorted_sentence_terms(sentence, sentence + sentence_length);
         std::sort(sorted_sentence_terms.begin(), sorted_sentence_terms.end());
-        cover = std::vector<float>(sorted_pattern_terms.size(), 0.f);
-        score = 0.f;
-        for (unsigned i, j = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++)
-        {
-            while (
-                (i < sorted_pattern_terms.size()) && 
-                (sorted_pattern_terms[i] < sorted_sentence_terms[j]))
-                i++;
-            // std::cerr << sorted_pattern_terms[i] << " ?= " << sorted_sentence_terms[j] << "  (" << i << ", " << j << ")" << std::endl;
-            if (sorted_pattern_terms[i] == sorted_sentence_terms[j])
-                if ((float)count_terms[i] > cover[i] + 1e-6f)
-                {
-                    cover[i] += 1.f;
-                    score += 1.f;
-                }
-        }
+        get_score(sorted_pattern_terms, sorted_sentence_terms, count_terms, score, cover, idf_penalty);
     }
-    void get_bow_score_idf(
-        std::vector<unsigned>& sorted_pattern_terms,
+
+    void get_ngram_score(
+        std::vector<NGram>& sorted_pattern_terms,
+        const unsigned N,
         std::vector<unsigned>& count_terms,
         const unsigned* sentence,
         const unsigned sentence_length,
-        std::vector<float>& idf_penalty,
         float& score,
-        std::vector<float>& cover)
+        std::vector<float>& cover,
+        std::vector<float>& idf_penalty)
     {
-        std::vector<unsigned> sorted_sentence_terms(sentence, sentence + sentence_length);
-        std::sort(sorted_sentence_terms.begin(), sorted_sentence_terms.end());
-        cover = std::vector<float>(sorted_pattern_terms.size(), 0.f);
-        score = 0.f;
-        for (unsigned i, j, k = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++)
-        {
-            while (
-                (i < sorted_pattern_terms.size()) && 
-                (sorted_pattern_terms[i] < sorted_sentence_terms[j]))
-                i++;
-            // std::cerr << sorted_pattern_terms[i] << " ?= " << sorted_sentence_terms[j] << "  (" << i << ", " << j << ")" << std::endl;
-            if (sorted_pattern_terms[i] == sorted_sentence_terms[j])
-                if ((float)count_terms[i] > cover[i] / idf_penalty[i] + 1e-6f)
-                {
-                    cover[i] += idf_penalty[i];
-                    score += idf_penalty[i];
-                }
-        }
+        std::cerr << "avant " << sentence_length << std::endl;
+        std::vector<NGram> all_ngrams = get_sorted_ngrams(N, sentence, sentence_length);
+        std::cerr << "apres" << std::endl;
+        // all_ngrams.reserve(N * sentence_length - N * (N - 1) / 2);
+        // for (unsigned n = 1; n <= N; n++)
+        //     for (unsigned i = 0; i < sentence_length - n + 1; i++)
+        //         all_ngrams.push_back(NGram(sentence + i, n));
+
+        // std::sort(all_ngrams.begin(), all_ngrams.end());
+
+        for (const NGram& ngram : all_ngrams)
+            ngram.print();
+        std::cerr << std::endl;
+
+        get_score(sorted_pattern_terms, all_ngrams, count_terms, score, cover, idf_penalty);
+        std::cerr << "xxx" << std::endl;
+    }
+
+    // void get_bow_score(
+    //     std::vector<unsigned>& sorted_pattern_terms,
+    //     std::vector<unsigned>& count_terms,
+    //     const unsigned* sentence,
+    //     const unsigned sentence_length,
+    //     float& score,
+    //     std::vector<float>& cover)
+    // {
+    //     std::vector<unsigned> sorted_sentence_terms(sentence, sentence + sentence_length);
+    //     std::sort(sorted_sentence_terms.begin(), sorted_sentence_terms.end());
+    //     cover = std::vector<float>(sorted_pattern_terms.size(), 0.f);
+    //     score = 0.f;
+    //     for (unsigned i, j = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++)
+    //     {
+    //         while (
+    //             (i < sorted_pattern_terms.size()) && 
+    //             (sorted_pattern_terms[i] < sorted_sentence_terms[j]))
+    //             i++;
+    //         if (sorted_pattern_terms[i] == sorted_sentence_terms[j])
+    //             if ((float)count_terms[i] > cover[i] + 1e-6f)
+    //             {
+    //                 cover[i] += 1.f;
+    //                 score += 1.f;
+    //             }
+    //     }
+    // }
+
+    void get_all_ngrams(
+        const unsigned* sequence,
+        const unsigned length,
+        const unsigned N,
+        std::vector<NGram>& ngrams,
+        std::vector<unsigned>& counts)
+    {
+        std::vector<NGram> all_ngrams = get_sorted_ngrams(N, sequence, length);
+        // std::vector<NGram> all_ngrams;
+        // all_ngrams.reserve(N * length - N * (N - 1) / 2);
+        // for (unsigned n = 1; n <= N; n++)
+        //     for (unsigned i = 0; i < length - n + 1; i++)
+        //         all_ngrams.push_back(NGram(sequence + i, n));
+        // // std::cerr << std::endl;
+        // std::sort(all_ngrams.begin(), all_ngrams.end());
+        for (const NGram& ngram : all_ngrams)
+            ngram.print();
+        std::cerr << std::endl;
+
+        get_unique_with_count(all_ngrams, ngrams, counts);
+        // std::cerr << "ALL" << std::endl;
+        // for (unsigned i = 0; i < ngrams.size(); i++)
+        // {
+        //     ngrams[i].print();
+        //     std::cerr << counts[i] << std::endl;
+        // }
+        // std::cerr << std::endl;
+        // std::cerr << "ONLY > 1" << std::endl;
+        // for (unsigned i = 0; i < ngrams.size(); i++)
+        //     if (counts[i] > 1)
+        //     {
+        //         ngrams[i].print();
+        //         std::cerr << counts[i] << std::endl;
+        //     }
+        // std::cerr << std::endl;
+        
     }
 }
\ No newline at end of file

From 6021f02c486b9c2d1c80dbb1976c6608b86ad695 Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Fri, 12 Jan 2024 16:32:24 +0100
Subject: [PATCH 10/15] ngram implementation

---
 src/fuzzy_match.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc
index 3b6a60a..27dea52 100644
--- a/src/fuzzy_match.cc
+++ b/src/fuzzy_match.cc
@@ -403,12 +403,10 @@ namespace fuzzy
     float tot_idf;
     for (const NGram& ngram : pattern_ngrams) {
       for (n = 0, tot_idf = 0.f; n < ngram._N; n++)
-      {
         if (ngram._start[n] != fuzzy::VocabIndexer::VOCAB_UNK)
           tot_idf += std::log((float)num_sentences/(float)word_frequency_in_sentences[ngram._start[n]]);
         else
           tot_idf += unknown_vocab_word_penalty;
-      }
       idf_penalty.push_back(tot_idf / (float)n);
     }
     return idf_penalty;

From f1f92e5c2a08b1f7623e22daf7e80c88432cc729 Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Fri, 12 Jan 2024 18:09:25 +0100
Subject: [PATCH 11/15] ed implementation

---
 include/fuzzy/edit_distance.hh |  12 +++
 src/edit_distance.cc           | 109 ++++++++++++++++++++++
 src/fuzzy_match.cc             | 163 ++++++++++++++++++---------------
 src/submodular.cc              |  68 --------------
 4 files changed, 211 insertions(+), 141 deletions(-)

diff --git a/include/fuzzy/edit_distance.hh b/include/fuzzy/edit_distance.hh
index eff7ab9..aa88717 100644
--- a/include/fuzzy/edit_distance.hh
+++ b/include/fuzzy/edit_distance.hh
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <limits>
+#include <algorithm>
+#include <iostream>
 
 #include <fuzzy/sentence.hh>
 #include <fuzzy/costs.hh>
@@ -21,6 +23,16 @@ namespace fuzzy
                        const EditCosts& edit_costs,
                        const Costs& costs,
                        float max_fuzzyness = std::numeric_limits<float>::max());
+
+  float _edit_distance_cover(const unsigned* thes, const Sentence &reals, int slen,
+                             const unsigned* thep, const Tokens &realptok, int plen,
+                             const std::vector<const char*>& st, const std::vector<int>& sn,
+                             const std::vector<float> &idf_penalty, float idf_weight,
+                             const EditCosts&,
+                             const Costs&,
+                             std::vector<float>& cover,
+                             const bool idf_cover = false,
+                             float max_fuzziness = std::numeric_limits<float>::max());
 }
 
 #include <fuzzy/edit_distance.hxx>
diff --git a/src/edit_distance.cc b/src/edit_distance.cc
index 2d72d41..30c3976 100644
--- a/src/edit_distance.cc
+++ b/src/edit_distance.cc
@@ -120,4 +120,113 @@ namespace fuzzy
     }
     return arr[n1][n2];
   }
+
+  float
+  _edit_distance_cover(const unsigned* s1, const Sentence &real1, int n1,
+                       const unsigned* s2, const Tokens &real2tok, int n2,
+                       const std::vector<const char*>& st2, const std::vector<int>& sn2,
+                       const std::vector<float> &idf_penalty, float idf_weight,
+                       const EditCosts& edit_costs,
+                       const Costs& costs,
+                       std::vector<float>& cover,
+                       const bool idf_cover,
+                       float max_fuzzyness)
+  {
+    boost::multi_array<float, 2> arr(boost::extents[n1+1][n2+1]);
+    boost::multi_array<unsigned, 2> traceback(boost::extents[n1+1][n2+1]);
+    boost::multi_array<int, 2> cost_tag(boost::extents[n1+1][n2+1]);
+    /* idf_penalty(w) = log(nbre seqs / nbre occ w) */ 
+    /* idf_weight = weight * costs.diff_word / log(nbre seqs) */ 
+
+    std::vector<const char*> st1(n1+1, nullptr);
+    std::vector<int> sn1(n1+1, 0);
+    real1.get_itoks(st1, sn1);
+    Tokens real1tok = (Tokens)real1;
+
+    /* we have a fixed cost corresponding to trailing penalty_tokens */
+    arr[0][0] = _edit_distance_char(st1[n1], sn1[n1], st2[n2], sn2[n2]);
+    cost_tag[0][0] = _edit_distance_char(st1[0], sn1[0], st2[0], sn2[0]);
+
+    for (int i = 1; i < n1 + 1; i++) {
+      /* initialize distance source side (real1) */
+      arr[i][0] = arr[i-1][0] + costs.diff_word * edit_costs.delete_cost + sn1[i];
+      traceback[i][0] = 0;
+      cost_tag[i][0] = _edit_distance_char(st1[i], sn1[i], st2[0], sn2[0]);
+    }
+    for (int j = 1; j < n2 + 1; j++) {
+      /* initialize distance target side (real2tok) */
+      arr[0][j] = arr[0][j-1] + costs.diff_word * edit_costs.insert_cost + sn2[j];
+      traceback[0][j] = 1;
+      if (idf_weight)
+        arr[0][j] += idf_penalty[j-1] * idf_weight;
+      cost_tag[0][j] = _edit_distance_char(st1[0], sn1[0], st2[j], sn2[j]);
+    }
+
+    for (int i = 1; i < n1 + 1; i++)
+    {
+      float min = std::numeric_limits<float>::max();
+      for (int j = 1; j < n2 + 1; j++)
+      {
+        float diff = 0.f;
+        float penalty_j1 = 0.f;
+        if (idf_weight)
+          penalty_j1 = idf_penalty[j-1] * idf_weight;
+        if (s1[i-1] != s2[j-1]) {
+          diff = edit_costs.replace_cost * costs.diff_word + penalty_j1;
+        }
+        else if (real1tok[i-1] != real2tok[j-1]) {
+          /* is difference only a case difference */
+          if (strchr("LUMC", real1tok[i-1][0]))
+            diff = edit_costs.replace_cost * costs.diff_case;
+          else {
+            diff = edit_costs.replace_cost * costs.diff_real;
+          }
+        }
+
+        cost_tag[i][j] = _edit_distance_char(st1[i], sn1[i], st2[j], sn2[j]);
+        const auto previous = {
+          arr[i - 1][j] + edit_costs.delete_cost * costs.diff_word + cost_tag[i - 1][j],
+          arr[i][j - 1] + edit_costs.insert_cost * costs.diff_word + cost_tag[i][j - 1] + penalty_j1,
+          arr[i - 1][j - 1] + diff + cost_tag[i - 1][j - 1]
+        };
+        const auto min_it = std::min_element(std::begin(previous), std::end(previous));
+        unsigned argmin = std::distance(std::begin(previous), min_it);
+        const auto distance = *min_it;
+        if (argmin == 2 && diff == 0.f)
+          argmin = 3;
+        traceback[i][j] = argmin; 
+
+        arr[i][j] = distance;
+        min = std::min(min, distance);
+      }
+      if (min > max_fuzzyness)
+        return min;
+    }
+    int i = n1;
+    int j = n2;
+    while (i != 0 || j != 0)
+      switch(traceback[i][j])
+      {
+        case 0:
+          i--;
+          break;
+        case 1:
+          j--;
+          break;
+        case 2:
+          i--;
+          j--;
+          break;
+        case 3:
+          i--;
+          j--;
+          std::cerr << "idf cover " << j << "  " << idf_penalty.size() << std::endl << std::flush;
+          cover[j] = idf_cover ? idf_penalty[j] : 1.f;
+          break;
+        default:
+          ;
+      };
+
+    return arr[n1][n2];
+  }
 }
diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc
index 27dea52..f1d394d 100644
--- a/src/fuzzy_match.cc
+++ b/src/fuzzy_match.cc
@@ -652,6 +652,7 @@ namespace fuzzy
     {
       case SubmodularFunction::BOW:
       {
+        // all terms bow
         std::vector<unsigned> sorted_pattern_wids(pattern_wids);
         std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end());
 
@@ -659,32 +660,11 @@ namespace fuzzy
 
         if (submod_norm == SubmodularNormalization::IDF)
           sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms);
-
-        // sorted_pattern_terms.reserve(pattern_wids.size());
-        // count_terms.reserve(pattern_wids.size());
-        // if (sorted_pattern_wids.size() > 0)
-        // {
-        //   unsigned current_term = sorted_pattern_wids[0];
-        //   unsigned current_count = 1;
-        //   for (unsigned i = 1; i < sorted_pattern_wids.size(); i++)
-        //   {
-        //     if (current_term != sorted_pattern_wids[i])
-        //     {
-        //       sorted_pattern_terms.push_back(current_term);
-        //       count_terms.push_back(current_count);
-        //       current_term = sorted_pattern_wids[i];
-        //       current_count = 1;
-        //     }
-        //     else
-        //       current_count++;
-        //   }
-        //   sorted_pattern_terms.push_back(current_term);
-        //   count_terms.push_back(current_count);
-        // }
-        // break;
+        break;
       }
       case SubmodularFunction::NGRAM:
       {
+        // all ngrams bow
         get_all_ngrams(
           pattern_wids.data(),
           p_length,
@@ -693,6 +673,15 @@ namespace fuzzy
           count_terms);
         if (submod_norm == SubmodularNormalization::IDF)
           sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_ngrams);
+        break;
+      }
+      case SubmodularFunction::ED:
+      {
+        // sentence indices
+        sorted_pattern_terms = std::vector<unsigned>(pattern_wids);
+        if (submod_norm == SubmodularNormalization::IDF)
+          sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms);
+        break;
       }
       default:
         ;
@@ -708,8 +697,6 @@ namespace fuzzy
     std::cerr << std::endl;
     /////////////
 
-    std::cerr << std::endl << "num best match after bm25 = " << best_matches.size() << std::endl << std::flush;
-
     for (const auto& pair : best_matches)
     {
       // num_filtered++;
@@ -717,10 +704,6 @@ namespace fuzzy
       const auto score_filter = pair.second;
       size_t s_length = 0;
       const auto* sentence_wids = _filterIndex->get_Filter().get_sentence(s_id, &s_length);
-      // const auto num_covered_words = (score_filter < p_length
-      //                                 ? pattern_coverage.count_covered_words(sentence_wids, s_length)
-      //                                 : p_length);
-      // const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length);
 
       std::vector<float> s_cover;
       float score;
@@ -736,17 +719,6 @@ namespace fuzzy
         BM25Matches& bm25Matches = static_cast<BM25Matches&>(*filter_matches);
         s_cover = bm25Matches.cover(sorted_pattern_terms, count_terms, s_id);
       }
-      // else if (submod_norm == SubmodularNormalization::NO)
-      // {
-      //   std::cerr << "No norm..." << std::endl << std::flush;
-      //   get_bow_score(
-      //     sorted_pattern_terms,
-      //     count_terms,
-      //     sentence_wids,
-      //     s_length,
-      //     score,
-      //     s_cover);
-      // }
       else if (submod_norm == SubmodularNormalization::IDF || submod_norm == SubmodularNormalization::NO)
       {
         std::cerr 
@@ -771,32 +743,43 @@ namespace fuzzy
             score,
             s_cover,
             sorted_pattern_terms_idf);
-      }
-      // std::cerr << "q:       ";
-      // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++)
-      //   std::cerr << sorted_pattern_terms[i] << ",";
-      // std::cerr << std::endl;
-      // std::cerr << "q count: ";
-      // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++)
-      //   std::cerr << count_terms[i] << ",";
-      // std::cerr << std::endl;
-      std::cerr << "sent:    ";
-      for (unsigned i = 0; i < s_length; i++)
-        std::cerr << sentence_wids[i] << ",";
-      std::cerr << std::endl;
-      std::cerr << "cover:   ";
-      for (unsigned i = 0; i < s_cover.size(); i++)
-        std::cerr << s_cover[i] << ",";
-      std::cerr << std::endl;
-      std::cerr << "score:   " << score << std::endl;
-      std::cerr << "...done" << std::endl << std::flush;
-
-      switch(submod_fun) // salient aspect weighted cover
-      {
-        case SubmodularFunction::BOW:
-        case SubmodularFunction::NGRAM:
-          break;
-        default:
+        else if (submod_fun == SubmodularFunction::ED)
+        { 
+          const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length);
+          /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
+          if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs))
+          {
+            const Costs costs(p_length, s_length, edit_costs);
+            /* let us check the candidates */
+            const auto sentence_realtok = _filterIndex->real_tokens(s_id);
+            const auto cost_upper_bound = lowest_costs.top();
+            s_cover = std::vector<float>(p_length, 0.f);
+            if (idf_penalty.size() == 0 && submod_norm == SubmodularNormalization::IDF)
+              idf_penalty = compute_idf_penalty(pattern_wids);
+            float cost = _edit_distance_cover(sentence_wids, sentence_realtok, s_length,
+                                              pattern_wids.data(), pattern_realtok, p_length,
+                                              st, sn,
+                                              idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max,
+                                              edit_costs,
+                                              costs, 
+                                              s_cover,
+                                              submod_norm == SubmodularNormalization::IDF,
+                                              cost_upper_bound);
+            if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
+              continue;
+            float score = int(10000 - cost * 100) / 10000.0;
+
+            lowest_costs.push(cost);
+            if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
+              lowest_costs.pop();
+          }
+          else 
+          {
+            continue;
+          }
+        }
+        else
+        {
           const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length);
           /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
           if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs))
@@ -812,20 +795,52 @@ namespace fuzzy
                                         edit_costs,
                                         costs, cost_upper_bound);
             // float cost = 0.1;
-          if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
-            continue;
-          float score = int(10000 - cost * 100) / 10000.0;
+            if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
+              continue;
+            score = int(10000 - cost * 100) / 10000.0;
 
-          lowest_costs.push(cost);
-          if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
-            lowest_costs.pop();
+            lowest_costs.push(cost);
+            if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
+              lowest_costs.pop();
           }
           else 
           {
             continue;
           }
+        }
       }
 
+      // switch(submod_fun) 
+      // {
+      //   case SubmodularFunction::BOW:
+      //   case SubmodularFunction::NGRAM:
+      //     break;
+      //   case SubmodularFunction::ED:
+        
+      //   default:
+          
+      // }
+
+      // std::cerr << "q:       ";
+      // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++)
+      //   std::cerr << sorted_pattern_terms[i] << ",";
+      // std::cerr << std::endl;
+      // std::cerr << "q count: ";
+      // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++)
+      //   std::cerr << count_terms[i] << ",";
+      // std::cerr << std::endl;
+      std::cerr << "sent:    ";
+      for (unsigned i = 0; i < s_length; i++)
+        std::cerr << sentence_wids[i] << ",";
+      std::cerr << std::endl;
+      std::cerr << "cover:   ";
+      for (unsigned i = 0; i < s_cover.size(); i++)
+        std::cerr << s_cover[i] << ",";
+      std::cerr << std::endl;
+      std::cerr << "score:   " << score << std::endl;
+      std::cerr << "...done" << std::endl << std::flush;
+
+
       if (score >= fuzzy) {
         Match m(sentence_wids, s_length);
         m.score = (filter_type == IndexType::BM25) ? (float)score_filter / (float)1000. : score;
@@ -851,7 +866,8 @@ namespace fuzzy
 
     if (shrinking_factor < 1.f) // submodular coverage
     {
-      std::vector<float> cover_weights(count_terms.size(), 1.f);
+      const unsigned cover_length = (submod_fun == SubmodularFunction::ED) ? p_length : count_terms.size();
+      std::vector<float> cover_weights(cover_length, 1.f);
       std::list<Match> candidates;
       while (!result.empty())
       {
@@ -864,6 +880,7 @@ namespace fuzzy
       };
       while (!candidates.empty() && (number_of_matches == 0 || matches.size() < number_of_matches))
       {
+        // TODO: not compute it the first iteration
         // rescore penalties of candidates
         for (Match &match : candidates)
         {
diff --git a/src/submodular.cc b/src/submodular.cc
index fcaf2a8..626453c 100644
--- a/src/submodular.cc
+++ b/src/submodular.cc
@@ -61,51 +61,10 @@ namespace fuzzy
         std::vector<float>& cover,
         std::vector<float>& idf_penalty)
     {
-        std::cerr << "avant " << sentence_length << std::endl;
         std::vector<NGram> all_ngrams = get_sorted_ngrams(N, sentence, sentence_length);
-        std::cerr << "apres" << std::endl;
-        // all_ngrams.reserve(N * sentence_length - N * (N - 1) / 2);
-        // for (unsigned n = 1; n <= N; n++)
-        //     for (unsigned i = 0; i < sentence_length - n + 1; i++)
-        //         all_ngrams.push_back(NGram(sentence + i, n));
-
-        // std::sort(all_ngrams.begin(), all_ngrams.end());
-
-        for (const NGram& ngram : all_ngrams)
-            ngram.print();
-        std::cerr << std::endl;
-
         get_score(sorted_pattern_terms, all_ngrams, count_terms, score, cover, idf_penalty);
-        std::cerr << "xxx" << std::endl;
     }
 
-    // void get_bow_score(
-    //     std::vector<unsigned>& sorted_pattern_terms,
-    //     std::vector<unsigned>& count_terms,
-    //     const unsigned* sentence,
-    //     const unsigned sentence_length,
-    //     float& score,
-    //     std::vector<float>& cover)
-    // {
-    //     std::vector<unsigned> sorted_sentence_terms(sentence, sentence + sentence_length);
-    //     std::sort(sorted_sentence_terms.begin(), sorted_sentence_terms.end());
-    //     cover = std::vector<float>(sorted_pattern_terms.size(), 0.f);
-    //     score = 0.f;
-    //     for (unsigned i, j = 0; (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size()); j++)
-    //     {
-    //         while (
-    //             (i < sorted_pattern_terms.size()) && 
-    //             (sorted_pattern_terms[i] < sorted_sentence_terms[j]))
-    //             i++;
-    //         if (sorted_pattern_terms[i] == sorted_sentence_terms[j])
-    //             if ((float)count_terms[i] > cover[i] + 1e-6f)
-    //             {
-    //                 cover[i] += 1.f;
-    //                 score += 1.f;
-    //             }
-    //     }
-    // }
-
     void get_all_ngrams(
         const unsigned* sequence,
         const unsigned length,
@@ -114,33 +73,6 @@ namespace fuzzy
         std::vector<unsigned>& counts)
     {
         std::vector<NGram> all_ngrams = get_sorted_ngrams(N, sequence, length);
-        // std::vector<NGram> all_ngrams;
-        // all_ngrams.reserve(N * length - N * (N - 1) / 2);
-        // for (unsigned n = 1; n <= N; n++)
-        //     for (unsigned i = 0; i < length - n + 1; i++)
-        //         all_ngrams.push_back(NGram(sequence + i, n));
-        // // std::cerr << std::endl;
-        // std::sort(all_ngrams.begin(), all_ngrams.end());
-        for (const NGram& ngram : all_ngrams)
-            ngram.print();
-        std::cerr << std::endl;
-
         get_unique_with_count(all_ngrams, ngrams, counts);
-        // std::cerr << "ALL" << std::endl;
-        // for (unsigned i = 0; i < ngrams.size(); i++)
-        // {
-        //     ngrams[i].print();
-        //     std::cerr << counts[i] << std::endl;
-        // }
-        // std::cerr << std::endl;
-        // std::cerr << "ONLY > 1" << std::endl;
-        // for (unsigned i = 0; i < ngrams.size(); i++)
-        //     if (counts[i] > 1)
-        //     {
-        //         ngrams[i].print();
-        //         std::cerr << counts[i] << std::endl;
-        //     }
-        // std::cerr << std::endl;
-        
     }
 }
\ No newline at end of file

From 7a2c13f8ff4890f5810cf09261679e8170d1fc8f Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Mon, 15 Jan 2024 17:47:18 +0100
Subject: [PATCH 12/15] working but unclean

---
 cli/src/FuzzyMatch-cli.cc    |   2 +-
 include/fuzzy/fuzzy_match.hh |   1 +
 include/fuzzy/submodular.hxx |   2 +-
 src/edit_distance.cc         |   2 +-
 src/fuzzy_match.cc           | 245 ++++++++++++++++++-----------------
 src/submodular.cc            |   6 +
 6 files changed, 136 insertions(+), 122 deletions(-)

diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc
index 9dd1b23..97d257b 100644
--- a/cli/src/FuzzyMatch-cli.cc
+++ b/cli/src/FuzzyMatch-cli.cc
@@ -466,7 +466,7 @@ int main(int argc, char** argv)
   else
     submodular_function = fuzzy::SubmodularFunction::NO;
   fuzzy::SubmodularNormalization submodular_normalization;
-  std::cerr << "submodular_normalization_str = " << submodular_normalization_str << std::endl;
+  // std::cerr << "submodular_normalization_str = " << submodular_normalization_str << std::endl;
   if (submodular_normalization_str == "BM25")
     submodular_normalization = fuzzy::SubmodularNormalization::BM25;
   else if (submodular_normalization_str == "IDF")
diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh
index 3d4560f..bfca226 100644
--- a/include/fuzzy/fuzzy_match.hh
+++ b/include/fuzzy/fuzzy_match.hh
@@ -8,6 +8,7 @@
 #include <fuzzy/edit_distance.hh>
 #include <fuzzy/submodular.hh>
 #include <memory>
+#include <numeric>
 
 namespace onmt {
   class Tokenizer;
diff --git a/include/fuzzy/submodular.hxx b/include/fuzzy/submodular.hxx
index 0353158..858217e 100644
--- a/include/fuzzy/submodular.hxx
+++ b/include/fuzzy/submodular.hxx
@@ -29,7 +29,7 @@ namespace fuzzy
         cover = std::vector<float>(sorted_pattern_terms.size(), 0.f);
         score = 0.f;
         for (
-            unsigned i, j, k = 0;
+            unsigned i = 0, j = 0, k = 0;
             (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size());
             j++)
         {
diff --git a/src/edit_distance.cc b/src/edit_distance.cc
index 30c3976..170b4d1 100644
--- a/src/edit_distance.cc
+++ b/src/edit_distance.cc
@@ -220,7 +220,7 @@ namespace fuzzy
         case 3:
           i--;
           j--;
-          std::cerr << "idf cover " << j << "  " << idf_penalty.size() << std::endl << std::flush;
+          // std::cerr << "idf cover " << j << "  " << idf_penalty.size() << std::endl << std::flush;
           cover[j] = idf_cover ? idf_penalty[j] : 1.f;
           break;
         default:
diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc
index f1d394d..09d80ce 100644
--- a/src/fuzzy_match.cc
+++ b/src/fuzzy_match.cc
@@ -657,6 +657,13 @@ namespace fuzzy
         std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end());
 
         get_unique_with_count(sorted_pattern_wids, sorted_pattern_terms, count_terms);
+        // std::cerr << "### "
+        //           << sorted_pattern_wids.size() 
+        //           << ", "
+        //           << sorted_pattern_terms.size()
+        //           << ", "
+        //           << count_terms.size()
+        //           << std::endl;
 
         if (submod_norm == SubmodularNormalization::IDF)
           sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms);
@@ -687,14 +694,14 @@ namespace fuzzy
         ;
     }
 
-    std::cerr << "sorted unique terms" << ": ";
-    for (const auto& c : sorted_pattern_terms)
-      std::cerr << c << ", ";
-    std::cerr << std::endl;
-    std::cerr << "Idf" << ": ";
-    for (const auto& c : sorted_pattern_terms_idf)
-      std::cerr << c << ", ";
-    std::cerr << std::endl;
+    // std::cerr << "sorted unique terms" << ": ";
+    // for (const auto& c : sorted_pattern_terms)
+    //   std::cerr << c << ", ";
+    // std::cerr << std::endl;
+    // std::cerr << "Idf" << ": ";
+    // for (const auto& c : sorted_pattern_terms_idf)
+    //   std::cerr << c << ", ";
+    // std::cerr << std::endl;
     /////////////
 
     for (const auto& pair : best_matches)
@@ -713,7 +720,7 @@ namespace fuzzy
 
       if (submod_norm == SubmodularNormalization::BM25)
       {
-        std::cerr << "BM25 norm..." << std::endl << std::flush;
+        // std::cerr << "BM25 norm..." << std::endl << std::flush;
         score = (float)score_filter / 1000.f;
         assert((filter_type == IndexType::BM25));
         BM25Matches& bm25Matches = static_cast<BM25Matches&>(*filter_matches);
@@ -721,106 +728,104 @@ namespace fuzzy
       }
       else if (submod_norm == SubmodularNormalization::IDF || submod_norm == SubmodularNormalization::NO)
       {
-        std::cerr 
-          << (submod_norm == SubmodularNormalization::IDF ? "IDF" : "NO")
-          << " norm..." << std::endl << std::flush;
-        if (submod_fun == SubmodularFunction::BOW)
-          get_bow_score(
-            sorted_pattern_terms,
-            count_terms,
-            sentence_wids,
-            s_length,
-            score,
-            s_cover,
-            sorted_pattern_terms_idf);
-        else if (submod_fun == SubmodularFunction::NGRAM)
-          get_ngram_score(
-            sorted_pattern_ngrams,
-            4,
-            count_terms,
-            sentence_wids,
-            s_length,
-            score,
-            s_cover,
-            sorted_pattern_terms_idf);
-        else if (submod_fun == SubmodularFunction::ED)
-        { 
-          const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length);
-          /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
-          if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs))
-          {
-            const Costs costs(p_length, s_length, edit_costs);
-            /* let us check the candidates */
-            const auto sentence_realtok = _filterIndex->real_tokens(s_id);
-            const auto cost_upper_bound = lowest_costs.top();
-            s_cover = std::vector<float>(p_length, 0.f);
-            if (idf_penalty.size() == 0 && submod_norm == SubmodularNormalization::IDF)
-              idf_penalty = compute_idf_penalty(pattern_wids);
-            float cost = _edit_distance_cover(sentence_wids, sentence_realtok, s_length,
-                                              pattern_wids.data(), pattern_realtok, p_length,
-                                              st, sn,
-                                              idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max,
-                                              edit_costs,
-                                              costs, 
-                                              s_cover,
-                                              submod_norm == SubmodularNormalization::IDF,
-                                              cost_upper_bound);
-            if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
-              continue;
-            float score = int(10000 - cost * 100) / 10000.0;
-
-            lowest_costs.push(cost);
-            if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
-              lowest_costs.pop();
-          }
-          else 
-          {
-            continue;
-          }
-        }
-        else
+        // std::cerr 
+        //   << (submod_norm == SubmodularNormalization::IDF ? "IDF" : "NO")
+        //   << " norm..." << std::endl << std::flush;
+        
+        switch(submod_fun) 
         {
-          const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length);
-          /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
-          if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs))
-          {
-            const Costs costs(p_length, s_length, edit_costs);
-            /* let us check the candidates */
-            const auto sentence_realtok = _filterIndex->real_tokens(s_id);
-            const auto cost_upper_bound = lowest_costs.top();
-            float cost = _edit_distance(sentence_wids, sentence_realtok, s_length,
-                                        pattern_wids.data(), pattern_realtok, p_length,
-                                        st, sn,
-                                        idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max,
-                                        edit_costs,
-                                        costs, cost_upper_bound);
-            // float cost = 0.1;
-            if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
+          case SubmodularFunction::BOW:
+            get_bow_score(
+              sorted_pattern_terms,
+              count_terms,
+              sentence_wids,
+              s_length,
+              score,
+              s_cover,
+              sorted_pattern_terms_idf);
+            score /= (float)std::accumulate(count_terms.begin(), count_terms.end(), 0);
+            break;
+          case SubmodularFunction::NGRAM:
+            get_ngram_score(
+              sorted_pattern_ngrams,
+              4,
+              count_terms,
+              sentence_wids,
+              s_length,
+              score,
+              s_cover,
+              sorted_pattern_terms_idf);
+            score /= (float)std::accumulate(count_terms.begin(), count_terms.end(), 0);
+            break;
+          case SubmodularFunction::ED:
+          { 
+            const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length);
+            /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
+            if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs))
+            {
+              const Costs costs(p_length, s_length, edit_costs);
+              /* let us check the candidates */
+              const auto sentence_realtok = _filterIndex->real_tokens(s_id);
+              const auto cost_upper_bound = lowest_costs.top();
+              s_cover = std::vector<float>(p_length, 0.f);
+              if (idf_penalty.size() == 0 && submod_norm == SubmodularNormalization::IDF)
+                idf_penalty = compute_idf_penalty(pattern_wids);
+              float cost = _edit_distance_cover(sentence_wids, sentence_realtok, s_length,
+                                                pattern_wids.data(), pattern_realtok, p_length,
+                                                st, sn,
+                                                idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max,
+                                                edit_costs,
+                                                costs, 
+                                                s_cover,
+                                                submod_norm == SubmodularNormalization::IDF,
+                                                cost_upper_bound);
+              if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
+                continue;
+              score = int(10000 - cost * 100) / 10000.0;
+
+              lowest_costs.push(cost);
+              if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
+                lowest_costs.pop();
+            }
+            else 
+            {
               continue;
-            score = int(10000 - cost * 100) / 10000.0;
-
-            lowest_costs.push(cost);
-            if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
-              lowest_costs.pop();
+            }
+            break;
           }
-          else 
+          default:
           {
-            continue;
+            const auto num_covered_words = pattern_coverage.count_covered_words(sentence_wids, s_length);
+            /* do not care checking sentences that do not have enough ngram matches for the fuzzy threshold */
+            if (!filter_matches->theoretical_rejection_cover(p_length, s_length, num_covered_words, edit_costs))
+            {
+              const Costs costs(p_length, s_length, edit_costs);
+              /* let us check the candidates */
+              const auto sentence_realtok = _filterIndex->real_tokens(s_id);
+              const auto cost_upper_bound = lowest_costs.top();
+              float cost = _edit_distance(sentence_wids, sentence_realtok, s_length,
+                                          pattern_wids.data(), pattern_realtok, p_length,
+                                          st, sn,
+                                          idf_penalty, costs.diff_word*vocab_idf_penalty/idf_max,
+                                          edit_costs,
+                                          costs, cost_upper_bound);
+              // float cost = 0.1;
+              if ((no_perfect && cost == 0 && (s_length == p_length)) || cost > cost_upper_bound)
+                continue;
+              score = int(10000 - cost * 100) / 10000.0;
+
+              lowest_costs.push(cost);
+              if (score < fuzzy || (contrast_buffer > 0 && (int)lowest_costs.size() > contrast_buffer))
+                lowest_costs.pop();
+            }
+            else 
+            {
+              continue;
+            }
           }
         }
       }
 
-      // switch(submod_fun) 
-      // {
-      //   case SubmodularFunction::BOW:
-      //   case SubmodularFunction::NGRAM:
-      //     break;
-      //   case SubmodularFunction::ED:
-        
-      //   default:
-          
-      // }
-
       // std::cerr << "q:       ";
       // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++)
       //   std::cerr << sorted_pattern_terms[i] << ",";
@@ -829,21 +834,23 @@ namespace fuzzy
       // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++)
       //   std::cerr << count_terms[i] << ",";
       // std::cerr << std::endl;
-      std::cerr << "sent:    ";
-      for (unsigned i = 0; i < s_length; i++)
-        std::cerr << sentence_wids[i] << ",";
-      std::cerr << std::endl;
-      std::cerr << "cover:   ";
-      for (unsigned i = 0; i < s_cover.size(); i++)
-        std::cerr << s_cover[i] << ",";
-      std::cerr << std::endl;
-      std::cerr << "score:   " << score << std::endl;
-      std::cerr << "...done" << std::endl << std::flush;
+      // std::cerr << "sent:    ";
+      // for (unsigned i = 0; i < s_length; i++)
+      //   std::cerr << sentence_wids[i] << ",";
+      // std::cerr << std::endl;
+      // std::cerr << "cover:   ";
+      // for (unsigned i = 0; i < s_cover.size(); i++)
+      //   std::cerr << s_cover[i] << ",";
+      // std::cerr << std::endl;
+      // std::cerr << "score:   " << score << std::endl;
+      // std::cerr << "...done" << std::endl << std::flush;
 
 
       if (score >= fuzzy) {
         Match m(sentence_wids, s_length);
-        m.score = (filter_type == IndexType::BM25) ? (float)score_filter / (float)1000. : score;
+        
+        // m.score = (filter_type == IndexType::BM25) ? (float)score_filter / (float)1000. : score;
+        m.score = score;
         m.max_subseq = (filter_type == IndexType::BM25) ? 0 : score_filter;
         m.s_id = s_id;
         m.id = _filterIndex->id(s_id);
@@ -864,9 +871,10 @@ namespace fuzzy
     // std::cerr << num_filtered << std::endl;
     // std::cerr << filter_matches->get_best_matches().size() << std::endl;
 
-    if (shrinking_factor < 1.f) // submodular coverage
+    if (submod_fun != SubmodularFunction::NO && shrinking_factor < 1.f) // submodular coverage
     {
       const unsigned cover_length = (submod_fun == SubmodularFunction::ED) ? p_length : count_terms.size();
+      // std::cerr << ">> " << cover_length << std::endl;
       std::vector<float> cover_weights(cover_length, 1.f);
       std::list<Match> candidates;
       while (!result.empty())
@@ -885,20 +893,19 @@ namespace fuzzy
         for (Match &match : candidates)
         {
           float rescore = 0.f;
-          std::cerr << "rescore " << match.s_id << " : (";
+          // std::cerr << "rescore " << match.s_id << " : (";
           for (unsigned i = 0; i < cover_weights.size(); i++)
           {  
             rescore += cover_weights[i] * match.cover[i];
-            if (match.cover[i] != 0) 
-              std::cerr << cover_weights[i] << "*" << match.cover[i] << "+";
+            // if (match.cover[i] != 0) 
+            //   std::cerr << cover_weights[i] << "*" << match.cover[i] << "+";
           }
-          
-          std::cerr << ") " << match.penalty << " -> " << rescore << std::endl;
+          // std::cerr << ") " << match.penalty << " -> " << rescore << std::endl;
           match.penalty = rescore;
         }
         auto it_max = std::max_element(candidates.begin(), candidates.end(), comp);
         matches.push_back(*it_max);
-        std::cerr << "choose No " << it_max->s_id << std::endl;
+        // std::cerr << "choose No " << it_max->s_id << std::endl;
         // update cover_weights
         for (unsigned i = 0; i < cover_weights.size(); i++)
           if (it_max->cover[i] > 0)
diff --git a/src/submodular.cc b/src/submodular.cc
index 626453c..a5b89ae 100644
--- a/src/submodular.cc
+++ b/src/submodular.cc
@@ -37,6 +37,12 @@ namespace fuzzy
         std::cerr << '\t';
     }
 
+    std::ostream& operator<<(std::ostream &s, const NGram &ngram) {
+        for (unsigned i = 0; i < ngram._N; i++)
+            s << ngram._start[i] << ",";
+        return s << '\t';
+    }
+
     void get_bow_score(
         std::vector<unsigned>& sorted_pattern_terms,
         std::vector<unsigned>& count_terms,

From e22bee158ca6c38bf4c6caaa78dde5f54682dfad Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Sun, 21 Jan 2024 00:52:51 +0100
Subject: [PATCH 13/15] cerr everywhere

---
 cli/src/FuzzyMatch-cli.cc    |  3 +++
 include/fuzzy/fuzzy_match.hh |  3 ++-
 src/bm25_matches.cc          |  4 ++++
 src/fuzzy_match.cc           | 31 +++++++++++++++++++++++++------
 4 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc
index 97d257b..8a18e34 100644
--- a/cli/src/FuzzyMatch-cli.cc
+++ b/cli/src/FuzzyMatch-cli.cc
@@ -125,12 +125,15 @@ std::pair<int, int> process_stream(const Function& function,
   if (num_threads <= 1) // Fast path for sequential processing.
   {
     while (std::getline(in, line)) {
+      std::cerr << "#" << std::flush;
       std::string res = function(line);
+      std::cerr << "+" << std::flush;
       if (!res.empty())
         count_nonempty++;
       out << res << std::endl;
       // if (count_nonempty % 100 == 0)
       //   std::cerr << "\rPROGRESS: " << count_nonempty << "  " << std::flush;
+      std::cerr << "+" << std::endl << std::flush;
     }
     // std::cerr << std::endl;
     return std::make_pair(count_nonempty, count_total);
diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh
index bfca226..08debfc 100644
--- a/include/fuzzy/fuzzy_match.hh
+++ b/include/fuzzy/fuzzy_match.hh
@@ -42,13 +42,14 @@ namespace fuzzy
         int length
       ) : length(length), s(seq) {}
       Match() {}
+      ~Match() {}
       float       score;
       float       secondary_sort;
       float       penalty;
       int         max_subseq;
       unsigned    s_id;
       std::string id;
-      std::vector<float> cover;
+      float* cover;
       int length;
       const unsigned* s;
     };
diff --git a/src/bm25_matches.cc b/src/bm25_matches.cc
index 3a2a0c0..9d55492 100644
--- a/src/bm25_matches.cc
+++ b/src/bm25_matches.cc
@@ -24,12 +24,15 @@ namespace fuzzy
 
     std::priority_queue<std::pair<float, unsigned>, std::vector<std::pair<float, unsigned>>, ComparePairs> k_best;
 
+    std::cerr << "1" << std::flush;
     Eigen::SparseVector<float> pattern_sparse_vec(bm25.get_vocab_size());
     for (const unsigned& wid : pattern_wids)
       pattern_sparse_vec.coeffRef(wid) += 1.0;
 
+    std::cerr << "2" << std::flush;
     Eigen::SparseVector<float> scores = bm25.compute_product(pattern_sparse_vec);
 
+    std::cerr << "3" << std::flush;
     for (Eigen::SparseVector<float>::InnerIterator it(scores); it; ++it) {
       int s_id = it.index();
       float bm25_score = it.value();
@@ -41,6 +44,7 @@ namespace fuzzy
       }
     }
 
+    std::cerr << "4" << std::flush;
     _best_matches.reserve(k_best.size());
     while (!k_best.empty())
     {
diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc
index 09d80ce..f507c76 100644
--- a/src/fuzzy_match.cc
+++ b/src/fuzzy_match.cc
@@ -505,6 +505,8 @@ namespace fuzzy
     if (!p_length)
       return false;
 
+    std::cerr << "[" << std::flush;
+
     if ((std::size_t)(min_subseq_length) > pattern.size())
       min_subseq_length = pattern.size();
 
@@ -527,6 +529,8 @@ namespace fuzzy
     // FilterMatches* filter_matches = nullptr;
     // std::unique_ptr<FilterMatches> filter_matches;
     std::shared_ptr<FilterMatches> filter_matches;
+
+    std::cerr << "$" << std::flush;
     if (filter_type == IndexType::SUFFIX) {
       const SuffixArray& suffix_array = static_cast<const SuffixArray&>(filter);
       // filter_matches = new NGramMatches(fuzzy, p_length, min_subseq_length, suffix_array);
@@ -611,6 +615,7 @@ namespace fuzzy
       filter_matches = std::make_shared<BM25Matches>(fuzzy, p_length, min_subseq_length, bm25, bm25_buffer, bm25_cutoff);
       // filter_matches = new BM25Matches(fuzzy, p_length, min_subseq_length, bm25, bm25_buffer, bm25_cutoff);
       BM25Matches& bm25Matches = static_cast<BM25Matches&>(*filter_matches);
+      std::cerr << "!" << std::flush;
       bm25Matches.register_pattern(pattern_wids, edit_costs);
     }
 #endif
@@ -622,7 +627,7 @@ namespace fuzzy
       no_matches.load_all();
     }
     /* Consolidation of the results */
-
+    std::cerr << "~" << std::flush;
     /* now explore for the best segments */
 
     PatternCoverage pattern_coverage(pattern_wids);
@@ -647,6 +652,8 @@ namespace fuzzy
     std::vector<float> norm_weight;
     std::vector<float> sorted_pattern_terms_idf;
 
+    std::cerr << "|" << std::flush;
+
     /* Salient aspects enumeration */
     switch(submod_fun)
     {
@@ -706,6 +713,7 @@ namespace fuzzy
 
     for (const auto& pair : best_matches)
     {
+      // std::cerr << "-" << std::flush;
       // num_filtered++;
       const auto s_id = pair.first;
       const auto score_filter = pair.second;
@@ -856,7 +864,10 @@ namespace fuzzy
         m.id = _filterIndex->id(s_id);
         m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt;
         m.penalty = 0;
-        m.cover = s_cover;
+        // m.cover = s_cover;
+        // m.cover = std::vector<float>(s_cover);
+        // m.cover = std::vector<float>(s_cover.size());
+        // std::copy(s_cover.begin(), s_cover.end(), m.cover.begin());
         result.push(m);
         // std::cerr << m.s_id << ": ";
         // for (const auto& c : m.cover)
@@ -868,6 +879,7 @@ namespace fuzzy
       }
     }
     // COUT filter
+    std::cerr << "]" << std::flush;
     // std::cerr << num_filtered << std::endl;
     // std::cerr << filter_matches->get_best_matches().size() << std::endl;
 
@@ -896,7 +908,9 @@ namespace fuzzy
           // std::cerr << "rescore " << match.s_id << " : (";
           for (unsigned i = 0; i < cover_weights.size(); i++)
           {  
-            rescore += cover_weights[i] * match.cover[i];
+            ///////////////////////////////// TODO: uncomment
+            // rescore += cover_weights[i] * match.cover[i];
+            rescore += cover_weights[i];
             // if (match.cover[i] != 0) 
             //   std::cerr << cover_weights[i] << "*" << match.cover[i] << "+";
           }
@@ -907,9 +921,10 @@ namespace fuzzy
         matches.push_back(*it_max);
         // std::cerr << "choose No " << it_max->s_id << std::endl;
         // update cover_weights
-        for (unsigned i = 0; i < cover_weights.size(); i++)
-          if (it_max->cover[i] > 0)
-            cover_weights[i] *= shrinking_factor;
+        ///////////////////////////////// TODO: uncomment
+        // for (unsigned i = 0; i < cover_weights.size(); i++)
+        //   if (it_max->cover[i] > 0)
+        //     cover_weights[i] *= shrinking_factor;
         candidates.erase(it_max);
         if (shrinking_factor < 1e-20f)
         {
@@ -987,6 +1002,7 @@ namespace fuzzy
         result.pop();
       }
     }
+    std::cerr << "|" << std::flush;
 
     // std::cerr << "final matches " << " : ";
     // for (unsigned i = 0; i < matches.size(); i++)
@@ -999,6 +1015,9 @@ namespace fuzzy
     //   std::cerr << matches[i].id;
     // }
     // std::cerr << std::endl;
+
+    //// Attempts to free memory which is corrupted
+    //// Probably from vector
     return matches.size() > 0;
   }
 }

From 30f56d22a10fab1a9c101775eed4a97e5c58d8d6 Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Sun, 21 Jan 2024 17:50:00 +0100
Subject: [PATCH 14/15] bug fixed

---
 cli/src/FuzzyMatch-cli.cc    |  5 --
 include/fuzzy/fuzzy_match.hh |  3 +-
 include/fuzzy/submodular.hxx | 10 +++-
 src/bm25_matches.cc          |  4 --
 src/fuzzy_match.cc           | 95 +++++-------------------------------
 5 files changed, 20 insertions(+), 97 deletions(-)

diff --git a/cli/src/FuzzyMatch-cli.cc b/cli/src/FuzzyMatch-cli.cc
index 8a18e34..94c54dc 100644
--- a/cli/src/FuzzyMatch-cli.cc
+++ b/cli/src/FuzzyMatch-cli.cc
@@ -125,15 +125,12 @@ std::pair<int, int> process_stream(const Function& function,
   if (num_threads <= 1) // Fast path for sequential processing.
   {
     while (std::getline(in, line)) {
-      std::cerr << "#" << std::flush;
       std::string res = function(line);
-      std::cerr << "+" << std::flush;
       if (!res.empty())
         count_nonempty++;
       out << res << std::endl;
       // if (count_nonempty % 100 == 0)
       //   std::cerr << "\rPROGRESS: " << count_nonempty << "  " << std::flush;
-      std::cerr << "+" << std::endl << std::flush;
     }
     // std::cerr << std::endl;
     return std::make_pair(count_nonempty, count_total);
@@ -190,8 +187,6 @@ std::pair<int, int> process_stream(const Function& function,
   if (!futures.empty())
     pop_results(/*blocking=*/true);
   
-  // std::cerr << std::endl;
-
   {
     std::lock_guard<std::mutex> lock(mutex);
     request_end = true;
diff --git a/include/fuzzy/fuzzy_match.hh b/include/fuzzy/fuzzy_match.hh
index 08debfc..bfca226 100644
--- a/include/fuzzy/fuzzy_match.hh
+++ b/include/fuzzy/fuzzy_match.hh
@@ -42,14 +42,13 @@ namespace fuzzy
         int length
       ) : length(length), s(seq) {}
       Match() {}
-      ~Match() {}
       float       score;
       float       secondary_sort;
       float       penalty;
       int         max_subseq;
       unsigned    s_id;
       std::string id;
-      float* cover;
+      std::vector<float> cover;
       int length;
       const unsigned* s;
     };
diff --git a/include/fuzzy/submodular.hxx b/include/fuzzy/submodular.hxx
index 858217e..c31c8cc 100644
--- a/include/fuzzy/submodular.hxx
+++ b/include/fuzzy/submodular.hxx
@@ -18,6 +18,7 @@ namespace fuzzy
     }
 
     template <typename T>
+    inline
     void get_score(
         std::vector<T>& sorted_pattern_terms,
         std::vector<T>& sorted_sentence_terms,
@@ -27,16 +28,20 @@ namespace fuzzy
         std::vector<float>& idf_penalty)
     {
         cover = std::vector<float>(sorted_pattern_terms.size(), 0.f);
+        // std::cerr << sorted_pattern_terms.size() << "|"
+        //           << sorted_sentence_terms.size() << ">" 
+        //           << std::flush;
         score = 0.f;
         for (
-            unsigned i = 0, j = 0, k = 0;
+            unsigned i = 0, j = 0;
             (i < sorted_pattern_terms.size()) && (j < sorted_sentence_terms.size());
             j++)
         {
             while (
-                (i < sorted_pattern_terms.size()) && 
+                (i < sorted_pattern_terms.size() - 1) && 
                 (sorted_pattern_terms[i] < sorted_sentence_terms[j]))
                 i++;
+
             if (sorted_pattern_terms[i] == sorted_sentence_terms[j])
                 if (idf_penalty.size() > 0)
                 {
@@ -55,6 +60,7 @@ namespace fuzzy
     }
 
     template <typename T>
+    inline
     void get_unique_with_count(
         std::vector<T>& sorted_salient,
         std::vector<T>& unique,
diff --git a/src/bm25_matches.cc b/src/bm25_matches.cc
index 9d55492..3a2a0c0 100644
--- a/src/bm25_matches.cc
+++ b/src/bm25_matches.cc
@@ -24,15 +24,12 @@ namespace fuzzy
 
     std::priority_queue<std::pair<float, unsigned>, std::vector<std::pair<float, unsigned>>, ComparePairs> k_best;
 
-    std::cerr << "1" << std::flush;
     Eigen::SparseVector<float> pattern_sparse_vec(bm25.get_vocab_size());
     for (const unsigned& wid : pattern_wids)
       pattern_sparse_vec.coeffRef(wid) += 1.0;
 
-    std::cerr << "2" << std::flush;
     Eigen::SparseVector<float> scores = bm25.compute_product(pattern_sparse_vec);
 
-    std::cerr << "3" << std::flush;
     for (Eigen::SparseVector<float>::InnerIterator it(scores); it; ++it) {
       int s_id = it.index();
       float bm25_score = it.value();
@@ -44,7 +41,6 @@ namespace fuzzy
       }
     }
 
-    std::cerr << "4" << std::flush;
     _best_matches.reserve(k_best.size());
     while (!k_best.empty())
     {
diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc
index f507c76..7398291 100644
--- a/src/fuzzy_match.cc
+++ b/src/fuzzy_match.cc
@@ -505,8 +505,6 @@ namespace fuzzy
     if (!p_length)
       return false;
 
-    std::cerr << "[" << std::flush;
-
     if ((std::size_t)(min_subseq_length) > pattern.size())
       min_subseq_length = pattern.size();
 
@@ -530,7 +528,6 @@ namespace fuzzy
     // std::unique_ptr<FilterMatches> filter_matches;
     std::shared_ptr<FilterMatches> filter_matches;
 
-    std::cerr << "$" << std::flush;
     if (filter_type == IndexType::SUFFIX) {
       const SuffixArray& suffix_array = static_cast<const SuffixArray&>(filter);
       // filter_matches = new NGramMatches(fuzzy, p_length, min_subseq_length, suffix_array);
@@ -615,7 +612,6 @@ namespace fuzzy
       filter_matches = std::make_shared<BM25Matches>(fuzzy, p_length, min_subseq_length, bm25, bm25_buffer, bm25_cutoff);
       // filter_matches = new BM25Matches(fuzzy, p_length, min_subseq_length, bm25, bm25_buffer, bm25_cutoff);
       BM25Matches& bm25Matches = static_cast<BM25Matches&>(*filter_matches);
-      std::cerr << "!" << std::flush;
       bm25Matches.register_pattern(pattern_wids, edit_costs);
     }
 #endif
@@ -627,7 +623,6 @@ namespace fuzzy
       no_matches.load_all();
     }
     /* Consolidation of the results */
-    std::cerr << "~" << std::flush;
     /* now explore for the best segments */
 
     PatternCoverage pattern_coverage(pattern_wids);
@@ -652,25 +647,17 @@ namespace fuzzy
     std::vector<float> norm_weight;
     std::vector<float> sorted_pattern_terms_idf;
 
-    std::cerr << "|" << std::flush;
-
     /* Salient aspects enumeration */
     switch(submod_fun)
     {
       case SubmodularFunction::BOW:
       {
         // all terms bow
-        std::vector<unsigned> sorted_pattern_wids(pattern_wids);
+        std::vector<unsigned> sorted_pattern_wids = pattern_wids;
         std::sort(sorted_pattern_wids.begin(), sorted_pattern_wids.end());
 
         get_unique_with_count(sorted_pattern_wids, sorted_pattern_terms, count_terms);
-        // std::cerr << "### "
-        //           << sorted_pattern_wids.size() 
-        //           << ", "
-        //           << sorted_pattern_terms.size()
-        //           << ", "
-        //           << count_terms.size()
-        //           << std::endl;
+
 
         if (submod_norm == SubmodularNormalization::IDF)
           sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms);
@@ -692,7 +679,8 @@ namespace fuzzy
       case SubmodularFunction::ED:
       {
         // sentence indices
-        sorted_pattern_terms = std::vector<unsigned>(pattern_wids);
+        // sorted_pattern_terms = std::vector<unsigned>(pattern_wids);
+        sorted_pattern_terms = pattern_wids;
         if (submod_norm == SubmodularNormalization::IDF)
           sorted_pattern_terms_idf = compute_idf_penalty(sorted_pattern_terms);
         break;
@@ -701,19 +689,8 @@ namespace fuzzy
         ;
     }
 
-    // std::cerr << "sorted unique terms" << ": ";
-    // for (const auto& c : sorted_pattern_terms)
-    //   std::cerr << c << ", ";
-    // std::cerr << std::endl;
-    // std::cerr << "Idf" << ": ";
-    // for (const auto& c : sorted_pattern_terms_idf)
-    //   std::cerr << c << ", ";
-    // std::cerr << std::endl;
-    /////////////
-
     for (const auto& pair : best_matches)
     {
-      // std::cerr << "-" << std::flush;
       // num_filtered++;
       const auto s_id = pair.first;
       const auto score_filter = pair.second;
@@ -728,7 +705,6 @@ namespace fuzzy
 
       if (submod_norm == SubmodularNormalization::BM25)
       {
-        // std::cerr << "BM25 norm..." << std::endl << std::flush;
         score = (float)score_filter / 1000.f;
         assert((filter_type == IndexType::BM25));
         BM25Matches& bm25Matches = static_cast<BM25Matches&>(*filter_matches);
@@ -834,25 +810,6 @@ namespace fuzzy
         }
       }
 
-      // std::cerr << "q:       ";
-      // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++)
-      //   std::cerr << sorted_pattern_terms[i] << ",";
-      // std::cerr << std::endl;
-      // std::cerr << "q count: ";
-      // for (unsigned i = 0; i < sorted_pattern_terms.size(); i++)
-      //   std::cerr << count_terms[i] << ",";
-      // std::cerr << std::endl;
-      // std::cerr << "sent:    ";
-      // for (unsigned i = 0; i < s_length; i++)
-      //   std::cerr << sentence_wids[i] << ",";
-      // std::cerr << std::endl;
-      // std::cerr << "cover:   ";
-      // for (unsigned i = 0; i < s_cover.size(); i++)
-      //   std::cerr << s_cover[i] << ",";
-      // std::cerr << std::endl;
-      // std::cerr << "score:   " << score << std::endl;
-      // std::cerr << "...done" << std::endl << std::flush;
-
 
       if (score >= fuzzy) {
         Match m(sentence_wids, s_length);
@@ -864,29 +821,20 @@ namespace fuzzy
         m.id = _filterIndex->id(s_id);
         m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt;
         m.penalty = 0;
-        // m.cover = s_cover;
-        // m.cover = std::vector<float>(s_cover);
-        // m.cover = std::vector<float>(s_cover.size());
-        // std::copy(s_cover.begin(), s_cover.end(), m.cover.begin());
+        m.cover = s_cover;
         result.push(m);
-        // std::cerr << m.s_id << ": ";
-        // for (const auto& c : m.cover)
-        //   std::cerr << c << ", ";
-        // std::cerr << std::endl;
         cpt++;
         if (cpt > contrast_buffer)
           break;
       }
     }
     // COUT filter
-    std::cerr << "]" << std::flush;
     // std::cerr << num_filtered << std::endl;
     // std::cerr << filter_matches->get_best_matches().size() << std::endl;
 
     if (submod_fun != SubmodularFunction::NO && shrinking_factor < 1.f) // submodular coverage
     {
       const unsigned cover_length = (submod_fun == SubmodularFunction::ED) ? p_length : count_terms.size();
-      // std::cerr << ">> " << cover_length << std::endl;
       std::vector<float> cover_weights(cover_length, 1.f);
       std::list<Match> candidates;
       while (!result.empty())
@@ -908,23 +856,17 @@ namespace fuzzy
           // std::cerr << "rescore " << match.s_id << " : (";
           for (unsigned i = 0; i < cover_weights.size(); i++)
           {  
-            ///////////////////////////////// TODO: uncomment
-            // rescore += cover_weights[i] * match.cover[i];
-            rescore += cover_weights[i];
-            // if (match.cover[i] != 0) 
-            //   std::cerr << cover_weights[i] << "*" << match.cover[i] << "+";
+            rescore += cover_weights[i] * match.cover[i];
+            // rescore += cover_weights[i];
           }
-          // std::cerr << ") " << match.penalty << " -> " << rescore << std::endl;
           match.penalty = rescore;
         }
         auto it_max = std::max_element(candidates.begin(), candidates.end(), comp);
         matches.push_back(*it_max);
-        // std::cerr << "choose No " << it_max->s_id << std::endl;
         // update cover_weights
-        ///////////////////////////////// TODO: uncomment
-        // for (unsigned i = 0; i < cover_weights.size(); i++)
-        //   if (it_max->cover[i] > 0)
-        //     cover_weights[i] *= shrinking_factor;
+        for (unsigned i = 0; i < cover_weights.size(); i++)
+          if (it_max->cover[i] > 0)
+            cover_weights[i] *= shrinking_factor;
         candidates.erase(it_max);
         if (shrinking_factor < 1e-20f)
         {
@@ -1002,22 +944,7 @@ namespace fuzzy
         result.pop();
       }
     }
-    std::cerr << "|" << std::flush;
-
-    // std::cerr << "final matches " << " : ";
-    // for (unsigned i = 0; i < matches.size(); i++)
-    // { 
-    //   std::cerr << std::endl << "   ";
-    //   std::cerr << matches[i].s_id << ": ";
-    //   for (int j = 0; j < matches[i].length; j++)
-    //     std::cerr << matches[i].s[j] << " ";
-    //   std::cerr << std::endl;
-    //   std::cerr << matches[i].id;
-    // }
-    // std::cerr << std::endl;
-
-    //// Attempts to free memory which is corrupted
-    //// Probably from vector
+
     return matches.size() > 0;
   }
 }

From 0b6187f61430bb4eec9fa58a7a5b7f72ca6a0816 Mon Sep 17 00:00:00 2001
From: Maxwell1447 <maxbouthors@gmail.com>
Date: Mon, 14 Oct 2024 16:09:38 +0200
Subject: [PATCH 15/15] fixed problem of best match buffer

---
 src/CMakeLists.txt |  2 +-
 src/filter.cc      |  2 +-
 src/fuzzy_match.cc | 25 ++++++++++++++++++++++---
 3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b7a96fa..b80565a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -55,7 +55,7 @@ target_include_directories(${PROJECT_NAME} PUBLIC
 
 target_link_libraries(${PROJECT_NAME}
   ${OPENNMT_TOKENIZER_LIB}
-  ${ICU_LIBRARIES}
+  # ${ICU_LIBRARIES}
   ${Boost_LIBRARIES}
   Threads::Threads
   )
diff --git a/src/filter.cc b/src/filter.cc
index 94f89c2..e2a5e6d 100644
--- a/src/filter.cc
+++ b/src/filter.cc
@@ -10,7 +10,7 @@ namespace fuzzy
   Filter::add_sentence(const std::vector<unsigned>& sentence)
   {
     size_t sidx = _sentence_pos.size();
-    std::cerr << sidx << std::endl;
+    // std::cerr << sidx << std::endl;
     _sentence_pos.push_back(_sentence_buffer.size());
 
     /* first token in sentence buffer is the sentence size */
diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc
index 7398291..b3fc964 100644
--- a/src/fuzzy_match.cc
+++ b/src/fuzzy_match.cc
@@ -41,6 +41,16 @@ namespace fuzzy
     }
   };
 
+  class CompareMatchInverse
+  {
+  public:
+    bool operator()(const FuzzyMatch::Match &x, const FuzzyMatch::Match &y)
+    {
+      return x.score > y.score || 
+             (x.score == y.score && x.secondary_sort < y.secondary_sort);
+    }
+  };
+
   static std::string normalize(const std::string& text_utf8) {
     UErrorCode error_code = U_ZERO_ERROR;
     const auto* normalizer = icu::Normalizer2::getNFCInstance(error_code);
@@ -522,6 +532,7 @@ namespace fuzzy
 
     /* result map - normalized error => sentence */
     std::priority_queue<Match, std::vector<Match>, CompareMatch> result;
+    std::priority_queue<Match, std::vector<Match>, CompareMatchInverse> result_best;
 
     const Filter& filter = _filterIndex->get_Filter();
     // FilterMatches* filter_matches = nullptr;
@@ -822,12 +833,20 @@ namespace fuzzy
         m.secondary_sort = (filter_type == IndexType::SUFFIX) ? s_id : cpt;
         m.penalty = 0;
         m.cover = s_cover;
-        result.push(m);
+        // result.push(m);
+        result_best.push(m);
+        if (contrast_buffer > 0 && (int)result_best.size() > contrast_buffer)
+          result_best.pop();
         cpt++;
-        if (cpt > contrast_buffer)
-          break;
+        // if (cpt > contrast_buffer)
+        //   break
       }
     }
+    while (result_best.size() > 0)
+    {
+      result.push(result_best.top());
+      result_best.pop();
+    }
     // COUT filter
     // std::cerr << num_filtered << std::endl;
     // std::cerr << filter_matches->get_best_matches().size() << std::endl;