Skip to content

Commit

Permalink
Merge pull request #71 from SYSTRAN/BM25
Browse files Browse the repository at this point in the history
Bm25
  • Loading branch information
Maxwell1447 authored Dec 23, 2024
2 parents 636ff9e + 0b6187f commit 56ff429
Show file tree
Hide file tree
Showing 27 changed files with 1,002 additions and 75 deletions.
57 changes: 45 additions & 12 deletions cli/src/FuzzyMatch-cli.cc
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,10 @@ std::pair<int, int> process_stream(const Function& function,
if (!res.empty())
count_nonempty++;
out << res << std::endl;
if (count_nonempty % 100 == 0)
std::cerr << "\rPROGRESS: " << count_nonempty << " " << std::flush;
// if (count_nonempty % 100 == 0)
// std::cerr << "\rPROGRESS: " << count_nonempty << " " << std::flush;
}
std::cerr << std::endl;
// std::cerr << std::endl;
return std::make_pair(count_nonempty, count_total);
}

Expand Down Expand Up @@ -163,8 +163,8 @@ std::pair<int, int> process_stream(const Function& function,
count_nonempty++;
out << res << std::endl;
futures.pop();
if (count_nonempty % 100 == 0)
std::cerr << "\rPROGRESS: " << count_nonempty << " " << std::flush;
// if (count_nonempty % 100 == 0)
// std::cerr << "\rPROGRESS: " << count_nonempty << " " << std::flush;
}
};

Expand All @@ -187,8 +187,6 @@ std::pair<int, int> process_stream(const Function& function,
if (!futures.empty())
pop_results(/*blocking=*/true);

std::cerr << std::endl;

{
std::lock_guard<std::mutex> lock(mutex);
request_end = true;
Expand All @@ -210,7 +208,9 @@ class processor {
std::string contrastive_reduce_str,
int contrastive_buffer,
fuzzy::IndexType filter_type,
int bm25_buffer, float bm25_cutoff, const fuzzy::FilterIndexParams& filter_index_params):
int bm25_buffer, float bm25_cutoff,
float submodular_shrinking_factor, fuzzy::SubmodularFunction submodular_function, fuzzy::SubmodularNormalization submodular_normalization,
const fuzzy::FilterIndexParams& filter_index_params):
_fuzzyMatcher(pt, max_tokens_in_pattern, filter_type, filter_index_params),
_fuzzy(fuzzy),
_contrastive_factor(contrastive_factor),
Expand All @@ -224,7 +224,10 @@ class processor {
_contrastive_buffer(contrastive_buffer),
_filter_type(filter_type),
_bm25_buffer(bm25_buffer),
_bm25_cutoff(bm25_cutoff) {
_bm25_cutoff(bm25_cutoff),
_submodular_shrinking_factor(submodular_shrinking_factor),
_submodular_function(submodular_function),
_submodular_normalization(submodular_normalization) {
if (contrastive_reduce_str == "max")
_contrastive_reduce = fuzzy::ContrastReduce::MAX;
else
Expand All @@ -236,7 +239,8 @@ class processor {
_fuzzyMatcher.match(sentence, _fuzzy, _nmatch, _no_perfect, matches,
_min_subseq_length, _min_subseq_ratio, _idf_penalty, _cost,
_contrastive_factor, _contrastive_reduce, _contrastive_buffer,
_filter_type, _bm25_buffer, _bm25_cutoff);
_filter_type, _bm25_buffer, _bm25_cutoff,
_submodular_shrinking_factor, _submodular_function, _submodular_normalization);

std::string out;
for(const fuzzy::FuzzyMatch::Match &m: matches) {
Expand Down Expand Up @@ -294,6 +298,9 @@ class processor {
fuzzy::IndexType _filter_type;
int _bm25_buffer;
float _bm25_cutoff;
float _submodular_shrinking_factor;
fuzzy::SubmodularFunction _submodular_function;
fuzzy::SubmodularNormalization _submodular_normalization;
};

int main(int argc, char** argv)
Expand Down Expand Up @@ -326,6 +333,9 @@ int main(int argc, char** argv)
float contrastive_factor;
float bm25_cutoff;
float bm25_ratio_idf;
float submodular_shrinking_factor;
std::string submodular_function_str;
std::string submodular_normalization_str;
int nmatch;
int nthreads;
int min_subseq_length;
Expand Down Expand Up @@ -365,6 +375,9 @@ int main(int argc, char** argv)
("bm25-ratio-idf", po::value(&bm25_ratio_idf)->default_value(0.5f), "filter in the reverse index to consider only terms rare enough (close to 0 = ignores a lot : close to 1 = considers a lot)")
("bm25-buffer", po::value(&bm25_buffer)->default_value(10), "number of best BM25 to rerank")
("bm25-cutoff", po::value(&bm25_cutoff)->default_value(0.f), "minimum BM25 score threshold cutoff")
("submodular-shrinking-factor,lambda", po::value(&submodular_shrinking_factor)->default_value(1.f), "In submodularity coverage, weight shrinking factor of each covered salient aspect of the source")
("submodular-function", po::value(&submodular_function_str)->default_value("NO"), "submodularity coverage function category (NO|BOW|NGRAM|ED)")
("submodular-norm", po::value(&submodular_normalization_str)->default_value("NO"), "Normalization in submodular coverage score")
("nthreads,N", po::value(&nthreads)->default_value(4), "number of thread to use for match")
;

Expand Down Expand Up @@ -437,8 +450,27 @@ int main(int argc, char** argv)
fuzzy::IndexType filter_type;
if (filter_type_str == "bm25")
filter_type = fuzzy::IndexType::BM25;
else if (filter_type_str == "no")
filter_type = fuzzy::IndexType::NO;
else
filter_type = fuzzy::IndexType::SUFFIX;
fuzzy::SubmodularFunction submodular_function;
if (submodular_function_str == "BOW")
submodular_function = fuzzy::SubmodularFunction::BOW;
else if (submodular_function_str == "NGRAM")
submodular_function = fuzzy::SubmodularFunction::NGRAM;
else if (submodular_function_str == "ED")
submodular_function = fuzzy::SubmodularFunction::ED;
else
submodular_function = fuzzy::SubmodularFunction::NO;
fuzzy::SubmodularNormalization submodular_normalization;
// std::cerr << "submodular_normalization_str = " << submodular_normalization_str << std::endl;
if (submodular_normalization_str == "BM25")
submodular_normalization = fuzzy::SubmodularNormalization::BM25;
else if (submodular_normalization_str == "IDF")
submodular_normalization = fuzzy::SubmodularNormalization::IDF;
else
submodular_normalization = fuzzy::SubmodularNormalization::NO;
#ifdef NO_EIGEN
assert(filter_type != fuzzy::IndexType::BM25);
#endif
Expand All @@ -448,8 +480,9 @@ int main(int argc, char** argv)
idf_penalty, subseq_idf_weighting,
max_tokens_in_pattern, edit_cost,
contrastive_reduce, contrastive_buffer,
filter_type, bm25_buffer, bm25_cutoff, filter_index_params);

filter_type, bm25_buffer, bm25_cutoff,
submodular_shrinking_factor, submodular_function, submodular_normalization,
filter_index_params);
if (index_file.length()) {
TICK("Loading index_file: "+index_file);
import_binarized_fuzzy_matcher(index_file, O._fuzzyMatcher);
Expand Down
1 change: 1 addition & 0 deletions include/fuzzy/bm25.hh
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ namespace fuzzy

inline int get_vocab_size() const { return _vocab_size; }
Eigen::SparseVector<float> compute_product(const Eigen::SparseVector<float>& pattern_voc) const;
Eigen::SparseVector<float> get_cover(const Eigen::SparseVector<float>& pattern_voc, unsigned s_id) const;

private:
size_t _vocab_size;
Expand Down
5 changes: 5 additions & 0 deletions include/fuzzy/bm25.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ namespace fuzzy
{
return _bm25_inverse_index * pattern_voc;
}
inline Eigen::SparseVector<float> BM25::get_cover(const Eigen::SparseVector<float>& pattern_voc, unsigned s_id) const
{
Eigen::SparseVector<float> row_s = _bm25_inverse_index.row(s_id);
return row_s.cwiseProduct(pattern_voc);
}
template<class Archive>
void BM25::save(Archive& archive, unsigned int) const
{
Expand Down
5 changes: 3 additions & 2 deletions include/fuzzy/bm25_matches.hh
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,13 @@ namespace fuzzy
using FilterMatches::theoretical_rejection;
using FilterMatches::theoretical_rejection_cover;

std::vector<std::pair<unsigned, unsigned>> get_best_matches() const override;
std::vector<std::pair<unsigned, int>> get_best_matches() const override;
std::vector<float> cover(const std::vector<unsigned>& unique_pattern_wids, const std::vector<unsigned>& count, unsigned s_id) const;

private:
// Num of sentences to place in the buffer
const unsigned _buffer;
const float _cutoff_threshold;
std::vector<std::pair<unsigned, unsigned>> _best_matches;
std::vector<std::pair<unsigned, int>> _best_matches;
};
}
12 changes: 12 additions & 0 deletions include/fuzzy/edit_distance.hh
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#pragma once

#include <limits>
#include <algorithm>
#include <iostream>

#include <fuzzy/sentence.hh>
#include <fuzzy/costs.hh>
Expand All @@ -21,6 +23,16 @@ namespace fuzzy
const EditCosts& edit_costs,
const Costs& costs,
float max_fuzzyness = std::numeric_limits<float>::max());

float _edit_distance_cover(const unsigned* thes, const Sentence &reals, int slen,
const unsigned* thep, const Tokens &realptok, int plen,
const std::vector<const char*>& st, const std::vector<int>& sn,
const std::vector<float> &idf_penalty, float idf_weight,
const EditCosts&,
const Costs&,
std::vector<float>& cover,
const bool idf_cover = false,
float max_fuzziness = std::numeric_limits<float>::max());
}

#include <fuzzy/edit_distance.hxx>
2 changes: 1 addition & 1 deletion include/fuzzy/filter_matches.hh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace fuzzy
bool theoretical_rejection(size_t p_length, size_t s_length, const EditCosts& edit_costs) const;
bool theoretical_rejection_cover(size_t p_length, size_t s_length, size_t cover, const EditCosts& edit_costs) const;

virtual std::vector<std::pair<unsigned, unsigned>> get_best_matches() const = 0;
virtual std::vector<std::pair<unsigned, int>> get_best_matches() const = 0;

float fuzzy_threshold;
// unsigned max_differences_with_pattern;
Expand Down
23 changes: 20 additions & 3 deletions include/fuzzy/fuzzy_match.hh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
#include <fuzzy/index.hh>
#include <fuzzy/sentence.hh>
#include <fuzzy/edit_distance.hh>
#include <fuzzy/submodular.hh>
#include <memory>
#include <numeric>

namespace onmt {
class Tokenizer;
Expand All @@ -15,6 +17,8 @@ namespace onmt {
namespace fuzzy
{
enum class ContrastReduce { MEAN, MAX };
enum class SubmodularFunction { NO, BOW, NGRAM, ED };
enum class SubmodularNormalization { NO, IDF, BM25 };

class FuzzyMatch
{
Expand Down Expand Up @@ -44,6 +48,7 @@ namespace fuzzy
int max_subseq;
unsigned s_id;
std::string id;
std::vector<float> cover;
int length;
const unsigned* s;
};
Expand Down Expand Up @@ -74,7 +79,10 @@ namespace fuzzy
int contrast_buffer=-1,
IndexType filter_type=IndexType::SUFFIX,
int bm25_buffer=10,
float bm25_cutoff=0) const;
float bm25_cutoff=0,
float shrinking_factor=1.f,
SubmodularFunction submod_fun=SubmodularFunction::NO,
SubmodularNormalization submod_norm=SubmodularNormalization::NO) const;
bool match(const Sentence& real,
const Tokens& pattern,
float fuzzy,
Expand All @@ -90,7 +98,10 @@ namespace fuzzy
int contrast_buffer=-1,
IndexType filter_type=IndexType::SUFFIX,
int bm25_buffer=10,
float bm25_cutoff=0) const;
float bm25_cutoff=0,
float shrinking_factor=1.f,
SubmodularFunction submod_fun=SubmodularFunction::NO,
SubmodularNormalization submod_norm=SubmodularNormalization::NO) const;
/* simplified, include tokenization */
bool match(const std::string &sentence,
float fuzzy,
Expand All @@ -106,7 +117,10 @@ namespace fuzzy
int contrast_buffer=-1,
IndexType filter_type=IndexType::SUFFIX,
int bm25_buffer=10,
float bm25_cutoff=0) const;
float bm25_cutoff=0,
float shrinking_factor=1.f,
SubmodularFunction submod_fun=SubmodularFunction::NO,
SubmodularNormalization submod_norm=SubmodularNormalization::NO) const;
bool subsequence(const std::string &sentence,
unsigned number_of_matches,
bool no_perfect,
Expand Down Expand Up @@ -151,6 +165,9 @@ namespace fuzzy
std::vector<float>
compute_idf_penalty(const std::vector<unsigned int>& pattern_wids,
float unknown_vocab_word_penalty = 0) const;
std::vector<float>
compute_idf_penalty(const std::vector<NGram>& pattern_ngrams,
float unknown_vocab_word_penalty = 0) const;

/* penalty tokens */
int _pt;
Expand Down
4 changes: 3 additions & 1 deletion include/fuzzy/index.hh
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
#ifdef USE_EIGEN
#include <fuzzy/bm25.hh>
#endif
#include <fuzzy/no_filter.hh>
#include <fuzzy/vocab_indexer.hh>
#include <fuzzy/sentence.hh>

namespace fuzzy
{
constexpr size_t DEFAULT_MAX_TOKENS_IN_PATTERN = 300; // if you change this value, update README.md
enum class IndexType { SUFFIX, BM25 };
enum class IndexType { SUFFIX, BM25, NO };
class FilterIndex
{
public:
Expand Down Expand Up @@ -62,6 +63,7 @@ namespace fuzzy
#ifdef USE_EIGEN
inline std::shared_ptr<Filter> createBM25(const FilterIndexParams &params = FilterIndexParams()) { return std::make_shared<BM25>(params); }
#endif
inline std::shared_ptr<Filter> createNo() { return std::make_shared<NoFilter>(); }
std::vector<std::string> _ids;
std::vector<Sentence> _real_tokens;
size_t _max_tokens_in_pattern;
Expand Down
21 changes: 21 additions & 0 deletions include/fuzzy/index.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,17 @@ namespace fuzzy
& _max_tokens_in_pattern;
}
#endif
else if (_type == IndexType::NO)
{
NoFilter& no_filter = static_cast<NoFilter&>(*_filter);
ar
& _type
& _vocabIndexer
& no_filter
& _ids
& _real_tokens
& _max_tokens_in_pattern;
}
}

template<class Archive>
Expand Down Expand Up @@ -94,6 +105,16 @@ namespace fuzzy
& _real_tokens;
}
#endif
else if (_type == IndexType::NO)
{
_filter = createNo();
NoFilter& no_filter = static_cast<NoFilter&>(*_filter);
ar
& _vocabIndexer
& no_filter
& _ids
& _real_tokens;
}
if (version >= 1)
ar & _max_tokens_in_pattern;
}
Expand Down
2 changes: 1 addition & 1 deletion include/fuzzy/ngram_matches.hh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ namespace fuzzy
using FilterMatches::theoretical_rejection;
using FilterMatches::theoretical_rejection_cover;

std::vector<std::pair<unsigned, unsigned>> get_best_matches() const override;
std::vector<std::pair<unsigned, int>> get_best_matches() const override;

private:
LongestMatches _longest_matches;
Expand Down
Loading

0 comments on commit 56ff429

Please sign in to comment.