From 8a5da07e3d59dfbb76cf40d933d90a5b805a6a95 Mon Sep 17 00:00:00 2001 From: The-Dunadan Date: Sun, 23 Jan 2022 14:18:33 +0000 Subject: [PATCH] Add Ukrainian stemmer https://github.com/Tapkomet/UAStemming --- algorithms/index.tt | 1 + algorithms/ukrainian/stemmer.tt | 18 ++ js/ukrainian-stemmer.js | 419 ++++++++++++++++++++++++++++++++ 3 files changed, 438 insertions(+) create mode 100644 algorithms/ukrainian/stemmer.tt create mode 100644 js/ukrainian-stemmer.js diff --git a/algorithms/index.tt b/algorithms/index.tt index 0b0133c..c0d4749 100644 --- a/algorithms/index.tt +++ b/algorithms/index.tt @@ -33,6 +33,7 @@ following languages:
  • Danish
  • Russian +
  • Ukrainian
  • Finnish
  • Basque diff --git a/algorithms/ukrainian/stemmer.tt b/algorithms/ukrainian/stemmer.tt new file mode 100644 index 0000000..ceb56df --- /dev/null +++ b/algorithms/ukrainian/stemmer.tt @@ -0,0 +1,18 @@ +[% header('Ukrainian stemming algorithm') %] + +

    Links to resources

    + + + +[% algorithm_vocab([60, 'в', 'п']) %] + + +

    The algorithm in Snowball (Unicode version)

    + +Source here. + +[% highlight_file('ukrainian') %] + +[% footer %] diff --git a/js/ukrainian-stemmer.js b/js/ukrainian-stemmer.js new file mode 100644 index 0000000..65964a3 --- /dev/null +++ b/js/ukrainian-stemmer.js @@ -0,0 +1,419 @@ +// Generated by Snowball 2.1.0 - https://snowballstem.org/ + +/**@constructor*/ +function UkrainianStemmer() { + var base = new BaseStemmer(); + /** @const */ var a_0 = [ + ["\u0430\u0434\u0436\u0435", -1, -1], + ["\u0430\u0442\u043E\u043C", -1, -1], + ["\u0432\u0456\u0441\u044C", -1, -1], + ["\u0434\u0435\u0441\u044C", -1, -1], + ["\u0437\u0434\u043E\u0440\u043E\u0432'\u044F", -1, 1], + ["\u043A\u0440\u043E\u043A", -1, -1], + ["\u043A\u0440\u0456\u043C", -1, -1] + ]; + + /** @const */ var a_1 = [ + ["\u043E\u0432\u0430", -1, 1], + ["\u043E\u0432\u0435", -1, 1], + ["\u0438\u043C\u0438", -1, 1], + ["\u0435\u0439", -1, 1], + ["\u0438\u0439", -1, 1], + ["\u043E\u0432\u0438\u0439", 4, 1], + ["\u0456\u0439", -1, 1], + ["\u043E\u0432\u0456\u0439", 6, 1], + ["\u0435\u043C", -1, 1], + ["\u0438\u043C", -1, 1], + ["\u043E\u0432\u0438\u043C", 9, 1], + ["\u043E\u043C", -1, 1], + ["\u0456\u043C", -1, 1], + ["\u043E\u0432\u043E", -1, 1], + ["\u043E\u0433\u043E", -1, 1], + ["\u043E\u0432\u043E\u0433\u043E", 14, 1], + ["\u0435\u043C\u0443", -1, 1], + ["\u043E\u043C\u0443", -1, 1], + ["\u043E\u0432\u043E\u043C\u0443", 17, 1], + ["\u0438\u0445", -1, 1], + ["\u043E\u0432\u0438\u0445", 19, 1], + ["\u0456\u0445", -1, 1], + ["\u0435\u044E", -1, 1], + ["\u043E\u044E", -1, 1], + ["\u043E\u0432\u043E\u044E", 23, 1], + ["\u0443\u044E", -1, 1], + ["\u044E\u044E", -1, 1], + ["\u0430\u044F", -1, 1], + ["\u043E\u0457", -1, 1], + ["\u043E\u0432\u043E\u0457", 28, 1] + ]; + + /** @const */ var a_2 = [ + ["\u0441\u044C", -1, 1], + ["\u0441\u044F", -1, 1] + ]; + + /** @const */ var a_3 = [ + ["\u0430\u043B\u0430", -1, 2], + ["\u0443\u0432\u0430\u043B\u0430", 0, 2], + ["\u0438\u043B\u0430", -1, 2], + ["\u0448\u043B\u0430", -1, 1], + ["\u0456\u043B\u0430", -1, 2], + ["\u0435\u043D\u0430", -1, 2], + ["\u0438\u0442\u0430", -1, 2], + ["\u0430\u0432", -1, 2], + ["\u0443\u0432\u0430\u0432", 7, 2], + ["\u0438\u0432", -1, 2], + ["\u0448\u043E\u0432", -1, 1], + ["\u0443\u0439\u0442\u0435", -1, 2], + ["\u0430\u043B\u0438", -1, 2], + ["\u0443\u0432\u0430\u043B\u0438", 12, 2], + ["\u0438\u043B\u0438", -1, 2], + ["\u0448\u043B\u0438", -1, 1], + ["\u0430\u043D\u0438\u043C\u0438", -1, 2], + ["\u0443\u0432\u0430\u0442\u0438", -1, 2], + ["\u0438\u0432\u0448\u0438", -1, 2], + ["\u0443\u0439", -1, 2], + ["\u0430\u043B\u043E", -1, 2], + ["\u0443\u0432\u0430\u043B\u043E", 20, 2], + ["\u0438\u043B\u043E", -1, 2], + ["\u0448\u043B\u043E", -1, 1], + ["\u0456\u043B\u043E", -1, 2], + ["\u0435\u043D\u043E", -1, 2], + ["\u0430\u043D\u0438\u0445", -1, 2], + ["\u0438\u0442\u044C", -1, 2], + ["\u0430\u044E\u0442\u044C", -1, 2], + ["\u0443\u044E\u0442\u044C", -1, 2], + ["\u0456\u044E\u0442\u044C", -1, 2], + ["\u0456\u0442\u044C", -1, 2], + ["\u0443\u0432\u0430\u043D\u043D\u044F", -1, 2], + ["\u0430\u0454", -1, 2], + ["\u0438\u0454", -1, 2], + ["\u0443\u0454", -1, 2], + ["\u044E\u0454", -1, 2], + ["\u044F\u0454", -1, 2], + ["\u0456\u0454", -1, 2], + ["\u0438\u043B\u0456", -1, 2], + ["\u0430\u043D\u0456", -1, 2] + ]; + + /** @const */ var a_4 = [ + ["\u0430", -1, 3], + ["\u044F\u0442\u0430", 0, 1], + ["\u043E\u0432", -1, 3], + ["\u0456\u0432", -1, 3], + ["\u0457\u0432", -1, 3], + ["\u043E\u0457\u0432", 4, 3], + ["\u0435", -1, 3], + ["\u0438", -1, 3], + ["\u0430\u043C\u0438", 7, 3], + ["\u044F\u0442\u0430\u043C\u0438", 8, 1], + ["\u044F\u043C\u0438", 7, 3], + ["\u0456\u044F\u043C\u0438", 10, 3], + ["\u0439", -1, 3], + ["\u0435\u0439", 12, 3], + ["\u043E\u0439", 12, 3], + ["\u0456\u0439", 12, 3], + ["\u043E\u043A", -1, 2], + ["\u0438\u043B", -1, 3], + ["\u0456\u043B", -1, 3], + ["\u0430\u043C", -1, 3], + ["\u044F\u0442\u0430\u043C", 19, 1], + ["\u0435\u043C", -1, 3], + ["\u043E\u043C", -1, 3], + ["\u044F\u043C", -1, 3], + ["\u0456\u044F\u043C", 23, 3], + ["\u043E\u0454\u043C", -1, 3], + ["\u0435\u043D", -1, 3], + ["\u043E", -1, 3], + ["\u044F\u0442", -1, 3], + ["\u0443", -1, 3], + ["\u0430\u0445", -1, 3], + ["\u044F\u0445", -1, 3], + ["\u043E\u044F\u0445", 31, 3], + ["\u0456\u044F\u0445", 31, 3], + ["\u044C", -1, 3], + ["\u044E", -1, 3], + ["\u0443\u044E", 35, 3], + ["\u0456\u0454\u044E", 35, 3], + ["\u0456\u044E", 35, 3], + ["\u044F", -1, 3], + ["\u043E\u044F", 39, 3], + ["\u0456\u044F", 39, 3], + ["\u0456", -1, 3], + ["\u043E\u0432\u0456", 42, 3], + ["\u0435\u0457", -1, 3], + ["\u0456\u0457", -1, 3] + ]; + + /** @const */ var a_5 = [ + ["'", -1, 3], + ["\u0441\u044C\u043A", -1, 3], + ["\u0456\u0439\u0441\u044C\u043A", 1, 3], + ["\u043D", -1, 1], + ["\u0430\u043D", 3, 3], + ["\u0435\u043D", 3, 3], + ["\u0456\u0447\u043D", 3, 3], + ["\u044C\u043D", 3, 3], + ["\u0442", -1, 2], + ["\u0438\u0442", 8, 3], + ["\u043E\u0441\u0442", 8, 4], + ["\u044E\u044E\u0442", 8, 3], + ["\u0430\u0454\u0442", 8, 3], + ["\u0443\u0454\u0442", 8, 3], + ["\u044E\u0454\u0442", 8, 3], + ["\u044F\u0454\u0442", 8, 3], + ["\u044C", -1, 3] + ]; + + + + /** @return {boolean} */ + function r_exception1() { + var /** number */ among_var; + base.bra = base.cursor; + among_var = base.find_among(a_0); + if (among_var == 0) + { + return false; + } + base.ket = base.cursor; + if (base.cursor < base.limit) + { + return false; + } + switch (among_var) { + case 1: + if (!base.slice_from("\u0437\u0434\u043E\u0440")) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_adjective() { + base.ket = base.cursor; + if (base.find_among_b(a_1) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_postfix() { + base.ket = base.cursor; + if (base.find_among_b(a_2) == 0) + { + return false; + } + base.bra = base.cursor; + if (!base.slice_del()) + { + return false; + } + return true; + }; + + /** @return {boolean} */ + function r_verb() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_3); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!base.slice_from("\u0442")) + { + return false; + } + break; + case 2: + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_noun() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_4); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!(base.eq_s_b("\u043D"))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!base.slice_from("\u043A")) + { + return false; + } + break; + case 3: + if (!base.slice_del()) + { + return false; + } + break; + } + return true; + }; + + /** @return {boolean} */ + function r_tidy_up() { + var /** number */ among_var; + base.ket = base.cursor; + among_var = base.find_among_b(a_5); + if (among_var == 0) + { + return false; + } + base.bra = base.cursor; + switch (among_var) { + case 1: + if (!(base.eq_s_b("\u043D"))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 2: + if (!(base.eq_s_b("\u0442"))) + { + return false; + } + if (!base.slice_del()) + { + return false; + } + break; + case 3: + if (!base.slice_del()) + { + return false; + } + break; + case 4: + if (!base.slice_from("\u0456\u0441\u0442")) + { + return false; + } + break; + } + return true; + }; + + this.stem = /** @return {boolean} */ function() { + lab0: { + var /** number */ v_1 = base.cursor; + lab1: { + { + var /** number */ v_2 = base.cursor; + lab2: { + { + var /** number */ c1 = base.cursor + 4; + if (c1 > base.limit) + { + break lab2; + } + base.cursor = c1; + } + break lab1; + } + base.cursor = v_2; + } + break lab0; + } + base.cursor = v_1; + lab3: { + var /** number */ v_3 = base.cursor; + lab4: { + if (!r_exception1()) + { + break lab4; + } + break lab3; + } + base.cursor = v_3; + base.limit_backward = base.cursor; base.cursor = base.limit; + var /** number */ v_4 = base.limit - base.cursor; + lab5: { + var /** number */ v_5 = base.limit - base.cursor; + lab6: { + if (!r_postfix()) + { + base.cursor = base.limit - v_5; + break lab6; + } + } + lab7: { + var /** number */ v_6 = base.limit - base.cursor; + lab8: { + if (!r_adjective()) + { + break lab8; + } + break lab7; + } + base.cursor = base.limit - v_6; + lab9: { + if (!r_verb()) + { + break lab9; + } + break lab7; + } + base.cursor = base.limit - v_6; + if (!r_noun()) + { + break lab5; + } + } + } + base.cursor = base.limit - v_4; + var /** number */ v_7 = base.limit - base.cursor; + r_tidy_up(); + base.cursor = base.limit - v_7; + base.cursor = base.limit_backward; + } + } + return true; + }; + + /**@return{string}*/ + this['stemWord'] = function(/**string*/word) { + base.setCurrent(word); + this.stem(); + return base.getCurrent(); + }; +}; +window['UkrainianStemmer'] = UkrainianStemmer;