diff --git a/README.md b/README.md index 5880dc1..789b804 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Lunr Languages is a [Lunr](http://lunrjs.com/) addon that helps you search in do * ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IN.png) Tamil * ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/KR.png) Korean * ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/AM.png) Armenian +* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IL.png) Hebrew * [Contribute with a new language](CONTRIBUTING.md) Lunr Languages is compatible with Lunr version `0.6`, `0.7`, `1.0` and `2.X`. diff --git a/build/build.js b/build/build.js index 185618d..139cf39 100644 --- a/build/build.js +++ b/build/build.js @@ -9,7 +9,7 @@ var UglifyJS = require("uglify-js"); // shortcut for minifying a piece of code function compress(orig_code) { - return UglifyJS.minify(orig_code, {fromString: true, comments: true}).code; + return UglifyJS.minify(orig_code, { fromString: true, comments: true }).code; } // take some of the stop words list from the stopwords-filter repo @@ -25,7 +25,7 @@ function wordCharacters(script) { // Now from /[a-z]/ get "a-z" var regexString = charRegex.toString() // Format sanity check - if (regexString.slice(0,2) !== '/[' || regexString.slice(-2) != ']/') { + if (regexString.slice(0, 2) !== '/[' || regexString.slice(-2) != ']/') { console.error('Unexpected regex structure, aborting: ' + regexString); throw Error; } @@ -34,115 +34,117 @@ function wordCharacters(script) { // list mapping between locale, stemmer file, stopwords file, and char pattern var list = [ -{ - locale: 'ar', -}, { - locale: 'hi' -}, { - locale: 'da', - file: 'DanishStemmer.js', - stopwords: stopwordsRepoFolder + 'da.csv', - wordCharacters: wordCharacters('Latin') -}, { - locale: 'nl', - file: 'DutchStemmer.js', - stopwords: stopwordsRepoFolder + 'nl.csv', - wordCharacters: wordCharacters('Latin') -}, { - /* - Kept here to prevent breaking changes. - The correct code for Dutch is NL. - Please do not use "du" anymore, start using "nl". - I will remove "du" next time I'll build a major, backward incompatible package - */ - locale: 'du', - file: 'DutchStemmer.js', - stopwords: stopwordsRepoFolder + 'nl.csv', - wordCharacters: wordCharacters('Latin'), - warningMessage: '[Lunr Languages] Please use the "nl" instead of the "du". The "nl" code is the standard code for Dutch language, and "du" will be removed in the next major versions.' -}, { - locale: 'fi', - file: 'FinnishStemmer.js', - stopwords: stopwordsRepoFolder + 'fn.csv', - wordCharacters: wordCharacters('Latin') -}, { - locale: 'fr', - file: 'FrenchStemmer.js', - stopwords: stopwordsRepoFolder + 'fr.csv', - wordCharacters: wordCharacters('Latin') -}, { - locale: 'de', - file: 'GermanStemmer.js', - stopwords: stopwordsRepoFolder + 'de.csv', - wordCharacters: wordCharacters('Latin') -}, { - locale: 'hu', - file: 'HungarianStemmer.js', - stopwords: stopwordsRepoFolder + 'hu.csv', - wordCharacters: wordCharacters('Latin') -}, { - locale: 'it', - file: 'ItalianStemmer.js', - stopwords: stopwordsRepoFolder + 'it.csv', - wordCharacters: wordCharacters('Latin') -}, { - locale: 'ja' -}, { - locale: 'jp' -}, { - locale: 'kn' -},{ - locale: 'no', - file: 'NorwegianStemmer.js', - stopwords: stopwordsCustomFolder + 'no.csv', - wordCharacters: wordCharacters('Latin') -}, { - locale: 'pt', - file: 'PortugueseStemmer.js', - stopwords: stopwordsRepoFolder + 'pt.csv', - wordCharacters: wordCharacters('Latin') -}, { - locale: 'ro', - file: 'RomanianStemmer.js', - stopwords: stopwordsCustomFolder + 'ro.csv', - wordCharacters: wordCharacters('Latin') -}, { - locale: 'ru', - file: 'RussianStemmer.js', - stopwords: stopwordsCustomFolder + 'ru.csv', - wordCharacters: wordCharacters('Cyrillic') -}, { - locale: 'es', - file: 'SpanishStemmer.js', - stopwords: stopwordsRepoFolder + 'es.csv', - wordCharacters: wordCharacters('Latin') -}, { - locale: 'sa' -},{ - locale: 'sv', - file: 'SwedishStemmer.js', - stopwords: stopwordsCustomFolder + 'sv.csv', - wordCharacters: wordCharacters('Latin') -}, { - locale: 'ta', -},{ - locale: 'te' -},{ - locale: 'tr', - file: 'TurkishStemmer.js', - stopwords: stopwordsCustomFolder + 'tr.csv', - wordCharacters: wordCharacters('Latin') -}, { - locale: 'th', -}, { - locale: 'vi', -}, { - locale: 'zh', -}, { - locale: 'ko', -}, { - locale: 'hy', -} + { + locale: 'ar', + }, { + locale: 'hi' + }, { + locale: 'da', + file: 'DanishStemmer.js', + stopwords: stopwordsRepoFolder + 'da.csv', + wordCharacters: wordCharacters('Latin') + }, { + locale: 'nl', + file: 'DutchStemmer.js', + stopwords: stopwordsRepoFolder + 'nl.csv', + wordCharacters: wordCharacters('Latin') + }, { + /* + Kept here to prevent breaking changes. + The correct code for Dutch is NL. + Please do not use "du" anymore, start using "nl". + I will remove "du" next time I'll build a major, backward incompatible package + */ + locale: 'du', + file: 'DutchStemmer.js', + stopwords: stopwordsRepoFolder + 'nl.csv', + wordCharacters: wordCharacters('Latin'), + warningMessage: '[Lunr Languages] Please use the "nl" instead of the "du". The "nl" code is the standard code for Dutch language, and "du" will be removed in the next major versions.' + }, { + locale: 'fi', + file: 'FinnishStemmer.js', + stopwords: stopwordsRepoFolder + 'fn.csv', + wordCharacters: wordCharacters('Latin') + }, { + locale: 'fr', + file: 'FrenchStemmer.js', + stopwords: stopwordsRepoFolder + 'fr.csv', + wordCharacters: wordCharacters('Latin') + }, { + locale: 'de', + file: 'GermanStemmer.js', + stopwords: stopwordsRepoFolder + 'de.csv', + wordCharacters: wordCharacters('Latin') + }, { + locale: 'hu', + file: 'HungarianStemmer.js', + stopwords: stopwordsRepoFolder + 'hu.csv', + wordCharacters: wordCharacters('Latin') + }, { + locale: 'it', + file: 'ItalianStemmer.js', + stopwords: stopwordsRepoFolder + 'it.csv', + wordCharacters: wordCharacters('Latin') + }, { + locale: 'ja' + }, { + locale: 'jp' + }, { + locale: 'kn' + }, { + locale: 'no', + file: 'NorwegianStemmer.js', + stopwords: stopwordsCustomFolder + 'no.csv', + wordCharacters: wordCharacters('Latin') + }, { + locale: 'pt', + file: 'PortugueseStemmer.js', + stopwords: stopwordsRepoFolder + 'pt.csv', + wordCharacters: wordCharacters('Latin') + }, { + locale: 'ro', + file: 'RomanianStemmer.js', + stopwords: stopwordsCustomFolder + 'ro.csv', + wordCharacters: wordCharacters('Latin') + }, { + locale: 'ru', + file: 'RussianStemmer.js', + stopwords: stopwordsCustomFolder + 'ru.csv', + wordCharacters: wordCharacters('Cyrillic') + }, { + locale: 'es', + file: 'SpanishStemmer.js', + stopwords: stopwordsRepoFolder + 'es.csv', + wordCharacters: wordCharacters('Latin') + }, { + locale: 'sa' + }, { + locale: 'sv', + file: 'SwedishStemmer.js', + stopwords: stopwordsCustomFolder + 'sv.csv', + wordCharacters: wordCharacters('Latin') + }, { + locale: 'ta', + }, { + locale: 'te' + }, { + locale: 'tr', + file: 'TurkishStemmer.js', + stopwords: stopwordsCustomFolder + 'tr.csv', + wordCharacters: wordCharacters('Latin') + }, { + locale: 'th', + }, { + locale: 'vi', + }, { + locale: 'zh', + }, { + locale: 'ko', + }, { + locale: 'hy', + }, { + locale: 'he', + } ]; console.log('Starting building lunr-languages ...'); @@ -151,7 +153,7 @@ var tpl = fs.readFileSync('build/lunr.template', 'utf8'); var cm = fs.readFileSync('build/lunr.comments', 'utf8'); // for each language, start building -for(var i = 0; i < list.length; i++) { +for (var i = 0; i < list.length; i++) { console.log('Building for "' + list[i].locale + '"'); var data; var stopWords; diff --git a/lunr.he.js b/lunr.he.js new file mode 100644 index 0000000..0a1efec --- /dev/null +++ b/lunr.he.js @@ -0,0 +1,144 @@ +/*! + * Lunr languages, `Hebrew` language + * https://github.com/avisaradir/lunr-languages-he + * + * Copyright 2023, Adir Avisar + * http://www.mozilla.org/MPL/ + */ +/*! + * based on + * Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005) + * Meryeme Hadni, Abdelmonaime Lachkar, and S. Alaoui Ouatik (2012) + * + * Snowball JavaScript Library v0.3 + * http://code.google.com/p/urim/ + * http://snowball.tartarus.org/ + * + * Copyright 2010, Oleg Mazko + * http://www.mozilla.org/MPL/ + */ + +/** + * export the module via AMD, CommonJS or as a browser global + * Export code from https://github.com/umdjs/umd/blob/master/returnExports.js + */ +; +(function(root, factory) { + if (typeof define === 'function' && define.amd) { + // AMD. Register as an anonymous module. + define(factory) + } else if (typeof exports === 'object') { + /** + * Node. Does not work with strict CommonJS, but + * only CommonJS-like environments that support module.exports, + * like Node. + */ + module.exports = factory() + } else { + // Browser globals (root is window) + factory()(root.lunr); + } +}(this, function() { + /** + * Just return a value to define the module export. + * This example returns an object, but the module + * can return a function as the exported value. + */ + return function(lunr) { + /* throw error if lunr is not yet included */ + if ('undefined' === typeof lunr) { + throw new Error('Lunr is not present. Please include / require Lunr before this script.'); + } + + /* throw error if lunr stemmer support is not yet included */ + if ('undefined' === typeof lunr.stemmerSupport) { + throw new Error('Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.'); + } + + /* register specific locale function */ + lunr.he = function() { + this.pipeline.reset(); + this.pipeline.add( + lunr.he.trimmer, + lunr.he.stopWordFilter, + lunr.he.stemmer + ); + + // for lunr version 2 + // this is necessary so that every searched word is also stemmed before + // in lunr <= 1 this is not needed, as it is done using the normal pipeline + if (this.searchPipeline) { + this.searchPipeline.reset(); + this.searchPipeline.add(lunr.he.stemmer) + } + }; + + /* lunr trimmer function */ + lunr.he.wordCharacters = "\u0590-\u05FF\u05D0-\u05EAa-zA-Za-zA-Z0-90-9"; + lunr.he.trimmer = lunr.trimmerSupport.generateTrimmer(lunr.he.wordCharacters); + + lunr.Pipeline.registerFunction(lunr.he.trimmer, 'trimmer-he'); + + /* lunr stemmer function */ + lunr.he.stemmer = (function() { + var self = this; + var word = ''; + self.result = false; + + self.execArray = [ + 'cleanWord' + ]; + + self.stem = function() { + var counter = 0; + self.result = false; + while (counter < self.execArray.length && self.result != true) { + self.result = self[self.execArray[counter]](); + counter++; + } + } + + self.setCurrent = function(word) { + self.word = word; + } + + self.getCurrent = function() { + return self.word + } + + /*remove elongating character and test that the word does not contain non-hebrew characters. + If the word contains special characters, don't stem. */ + self.cleanWord = function() { + var wordCharacters = "\u0591-\u05F4\u05D0-\u05EA"; + var testRegex = new RegExp("[^" + wordCharacters + "]"); + if (testRegex.test(word)) { + return true; + } + return false; + } + + /* and return a function that stems a word for the current locale */ + return function(token) { + // for lunr version 2 + if (typeof token.update === "function") { + return token.update(function(word) { + self.setCurrent(word); + self.stem(); + return self.getCurrent(); + }) + } else { // for lunr version <= 1 + self.setCurrent(token); + self.stem(); + return self.getCurrent(); + } + + } + })(); + + lunr.Pipeline.registerFunction(lunr.he.stemmer, 'stemmer-he'); + + lunr.he.stopWordFilter = lunr.generateStopWordFilter('אבל או אולי אותו אותי אותך אותם אותן אותנו אז אחר אחרות אחרי אחריכן אחרים אחרת אי איזה איך אין איפה אל אלה אלו אם אנחנו אני אף אפשר את אתה אתכם אתכן אתם אתן באיזה באיזו בגלל בין בלבד בעבור בעזרת בכל בכן בלי במידה במקום שבו ברוב בשביל בשעה ש בתוך גם דרך הוא היא היה היי היכן היתה היתי הם הן הנה הסיבה שבגללה הרי ואילו ואת זאת זה זות יהיה יוכל יוכלו יותר מדי יכול יכולה יכולות יכולים יכל יכלה יכלו יש כאן כאשר כולם כולן כזה כי כיצד כך כל כלל כמו כן כפי כש לא לאו לאיזותך לאן לבין לה להיות להם להן לו לזה לזות לי לך לכם לכן למה למעלה למעלה מ למטה למטה מ למעט למקום שבו למרות לנו לעבר לעיכן לפיכך לפני מאד מאחורי מאיזו סיבה מאין מאיפה מבלי מבעד מדוע מה מהיכן מול מחוץ מי מידע מכאן מכל מכן מלבד מן מנין מסוגל מעט מעטים מעל מצד מקום בו מתחת מתי נגד נגר נו עד עז על עלי עליו עליה עליהם עליך עלינו עם עצמה עצמהם עצמהן עצמו עצמי עצמם עצמן עצמנו פה רק שוב של שלה שלהם שלהן שלו שלי שלך שלכה שלכם שלכן שלנו שם תהיה תחת'.split(' ')); + + lunr.Pipeline.registerFunction(lunr.he.stopWordFilter, 'stopWordFilter-he'); + }; +})) \ No newline at end of file diff --git a/min/lunr.he.min.js b/min/lunr.he.min.js new file mode 100644 index 0000000..b863d3e --- /dev/null +++ b/min/lunr.he.min.js @@ -0,0 +1 @@ +!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.he=function(){this.pipeline.reset(),this.pipeline.add(e.he.trimmer,e.he.stopWordFilter,e.he.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.he.stemmer))},e.he.wordCharacters="֑-״א-תa-zA-Za-zA-Z0-90-9",e.he.trimmer=e.trimmerSupport.generateTrimmer(e.he.wordCharacters),e.Pipeline.registerFunction(e.he.trimmer,"trimmer-he"),e.he.stemmer=function(){var e=this;return e.result=!1,e.preRemoved=!1,e.sufRemoved=!1,e.pre={pre1:"ה ו י ת",pre2:"ב כ ל מ ש כש",pre3:"הב הכ הל המ הש בש לכ",pre4:"וב וכ ול ומ וש",pre5:"מה שה כל",pre6:"מב מכ מל ממ מש",pre7:"בה בו בי בת כה כו כי כת לה לו לי לת",pre8:"ובה ובו ובי ובת וכה וכו וכי וכת ולה ולו ולי ולת"},e.suf={suf1:"ך כ ם ן נ",suf2:"ים ות וך וכ ום ון ונ הם הן יכ יך ינ ים",suf3:"תי תך תכ תם תן תנ",suf4:"ותי ותך ותכ ותם ותן ותנ",suf5:"נו כם כן הם הן",suf6:"ונו וכם וכן והם והן",suf7:"תכם תכן תנו תהם תהן",suf8:"הוא היא הם הן אני אתה את אנו אתם אתן",suf9:"ני נו כי כו כם כן תי תך תכ תם תן",suf10:"י ך כ ם ן נ ת"},e.patterns=JSON.parse('{"hebrewPatterns": [{"pt1": [{"c": "ה", "l": 0}]}, {"pt2": [{"c": "ו", "l": 0}]}, {"pt3": [{"c": "י", "l": 0}]}, {"pt4": [{"c": "ת", "l": 0}]}, {"pt5": [{"c": "מ", "l": 0}]}, {"pt6": [{"c": "ל", "l": 0}]}, {"pt7": [{"c": "ב", "l": 0}]}, {"pt8": [{"c": "כ", "l": 0}]}, {"pt9": [{"c": "ש", "l": 0}]}, {"pt10": [{"c": "כש", "l": 0}]}, {"pt11": [{"c": "בה", "l": 0}]}, {"pt12": [{"c": "וב", "l": 0}]}, {"pt13": [{"c": "וכ", "l": 0}]}, {"pt14": [{"c": "ול", "l": 0}]}, {"pt15": [{"c": "ומ", "l": 0}]}, {"pt16": [{"c": "וש", "l": 0}]}, {"pt17": [{"c": "הב", "l": 0}]}, {"pt18": [{"c": "הכ", "l": 0}]}, {"pt19": [{"c": "הל", "l": 0}]}, {"pt20": [{"c": "המ", "l": 0}]}, {"pt21": [{"c": "הש", "l": 0}]}, {"pt22": [{"c": "מה", "l": 0}]}, {"pt23": [{"c": "שה", "l": 0}]}, {"pt24": [{"c": "כל", "l": 0}]}]}'),e.execArray=["cleanWord","removeDiacritics","removeStopWords","normalizeHebrewCharacters"],e.stem=function(){var r=0;for(e.result=!1,e.preRemoved=!1,e.sufRemoved=!1;r=0)return!0},e.normalizeHebrewCharacters=function(){return e.word=e.word.replace("ך","כ"),e.word=e.word.replace("ם","מ"),e.word=e.word.replace("ן","נ"),e.word=e.word.replace("ף","פ"),e.word=e.word.replace("ץ","צ"),!1},function(r){return"function"==typeof r.update?r.update(function(r){return e.setCurrent(r),e.stem(),e.getCurrent()}):(e.setCurrent(r),e.stem(),e.getCurrent())}}(),e.Pipeline.registerFunction(e.he.stemmer,"stemmer-he"),e.he.stopWordFilter=e.generateStopWordFilter("אבל או אולי אותו אותי אותך אותם אותן אותנו אז אחר אחרות אחרי אחריכן אחרים אחרת אי איזה איך אין איפה אל אלה אלו אם אנחנו אני אף אפשר את אתה אתכם אתכן אתם אתן באיזה באיזו בגלל בין בלבד בעבור בעזרת בכל בכן בלי במידה במקום שבו ברוב בשביל בשעה ש בתוך גם דרך הוא היא היה היי היכן היתה היתי הם הן הנה הסיבה שבגללה הרי ואילו ואת זאת זה זות יהיה יוכל יוכלו יותר מדי יכול יכולה יכולות יכולים יכל יכלה יכלו יש כאן כאשר כולם כולן כזה כי כיצד כך כל כלל כמו כן כפי כש לא לאו לאיזותך לאן לבין לה להיות להם להן לו לזה לזות לי לך לכם לכן למה למעלה למעלה מ למטה למטה מ למעט למקום שבו למרות לנו לעבר לעיכן לפיכך לפני מאד מאחורי מאיזו סיבה מאין מאיפה מבלי מבעד מדוע מה מהיכן מול מחוץ מי מידע מכאן מכל מכן מלבד מן מנין מסוגל מעט מעטים מעל מצד מקום בו מתחת מתי נגד נגר נו עד עז על עלי עליו עליה עליהם עליך עלינו עם עצמה עצמהם עצמהן עצמו עצמי עצמם עצמן עצמנו פה רק שוב של שלה שלהם שלהן שלו שלי שלך שלכה שלכם שלכן שלנו שם תהיה תחת".split(" ")),e.Pipeline.registerFunction(e.he.stopWordFilter,"stopWordFilter-he")}}); \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 7c61fca..f85616c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "lunr-languages", - "version": "1.11.0", + "version": "1.12.0", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "lunr-languages", - "version": "1.11.0", + "version": "1.12.0", "license": "MPL-1.1", "devDependencies": { "@node-rs/jieba": "^1.6.1", diff --git a/test/VersionsAndLanguagesTest.js b/test/VersionsAndLanguagesTest.js index 7517265..65056e6 100644 --- a/test/VersionsAndLanguagesTest.js +++ b/test/VersionsAndLanguagesTest.js @@ -17,10 +17,11 @@ var lunrVersions = [ version: "2.3.5", lunr: "lunr-2.3.5" } - + ]; var testDocuments = { + he: require('./testdata/he'), ar: require('./testdata/ar'), de: require('./testdata/de'), da: require('./testdata/da'), @@ -50,9 +51,9 @@ var testDocuments = { zh: require('./testdata/zh') }; -lunrVersions.forEach(function(lunrVersion) { - describe("Testing Lunr-Languages & Lunr version " + lunrVersion.version, function() { - describe("should be able to correctly identify words in multi-documents scenarios (eg: en + ru)", function() { +lunrVersions.forEach(function (lunrVersion) { + describe("Testing Lunr-Languages & Lunr version " + lunrVersion.version, function () { + describe("should be able to correctly identify words in multi-documents scenarios (eg: en + ru)", function () { delete require.cache[require.resolve('./lunr/' + lunrVersion.lunr)] var lunr = require('./lunr/' + lunrVersion.lunr); require('../lunr.stemmer.support.js')(lunr); @@ -61,50 +62,50 @@ lunrVersions.forEach(function(lunrVersion) { var idxEn = lunr(function () { this.field('body'); - this.add({"body": "Этот текст написан на русском.", "id": 1}); - this.add({"body": "This text is written in the English language.", "id": 2}); + this.add({ "body": "Этот текст написан на русском.", "id": 1 }); + this.add({ "body": "This text is written in the English language.", "id": 2 }); }); var idxRu = lunr(function () { this.use(lunr.ru); this.field('body'); - this.add({"body": "Этот текст написан на русском.", "id": 1}); - this.add({"body": "This text is written in the English language.", "id": 2}); + this.add({ "body": "Этот текст написан на русском.", "id": 1 }); + this.add({ "body": "This text is written in the English language.", "id": 2 }); }); var idxMulti = lunr(function () { this.use(lunr.multiLanguage('en', 'ru')); this.field('body'); - this.add({"body": "Этот текст написан на русском.", "id": 1}); - this.add({"body": "This text is written in the English language.", "id": 2}); + this.add({ "body": "Этот текст написан на русском.", "id": 1 }); + this.add({ "body": "This text is written in the English language.", "id": 2 }); }); - it("should not stem and find 'Русских' in english documents", function() { + it("should not stem and find 'Русских' in english documents", function () { assert.equal(idxEn.search('Русских').length, 0) }); - it("should stem and find 'languages' in english documents", function() { + it("should stem and find 'languages' in english documents", function () { assert.equal(idxEn.search('languages').length, 1) }); - it("should stem and find 'Русских' in russian documents", function() { + it("should stem and find 'Русских' in russian documents", function () { assert.equal(idxRu.search('Русских').length, 1) }); - it("should not stem and find 'languages' in russian documents", function() { + it("should not stem and find 'languages' in russian documents", function () { assert.equal(idxRu.search('languages').length, 0) }); - it("should stem and find 'Русских' in russian+english documents", function() { + it("should stem and find 'Русских' in russian+english documents", function () { assert.equal(idxMulti.search('Русских').length, 1) }); - it("should stem and find 'languages' in russian+english documents", function() { + it("should stem and find 'languages' in russian+english documents", function () { assert.equal(idxMulti.search('languages').length, 1) }); }); - Object.keys(testDocuments).forEach(function(language) { - describe("should be able to correctly find terms in " + language.toUpperCase() + " correctly", function() { + Object.keys(testDocuments).forEach(function (language) { + describe("should be able to correctly find terms in " + language.toUpperCase() + " correctly", function () { // because these tests are asynchronous, we must ensure every load of lunr is fresh // so we do not get the previous used languages on it. // if we don't do this, when we'll run the test for jp, we'll also have da, de, fr, it languages used @@ -122,17 +123,17 @@ lunrVersions.forEach(function(lunrVersion) { var idx = lunr(function () { this.use(lunr[language]); - testDocuments[language].fields.forEach(function(field) { + testDocuments[language].fields.forEach(function (field) { this.field(field.name, field.config) }.bind(this)); - testDocuments[language].documents.forEach(function(doc) { + testDocuments[language].documents.forEach(function (doc) { this.add(doc) }.bind(this)); }); - testDocuments[language].tests.forEach(function(test) { - it("should " + test.what.replace('%w', '"' + test.search + '"'), function() { + testDocuments[language].tests.forEach(function (test) { + it("should " + test.what.replace('%w', '"' + test.search + '"'), function () { assert.equal(idx.search(test.search).length, test.found) }); }.bind(this)); diff --git a/test/testdata/he.js b/test/testdata/he.js new file mode 100644 index 0000000..eb34ef9 --- /dev/null +++ b/test/testdata/he.js @@ -0,0 +1,49 @@ +module.exports = { + fields: [ + { + name: 'title', + config: { boost: 10 } + }, { + name: 'body' + } + ], + documents: [ + { + "title": "בקשה צנועה", + "body": "יש לי בקשה צנועה שתמשיכו להדריך בזה אותנו בעתיד עם ההצעות והתגובות שלכם. תודה רבה", + "id": 1 + } + ], + tests: [ + { + what: "find the word %w", + search: "בקשה", + found: 1 + }, + { + what: "find the word %w", + search: "בזה", + found: 1 + }, + { + what: "never find a word that does not exist, like %w", + search: "לא קיים", + found: 0 + }, + { + what: "find the word %w", + search: "בעתיד", + found: 1 + }, + { + what: "find the word %w", + search: "להדריך", + found: 1 + }, + { + what: "find a phrase that contains both %w", + search: "בקשה צנועה", + found: 1 + } + ] +}