Skip to content

Commit

Permalink
feat: add variations (#14)
Browse files Browse the repository at this point in the history
  • Loading branch information
awxiaoxian2020 committed Sep 13, 2023
1 parent 67dfedd commit 0680529
Show file tree
Hide file tree
Showing 12 changed files with 17,013 additions and 11,176 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

高频词汇的释义经过了人工初步校对,其他单词选取使用频率总和大于 50% 的释义(数据来自 [the little dict](http://louischeung.top:225/mdict%E8%AF%8D%E5%85%B8%E5%8C%85/The%20Little%20Dict/)),可以保证一定的准确性。减轻不必要的机械记忆负担。

每个单词有异形词(即考纲当中有多种写法的单词)的,计划将其在后面列出,以保证原始数据的准确性。[这个 PR](https://github.com/awxiaoxian2020/NETEMVocabulary/pull/14)
每个单词有异形词(即考纲当中有多种写法的单词)的,一并列出,以保证原始数据的准确性。目前根据[这个数据](https://github.com/awxiaoxian2020/spelling-variations/blob/dev/src/bydictionary.json)进行了初步填充。有空再和考纲校对

目前正在开发对应的跨端小程序,见 [develop 分支](https://github.com/awxiaoxian2020/NETEMVocabulary/tree/develop)

Expand Down
1 change: 1 addition & 0 deletions scripts/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
**/node_modules/
66 changes: 66 additions & 0 deletions scripts/spelling-variations/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import SpellingVariations from "./lib/index.js";
import mysql from "mysql2";

// 创建数据库连接
const connection = mysql.createConnection({
host: "127.0.0.1",
user: "root",
password: "root",
database: "netem",
});

// 连接到数据库
connection.connect((err) => {
if (err) {
console.error("无法连接到数据库:", err);
return;
}
console.log("已成功连接到数据库");
});

// 执行数据库查询以获取数据
connection.query("SELECT word FROM vocabulary", (err, results) => {
if (err) {
console.error("查询数据库时出错:", err);
return;
}

// 处理检索到的数据
const records = results; // 此处假设您的数据库表包含名为"word"的列

// 遍历记录并进行拼写变体分析
// 处理检索到的数据
for (const record of records) {
const word = record.word; // 获取单词字段的值
const result = new SpellingVariations(word).analyze();
if (result.hasVariations) {
const uniqueVariantsSet = new Set(
result.variations.filter((variant) => variant !== word)
); // 使用Set来确保唯一性
const uniqueVariants = Array.from(uniqueVariantsSet).join(", ");
const updateQuery = `UPDATE \`vocabulary\` SET \`variant\` = ? WHERE \`word\` = ?`;
const query = connection.query(
updateQuery,
[uniqueVariants, word],
(updateErr, updateResults) => {
// console.log(updateResults)
if (updateErr) {
console.error(`更新单词 ${word} 的变体时出错: ${updateErr}`);
}
}
);
console.log('sql是',query.sql)
} else {
continue;
}
}

// 关闭数据库连接
connection.end((err) => {
if (err) {
console.error("关闭数据库连接时出错:", err);
} else {
console.log("已成功关闭数据库连接");
}
});
});
14 changes: 7 additions & 7 deletions scripts/spelling-variations/lib/bypattern.js
Original file line number Diff line number Diff line change
Expand Up @@ -114,12 +114,12 @@ const patterns = [
},
];

module.exports = function (word) {
var pattern = patterns.find(pattern=>pattern.regex.test(word));
if(!pattern) return null;
export default function(word) {
var pattern = patterns.find(pattern => pattern.regex.test(word));
if (!pattern) return null;
var result = [];
var replacement = word.replace(pattern.regex,pattern.replacementString);
pattern.originalIndex.forEach(index=>result[index] = word);
pattern.replacementIndex.forEach(index=>result[index] = replacement);
var replacement = word.replace(pattern.regex, pattern.replacementString);
pattern.originalIndex.forEach(index => result[index] = word);
pattern.replacementIndex.forEach(index => result[index] = replacement);
return result;
};
}
259 changes: 157 additions & 102 deletions scripts/spelling-variations/lib/index.js
Original file line number Diff line number Diff line change
@@ -1,106 +1,161 @@
const bydictionary = require('./bydictionary.json');
const bypattern = require('./bypattern.js');

const spellingVariations = function (word) {
this.data = analyse(word);
};

// @return {Number} how common this variation in the UK's texts (1-0)
spellingVariations.prototype.scoreUK = function() {return this.data.scoreUK;};
// @return {Number} how common this variation in the US's texts (1-0)
spellingVariations.prototype.scoreUS = function() {return this.data.scoreUS;};
// @return {Boolean} the word has variations
spellingVariations.prototype.hasVariations = function() {return this.data.hasVariations;};
// @return {Array} US variations of the word
spellingVariations.prototype.USVariations = function() {return this.data.USVariations;};
// @return {Array} UK variations of the word
spellingVariations.prototype.UKVariations = function() {return this.data.UKVariations;};
// @return {String} UK's preferred variation
spellingVariations.prototype.UKPrefered = function() {return this.data.UKPrefered;};
// @return {String} US's preferred variation
spellingVariations.prototype.USPrefered = function() {return this.data.USPrefered;};
// @return {Array} All of the word's variations
spellingVariations.prototype.variations = function() {return this.data.variations;};
// @return {String} UK and US common variation
spellingVariations.prototype.commonVariation = function() {return this.data.commonVariation;};
// @return {String} converts the word spelling to it's UK variant
spellingVariations.prototype.toUK = function() {return this.data.UKPrefered || this.data.word;};
// @return {String} converts the word spelling to it's US variant
spellingVariations.prototype.toUS = function() {return this.data.USPrefered || this.data.word;};
// @return {Object} all the info above
spellingVariations.prototype.analyse = function() {return this.data;};
// a us alias for the above function :)
spellingVariations.prototype.analyze = function() {return this.data;};


/**
*
* This little guy here is actually the one who does all the heavy
* lifting of finding the variations and the class and such..
*
**/
function analyse(word) {

word = (word || "").toLowerCase();

const result = {
word,
scoreUK:-1,
scoreUS:-1,
hasVariations:false,
UKPrefered:word,
USPrefered:word,
commonVariation:word,
UKVariations:[],
USVariations:[],
variations:[],
analyse:analyse,
analyze:analyse
};

var resultArr = [];
var dictionaryEntry = bydictionary[word];
var patternEntry = bypattern(word);
if(dictionaryEntry) resultArr = dictionaryEntry.split("|");
else if(patternEntry) resultArr = patternEntry;
else return result;

// resultArr reference:
// 0: UK1 4: US1
// 1: UK2 5: US2
// 2: UK3 6: US3
// 3: UK4 7: US4 8:UKUS


result.hasVariations = true;
result.variations = filterOut(resultArr,word);
result.UKPrefered = resultArr[0];
result.USPrefered = resultArr[4];
result.commonVariation = resultArr[8] || "";
result.UKVariations = resultArr.filter((e,i)=>e&&(i<4||i===8)&&e!==word);
result.USVariations = resultArr.filter((e,i)=>e&&(i>3||i===8)&&e!==word);

if(resultArr.indexOf(word) === 8) {
result.scoreUK = 0.87;
result.scoreUS = 0.87;
}

else {
var UKi = resultArr.slice(0,4).indexOf(word);
var USi = resultArr.slice(4,8).indexOf(word);

if(UKi === -1) result.scoreUK = 0;
else result.scoreUK = (4-UKi)*0.25;

if(USi === -1) result.scoreUS = 0;
else result.scoreUS = (4-USi)*0.25;
}

return result;
import bypattern from './bypattern.js';
import fs from 'fs';

import path from 'path';
import { exit } from 'process';
import { fileURLToPath } from 'url';

const __filename = fileURLToPath(import.meta.url);

const __dirname = path.dirname(__filename);

function readJsonFile(filePath) {
try {
const data = fs.readFileSync(filePath, 'utf8');
return JSON.parse(data);
} catch (error) {
console.error(`Error reading JSON file: ${error}`);
exit(-1);
}
}

function filterOut(arr,word){
return arr.filter((x)=>x&&x!==word);
const bydictionary = readJsonFile(path.join(__dirname,'bydictionary.json'));

class SpellingVariations {
constructor(word) {
this.data = this.analyse(word);
}

// @return {Number} how common this variation is in the UK's texts (1-0)
scoreUK() {
return this.data.scoreUK;
}

// @return {Number} how common this variation is in the US's texts (1-0)
scoreUS() {
return this.data.scoreUS;
}

// @return {Boolean} the word has variations
hasVariations() {
return this.data.hasVariations;
}

// @return {Array} US variations of the word
USVariations() {
return this.data.USVariations;
}

// @return {Array} UK variations of the word
UKVariations() {
return this.data.UKVariations;
}

// @return {String} UK's preferred variation
UKPreferred() {
return this.data.UKPreferred;
}

// @return {String} US's preferred variation
USPreferred() {
return this.data.USPreferred;
}

// @return {Array} All of the word's variations
variations() {
return this.data.variations;
}

// @return {String} UK and US common variation
commonVariation() {
return this.data.commonVariation;
}

// @return {String} converts the word spelling to its UK variant
toUK() {
return this.data.UKPreferred || this.data.word;
}

// @return {String} converts the word spelling to its US variant
toUS() {
return this.data.USPreferred || this.data.word;
}

// @return {Object} all the info above
analyse() {
return this.data;
}

// a US alias for the above function :)
analyze() {
return this.data;
}

/**
*
* This little guy here is actually the one who does all the heavy
* lifting of finding the variations and the class and such..
*
**/
analyse(word) {
word = (word || "").toLowerCase();

const result = {
word,
scoreUK: -1,
scoreUS: -1,
hasVariations: false,
UKPreferred: word,
USPreferred: word,
commonVariation: word,
UKVariations: [],
USVariations: [],
variations: [],
analyse: this.analyse,
analyze: this.analyse
};

var resultArr = [];
var dictionaryEntry = bydictionary[word];
var patternEntry = bypattern(word);
if (dictionaryEntry) resultArr = dictionaryEntry.split("|");
else if (patternEntry) resultArr = patternEntry;
else return result;

// resultArr reference:
// 0: UK1 4: US1
// 1: UK2 5: US2
// 2: UK3 6: US3
// 3: UK4 7: US4 8:UKUS

result.hasVariations = true;
result.variations = this.filterOut(resultArr, word);
result.UKPreferred = resultArr[0];
result.USPreferred = resultArr[4];
result.commonVariation = resultArr[8] || "";
result.UKVariations = resultArr.filter((e, i) => e && (i < 4 || i === 8) && e !== word);
result.USVariations = resultArr.filter((e, i) => e && (i > 3 || i === 8) && e !== word);

if (resultArr.indexOf(word) === 8) {
result.scoreUK = 0.87;
result.scoreUS = 0.87;
} else {
var UKi = resultArr.slice(0, 4).indexOf(word);
var USi = resultArr.slice(4, 8).indexOf(word);

if (UKi === -1) result.scoreUK = 0;
else result.scoreUK = (4 - UKi) * 0.25;

if (USi === -1) result.scoreUS = 0;
else result.scoreUS = (4 - USi) * 0.25;
}

return result;
}

filterOut(arr, word) {
return arr.filter((x) => x && x !== word);
}
}

module.exports = spellingVariations;
export default SpellingVariations;
Loading

0 comments on commit 0680529

Please sign in to comment.