From d03b4b4438a0e1ab9ce8ba595be0d46d5e047b3c Mon Sep 17 00:00:00 2001 From: spencer kelly Date: Mon, 29 Oct 2018 15:57:41 -0400 Subject: [PATCH] move encoding to wtf --- README.md | 4 ++- package-lock.json | 30 ++++++++++---------- package.json | 4 +-- scratch.js | 8 ++++-- src/01-prepwork.js | 3 ++ src/worker/02-parseWiki.js | 1 - src/worker/_encode.js | 56 -------------------------------------- 7 files changed, 28 insertions(+), 78 deletions(-) diff --git a/README.md b/README.md index a238bc5..f9b7c17 100644 --- a/README.md +++ b/README.md @@ -179,13 +179,15 @@ let obj={ return { _id: doc.title(), //for duplicate-detection title: doc.title(), //for the logger.. - sections: doc.sections().map(i => i.json()), + sections: doc.sections().map(i => i.json({encode:true})), categories: doc.categories() //whatever you want! } } } dumpster(obj, () => console.log('custom wikipedia!') ) ``` +if you're using any `.json()` methods, pass a `{encode:true}` in to avoid mongo complaints about key-names. + * **non-main namespaces:** do you want to parse all the navboxes? change `namespace` in ./config.js to [another number](https://en.wikipedia.org/wiki/Wikipedia:Namespace) diff --git a/package-lock.json b/package-lock.json index abd65ef..e30ac50 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "dumpster-dive", - "version": "4.0.0", + "version": "4.0.1", "lockfileVersion": 1, "requires": true, "dependencies": { @@ -100,9 +100,9 @@ "dev": true }, "cross-fetch": { - "version": "2.2.2", - "resolved": "https://registry.npmjs.org/cross-fetch/-/cross-fetch-2.2.2.tgz", - "integrity": "sha1-pH/09/xxLauo9qaVoRyUhEDUVyM=", + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/cross-fetch/-/cross-fetch-2.2.3.tgz", + "integrity": "sha512-PrWWNH3yL2NYIb/7WF/5vFG3DCQiXDOVf8k3ijatbrtnwNuhMWLC7YF7uqf53tbTFDzHIUD8oITw4Bxt8ST3Nw==", "requires": { "node-fetch": "2.1.2", "whatwg-fetch": "2.0.4" @@ -470,11 +470,11 @@ "dev": true }, "mongodb": { - "version": "3.1.6", - "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-3.1.6.tgz", - "integrity": "sha512-E5QJuXQoMlT7KyCYqNNMfAkhfQD79AT4F8Xd+6x37OX+8BL17GyXyWvfm6wuyx4wnzCCPoCSLeMeUN2S7dU9yw==", + "version": "3.1.8", + "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-3.1.8.tgz", + "integrity": "sha512-yNKwYxQ6m00NV6+pMoWoheFTHSQVv1KkSrfOhRDYMILGWDYtUtQRqHrFqU75rmPIY8hMozVft8zdC4KYMWaM3Q==", "requires": { - "mongodb-core": "3.1.5", + "mongodb-core": "3.1.7", "safe-buffer": "^5.1.2" }, "dependencies": { @@ -486,9 +486,9 @@ } }, "mongodb-core": { - "version": "3.1.5", - "resolved": "https://registry.npmjs.org/mongodb-core/-/mongodb-core-3.1.5.tgz", - "integrity": "sha512-emT/tM4ZBinqd6RZok+EzDdtN4LjYJIckv71qQVOEFmvXgT5cperZegVmTgox/1cx4XQu6LJ5ZuIwipP/eKdQg==", + "version": "3.1.7", + "resolved": "https://registry.npmjs.org/mongodb-core/-/mongodb-core-3.1.7.tgz", + "integrity": "sha512-YffpSrLmgFNmrvkGx+yX00KyBNk64C0BalfEn6vHHkXtcMUGXw8nxrMmhq5eXPLLlYeBpD/CsgNxE2Chf0o4zQ==", "requires": { "bson": "^1.1.0", "require_optional": "^1.0.1", @@ -1065,11 +1065,11 @@ "dev": true }, "wtf_wikipedia": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/wtf_wikipedia/-/wtf_wikipedia-6.1.0.tgz", - "integrity": "sha512-KsGXmsqwxcGgeWcIsojTLJhExx3pYkY2TPVIRxbNF/3hmH7u5bV4C8AN5aiCVPzmr0pI0sABSdt77neFydrbRA==", + "version": "6.2.0", + "resolved": "https://registry.npmjs.org/wtf_wikipedia/-/wtf_wikipedia-6.2.0.tgz", + "integrity": "sha512-flTL95xVC7myhDA5TaLXmxg9CMkzjpEMUUmOnALoaSZIzGPfWECGVt/l/NlmB87SDfvn+QvBxGqaZEBLnkf9rA==", "requires": { - "cross-fetch": "2.2.2" + "cross-fetch": "2.2.3" } }, "xregexp": { diff --git a/package.json b/package.json index 903ff1e..922daf2 100644 --- a/package.json +++ b/package.json @@ -22,11 +22,11 @@ "dependencies": { "chalk": "2.4.1", "jsonfn": "^0.31.0", - "mongodb": "3.1.6", + "mongodb": "3.1.8", "prettysize": "1.1.0", "sunday-driver": "1.0.1", "worker-nodes": "1.6.1", - "wtf_wikipedia": "6.1.0", + "wtf_wikipedia": "6.2.0", "yargs": "12.0.2" }, "devDependencies": { diff --git a/scratch.js b/scratch.js index f1872f3..56b5874 100644 --- a/scratch.js +++ b/scratch.js @@ -3,7 +3,8 @@ const drop = require('./src/lib/drop-db'); //144mb → 2.5 minutes = 57mb per worker per minute // const path = '/home/spencer/mountain/dumpster-dive/tests/tinywiki-latest-pages-articles.xml'; -const path = '/media/spencer/07d11766-2ce6-4f8a-8ec0-a3d144a3d4cd/big_data/wikipedia/enwiki-latest-pages-articles.xml'; +// const path = '/media/spencer/07d11766-2ce6-4f8a-8ec0-a3d144a3d4cd/big_data/wikipedia/enwiki-latest-pages-articles.xml'; +const path = '/Users/spencer/data/wikipedia/enwiki-latest-pages-articles.xml' // const path = './tests/smallwiki-latest-pages-articles.xml'; //3s // const path = './tests/tinywiki-latest-pages-articles.xml'; //2s const dbName = path.match(/\/([a-z-]+)-latest-pages/)[1]; @@ -17,8 +18,9 @@ let options = { // verbose: true, // verbose_skip: true, // batch_size: 1 - skip_redirects: true, - skip_disambig: true, + // skip_redirects: true, + // skip_disambig: true, + missing_templates: true // workers: 1 // custom: function(doc) { // return { diff --git a/src/01-prepwork.js b/src/01-prepwork.js index c5bb5b4..aa20d7c 100644 --- a/src/01-prepwork.js +++ b/src/01-prepwork.js @@ -32,6 +32,9 @@ const prepWork = function(options) { options.dbName = options.db options.workers = options.workers || cpuCount options.batch_size = options.batch_size || config.batch_size + if (options.encode === undefined) { + options.encode = true + } //some top-level logging process.on('unhandledRejection', function(up) { console.log(chalk.red('--uncaught top-process error--')) diff --git a/src/worker/02-parseWiki.js b/src/worker/02-parseWiki.js index aae265e..0272f62 100644 --- a/src/worker/02-parseWiki.js +++ b/src/worker/02-parseWiki.js @@ -39,7 +39,6 @@ const parseWiki = function(page, options, worker) { } else { //DIY format data = options.custom(doc); } - data = encode.encodeData(data); //use the title/pageID from the xml data.title = page.title || data.title; data.pageID = page.pageID || data.pageID; diff --git a/src/worker/_encode.js b/src/worker/_encode.js index 7339cc5..7cd5b83 100644 --- a/src/worker/_encode.js +++ b/src/worker/_encode.js @@ -1,11 +1,5 @@ // mongo has some opinions about what characters are allowed as keys and ids. //https://stackoverflow.com/questions/12397118/mongodb-dot-in-key-name/30254815#30254815 -const specialChar = /[\\\.$]/; - -const isObject = function(x) { - return (typeof x === 'object') && (x !== null); -}; - const encodeStr = function(str) { if (typeof str !== 'string') { str = ''; @@ -16,56 +10,6 @@ const encodeStr = function(str) { .replace(/\./g, '\\u002e'); }; -const encodeObj = function( obj = {} ) { - let keys = Object.keys(obj); - for(let i = 0; i < keys.length; i += 1) { - if (specialChar.test(keys[i]) === true) { - let str = encodeStr(keys[i]); - if (str !== keys[i]) { - obj[str] = obj[keys[i]]; - delete obj[keys[i]]; - } - } - } - return obj; -}; - -//tables & infoboxes & citations could potentially have unsafe keys -const encodeData = function(data) { - data = data || {}; - //cleanup forbidden object key names in mongo - if (data.sections && data.sections.length > 0) { - data.sections.forEach(s => { - //encode keys in templates - if (s.templates) { - s.templates = s.templates.map(tmpl => { - tmpl = encodeObj(tmpl); - //try encoding these, too - if (tmpl.data && isObject(tmpl.data)) { - tmpl.data = encodeObj(tmpl.data); - } - return tmpl; - }); - } - //infoboxes have their stuff here - if (s.infoboxes) { - s.infoboxes = s.infoboxes.map(info => { - info = encodeObj(info); - return info; - }); - } - //encode keys in tables - if (s.tables && s.tables.length > 0) { - s.tables = s.tables.map(table => { - return table.map((row) => encodeObj(row)); - }); - } - }); - } - return data; -}; - module.exports = { - encodeData: encodeData, encodeStr: encodeStr };