Skip to content

Commit

Permalink
move encoding to wtf
Browse files Browse the repository at this point in the history
  • Loading branch information
spencermountain committed Oct 29, 2018
1 parent c3f4222 commit d03b4b4
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 78 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,13 +179,15 @@ let obj={
return {
_id: doc.title(), //for duplicate-detection
title: doc.title(), //for the logger..
sections: doc.sections().map(i => i.json()),
sections: doc.sections().map(i => i.json({encode:true})),
categories: doc.categories() //whatever you want!
}
}
}
dumpster(obj, () => console.log('custom wikipedia!') )
```
if you're using any `.json()` methods, pass a `{encode:true}` in to avoid mongo complaints about key-names.
* **non-main namespaces:**
do you want to parse all the navboxes? change `namespace` in ./config.js to [another number](https://en.wikipedia.org/wiki/Wikipedia:Namespace)
Expand Down
30 changes: 15 additions & 15 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
"dependencies": {
"chalk": "2.4.1",
"jsonfn": "^0.31.0",
"mongodb": "3.1.6",
"mongodb": "3.1.8",
"prettysize": "1.1.0",
"sunday-driver": "1.0.1",
"worker-nodes": "1.6.1",
"wtf_wikipedia": "6.1.0",
"wtf_wikipedia": "6.2.0",
"yargs": "12.0.2"
},
"devDependencies": {
Expand Down
8 changes: 5 additions & 3 deletions scratch.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ const drop = require('./src/lib/drop-db');

//144mb → 2.5 minutes = 57mb per worker per minute
// const path = '/home/spencer/mountain/dumpster-dive/tests/tinywiki-latest-pages-articles.xml';
const path = '/media/spencer/07d11766-2ce6-4f8a-8ec0-a3d144a3d4cd/big_data/wikipedia/enwiki-latest-pages-articles.xml';
// const path = '/media/spencer/07d11766-2ce6-4f8a-8ec0-a3d144a3d4cd/big_data/wikipedia/enwiki-latest-pages-articles.xml';
const path = '/Users/spencer/data/wikipedia/enwiki-latest-pages-articles.xml'
// const path = './tests/smallwiki-latest-pages-articles.xml'; //3s
// const path = './tests/tinywiki-latest-pages-articles.xml'; //2s
const dbName = path.match(/\/([a-z-]+)-latest-pages/)[1];
Expand All @@ -17,8 +18,9 @@ let options = {
// verbose: true,
// verbose_skip: true,
// batch_size: 1
skip_redirects: true,
skip_disambig: true,
// skip_redirects: true,
// skip_disambig: true,
missing_templates: true
// workers: 1
// custom: function(doc) {
// return {
Expand Down
3 changes: 3 additions & 0 deletions src/01-prepwork.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ const prepWork = function(options) {
options.dbName = options.db
options.workers = options.workers || cpuCount
options.batch_size = options.batch_size || config.batch_size
if (options.encode === undefined) {
options.encode = true
}
//some top-level logging
process.on('unhandledRejection', function(up) {
console.log(chalk.red('--uncaught top-process error--'))
Expand Down
1 change: 0 additions & 1 deletion src/worker/02-parseWiki.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ const parseWiki = function(page, options, worker) {
} else { //DIY format
data = options.custom(doc);
}
data = encode.encodeData(data);
//use the title/pageID from the xml
data.title = page.title || data.title;
data.pageID = page.pageID || data.pageID;
Expand Down
56 changes: 0 additions & 56 deletions src/worker/_encode.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
// mongo has some opinions about what characters are allowed as keys and ids.
//https://stackoverflow.com/questions/12397118/mongodb-dot-in-key-name/30254815#30254815
const specialChar = /[\\\.$]/;

const isObject = function(x) {
return (typeof x === 'object') && (x !== null);
};

const encodeStr = function(str) {
if (typeof str !== 'string') {
str = '';
Expand All @@ -16,56 +10,6 @@ const encodeStr = function(str) {
.replace(/\./g, '\\u002e');
};

const encodeObj = function( obj = {} ) {
let keys = Object.keys(obj);
for(let i = 0; i < keys.length; i += 1) {
if (specialChar.test(keys[i]) === true) {
let str = encodeStr(keys[i]);
if (str !== keys[i]) {
obj[str] = obj[keys[i]];
delete obj[keys[i]];
}
}
}
return obj;
};

//tables & infoboxes & citations could potentially have unsafe keys
const encodeData = function(data) {
data = data || {};
//cleanup forbidden object key names in mongo
if (data.sections && data.sections.length > 0) {
data.sections.forEach(s => {
//encode keys in templates
if (s.templates) {
s.templates = s.templates.map(tmpl => {
tmpl = encodeObj(tmpl);
//try encoding these, too
if (tmpl.data && isObject(tmpl.data)) {
tmpl.data = encodeObj(tmpl.data);
}
return tmpl;
});
}
//infoboxes have their stuff here
if (s.infoboxes) {
s.infoboxes = s.infoboxes.map(info => {
info = encodeObj(info);
return info;
});
}
//encode keys in tables
if (s.tables && s.tables.length > 0) {
s.tables = s.tables.map(table => {
return table.map((row) => encodeObj(row));
});
}
});
}
return data;
};

module.exports = {
encodeData: encodeData,
encodeStr: encodeStr
};

0 comments on commit d03b4b4

Please sign in to comment.