From cfe890ef24047ad41004aac4cd5e58cb2739d5e5 Mon Sep 17 00:00:00 2001 From: "modesty.zhang" Date: Thu, 17 Mar 2016 21:53:46 -0700 Subject: [PATCH] v1.1.4: complete transform stream --- base/core/worker.js | 5 +- lib/p2jcmd.js | 140 ++++++++++++++++++++++++++++++++++++-------- lib/pdf.js | 8 +-- package.json | 2 +- pdfparser.js | 33 ++++++++--- readme.md | 38 +++++++++--- 6 files changed, 175 insertions(+), 51 deletions(-) diff --git a/base/core/worker.js b/base/core/worker.js index e1b22535..d98ba770 100755 --- a/base/core/worker.js +++ b/base/core/worker.js @@ -375,8 +375,9 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = { var start = Date.now(); page.extractTextContent().then(function(textContent) { promise.resolve(textContent); - log('text indexing: page=%d - time=%dms', pageNum, - Date.now() - start); + //MQZ 03/17/2016 comment out log + //log('text indexing: page=%d - time=%dms', pageNum, + // Date.now() - start); }, function (e) { // Skip errored pages promise.reject(e); diff --git a/lib/p2jcmd.js b/lib/p2jcmd.js index 03475fd9..130eb659 100644 --- a/lib/p2jcmd.js +++ b/lib/p2jcmd.js @@ -1,9 +1,10 @@ 'use strict'; let nodeUtil = require("util"), + stream = require('stream'), fs = require('fs'), path = require('path'), - _ = require('underscore'), + _ = require('underscore'), PDFParser = require("../pdfparser"), pkInfo = require('../package.json'), async = require("async"); @@ -27,10 +28,17 @@ let optimist = require('optimist') .alias('c', 'content') .describe('c', '(optional) when specified, will generate .content.txt that includes text content from PDF.\n') .alias('m', 'merge') - .describe('m', '(optional) when specified, will generate .merged.json that includes auto-merged broken text blocks from PDF (Experimental).\n'); + .describe('m', '(optional) when specified, will generate .merged.json that includes auto-merged broken text blocks from PDF (Experimental).\n') + .alias('r', 'stream') + .describe('r', '(optional) when specified, will process and parse with buffer/object transform stream rather than file system (Experimental).\n'); +const argv = optimist.argv; +const VERBOSITY_LEVEL = (_.has(argv, 's') ? 0 : 5); -let argv = optimist.argv; +const PROCESS_RAW_TEXT_CONTENT = _.has(argv, 'c'); +const PROCESS_FIELDS_CONTENT = _.has(argv, 't'); +const PROCESS_MERGE_BROKEN_TEXT_BLOCKS = _.has(argv, 'm'); +const PROCESS_WITH_STREAM = _.has(argv, 'r'); let PDF2JSONUtil = (function () { @@ -71,10 +79,8 @@ let PDF2JSONUtil = (function () { }; - let _writeOneJSON = function(data, callback) { - let pJSON = JSON.stringify({"formImage":data}); - - fs.writeFile(this.outputPath, pJSON, err => { + let _writeOneJSON = function(pJSON, callback) { + fs.writeFile(this.outputPath, JSON.stringify(pJSON), err => { if(err) { console.warn(this.inputFile + " => " + this.outputFile + " Exception: " + err); this.curProcessor.failedCount++; @@ -88,12 +94,11 @@ let PDF2JSONUtil = (function () { }; let _writeOneJSONWithMergedTextBlocks = function(data, callback) { - data.Pages = this.pdfParser.getMergedTextBlocksIfNeeded(); - let pJSON = JSON.stringify({"formImage":data}); + let pJSON = this.pdfParser.getMergedTextBlocksIfNeeded(); let outputPath = this.outputPath.replace(".json", ".merged.json"); let contentFile = this.outputFile.replace(".json", ".merged.json"); - fs.writeFile(outputPath, pJSON, err => { + fs.writeFile(outputPath, JSON.stringify(pJSON), err => { if (err) { console.warn(err); } else { @@ -104,20 +109,19 @@ let PDF2JSONUtil = (function () { }; let _parseOnePDF = function(callback) { - let processRawTextContent = _.has(argv, 'c'); - this.pdfParser = new PDFParser(null, processRawTextContent); + this.pdfParser = new PDFParser(null, PROCESS_RAW_TEXT_CONTENT); this.pdfParser.on("pdfParser_dataReady", evtData => { - if ((!!evtData) && (!!evtData.data)) { - let outputTasks = [cbFunc => _writeOneJSON.call(this, evtData.data, cbFunc)]; - if (_.has(argv, 't')) {//needs to generate fields.json file - outputTasks.push(cbFunc => _generateFieldsTypesFile.call(this, evtData.data, cbFunc)); + if ((!!evtData) && (!!evtData.formImage)) { + let outputTasks = [cbFunc => _writeOneJSON.call(this, evtData, cbFunc)]; + if (PROCESS_FIELDS_CONTENT) {//needs to generate fields.json file + outputTasks.push(cbFunc => _generateFieldsTypesFile.call(this, evtData, cbFunc)); } - if (processRawTextContent) {//needs to generate content.txt file - outputTasks.push(cbFunc => _generateRawTextContentFile.call(this, evtData.data, cbFunc)); + if (PROCESS_RAW_TEXT_CONTENT) {//needs to generate content.txt file + outputTasks.push(cbFunc => _generateRawTextContentFile.call(this, evtData, cbFunc)); } - if (_.has(argv, 'm')) {//needs to generate json file with merged broken text blocks - outputTasks.push(cbFunc => _writeOneJSONWithMergedTextBlocks.call(this, evtData.data, cbFunc)); + if (PROCESS_MERGE_BROKEN_TEXT_BLOCKS) {//needs to generate json file with merged broken text blocks + outputTasks.push(cbFunc => _writeOneJSONWithMergedTextBlocks.call(this, evtData, cbFunc)); } async.series(outputTasks, function(err, results){ @@ -143,10 +147,95 @@ let PDF2JSONUtil = (function () { }); console.log("\nTranscoding " + this.inputFile + " to - " + this.outputPath); - this.pdfParser.loadPDF(this.inputPath, (_.has(argv, 's') ? 0 : 5)); + this.pdfParser.loadPDF(this.inputPath, VERBOSITY_LEVEL); }; - // constructor + function StringifyStream(){ + stream.Transform.call(this); + + this._readableState.objectMode = false; + this._writableState.objectMode = true; + } + nodeUtil.inherits(StringifyStream, stream.Transform); + + StringifyStream.prototype._transform = function(obj, encoding, cb){ + this.push(JSON.stringify(obj)); + cb(); + }; + + let _createOutputStream = function(outputPath, callback) { + let outputStream = fs.createWriteStream(outputPath); + outputStream.on('finish', () => { + callback(null, outputPath); + }); + outputStream.on('error', err => { + callback({"streamError": err}, outputPath); + }); + + return outputStream; + }; + + let _generateMergedTextBlocksStream = function(callback) { + let outputStream = _createOutputStream.call(this, this.outputPath.replace(".json", ".merged.json"), callback); + this.pdfParser.getMergedTextBlocksStream().pipe(new StringifyStream()).pipe(outputStream); + }; + + let _generateRawTextContentStream = function(callback) { + let outputStream = _createOutputStream.call(this, this.outputPath.replace(".json", ".content.txt"), callback); + this.pdfParser.getRawTextContentStream().pipe(outputStream); + }; + + let _generateFieldsTypesStream = function(callback) { + let outputStream = _createOutputStream.call(this, this.outputPath.replace(".json", ".fields.json"), callback); + this.pdfParser.getAllFieldsTypesStream().pipe(new StringifyStream()).pipe(outputStream); + }; + + let _parseOnePDFStream = function(callback) { + this.pdfParser = new PDFParser(null, PROCESS_RAW_TEXT_CONTENT); + + this.pdfParser.on("pdfParser_dataError", evtData => { + this.curProcessor.failedCount++; + let errMsg = "Exception: " + evtData.data; + _continue.call(this, callback, errMsg); + }); + + let outputStream = fs.createWriteStream(this.outputPath); + outputStream.on('finish', () => { + console.log("Primary stream OK: [" + this.inputPath + "] => [" + this.outputPath + "]"); + this.curProcessor.successCount++; + + let outputTasks = []; + if (PROCESS_FIELDS_CONTENT) {//needs to generate fields.json file + outputTasks.push(cbFunc => _generateFieldsTypesStream.call(this, cbFunc)); + } + if (PROCESS_RAW_TEXT_CONTENT) {//needs to generate content.txt file + outputTasks.push(cbFunc => _generateRawTextContentStream.call(this, cbFunc)); + } + if (PROCESS_MERGE_BROKEN_TEXT_BLOCKS) {//needs to generate json file with merged broken text blocks + outputTasks.push(cbFunc => _generateMergedTextBlocksStream.call(this, cbFunc)); + } + + if (outputTasks.length > 0) { + async.series(outputTasks, function (err, results) { + if (err) { + console.error("Additional streams Error: " + err); + } else { + console.log("Additional streams OK: \n", results); + } + _continue.call(this, callback); + }); + } + else { + _continue.call(this, callback); + } + }); + + console.log("\nTranscoding " + this.inputFile + " to - " + this.outputPath); + let inputStream = fs.createReadStream(this.inputPath, {bufferSize: 64 * 1024}); + inputStream.pipe(this.pdfParser).pipe(new StringifyStream()).pipe(outputStream); + }; + + // constructor let cls = function (inputDir, inputFile, curProcessor) { // public, this instance copies this.inputDir = path.normalize(inputDir); @@ -217,8 +306,11 @@ let PDF2JSONUtil = (function () { if (!!validateMsg) { _continue.call(this, callback, validateMsg); } - else { - _parseOnePDF.call(this, callback); + else if (PROCESS_WITH_STREAM) { + _parseOnePDFStream.call(this, callback); + } + else { + _parseOnePDF.call(this, callback); } }; diff --git a/lib/pdf.js b/lib/pdf.js index 5b704573..f2b989d0 100644 --- a/lib/pdf.js +++ b/lib/pdf.js @@ -456,12 +456,6 @@ let PDFJSClass = (function () { let preT = decodeURIComponent(prevText.R[0].T); let curT = decodeURIComponent(text.R[0].T); - //let distance = Math.abs(text.x - prevText.x - prevText.w); - //let textSize = PDFFont.getFontSize(prevText); - //let spaceWidth = text.sw; - //let threshHold = PDFFont.getSpaceThreshHold(prevText); - //console.log(`\ndistance=${distance}\tthreshHold=${threshHold}\ttextSize=${textSize}\tspaceWidth=${spaceWidth}`); - prevText.R[0].T += text.R[0].T; prevText.w += text.w; text.merged = true; @@ -481,7 +475,7 @@ let PDFJSClass = (function () { page.Texts = page.Texts.filter( t => !t.merged); } - return this.pages; + return {Pages:this.pages, Width: this.pageWidth}; }; cls.prototype.destroy = function() { diff --git a/package.json b/package.json index 091abd48..b2c5eb09 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pdf2json", - "version": "1.1.3", + "version": "1.1.4", "description": "A PDF file parser that converts PDF binaries to text based JSON, powered by porting a fork of PDF.JS to Node.js", "keywords": [ "pdf", diff --git a/pdfparser.js b/pdfparser.js index e97b2597..79e83818 100644 --- a/pdfparser.js +++ b/pdfparser.js @@ -19,9 +19,10 @@ let PDFParser = (function () { let _onPDFJSParseDataReady = function(data) { if (!data) { //v1.1.2: data===null means end of parsed data nodeUtil.p2jinfo("PDF parsing completed."); - this.emit("pdfParser_dataReady", this); + let output = {"formImage": this.data}; + this.emit("pdfParser_dataReady", output); if (typeof this.flushCallback === 'function') { - this.push(this); + this.push(output); this.flushCallback(); this.flushCallback = null; } @@ -32,8 +33,8 @@ let PDFParser = (function () { }; let _onPDFJSParserDataError = function(data) { - this.data = data; - this.emit("pdfParser_dataError", this); + this.data = null; + this.emit("pdfParser_dataError", {"parserError": data}); }; let _startParsingPDF = function(buffer) { @@ -76,10 +77,17 @@ let PDFParser = (function () { } }; + let _createContentStream = function(jsonObj) { + let rStream = new stream.Readable({objectMode: true}); + rStream.push(jsonObj); + rStream.push(null); + return rStream; + }; + // constructor function PdfParser(context, needRawText) { //call constructor for super class - stream.Transform.call(this, {objectMode: true}); + stream.Transform.call(this, {objectMode: true, bufferSize: 64 * 1024}); // private let _id = _nextId++; @@ -118,8 +126,12 @@ let PDFParser = (function () { }, 100); //public APIs + PdfParser.prototype.setVerbosity = function(verbosity) { + nodeUtil.verbosity(verbosity || 0); + }; + PdfParser.prototype.loadPDF = function(pdfFilePath, verbosity) { - nodeUtil.verbosity(verbosity); + this.setVerbosity(verbosity); nodeUtil.p2jinfo("about to load PDF file " + pdfFilePath); this.pdfFilePath = pdfFilePath; @@ -139,8 +151,13 @@ let PDFParser = (function () { }; PdfParser.prototype.getRawTextContent = function() { return this.PDFJS.getRawTextContent(); }; - PdfParser.prototype.getAllFieldsTypes = function() { this.PDFJS.getAllFieldsTypes(this.data); }; - PdfParser.prototype.getMergedTextBlocksIfNeeded = function() { this.PDFJS.getMergedTextBlocksIfNeeded(); }; + PdfParser.prototype.getRawTextContentStream = function() { return _createContentStream(this.getRawTextContent()); }; + + PdfParser.prototype.getAllFieldsTypes = function() { return this.PDFJS.getAllFieldsTypes(); }; + PdfParser.prototype.getAllFieldsTypesStream = function() { return _createContentStream(this.getAllFieldsTypes()); }; + + PdfParser.prototype.getMergedTextBlocksIfNeeded = function() { return {"formImage": this.PDFJS.getMergedTextBlocksIfNeeded()}; }; + PdfParser.prototype.getMergedTextBlocksStream = function() { return _createContentStream(this.getMergedTextBlocksIfNeeded()); }; PdfParser.prototype.destroy = function() { this.removeAllListeners(); diff --git a/readme.md b/readme.md index f88d877f..cd2a0ea6 100644 --- a/readme.md +++ b/readme.md @@ -30,7 +30,7 @@ To Run in RESTful Web Service or as Commandline Utility pdfParser.on("pdfParser_dataError", errData => console.error(errData) ); pdfParser.on("pdfParser_dataReady", pdfData => { - let pJSON = JSON.stringify({"formImage": pdfData.data}); + let pJSON = JSON.stringify(pdfData); fs.writeFile("./pdf2json/test/F1040EZ.json", pJSON, (err) => { if(err) { @@ -63,7 +63,7 @@ To Run in RESTful Web Service or as Commandline Utility pdfParser.on("pdfParser_dataError", errData => console.error(errData) ); pdfParser.on("pdfParser_dataReady", pdfDta => { - fs.writeFile("./pdf2json/test/F1040EZ.content.txt", pdfParser.getRawTextContent(), (err) => { + fs.writeFile("./pdf2json/test/F1040EZ.content.txt", pdfParser.getRawTextContent(), err => { if(err) { console.error("parsing error: ", err); } @@ -101,6 +101,22 @@ To Run in RESTful Web Service or as Commandline Utility pdfParser.loadPDF("./pdf2json/test/pdf/fd/form/F1040EZ.pdf"); ```` + +Alternatively, you can pipe input and output streams: (requires v1.1.4) + +````javascript + let fs = require('fs'), + PDFParser = require("./pdf2json/PDFParser"); + + let pdfParser = new PDFParser(); + + let inputStream = fs.createReadStream("./pdf2json/test/pdf/fd/form/F1040EZ.pdf", {bufferSize: 64 * 1024}); + let outputStram = fs.createWriteStream("./pdf2json/test/target/fd/form/F1040EZ.json"); + + inputStream.pipe(this.pdfParser).pipe(new StringifyStream()).pipe(outputStream); +```` +See [p2jcmd.js](https://github.com/modesty/pdf2json/blob/master/lib/p2jcmd.js) for more details. + ## API Reference @@ -112,8 +128,8 @@ To Run in RESTful Web Service or as Commandline Utility ````javascript function loadPDF(pdfFilePath); ```` -If failed, event "pdfParser_dataError" will be raised with error object; -If success, event "pdfParser_dataReady" will be raised with output data object, which can be saved as json file (in command line) or serialized to json when running in web service. +If failed, event "pdfParser_dataError" will be raised with error object: {"parserError": errObj}; +If success, event "pdfParser_dataReady" will be raised with output data object: {"formImage": parseOutput}, which can be saved as json file (in command line) or serialized to json when running in web service. * Get all textual content from "pdfParser_dataReady" event handler: ````javascript @@ -759,11 +775,11 @@ Some testing PDFs are provided by bug reporters, like the "unsupported encryptio ```` -## Upgrading to ~v1.0.x +## Upgrade to ~v1.x.x If you have an early version of pdf2json, please remove your local `node_modules` directory and re-run `npm install` to upgrade to pdf2json@1.0.x. -v1.0.x upgraded dependency packages, removed some unnecessary dependencies, started to assumes ES6 / ES2015 with node ~v4.x. More PDFs are added for unit testing. +v1.x.x upgraded dependency packages, removed some unnecessary dependencies, started to assumes ES6 / ES2015 with node ~v4.x. More PDFs are added for unit testing. **Note:** pdf2json has been in production for over 3 years, it's pretty reliable and solid when parsing hundreds (sometimes tens of thousands) of PDF forms every day, thanks to everybody's help. @@ -778,16 +794,20 @@ In order to support this auto merging capability, text block objects have an add **Breaking Changes:** -v1.0.8 fixed [issue 27](https://github.com/modesty/pdf2json/issues/27), it converts x coordinate with the same ratio as y, which is 24 (96/4), rather than 8.7 (96/11), please adjust client renderer accordingly when position all elements' x coordinate. +* v1.0.8 fixed [issue 27](https://github.com/modesty/pdf2json/issues/27), it converts x coordinate with the same ratio as y, which is 24 (96/4), rather than 8.7 (96/11), please adjust client renderer accordingly when position all elements' x coordinate. +* v1.1.4 unified event data structure: **only when you handle these top level events, no change if you use commandline** + * event "pdfParser_dataError": {"parserError": errObj} + * event "pdfParser_dataReady": {"formImage": parseOutput} + ### Install on Ubuntu * Make sure nodejs is installed. Detailed installation steps can be found at http://stackoverflow.com/a/16303380/433814. -``` +```` $ nodejs --version v0.10.22 -``` +```` * Create a symbolic link from node to nodejs