Skip to content

Commit

Permalink
v1.1.3: refactor: start to implement transform stream
Browse files Browse the repository at this point in the history
  • Loading branch information
modestysn committed Mar 17, 2016
1 parent 29e70c1 commit 7a18a6d
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 55 deletions.
4 changes: 1 addition & 3 deletions lib/p2jcmd.js
Original file line number Diff line number Diff line change
Expand Up @@ -335,9 +335,7 @@ let PDFProcessor = (function () {

fs.readdir(inputDir, (err, files) => {
let _iChars = "!@#$%^&*()+=[]\\\';,/{}|\":<>?~`.-_ ";
let pdfFiles = files.filter(function(file) {
return file.substr(-4).toLowerCase() === '.pdf' && _iChars.indexOf(file.substr(0,1)) < 0;
});
let pdfFiles = files.filter( file => file.substr(-4).toLowerCase() === '.pdf' && _iChars.indexOf(file.substr(0,1)) < 0 );

this.inputCount = pdfFiles.length;
if (this.inputCount > 0) {
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "pdf2json",
"version": "1.1.2",
"version": "1.1.3",
"description": "A PDF file parser that converts PDF binaries to text based JSON, powered by porting a fork of PDF.JS to Node.js",
"keywords": [
"pdf",
Expand Down
104 changes: 62 additions & 42 deletions pdfparser.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
'use strict';

let nodeUtil = require("util"),
nodeEvents = require("events"),
let fs = require('fs'),
stream = require('stream'),
nodeUtil = require("util"),
_ = require("underscore"),
fs = require('fs'),
async = require("async"),
PDFJS = require("./lib/pdf.js");

Expand All @@ -18,8 +18,13 @@ let PDFParser = (function () {
//private methods, needs to invoked by [funcName].call(this, ...)
let _onPDFJSParseDataReady = function(data) {
if (!data) { //v1.1.2: data===null means end of parsed data
this.emit("pdfParser_dataReady", this);
nodeUtil.p2jinfo("PDF parsing completed.");
this.emit("pdfParser_dataReady", this);
if (typeof this.flushCallback === 'function') {
this.push(this);
this.flushCallback();
this.flushCallback = null;
}
}
else {
Object.assign(this.data, data);
Expand Down Expand Up @@ -74,7 +79,7 @@ let PDFParser = (function () {
// constructor
function PdfParser(context, needRawText) {
//call constructor for super class
nodeEvents.EventEmitter.call(this);
stream.Transform.call(this, {objectMode: true});

// private
let _id = _nextId++;
Expand All @@ -91,53 +96,68 @@ let PDFParser = (function () {
this.PDFJS = new PDFJS(needRawText);
this.processFieldInfoXML = false;//disable additional _fieldInfo.xml parsing and merging

this.fq = async.queue( (task, callback) => {
fs.readFile(task.path, callback);
}, 100);
this.chunks = [];
this.flushCallback = null;
}
// inherit from event emitter
nodeUtil.inherits(PdfParser, stream.Transform);

//public APIs
this.loadPDF = (pdfFilePath, verbosity) => {
nodeUtil.verbosity(verbosity);
nodeUtil.p2jinfo("about to load PDF file " + pdfFilePath);
//implements transform stream
PdfParser.prototype._transform = function (chunk, enc, callback) {
this.chunks.push(Buffer.isBuffer(chunk) ? chunk : new Buffer(chunk, enc));
callback();
};

this.pdfFilePath = pdfFilePath;
if (this.processFieldInfoXML) {
this.PDFJS.tryLoadFieldInfoXML(pdfFilePath);
}
PdfParser.prototype._flush = function(callback) {
this.flushCallback = callback;
this.parseBuffer(Buffer.concat(this.chunks));
};

if (_processBinaryCache.call(this))
return;
PdfParser.prototype.fq = async.queue( (task, callback) => {
fs.readFile(task.path, callback);
}, 100);

this.fq.push({path: pdfFilePath}, _processPDFContent.bind(this));
};
//public APIs
PdfParser.prototype.loadPDF = function(pdfFilePath, verbosity) {
nodeUtil.verbosity(verbosity);
nodeUtil.p2jinfo("about to load PDF file " + pdfFilePath);

// Introduce a way to directly process buffers without the need to write it to a temporary file
this.parseBuffer = (pdfBuffer) => {
_startParsingPDF.call(this, pdfBuffer);
};
this.pdfFilePath = pdfFilePath;
if (this.processFieldInfoXML) {
this.PDFJS.tryLoadFieldInfoXML(pdfFilePath);
}

this.getRawTextContent = () => this.PDFJS.getRawTextContent();
this.getAllFieldsTypes = () => this.PDFJS.getAllFieldsTypes(this.data);
this.getMergedTextBlocksIfNeeded = () => this.PDFJS.getMergedTextBlocksIfNeeded();
if (_processBinaryCache.call(this))
return;

this.destroy = () => {
this.removeAllListeners();
this.fq.push({path: pdfFilePath}, _processPDFContent.bind(this));
};

//context object will be set in Web Service project, but not in command line utility
if (this.context) {
this.context.destroy();
this.context = null;
}
// Introduce a way to directly process buffers without the need to write it to a temporary file
PdfParser.prototype.parseBuffer = function(pdfBuffer) {
_startParsingPDF.call(this, pdfBuffer);
};

this.pdfFilePath = null;
this.data = null;
PdfParser.prototype.getRawTextContent = function() { return this.PDFJS.getRawTextContent(); };
PdfParser.prototype.getAllFieldsTypes = function() { this.PDFJS.getAllFieldsTypes(this.data); };
PdfParser.prototype.getMergedTextBlocksIfNeeded = function() { this.PDFJS.getMergedTextBlocksIfNeeded(); };

this.PDFJS.destroy();
this.PDFJS = null;
};
}
// inherit from event emitter
nodeUtil.inherits(PdfParser, nodeEvents.EventEmitter);
PdfParser.prototype.destroy = function() {
this.removeAllListeners();

//context object will be set in Web Service project, but not in command line utility
if (this.context) {
this.context.destroy();
this.context = null;
}

this.pdfFilePath = null;
this.data = null;
this.chunks = null;

this.PDFJS.destroy();
this.PDFJS = null;
};

return PdfParser;
})();
Expand Down
18 changes: 9 additions & 9 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ To Run in RESTful Web Service or as Commandline Utility

## Code Example

* Parse a PDF then write to a JSON file:
* Parse a PDF file then write to a JSON file:

```javascript

Expand Down Expand Up @@ -105,26 +105,26 @@ To Run in RESTful Web Service or as Commandline Utility
## API Reference

* events:
** pdfParser_dataError: will be raised when parsing failed
** pdfParser_dataReady: when parsing successfully completed with data
* pdfParser_dataError: will be raised when parsing failed
* pdfParser_dataReady: when parsing succeeded

* start to parse PDF file from specified file path asynchronously:

````javascript
function loadPDF(pdfFilePath);

````
If failed, event "pdfParser_dataError" will be raised with error object;
If success, event "pdfParser_dataReady" will be raised with output data object, which can be saved as json file (in command line) or serialized to json when running in web service.

* Get all textual content from "pdfParser_dataReady" event handler:

````javascript
function getRawTextContent();

````
returns text in string.

* Get all input fields information from "pdfParser_dataReady" event handler:

````javascript
function getAllFieldsTypes();
````
returns an array of field objects.

## Output format Reference
Expand Down

0 comments on commit 7a18a6d

Please sign in to comment.