diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index aa68ef4..defda12 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -23,16 +23,10 @@ jobs: strategy: fail-fast: false matrix: - php-version: ["7.2", "7.3", "7.4", "8.0", "8.1", "8.2"] + php-version: ["8.0", "8.1", "8.2"] experimental: [false] os: [ubuntu-latest] coverage-extension: [pcov] - include: - #- { php-version: '5.3', experimental: false, os: ubuntu-latest, coverage-extension: 'xdebug' } - #- { php-version: '5.4', experimental: false, os: ubuntu-latest, coverage-extension: 'xdebug' } - - { php-version: '5.5', experimental: false, os: ubuntu-latest, coverage-extension: 'xdebug' } - - { php-version: '5.6', experimental: false, os: ubuntu-latest, coverage-extension: 'xdebug' } - - { php-version: '7.1', experimental: false, os: ubuntu-latest, coverage-extension: 'xdebug' } steps: - uses: actions/checkout@v4 - name: Use php ${{ matrix.php-version }} diff --git a/.gitignore b/.gitignore index 291bb86..4dd7d5e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,10 +7,14 @@ **/.vagrant **/auth.json **/nbproject +**/temp.php +**/test.php .phpdoc .phpunit.cache .phpunit.result.cache composer.lock +ecs.php phpunit.xml +rector.php target vendor diff --git a/README.md b/README.md index cd256bc..2ade32a 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ All artifacts are generated in the target directory. Examples are located in the `example` directory. -Start a development server (requires PHP 5.4) using the command: +Start a development server (requires PHP 8.0+) using the command: ``` make server @@ -78,7 +78,7 @@ Create a composer.json in your projects root-directory: ```json { "require": { - "tecnickcom/tc-lib-pdf-parser": "^2.3" + "tecnickcom/tc-lib-pdf-parser": "^3.0.0" } } ``` @@ -86,7 +86,7 @@ Create a composer.json in your projects root-directory: Or add to an existing project with: ```bash -composer require tecnickcom/tc-lib-pdf-parser ^2.3 +composer require tecnickcom/tc-lib-pdf-parser ^3.0.0 ``` diff --git a/VERSION b/VERSION index bd7d922..75a22a2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.4.33 +3.0.3 diff --git a/composer.json b/composer.json index f77e1a9..a20e13f 100644 --- a/composer.json +++ b/composer.json @@ -18,15 +18,15 @@ } ], "require": { - "php": ">=5.4", + "php": ">=8.0", "ext-pcre": "*", - "tecnickcom/tc-lib-pdf-filter": "^1.4" + "tecnickcom/tc-lib-pdf-filter": "^2.0" }, "require-dev": { "pdepend/pdepend": "2.13.0", "phpmd/phpmd": "2.13.0", - "phpunit/phpunit": "10.1.2 || 9.6.7 || 8.5.31 || 7.5.20 || 6.5.14 || 5.7.27 || 4.8.36", - "squizlabs/php_codesniffer": "3.7.2 || 2.9.2" + "phpunit/phpunit": "10.1.2 || 9.6.13", + "squizlabs/php_codesniffer": "3.7.2" }, "autoload": { "psr-4": { diff --git a/example/index.php b/example/index.php index d81fff8..a256057 100644 --- a/example/index.php +++ b/example/index.php @@ -14,7 +14,7 @@ */ // autoloader when using Composer -require ('../vendor/autoload.php'); +require(__DIR__ . '/../vendor/autoload.php'); // autoloader when using RPM or DEB package installation //require ('/usr/share/php/Com/Tecnick/Pdf/Parser/autoload.php'); @@ -22,10 +22,13 @@ $filename = '../resources/test/example_036.pdf'; $rawdata = file_get_contents($filename); if ($rawdata === false) { - die('Unable to get the content of the file: '.$filename); + die('Unable to get the content of the file: ' . $filename); } + // configuration parameters for parser -$cfg = array('ignore_filter_errors' => true); +$cfg = [ + 'ignore_filter_errors' => true, +]; // parse PDF data $pdf = new \Com\Tecnick\Pdf\Parser\Parser($cfg); diff --git a/phpstan.neon b/phpstan.neon index c42b364..bf592ce 100644 --- a/phpstan.neon +++ b/phpstan.neon @@ -1,5 +1,5 @@ parameters: - level: 5 + level: max paths: - src - test diff --git a/resources/debian/control b/resources/debian/control index c2dae97..b945776 100644 --- a/resources/debian/control +++ b/resources/debian/control @@ -10,6 +10,6 @@ Vcs-Git: https://github.com/~#VENDOR#~/~#PROJECT#~.git Package: ~#PKGNAME#~ Provides: php-~#PROJECT#~ Architecture: all -Depends: php (>= 5.4.0), php-tecnickcom-tc-lib-pdf-filter (<< 2.0.0), php-tecnickcom-tc-lib-pdf-filter (>= 1.4.37), ${misc:Depends} +Depends: php (>= 8.0.0), php-tecnickcom-tc-lib-pdf-filter (<< 2.0.0), php-tecnickcom-tc-lib-pdf-filter (>= 2.0.6), ${misc:Depends} Description: PHP PDF Parser Library PHP library to parse PDF documents. diff --git a/resources/rpm/rpm.spec b/resources/rpm/rpm.spec index 9c73116..c7cc9b1 100644 --- a/resources/rpm/rpm.spec +++ b/resources/rpm/rpm.spec @@ -16,9 +16,9 @@ URL: https://github.com/%{gh_owner}/%{gh_project} BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-%(%{__id_u} -n) BuildArch: noarch -Requires: php(language) >= 5.4.0 +Requires: php(language) >= 8.0.0 Requires: php-composer(%{c_vendor}/tc-lib-pdf-filter) < 2.0.0 -Requires: php-composer(%{c_vendor}/tc-lib-pdf-filter) >= 1.4.37 +Requires: php-composer(%{c_vendor}/tc-lib-pdf-filter) >= 2.0.6 Requires: php-pcre Provides: php-composer(%{c_vendor}/%{gh_project}) = %{version} diff --git a/src/Exception.php b/src/Exception.php index 87b33d2..fd1b123 100644 --- a/src/Exception.php +++ b/src/Exception.php @@ -3,13 +3,13 @@ /** * Exception.php * - * @since 2011-05-23 - * @category Library - * @package PdfParser - * @author Nicola Asuni - * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD - * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) - * @link https://github.com/tecnickcom/tc-lib-pdf-parser + * @since 2011-05-23 + * @category Library + * @package PdfParser + * @author Nicola Asuni + * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD + * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) + * @link https://github.com/tecnickcom/tc-lib-pdf-parser * * This file is part of tc-lib-pdf-parser software library. */ @@ -21,13 +21,13 @@ * * Custom Exception class * - * @since 2011-05-23 - * @category Library - * @package PdfParser - * @author Nicola Asuni - * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD - * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) - * @link https://github.com/tecnickcom/tc-lib-pdf-parser + * @since 2011-05-23 + * @category Library + * @package PdfParser + * @author Nicola Asuni + * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD + * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) + * @link https://github.com/tecnickcom/tc-lib-pdf-parser */ class Exception extends \Exception { diff --git a/src/Parser.php b/src/Parser.php index 1ea82db..cc2e68f 100644 --- a/src/Parser.php +++ b/src/Parser.php @@ -3,19 +3,20 @@ /** * Parser.php * - * @since 2011-05-23 - * @category Library - * @package PdfParser - * @author Nicola Asuni - * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD - * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) - * @link https://github.com/tecnickcom/tc-lib-pdf-parser + * @since 2011-05-23 + * @category Library + * @package PdfParser + * @author Nicola Asuni + * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD + * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) + * @link https://github.com/tecnickcom/tc-lib-pdf-parser * * This file is part of tc-lib-pdf-parser software library. */ namespace Com\Tecnick\Pdf\Parser; +use Com\Tecnick\Pdf\Filter\Filter; use Com\Tecnick\Pdf\Parser\Exception as PPException; /** @@ -23,36 +24,42 @@ * * PHP class for parsing PDF documents. * - * @since 2011-05-23 - * @category Library - * @package PdfParser - * @author Nicola Asuni - * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD - * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) - * @link https://github.com/tecnickcom/tc-lib-pdf-parser + * @since 2011-05-23 + * @category Library + * @package PdfParser + * @author Nicola Asuni + * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD + * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) + * @link https://github.com/tecnickcom/tc-lib-pdf-parser + * + * @SuppressWarnings(PHPMD.ExcessiveClassComplexity) */ class Parser extends \Com\Tecnick\Pdf\Parser\Process\Xref { /** * Array of configuration parameters. * - * @var array + * @var array */ - private $cfg = array( - 'ignore_filter_errors' => false, - ); + private array $cfg = [ + 'ignore_filter_errors' => false, + ]; /** * Initialize the PDF parser * - * @param array $cfg Array of configuration parameters: - * 'ignore_filter_decoding_errors' : if true ignore filter decoding errors; - * 'ignore_missing_filter_decoders' : if true ignore missing filter decoding errors. + * @param array $cfg Array of configuration parameters: + * 'ignore_filter_decoding_errors' : + * if true ignore filter decoding + * errors; + * 'ignore_missing_filter_decoders' : + * if true ignore missing filter + * decoding errors. */ - public function __construct($cfg = array()) + public function __construct(array $cfg = []) { if (isset($cfg['ignore_filter_errors'])) { - $this->cfg['ignore_filter_errors'] = (bool)$cfg['ignore_filter_errors']; + $this->cfg['ignore_filter_errors'] = $cfg['ignore_filter_errors']; } } @@ -60,31 +67,84 @@ public function __construct($cfg = array()) * Parse a PDF document into an array of objects * * @param string $data PDF data to parse. + * + * @return array{ + * 0: array{ + * 'trailer': array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref': array, + * }, + * 1: array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>>, + * } */ - public function parse($data) + public function parse(string $data): array { - if (empty($data)) { + if ($data === '') { throw new PPException('Empty PDF data.'); } + // find the pdf header starting position if (($trimpos = strpos($data, '%PDF-')) === false) { throw new PPException('Invalid PDF data: missing %PDF header.'); } + // get PDF content string $this->pdfdata = substr($data, $trimpos); // get xref and trailer data $this->xref = $this->getXrefData(); // parse all document objects - $this->objects = array(); + $this->objects = []; foreach ($this->xref['xref'] as $obj => $offset) { - if (!isset($this->objects[$obj]) && ($offset > 0)) { - // decode objects with positive offset - $this->objects[$obj] = $this->getIndirectObject($obj, $offset, true); + if (isset($this->objects[$obj])) { + continue; } + + if ($offset <= 0) { + continue; + } + + // decode objects with positive offset + $this->objects[$obj] = $this->getIndirectObject($obj, $offset, true); } + // release some memory unset($this->pdfdata); - return array($this->xref, $this->objects); + return [$this->xref, $this->objects]; } /** @@ -94,24 +154,56 @@ public function parse($data) * @param int $offset Object offset. * @param bool $decoding If true decode streams. * - * @return array Object data. + * @return array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }> Object data. */ - protected function getIndirectObject($obj_ref, $offset = 0, $decoding = true) + protected function getIndirectObject(string $obj_ref, int $offset = 0, bool $decoding = true): array { $obj = explode('_', $obj_ref); if (($obj == false) || (count($obj) != 2)) { throw new PPException('Invalid object reference: ' . serialize($obj)); } + $objref = $obj[0] . ' ' . $obj[1] . ' obj'; // ignore leading zeros $offset += strspn($this->pdfdata, '0', $offset); if (strpos($this->pdfdata, $objref, $offset) != $offset) { - $offset++; + ++$offset; if (strpos($this->pdfdata, $objref, $offset) != $offset) { // an indirect reference to an undefined object shall be considered a reference to the null object - return array('null', 'null', $offset); + return [['null', 'null', $offset]]; } } + // starting position of object content $offset += strlen($objref); // return raw object content @@ -121,19 +213,49 @@ protected function getIndirectObject($obj_ref, $offset = 0, $decoding = true) /** * Get content of indirect object. * - * @param int $offset Object offset. - * @param bool $decoding If true decode streams. + * @param int $offset Object offset. + * @param bool $decoding If true decode streams. * - * @return array Object data. + * @return array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }> Object data. */ - protected function getRawIndirectObject($offset, $decoding) + protected function getRawIndirectObject(int $offset, bool $decoding): array { // get array of object content - $objdata = array(); + $objdata = []; $idx = 0; // object main index do { $oldoffset = $offset; - // get element + $element = $this->getRawObject($offset); $offset = $element[2]; // decode stream using stream's dictionary information @@ -142,14 +264,19 @@ protected function getRawIndirectObject($offset, $decoding) && ($element[0] == 'stream') && (isset($objdata[($idx - 1)][0])) && ($objdata[($idx - 1)][0] == '<<') + && (is_array($objdata[($idx - 1)][1])) + && (is_string($element[1])) ) { $element[3] = $this->decodeStream($objdata[($idx - 1)][1], $element[1]); } + $objdata[$idx] = $element; ++$idx; } while (($element[0] != 'endobj') && ($offset != $oldoffset)); + // remove closing delimiter array_pop($objdata); + // return raw object content return $objdata; } @@ -157,43 +284,146 @@ protected function getRawIndirectObject($offset, $decoding) /** * Get the content of object, resolving indect object reference if necessary. * - * @param array $obj Object value. + * @param array{ + * 0: string, + * 1: string|array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * } $obj Object value. * - * @return array Object data. + * @return array{ + * 0: string, + * 1: string|array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * } Object data. */ - protected function getObjectVal($obj) + protected function getObjectVal(array $obj): array { - if ($obj[0] == 'objref') { + if (($obj[0] == 'objref') && is_string($obj[1])) { // reference to indirect object - if (isset($this->objects[$obj[1]])) { + if (isset($this->objects[$obj[1]][0])) { // this object has been already parsed - return $this->objects[$obj[1]]; - } elseif (isset($this->xref[$obj[1]])) { + return $this->objects[$obj[1]][0]; + } + + if (isset($this->xref['xref'][$obj[1]])) { // parse new object - $this->objects[$obj[1]] = $this->getIndirectObject($obj[1], $this->xref[$obj[1]], false); - return $this->objects[$obj[1]]; + $this->objects[$obj[1]] = $this->getIndirectObject($obj[1], $this->xref['xref'][$obj[1]], false); + return $this->objects[$obj[1]][0]; } } + return $obj; } /** * Decode the specified stream. * - * @param array $sdic Stream's dictionary array. - * @param string $stream Stream to decode. + * @param array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }> $sdic Stream's dictionary array. + * @param string $stream Stream to decode. + * + * @return array{ + * 0: string, + * 1: array, + * } Decoded stream data and remaining filters. * - * @return array Decoded stream data and remaining filters. + * @SuppressWarnings(PHPMD.CyclomaticComplexity) */ - protected function decodeStream($sdic, $stream) + protected function decodeStream(array $sdic, string $stream): array { // get stream length and filters $slength = strlen($stream); if ($slength <= 0) { - return array('', array()); + return ['', []]; } - $filters = array(); + + $filters = []; foreach ($sdic as $key => $val) { + if (! is_string($val[1])) { + continue; + } + if ($val[0] == '/') { if (($val[1] == 'Length') && (isset($sdic[($key + 1)])) && ($sdic[($key + 1)][0] == 'numeric')) { // get declared stream length @@ -203,21 +433,52 @@ protected function decodeStream($sdic, $stream) } } } + return $this->getDecodedStream($filters, $stream); } /** * Get Filters * - * @param string $stream Stream - * @param int $slength Stream length - * @param array $sdic Stream's dictionary array. - * @param int $key Index + * @param string $stream Stream + * @param int $slength Stream length + * @param array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }> $sdic Stream's dictionary array. + * @param int $key Index */ - protected function getDeclaredStreamLength(&$stream, &$slength, $sdic, $key) + protected function getDeclaredStreamLength(string &$stream, int &$slength, array $sdic, int $key): void { // get declared stream length - $declength = intval($sdic[($key + 1)][1]); + $declength = (int) $sdic[($key + 1)][1]; if ($declength < $slength) { $stream = substr($stream, 0, $declength); $slength = $declength; @@ -227,52 +488,108 @@ protected function getDeclaredStreamLength(&$stream, &$slength, $sdic, $key) /** * Get Filters * - * @param array $filters Array of Filters - * @param array $sdic Stream's dictionary array. - * @param int $key Index + * @param array $filters Array of Filters + * @param array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }> $sdic Stream's dictionary array. + * @param int $key Index * - * @return array Array of filters + * @return array Array of filters */ - protected function getFilters($filters, $sdic, $key) + protected function getFilters(array $filters, array $sdic, int $key): array { // resolve indirect object $objval = $this->getObjectVal($sdic[($key + 1)]); - if ($objval[0] == '/') { - // single filter - $filters[] = $objval[1]; - } elseif ($objval[0] == '[') { - // array of filters - foreach ($objval[1] as $flt) { - if ($flt[0] == '/') { + + switch ($objval[0]) { + case '/': + // single filter + if (is_string($objval[1])) { + $filters[] = $objval[1]; + } + + break; + case '[': + if (! is_array($objval[1])) { + break; + } + + foreach ($objval[1] as $flt) { + if (! is_array($flt)) { + continue; + } + + if ($flt[0] != '/') { + continue; + } + + if (! is_string($flt[1])) { + continue; + } + $filters[] = $flt[1]; } - } + + break; } + return $filters; } /** * Decode the specified stream. * - * @param array $filters Array of decoding filters to apply - * @param string $stream Stream to decode. + * @param array $filters Array of decoding filters to apply + * @param string $stream Stream to decode. * - * @return array Decoded stream data and remaining filters. + * @return array{ + * 0: string, + * 1: array, + * } Decoded stream data and remaining filters. */ - protected function getDecodedStream($filters, $stream) + protected function getDecodedStream(array $filters, string $stream): array { // decode the stream - $errorfilters = array(); + $errorfilters = []; try { - $filter = new \Com\Tecnick\Pdf\Filter\Filter(); + $filter = new Filter(); $stream = $filter->decodeAll($filters, $stream); - } catch (\Com\Tecnick\Pdf\Filter\Exception $e) { + } catch (\Com\Tecnick\Pdf\Filter\Exception $exception) { if ($this->cfg['ignore_filter_errors']) { $errorfilters = $filters; } else { - throw new PPException($e->getMessage()); + throw new PPException($exception->getMessage()); } } - return array($stream, $errorfilters); + + return [$stream, $errorfilters]; } } diff --git a/src/Process/RawObject.php b/src/Process/RawObject.php index 7bdd433..9e15462 100644 --- a/src/Process/RawObject.php +++ b/src/Process/RawObject.php @@ -3,61 +3,137 @@ /** * RawObject.php * - * @since 2011-05-23 - * @category Library - * @package PdfParser - * @author Nicola Asuni - * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD - * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) - * @link https://github.com/tecnickcom/tc-lib-pdf-parser + * @since 2011-05-23 + * @category Library + * @package PdfParser + * @author Nicola Asuni + * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD + * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) + * @link https://github.com/tecnickcom/tc-lib-pdf-parser * * This file is part of tc-lib-pdf-parser software library. */ namespace Com\Tecnick\Pdf\Parser\Process; -use Com\Tecnick\Pdf\Parser\Exception as PPException; - /** * Com\Tecnick\Pdf\Parser\Process\RawObject * * Process Raw Objects * - * @since 2011-05-23 - * @category Library - * @package PdfParser - * @author Nicola Asuni - * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD - * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) - * @link https://github.com/tecnickcom/tc-lib-pdf-parser + * @since 2011-05-23 + * @category Library + * @package PdfParser + * @author Nicola Asuni + * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD + * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) + * @link https://github.com/tecnickcom/tc-lib-pdf-parser */ abstract class RawObject { /** * Raw content of the PDF document. - * - * @var string */ - protected $pdfdata = ''; + protected string $pdfdata = ''; /** * Array of PDF objects. * - * @var array + * @var array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>> */ - protected $objects = array(); + protected array $objects = []; + + /** + * Map symbols with corresponding processing methods. + * + * @var array + */ + protected const SYMBOLMETHOD = [ + // \x2F SOLIDUS + '/' => 'Solidus', + // \x28 LEFT PARENTHESIS + '(' => 'Parenthesis', + // \x29 RIGHT PARENTHESIS + ')' => 'Parenthesis', + // \x5B LEFT SQUARE BRACKET + '[' => 'Bracket', + // \x5D RIGHT SQUARE BRACKET + ']' => 'Bracket', + // \x3C LESS-THAN SIGN + '<' => 'Angular', + // \x3E GREATER-THAN SIGN + '>' => 'Angular', + ]; /** * Get object type, raw value and offset to next object * * @param int $offset Object offset. * - * @return array Array containing object type, raw value and offset to next object + * @return array{ + * 0: string, + * 1: string|array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * } Array containing: object type, raw value and offset to next object */ - protected function getRawObject($offset = 0) + protected function getRawObject(int $offset = 0): array { - $objtype = ''; // object type to be returned - $objval = ''; // object value to be returned // skip initial white space chars: // \x00 null (NUL) // \x09 horizontal tab (HT) @@ -76,37 +152,38 @@ protected function getRawObject($offset = 0) return $this->getRawObject($offset); } } + + $objtype = ''; + $objval = ''; // map symbols with corresponding processing methods - $map = array( - '/' => 'Solidus', // \x2F SOLIDUS - '(' => 'Parenthesis', // \x28 LEFT PARENTHESIS - ')' => 'Parenthesis', // \x29 RIGHT PARENTHESIS - '[' => 'Bracket', // \x5B LEFT SQUARE BRACKET - ']' => 'Bracket', // \x5D RIGHT SQUARE BRACKET - '<' => 'Angular', // \x3C LESS-THAN SIGN - '>' => 'Angular', // \x3E GREATER-THAN SIGN - ); - if (isset($map[$char])) { - $method = 'process' . $map[$char]; + if (isset(self::SYMBOLMETHOD[$char])) { + $method = 'process' . self::SYMBOLMETHOD[$char]; $this->$method($char, $offset, $objtype, $objval); - } else { - if ($this->processDefaultName($offset, $objtype, $objval) === false) { - $this->processDefault($offset, $objtype, $objval); - } + } elseif ($this->processDefaultName($offset, $objtype, $objval) === false) { + $this->processDefault($offset, $objtype, $objval); } - return array($objtype, $objval, $offset); + + return [$objtype, $objval, $offset]; } /** * Process name object * \x2F SOLIDUS * - * @param string $char Symbol to process - * @param int $offset Offset - * @param string $objtype Object type - * @param string $objval Object content + * @param string $char Symbol to process + * @param int $offset Offset + * @param string $objtype Object type + * @param string|array, + * 2: int, + * }> $objval Object content */ - protected function processSolidus($char, &$offset, &$objtype, &$objval) + protected function processSolidus(string $char, int &$offset, string &$objtype, string|array &$objval): void { $objtype = $char; ++$offset; @@ -126,12 +203,20 @@ protected function processSolidus($char, &$offset, &$objtype, &$objval) * Process literal string object * \x28 LEFT PARENTHESIS and \x29 RIGHT PARENTHESIS * - * @param string $char Symbol to process - * @param int $offset Offset - * @param string $objtype Object type - * @param string $objval Object content + * @param string $char Symbol to process + * @param int $offset Offset + * @param string $objtype Object type + * @param string|array, + * 2: int, + * }> $objval Object content */ - protected function processParenthesis($char, &$offset, &$objtype, &$objval) + protected function processParenthesis(string $char, int &$offset, string &$objtype, string|array &$objval): void { $objtype = $char; ++$offset; @@ -139,9 +224,10 @@ protected function processParenthesis($char, &$offset, &$objtype, &$objval) if ($char == '(') { $open_bracket = 1; while ($open_bracket > 0) { - if (!isset($this->pdfdata[$strpos])) { + if (! isset($this->pdfdata[$strpos])) { break; } + $chr = $this->pdfdata[$strpos]; switch ($chr) { case '\\': @@ -158,8 +244,10 @@ protected function processParenthesis($char, &$offset, &$objtype, &$objval) --$open_bracket; break; } + ++$strpos; } + $objval = substr($this->pdfdata, $offset, ($strpos - $offset - 1)); $offset = $strpos; } @@ -169,25 +257,33 @@ protected function processParenthesis($char, &$offset, &$objtype, &$objval) * Process array content * \x5B LEFT SQUARE BRACKET and \x5D RIGHT SQUARE BRACKET * - * @param string $char Symbol to process - * @param int $offset Offset - * @param string $objtype Object type - * @param string $objval Object content + * @param string $char Symbol to process + * @param int $offset Offset + * @param string $objtype Object type + * @param array, + * 2: int, + * }> $objval Object content */ - protected function processBracket($char, &$offset, &$objtype, &$objval) + protected function processBracket(string $char, int &$offset, string &$objtype, string|array &$objval): void { // array object $objtype = $char; ++$offset; if ($char == '[') { // get array content - $objval = array(); + $objval = []; do { - // get element $element = $this->getRawObject($offset); $offset = $element[2]; $objval[] = $element; } while ($element[0] != ']'); + // remove closing delimiter array_pop($objval); } @@ -196,26 +292,34 @@ protected function processBracket($char, &$offset, &$objtype, &$objval) /** * Process \x3C LESS-THAN SIGN and \x3E GREATER-THAN SIGN * - * @param string $char Symbol to process - * @param int $offset Offset - * @param string $objtype Object type - * @param string $objval Object content + * @param string $char Symbol to process + * @param int $offset Offset + * @param string $objtype Object type + * @param string|array, + * 2: int, + * }> $objval Object content */ - protected function processAngular($char, &$offset, &$objtype, &$objval) + protected function processAngular(string $char, int &$offset, string &$objtype, string|array &$objval): void { - if (isset($this->pdfdata[($offset + 1)]) && ($this->pdfdata[($offset + 1)] == $char)) { + if (isset($this->pdfdata[($offset + 1)]) && ($this->pdfdata[($offset + 1)] === $char)) { // dictionary object $objtype = $char . $char; $offset += 2; if ($char == '<') { // get array content - $objval = array(); + $objval = []; do { - // get element $element = $this->getRawObject($offset); $offset = $element[2]; $objval[] = $element; } while ($element[0] != '>>'); + // remove closing delimiter array_pop($objval); } @@ -243,13 +347,21 @@ protected function processAngular($char, &$offset, &$objtype, &$objval) /** * Process default * - * @param int $offset Offset - * @param string $objtype Object type - * @param string $objval Object content + * @param int $offset Offset + * @param string $objtype Object type + * @param string|array, + * 2: int, + * }> $objval Object content * * @return bool True in case of match, flase otherwise */ - protected function processDefaultName(&$offset, &$objtype, &$objval) + protected function processDefaultName(int &$offset, string &$objtype, string|array &$objval): bool { $status = false; if (substr($this->pdfdata, $offset, 6) == 'endobj') { @@ -293,6 +405,7 @@ protected function processDefaultName(&$offset, &$objtype, &$objval) $offset += $matches[1][1]; } } + $status = true; } elseif (substr($this->pdfdata, $offset, 9) == 'endstream') { // end stream object @@ -300,17 +413,26 @@ protected function processDefaultName(&$offset, &$objtype, &$objval) $offset += 9; $status = true; } + return $status; } /** * Process default * - * @param int $offset Offset - * @param string $objtype Object type - * @param string $objval Object content + * @param int $offset Offset + * @param string $objtype Object type + * @param string|array, + * 2: int, + * }> $objval Object content */ - protected function processDefault(&$offset, &$objtype, &$objval) + protected function processDefault(int &$offset, string &$objtype, string|array &$objval): void { if ( preg_match( @@ -322,7 +444,7 @@ protected function processDefault(&$offset, &$objtype, &$objval) // indirect object reference $objtype = 'objref'; $offset += strlen($matches[0]); - $objval = intval($matches[1]) . '_' . intval($matches[2]); + $objval = (int) $matches[1] . '_' . (int) $matches[2]; } elseif ( preg_match( '/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', @@ -332,7 +454,7 @@ protected function processDefault(&$offset, &$objtype, &$objval) ) { // object start $objtype = 'obj'; - $objval = intval($matches[1]) . '_' . intval($matches[2]); + $objval = (int) $matches[1] . '_' . (int) $matches[2]; $offset += strlen($matches[0]); } elseif (($numlen = strspn($this->pdfdata, '+-.0123456789', $offset)) > 0) { // numeric object diff --git a/src/Process/Xref.php b/src/Process/Xref.php index bc8dc72..22c2876 100644 --- a/src/Process/Xref.php +++ b/src/Process/Xref.php @@ -3,13 +3,13 @@ /** * Xref.php * - * @since 2011-05-23 - * @category Library - * @package PdfParser - * @author Nicola Asuni - * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD - * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) - * @link https://github.com/tecnickcom/tc-lib-pdf-parser + * @since 2011-05-23 + * @category Library + * @package PdfParser + * @author Nicola Asuni + * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD + * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) + * @link https://github.com/tecnickcom/tc-lib-pdf-parser * * This file is part of tc-lib-pdf-parser software library. */ @@ -23,46 +23,98 @@ * * Process XREF * - * @since 2011-05-23 - * @category Library - * @package PdfParser - * @author Nicola Asuni - * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD - * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) - * @link https://github.com/tecnickcom/tc-lib-pdf-parser + * @since 2011-05-23 + * @category Library + * @package PdfParser + * @author Nicola Asuni + * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD + * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) + * @link https://github.com/tecnickcom/tc-lib-pdf-parser */ abstract class Xref extends \Com\Tecnick\Pdf\Parser\Process\XrefStream { /** * XREF data. * - * @var array + * @var array{ + * 'trailer': array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref': array, + * } */ - protected $xref = array(); + protected array $xref = [ + 'trailer' => [ + 'encrypt' => '', + 'id' => [], + 'info' => '', + 'root' => '', + 'size' => 0, + ], + 'xref' => [], + ]; /** * Store the processed offsets * - * @var array + * @var array */ - protected $mrkoff = array(); + protected $mrkoff = []; - - abstract protected function getIndirectObject($obj_ref, $offset = 0, $decoding = true); + /** + * Get content of indirect object. + * + * @param string $obj_ref Object number and generation number separated by underscore character. + * @param int $offset Object offset. + * @param bool $decoding If true decode streams. + * + * @return array< int, array{ + * 0: string, + * 1: string, + * 2: int, + * 3?: array{string, array}, + * }> Object data. + */ + abstract protected function getIndirectObject(string $obj_ref, int $offset = 0, bool $decoding = true): array; /** * Get Cross-Reference (xref) table and trailer data from PDF document data. * - * @param int $offset Xref offset (if know). - * @param array $xref Previous xref array (if any). + * @param int $offset Xref offset (if know). + * @param array{ + * 'trailer'?: array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref'?: array, + * } $xref Previous xref array (if any). + * + * @return array{ + * 'trailer': array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref': array, + * } Xref and trailer data. * - * @return array Xref and trailer data. + * @SuppressWarnings(PHPMD.CyclomaticComplexity) */ - protected function getXrefData($offset = 0, $xref = array()) + protected function getXrefData(int $offset = 0, array $xref = []): array { if (in_array($offset, $this->mrkoff)) { throw new PPException('LOOP: this XRef offset has been already processed'); } + $this->mrkoff[] = $offset; if ($offset == 0) { // find last startxref @@ -75,16 +127,21 @@ protected function getXrefData($offset = 0, $xref = array()) $offset ) == 0 ) { - throw new PPException('Unable to find startxref'); + throw new PPException('Unable to find startxref (1)'); } + $matches = array_pop($matches); - $startxref = $matches[1]; + if ($matches === null) { + throw new PPException('Unable to find startxref (2)'); + } + + $startxref = (int) $matches[1]; } elseif (($pos = strpos($this->pdfdata, 'xref', $offset)) <= ($offset + 4)) { // Already pointing at the xref table - $startxref = $pos; + $startxref = (int) $pos; } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset)) { // Cross-Reference Stream object - $startxref = $offset; + $startxref = (int) $offset; } elseif ( preg_match( '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', @@ -95,10 +152,15 @@ protected function getXrefData($offset = 0, $xref = array()) ) ) { // startxref found - $startxref = $matches[1][0]; + $startxref = (int) $matches[1][0]; } else { - throw new PPException('Unable to find startxref'); + throw new PPException('Unable to find startxref (3)'); } + + if (! isset($xref['xref'])) { + $xref['xref'] = []; + } + // check xref position if (strpos($this->pdfdata, 'xref', $startxref) == $startxref) { // Cross-Reference @@ -107,21 +169,41 @@ protected function getXrefData($offset = 0, $xref = array()) // Cross-Reference Stream $xref = $this->decodeXrefStream($startxref, $xref); } - if (empty($xref)) { - throw new PPException('Unable to find xref'); + + if (empty($xref['xref'])) { + throw new PPException('Unable to find xref (4)'); } + return $xref; } /** * Decode the Cross-Reference section * - * @param int $startxref Offset at which the xref section starts (position of the 'xref' keyword). - * @param array $xref Previous xref array (if any). + * @param int $startxref Offset at which the xref section starts (position of the 'xref' keyword). + * @param array{ + * 'trailer'?: array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref': array, + * } $xref Previous xref array (if any). * - * @return array Xref and trailer data. + * @return array{ + * 'trailer': array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref': array, + * } Xref and trailer data. */ - protected function decodeXref($startxref, $xref = array()) + protected function decodeXref(int $startxref, array $xref): array { $startxref += 4; // 4 is the length of the word 'xref' // skip initial white space chars: @@ -137,7 +219,7 @@ protected function decodeXref($startxref, $xref = array()) // search for cross-reference entries or subsection while ( preg_match( - '/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', + '/(\d+)[\x20](\d+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, @@ -148,152 +230,219 @@ protected function decodeXref($startxref, $xref = array()) // we are on another section break; } + $offset += strlen($matches[0][0]); if ($matches[3][0] == 'n') { // create unique object index: [object number]_[generation number] - $index = $obj_num . '_' . intval($matches[2][0]); + $index = $obj_num . '_' . (int) $matches[2][0]; // check if object already exist - if (!isset($xref['xref'][$index])) { + if (! isset($xref['xref'][$index])) { // store object offset position - $xref['xref'][$index] = intval($matches[1][0]); + $xref['xref'][$index] = (int) $matches[1][0]; } + ++$obj_num; } elseif ($matches[3][0] == 'f') { ++$obj_num; } else { // object number (index) - $obj_num = intval($matches[1][0]); + $obj_num = (int) $matches[1][0]; } } + // get trailer data - if (!preg_match('/trailer[\s]*<<(.*)>>/isU', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) { + $trl = preg_match('/trailer[\s]*<<(.*)>>/isU', $this->pdfdata, $trmatches, PREG_OFFSET_CAPTURE, $offset); + if ($trl !== 1) { throw new PPException('Unable to find trailer'); } - return $this->getTrailerData($xref, $matches); + + return $this->getTrailerData($xref, $trmatches); } /** * Decode the Cross-Reference section * - * @param array $xref Previous xref array (if any). - * @param array $matches Matches containing traile sections + * @param array{ + * 'trailer'?: array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref': array, + * } $xref Previous xref array (if any). + * @param array|string>> $matches Matches containing trailer sections * - * @return array Xref and trailer data. + * @return array{ + * 'trailer': array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref': array, + * } Xref and trailer data. */ - protected function getTrailerData($xref, $matches) + protected function getTrailerData(array $xref, array $matches): array { - $trailer_data = $matches[1][0]; - if (!isset($xref['trailer']) || empty($xref['trailer'])) { + $trailer_data = (string) $matches[1][0]; + if (! isset($xref['trailer']) || empty($xref['trailer'])) { // get only the last updated version - $xref['trailer'] = array(); + $xref['trailer'] = [ + 'id' => [], + 'info' => '', + 'root' => '', + 'size' => 0, + ]; + // parse trailer_data if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { - $xref['trailer']['size'] = intval($matches[1]); + $xref['trailer']['size'] = (int) $matches[1]; } + if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { - $xref['trailer']['root'] = intval($matches[1]) . '_' . intval($matches[2]); + $xref['trailer']['root'] = (int) $matches[1] . '_' . (int) $matches[2]; } + if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { - $xref['trailer']['encrypt'] = intval($matches[1]) . '_' . intval($matches[2]); + $xref['trailer']['encrypt'] = (int) $matches[1] . '_' . (int) $matches[2]; } + if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { - $xref['trailer']['info'] = intval($matches[1]) . '_' . intval($matches[2]); + $xref['trailer']['info'] = (int) $matches[1] . '_' . (int) $matches[2]; } + if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) { - $xref['trailer']['id'] = array(); + $xref['trailer']['id'] = []; $xref['trailer']['id'][0] = $matches[1]; $xref['trailer']['id'][1] = $matches[2]; } } + if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { // get previous xref - $xref = $this->getXrefData(intval($matches[1]), $xref); + return $this->getXrefData((int) $matches[1], $xref); } + return $xref; } /** * Decode the Cross-Reference Stream section * - * @param int $startxref Offset at which the xref section starts. - * @param array $xref Previous xref array (if any). + * @param int $startxref Offset at which the xref section starts. + * @param array{ + * 'trailer'?: array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref': array, + * } $xref Previous xref array (if any). + * + * @return array{ + * 'trailer': array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref': array, + * } Xref and trailer data. * - * @return array Xref and trailer data. + * @SuppressWarnings(PHPMD.CyclomaticComplexity) */ - protected function decodeXrefStream($startxref, $xref = array()) + protected function decodeXrefStream(int $startxref, array $xref): array { // try to read Cross-Reference Stream $xrefobj = $this->getRawObject($startxref); + if (! is_string($xrefobj[1])) { + throw new PPException('Unable to find xref stream'); + } + $xrefcrs = $this->getIndirectObject($xrefobj[1], $startxref, true); - if (!isset($xref['trailer']) || empty($xref['trailer'])) { + if (! isset($xref['trailer']) || empty($xref['trailer'])) { // get only the last updated version - $xref['trailer'] = array(); + $xref['trailer'] = []; $filltrailer = true; } else { $filltrailer = false; } - if (!isset($xref['xref'])) { - $xref['xref'] = array(); + + if (! isset($xref['xref'])) { + $xref['xref'] = []; } + $valid_crs = false; $columns = 0; $sarr = $xrefcrs[0][1]; - if (!is_array($sarr)) { - $sarr = array(); + if (! is_array($sarr)) { + $sarr = []; } - $wbt = array(); + + $wbt = []; $index_first = null; $prevxref = null; $this->processXrefType($sarr, $xref, $wbt, $index_first, $prevxref, $columns, $valid_crs, $filltrailer); // decode data if ($valid_crs && isset($xrefcrs[1][3][0])) { // number of bytes in a row - $rowlen = ($columns + 1); + $rowlen = (int) ($columns + 1); // convert the stream into an array of integers $sdata = unpack('C*', $xrefcrs[1][3][0]); + if ($sdata === false) { + throw new PPException('Unable to unpack xref stream data'); + } + // split the rows - $sdata = array_chunk($sdata, $rowlen); + $sdata = array_chunk($sdata, max(1, $rowlen), false); // initialize decoded array - $ddata = array(); + $ddata = []; // initialize first row with zeros $prev_row = array_fill(0, $rowlen, 0); $this->pngUnpredictor($sdata, $ddata, $columns, $prev_row); // complete decoding - $sdata = array(); + $sdata = []; $this->processDdata($sdata, $ddata, $wbt); - $ddata = array(); + $ddata = []; // fill xref - if ($index_first !== null) { - $obj_num = $index_first; - } else { - $obj_num = 0; - } + $obj_num = $index_first ?? 0; + $this->processObjIndexes($xref, $obj_num, $sdata); - } // end decoding data - if ($prevxref != null) { + } + + // end decoding data + if ($prevxref !== []) { // get previous xref - $xref = $this->getXrefData($prevxref, $xref); + return $this->getXrefData($prevxref, $xref); } + return $xref; } /** * Process ddata * - * @param array $sdata - * @param array $ddata - * @param array $wbt + * @param array> $sdata + * @param array> $ddata + * @param array $wbt */ - protected function processDdata(&$sdata, $ddata, $wbt) + protected function processDdata(array &$sdata, array $ddata, array $wbt): void { // for every row foreach ($ddata as $key => $row) { // initialize new row - $sdata[$key] = array(0, 0, 0); + $sdata[$key] = [0, 0, 0]; if ($wbt[0] == 0) { // default type field $sdata[$key][0] = 1; } + $idx = 0; // count bytes in the row // for every column for ($col = 0; $col < 3; ++$col) { @@ -302,6 +451,7 @@ protected function processDdata(&$sdata, $ddata, $wbt) if (isset($row[$idx])) { $sdata[$key][$col] += ($row[$idx] << (($wbt[$col] - 1 - $byte) * 8)); } + ++$idx; } } diff --git a/src/Process/XrefStream.php b/src/Process/XrefStream.php index 726bc88..fa7019d 100644 --- a/src/Process/XrefStream.php +++ b/src/Process/XrefStream.php @@ -3,13 +3,13 @@ /** * XrefStream.php * - * @since 2011-05-23 - * @category Library - * @package PdfParser - * @author Nicola Asuni - * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD - * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) - * @link https://github.com/tecnickcom/tc-lib-pdf-parser + * @since 2011-05-23 + * @category Library + * @package PdfParser + * @author Nicola Asuni + * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD + * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) + * @link https://github.com/tecnickcom/tc-lib-pdf-parser * * This file is part of tc-lib-pdf-parser software library. */ @@ -23,51 +23,62 @@ * * Process XREF * - * @since 2011-05-23 - * @category Library - * @package PdfParser - * @author Nicola Asuni - * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD - * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) - * @link https://github.com/tecnickcom/tc-lib-pdf-parser + * @since 2011-05-23 + * @category Library + * @package PdfParser + * @author Nicola Asuni + * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD + * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) + * @link https://github.com/tecnickcom/tc-lib-pdf-parser */ abstract class XrefStream extends \Com\Tecnick\Pdf\Parser\Process\RawObject { /** * Process object indexes * - * @param array $xref - * @param int $obj_num - * @param array $sdata + * @param array{ + * 'trailer': array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref': array, + * } $xref XREF data + * @param int $obj_num Object number + * @param array> $sdata Stream data */ - protected function processObjIndexes(&$xref, &$obj_num, $sdata) + protected function processObjIndexes(array &$xref, int &$obj_num, array $sdata): void { - foreach ($sdata as $row) { - switch ($row[0]) { + foreach ($sdata as $sdatum) { + switch ($sdatum[0]) { case 0: // (f) linked list of free objects break; case 1: // (n) objects that are in use but are not compressed // create unique object index: [object number]_[generation number] - $index = $obj_num . '_' . $row[2]; + $index = $obj_num . '_' . $sdatum[2]; // check if object already exist - if (!isset($xref['xref'][$index])) { + if (! isset($xref['xref'][$index])) { // store object offset position - $xref['xref'][$index] = $row[1]; + $xref['xref'][$index] = $sdatum[1]; } + break; case 2: // compressed objects // $row[1] = object number of the object stream in which this object is stored // $row[2] = index of this object within the object stream - $index = $row[1] . '_0_' . $row[2]; + $index = $sdatum[1] . '_0_' . $sdatum[2]; $xref['xref'][$index] = -1; break; default: // null objects break; } + ++$obj_num; } } @@ -75,17 +86,17 @@ protected function processObjIndexes(&$xref, &$obj_num, $sdata) /** * PNG Unpredictor * - * @param array $sdata - * @param array $ddata - * @param int $columns - * @param array $prev_row + * @param array> $sdata Stream data + * @param array> $ddata Decoded data + * @param int $columns Number of columns + * @param array $prev_row Previous row */ - protected function pngUnpredictor($sdata, &$ddata, $columns, $prev_row) + protected function pngUnpredictor(array $sdata, array &$ddata, int $columns, array $prev_row): void { // for each row apply PNG unpredictor foreach ($sdata as $key => $row) { // initialize new row - $ddata[$key] = array(); + $ddata[$key] = []; // get PNG predictor value $predictor = (10 + $row[0]); // for each byte on the row @@ -100,6 +111,7 @@ protected function pngUnpredictor($sdata, &$ddata, $columns, $prev_row) $row_left = $row[($idx - 1)]; $row_upleft = $prev_row[($jdx - 1)]; } + switch ($predictor) { case 10: // PNG prediction (on encoding, PNG None on all rows) @@ -126,6 +138,7 @@ protected function pngUnpredictor($sdata, &$ddata, $columns, $prev_row) throw new PPException('Unknown PNG predictor'); } } + $prev_row = $ddata[$key]; } // end for each row } @@ -133,16 +146,25 @@ protected function pngUnpredictor($sdata, &$ddata, $columns, $prev_row) /** * Return minimum distance for PNG unpredictor * - * @param array $ddata - * @param array $row - * @param int $idx - * @param int $jdx - * @param int $row_left - * @param int $row_up - * @param int $row_upleft + * @param array> $ddata Decoded data + * @param int $key Key + * @param array $row Row + * @param int $idx Index + * @param int $jdx Jdx + * @param int $row_left Row left + * @param int $row_up Row up + * @param int $row_upleft Row upleft */ - protected function minDistance(&$ddata, $key, $row, $idx, $jdx, $row_left, $row_up, $row_upleft) - { + protected function minDistance( + array &$ddata, + int $key, + array $row, + int $idx, + int $jdx, + int $row_left, + int $row_up, + int $row_upleft, + ): void { // initial estimate $pos = ($row_left + $row_up - $row_upleft); // distances @@ -166,36 +188,82 @@ protected function minDistance(&$ddata, $key, $row, $idx, $jdx, $row_left, $row_ /** * Process XREF types * - * @param array $sarr - * @param array $xref - * @param array $wbt - * @param int $index_first - * @param int $prevxref - * @param int $columns - * @param bool $valid_crs - * @param bool $filltrailer + * @param array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }> $sarr Stream data + * @param array{ + * 'trailer': array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref': array, + * } $xref XREF data + * @param array $wbt WBT data + * @param int $index_first Index first + * @param int $prevxref Previous XREF + * @param int $columns Number of columns + * @param int $valid_crs Valid CRS + * @param bool $filltrailer Fill trailer + * + * @SuppressWarnings(PHPMD.CyclomaticComplexity) */ protected function processXrefType( - $sarr, - &$xref, - &$wbt, - &$index_first, - &$prevxref, - &$columns, - &$valid_crs, - $filltrailer - ) { + array $sarr, + array &$xref, + array &$wbt, + int &$index_first, + int &$prevxref, + int &$columns, + int &$valid_crs, + bool $filltrailer + ): void { foreach ($sarr as $key => $val) { if ($val[0] !== '/') { continue; } + + if (! is_string($val[1])) { + continue; + } + switch ($val[1]) { case 'Type': $valid_crs = (($sarr[($key + 1)][0] == '/') && ($sarr[($key + 1)][1] == 'XRef')); break; case 'Index': // first object number in the subsection - $index_first = intval($sarr[($key + 1)][1][0][1]); + $index_first = (int) $sarr[($key + 1)][1][0][1]; // number of entries in the subsection // $index_entries = intval($sarr[($key + 1)][1][1][1]); break; @@ -204,14 +272,15 @@ protected function processXrefType( break; case 'W': // number of bytes (in the decoded stream) of the corresponding field - $wbt[0] = intval($sarr[($key + 1)][1][0][1]); - $wbt[1] = intval($sarr[($key + 1)][1][1][1]); - $wbt[2] = intval($sarr[($key + 1)][1][2][1]); + $wbt[0] = (int) $sarr[($key + 1)][1][0][1]; + $wbt[1] = (int) $sarr[($key + 1)][1][1][1]; + $wbt[2] = (int) $sarr[($key + 1)][1][2][1]; break; case 'DecodeParms': $this->processXrefDecodeParms($sarr, $key, $columns); break; } + $this->processXrefTypeFt($val[1], $sarr, $key, $xref, $filltrailer); } } @@ -219,58 +288,165 @@ protected function processXrefType( /** * Process XREF type Prev * - * @param array $sarr - * @param int $key - * @param int $prevxref - + * @param array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }> $sarr Stream data + * @param int $key Key + * @param int $prevxref Previous XREF */ - protected function processXrefPrev($sarr, $key, &$prevxref) + protected function processXrefPrev(array $sarr, int $key, int &$prevxref): void { if ($sarr[($key + 1)][0] == 'numeric') { // get previous xref offset - $prevxref = intval($sarr[($key + 1)][1]); + $prevxref = (int) $sarr[($key + 1)][1]; } } /** * Process XREF type DecodeParms * - * @param array $sarr - * @param int $key - * @param int $columns + * @param array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }> $sarr Stream data + * @param int $key Key + * @param int $columns Number of columns */ - protected function processXrefDecodeParms($sarr, $key, &$columns) + protected function processXrefDecodeParms(array $sarr, int $key, int &$columns): void { $decpar = $sarr[($key + 1)][1]; + if (! is_array($decpar)) { + return; + } + foreach ($decpar as $kdc => $vdc) { if (($vdc[0] == '/') && ($vdc[1] == 'Columns') && ($decpar[($kdc + 1)][0] == 'numeric')) { - $columns = intval($decpar[($kdc + 1)][1]); + $columns = (int) $decpar[($kdc + 1)][1]; break; } } + + $columns = max(0, $columns); } /** * Process XREF type * - * @param string $type - * @param array $sarr - * @param array $xref - * @param bool $filltrailer + * @param string $type Type + * @param array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }> $sarr Stream data + * @param int $key Key + * @param array{ + * 'trailer': array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref': array, + * } $xref XREF data + * @param bool $filltrailer Fill trailer */ - protected function processXrefTypeFt($type, $sarr, $key, &$xref, $filltrailer) + protected function processXrefTypeFt(string $type, array $sarr, int $key, array &$xref, bool $filltrailer): void { - if (!$filltrailer) { + if (! $filltrailer) { return; } + switch ($type) { case 'Size': if ($sarr[($key + 1)][0] == 'numeric') { $xref['trailer']['size'] = $sarr[($key + 1)][1]; } + break; case 'ID': - $xref['trailer']['id'] = array(); + $xref['trailer']['id'] = []; $xref['trailer']['id'][0] = $sarr[($key + 1)][1][0][1]; $xref['trailer']['id'][1] = $sarr[($key + 1)][1][1][1]; break; @@ -283,15 +459,56 @@ protected function processXrefTypeFt($type, $sarr, $key, &$xref, $filltrailer) /** * Process XREF type Objref * - * @param string $type - * @param array $sarr - * @param array $xref + * @param string $type Type + * @param array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }>, + * 2: int, + * 3?: array{string, array}, + * }> $sarr Stream data + * @param int $key Key + * @param array{ + * 'trailer': array{ + * 'encrypt'?: string, + * 'id': array, + * 'info': string, + * 'root': string, + * 'size': int, + * }, + * 'xref': array, + * } $xref XREF data */ - protected function processXrefObjref($type, $sarr, $key, &$xref) + protected function processXrefObjref(string $type, array $sarr, int $key, array &$xref): void { - if (!isset($sarr[($key + 1)]) || ($sarr[($key + 1)][0] !== 'objref')) { + if (! isset($sarr[($key + 1)]) || ($sarr[($key + 1)][0] !== 'objref')) { return; } + switch ($type) { case 'Root': $xref['trailer']['root'] = $sarr[($key + 1)][1]; diff --git a/test/ParserTest.php b/test/ParserTest.php index 81deac9..761b776 100644 --- a/test/ParserTest.php +++ b/test/ParserTest.php @@ -3,52 +3,59 @@ /** * ParserTest.php * - * @since 2011-05-23 - * @category Library - * @package Pdfparser - * @author Nicola Asuni - * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD - * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) - * @link https://github.com/tecnickcom/tc-lib-pdf-parser + * @since 2011-05-23 + * @category Library + * @package Pdfparser + * @author Nicola Asuni + * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD + * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) + * @link https://github.com/tecnickcom/tc-lib-pdf-parser * * This file is part of tc-lib-pdf-parser software library. */ namespace Test; +use Com\Tecnick\Pdf\Parser\Parser; use PHPUnit\Framework\TestCase; /** * Filter Test * - * @since 2011-05-23 - * @category Library - * @package PdfParser - * @author Nicola Asuni - * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD - * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) - * @link https://github.com/tecnickcom/tc-lib-pdf-parser + * @since 2011-05-23 + * @category Library + * @package PdfParser + * @author Nicola Asuni + * @copyright 2011-2023 Nicola Asuni - Tecnick.com LTD + * @license http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT) + * @link https://github.com/tecnickcom/tc-lib-pdf-parser */ class ParserTest extends TestCase { /** * @dataProvider getParseProvider */ - public function testParse($filename, $hash) + public function testParse(string $filename, string $hash): void { - $cfg = array('ignore_filter_errors' => true); + $cfg = [ + 'ignore_filter_errors' => true, + ]; $rawdata = file_get_contents($filename); - $testObj = new \Com\Tecnick\Pdf\Parser\Parser($cfg); - $data = $testObj->parse($rawdata); + $this->assertNotFalse($rawdata); + $parser = new Parser($cfg); + $data = $parser->parse($rawdata); $this->assertEquals($hash, md5(serialize($data))); } - public static function getParseProvider() + /** + * @return array + */ + public static function getParseProvider(): array { - return array( - array('resources/test/example_005.pdf', 'b65259e9c2864e707b10495e64c71363'), - array('resources/test/example_036.pdf', 'f707a4503fba04b79a1c3905af9d4fbc'), - array('resources/test/example_046.pdf', '3b65bf473a50da304cc9549d18bfec73'), - ); + return [ + ['resources/test/example_005.pdf', 'b1c58b8f34df2974a339f8fe2909cf59'], + ['resources/test/example_036.pdf', '78cc03b354588660ccc1ec6453b4fdba'], + ['resources/test/example_046.pdf', 'ba410ddc927da4b636d749b503b96252'], + ]; } }