diff --git a/samples/bugs/Issue727.pdf b/samples/bugs/Issue727.pdf new file mode 100644 index 00000000..1d2f89ae Binary files /dev/null and b/samples/bugs/Issue727.pdf differ diff --git a/src/Smalot/PdfParser/RawData/FilterHelper.php b/src/Smalot/PdfParser/RawData/FilterHelper.php index a6f11b30..4bab847e 100644 --- a/src/Smalot/PdfParser/RawData/FilterHelper.php +++ b/src/Smalot/PdfParser/RawData/FilterHelper.php @@ -282,7 +282,7 @@ protected function decodeFilterLZWDecode(string $data): string // convert string to binary string $bitstring = ''; for ($i = 0; $i < $data_length; ++$i) { - $bitstring .= sprintf('%08b', \ord($data[$i])); + $bitstring .= \sprintf('%08b', \ord($data[$i])); } // get the number of bits $data_length = \strlen($bitstring); diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index 5e17083a..6441646b 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -214,8 +214,11 @@ protected function decodeXref(string $pdfData, int $startxref, array $xref = []) } } if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { - // get previous xref - $xref = $this->getXrefData($pdfData, (int) $matches[1], $xref); + $offset = (int) $matches[1]; + if (0 != $offset) { + // get previous xref + $xref = $this->getXrefData($pdfData, $offset, $xref); + } } } else { throw new \Exception('Unable to find trailer'); diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index 7a586932..c0b7cf9f 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -194,4 +194,23 @@ public function testGetXrefDataIssue673(): void self::assertStringContainsString('6 rue des Goutais', $text); } + + /** + * Handle self referencing xref + * + * It seems that some PDF creators output `Prev 0` when there is no previous xref. + * + * @see https://github.com/smalot/pdfparser/pull/727 + */ + public function testDecodeXrefIssue727(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue727.pdf'; + + // Parsing this document would previously cause an infinite loop + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $text = $document->getText(); + + self::assertStringContainsString('', $text); + } }