Skip to content

Commit

Permalink
Merge pull request #2 from jahtz/dev
Browse files Browse the repository at this point in the history
v2.0: Merge PageXML and Page class
  • Loading branch information
jahtz authored Oct 18, 2024
2 parents 94afbd2 + f86e291 commit d0b8327
Show file tree
Hide file tree
Showing 13 changed files with 533 additions and 660 deletions.
3 changes: 1 addition & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
recursive-include src/pypxml/resources *
recursive-include src/pypxml *.py
recursive-include src/pypxml *
recursive-include src/cli *.py

# Include the README and LICENSE files
Expand Down
93 changes: 7 additions & 86 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,101 +15,22 @@ pip install pypxml
```
pypxml [OPTIONS] COMMAND [ARGS]...
```
Coming in version 2.x

## API
PyXML provides a feature rich Python API for working with PageXML files.

### Basics
### Example: Edit existing PageXML
```python
from pypxml import PageXML, Page, Element, XMLType
from pypxml import PageXML, PageType

pxml = PageXML.from_xml('path_to_pagexml.xml')
page1 = pxml.create_page(imageFilename='0001.png',
imageWidth=1000,
imageHeight=2500)
page1.create_element(XMLType.TextRegion, id='ir01')
pxml.to_xml('path_to_output.xml')
```

### PageXML class
```python
from pypxml import PageXML

# open file
pxml = PageXML.from_xml('path_to.xml')
# or create new PageXML
pxml = PageXML.new()

# edit metadata
pxml.creator = 'yourname'
...

# create a page
page = pxml.create_page(imageFilename='0001.png',
imageWidth=1000,
imageHeight='2500')
# or add existing page
pxml.add_page(page) # see below

# iterate over pages
for page in pxml:
...

# delete or modify pages
pxml[0] = ...
pxml.remove_page(pxml[1])

# save object to file
pxml.to_xml('output.xml')
...
```
text_region = pxml.create_element(PageType.TextRegion, type='paragraph', id='tr_001')
text_region.create_element(PageType.Coords, points='1,2 3,4 5,6 ...')

### Page class
```python
from pypxml import Page, XMLType

# create a page
page = Page.new(imageFilename='0001.png',
imageWidth=1000,
imageHeight=2500)

# modify attributes
page['imageFilename'] = '0002.png'
# or get element by index
element = page[3]

# add elements (automatically added to reading order if it is a region)
text_region = page.create_element(XMLType.TextRegion, id='tr1')
# or add existing element
page.add_element(element)

# iterate over regions
for region in page:
...
...
```
for region in pxml.regions:
print(region.type)

### Element class
```python
from pypxml import Element, XMLType

# create an element
coords = Element.new(XMLType.Coords,
points='1,2 3,4 5,6 7,8')
# modify attributes
coords['points'] = 'some other coords'
# or get element by index
baseline = text_region[2]

# check if element is a region
if text_region.is_region():
...

# get coords and baseline, if they exist
coords = text_line.get_coords()
baseline = text_line.get_baseline()
...
pxml.to_xml('path_to_output.xml')
```

## ZPD
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ build-backend = "setuptools.build_meta"
name = "pypxml"
description = "A python library for parsing, converting and modifying PageXML files. "
keywords = ["PageXML", "XML", "OCR", "optical character recognition"]
version = "1.0"
version = "2.0"
readme = "README.md"
license = { file = "LICENSE" }
license = { text = "MIT License" }
authors = [
{ name="Janik Haitz", email="[email protected]" },
]
Expand Down
4 changes: 2 additions & 2 deletions src/cli/pypxml_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

@click.group()
@click.help_option('--help')
@click.version_option('1.0', '--version',
prog_name='PyPXML',
@click.version_option('2.0', '--version',
prog_name='pypxml',
message='%(prog)s v%(version)s - Developed at Centre for Philology and Digitality (ZPD), '
'University of Würzburg')
def cli():
Expand Down
9 changes: 4 additions & 5 deletions src/pypxml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
# Copyright (c) 2024 Janik Haitz
# See the LICENSE file in the root directory for more details.

from .pxml import PageXML
from .page import Page
from .element import Element
from .resources.xml_schema import XMLSchema
from .resources.xml_types import XMLType
from .page_xml import PageXML
from .page_element import PageElement
from .page_types import PageType
from .page_schema import PageSchema
244 changes: 0 additions & 244 deletions src/pypxml/page.py

This file was deleted.

Loading

0 comments on commit d0b8327

Please sign in to comment.