Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve usability #2

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,10 @@ where:

## Usage

```py
from pubtator_loader import PubTatorCorpusReader
dataset_reader = PubTatorCorpusReader('./sample_pubator_input.txt')
```python
from pubtator_loader import from_path, PubTatorDocument

corpus = dataset_reader.load_corpus()
# corpus will be a List[PubtatorDocuments]
corpus: list[PubTatorDocument] = from_path('./sample_pubator_input.txt')

for doc in corpus:
print(doc)
Expand Down
4 changes: 2 additions & 2 deletions pubtator_loader/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .models import PubTatorEntity, PubTatorDocument # noqa
from .pubtator_corpus_reader import PubTatorCorpusReader
from .models import PubTatorDocument, PubTatorEntity # noqa
from .pubtator_corpus_reader import PubTatorCorpusReader, from_gz, from_lines, from_path # noqa
17 changes: 10 additions & 7 deletions pubtator_loader/models/pubtator_document.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from spacy.language import Language
from spacy.training import offsets_to_biluo_tags
from .pubtator_entities import PubTatorEntity
from typing import List
from typing import List, TYPE_CHECKING
import re
import json
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex

if TYPE_CHECKING:
import spacy.language
import spacy.tokenizer

class PubTatorDocument:
def __init__(self, id):
Expand Down Expand Up @@ -83,7 +82,10 @@ def __replace_overlapping_entities(self, span_replacement_fn):

self.entities = processed_entities

def __get_custom_tokenizer(self, nlp: Language) -> Tokenizer:
def __get_custom_tokenizer(self, nlp: 'spacy.language.Language') -> 'spacy.tokenizer.Tokenizer':
from spacy.util import compile_prefix_regex, compile_suffix_regex
from spacy.tokenizer import Tokenizer

infix_re = re.compile(
r'''[!\"\#\$\%\&\'\(\)\*\+\,\-\.\/
\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]'''
Expand All @@ -97,7 +99,7 @@ def __get_custom_tokenizer(self, nlp: Language) -> Tokenizer:
infix_finditer=infix_re.finditer,
token_match=None)

def tokenize_and_convert_to_bilou(self, nlp: Language):
def tokenize_and_convert_to_bilou(self, nlp: 'spacy.language.Language'):
self.replace_overlapping_entities_w_longest()
text = self.get_space_separated_title_and_abstract()
# we need to use a custom tokenizer to avoid the alignment issues
Expand All @@ -113,6 +115,7 @@ def tokenize_and_convert_to_bilou(self, nlp: Language):

results = []
sentences_started = 0
from spacy.training import offsets_to_biluo_tags
for token, semantic_type_id, entity_id in zip(
document,
offsets_to_biluo_tags(document,
Expand Down
24 changes: 21 additions & 3 deletions pubtator_loader/pubtator_corpus_reader.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,36 @@
from enum import Enum
import re
import gzip

from . import PubTatorDocument
from . import PubTatorEntity


def from_gz(path, mode='rt'):
"""Parse a pubtator corpus from a gzip file at the given path."""
with gzip.open(path, mode=mode) as file:
return from_lines(file)

def from_path(path):
"""Parse a pubtator corpus from a file at the given path."""
with open(path) as file:
return from_lines(file)


def from_lines(lines):
"""Parse a pubtator corpus from the given iterable of lines."""
reader = PubTatorCorpusReader()
return reader.parse_lines(lines)


class PubTatorCorpusReader:
class LineType(Enum):
TITLE = 'TITLE'
ABSTRACT = 'ABSTRACT'
MENTION = 'MENTION'
DOC_SEP = 'DOCUMENT SEPARATOR'

def __init__(self, file_path):
def __init__(self, file_path=None):
self.file_path = file_path
self.__document_being_read = None
self.corpus = []
Expand All @@ -37,9 +55,9 @@ def __init__(self, file_path):
def load_corpus(self):
with open(self.file_path, 'r') as file:
lines = file.readlines()
return self.__parse_lines(lines)
return self.parse_lines(lines)

def __parse_lines(self, content_lines):
def parse_lines(self, content_lines):
prev_line_type = None
for line_number, line in enumerate(content_lines):
try:
Expand Down