From efbbb68d4dbe4312c0b232de6f9785880090c2ed Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Tue, 31 Oct 2023 11:53:20 +0100 Subject: [PATCH] Using spaCy `nlp.pipe` now processes texts sentence-wise, just like for `nlp(...)`. (#41) * `pipe` now does sentence-wise predictions just like __call__ * Update changelog --- CHANGELOG.md | 1 + span_marker/spacy_integration.py | 37 ++++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8043356f..0ce6c407 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Types of changes ### Changed - Changed the error when an empty sentence is provided to the tokenizer. +- Using spaCy `nlp.pipe` now processes texts sentence-wise, just like for `nlp(...)`. ### Fixed diff --git a/span_marker/spacy_integration.py b/span_marker/spacy_integration.py index 8abb7c18..31ba61e5 100644 --- a/span_marker/spacy_integration.py +++ b/span_marker/spacy_integration.py @@ -126,22 +126,45 @@ def pipe(self, stream, batch_size=128): stream = self.nlp.pipe(stream, batch_size=batch_size) for docs in minibatch(stream, size=batch_size): - inputs = [[token.text if not token.is_space else "" for token in doc] for doc in docs] + inputs = [ + [[token.text if not token.is_space else "" for token in sent] for sent in doc.sents] for doc in docs + ] + tokens = [tokens for sentences in inputs for tokens in sentences] + document_id = [idx for idx, sentences in enumerate(inputs) for _ in sentences] + sentence_id = [idx for sentences in inputs for idx in range(len(sentences))] # use document-level context in the inference if the model was also trained that way if self.model.config.trained_with_document_context: - inputs = self.convert_inputs_to_dataset(inputs) + inputs = Dataset.from_dict( + { + "tokens": tokens, + "document_id": document_id, + "sentence_id": sentence_id, + } + ) + else: + inputs = tokens entities_list = self.model.predict(inputs, batch_size=self.batch_size) - for doc, entities in zip(docs, entities_list): + + ents_list = [] + for idx, entities in enumerate(entities_list): + doc_id = document_id[idx] + num_prior_sentences = sentence_id[idx] + offset = len(sum(tokens[idx - num_prior_sentences : idx], start=[])) ents = [] for entity in entities: - start = entity["word_start_index"] - end = entity["word_end_index"] - span = doc[start:end] + start = entity["word_start_index"] + offset + end = entity["word_end_index"] + offset + span = docs[doc_id][start:end] span.label_ = entity["label"] ents.append(span) + if doc_id == len(ents_list): + ents_list.append(ents) + else: + ents_list[-1].extend(ents) + for doc, ents in zip(docs, ents_list): self.set_ents(doc, ents) - yield doc + yield from docs