From 2ac47d79c45ef53273856474d955054e3ad0b631 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 30 Apr 2024 15:35:17 +0200 Subject: [PATCH] Feature/streamline feature import parsing 20231130 (#48) * version -> 2.16.1 * added streamlined feature import and composite cog_cat to all importers --- gffquant/__init__.py | 2 +- gffquant/db/importers/annstr_db_importer.py | 15 +++++++---- .../db/importers/custom_database_importer.py | 11 ++++---- gffquant/db/importers/database_importer.py | 26 ++++++++++++------- .../db/importers/gene_database_importer.py | 23 ++++++++++------ .../small_genome_database_importer.py | 2 +- 6 files changed, 49 insertions(+), 30 deletions(-) diff --git a/gffquant/__init__.py b/gffquant/__init__.py index 498c8973..102fbb30 100644 --- a/gffquant/__init__.py +++ b/gffquant/__init__.py @@ -5,7 +5,7 @@ from enum import Enum, auto, unique -__version__ = "2.16.0" +__version__ = "2.16.1" __tool__ = "gffquant" diff --git a/gffquant/db/importers/annstr_db_importer.py b/gffquant/db/importers/annstr_db_importer.py index 5830f3ba..fc80f7a9 100644 --- a/gffquant/db/importers/annstr_db_importer.py +++ b/gffquant/db/importers/annstr_db_importer.py @@ -77,16 +77,21 @@ def get_ann_hash(s): logger.info("\tProcessed %s records", self.nseqs) # line = line.decode() line = line.strip().split(self.delimiter) - line_d = { + columns = { colname: value.strip() for colname, value in zip(header_line + [self.seq_column], line) if colname in category_cols or colname in (self.seq_column, self.seqid_column) } + # annotation = tuple( + # (category, tuple(set(sorted(features.split(","))))) + # for category, features in line_d.items() + # if features != self.na_char and features and category not in (self.seq_column, self.seqid_column) + # ) annotation = tuple( - (category, tuple(set(sorted(features.split(","))))) - for category, features in line_d.items() - if features != self.na_char and features and category not in (self.seq_column, self.seqid_column) + (category, features) + for category, features in self.extract_features(columns) + if category not in (self.seq_column, self.seqid_column) ) ann_str = ";".join( @@ -100,7 +105,7 @@ def get_ann_hash(s): yield db.AnnotationString(annotation_hash=ann_sfx), annotation print( - f">{line_d[self.seqid_column]}.{ann_sfx}", line_d[self.seq_column], + f">{columns[self.seqid_column]}.{ann_sfx}", columns[self.seq_column], sep="\n", file=seq_out ) diff --git a/gffquant/db/importers/custom_database_importer.py b/gffquant/db/importers/custom_database_importer.py index 184990a9..a7eb3dd8 100644 --- a/gffquant/db/importers/custom_database_importer.py +++ b/gffquant/db/importers/custom_database_importer.py @@ -74,11 +74,12 @@ def parse_annotations(self, input_data, input_data2=None): if colname in category_cols } - annotation = [ - (category, tuple(features.split(","))) - for category, features in columns.items() - if features and features != self.na_char - ] + # annotation = [ + # (category, tuple(features.split(","))) + # for category, features in columns.items() + # if features and features != self.na_char + # ] + annotation = self.extract_features(columns) if annotation: seq_feature = db.AnnotatedSequence( diff --git a/gffquant/db/importers/database_importer.py b/gffquant/db/importers/database_importer.py index 58116fa0..fb6bb49b 100644 --- a/gffquant/db/importers/database_importer.py +++ b/gffquant/db/importers/database_importer.py @@ -28,13 +28,17 @@ def __init__(self, db_path=None, db_session=None, na_char="-"): self.features = {} self.na_char = na_char - @staticmethod - def extract_features(columns): - annotation = [ - (category, tuple(features.split(","))) - for category, features in columns.items() - if features and features != "-" and category != "COG_category" - ] + def extract_features(self, columns): + + for category, features in columns.items(): + if features and features != self.na_char and category != "COG_category": + yield category, tuple(set(sorted(features.split(",")))) + + # annotation = [ + # (category, tuple(features.split(","))) + # for category, features in columns.items() + # if features and features != "-" and category != "COG_category" + # ] # COG_categories are single letters, but genes can have composite annotations # we profile both, the single letters (split composites into individual categories), @@ -45,10 +49,12 @@ def extract_features(columns): if len(cog_category) > 1: # composites need to be passed as 1-tuples, # otherwise downstream ops with iterate over the string! - annotation.append(("COG_category_composite", (cog_category,))) - annotation.append(("COG_category", tuple(cog_category))) + # annotation.append(("COG_category_composite", (cog_category,))) + yield "COG_category_composite", (cog_category,) + # annotation.append(("COG_category", tuple(cog_category))) + yield "COG_category", tuple(cog_category) - return annotation + # return annotation @staticmethod def get_open_function(f): diff --git a/gffquant/db/importers/gene_database_importer.py b/gffquant/db/importers/gene_database_importer.py index cba74ace..f670feb8 100644 --- a/gffquant/db/importers/gene_database_importer.py +++ b/gffquant/db/importers/gene_database_importer.py @@ -44,16 +44,23 @@ def parse_annotations(self, input_data, input_data2=None): featureid=None, strand=int(strand == "+") if strand is not None else None, ) - annotation = ( - (category, set(features).difference({"-"})) + + region_ann_d = { + category: ",".join(features) for category, features in region_annotation[1:] - ) + } + + # annotation = ( + # (category, set(features).difference({"-"})) + # for category, features in region_annotation[1:] + # ) - annotation = [ - (category, features) - for category, features in annotation - if features - ] + # annotation = [ + # (category, features) + # for category, features in annotation + # if features + # ] + annotation = tuple(self.extract_features(region_ann_d)) if annotation: yield seq_feature, annotation diff --git a/gffquant/db/importers/small_genome_database_importer.py b/gffquant/db/importers/small_genome_database_importer.py index 5370a744..f89e1bf6 100644 --- a/gffquant/db/importers/small_genome_database_importer.py +++ b/gffquant/db/importers/small_genome_database_importer.py @@ -106,7 +106,7 @@ def parse_annotations(self, input_data, input_data2=None): # if len(cog_category) > 1: # annotation.append(("COG_category_composite", (cog_category,))) # annotation.append(("COG_category", tuple(cog_category))) - annotation = GqDatabaseImporter.extract_features(columns) + annotation = tuple(self.extract_features(columns)) if annotation: yield seq_feature, annotation