From 2ac47d79c45ef53273856474d955054e3ad0b631 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <cschu1981@gmail.com>
Date: Tue, 30 Apr 2024 15:35:17 +0200
Subject: [PATCH] Feature/streamline feature import parsing 20231130 (#48)

* version -> 2.16.1
* added streamlined feature import and composite cog_cat to all importers
---
 gffquant/__init__.py                          |  2 +-
 gffquant/db/importers/annstr_db_importer.py   | 15 +++++++----
 .../db/importers/custom_database_importer.py  | 11 ++++----
 gffquant/db/importers/database_importer.py    | 26 ++++++++++++-------
 .../db/importers/gene_database_importer.py    | 23 ++++++++++------
 .../small_genome_database_importer.py         |  2 +-
 6 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/gffquant/__init__.py b/gffquant/__init__.py
index 498c8973..102fbb30 100644
--- a/gffquant/__init__.py
+++ b/gffquant/__init__.py
@@ -5,7 +5,7 @@
 from enum import Enum, auto, unique
 
 
-__version__ = "2.16.0"
+__version__ = "2.16.1"
 __tool__ = "gffquant"
 
 
diff --git a/gffquant/db/importers/annstr_db_importer.py b/gffquant/db/importers/annstr_db_importer.py
index 5830f3ba..fc80f7a9 100644
--- a/gffquant/db/importers/annstr_db_importer.py
+++ b/gffquant/db/importers/annstr_db_importer.py
@@ -77,16 +77,21 @@ def get_ann_hash(s):
                     logger.info("\tProcessed %s records", self.nseqs)
                 # line = line.decode()
                 line = line.strip().split(self.delimiter)
-                line_d = {
+                columns = {
                     colname: value.strip()
                     for colname, value in zip(header_line + [self.seq_column], line)
                     if colname in category_cols or colname in (self.seq_column, self.seqid_column)
                 }
 
+                # annotation = tuple(
+                #     (category, tuple(set(sorted(features.split(",")))))
+                #     for category, features in line_d.items()
+                #     if features != self.na_char and features and category not in (self.seq_column, self.seqid_column)
+                # )
                 annotation = tuple(
-                    (category, tuple(set(sorted(features.split(",")))))
-                    for category, features in line_d.items()
-                    if features != self.na_char and features and category not in (self.seq_column, self.seqid_column)
+                    (category, features)
+                    for category, features in self.extract_features(columns)
+                    if category not in (self.seq_column, self.seqid_column)
                 )
 
                 ann_str = ";".join(
@@ -100,7 +105,7 @@ def get_ann_hash(s):
                     yield db.AnnotationString(annotation_hash=ann_sfx), annotation
 
                 print(
-                    f">{line_d[self.seqid_column]}.{ann_sfx}", line_d[self.seq_column],
+                    f">{columns[self.seqid_column]}.{ann_sfx}", columns[self.seq_column],
                     sep="\n",
                     file=seq_out
                 )
diff --git a/gffquant/db/importers/custom_database_importer.py b/gffquant/db/importers/custom_database_importer.py
index 184990a9..a7eb3dd8 100644
--- a/gffquant/db/importers/custom_database_importer.py
+++ b/gffquant/db/importers/custom_database_importer.py
@@ -74,11 +74,12 @@ def parse_annotations(self, input_data, input_data2=None):
                     if colname in category_cols
                 }
 
-                annotation = [
-                    (category, tuple(features.split(",")))
-                    for category, features in columns.items()
-                    if features and features != self.na_char
-                ]
+                # annotation = [
+                #     (category, tuple(features.split(",")))
+                #     for category, features in columns.items()
+                #     if features and features != self.na_char
+                # ]
+                annotation = self.extract_features(columns)
 
                 if annotation:
                     seq_feature = db.AnnotatedSequence(
diff --git a/gffquant/db/importers/database_importer.py b/gffquant/db/importers/database_importer.py
index 58116fa0..fb6bb49b 100644
--- a/gffquant/db/importers/database_importer.py
+++ b/gffquant/db/importers/database_importer.py
@@ -28,13 +28,17 @@ def __init__(self, db_path=None, db_session=None, na_char="-"):
         self.features = {}
         self.na_char = na_char
 
-    @staticmethod
-    def extract_features(columns):
-        annotation = [
-            (category, tuple(features.split(",")))
-            for category, features in columns.items()
-            if features and features != "-" and category != "COG_category"
-        ]
+    def extract_features(self, columns):
+
+        for category, features in columns.items():
+            if features and features != self.na_char and category != "COG_category":
+                yield category, tuple(set(sorted(features.split(","))))
+
+        # annotation = [
+        #     (category, tuple(features.split(",")))
+        #     for category, features in columns.items()
+        #     if features and features != "-" and category != "COG_category"
+        # ]
 
         # COG_categories are single letters, but genes can have composite annotations
         # we profile both, the single letters (split composites into individual categories),
@@ -45,10 +49,12 @@ def extract_features(columns):
             if len(cog_category) > 1:
                 # composites need to be passed as 1-tuples,
                 # otherwise downstream ops with iterate over the string!
-                annotation.append(("COG_category_composite", (cog_category,)))
-            annotation.append(("COG_category", tuple(cog_category)))
+                # annotation.append(("COG_category_composite", (cog_category,)))
+                yield "COG_category_composite", (cog_category,)
+            # annotation.append(("COG_category", tuple(cog_category)))
+            yield "COG_category", tuple(cog_category)
 
-        return annotation
+        # return annotation
     
     @staticmethod
     def get_open_function(f):
diff --git a/gffquant/db/importers/gene_database_importer.py b/gffquant/db/importers/gene_database_importer.py
index cba74ace..f670feb8 100644
--- a/gffquant/db/importers/gene_database_importer.py
+++ b/gffquant/db/importers/gene_database_importer.py
@@ -44,16 +44,23 @@ def parse_annotations(self, input_data, input_data2=None):
                 featureid=None,
                 strand=int(strand == "+") if strand is not None else None,
             )
-            annotation = (
-                (category, set(features).difference({"-"}))
+
+            region_ann_d = {
+                category: ",".join(features)
                 for category, features in region_annotation[1:]
-            )
+            }
+
+            # annotation = (
+            #     (category, set(features).difference({"-"}))
+            #     for category, features in region_annotation[1:]
+            # )
 
-            annotation = [
-                (category, features)
-                for category, features in annotation
-                if features
-            ]
+            # annotation = [
+            #     (category, features)
+            #     for category, features in annotation
+            #     if features
+            # ]
+            annotation = tuple(self.extract_features(region_ann_d))
 
             if annotation:
                 yield seq_feature, annotation
diff --git a/gffquant/db/importers/small_genome_database_importer.py b/gffquant/db/importers/small_genome_database_importer.py
index 5370a744..f89e1bf6 100644
--- a/gffquant/db/importers/small_genome_database_importer.py
+++ b/gffquant/db/importers/small_genome_database_importer.py
@@ -106,7 +106,7 @@ def parse_annotations(self, input_data, input_data2=None):
                 #     if len(cog_category) > 1:
                 #         annotation.append(("COG_category_composite", (cog_category,)))
                 #     annotation.append(("COG_category", tuple(cog_category)))
-                annotation = GqDatabaseImporter.extract_features(columns)
+                annotation = tuple(self.extract_features(columns))
 
                 if annotation:
                     yield seq_feature, annotation