Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
cschu committed Apr 30, 2024
2 parents 7a394de + 2ac47d7 commit 0f81966
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 30 deletions.
2 changes: 1 addition & 1 deletion gffquant/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from enum import Enum, auto, unique


__version__ = "2.16.0"
__version__ = "2.16.1"
__tool__ = "gffquant"


Expand Down
15 changes: 10 additions & 5 deletions gffquant/db/importers/annstr_db_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,21 @@ def get_ann_hash(s):
logger.info("\tProcessed %s records", self.nseqs)
# line = line.decode()
line = line.strip().split(self.delimiter)
line_d = {
columns = {
colname: value.strip()
for colname, value in zip(header_line + [self.seq_column], line)
if colname in category_cols or colname in (self.seq_column, self.seqid_column)
}

# annotation = tuple(
# (category, tuple(set(sorted(features.split(",")))))
# for category, features in line_d.items()
# if features != self.na_char and features and category not in (self.seq_column, self.seqid_column)
# )
annotation = tuple(
(category, tuple(set(sorted(features.split(",")))))
for category, features in line_d.items()
if features != self.na_char and features and category not in (self.seq_column, self.seqid_column)
(category, features)
for category, features in self.extract_features(columns)
if category not in (self.seq_column, self.seqid_column)
)

ann_str = ";".join(
Expand All @@ -100,7 +105,7 @@ def get_ann_hash(s):
yield db.AnnotationString(annotation_hash=ann_sfx), annotation

print(
f">{line_d[self.seqid_column]}.{ann_sfx}", line_d[self.seq_column],
f">{columns[self.seqid_column]}.{ann_sfx}", columns[self.seq_column],
sep="\n",
file=seq_out
)
Expand Down
11 changes: 6 additions & 5 deletions gffquant/db/importers/custom_database_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,12 @@ def parse_annotations(self, input_data, input_data2=None):
if colname in category_cols
}

annotation = [
(category, tuple(features.split(",")))
for category, features in columns.items()
if features and features != self.na_char
]
# annotation = [
# (category, tuple(features.split(",")))
# for category, features in columns.items()
# if features and features != self.na_char
# ]
annotation = self.extract_features(columns)

if annotation:
seq_feature = db.AnnotatedSequence(
Expand Down
26 changes: 16 additions & 10 deletions gffquant/db/importers/database_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,17 @@ def __init__(self, db_path=None, db_session=None, na_char="-"):
self.features = {}
self.na_char = na_char

@staticmethod
def extract_features(columns):
annotation = [
(category, tuple(features.split(",")))
for category, features in columns.items()
if features and features != "-" and category != "COG_category"
]
def extract_features(self, columns):

for category, features in columns.items():
if features and features != self.na_char and category != "COG_category":
yield category, tuple(set(sorted(features.split(","))))

# annotation = [
# (category, tuple(features.split(",")))
# for category, features in columns.items()
# if features and features != "-" and category != "COG_category"
# ]

# COG_categories are single letters, but genes can have composite annotations
# we profile both, the single letters (split composites into individual categories),
Expand All @@ -45,10 +49,12 @@ def extract_features(columns):
if len(cog_category) > 1:
# composites need to be passed as 1-tuples,
# otherwise downstream ops with iterate over the string!
annotation.append(("COG_category_composite", (cog_category,)))
annotation.append(("COG_category", tuple(cog_category)))
# annotation.append(("COG_category_composite", (cog_category,)))
yield "COG_category_composite", (cog_category,)
# annotation.append(("COG_category", tuple(cog_category)))
yield "COG_category", tuple(cog_category)

return annotation
# return annotation

@staticmethod
def get_open_function(f):
Expand Down
23 changes: 15 additions & 8 deletions gffquant/db/importers/gene_database_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,23 @@ def parse_annotations(self, input_data, input_data2=None):
featureid=None,
strand=int(strand == "+") if strand is not None else None,
)
annotation = (
(category, set(features).difference({"-"}))

region_ann_d = {
category: ",".join(features)
for category, features in region_annotation[1:]
)
}

# annotation = (
# (category, set(features).difference({"-"}))
# for category, features in region_annotation[1:]
# )

annotation = [
(category, features)
for category, features in annotation
if features
]
# annotation = [
# (category, features)
# for category, features in annotation
# if features
# ]
annotation = tuple(self.extract_features(region_ann_d))

if annotation:
yield seq_feature, annotation
2 changes: 1 addition & 1 deletion gffquant/db/importers/small_genome_database_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def parse_annotations(self, input_data, input_data2=None):
# if len(cog_category) > 1:
# annotation.append(("COG_category_composite", (cog_category,)))
# annotation.append(("COG_category", tuple(cog_category)))
annotation = GqDatabaseImporter.extract_features(columns)
annotation = tuple(self.extract_features(columns))

if annotation:
yield seq_feature, annotation

0 comments on commit 0f81966

Please sign in to comment.