Skip to content

Commit

Permalink
Update seqid_by_taxa.py to read data report as jsonlines (format chan…
Browse files Browse the repository at this point in the history
…ged from yaml)
  • Loading branch information
Bob Falk committed Nov 6, 2020
1 parent 0571dc0 commit 20a78df
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 18 deletions.
16 changes: 7 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
pandas
jsonlines
numpy
matplotlib
biopython
ncbi-datasets-pylib

virtualenv -p python3 jup
source ./jup/bin/activate.csh
pip install jupyter
pip install jupyterlab
pip install ipywidgets
pip install ipykernel
pip install jupyter_contrib_nbextensions
pip install jupyter_nbextensions_configurator
jupyter
jupyterlab
ipywidgets
ipykernel
jupyter_contrib_nbextensions
jupyter_nbextensions_configurator
22 changes: 13 additions & 9 deletions scripts/seqids_by_taxa.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import logging
import os
import jsonlines
from tax_tree import TaxTree
from report_reader import DatasetsReportReader

Expand Down Expand Up @@ -50,11 +51,11 @@ def main():
# Go over all transcripts and assign the seqids (based on their taxid) to the group
# to which they belong, or the UNASSSIGNED group
count = 0
for gene in gene_data_report.genes:
taxgroup = taxmap.get(str(gene.tax_id), "")
for protein in gene.proteins:
for gene in gene_data_report:
taxgroup = taxmap.get(str(gene['taxId']), "")
for protein in gene['proteins']:
if taxgroup:
tax_to_seqid[taxgroup].append(protein.accession_version)
tax_to_seqid[taxgroup].append(protein['accessionVersion'])

# write out the seqids with their corresponding tax group
with open(args.output, "w") as f:
Expand All @@ -67,19 +68,22 @@ def main():

def get_data_report(bdbag):
report_reader = DatasetsReportReader()
report_file = os.path.join(bdbag, "data/data_report.yaml")
report_file = os.path.join(bdbag, "data/data_report.jsonl")
if not os.path.isfile(report_file):
logger.error(f'Error opening report file. File not found: {report_file}')
return 1

return report_reader.gene_report_from_file(report_file)
genes = []
with jsonlines.open(report_file) as report_reader:
genes = [gene for gene in report_reader]
return genes


def add_missing_taxids(taxtree, gene_data_report, email):
missing_taxa = set()
for gene in gene_data_report.genes:
if not taxtree.get_org_if_exists(str(gene.tax_id)):
missing_taxa.add(str(gene.tax_id))
for gene in gene_data_report:
if not taxtree.get_org_if_exists(str(gene['taxId'])):
missing_taxa.add(str(gene['taxId']))
uids = ",".join(missing_taxa)
if len(uids):
taxtree.add_entrez_taxa(uids, email)
Expand Down

0 comments on commit 20a78df

Please sign in to comment.