From 921c47b2cd31a52d74e7dd42a4cdbf2e756e2209 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Tue, 24 Oct 2023 13:37:24 +0200 Subject: [PATCH 1/3] Reformat code using black + zimports. Fix file endings. Remove execute bit. --- LICENSE | 0 README.md | 82 +- lib/EDNAFULL | 0 lib/Lineages_UPDATER.py | 113 +- lib/__init__.py | 0 lib/db.sqlite | 0 lib/doctest_b117.fna | 0 lib/doctest_b117.pickle | Bin lib/migrate/4.sql | 2 +- lib/ref.fna | 0 lib/ref.gff3 | 0 lib/sonardb.py | 7491 ++++++++++++++++++++------------------ lib/sonartoVCF.py | 315 +- lib/sonartoVCF_v2.bak.py | 378 +- lib/sonartoVCF_v2.py | 623 ++-- logo.png | Bin sonar.env.yml | 0 sonar.py | 1762 ++++++--- 18 files changed, 6017 insertions(+), 4749 deletions(-) mode change 100755 => 100644 LICENSE mode change 100755 => 100644 README.md mode change 100755 => 100644 lib/EDNAFULL mode change 100755 => 100644 lib/Lineages_UPDATER.py mode change 100755 => 100644 lib/__init__.py mode change 100755 => 100644 lib/db.sqlite mode change 100755 => 100644 lib/doctest_b117.fna mode change 100755 => 100644 lib/doctest_b117.pickle mode change 100755 => 100644 lib/migrate/4.sql mode change 100755 => 100644 lib/ref.fna mode change 100755 => 100644 lib/ref.gff3 mode change 100755 => 100644 lib/sonardb.py mode change 100755 => 100644 lib/sonartoVCF.py mode change 100755 => 100644 lib/sonartoVCF_v2.bak.py mode change 100755 => 100644 lib/sonartoVCF_v2.py mode change 100755 => 100644 logo.png mode change 100755 => 100644 sonar.env.yml diff --git a/LICENSE b/LICENSE old mode 100755 new mode 100644 diff --git a/README.md b/README.md old mode 100755 new mode 100644 index da56ab3..a90a7e9 --- a/README.md +++ b/README.md @@ -25,11 +25,11 @@ covSonar has some software-environmental requirements that can most easily be me ## 2. Setup Proceed as follows to install covSonar: ```sh -# download the repository to the current working directory using git +# download the repository to the current working directory using git git clone https://github.com/rki-mf1/covsonar.git # build the custom software environment using conda [recommended] conda env create -n sonar -f covsonar/sonar.env.yml -# activate the conda evironment if built +# activate the conda evironment if built conda activate sonar # testing ./covsonar/test.sh @@ -38,13 +38,13 @@ conda activate sonar ## 3. Usage -In covSonar there are several tools that can be called via subcommands. +In covSonar there are several tools that can be called via subcommands. | subcommand | purpose | |------------|---------------------------------------------------------------------| | add | to add genome sequences to the database | | update | to import and replace meta information | -| match | to query genome sequences sharing a defined profile | +| match | to query genome sequences sharing a defined profile | | restore | to restore genome sequence(s) from the database | | info | show detailed informations about the used sonarversion and database | | optimize | optimize the given database | @@ -57,13 +57,13 @@ Each tool provides a help page that can be accessed with the `-h` option. # activating conda environment if built and not active yet (see section 2) conda activate sonar # display help page for adding genomes -path/to/covsonar/sonar.py add -h +path/to/covsonar/sonar.py add -h ``` ### 3.1 Adding genomes to the database -Genome sequences of SARS-COV-2 can be added to the database in the form of FASTA files. Intermediate data is stored in a cache directory, which is temporary by default and deleted after import. The SQLite database is stored in a single file. If the defined database file does not exist, a new database is created. +Genome sequences of SARS-COV-2 can be added to the database in the form of FASTA files. Intermediate data is stored in a cache directory, which is temporary by default and deleted after import. The SQLite database is stored in a single file. If the defined database file does not exist, a new database is created. The import process can be divided into three stages: 1. caching of the sequences to be imported and calculation of sequence hashes. @@ -85,7 +85,7 @@ conda activate sonar # adding all sequences from 'genomes.fasta' to database 'mydb' # using eight cpus (the database file will be created if it does not exist) path/to/covsonar/sonar.py add -f genomes.fasta --db mydb --cpus 8 -# as before, but using a permanent cache directory to store +# as before, but using a permanent cache directory to store # intermediate files path/to/covsonar/sonar.py add -f genomes.fasta --db mydb --cpus 8 --cache mycache ``` @@ -98,25 +98,25 @@ Additional meta-information can be added for each genome sequence, namely lab, d | expression | description | |------------------------------|---------------------------------------------------------------------------------| | accession=_colname1_ | genome accessions are listed in column _colname1_ | -| lineage=_colname2_ | lineage information is listed in column _colname2_ | +| lineage=_colname2_ | lineage information is listed in column _colname2_ | | zip=_colname3_ | zip codes are listed in column _colname3_ | | date=_colname4_ | sampling dates are listed in column _colname4_ (needed date format: YYYY-MM-DD) | -| lab=_colname5_ | lab information is listed in column _colname5_ | -| source=_colname6_ | data source is listed in column _colname6_ | -| collection=_colname7_ | data collection is listed in column _colname7_ | -| technology=_colname8_ | used sequencing technology is listed in column _colname8_ | -| platform=_colname9_ | used sequencing platform is listed in column _colname9_ | -| chemistry=_colname10_ | used sequencing chemistry is listed in column _colname10_ | -| software=_colname11_ | software used for genome reconstruction is listed in column _colname11_ | -| software_version=_colname12_ | software version used for genome reconstruction is listed in column _colname12_ | -| material=_colname13_ | sampling material is listed in column _colname13_ | -| ct=_colname14_ | ct values are listed in column _colname14_ | +| lab=_colname5_ | lab information is listed in column _colname5_ | +| source=_colname6_ | data source is listed in column _colname6_ | +| collection=_colname7_ | data collection is listed in column _colname7_ | +| technology=_colname8_ | used sequencing technology is listed in column _colname8_ | +| platform=_colname9_ | used sequencing platform is listed in column _colname9_ | +| chemistry=_colname10_ | used sequencing chemistry is listed in column _colname10_ | +| software=_colname11_ | software used for genome reconstruction is listed in column _colname11_ | +| software_version=_colname12_ | software version used for genome reconstruction is listed in column _colname12_ | +| material=_colname13_ | sampling material is listed in column _colname13_ | +| ct=_colname14_ | ct values are listed in column _colname14_ | ```sh # activating conda environment if built and not active yet (see section 2) conda activate sonar -# importing lineage information from pangolin output file +# importing lineage information from pangolin output file # to database 'mydb' path/to/covsonar/sonar.py update --pangolin pangolin.csv --db mydb # importing zip codes and sampling dates from a custom CSV file @@ -124,29 +124,29 @@ path/to/covsonar/sonar.py update --pangolin pangolin.csv --db mydb path/to/covsonar/sonar.py update --csv custom.csv --fields accession=acc zip=zip_codes date=sampling --db mydb ``` -*More add-on feild to support custom scenario +*More add-on feild to support custom scenario | expression | description | |------------------------------|------------------------------------------------------------------------------------------------------------| -| submission_date=_colname_ | This one can be used when the sample is submitted for processing or prcoessing date after sampling date. | +| submission_date=_colname_ | This one can be used when the sample is submitted for processing or prcoessing date after sampling date. | -### 3.3 Query genome sequences based on profiles +### 3.3 Query genome sequences based on profiles Genomic profiles can be defined to align genomes. For this purpose, the variants related to the complete genome of the SARS-CoV-2 isolate Wuhan-Hu-1 (NC_045512.2) must be expressed as follows: | typ | nucleotide level | amino acid level | |-----------|-------------------------------------------------------------------|-------------------------------| | SNP | ref_nuc _followed by_ ref_pos _followed by_ alt_nuc (e.g. A3451T) | protein_symbol:ref_aa _followed by_ ref_pos _followed by_ alt_aa (e.g. S:N501Y) | -| deletion | del:ref_pos:length_in_bp (e.g. del:3001:8) | protein_symbol:del:ref_pos:length_in_aa (e.g. ORF1ab:del:3001:21) | -| insertion | ref_nuc _followed by_ ref_pos _followed by_ alt_nucs (e.g. A3451TGAT) | protein_symbol:ref_aa _followed by_ ref_pos _followed by_ alt_aas (e.g. N:A34AK) | +| deletion | del:ref_pos:length_in_bp (e.g. del:3001:8) | protein_symbol:del:ref_pos:length_in_aa (e.g. ORF1ab:del:3001:21) | +| insertion | ref_nuc _followed by_ ref_pos _followed by_ alt_nucs (e.g. A3451TGAT) | protein_symbol:ref_aa _followed by_ ref_pos _followed by_ alt_aas (e.g. N:A34AK) | -The positions refer to the reference (first nucleotide in the genome is position 1). Using the option `-i` multiple variant definitions can be combined into a nucleotide, amino acid or mixed profile, which means that matching genomes must have all those variations in common. In contrast, alternative variations can be defined by multiple `-i` options. As an example, `-i S:N501Y S:E484K` matches genomes sharing the _Nelly_ **AND** _Erik_ variation while `-i S:N501Y -i S:E484K` matches to genomes that share either the _Nelly_ **OR** _Erik_ variation **OR** both. Accordingly, using the option `-e` profiles can be defined that have not to be present in the matched genomes. +The positions refer to the reference (first nucleotide in the genome is position 1). Using the option `-i` multiple variant definitions can be combined into a nucleotide, amino acid or mixed profile, which means that matching genomes must have all those variations in common. In contrast, alternative variations can be defined by multiple `-i` options. As an example, `-i S:N501Y S:E484K` matches genomes sharing the _Nelly_ **AND** _Erik_ variation while `-i S:N501Y -i S:E484K` matches to genomes that share either the _Nelly_ **OR** _Erik_ variation **OR** both. Accordingly, using the option `-e` profiles can be defined that have not to be present in the matched genomes. -To filter genomes based on metadata specific options can be used (see table below). Only genomes linked to the respective metadata are then considered. Metadata values are negated when introduced by ^ (e.g. `--acc ^ID1` matches all genomes accessions but ID1). Metadata filtering is case-insensitive. To see the amount of available metadata in your database use the info tool (see section 3.5). +To filter genomes based on metadata specific options can be used (see table below). Only genomes linked to the respective metadata are then considered. Metadata values are negated when introduced by ^ (e.g. `--acc ^ID1` matches all genomes accessions but ID1). Metadata filtering is case-insensitive. To see the amount of available metadata in your database use the info tool (see section 3.5). | option | value(s) | note | -|---------------------|-----------------------------------------------------------------------|------| +|---------------------|-----------------------------------------------------------------------|------| | --acc | one or more genome accessions (e.g. NC_045512.2) | | | --lineage | one or more pangolin lineages (e.g. B.1.1.7) | | | --zip | one or more zip codes (e.g. 10627) | zip codes are dynamically extended to the right side, e.g. 033 matches to all zip codes starting with 033| @@ -163,7 +163,7 @@ To filter genomes based on metadata specific options can be used (see table belo | --material | one or more sample materials (e.g. 'nasal swap') | | | --min_ct | minimal ct value (e.g. 20) | | | --max_ct | maximal ct value (e.g. 20) | | - + There are additional options to adjust the matching. @@ -180,7 +180,7 @@ By default, genome matching produces a comma-separated output (csv). Using the o ```sh # activating conda environment if built and not active yet (see section 2) conda activate sonar -# matching B.1.1.7 genomes in DB 'mydb' that share an additional "Erik" mutation +# matching B.1.1.7 genomes in DB 'mydb' that share an additional "Erik" mutation path/to/covsonar/sonar.py match -i S:E484K --lineage B.1.1.7 --db mydb # as before but matching genomes are counted only path/to/covsonar/sonar.py match -i S:E484K --lineage B.1.1.7 --count --db mydb @@ -242,11 +242,11 @@ The restored sequences are combined with their original FASTA header and shown ```sh # activating conda environment if built and not active yet (see section 2) conda activate sonar -# Restore genome sequences linked to accessions 'mygenome1' and 'mygenome2' from the +# Restore genome sequences linked to accessions 'mygenome1' and 'mygenome2' from the # database 'mydb' and write these to a fasta file named 'restored.fasta' path/to/covsonar/sonar.py restore --acc mygenome1 mygenome2 --db mydb > restored.fasta # as before, but consider all accessions from 'accessions.txt' (the file has to -# contain one accession per line) +# contain one accession per line) path/to/covsonar/sonar.py restore --file accessions.txt --db mydb > restored.fasta ``` @@ -289,11 +289,11 @@ path/to/covsonar/sonar.py var2vcf --db mydb -f acc.10.txt -o merge.vcf # To speed up the query, we can use --cpus tag to aid a query performance. path/to/covsonar/sonar.py var2vcf --db mydb --date 2021-08-01:2021-08-10 -o merge.vcf --cpus 20 -# Another solution, we can use --betaV2 tag (x3-5 times faster), +# Another solution, we can use --betaV2 tag (x3-5 times faster), # The current version is under development, so if you found any bug please report it to us. path/to/covsonar/sonar.py var2vcf --db mydb --date 2021-08-01:2021-08-10 -o merge.vcf --cpus 20 --betaV2 ``` -:warning:**Note:** The current performance of this feature still does not perform well (e.g., memory usage and runtime) when trying to export many accessions. However, we are constantly working on improving the performance. :monkey: +:warning:**Note:** The current performance of this feature still does not perform well (e.g., memory usage and runtime) when trying to export many accessions. However, we are constantly working on improving the performance. :monkey: ## 4 How to contribute @@ -306,9 +306,9 @@ Your feedback is very welcome! **Q:** How can I screen for genomes with any SNP or amino acid substitution at a specific position at the genome or gene product? -**A:** covSonar accepts and interpretes the IUPAC nucleotide and amino acid code. Accordingly, you can screen for any nucleotide or amino acid at a certain position using N and X, respectively. Since covSonar stores only sites different from the reference, the reference nucleotide will be not considered when searching for N or X. As an example, use the following command to screen for all genome encoding for any amino acid substitution at position 484 within the Spike protein (reference allele is E at this position). +**A:** covSonar accepts and interpretes the IUPAC nucleotide and amino acid code. Accordingly, you can screen for any nucleotide or amino acid at a certain position using N and X, respectively. Since covSonar stores only sites different from the reference, the reference nucleotide will be not considered when searching for N or X. As an example, use the following command to screen for all genome encoding for any amino acid substitution at position 484 within the Spike protein (reference allele is E at this position). -```bash +```bash # screen for all genomes encoding for any amino acid substitution at position 484 within the Spike protein # using the database 'mydb" path/to/sonar.py -i S:E484X --db mydb @@ -320,8 +320,8 @@ path/to/sonar.py -i S:E484X --db mydb **A:** This happens, when the sqlite temprory directory that might be located on another disk has not enough space to store the intermediate files. You can easily change the temporary directory e.g. to the current working directory using the following Shell command (the changes will only apply to the current Shell session): -```bash -# changing the sqlite temporary directory to the current working directory +```bash +# changing the sqlite temporary directory to the current working directory # replace . by the path to the location you want to use export SQLITE_TMPDIR="." ``` @@ -337,9 +337,9 @@ Please run 'sonar.py db-upgrade' to upgrade database **A:** This happens, when you use the newest version of covSonar with old database version. -Please use our database upgrade assistant to solve the problem. -```bash -# RUN +Please use our database upgrade assistant to solve the problem. +```bash +# RUN python sonar.py db-upgrade --db mydb.db # Output @@ -354,5 +354,3 @@ Success: Database upgrade was successfully completed ``` :warning: Warning: Backup the db file before upgrade. - - diff --git a/lib/EDNAFULL b/lib/EDNAFULL old mode 100755 new mode 100644 diff --git a/lib/Lineages_UPDATER.py b/lib/Lineages_UPDATER.py old mode 100755 new mode 100644 index 7e6c4e1..9055952 --- a/lib/Lineages_UPDATER.py +++ b/lib/Lineages_UPDATER.py @@ -1,15 +1,14 @@ #!/usr/bin/python # Maintainer: KongkitimanonK -# The method originally came from +# The method originally came from # https://github.com/cov-lineages/pango-designation. # We just adapt and change some parts to be used in covsonar, vocal etc. -import os -import pandas as pd -from tempfile import mkstemp, mkdtemp import json +import os + +import pandas as pd import requests -import warnings # warnings.simplefilter(action='ignore', category=FutureWarning) # Due to the https://github.com/cov-lineages/pango-designation/issues/853 @@ -18,7 +17,7 @@ # from pango_aliasor.aliasor import Aliasor # except ModuleNotFoundError as e: # print("Dependency `pango_aliasor` missing, please install using `pip install pango_aliasor`") -# raise e +# raise e # In covSonar 1 we will copy the class from https://github.com/corneliusroemer/pango_aliasor/blob/main/src/pango_aliasor/aliasor.py # but for V.2 we will switch using the package to "pip install pango_aliasor" @@ -29,7 +28,9 @@ def __init__(self, alias_file=None): if alias_file is None: import importlib.resources - with importlib.resources.open_text("pango_designation", "alias_key.json") as file: + with importlib.resources.open_text( + "pango_designation", "alias_key.json" + ) as file: file = json.load(file) else: @@ -45,21 +46,21 @@ def __init__(self, alias_file=None): self.realias_dict = {v: k for k, v in self.alias_dict.items()} - def compress(self,name): - name_split = name.split('.') + def compress(self, name): + name_split = name.split(".") levels = len(name_split) - 1 - num_indirections = (levels -1) // 3 + num_indirections = (levels - 1) // 3 if num_indirections <= 0: return name - alias = ".".join(name_split[0:(3*num_indirections + 1)]) - ending = ".".join(name_split[(3*num_indirections + 1):]) - return self.realias_dict[alias] + '.' + ending + alias = ".".join(name_split[0 : (3 * num_indirections + 1)]) + ending = ".".join(name_split[(3 * num_indirections + 1) :]) + return self.realias_dict[alias] + "." + ending - def uncompress(self,name): + def uncompress(self, name): # A function that prints the output to the screen. - if(pd.isna(name)): + if pd.isna(name): return "" - name_split = name.split('.') + name_split = name.split(".") letter = name_split[0] try: unaliased = self.alias_dict[letter] @@ -68,49 +69,52 @@ def uncompress(self,name): if len(name_split) == 1: return name if len(name_split) == 2: - return unaliased + '.' + name_split[1] + return unaliased + "." + name_split[1] else: - return unaliased + '.' + ".".join(name_split[1:]) + return unaliased + "." + ".".join(name_split[1:]) + def lts(lineage): items = [] for item in lineage.split("."): item_string = str(item) - items.append((5-len(item))*"0" + item_string) + items.append((5 - len(item)) * "0" + item_string) return "".join(items) + def download_source(tmp_dir): - lineages_url='https://raw.githubusercontent.com/cov-lineages/pango-designation/master/lineages.csv' - alias_key_url='https://raw.githubusercontent.com/cov-lineages/pango-designation/master/pango_designation/alias_key.json' - lineag=os.path.join(tmp_dir,'lineags.csv') - alias_key=os.path.join(tmp_dir,'alias_key.json') - print('Download lineages') + lineages_url = "https://raw.githubusercontent.com/cov-lineages/pango-designation/master/lineages.csv" + alias_key_url = "https://raw.githubusercontent.com/cov-lineages/pango-designation/master/pango_designation/alias_key.json" + lineag = os.path.join(tmp_dir, "lineags.csv") + alias_key = os.path.join(tmp_dir, "alias_key.json") + print("Download lineages") url_content = requests.get(lineages_url).content - csv_file = open(lineag, 'wb') + csv_file = open(lineag, "wb") csv_file.write(url_content) csv_file.close() - print('Download alias_key') + print("Download alias_key") items = requests.get(alias_key_url) data = items.json() - with open( alias_key , 'w') as f: + with open(alias_key, "w") as f: json.dump(data, f) - return alias_key,lineag + return alias_key, lineag + def process_lineage(alias_key_path, lineages_path, output): """ # handle duplicate values with open(alias_key_path) as f: - # load json objects to dictionaries + # load json objects to dictionaries data_dict = json.load(f) for k, v in data_dict.items(): if type(v) is list: data_dict[k] = list(set(v)) - # rewrite the json + # rewrite the json with open(alias_key_path ,'w') as nf: json.dump(data_dict, nf) """ - print('Create all lineages') + print("Create all lineages") aliasor = Aliasor(alias_key_path) df_lineages = pd.read_csv(lineages_path) lineages = df_lineages.lineage.unique() @@ -122,36 +126,37 @@ def process_lineage(alias_key_path, lineages_path, output): uncompressed_lineages.sort(key=lts) sorted_lineages = list(map(aliasor.compress, uncompressed_lineages)) - print('Calculate parent-child relationship') + print("Calculate parent-child relationship") # -- Fill the sub child -- # the current method is working, but it is not good in term of performance # the algoritm use 3 loops - _final_list=[] - for _id in sorted_lineages: + _final_list = [] + for _id in sorted_lineages: alias_lineage_char = aliasor.uncompress(_id) sub_lineage_list = [] row_dict = {} - #print(_id, '=',alias_lineage_char) # check alias name - for name_ in uncompressed_lineages: # fetch all lineage again - for index, letter in enumerate(name_.split('.')): - if index!=0: - letter=root+'.'+letter - root=letter + # print(_id, '=',alias_lineage_char) # check alias name + for name_ in uncompressed_lineages: # fetch all lineage again + for index, letter in enumerate(name_.split(".")): + if index != 0: + letter = root + "." + letter + root = letter else: - root=letter + root = letter if letter == alias_lineage_char: sub_lineage_list.append(aliasor.compress(name_)) - - sub_lineage_list.remove(aliasor.compress(alias_lineage_char)) # remove root lineage - if(len(sub_lineage_list)>0): - row_dict['lineage']= _id - row_dict['sublineage']= ",".join(sub_lineage_list) - else: - row_dict['lineage']= _id - row_dict['sublineage']= "none" - _final_list.append(row_dict) - print('Write output:',output) - df = pd.DataFrame.from_dict(_final_list, orient='columns') - df = df[df.lineage != ''] - df.sort_values(by=['lineage']).to_csv(output, sep="\t", index=False) + sub_lineage_list.remove( + aliasor.compress(alias_lineage_char) + ) # remove root lineage + if len(sub_lineage_list) > 0: + row_dict["lineage"] = _id + row_dict["sublineage"] = ",".join(sub_lineage_list) + else: + row_dict["lineage"] = _id + row_dict["sublineage"] = "none" + _final_list.append(row_dict) + print("Write output:", output) + df = pd.DataFrame.from_dict(_final_list, orient="columns") + df = df[df.lineage != ""] + df.sort_values(by=["lineage"]).to_csv(output, sep="\t", index=False) diff --git a/lib/__init__.py b/lib/__init__.py old mode 100755 new mode 100644 diff --git a/lib/db.sqlite b/lib/db.sqlite old mode 100755 new mode 100644 diff --git a/lib/doctest_b117.fna b/lib/doctest_b117.fna old mode 100755 new mode 100644 diff --git a/lib/doctest_b117.pickle b/lib/doctest_b117.pickle old mode 100755 new mode 100644 diff --git a/lib/migrate/4.sql b/lib/migrate/4.sql old mode 100755 new mode 100644 index 3b63701..1f530c9 --- a/lib/migrate/4.sql +++ b/lib/migrate/4.sql @@ -46,4 +46,4 @@ LEFT JOIN sequence USING (seqhash) LEFT JOIN sequence2prot USING (seqhash) LEFT JOIN prot USING (varid); -COMMIT; \ No newline at end of file +COMMIT; diff --git a/lib/ref.fna b/lib/ref.fna old mode 100755 new mode 100644 diff --git a/lib/ref.gff3 b/lib/ref.gff3 old mode 100755 new mode 100644 diff --git a/lib/sonardb.py b/lib/sonardb.py old mode 100755 new mode 100644 index 41c9157..936d0da --- a/lib/sonardb.py +++ b/lib/sonardb.py @@ -1,3044 +1,3418 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -#author: Stephan Fuchs (Robert Koch Institute, MF1, fuchss@rki.de) +# author: Stephan Fuchs (Robert Koch Institute, MF1, fuchss@rki.de) +import base64 +from collections import defaultdict +from collections import OrderedDict +from contextlib import ExitStack +import csv +import itertools import os +import pickle import re -import sys -import argparse +import shutil +import signal import sqlite3 -from sqlite3 import Error -from Bio.SeqUtils.CheckSum import seguid -from Bio.Seq import Seq +import sys +from tempfile import mkdtemp +from tempfile import mkstemp +from tempfile import TemporaryDirectory +import traceback +from urllib.parse import quote as urlquote + from Bio import SeqIO from Bio.Emboss.Applications import StretcherCommandline -from packaging import version -import shutil -import base64 -from collections import OrderedDict, defaultdict -import pickle -from tqdm import tqdm -from urllib.parse import quote as urlquote -from math import floor, ceil -from tempfile import mkstemp, TemporaryDirectory, mkdtemp -import traceback -import itertools -import signal -import csv -from time import sleep -from contextlib import ExitStack -from more_itertools import consecutive_groups, split_when +from Bio.Seq import Seq +from Bio.SeqUtils.CheckSum import seguid import pandas as pd +from tqdm import tqdm # COMPATIBILITY SUPPORTED_DB_VERSION = 4 -class sonarTimeout(): - """ - this class is a helper class raising a TimeoutError within a defined context - - Example - -------- - - >>> with sonarTimeout(1) as _: - ... sleep(60) - Traceback (most recent call last): - ... - TimeoutError: Timeout - - Parameters - ---------- - - seconds : int - define time in seconds until TimeoutError is raised - values below 1 will not raise any TimeoutError - error_message: str - define error message to be shown [ 'Timeout' ] - - Attributes - ---------- - - seconds : int - time in seconds when a TimeoutError is raised - error_message : str [ 'Timeout' ] - error message to be shown - """ - def __init__(self, seconds, error_message='Timeout'): - self.seconds = seconds - self.error_message = error_message - - def __enter__(self): - if self.seconds > 0: - signal.signal(signal.SIGALRM, self.handle_timeout) - signal.alarm(self.seconds) - - def __exit__(self, type, value, traceback): - if self.seconds > 0: - signal.alarm(0) - - def handle_timeout(self, signum, frame): - raise TimeoutError(self.error_message) - - -class sonarFiler(): - """ - this class is a helper class providing a (temporary) file handler for - writing in a given context - - Notes - ----- - Please consider, that an existing file will be overwritten. - - Examples - -------- - - >>> with sonarFiler() as handle: - ... fname = handle.name - ... os.path.isfile(fname) - True - >>> os.path.isfile(fname) - False - - Parameters - ---------- - fname : str [ None ] - define designated file name to open. If None, a temporary file is - created and deleted after use. - - Attributes - ---------- - name : str - stores the given file name - basename : str - stores file basename - path : str - stores absolute file path - tmp : bool - stores True if it is a temporary file else False - handle: file handler - opened file handler - """ - def __init__(self, fname = None): - self.fname = fname - self.tmp = True if fname is None else False - - def __enter__(self): - if self.fname is None: - self.handle, path = mkstemp() - else: - self.handle = open(self.fname, "w") - path = self.fname - self.name = os.path.abspath(path) - self.basename = os.path.basename(path) - self.path = os.path.dirname(path) - return self - - def __exit__(self, type, value, traceback): - if self.tmp: - os.remove(self.name) + +class sonarTimeout: + """ + this class is a helper class raising a TimeoutError within a defined context + + Example + -------- + + >>> with sonarTimeout(1) as _: + ... sleep(60) + Traceback (most recent call last): + ... + TimeoutError: Timeout + + Parameters + ---------- + + seconds : int + define time in seconds until TimeoutError is raised + values below 1 will not raise any TimeoutError + error_message: str + define error message to be shown [ 'Timeout' ] + + Attributes + ---------- + + seconds : int + time in seconds when a TimeoutError is raised + error_message : str [ 'Timeout' ] + error message to be shown + """ + + def __init__(self, seconds, error_message="Timeout"): + self.seconds = seconds + self.error_message = error_message + + def __enter__(self): + if self.seconds > 0: + signal.signal(signal.SIGALRM, self.handle_timeout) + signal.alarm(self.seconds) + + def __exit__(self, type, value, traceback): + if self.seconds > 0: + signal.alarm(0) + + def handle_timeout(self, signum, frame): + raise TimeoutError(self.error_message) + + +class sonarFiler: + """ + this class is a helper class providing a (temporary) file handler for + writing in a given context + + Notes + ----- + Please consider, that an existing file will be overwritten. + + Examples + -------- + + >>> with sonarFiler() as handle: + ... fname = handle.name + ... os.path.isfile(fname) + True + >>> os.path.isfile(fname) + False + + Parameters + ---------- + fname : str [ None ] + define designated file name to open. If None, a temporary file is + created and deleted after use. + + Attributes + ---------- + name : str + stores the given file name + basename : str + stores file basename + path : str + stores absolute file path + tmp : bool + stores True if it is a temporary file else False + handle: file handler + opened file handler + """ + + def __init__(self, fname=None): + self.fname = fname + self.tmp = True if fname is None else False + + def __enter__(self): + if self.fname is None: + self.handle, path = mkstemp() + else: + self.handle = open(self.fname, "w") + path = self.fname + self.name = os.path.abspath(path) + self.basename = os.path.basename(path) + self.path = os.path.dirname(path) + return self + + def __exit__(self, type, value, traceback): + if self.tmp: + os.remove(self.name) + class sonarCDS(object): - """ - this object stores information about a coding sequence (CDS) - - Notes - ----- - Please note, that genomic coordinates are processed and returned 0-based - by this object. While start or single coordinates are inclusive, - end coordinates of ranges are exclusive, expressed in a mathematical - notation: [start, end) - - Examples - -------- - - Initiating an sonarCDS object: - - >>> cds = sonarCDS("Loc1", "ORF1", [(155, 170)], ["ATGTTATGAATGGCC"], "+") - - Accessing amino acid sequence (genetic code 1): - - >>> cds.aa - 'ML*MA' - - Accessing CDS coordinates or genome range: - - >>> cds.coords - (155, 170) - >>> cds.range - range(155, 170) - - Parameters - ---------- - locus: str [ None ] - define the gene locus accession - symbol : str - define the gene/protein symbol - (e.g. ORF1b) - coords : int - define a sorted list of (start, end) tuples describing all exons of the - gene (coordinates are 0-based, starts inclusive, ends exclusive, - start always lower than end) - seqs : list - define a sorted list of exon nucleotide sequences (alswas forward strand - sequence) - strand : {'+', '-'} - define the genomic strand the gene is encoded on (+ or -) - translation_table : int [ 1 ] - define the genetic code table used for in silico translation (see - https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) - - Attributes - ---------- - locus: str [ None ] - sores the gene locus accession - symbol : str - stores the gene/protein symbol - start : int - stores the genomic start coordinate (0-based, inclusive). The start - coordinate is always lower than the end coordinate. - end : int - stores the genomic end coordinate (0-based, exclusive). The end - coordinate is always greater than the start coordinate. - coordlist : list - stores a list of (start, end) tuples of of all exons (coordinates are - 0-based, starts inclusive, ends exclusive, start always lower than end) - ranges : list - stores a list of exon ranges - nuc : str - stores the coding nucleotide sequence (joined exons) - aa : str - stores the in silico translated amino acid sequence - coding_positions: list - stores all genomic positions part of the coding sequences of this gene - as ordered list (positions might be redundant in case of ribosomal slippage) - coding_positions_set: set - stores all genomic positions part of the coding sequences of this gene - as set - translation_table : int [ 1 ] - stores the genetic code table used for in silico translation (see - https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) - """ - - def __init__(self, locus, symbol, coords, seqs, strand, translation_table=1): - self.symbol = symbol - self.locus = locus - self.start = coords[0][0] # inclusive - self.end = coords[-1][1] # exclusive - self.strand = strand - self.seqs = seqs - self.coordlist = coords - self.ranges = [range(s, e) for s, e in coords] - self.translation_table = translation_table - self.__aa = None - self.__nuc = None - self.__coding_positions = None - self.__coding_positions_set = None - - @property - def nuc(self): - if self.__nuc is None: - self.__nuc = "".join(self.seqs) - return self.__nuc - - @property - def aa(self): - if self.__aa is None: - nuc = self.nuc if self.strand == "+" else str(Seq(self.nuc).reverse_complement()) - l = len(nuc) - if l%3 == 1: - l = -1 - elif l%3 == 2: - l = -2 - self.__aa = str(Seq.translate(nuc[:l], table=self.translation_table)) - return self.__aa - - @property - def coords(self): - return self.start, self.end - - @property - def range(self): - return range(self.start, self.end) - - @property - def coding_positions(self): - if self.__coding_positions is None: - self.__coding_positions = [] - for r in self.ranges: - self.__coding_positions.extend(list(r)) - return self.__coding_positions - - @property - def coding_positions_set(self): - if self.__coding_positions_set is None: - self.__coding_positions_set = set(self.coding_positions) - return self.__coding_positions_set - - def aa_to_nuc_pos(self, x): - """ - function to return the respective lower-bound genome position for - a given protein position - - Examples - -------- - - >>> cds=sonarCDS("loc1", "prot1", [(10, 24)], ['ATGTTATCCTGAAA'], "+") - >>> cds.aa_to_nuc_pos(0) - 10 - >>> cds.aa_to_nuc_pos(2) - 16 - - Parameters - ---------- - x : int - define the protein position to convert (0-based, inclusive) - - Returns - ------- - int - respective lower-bound genome position - """ - return self.coding_positions[3*x] - - def iter_coords(self): - """ - function to iterate over genomic coordinate of the coding part of an - annotated coding sequence (CDS). - - Examples - -------- - - >>> cds=sonarCDS("loc1", "prot1", [(10, 15), (14, 16)], ['ATGTG', 'CTAATGA'], "+") - >>> for i in cds.iter_coords(): - ... print(i) - 10 - 11 - 12 - 13 - 14 - 14 - 15 - - Parameters - ---------- - x : int - genomic (start) coordinate (0-based, inclusive) - y : int - genomic end coordniate (0-based, exclusive, has to be greater than x) - [ None ] - - Returns - ------- - bool - True if coordinate(s) within coding part of CDS, False otherwise. - - """ - for i in self.coding_positions: - yield i - - def is_exon(self, x, y=None): - """ - function to check if a given genomic coordinate (range) overlaps with the - coding part of this coding sequence (CDS). - - Examples - -------- - - >>> cds=sonarCDS("loc1", "prot1", [(10, 15), (25, 32)], ['ATGTG', 'CTAATGA'], "+") - >>> cds.is_exon(10) - True - >>> cds.is_exon(16) - False - - Parameters - ---------- - x : int - genomic (start) coordinate (0-based, inclusive) - y : int [ None ] - genomic end coordniate (0-based, exclusive) - - Returns - ------- - bool - True if coordinate(s) are within or overlapping with this exons of - the CDS, False otherwise. - - Dev Note - -------- - Working with intersection of ranges is for long sequences less performant - - """ - if y is None: - y = x + 1 - for start, end in self.coordlist: - if y >= start and end >= x: - return True - return False - - def is_cds(self, x, y=None): - """ - function to check if a given genomic coordinate (range) overlaps with - this coding sequence (CDS). - - Examples - -------- - - >>> gff=sonarCDS("loc1", "prot1", [(10, 15), (25, 32)], ['ATGTG', 'CTAATGA'], "+") - >>> gff.is_cds(10) - True - >>> gff.is_cds(16) - True - - Parameters - ---------- - x : int - genomic (start) coordinate (0-based, inclusive) - y : int [ None ] - genomic end coordinate (0-based, exclusive, greater than x) - - Returns - ------- - bool - True if coordinate(s) are within or overlapping with this CDS, False otherwise. - - Dev Note - -------- - Working with intersection of ranges is for long sequences less performant - - """ - if y is None: - y = x + 1 - return y >= self.start and self.end >= x - - def is_frameshift_del(self, x, y): - """ - function to check if a deletion of a given genomic range (x to y) - leads to an frameshift within this CDS. - - Examples - -------- - - >>> cds=sonarCDS("loc1", "prot1", [(10, 15), (25, 32)], ['ATGTG', 'CTAATGA'], "+") - >>> cds.is_frameshift_del(11, 13) - True - >>> cds.is_frameshift_del(14,16) - True - >>> cds.is_frameshift_del(15,17) - False - >>> cds.is_frameshift_del(27,30) - False - - >>> cds=sonarCDS("loc1", "prot1", [(10, 15), (15, 16), (15,20)], ['ATGTG', 'G', 'GATC'], "+") - >>> cds.is_frameshift_del(15, 16) - False - >>> cds.is_frameshift_del(13, 16) - True - - Parameters - ---------- - x : int - genomic (start) coordinate (0-based, inclusive) - y : int - genomic end coordinate (0-based, exclusive, greater than x) - - Returns - ------- - bool - True if deletion of the given genomic range causes a frameshift mutation - within the CDS, False otherwise. - """ - if self.is_cds(x, y) and len([True for z in self.coding_positions if z < x or z >= y])%3 != 0: - return True - return False - - def is_frameshift_in(self, x, l): - """ - function to check if a insertion at given genomic position (x) with a given - insertion length (l) leads to an frameshift within this CDS. - - Examples - -------- - - >>> cds=sonarCDS("loc1", "prot1", [(10, 16), (15, 21)], ['ATGTGC', 'GATNTC'], "+") - >>> cds.is_frameshift_in(12, 3) - False - >>> cds.is_frameshift_in(12, 7) - True - >>> cds.is_frameshift_in(15, 4) - True - >>> cds.is_frameshift_in(15, 3) - False - - Parameters - ---------- - x : int - genomic (start) coordinate (0-based, inclusive) - l : int - length of insertion (excluding anchor base) - - Returns - ------- - bool - True if deletion of the given genomic range causes a frameshift mutation - within the CDS, False otherwise. - """ - if l%3 != 0 and x in self.coding_positions_set: - return True - return False + """ + this object stores information about a coding sequence (CDS) + + Notes + ----- + Please note, that genomic coordinates are processed and returned 0-based + by this object. While start or single coordinates are inclusive, + end coordinates of ranges are exclusive, expressed in a mathematical + notation: [start, end) + + Examples + -------- + + Initiating an sonarCDS object: + + >>> cds = sonarCDS("Loc1", "ORF1", [(155, 170)], ["ATGTTATGAATGGCC"], "+") + + Accessing amino acid sequence (genetic code 1): + + >>> cds.aa + 'ML*MA' + + Accessing CDS coordinates or genome range: + + >>> cds.coords + (155, 170) + >>> cds.range + range(155, 170) + + Parameters + ---------- + locus: str [ None ] + define the gene locus accession + symbol : str + define the gene/protein symbol + (e.g. ORF1b) + coords : int + define a sorted list of (start, end) tuples describing all exons of the + gene (coordinates are 0-based, starts inclusive, ends exclusive, + start always lower than end) + seqs : list + define a sorted list of exon nucleotide sequences (alswas forward strand + sequence) + strand : {'+', '-'} + define the genomic strand the gene is encoded on (+ or -) + translation_table : int [ 1 ] + define the genetic code table used for in silico translation (see + https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) + + Attributes + ---------- + locus: str [ None ] + sores the gene locus accession + symbol : str + stores the gene/protein symbol + start : int + stores the genomic start coordinate (0-based, inclusive). The start + coordinate is always lower than the end coordinate. + end : int + stores the genomic end coordinate (0-based, exclusive). The end + coordinate is always greater than the start coordinate. + coordlist : list + stores a list of (start, end) tuples of of all exons (coordinates are + 0-based, starts inclusive, ends exclusive, start always lower than end) + ranges : list + stores a list of exon ranges + nuc : str + stores the coding nucleotide sequence (joined exons) + aa : str + stores the in silico translated amino acid sequence + coding_positions: list + stores all genomic positions part of the coding sequences of this gene + as ordered list (positions might be redundant in case of ribosomal slippage) + coding_positions_set: set + stores all genomic positions part of the coding sequences of this gene + as set + translation_table : int [ 1 ] + stores the genetic code table used for in silico translation (see + https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) + """ + + def __init__(self, locus, symbol, coords, seqs, strand, translation_table=1): + self.symbol = symbol + self.locus = locus + self.start = coords[0][0] # inclusive + self.end = coords[-1][1] # exclusive + self.strand = strand + self.seqs = seqs + self.coordlist = coords + self.ranges = [range(s, e) for s, e in coords] + self.translation_table = translation_table + self.__aa = None + self.__nuc = None + self.__coding_positions = None + self.__coding_positions_set = None + + @property + def nuc(self): + if self.__nuc is None: + self.__nuc = "".join(self.seqs) + return self.__nuc + + @property + def aa(self): + if self.__aa is None: + nuc = ( + self.nuc + if self.strand == "+" + else str(Seq(self.nuc).reverse_complement()) + ) + l = len(nuc) + if l % 3 == 1: + l = -1 + elif l % 3 == 2: + l = -2 + self.__aa = str(Seq.translate(nuc[:l], table=self.translation_table)) + return self.__aa + + @property + def coords(self): + return self.start, self.end + + @property + def range(self): + return range(self.start, self.end) + + @property + def coding_positions(self): + if self.__coding_positions is None: + self.__coding_positions = [] + for r in self.ranges: + self.__coding_positions.extend(list(r)) + return self.__coding_positions + + @property + def coding_positions_set(self): + if self.__coding_positions_set is None: + self.__coding_positions_set = set(self.coding_positions) + return self.__coding_positions_set + + def aa_to_nuc_pos(self, x): + """ + function to return the respective lower-bound genome position for + a given protein position + + Examples + -------- + + >>> cds=sonarCDS("loc1", "prot1", [(10, 24)], ['ATGTTATCCTGAAA'], "+") + >>> cds.aa_to_nuc_pos(0) + 10 + >>> cds.aa_to_nuc_pos(2) + 16 + + Parameters + ---------- + x : int + define the protein position to convert (0-based, inclusive) + + Returns + ------- + int + respective lower-bound genome position + """ + return self.coding_positions[3 * x] + + def iter_coords(self): + """ + function to iterate over genomic coordinate of the coding part of an + annotated coding sequence (CDS). + + Examples + -------- + + >>> cds=sonarCDS("loc1", "prot1", [(10, 15), (14, 16)], ['ATGTG', 'CTAATGA'], "+") + >>> for i in cds.iter_coords(): + ... print(i) + 10 + 11 + 12 + 13 + 14 + 14 + 15 + + Parameters + ---------- + x : int + genomic (start) coordinate (0-based, inclusive) + y : int + genomic end coordniate (0-based, exclusive, has to be greater than x) + [ None ] + + Returns + ------- + bool + True if coordinate(s) within coding part of CDS, False otherwise. + + """ + for i in self.coding_positions: + yield i + + def is_exon(self, x, y=None): + """ + function to check if a given genomic coordinate (range) overlaps with the + coding part of this coding sequence (CDS). + + Examples + -------- + + >>> cds=sonarCDS("loc1", "prot1", [(10, 15), (25, 32)], ['ATGTG', 'CTAATGA'], "+") + >>> cds.is_exon(10) + True + >>> cds.is_exon(16) + False + + Parameters + ---------- + x : int + genomic (start) coordinate (0-based, inclusive) + y : int [ None ] + genomic end coordniate (0-based, exclusive) + + Returns + ------- + bool + True if coordinate(s) are within or overlapping with this exons of + the CDS, False otherwise. + + Dev Note + -------- + Working with intersection of ranges is for long sequences less performant + + """ + if y is None: + y = x + 1 + for start, end in self.coordlist: + if y >= start and end >= x: + return True + return False + + def is_cds(self, x, y=None): + """ + function to check if a given genomic coordinate (range) overlaps with + this coding sequence (CDS). + + Examples + -------- + + >>> gff=sonarCDS("loc1", "prot1", [(10, 15), (25, 32)], ['ATGTG', 'CTAATGA'], "+") + >>> gff.is_cds(10) + True + >>> gff.is_cds(16) + True + + Parameters + ---------- + x : int + genomic (start) coordinate (0-based, inclusive) + y : int [ None ] + genomic end coordinate (0-based, exclusive, greater than x) + + Returns + ------- + bool + True if coordinate(s) are within or overlapping with this CDS, False otherwise. + + Dev Note + -------- + Working with intersection of ranges is for long sequences less performant + + """ + if y is None: + y = x + 1 + return y >= self.start and self.end >= x + + def is_frameshift_del(self, x, y): + """ + function to check if a deletion of a given genomic range (x to y) + leads to an frameshift within this CDS. + + Examples + -------- + + >>> cds=sonarCDS("loc1", "prot1", [(10, 15), (25, 32)], ['ATGTG', 'CTAATGA'], "+") + >>> cds.is_frameshift_del(11, 13) + True + >>> cds.is_frameshift_del(14,16) + True + >>> cds.is_frameshift_del(15,17) + False + >>> cds.is_frameshift_del(27,30) + False + + >>> cds=sonarCDS("loc1", "prot1", [(10, 15), (15, 16), (15,20)], ['ATGTG', 'G', 'GATC'], "+") + >>> cds.is_frameshift_del(15, 16) + False + >>> cds.is_frameshift_del(13, 16) + True + + Parameters + ---------- + x : int + genomic (start) coordinate (0-based, inclusive) + y : int + genomic end coordinate (0-based, exclusive, greater than x) + + Returns + ------- + bool + True if deletion of the given genomic range causes a frameshift mutation + within the CDS, False otherwise. + """ + if ( + self.is_cds(x, y) + and len([True for z in self.coding_positions if z < x or z >= y]) % 3 != 0 + ): + return True + return False + + def is_frameshift_in(self, x, l): + """ + function to check if a insertion at given genomic position (x) with a given + insertion length (l) leads to an frameshift within this CDS. + + Examples + -------- + + >>> cds=sonarCDS("loc1", "prot1", [(10, 16), (15, 21)], ['ATGTGC', 'GATNTC'], "+") + >>> cds.is_frameshift_in(12, 3) + False + >>> cds.is_frameshift_in(12, 7) + True + >>> cds.is_frameshift_in(15, 4) + True + >>> cds.is_frameshift_in(15, 3) + False + + Parameters + ---------- + x : int + genomic (start) coordinate (0-based, inclusive) + l : int + length of insertion (excluding anchor base) + + Returns + ------- + bool + True if deletion of the given genomic range causes a frameshift mutation + within the CDS, False otherwise. + """ + if l % 3 != 0 and x in self.coding_positions_set: + return True + return False + class sonarGFF(object): - """ - this object stores CDS objects based on a GFF3 file. - - Notes - ----- - Please note, that genomic coordinates are processed and returned 0-based - by this object. While start or single coordinates are inclusive, - end coordinates of ranges are exclusive, expressed in a mathematical - notation: [start, end) - - Please note, that only single molecule genome annotations can be handled - by this object. - - Examples - -------- - - Initiating an sonarGFF object. In this example the REF_GFF_FILE and REF_FASTA_FILE - variable stores the path of an GFF3 and FASTA file containing the annotation - and genomic sequence of the SARS-COV-2 NC_045512.2, respectively. - - >>> gff = sonarGFF(REF_GFF_FILE, REF_FASTA_FILE) - - Parameters - ---------- - gff3 : str - define a path to a valid GFF3 file storing genome annotation - fna : str - define a path to a valid FASTA file storing the nucleotide - sequence of the annotated genome - translation_table : int - define the genetic code table used for in silico translation of CDS (see - https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) [ 1 ] - - Attributes - ---------- - translation_table : int - stores the genetic code table used for in silico translation of CDS (see - https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) - cds : list - stores a list of sonarCDS objects (one per CDS of the given annotation) - coords : dict - stores a dictionary with protein symbol as keys and respective 0-based - genomic coordinate tuples (start always lower than end coordinate, - start coordinate inclusive, end coordinate exclusive) - ranges : dict - stores a dictionary with protein symbol as keys and respective list of - coding ranges - cds_positions : set - stores a set of all genomic positions annotated within a coding gene - (includes exons and introns). - exon_positions : set - stores a set of all genomic positions within annotated exons. - symbols : list - stores a list of protein symbols - - """ - - def __init__(self, gff3, fna, translation_table=1): - self.translation_table = translation_table - self.cds = self.process_gff3(gff3, fna) - self.coords = { x.symbol: (x.start, x.end) for x in self.cds } - self.ranges = { x.symbol: x.ranges for x in self.cds } - self.symbols = [ x.symbol for x in self.cds ] - self.__cds_positions = None - self.__exon_positions = None - - @property - def cds_positions(self): - if self.__cds_positions is None: - positions = set() - for ranges in self.ranges.values(): - for r in ranges: - positions.update(r) - self.__cds_positions = positions - return self.__cds_positions - - @property - def exon_positions(self): - if self.__exon_positions is None: - positions = set() - for ranges in self.ranges.values(): - for r in ranges: - positions.update(r) - self.__exon_positions = positions - return self.__exon_positions - - def in_any_exon(self, x, y=None): - """ - function to check if a given genomic coordinate (range) overlaps with the - coding region of any annotated coding sequence (CDS). - - Examples - -------- - - >>> gff=sonarGFF(REF_GFF_FILE, REF_FASTA_FILE) - >>> gff.in_any_exon(21562) - True - >>> gff.in_any_exon(25384) - False - >>> gff.in_any_exon(25380, 25384) - True - - Parameters - ---------- - x : int - genomic (start) coordinate (0-based, inclusive) - y : int [ None ] - genomic end coordniate (0-based, exclusive, greater than x) - - Returns - ------- - bool - True if coordinate(s) within CDS, False otherwise. - - """ - x = {x, } if y is None else range(x, y) - if self.exon_positions.intersection(x): - return True - else: - return False - - def in_any_cds(self, x, y=None): - """ - function to check if a given genomic coordinate (range) overlaps with an - annotated coding sequence (CDS). - - Examples - -------- - - >>> gff=sonarGFF(REF_GFF_FILE, REF_FASTA_FILE) - >>> gff.in_any_cds(21562) - True - >>> gff.in_any_cds(25384) - False - >>> gff.in_any_cds(25380, 25384) - True - - Parameters - ---------- - x : int - genomic (start) coordinate (0-based, inclusive) - y : int - genomic end coordniate (0-based, exclusive) [ None ] - - Returns - ------- - bool - True if coordinate(s) within CDS, False otherwise. - - """ - x = {x, } if y is None else range(x, y) - if self.cds_positions.intersection(x): - return True - else: - return False - - def process_gff3(self, gff, fna): - """ - function to parse CDS from a given GFF3 file - - Examples - -------- - - >>> os.chdir(os.path.dirname(os.path.realpath(__file__))) - >>> gff = sonarGFF(REF_GFF_FILE, REF_FASTA_FILE) - >>> gff.coords == {'ORF1a': (265, 13483), 'ORF1b': (265, 21555), 'S': (21562, 25384), 'ORF3a': (25392, 26220), 'E': (26244, 26472), 'M': (26522, 27191), 'ORF6': (27201, 27387), 'ORF7a': (27393, 27759), 'ORF7b': (27755, 27887), 'ORF8': (27893, 28259),'N': (28273, 29533), 'ORF10': (29557, 29674)} - True - - Parameters - ---------- - gff : str - path to a valid GFF3 file storing the genome annotation - fna : str - path to a valid FASTA file storing the respective genome sequence - - Returns - ------- - list - list of sonarCDS objects for CDS annotated in the given GFF3 file - sorted by CDS start (lower genomic coordinate). - - """ - - symbol_regex = re.compile("gene=([^;]+)(?:;|$)") - locus_regex = re.compile("locus_tag=([^;]+)(?:;|$)") - id_regex = re.compile("ID=([^;]+)(?:;|$)") - - record = SeqIO.read(fna, "fasta") - gseq = str(record.seq).upper() - - with open(gff, "r") as handle: - cds = {} - for line in handle: - fields = line.rstrip("\r\n").split("\t") - if line.startswith("#") or len(fields) < 7: - continue - if fields[2] == "CDS": - id = id_regex.search(fields[-1]).groups(1)[0] - symbol = symbol_regex.search(fields[-1]).groups(1)[0] - locus = locus_regex.search(fields[-1]).groups(1)[0] - strand = fields[6] - s = int(fields[3])-1 - e = int(fields[4]) - if id not in cds: - cds[id] = { - 'locus': locus, - 'symbol': symbol, - 'coords': [(s, e)], - 'strand': strand - } - elif id in cds: - if symbol != cds[id]['symbol']: - sys.exit("gff3 error: multiple symbols for locus " + locus) - if strand != cds[id]['strand']: - sys.exit("gff3 error: different strands for locus " + locus) - cds[id]['coords'].append((s, e)) - - cdsobjs = [] - for locus, data in cds.items(): - seqs = [] - for s, e in data['coords']: - if data['strand'] == "+": - seqs.append(gseq[s:e]) - else: - seqs.append(str(Seq.reverse_complement(gseq[s:e]))) - cdsobjs.append(sonarCDS(data['locus'], data['symbol'], data['coords'], seqs, data['strand'], self.translation_table)) - - return sorted(cdsobjs, key=lambda x: x.start) + """ + this object stores CDS objects based on a GFF3 file. + + Notes + ----- + Please note, that genomic coordinates are processed and returned 0-based + by this object. While start or single coordinates are inclusive, + end coordinates of ranges are exclusive, expressed in a mathematical + notation: [start, end) + + Please note, that only single molecule genome annotations can be handled + by this object. + + Examples + -------- + + Initiating an sonarGFF object. In this example the REF_GFF_FILE and REF_FASTA_FILE + variable stores the path of an GFF3 and FASTA file containing the annotation + and genomic sequence of the SARS-COV-2 NC_045512.2, respectively. + + >>> gff = sonarGFF(REF_GFF_FILE, REF_FASTA_FILE) + + Parameters + ---------- + gff3 : str + define a path to a valid GFF3 file storing genome annotation + fna : str + define a path to a valid FASTA file storing the nucleotide + sequence of the annotated genome + translation_table : int + define the genetic code table used for in silico translation of CDS (see + https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) [ 1 ] + + Attributes + ---------- + translation_table : int + stores the genetic code table used for in silico translation of CDS (see + https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) + cds : list + stores a list of sonarCDS objects (one per CDS of the given annotation) + coords : dict + stores a dictionary with protein symbol as keys and respective 0-based + genomic coordinate tuples (start always lower than end coordinate, + start coordinate inclusive, end coordinate exclusive) + ranges : dict + stores a dictionary with protein symbol as keys and respective list of + coding ranges + cds_positions : set + stores a set of all genomic positions annotated within a coding gene + (includes exons and introns). + exon_positions : set + stores a set of all genomic positions within annotated exons. + symbols : list + stores a list of protein symbols + + """ + + def __init__(self, gff3, fna, translation_table=1): + self.translation_table = translation_table + self.cds = self.process_gff3(gff3, fna) + self.coords = {x.symbol: (x.start, x.end) for x in self.cds} + self.ranges = {x.symbol: x.ranges for x in self.cds} + self.symbols = [x.symbol for x in self.cds] + self.__cds_positions = None + self.__exon_positions = None + + @property + def cds_positions(self): + if self.__cds_positions is None: + positions = set() + for ranges in self.ranges.values(): + for r in ranges: + positions.update(r) + self.__cds_positions = positions + return self.__cds_positions + + @property + def exon_positions(self): + if self.__exon_positions is None: + positions = set() + for ranges in self.ranges.values(): + for r in ranges: + positions.update(r) + self.__exon_positions = positions + return self.__exon_positions + + def in_any_exon(self, x, y=None): + """ + function to check if a given genomic coordinate (range) overlaps with the + coding region of any annotated coding sequence (CDS). + + Examples + -------- + + >>> gff=sonarGFF(REF_GFF_FILE, REF_FASTA_FILE) + >>> gff.in_any_exon(21562) + True + >>> gff.in_any_exon(25384) + False + >>> gff.in_any_exon(25380, 25384) + True + + Parameters + ---------- + x : int + genomic (start) coordinate (0-based, inclusive) + y : int [ None ] + genomic end coordniate (0-based, exclusive, greater than x) + + Returns + ------- + bool + True if coordinate(s) within CDS, False otherwise. + + """ + x = ( + { + x, + } + if y is None + else range(x, y) + ) + if self.exon_positions.intersection(x): + return True + else: + return False + + def in_any_cds(self, x, y=None): + """ + function to check if a given genomic coordinate (range) overlaps with an + annotated coding sequence (CDS). + + Examples + -------- + + >>> gff=sonarGFF(REF_GFF_FILE, REF_FASTA_FILE) + >>> gff.in_any_cds(21562) + True + >>> gff.in_any_cds(25384) + False + >>> gff.in_any_cds(25380, 25384) + True + + Parameters + ---------- + x : int + genomic (start) coordinate (0-based, inclusive) + y : int + genomic end coordniate (0-based, exclusive) [ None ] + + Returns + ------- + bool + True if coordinate(s) within CDS, False otherwise. + + """ + x = ( + { + x, + } + if y is None + else range(x, y) + ) + if self.cds_positions.intersection(x): + return True + else: + return False + + def process_gff3(self, gff, fna): + """ + function to parse CDS from a given GFF3 file + + Examples + -------- + + >>> os.chdir(os.path.dirname(os.path.realpath(__file__))) + >>> gff = sonarGFF(REF_GFF_FILE, REF_FASTA_FILE) + >>> gff.coords == {'ORF1a': (265, 13483), 'ORF1b': (265, 21555), 'S': (21562, 25384), 'ORF3a': (25392, 26220), 'E': (26244, 26472), 'M': (26522, 27191), 'ORF6': (27201, 27387), 'ORF7a': (27393, 27759), 'ORF7b': (27755, 27887), 'ORF8': (27893, 28259),'N': (28273, 29533), 'ORF10': (29557, 29674)} + True + + Parameters + ---------- + gff : str + path to a valid GFF3 file storing the genome annotation + fna : str + path to a valid FASTA file storing the respective genome sequence + + Returns + ------- + list + list of sonarCDS objects for CDS annotated in the given GFF3 file + sorted by CDS start (lower genomic coordinate). + + """ + + symbol_regex = re.compile("gene=([^;]+)(?:;|$)") + locus_regex = re.compile("locus_tag=([^;]+)(?:;|$)") + id_regex = re.compile("ID=([^;]+)(?:;|$)") + + record = SeqIO.read(fna, "fasta") + gseq = str(record.seq).upper() + + with open(gff, "r") as handle: + cds = {} + for line in handle: + fields = line.rstrip("\r\n").split("\t") + if line.startswith("#") or len(fields) < 7: + continue + if fields[2] == "CDS": + id = id_regex.search(fields[-1]).groups(1)[0] + symbol = symbol_regex.search(fields[-1]).groups(1)[0] + locus = locus_regex.search(fields[-1]).groups(1)[0] + strand = fields[6] + s = int(fields[3]) - 1 + e = int(fields[4]) + if id not in cds: + cds[id] = { + "locus": locus, + "symbol": symbol, + "coords": [(s, e)], + "strand": strand, + } + elif id in cds: + if symbol != cds[id]["symbol"]: + sys.exit("gff3 error: multiple symbols for locus " + locus) + if strand != cds[id]["strand"]: + sys.exit("gff3 error: different strands for locus " + locus) + cds[id]["coords"].append((s, e)) + + cdsobjs = [] + for locus, data in cds.items(): + seqs = [] + for s, e in data["coords"]: + if data["strand"] == "+": + seqs.append(gseq[s:e]) + else: + seqs.append(str(Seq.reverse_complement(gseq[s:e]))) + cdsobjs.append( + sonarCDS( + data["locus"], + data["symbol"], + data["coords"], + seqs, + data["strand"], + self.translation_table, + ) + ) + + return sorted(cdsobjs, key=lambda x: x.start) + class sonarALIGN(object): - """ - this object performs a pairwise sequence alignment and provides/stores selected - alignment functionalities/statistics. - - Notes - ----- - Please note, that genomic coordinates are processed and returned 0-based - by this object. While start or single coordinates are inclusive, - end coordinates are exclusive, expressed as mathematical notation: - [start, end) - - Please note, alignment is based on EMBOSS Stretcher. - - Example - ------- - - In this example the QRY_FASTA_FILE and REF_FASTA_FILE variables store - the path of FASTA files containing the query and reference genome sequences, - respectively. - - >>> algn = sonarALIGN(QRY_FASTA_FILE, REF_FASTA_FILE) - - Parameters - ---------- - query_file : str - define a path to a valid FASTA file storing the query genome sequence - target_file : str - define a path to a valid FASTA file storing the target genome sequence - (= reference) - out_file : str [ None ] - define a path to an output file that will store the FASTA formatted - alignment. Please consider, that an existing file will be overwritten! - If None, a temporary file is used and deleted after processing. - sonarGFFObj : object [ None ] - define a sonarGFF object based on the reference genome annotation - - Attributes - ---------- - aligned_query : str - stores the aligned upper-case query sequence (U replaced by T) - aligned_target : str - stores the aligned upper-case target or reference sequence (U replaced by T) - gff : object - stores the sonarGFF object if provided, otherwise None - dnadiff : list - stores a list of tuples for each genomic variation (based on the alignment). - Each tuple consists of: - - reference base (or bases in case of deletions) - - query base (or bases in case of insertions) - - genomic coordinate (0-based, inclusive) - - genomic end coordinate (in case of InDels 0-based and exlusive otherwise None) - - None - - None - Accordingly to the VCF format, insertions are expressed considering the upstream - base as anchor. As a special case, an insertion at the start of the sequence - has no anchor and a genomic coordinate of -1. Deletions are expressed for each - position they occur and not fused. The last two tuple elements - are always None to keep the length according to tuples stored in aadiff. - aadiff : list - stores a list of tuples for each amino acid variation in an annotated protein. - Each tuple consists of: - - reference amino acid (or amino acids in case of deletions) - - query amino acid (or amino acids in case of insertions) - - protein position (0-based, inclusive) - - protein end position (in case of InDels 0-based and exlusive otherwise None) - - protein symbol - - gene locus - Accordingly to the VCF format, insertions are expressed considering the upstream - base as anchor. As a special case, an insertion at the start of the sequence - has no anchor and a genomic coordinate of -1. Deletions are expressed for each - position they occur and not fused. The last two tuple elements - are always None to keep the length according to tuples stored in aadiff. - """ - - def __init__(self, query_file, target_file, out_file = None, sonarGFFObj = None): - self.aligned_query, self.aligned_target = self.align_dna(query_file, target_file, out_file) - self.gff = sonarGFFObj if sonarGFFObj else None - self._insert_regex = re.compile("[^-]-+") - self._del_regex = re.compile("-+") - self._codon_regex = re.compile("[^-]-*[^-]-*[^-]-*") - self._leading_gap_regex = re.compile("^-+") - self._tailing_gap_regex = re.compile("-+$") - self._dnadiff = None - self._aadiff = None - self.__target_coords_matrix = None - - @property - def dnadiff(self): - if self._dnadiff is None: - self._dnadiff = [ x for x in self.iter_dna_vars() ] - return self._dnadiff - - @property - def aadiff(self): - if self._aadiff is None: - self._aadiff = [ x for x in self.iter_aa_vars() ] - return self._aadiff - - @property - def _target_coords_matrix(self): - if self.__target_coords_matrix is None: - self.__target_coords_matrix = [len(x.group()) for x in re.finditer(".-*", self.aligned_target)] - return self.__target_coords_matrix - - def use_stretcher(self, query_file, target_file, out_file, gapopen= 16, gapextend = 4, right_align = True): - """ - function to perform a pairwise aligment using EMBOSS Stretcher - - Parameters - ---------- - query_file : str - define a path to a valid FASTA file storing the query sequence - target_file : str - define a path to a valid FASTA file storing the target sequence - (= reference) - out_file : str - define a path to a file that will store the alignment. Please consider, - that an existing file will be overwritten. - gapopen : int [ 16 ] - define penalty for gap opening - gapextend : int [ 4 ] - define penalty for gap extension - - Returns - ------- - list - list of aligned query and target sequence, in that order - """ - temp = True if not out_file else False - if temp: - handle, out_file = mkstemp() - cline = StretcherCommandline(asequence=query_file, bsequence=target_file, gapopen=gapopen, gapextend=gapextend, outfile=out_file, aformat="fasta") - stdout, stderr = cline() - alignment = [str(x.seq) for x in SeqIO.parse(out_file, "fasta")] - if temp: - os.remove(out_file) - if right_align: - alignment = self.left_align_gaps(*alignment) - return alignment - - def left_align_gaps(self, query, target): - """ - function to align gaps to the left in two aligned sequences - - Parameters - ---------- - query : str - define the query sequence in aligned form - target : str - define the target sequence (reference) in aligned form - - Returns - ------- - list - aligned query and target sequence strings with left-aligned gaps, - in that order. - """ - l = len(query)-1 - for match in re.finditer("-+", query): - s = match.start()-1 - e = match.end()-1 - g = "-" * (e-s) - while s >= 0 and e < l and query[s] == target[e]: - query = query[:s] + g + query[s] + query[e+1:] - s -= 1 - e -= 1 - for match in re.finditer("-+", target): - s = match.start()-1 - e = match.end()-1 - g = "-" * (e-s) - while s >= 0 and e < l and target[s] == query[e]: - target = target[:s] + g + target[s] + target[e+1:] - s -= 1 - e -= 1 - return query, target - - def align_dna(self, query_file, target_file, out_file=None, gapopen = 16, gapextend = 4, right_align = True): - """ - function to perform the default pairwise nucleotide aligment - - Parameters - ---------- - query_file : str - define a path to a valid FASTA file storing the query sequence - target_file : str - define a path to a valid FASTA file storing the target sequence - (= reference) - out_file : str - define a path to a file that will store the alignment. Please consider, - that an existing file will be overwritten. - gapopen : int [ 16 ] - define penalty for gap opening - gapextend : int [ 4 ] - define penalty for gap extension - - Returns - ------- - list - list of aligned query and target sequence - """ - return self.use_stretcher(query_file, target_file, out_file, gapopen, gapextend, right_align) - - def real_pos(self, x): - """ - function to convert an alignment position to the position in the - unaligned target sequence (= reference). - - Example - ------- - In this example the QRY_FASTA_FILE and REF_FASTA_FILE variables store - the path of FASTA files containing the query and reference genome sequences, - respectively. - - >>> algn = sonarALIGN(QRY_FASTA_FILE, REF_FASTA_FILE) - >>> algn.real_pos(29282) - 29282 - - Parameters - ---------- - x : int - define a position within the alignment (0-based) - - Returns - ------- - int - corresponding position (0-based) in the unaligned target/reference - sequence - """ - return x - self.aligned_target[:x+1].count("-") - - def align_pos(self, x): - """ - function to convert an target/reference position to the corresponding - position in the alignment. - - Example - ------- - - >>> algn = sonarALIGN(QRY_FASTA_FILE, REF_FASTA_FILE) - >>> algn.align_pos(29282) - 29282 - - Parameters - ---------- - x : int - define a reference position (0-based) - - Returns - ------- - int - corresponding position of the sequence alignment - """ - return sum(self._target_coords_matrix[:x]) - - def iter_dna_vars(self): - """ - function to iterate variations on nucleotide level. - - Example - ------- - - In this example the QRY_FASTA_FILE and REF_FASTA_FILE variables store - the path of FASTA files containing the query and reference genome sequences, - respectively. The reference is NC_045512.2 while the query is a B.1.1.7 - prototype sequence. - - >>> algn = sonarALIGN(QRY_FASTA_FILE, REF_FASTA_FILE) - >>> for x in algn.iter_dna_vars(): - ... print(x) - ('C', 'T', 3266, None, None, None) - ('C', 'A', 5387, None, None, None) - ('T', 'C', 6953, None, None, None) - ('T', '', 11287, None, None, None) - ('C', '', 11288, None, None, None) - ('T', '', 11289, None, None, None) - ('G', '', 11290, None, None, None) - ('G', '', 11291, None, None, None) - ('T', '', 11292, None, None, None) - ('T', '', 11293, None, None, None) - ('T', '', 11294, None, None, None) - ('T', '', 11295, None, None, None) - ('T', '', 21764, None, None, None) - ('A', '', 21765, None, None, None) - ('C', '', 21766, None, None, None) - ('A', '', 21767, None, None, None) - ('T', '', 21768, None, None, None) - ('G', '', 21769, None, None, None) - ('T', '', 21990, None, None, None) - ('T', '', 21991, None, None, None) - ('A', '', 21992, None, None, None) - ('A', 'T', 23062, None, None, None) - ('C', 'A', 23270, None, None, None) - ('C', 'A', 23603, None, None, None) - ('C', 'T', 23708, None, None, None) - ('T', 'G', 24505, None, None, None) - ('G', 'C', 24913, None, None, None) - ('C', 'T', 27971, None, None, None) - ('G', 'T', 28047, None, None, None) - ('A', 'G', 28110, None, None, None) - ('G', 'C', 28279, None, None, None) - ('A', 'T', 28280, None, None, None) - ('T', 'A', 28281, None, None, None) - ('C', 'T', 28976, None, None, None) - - Returns - ------- - iterator of tuples - each tuple represents a nucleotide level variation and consists of: - - target nucleotide - - query nucleotide(s) - - target or reference start position (0-based - - target or reference end position (0-based) - - None - - None - Accordingly to the VCF format, insertions are expressed considering the upstream - base as anchor. As a special case, an insertion at the start of the sequence - has no anchor and a genomic coordinate of -1. Deletions are are expressed for - each position they occur and not fused. The last two tuple elements - are always None to keep the length according to tuples stored in aadiff. - """ - target = self.aligned_target - query = self.aligned_query - - # query overhead in front - match = self._leading_gap_regex.match(target) - if match: - yield "", query[:match.end()], -1, None, None, None - - # insertions - isites = set() - for match in self._insert_regex.finditer(target): - isites.add(match.start()) - s = self.real_pos(match.start()) - yield match.group()[0], query[match.start():match.end()], s, None, None, None - - # deletions and snps - for i, pair in enumerate(zip(target, query)): - if pair[0] != "-" and pair[0] != pair[1] and i not in isites: - s = self.real_pos(i) - l = len(pair[1]) - e = None if l == 1 else s + l - yield pair[0], pair[1].replace("-", ""), s, e, None, None - - def iter_aa_vars(self): - """ - function to iterate variations on amino acid level. - - Example - ------- - - In this example the QRY_FASTA_FILE, REF_FASTA_FILE, and REF_GFF_FILE - variables store the path of FASTA files containing the query and - reference genome sequences as well as the reference genome annotation, - in that order. The reference is NC_045512.2 while the query is a B.1.1.7 - prototype sequence. - - Please consider, that a sonarGFF is needed to consider annotation and - deduce amino acid level profiles. - - >>> gff = sonarGFF(REF_GFF_FILE, REF_FASTA_FILE) - >>> algn = sonarALIGN(QRY_FASTA_FILE, REF_FASTA_FILE, sonarGFFObj=gff) - >>> for x in algn.iter_aa_vars(): - ... print(x) - ('T', 'I', 1000, None, 'ORF1b', 'GU280_gp01') - ('A', 'D', 1707, None, 'ORF1b', 'GU280_gp01') - ('I', 'T', 2229, None, 'ORF1b', 'GU280_gp01') - ('S', '', 3674, 3675, 'ORF1b', 'GU280_gp01') - ('G', '', 3675, 3676, 'ORF1b', 'GU280_gp01') - ('F', '', 3676, 3677, 'ORF1b', 'GU280_gp01') - ('T', 'I', 1000, None, 'ORF1a', 'GU280_gp01') - ('A', 'D', 1707, None, 'ORF1a', 'GU280_gp01') - ('I', 'T', 2229, None, 'ORF1a', 'GU280_gp01') - ('S', '', 3674, 3675, 'ORF1a', 'GU280_gp01') - ('G', '', 3675, 3676, 'ORF1a', 'GU280_gp01') - ('F', '', 3676, 3677, 'ORF1a', 'GU280_gp01') - ('I', '', 67, 68, 'S', 'GU280_gp02') - ('H', '', 68, 69, 'S', 'GU280_gp02') - ('V', '', 69, 70, 'S', 'GU280_gp02') - ('V', '', 142, 143, 'S', 'GU280_gp02') - ('Y', '', 143, 144, 'S', 'GU280_gp02') - ('N', 'Y', 500, None, 'S', 'GU280_gp02') - ('A', 'D', 569, None, 'S', 'GU280_gp02') - ('P', 'H', 680, None, 'S', 'GU280_gp02') - ('T', 'I', 715, None, 'S', 'GU280_gp02') - ('S', 'A', 981, None, 'S', 'GU280_gp02') - ('D', 'H', 1117, None, 'S', 'GU280_gp02') - ('Q', '*', 26, None, 'ORF8', 'GU280_gp09') - ('R', 'I', 51, None, 'ORF8', 'GU280_gp09') - ('Y', 'C', 72, None, 'ORF8', 'GU280_gp09') - ('D', 'L', 2, None, 'N', 'GU280_gp10') - ('S', 'F', 234, None, 'N', 'GU280_gp10') - - Returns - ------- - iterator of tuples - each tuple represents a amino acid level variation and consists of: - - target nucleotide - - query nucleotide(s) - - target or reference start position (0-based - - target or reference end position (0-based) - - protein symbol - - gene locus - Accordingly to the VCF format, insertions are expressed considering the upstream - base as anchor. As a special case, an insertion at the start of the sequence - has no anchor and a genomic coordinate of -1. Deletions are are expressed for - each position they occur and not fused. - """ - if self.gff: - for cds in self.gff.cds: - query = [] - target = [] - for s, e in cds.coordlist: - s = self.align_pos(s) - e = self.align_pos(e) - query.append(self.aligned_query[s:e]) - target.append(self.aligned_target[s:e]) - query = "".join(query) - target = "".join(target) - - if cds.strand == "-": - query.append(str(Seq.reverse_complement(query))) - target.append(str(Seq.reverse_complement(target))) - - for match in self._codon_regex.finditer(target): - s = match.start() - e = match.end() - start = int((s-target[:match.start()].count("-"))/3) - tcodon = match.group().replace("-", "") - qcodon = query[s:e].replace("-", "") - taa = self.translate(tcodon, cds.translation_table) - qaa = self.translate(qcodon, cds.translation_table) - if qaa == "": - yield taa, "", start, start+1, cds.symbol, cds.locus - else: - if qaa != taa: - e = None if len(qaa) == 1 else start + len(qaa) - yield (taa, qaa, start, e, cds.symbol, cds.locus) - - @staticmethod - def translate(seq, translation_table=1): - """ - function to translate a nucleotide sequence. - - Notes - ----- - If necessary, the given nucleotide sequence is shortened that its - length is a multiple of 3. - - Example - ------- - - >>> algn = sonarALIGN(QRY_FASTA_FILE, REF_FASTA_FILE) - >>> algn.translate("ATGTGAAA") - 'M*' - - Parameters - ---------- - seq : str - define the nucleotide sequence to translate - translation_table : int - define the genetic code table used for in silico translation (see - https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) [ 1 ] - - Returns - ------- - str - translated amino acid sequence - """ - l = len(seq) - if l%3 == 1: - l = -1 - elif l%3 == 2: - l = -2 - return str(Seq.translate(seq[:l], table=translation_table)) - -class sonarDBManager(): - """ - This object provides a sonarDB SQLite manager handler managing connections and - providing context-safe transaction control. - - Notes - ----- - This object should be called using a context manager to ensure rollbacks - on abnormal termination. - - Example - ------- - - In this example the DOCTESTDB variable store the path to a database file - - >>> with sonarDBManager(DOCTESTDB) as dbm: - ... pass - - Parameters - ---------- - - dbfile : str - define a path to a non-existent or valid SONAR database file. If the - file does not exist, a SONAR database is created. - timeout : int [ -1 ] - define busy timeout. Use -1 for no timeout. - readonly : bool [ False ] - define if the connection should be read-only - debug : bool [ False ] - debug mode (print selected sql queries) - - Attributes - ---------- - dbfile : str - stores the path to the used SONAR database file. - connection : object - stores the SQLite3 connection - cursor : method - stores the SQLite3 cursor - - Dev Note - -------- - A database row is returned as dictionary with column name as keys. Multiple - rows are returned as list of those dictionaries. - - """ - - def __init__(self, dbfile, timeout=-1, readonly=False, debug=False): - self.dbfile = os.path.abspath(dbfile) - self.connection = None - self.cursor = None - self.__timeout = timeout - self.__mode = "ro" if readonly else "rwc" - self.__uri = "file:" + urlquote(self.dbfile) - self.debug = debug - - def __enter__(self): - if not os.path.isfile(self.dbfile) or os.stat(self.dbfile).st_size == 0: - self.create_tables() - self.connection, self.cursor = self.connect() - self.start_transaction() - return self - - def __exit__(self, exc_type, exc_value, exc_traceback): - if [exc_type, exc_value, exc_traceback].count(None) != 3: - print("warning:", file=sys.stderr) - print(traceback.format_exc(), file=sys.stderr) - if self.__mode == "rwc": - print("rollback", file=sys.stderr) - self.rollback() - elif self.__mode == "rwc": - self.commit() - self.close() - - def __del__(self): - if self.connection: - self.close() - - def connect(self): - con = sqlite3.connect(self.__uri + "?mode=" + self.__mode, self.__timeout, isolation_level = None, uri = True) - con.row_factory = self.dict_factory - cur = con.cursor() - return con, cur - - def start_transaction(self): - self.cursor.execute("BEGIN DEFERRED") - - def commit(self): - self.connection.commit() - - def rollback(self): - self.connection.rollback() - - def close(self): - self.connection.close() - - def create_tables(self): - with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "db.sqlite"), 'r') as handle: - sql = handle.read() - with sqlite3.connect(self.__uri + "?mode=rwc", uri = True) as con: - con.executescript(sql) - - def get_db_version(self): - return self.cursor.execute('pragma user_version').fetchone()['user_version'] - - def check_db_compatibility(self): - dbver = self.get_db_version() - if dbver != SUPPORTED_DB_VERSION: - sys.exit("Compatibility error: the given database is not compatible with this version of sonar (Current database version: " + str(dbver) + "; Supported database version: " + str(SUPPORTED_DB_VERSION) +") \nPlease run 'sonar.py db-upgrade' to upgrade database") - - @staticmethod - def upgrade_db(dbfile): - try: - with sqlite3.connect(dbfile) as con: - cur = con.cursor() - current_version= cur.execute('pragma user_version').fetchone()[0] - - print('Current version:', current_version, ' Upgrade to:', SUPPORTED_DB_VERSION) - uri = "file:" + urlquote(dbfile) - print('Perform the Upgrade:',uri) - while(current_version < SUPPORTED_DB_VERSION): - next_version = current_version + 1 - with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "migrate/"+str(next_version)+".sql"), 'r') as handle: - sql = handle.read() - with sqlite3.connect(uri + "?mode=rwc", uri = True) as con: - con.executescript(sql) - - current_version = next_version - - except sqlite3.Error as er: - con.executescript('ROLLBACK') - raise er - finally: - print('Database now version:', current_version) - if(current_version==SUPPORTED_DB_VERSION): - print("Success: Database upgrade was successfully completed") - else: - print("Error: Upgrade was not completed") - - - - # INSERTING DATA - - def insert_genome(self, acc, descr, seqhash): - sql = "INSERT INTO genome (accession, description, seqhash) VALUES(?, ?, ?);" - self.cursor.execute(sql, [acc, descr, seqhash]) - return acc - - def insert_sequence(self, seqhash): - sql = "INSERT OR IGNORE INTO sequence (seqhash) VALUES(?);" - self.cursor.execute(sql, [seqhash]) - return seqhash - - def insert_profile(self, seqhash, dna_profile, aa_profile, fs_profile): - dna_profile = " " + dna_profile.strip() + " " - aa_profile = " " + aa_profile.strip() + " " - sql = "INSERT OR IGNORE INTO profile (seqhash, dna_profile, aa_profile, fs_profile) VALUES(?, ?, ?, ?);" - self.cursor.execute(sql, [seqhash, dna_profile, aa_profile, fs_profile]) - return seqhash - - def insert_dna_var(self, seqhash, ref, alt, start, end=None): - if end is None: - end = start + 1 - sql = "INSERT OR IGNORE INTO dna (varid, start, end, ref, alt) VALUES(?, ?, ?, ?, ?);" - self.cursor.execute(sql, [None, start, end, ref, alt]) - sql = "SELECT varid FROM dna WHERE start = ? AND end = ? AND alt = ? AND ref = ?;" - varid = self.cursor.execute(sql, [start, end, alt, ref]).fetchone()['varid'] - sql = "INSERT OR IGNORE INTO sequence2dna (seqhash, varid) VALUES(?, ?);" - self.cursor.execute(sql, [seqhash, varid]) - return varid - - def insert_prot_var(self, seqhash, protein, locus, ref, alt, start, end=None): - if end is None: - end = start + 1 - sql = "INSERT OR IGNORE INTO prot (varid, protein, locus, start, end, ref, alt) VALUES(?, ?, ?, ?, ?, ?, ?);" - self.cursor.execute(sql, [None, protein, locus, start, end, ref, alt]) - sql = "SELECT varid FROM prot WHERE protein = ? AND locus = ? AND start = ? AND end = ? AND alt = ? AND ref = ?;" - varid = self.cursor.execute(sql, [protein, locus, start, end, alt, ref]).fetchone()['varid'] - sql = "INSERT OR IGNORE INTO sequence2prot (seqhash, varid) VALUES(?, ?);" - self.cursor.execute(sql, [seqhash, varid]) - return varid - - # DELETING DATA - - def delete_genome(self, acc): - """ - we currently did not add CONSTRAINT, ON DELETE CASCADE to .sql file. - so we manually delete each table. - - Attributes - ---------- - acc: String - accession id - """ - sql = "SELECT seqhash FROM genome WHERE accession = ?;" - row = self.cursor.execute(sql, [acc]).fetchone() - # If there is no matching accession, return immediately - if not row: - return - # If the accession exists, delete it - sql = "DELETE FROM genome WHERE accession = ?;" - self.cursor.execute(sql, [acc]) - # Check to see if there are any remaining sequences with the same - # seqhash as the sequence we just deleted. If not, clean up other tables - # with that seqhash's data. - sql = "SELECT COUNT(*) FROM genome WHERE seqhash = ?;" - selected_seqhash = row["seqhash"] - row = self.cursor.execute(sql, [selected_seqhash]).fetchone() - if row['COUNT(*)'] == 0: - # delete profile - sql = "DELETE FROM profile WHERE seqhash = ?;" - self.cursor.execute(sql, [selected_seqhash]) - # delete seq seq2dna 2prpt - sql = "DELETE FROM sequence WHERE seqhash = ?;" - self.cursor.execute(sql, [selected_seqhash]) - sql = "DELETE FROM sequence2dna WHERE seqhash = ?;" - self.cursor.execute(sql, [selected_seqhash]) - sql = "DELETE FROM sequence2prot WHERE seqhash = ?;" - self.cursor.execute(sql, [selected_seqhash]) - - - # SELECTING DATA - - def genome_exists(self, acc, descr=None, seqhash=None): - sql = ["SELECT COUNT(*) FROM genome WHERE accession = ?"] - vals = [acc] - if descr: - sql.append("AND descr = ?") - vals.append(descr) - if seqhash: - sql.append(" AND seqhash = ?") - vals.append(seqhash) - return self.cursor.execute(" ".join(sql) + ";", vals).fetchone()['COUNT(*)'] > 0 - - def seq_exists(self, seqhash): - sql = "SELECT COUNT(*) FROM sequence WHERE seqhash = ?;" - return self.cursor.execute(sql, [seqhash]).fetchone()['COUNT(*)'] > 0 - - def get_genomes(self, acc): - sql = "SELECT * FROM genome WHERE accession = ?;" - return self.cursor.execute(sql, [acc]).fetchone() - - def get_dna_varid(self, ref, alt, pos): - sql = "SELECT varid FROM dna WHERE pos = ? AND alt = ? AND ref = ?;" - row = self.cursor.execute(sql, [pos, alt, ref]).fetchone() - if row: - return row['varid'] - return None - - def get_prot_varid(self, protein, locus, ref, alt, pos): - sql = "SELECT varid FROM prot WHERE protein = ? AND locus = ? AND pos = ? AND alt = ? AND ref = ?;" - row = self.cursor.execute(sql, [protein, locus, ref, alt, pos]).fetchone() - if row: - return row['varid'] - return None - - def get_dna_vars(self, acc): - sql = "SELECT description, start, end, alt, ref FROM dna_view WHERE accession = ?;" - return self.cursor.execute(sql, [acc]).fetchall() - - def get_dna_profile(self, acc): - sql = "SELECT dna_profile FROM essence WHERE accession = ?;" - row = self.cursor.execute(sql, [acc]).fetchone() - if not row: - return None - return row['dna_profile'] - - def count_genomes(self): - sql = "SELECT COUNT(accession) FROM genome;" - row = self.cursor.execute(sql).fetchone() - return int(row['COUNT(accession)']) - - def count_sequences(self): - sql = "SELECT COUNT(seqhash) FROM sequence;" - row = self.cursor.execute(sql).fetchone() - return int(row['COUNT(seqhash)']) - - def count_labs(self): - sql = "SELECT COUNT(DISTINCT lab) as count FROM genome WHERE lab != '';" - row = self.cursor.execute(sql).fetchone() - return int(row['count']) - - def info_data_types(self): - sql = "SELECT source, collection, COUNT(accession) as genome_count FROM genome GROUP BY source, collection ORDER BY source, collection;" - return self.cursor.execute(sql).fetchall() - - def get_earliest_import(self): - sql = "SELECT MIN(imported) as import FROM genome;" - return self.cursor.execute(sql).fetchone()['import'] - - def get_latest_import(self): - sql = "SELECT MAX(imported) as import FROM genome;" - return self.cursor.execute(sql).fetchone()['import'] - - def get_earliest_date(self): - sql = "SELECT MIN(date) as date FROM genome WHERE date IS NOT NULL;" - return self.cursor.execute(sql).fetchone()['date'] - - def get_latest_date(self): - sql = "SELECT MAX(date) as date FROM genome WHERE date IS NOT NULL;" - return self.cursor.execute(sql).fetchone()['date'] - - def count_metadata(self, field): - sql = "SELECT COUNT(accession) as counts FROM genome WHERE " + field + " IS NOT NULL AND " + field + " != '';" - return self.cursor.execute(sql).fetchone()['counts'] - - def iter_table(self, table, batch_size=1000): - sql = "SELECT * FROM " + table + ";" - c = self.cursor.execute(sql) - while True: - rows = c.fetchmany(batch_size) - if not rows: - break - for row in rows: - yield row - - ## extra features - def get_list_of_lineages(self, lineage): - - sql = "SELECT DISTINCT lineage FROM genome WHERE lineage LIKE '"+lineage+"';" - rows = self.cursor.execute(sql).fetchall() - result = [i['lineage'] for i in rows] - return result - - def get_dna_vars_for_vcf(self, ): - return None - - # MATCHING PROFILES - - def get_profile_condition(self, field, *profiles, negate=False): - op = " NOT " if negate else " " - search_all = False - if(field == 'dna_profile'): - for x in profiles: - if "N" == x[-1]: - search_all = True - elif(field == 'aa_profile'): - for x in profiles: - if "X" == x[-1]: - search_all = True - - clause = [field + op + "LIKE '% " + x + " %'" for x in profiles] - if(search_all): - return " OR ".join(clause) - - return " AND ".join(clause) - - def get_metadata_in_condition(self, field, *vals, negate=False): - op = " NOT " if negate else " " - # clause = [field + op + "LIKE '" + x + "'" for x in vals] - #return " AND ".join(clause) - return field + op + "IN (" + ", ".join(['?'] * len(vals)) + ")" - - def get_metadata_equal_condition(self, field, val, negate=False): - op = " != " if negate else " = " - return field + op + "?" - - def get_metadata_numeric_condition(self, field, min=False, max=True): - condition = [] - if min: - condition.append(field + " >= ?") - if max: - condition.append(field + " <= ?") - return " AND ".join(condition) - - def get_metadata_leading_string_condition(self, field, *vals, negate=False): - op = " NOT " if negate else " " - logic = " AND " if negate else " OR " - clause = [field + op + "LIKE '" + x + "%'" for x in vals] - if not negate and len(clause) > 1: - return "(" + logic.join(clause) + ")" - return logic.join(clause) - - def get_metadata_date_condition(self, field, *dates, negate=False): - op = " NOT " if negate else " " - op2 = " != " if negate else " = " - logic = " AND " if negate else " OR " - clause = [] - for date in dates: - if ":" in date: - x, y = date.split(":") - clause.append("(" + field + op + "BETWEEN '" + x + "' AND '" + y + "')") - else: - clause.append(field + op2 + "'" + date+ "'") - if not negate and len(clause) > 1: - return "(" + logic.join(clause) + ")" - return logic.join(clause) - - def match(self, - include_profiles=[], - exclude_profiles=[], - include_acc=[], - exclude_acc=[], - include_lin=[], - exclude_lin=[], - include_zip=[], - exclude_zip=[], - include_dates=[], - exclude_dates=[], - include_submission_dates=[], - exclude_submission_dates=[], - include_lab=[], - exclude_lab=[], - include_source=[], - exclude_source=[], - include_collection=[], - exclude_collection=[], - include_technology=[], - exclude_technology=[], - include_platform=[], - exclude_platform=[], - include_chemistry=[], - exclude_chemistry=[], - include_material=[], - exclude_material=[], - include_software=None, - exclude_software=None, - include_software_version=None, - exclude_software_version=None, - min_ct=None, - max_ct=None, - include_seqhash=[], - exclude_seqhash=[], - count = False, - frameshifts = 0): - - # creating where condition - where_clause = [] - where_vals = [] - - ## accessions - if include_acc: - where_clause.append(self.get_metadata_in_condition("accession", *include_acc)) - where_vals.extend(include_acc) - if exclude_acc: - where_clause.append(self.get_metadata_in_condition("accession", *exclude_acc, negate=True)) - where_vals.extend(exclude_acc) - - ## lineage - if include_lin: - where_clause.append(self.get_metadata_in_condition("lineage", *include_lin)) - where_vals.extend(include_lin) - if exclude_lin: - where_clause.append(self.get_metadata_in_condition("lineage", *exclude_lin, negate=True)) - where_vals.extend(exclude_lin) - - ## lab - if include_lab: - where_clause.append(self.get_metadata_in_condition("lab", *include_lab)) - where_vals.extend(include_lab) - if exclude_lab: - where_clause.append(self.get_metadata_in_condition("lab", *exclude_lab, negate=True)) - where_vals.extend(exclude_lab) - - ## source - if include_source: - where_clause.append(self.get_metadata_in_condition("source", *include_source)) - where_vals.extend(include_source) - if exclude_source: - where_clause.append(self.get_metadata_in_condition("source", *exclude_source, negate=True)) - where_vals.extend(exclude_source) - - ## collection - if include_collection: - where_clause.append(self.get_metadata_in_condition("collection", *include_collection)) - where_vals.extend(include_collection) - if exclude_collection: - where_clause.append(self.get_metadata_in_condition("collection", *exclude_collection, negate=True)) - where_vals.extend(exclude_collection) - - ## technology - if include_technology: - where_clause.append(self.get_metadata_in_condition("technology", *include_technology)) - where_vals.extend(include_technology) - if exclude_technology: - where_clause.append(self.get_metadata_in_condition("technology", *exclude_technology, negate=True)) - where_vals.extend(exclude_technology) - - ## platform - if include_platform: - where_clause.append(self.get_metadata_in_condition("platform", *include_platform)) - where_vals.extend(include_platform) - if exclude_platform: - where_clause.append(self.get_metadata_in_condition("platform", *exclude_platform, negate=True)) - where_vals.extend(exclude_platform) - - ## chemistry - if include_chemistry: - where_clause.append(self.get_metadata_in_condition("chemistry", *include_chemistry)) - where_vals.extend(include_chemistry) - if exclude_chemistry: - where_clause.append(self.get_metadata_in_condition("chemistry", *exclude_chemistry, negate=True)) - where_vals.extend(exclude_chemistry) - - ## software - if include_software: - where_clause.append(self.get_metadata_equal_condition("software", include_software)) - where_vals.append(include_software) - if exclude_software: - where_clause.append(self.get_metadata_equal_condition("software", exclude_software, negate=True)) - where_vals.append(exclude_software) - - ## software version - if include_software_version: - where_clause.append(self.get_metadata_equal_condition("software_version", include_software_version)) - where_vals.append(include_software_version) - if exclude_software_version: - where_clause.append(self.get_metadata_equal_condition("software_version", exclude_software_version, negate=True)) - where_vals.append(exclude_software_version) - - ## material - if include_material: - where_clause.append(self.get_metadata_in_condition("material", *include_material)) - where_vals.extend(include_material) - if exclude_material: - where_clause.append(self.get_metadata_in_condition("material", *exclude_material, negate=True)) - where_vals.extend(exclude_material) - - ## ct - if min_ct or max_ct: - where_clause.append(self.get_metadata_numeric_condition("ct", min_ct, max_ct)) - if min_ct: - where_vals.append(min_ct) - if max_ct: - where_vals.append(max_ct) - - ## zip - if include_zip: - where_clause.append(self.get_metadata_leading_string_condition("zip", *include_zip)) - if exclude_zip: - where_clause.append(self.get_metadata_leading_string_condition("zip", *exclude_zip, negate=True)) - - ## date - if include_dates: - where_clause.append(self.get_metadata_date_condition("date", *include_dates)) - if exclude_dates: - where_clause.append(self.get_metadata_date_condition("date", *exclude_dates, negate=True)) - ## submission_date - if include_submission_dates: - where_clause.append(self.get_metadata_date_condition("submission_date", *include_submission_dates)) - if exclude_submission_dates: - where_clause.append(self.get_metadata_date_condition("submission_date", *exclude_submission_dates, negate=True)) - - ## seqhash - if include_seqhash: - where_clause.append(self.get_metadata_in_condition("seqhash", *include_seqhash)) - where_vals.extend(include_seqhash) - if exclude_seqhash: - where_clause.append(self.get_metadata_in_condition("seqhash", *exclude_seqhash, negate=True)) - where_vals.extend(exclude_seqhash) - - ## profiles - if include_profiles: - profile_clause = [] - for profile in include_profiles: - if not profile['dna'] and not profile['aa']: - continue - profile_clause.append([]) - - if len(profile['dna']) > 0: - profile_clause[-1].append(self.get_profile_condition('dna_profile', *profile['dna'])) - if len(profile['aa']) > 0: - profile_clause[-1].append(self.get_profile_condition('aa_profile', *profile['aa'])) - if len(profile_clause[-1]) > 1: - profile_clause[-1] = "(" + " AND ".join(profile_clause[-1]) + ")" - else: - profile_clause[-1] = profile_clause[-1][0] - if len(profile_clause) > 1: - where_clause.append("(" + " OR ".join(profile_clause) + ")") - else: - where_clause.append(profile_clause[0]) - - if exclude_profiles: - profile_clause = [] - for profile in exclude_profiles: - if not profile['dna'] and not profile['aa']: - continue - profile_clause.append([]) - if profile['dna']: - profile_clause[-1].append(self.get_profile_condition('dna_profile', *profile['dna'], negate=True)) - if profile['aa']: - profile_clause[-1].append(self.get_profile_condition('aa_profile', *profile['aa'], negate=True)) - if len(profile_clause[-1]) > 1: - profile_clause[-1] = "(" + " AND ".join(profile_clause) + ")" - else: - profile_clause[-1] = profile_clause[-1][0] - if len(profile_clause) > 1: - where_clause.append("(" + " OR ".join(profile_clause) + ")") - else: - where_clause.append(profile_clause[0]) - - ## frameshifts - if frameshifts == -1: - where_clause.append("fs_profile = ''") - elif frameshifts == 1: - where_clause.append("fs_profile != ''") - # count or not - fields = "*" if not count else "COUNT(*) as count" - - # create sql query - if where_clause: - sql = "SELECT " + fields + " FROM essence WHERE " + " AND ".join(where_clause) + ";" - else: - sql = "SELECT " + fields + " FROM essence;" - - if self.debug: - print("query: " + sql) - print("vals: ", where_vals) - - return self.cursor.execute(sql, where_vals).fetchall() - - # UPDATE DATA - - def update_genome(self, acc, description = None, lineage = None, zip = None, date = None, submission_date= None, gisaid = None, ena = None, collection = None, source = None, lab = None, technology = None, platform = None, chemistry = None, software = None, version = None, material = None, ct = None): - expr = [] - vals = [] - if description is not None: - expr.append("description") - vals.append(description) - if lineage is not None: - expr.append("lineage") - vals.append(lineage) - if zip is not None: - expr.append("zip") - vals.append(zip) - if gisaid is not None: - expr.append("gisaid") - vals.append(gisaid) - if date is not None: - expr.append("date") - vals.append(date) - if submission_date is not None: - expr.append("submission_date") - vals.append(submission_date) - if ena is not None: - expr.append("ena") - vals.append(ena) - if collection is not None: - expr.append("collection") - vals.append(collection) - if source is not None: - expr.append("source") - vals.append(source) - if lab is not None: - expr.append("lab") - vals.append(lab) - if technology is not None: - expr.append("technology") - vals.append(technology) - if platform is not None: - expr.append("platform") - vals.append(platform) - if chemistry is not None: - expr.append("chemistry") - vals.append(chemistry) - if software is not None: - expr.append("software") - vals.append(software) - if version is not None: - expr.append("software_version") - vals.append(version) - if material is not None: - expr.append("material") - vals.append(material) - if ct is not None: - expr.append("ct") - vals.append(ct) - vals.append(acc) - setexpr = ", ".join([x + " = ?" for x in expr]) - sql = "UPDATE genome SET "+ setexpr + " WHERE accession = ?;" - self.cursor.execute(sql, vals) - - # MISC - @staticmethod - def optimize(dbfile): - with sqlite3.connect(dbfile) as con: - con.executescript("VACUUM") - - @staticmethod - def dict_factory(cursor, row): - d = OrderedDict() - for idx, col in enumerate(cursor.description): - d[col[0]] = row[idx] - return d + """ + this object performs a pairwise sequence alignment and provides/stores selected + alignment functionalities/statistics. + + Notes + ----- + Please note, that genomic coordinates are processed and returned 0-based + by this object. While start or single coordinates are inclusive, + end coordinates are exclusive, expressed as mathematical notation: + [start, end) + + Please note, alignment is based on EMBOSS Stretcher. + + Example + ------- + + In this example the QRY_FASTA_FILE and REF_FASTA_FILE variables store + the path of FASTA files containing the query and reference genome sequences, + respectively. + + >>> algn = sonarALIGN(QRY_FASTA_FILE, REF_FASTA_FILE) + + Parameters + ---------- + query_file : str + define a path to a valid FASTA file storing the query genome sequence + target_file : str + define a path to a valid FASTA file storing the target genome sequence + (= reference) + out_file : str [ None ] + define a path to an output file that will store the FASTA formatted + alignment. Please consider, that an existing file will be overwritten! + If None, a temporary file is used and deleted after processing. + sonarGFFObj : object [ None ] + define a sonarGFF object based on the reference genome annotation + + Attributes + ---------- + aligned_query : str + stores the aligned upper-case query sequence (U replaced by T) + aligned_target : str + stores the aligned upper-case target or reference sequence (U replaced by T) + gff : object + stores the sonarGFF object if provided, otherwise None + dnadiff : list + stores a list of tuples for each genomic variation (based on the alignment). + Each tuple consists of: + - reference base (or bases in case of deletions) + - query base (or bases in case of insertions) + - genomic coordinate (0-based, inclusive) + - genomic end coordinate (in case of InDels 0-based and exlusive otherwise None) + - None + - None + Accordingly to the VCF format, insertions are expressed considering the upstream + base as anchor. As a special case, an insertion at the start of the sequence + has no anchor and a genomic coordinate of -1. Deletions are expressed for each + position they occur and not fused. The last two tuple elements + are always None to keep the length according to tuples stored in aadiff. + aadiff : list + stores a list of tuples for each amino acid variation in an annotated protein. + Each tuple consists of: + - reference amino acid (or amino acids in case of deletions) + - query amino acid (or amino acids in case of insertions) + - protein position (0-based, inclusive) + - protein end position (in case of InDels 0-based and exlusive otherwise None) + - protein symbol + - gene locus + Accordingly to the VCF format, insertions are expressed considering the upstream + base as anchor. As a special case, an insertion at the start of the sequence + has no anchor and a genomic coordinate of -1. Deletions are expressed for each + position they occur and not fused. The last two tuple elements + are always None to keep the length according to tuples stored in aadiff. + """ + + def __init__(self, query_file, target_file, out_file=None, sonarGFFObj=None): + self.aligned_query, self.aligned_target = self.align_dna( + query_file, target_file, out_file + ) + self.gff = sonarGFFObj if sonarGFFObj else None + self._insert_regex = re.compile("[^-]-+") + self._del_regex = re.compile("-+") + self._codon_regex = re.compile("[^-]-*[^-]-*[^-]-*") + self._leading_gap_regex = re.compile("^-+") + self._tailing_gap_regex = re.compile("-+$") + self._dnadiff = None + self._aadiff = None + self.__target_coords_matrix = None + + @property + def dnadiff(self): + if self._dnadiff is None: + self._dnadiff = [x for x in self.iter_dna_vars()] + return self._dnadiff + + @property + def aadiff(self): + if self._aadiff is None: + self._aadiff = [x for x in self.iter_aa_vars()] + return self._aadiff + + @property + def _target_coords_matrix(self): + if self.__target_coords_matrix is None: + self.__target_coords_matrix = [ + len(x.group()) for x in re.finditer(".-*", self.aligned_target) + ] + return self.__target_coords_matrix + + def use_stretcher( + self, + query_file, + target_file, + out_file, + gapopen=16, + gapextend=4, + right_align=True, + ): + """ + function to perform a pairwise aligment using EMBOSS Stretcher + + Parameters + ---------- + query_file : str + define a path to a valid FASTA file storing the query sequence + target_file : str + define a path to a valid FASTA file storing the target sequence + (= reference) + out_file : str + define a path to a file that will store the alignment. Please consider, + that an existing file will be overwritten. + gapopen : int [ 16 ] + define penalty for gap opening + gapextend : int [ 4 ] + define penalty for gap extension + + Returns + ------- + list + list of aligned query and target sequence, in that order + """ + temp = True if not out_file else False + if temp: + handle, out_file = mkstemp() + cline = StretcherCommandline( + asequence=query_file, + bsequence=target_file, + gapopen=gapopen, + gapextend=gapextend, + outfile=out_file, + aformat="fasta", + ) + stdout, stderr = cline() + alignment = [str(x.seq) for x in SeqIO.parse(out_file, "fasta")] + if temp: + os.remove(out_file) + if right_align: + alignment = self.left_align_gaps(*alignment) + return alignment + + def left_align_gaps(self, query, target): + """ + function to align gaps to the left in two aligned sequences + + Parameters + ---------- + query : str + define the query sequence in aligned form + target : str + define the target sequence (reference) in aligned form + + Returns + ------- + list + aligned query and target sequence strings with left-aligned gaps, + in that order. + """ + l = len(query) - 1 + for match in re.finditer("-+", query): + s = match.start() - 1 + e = match.end() - 1 + g = "-" * (e - s) + while s >= 0 and e < l and query[s] == target[e]: + query = query[:s] + g + query[s] + query[e + 1 :] + s -= 1 + e -= 1 + for match in re.finditer("-+", target): + s = match.start() - 1 + e = match.end() - 1 + g = "-" * (e - s) + while s >= 0 and e < l and target[s] == query[e]: + target = target[:s] + g + target[s] + target[e + 1 :] + s -= 1 + e -= 1 + return query, target + + def align_dna( + self, + query_file, + target_file, + out_file=None, + gapopen=16, + gapextend=4, + right_align=True, + ): + """ + function to perform the default pairwise nucleotide aligment + + Parameters + ---------- + query_file : str + define a path to a valid FASTA file storing the query sequence + target_file : str + define a path to a valid FASTA file storing the target sequence + (= reference) + out_file : str + define a path to a file that will store the alignment. Please consider, + that an existing file will be overwritten. + gapopen : int [ 16 ] + define penalty for gap opening + gapextend : int [ 4 ] + define penalty for gap extension + + Returns + ------- + list + list of aligned query and target sequence + """ + return self.use_stretcher( + query_file, target_file, out_file, gapopen, gapextend, right_align + ) + + def real_pos(self, x): + """ + function to convert an alignment position to the position in the + unaligned target sequence (= reference). + + Example + ------- + In this example the QRY_FASTA_FILE and REF_FASTA_FILE variables store + the path of FASTA files containing the query and reference genome sequences, + respectively. + + >>> algn = sonarALIGN(QRY_FASTA_FILE, REF_FASTA_FILE) + >>> algn.real_pos(29282) + 29282 + + Parameters + ---------- + x : int + define a position within the alignment (0-based) + + Returns + ------- + int + corresponding position (0-based) in the unaligned target/reference + sequence + """ + return x - self.aligned_target[: x + 1].count("-") + + def align_pos(self, x): + """ + function to convert an target/reference position to the corresponding + position in the alignment. + + Example + ------- + + >>> algn = sonarALIGN(QRY_FASTA_FILE, REF_FASTA_FILE) + >>> algn.align_pos(29282) + 29282 + + Parameters + ---------- + x : int + define a reference position (0-based) + + Returns + ------- + int + corresponding position of the sequence alignment + """ + return sum(self._target_coords_matrix[:x]) + + def iter_dna_vars(self): + """ + function to iterate variations on nucleotide level. + + Example + ------- + + In this example the QRY_FASTA_FILE and REF_FASTA_FILE variables store + the path of FASTA files containing the query and reference genome sequences, + respectively. The reference is NC_045512.2 while the query is a B.1.1.7 + prototype sequence. + + >>> algn = sonarALIGN(QRY_FASTA_FILE, REF_FASTA_FILE) + >>> for x in algn.iter_dna_vars(): + ... print(x) + ('C', 'T', 3266, None, None, None) + ('C', 'A', 5387, None, None, None) + ('T', 'C', 6953, None, None, None) + ('T', '', 11287, None, None, None) + ('C', '', 11288, None, None, None) + ('T', '', 11289, None, None, None) + ('G', '', 11290, None, None, None) + ('G', '', 11291, None, None, None) + ('T', '', 11292, None, None, None) + ('T', '', 11293, None, None, None) + ('T', '', 11294, None, None, None) + ('T', '', 11295, None, None, None) + ('T', '', 21764, None, None, None) + ('A', '', 21765, None, None, None) + ('C', '', 21766, None, None, None) + ('A', '', 21767, None, None, None) + ('T', '', 21768, None, None, None) + ('G', '', 21769, None, None, None) + ('T', '', 21990, None, None, None) + ('T', '', 21991, None, None, None) + ('A', '', 21992, None, None, None) + ('A', 'T', 23062, None, None, None) + ('C', 'A', 23270, None, None, None) + ('C', 'A', 23603, None, None, None) + ('C', 'T', 23708, None, None, None) + ('T', 'G', 24505, None, None, None) + ('G', 'C', 24913, None, None, None) + ('C', 'T', 27971, None, None, None) + ('G', 'T', 28047, None, None, None) + ('A', 'G', 28110, None, None, None) + ('G', 'C', 28279, None, None, None) + ('A', 'T', 28280, None, None, None) + ('T', 'A', 28281, None, None, None) + ('C', 'T', 28976, None, None, None) + + Returns + ------- + iterator of tuples + each tuple represents a nucleotide level variation and consists of: + - target nucleotide + - query nucleotide(s) + - target or reference start position (0-based + - target or reference end position (0-based) + - None + - None + Accordingly to the VCF format, insertions are expressed considering the upstream + base as anchor. As a special case, an insertion at the start of the sequence + has no anchor and a genomic coordinate of -1. Deletions are are expressed for + each position they occur and not fused. The last two tuple elements + are always None to keep the length according to tuples stored in aadiff. + """ + target = self.aligned_target + query = self.aligned_query + + # query overhead in front + match = self._leading_gap_regex.match(target) + if match: + yield "", query[: match.end()], -1, None, None, None + + # insertions + isites = set() + for match in self._insert_regex.finditer(target): + isites.add(match.start()) + s = self.real_pos(match.start()) + yield match.group()[0], query[ + match.start() : match.end() + ], s, None, None, None + + # deletions and snps + for i, pair in enumerate(zip(target, query)): + if pair[0] != "-" and pair[0] != pair[1] and i not in isites: + s = self.real_pos(i) + l = len(pair[1]) + e = None if l == 1 else s + l + yield pair[0], pair[1].replace("-", ""), s, e, None, None + + def iter_aa_vars(self): + """ + function to iterate variations on amino acid level. + + Example + ------- + + In this example the QRY_FASTA_FILE, REF_FASTA_FILE, and REF_GFF_FILE + variables store the path of FASTA files containing the query and + reference genome sequences as well as the reference genome annotation, + in that order. The reference is NC_045512.2 while the query is a B.1.1.7 + prototype sequence. + + Please consider, that a sonarGFF is needed to consider annotation and + deduce amino acid level profiles. + + >>> gff = sonarGFF(REF_GFF_FILE, REF_FASTA_FILE) + >>> algn = sonarALIGN(QRY_FASTA_FILE, REF_FASTA_FILE, sonarGFFObj=gff) + >>> for x in algn.iter_aa_vars(): + ... print(x) + ('T', 'I', 1000, None, 'ORF1b', 'GU280_gp01') + ('A', 'D', 1707, None, 'ORF1b', 'GU280_gp01') + ('I', 'T', 2229, None, 'ORF1b', 'GU280_gp01') + ('S', '', 3674, 3675, 'ORF1b', 'GU280_gp01') + ('G', '', 3675, 3676, 'ORF1b', 'GU280_gp01') + ('F', '', 3676, 3677, 'ORF1b', 'GU280_gp01') + ('T', 'I', 1000, None, 'ORF1a', 'GU280_gp01') + ('A', 'D', 1707, None, 'ORF1a', 'GU280_gp01') + ('I', 'T', 2229, None, 'ORF1a', 'GU280_gp01') + ('S', '', 3674, 3675, 'ORF1a', 'GU280_gp01') + ('G', '', 3675, 3676, 'ORF1a', 'GU280_gp01') + ('F', '', 3676, 3677, 'ORF1a', 'GU280_gp01') + ('I', '', 67, 68, 'S', 'GU280_gp02') + ('H', '', 68, 69, 'S', 'GU280_gp02') + ('V', '', 69, 70, 'S', 'GU280_gp02') + ('V', '', 142, 143, 'S', 'GU280_gp02') + ('Y', '', 143, 144, 'S', 'GU280_gp02') + ('N', 'Y', 500, None, 'S', 'GU280_gp02') + ('A', 'D', 569, None, 'S', 'GU280_gp02') + ('P', 'H', 680, None, 'S', 'GU280_gp02') + ('T', 'I', 715, None, 'S', 'GU280_gp02') + ('S', 'A', 981, None, 'S', 'GU280_gp02') + ('D', 'H', 1117, None, 'S', 'GU280_gp02') + ('Q', '*', 26, None, 'ORF8', 'GU280_gp09') + ('R', 'I', 51, None, 'ORF8', 'GU280_gp09') + ('Y', 'C', 72, None, 'ORF8', 'GU280_gp09') + ('D', 'L', 2, None, 'N', 'GU280_gp10') + ('S', 'F', 234, None, 'N', 'GU280_gp10') + + Returns + ------- + iterator of tuples + each tuple represents a amino acid level variation and consists of: + - target nucleotide + - query nucleotide(s) + - target or reference start position (0-based + - target or reference end position (0-based) + - protein symbol + - gene locus + Accordingly to the VCF format, insertions are expressed considering the upstream + base as anchor. As a special case, an insertion at the start of the sequence + has no anchor and a genomic coordinate of -1. Deletions are are expressed for + each position they occur and not fused. + """ + if self.gff: + for cds in self.gff.cds: + query = [] + target = [] + for s, e in cds.coordlist: + s = self.align_pos(s) + e = self.align_pos(e) + query.append(self.aligned_query[s:e]) + target.append(self.aligned_target[s:e]) + query = "".join(query) + target = "".join(target) + + if cds.strand == "-": + query.append(str(Seq.reverse_complement(query))) + target.append(str(Seq.reverse_complement(target))) + + for match in self._codon_regex.finditer(target): + s = match.start() + e = match.end() + start = int((s - target[: match.start()].count("-")) / 3) + tcodon = match.group().replace("-", "") + qcodon = query[s:e].replace("-", "") + taa = self.translate(tcodon, cds.translation_table) + qaa = self.translate(qcodon, cds.translation_table) + if qaa == "": + yield taa, "", start, start + 1, cds.symbol, cds.locus + else: + if qaa != taa: + e = None if len(qaa) == 1 else start + len(qaa) + yield (taa, qaa, start, e, cds.symbol, cds.locus) + + @staticmethod + def translate(seq, translation_table=1): + """ + function to translate a nucleotide sequence. + + Notes + ----- + If necessary, the given nucleotide sequence is shortened that its + length is a multiple of 3. + + Example + ------- + + >>> algn = sonarALIGN(QRY_FASTA_FILE, REF_FASTA_FILE) + >>> algn.translate("ATGTGAAA") + 'M*' + + Parameters + ---------- + seq : str + define the nucleotide sequence to translate + translation_table : int + define the genetic code table used for in silico translation (see + https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) [ 1 ] + + Returns + ------- + str + translated amino acid sequence + """ + l = len(seq) + if l % 3 == 1: + l = -1 + elif l % 3 == 2: + l = -2 + return str(Seq.translate(seq[:l], table=translation_table)) + + +class sonarDBManager: + """ + This object provides a sonarDB SQLite manager handler managing connections and + providing context-safe transaction control. + + Notes + ----- + This object should be called using a context manager to ensure rollbacks + on abnormal termination. + + Example + ------- + + In this example the DOCTESTDB variable store the path to a database file + + >>> with sonarDBManager(DOCTESTDB) as dbm: + ... pass + + Parameters + ---------- + + dbfile : str + define a path to a non-existent or valid SONAR database file. If the + file does not exist, a SONAR database is created. + timeout : int [ -1 ] + define busy timeout. Use -1 for no timeout. + readonly : bool [ False ] + define if the connection should be read-only + debug : bool [ False ] + debug mode (print selected sql queries) + + Attributes + ---------- + dbfile : str + stores the path to the used SONAR database file. + connection : object + stores the SQLite3 connection + cursor : method + stores the SQLite3 cursor + + Dev Note + -------- + A database row is returned as dictionary with column name as keys. Multiple + rows are returned as list of those dictionaries. + + """ + + def __init__(self, dbfile, timeout=-1, readonly=False, debug=False): + self.dbfile = os.path.abspath(dbfile) + self.connection = None + self.cursor = None + self.__timeout = timeout + self.__mode = "ro" if readonly else "rwc" + self.__uri = "file:" + urlquote(self.dbfile) + self.debug = debug + + def __enter__(self): + if not os.path.isfile(self.dbfile) or os.stat(self.dbfile).st_size == 0: + self.create_tables() + self.connection, self.cursor = self.connect() + self.start_transaction() + return self + + def __exit__(self, exc_type, exc_value, exc_traceback): + if [exc_type, exc_value, exc_traceback].count(None) != 3: + print("warning:", file=sys.stderr) + print(traceback.format_exc(), file=sys.stderr) + if self.__mode == "rwc": + print("rollback", file=sys.stderr) + self.rollback() + elif self.__mode == "rwc": + self.commit() + self.close() + + def __del__(self): + if self.connection: + self.close() + + def connect(self): + con = sqlite3.connect( + self.__uri + "?mode=" + self.__mode, + self.__timeout, + isolation_level=None, + uri=True, + ) + con.row_factory = self.dict_factory + cur = con.cursor() + return con, cur + + def start_transaction(self): + self.cursor.execute("BEGIN DEFERRED") + + def commit(self): + self.connection.commit() + + def rollback(self): + self.connection.rollback() + + def close(self): + self.connection.close() + + def create_tables(self): + with open( + os.path.join(os.path.dirname(os.path.realpath(__file__)), "db.sqlite"), "r" + ) as handle: + sql = handle.read() + with sqlite3.connect(self.__uri + "?mode=rwc", uri=True) as con: + con.executescript(sql) + + def get_db_version(self): + return self.cursor.execute("pragma user_version").fetchone()["user_version"] + + def check_db_compatibility(self): + dbver = self.get_db_version() + if dbver != SUPPORTED_DB_VERSION: + sys.exit( + "Compatibility error: the given database is not compatible with this version of sonar (Current database version: " + + str(dbver) + + "; Supported database version: " + + str(SUPPORTED_DB_VERSION) + + ") \nPlease run 'sonar.py db-upgrade' to upgrade database" + ) + + @staticmethod + def upgrade_db(dbfile): + try: + with sqlite3.connect(dbfile) as con: + cur = con.cursor() + current_version = cur.execute("pragma user_version").fetchone()[0] + + print( + "Current version:", + current_version, + " Upgrade to:", + SUPPORTED_DB_VERSION, + ) + uri = "file:" + urlquote(dbfile) + print("Perform the Upgrade:", uri) + while current_version < SUPPORTED_DB_VERSION: + next_version = current_version + 1 + with open( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "migrate/" + str(next_version) + ".sql", + ), + "r", + ) as handle: + sql = handle.read() + with sqlite3.connect(uri + "?mode=rwc", uri=True) as con: + con.executescript(sql) + + current_version = next_version + + except sqlite3.Error as er: + con.executescript("ROLLBACK") + raise er + finally: + print("Database now version:", current_version) + if current_version == SUPPORTED_DB_VERSION: + print("Success: Database upgrade was successfully completed") + else: + print("Error: Upgrade was not completed") + + # INSERTING DATA + + def insert_genome(self, acc, descr, seqhash): + sql = "INSERT INTO genome (accession, description, seqhash) VALUES(?, ?, ?);" + self.cursor.execute(sql, [acc, descr, seqhash]) + return acc + + def insert_sequence(self, seqhash): + sql = "INSERT OR IGNORE INTO sequence (seqhash) VALUES(?);" + self.cursor.execute(sql, [seqhash]) + return seqhash + + def insert_profile(self, seqhash, dna_profile, aa_profile, fs_profile): + dna_profile = " " + dna_profile.strip() + " " + aa_profile = " " + aa_profile.strip() + " " + sql = "INSERT OR IGNORE INTO profile (seqhash, dna_profile, aa_profile, fs_profile) VALUES(?, ?, ?, ?);" + self.cursor.execute(sql, [seqhash, dna_profile, aa_profile, fs_profile]) + return seqhash + + def insert_dna_var(self, seqhash, ref, alt, start, end=None): + if end is None: + end = start + 1 + sql = "INSERT OR IGNORE INTO dna (varid, start, end, ref, alt) VALUES(?, ?, ?, ?, ?);" + self.cursor.execute(sql, [None, start, end, ref, alt]) + sql = ( + "SELECT varid FROM dna WHERE start = ? AND end = ? AND alt = ? AND ref = ?;" + ) + varid = self.cursor.execute(sql, [start, end, alt, ref]).fetchone()["varid"] + sql = "INSERT OR IGNORE INTO sequence2dna (seqhash, varid) VALUES(?, ?);" + self.cursor.execute(sql, [seqhash, varid]) + return varid + + def insert_prot_var(self, seqhash, protein, locus, ref, alt, start, end=None): + if end is None: + end = start + 1 + sql = "INSERT OR IGNORE INTO prot (varid, protein, locus, start, end, ref, alt) VALUES(?, ?, ?, ?, ?, ?, ?);" + self.cursor.execute(sql, [None, protein, locus, start, end, ref, alt]) + sql = "SELECT varid FROM prot WHERE protein = ? AND locus = ? AND start = ? AND end = ? AND alt = ? AND ref = ?;" + varid = self.cursor.execute( + sql, [protein, locus, start, end, alt, ref] + ).fetchone()["varid"] + sql = "INSERT OR IGNORE INTO sequence2prot (seqhash, varid) VALUES(?, ?);" + self.cursor.execute(sql, [seqhash, varid]) + return varid + + # DELETING DATA + + def delete_genome(self, acc): + """ + we currently did not add CONSTRAINT, ON DELETE CASCADE to .sql file. + so we manually delete each table. + + Attributes + ---------- + acc: String + accession id + """ + sql = "SELECT seqhash FROM genome WHERE accession = ?;" + row = self.cursor.execute(sql, [acc]).fetchone() + # If there is no matching accession, return immediately + if not row: + return + # If the accession exists, delete it + sql = "DELETE FROM genome WHERE accession = ?;" + self.cursor.execute(sql, [acc]) + # Check to see if there are any remaining sequences with the same + # seqhash as the sequence we just deleted. If not, clean up other tables + # with that seqhash's data. + sql = "SELECT COUNT(*) FROM genome WHERE seqhash = ?;" + selected_seqhash = row["seqhash"] + row = self.cursor.execute(sql, [selected_seqhash]).fetchone() + if row["COUNT(*)"] == 0: + # delete profile + sql = "DELETE FROM profile WHERE seqhash = ?;" + self.cursor.execute(sql, [selected_seqhash]) + # delete seq seq2dna 2prpt + sql = "DELETE FROM sequence WHERE seqhash = ?;" + self.cursor.execute(sql, [selected_seqhash]) + sql = "DELETE FROM sequence2dna WHERE seqhash = ?;" + self.cursor.execute(sql, [selected_seqhash]) + sql = "DELETE FROM sequence2prot WHERE seqhash = ?;" + self.cursor.execute(sql, [selected_seqhash]) + + # SELECTING DATA + + def genome_exists(self, acc, descr=None, seqhash=None): + sql = ["SELECT COUNT(*) FROM genome WHERE accession = ?"] + vals = [acc] + if descr: + sql.append("AND descr = ?") + vals.append(descr) + if seqhash: + sql.append(" AND seqhash = ?") + vals.append(seqhash) + return self.cursor.execute(" ".join(sql) + ";", vals).fetchone()["COUNT(*)"] > 0 + + def seq_exists(self, seqhash): + sql = "SELECT COUNT(*) FROM sequence WHERE seqhash = ?;" + return self.cursor.execute(sql, [seqhash]).fetchone()["COUNT(*)"] > 0 + + def get_genomes(self, acc): + sql = "SELECT * FROM genome WHERE accession = ?;" + return self.cursor.execute(sql, [acc]).fetchone() + + def get_dna_varid(self, ref, alt, pos): + sql = "SELECT varid FROM dna WHERE pos = ? AND alt = ? AND ref = ?;" + row = self.cursor.execute(sql, [pos, alt, ref]).fetchone() + if row: + return row["varid"] + return None + + def get_prot_varid(self, protein, locus, ref, alt, pos): + sql = "SELECT varid FROM prot WHERE protein = ? AND locus = ? AND pos = ? AND alt = ? AND ref = ?;" + row = self.cursor.execute(sql, [protein, locus, ref, alt, pos]).fetchone() + if row: + return row["varid"] + return None + + def get_dna_vars(self, acc): + sql = "SELECT description, start, end, alt, ref FROM dna_view WHERE accession = ?;" + return self.cursor.execute(sql, [acc]).fetchall() + + def get_dna_profile(self, acc): + sql = "SELECT dna_profile FROM essence WHERE accession = ?;" + row = self.cursor.execute(sql, [acc]).fetchone() + if not row: + return None + return row["dna_profile"] + + def count_genomes(self): + sql = "SELECT COUNT(accession) FROM genome;" + row = self.cursor.execute(sql).fetchone() + return int(row["COUNT(accession)"]) + + def count_sequences(self): + sql = "SELECT COUNT(seqhash) FROM sequence;" + row = self.cursor.execute(sql).fetchone() + return int(row["COUNT(seqhash)"]) + + def count_labs(self): + sql = "SELECT COUNT(DISTINCT lab) as count FROM genome WHERE lab != '';" + row = self.cursor.execute(sql).fetchone() + return int(row["count"]) + + def info_data_types(self): + sql = "SELECT source, collection, COUNT(accession) as genome_count FROM genome GROUP BY source, collection ORDER BY source, collection;" + return self.cursor.execute(sql).fetchall() + + def get_earliest_import(self): + sql = "SELECT MIN(imported) as import FROM genome;" + return self.cursor.execute(sql).fetchone()["import"] + + def get_latest_import(self): + sql = "SELECT MAX(imported) as import FROM genome;" + return self.cursor.execute(sql).fetchone()["import"] + + def get_earliest_date(self): + sql = "SELECT MIN(date) as date FROM genome WHERE date IS NOT NULL;" + return self.cursor.execute(sql).fetchone()["date"] + + def get_latest_date(self): + sql = "SELECT MAX(date) as date FROM genome WHERE date IS NOT NULL;" + return self.cursor.execute(sql).fetchone()["date"] + + def count_metadata(self, field): + sql = ( + "SELECT COUNT(accession) as counts FROM genome WHERE " + + field + + " IS NOT NULL AND " + + field + + " != '';" + ) + return self.cursor.execute(sql).fetchone()["counts"] + + def iter_table(self, table, batch_size=1000): + sql = "SELECT * FROM " + table + ";" + c = self.cursor.execute(sql) + while True: + rows = c.fetchmany(batch_size) + if not rows: + break + for row in rows: + yield row + + ## extra features + def get_list_of_lineages(self, lineage): + + sql = ( + "SELECT DISTINCT lineage FROM genome WHERE lineage LIKE '" + lineage + "';" + ) + rows = self.cursor.execute(sql).fetchall() + result = [i["lineage"] for i in rows] + return result + + def get_dna_vars_for_vcf( + self, + ): + return None + + # MATCHING PROFILES + + def get_profile_condition(self, field, *profiles, negate=False): + op = " NOT " if negate else " " + search_all = False + if field == "dna_profile": + for x in profiles: + if "N" == x[-1]: + search_all = True + elif field == "aa_profile": + for x in profiles: + if "X" == x[-1]: + search_all = True + + clause = [field + op + "LIKE '% " + x + " %'" for x in profiles] + if search_all: + return " OR ".join(clause) + + return " AND ".join(clause) + + def get_metadata_in_condition(self, field, *vals, negate=False): + op = " NOT " if negate else " " + # clause = [field + op + "LIKE '" + x + "'" for x in vals] + # return " AND ".join(clause) + return field + op + "IN (" + ", ".join(["?"] * len(vals)) + ")" + + def get_metadata_equal_condition(self, field, val, negate=False): + op = " != " if negate else " = " + return field + op + "?" + + def get_metadata_numeric_condition(self, field, min=False, max=True): + condition = [] + if min: + condition.append(field + " >= ?") + if max: + condition.append(field + " <= ?") + return " AND ".join(condition) + + def get_metadata_leading_string_condition(self, field, *vals, negate=False): + op = " NOT " if negate else " " + logic = " AND " if negate else " OR " + clause = [field + op + "LIKE '" + x + "%'" for x in vals] + if not negate and len(clause) > 1: + return "(" + logic.join(clause) + ")" + return logic.join(clause) + + def get_metadata_date_condition(self, field, *dates, negate=False): + op = " NOT " if negate else " " + op2 = " != " if negate else " = " + logic = " AND " if negate else " OR " + clause = [] + for date in dates: + if ":" in date: + x, y = date.split(":") + clause.append("(" + field + op + "BETWEEN '" + x + "' AND '" + y + "')") + else: + clause.append(field + op2 + "'" + date + "'") + if not negate and len(clause) > 1: + return "(" + logic.join(clause) + ")" + return logic.join(clause) + + def match( + self, + include_profiles=[], + exclude_profiles=[], + include_acc=[], + exclude_acc=[], + include_lin=[], + exclude_lin=[], + include_zip=[], + exclude_zip=[], + include_dates=[], + exclude_dates=[], + include_submission_dates=[], + exclude_submission_dates=[], + include_lab=[], + exclude_lab=[], + include_source=[], + exclude_source=[], + include_collection=[], + exclude_collection=[], + include_technology=[], + exclude_technology=[], + include_platform=[], + exclude_platform=[], + include_chemistry=[], + exclude_chemistry=[], + include_material=[], + exclude_material=[], + include_software=None, + exclude_software=None, + include_software_version=None, + exclude_software_version=None, + min_ct=None, + max_ct=None, + include_seqhash=[], + exclude_seqhash=[], + count=False, + frameshifts=0, + ): + + # creating where condition + where_clause = [] + where_vals = [] + + ## accessions + if include_acc: + where_clause.append( + self.get_metadata_in_condition("accession", *include_acc) + ) + where_vals.extend(include_acc) + if exclude_acc: + where_clause.append( + self.get_metadata_in_condition("accession", *exclude_acc, negate=True) + ) + where_vals.extend(exclude_acc) + + ## lineage + if include_lin: + where_clause.append(self.get_metadata_in_condition("lineage", *include_lin)) + where_vals.extend(include_lin) + if exclude_lin: + where_clause.append( + self.get_metadata_in_condition("lineage", *exclude_lin, negate=True) + ) + where_vals.extend(exclude_lin) + + ## lab + if include_lab: + where_clause.append(self.get_metadata_in_condition("lab", *include_lab)) + where_vals.extend(include_lab) + if exclude_lab: + where_clause.append( + self.get_metadata_in_condition("lab", *exclude_lab, negate=True) + ) + where_vals.extend(exclude_lab) + + ## source + if include_source: + where_clause.append( + self.get_metadata_in_condition("source", *include_source) + ) + where_vals.extend(include_source) + if exclude_source: + where_clause.append( + self.get_metadata_in_condition("source", *exclude_source, negate=True) + ) + where_vals.extend(exclude_source) + + ## collection + if include_collection: + where_clause.append( + self.get_metadata_in_condition("collection", *include_collection) + ) + where_vals.extend(include_collection) + if exclude_collection: + where_clause.append( + self.get_metadata_in_condition( + "collection", *exclude_collection, negate=True + ) + ) + where_vals.extend(exclude_collection) + + ## technology + if include_technology: + where_clause.append( + self.get_metadata_in_condition("technology", *include_technology) + ) + where_vals.extend(include_technology) + if exclude_technology: + where_clause.append( + self.get_metadata_in_condition( + "technology", *exclude_technology, negate=True + ) + ) + where_vals.extend(exclude_technology) + + ## platform + if include_platform: + where_clause.append( + self.get_metadata_in_condition("platform", *include_platform) + ) + where_vals.extend(include_platform) + if exclude_platform: + where_clause.append( + self.get_metadata_in_condition( + "platform", *exclude_platform, negate=True + ) + ) + where_vals.extend(exclude_platform) + + ## chemistry + if include_chemistry: + where_clause.append( + self.get_metadata_in_condition("chemistry", *include_chemistry) + ) + where_vals.extend(include_chemistry) + if exclude_chemistry: + where_clause.append( + self.get_metadata_in_condition( + "chemistry", *exclude_chemistry, negate=True + ) + ) + where_vals.extend(exclude_chemistry) + + ## software + if include_software: + where_clause.append( + self.get_metadata_equal_condition("software", include_software) + ) + where_vals.append(include_software) + if exclude_software: + where_clause.append( + self.get_metadata_equal_condition( + "software", exclude_software, negate=True + ) + ) + where_vals.append(exclude_software) + + ## software version + if include_software_version: + where_clause.append( + self.get_metadata_equal_condition( + "software_version", include_software_version + ) + ) + where_vals.append(include_software_version) + if exclude_software_version: + where_clause.append( + self.get_metadata_equal_condition( + "software_version", exclude_software_version, negate=True + ) + ) + where_vals.append(exclude_software_version) + + ## material + if include_material: + where_clause.append( + self.get_metadata_in_condition("material", *include_material) + ) + where_vals.extend(include_material) + if exclude_material: + where_clause.append( + self.get_metadata_in_condition( + "material", *exclude_material, negate=True + ) + ) + where_vals.extend(exclude_material) + + ## ct + if min_ct or max_ct: + where_clause.append( + self.get_metadata_numeric_condition("ct", min_ct, max_ct) + ) + if min_ct: + where_vals.append(min_ct) + if max_ct: + where_vals.append(max_ct) + + ## zip + if include_zip: + where_clause.append( + self.get_metadata_leading_string_condition("zip", *include_zip) + ) + if exclude_zip: + where_clause.append( + self.get_metadata_leading_string_condition( + "zip", *exclude_zip, negate=True + ) + ) + + ## date + if include_dates: + where_clause.append( + self.get_metadata_date_condition("date", *include_dates) + ) + if exclude_dates: + where_clause.append( + self.get_metadata_date_condition("date", *exclude_dates, negate=True) + ) + ## submission_date + if include_submission_dates: + where_clause.append( + self.get_metadata_date_condition( + "submission_date", *include_submission_dates + ) + ) + if exclude_submission_dates: + where_clause.append( + self.get_metadata_date_condition( + "submission_date", *exclude_submission_dates, negate=True + ) + ) + + ## seqhash + if include_seqhash: + where_clause.append( + self.get_metadata_in_condition("seqhash", *include_seqhash) + ) + where_vals.extend(include_seqhash) + if exclude_seqhash: + where_clause.append( + self.get_metadata_in_condition("seqhash", *exclude_seqhash, negate=True) + ) + where_vals.extend(exclude_seqhash) + + ## profiles + if include_profiles: + profile_clause = [] + for profile in include_profiles: + if not profile["dna"] and not profile["aa"]: + continue + profile_clause.append([]) + + if len(profile["dna"]) > 0: + profile_clause[-1].append( + self.get_profile_condition("dna_profile", *profile["dna"]) + ) + if len(profile["aa"]) > 0: + profile_clause[-1].append( + self.get_profile_condition("aa_profile", *profile["aa"]) + ) + if len(profile_clause[-1]) > 1: + profile_clause[-1] = "(" + " AND ".join(profile_clause[-1]) + ")" + else: + profile_clause[-1] = profile_clause[-1][0] + if len(profile_clause) > 1: + where_clause.append("(" + " OR ".join(profile_clause) + ")") + else: + where_clause.append(profile_clause[0]) + + if exclude_profiles: + profile_clause = [] + for profile in exclude_profiles: + if not profile["dna"] and not profile["aa"]: + continue + profile_clause.append([]) + if profile["dna"]: + profile_clause[-1].append( + self.get_profile_condition( + "dna_profile", *profile["dna"], negate=True + ) + ) + if profile["aa"]: + profile_clause[-1].append( + self.get_profile_condition( + "aa_profile", *profile["aa"], negate=True + ) + ) + if len(profile_clause[-1]) > 1: + profile_clause[-1] = "(" + " AND ".join(profile_clause) + ")" + else: + profile_clause[-1] = profile_clause[-1][0] + if len(profile_clause) > 1: + where_clause.append("(" + " OR ".join(profile_clause) + ")") + else: + where_clause.append(profile_clause[0]) + + ## frameshifts + if frameshifts == -1: + where_clause.append("fs_profile = ''") + elif frameshifts == 1: + where_clause.append("fs_profile != ''") + # count or not + fields = "*" if not count else "COUNT(*) as count" + + # create sql query + if where_clause: + sql = ( + "SELECT " + + fields + + " FROM essence WHERE " + + " AND ".join(where_clause) + + ";" + ) + else: + sql = "SELECT " + fields + " FROM essence;" + + if self.debug: + print("query: " + sql) + print("vals: ", where_vals) + + return self.cursor.execute(sql, where_vals).fetchall() + + # UPDATE DATA + + def update_genome( + self, + acc, + description=None, + lineage=None, + zip=None, + date=None, + submission_date=None, + gisaid=None, + ena=None, + collection=None, + source=None, + lab=None, + technology=None, + platform=None, + chemistry=None, + software=None, + version=None, + material=None, + ct=None, + ): + expr = [] + vals = [] + if description is not None: + expr.append("description") + vals.append(description) + if lineage is not None: + expr.append("lineage") + vals.append(lineage) + if zip is not None: + expr.append("zip") + vals.append(zip) + if gisaid is not None: + expr.append("gisaid") + vals.append(gisaid) + if date is not None: + expr.append("date") + vals.append(date) + if submission_date is not None: + expr.append("submission_date") + vals.append(submission_date) + if ena is not None: + expr.append("ena") + vals.append(ena) + if collection is not None: + expr.append("collection") + vals.append(collection) + if source is not None: + expr.append("source") + vals.append(source) + if lab is not None: + expr.append("lab") + vals.append(lab) + if technology is not None: + expr.append("technology") + vals.append(technology) + if platform is not None: + expr.append("platform") + vals.append(platform) + if chemistry is not None: + expr.append("chemistry") + vals.append(chemistry) + if software is not None: + expr.append("software") + vals.append(software) + if version is not None: + expr.append("software_version") + vals.append(version) + if material is not None: + expr.append("material") + vals.append(material) + if ct is not None: + expr.append("ct") + vals.append(ct) + vals.append(acc) + setexpr = ", ".join([x + " = ?" for x in expr]) + sql = "UPDATE genome SET " + setexpr + " WHERE accession = ?;" + self.cursor.execute(sql, vals) + + # MISC + @staticmethod + def optimize(dbfile): + with sqlite3.connect(dbfile) as con: + con.executescript("VACUUM") + + @staticmethod + def dict_factory(cursor, row): + d = OrderedDict() + for idx, col in enumerate(cursor.description): + d[col[0]] = row[idx] + return d class sonarDB(object): - """ - this object provides sonarDB functionalities and intelligence - - Notes - ----- - Please note, that genomic and protein coordinates are expected to be and - returned 0-based by this object, except for formatted profiles. - While start or single coordinates are inclusive, end coordinates of - ranges are exclusive, expressed in a mathematical notation: [start, end). - Only in formatted profiles start and end coordinates are 1-based and both - inclusive. - - Examples - -------- - - In this example the path to the database is stored in DOCTESTDB. - - >>> db = sonarDB(DOCTESTDB) - - Parameters - ---------- - dbfile : str - define a path to a non-existent or valid SONAR database file. If the - file does not exist, a SONAR database is created. - translation_table : int - define the genetic code table used for in silico translation (see - https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) [ 1 ] - - Attributes - ---------- - db : str - stores the absolute path to the used SONAR database file - reffna : str - stores the absolute path to the built-in FASTA file containing the reference - genome sequence - refgff : str - stores the absolute path to the built-in GFF3 file containing the reference - genome annotation - translation_table : int - stores the genetic code table used for in silico translation (see - https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) [ 1 ] - refseq : str - stores the upper-case sequence of the built-in reference genome - refdescr : str - stores the FASTA header of the built-in reference genome - refgffObj : object - stores the sonarGFF object based on the built-in reference genome - annotation - iupac_nt_code : dict - stores a dict with IUPAC one-letter nucleotide codes as keys and the - respective set of matching explicit IUPAC one-letter nucleotide codes - as values (e.g {"W": set('A', 'T')}) - iupac_explicit_nt_code : dict - stores a set containing all non-ambiguous IUPAC one-letter nucleotide codes - iupac_ambig_nt_code : set - stores a set containing all ambiguous IUPAC one-letter nucleotide codes - iupac_aa_code : dict - stores a dict with IUPAC one-letter amino acid codes as keys and - the respective set of matching IUPAC one-letter amino acids codes as values - iupac_explicit_aa_code : dict - stores a set containing all non-ambiguous IUPAC one-letter amino acid codes - iupac_ambig_aa_code : dict - stores a set containing all ambiguous IUPAC one-letter amino acid codes - dna_var_regex : compiled re expression - stores a compiled re expression that matches to nucleotide profiles but - not to amino acid profiles - aa_var_regex : compiled re expression - stores a compiled re expression that matches to amino acid profiles but - not to nucleotide profiles - del_regex : compiled re expression - stores a compiled re expression that matches to deletion profiles on - nucleotide as well as on amino acid level. - dnavar_grep_regex : compiled re expression - stores a compiled re expression that matches to snp or dna insertion - profiles with eference allele, genomic position and variant allele - as groups. - codedict : dict - stores a dictionary with "dna" and "aa" containing the field name in the - database that stores the profile data, the one letter code with and - without ambiguities - """ - def __init__(self, dbfile, translation_table = 1): - self.db = os.path.abspath(dbfile) - self.__moduledir = os.path.dirname(os.path.realpath(__file__)) - self.reffna = os.path.join(self.__moduledir, "ref.fna") - self.refgff = os.path.join(self.__moduledir, "ref.gff3") - self.lineagewithsublineages = os.path.join(self.__moduledir, "lineage.all.tsv") - - self.translation_table = translation_table - self.__lineage_sublineage_dict = None - self.__refseq = None - self.__refdescr = None - self.__refgffObj = None - self.__iupac_nt_code = None - self.__iupac_aa_code = None - self.__iupac_explicit_nt_code = None - self.__iupac_explicit_aa_code = None - self.__iupac_ambig_nt_code = None - self.__iupac_ambig_aa_code = None - self.__terminal_letters_regex = re.compile("[A-Z]$") - self.__dna_var_regex = None - self.__aa_var_regex = None - self.__del_regex = None - self.__dnavar_grep_regex = None - self.__codedict = None - - # PROPERTIES ON DEMAND - - @property - def lineage_sublineage_dict(self): - if not self.__lineage_sublineage_dict: - df = pd.read_csv(self.lineagewithsublineages, sep='\t') - self.__lineage_sublineage_dict = dict(zip(df.lineage, df.sublineage)) - return self.__lineage_sublineage_dict - - @property - def refseq(self): - if not self.__refseq: - record = SeqIO.read(self.reffna, "fasta") - self.__refseq = self.harmonize(record.seq) - return self.__refseq - - @property - def refdescr(self): - if not self.__refdescr: - with open(self.reffna, "r") as handle: - self.__refdescr = handle.readline().strip()[1:] - return self.__refdescr - - @property - def refgffObj(self): - if not self.__refgffObj: - self.__refgffObj = sonarGFF(self.refgff, self.reffna, self.translation_table) - return self.__refgffObj - - @property - def dna_var_regex(self): - if self.__dna_var_regex is None: - allowed_letters = "[" + "".join(self.iupac_nt_code.keys()) + "]" - self.__dna_var_regex = re.compile("^(?:(?:del:[0-9]+:[0-9]+)|(?:" + allowed_letters + "[0-9]+" + allowed_letters + "+))$") - return self.__dna_var_regex - - @property - def dnavar_grep_regex(self): - if self.__dnavar_grep_regex is None: - self.__dnavar_grep_regex = re.compile("^([^0-9:]*)([0-9]+)([^0-9]*)$") - return self.__dnavar_grep_regex - - @property - def aa_var_regex(self): - if self.__aa_var_regex is None: - allowed_symbols = "(?:(?:" + ")|(?:".join(self.refgffObj.symbols) + "))" - allowed_letters = "[" + "".join(self.iupac_aa_code.keys()).replace("-", "") + "*~-" + "]" - self.__aa_var_regex = re.compile("^" + allowed_symbols + ":(?:(?:del:[0-9]+:[0-9]+)|(?:" + allowed_letters + "[0-9]+" + allowed_letters + "+))$") - return self.__aa_var_regex - - @property - def del_regex(self): - if self.__del_regex is None: - allowed_symbols = "(?:(?:" + ")|(?:".join(self.refgffObj.symbols) + "))" - self.__del_regex = re.compile("^(?:" + allowed_symbols + ":)?del:[0-9]+:[0-9]+$") - return self.__del_regex - - @property - def iupac_nt_code(self): - if self.__iupac_nt_code is None: - self.__iupac_nt_code = { "A": set("A"), "C": set("C"), "G": set("G"), "T": set("T"), "R": set("AGR"), "Y": set("CTY"), "S": set("GCS"), "W": set("ATW"), "K": set("GTK"), "M": set("ACM"), "B": set("CGTB"), "D": set("AGTD"), "H": set("ACTH"), "V": set("ACGV") } - self.__iupac_nt_code['N'] = set(self.__iupac_nt_code.keys()) | set("N") - return self.__iupac_nt_code - - @property - def iupac_explicit_nt_code(self): - if self.__iupac_explicit_nt_code is None: - self.__iupac_explicit_nt_code = set([ x for x in self.iupac_nt_code if len(self.iupac_nt_code[x]) == 1 ]) - return self.__iupac_explicit_nt_code - - @property - def iupac_ambig_nt_code(self): - if self.__iupac_ambig_nt_code is None: - self.__iupac_ambig_nt_code = set([ x for x in self.iupac_nt_code if len(self.iupac_nt_code[x]) > 1 ]) - return self.__iupac_ambig_nt_code - - @property - def iupac_aa_code(self): - if self.__iupac_aa_code is None: - self.__iupac_aa_code = { "A": set("A"), "R": set("R"), "N": set("N"), "D": set("D"), "C": set("C"), "Q": set("Q"), "E": set("E"), "G": set("G"), "H": set("H"), "I": set("I"), "L": set("L"), "K": set("K"), "M": set("M"), "F": set("F"), "P": set("P"), "S": set("S"), "T": set("T"), "W": set("W"), "Y": set("Y"), "V": set("V"), "U": set("U"), "O": set("O") } - self.__iupac_aa_code.update({"B": set("DNB"), "Z": set("EQZ"), "J": set("ILJ"), "Φ": set("VILFWYMΦ"), "Ω": set("FWYHΩ"), "Ψ": set("VILMΨ"), "π": set("PGASπ"), "ζ": set("STHNQEDKRζ"), "+": set("KRH+"), "-": set("DE-") }) - self.__iupac_aa_code['X'] = set(self.__iupac_aa_code.keys()) | set("X") - return self.__iupac_aa_code - - @property - def iupac_explicit_aa_code(self): - if self.__iupac_explicit_aa_code is None: - self.__iupac_explicit_aa_code = set([ x for x in self.iupac_aa_code if len(self.iupac_aa_code[x]) == 1 ]) - return self.__iupac_explicit_aa_code - - @property - def iupac_ambig_aa_code(self): - if self.__iupac_ambig_aa_code is None: - self.__iupac_ambig_aa_code = set([ x for x in self.iupac_aa_code if len(self.iupac_aa_code[x]) > 1 ]) - return self.__iupac_ambig_aa_code - - @property - def codedict(self): - if self.__codedict is None: - self.__codedict = { - "dna": { - "field": "dna_profile", - "code": self.iupac_nt_code, - "explicit_code": self.iupac_explicit_nt_code - }, - "aa": { - "field": "aa_profile", - "code": self.iupac_aa_code, - "explicit_code": self.iupac_explicit_aa_code - } - } - - return self.__codedict - - # DATA IMPORT - - @staticmethod - def hash(seq): - """ - static function to hash any sequence using SEGUID (SHA-1 hash of the - upper-case sequence) - - Parameters - ---------- - seq : str - define a sequence to hash - - Returns - ------- - str - seguid - - """ - return seguid(seq) - - @staticmethod - def harmonize(seq): - """ - static function to return a sequence in upper case format and with T instead of U - - Parameters - ---------- - seq : str - define a sequence to harmonize - - Returns - ------- - str - sequence - - """ - return str(seq).strip().upper().replace("U", "T") - - def check_iupac_nt_code(self, seq): - """ - returns set of non-IUPAC characters present in a given sequence - - Parameters - ---------- - seq : str - define a sequence to check - - Returns - ------- - str - sequence - - """ - return set(seq).difference(self.iupac_nt_code.keys()) - - def multi_process_fasta_wrapper(self, args): - """ - wrapper function for sonarDB.process_fasta that accepts the needed - parameters as list (which allows to be called by multiprocessing for - parallelization) to add a genome sequences from a FASTA file. The FASTA - file has to contain exactly one record. - - Parameters - ---------- - args: list - ordered list of the following arguments - args[0] : str - corresponds to fname in sonarDB.process_fasta - define a valid FASTA file containing exactly one genome record to be - added to the SONAR database - args[1] : str - corresponds to algnfile in sonarDB.process_fasta - define a filename to permanently store the sequence alignment. Please - consider, that an existing file will be overwritten. If None, a - temporary file will be created and deleted after processing. - args[2] : str - corresponds to cache in sonarDB.process_fasta - define a cache file (pickle format) that is used to permanently store - processed data. Please consider, that an existing file will be - overwritten. IfNone, a temporary file will be created and deleted after - processing. - args[3] : int - timeout in seconds - define a timeout in seconds for processing genomes - integers below 1 deactivate the timeout. - - Returns - ------- - tuple - returns a tuple consisting of status and the hash of the processed - genome sequence. Status False means TimeoutError (genome was not added - to the database) while True means genome was successfully added. - - """ - fname, algnfile, picklefile, seqhash, timeout = args - try: - with sonarTimeout(seconds=timeout): - self.process_fasta(fname, algnfile, picklefile) - except TimeoutError: - return False, seqhash - else: - return True, seqhash - - def process_fasta(self, fname, algnfile=None, pickle_file=None): - """ - function to process a genome sequence from a single FASTA file, if - the respective sequence is not in the database. The FASTA - file has to contain exactly one record. - - Example - ------- - - In this example the path to the database is stored in DOCTESTDB. - QRY_FASTA_FILE stores the path of a FASTA file conatining a - B.1.1.7 prototype genome sequence. - - >>> a = os.remove(DOCTESTDB) if os.path.exists(DOCTESTDB) else None - >>> db = sonarDB(DOCTESTDB) - >>> data = db.process_fasta(QRY_FASTA_FILE) - >>> data['acc'] - 'b117' - >>> data['descr'] - 'b117 Ideal severe acute respiratory syndrome coronavirus 2 lineage B.1.1.7, complete genome' - >>> data['dna_profile'] - 'C3267T C5388A T6954C del:11288:9 del:21765:6 del:21991:3 A23063T C23271A C23604A C23709T T24506G G24914C C27972T G28048T A28111G G28280C A28281T T28282A C28977T' - >>> data['prot_profile'] - 'ORF1a:T1001I ORF1a:A1708D ORF1a:I2230T ORF1a:del:3675:3 ORF1b:T1001I ORF1b:A1708D ORF1b:I2230T ORF1b:del:3675:3 S:del:68:3 S:del:143:2 S:N501Y S:A570D S:P681H S:T716I S:S982A S:D1118H ORF8:Q27* ORF8:R52I ORF8:Y73C N:D3L N:S235F' - - Parameters - ---------- - fname : str - define a valid FASTA file containing exactly one genome record to be - added to the SONAR database - algnfile : str [ None ] - define a filename to permanently store the sequence alignment. Please - consider, that an existing file will be overwritten. If None, a - temporary file will be created and deleted after processing. - pickle_file : str [ None ] - define a filname to store the dictionary in pickle format instead of - returning it. Please consider, that an existing file will be - overwritten. If None, a temporary file will be created and deleted - after processing. - - Returns - ------- - dict - if pickle_file is None a dictionary is returned, else there is no return - value. The dictionary has following keys and values and can be directly - used as input for the import_genome function of this class (**kwargs): - - acc: accession of processed genome - - descr: FASTA header of processed genome - - dnadiff: a list of nucleotide level variations (see sonarALIGN.dnadiff) - - aadiff: a list of amino acid level variations (see sonarALIGN.aadiff) - - dna_profile: the formatted nucleotide level profile (see sonarDB.build_profile) - - prot_profile: the formatted amino acid level profile (see sonarDB.build_profile) - - fs_profile: the dna_profile with frameshift mutations only - - seq: genome sequence - """ - record = SeqIO.read(fname, "fasta") - seq = self.harmonize(record.seq) - seqhash = self.hash(seq) - data = { - 'acc': record.id, - 'descr': record.description, - 'seqhash': seqhash - } - - alignment = sonarALIGN(fname, self.reffna, algnfile, self.refgffObj) - data['dnadiff'] = alignment.dnadiff - data['aadiff'] = alignment.aadiff - data['dna_profile'] = self.build_profile(*data['dnadiff']) - data['prot_profile'] = self.build_profile(*data['aadiff']) - data['fs_profile'] = self.filter_frameshifts(data['dna_profile']) - - if pickle_file: - with open(pickle_file, "wb") as handle: - pickle.dump(data, handle) - else: - data['seq'] = seq - return data - - - def import_genome_from_fasta_files(self, *fnames, dbm=None, msg=None, disable_progressbar=False): - """ - function to import genome sequence(s) from given FASTA file(s) to the - SONAR database. Each FASTA file has to contain exactly one record. - - Example - ------- - - In this example the path to the database is stored in DOCTESTDB. - QRY_FASTA_FILE stores the path of a FASTA file conatining a - B.1.1.7 protoype genome sequence. - - >>> a = os.remove(DOCTESTDB) if os.path.exists(DOCTESTDB) else None - >>> db = sonarDB(DOCTESTDB) - >>> db.import_genome_from_fasta_files(QRY_FASTA_FILE, disable_progressbar=True) - - Parameters - ---------- - *fnames : str - define one or more valid FASTA files. Each file must contain - exactly one genome record - dbm : sonarDBManager object [ None ] - define a sonarDBManager object to use for database transaction - msg : str - define a message used for the progress bar. If None, no progress - bar is shown. [ None ] - disable_progressbar : bool [ False ] - define if the progress bar is shown (False) or not (True) - """ - with ExitStack() as stack: - if dbm is None: - dbm = stack.enter_context(sonarDBManager(self.db)) - for i in tqdm(range(len(fnames)), desc = msg, disable=disable_progressbar): - self.import_genome(**self.process_fasta(fnames[i]), dbm=dbm) - - - def import_genome_from_cache(self, cachedir, acc_dict, dbm=None, msg=None, disable_progressbar=False): - """ - function to import data from a sonarCACHE directory to the SONAR database. - - Parameters - ---------- - cachedir : str - define a valid sonarCACHE directory - acc_dict : dict - define a dictionary (key: sequence hash, value: set of assigned accessions) - to import to the database - dbm : sonarDBManager object [ None ] - define a sonarDBManager object to use for database transaction - msg : str [ None ] - define a message used for the progress bar. If None, no progress - bar is shown - disable_progressbar : bool [ False ] - define if the progress bar is shown (False) or not (True) - """ - seqhashes = list(acc_dict.keys()) - with ExitStack() as stack, sonarCache(cachedir) as cache: - if dbm is None: - dbm = stack.enter_context(sonarDBManager(self.db)) - for i in tqdm(range(len(seqhashes)), desc = msg, disable = disable_progressbar): - seqhash = seqhashes[i] - seq = cache.get_cached_seq(seqhash) - preprocessed_data = cache.load_info(seqhash) - for entry in acc_dict[seqhash]: - preprocessed_data['acc'] = entry[0] - preprocessed_data['descr'] = entry[1] - self.import_genome(**preprocessed_data, seq=seq, dbm=dbm) - - - def import_genome(self, acc, descr, seqhash, dnadiff=None, aadiff=None, dna_profile=None, prot_profile=None, fs_profile=None, seq=None, dbm=None): - """ - function to import processed data to the SONAR database. - - Parameters - ---------- - - acc : str - define the accession of the processed genome - descr : str - define the FASTA header of the processed genome - seqhash : str - define the hash (seguid) of the processed genome - dnadiff : list - define a sub list of nucleotide level variations (see sonarALIGN.dnadiff) - aadiff : list - define a sub list of amino acid level variations (see sonarALIGN.aadiff) - dna_profile : str - define the formatted nucleotide level profile (see sonarDB.build_profile) - prot_profile : str - define the formatted amino acid level profile (see sonarDB.build_profile) - seq : str - define the sequence of the processed genome (can be None, but then no paranoid test is done) - dbm : sonarDBManager object [ None ] - define a sonarDBManager object to use for database transaction - """ - with ExitStack() as stack: - try: - if dbm is None: - dbm = stack.enter_context(sonarDBManager(self.db)) - - dbm.insert_genome(acc, descr, seqhash) - - if not dnadiff is None: - dbm.insert_sequence(seqhash) - dbm.insert_profile(seqhash, dna_profile, prot_profile, fs_profile) - for ref, alt, s, e, _, __ in dnadiff: - dbm.insert_dna_var(seqhash, ref, alt, s, e) - - for ref, alt, s, e, protein, locus in aadiff: - dbm.insert_prot_var(seqhash, protein, locus, ref, alt, s, e) - - if seq: - self.be_paranoid(acc, seq, auto_delete=True, dbm=dbm) - except sqlite3.IntegrityError as er: - print("\nError while processing ID: '{}' \n".format(acc)) - raise er - except sqlite3.Error as er: - print("\nError: occurred while trying to store ID: %s \n", acc) - raise er - - - - - # NOMENCLATURE - - def isdnavar(self, var): - """ - function to validate nucleotide level profiles - - Examples - -------- - - >>> a = os.remove(DOCTESTDB) if os.path.exists(DOCTESTDB) else None - >>> db = sonarDB(DOCTESTDB) - >>> db.isdnavar("S:N501Y") - False - >>> db.isdnavar("A101T") - True - - Parameters - ---------- - - var : str - define the profile to validate - - Returns - ------- - - bool - True if var is a valid nucleotide level profile otherwise False - """ - return bool(self.dna_var_regex.match(var)) - - def isaavar(self, var): - """ - function to validate amino acid level profiles - - Examples - -------- - - >>> a = os.remove(DOCTESTDB) if os.path.exists(DOCTESTDB) else None - >>> db = sonarDB(DOCTESTDB) - >>> db.isaavar("S:N501Y") - True - >>> db.isaavar("A101T") - False - - Parameters - ---------- - - var : str - define the profile to validate - - Returns - ------- - - bool - True if var is a valid amino acid level profile otherwise False - """ - return bool(self.aa_var_regex.match(var)) - - def isdel(self, var): - """ - function to validate deletion profiles on both nucleotide and amino acid level - - Examples - -------- - - >>> a = os.remove(DOCTESTDB) if os.path.exists(DOCTESTDB) else None - >>> db = sonarDB(DOCTESTDB) - >>> db.isdel("del:100-118") - False - >>> db.isdel("del:100:18") - True - >>> db.isdel("ORF1b:del:5:2") - True - - Parameters - ---------- - - var : str - define the profile to validate - - Returns - ------- - - bool - True if var is a deletion profile otherwise False - """ - return bool(self.del_regex.match(var)) - - # PROFILE BUILDING - - def build_profile(self, *vars): - """ - function to build a valid variant profiles based on given variations - - Parameters - ---------- - - vars : list - define for each variation to be considered by the profile a list - with the following elements: - - reference nucleotide(s) or amino acid(s) - - alternative nucleotide(s) or amino acid(s) - - start position (0-based) related to the genome (nucleotide level profile) or - protein (amino acid level profile) - - end position (0-based) related to the genome (nucleotide level profile) or - protein (amino acid level profile) or None if single nucleotide/amino acid - polymorphism - - protein symbol (None in case of nucleotide level profiles) - - gene locus (None in case of nucleotide level profiles) - - Returns - ------- - - str - valid variant profile - """ - if len(vars) == 0: - return "" - profile = [] - if len(vars) == 1: - this_ref, this_alt, this_start, this_end, this_protein, this_locus = vars[0] - if this_alt == "" and this_end is None: - this_end = this_start + len(this_ref) - else: - vars = sorted(vars, key=lambda x: (x[5], x[4], x[2])) - for l in range(len(vars)-1): - this_ref, this_alt, this_start, this_end, this_protein, this_locus = vars[l] - next_ref, next_alt, next_start, next_end, next_protein, next_locus = vars[l+1] - if this_alt != "": - var = self.format_var(this_ref, this_alt, this_start, this_end, this_protein) - profile.append(var) - elif this_alt == "" and next_alt == "" and this_start + len(this_ref) == next_start and this_protein == next_protein and this_locus == next_locus: - vars[l+1] = (this_ref + next_ref, "", this_start, next_start+1, this_protein, this_locus) - else: - if this_alt == "" and this_end is None: - this_end = this_start + len(this_ref) - var = self.format_var(this_ref, this_alt, this_start, this_end, this_protein, this_locus) - profile.append(var) - this_ref, this_alt, this_start, this_end, this_protein, this_locus = vars[l+1] - if this_alt == "" and this_end is None: - this_end = this_start + len(this_ref) - var = self.format_var(this_ref, this_alt, this_start, this_end, this_protein, this_locus) - if var not in profile: - profile.append(var) - - return " ".join(profile) - - @staticmethod - def format_var(ref, alt, start, end, protein=None, locus=None): - """ - function to build a valid variant profile based on a single variation - - Parameters - ---------- - - ref : str - define the reference nucleotide(s) or amino acid(s) - alt : str - define the alternative nucleotide(s) or amino acid(s) - start : int - define the start position (0-based) related to the genome (nucleotide - level profile) or protein (amino acid level profile) - end : int - define the end position (0-based) related to the genome (nucleotide - level profile) or protein (amino acid level profile) or None if - single nucleotide/amino acid polymorphism - protein : str - define the protein symbol (None in case of nucleotide level profiles) - [ None ] - locus : str - define the gene locus (None in case of nucleotide level profiles) - [ None ] - - Returns - ------- - - str - valid variant profile - """ - if alt != "": - coord = str(start+1) - else: - ref = "del:" - coord = str(start+1) + ":" + str(end-start) - protein = protein + ":" if protein else "" - return protein + ref + coord + alt - - # FRAMESHIFT DETECTION - - def is_frameshift(self, dna_var): - """ - function to check if a dna variant causes a frameshift in any annotated - CDS. - - Returns - ------- - bool - True if dna variant causes a frameshift, otherwise False. - """ - - if dna_var.startswith("del:"): - _, x, l = dna_var.split(":") - x = int(x) - 1 - y = x + int(l) - for cds in self.refgffObj.cds: - if cds.is_frameshift_del(x, y): - return True - else: - match = self.dnavar_grep_regex.search(dna_var) - x = int(match.group(2)) - 1 - l = len(match.group(3)) - 1 - if l%3 != 0: - for cds in self.refgffObj.cds: - if cds.is_frameshift_in(x, l): - return True - return False - - def filter_frameshifts(self, dna_profile): - """ - function to filter all frameshift mutations from a given dna_profile. - - Returns - ------- - str - dna_profile containing only frameshift mutations - """ - if self.refgffObj and dna_profile.strip(): - return " ".join([x for x in filter(None, dna_profile.split(" ")) if self.is_frameshift(x)]) - return "" - - # MATCHING - - def filter_ambig(self, profile, explicit_code, keep=None): - """ - function to filter variations with ambiguities in the alternative allele - from a valid nucleotide or amino acid level profile - - Parameters - ---------- - - profile : str - valid nucleotide or amino acid level profile - explicit_code : dict - explicit IUPAC code dictionary to use (as provided by - sonarDB.iupac_explicit_nt_code or sonarDB.iupac_explicit_aa_code) - keep : list [ None ] - list of single variation profiles to exclude from filtering - - Returns - ------- - - str - valid variant profile - """ - if profile is None: - return "" - out = [] - keep = set(keep) if keep else set() - for mutation in list(filter(None, profile.split(" "))): - if mutation in keep or self.del_regex.search(mutation): - out.append(mutation) - continue - match = self.__terminal_letters_regex.search(mutation) - if match and len(match.group(0)) == 1 and match.group(0) not in explicit_code: - continue - out.append(mutation) - return " ".join(out) - - - def pinpoint_mutation(self, mutation, code): - """ - function to generate a set of all profiles consisting of - non-ambiguous one-letter codes only that match to a given profile. - If the given profile does not contain any ambiguities a list only - containing the given profile is returned. - - Examples - -------- - - >>> a = os.remove(DOCTESTDB) if os.path.exists(DOCTESTDB) else None - >>> db = sonarDB(DOCTESTDB) - >>> sorted(db.pinpoint_mutation('A5001N', db.iupac_nt_code)) - ['A5001A', 'A5001B', 'A5001C', 'A5001D', 'A5001G', 'A5001H', 'A5001K', 'A5001M', 'A5001N', 'A5001R', 'A5001S', 'A5001T', 'A5001V', 'A5001W', 'A5001Y'] - >>> db.pinpoint_mutation('N501Y', db.iupac_aa_code) - {'N501Y'} - - Parameters - ---------- - - mutation : str - define a valid nucleotide or amino acid level profile that may contain - ambiguities - code : dict - define the IUPAC code dictionary to use (as provided by - sonarDB.iupac_nt_code or sonarDB.iupac_aa_code) - - Returns - ------- - - set - set of profiles without ambiguities but matching to given profile - """ - # extract ALT call from mutation profile - match = self.__terminal_letters_regex.search(mutation) - if not match: - return {mutation, } - match = match.group(0) - - # resolve ambiguities - options = [] - for m in match: - options.append(code[m]) - - # generate the set of explicit mutations - orig_stat = mutation[:-len(match)] - return set([mutation] + [ orig_stat + "".join(x) for x in itertools.product(*options) ]) - - def make_profile_explicit(self, profile): - """ - function to replace ambiguous variants from a profile by the respective - explicit variant descriptions and to sort profiles based on their level - - Parameters - ---------- - - profile : str - define a valid nucleotide, amino acid or mixed level profile that - may contain ambiguities - - Returns - ------- - - dict - dictionary with 'dna' or 'aa' as key and the respective list of - explicit dna/protein level mutations - """ - - profile = set(profile) - extended_profile = {'aa': [], 'dna': []} - for var in profile: - key = "dna" if self.isdnavar(var) else "aa" - extended_profile[key].extend([v for v in self.pinpoint_mutation(var, self.codedict[key]['code'])]) - return extended_profile - - def _fix_X_N_search(self, _profiles): - temp_include_profiles = [] - for _list_var in _profiles: - for var in _list_var: - if (var[-1].lower() == 'x') and not self.isdnavar(var): - for v in self.pinpoint_mutation(var, self.codedict["aa"]['code']): - temp_include_profiles.append([v]) - - elif(var[-1].lower() == 'n') and self.isdnavar(var): - for v in self.pinpoint_mutation(var, self.codedict["dna"]['code']): - temp_include_profiles.append([v]) - - _profiles.extend(temp_include_profiles) - _profiles = [list(x) for x in set(tuple(x) for x in _profiles)] - #print("After update") - #print(_profiles) - return _profiles - - - def match(self, - include_profiles=[], - exclude_profiles=[], - accessions=[], - lineages=[], - with_sublineage=False, - zips=[], - dates=[], - submission_dates=[], - labs=[], - sources=[], - collections=[], - technologies=[], - platforms=[], - chemistries=[], - materials=[], - software=None, - software_version=None, - min_ct=None, - max_ct=None, - seqhashes=[], - ambig=False, - count=False, - frameshifts=0, - debug=False, - dbm=None): - """ - function to search genomes in the SONAR database dependent on - defined sequence and metadata profiles - - Parameters - ---------- - - include_profiles : list [ [] ] - define a list of valid nucleotide, amino acid or mixed level profiles - that may contain ambiguities to find genomes sharing respective - profiles. Variations in each profile (sublist) are linked by AND operator - while profiles from different sublists are linked by OR. - exclude_profiles : list [ [] ] - define a list of valid nucleotide, amino acid or mixed level profiles - that may contain ambiguities to find genomes NOT sharing respective - profiles. Variations in each profile (sublist) are linked by AND operator - while profiles from different sublists are linked by OR. - accessions : list [ [] ] - list of accessions. Only genomes linked to accessions in this list - will be matched. Accessions are negated when starting with ^. [ None ] - lineages : list [ [] ] - list of pangolin lineages. Only genomes assigend to a - pangolin lineage in this list will be matched. Lineages are - negated when starting with ^. - with_sublineage :False, - - zips : list [ [] ] - list of zip codes. Only genomes linked to one of the given zip - codes or whose linked zip code starts like one of the given - zip codes are matched. zip codes are negated when starting with ^. - dates : list [ [] ] - define list of dates (YYYY-MM-DD) or date ranges (YYYY-MM-DD:YYYY-MM-DD). - Only genomes linked to one of the given dates or date ranges are - matched. - submission_dates : list [ [] ] - define list of submission dates (YYYY-MM-DD) or submission ranges (YYYY-MM-DD:YYYY-MM-DD). - Only genomes linked to one of the given dates or date ranges are - matched. - sources : list [ [] ] - list of data sources. Only genomes linked to a - data source in this list will be matched. Data sources are - negated when starting with ^. - collections : list [ [] ] - list of data collections. Only genomes linked to a - data collection in this list will be matched. Data collections are - negated when starting with ^. - technologies : list [ [] ] - list of sequencing technologies. Only genomes linked to a - technology in this list will be matched. Technologies are - negated when starting with ^. - platforms : list [ [] ] - list of sequencing platforms. Only genomes linked to a - platform in this list will be matched. Platforms are - negated when starting with ^. - chemistries : list [ [] ] - list of sequencing chemistries. Only genomes linked to a - chemistry in this list will be matched. Chemistries are - negated when starting with ^. - software : str [ None ] - software used for sequence reconstruction. Only genomes linked to the - given software will be matched. Software is negated when starting with ^. - software_version : str [ None ] - software version used for sequence reconstruction. Only genomes linked - to the given software version will be matched. Software version is - negated when starting with ^. Needs software defined. - materials : list [ [] ] - list of sampling materials. Only genomes linked to a - material in this list will be matched. Materials are - negated when starting with ^. - labs : list [ [] ] - list of lab identifiers. Only genomes linked to a - lab in this list will be matched. Labs are - negated when starting with ^. - min_ct : float [ None ] - minimal ct value of genomes to match. - max_ct : float [ None ] - maximal ct value of genomes to match. - ambig : bool [ False ] - define if variant alleles including ambiguities should be shown (True) - or not (False) - count : bool [ False ] - define if matched genomes should be counted (True) instead of collected - (False). - frameshifts : int [ 0 ] - define if matched genomes have to conatin frameshift mutations (1) - or have not to conatin frameshift mutations (-1) or frameshift mutations - do not matter (0). - debug : bool [ False ] - activate debug mode for sonarDBManager - dbm : sonarDBManager object [ None ] - define a sonarDBManager object to use for database transaction - - Returns - ------- - - list or int - list of rows if count is False else number of rows as int. Each row - represents a matching genome and is provided as dictionary with field - names as keys. - """ - - clause = [] - vals =[] - - #sanity check: - check = [] - if include_profiles: - check += [item for sublist in include_profiles for item in sublist] - if exclude_profiles: - check += [item for sublist in exclude_profiles for item in sublist] - nonvalid = [ x for x in check if not self.isdnavar(x) and not self.isaavar(x) ] - if nonvalid: - sys.exit("input error: Non-valid variant expression(s) entered: " + ", ".join(nonvalid)) - - if software_version and software is None: - sys.exit("input error: matching a given software version needs a software defined.") - - # adding conditions of profiles to include to where clause - # print(include_profiles) - if include_profiles: - include_profiles = self._fix_X_N_search(include_profiles) - include_profiles = [ self.make_profile_explicit(x) for x in include_profiles ] # Fix here - # adding conditions of profiles to exclude to where clause - if exclude_profiles: - exclude_profiles = self._fix_X_N_search(exclude_profiles) - exclude_profiles = [ self.make_profile_explicit(x) for x in exclude_profiles ] - # adding accession, lineage, zips, and dates based conditions - include_acc = [x for x in accessions if not x.startswith("^")] - exclude_acc = [x[1:] for x in accessions if x.startswith("^")] - - include_lin = [x for x in lineages if not x.startswith("^")] - exclude_lin = [x[1:] for x in lineages if x.startswith("^")] - - include_zip = [x for x in zips if not str(x).startswith("^")] - exclude_zip = [x[1:] for x in zips if str(x).startswith("^")] - - include_dates = [x for x in dates if not str(x).startswith("^")] - exclude_dates = [x[1:] for x in dates if str(x).startswith("^")] - - include_submission_dates = [x for x in submission_dates if not str(x).startswith("^")] - exclude_submission_dates = [x[1:] for x in submission_dates if str(x).startswith("^")] - - include_labs = [x for x in labs if not str(x).startswith("^")] - exclude_labs = [x[1:] for x in labs if str(x).startswith("^")] - - include_source = [x for x in sources if not str(x).startswith("^")] - exclude_source = [x[1:] for x in sources if str(x).startswith("^")] - - include_collection = [x for x in collections if not str(x).startswith("^")] - exclude_collection = [x[1:] for x in collections if str(x).startswith("^")] - - include_technology = [x for x in technologies if not str(x).startswith("^")] - exclude_technology = [x[1:] for x in technologies if str(x).startswith("^")] - - include_platform = [x for x in platforms if not str(x).startswith("^")] - exclude_platform= [x[1:] for x in platforms if str(x).startswith("^")] - - include_chemistry = [x for x in chemistries if not str(x).startswith("^")] - exclude_chemistry = [x[1:] for x in chemistries if str(x).startswith("^")] - - include_material = [x for x in materials if not str(x).startswith("^")] - exclude_material = [x[1:] for x in materials if str(x).startswith("^")] - - include_seqhash = [x for x in seqhashes if not x.startswith("^")] - exclude_seqhash = [x[1:] for x in seqhashes if x.startswith("^")] - - if software: - if not software.startswith("^"): - include_software = software - exclude_software = None - else: - include_software = None - exclude_software = software[1:] - else: - include_software = None - exclude_software = None - - if software_version: - if not software_version.startswith("^"): - include_software_version = software_version - exclude_software_version = None - else: - include_software_version = None - exclude_software_version = software_version[1:] - else: - include_software_version = None - exclude_software_version = None - - # query - with ExitStack() as stack: - if dbm is None: - dbm = stack.enter_context(sonarDBManager(self.db, readonly=True)) - dbm.debug = debug - ### support wildcard #### - _tmp_include_lin = [] - for in_lin in include_lin: - if "%" in in_lin: - _list = dbm.get_list_of_lineages(in_lin) - - if len(_list) > 0: - _tmp_include_lin.extend(_list) - ## if we don't find this wildcard so we discard it - else: - _tmp_include_lin.append(in_lin) - include_lin = _tmp_include_lin - - _tmp_exclude_lin = [] - for ex_lin in exclude_lin: - if "%" in ex_lin: - _list = dbm.get_list_of_lineages(ex_lin) - - if len(_list) > 0: - _tmp_exclude_lin.extend(_list) - ## if we don't find this wildcard so we discard it - else: - _tmp_exclude_lin.append(ex_lin) - exclude_lin = _tmp_exclude_lin - ### support paren-child relationship #### - if(with_sublineage): - # print('sublineage query is enbable for all included lineages') - _tmp_include_lin = [] - - while include_lin: - in_lin = include_lin.pop(0) - value = self.lineage_sublineage_dict.get(in_lin, 'none') # provide a default value if the key is missing: - - if value != 'none': - _tmp_include_lin.append(in_lin) - _list = value.split(',') - # recursive way - for i in _list: - include_lin.append(i) - # _tmp_include_lin.append(i) - ## if we don't find this wildcard so we discard it - else: # None - _tmp_include_lin.append(in_lin) - _tmp_include_lin = list(dict.fromkeys(_tmp_include_lin)) - """ - # since we have a proper lineage file + """ + this object provides sonarDB functionalities and intelligence + + Notes + ----- + Please note, that genomic and protein coordinates are expected to be and + returned 0-based by this object, except for formatted profiles. + While start or single coordinates are inclusive, end coordinates of + ranges are exclusive, expressed in a mathematical notation: [start, end). + Only in formatted profiles start and end coordinates are 1-based and both + inclusive. + + Examples + -------- + + In this example the path to the database is stored in DOCTESTDB. + + >>> db = sonarDB(DOCTESTDB) + + Parameters + ---------- + dbfile : str + define a path to a non-existent or valid SONAR database file. If the + file does not exist, a SONAR database is created. + translation_table : int + define the genetic code table used for in silico translation (see + https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) [ 1 ] + + Attributes + ---------- + db : str + stores the absolute path to the used SONAR database file + reffna : str + stores the absolute path to the built-in FASTA file containing the reference + genome sequence + refgff : str + stores the absolute path to the built-in GFF3 file containing the reference + genome annotation + translation_table : int + stores the genetic code table used for in silico translation (see + https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) [ 1 ] + refseq : str + stores the upper-case sequence of the built-in reference genome + refdescr : str + stores the FASTA header of the built-in reference genome + refgffObj : object + stores the sonarGFF object based on the built-in reference genome + annotation + iupac_nt_code : dict + stores a dict with IUPAC one-letter nucleotide codes as keys and the + respective set of matching explicit IUPAC one-letter nucleotide codes + as values (e.g {"W": set('A', 'T')}) + iupac_explicit_nt_code : dict + stores a set containing all non-ambiguous IUPAC one-letter nucleotide codes + iupac_ambig_nt_code : set + stores a set containing all ambiguous IUPAC one-letter nucleotide codes + iupac_aa_code : dict + stores a dict with IUPAC one-letter amino acid codes as keys and + the respective set of matching IUPAC one-letter amino acids codes as values + iupac_explicit_aa_code : dict + stores a set containing all non-ambiguous IUPAC one-letter amino acid codes + iupac_ambig_aa_code : dict + stores a set containing all ambiguous IUPAC one-letter amino acid codes + dna_var_regex : compiled re expression + stores a compiled re expression that matches to nucleotide profiles but + not to amino acid profiles + aa_var_regex : compiled re expression + stores a compiled re expression that matches to amino acid profiles but + not to nucleotide profiles + del_regex : compiled re expression + stores a compiled re expression that matches to deletion profiles on + nucleotide as well as on amino acid level. + dnavar_grep_regex : compiled re expression + stores a compiled re expression that matches to snp or dna insertion + profiles with eference allele, genomic position and variant allele + as groups. + codedict : dict + stores a dictionary with "dna" and "aa" containing the field name in the + database that stores the profile data, the one letter code with and + without ambiguities + """ + + def __init__(self, dbfile, translation_table=1): + self.db = os.path.abspath(dbfile) + self.__moduledir = os.path.dirname(os.path.realpath(__file__)) + self.reffna = os.path.join(self.__moduledir, "ref.fna") + self.refgff = os.path.join(self.__moduledir, "ref.gff3") + self.lineagewithsublineages = os.path.join(self.__moduledir, "lineage.all.tsv") + + self.translation_table = translation_table + self.__lineage_sublineage_dict = None + self.__refseq = None + self.__refdescr = None + self.__refgffObj = None + self.__iupac_nt_code = None + self.__iupac_aa_code = None + self.__iupac_explicit_nt_code = None + self.__iupac_explicit_aa_code = None + self.__iupac_ambig_nt_code = None + self.__iupac_ambig_aa_code = None + self.__terminal_letters_regex = re.compile("[A-Z]$") + self.__dna_var_regex = None + self.__aa_var_regex = None + self.__del_regex = None + self.__dnavar_grep_regex = None + self.__codedict = None + + # PROPERTIES ON DEMAND + + @property + def lineage_sublineage_dict(self): + if not self.__lineage_sublineage_dict: + df = pd.read_csv(self.lineagewithsublineages, sep="\t") + self.__lineage_sublineage_dict = dict(zip(df.lineage, df.sublineage)) + return self.__lineage_sublineage_dict + + @property + def refseq(self): + if not self.__refseq: + record = SeqIO.read(self.reffna, "fasta") + self.__refseq = self.harmonize(record.seq) + return self.__refseq + + @property + def refdescr(self): + if not self.__refdescr: + with open(self.reffna, "r") as handle: + self.__refdescr = handle.readline().strip()[1:] + return self.__refdescr + + @property + def refgffObj(self): + if not self.__refgffObj: + self.__refgffObj = sonarGFF( + self.refgff, self.reffna, self.translation_table + ) + return self.__refgffObj + + @property + def dna_var_regex(self): + if self.__dna_var_regex is None: + allowed_letters = "[" + "".join(self.iupac_nt_code.keys()) + "]" + self.__dna_var_regex = re.compile( + "^(?:(?:del:[0-9]+:[0-9]+)|(?:" + + allowed_letters + + "[0-9]+" + + allowed_letters + + "+))$" + ) + return self.__dna_var_regex + + @property + def dnavar_grep_regex(self): + if self.__dnavar_grep_regex is None: + self.__dnavar_grep_regex = re.compile("^([^0-9:]*)([0-9]+)([^0-9]*)$") + return self.__dnavar_grep_regex + + @property + def aa_var_regex(self): + if self.__aa_var_regex is None: + allowed_symbols = "(?:(?:" + ")|(?:".join(self.refgffObj.symbols) + "))" + allowed_letters = ( + "[" + "".join(self.iupac_aa_code.keys()).replace("-", "") + "*~-" + "]" + ) + self.__aa_var_regex = re.compile( + "^" + + allowed_symbols + + ":(?:(?:del:[0-9]+:[0-9]+)|(?:" + + allowed_letters + + "[0-9]+" + + allowed_letters + + "+))$" + ) + return self.__aa_var_regex + + @property + def del_regex(self): + if self.__del_regex is None: + allowed_symbols = "(?:(?:" + ")|(?:".join(self.refgffObj.symbols) + "))" + self.__del_regex = re.compile( + "^(?:" + allowed_symbols + ":)?del:[0-9]+:[0-9]+$" + ) + return self.__del_regex + + @property + def iupac_nt_code(self): + if self.__iupac_nt_code is None: + self.__iupac_nt_code = { + "A": set("A"), + "C": set("C"), + "G": set("G"), + "T": set("T"), + "R": set("AGR"), + "Y": set("CTY"), + "S": set("GCS"), + "W": set("ATW"), + "K": set("GTK"), + "M": set("ACM"), + "B": set("CGTB"), + "D": set("AGTD"), + "H": set("ACTH"), + "V": set("ACGV"), + } + self.__iupac_nt_code["N"] = set(self.__iupac_nt_code.keys()) | set("N") + return self.__iupac_nt_code + + @property + def iupac_explicit_nt_code(self): + if self.__iupac_explicit_nt_code is None: + self.__iupac_explicit_nt_code = set( + [x for x in self.iupac_nt_code if len(self.iupac_nt_code[x]) == 1] + ) + return self.__iupac_explicit_nt_code + + @property + def iupac_ambig_nt_code(self): + if self.__iupac_ambig_nt_code is None: + self.__iupac_ambig_nt_code = set( + [x for x in self.iupac_nt_code if len(self.iupac_nt_code[x]) > 1] + ) + return self.__iupac_ambig_nt_code + + @property + def iupac_aa_code(self): + if self.__iupac_aa_code is None: + self.__iupac_aa_code = { + "A": set("A"), + "R": set("R"), + "N": set("N"), + "D": set("D"), + "C": set("C"), + "Q": set("Q"), + "E": set("E"), + "G": set("G"), + "H": set("H"), + "I": set("I"), + "L": set("L"), + "K": set("K"), + "M": set("M"), + "F": set("F"), + "P": set("P"), + "S": set("S"), + "T": set("T"), + "W": set("W"), + "Y": set("Y"), + "V": set("V"), + "U": set("U"), + "O": set("O"), + } + self.__iupac_aa_code.update( + { + "B": set("DNB"), + "Z": set("EQZ"), + "J": set("ILJ"), + "Φ": set("VILFWYMΦ"), + "Ω": set("FWYHΩ"), + "Ψ": set("VILMΨ"), + "π": set("PGASπ"), + "ζ": set("STHNQEDKRζ"), + "+": set("KRH+"), + "-": set("DE-"), + } + ) + self.__iupac_aa_code["X"] = set(self.__iupac_aa_code.keys()) | set("X") + return self.__iupac_aa_code + + @property + def iupac_explicit_aa_code(self): + if self.__iupac_explicit_aa_code is None: + self.__iupac_explicit_aa_code = set( + [x for x in self.iupac_aa_code if len(self.iupac_aa_code[x]) == 1] + ) + return self.__iupac_explicit_aa_code + + @property + def iupac_ambig_aa_code(self): + if self.__iupac_ambig_aa_code is None: + self.__iupac_ambig_aa_code = set( + [x for x in self.iupac_aa_code if len(self.iupac_aa_code[x]) > 1] + ) + return self.__iupac_ambig_aa_code + + @property + def codedict(self): + if self.__codedict is None: + self.__codedict = { + "dna": { + "field": "dna_profile", + "code": self.iupac_nt_code, + "explicit_code": self.iupac_explicit_nt_code, + }, + "aa": { + "field": "aa_profile", + "code": self.iupac_aa_code, + "explicit_code": self.iupac_explicit_aa_code, + }, + } + + return self.__codedict + + # DATA IMPORT + + @staticmethod + def hash(seq): + """ + static function to hash any sequence using SEGUID (SHA-1 hash of the + upper-case sequence) + + Parameters + ---------- + seq : str + define a sequence to hash + + Returns + ------- + str + seguid + + """ + return seguid(seq) + + @staticmethod + def harmonize(seq): + """ + static function to return a sequence in upper case format and with T instead of U + + Parameters + ---------- + seq : str + define a sequence to harmonize + + Returns + ------- + str + sequence + + """ + return str(seq).strip().upper().replace("U", "T") + + def check_iupac_nt_code(self, seq): + """ + returns set of non-IUPAC characters present in a given sequence + + Parameters + ---------- + seq : str + define a sequence to check + + Returns + ------- + str + sequence + + """ + return set(seq).difference(self.iupac_nt_code.keys()) + + def multi_process_fasta_wrapper(self, args): + """ + wrapper function for sonarDB.process_fasta that accepts the needed + parameters as list (which allows to be called by multiprocessing for + parallelization) to add a genome sequences from a FASTA file. The FASTA + file has to contain exactly one record. + + Parameters + ---------- + args: list + ordered list of the following arguments + args[0] : str + corresponds to fname in sonarDB.process_fasta + define a valid FASTA file containing exactly one genome record to be + added to the SONAR database + args[1] : str + corresponds to algnfile in sonarDB.process_fasta + define a filename to permanently store the sequence alignment. Please + consider, that an existing file will be overwritten. If None, a + temporary file will be created and deleted after processing. + args[2] : str + corresponds to cache in sonarDB.process_fasta + define a cache file (pickle format) that is used to permanently store + processed data. Please consider, that an existing file will be + overwritten. IfNone, a temporary file will be created and deleted after + processing. + args[3] : int + timeout in seconds + define a timeout in seconds for processing genomes + integers below 1 deactivate the timeout. + + Returns + ------- + tuple + returns a tuple consisting of status and the hash of the processed + genome sequence. Status False means TimeoutError (genome was not added + to the database) while True means genome was successfully added. + + """ + fname, algnfile, picklefile, seqhash, timeout = args + try: + with sonarTimeout(seconds=timeout): + self.process_fasta(fname, algnfile, picklefile) + except TimeoutError: + return False, seqhash + else: + return True, seqhash + + def process_fasta(self, fname, algnfile=None, pickle_file=None): + """ + function to process a genome sequence from a single FASTA file, if + the respective sequence is not in the database. The FASTA + file has to contain exactly one record. + + Example + ------- + + In this example the path to the database is stored in DOCTESTDB. + QRY_FASTA_FILE stores the path of a FASTA file conatining a + B.1.1.7 prototype genome sequence. + + >>> a = os.remove(DOCTESTDB) if os.path.exists(DOCTESTDB) else None + >>> db = sonarDB(DOCTESTDB) + >>> data = db.process_fasta(QRY_FASTA_FILE) + >>> data['acc'] + 'b117' + >>> data['descr'] + 'b117 Ideal severe acute respiratory syndrome coronavirus 2 lineage B.1.1.7, complete genome' + >>> data['dna_profile'] + 'C3267T C5388A T6954C del:11288:9 del:21765:6 del:21991:3 A23063T C23271A C23604A C23709T T24506G G24914C C27972T G28048T A28111G G28280C A28281T T28282A C28977T' + >>> data['prot_profile'] + 'ORF1a:T1001I ORF1a:A1708D ORF1a:I2230T ORF1a:del:3675:3 ORF1b:T1001I ORF1b:A1708D ORF1b:I2230T ORF1b:del:3675:3 S:del:68:3 S:del:143:2 S:N501Y S:A570D S:P681H S:T716I S:S982A S:D1118H ORF8:Q27* ORF8:R52I ORF8:Y73C N:D3L N:S235F' + + Parameters + ---------- + fname : str + define a valid FASTA file containing exactly one genome record to be + added to the SONAR database + algnfile : str [ None ] + define a filename to permanently store the sequence alignment. Please + consider, that an existing file will be overwritten. If None, a + temporary file will be created and deleted after processing. + pickle_file : str [ None ] + define a filname to store the dictionary in pickle format instead of + returning it. Please consider, that an existing file will be + overwritten. If None, a temporary file will be created and deleted + after processing. + + Returns + ------- + dict + if pickle_file is None a dictionary is returned, else there is no return + value. The dictionary has following keys and values and can be directly + used as input for the import_genome function of this class (**kwargs): + - acc: accession of processed genome + - descr: FASTA header of processed genome + - dnadiff: a list of nucleotide level variations (see sonarALIGN.dnadiff) + - aadiff: a list of amino acid level variations (see sonarALIGN.aadiff) + - dna_profile: the formatted nucleotide level profile (see sonarDB.build_profile) + - prot_profile: the formatted amino acid level profile (see sonarDB.build_profile) + - fs_profile: the dna_profile with frameshift mutations only + - seq: genome sequence + """ + record = SeqIO.read(fname, "fasta") + seq = self.harmonize(record.seq) + seqhash = self.hash(seq) + data = {"acc": record.id, "descr": record.description, "seqhash": seqhash} + + alignment = sonarALIGN(fname, self.reffna, algnfile, self.refgffObj) + data["dnadiff"] = alignment.dnadiff + data["aadiff"] = alignment.aadiff + data["dna_profile"] = self.build_profile(*data["dnadiff"]) + data["prot_profile"] = self.build_profile(*data["aadiff"]) + data["fs_profile"] = self.filter_frameshifts(data["dna_profile"]) + + if pickle_file: + with open(pickle_file, "wb") as handle: + pickle.dump(data, handle) + else: + data["seq"] = seq + return data + + def import_genome_from_fasta_files( + self, *fnames, dbm=None, msg=None, disable_progressbar=False + ): + """ + function to import genome sequence(s) from given FASTA file(s) to the + SONAR database. Each FASTA file has to contain exactly one record. + + Example + ------- + + In this example the path to the database is stored in DOCTESTDB. + QRY_FASTA_FILE stores the path of a FASTA file conatining a + B.1.1.7 protoype genome sequence. + + >>> a = os.remove(DOCTESTDB) if os.path.exists(DOCTESTDB) else None + >>> db = sonarDB(DOCTESTDB) + >>> db.import_genome_from_fasta_files(QRY_FASTA_FILE, disable_progressbar=True) + + Parameters + ---------- + *fnames : str + define one or more valid FASTA files. Each file must contain + exactly one genome record + dbm : sonarDBManager object [ None ] + define a sonarDBManager object to use for database transaction + msg : str + define a message used for the progress bar. If None, no progress + bar is shown. [ None ] + disable_progressbar : bool [ False ] + define if the progress bar is shown (False) or not (True) + """ + with ExitStack() as stack: + if dbm is None: + dbm = stack.enter_context(sonarDBManager(self.db)) + for i in tqdm(range(len(fnames)), desc=msg, disable=disable_progressbar): + self.import_genome(**self.process_fasta(fnames[i]), dbm=dbm) + + def import_genome_from_cache( + self, cachedir, acc_dict, dbm=None, msg=None, disable_progressbar=False + ): + """ + function to import data from a sonarCACHE directory to the SONAR database. + + Parameters + ---------- + cachedir : str + define a valid sonarCACHE directory + acc_dict : dict + define a dictionary (key: sequence hash, value: set of assigned accessions) + to import to the database + dbm : sonarDBManager object [ None ] + define a sonarDBManager object to use for database transaction + msg : str [ None ] + define a message used for the progress bar. If None, no progress + bar is shown + disable_progressbar : bool [ False ] + define if the progress bar is shown (False) or not (True) + """ + seqhashes = list(acc_dict.keys()) + with ExitStack() as stack, sonarCache(cachedir) as cache: + if dbm is None: + dbm = stack.enter_context(sonarDBManager(self.db)) + for i in tqdm(range(len(seqhashes)), desc=msg, disable=disable_progressbar): + seqhash = seqhashes[i] + seq = cache.get_cached_seq(seqhash) + preprocessed_data = cache.load_info(seqhash) + for entry in acc_dict[seqhash]: + preprocessed_data["acc"] = entry[0] + preprocessed_data["descr"] = entry[1] + self.import_genome(**preprocessed_data, seq=seq, dbm=dbm) + + def import_genome( + self, + acc, + descr, + seqhash, + dnadiff=None, + aadiff=None, + dna_profile=None, + prot_profile=None, + fs_profile=None, + seq=None, + dbm=None, + ): + """ + function to import processed data to the SONAR database. + + Parameters + ---------- + + acc : str + define the accession of the processed genome + descr : str + define the FASTA header of the processed genome + seqhash : str + define the hash (seguid) of the processed genome + dnadiff : list + define a sub list of nucleotide level variations (see sonarALIGN.dnadiff) + aadiff : list + define a sub list of amino acid level variations (see sonarALIGN.aadiff) + dna_profile : str + define the formatted nucleotide level profile (see sonarDB.build_profile) + prot_profile : str + define the formatted amino acid level profile (see sonarDB.build_profile) + seq : str + define the sequence of the processed genome (can be None, but then no paranoid test is done) + dbm : sonarDBManager object [ None ] + define a sonarDBManager object to use for database transaction + """ + with ExitStack() as stack: + try: + if dbm is None: + dbm = stack.enter_context(sonarDBManager(self.db)) + + dbm.insert_genome(acc, descr, seqhash) + + if not dnadiff is None: + dbm.insert_sequence(seqhash) + dbm.insert_profile(seqhash, dna_profile, prot_profile, fs_profile) + for ref, alt, s, e, _, __ in dnadiff: + dbm.insert_dna_var(seqhash, ref, alt, s, e) + + for ref, alt, s, e, protein, locus in aadiff: + dbm.insert_prot_var(seqhash, protein, locus, ref, alt, s, e) + + if seq: + self.be_paranoid(acc, seq, auto_delete=True, dbm=dbm) + except sqlite3.IntegrityError as er: + print("\nError while processing ID: '{}' \n".format(acc)) + raise er + except sqlite3.Error as er: + print("\nError: occurred while trying to store ID: %s \n", acc) + raise er + + # NOMENCLATURE + + def isdnavar(self, var): + """ + function to validate nucleotide level profiles + + Examples + -------- + + >>> a = os.remove(DOCTESTDB) if os.path.exists(DOCTESTDB) else None + >>> db = sonarDB(DOCTESTDB) + >>> db.isdnavar("S:N501Y") + False + >>> db.isdnavar("A101T") + True + + Parameters + ---------- + + var : str + define the profile to validate + + Returns + ------- + + bool + True if var is a valid nucleotide level profile otherwise False + """ + return bool(self.dna_var_regex.match(var)) + + def isaavar(self, var): + """ + function to validate amino acid level profiles + + Examples + -------- + + >>> a = os.remove(DOCTESTDB) if os.path.exists(DOCTESTDB) else None + >>> db = sonarDB(DOCTESTDB) + >>> db.isaavar("S:N501Y") + True + >>> db.isaavar("A101T") + False + + Parameters + ---------- + + var : str + define the profile to validate + + Returns + ------- + + bool + True if var is a valid amino acid level profile otherwise False + """ + return bool(self.aa_var_regex.match(var)) + + def isdel(self, var): + """ + function to validate deletion profiles on both nucleotide and amino acid level + + Examples + -------- + + >>> a = os.remove(DOCTESTDB) if os.path.exists(DOCTESTDB) else None + >>> db = sonarDB(DOCTESTDB) + >>> db.isdel("del:100-118") + False + >>> db.isdel("del:100:18") + True + >>> db.isdel("ORF1b:del:5:2") + True + + Parameters + ---------- + + var : str + define the profile to validate + + Returns + ------- + + bool + True if var is a deletion profile otherwise False + """ + return bool(self.del_regex.match(var)) + + # PROFILE BUILDING + + def build_profile(self, *vars): + """ + function to build a valid variant profiles based on given variations + + Parameters + ---------- + + vars : list + define for each variation to be considered by the profile a list + with the following elements: + - reference nucleotide(s) or amino acid(s) + - alternative nucleotide(s) or amino acid(s) + - start position (0-based) related to the genome (nucleotide level profile) or + protein (amino acid level profile) + - end position (0-based) related to the genome (nucleotide level profile) or + protein (amino acid level profile) or None if single nucleotide/amino acid + polymorphism + - protein symbol (None in case of nucleotide level profiles) + - gene locus (None in case of nucleotide level profiles) + + Returns + ------- + + str + valid variant profile + """ + if len(vars) == 0: + return "" + profile = [] + if len(vars) == 1: + this_ref, this_alt, this_start, this_end, this_protein, this_locus = vars[0] + if this_alt == "" and this_end is None: + this_end = this_start + len(this_ref) + else: + vars = sorted(vars, key=lambda x: (x[5], x[4], x[2])) + for l in range(len(vars) - 1): + ( + this_ref, + this_alt, + this_start, + this_end, + this_protein, + this_locus, + ) = vars[l] + ( + next_ref, + next_alt, + next_start, + next_end, + next_protein, + next_locus, + ) = vars[l + 1] + if this_alt != "": + var = self.format_var( + this_ref, this_alt, this_start, this_end, this_protein + ) + profile.append(var) + elif ( + this_alt == "" + and next_alt == "" + and this_start + len(this_ref) == next_start + and this_protein == next_protein + and this_locus == next_locus + ): + vars[l + 1] = ( + this_ref + next_ref, + "", + this_start, + next_start + 1, + this_protein, + this_locus, + ) + else: + if this_alt == "" and this_end is None: + this_end = this_start + len(this_ref) + var = self.format_var( + this_ref, + this_alt, + this_start, + this_end, + this_protein, + this_locus, + ) + profile.append(var) + this_ref, this_alt, this_start, this_end, this_protein, this_locus = vars[ + l + 1 + ] + if this_alt == "" and this_end is None: + this_end = this_start + len(this_ref) + var = self.format_var( + this_ref, this_alt, this_start, this_end, this_protein, this_locus + ) + if var not in profile: + profile.append(var) + + return " ".join(profile) + + @staticmethod + def format_var(ref, alt, start, end, protein=None, locus=None): + """ + function to build a valid variant profile based on a single variation + + Parameters + ---------- + + ref : str + define the reference nucleotide(s) or amino acid(s) + alt : str + define the alternative nucleotide(s) or amino acid(s) + start : int + define the start position (0-based) related to the genome (nucleotide + level profile) or protein (amino acid level profile) + end : int + define the end position (0-based) related to the genome (nucleotide + level profile) or protein (amino acid level profile) or None if + single nucleotide/amino acid polymorphism + protein : str + define the protein symbol (None in case of nucleotide level profiles) + [ None ] + locus : str + define the gene locus (None in case of nucleotide level profiles) + [ None ] + + Returns + ------- + + str + valid variant profile + """ + if alt != "": + coord = str(start + 1) + else: + ref = "del:" + coord = str(start + 1) + ":" + str(end - start) + protein = protein + ":" if protein else "" + return protein + ref + coord + alt + + # FRAMESHIFT DETECTION + + def is_frameshift(self, dna_var): + """ + function to check if a dna variant causes a frameshift in any annotated + CDS. + + Returns + ------- + bool + True if dna variant causes a frameshift, otherwise False. + """ + + if dna_var.startswith("del:"): + _, x, l = dna_var.split(":") + x = int(x) - 1 + y = x + int(l) + for cds in self.refgffObj.cds: + if cds.is_frameshift_del(x, y): + return True + else: + match = self.dnavar_grep_regex.search(dna_var) + x = int(match.group(2)) - 1 + l = len(match.group(3)) - 1 + if l % 3 != 0: + for cds in self.refgffObj.cds: + if cds.is_frameshift_in(x, l): + return True + return False + + def filter_frameshifts(self, dna_profile): + """ + function to filter all frameshift mutations from a given dna_profile. + + Returns + ------- + str + dna_profile containing only frameshift mutations + """ + if self.refgffObj and dna_profile.strip(): + return " ".join( + [ + x + for x in filter(None, dna_profile.split(" ")) + if self.is_frameshift(x) + ] + ) + return "" + + # MATCHING + + def filter_ambig(self, profile, explicit_code, keep=None): + """ + function to filter variations with ambiguities in the alternative allele + from a valid nucleotide or amino acid level profile + + Parameters + ---------- + + profile : str + valid nucleotide or amino acid level profile + explicit_code : dict + explicit IUPAC code dictionary to use (as provided by + sonarDB.iupac_explicit_nt_code or sonarDB.iupac_explicit_aa_code) + keep : list [ None ] + list of single variation profiles to exclude from filtering + + Returns + ------- + + str + valid variant profile + """ + if profile is None: + return "" + out = [] + keep = set(keep) if keep else set() + for mutation in list(filter(None, profile.split(" "))): + if mutation in keep or self.del_regex.search(mutation): + out.append(mutation) + continue + match = self.__terminal_letters_regex.search(mutation) + if ( + match + and len(match.group(0)) == 1 + and match.group(0) not in explicit_code + ): + continue + out.append(mutation) + return " ".join(out) + + def pinpoint_mutation(self, mutation, code): + """ + function to generate a set of all profiles consisting of + non-ambiguous one-letter codes only that match to a given profile. + If the given profile does not contain any ambiguities a list only + containing the given profile is returned. + + Examples + -------- + + >>> a = os.remove(DOCTESTDB) if os.path.exists(DOCTESTDB) else None + >>> db = sonarDB(DOCTESTDB) + >>> sorted(db.pinpoint_mutation('A5001N', db.iupac_nt_code)) + ['A5001A', 'A5001B', 'A5001C', 'A5001D', 'A5001G', 'A5001H', 'A5001K', 'A5001M', 'A5001N', 'A5001R', 'A5001S', 'A5001T', 'A5001V', 'A5001W', 'A5001Y'] + >>> db.pinpoint_mutation('N501Y', db.iupac_aa_code) + {'N501Y'} + + Parameters + ---------- + + mutation : str + define a valid nucleotide or amino acid level profile that may contain + ambiguities + code : dict + define the IUPAC code dictionary to use (as provided by + sonarDB.iupac_nt_code or sonarDB.iupac_aa_code) + + Returns + ------- + + set + set of profiles without ambiguities but matching to given profile + """ + # extract ALT call from mutation profile + match = self.__terminal_letters_regex.search(mutation) + if not match: + return { + mutation, + } + match = match.group(0) + + # resolve ambiguities + options = [] + for m in match: + options.append(code[m]) + + # generate the set of explicit mutations + orig_stat = mutation[: -len(match)] + return set( + [mutation] + [orig_stat + "".join(x) for x in itertools.product(*options)] + ) + + def make_profile_explicit(self, profile): + """ + function to replace ambiguous variants from a profile by the respective + explicit variant descriptions and to sort profiles based on their level + + Parameters + ---------- + + profile : str + define a valid nucleotide, amino acid or mixed level profile that + may contain ambiguities + + Returns + ------- + + dict + dictionary with 'dna' or 'aa' as key and the respective list of + explicit dna/protein level mutations + """ + + profile = set(profile) + extended_profile = {"aa": [], "dna": []} + for var in profile: + key = "dna" if self.isdnavar(var) else "aa" + extended_profile[key].extend( + [v for v in self.pinpoint_mutation(var, self.codedict[key]["code"])] + ) + return extended_profile + + def _fix_X_N_search(self, _profiles): + temp_include_profiles = [] + for _list_var in _profiles: + for var in _list_var: + if (var[-1].lower() == "x") and not self.isdnavar(var): + for v in self.pinpoint_mutation(var, self.codedict["aa"]["code"]): + temp_include_profiles.append([v]) + + elif (var[-1].lower() == "n") and self.isdnavar(var): + for v in self.pinpoint_mutation(var, self.codedict["dna"]["code"]): + temp_include_profiles.append([v]) + + _profiles.extend(temp_include_profiles) + _profiles = [list(x) for x in set(tuple(x) for x in _profiles)] + # print("After update") + # print(_profiles) + return _profiles + + def match( + self, + include_profiles=[], + exclude_profiles=[], + accessions=[], + lineages=[], + with_sublineage=False, + zips=[], + dates=[], + submission_dates=[], + labs=[], + sources=[], + collections=[], + technologies=[], + platforms=[], + chemistries=[], + materials=[], + software=None, + software_version=None, + min_ct=None, + max_ct=None, + seqhashes=[], + ambig=False, + count=False, + frameshifts=0, + debug=False, + dbm=None, + ): + """ + function to search genomes in the SONAR database dependent on + defined sequence and metadata profiles + + Parameters + ---------- + + include_profiles : list [ [] ] + define a list of valid nucleotide, amino acid or mixed level profiles + that may contain ambiguities to find genomes sharing respective + profiles. Variations in each profile (sublist) are linked by AND operator + while profiles from different sublists are linked by OR. + exclude_profiles : list [ [] ] + define a list of valid nucleotide, amino acid or mixed level profiles + that may contain ambiguities to find genomes NOT sharing respective + profiles. Variations in each profile (sublist) are linked by AND operator + while profiles from different sublists are linked by OR. + accessions : list [ [] ] + list of accessions. Only genomes linked to accessions in this list + will be matched. Accessions are negated when starting with ^. [ None ] + lineages : list [ [] ] + list of pangolin lineages. Only genomes assigend to a + pangolin lineage in this list will be matched. Lineages are + negated when starting with ^. + with_sublineage :False, + + zips : list [ [] ] + list of zip codes. Only genomes linked to one of the given zip + codes or whose linked zip code starts like one of the given + zip codes are matched. zip codes are negated when starting with ^. + dates : list [ [] ] + define list of dates (YYYY-MM-DD) or date ranges (YYYY-MM-DD:YYYY-MM-DD). + Only genomes linked to one of the given dates or date ranges are + matched. + submission_dates : list [ [] ] + define list of submission dates (YYYY-MM-DD) or submission ranges (YYYY-MM-DD:YYYY-MM-DD). + Only genomes linked to one of the given dates or date ranges are + matched. + sources : list [ [] ] + list of data sources. Only genomes linked to a + data source in this list will be matched. Data sources are + negated when starting with ^. + collections : list [ [] ] + list of data collections. Only genomes linked to a + data collection in this list will be matched. Data collections are + negated when starting with ^. + technologies : list [ [] ] + list of sequencing technologies. Only genomes linked to a + technology in this list will be matched. Technologies are + negated when starting with ^. + platforms : list [ [] ] + list of sequencing platforms. Only genomes linked to a + platform in this list will be matched. Platforms are + negated when starting with ^. + chemistries : list [ [] ] + list of sequencing chemistries. Only genomes linked to a + chemistry in this list will be matched. Chemistries are + negated when starting with ^. + software : str [ None ] + software used for sequence reconstruction. Only genomes linked to the + given software will be matched. Software is negated when starting with ^. + software_version : str [ None ] + software version used for sequence reconstruction. Only genomes linked + to the given software version will be matched. Software version is + negated when starting with ^. Needs software defined. + materials : list [ [] ] + list of sampling materials. Only genomes linked to a + material in this list will be matched. Materials are + negated when starting with ^. + labs : list [ [] ] + list of lab identifiers. Only genomes linked to a + lab in this list will be matched. Labs are + negated when starting with ^. + min_ct : float [ None ] + minimal ct value of genomes to match. + max_ct : float [ None ] + maximal ct value of genomes to match. + ambig : bool [ False ] + define if variant alleles including ambiguities should be shown (True) + or not (False) + count : bool [ False ] + define if matched genomes should be counted (True) instead of collected + (False). + frameshifts : int [ 0 ] + define if matched genomes have to conatin frameshift mutations (1) + or have not to conatin frameshift mutations (-1) or frameshift mutations + do not matter (0). + debug : bool [ False ] + activate debug mode for sonarDBManager + dbm : sonarDBManager object [ None ] + define a sonarDBManager object to use for database transaction + + Returns + ------- + + list or int + list of rows if count is False else number of rows as int. Each row + represents a matching genome and is provided as dictionary with field + names as keys. + """ + + clause = [] + vals = [] + + # sanity check: + check = [] + if include_profiles: + check += [item for sublist in include_profiles for item in sublist] + if exclude_profiles: + check += [item for sublist in exclude_profiles for item in sublist] + nonvalid = [x for x in check if not self.isdnavar(x) and not self.isaavar(x)] + if nonvalid: + sys.exit( + "input error: Non-valid variant expression(s) entered: " + + ", ".join(nonvalid) + ) + + if software_version and software is None: + sys.exit( + "input error: matching a given software version needs a software defined." + ) + + # adding conditions of profiles to include to where clause + # print(include_profiles) + if include_profiles: + include_profiles = self._fix_X_N_search(include_profiles) + include_profiles = [ + self.make_profile_explicit(x) for x in include_profiles + ] # Fix here + # adding conditions of profiles to exclude to where clause + if exclude_profiles: + exclude_profiles = self._fix_X_N_search(exclude_profiles) + exclude_profiles = [self.make_profile_explicit(x) for x in exclude_profiles] + # adding accession, lineage, zips, and dates based conditions + include_acc = [x for x in accessions if not x.startswith("^")] + exclude_acc = [x[1:] for x in accessions if x.startswith("^")] + + include_lin = [x for x in lineages if not x.startswith("^")] + exclude_lin = [x[1:] for x in lineages if x.startswith("^")] + + include_zip = [x for x in zips if not str(x).startswith("^")] + exclude_zip = [x[1:] for x in zips if str(x).startswith("^")] + + include_dates = [x for x in dates if not str(x).startswith("^")] + exclude_dates = [x[1:] for x in dates if str(x).startswith("^")] + + include_submission_dates = [ + x for x in submission_dates if not str(x).startswith("^") + ] + exclude_submission_dates = [ + x[1:] for x in submission_dates if str(x).startswith("^") + ] + + include_labs = [x for x in labs if not str(x).startswith("^")] + exclude_labs = [x[1:] for x in labs if str(x).startswith("^")] + + include_source = [x for x in sources if not str(x).startswith("^")] + exclude_source = [x[1:] for x in sources if str(x).startswith("^")] + + include_collection = [x for x in collections if not str(x).startswith("^")] + exclude_collection = [x[1:] for x in collections if str(x).startswith("^")] + + include_technology = [x for x in technologies if not str(x).startswith("^")] + exclude_technology = [x[1:] for x in technologies if str(x).startswith("^")] + + include_platform = [x for x in platforms if not str(x).startswith("^")] + exclude_platform = [x[1:] for x in platforms if str(x).startswith("^")] + + include_chemistry = [x for x in chemistries if not str(x).startswith("^")] + exclude_chemistry = [x[1:] for x in chemistries if str(x).startswith("^")] + + include_material = [x for x in materials if not str(x).startswith("^")] + exclude_material = [x[1:] for x in materials if str(x).startswith("^")] + + include_seqhash = [x for x in seqhashes if not x.startswith("^")] + exclude_seqhash = [x[1:] for x in seqhashes if x.startswith("^")] + + if software: + if not software.startswith("^"): + include_software = software + exclude_software = None + else: + include_software = None + exclude_software = software[1:] + else: + include_software = None + exclude_software = None + + if software_version: + if not software_version.startswith("^"): + include_software_version = software_version + exclude_software_version = None + else: + include_software_version = None + exclude_software_version = software_version[1:] + else: + include_software_version = None + exclude_software_version = None + + # query + with ExitStack() as stack: + if dbm is None: + dbm = stack.enter_context(sonarDBManager(self.db, readonly=True)) + dbm.debug = debug + ### support wildcard #### + _tmp_include_lin = [] + for in_lin in include_lin: + if "%" in in_lin: + _list = dbm.get_list_of_lineages(in_lin) + + if len(_list) > 0: + _tmp_include_lin.extend(_list) + ## if we don't find this wildcard so we discard it + else: + _tmp_include_lin.append(in_lin) + include_lin = _tmp_include_lin + + _tmp_exclude_lin = [] + for ex_lin in exclude_lin: + if "%" in ex_lin: + _list = dbm.get_list_of_lineages(ex_lin) + + if len(_list) > 0: + _tmp_exclude_lin.extend(_list) + ## if we don't find this wildcard so we discard it + else: + _tmp_exclude_lin.append(ex_lin) + exclude_lin = _tmp_exclude_lin + ### support paren-child relationship #### + if with_sublineage: + # print('sublineage query is enbable for all included lineages') + _tmp_include_lin = [] + + while include_lin: + in_lin = include_lin.pop(0) + value = self.lineage_sublineage_dict.get( + in_lin, "none" + ) # provide a default value if the key is missing: + + if value != "none": + _tmp_include_lin.append(in_lin) + _list = value.split(",") + # recursive way + for i in _list: + include_lin.append(i) + # _tmp_include_lin.append(i) + ## if we don't find this wildcard so we discard it + else: # None + _tmp_include_lin.append(in_lin) + _tmp_include_lin = list(dict.fromkeys(_tmp_include_lin)) + """ + # since we have a proper lineage file # the above code can be rewritten by below code # however, we are not sure the upcoming lineage assignment # so we stick the old method while include_lin: - in_lin = include_lin.pop(0) + in_lin = include_lin.pop(0) value = self.lineage_sublineage_dict.get(in_lin, 'none') if value != 'none': _list = value.split(',') @@ -3046,505 +3420,602 @@ def match(self, _tmp_include_lin.append(in_lin) """ - include_lin = _tmp_include_lin - #print(include_profiles) - #print(include_lin) - - ######################## - rows = dbm.match( - include_profiles, - exclude_profiles, - include_acc, - exclude_acc, - include_lin, - exclude_lin, - include_zip, - exclude_zip, - include_dates, - exclude_dates, - include_submission_dates, - exclude_submission_dates, - include_labs, - exclude_labs, - include_source, - exclude_source, - include_collection, - exclude_collection, - include_technology, - exclude_technology, - include_platform, - exclude_platform, - include_chemistry, - exclude_chemistry, - include_material, - exclude_material, - include_software, - exclude_software, - include_software_version, - exclude_software_version, - min_ct, - max_ct, - include_seqhash, - exclude_seqhash, - count, - frameshifts) - - # remove ambiguities from database profiles if wished - if not ambig and not count: - keep = [item for sublist in include_profiles for item in sublist] if include_profiles else None - for i in range(len(rows)): - rows[i]['dna_profile'] = self.filter_ambig(rows[i]['dna_profile'], self.iupac_explicit_nt_code, keep) - rows[i]['aa_profile'] = self.filter_ambig(rows[i]['aa_profile'], self.iupac_explicit_aa_code, keep) - elif count: - return rows[0]['count'] - - return rows - - - # VALIDATION - - def restore_genome_using_dnavars(self, acc, dbm = None): - """ - function to restore a genome sequence from the SONAR database using dna variation table - - Parameters - ---------- - - acc : str - define the accesion of the genome that should be restored - dbm : sonarDBManager object [ None ] - define a sonarDBManager object to use for database transaction - - Raises - ------ - - Each variant site stored in the database is checked, if the linked reference - nucleotide is correct. If not, program is terminated and an error shown. - - Returns - ------- - - tuple - tuple of the FASTA header and sequence of the respective genome. - None is returned if the given accession does not exist in the - database. - """ - - with ExitStack() as stack: - if dbm is None: - dbm = stack.enter_context(sonarDBManager(self.db, readonly=True)) - rows = dbm.get_dna_vars(acc) - if rows: - prefix = "" - qryseq = list(self.refseq) - for row in rows: - if row['start'] is None: - continue - s = row['start'] - if s >= 0: - if row['ref'] != self.refseq[s]: - sys.exit("data error: data inconsistency found for '" + acc + "' (" + row['ref']+ " expected at position " + str(s+1) + " of the reference sequence, got " + self.refseq[s] + ").") - qryseq[s] = row['alt'] - else: - prefix = row['alt'] - return ">" + rows[0]['description'], prefix + "".join(qryseq) - else: - rows = dbm.get_genomes(acc) - if rows is None: - sys.exit("error: " + acc + " not found.") - return ">" + rows['description'], self.refseq - - def restore_genome_using_dnaprofile(self, acc, dbm=None): - """ - function to restore a genome sequence from the SONAR database using dna level profiles - - Parameters - ---------- - - acc : str - define the accesion of the genome that should be restored - dbm : sonarDBManager object [ None ] - define a sonarDBManager object to use for database transaction - - Raises - ------ - - Each variant site stored in the database is checked, if the linked reference - nucleotide is correct. If not, program is terminated and an error shown. - - Returns - ------- - - tuple - tuple of the FASTA header and sequence of the respective genome. - None is returned if the given accession does not exist in the - database. - """ - with ExitStack() as stack: - if dbm is None: - dbm = stack.enter_context(sonarDBManager(self.db, readonly=True)) - profile = dbm.get_dna_profile(acc) - if profile: - qryseq = list(self.refseq) - prefix = "" - for var in profile.strip().split(" "): - if var.startswith("del:"): - var = var.split(":") - s = int(var[1])-1 - e = s + int(var[2]) - for i in range(s, e): - qryseq[i] = "" - elif var: - match = self.dnavar_grep_regex.search(var) - pos = int(match.group(2))-1 - ref = match.group(1) - alt = match.group(3) - if pos >= 0 and ref != self.refseq[pos]: - sys.exit("data error: data inconsistency found for '" + acc + "' (" + ref+ " expected at position " + str(pos+1) + " of the reference sequence, got " + self.refseq[pos] + ").") - if pos == -1: - prefix = alt - else: - qryseq[pos] = alt - return prefix + "".join(qryseq) - else: - row = dbm.get_genomes(acc) - if row is None: - sys.exit("error: " + acc + " not found.") - return ">" + row['description'], self.refseq - - def restore_alignment(self, acc, dbm=None): - """ - function to restore a genome alignment from the SONAR database - - Parameters - ---------- - - acc : str - define the accesion of the genome whose alignment versus the reference - should be restored - dbm : sonarDBManager object [ None ] - define a sonarDBManager object to use for database transaction - - Raises - ------ - - Each variant site stored in the database is checked, if the linked reference - nucleotide is correct. If not, program is terminated and an error shown. - - Returns - ------- - - tuple - tuple of the FASTA header and aligned sequence of the respective genome - followed by the FASTA header and aligned sequence of the reference genome. - None is returned if the given accession does not exist in the - database. - """ - with ExitStack() as stack: - if dbm is None: - dbm = stack.enter_context(sonarDBManager(self.db, readonly=True)) - rows = dbm.get_dna_vars(acc) - if rows: - refseq = list(self.refseq) - qryseq = refseq[:] - for row in rows: - if row['start'] is not None: - s = row['start'] - if s >= 0: - if row['ref'] != self.refseq[s]: - sys.exit("data error: data inconsistency found for '" + acc + "' (" + row['ref']+ " expected at position " + str(s+1) + " of the reference sequence, got " + refseq[s] + ").") - qryseq[s] = "-" if not row['alt'] else row['alt'] - if len(row['alt']) > 1: - refseq[s] += "-" * (len(row['alt'])-1) - else: - qryseq = [row['alt']] + qryseq - refseq = ["-" * (len(row['alt']))] + refseq - return ">" + rows[0]['description'], "".join(qryseq), ">" + self.dbobj.refdescr, "".join(refseq) - return None - - - def be_paranoid(self, acc, orig_seq, auto_delete=False, dbm=None): - """ - function to compare a given sequence with the respective sequence restored - from the SONAR database - - Parameters - ---------- - - acc : str - define the accesion of the genome that should be validated - orig_seq : str - define the sequence expected - dbm : sonarDBManager object [ None ] - define a sonarDBManager object to use for database transaction - auto_delete : bool [ False ] - define if the respective genome should be automatically deleted - from the SONAR database if the test fails - - Returns - ------- - - bool - True is returned if expected and restored sequences are not different - otherwise False - """ - orig_seq = self.harmonize(orig_seq) - - with ExitStack() as stack: - if dbm is None: - dbm = stack.enter_context(sonarDBManager(self.db)) - - # dna table check - s = self.restore_genome_using_dnavars(acc, dbm)[1] - if orig_seq != s: - if auto_delete: - dbm.delete_genome(acc) - fd, path = mkstemp(suffix=".fna", prefix="paranoid_", dir=".") - with open(path, "w") as handle: - handle.write(">original " + acc + "\n" + orig_seq + "\n" + ">restored " + acc + "\n" + orig_seq) - sys.exit("Good that you are paranoid: " + acc + " original and those restored from dna table do not match (sequences stored in " + path + ").") - - #dna profile check - s = self.restore_genome_using_dnaprofile(acc, dbm) - if orig_seq != s: - if auto_delete: - dbm.delete_genome(acc) - fd, path = mkstemp(suffix=".fna", prefix="paranoid_", dir="./") - with open(path, "w") as handle: - handle.write(">original " + acc + "\n" + orig_seq + "\n" + ">restored " + acc + "\n" + orig_seq) - sys.exit("Good that you are paranoid: " + acc + " original and those restored from its dna profile do not match (sequences stored in " + path + ").") - - #frameshift checks - row = self.match(accessions=[acc], ambig=True, dbm = dbm)[0] - fs = set() - for dna_var in row['dna_profile'].split(" "): - if dna_var.strip() == "": - continue - if self.is_frameshift(dna_var): - fs.add(dna_var) - - db_fs = set(filter(None, row['fs_profile'].split(" "))) - missing_fs = [x for x in fs if x not in db_fs] - wrong_fs = [x for x in db_fs if x not in fs] - if wrong_fs: - if auto_delete: - dbm.delete_genome(acc) - fd, path = mkstemp(suffix=".csv", prefix="paranoid_", dir="./") - with open(path, "w") as handle: - writer = csv.DictWriter(handle, row.keys(), lineterminator=os.linesep) - writer.writeheader() - writer.writerows([row]) - sys.exit("Good that you are paranoid: " + ", ".join(wrong_fs) + " not expected in frameshift profile of " + acc + " (profiles stored in " + path + ").") - - if missing_fs: - if auto_delete: - dbm.delete_genome(acc) - fd, path = mkstemp(suffix=".csv", prefix="paranoid_", dir="./") - with open(path, "w") as handle: - writer = csv.DictWriter(handle, row.keys(), lineterminator=os.linesep) - writer.writeheader() - writer.writerows([row]) - sys.exit("Good that you are paranoid: " + ", ".join(missing_fs) + " missing in frameshift profile of " + acc + " (profiles stored in " + path + ").") - - - return True - - @staticmethod - def get_version(): - return SUPPORTED_DB_VERSION - -class sonarCache(): - """ - this object manages permanent and temporary file caches - - Notes - ----- - - This class should be included via context manager to ensure that accession - index is written and cleaning temporary objects is performed after abnormal - program termination. - - In the SONAR cache for each unique sequence that has been cached a FASTA file - containing the sequence. That files are named by the slugified hash of the - sequence they contain while the used FASTA header represent the hash. Pre-processed - data provided by the sonarDB.process_fasta is stored in info files als named by - the slugified hash of the respective sequence they are related to (PICKLE format). - The link between sequence hash and accession(s) is stored in the cache attribute and, - when closing the cache, written to the index file (PICKLE format). - - Parameters - ---------- - dir : str - define a path to an non-existent, empty or valid SONAR cache directory. - If None, a temporary cache directoryis created and deleted after use. - [ None ] - - Attributes - ---------- - dirname : str - stores the absolute path to the cache directory - temp : bool - stores True if the cache is temporary and will be deleted after use - otherwise False - cache : dict - stores a dictionary whose keys are hashes of cached genome sequences and - and values tuples of linked accessions and FASTA headers - - """ - def __init__(self, dir=None): - self.temp = not bool(dir) - self.cache = defaultdict(set) - self._fasta_ext = ".fasta" - self._info_ext = ".info" - self._algn_ext = ".algn" - - if self.temp: - self.dirname = mkdtemp(prefix=".sonarCache_") - else: - self.dirname = os.path.abspath(dir) - self.checkdir() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, exc_traceback): - if [exc_type, exc_value, exc_traceback].count(None) != 3: - print("warning:", file=sys.stderr) - print(traceback.format_exc(), file=sys.stderr) - if os.path.isdir(self.dirname) and self.temp: - shutil.rmtree(self.dirname) - - def checkdir(self): - if not os.path.isdir(self.dirname): - os.makedirs(self.dirname) - - @staticmethod - def slugify(string): - """ - function to provide a file-system- and collision-safe representation of - a given string - - Parameters - ---------- - - string : str - define the string to slugify - - Returns - ------- - - str - a file-system- and collision-safe representation of - the original string - """ - return base64.urlsafe_b64encode(string.encode('UTF-8') ).decode('UTF-8') - - @staticmethod - def deslugify(string): - return base64.urlsafe_b64decode(string).decode("utf-8") - - @staticmethod - def get_seqhash_from_fasta_name(fname): - return sonarCache.deslugify(os.path.basename(fname))[:-len(self._fasta_ext)] - - def iter_fasta(self, fname): - """ - function to iterate records of a given FASTA file - - Parameters - ---------- - - fname : str - define the path to a valid FASTA file - - Returns - ------- - - tuple - for each record a tuple is returned consisting of - - accession - - FASTA header - - upper-case sequence - """ - for record in SeqIO.read(fname, "fasta"): - yield record.id, record.description, str(record.seq).upper() - - def read_cached_fasta(self, seqhash): - record = SeqIO.read(self.get_fasta_fname(seqhash), "fasta") - return record.id, record.description[1:], str(record.seq).upper() - - def get_cached_filename(self, seqhash, ext=""): - basename = self.slugify(seqhash) - return os.path.join(self.dirname, basename[:2], basename + ext) - - def get_fasta_fname(self, seqhash): - return self.get_cached_filename(seqhash, self._fasta_ext) - - def get_algn_fname(self, seqhash): - return self.get_cached_filename(seqhash, self._algn_ext) - - def get_info_fname(self, seqhash): - return self.get_cached_filename(seqhash, self._info_ext) - - def prep_cached_files(self, seqhash): - fasta = self.get_fasta_fname(seqhash) - algn = self.get_algn_fname(seqhash) - info = self.get_info_fname(seqhash) - os.makedirs(os.path.dirname(fasta), exist_ok=True) - return fasta, algn, info - - def load_info(self, seqhash): - with open(self.get_info_fname(seqhash), 'rb') as handle: - return pickle.load(handle, encoding="bytes") - - def write_info(self, seqhash, data={}): - data['seqhash'] = seqhash - with open(self.get_info_fname(seqhash), 'wb') as handle: - pickle.dump(data, handle) - - def add_seq(self, seqhash, seq): - """ - function to add a sequence to the cache - - Parameters - ---------- - - seqhash : str - define the seqhash of the sequence - seq : str - define the sequence + include_lin = _tmp_include_lin + # print(include_profiles) + # print(include_lin) + + ######################## + rows = dbm.match( + include_profiles, + exclude_profiles, + include_acc, + exclude_acc, + include_lin, + exclude_lin, + include_zip, + exclude_zip, + include_dates, + exclude_dates, + include_submission_dates, + exclude_submission_dates, + include_labs, + exclude_labs, + include_source, + exclude_source, + include_collection, + exclude_collection, + include_technology, + exclude_technology, + include_platform, + exclude_platform, + include_chemistry, + exclude_chemistry, + include_material, + exclude_material, + include_software, + exclude_software, + include_software_version, + exclude_software_version, + min_ct, + max_ct, + include_seqhash, + exclude_seqhash, + count, + frameshifts, + ) + + # remove ambiguities from database profiles if wished + if not ambig and not count: + keep = ( + [item for sublist in include_profiles for item in sublist] + if include_profiles + else None + ) + for i in range(len(rows)): + rows[i]["dna_profile"] = self.filter_ambig( + rows[i]["dna_profile"], self.iupac_explicit_nt_code, keep + ) + rows[i]["aa_profile"] = self.filter_ambig( + rows[i]["aa_profile"], self.iupac_explicit_aa_code, keep + ) + elif count: + return rows[0]["count"] + + return rows + + # VALIDATION + + def restore_genome_using_dnavars(self, acc, dbm=None): + """ + function to restore a genome sequence from the SONAR database using dna variation table + + Parameters + ---------- + + acc : str + define the accesion of the genome that should be restored + dbm : sonarDBManager object [ None ] + define a sonarDBManager object to use for database transaction + + Raises + ------ + + Each variant site stored in the database is checked, if the linked reference + nucleotide is correct. If not, program is terminated and an error shown. + + Returns + ------- + + tuple + tuple of the FASTA header and sequence of the respective genome. + None is returned if the given accession does not exist in the + database. + """ + + with ExitStack() as stack: + if dbm is None: + dbm = stack.enter_context(sonarDBManager(self.db, readonly=True)) + rows = dbm.get_dna_vars(acc) + if rows: + prefix = "" + qryseq = list(self.refseq) + for row in rows: + if row["start"] is None: + continue + s = row["start"] + if s >= 0: + if row["ref"] != self.refseq[s]: + sys.exit( + "data error: data inconsistency found for '" + + acc + + "' (" + + row["ref"] + + " expected at position " + + str(s + 1) + + " of the reference sequence, got " + + self.refseq[s] + + ")." + ) + qryseq[s] = row["alt"] + else: + prefix = row["alt"] + return ">" + rows[0]["description"], prefix + "".join(qryseq) + else: + rows = dbm.get_genomes(acc) + if rows is None: + sys.exit("error: " + acc + " not found.") + return ">" + rows["description"], self.refseq + + def restore_genome_using_dnaprofile(self, acc, dbm=None): + """ + function to restore a genome sequence from the SONAR database using dna level profiles + + Parameters + ---------- + + acc : str + define the accesion of the genome that should be restored + dbm : sonarDBManager object [ None ] + define a sonarDBManager object to use for database transaction + + Raises + ------ + + Each variant site stored in the database is checked, if the linked reference + nucleotide is correct. If not, program is terminated and an error shown. + + Returns + ------- + + tuple + tuple of the FASTA header and sequence of the respective genome. + None is returned if the given accession does not exist in the + database. + """ + with ExitStack() as stack: + if dbm is None: + dbm = stack.enter_context(sonarDBManager(self.db, readonly=True)) + profile = dbm.get_dna_profile(acc) + if profile: + qryseq = list(self.refseq) + prefix = "" + for var in profile.strip().split(" "): + if var.startswith("del:"): + var = var.split(":") + s = int(var[1]) - 1 + e = s + int(var[2]) + for i in range(s, e): + qryseq[i] = "" + elif var: + match = self.dnavar_grep_regex.search(var) + pos = int(match.group(2)) - 1 + ref = match.group(1) + alt = match.group(3) + if pos >= 0 and ref != self.refseq[pos]: + sys.exit( + "data error: data inconsistency found for '" + + acc + + "' (" + + ref + + " expected at position " + + str(pos + 1) + + " of the reference sequence, got " + + self.refseq[pos] + + ")." + ) + if pos == -1: + prefix = alt + else: + qryseq[pos] = alt + return prefix + "".join(qryseq) + else: + row = dbm.get_genomes(acc) + if row is None: + sys.exit("error: " + acc + " not found.") + return ">" + row["description"], self.refseq + + def restore_alignment(self, acc, dbm=None): + """ + function to restore a genome alignment from the SONAR database + + Parameters + ---------- + + acc : str + define the accesion of the genome whose alignment versus the reference + should be restored + dbm : sonarDBManager object [ None ] + define a sonarDBManager object to use for database transaction + + Raises + ------ + + Each variant site stored in the database is checked, if the linked reference + nucleotide is correct. If not, program is terminated and an error shown. + + Returns + ------- + + tuple + tuple of the FASTA header and aligned sequence of the respective genome + followed by the FASTA header and aligned sequence of the reference genome. + None is returned if the given accession does not exist in the + database. + """ + with ExitStack() as stack: + if dbm is None: + dbm = stack.enter_context(sonarDBManager(self.db, readonly=True)) + rows = dbm.get_dna_vars(acc) + if rows: + refseq = list(self.refseq) + qryseq = refseq[:] + for row in rows: + if row["start"] is not None: + s = row["start"] + if s >= 0: + if row["ref"] != self.refseq[s]: + sys.exit( + "data error: data inconsistency found for '" + + acc + + "' (" + + row["ref"] + + " expected at position " + + str(s + 1) + + " of the reference sequence, got " + + refseq[s] + + ")." + ) + qryseq[s] = "-" if not row["alt"] else row["alt"] + if len(row["alt"]) > 1: + refseq[s] += "-" * (len(row["alt"]) - 1) + else: + qryseq = [row["alt"]] + qryseq + refseq = ["-" * (len(row["alt"]))] + refseq + return ( + ">" + rows[0]["description"], + "".join(qryseq), + ">" + self.dbobj.refdescr, + "".join(refseq), + ) + return None + + def be_paranoid(self, acc, orig_seq, auto_delete=False, dbm=None): + """ + function to compare a given sequence with the respective sequence restored + from the SONAR database + + Parameters + ---------- + + acc : str + define the accesion of the genome that should be validated + orig_seq : str + define the sequence expected + dbm : sonarDBManager object [ None ] + define a sonarDBManager object to use for database transaction + auto_delete : bool [ False ] + define if the respective genome should be automatically deleted + from the SONAR database if the test fails + + Returns + ------- + + bool + True is returned if expected and restored sequences are not different + otherwise False + """ + orig_seq = self.harmonize(orig_seq) + + with ExitStack() as stack: + if dbm is None: + dbm = stack.enter_context(sonarDBManager(self.db)) + + # dna table check + s = self.restore_genome_using_dnavars(acc, dbm)[1] + if orig_seq != s: + if auto_delete: + dbm.delete_genome(acc) + fd, path = mkstemp(suffix=".fna", prefix="paranoid_", dir=".") + with open(path, "w") as handle: + handle.write( + ">original " + + acc + + "\n" + + orig_seq + + "\n" + + ">restored " + + acc + + "\n" + + orig_seq + ) + sys.exit( + "Good that you are paranoid: " + + acc + + " original and those restored from dna table do not match (sequences stored in " + + path + + ")." + ) + + # dna profile check + s = self.restore_genome_using_dnaprofile(acc, dbm) + if orig_seq != s: + if auto_delete: + dbm.delete_genome(acc) + fd, path = mkstemp(suffix=".fna", prefix="paranoid_", dir="./") + with open(path, "w") as handle: + handle.write( + ">original " + + acc + + "\n" + + orig_seq + + "\n" + + ">restored " + + acc + + "\n" + + orig_seq + ) + sys.exit( + "Good that you are paranoid: " + + acc + + " original and those restored from its dna profile do not match (sequences stored in " + + path + + ")." + ) + + # frameshift checks + row = self.match(accessions=[acc], ambig=True, dbm=dbm)[0] + fs = set() + for dna_var in row["dna_profile"].split(" "): + if dna_var.strip() == "": + continue + if self.is_frameshift(dna_var): + fs.add(dna_var) + + db_fs = set(filter(None, row["fs_profile"].split(" "))) + missing_fs = [x for x in fs if x not in db_fs] + wrong_fs = [x for x in db_fs if x not in fs] + if wrong_fs: + if auto_delete: + dbm.delete_genome(acc) + fd, path = mkstemp(suffix=".csv", prefix="paranoid_", dir="./") + with open(path, "w") as handle: + writer = csv.DictWriter( + handle, row.keys(), lineterminator=os.linesep + ) + writer.writeheader() + writer.writerows([row]) + sys.exit( + "Good that you are paranoid: " + + ", ".join(wrong_fs) + + " not expected in frameshift profile of " + + acc + + " (profiles stored in " + + path + + ")." + ) + + if missing_fs: + if auto_delete: + dbm.delete_genome(acc) + fd, path = mkstemp(suffix=".csv", prefix="paranoid_", dir="./") + with open(path, "w") as handle: + writer = csv.DictWriter( + handle, row.keys(), lineterminator=os.linesep + ) + writer.writeheader() + writer.writerows([row]) + sys.exit( + "Good that you are paranoid: " + + ", ".join(missing_fs) + + " missing in frameshift profile of " + + acc + + " (profiles stored in " + + path + + ")." + ) + + return True + + @staticmethod + def get_version(): + return SUPPORTED_DB_VERSION + + +class sonarCache: + """ + this object manages permanent and temporary file caches + + Notes + ----- + + This class should be included via context manager to ensure that accession + index is written and cleaning temporary objects is performed after abnormal + program termination. + + In the SONAR cache for each unique sequence that has been cached a FASTA file + containing the sequence. That files are named by the slugified hash of the + sequence they contain while the used FASTA header represent the hash. Pre-processed + data provided by the sonarDB.process_fasta is stored in info files als named by + the slugified hash of the respective sequence they are related to (PICKLE format). + The link between sequence hash and accession(s) is stored in the cache attribute and, + when closing the cache, written to the index file (PICKLE format). + + Parameters + ---------- + dir : str + define a path to an non-existent, empty or valid SONAR cache directory. + If None, a temporary cache directoryis created and deleted after use. + [ None ] + + Attributes + ---------- + dirname : str + stores the absolute path to the cache directory + temp : bool + stores True if the cache is temporary and will be deleted after use + otherwise False + cache : dict + stores a dictionary whose keys are hashes of cached genome sequences and + and values tuples of linked accessions and FASTA headers + + """ + + def __init__(self, dir=None): + self.temp = not bool(dir) + self.cache = defaultdict(set) + self._fasta_ext = ".fasta" + self._info_ext = ".info" + self._algn_ext = ".algn" + + if self.temp: + self.dirname = mkdtemp(prefix=".sonarCache_") + else: + self.dirname = os.path.abspath(dir) + self.checkdir() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, exc_traceback): + if [exc_type, exc_value, exc_traceback].count(None) != 3: + print("warning:", file=sys.stderr) + print(traceback.format_exc(), file=sys.stderr) + if os.path.isdir(self.dirname) and self.temp: + shutil.rmtree(self.dirname) + + def checkdir(self): + if not os.path.isdir(self.dirname): + os.makedirs(self.dirname) + + @staticmethod + def slugify(string): + """ + function to provide a file-system- and collision-safe representation of + a given string + + Parameters + ---------- + + string : str + define the string to slugify + + Returns + ------- + + str + a file-system- and collision-safe representation of + the original string + """ + return base64.urlsafe_b64encode(string.encode("UTF-8")).decode("UTF-8") + + @staticmethod + def deslugify(string): + return base64.urlsafe_b64decode(string).decode("utf-8") + + @staticmethod + def get_seqhash_from_fasta_name(fname): + return sonarCache.deslugify(os.path.basename(fname))[: -len(self._fasta_ext)] + + def iter_fasta(self, fname): + """ + function to iterate records of a given FASTA file + + Parameters + ---------- + + fname : str + define the path to a valid FASTA file + + Returns + ------- + + tuple + for each record a tuple is returned consisting of + - accession + - FASTA header + - upper-case sequence + """ + for record in SeqIO.read(fname, "fasta"): + yield record.id, record.description, str(record.seq).upper() + + def read_cached_fasta(self, seqhash): + record = SeqIO.read(self.get_fasta_fname(seqhash), "fasta") + return record.id, record.description[1:], str(record.seq).upper() + + def get_cached_filename(self, seqhash, ext=""): + basename = self.slugify(seqhash) + return os.path.join(self.dirname, basename[:2], basename + ext) + + def get_fasta_fname(self, seqhash): + return self.get_cached_filename(seqhash, self._fasta_ext) + + def get_algn_fname(self, seqhash): + return self.get_cached_filename(seqhash, self._algn_ext) + + def get_info_fname(self, seqhash): + return self.get_cached_filename(seqhash, self._info_ext) + + def prep_cached_files(self, seqhash): + fasta = self.get_fasta_fname(seqhash) + algn = self.get_algn_fname(seqhash) + info = self.get_info_fname(seqhash) + os.makedirs(os.path.dirname(fasta), exist_ok=True) + return fasta, algn, info + + def load_info(self, seqhash): + with open(self.get_info_fname(seqhash), "rb") as handle: + return pickle.load(handle, encoding="bytes") + + def write_info(self, seqhash, data={}): + data["seqhash"] = seqhash + with open(self.get_info_fname(seqhash), "wb") as handle: + pickle.dump(data, handle) + + def add_seq(self, seqhash, seq): + """ + function to add a sequence to the cache + + Parameters + ---------- + + seqhash : str + define the seqhash of the sequence + seq : str + define the sequence + + """ + fasta, align, info = self.prep_cached_files(seqhash) + + # check for sequence hash collision + if not os.path.isfile(fasta): + with open(fasta, "w") as handle: + handle.write(">" + seqhash + os.linesep + seq) + elif seq != self.read_cached_fasta(seqhash)[2]: + sys.exit("cache error: sequence hash collision for hash '" + seqhash + "'.") - """ - fasta, align, info = self.prep_cached_files(seqhash) + def get_cached_seqhashes(self): + return set(self.cache.keys()) - # check for sequence hash collision - if not os.path.isfile(fasta): - with open(fasta, "w") as handle: - handle.write(">" + seqhash + os.linesep + seq) - elif seq != self.read_cached_fasta(seqhash)[2]: - sys.exit("cache error: sequence hash collision for hash '" + seqhash + "'.") + def iter_cached_fasta_files(self): + for x in self.cache: + yield self.get_fasta_fname(x) - def get_cached_seqhashes(self): - return set(self.cache.keys()) + def get_cached_seq(self, seqhash): + return self.read_cached_fasta(seqhash)[-1] - def iter_cached_fasta_files(self): - for x in self.cache: - yield self.get_fasta_fname(x) - - def get_cached_seq(self, seqhash): - return self.read_cached_fasta(seqhash)[-1] if __name__ == "__main__": - import doctest - global DOCTESTDIR, DOCTESTDB, QRY_FASTA_FILE, REF_FASTA_FILE - print("sonarDB", sonarDB.get_version()) - print("performing unit tests ...") - with TemporaryDirectory() as tmpdirname: - this_path = os.path.dirname(os.path.realpath(__file__)) - DOCTESTDIR = tmpdirname - DOCTESTDB = os.path.join(DOCTESTDIR, "testdb") - QRY_FASTA_FILE = os.path.join(this_path, "doctest_b117.fna") - QRY_PICKLE_FILE = os.path.join(this_path, "doctest_b117.pickle") - REF_FASTA_FILE = os.path.join(this_path, "ref.fna") - REF_GFF_FILE = os.path.join(this_path, "ref.gff3") - print(doctest.testmod(verbose=False)) + import doctest + + global DOCTESTDIR, DOCTESTDB, QRY_FASTA_FILE, REF_FASTA_FILE + print("sonarDB", sonarDB.get_version()) + print("performing unit tests ...") + with TemporaryDirectory() as tmpdirname: + this_path = os.path.dirname(os.path.realpath(__file__)) + DOCTESTDIR = tmpdirname + DOCTESTDB = os.path.join(DOCTESTDIR, "testdb") + QRY_FASTA_FILE = os.path.join(this_path, "doctest_b117.fna") + QRY_PICKLE_FILE = os.path.join(this_path, "doctest_b117.pickle") + REF_FASTA_FILE = os.path.join(this_path, "ref.fna") + REF_GFF_FILE = os.path.join(this_path, "ref.gff3") + print(doctest.testmod(verbose=False)) diff --git a/lib/sonartoVCF.py b/lib/sonartoVCF.py old mode 100755 new mode 100644 index e3459ed..0dfa4b0 --- a/lib/sonartoVCF.py +++ b/lib/sonartoVCF.py @@ -1,133 +1,159 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -import os -from lib.sonardb import sonarDBManager from contextlib import ExitStack -from more_itertools import consecutive_groups, split_when -from tempfile import mkstemp, mkdtemp +import gzip +import math +from multiprocessing import Pool +import os +from os import getpid import shutil -import pandas as pd -import numpy as np import subprocess -from os import getpid -from multiprocessing import Pool +from tempfile import mkdtemp import warnings -import math + +from lib.sonardb import sonarDBManager +import numpy as np +import pandas as pd from tqdm import tqdm -import gzip -warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) +warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) -def create_fix_vcf_header(ref,sample_id): - header = "##fileformat=VCFv4.2\n##poweredby=CovSonarV1.1.4\n##reference="+ref + +def create_fix_vcf_header(ref, sample_id): + header = "##fileformat=VCFv4.2\n##poweredby=CovSonarV1.1.4\n##reference=" + ref format = '\n##FORMAT=' info = '\n##INFO=' - info = info+'\n##INFO=\n' - note = '' #"##Note_1='Currently we ignore DEL of the SARS-CoV-2 seqeunce'\n" - column = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"+sample_id+"\n" - return header+format+info+note+column - -from multiprocessing import Pool + info = ( + info + + '\n##INFO=\n' + ) + note = "" # "##Note_1='Currently we ignore DEL of the SARS-CoV-2 seqeunce'\n" + column = ( + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sample_id + "\n" + ) + return header + format + info + note + column def bgzip(filename): """Call bgzip to compress a file.""" - cmd = ['bgzip', '-f', filename] - with subprocess.Popen(cmd, encoding='utf8', stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + cmd = ["bgzip", "-f", filename] + with subprocess.Popen( + cmd, encoding="utf8", stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) as process: try: stdout, stderr = process.communicate(cmd) except subprocess.TimeoutExpired: process.kill() stdout, stderr = process.communicate() - raise subprocess.TimeoutExpired( output=stdout, stderr=stderr) + raise subprocess.TimeoutExpired(output=stdout, stderr=stderr) except Exception: process.kill() raise return + def tabix_index(filename): """Call tabix to create an index for a bgzip-compressed file.""" - cmd = ['tabix', '-p', 'vcf', filename+'.gz'] - with subprocess.Popen(cmd, encoding='utf8', stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + cmd = ["tabix", "-p", "vcf", filename + ".gz"] + with subprocess.Popen( + cmd, encoding="utf8", stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) as process: try: stdout, stderr = process.communicate(cmd) except subprocess.TimeoutExpired: process.kill() stdout, stderr = process.communicate() - raise subprocess.TimeoutExpired( output=stdout, stderr=stderr) + raise subprocess.TimeoutExpired(output=stdout, stderr=stderr) except Exception: process.kill() raise - return + return + def bcftool_index(filename): """Call tabix to create an index for a bgzip-compressed file.""" - cmd = ['bcftools', 'index', filename] - with subprocess.Popen(cmd, encoding='utf8', stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + cmd = ["bcftools", "index", filename] + with subprocess.Popen( + cmd, encoding="utf8", stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) as process: try: stdout, stderr = process.communicate(cmd) except subprocess.TimeoutExpired: process.kill() stdout, stderr = process.communicate() - raise subprocess.TimeoutExpired( output=stdout, stderr=stderr) + raise subprocess.TimeoutExpired(output=stdout, stderr=stderr) except Exception: process.kill() raise - return + return -def create_vcf(rows_grouped, tmp_dirname, refdescr,_pos): - process_id =str(getpid()) + +def create_vcf(rows_grouped, tmp_dirname, refdescr, _pos): + process_id = str(getpid()) # print(process_id+" Start") # iterate over each group # position=_pos,bar_format='{l_bar}{bar:10}{r_bar}{bar:-2b}' for group_name, df_group in tqdm(rows_grouped, mininterval=0.5): - #print("Create VCF file:",group_name) - vcf_filename =group_name+'.vcf' - full_path = os.path.join(tmp_dirname,vcf_filename) - with open(full_path, 'w') as f: - f.write(create_fix_vcf_header(refdescr,group_name)) - df_group = df_group.sort_values(by='start', ascending=True) + # print("Create VCF file:",group_name) + vcf_filename = group_name + ".vcf" + full_path = os.path.join(tmp_dirname, vcf_filename) + with open(full_path, "w") as f: + f.write(create_fix_vcf_header(refdescr, group_name)) + df_group = df_group.sort_values(by="start", ascending=True) # replace null to . - #df_group['ref'] = df_group['ref'].replace('', '.') # for insertion - #df_group['alt'] = df_group['alt'].replace('', '.') # for deltion - #df_group['alt'] = df_group['alt'].replace('', np.nan) - #df_group = df_group.dropna(axis=0, subset=['alt']) # remove Deletion + # df_group['ref'] = df_group['ref'].replace('', '.') # for insertion + # df_group['alt'] = df_group['alt'].replace('', '.') # for deltion + # df_group['alt'] = df_group['alt'].replace('', np.nan) + # df_group = df_group.dropna(axis=0, subset=['alt']) # remove Deletion for index, row in df_group.iterrows(): - id = row['ref']+str(row['start'])+row['alt'] - f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(row['CHROM'], row['start'], id, - row['ref'], row['alt'],row['QUAL'],row['FILTER'], - row['INFO'],row['FORMAT'],"1")) + id = row["ref"] + str(row["start"]) + row["alt"] + f.write( + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( + row["CHROM"], + row["start"], + id, + row["ref"], + row["alt"], + row["QUAL"], + row["FILTER"], + row["INFO"], + row["FORMAT"], + "1", + ) + ) bgzip(full_path) - tabix_index(full_path) + tabix_index(full_path) # print(process_id+" Finish") - -def parallelize_dataframe(df, tmp_dir, num_cores,refdescr, func): +def parallelize_dataframe(df, tmp_dir, num_cores, refdescr, func): _tmp_lis = np.array_split(df, num_cores) - counter=0 - zip_items = [(_tmp_lis[i],tmp_dir,refdescr,i+2) for i in range(len(_tmp_lis))] # same order - #with Pool(processes=num_cores) as pool: + counter = 0 + zip_items = [ + (_tmp_lis[i], tmp_dir, refdescr, i + 2) for i in range(len(_tmp_lis)) + ] # same order + # with Pool(processes=num_cores) as pool: # res = pool.starmap(func, zip_items) pool = Pool(num_cores) pool.starmap(func, zip_items) - # finish all tasks + # finish all tasks pool.close() pool.join() + def export2VCF( db_path, include_acc, include_dates, output, num_cores, - refdescr="No ref is provided" - ): - print('----- You are using sonartoVCF_V1 --------') - print('Prepare export2VCF workspace for',num_cores,'cpu') + refdescr="No ref is provided", +): + print("----- You are using sonartoVCF_V1 --------") + print("Prepare export2VCF workspace for", num_cores, "cpu") with ExitStack() as stack: dbm = stack.enter_context(sonarDBManager(db_path)) @@ -135,153 +161,164 @@ def export2VCF( where_vals = [] if include_acc: - where_clause.append(dbm.get_metadata_in_condition('accession' - , *include_acc)) + where_clause.append( + dbm.get_metadata_in_condition("accession", *include_acc) + ) where_vals.extend(include_acc) if include_dates: - where_clause.append(dbm.get_metadata_date_condition('date', - *include_dates)) + where_clause.append(dbm.get_metadata_date_condition("date", *include_dates)) - #print(where_clause) - fields = 'accession, start, end, alt, ref ' + # print(where_clause) + fields = "accession, start, end, alt, ref " if where_clause: - sql = "SELECT " + fields + " FROM dna_view WHERE " + " AND ".join(where_clause) + ";" + sql = ( + "SELECT " + + fields + + " FROM dna_view WHERE " + + " AND ".join(where_clause) + + ";" + ) else: sql = "SELECT " + fields + " FROM dna_view;" - ############################## - print('Start Bigquery...') - rows = pd.read_sql(sql, - dbm.connection,params=where_vals) - print('Return:', len(rows), ' records') + print("Start Bigquery...") + rows = pd.read_sql(sql, dbm.connection, params=where_vals) + print("Return:", len(rows), " records") track_vcf = [] count = 0 if not rows.empty: - tmp_dirname = mkdtemp( prefix=".sonarCache_") + tmp_dirname = mkdtemp(prefix=".sonarCache_") # vcf_path=os.path.join(tmp_dirname,) # create fasta_id chrom_id = refdescr.split()[0].replace(">", "") - rows['CHROM'] = chrom_id - rows['QUAL'] = '.' - rows['FILTER'] = '.' - rows['INFO'] = 'AC=1;AN=1' - rows['FORMAT'] = 'GT' + rows["CHROM"] = chrom_id + rows["QUAL"] = "." + rows["FILTER"] = "." + rows["INFO"] = "AC=1;AN=1" + rows["FORMAT"] = "GT" # POS or start position: The reference position, with the 1st base is position 1 not 0 , but in covsonar use 0 as the 1st position # so we should + 1 # http://samtools.github.io/hts-specs/VCFv4.2.pdf - rows['start'] = rows['start'] + 1 - rows_grouped = rows.groupby('accession') - for group_name, df_group in rows_grouped: - group_name=os.path.join(tmp_dirname, group_name+'.vcf.gz') - full_path = os.path.join(tmp_dirname,group_name) + rows["start"] = rows["start"] + 1 + rows_grouped = rows.groupby("accession") + for group_name, df_group in rows_grouped: + group_name = os.path.join(tmp_dirname, group_name + ".vcf.gz") + full_path = os.path.join(tmp_dirname, group_name) track_vcf.append(full_path) - print('With :', len(track_vcf), ' accessions') + print("With :", len(track_vcf), " accessions") # split data and write each ACC into individual VCF file. - print('Start Divide and Conquer ...') - parallelize_dataframe(rows_grouped, tmp_dirname, num_cores, refdescr, create_vcf) - - # bundle all vcf together - print('Integrate all VCFs ...') - divide_merge_vcf(track_vcf, output, num_cores) + print("Start Divide and Conquer ...") + parallelize_dataframe( + rows_grouped, tmp_dirname, num_cores, refdescr, create_vcf + ) + # bundle all vcf together + print("Integrate all VCFs ...") + divide_merge_vcf(track_vcf, output, num_cores) if os.path.isdir(tmp_dirname): shutil.rmtree(tmp_dirname) print("Finish! compress final result (gz):") - + + def divide_merge_vcf(list_track_vcf, global_output, num_cores): - chunk=500 - list_length = math.ceil(len(list_track_vcf)/chunk) # try to merge every - print('size:', list_length) - first_create_ = True + chunk = 500 + list_length = math.ceil(len(list_track_vcf) / chunk) # try to merge every + print("size:", list_length) + first_create_ = True second_create_ = True - tmp_dirname = mkdtemp( prefix=".final.sonarCache_") + tmp_dirname = mkdtemp(prefix=".final.sonarCache_") # we can tweak performance by using U at Bcftools for piping between bcftools subcommands (future work) bar = tqdm(range(list_length), desc="Create Global VCF:") - merge_type='b' + merge_type = "b" for i in bar: - _vcfs = " ".join(list_track_vcf[chunk*i:chunk*i+chunk]) + _vcfs = " ".join(list_track_vcf[chunk * i : chunk * i + chunk]) - - if(len(list_track_vcf)==1): - tmp_output = list_track_vcf[i].replace('.gz', '') + if len(list_track_vcf) == 1: + tmp_output = list_track_vcf[i].replace(".gz", "") continue - if(i == list_length-1): - merge_type='v' - #print('final merge') - + if i == list_length - 1: + merge_type = "v" + # print('final merge') if first_create_: - tmp_output = os.path.join(tmp_dirname,'vcf.2' ) - cmd = "bcftools merge {} -o {} -O{} --threads {}".format(_vcfs,tmp_output, merge_type, num_cores) - with subprocess.Popen(cmd, encoding='utf8', shell=True) as process: + tmp_output = os.path.join(tmp_dirname, "vcf.2") + cmd = "bcftools merge {} -o {} -O{} --threads {}".format( + _vcfs, tmp_output, merge_type, num_cores + ) + with subprocess.Popen(cmd, encoding="utf8", shell=True) as process: stdout, stderr = process.communicate(cmd) - #bgzip(tmp_output) - #tabix_index(tmp_output) + # bgzip(tmp_output) + # tabix_index(tmp_output) bcftool_index(tmp_output) first_create_ = False second_create_ = True third_create_ = True elif second_create_: - _vcfs = _vcfs +' '+ os.path.join(tmp_dirname,'vcf.2' ) - tmp_output = os.path.join(tmp_dirname,'vcf.3' ) + _vcfs = _vcfs + " " + os.path.join(tmp_dirname, "vcf.2") + tmp_output = os.path.join(tmp_dirname, "vcf.3") - cmd = "bcftools merge {} -o {} -O{} --threads {}".format(_vcfs, tmp_output, merge_type, num_cores) - with subprocess.Popen(cmd, encoding='utf8', shell=True) as process: + cmd = "bcftools merge {} -o {} -O{} --threads {}".format( + _vcfs, tmp_output, merge_type, num_cores + ) + with subprocess.Popen(cmd, encoding="utf8", shell=True) as process: stdout, stderr = process.communicate(cmd) - #bgzip(tmp_output) - #tabix_index(tmp_output) + # bgzip(tmp_output) + # tabix_index(tmp_output) bcftool_index(tmp_output) second_create_ = False third_create_ = True else: - _vcfs = _vcfs +' '+ os.path.join(tmp_dirname,'vcf.3' ) - tmp_output = os.path.join(tmp_dirname,'vcf.2' ) + _vcfs = _vcfs + " " + os.path.join(tmp_dirname, "vcf.3") + tmp_output = os.path.join(tmp_dirname, "vcf.2") - cmd = "bcftools merge {} -o {} -O{} --threads {}".format(_vcfs, tmp_output, merge_type, num_cores) - with subprocess.Popen(cmd, encoding='utf8', shell=True) as process: + cmd = "bcftools merge {} -o {} -O{} --threads {}".format( + _vcfs, tmp_output, merge_type, num_cores + ) + with subprocess.Popen(cmd, encoding="utf8", shell=True) as process: stdout, stderr = process.communicate(cmd) - #bgzip( tmp_output) - #tabix_index(tmp_output) + # bgzip( tmp_output) + # tabix_index(tmp_output) bcftool_index(tmp_output) second_create_ = True third_create_ = False - if(merge_type =='v'): + if merge_type == "v": bgzip(tmp_output) - tabix_index(tmp_output) - tmp_output = clean_stranger_things(tmp_output+ '.gz', tmp_dirname) - shutil.copy(tmp_output, global_output+ '.gz') - - print('Clean workspace ...') + tabix_index(tmp_output) + tmp_output = clean_stranger_things(tmp_output + ".gz", tmp_dirname) + shutil.copy(tmp_output, global_output + ".gz") + + print("Clean workspace ...") if os.path.isdir(tmp_dirname): shutil.rmtree(tmp_dirname) - #if not first_create_ and third_create_ and second_create_: + # if not first_create_ and third_create_ and second_create_: # os.rename( global_output + '.2.gz', global_output+ '.gz') - #elif second_create_ and not third_create_: + # elif second_create_ and not third_create_: # os.rename( global_output + '.2.gz', global_output+ '.gz') - #elif not second_create_ and third_create_: + # elif not second_create_ and third_create_: # os.rename(global_output + '.3.gz', global_output+ '.gz') - + + def clean_stranger_things(path_to_vcfgz, tmp_dirname): - print('Clean strange things in vcf ...') - output_path_file = os.path.join(tmp_dirname,'vcf.final.gz' ) - with gzip.open(path_to_vcfgz, 'rt') as f: - with gzip.open(output_path_file, 'wt') as output_file: + print("Clean strange things in vcf ...") + output_path_file = os.path.join(tmp_dirname, "vcf.final.gz") + with gzip.open(path_to_vcfgz, "rt") as f: + with gzip.open(output_path_file, "wt") as output_file: for line in f: - if line.startswith('#'): - if 'bcftools_mergeCommand' in line: + if line.startswith("#"): + if "bcftools_mergeCommand" in line: continue else: output_file.write(line) else: - rows = line.split('\t') - ### fix duplicate + rows = line.split("\t") + ### fix duplicate ID = rows[2] - rows[2]= ";".join(set(ID.split(';'))) - + rows[2] = ";".join(set(ID.split(";"))) + output_file.write("\t".join(rows)) return output_path_file diff --git a/lib/sonartoVCF_v2.bak.py b/lib/sonartoVCF_v2.bak.py old mode 100755 new mode 100644 index ff42924..da75e1e --- a/lib/sonartoVCF_v2.bak.py +++ b/lib/sonartoVCF_v2.bak.py @@ -1,81 +1,91 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -import os -from lib.sonardb import sonarDBManager from contextlib import ExitStack -from more_itertools import consecutive_groups, split_when -from tempfile import mkstemp, mkdtemp +import gzip +import math +from multiprocessing import Pool +import os +from os import getpid import shutil -import pandas as pd -import numpy as np import subprocess -from os import getpid -from multiprocessing import Pool +from tempfile import mkdtemp +import traceback import warnings -import math + +from lib.sonardb import sonarDBManager +import numpy as np +import pandas as pd from tqdm import tqdm -warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) -import traceback -import gzip + +warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) def create_fix_vcf_header(ref): - header = "##fileformat=VCFv4.2\n##CreatedBy=covSonarV1.1.3\n##reference="+ref + header = "##fileformat=VCFv4.2\n##CreatedBy=covSonarV1.1.3\n##reference=" + ref format = '\n##FORMAT=' info = '\n##INFO=' - info = info+'\n##INFO=\n' - note = "##Note='Currently ignore INDEL'\n" + info = ( + info + + '\n##INFO=\n' + ) + note = "##Note='Currently ignore INDEL'\n" # column = "\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"+sample_id+"\n" - return header+format+info+note - -from multiprocessing import Pool + return header + format + info + note def bgzip(filename): """Call bgzip to compress a file.""" - cmd = ['bgzip', '-f', filename] - with subprocess.Popen(cmd, encoding='utf8', stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + cmd = ["bgzip", "-f", filename] + with subprocess.Popen( + cmd, encoding="utf8", stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) as process: try: stdout, stderr = process.communicate(cmd) except subprocess.TimeoutExpired: process.kill() stdout, stderr = process.communicate() - raise subprocess.TimeoutExpired( output=stdout, stderr=stderr) + raise subprocess.TimeoutExpired(output=stdout, stderr=stderr) except Exception: process.kill() raise return + def tabix_index(filename): """Call tabix to create an index for a bgzip-compressed file.""" - cmd = ['tabix', '-p', 'vcf', filename+'.gz'] - with subprocess.Popen(cmd, encoding='utf8', stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + cmd = ["tabix", "-p", "vcf", filename + ".gz"] + with subprocess.Popen( + cmd, encoding="utf8", stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) as process: try: stdout, stderr = process.communicate(cmd) except subprocess.TimeoutExpired: process.kill() stdout, stderr = process.communicate() - raise subprocess.TimeoutExpired( output=stdout, stderr=stderr) + raise subprocess.TimeoutExpired(output=stdout, stderr=stderr) except Exception: process.kill() raise - return + return + def bcftool_index(filename): """Call tabix to create an index for a bgzip-compressed file.""" - cmd = ['bcftools', 'index', filename] - with subprocess.Popen(cmd, encoding='utf8', stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + cmd = ["bcftools", "index", filename] + with subprocess.Popen( + cmd, encoding="utf8", stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) as process: try: stdout, stderr = process.communicate(cmd) except subprocess.TimeoutExpired: process.kill() stdout, stderr = process.communicate() - raise subprocess.TimeoutExpired( output=stdout, stderr=stderr) + raise subprocess.TimeoutExpired(output=stdout, stderr=stderr) except Exception: process.kill() raise - return + return def calculate_AC_AN(final_df): @@ -84,119 +94,143 @@ def calculate_AC_AN(final_df): # e.g. a.POS;b.POS a.AC,b.AC for row in final_df.itertuples(): # print('POS '+str(row.POS)) - unique, counts = np.unique(np.asarray(row[10:]), return_counts=True) # row[10:] means we start from sample ID column + unique, counts = np.unique( + np.asarray(row[10:]), return_counts=True + ) # row[10:] means we start from sample ID column # for unique, counts in zip(unique, counts): - AN=0 - AC='' + AN = 0 + AC = "" for idx, val in enumerate(unique): - if(val == '.'): # ignore it + if val == ".": # ignore it continue else: _AC = counts[idx] - AN = AN +_AC - AC = str(_AC) if not AC else AC+','+str(_AC) + AN = AN + _AC + AC = str(_AC) if not AC else AC + "," + str(_AC) # print('AN='+str(AN)+';AC='+AC) - final_df.at[row.POS, 'INFO'] = 'AN='+str(AN)+';AC='+AC + final_df.at[row.POS, "INFO"] = "AN=" + str(AN) + ";AC=" + AC return final_df def create_vcf(rows_grouped, tmp_dirname, refdescr): - process_id =str(getpid()) + process_id = str(getpid()) # print(process_id+" Start") # iterate over each group final_df = pd.DataFrame() final_df.index = np.arange(1, 29904) - final_df['#CHROM'] = refdescr.split()[0].replace(">", "") - final_df['POS'] = np.arange(1, 29904) - final_df['ID'] = '.' - final_df['REF'] = '.' - final_df['ALT'] = '.' - final_df['FILTER'] = '.' - final_df['QUAL'] = '.' - final_df['INFO'] = '.' - final_df['FORMAT'] = 'GT' - final_df['POS'] = final_df['POS'].astype('int16') # int16 bit max value 32767 - vcf_filename =process_id+'.vcf' - full_path = os.path.join(tmp_dirname,vcf_filename) - with open(full_path, 'w') as f: + final_df["#CHROM"] = refdescr.split()[0].replace(">", "") + final_df["POS"] = np.arange(1, 29904) + final_df["ID"] = "." + final_df["REF"] = "." + final_df["ALT"] = "." + final_df["FILTER"] = "." + final_df["QUAL"] = "." + final_df["INFO"] = "." + final_df["FORMAT"] = "GT" + final_df["POS"] = final_df["POS"].astype("int16") # int16 bit max value 32767 + vcf_filename = process_id + ".vcf" + full_path = os.path.join(tmp_dirname, vcf_filename) + with open(full_path, "w") as f: # print("Create VCF file:",full_path) f.write(create_fix_vcf_header(refdescr)) for group_name, df_ in tqdm(rows_grouped, mininterval=0.5): final_df[group_name] = "." for row in df_.itertuples(): try: - if( getattr(row, 'start') < 1 or getattr(row, 'start') > 29903): + if getattr(row, "start") < 1 or getattr(row, "start") > 29903: continue - selected_final_row = final_df.loc[getattr(row, 'start')] - index_start_postion = getattr(row, 'start') - id = getattr(row, 'ref')+str(getattr(row, 'start'))+getattr(row, 'alt') - - if(selected_final_row.ID=='.'): - final_df.at[index_start_postion, 'ID'] = id + selected_final_row = final_df.loc[getattr(row, "start")] + index_start_postion = getattr(row, "start") + id = ( + getattr(row, "ref") + + str(getattr(row, "start")) + + getattr(row, "alt") + ) + + if selected_final_row.ID == ".": + final_df.at[index_start_postion, "ID"] = id final_df.at[index_start_postion, group_name] = 1 - final_df.at[index_start_postion, 'REF'] = getattr(row, 'ref') - final_df.at[index_start_postion, 'ALT'] = getattr(row, 'alt') - else: # update - # check ref and alt - if(selected_final_row.ID==id): # only one ID exists + final_df.at[index_start_postion, "REF"] = getattr(row, "ref") + final_df.at[index_start_postion, "ALT"] = getattr(row, "alt") + else: # update + # check ref and alt + if selected_final_row.ID == id: # only one ID exists final_df.at[index_start_postion, group_name] = 1 else: splited_final_id_list = selected_final_row.ID.split(";") totel_len = len(splited_final_id_list) - for new_GT, splited_final_id in enumerate(splited_final_id_list, start=1): - if(splited_final_id==id): # find the exist one - final_df.at[index_start_postion, group_name] = new_GT + for new_GT, splited_final_id in enumerate( + splited_final_id_list, start=1 + ): + if splited_final_id == id: # find the exist one + final_df.at[ + index_start_postion, group_name + ] = new_GT break - elif(totel_len == new_GT): # cannot find the same id ,so we append the new one to the string - final_df.at[index_start_postion, 'ID'] = final_df.at[index_start_postion,'ID']+ ';'+id + elif ( + totel_len == new_GT + ): # cannot find the same id ,so we append the new one to the string + final_df.at[index_start_postion, "ID"] = ( + final_df.at[index_start_postion, "ID"] + + ";" + + id + ) # with new GT number - final_df.at[index_start_postion, group_name] = new_GT + 1 - # appends new alt - final_df.at[index_start_postion, 'ALT'] = final_df.at[index_start_postion,'ALT']+ ','+getattr(row, 'alt') + final_df.at[index_start_postion, group_name] = ( + new_GT + 1 + ) + # appends new alt + final_df.at[index_start_postion, "ALT"] = ( + final_df.at[index_start_postion, "ALT"] + + "," + + getattr(row, "alt") + ) except Exception as e: - print("An exception occurred at...") + print("An exception occurred at...") print(group_name) print(row) print(traceback.format_exc()) continue final_df = calculate_AC_AN(final_df) - final_df = final_df.drop(final_df[final_df.ID=='.'].index) - final_df.to_csv(f, sep='\t', encoding='utf-8', index=False) + final_df = final_df.drop(final_df[final_df.ID == "."].index) + final_df.to_csv(f, sep="\t", encoding="utf-8", index=False) bgzip(full_path) - tabix_index(full_path) - return full_path+'.gz' + tabix_index(full_path) + return full_path + ".gz" # print(process_id+" Finish") - -def parallelize_dataframe(df, tmp_dirname, num_cores,refdescr, func): +def parallelize_dataframe(df, tmp_dirname, num_cores, refdescr, func): _tmp_lis = np.array_split(df, num_cores) - counter=0 - zip_items = [(_tmp_lis[i],tmp_dirname,refdescr) for i in range(len(_tmp_lis))] # same order - #with Pool(processes=num_cores) as pool: + counter = 0 + zip_items = [ + (_tmp_lis[i], tmp_dirname, refdescr) for i in range(len(_tmp_lis)) + ] # same order + # with Pool(processes=num_cores) as pool: # res = pool.starmap(func, zip_items) pool = Pool(num_cores) full_paht_list = pool.starmap(func, zip_items) - # finish all tasks + # finish all tasks pool.close() pool.join() - #print("Tmp result: ", full_paht_list) + # print("Tmp result: ", full_paht_list) return full_paht_list + def export2VCF( db_path, include_acc, include_dates, output, num_cores, - refdescr="No ref is provided" - ): - print('----- You are using sonartoVCF_V2 --------') - print('WARNING: the function is still experimental/not fully implemented.') - print('Prepare export2VCF workspace for',num_cores,'cpu') + refdescr="No ref is provided", +): + print("----- You are using sonartoVCF_V2 --------") + print("WARNING: the function is still experimental/not fully implemented.") + print("Prepare export2VCF workspace for", num_cores, "cpu") with ExitStack() as stack: dbm = stack.enter_context(sonarDBManager(db_path)) @@ -204,149 +238,161 @@ def export2VCF( where_vals = [] if include_acc: - where_clause.append(dbm.get_metadata_in_condition('accession' - , *include_acc)) + where_clause.append( + dbm.get_metadata_in_condition("accession", *include_acc) + ) where_vals.extend(include_acc) if include_dates: - where_clause.append(dbm.get_metadata_date_condition('date', - *include_dates)) + where_clause.append(dbm.get_metadata_date_condition("date", *include_dates)) - #print(where_clause) - fields = 'accession, start, end, alt, ref ' + # print(where_clause) + fields = "accession, start, end, alt, ref " if where_clause: - sql = "SELECT " + fields + " FROM dna_view WHERE " + " AND ".join(where_clause) + ";" + sql = ( + "SELECT " + + fields + + " FROM dna_view WHERE " + + " AND ".join(where_clause) + + ";" + ) else: sql = "SELECT " + fields + " FROM dna_view;" # print("query: " + sql) # print("vals: ", where_vals) ############################## - print('Start Bigquery...') - rows = pd.read_sql(sql, - dbm.connection,params=where_vals) - print('Return:', len(rows), ' records') + print("Start Bigquery...") + rows = pd.read_sql(sql, dbm.connection, params=where_vals) + print("Return:", len(rows), " records") track_vcf = [] count = 0 if not rows.empty: - tmp_dirname = mkdtemp( prefix=".sonarCache_") + tmp_dirname = mkdtemp(prefix=".sonarCache_") # vcf_path=os.path.join(tmp_dirname,) # create fasta_id - - #rows['CHROM'] = chrom_id - #rows['QUAL'] = '.' - #rows['FILTER'] = '.' - #rows['INFO'] = '.' - #rows['FORMAT'] = 'GT' + + # rows['CHROM'] = chrom_id + # rows['QUAL'] = '.' + # rows['FILTER'] = '.' + # rows['INFO'] = '.' + # rows['FORMAT'] = 'GT' # POS or start position: The reference position, with the 1st base is position 1 not 0 , but in covsonar use 0 as the 1st position # so we should + 1 # http://samtools.github.io/hts-specs/VCFv4.2.pdf - rows['start'] = rows['start'] + 1 - rows['end'] = rows['end']+1 - rows['alt'] = rows['alt'].replace('', np.nan) # remove Deletion + rows["start"] = rows["start"] + 1 + rows["end"] = rows["end"] + 1 + rows["alt"] = rows["alt"].replace("", np.nan) # remove Deletion # rows['start'] = rows['start'].replace('', np.nan) # remove Insertion - rows = rows.dropna(axis=0, subset=['alt']) - rows_grouped = rows.groupby('accession') - print('With :', len(rows_grouped), ' accessions') + rows = rows.dropna(axis=0, subset=["alt"]) + rows_grouped = rows.groupby("accession") + print("With :", len(rows_grouped), " accessions") # split data and write each ACC into individual VCF file. - print('Start Divide and Conquer ...') - track_vcf = parallelize_dataframe(rows_grouped, tmp_dirname, num_cores, refdescr, create_vcf) - - # bundle all vcf together - print('Integrate all VCFs ...') - divide_merge_vcf(track_vcf, output, num_cores) + print("Start Divide and Conquer ...") + track_vcf = parallelize_dataframe( + rows_grouped, tmp_dirname, num_cores, refdescr, create_vcf + ) + # bundle all vcf together + print("Integrate all VCFs ...") + divide_merge_vcf(track_vcf, output, num_cores) if os.path.isdir(tmp_dirname): shutil.rmtree(tmp_dirname) print("Finish! compress final result (gz):") - + + def divide_merge_vcf(list_track_vcf, global_output, num_cores): - chunk=500 - list_length = math.ceil(len(list_track_vcf)/chunk) # try to merge every - print('size:', list_length) - first_create_ = True + chunk = 500 + list_length = math.ceil(len(list_track_vcf) / chunk) # try to merge every + print("size:", list_length) + first_create_ = True second_create_ = True - tmp_dirname = mkdtemp( prefix=".final.sonarCache_") + tmp_dirname = mkdtemp(prefix=".final.sonarCache_") # we can tweak performance by using U at Bcftools for piping between bcftools subcommands (future work) bar = tqdm(range(list_length), desc="Create Global VCF:") - merge_type='b' + merge_type = "b" for i in bar: - _vcfs = " ".join(list_track_vcf[chunk*i:chunk*i+chunk]) - + _vcfs = " ".join(list_track_vcf[chunk * i : chunk * i + chunk]) - if(len(list_track_vcf)==1): - tmp_output = list_track_vcf[i].replace('.gz', '') + if len(list_track_vcf) == 1: + tmp_output = list_track_vcf[i].replace(".gz", "") continue - if(i == list_length-1): - merge_type='v' - #print('final merge') - + if i == list_length - 1: + merge_type = "v" + # print('final merge') if first_create_: - tmp_output = os.path.join(tmp_dirname,'vcf.2' ) - cmd = "bcftools merge {} -o {} -O{} --threads {}".format(_vcfs,tmp_output, merge_type, num_cores) - with subprocess.Popen(cmd, encoding='utf8', shell=True) as process: + tmp_output = os.path.join(tmp_dirname, "vcf.2") + cmd = "bcftools merge {} -o {} -O{} --threads {}".format( + _vcfs, tmp_output, merge_type, num_cores + ) + with subprocess.Popen(cmd, encoding="utf8", shell=True) as process: stdout, stderr = process.communicate(cmd) - #bgzip(tmp_output) - #tabix_index(tmp_output) + # bgzip(tmp_output) + # tabix_index(tmp_output) bcftool_index(tmp_output) first_create_ = False second_create_ = True third_create_ = True elif second_create_: - _vcfs = _vcfs +' '+ os.path.join(tmp_dirname,'vcf.2' ) - tmp_output = os.path.join(tmp_dirname,'vcf.3' ) + _vcfs = _vcfs + " " + os.path.join(tmp_dirname, "vcf.2") + tmp_output = os.path.join(tmp_dirname, "vcf.3") - cmd = "bcftools merge {} -o {} -O{} --threads {}".format(_vcfs, tmp_output, merge_type, num_cores) - with subprocess.Popen(cmd, encoding='utf8', shell=True) as process: + cmd = "bcftools merge {} -o {} -O{} --threads {}".format( + _vcfs, tmp_output, merge_type, num_cores + ) + with subprocess.Popen(cmd, encoding="utf8", shell=True) as process: stdout, stderr = process.communicate(cmd) - #bgzip(tmp_output) - #tabix_index(tmp_output) + # bgzip(tmp_output) + # tabix_index(tmp_output) bcftool_index(tmp_output) second_create_ = False third_create_ = True else: - _vcfs = _vcfs +' '+ os.path.join(tmp_dirname,'vcf.3' ) - tmp_output = os.path.join(tmp_dirname,'vcf.2' ) + _vcfs = _vcfs + " " + os.path.join(tmp_dirname, "vcf.3") + tmp_output = os.path.join(tmp_dirname, "vcf.2") - cmd = "bcftools merge {} -o {} -O{} --threads {}".format(_vcfs, tmp_output, merge_type, num_cores) - with subprocess.Popen(cmd, encoding='utf8', shell=True) as process: + cmd = "bcftools merge {} -o {} -O{} --threads {}".format( + _vcfs, tmp_output, merge_type, num_cores + ) + with subprocess.Popen(cmd, encoding="utf8", shell=True) as process: stdout, stderr = process.communicate(cmd) - #bgzip( tmp_output) - #tabix_index(tmp_output) + # bgzip( tmp_output) + # tabix_index(tmp_output) bcftool_index(tmp_output) second_create_ = True third_create_ = False - if(merge_type =='v'): + if merge_type == "v": bgzip(tmp_output) - tabix_index(tmp_output) - tmp_output = clean_stranger_things(tmp_output+ '.gz', tmp_dirname) - shutil.copy(tmp_output, global_output+ '.gz') - - print('Clean workspace ...') + tabix_index(tmp_output) + tmp_output = clean_stranger_things(tmp_output + ".gz", tmp_dirname) + shutil.copy(tmp_output, global_output + ".gz") + + print("Clean workspace ...") if os.path.isdir(tmp_dirname): - shutil.rmtree(tmp_dirname) + shutil.rmtree(tmp_dirname) + def clean_stranger_things(path_to_vcfgz, tmp_dirname): - print('Clean strange things in vcf ...') - output_path_file = os.path.join(tmp_dirname,'vcf.final.gz' ) - with gzip.open(path_to_vcfgz, 'rt') as f: - with gzip.open(output_path_file, 'wt') as output_file: + print("Clean strange things in vcf ...") + output_path_file = os.path.join(tmp_dirname, "vcf.final.gz") + with gzip.open(path_to_vcfgz, "rt") as f: + with gzip.open(output_path_file, "wt") as output_file: for line in f: - if line.startswith('#'): - if 'bcftools_mergeCommand' in line: + if line.startswith("#"): + if "bcftools_mergeCommand" in line: continue else: output_file.write(line) else: - rows = line.split('\t') - ### fix duplicate + rows = line.split("\t") + ### fix duplicate ID = rows[2] - rows[2]= ";".join(set(ID.split(';'))) - + rows[2] = ";".join(set(ID.split(";"))) + output_file.write("\t".join(rows)) return output_path_file diff --git a/lib/sonartoVCF_v2.py b/lib/sonartoVCF_v2.py old mode 100755 new mode 100644 index 0a25a66..91ff32e --- a/lib/sonartoVCF_v2.py +++ b/lib/sonartoVCF_v2.py @@ -1,83 +1,99 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -import os -from lib.sonardb import sonarDBManager from contextlib import ExitStack -from more_itertools import consecutive_groups, split_when -from tempfile import mkstemp, mkdtemp +import gzip +import math +from multiprocessing import Pool +import os +from os import getpid import shutil -import pandas as pd -import numpy as np import subprocess -from os import getpid, system -from multiprocessing import Pool +from tempfile import mkdtemp +import traceback import warnings -import math + +from lib.sonardb import sonarDBManager +import numpy as np +import pandas as pd from tqdm import tqdm -warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) -import traceback -import gzip -import sys + +warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) + def create_fix_vcf_header(ref): - header = "##fileformat=VCFv4.2\n##CreatedBy=covSonarV1.1.4\n##reference="+ref + header = "##fileformat=VCFv4.2\n##CreatedBy=covSonarV1.1.4\n##reference=" + ref format = '\n##FORMAT=' info = '\n##INFO=' - info = info+'\n##INFO=' - info = info+'\n##INFO=\n' - note = "##Note_1='Currently we ignore DEL type'\n" - note = note+ "##Note_2='This VCF file is genereted by using var2vcf with betaV2, if you find any bugs, then please write a bug report to us'\n" + info = ( + info + + '\n##INFO=' + ) + info = ( + info + + '\n##INFO=\n' + ) + note = "##Note_1='Currently we ignore DEL type'\n" + note = ( + note + + "##Note_2='This VCF file is genereted by using var2vcf with betaV2, if you find any bugs, then please write a bug report to us'\n" + ) # column = "\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"+sample_id+"\n" - return header+format+info+note - -from multiprocessing import Pool + return header + format + info + note def bgzip(filename): """Call bgzip to compress a file.""" - cmd = ['bgzip', '-f', filename] - with subprocess.Popen(cmd, encoding='utf8', stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + cmd = ["bgzip", "-f", filename] + with subprocess.Popen( + cmd, encoding="utf8", stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) as process: try: stdout, stderr = process.communicate(cmd) except subprocess.TimeoutExpired: process.kill() stdout, stderr = process.communicate() - raise subprocess.TimeoutExpired( output=stdout, stderr=stderr) + raise subprocess.TimeoutExpired(output=stdout, stderr=stderr) except Exception: process.kill() raise return + def tabix_index(filename): """Call tabix to create an index for a bgzip-compressed file.""" - cmd = ['tabix', '-p', 'vcf', filename+'.gz'] - with subprocess.Popen(cmd, encoding='utf8', stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + cmd = ["tabix", "-p", "vcf", filename + ".gz"] + with subprocess.Popen( + cmd, encoding="utf8", stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) as process: try: stdout, stderr = process.communicate(cmd) except subprocess.TimeoutExpired: process.kill() stdout, stderr = process.communicate() - raise subprocess.TimeoutExpired( output=stdout, stderr=stderr) + raise subprocess.TimeoutExpired(output=stdout, stderr=stderr) except Exception: process.kill() raise - return + return + def bcftool_index(filename): """Call tabix to create an index for a bgzip-compressed file.""" - cmd = ['bcftools', 'index', filename] - with subprocess.Popen(cmd, encoding='utf8', stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process: + cmd = ["bcftools", "index", filename] + with subprocess.Popen( + cmd, encoding="utf8", stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) as process: try: stdout, stderr = process.communicate(cmd) except subprocess.TimeoutExpired: process.kill() stdout, stderr = process.communicate() - raise subprocess.TimeoutExpired( output=stdout, stderr=stderr) + raise subprocess.TimeoutExpired(output=stdout, stderr=stderr) except Exception: process.kill() raise - return + return def calculate_AC_AN(final_df): @@ -85,204 +101,286 @@ def calculate_AC_AN(final_df): # order-preserving index between POS and INFO AC # e.g. a.POS;b.POS a.AC,b.AC for row in final_df.itertuples(): - unique, counts = np.unique(np.asarray(row[11:]), return_counts=True) # row[11:] means we start from sample ID column + unique, counts = np.unique( + np.asarray(row[11:]), return_counts=True + ) # row[11:] means we start from sample ID column # for unique, counts in zip(unique, counts): - AN=0 - AC='' + AN = 0 + AC = "" for idx, val in enumerate(unique): - if(val == '.'): # ignore it + if val == ".": # ignore it continue else: _AC = counts[idx] - AN = AN +_AC - AC = str(_AC) if not AC else AC+','+str(_AC) - final_df.at[row.Index, 'INFO'] = 'AN='+str(AN)+';AC='+AC + AN = AN + _AC + AC = str(_AC) if not AC else AC + "," + str(_AC) + final_df.at[row.Index, "INFO"] = "AN=" + str(AN) + ";AC=" + AC return final_df + def _check_variant_type(ref, alt): - - if len(ref) == len(alt) and len(alt)==1: # SNP - return 'SNP' - elif len(ref) < len(alt) and len(alt) > 0: # INS + + if len(ref) == len(alt) and len(alt) == 1: # SNP + return "SNP" + elif len(ref) < len(alt) and len(alt) > 0: # INS if ref == alt[0]: - return 'INS' + return "INS" else: - return 'INDEL' - elif len(ref) > len(alt) and len(ref) > 0: # DEL - return 'DEL' + return "INDEL" + elif len(ref) > len(alt) and len(ref) > 0: # DEL + return "DEL" else: - print('Unknown:',ref, alt) - return 'Unknown' + print("Unknown:", ref, alt) + return "Unknown" + def create_vcf(rows_grouped, tmp_dirname, refdescr): - refdescr= refdescr.split()[0].replace(">", "") - process_id =str(getpid()) + refdescr = refdescr.split()[0].replace(">", "") + process_id = str(getpid()) # print(process_id+" Start") # iterate over each group - final_snp_df = pd.DataFrame({'#CHROM': pd.Series(dtype='str'), - 'POS': pd.Series(dtype='int'), - 'ID': pd.Series(dtype='str'), - 'REF': pd.Series(dtype='str'), - 'ALT': pd.Series(dtype='str'), - 'FILTER': pd.Series(dtype='str'), - 'QUAL': pd.Series(dtype='str'), - 'INFO': pd.Series(dtype='str'), - 'FORMAT': pd.Series(dtype='str'), - 'TYPE': pd.Series(dtype='str'),}, index = np.arange(1, 29904)) - final_snp_df['POS'] = np.arange(1, 29904) - final_snp_df['#CHROM'] = refdescr # duplicated line here did a trick for assigning the value - final_snp_df['ID'] = '.' - final_snp_df['REF'] = '.' - final_snp_df['ALT'] = '.' - final_snp_df['FILTER'] = '.' - final_snp_df['QUAL'] = '.' - final_snp_df['INFO'] = '.' - final_snp_df['FORMAT'] = 'GT' - final_snp_df['TYPE'] = '.' - - final_indel_df = pd.DataFrame({'#CHROM': pd.Series(dtype='str'), - 'POS': pd.Series(dtype='int'), - 'ID': pd.Series(dtype='str'), - 'REF': pd.Series(dtype='str'), - 'ALT': pd.Series(dtype='str'), - 'FILTER': pd.Series(dtype='str'), - 'QUAL': pd.Series(dtype='str'), - 'INFO': pd.Series(dtype='str'), - 'FORMAT': pd.Series(dtype='str'), - 'TYPE': pd.Series(dtype='str'),}) - vcf_filename =process_id+'.vcf' - full_path = os.path.join(tmp_dirname,vcf_filename) - with open(full_path, 'w') as f: + final_snp_df = pd.DataFrame( + { + "#CHROM": pd.Series(dtype="str"), + "POS": pd.Series(dtype="int"), + "ID": pd.Series(dtype="str"), + "REF": pd.Series(dtype="str"), + "ALT": pd.Series(dtype="str"), + "FILTER": pd.Series(dtype="str"), + "QUAL": pd.Series(dtype="str"), + "INFO": pd.Series(dtype="str"), + "FORMAT": pd.Series(dtype="str"), + "TYPE": pd.Series(dtype="str"), + }, + index=np.arange(1, 29904), + ) + final_snp_df["POS"] = np.arange(1, 29904) + final_snp_df[ + "#CHROM" + ] = refdescr # duplicated line here did a trick for assigning the value + final_snp_df["ID"] = "." + final_snp_df["REF"] = "." + final_snp_df["ALT"] = "." + final_snp_df["FILTER"] = "." + final_snp_df["QUAL"] = "." + final_snp_df["INFO"] = "." + final_snp_df["FORMAT"] = "GT" + final_snp_df["TYPE"] = "." + + final_indel_df = pd.DataFrame( + { + "#CHROM": pd.Series(dtype="str"), + "POS": pd.Series(dtype="int"), + "ID": pd.Series(dtype="str"), + "REF": pd.Series(dtype="str"), + "ALT": pd.Series(dtype="str"), + "FILTER": pd.Series(dtype="str"), + "QUAL": pd.Series(dtype="str"), + "INFO": pd.Series(dtype="str"), + "FORMAT": pd.Series(dtype="str"), + "TYPE": pd.Series(dtype="str"), + } + ) + vcf_filename = process_id + ".vcf" + full_path = os.path.join(tmp_dirname, vcf_filename) + with open(full_path, "w") as f: # print("Create VCF file:",full_path) f.write(create_fix_vcf_header(refdescr)) for group_name, df_ in tqdm(rows_grouped, mininterval=0.5): try: - #final_snp_df[group_name] = "." # init value - #final_indel_df[group_name] = "." # init value + # final_snp_df[group_name] = "." # init value + # final_indel_df[group_name] = "." # init value for row in df_.itertuples(): - - _id = getattr(row, 'ref')+str(getattr(row, 'start'))+getattr(row, 'alt') - _type =_check_variant_type( getattr(row, 'ref'),getattr(row, 'alt')) - - if(_type == 'SNP' ): + + _id = ( + getattr(row, "ref") + + str(getattr(row, "start")) + + getattr(row, "alt") + ) + _type = _check_variant_type( + getattr(row, "ref"), getattr(row, "alt") + ) + + if _type == "SNP": # find ID - index_start_postion = getattr(row, 'start') - selected_row = final_snp_df.loc[index_start_postion] # return scalar instead DF - if(selected_row.ID == '.'): # A - - final_snp_df.at[index_start_postion, 'ID'] = _id - final_snp_df.at[index_start_postion, group_name] = '1' - final_snp_df.at[index_start_postion, 'REF'] = getattr(row, 'ref') - final_snp_df.at[index_start_postion, 'ALT'] = getattr(row, 'alt') - final_snp_df.at[index_start_postion, 'TYPE'] = 'SNP' - - elif(selected_row.ID != '.'): # B - if(selected_row.ID == _id): # only one ID exists and just update GT of a sample - final_snp_df.at[index_start_postion, group_name] = '1' + index_start_postion = getattr(row, "start") + selected_row = final_snp_df.loc[ + index_start_postion + ] # return scalar instead DF + if selected_row.ID == ".": # A + + final_snp_df.at[index_start_postion, "ID"] = _id + final_snp_df.at[index_start_postion, group_name] = "1" + final_snp_df.at[index_start_postion, "REF"] = getattr( + row, "ref" + ) + final_snp_df.at[index_start_postion, "ALT"] = getattr( + row, "alt" + ) + final_snp_df.at[index_start_postion, "TYPE"] = "SNP" + + elif selected_row.ID != ".": # B + if ( + selected_row.ID == _id + ): # only one ID exists and just update GT of a sample + final_snp_df.at[index_start_postion, group_name] = "1" else: splited_final_id_list = selected_row.ID.split(";") totel_len = len(splited_final_id_list) - for new_GT, splited_final_id in enumerate(splited_final_id_list, start=1): - if(splited_final_id == _id): # Found the exist one - final_snp_df.at[index_start_postion, group_name] = str(new_GT) - break - elif(totel_len == new_GT): # cannot find the same ID ,so we append the new one to the string - final_snp_df.at[index_start_postion, 'ID'] = final_snp_df.at[index_start_postion,'ID']+ ';'+ _id + for new_GT, splited_final_id in enumerate( + splited_final_id_list, start=1 + ): + if splited_final_id == _id: # Found the exist one + final_snp_df.at[ + index_start_postion, group_name + ] = str(new_GT) + break + elif ( + totel_len == new_GT + ): # cannot find the same ID ,so we append the new one to the string + final_snp_df.at[index_start_postion, "ID"] = ( + final_snp_df.at[index_start_postion, "ID"] + + ";" + + _id + ) # with new GT number - final_snp_df.at[index_start_postion, group_name] = str(new_GT + 1) - # appends new alt - final_snp_df.at[index_start_postion, 'ALT'] = final_snp_df.at[index_start_postion,'ALT']+ ','+getattr(row, 'alt') + final_snp_df.at[ + index_start_postion, group_name + ] = str(new_GT + 1) + # appends new alt + final_snp_df.at[index_start_postion, "ALT"] = ( + final_snp_df.at[index_start_postion, "ALT"] + + "," + + getattr(row, "alt") + ) else: - # get index + # get index print(selected_row) - raise ValueError('Something went wrong') - - #elif(_type == 'DEL'): + raise ValueError("Something went wrong") + + # elif(_type == 'DEL'): # continue - elif(_type == 'INS' or _type == 'INDEL'or _type == 'DEL'): # C or _type == 'DEL' - - selected_row = final_indel_df[(final_indel_df['POS'] == getattr(row, 'start')) & (final_indel_df['TYPE'] == _type)] - if(len(selected_row) == 0): # D, always insert new records - new_row = {'#CHROM': refdescr,'ID': _id, 'POS': int(getattr(row, 'start')), 'REF':getattr(row, 'ref'), - 'ALT':getattr(row, 'alt'), 'TYPE':_type, - 'FILTER':'.', 'QUAL':'.', 'INFO': '.','FORMAT':'GT', group_name: '1'} - final_indel_df = final_indel_df.append(new_row, ignore_index=True) - elif(len(selected_row) > 0): # E , found more than 0 which means we can update or insert it - for _E_rows in selected_row.itertuples(): - index_postion= _E_rows.Index + elif ( + _type == "INS" or _type == "INDEL" or _type == "DEL" + ): # C or _type == 'DEL' + + selected_row = final_indel_df[ + (final_indel_df["POS"] == getattr(row, "start")) + & (final_indel_df["TYPE"] == _type) + ] + if len(selected_row) == 0: # D, always insert new records + new_row = { + "#CHROM": refdescr, + "ID": _id, + "POS": int(getattr(row, "start")), + "REF": getattr(row, "ref"), + "ALT": getattr(row, "alt"), + "TYPE": _type, + "FILTER": ".", + "QUAL": ".", + "INFO": ".", + "FORMAT": "GT", + group_name: "1", + } + final_indel_df = final_indel_df.append( + new_row, ignore_index=True + ) + elif ( + len(selected_row) > 0 + ): # E , found more than 0 which means we can update or insert it + for _E_rows in selected_row.itertuples(): + index_postion = _E_rows.Index splited_IDs_list = _E_rows.ID.split(";") totel_len = len(splited_IDs_list) - for _E_new_GT, splited_ID in enumerate(splited_IDs_list, start=1): - #print('index',index_postion,'test', splited_ID, _E_new_GT) - if(splited_ID == _id): # Found the exist one - #print('Found the exist one', splited_final_id) - final_indel_df.at[index_postion, group_name] = str(_E_new_GT) - break - elif(totel_len == _E_new_GT): # cannot find the same ID ,so we append the new one to the string - final_indel_df.at[index_postion, 'ID'] = final_indel_df.at[index_postion,'ID']+ ';'+ _id + for _E_new_GT, splited_ID in enumerate( + splited_IDs_list, start=1 + ): + # print('index',index_postion,'test', splited_ID, _E_new_GT) + if splited_ID == _id: # Found the exist one + # print('Found the exist one', splited_final_id) + final_indel_df.at[ + index_postion, group_name + ] = str(_E_new_GT) + break + elif ( + totel_len == _E_new_GT + ): # cannot find the same ID ,so we append the new one to the string + final_indel_df.at[index_postion, "ID"] = ( + final_indel_df.at[index_postion, "ID"] + + ";" + + _id + ) # with new GT number - final_indel_df.at[index_postion, group_name] = str(_E_new_GT + 1) - # appends new alt - final_indel_df.at[index_postion, 'ALT'] = final_indel_df.at[index_postion,'ALT']+ ','+getattr(row, 'alt') - + final_indel_df.at[ + index_postion, group_name + ] = str(_E_new_GT + 1) + # appends new alt + final_indel_df.at[index_postion, "ALT"] = ( + final_indel_df.at[index_postion, "ALT"] + + "," + + getattr(row, "alt") + ) + else: - print('Skip this ', _id) # right now skip for Unknown + print("Skip this ", _id) # right now skip for Unknown continue except ValueError as err: - print(err.args) - raise + print(err.args) + raise except Exception as e: - print("An exception occurred at...") - print(group_name) - print(row) - print(selected_row) - print(traceback.format_exc()) - raise e - - final_df = pd.concat( [final_snp_df, final_indel_df ],axis=0,ignore_index=True) - final_df = final_df.drop(final_df[final_df.ID=='.'].index) - final_df.replace(np.nan, '.', inplace=True) - final_df['ALT'].replace('', '.', inplace=True) # for deletion - final_df = final_df.sort_values(["POS"], ascending=True) - final_df = calculate_AC_AN(final_df) - final_df["INFO"] = final_df["INFO"] +";TYPE="+ final_df["TYPE"] - final_df= final_df.drop(columns=['TYPE']) - final_df.to_csv(f, sep='\t', encoding='utf-8', index=False) + print("An exception occurred at...") + print(group_name) + print(row) + print(selected_row) + print(traceback.format_exc()) + raise e + + final_df = pd.concat([final_snp_df, final_indel_df], axis=0, ignore_index=True) + final_df = final_df.drop(final_df[final_df.ID == "."].index) + final_df.replace(np.nan, ".", inplace=True) + final_df["ALT"].replace("", ".", inplace=True) # for deletion + final_df = final_df.sort_values(["POS"], ascending=True) + final_df = calculate_AC_AN(final_df) + final_df["INFO"] = final_df["INFO"] + ";TYPE=" + final_df["TYPE"] + final_df = final_df.drop(columns=["TYPE"]) + final_df.to_csv(f, sep="\t", encoding="utf-8", index=False) bgzip(full_path) - tabix_index(full_path) - return full_path+'.gz' + tabix_index(full_path) + return full_path + ".gz" # print(process_id+" Finish") - -def parallelize_dataframe(df, tmp_dirname, num_cores,refdescr, func): +def parallelize_dataframe(df, tmp_dirname, num_cores, refdescr, func): _tmp_lis = np.array_split(df, num_cores) - counter=0 - zip_items = [(_tmp_lis[i],tmp_dirname,refdescr) for i in range(len(_tmp_lis))] # same order - #with Pool(processes=num_cores) as pool: + counter = 0 + zip_items = [ + (_tmp_lis[i], tmp_dirname, refdescr) for i in range(len(_tmp_lis)) + ] # same order + # with Pool(processes=num_cores) as pool: # res = pool.starmap(func, zip_items) pool = Pool(num_cores) full_paht_list = pool.starmap(func, zip_items) - # finish all tasks + # finish all tasks pool.close() pool.join() - #print("Tmp result: ", full_paht_list) + # print("Tmp result: ", full_paht_list) return full_paht_list + def export2VCF( db_path, include_acc, include_dates, output, num_cores, - refdescr="No ref is provided" - ): - print('----- You are using sonartoVCF_V2 --------') - print('WARNING: the function is still experimental/not fully implemented.') - print('Prepare export2VCF workspace for',num_cores,'cpu') + refdescr="No ref is provided", +): + print("----- You are using sonartoVCF_V2 --------") + print("WARNING: the function is still experimental/not fully implemented.") + print("Prepare export2VCF workspace for", num_cores, "cpu") with ExitStack() as stack: dbm = stack.enter_context(sonarDBManager(db_path)) @@ -290,150 +388,167 @@ def export2VCF( where_vals = [] if include_acc: - where_clause.append(dbm.get_metadata_in_condition('accession' - , *include_acc)) + where_clause.append( + dbm.get_metadata_in_condition("accession", *include_acc) + ) where_vals.extend(include_acc) if include_dates: - where_clause.append(dbm.get_metadata_date_condition('date', - *include_dates)) + where_clause.append(dbm.get_metadata_date_condition("date", *include_dates)) - #print(where_clause) - fields = 'accession, start, end, alt, ref ' + # print(where_clause) + fields = "accession, start, end, alt, ref " if where_clause: - sql = "SELECT " + fields + " FROM dna_view WHERE " + " AND ".join(where_clause) + ";" + sql = ( + "SELECT " + + fields + + " FROM dna_view WHERE " + + " AND ".join(where_clause) + + ";" + ) else: sql = "SELECT " + fields + " FROM dna_view;" # print("query: " + sql) # print("vals: ", where_vals) ############################## - print('Start Bigquery...') - rows = pd.read_sql(sql, - dbm.connection,params=where_vals) - print('Return:', len(rows), ' records') + print("Start Bigquery...") + rows = pd.read_sql(sql, dbm.connection, params=where_vals) + print("Return:", len(rows), " records") track_vcf = [] count = 0 if not rows.empty: - # rows.to_pickle("dummy.pkl") - tmp_dirname = mkdtemp( prefix=".sonarCache_") + # rows.to_pickle("dummy.pkl") + tmp_dirname = mkdtemp(prefix=".sonarCache_") # vcf_path=os.path.join(tmp_dirname,) # create fasta_id - - #rows['CHROM'] = chrom_id - #rows['QUAL'] = '.' - #rows['FILTER'] = '.' - #rows['INFO'] = '.' - #rows['FORMAT'] = 'GT' + + # rows['CHROM'] = chrom_id + # rows['QUAL'] = '.' + # rows['FILTER'] = '.' + # rows['INFO'] = '.' + # rows['FORMAT'] = 'GT' # POS or start position: The reference position, with the 1st base is position 1 not 0 , but in covsonar use 0 as the 1st position # so we should + 1 # http://samtools.github.io/hts-specs/VCFv4.2.pdf - rows['start'] = rows['start'] + 1 + rows["start"] = rows["start"] + 1 # rows['end'] = rows['end']+1 - rows= rows.loc[ (1 <= rows['start']) & (rows['start'] <= 29903)] # filter out + rows = rows.loc[ + (1 <= rows["start"]) & (rows["start"] <= 29903) + ] # filter out - rows['alt'] = rows['alt'].replace('', np.nan) # remove Deletion + rows["alt"] = rows["alt"].replace("", np.nan) # remove Deletion # rows['start'] = rows['start'].replace('', np.nan) # remove Insertion - rows = rows.dropna(axis=0, subset=['alt']) - rows_grouped = rows.groupby('accession') - print('With :', len(rows_grouped), ' accessions') + rows = rows.dropna(axis=0, subset=["alt"]) + rows_grouped = rows.groupby("accession") + print("With :", len(rows_grouped), " accessions") # split data and write each ACC into individual VCF file. - print('Start Divide and Conquer ...') - track_vcf = parallelize_dataframe(rows_grouped, tmp_dirname, num_cores, refdescr, create_vcf) - - # bundle all vcf together - print('Integrate all VCFs ...') - divide_merge_vcf(track_vcf, output, num_cores) + print("Start Divide and Conquer ...") + track_vcf = parallelize_dataframe( + rows_grouped, tmp_dirname, num_cores, refdescr, create_vcf + ) + # bundle all vcf together + print("Integrate all VCFs ...") + divide_merge_vcf(track_vcf, output, num_cores) if os.path.isdir(tmp_dirname): shutil.rmtree(tmp_dirname) print("Finish! compress final result (gz):") - + + def divide_merge_vcf(list_track_vcf, global_output, num_cores): - chunk=500 - list_length = math.ceil(len(list_track_vcf)/chunk) # try to merge every - print('size:', list_length) - first_create_ = True + chunk = 500 + list_length = math.ceil(len(list_track_vcf) / chunk) # try to merge every + print("size:", list_length) + first_create_ = True second_create_ = True - tmp_dirname = mkdtemp(dir='/home/kongkitimanonk/SCRATCH_NOBAK/CovSonar1/workdir_covsonar/test-vcf/', prefix=".final.sonarCache_") + tmp_dirname = mkdtemp( + dir="/home/kongkitimanonk/SCRATCH_NOBAK/CovSonar1/workdir_covsonar/test-vcf/", + prefix=".final.sonarCache_", + ) # we can tweak performance by using U at Bcftools for piping between bcftools subcommands (future work) bar = tqdm(range(list_length), desc="Create Global VCF:") - merge_type='b' + merge_type = "b" for i in bar: - _vcfs = " ".join(list_track_vcf[chunk*i:chunk*i+chunk]) - + _vcfs = " ".join(list_track_vcf[chunk * i : chunk * i + chunk]) - if(len(list_track_vcf)==1): - tmp_output = list_track_vcf[i].replace('.gz', '') + if len(list_track_vcf) == 1: + tmp_output = list_track_vcf[i].replace(".gz", "") continue - if(i == list_length-1): - merge_type='v' - #print('final merge') - + if i == list_length - 1: + merge_type = "v" + # print('final merge') if first_create_: - tmp_output = os.path.join(tmp_dirname,'vcf.2' ) - cmd = "bcftools merge {} -o {} -O{} --threads {}".format(_vcfs,tmp_output, merge_type, num_cores) - with subprocess.Popen(cmd, encoding='utf8', shell=True) as process: + tmp_output = os.path.join(tmp_dirname, "vcf.2") + cmd = "bcftools merge {} -o {} -O{} --threads {}".format( + _vcfs, tmp_output, merge_type, num_cores + ) + with subprocess.Popen(cmd, encoding="utf8", shell=True) as process: stdout, stderr = process.communicate(cmd) - #bgzip(tmp_output) - #tabix_index(tmp_output) + # bgzip(tmp_output) + # tabix_index(tmp_output) bcftool_index(tmp_output) first_create_ = False second_create_ = True elif second_create_: - _vcfs = _vcfs +' '+ os.path.join(tmp_dirname,'vcf.2' ) - tmp_output = os.path.join(tmp_dirname,'vcf.3' ) + _vcfs = _vcfs + " " + os.path.join(tmp_dirname, "vcf.2") + tmp_output = os.path.join(tmp_dirname, "vcf.3") - cmd = "bcftools merge {} -o {} -O{} --threads {}".format(_vcfs, tmp_output, merge_type, num_cores) - with subprocess.Popen(cmd, encoding='utf8', shell=True) as process: + cmd = "bcftools merge {} -o {} -O{} --threads {}".format( + _vcfs, tmp_output, merge_type, num_cores + ) + with subprocess.Popen(cmd, encoding="utf8", shell=True) as process: stdout, stderr = process.communicate(cmd) - #bgzip(tmp_output) - #tabix_index(tmp_output) + # bgzip(tmp_output) + # tabix_index(tmp_output) bcftool_index(tmp_output) second_create_ = False else: - _vcfs = _vcfs +' '+ os.path.join(tmp_dirname,'vcf.3' ) - tmp_output = os.path.join(tmp_dirname,'vcf.2' ) + _vcfs = _vcfs + " " + os.path.join(tmp_dirname, "vcf.3") + tmp_output = os.path.join(tmp_dirname, "vcf.2") - cmd = "bcftools merge {} -o {} -O{} --threads {}".format(_vcfs, tmp_output, merge_type, num_cores) - with subprocess.Popen(cmd, encoding='utf8', shell=True) as process: + cmd = "bcftools merge {} -o {} -O{} --threads {}".format( + _vcfs, tmp_output, merge_type, num_cores + ) + with subprocess.Popen(cmd, encoding="utf8", shell=True) as process: stdout, stderr = process.communicate(cmd) - #bgzip( tmp_output) - #tabix_index(tmp_output) + # bgzip( tmp_output) + # tabix_index(tmp_output) bcftool_index(tmp_output) second_create_ = True - if(merge_type =='v'): + if merge_type == "v": bgzip(tmp_output) - tabix_index(tmp_output) - tmp_output = clean_stranger_things(tmp_output+ '.gz', tmp_dirname) - shutil.copy(tmp_output, global_output+ '.gz') + tabix_index(tmp_output) + tmp_output = clean_stranger_things(tmp_output + ".gz", tmp_dirname) + shutil.copy(tmp_output, global_output + ".gz") # sys.exit(" exist.") - print('Clean workspace ...') + print("Clean workspace ...") if os.path.isdir(tmp_dirname): - shutil.rmtree(tmp_dirname) + shutil.rmtree(tmp_dirname) + def clean_stranger_things(path_to_vcfgz, tmp_dirname): - print('Clean strange things in vcf ...') - output_path_file = os.path.join(tmp_dirname,'vcf.final.gz' ) - with gzip.open(path_to_vcfgz, 'rt') as f: - with gzip.open(output_path_file, 'wt') as output_file: + print("Clean strange things in vcf ...") + output_path_file = os.path.join(tmp_dirname, "vcf.final.gz") + with gzip.open(path_to_vcfgz, "rt") as f: + with gzip.open(output_path_file, "wt") as output_file: for line in f: - if line.startswith('#'): - if 'bcftools_mergeCommand' in line: + if line.startswith("#"): + if "bcftools_mergeCommand" in line: continue else: output_file.write(line) else: - rows = line.split('\t') - ### fix duplicate + rows = line.split("\t") + ### fix duplicate ID = rows[2] - rows[2]= ";".join(set(ID.split(';'))) - + rows[2] = ";".join(set(ID.split(";"))) + output_file.write("\t".join(rows)) - return output_path_file \ No newline at end of file + return output_path_file diff --git a/logo.png b/logo.png old mode 100755 new mode 100644 diff --git a/sonar.env.yml b/sonar.env.yml old mode 100755 new mode 100644 diff --git a/sonar.py b/sonar.py index 7fb31ce..ab52422 100755 --- a/sonar.py +++ b/sonar.py @@ -1,599 +1,1195 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -#author: Stephan Fuchs (Robert Koch Institute, MF1, fuchss@rki.de) +# author: Stephan Fuchs (Robert Koch Institute, MF1, fuchss@rki.de) -import os -from posixpath import split -import sys -import csv import argparse +from collections import defaultdict +import csv import gzip import lzma -from lib import sonardb, Lineages_UPDATER +from multiprocessing import Pool +import os +import re +import shutil +import sys +from tempfile import mkdtemp +from tempfile import mkstemp + +from Bio import SeqIO +from lib import Lineages_UPDATER +from lib import sonardb from lib import sonartoVCF as sonartoVCF from lib import sonartoVCF_v2 as sonartoVCFV2 -from Bio import SeqIO -from tempfile import mkstemp, mkdtemp -from collections import defaultdict -import re from tqdm import tqdm -from multiprocessing import Pool -import shutil -with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), ".version"), "r") as handle: - VERSION = handle.read().strip() +with open( + os.path.join(os.path.dirname(os.path.realpath(__file__)), ".version"), "r" +) as handle: + VERSION = handle.read().strip() + def parse_args(): - parser = argparse.ArgumentParser(prog="sonar.py", description="") - subparsers = parser.add_subparsers(help='detect, store, and screen for mutations in SARS-CoV-2 genomic sequences') - subparsers.dest = 'tool' - subparsers.required = True - - #parent parser: db input - general_parser = argparse.ArgumentParser(add_help=False) - general_parser.add_argument('--db', metavar="DB_DIR", help="sonar database directory", type=str, required=True) - general_parser.add_argument('--cpus', metavar="int", help="number of cpus to use (default: 1)", type=int, default=1) - - # create the parser for the "add" command - parser_add = subparsers.add_parser('add', parents=[general_parser], help='add genome sequences to the database.') - parser_add_input = parser_add.add_mutually_exclusive_group() - parser_add_input.add_argument('-f', '--file', metavar="FILE", help="fasta file(s) containing DNA sequences to add", type=str, nargs="+", default=[]) - parser_add_input.add_argument('-d', '--dir', metavar="DIR", help="add all fasta files (ending with \".fasta\" or \".fna\") from a given directory or directories", type=str, nargs="+", default=None) - parser_add.add_argument('-c', '--cache', metavar="DIR", help="use (and restore data from) a given cache (if not set, a temporary cache is used and deleted after import)", type=str, default=None) - parser_add.add_argument('-t', '--timeout', metavar="INT", help="timout for aligning sequences in seconds (default: 600)", type=int, default=600) - parser_add.add_argument('--compressed', help="compression of input file format ('none', 'gz', 'xz', default: 'auto')", choices=['none', 'gz', 'xz', 'auto'], default='auto') - parser_add.add_argument('--force', help="force updating of accessions if description or sequence has changed", action="store_true") - parser_add.add_argument('--noprogress', '-p', help="do not show any progress bar", action="store_true") - parser_add.add_argument('--source', help="define a common data source for all genomes", type=str, default=None) - parser_add.add_argument('--collection', help="define a common data collection for all genomes", type=str, default=None) - parser_add.add_argument('--lab', help="define a common lab for all genomes", type=str, default=None) - parser_add.add_argument('--quiet', '-q', help="do not show any output", action="store_true") - - # create the parser for the "remove" command - parser_remove = subparsers.add_parser('remove', parents=[general_parser], help='remove genome sequences to the database.') - parser_remove = parser_remove.add_mutually_exclusive_group() - parser_remove.add_argument('--acc', metavar="STR", help="define accession(s) of sequences to delete", type=str, nargs="+", default=[]) - parser_remove.add_argument('--file', metavar="FILE", help="define file containing accession(s) of sequences to delete (one per line)", type=str, default=None) - - # create the parser for the "match" command - parser_match = subparsers.add_parser('match', parents=[general_parser], help='get mutations profiles for given accessions.') - parser_match.add_argument('--include', '-i', metavar="STR", help="match genomes sharing the given mutation profile", type=str, action='append', nargs="+", default=[]) - parser_match.add_argument('--exclude', '-e', metavar="STR", help="match genomes not containing the mutation profile", type=str, action='append', nargs="+", default=[]) - parser_match.add_argument('--with-sublineage', help="recursively get all sublineages from a given lineage (--lineage) (only child) ",action="store_true") - # parser_match.add_argument('--recursion', help="recursively get all sublineages of a given lineage (--lineage). this will work only if '--with-sublineage' is used",action="store_true") - parser_match.add_argument('--lineage', metavar="STR", help="match genomes of the given pangolin lineage(s) only", type=str, nargs="+", default=[]) - parser_match.add_argument('--acc', metavar="STR", help="match specific genomes defined by acession(s) only", type=str, nargs="+", default=[]) - parser_match.add_argument('--zip', metavar="INT", help="only match genomes of a given region(s) defined by zip code(s)", type=str, nargs="+", default=[]) - parser_match.add_argument('--date', help="only match genomes sampled at a certain sampling date or time frame. Accepts single dates (YYYY-MM-DD) or time spans (YYYY-MM-DD:YYYY-MM-DD).", nargs="+", type=str, default=[]) - parser_match.add_argument('--submission_date', help="only match genomes at a certain submission date or time frame. Accepts single dates (YYYY-MM-DD) or time spans (YYYY-MM-DD:YYYY-MM-DD).", nargs="+", type=str, default=[]) - parser_match.add_argument('--lab', metavar="STR", help="match genomes of the given lab only", type=str, nargs="+", default=[]) - parser_match.add_argument('--source', metavar="STR", help="match genomes of the given data source only", type=str, nargs="+", default=[]) - parser_match.add_argument('--collection', metavar="STR", help="match genomes of the given data collection only", type=str, nargs="+", default=[]) - parser_match.add_argument('--technology', metavar="STR", help="match genomes of the given sequencing technology only", type=str, nargs="+", default=[]) - parser_match.add_argument('--platform', metavar="STR", help="match genomes of the given sequencing platform only", type=str, nargs="+", default=[]) - parser_match.add_argument('--chemistry', metavar="STR", help="match genomes of the given sequencing chemistry only", type=str, nargs="+", default=[]) - parser_match.add_argument('--software', metavar="STR", help="software used for genome reconstruction", type=str, default=None) - parser_match.add_argument('--version', metavar="STR", help="software version used for genome reconstruction", type=str, default=None) - parser_match.add_argument('--material', metavar="STR", help="match genomes of the given sequencing chemistry only", type=str, nargs="+", default=[]) - parser_match.add_argument('--min_ct', metavar="STR", help="minimal ct value of samples resulting genomes are matched to", type=float, default=None) - parser_match.add_argument('--max_ct', metavar="STR", help="maximal ct value of samples resulting genomes are matched to", type=float, default=None) - parser_match.add_argument('--seqhash', metavar="STR", help="match specific genomes with the given seqhash(es)", type=str, nargs="+", default=[]) - parser_match_g1 = parser_match.add_mutually_exclusive_group() - parser_match_g1.add_argument('--count', help="count instead of listing matching genomes", action="store_true") - parser_match_g1.add_argument('--ambig', help="include ambiguos sites when reporting profiles (no effect when --count is used)", action="store_true") - parser_match_g2 = parser_match.add_mutually_exclusive_group() - parser_match_g2.add_argument('--only_frameshifts', help="show only genomes containing one or more frameshift mutations", action="store_true") - parser_match_g2.add_argument('--no_frameshifts', help="show only genomes containing no frameshift mutation", action="store_true") - parser_match.add_argument('--tsv', help="use tsv instead of csv output", action="store_true") - parser_match.add_argument('--debug', help="show database query for debugging", action="store_true") - - #create the parser for the "restore" command - parser_restore = subparsers.add_parser('restore', parents=[general_parser], help='restore sequence(s) from the database.') - parser_restore.add_argument('--acc', metavar="STR", help="acession(s) whose sequences are to be restored", type=str, default=[], nargs = "+") - parser_restore.add_argument('--file', '-f', metavar="STR", help="file containing acession(s) whose sequences are to be restored (one accession per line)", type=str, default=None) - - #create the parser for the "Var2Vcf" command - parser_var2vcf = subparsers.add_parser('var2vcf', parents=[general_parser], help='export variants from the database to vcf format.') - parser_var2vcf.add_argument('--acc', metavar="STR", help="acession(s) whose sequences are to be exported", type=str, default=[], nargs = "+") - parser_var2vcf.add_argument('--file', '-f', metavar="STR", help="file containing acession(s) whose sequences are to be exported (one accession per line)", type=str, default=None) - parser_var2vcf.add_argument('--date', help="only match genomes sampled at a certain sampling date or time frame. Accepts single dates (YYYY-MM-DD) or time spans (YYYY-MM-DD:YYYY-MM-DD).", nargs="+", type=str, default=[]) - parser_var2vcf.add_argument('--output', '-o', metavar="STR", help="output file (merged vcf)", type=str, default=None, required=True) - parser_var2vcf.add_argument('--betaV2', help="Use in-memory computing for processing (speed up X5 times). WARNING: the function is still experimental/not fully implemented", action="store_true") - # create the parser for the "update" command - parser_update = subparsers.add_parser('update', parents=[general_parser], help='add or update meta information.') - parser_update_input = parser_update.add_mutually_exclusive_group() - parser_update_input.add_argument('--pangolin', metavar="FILE", help="import linegae information from csv file created by pangolin", type=str, default=None) - parser_update_input.add_argument('--csv', metavar="FILE", help="import metadata from a csv file", type=str, default=None) - parser_update_input.add_argument('--tsv', metavar="FILE", help="import metadata from a tsv file", type=str, default=None) - parser_update.add_argument('--fields', metavar="STR", help="if --csv or --tsv is used, define relevant columns like \"pango={colname_in_cs} zip={colname_in_cs} date={colname_in_csv}\"", type=str, nargs="+", default=None) - parser_update.add_argument('--compressed', help="compression of input file format ('none', 'gz', 'xz', default: 'auto')", choices=['none', 'gz', 'xz', 'auto'], default='auto') - - # create the parser for the "info" command - parser_info= subparsers.add_parser('info', help='show info') - parser_info.add_argument('--db', metavar="DB_DIR", help="sonar database directory (optional)", type=str, default=None) - - #create the parser for the "optimize" command - parser_opt = subparsers.add_parser('optimize', parents=[general_parser], help='optimizes the database.') - - #create the parser for the "optimize" command - parser_opt = subparsers.add_parser('db-upgrade', parents=[general_parser], help='Upgrade the database to the latest version.') - - #Update lineage information command - parser_update_anno = subparsers.add_parser('update-lineage-info', help='Update lineage information (e.g., lib/linage.all.tsv).') - - # version - parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION) - - return parser.parse_args() - -class sonar(): - def __init__(self, db, gff=None, debug=False): - self.dbfile = db if db else mkstemp()[1] - self.db = sonardb.sonarDB(self.dbfile) - self.gff = gff - self.debug = debug - - def open_file(self, fname, mode="r", compressed=False, encoding=None): - if not os.path.isfile(fname): - sys.exit("input error: " + fname + " does not exist.") - if compressed == "auto": - compressed = os.path.splitext(fname)[1][1:] - try: - if compressed == "gz": - return gzip.open(fname, mode + "t", encoding=encoding) - if compressed == "xz": - return lzma.open(fname, mode + "t", encoding=encoding) - else: - return open(fname, mode, encoding=encoding) - except: - sys.exit("input error: " + fname + " cannot be opened.") - - - def add(self, fnames, cachedir=None, cpus=1, timeout=600, force=False, paranoid=True, quiet=False, noprogress=False, compressed=False, source=None, collection=None, lab=None): - ''' - Adds genome sequence(s) from given FASTA file(s) to the database. - If cachedir is not defined, a temporary directory will be used as cache. - ''' - - # set display options - disable_progressbar = False if not quiet and not noprogress else True - print_steps = True if not quiet and noprogress else False - - # set global update vals - updates = {} - if source: - updates['source'] = source - if lab: - updates['lab'] = lab - if collection: - updates['collection'] = collection - - # create db if necessary - step = 0 - if cachedir and os.path.isdir(cachedir): - step += 1 - if print_steps: - print("[step", str(step) + "] restoring ... ") - - with sonardb.sonarCache(cachedir) as cache, sonardb.sonarDBManager(self.dbfile) as dbm: - # db status - if not quiet: - dbstatus ={ - 'genomes': dbm.count_genomes(), - 'seqs': dbm.count_sequences(), - 'labs': dbm.count_labs(), - } - - # add fasta files to cache - step += 1 - msg = "[step " + str(step) + "] caching ... " - if print_steps: - print(msg) - - to_process = [] - to_import = defaultdict(set) - to_update = set() - - for i in tqdm(range(len(fnames)), desc = msg, disable = disable_progressbar): - with self.open_file(fnames[i], compressed=compressed) as handle: - for record in SeqIO.parse(handle, "fasta"): - acc = record.id - descr = record.description - seq = self.db.harmonize(record.seq) - - if len(seq) == 0: - continue - seqhash = self.db.hash(seq) - genome_data = dbm.get_genomes(acc) - if updates: - to_update.add(acc) - - if genome_data: - if genome_data['seqhash'] != seqhash: - if not force: - sys.exit("database error: " + acc + " exists in the database with a different sequence (use --force to allow updating)") - dbm.delete_genome(acc) - elif genome_data['description'] != descr: - if not force: - sys.exit("database error: " + acc + " exists in the database with a different description (use --force to allow updating)") - dbm.update_genome(acc, description = descr) - continue - else: - continue - - if dbm.seq_exists(seqhash): - cache.prep_cached_files(seqhash) - cache.write_info(seqhash) - cache.add_seq(seqhash, seq) - elif seqhash not in to_import: - algn = cache.get_algn_fname(seqhash) - fasta = cache.get_fasta_fname(seqhash) - info = cache.get_info_fname(seqhash) - - if not os.path.isfile(fasta): - unvalid_letters = sorted(self.db.check_iupac_nt_code(seq)) - if unvalid_letters: - print("[Skip] input error: " + acc + " contains non-IUPAC characters (found: " + ", ".join(unvalid_letters) + ")") - continue - # sys.exit("input error: " + acc + " contains non-IUPAC characters (found: " + ", ".join(unvalid_letters) + ")") - cache.add_seq(seqhash, seq) - to_process.append([fasta, algn, info, seqhash, timeout]) - elif SeqIO.read(fasta, "fasta").seq != seq: - sys.exit("cache error: sequence hash " + seqhash + " exists in cache but refers to a different sequence") - elif not os.path.isfile(info): - to_process.append([fasta, algn, info, seqhash, timeout]) - - to_import[seqhash].add((acc, descr)) - - step += 1 - msg = "[step " + str(step) + "] processing ..." - if print_steps: - print(msg) - pool = Pool(processes=cpus) - failed = set() - for status, seqhash in tqdm(pool.imap_unordered(self.db.multi_process_fasta_wrapper, to_process), total=len(to_process), desc = msg, disable = disable_progressbar): - if not status: - failed.update([x[1] for x in cache.cache[seqhash]]) - if failed: - print("timeout warning: following genomes were not added to the database since the respective sequence produced an timeout while aligning:", file=sys.stderr) - for f in failed: - print(f, file=sys.stderr) - - step += 1 - msg = "[step " + str(step) + "] importing ... " - if print_steps: - print(msg) - self.db.import_genome_from_cache(cache.dirname, to_import, msg=msg, dbm=dbm, disable_progressbar=disable_progressbar) - - if updates: - step += 1 - msg = "[step " + str(step) + "] updating ... " - if print_steps: - print(msg) - for i in tqdm(range(len(to_update)), desc = msg, disable = disable_progressbar): - dbm.update_genome(to_update.pop(), **updates) - - # db status - if not quiet: - new_dbstatus ={ - 'genomes': dbm.count_genomes(), - 'seqs': dbm.count_sequences(), - 'labs': dbm.count_labs(), - } - - print("number of genomes:") - print("\twas: " + str(dbstatus['genomes'])) - print("\tnow: " + str(new_dbstatus['genomes'])) - print("\tadded: " + str(new_dbstatus['genomes']-dbstatus['genomes'])) - print("number of unique sequences:") - print("\twas: " + str(dbstatus['seqs'])) - print("\tnow: " + str(new_dbstatus['seqs'])) - print("\tadded: " + str(new_dbstatus['seqs']-dbstatus['seqs'])) - - def remove(self, *accs): - with sonardb.sonarDBManager(self.dbfile) as dbm: - g_before = dbm.count_genomes() - for acc in set(accs): - dbm.delete_genome(acc) - g_after = dbm.count_genomes() - print(str(g_before-g_after) + " genomic entrie(s) deleted.") - - def match_genomes(self, include_profiles, exclude_profiles, accessions, lineages, with_sublineage, zips, dates, submission_dates, labs, sources, collections, technologies, platforms, chemistries, software, software_version, materials, min_ct, max_ct, seqhash, ambig, count=False, frameshifts=0, tsv=False): - rows = self.db.match(include_profiles=include_profiles, exclude_profiles=exclude_profiles, accessions=accessions, lineages=lineages, with_sublineage=with_sublineage, zips=zips, dates=dates, submission_dates=submission_dates, labs=labs, sources=sources, collections=collections, technologies=technologies, platforms=platforms, chemistries=chemistries, software=software, software_version=software_version, materials=materials, min_ct=min_ct, max_ct=max_ct, seqhashes=seqhash, ambig=ambig, count=count, frameshifts=frameshifts, debug=debug) - if count: - print(rows) - else: - self.rows_to_csv(rows, na="*** no match ***", tsv=tsv) - - def update_metadata(self, fname, accCol=None, lineageCol=None, zipCol=None, dateCol=None, submission_dateCol=None, gisaidCol=None, enaCol=None, labCol=None, sourceCol=None, collectionCol=None, technologyCol=None, platformCol=None, chemistryCol=None, softwareCol = None, versionCol = None, materialCol=None, ctCol=None, sep=",", pangolin=False, compressed=False): - updates = defaultdict(dict) - if pangolin: - with self.open_file(fname, compressed=compressed, encoding='utf-8-sig') as handle: - lines = csv.DictReader(handle, delimiter = ',', quoting=csv.QUOTE_MINIMAL) - for line in lines: - acc = line['Sequence name'].split(" ")[0] - updates[acc]['lineage'] = line['Lineage'] - elif accCol: - with self.open_file(fname, compressed=compressed) as handle: - lines = csv.DictReader(handle, delimiter = sep) - for line in lines: - acc = line[accCol] - if lineageCol and (acc not in updates or 'lineage' not in updates[acc]): - updates[acc]['lineage'] = line[lineageCol].upper() - if zipCol and line[zipCol]: - updates[acc]['zip'] = line[zipCol] - if dateCol and line[dateCol]: - updates[acc]['date'] = line[dateCol] - if submission_dateCol and line[submission_dateCol]: - updates[acc]['submission_date'] = line[submission_dateCol] - if gisaidCol and line[gisaidCol]: - updates[acc]['gisaid'] = line[gisaidCol] - if enaCol and line[enaCol]: - updates[acc]['ena'] = line[enaCol] - if collectionCol and line[collectionCol]: - updates[acc]['collection'] = line[collectionCol].upper() - if sourceCol and line[sourceCol]: - updates[acc]['source'] = line[sourceCol].upper() - if labCol and line[labCol]: - updates[acc]['lab'] = line[labCol].upper() - if technologyCol and line[technologyCol]: - updates[acc]['technology'] = line[technologyCol].upper() - if chemistryCol and line[chemistryCol]: - updates[acc]['chemistry'] = line[chemistryCol].upper() - if platformCol and line[platformCol]: - updates[acc]['platform'] = line[platformCol].upper() - if softwareCol and line[softwareCol]: - updates[acc]['software'] = line[softwareCol].upper() - if versionCol and line[versionCol]: - updates[acc]['version'] = line[versionCol].upper() - if materialCol: - updates[acc]['material'] = line[materialCol].upper() - if ctCol and line[ctCol]: - try: - updates[acc]['ct'] = float(line[ctCol]) - except: - sys.exit("metadata error: " + line[ctCol] + " is not a valid ct value (accession: " + acc + ")") - with sonardb.sonarDBManager(self.dbfile) as dbm: - for acc, update in updates.items(): - dbm.update_genome(acc, **update) - - def restore(self, acc): - return self.db.restore_genome_using_dnavars(acc) - - def var2vcf(self, acc, date, output, cpu, betaV2): - if betaV2: - return sonartoVCFV2.export2VCF(self.dbfile,acc, date, output, cpu,self.db.refdescr) - else: - return sonartoVCF.export2VCF(self.dbfile,acc, date, output, cpu,self.db.refdescr) - - def view(self, acc): - with sonardb.sonarDBManager(self.dbfile, readonly=True) as dbm: - self.rows_to_csv(self.db.get_dna_vars(acc, dbm=dbm)) - - def show_system_info(self): - print("sonarDB version: ", self.db.get_version()) - print("reference genome: ", self.db.refdescr) - print("reference length: ", str(len(self.db.refseq)) + "bp") - print("annotated proteins: ", ", ".join(self.db.refgffObj.symbols)) - print("used translation table:", self.db.translation_table) - - def show_db_info(self): - with sonardb.sonarDBManager(self.dbfile, readonly=True) as dbm: - print("database path: ", dbm.dbfile) - print("database version: ", dbm.get_db_version()) - print("database size: ", self.get_db_size()) - g = dbm.count_genomes() - print("genomes: ", g) - print("unique sequences: ", dbm.count_sequences()) - print("labs: ", dbm.count_labs()) - print("earliest genome import: ", dbm.get_earliest_import()) - print("latest genome import: ", dbm.get_latest_import()) - print("earliest sampling date: ", dbm.get_earliest_date()) - print("latest sampling date: ", dbm.get_latest_date()) - print("metadata: ") - fields = sorted(['lab', 'source', 'collection', 'technology', 'platform', 'chemistry', 'software', 'software_version', 'material', 'ct', 'gisaid', 'ena', 'lineage', 'zip', 'date', 'submission_date']) - maxlen = max([len(x) for x in fields]) - for field in fields: - if g == 0: - c = 0 - p = 0 - else: - c = dbm.count_metadata(field) - p = c/g*100 - spacer = " " * (maxlen-len(field)) - print(" " + field + " information:" + spacer, f"{c} ({p:.{2}f}%)") - - def rows_to_csv(self, rows, file=None, na="*** no data ***", tsv=False): - if len(rows) == 0: - print(na, file=sys.stderr) - else: - file = sys.stdout if file is None else open(file, "w") - sep = "\t" if tsv else "," - writer = csv.DictWriter(file, rows[0].keys(), delimiter=sep, lineterminator=os.linesep) - writer.writeheader() - writer.writerows(rows) - - def get_db_size(self, decimal_places=3): - size = os.path.getsize(self.dbfile) - for unit in ['B','KiB','MiB','GiB','TiB']: - if size < 1024.0: - break - size /= 1024.0 - return f"{size:.{decimal_places}f}{unit}" + parser = argparse.ArgumentParser(prog="sonar.py", description="") + subparsers = parser.add_subparsers( + help="detect, store, and screen for mutations in SARS-CoV-2 genomic sequences" + ) + subparsers.dest = "tool" + subparsers.required = True + + # parent parser: db input + general_parser = argparse.ArgumentParser(add_help=False) + general_parser.add_argument( + "--db", + metavar="DB_DIR", + help="sonar database directory", + type=str, + required=True, + ) + general_parser.add_argument( + "--cpus", + metavar="int", + help="number of cpus to use (default: 1)", + type=int, + default=1, + ) + + # create the parser for the "add" command + parser_add = subparsers.add_parser( + "add", parents=[general_parser], help="add genome sequences to the database." + ) + parser_add_input = parser_add.add_mutually_exclusive_group() + parser_add_input.add_argument( + "-f", + "--file", + metavar="FILE", + help="fasta file(s) containing DNA sequences to add", + type=str, + nargs="+", + default=[], + ) + parser_add_input.add_argument( + "-d", + "--dir", + metavar="DIR", + help='add all fasta files (ending with ".fasta" or ".fna") from a given directory or directories', + type=str, + nargs="+", + default=None, + ) + parser_add.add_argument( + "-c", + "--cache", + metavar="DIR", + help="use (and restore data from) a given cache (if not set, a temporary cache is used and deleted after import)", + type=str, + default=None, + ) + parser_add.add_argument( + "-t", + "--timeout", + metavar="INT", + help="timout for aligning sequences in seconds (default: 600)", + type=int, + default=600, + ) + parser_add.add_argument( + "--compressed", + help="compression of input file format ('none', 'gz', 'xz', default: 'auto')", + choices=["none", "gz", "xz", "auto"], + default="auto", + ) + parser_add.add_argument( + "--force", + help="force updating of accessions if description or sequence has changed", + action="store_true", + ) + parser_add.add_argument( + "--noprogress", "-p", help="do not show any progress bar", action="store_true" + ) + parser_add.add_argument( + "--source", + help="define a common data source for all genomes", + type=str, + default=None, + ) + parser_add.add_argument( + "--collection", + help="define a common data collection for all genomes", + type=str, + default=None, + ) + parser_add.add_argument( + "--lab", help="define a common lab for all genomes", type=str, default=None + ) + parser_add.add_argument( + "--quiet", "-q", help="do not show any output", action="store_true" + ) + + # create the parser for the "remove" command + parser_remove = subparsers.add_parser( + "remove", + parents=[general_parser], + help="remove genome sequences to the database.", + ) + parser_remove = parser_remove.add_mutually_exclusive_group() + parser_remove.add_argument( + "--acc", + metavar="STR", + help="define accession(s) of sequences to delete", + type=str, + nargs="+", + default=[], + ) + parser_remove.add_argument( + "--file", + metavar="FILE", + help="define file containing accession(s) of sequences to delete (one per line)", + type=str, + default=None, + ) + + # create the parser for the "match" command + parser_match = subparsers.add_parser( + "match", + parents=[general_parser], + help="get mutations profiles for given accessions.", + ) + parser_match.add_argument( + "--include", + "-i", + metavar="STR", + help="match genomes sharing the given mutation profile", + type=str, + action="append", + nargs="+", + default=[], + ) + parser_match.add_argument( + "--exclude", + "-e", + metavar="STR", + help="match genomes not containing the mutation profile", + type=str, + action="append", + nargs="+", + default=[], + ) + parser_match.add_argument( + "--with-sublineage", + help="recursively get all sublineages from a given lineage (--lineage) (only child) ", + action="store_true", + ) + # parser_match.add_argument('--recursion', help="recursively get all sublineages of a given lineage (--lineage). this will work only if '--with-sublineage' is used",action="store_true") + parser_match.add_argument( + "--lineage", + metavar="STR", + help="match genomes of the given pangolin lineage(s) only", + type=str, + nargs="+", + default=[], + ) + parser_match.add_argument( + "--acc", + metavar="STR", + help="match specific genomes defined by acession(s) only", + type=str, + nargs="+", + default=[], + ) + parser_match.add_argument( + "--zip", + metavar="INT", + help="only match genomes of a given region(s) defined by zip code(s)", + type=str, + nargs="+", + default=[], + ) + parser_match.add_argument( + "--date", + help="only match genomes sampled at a certain sampling date or time frame. Accepts single dates (YYYY-MM-DD) or time spans (YYYY-MM-DD:YYYY-MM-DD).", + nargs="+", + type=str, + default=[], + ) + parser_match.add_argument( + "--submission_date", + help="only match genomes at a certain submission date or time frame. Accepts single dates (YYYY-MM-DD) or time spans (YYYY-MM-DD:YYYY-MM-DD).", + nargs="+", + type=str, + default=[], + ) + parser_match.add_argument( + "--lab", + metavar="STR", + help="match genomes of the given lab only", + type=str, + nargs="+", + default=[], + ) + parser_match.add_argument( + "--source", + metavar="STR", + help="match genomes of the given data source only", + type=str, + nargs="+", + default=[], + ) + parser_match.add_argument( + "--collection", + metavar="STR", + help="match genomes of the given data collection only", + type=str, + nargs="+", + default=[], + ) + parser_match.add_argument( + "--technology", + metavar="STR", + help="match genomes of the given sequencing technology only", + type=str, + nargs="+", + default=[], + ) + parser_match.add_argument( + "--platform", + metavar="STR", + help="match genomes of the given sequencing platform only", + type=str, + nargs="+", + default=[], + ) + parser_match.add_argument( + "--chemistry", + metavar="STR", + help="match genomes of the given sequencing chemistry only", + type=str, + nargs="+", + default=[], + ) + parser_match.add_argument( + "--software", + metavar="STR", + help="software used for genome reconstruction", + type=str, + default=None, + ) + parser_match.add_argument( + "--version", + metavar="STR", + help="software version used for genome reconstruction", + type=str, + default=None, + ) + parser_match.add_argument( + "--material", + metavar="STR", + help="match genomes of the given sequencing chemistry only", + type=str, + nargs="+", + default=[], + ) + parser_match.add_argument( + "--min_ct", + metavar="STR", + help="minimal ct value of samples resulting genomes are matched to", + type=float, + default=None, + ) + parser_match.add_argument( + "--max_ct", + metavar="STR", + help="maximal ct value of samples resulting genomes are matched to", + type=float, + default=None, + ) + parser_match.add_argument( + "--seqhash", + metavar="STR", + help="match specific genomes with the given seqhash(es)", + type=str, + nargs="+", + default=[], + ) + parser_match_g1 = parser_match.add_mutually_exclusive_group() + parser_match_g1.add_argument( + "--count", help="count instead of listing matching genomes", action="store_true" + ) + parser_match_g1.add_argument( + "--ambig", + help="include ambiguos sites when reporting profiles (no effect when --count is used)", + action="store_true", + ) + parser_match_g2 = parser_match.add_mutually_exclusive_group() + parser_match_g2.add_argument( + "--only_frameshifts", + help="show only genomes containing one or more frameshift mutations", + action="store_true", + ) + parser_match_g2.add_argument( + "--no_frameshifts", + help="show only genomes containing no frameshift mutation", + action="store_true", + ) + parser_match.add_argument( + "--tsv", help="use tsv instead of csv output", action="store_true" + ) + parser_match.add_argument( + "--debug", help="show database query for debugging", action="store_true" + ) + + # create the parser for the "restore" command + parser_restore = subparsers.add_parser( + "restore", + parents=[general_parser], + help="restore sequence(s) from the database.", + ) + parser_restore.add_argument( + "--acc", + metavar="STR", + help="acession(s) whose sequences are to be restored", + type=str, + default=[], + nargs="+", + ) + parser_restore.add_argument( + "--file", + "-f", + metavar="STR", + help="file containing acession(s) whose sequences are to be restored (one accession per line)", + type=str, + default=None, + ) + + # create the parser for the "Var2Vcf" command + parser_var2vcf = subparsers.add_parser( + "var2vcf", + parents=[general_parser], + help="export variants from the database to vcf format.", + ) + parser_var2vcf.add_argument( + "--acc", + metavar="STR", + help="acession(s) whose sequences are to be exported", + type=str, + default=[], + nargs="+", + ) + parser_var2vcf.add_argument( + "--file", + "-f", + metavar="STR", + help="file containing acession(s) whose sequences are to be exported (one accession per line)", + type=str, + default=None, + ) + parser_var2vcf.add_argument( + "--date", + help="only match genomes sampled at a certain sampling date or time frame. Accepts single dates (YYYY-MM-DD) or time spans (YYYY-MM-DD:YYYY-MM-DD).", + nargs="+", + type=str, + default=[], + ) + parser_var2vcf.add_argument( + "--output", + "-o", + metavar="STR", + help="output file (merged vcf)", + type=str, + default=None, + required=True, + ) + parser_var2vcf.add_argument( + "--betaV2", + help="Use in-memory computing for processing (speed up X5 times). WARNING: the function is still experimental/not fully implemented", + action="store_true", + ) + # create the parser for the "update" command + parser_update = subparsers.add_parser( + "update", parents=[general_parser], help="add or update meta information." + ) + parser_update_input = parser_update.add_mutually_exclusive_group() + parser_update_input.add_argument( + "--pangolin", + metavar="FILE", + help="import linegae information from csv file created by pangolin", + type=str, + default=None, + ) + parser_update_input.add_argument( + "--csv", + metavar="FILE", + help="import metadata from a csv file", + type=str, + default=None, + ) + parser_update_input.add_argument( + "--tsv", + metavar="FILE", + help="import metadata from a tsv file", + type=str, + default=None, + ) + parser_update.add_argument( + "--fields", + metavar="STR", + help='if --csv or --tsv is used, define relevant columns like "pango={colname_in_cs} zip={colname_in_cs} date={colname_in_csv}"', + type=str, + nargs="+", + default=None, + ) + parser_update.add_argument( + "--compressed", + help="compression of input file format ('none', 'gz', 'xz', default: 'auto')", + choices=["none", "gz", "xz", "auto"], + default="auto", + ) + + # create the parser for the "info" command + parser_info = subparsers.add_parser("info", help="show info") + parser_info.add_argument( + "--db", + metavar="DB_DIR", + help="sonar database directory (optional)", + type=str, + default=None, + ) + + # create the parser for the "optimize" command + parser_opt = subparsers.add_parser( + "optimize", parents=[general_parser], help="optimizes the database." + ) + + # create the parser for the "optimize" command + parser_opt = subparsers.add_parser( + "db-upgrade", + parents=[general_parser], + help="Upgrade the database to the latest version.", + ) + + # Update lineage information command + parser_update_anno = subparsers.add_parser( + "update-lineage-info", + help="Update lineage information (e.g., lib/linage.all.tsv).", + ) + + # version + parser.add_argument("--version", action="version", version="%(prog)s " + VERSION) + + return parser.parse_args() + + +class sonar: + def __init__(self, db, gff=None, debug=False): + self.dbfile = db if db else mkstemp()[1] + self.db = sonardb.sonarDB(self.dbfile) + self.gff = gff + self.debug = debug + + def open_file(self, fname, mode="r", compressed=False, encoding=None): + if not os.path.isfile(fname): + sys.exit("input error: " + fname + " does not exist.") + if compressed == "auto": + compressed = os.path.splitext(fname)[1][1:] + try: + if compressed == "gz": + return gzip.open(fname, mode + "t", encoding=encoding) + if compressed == "xz": + return lzma.open(fname, mode + "t", encoding=encoding) + else: + return open(fname, mode, encoding=encoding) + except: + sys.exit("input error: " + fname + " cannot be opened.") + + def add( + self, + fnames, + cachedir=None, + cpus=1, + timeout=600, + force=False, + paranoid=True, + quiet=False, + noprogress=False, + compressed=False, + source=None, + collection=None, + lab=None, + ): + """ + Adds genome sequence(s) from given FASTA file(s) to the database. + If cachedir is not defined, a temporary directory will be used as cache. + """ + + # set display options + disable_progressbar = False if not quiet and not noprogress else True + print_steps = True if not quiet and noprogress else False + + # set global update vals + updates = {} + if source: + updates["source"] = source + if lab: + updates["lab"] = lab + if collection: + updates["collection"] = collection + + # create db if necessary + step = 0 + if cachedir and os.path.isdir(cachedir): + step += 1 + if print_steps: + print("[step", str(step) + "] restoring ... ") + + with sonardb.sonarCache(cachedir) as cache, sonardb.sonarDBManager( + self.dbfile + ) as dbm: + # db status + if not quiet: + dbstatus = { + "genomes": dbm.count_genomes(), + "seqs": dbm.count_sequences(), + "labs": dbm.count_labs(), + } + + # add fasta files to cache + step += 1 + msg = "[step " + str(step) + "] caching ... " + if print_steps: + print(msg) + + to_process = [] + to_import = defaultdict(set) + to_update = set() + + for i in tqdm(range(len(fnames)), desc=msg, disable=disable_progressbar): + with self.open_file(fnames[i], compressed=compressed) as handle: + for record in SeqIO.parse(handle, "fasta"): + acc = record.id + descr = record.description + seq = self.db.harmonize(record.seq) + + if len(seq) == 0: + continue + seqhash = self.db.hash(seq) + genome_data = dbm.get_genomes(acc) + if updates: + to_update.add(acc) + + if genome_data: + if genome_data["seqhash"] != seqhash: + if not force: + sys.exit( + "database error: " + + acc + + " exists in the database with a different sequence (use --force to allow updating)" + ) + dbm.delete_genome(acc) + elif genome_data["description"] != descr: + if not force: + sys.exit( + "database error: " + + acc + + " exists in the database with a different description (use --force to allow updating)" + ) + dbm.update_genome(acc, description=descr) + continue + else: + continue + + if dbm.seq_exists(seqhash): + cache.prep_cached_files(seqhash) + cache.write_info(seqhash) + cache.add_seq(seqhash, seq) + elif seqhash not in to_import: + algn = cache.get_algn_fname(seqhash) + fasta = cache.get_fasta_fname(seqhash) + info = cache.get_info_fname(seqhash) + + if not os.path.isfile(fasta): + unvalid_letters = sorted( + self.db.check_iupac_nt_code(seq) + ) + if unvalid_letters: + print( + "[Skip] input error: " + + acc + + " contains non-IUPAC characters (found: " + + ", ".join(unvalid_letters) + + ")" + ) + continue + # sys.exit("input error: " + acc + " contains non-IUPAC characters (found: " + ", ".join(unvalid_letters) + ")") + cache.add_seq(seqhash, seq) + to_process.append([fasta, algn, info, seqhash, timeout]) + elif SeqIO.read(fasta, "fasta").seq != seq: + sys.exit( + "cache error: sequence hash " + + seqhash + + " exists in cache but refers to a different sequence" + ) + elif not os.path.isfile(info): + to_process.append([fasta, algn, info, seqhash, timeout]) + + to_import[seqhash].add((acc, descr)) + + step += 1 + msg = "[step " + str(step) + "] processing ..." + if print_steps: + print(msg) + pool = Pool(processes=cpus) + failed = set() + for status, seqhash in tqdm( + pool.imap_unordered(self.db.multi_process_fasta_wrapper, to_process), + total=len(to_process), + desc=msg, + disable=disable_progressbar, + ): + if not status: + failed.update([x[1] for x in cache.cache[seqhash]]) + if failed: + print( + "timeout warning: following genomes were not added to the database since the respective sequence produced an timeout while aligning:", + file=sys.stderr, + ) + for f in failed: + print(f, file=sys.stderr) + + step += 1 + msg = "[step " + str(step) + "] importing ... " + if print_steps: + print(msg) + self.db.import_genome_from_cache( + cache.dirname, + to_import, + msg=msg, + dbm=dbm, + disable_progressbar=disable_progressbar, + ) + + if updates: + step += 1 + msg = "[step " + str(step) + "] updating ... " + if print_steps: + print(msg) + for i in tqdm( + range(len(to_update)), desc=msg, disable=disable_progressbar + ): + dbm.update_genome(to_update.pop(), **updates) + + # db status + if not quiet: + new_dbstatus = { + "genomes": dbm.count_genomes(), + "seqs": dbm.count_sequences(), + "labs": dbm.count_labs(), + } + + print("number of genomes:") + print("\twas: " + str(dbstatus["genomes"])) + print("\tnow: " + str(new_dbstatus["genomes"])) + print("\tadded: " + str(new_dbstatus["genomes"] - dbstatus["genomes"])) + print("number of unique sequences:") + print("\twas: " + str(dbstatus["seqs"])) + print("\tnow: " + str(new_dbstatus["seqs"])) + print("\tadded: " + str(new_dbstatus["seqs"] - dbstatus["seqs"])) + + def remove(self, *accs): + with sonardb.sonarDBManager(self.dbfile) as dbm: + g_before = dbm.count_genomes() + for acc in set(accs): + dbm.delete_genome(acc) + g_after = dbm.count_genomes() + print(str(g_before - g_after) + " genomic entrie(s) deleted.") + + def match_genomes( + self, + include_profiles, + exclude_profiles, + accessions, + lineages, + with_sublineage, + zips, + dates, + submission_dates, + labs, + sources, + collections, + technologies, + platforms, + chemistries, + software, + software_version, + materials, + min_ct, + max_ct, + seqhash, + ambig, + count=False, + frameshifts=0, + tsv=False, + ): + rows = self.db.match( + include_profiles=include_profiles, + exclude_profiles=exclude_profiles, + accessions=accessions, + lineages=lineages, + with_sublineage=with_sublineage, + zips=zips, + dates=dates, + submission_dates=submission_dates, + labs=labs, + sources=sources, + collections=collections, + technologies=technologies, + platforms=platforms, + chemistries=chemistries, + software=software, + software_version=software_version, + materials=materials, + min_ct=min_ct, + max_ct=max_ct, + seqhashes=seqhash, + ambig=ambig, + count=count, + frameshifts=frameshifts, + debug=debug, + ) + if count: + print(rows) + else: + self.rows_to_csv(rows, na="*** no match ***", tsv=tsv) + + def update_metadata( + self, + fname, + accCol=None, + lineageCol=None, + zipCol=None, + dateCol=None, + submission_dateCol=None, + gisaidCol=None, + enaCol=None, + labCol=None, + sourceCol=None, + collectionCol=None, + technologyCol=None, + platformCol=None, + chemistryCol=None, + softwareCol=None, + versionCol=None, + materialCol=None, + ctCol=None, + sep=",", + pangolin=False, + compressed=False, + ): + updates = defaultdict(dict) + if pangolin: + with self.open_file( + fname, compressed=compressed, encoding="utf-8-sig" + ) as handle: + lines = csv.DictReader(handle, delimiter=",", quoting=csv.QUOTE_MINIMAL) + for line in lines: + acc = line["Sequence name"].split(" ")[0] + updates[acc]["lineage"] = line["Lineage"] + elif accCol: + with self.open_file(fname, compressed=compressed) as handle: + lines = csv.DictReader(handle, delimiter=sep) + for line in lines: + acc = line[accCol] + if lineageCol and ( + acc not in updates or "lineage" not in updates[acc] + ): + updates[acc]["lineage"] = line[lineageCol].upper() + if zipCol and line[zipCol]: + updates[acc]["zip"] = line[zipCol] + if dateCol and line[dateCol]: + updates[acc]["date"] = line[dateCol] + if submission_dateCol and line[submission_dateCol]: + updates[acc]["submission_date"] = line[submission_dateCol] + if gisaidCol and line[gisaidCol]: + updates[acc]["gisaid"] = line[gisaidCol] + if enaCol and line[enaCol]: + updates[acc]["ena"] = line[enaCol] + if collectionCol and line[collectionCol]: + updates[acc]["collection"] = line[collectionCol].upper() + if sourceCol and line[sourceCol]: + updates[acc]["source"] = line[sourceCol].upper() + if labCol and line[labCol]: + updates[acc]["lab"] = line[labCol].upper() + if technologyCol and line[technologyCol]: + updates[acc]["technology"] = line[technologyCol].upper() + if chemistryCol and line[chemistryCol]: + updates[acc]["chemistry"] = line[chemistryCol].upper() + if platformCol and line[platformCol]: + updates[acc]["platform"] = line[platformCol].upper() + if softwareCol and line[softwareCol]: + updates[acc]["software"] = line[softwareCol].upper() + if versionCol and line[versionCol]: + updates[acc]["version"] = line[versionCol].upper() + if materialCol: + updates[acc]["material"] = line[materialCol].upper() + if ctCol and line[ctCol]: + try: + updates[acc]["ct"] = float(line[ctCol]) + except: + sys.exit( + "metadata error: " + + line[ctCol] + + " is not a valid ct value (accession: " + + acc + + ")" + ) + with sonardb.sonarDBManager(self.dbfile) as dbm: + for acc, update in updates.items(): + dbm.update_genome(acc, **update) + + def restore(self, acc): + return self.db.restore_genome_using_dnavars(acc) + + def var2vcf(self, acc, date, output, cpu, betaV2): + if betaV2: + return sonartoVCFV2.export2VCF( + self.dbfile, acc, date, output, cpu, self.db.refdescr + ) + else: + return sonartoVCF.export2VCF( + self.dbfile, acc, date, output, cpu, self.db.refdescr + ) + + def view(self, acc): + with sonardb.sonarDBManager(self.dbfile, readonly=True) as dbm: + self.rows_to_csv(self.db.get_dna_vars(acc, dbm=dbm)) + + def show_system_info(self): + print("sonarDB version: ", self.db.get_version()) + print("reference genome: ", self.db.refdescr) + print("reference length: ", str(len(self.db.refseq)) + "bp") + print("annotated proteins: ", ", ".join(self.db.refgffObj.symbols)) + print("used translation table:", self.db.translation_table) + + def show_db_info(self): + with sonardb.sonarDBManager(self.dbfile, readonly=True) as dbm: + print("database path: ", dbm.dbfile) + print("database version: ", dbm.get_db_version()) + print("database size: ", self.get_db_size()) + g = dbm.count_genomes() + print("genomes: ", g) + print("unique sequences: ", dbm.count_sequences()) + print("labs: ", dbm.count_labs()) + print("earliest genome import: ", dbm.get_earliest_import()) + print("latest genome import: ", dbm.get_latest_import()) + print("earliest sampling date: ", dbm.get_earliest_date()) + print("latest sampling date: ", dbm.get_latest_date()) + print("metadata: ") + fields = sorted( + [ + "lab", + "source", + "collection", + "technology", + "platform", + "chemistry", + "software", + "software_version", + "material", + "ct", + "gisaid", + "ena", + "lineage", + "zip", + "date", + "submission_date", + ] + ) + maxlen = max([len(x) for x in fields]) + for field in fields: + if g == 0: + c = 0 + p = 0 + else: + c = dbm.count_metadata(field) + p = c / g * 100 + spacer = " " * (maxlen - len(field)) + print(" " + field + " information:" + spacer, f"{c} ({p:.{2}f}%)") + + def rows_to_csv(self, rows, file=None, na="*** no data ***", tsv=False): + if len(rows) == 0: + print(na, file=sys.stderr) + else: + file = sys.stdout if file is None else open(file, "w") + sep = "\t" if tsv else "," + writer = csv.DictWriter( + file, rows[0].keys(), delimiter=sep, lineterminator=os.linesep + ) + writer.writeheader() + writer.writerows(rows) + + def get_db_size(self, decimal_places=3): + size = os.path.getsize(self.dbfile) + for unit in ["B", "KiB", "MiB", "GiB", "TiB"]: + if size < 1024.0: + break + size /= 1024.0 + return f"{size:.{decimal_places}f}{unit}" + def process_update_expressions(expr): - allowed = {"accession": "accCol", "lineage": "lineageCol", "date": "dateCol", "submission_date": "submission_dateCol", "zip": "zipCol", "gisaid": "gisaidCol", "ena": "enaCol", "collection": "collectionCol", "technology": "technologyCol", "platform": "platformCol", "chemistry": "chemistryCol", "software": "softwareCol", "version": "versionCol", "material": "materialCol", "ct": "ctCol", "source": "sourceCol", "lab": "labCol"} - fields = {} - for val in expr: - val = val.split("=") - if val[0] not in allowed or len(val) == 1: - sys.exit("input error: " + val[0] + " is not a valid expression") - key = allowed[val[0]] - if key in fields: - sys.exit("input error: multiple assignments for " + val[0]) - fields[key] = "=".join(val[1:]) - if 'accCol' not in fields: - sys.exit("input error: an accession column has to be defined.") - return fields + allowed = { + "accession": "accCol", + "lineage": "lineageCol", + "date": "dateCol", + "submission_date": "submission_dateCol", + "zip": "zipCol", + "gisaid": "gisaidCol", + "ena": "enaCol", + "collection": "collectionCol", + "technology": "technologyCol", + "platform": "platformCol", + "chemistry": "chemistryCol", + "software": "softwareCol", + "version": "versionCol", + "material": "materialCol", + "ct": "ctCol", + "source": "sourceCol", + "lab": "labCol", + } + fields = {} + for val in expr: + val = val.split("=") + if val[0] not in allowed or len(val) == 1: + sys.exit("input error: " + val[0] + " is not a valid expression") + key = allowed[val[0]] + if key in fields: + sys.exit("input error: multiple assignments for " + val[0]) + fields[key] = "=".join(val[1:]) + if "accCol" not in fields: + sys.exit("input error: an accession column has to be defined.") + return fields + if __name__ == "__main__": - args = parse_args() - if hasattr(args, 'debug') and args.debug: - debug = True - else: - debug = False - # update-lineage-info - if args.tool == "update-lineage-info": - tmp_dirname = mkdtemp( prefix=".tmp_") - alias_key, lineage = Lineages_UPDATER.download_source(tmp_dirname) - Lineages_UPDATER.process_lineage(alias_key,lineage,'lib/lineage.all.tsv') - if os.path.isdir(tmp_dirname): - shutil.rmtree(tmp_dirname) - sys.exit('Complete!') - - - if not args.db is None and args.tool != "add" and not os.path.isfile(args.db): - sys.exit("input error: database does not exist.") - - snr = sonar(args.db, debug=debug) - - if not args.db is None: - # if Upgrade - if args.tool == "db-upgrade": - input("Warning: Backup db file before upgrading, Press Enter to continue...") - sonardb.sonarDBManager.upgrade_db(args.db) - else: - with sonardb.sonarDBManager(args.db, readonly=True) as dbm: - dbm.check_db_compatibility() - - # add - if args.tool == "add": - - # sanity check - if not args.file and not args.cache: - sys.exit("nothing to add.") - - snr.add(args.file, cachedir=args.cache, cpus=args.cpus, force=args.force, timeout=args.timeout, quiet=args.quiet, noprogress=args.noprogress, compressed=args.compressed, source=args.source, collection=args.collection, lab=args.lab) - - # remove - if args.tool == "remove": - acc = [] - if args.file: - if not os.path.isfile(args.file): - sys.exit("input error: file not found.") - with open(args.file, "r") as handle: - acc = [x.strip() for x in handle if x.strip() != ""] - elif args.acc: - acc = args.acc - if not acc: - print("nothing to delete.") - else: - snr.remove(*acc) - - # match - if args.tool == "match": - # sanity check - if args.date: - regex = re.compile("^[0-9]{4}-[0-9]{2}-[0-9]{2}(?::[0-9]{4}-[0-9]{2}-[0-9]{2})?$") - for d in args.date: - if "^" in d[0]: - d = d.split("^")[1] - if not regex.match(d): - sys.exit("input error: " + d + " is not a valid date (YYYY-MM-DD) or time span (YYYY-MM-DD:YYYY-MM-DD).") - if args.no_frameshifts: - frameshifts = -1 - elif args.only_frameshifts: - frameshifts = 1 - else: - frameshifts = 0 - - if args.lineage: - args.lineage = [x.upper() for x in args.lineage] - if args.lab: - args.lab = [x.upper() for x in args.lab] - if args.source: - args.source = [x.upper() for x in args.source] - if args.collection: - args.collection = [x.upper() for x in args.collection] - if args.technology: - args.technology = [x.upper() for x in args.technology] - if args.platform: - args.platform = [x.upper() for x in args.platform] - if args.chemistry: - args.chemistry = [x.upper() for x in args.chemistry] - if args.software: - args.software = args.software.upper() - if args.version: - args.version = args.version.upper() - if args.material: - args.material = [x.upper() for x in args.material] - - snr.match_genomes(include_profiles=args.include, exclude_profiles=args.exclude, accessions=args.acc, lineages=args.lineage, with_sublineage=args.with_sublineage, zips=args.zip, dates=args.date, submission_dates=args.submission_date, labs=args.lab, sources=args.source, collections=args.collection, technologies=args.technology, platforms=args.platform, chemistries=args.chemistry, software=args.software, software_version=args.version, materials=args.material, min_ct=args.min_ct, max_ct=args.max_ct, seqhash=args.seqhash, ambig=args.ambig, count=args.count, frameshifts=frameshifts, tsv=args.tsv) - - # update - if args.tool == "update": - fields={} - if args.csv: - cols = process_update_expressions(args.fields) - snr.update_metadata(args.csv, **cols, sep=",", pangolin=False, compressed=args.compressed) - elif args.tsv: - cols = process_update_expressions(args.fields) - snr.update_metadata(args.tsv, **cols, sep="\t", pangolin=False, compressed=args.compressed) - elif args.pangolin: - snr.update_metadata(args.pangolin, pangolin=True, compressed=args.compressed) - else: - print("nothing to update.") - - # restore - if args.tool == "restore": - args.acc = set([x.strip() for x in args.acc]) - if args.file: - if not os.path.isfile(args.file): - sys.exit("input error: file " + args.file + " does not exist.") - with snr.open_file(args.file, compressed=args.file,) as handle: - for line in handle: - args.acc.add(line.strip()) - if len(args.acc) == 0: - sys.exit("input error: nothing to restore.") - for acc in filter(None, args.acc): - print("\n".join(snr.restore(acc))) - - # Var2Vcf (export variants to VCF) - if args.tool == "var2vcf": - import time - start = time.time() - - if args.date: - regex = re.compile("^[0-9]{4}-[0-9]{2}-[0-9]{2}(?::[0-9]{4}-[0-9]{2}-[0-9]{2})?$") - for d in args.date: - if not regex.match(d): - sys.exit("input error: " + d + " is not a valid date (YYYY-MM-DD) or time span (YYYY-MM-DD:YYYY-MM-DD).") - - args.acc = set([x.strip() for x in args.acc]) - if args.file: - if not os.path.isfile(args.file): - sys.exit("input error: file " + args.file + " does not exist.") - with snr.open_file(args.file, compressed='auto') as handle: - for line in handle: - args.acc.add(line.strip()) - - snr.var2vcf(args.acc, args.date, args.output, args.cpus, args.betaV2) - end = time.time() - hours, rem = divmod(end-start, 3600) - minutes, seconds = divmod(rem, 60) - print("Runtime (H:M:S): {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)) - - # view - if args.tool == "view": - snr.view(args.acc) - - # info - if args.tool == "info": - snr.show_system_info() - if args.db: - print() - snr.show_db_info() - - # optimize - if args.tool == "optimize": - sonardb.sonarDBManager.optimize(args.db) + args = parse_args() + if hasattr(args, "debug") and args.debug: + debug = True + else: + debug = False + # update-lineage-info + if args.tool == "update-lineage-info": + tmp_dirname = mkdtemp(prefix=".tmp_") + alias_key, lineage = Lineages_UPDATER.download_source(tmp_dirname) + Lineages_UPDATER.process_lineage(alias_key, lineage, "lib/lineage.all.tsv") + if os.path.isdir(tmp_dirname): + shutil.rmtree(tmp_dirname) + sys.exit("Complete!") + + if not args.db is None and args.tool != "add" and not os.path.isfile(args.db): + sys.exit("input error: database does not exist.") + + snr = sonar(args.db, debug=debug) + + if not args.db is None: + # if Upgrade + if args.tool == "db-upgrade": + input( + "Warning: Backup db file before upgrading, Press Enter to continue..." + ) + sonardb.sonarDBManager.upgrade_db(args.db) + else: + with sonardb.sonarDBManager(args.db, readonly=True) as dbm: + dbm.check_db_compatibility() + + # add + if args.tool == "add": + + # sanity check + if not args.file and not args.cache: + sys.exit("nothing to add.") + + snr.add( + args.file, + cachedir=args.cache, + cpus=args.cpus, + force=args.force, + timeout=args.timeout, + quiet=args.quiet, + noprogress=args.noprogress, + compressed=args.compressed, + source=args.source, + collection=args.collection, + lab=args.lab, + ) + + # remove + if args.tool == "remove": + acc = [] + if args.file: + if not os.path.isfile(args.file): + sys.exit("input error: file not found.") + with open(args.file, "r") as handle: + acc = [x.strip() for x in handle if x.strip() != ""] + elif args.acc: + acc = args.acc + if not acc: + print("nothing to delete.") + else: + snr.remove(*acc) + + # match + if args.tool == "match": + # sanity check + if args.date: + regex = re.compile( + "^[0-9]{4}-[0-9]{2}-[0-9]{2}(?::[0-9]{4}-[0-9]{2}-[0-9]{2})?$" + ) + for d in args.date: + if "^" in d[0]: + d = d.split("^")[1] + if not regex.match(d): + sys.exit( + "input error: " + + d + + " is not a valid date (YYYY-MM-DD) or time span (YYYY-MM-DD:YYYY-MM-DD)." + ) + if args.no_frameshifts: + frameshifts = -1 + elif args.only_frameshifts: + frameshifts = 1 + else: + frameshifts = 0 + + if args.lineage: + args.lineage = [x.upper() for x in args.lineage] + if args.lab: + args.lab = [x.upper() for x in args.lab] + if args.source: + args.source = [x.upper() for x in args.source] + if args.collection: + args.collection = [x.upper() for x in args.collection] + if args.technology: + args.technology = [x.upper() for x in args.technology] + if args.platform: + args.platform = [x.upper() for x in args.platform] + if args.chemistry: + args.chemistry = [x.upper() for x in args.chemistry] + if args.software: + args.software = args.software.upper() + if args.version: + args.version = args.version.upper() + if args.material: + args.material = [x.upper() for x in args.material] + + snr.match_genomes( + include_profiles=args.include, + exclude_profiles=args.exclude, + accessions=args.acc, + lineages=args.lineage, + with_sublineage=args.with_sublineage, + zips=args.zip, + dates=args.date, + submission_dates=args.submission_date, + labs=args.lab, + sources=args.source, + collections=args.collection, + technologies=args.technology, + platforms=args.platform, + chemistries=args.chemistry, + software=args.software, + software_version=args.version, + materials=args.material, + min_ct=args.min_ct, + max_ct=args.max_ct, + seqhash=args.seqhash, + ambig=args.ambig, + count=args.count, + frameshifts=frameshifts, + tsv=args.tsv, + ) + + # update + if args.tool == "update": + fields = {} + if args.csv: + cols = process_update_expressions(args.fields) + snr.update_metadata( + args.csv, **cols, sep=",", pangolin=False, compressed=args.compressed + ) + elif args.tsv: + cols = process_update_expressions(args.fields) + snr.update_metadata( + args.tsv, **cols, sep="\t", pangolin=False, compressed=args.compressed + ) + elif args.pangolin: + snr.update_metadata( + args.pangolin, pangolin=True, compressed=args.compressed + ) + else: + print("nothing to update.") + + # restore + if args.tool == "restore": + args.acc = set([x.strip() for x in args.acc]) + if args.file: + if not os.path.isfile(args.file): + sys.exit("input error: file " + args.file + " does not exist.") + with snr.open_file( + args.file, + compressed=args.file, + ) as handle: + for line in handle: + args.acc.add(line.strip()) + if len(args.acc) == 0: + sys.exit("input error: nothing to restore.") + for acc in filter(None, args.acc): + print("\n".join(snr.restore(acc))) + + # Var2Vcf (export variants to VCF) + if args.tool == "var2vcf": + import time + + start = time.time() + + if args.date: + regex = re.compile( + "^[0-9]{4}-[0-9]{2}-[0-9]{2}(?::[0-9]{4}-[0-9]{2}-[0-9]{2})?$" + ) + for d in args.date: + if not regex.match(d): + sys.exit( + "input error: " + + d + + " is not a valid date (YYYY-MM-DD) or time span (YYYY-MM-DD:YYYY-MM-DD)." + ) + + args.acc = set([x.strip() for x in args.acc]) + if args.file: + if not os.path.isfile(args.file): + sys.exit("input error: file " + args.file + " does not exist.") + with snr.open_file(args.file, compressed="auto") as handle: + for line in handle: + args.acc.add(line.strip()) + + snr.var2vcf(args.acc, args.date, args.output, args.cpus, args.betaV2) + end = time.time() + hours, rem = divmod(end - start, 3600) + minutes, seconds = divmod(rem, 60) + print( + "Runtime (H:M:S): {:0>2}:{:0>2}:{:05.2f}".format( + int(hours), int(minutes), seconds + ) + ) + + # view + if args.tool == "view": + snr.view(args.acc) + + # info + if args.tool == "info": + snr.show_system_info() + if args.db: + print() + snr.show_db_info() + + # optimize + if args.tool == "optimize": + sonardb.sonarDBManager.optimize(args.db) From aaf14d4465bbf173302b1d359b5c071f93a25f9a Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Tue, 24 Oct 2023 13:49:51 +0200 Subject: [PATCH 2/3] Set execute bit on sonardb.py, as it is run directly in the testing script --- lib/sonardb.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 lib/sonardb.py diff --git a/lib/sonardb.py b/lib/sonardb.py old mode 100644 new mode 100755 From cc504148bb4e667453169bc950e15f5ccbaecd18 Mon Sep 17 00:00:00 2001 From: "Huska, Matthew" Date: Tue, 24 Oct 2023 14:08:15 +0200 Subject: [PATCH 3/3] Add blame ignore file to let us ignore black formatting changes --- .git-blame-ignore-revs | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..f26f558 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,12 @@ +# A set of commits to ignore when running "git blame" +# To use this file by default for this repo, run the command (in the project +# dir): +# +# $ git config blame.ignoreRevsFile .git-blame-ignore-revs +# +# To disable this feature, run the command: +# +# $ git config blame.ignoreRevsFile "" + +# Reformat code using black +921c47b2cd31a52d74e7dd42a4cdbf2e756e2209