Skip to content

Commit

Permalink
Added gget info return for flybase and wormbase gene IDs
Browse files Browse the repository at this point in the history
  • Loading branch information
lauraluebbert committed May 28, 2022
1 parent 76bc48c commit 4ccbb03
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 10 deletions.
40 changes: 34 additions & 6 deletions gget/gget_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,17 @@ def info(ens_ids, expand=False, wrap_text=False, json=False, verbose=True, save=
try:
df_temp = rest_query(server, query, content_type)

# Add Ensembl ID with latest version number to df_temp
ensembl_id_dict = {
"ensembl_id": str(df_temp["id"]) + "." + str(df_temp["version"])
}
df_temp.update(ensembl_id_dict)
try:
# Add Ensembl ID with latest version number to df_temp
ensembl_id_dict = {
"ensembl_id": str(df_temp["id"]) + "." + str(df_temp["version"])
}
df_temp.update(ensembl_id_dict)

except KeyError:
# Just add Ensembl ID if no version found
ensembl_id_dict = {"ensembl_id": str(df_temp["id"])}
df_temp.update(ensembl_id_dict)

# If query returns in an error:
except RuntimeError:
Expand Down Expand Up @@ -137,7 +143,29 @@ def info(ens_ids, expand=False, wrap_text=False, json=False, verbose=True, save=
):
if id_type == "Gene" or id_type == "Transcript":

if master_dict[ens_id]["species"] == "homo_sapiens":
# Check if this is a wrombase ID:
if ens_id.startswith("WB"):
if id_type == "Gene":
df_uniprot = get_uniprot_info(
UNIPROT_REST_API, ens_id, id_type="WB_Gene", verbose=verbose
)

else:
df_uniprot = get_uniprot_info(
UNIPROT_REST_API,
ens_id,
id_type="WB_Transcript",
verbose=verbose,
)

# Check if this is a flybase ID:
elif ens_id.startswith("FB"):
df_uniprot = get_uniprot_info(
UNIPROT_REST_API, ens_id, id_type="Flybase", verbose=verbose
)

# Check if this ID requires a version number
elif master_dict[ens_id]["species"] == "homo_sapiens":
df_uniprot = get_uniprot_info(
UNIPROT_REST_API, uniprot_ens_id, id_type=id_type, verbose=verbose
)
Expand Down
22 changes: 20 additions & 2 deletions gget/gget_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,17 @@ def seq(
f"{ensembl_ID} not recognized as either a gene or transcript ID. It will not be included in the UniProt query."
)

# Check if this is a Wrombase ID:
if ensembl_ID.startswith("WB"):
id_type = "wormbase"
# Check if this is a flybase ID:
elif ensembl_ID.startswith("FB"):
id_type = "flybase"
else:
id_type = "ensembl"

# Fetch the amino acid sequences of the transcript Ensembl IDs
df_uniprot = get_uniprot_seqs(UNIPROT_REST_API, trans_ids)
df_uniprot = get_uniprot_seqs(UNIPROT_REST_API, trans_ids, id_type=id_type)

if isoforms == True:
# List to collect transcript IDs
Expand Down Expand Up @@ -312,8 +321,17 @@ def seq(
f"{ensembl_ID} not recognized as either a gene or transcript ID. It will not be included in the UniProt query."
)

# Check if this is a Wrombase ID:
if ensembl_ID.startswith("WB"):
id_type = "wormbase"
# Check if this is a flybase ID:
elif ensembl_ID.startswith("FB"):
id_type = "flybase"
else:
id_type = "ensembl"

# Fetch amino acid sequences of all isoforms from the UniProt REST API
df_uniprot = get_uniprot_seqs(UNIPROT_REST_API, trans_ids)
df_uniprot = get_uniprot_seqs(UNIPROT_REST_API, trans_ids, id_type=id_type)

# Check if less results were found than IDs put in
if len(df_uniprot) != len(trans_ids) and len(df_uniprot) > 0:
Expand Down
23 changes: 21 additions & 2 deletions gget/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def aa_colors(amino_acid):
return f"\033[38;5;{textcolor}m\033[48;5;{bkg_color}m{amino_acid}\033[0;0m"


def get_uniprot_seqs(server, ensembl_ids):
def get_uniprot_seqs(server, ensembl_ids, id_type="ensembl"):
"""
Retrieve UniProt sequences based on Ensemsbl identifiers.
Expand All @@ -149,11 +149,23 @@ def get_uniprot_seqs(server, ensembl_ids):
if type(ensembl_ids) == str:
ensembl_ids = [ensembl_ids]

# Define Ensembl ID type
if id_type == "ensembl":
ens_id_type = "ENSEMBL_TRS_ID"
elif id_type == "flybase":
ens_id_type = "FLYBASE_ID"
elif id_type == "wormbase":
ens_id_type = "WORMBASE_TRS_ID"
else:
raise ValueError(
f"ID type defined as {id_type}. Expected one of: ensembl, flybase, wormbase"
)

# Define query arguments
# Columns documentation: https://www.uniprot.org/help/uniprotkb%5Fcolumn%5Fnames
# from/to IDs documentation: https://www.uniprot.org/help/api_idmapping
query_args = {
"from": "ENSEMBL_TRS_ID",
"from": ens_id_type,
"to": "ACC",
"format": "tab",
"query": " ".join(ensembl_ids),
Expand Down Expand Up @@ -234,6 +246,13 @@ def get_uniprot_info(server, ensembl_id, id_type, verbose=True):
ens_id_type = "ENSEMBL_ID"
elif id_type == "Transcript":
ens_id_type = "ENSEMBL_TRS_ID"
elif id_type == "Flybase":
ens_id_type = "FLYBASE_ID"
elif id_type == "WB_Gene":
ens_id_type = "WORMBASE_ID"
elif id_type == "WB_Transcript":
ens_id_type = "WORMBASE_TRS_ID"

else:
logging.warning(
f"Ensembl_ID '{ensembl_id}' was not recognized as either gene nor transcript. Gene name synonyms and description will not be fetched from UniProt."
Expand Down
57 changes: 57 additions & 0 deletions tests/test_gget_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,63 @@
class TestInfo(unittest.TestCase):
maxDiff = None

def test_info_WB_gene(self):
df = info("WBGene00043981")
# Drop NaN columns, since np.nan != np.nan
result_to_test = df.dropna(axis=1).values.tolist()
expected_result = [
[
"WBGene00043981",
"Q5WRS0",
"3565421",
"caenorhabditis_elegans",
"WBcel235",
"aaim-1",
"T14E8.4",
["T14E8.4", "aaim-1"],
"Protein aaim-1 (Antibacterial and aids invasion by microsporidia 1 protein)",
"Uncharacterized protein [Source:NCBI gene (formerly Entrezgene);Acc:3565421]",
"FUNCTION: Plays a role in promoting resistance to bacterial pathogens such as P.aeruginosa by inhibiting bacterial intestinal colonization. {ECO:0000269|PubMed:34994689}.; FUNCTION: (Microbial infection) Promotes infection by microsporidian pathogens such as N.parisii in the early larval stages of development (PubMed:34994689). Involved in ensuring the proper orientation and location of the spore proteins of N.parisii during intestinal cell invasion (PubMed:34994689). {ECO:0000269|PubMed:34994689}.",
"Gene",
"protein_coding",
"T14E8.4.1.",
"X",
-1,
6559466,
6562428,
]
]
self.assertListEqual(result_to_test, expected_result)

def test_info_FB_gene(self):
df = info("FBgn0003656")
# Drop NaN columns, since np.nan != np.nan
result_to_test = df.dropna(axis=1).values.tolist()
expected_result = [
[
"FBgn0003656",
"Q9U969",
"31716",
"drosophila_melanogaster",
"BDGP6.32",
"sws",
"sws",
["CG2212", "Dmel\\CG2212", "PNPLA6", "SWS", "Sws", "olfE", "sws"],
"Neuropathy target esterase sws (Swiss cheese) (DSWS) (EC 3.1.1.5)",
"swiss cheese",
"FUNCTION: Phospholipase B that deacylates intracellular phosphatidylcholine (PtdCho), generating glycerophosphocholine (GroPtdCho). This deacylation occurs at both sn-2 and sn-1 positions of PtdCho. Its specific chemical modification by certain organophosphorus (OP) compounds leads to distal axonopathy. Plays a role in the signaling mechanism between neurons and glia that regulates glia wrapping during development of the adult brain. Essential for membrane lipid homeostasis and cell survival in both neurons and glia of the adult brain. {ECO:0000269|PubMed:15772346, ECO:0000269|PubMed:18945896, ECO:0000269|PubMed:9295388}.",
"Enables lysophospholipase activity and protein kinase A catalytic subunit binding activity. Involved in several processes, including negative regulation of cAMP-dependent protein kinase activity; photoreceptor cell maintenance; and sensory perception of smell. Located in endoplasmic reticulum membrane and plasma membrane. Is expressed in adult head and interface glial cell. Used to study blindness; cerebellar ataxia; hereditary spastic paraplegia; and neurodegenerative disease. Human ortholog(s) of this gene implicated in Boucher-Neuhauser syndrome; Laurence-Moon syndrome; Oliver-McFarlane syndrome; and hereditary spastic paraplegia 39. Orthologous to human PNPLA6 (patatin like phospholipase domain containing 6) and PNPLA7 (patatin like phospholipase domain containing 7). [provided by Alliance of Genome Resources, Apr 2022]",
"Gene",
"protein_coding",
"FBtr0071125.",
"X",
-1,
7956820,
7968236,
]
]
self.assertListEqual(result_to_test, expected_result)

def test_info_gene(self):
df = info("ENSMUSG00000000001")
# Drop NaN columns, since np.nan != np.nan
Expand Down

0 comments on commit 4ccbb03

Please sign in to comment.