Added gget info return for flybase and wormbase gene IDs

pachterlab · May 28, 2022 · 4ccbb03 · 4ccbb03
1 parent 76bc48c
commit 4ccbb03
Show file tree

Hide file tree

Showing 4 changed files with 132 additions and 10 deletions.
diff --git a/gget/gget_info.py b/gget/gget_info.py
@@ -77,11 +77,17 @@ def info(ens_ids, expand=False, wrap_text=False, json=False, verbose=True, save=
         try:
             df_temp = rest_query(server, query, content_type)
 
-            # Add Ensembl ID with latest version number to df_temp
-            ensembl_id_dict = {
-                "ensembl_id": str(df_temp["id"]) + "." + str(df_temp["version"])
-            }
-            df_temp.update(ensembl_id_dict)
+            try:
+                # Add Ensembl ID with latest version number to df_temp
+                ensembl_id_dict = {
+                    "ensembl_id": str(df_temp["id"]) + "." + str(df_temp["version"])
+                }
+                df_temp.update(ensembl_id_dict)
+
+            except KeyError:
+                # Just add Ensembl ID if no version found
+                ensembl_id_dict = {"ensembl_id": str(df_temp["id"])}
+                df_temp.update(ensembl_id_dict)
 
         # If query returns in an error:
         except RuntimeError:
@@ -137,7 +143,29 @@ def info(ens_ids, expand=False, wrap_text=False, json=False, verbose=True, save=
     ):
         if id_type == "Gene" or id_type == "Transcript":
 
-            if master_dict[ens_id]["species"] == "homo_sapiens":
+            # Check if this is a wrombase ID:
+            if ens_id.startswith("WB"):
+                if id_type == "Gene":
+                    df_uniprot = get_uniprot_info(
+                        UNIPROT_REST_API, ens_id, id_type="WB_Gene", verbose=verbose
+                    )
+
+                else:
+                    df_uniprot = get_uniprot_info(
+                        UNIPROT_REST_API,
+                        ens_id,
+                        id_type="WB_Transcript",
+                        verbose=verbose,
+                    )
+
+            # Check if this is a flybase ID:
+            elif ens_id.startswith("FB"):
+                df_uniprot = get_uniprot_info(
+                    UNIPROT_REST_API, ens_id, id_type="Flybase", verbose=verbose
+                )
+
+            # Check if this ID requires a version number
+            elif master_dict[ens_id]["species"] == "homo_sapiens":
                 df_uniprot = get_uniprot_info(
                     UNIPROT_REST_API, uniprot_ens_id, id_type=id_type, verbose=verbose
                 )

diff --git a/gget/gget_seq.py b/gget/gget_seq.py
@@ -254,8 +254,17 @@ def seq(
                         f"{ensembl_ID} not recognized as either a gene or transcript ID. It will not be included in the UniProt query."
                     )
 
+            # Check if this is a Wrombase ID:
+            if ensembl_ID.startswith("WB"):
+                id_type = "wormbase"
+            # Check if this is a flybase ID:
+            elif ensembl_ID.startswith("FB"):
+                id_type = "flybase"
+            else:
+                id_type = "ensembl"
+
             # Fetch the amino acid sequences of the transcript Ensembl IDs
-            df_uniprot = get_uniprot_seqs(UNIPROT_REST_API, trans_ids)
+            df_uniprot = get_uniprot_seqs(UNIPROT_REST_API, trans_ids, id_type=id_type)
 
         if isoforms == True:
             # List to collect transcript IDs
@@ -312,8 +321,17 @@ def seq(
                         f"{ensembl_ID} not recognized as either a gene or transcript ID. It will not be included in the UniProt query."
                     )
 
+            # Check if this is a Wrombase ID:
+            if ensembl_ID.startswith("WB"):
+                id_type = "wormbase"
+            # Check if this is a flybase ID:
+            elif ensembl_ID.startswith("FB"):
+                id_type = "flybase"
+            else:
+                id_type = "ensembl"
+
             # Fetch amino acid sequences of all isoforms from the UniProt REST API
-            df_uniprot = get_uniprot_seqs(UNIPROT_REST_API, trans_ids)
+            df_uniprot = get_uniprot_seqs(UNIPROT_REST_API, trans_ids, id_type=id_type)
 
         # Check if less results were found than IDs put in
         if len(df_uniprot) != len(trans_ids) and len(df_uniprot) > 0:

diff --git a/gget/utils.py b/gget/utils.py
@@ -132,7 +132,7 @@ def aa_colors(amino_acid):
     return f"\033[38;5;{textcolor}m\033[48;5;{bkg_color}m{amino_acid}\033[0;0m"
 
 
-def get_uniprot_seqs(server, ensembl_ids):
+def get_uniprot_seqs(server, ensembl_ids, id_type="ensembl"):
     """
     Retrieve UniProt sequences based on Ensemsbl identifiers.
 
@@ -149,11 +149,23 @@ def get_uniprot_seqs(server, ensembl_ids):
     if type(ensembl_ids) == str:
         ensembl_ids = [ensembl_ids]
 
+    # Define Ensembl ID type
+    if id_type == "ensembl":
+        ens_id_type = "ENSEMBL_TRS_ID"
+    elif id_type == "flybase":
+        ens_id_type = "FLYBASE_ID"
+    elif id_type == "wormbase":
+        ens_id_type = "WORMBASE_TRS_ID"
+    else:
+        raise ValueError(
+            f"ID type defined as {id_type}. Expected one of: ensembl, flybase, wormbase"
+        )
+
     # Define query arguments
     # Columns documentation: https://www.uniprot.org/help/uniprotkb%5Fcolumn%5Fnames
     # from/to IDs documentation: https://www.uniprot.org/help/api_idmapping
     query_args = {
-        "from": "ENSEMBL_TRS_ID",
+        "from": ens_id_type,
         "to": "ACC",
         "format": "tab",
         "query": " ".join(ensembl_ids),
@@ -234,6 +246,13 @@ def get_uniprot_info(server, ensembl_id, id_type, verbose=True):
         ens_id_type = "ENSEMBL_ID"
     elif id_type == "Transcript":
         ens_id_type = "ENSEMBL_TRS_ID"
+    elif id_type == "Flybase":
+        ens_id_type = "FLYBASE_ID"
+    elif id_type == "WB_Gene":
+        ens_id_type = "WORMBASE_ID"
+    elif id_type == "WB_Transcript":
+        ens_id_type = "WORMBASE_TRS_ID"
+
     else:
         logging.warning(
             f"Ensembl_ID '{ensembl_id}' was not recognized as either gene nor transcript. Gene name synonyms and description will not be fetched from UniProt."

diff --git a/tests/test_gget_info.py b/tests/test_gget_info.py
@@ -5,6 +5,63 @@
 class TestInfo(unittest.TestCase):
     maxDiff = None
 
+    def test_info_WB_gene(self):
+        df = info("WBGene00043981")
+        # Drop NaN columns, since np.nan != np.nan
+        result_to_test = df.dropna(axis=1).values.tolist()
+        expected_result = [
+            [
+                "WBGene00043981",
+                "Q5WRS0",
+                "3565421",
+                "caenorhabditis_elegans",
+                "WBcel235",
+                "aaim-1",
+                "T14E8.4",
+                ["T14E8.4", "aaim-1"],
+                "Protein aaim-1 (Antibacterial and aids invasion by microsporidia 1 protein)",
+                "Uncharacterized protein [Source:NCBI gene (formerly Entrezgene);Acc:3565421]",
+                "FUNCTION: Plays a role in promoting resistance to bacterial pathogens such as P.aeruginosa by inhibiting bacterial intestinal colonization. {ECO:0000269|PubMed:34994689}.; FUNCTION: (Microbial infection) Promotes infection by microsporidian pathogens such as N.parisii in the early larval stages of development (PubMed:34994689). Involved in ensuring the proper orientation and location of the spore proteins of N.parisii during intestinal cell invasion (PubMed:34994689). {ECO:0000269|PubMed:34994689}.",
+                "Gene",
+                "protein_coding",
+                "T14E8.4.1.",
+                "X",
+                -1,
+                6559466,
+                6562428,
+            ]
+        ]
+        self.assertListEqual(result_to_test, expected_result)
+
+    def test_info_FB_gene(self):
+        df = info("FBgn0003656")
+        # Drop NaN columns, since np.nan != np.nan
+        result_to_test = df.dropna(axis=1).values.tolist()
+        expected_result = [
+            [
+                "FBgn0003656",
+                "Q9U969",
+                "31716",
+                "drosophila_melanogaster",
+                "BDGP6.32",
+                "sws",
+                "sws",
+                ["CG2212", "Dmel\\CG2212", "PNPLA6", "SWS", "Sws", "olfE", "sws"],
+                "Neuropathy target esterase sws (Swiss cheese) (DSWS) (EC 3.1.1.5)",
+                "swiss cheese",
+                "FUNCTION: Phospholipase B that deacylates intracellular phosphatidylcholine (PtdCho), generating glycerophosphocholine (GroPtdCho). This deacylation occurs at both sn-2 and sn-1 positions of PtdCho. Its specific chemical modification by certain organophosphorus (OP) compounds leads to distal axonopathy. Plays a role in the signaling mechanism between neurons and glia that regulates glia wrapping during development of the adult brain. Essential for membrane lipid homeostasis and cell survival in both neurons and glia of the adult brain. {ECO:0000269|PubMed:15772346, ECO:0000269|PubMed:18945896, ECO:0000269|PubMed:9295388}.",
+                "Enables lysophospholipase activity and protein kinase A catalytic subunit binding activity. Involved in several processes, including negative regulation of cAMP-dependent protein kinase activity; photoreceptor cell maintenance; and sensory perception of smell. Located in endoplasmic reticulum membrane and plasma membrane. Is expressed in adult head and interface glial cell. Used to study blindness; cerebellar ataxia; hereditary spastic paraplegia; and neurodegenerative disease. Human ortholog(s) of this gene implicated in Boucher-Neuhauser syndrome; Laurence-Moon syndrome; Oliver-McFarlane syndrome; and hereditary spastic paraplegia 39. Orthologous to human PNPLA6 (patatin like phospholipase domain containing 6) and PNPLA7 (patatin like phospholipase domain containing 7). [provided by Alliance of Genome Resources, Apr 2022]",
+                "Gene",
+                "protein_coding",
+                "FBtr0071125.",
+                "X",
+                -1,
+                7956820,
+                7968236,
+            ]
+        ]
+        self.assertListEqual(result_to_test, expected_result)
+
     def test_info_gene(self):
         df = info("ENSMUSG00000000001")
         # Drop NaN columns, since np.nan != np.nan