suspectlist_curation.py

# -*- coding: utf-8 -*-
"""Python-Exercise-SuspectList-Curation.ipynb

Automatically generated by Colaboratory. 

# Suspect list Curation

## Installation of packages

This is a python script in the form of a google colab notebook. A notebook can give you results right when you run the code , just like a python console. Check which python version is used in this notebook
"""

from platform import python_version

print(python_version())


"""Once the packages are installed, you need to import them every time you open the notebook or script."""

# pandas package is used to read the csv files easily and modify them
import pandas as pd
# pubchempy is a package from PubChem Database, where we can access all entries within python in an automated manner without ggoing to the website. This is called API (application programming interface).
import pubchempy as pcp
# numpy is very similar to pandas, it also helps organize the data in a user friendly manner
import numpy as np

# this is just a function defined to check if any input (here called variable) is a string or not. String means any character or sentence or word, so not numbers.
def isNaN(string):
    return string != string

#os and glob packages are used to fetch directories
import os
import glob

import re
from pybatchclassyfire import *
# csv package for certain csv files
import csv 
# time package
import time
# Reading json files
import json
# normalizing json files
from pandas import json_normalize

# for normal functions in python we might need
import openpyxl
import statistics
import sys

#RDKit is a python cheminformatics toolkit
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import rdFMCS
from rdkit.Chem import PandasTools


sl = pd.read_csv('SuspectListCSV.csv')

# list of all columns in the suspect list
sl.columns

# introduce synonyms column to sl:
sl["synonyms"] = np.nan
# introduce PubChemPY column to sl; which will add information on what kind of alteration is done to the cell
sl["PubChemPY"] = np.nan

# Here you have to think: there can be wrong syntax SMILES so for that:
#define an empty variable for entries with smiles that give error for any reason, either wrong SMILES or PubChem error:
no_entries_pb = []

# for each row in the suspect list
for index, row in sl.iterrows():

  #if SMILE cell is not empty; here you can see: suspectlist["column_name"][row_number] is same as sl["SMILES"][index]
  # isNaN was function we defined earlier to check if any entry/cell is empty or contains a string of characters
  if not isNaN(sl["SMILES"][index]):

    #save the smiles for each entry in a variable
    sm = sl["SMILES"][index]

    try:
    
      #fetch compound from PubChem using SMILES 
      comp = pcp.get_compounds(sm, 'smiles')

      # if the comp is not empty (pubchem has an entry with this SMILES)
      if comp:


        for c in comp:
          # add Pubchem ID
          sl["PubChemId"][index] = c.cid
          # add inchi
          sl["InChI"][index]= c.inchi
          #add iupac
          sl["iupac"][index]= c.iupac_name
          # add molecular mass
          sl["Molecular mass"][index] = c.molecular_weight
          # add monosiotopic mass
          sl["Monoisotopic_mass"][index] = c.monoisotopic_mass
          # add formula
          sl["Formula"][index] = c.molecular_formula
          # add synonyms
          sl["synonyms"][index] = c.synonyms
          #what alyterations are made? we checked and added information to these entries which have correct SMILES
          sl["PubChemPY"][index]= "checked comp.properties"

      else:
        # now it means there was no compound in Pubchem with these SMILES
        sl["PubChemPY"][index]= "no entry in PubChem"
    # If there is an error while running PubChempy, save it in a dataframe
    except Exception as e:
      no_entries_pb.append({
          "i": index,
          "error": e,
          "smiles":sm

      })
    no_entries = pd.DataFrame(no_entries_pb)

sl

"""We deal with ``` no_entries``` dataframe later; this contains wrong SMILES or PubChempy error we can check later. Lets first focus on entries with no SMILES in the suspect list, we can use the names to check the pubchem and add relevant information


"""

#define another empty variable for entries in suspect list with no smiles:
no_smiles_sl = []
# for each row in the suspect list
for index, row in sl.iterrows():
  # if SMILES cell is empty
  if isNaN(sl["SMILES"][index]):
    # add no smiles in sl
    sl["PubChemPY"][index]= "no smiles in sl"
    # save the name and index and check these entries
    no_smiles_sl.append({
        "i": index,
        "names":sl["Name"][index]
    })
  no_smiles = pd.DataFrame(no_smiles_sl)

no_smiles

"""Above you can see a  nitrite reductase. For that you can manually check uniprot and see what are the reactants and products for this enzyme and put nitrute reductase in the enzyme section"""

# define empty variable for entries which give errors
no_entries_nm = []
# for each row of no_smiles,
for index, rows in no_smiles.iterrows():
  # for each row in suspect list
  for indices, rows in sl.iterrows():
    # if the "i", which is the index number in suspect list for these compound names is same as the index number in the suspect list
    # and if there is no "ase" in the name of the compound, (becauase "ase" means its an enzyme)
    if indices == no_smiles["i"][index] and "ase" not in no_smiles["names"][index]:
      # save the name of the entry 
      nameSL = (sl["Name"][indices])
      try:
        # check if the name is present in the PubChem
        n_comp = pcp.get_compounds(nameSL, 'name')

        # if the compound with the name is present
        if n_comp:

          # add following information to the suspect list
          for c in n_comp:
            sl["PubChemId"][indices] = c.cid
            sl["InChI"][indices]= c.inchi
            sl["iupac"][indices]= c.iupac_name
            sl["SMILES"][indices] = c.isomeric_smiles
            sl["Molecular mass"][indices] = c.molecular_weight
            sl["Monoisotopic_mass"][indices] = c.monoisotopic_mass
            sl["Formula"][indices] = c.molecular_formula
            sl["synonyms"][indices] = c.synonyms
            sl["PubChemPY"][indices]= "checked comp.properties"
          # otherwise, no name was found in the suspect list
        else:
            sl["PubChemPY"][indices]= "no NAME in PubChem"
      # in case there is error while running Pubchempy, add this information into a dataframe
      except Exception as e:
        no_entries_pb.append({
            "i": indices,
            "error": e,
            "names":nameSL
       })
no_entriesNMS = pd.DataFrame(no_entries_nm)

# it is empty, which means no errors with names
no_entriesNMS

# going back to the errors with the SMILES
no_entries
# we keep the first compound as it is and we dont change 35 and 36 compounds.
# the SMILES for 35 and 36 contain * which means wild card in SMILES which symbolizes an unknown atom of any element

"""## classification

Lets add classification to the compounds. ChemONT is a chemical classification ontology that is used by a tool called ClassyFire. We can use ClassyFire via webservice or even in python like we will do for this notebook.
"""

# define an empty variable to store inchi, inchikey and smiles in a dataframe 
inchis = []

# here we store sl as frame
frame = sl

# so for each entry in the suspect list now known as frame
for i, row in frame.iterrows():
  # if there is a smiles entry
  if not isNaN(frame['SMILES'][i]):
    
    try:
      #convert smiles to inchi and then inchi to inchi key, using rdkit
      InChI = Chem.MolToInchi(Chem.MolFromSmiles(frame["SMILES"][i]))
      InChIKey = Chem.inchi.InchiToInchiKey(InChI)
      # save these entries into inchis matrix which later converted to dataframe called inchis as well
      inchis.append({
          'index': i,
          'smiles':frame["SMILES"][i],
          'inchi': InChI,
          'inchikey': InChIKey
      })
    except:
      pass
inchis = pd.DataFrame(inchis)

# there are entries in the inchis dataframe
if len(inchis):
  #take all entries for which an inchi is present
  inchis = inchis.loc[-isNaN(inchis['inchikey'])]

  ## Retrieve ClassyFire classifications ##

  # This first step is done using inchikey and interrogation of the gnps classified structures
  gnps_proxy = True 
  url = "http://classyfire.wishartlab.com"
  proxy_url =  "https://gnps-classyfire.ucsd.edu"
  chunk_size = 1000
  sleep_interval = 12

  all_inchi_keys = list(inchis['inchikey'].drop_duplicates())

  resolved_ik_number_list = [0, 0]
  total_inchikey_number = len(all_inchi_keys)

  while True:
    get_classifications_cf_mod(all_inchi_keys, par_level = 6)

    cleanse('all_json.json', 'all_json.json')

    with open("all_json.json") as tweetfile:
      jsondic = json.loads(tweetfile.read())

    df = json_normalize(jsondic)
    df = df.drop_duplicates( 'inchikey' )
    resolved_ik_number = len( df.drop_duplicates('inchikey').inchikey )
    resolved_ik_number_list.append( resolved_ik_number )

    if resolved_ik_number_list[-1] < resolved_ik_number_list[-2] or resolved_ik_number_list[-1] == resolved_ik_number_list[-3]:
        break
    cleanse('all_json.json', 'all_json_cleaned.json')

    with open("all_json_cleaned.json") as tweetfile:
        jsondic = json.loads(tweetfile.read())

flattened_classified_json = json_normalize(jsondic)
flattened_df = flattened_classified_json.drop_duplicates('inchikey')
flattened_df['inchikey'] = flattened_df['inchikey'].str.replace(r'InChIKey=', '')
df_merged = pd.merge(inchis, flattened_df, left_on='inchikey', right_on='inchikey', how='left')

for p, rowp in df_merged.iterrows():
  for q, rowq in frame.iterrows():
    if df_merged["smiles_x"][p] is frame["SMILES"][q]:
      frame.loc[q, 'sub_class'] = df_merged["subclass.name"][p]
      frame.loc[q, 'class'] = df_merged["class.name"][p]
      frame.loc[q, 'super_class'] = df_merged["superclass.name"][p]
      frame.loc[q, 'Classification_Source'] = "ClassyFire"