forked from OscarHoekstra/ClassifyNPDB
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.py
106 lines (81 loc) · 3.89 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python3
"""
Author: Oscar Hoekstra
Student Number: 961007346130
Email: [email protected]
Description: Configuration file for my pipeline.
Edit anything after the colon or between the single quotes to
run the pipeline with your own settings and files.
Edited by Joris Louwen ([email protected]) to run this pipeline
with a newest (2.0) version of mibig.
"""
import time
def Settings():
path_base = '/mnt/scratch/louwe015/NPLinker/classifying/'
Workbase = path_base+'results_ClassifyNPDB/'
repo_base = path_base+'ClassifyNPDB/'
ScriptFolder = repo_base+'Scripts/'
InFilesFolder = repo_base+'InFiles/'
Start = time.time()
StartTimestamp = time.strftime('%Y%m%d-%H%M')
cfg = {
# General Settings
# Add a number to this set to skip that step of the pipeline.
# Step 1 can not be skipped as its necessary for further steps
# and it is also really short.
"SkipSteps": (1,2,3,7),
# The amount of BGCs that should be missing before the script will
# assume it is at the end and stop:
"MaxMibigFails": 10,
# Do the classification of the NPDB in batches? (Classifications are
# saved in between, crashes are less severe but more batches take
# longer)
"DoBatched": True,
# Batch size (standard 10-100)?
"BatchSize": 50,
# Re-do all the NPDB classifications, usefull for if something
# might have updated
# Set this to true to always re-do the classification.
"RedoClassify": True,
# General Constants
# Starting time of the script is set automatically and used to
#save certain output with a timestamp.
"ScriptStartingTime": Start,
"StartTimestamp": StartTimestamp,
# File Paths
# Path to the SQL database that the script works with:
"SQLPath": Workbase+'NPDatabase_new.sqlite',
# Path to the file with the inchi-keys for the NPDB create
# by molconvert, with credits to Rutger Ozinga
"InchiKeyFile": InFilesFolder+'all_input_structures_neutralized_full_dataFile.txt',
# Path to the TSV file with Mibig compound_id and (new) smile,
# with credits to Michelle Schorn.
"MibigSmilesFile": InFilesFolder+'All_MIBiG_compounds_with_SMILES_and_PMID_MAS.txt',
# Path to the file that countains a pickled (saved in bytes)
# copy of the QueryIDDict, which contains the IDs that still
# need to be classified by ClassyFire.
"PQueryID": Workbase+'PickledQueryIDDict.txt',
# Path to file with the still to be classified NPDB IDs:
"ToClassifyFile": Workbase+'ToClassify.txt',
# Settings about NPDB table in the SQL database.
# Name of the table in the database that contains the NPDB
#structure data:
"NPDBtable": 'structure', #standard:'structure'
# Name of the column with the structure IDs:
"structure_id": 'structure_id', #standard:'structure_id'
# Name of the column with inchi-keys to get classifications for:
"InchiKeyToClassify": 'inchi_key_molconvert', #standard:'inchi_key'
# Name of the secondary inchi-key to try if first one failed.
# Set this to false to skip this step.
"BackupInchiKey": 'inchi_key_rdkit',
# Settings about the (to be created) MIBiG table in the SQL database.
# Name of the table in the database that contains the
#MIBiG data:
"MibigTable": 'mibig',#standard:'mibig'
# Name of the column with the compound IDs:
"MibigCompoundID": 'compound_id', #standard:'compound_id'
# Settings about files that will be outputted
# Name of the file that will contain unclassified structures:
"UnclassifiedFile": 'UnclassifiedStructures-'+StartTimestamp+".txt"
}
return cfg