Release v1.0

cbg-ethz · Oct 14, 2024 · 89470f8 · 89470f8
1 parent 9a259ff
commit 89470f8
Show file tree

Hide file tree

Showing 18 changed files with 167 additions and 111 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -6,7 +6,7 @@ User:
   output_dir: /path/to/output_dir
   sample_map: /path/to/sample_map.tsv
   cancer_cell_type: HGSOC
-
+  
 # Change if you use a custom reference
 Reference:
   genome: /../ref/GRCh38_gencode_v44_CTAT_lib_Oct292023.plug-n-play/ctat_genome_lib_build_dir/ref_genome.fa
@@ -78,7 +78,7 @@ SNVCalling:
     Min_cell_types: 2
     min_distance: 0
     max_gnomAD_VAF: 0.01
-    deltaVAF: 0.1
+    deltaVAF: 0.05
     deltaMCF: 0.3
     min_ac_reads: 3
     min_ac_cells: 2
@@ -116,7 +116,7 @@ CellClust:
     FN: -1
     estimator: 'posterior'
     pp: [1,1]
-    dpa: [1,1]
+    dpa: [0.001, 5.0]
 
  ### CNA Calling 
 inferCNV:

diff --git a/profile/config.yaml b/profile/config.yaml
@@ -5,7 +5,7 @@ jobs: 500
 cores: 500
 
 default-resources:
-  mem_mb_per_cpu: max(2*input.size_mb, 4096)
+  mem_mb_per_cpu: max(2*input.size_mb, 8192)
   runtime: 240
   slurm_account: "'es_beere'"
   #tmpdir: /cluster/scratch/
diff --git a/run_LongSom.sh b/run_LongSom.sh
@@ -3,7 +3,7 @@ OUTPUT_DIR=/path/to/output_dir
 REF_DIR=path/to/LongSom/ref/GRCh38_gencode_v44_CTAT_lib_Oct292023.plug-n-play/ctat_genome_lib_build_dir
 
 snakemake \
-  -s workflow/LongSom.smk \
+  -s workflow/Snakefile \
   --configfile config/config.yaml \
   --use-conda \
   --use-singularity \

diff --git a/run_LongSom_slurm.sh b/run_LongSom_slurm.sh
@@ -10,7 +10,7 @@ sbatch \
   -e logs/snakelog.err \
 snakemake \
   -s workflow/Snakefile \
-  --configfile workflow/test.yaml \
+  --configfile config/config.yaml \
   --profile profile/ \
   --use-conda \
   --use-singularity \

diff --git a/workflow/rules/CellClustering.smk b/workflow/rules/CellClustering.smk
@@ -57,7 +57,7 @@ else:
             tsv="SNVCalling/BaseCellCalling/{id}.calling.step3.tsv",
             bam=f"{INPUT}/bam/{{id}}.bam",
             barcodes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv",
-            fusions="FusionCalling/Somatic/{id}.Fusions.tsv" if CTATFUSION else [],
+            fusions="FusionCalling/Somatic/{id}.Fusions.SingleCellGenotype.tsv" if CTATFUSION else [],
             ref=str(workflow.basedir)+config['Reference']['genome'],
         output:
             tsv="CellClustering/SingleCellGenotype/{id}.SingleCellGenotype.tsv",
@@ -106,11 +106,11 @@ rule FormatInputBnpC:
     input:
         bin="CellClustering/SingleCellGenotype/{id}.BinaryMatrix.tsv",
         vaf="CellClustering/SingleCellGenotype/{id}.VAFMatrix.tsv",
-        ctypes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv",
+        barcodes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv",
     output:
         bin="CellClustering/BnpC_input/{id}.BinaryMatrix.tsv",
         vaf="CellClustering/BnpC_input/{id}.VAFMatrix.tsv",
-        ctypes="CellClustering/BnpC_input/{id}.Barcodes.tsv",
+        barcodes="CellClustering/BnpC_input/{id}.Barcodes.tsv",
     params:
         script=str(workflow.basedir)+"/scripts/CellClustering/FormatInputBnpC.py",
         min_cells=config['CellClust']['FormatInput']['min_cells_per_mut'],
@@ -126,7 +126,7 @@ rule FormatInputBnpC:
         python {params.script} \
         --bin {input.bin} \
         --vaf {input.vaf} \
-        --ctypes {input.ctypes} \
+        --barcodes {input.barcodes} \
         --min_pos_cov {params.min_cov} \
         --min_cells_per_mut {params.min_cells} \
         --outfile CellClustering/BnpC_input//{wildcards.id} 
@@ -136,7 +136,7 @@ rule BnpC_clustering:
     input:
         bin="CellClustering/BnpC_input/{id}.BinaryMatrix.tsv",
         vaf="CellClustering/BnpC_input/{id}.VAFMatrix.tsv",
-        ctypes="CellClustering/BnpC_input/{id}.Barcodes.tsv",
+        barcodes="CellClustering/BnpC_input/{id}.Barcodes.tsv",
     output:
         pdf="CellClustering/BnpC_output/{id}/genoCluster_posterior_mean_raw.pdf"
     params:
@@ -172,5 +172,5 @@ rule BnpC_clustering:
         -FN {params.FN} \
         -pp {params.pp} \
         -ap {params.dpa} \
-        --ctypes {input.ctypes} 
+        --barcodes {input.barcodes} 
         """
diff --git a/workflow/scripts/CellClustering/FormatInputBnpC.py b/workflow/scripts/CellClustering/FormatInputBnpC.py
@@ -1,14 +1,12 @@
 import timeit
 import argparse
 import pandas as pd
-import matplotlib
-import matplotlib.pyplot as plt
 import numpy as np
 
-def filter_input(bin,vaf,ctypes,min_cells_per_mut,min_pos_cov,out_prefix):
+def filter_input(bin,vaf,barcodes,min_cells_per_mut,min_pos_cov,out_prefix):
 	bin = pd.read_csv(bin,sep='\t',index_col=0,na_values=[3,'.'])
 	vaf = pd.read_csv(vaf,sep='\t',index_col=0,na_values=[3,'.'])
-	ctypes = pd.read_csv(ctypes,sep='\t')
+	barcodes = pd.read_csv(barcodes,sep='\t')
 	#Save fusions:
 	fusions = [i for i in bin.index if '--' in i]
 	fusions_save = bin.loc[fusions,bin.columns]
@@ -25,19 +23,22 @@ def filter_input(bin,vaf,ctypes,min_cells_per_mut,min_pos_cov,out_prefix):
 
 	# Filter all input files:
 	vaf = vaf.loc[idx,cols]
-	ctypes = ctypes[ctypes['Index'].isin(cols)]
+	barcodes = barcodes[barcodes['Index'].isin(cols)]
 	bin = pd.concat([bin,fusions_save[cols]])
+
+	# Add reanno ctype colors
+	barcodes['Cell_Reanno_Colors'] = barcodes['Reannotated_cell_type'].apply(lambda x: '#94C773' if x=='Non-Cancer' else '#8F79A1')
 
 	# Write
 	bin.to_csv(out_prefix + '.BinaryMatrix.tsv', sep='\t')
 	vaf.to_csv(out_prefix + '.VAFMatrix.tsv', sep='\t')
-	ctypes.to_csv(out_prefix + '.Barcodes.tsv', sep='\t', index = False)
+	barcodes.to_csv(out_prefix + '.Barcodes.tsv', sep='\t', index = False)
 
 def initialize_parser():
 	parser = argparse.ArgumentParser(description='Script to filter BnpC input matrix')
 	parser.add_argument('--bin', type=str, default=1, help='SComatic binary matrix (obtained by SingleCellGenotype.py)', required = True)
 	parser.add_argument('--vaf', type=str, default=1, help='SComatic VAF matrix (obtained by SingleCellGenotype.py)', required = True)
-	parser.add_argument('--ctypes', type=str, default=1, help='Barcode to celltypes (obtained by CellTypeReannotation.py)', required = True)
+	parser.add_argument('--barcodes', type=str, default=1, help='Barcode to celltypes (obtained by CellTypeReannotation.py)', required = True)
 	parser.add_argument('--min_cells_per_mut', type=int, default=5, help='SComatic+CellTypeReannotation base calling file (obtained by BaseCellCalling.step3.py)', required = False)
 	parser.add_argument('--min_pos_cov', type=int, default=3, help='SComatic+CellTypeReannotation base calling file (obtained by BaseCellCalling.step3.py)', required = False)
 	parser.add_argument('--outfile', default = 'Matrix.tsv', help='Out file', required = False)
@@ -51,7 +52,7 @@ def main():
 
 	bin = args.bin
 	vaf = args.vaf
-	ctypes = args.ctypes
+	barcodes = args.barcodes
 	min_cells_per_mut = args.min_cells_per_mut
 	min_pos_cov = args.min_pos_cov
 	out_prefix = args.outfile
@@ -60,7 +61,7 @@ def main():
 	print("Outfile prefix: " , out_prefix ,  "\n") 
 
 	# 1. Create clinical annotation file
-	filter_input(bin,vaf,ctypes,min_cells_per_mut,min_pos_cov,out_prefix)
+	filter_input(bin,vaf,barcodes,min_cells_per_mut,min_pos_cov,out_prefix)
 
 if __name__ == '__main__':
 	start = timeit.default_timer()

diff --git a/workflow/scripts/CellClustering/libs/__pycache__/CRP.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/CRP.cpython-312.pyc
diff --git a/workflow/scripts/CellClustering/libs/__pycache__/CRP_learning_errors.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/CRP_learning_errors.cpython-312.pyc
diff --git a/workflow/scripts/CellClustering/libs/__pycache__/MCMC.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/MCMC.cpython-312.pyc
diff --git a/workflow/scripts/CellClustering/libs/__pycache__/__init__.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/__init__.cpython-312.pyc
diff --git a/workflow/scripts/CellClustering/libs/__pycache__/dpmmIO.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/dpmmIO.cpython-312.pyc
diff --git a/workflow/scripts/CellClustering/libs/__pycache__/plotting.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/plotting.cpython-312.pyc
diff --git a/workflow/scripts/CellClustering/libs/__pycache__/utils.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/utils.cpython-312.pyc
diff --git a/workflow/scripts/CellClustering/libs/dpmmIO.py b/workflow/scripts/CellClustering/libs/dpmmIO.py
@@ -275,7 +275,7 @@ def save_similarity(args, inferred, results, out_dir):
 
 
 def save_geno_plots(args, data, data_raw, out_dir, names):
-    ctypes = pd.read_csv(args.ctypes, sep='\t')
+    barcodes = pd.read_csv(args.barcodes, sep='\t')
     row_cl = False
     if args.mut_order:
         row_cl = list(pd.read_csv(args.mut_order, sep='\t')['INDEX'])
@@ -289,11 +289,11 @@ def save_geno_plots(args, data, data_raw, out_dir, names):
             df_obs = pd.DataFrame(data_raw, index=names[0], columns=names[1]).T
             pl.plot_raw_data(
                 data_est['genotypes'], df_obs, assignment=data_est['assignment'],
-                ctypes=ctypes, out_file=out_file, row_cl=row_cl
+                barcodes=barcodes, out_file=out_file, row_cl=row_cl
             )
             pl.plot_raw_data(
                 df_obs, df_obs, assignment=data_est['assignment'],
-                ctypes=ctypes, out_file=out_file_raw, row_cl=row_cl
+                barcodes=barcodes, out_file=out_file_raw, row_cl=row_cl
             )
 
 

diff --git a/workflow/scripts/CellClustering/libs/plotting.py b/workflow/scripts/CellClustering/libs/plotting.py
@@ -64,7 +64,7 @@ def _get_col_order(assignment):
 
 def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None,
             assignment=np.array([]), metric='correlation', row_cl=True,
-            ctypes=pd.Series()):
+            barcodes=pd.Series()):
 
     data = data_in.copy()
     data_raw = data_raw_in.copy()
@@ -93,7 +93,7 @@ def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None,
 
         cluster_cols = pd.Series(col_dict, name='clusters', index=col_order)
 
-        ctypes = ctypes.reindex([ctypes.index[i] for i in col_order])
+        barcodes = barcodes.reindex([barcodes.index[i] for i in col_order])
 
         data.columns = np.arange(data_in.shape[1])
         data = data[col_order]
@@ -137,7 +137,7 @@ def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None,
     cmap.set_over('green')
     cmap.set_bad('grey')
 
-    col_colors=[ctypes['Cancer_Color'],cluster_cols]
+    col_colors=[barcodes['Cell_Reanno_Colors'],cluster_cols]
 
     cm = sns.clustermap(
         data,

diff --git a/workflow/scripts/CellClustering/run_BnpC.py b/workflow/scripts/CellClustering/run_BnpC.py
@@ -55,7 +55,7 @@ def check_PSRF_cutoff(val):
         help='Run single chain in main python thread for debugging with pdb.'
     )
     parser.add_argument(
-        '--ctypes', type=str,
+        '--barcodes', type=str,
         help='Absolute or relative path(s) to input barcode-to-ctype file'
     )
     parser.add_argument(

diff --git a/workflow/scripts/CellTypeReannotation/HighConfidenceCancerVariants.py b/workflow/scripts/CellTypeReannotation/HighConfidenceCancerVariants.py
@@ -46,9 +46,9 @@ def HCCV_SNV(SNVs,outfile,min_dp,deltaVAF,deltaMCF,clust_dist):
 	input_df['DP_FILTER'] = input_df.apply(lambda x: 
 		DP_filtering(x['Cancer'],x['Non-Cancer'],min_dp), axis=1)
 	input_df = input_df[input_df['DP_FILTER']=='PASS']
+	input_df.to_csv(outfile+'2', sep='\t', index=False,  mode='a')
 
 	#Special case for chrM due to contaminants:
-
 	# Save chrM candidate SNVs to apply specific filters
 	chrm_df = input_df[input_df['#CHROM']=='chrM'].copy()
 	input_df = input_df[input_df['#CHROM']!='chrM']
@@ -77,6 +77,7 @@ def HCCV_SNV(SNVs,outfile,min_dp,deltaVAF,deltaMCF,clust_dist):
 	# Filter 7: PoN filterDelta VAF and MCF filtering
 	input_df['HCCV_FILTER'] = input_df.apply(lambda x: 
 		MCF_filtering(x['Cell_types'],x['VAF'], x['MCF'],deltaVAF,deltaMCF), axis=1)
+	input_df.to_csv(outfile+'3', sep='\t', index=False,  mode='a')
 	input_df = input_df[input_df['HCCV_FILTER']=='PASS']
 
 	# Filter 8: Distance filter
@@ -229,16 +230,19 @@ def MCF_filtering(CTYPES,VAF,MCF,deltaVAFmin,deltaMCFmin):
 			VAFNonCancer = float(VAFs[0])
 			MCFCancer = float(MCFs[1])
 			MCFNonCancer = float(MCFs[0])
-
-		if VAFNonCancer>0.12:
-			return 'Heterozygous'
 
 		if VAFCancer<0.05: 
 			return 'NonSig'
 
-		# deltaVAF = VAFCancer-VAFNonCancer
+		deltaVAF = VAFCancer-VAFNonCancer
 		deltaMCF = MCFCancer-MCFNonCancer
 
+		if VAFNonCancer>0.1 and deltaVAF<2*deltaVAFmin:
+			return 'Heterozygous'
+
+		if VAFNonCancer>0.2:
+			return 'Heterozygous'
+
 		# HCCV are variants with high VAF/MCF in cancer and low VAF/MCF in non-cancer cells
 		# if deltaVAF < deltaVAFmin:
 		#	return 'LowDeltaVAF'