Skip to content

Commit

Permalink
Release v1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurDondi committed Oct 14, 2024
1 parent 9a259ff commit 89470f8
Show file tree
Hide file tree
Showing 18 changed files with 167 additions and 111 deletions.
6 changes: 3 additions & 3 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ User:
output_dir: /path/to/output_dir
sample_map: /path/to/sample_map.tsv
cancer_cell_type: HGSOC

# Change if you use a custom reference
Reference:
genome: /../ref/GRCh38_gencode_v44_CTAT_lib_Oct292023.plug-n-play/ctat_genome_lib_build_dir/ref_genome.fa
Expand Down Expand Up @@ -78,7 +78,7 @@ SNVCalling:
Min_cell_types: 2
min_distance: 0
max_gnomAD_VAF: 0.01
deltaVAF: 0.1
deltaVAF: 0.05
deltaMCF: 0.3
min_ac_reads: 3
min_ac_cells: 2
Expand Down Expand Up @@ -116,7 +116,7 @@ CellClust:
FN: -1
estimator: 'posterior'
pp: [1,1]
dpa: [1,1]
dpa: [0.001, 5.0]

### CNA Calling
inferCNV:
Expand Down
2 changes: 1 addition & 1 deletion profile/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ jobs: 500
cores: 500

default-resources:
mem_mb_per_cpu: max(2*input.size_mb, 4096)
mem_mb_per_cpu: max(2*input.size_mb, 8192)
runtime: 240
slurm_account: "'es_beere'"
#tmpdir: /cluster/scratch/
2 changes: 1 addition & 1 deletion run_LongSom.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ OUTPUT_DIR=/path/to/output_dir
REF_DIR=path/to/LongSom/ref/GRCh38_gencode_v44_CTAT_lib_Oct292023.plug-n-play/ctat_genome_lib_build_dir

snakemake \
-s workflow/LongSom.smk \
-s workflow/Snakefile \
--configfile config/config.yaml \
--use-conda \
--use-singularity \
Expand Down
2 changes: 1 addition & 1 deletion run_LongSom_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ sbatch \
-e logs/snakelog.err \
snakemake \
-s workflow/Snakefile \
--configfile workflow/test.yaml \
--configfile config/config.yaml \
--profile profile/ \
--use-conda \
--use-singularity \
Expand Down
12 changes: 6 additions & 6 deletions workflow/rules/CellClustering.smk
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ else:
tsv="SNVCalling/BaseCellCalling/{id}.calling.step3.tsv",
bam=f"{INPUT}/bam/{{id}}.bam",
barcodes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv",
fusions="FusionCalling/Somatic/{id}.Fusions.tsv" if CTATFUSION else [],
fusions="FusionCalling/Somatic/{id}.Fusions.SingleCellGenotype.tsv" if CTATFUSION else [],
ref=str(workflow.basedir)+config['Reference']['genome'],
output:
tsv="CellClustering/SingleCellGenotype/{id}.SingleCellGenotype.tsv",
Expand Down Expand Up @@ -106,11 +106,11 @@ rule FormatInputBnpC:
input:
bin="CellClustering/SingleCellGenotype/{id}.BinaryMatrix.tsv",
vaf="CellClustering/SingleCellGenotype/{id}.VAFMatrix.tsv",
ctypes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv",
barcodes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv",
output:
bin="CellClustering/BnpC_input/{id}.BinaryMatrix.tsv",
vaf="CellClustering/BnpC_input/{id}.VAFMatrix.tsv",
ctypes="CellClustering/BnpC_input/{id}.Barcodes.tsv",
barcodes="CellClustering/BnpC_input/{id}.Barcodes.tsv",
params:
script=str(workflow.basedir)+"/scripts/CellClustering/FormatInputBnpC.py",
min_cells=config['CellClust']['FormatInput']['min_cells_per_mut'],
Expand All @@ -126,7 +126,7 @@ rule FormatInputBnpC:
python {params.script} \
--bin {input.bin} \
--vaf {input.vaf} \
--ctypes {input.ctypes} \
--barcodes {input.barcodes} \
--min_pos_cov {params.min_cov} \
--min_cells_per_mut {params.min_cells} \
--outfile CellClustering/BnpC_input//{wildcards.id}
Expand All @@ -136,7 +136,7 @@ rule BnpC_clustering:
input:
bin="CellClustering/BnpC_input/{id}.BinaryMatrix.tsv",
vaf="CellClustering/BnpC_input/{id}.VAFMatrix.tsv",
ctypes="CellClustering/BnpC_input/{id}.Barcodes.tsv",
barcodes="CellClustering/BnpC_input/{id}.Barcodes.tsv",
output:
pdf="CellClustering/BnpC_output/{id}/genoCluster_posterior_mean_raw.pdf"
params:
Expand Down Expand Up @@ -172,5 +172,5 @@ rule BnpC_clustering:
-FN {params.FN} \
-pp {params.pp} \
-ap {params.dpa} \
--ctypes {input.ctypes}
--barcodes {input.barcodes}
"""
19 changes: 10 additions & 9 deletions workflow/scripts/CellClustering/FormatInputBnpC.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import timeit
import argparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

def filter_input(bin,vaf,ctypes,min_cells_per_mut,min_pos_cov,out_prefix):
def filter_input(bin,vaf,barcodes,min_cells_per_mut,min_pos_cov,out_prefix):
bin = pd.read_csv(bin,sep='\t',index_col=0,na_values=[3,'.'])
vaf = pd.read_csv(vaf,sep='\t',index_col=0,na_values=[3,'.'])
ctypes = pd.read_csv(ctypes,sep='\t')
barcodes = pd.read_csv(barcodes,sep='\t')
#Save fusions:
fusions = [i for i in bin.index if '--' in i]
fusions_save = bin.loc[fusions,bin.columns]
Expand All @@ -25,19 +23,22 @@ def filter_input(bin,vaf,ctypes,min_cells_per_mut,min_pos_cov,out_prefix):

# Filter all input files:
vaf = vaf.loc[idx,cols]
ctypes = ctypes[ctypes['Index'].isin(cols)]
barcodes = barcodes[barcodes['Index'].isin(cols)]
bin = pd.concat([bin,fusions_save[cols]])

# Add reanno ctype colors
barcodes['Cell_Reanno_Colors'] = barcodes['Reannotated_cell_type'].apply(lambda x: '#94C773' if x=='Non-Cancer' else '#8F79A1')

# Write
bin.to_csv(out_prefix + '.BinaryMatrix.tsv', sep='\t')
vaf.to_csv(out_prefix + '.VAFMatrix.tsv', sep='\t')
ctypes.to_csv(out_prefix + '.Barcodes.tsv', sep='\t', index = False)
barcodes.to_csv(out_prefix + '.Barcodes.tsv', sep='\t', index = False)

def initialize_parser():
parser = argparse.ArgumentParser(description='Script to filter BnpC input matrix')
parser.add_argument('--bin', type=str, default=1, help='SComatic binary matrix (obtained by SingleCellGenotype.py)', required = True)
parser.add_argument('--vaf', type=str, default=1, help='SComatic VAF matrix (obtained by SingleCellGenotype.py)', required = True)
parser.add_argument('--ctypes', type=str, default=1, help='Barcode to celltypes (obtained by CellTypeReannotation.py)', required = True)
parser.add_argument('--barcodes', type=str, default=1, help='Barcode to celltypes (obtained by CellTypeReannotation.py)', required = True)
parser.add_argument('--min_cells_per_mut', type=int, default=5, help='SComatic+CellTypeReannotation base calling file (obtained by BaseCellCalling.step3.py)', required = False)
parser.add_argument('--min_pos_cov', type=int, default=3, help='SComatic+CellTypeReannotation base calling file (obtained by BaseCellCalling.step3.py)', required = False)
parser.add_argument('--outfile', default = 'Matrix.tsv', help='Out file', required = False)
Expand All @@ -51,7 +52,7 @@ def main():

bin = args.bin
vaf = args.vaf
ctypes = args.ctypes
barcodes = args.barcodes
min_cells_per_mut = args.min_cells_per_mut
min_pos_cov = args.min_pos_cov
out_prefix = args.outfile
Expand All @@ -60,7 +61,7 @@ def main():
print("Outfile prefix: " , out_prefix , "\n")

# 1. Create clinical annotation file
filter_input(bin,vaf,ctypes,min_cells_per_mut,min_pos_cov,out_prefix)
filter_input(bin,vaf,barcodes,min_cells_per_mut,min_pos_cov,out_prefix)

if __name__ == '__main__':
start = timeit.default_timer()
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
6 changes: 3 additions & 3 deletions workflow/scripts/CellClustering/libs/dpmmIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def save_similarity(args, inferred, results, out_dir):


def save_geno_plots(args, data, data_raw, out_dir, names):
ctypes = pd.read_csv(args.ctypes, sep='\t')
barcodes = pd.read_csv(args.barcodes, sep='\t')
row_cl = False
if args.mut_order:
row_cl = list(pd.read_csv(args.mut_order, sep='\t')['INDEX'])
Expand All @@ -289,11 +289,11 @@ def save_geno_plots(args, data, data_raw, out_dir, names):
df_obs = pd.DataFrame(data_raw, index=names[0], columns=names[1]).T
pl.plot_raw_data(
data_est['genotypes'], df_obs, assignment=data_est['assignment'],
ctypes=ctypes, out_file=out_file, row_cl=row_cl
barcodes=barcodes, out_file=out_file, row_cl=row_cl
)
pl.plot_raw_data(
df_obs, df_obs, assignment=data_est['assignment'],
ctypes=ctypes, out_file=out_file_raw, row_cl=row_cl
barcodes=barcodes, out_file=out_file_raw, row_cl=row_cl
)


Expand Down
6 changes: 3 additions & 3 deletions workflow/scripts/CellClustering/libs/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def _get_col_order(assignment):

def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None,
assignment=np.array([]), metric='correlation', row_cl=True,
ctypes=pd.Series()):
barcodes=pd.Series()):

data = data_in.copy()
data_raw = data_raw_in.copy()
Expand Down Expand Up @@ -93,7 +93,7 @@ def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None,

cluster_cols = pd.Series(col_dict, name='clusters', index=col_order)

ctypes = ctypes.reindex([ctypes.index[i] for i in col_order])
barcodes = barcodes.reindex([barcodes.index[i] for i in col_order])

data.columns = np.arange(data_in.shape[1])
data = data[col_order]
Expand Down Expand Up @@ -137,7 +137,7 @@ def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None,
cmap.set_over('green')
cmap.set_bad('grey')

col_colors=[ctypes['Cancer_Color'],cluster_cols]
col_colors=[barcodes['Cell_Reanno_Colors'],cluster_cols]

cm = sns.clustermap(
data,
Expand Down
2 changes: 1 addition & 1 deletion workflow/scripts/CellClustering/run_BnpC.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def check_PSRF_cutoff(val):
help='Run single chain in main python thread for debugging with pdb.'
)
parser.add_argument(
'--ctypes', type=str,
'--barcodes', type=str,
help='Absolute or relative path(s) to input barcode-to-ctype file'
)
parser.add_argument(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ def HCCV_SNV(SNVs,outfile,min_dp,deltaVAF,deltaMCF,clust_dist):
input_df['DP_FILTER'] = input_df.apply(lambda x:
DP_filtering(x['Cancer'],x['Non-Cancer'],min_dp), axis=1)
input_df = input_df[input_df['DP_FILTER']=='PASS']
input_df.to_csv(outfile+'2', sep='\t', index=False, mode='a')

#Special case for chrM due to contaminants:

# Save chrM candidate SNVs to apply specific filters
chrm_df = input_df[input_df['#CHROM']=='chrM'].copy()
input_df = input_df[input_df['#CHROM']!='chrM']
Expand Down Expand Up @@ -77,6 +77,7 @@ def HCCV_SNV(SNVs,outfile,min_dp,deltaVAF,deltaMCF,clust_dist):
# Filter 7: PoN filterDelta VAF and MCF filtering
input_df['HCCV_FILTER'] = input_df.apply(lambda x:
MCF_filtering(x['Cell_types'],x['VAF'], x['MCF'],deltaVAF,deltaMCF), axis=1)
input_df.to_csv(outfile+'3', sep='\t', index=False, mode='a')
input_df = input_df[input_df['HCCV_FILTER']=='PASS']

# Filter 8: Distance filter
Expand Down Expand Up @@ -229,16 +230,19 @@ def MCF_filtering(CTYPES,VAF,MCF,deltaVAFmin,deltaMCFmin):
VAFNonCancer = float(VAFs[0])
MCFCancer = float(MCFs[1])
MCFNonCancer = float(MCFs[0])

if VAFNonCancer>0.12:
return 'Heterozygous'

if VAFCancer<0.05:
return 'NonSig'

# deltaVAF = VAFCancer-VAFNonCancer
deltaVAF = VAFCancer-VAFNonCancer
deltaMCF = MCFCancer-MCFNonCancer

if VAFNonCancer>0.1 and deltaVAF<2*deltaVAFmin:
return 'Heterozygous'

if VAFNonCancer>0.2:
return 'Heterozygous'

# HCCV are variants with high VAF/MCF in cancer and low VAF/MCF in non-cancer cells
# if deltaVAF < deltaVAFmin:
# return 'LowDeltaVAF'
Expand Down
Loading

0 comments on commit 89470f8

Please sign in to comment.