diff --git a/config/config.yaml b/config/config.yaml index c4a9a01..17c4d44 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -6,7 +6,7 @@ User: output_dir: /path/to/output_dir sample_map: /path/to/sample_map.tsv cancer_cell_type: HGSOC - + # Change if you use a custom reference Reference: genome: /../ref/GRCh38_gencode_v44_CTAT_lib_Oct292023.plug-n-play/ctat_genome_lib_build_dir/ref_genome.fa @@ -78,7 +78,7 @@ SNVCalling: Min_cell_types: 2 min_distance: 0 max_gnomAD_VAF: 0.01 - deltaVAF: 0.1 + deltaVAF: 0.05 deltaMCF: 0.3 min_ac_reads: 3 min_ac_cells: 2 @@ -116,7 +116,7 @@ CellClust: FN: -1 estimator: 'posterior' pp: [1,1] - dpa: [1,1] + dpa: [0.001, 5.0] ### CNA Calling inferCNV: diff --git a/profile/config.yaml b/profile/config.yaml index 1924afa..b0be342 100644 --- a/profile/config.yaml +++ b/profile/config.yaml @@ -5,7 +5,7 @@ jobs: 500 cores: 500 default-resources: - mem_mb_per_cpu: max(2*input.size_mb, 4096) + mem_mb_per_cpu: max(2*input.size_mb, 8192) runtime: 240 slurm_account: "'es_beere'" #tmpdir: /cluster/scratch/ diff --git a/run_LongSom.sh b/run_LongSom.sh index 23da61a..a7ad97c 100755 --- a/run_LongSom.sh +++ b/run_LongSom.sh @@ -3,7 +3,7 @@ OUTPUT_DIR=/path/to/output_dir REF_DIR=path/to/LongSom/ref/GRCh38_gencode_v44_CTAT_lib_Oct292023.plug-n-play/ctat_genome_lib_build_dir snakemake \ - -s workflow/LongSom.smk \ + -s workflow/Snakefile \ --configfile config/config.yaml \ --use-conda \ --use-singularity \ diff --git a/run_LongSom_slurm.sh b/run_LongSom_slurm.sh index 0a49dfd..f5e0e61 100755 --- a/run_LongSom_slurm.sh +++ b/run_LongSom_slurm.sh @@ -10,7 +10,7 @@ sbatch \ -e logs/snakelog.err \ snakemake \ -s workflow/Snakefile \ - --configfile workflow/test.yaml \ + --configfile config/config.yaml \ --profile profile/ \ --use-conda \ --use-singularity \ diff --git a/workflow/rules/CellClustering.smk b/workflow/rules/CellClustering.smk index 8878404..97a33cd 100644 --- a/workflow/rules/CellClustering.smk +++ b/workflow/rules/CellClustering.smk @@ -57,7 +57,7 @@ else: tsv="SNVCalling/BaseCellCalling/{id}.calling.step3.tsv", bam=f"{INPUT}/bam/{{id}}.bam", barcodes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv", - fusions="FusionCalling/Somatic/{id}.Fusions.tsv" if CTATFUSION else [], + fusions="FusionCalling/Somatic/{id}.Fusions.SingleCellGenotype.tsv" if CTATFUSION else [], ref=str(workflow.basedir)+config['Reference']['genome'], output: tsv="CellClustering/SingleCellGenotype/{id}.SingleCellGenotype.tsv", @@ -106,11 +106,11 @@ rule FormatInputBnpC: input: bin="CellClustering/SingleCellGenotype/{id}.BinaryMatrix.tsv", vaf="CellClustering/SingleCellGenotype/{id}.VAFMatrix.tsv", - ctypes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv", + barcodes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv", output: bin="CellClustering/BnpC_input/{id}.BinaryMatrix.tsv", vaf="CellClustering/BnpC_input/{id}.VAFMatrix.tsv", - ctypes="CellClustering/BnpC_input/{id}.Barcodes.tsv", + barcodes="CellClustering/BnpC_input/{id}.Barcodes.tsv", params: script=str(workflow.basedir)+"/scripts/CellClustering/FormatInputBnpC.py", min_cells=config['CellClust']['FormatInput']['min_cells_per_mut'], @@ -126,7 +126,7 @@ rule FormatInputBnpC: python {params.script} \ --bin {input.bin} \ --vaf {input.vaf} \ - --ctypes {input.ctypes} \ + --barcodes {input.barcodes} \ --min_pos_cov {params.min_cov} \ --min_cells_per_mut {params.min_cells} \ --outfile CellClustering/BnpC_input//{wildcards.id} @@ -136,7 +136,7 @@ rule BnpC_clustering: input: bin="CellClustering/BnpC_input/{id}.BinaryMatrix.tsv", vaf="CellClustering/BnpC_input/{id}.VAFMatrix.tsv", - ctypes="CellClustering/BnpC_input/{id}.Barcodes.tsv", + barcodes="CellClustering/BnpC_input/{id}.Barcodes.tsv", output: pdf="CellClustering/BnpC_output/{id}/genoCluster_posterior_mean_raw.pdf" params: @@ -172,5 +172,5 @@ rule BnpC_clustering: -FN {params.FN} \ -pp {params.pp} \ -ap {params.dpa} \ - --ctypes {input.ctypes} + --barcodes {input.barcodes} """ diff --git a/workflow/scripts/CellClustering/FormatInputBnpC.py b/workflow/scripts/CellClustering/FormatInputBnpC.py index 8571228..9901e65 100644 --- a/workflow/scripts/CellClustering/FormatInputBnpC.py +++ b/workflow/scripts/CellClustering/FormatInputBnpC.py @@ -1,14 +1,12 @@ import timeit import argparse import pandas as pd -import matplotlib -import matplotlib.pyplot as plt import numpy as np -def filter_input(bin,vaf,ctypes,min_cells_per_mut,min_pos_cov,out_prefix): +def filter_input(bin,vaf,barcodes,min_cells_per_mut,min_pos_cov,out_prefix): bin = pd.read_csv(bin,sep='\t',index_col=0,na_values=[3,'.']) vaf = pd.read_csv(vaf,sep='\t',index_col=0,na_values=[3,'.']) - ctypes = pd.read_csv(ctypes,sep='\t') + barcodes = pd.read_csv(barcodes,sep='\t') #Save fusions: fusions = [i for i in bin.index if '--' in i] fusions_save = bin.loc[fusions,bin.columns] @@ -25,19 +23,22 @@ def filter_input(bin,vaf,ctypes,min_cells_per_mut,min_pos_cov,out_prefix): # Filter all input files: vaf = vaf.loc[idx,cols] - ctypes = ctypes[ctypes['Index'].isin(cols)] + barcodes = barcodes[barcodes['Index'].isin(cols)] bin = pd.concat([bin,fusions_save[cols]]) + + # Add reanno ctype colors + barcodes['Cell_Reanno_Colors'] = barcodes['Reannotated_cell_type'].apply(lambda x: '#94C773' if x=='Non-Cancer' else '#8F79A1') # Write bin.to_csv(out_prefix + '.BinaryMatrix.tsv', sep='\t') vaf.to_csv(out_prefix + '.VAFMatrix.tsv', sep='\t') - ctypes.to_csv(out_prefix + '.Barcodes.tsv', sep='\t', index = False) + barcodes.to_csv(out_prefix + '.Barcodes.tsv', sep='\t', index = False) def initialize_parser(): parser = argparse.ArgumentParser(description='Script to filter BnpC input matrix') parser.add_argument('--bin', type=str, default=1, help='SComatic binary matrix (obtained by SingleCellGenotype.py)', required = True) parser.add_argument('--vaf', type=str, default=1, help='SComatic VAF matrix (obtained by SingleCellGenotype.py)', required = True) - parser.add_argument('--ctypes', type=str, default=1, help='Barcode to celltypes (obtained by CellTypeReannotation.py)', required = True) + parser.add_argument('--barcodes', type=str, default=1, help='Barcode to celltypes (obtained by CellTypeReannotation.py)', required = True) parser.add_argument('--min_cells_per_mut', type=int, default=5, help='SComatic+CellTypeReannotation base calling file (obtained by BaseCellCalling.step3.py)', required = False) parser.add_argument('--min_pos_cov', type=int, default=3, help='SComatic+CellTypeReannotation base calling file (obtained by BaseCellCalling.step3.py)', required = False) parser.add_argument('--outfile', default = 'Matrix.tsv', help='Out file', required = False) @@ -51,7 +52,7 @@ def main(): bin = args.bin vaf = args.vaf - ctypes = args.ctypes + barcodes = args.barcodes min_cells_per_mut = args.min_cells_per_mut min_pos_cov = args.min_pos_cov out_prefix = args.outfile @@ -60,7 +61,7 @@ def main(): print("Outfile prefix: " , out_prefix , "\n") # 1. Create clinical annotation file - filter_input(bin,vaf,ctypes,min_cells_per_mut,min_pos_cov,out_prefix) + filter_input(bin,vaf,barcodes,min_cells_per_mut,min_pos_cov,out_prefix) if __name__ == '__main__': start = timeit.default_timer() diff --git a/workflow/scripts/CellClustering/libs/__pycache__/CRP.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/CRP.cpython-312.pyc new file mode 100644 index 0000000..b8fb63d Binary files /dev/null and b/workflow/scripts/CellClustering/libs/__pycache__/CRP.cpython-312.pyc differ diff --git a/workflow/scripts/CellClustering/libs/__pycache__/CRP_learning_errors.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/CRP_learning_errors.cpython-312.pyc new file mode 100644 index 0000000..77221b2 Binary files /dev/null and b/workflow/scripts/CellClustering/libs/__pycache__/CRP_learning_errors.cpython-312.pyc differ diff --git a/workflow/scripts/CellClustering/libs/__pycache__/MCMC.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/MCMC.cpython-312.pyc new file mode 100644 index 0000000..3f4e6ee Binary files /dev/null and b/workflow/scripts/CellClustering/libs/__pycache__/MCMC.cpython-312.pyc differ diff --git a/workflow/scripts/CellClustering/libs/__pycache__/__init__.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..8e0b037 Binary files /dev/null and b/workflow/scripts/CellClustering/libs/__pycache__/__init__.cpython-312.pyc differ diff --git a/workflow/scripts/CellClustering/libs/__pycache__/dpmmIO.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/dpmmIO.cpython-312.pyc new file mode 100644 index 0000000..3a1ba53 Binary files /dev/null and b/workflow/scripts/CellClustering/libs/__pycache__/dpmmIO.cpython-312.pyc differ diff --git a/workflow/scripts/CellClustering/libs/__pycache__/plotting.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/plotting.cpython-312.pyc new file mode 100644 index 0000000..7b7671c Binary files /dev/null and b/workflow/scripts/CellClustering/libs/__pycache__/plotting.cpython-312.pyc differ diff --git a/workflow/scripts/CellClustering/libs/__pycache__/utils.cpython-312.pyc b/workflow/scripts/CellClustering/libs/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000..2623f9c Binary files /dev/null and b/workflow/scripts/CellClustering/libs/__pycache__/utils.cpython-312.pyc differ diff --git a/workflow/scripts/CellClustering/libs/dpmmIO.py b/workflow/scripts/CellClustering/libs/dpmmIO.py index 5a39c03..a594934 100644 --- a/workflow/scripts/CellClustering/libs/dpmmIO.py +++ b/workflow/scripts/CellClustering/libs/dpmmIO.py @@ -275,7 +275,7 @@ def save_similarity(args, inferred, results, out_dir): def save_geno_plots(args, data, data_raw, out_dir, names): - ctypes = pd.read_csv(args.ctypes, sep='\t') + barcodes = pd.read_csv(args.barcodes, sep='\t') row_cl = False if args.mut_order: row_cl = list(pd.read_csv(args.mut_order, sep='\t')['INDEX']) @@ -289,11 +289,11 @@ def save_geno_plots(args, data, data_raw, out_dir, names): df_obs = pd.DataFrame(data_raw, index=names[0], columns=names[1]).T pl.plot_raw_data( data_est['genotypes'], df_obs, assignment=data_est['assignment'], - ctypes=ctypes, out_file=out_file, row_cl=row_cl + barcodes=barcodes, out_file=out_file, row_cl=row_cl ) pl.plot_raw_data( df_obs, df_obs, assignment=data_est['assignment'], - ctypes=ctypes, out_file=out_file_raw, row_cl=row_cl + barcodes=barcodes, out_file=out_file_raw, row_cl=row_cl ) diff --git a/workflow/scripts/CellClustering/libs/plotting.py b/workflow/scripts/CellClustering/libs/plotting.py index fb2a958..888e72b 100644 --- a/workflow/scripts/CellClustering/libs/plotting.py +++ b/workflow/scripts/CellClustering/libs/plotting.py @@ -64,7 +64,7 @@ def _get_col_order(assignment): def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None, assignment=np.array([]), metric='correlation', row_cl=True, - ctypes=pd.Series()): + barcodes=pd.Series()): data = data_in.copy() data_raw = data_raw_in.copy() @@ -93,7 +93,7 @@ def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None, cluster_cols = pd.Series(col_dict, name='clusters', index=col_order) - ctypes = ctypes.reindex([ctypes.index[i] for i in col_order]) + barcodes = barcodes.reindex([barcodes.index[i] for i in col_order]) data.columns = np.arange(data_in.shape[1]) data = data[col_order] @@ -137,7 +137,7 @@ def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None, cmap.set_over('green') cmap.set_bad('grey') - col_colors=[ctypes['Cancer_Color'],cluster_cols] + col_colors=[barcodes['Cell_Reanno_Colors'],cluster_cols] cm = sns.clustermap( data, diff --git a/workflow/scripts/CellClustering/run_BnpC.py b/workflow/scripts/CellClustering/run_BnpC.py index be7df58..35bca1d 100644 --- a/workflow/scripts/CellClustering/run_BnpC.py +++ b/workflow/scripts/CellClustering/run_BnpC.py @@ -55,7 +55,7 @@ def check_PSRF_cutoff(val): help='Run single chain in main python thread for debugging with pdb.' ) parser.add_argument( - '--ctypes', type=str, + '--barcodes', type=str, help='Absolute or relative path(s) to input barcode-to-ctype file' ) parser.add_argument( diff --git a/workflow/scripts/CellTypeReannotation/HighConfidenceCancerVariants.py b/workflow/scripts/CellTypeReannotation/HighConfidenceCancerVariants.py index 4233580..95c283b 100644 --- a/workflow/scripts/CellTypeReannotation/HighConfidenceCancerVariants.py +++ b/workflow/scripts/CellTypeReannotation/HighConfidenceCancerVariants.py @@ -46,9 +46,9 @@ def HCCV_SNV(SNVs,outfile,min_dp,deltaVAF,deltaMCF,clust_dist): input_df['DP_FILTER'] = input_df.apply(lambda x: DP_filtering(x['Cancer'],x['Non-Cancer'],min_dp), axis=1) input_df = input_df[input_df['DP_FILTER']=='PASS'] + input_df.to_csv(outfile+'2', sep='\t', index=False, mode='a') #Special case for chrM due to contaminants: - # Save chrM candidate SNVs to apply specific filters chrm_df = input_df[input_df['#CHROM']=='chrM'].copy() input_df = input_df[input_df['#CHROM']!='chrM'] @@ -77,6 +77,7 @@ def HCCV_SNV(SNVs,outfile,min_dp,deltaVAF,deltaMCF,clust_dist): # Filter 7: PoN filterDelta VAF and MCF filtering input_df['HCCV_FILTER'] = input_df.apply(lambda x: MCF_filtering(x['Cell_types'],x['VAF'], x['MCF'],deltaVAF,deltaMCF), axis=1) + input_df.to_csv(outfile+'3', sep='\t', index=False, mode='a') input_df = input_df[input_df['HCCV_FILTER']=='PASS'] # Filter 8: Distance filter @@ -229,16 +230,19 @@ def MCF_filtering(CTYPES,VAF,MCF,deltaVAFmin,deltaMCFmin): VAFNonCancer = float(VAFs[0]) MCFCancer = float(MCFs[1]) MCFNonCancer = float(MCFs[0]) - - if VAFNonCancer>0.12: - return 'Heterozygous' if VAFCancer<0.05: return 'NonSig' - # deltaVAF = VAFCancer-VAFNonCancer + deltaVAF = VAFCancer-VAFNonCancer deltaMCF = MCFCancer-MCFNonCancer + if VAFNonCancer>0.1 and deltaVAF<2*deltaVAFmin: + return 'Heterozygous' + + if VAFNonCancer>0.2: + return 'Heterozygous' + # HCCV are variants with high VAF/MCF in cancer and low VAF/MCF in non-cancer cells # if deltaVAF < deltaVAFmin: # return 'LowDeltaVAF' diff --git a/workflow/scripts/SNVCalling/BaseCellCalling.step3.py b/workflow/scripts/SNVCalling/BaseCellCalling.step3.py index 6363220..505014a 100644 --- a/workflow/scripts/SNVCalling/BaseCellCalling.step3.py +++ b/workflow/scripts/SNVCalling/BaseCellCalling.step3.py @@ -30,23 +30,21 @@ def variant_calling_step3(file,out_prefix,deltaVAF,deltaMCF,chrM_conta,min_ac_re f3.write('##INFO=FINAL_FILTER,Description=Final ilter status, including chrM contaminants and clustered sites\n') f2.close() f3.close() - input_df = pd.read_csv(file, sep='\t',comment='#',names=output_column_names) - input_df['INDEX'] = input_df['#CHROM'].astype(str) + ':' + input_df['Start'].astype(str) + ':' + input_df['ALT'].str.split(',', n=1, expand=True)[0] - - # Only keeping sites with alternative reads in cancer + + # Only keeping sites called in cancer input_df = input_df[input_df['Cell_types']!='Non-Cancer'] # Filtering multiallelic sites, keeping only one allele if it's 50x larger than the other # Otherwise deleting the line (it messes with filters later on) # This is important as SComatic will often detect low expr secondary alt allele in very high expr. (e.g. in chrM) - input_df[['ALT', 'FILTER', 'Cell_types', 'Bc', 'Cc', 'VAF', 'MCF','MultiAllelic_filter']] = input_df.apply(lambda x: MultiAllelic_filtering(x['REF'], x['ALT'], x['FILTER'], x['Cell_types'],x['Dp'], x['Nc'], x['Bc'], x['Cc'], x['VAF'], x['MCF'], x['Cancer'], x['Non-Cancer']), axis=1, result_type="expand") - input_df = input_df[input_df['MultiAllelic_filter']=='KEEP'] - input_df = input_df[output_column_names + ['INDEX']] + input_df[['ALT', 'FILTER', 'Cell_types', 'Bc', 'Cc', 'VAF', 'MCF','STEP3FILTER']] = input_df.apply(lambda x: MultiAllelic_filtering(x['REF'], x['ALT'], x['FILTER'], x['Cell_types'],x['Dp'], x['Nc'], x['Bc'], x['Cc'], x['VAF'], x['MCF'], x['Cancer'], x['Non-Cancer']), axis=1, result_type="expand") + #input_df = input_df[input_df['MultiAllelic_filter']=='KEEP'] - #Special case for chrM due to contaminants: + input_df['INDEX'] = input_df['#CHROM'].astype(str) + ':' + input_df['Start'].astype(str) + ':' + input_df['ALT'].str.split(',', n=1, expand=True)[0] + #Special case for chrM due to contaminants: # Save chrM candidate SNVs to apply specific filters chrm_df = input_df[input_df['#CHROM']=='chrM'].copy() input_df = input_df[input_df['#CHROM']!='chrM'] @@ -55,27 +53,17 @@ def variant_calling_step3(file,out_prefix,deltaVAF,deltaMCF,chrM_conta,min_ac_re chrm_df = chrm_df[~chrm_df['FILTER'].str.contains('Min|LR|gnomAD|LC|RNA', regex=True)] if len(chrm_df)>0: # Applying deltaVAF and deltaMCF filters - chrm_df['FILTER'] = chrm_df.apply(lambda x: - chrM_filtering(x['Cell_types'],x['Dp'],x['VAF'], x['MCF'],deltaVAF,deltaMCF), axis=1) - + chrm_df['STEP3FILTER'] = chrm_df.apply(lambda x: + chrM_filtering(x['STEP3FILTER'],x['Cell_types'],x['Dp'],x['VAF'], x['MCF'],deltaVAF,deltaMCF), axis=1) # Filter 1: Non-cancer coverage filter input_df = input_df[~input_df['FILTER'].str.contains('Min_cell_types')] - - # Filter 2: Alt reads and cells in cancer filter - input_df['Bc'] = [i.split(',')[1] if ',' in i else i for i in input_df['Bc']] #only keeping cancer info - input_df['Cc'] = [i.split(',')[1] if ',' in i else i for i in input_df['Cc']] #only keeping cancer info - input_df = input_df.astype({'Bc':'int','Cc':'int'}) - input_df = input_df[input_df['Bc']>=min_ac_reads] - input_df = input_df[input_df['Cc']>=min_ac_cells] - - # Filter 3: Betabinomial filter in cancer cells - input_df = input_df[~input_df['Cell_type_Filter'].str.contains(',Non-Significant|,Low-Significant', regex = True)] - input_df = input_df[~input_df['Cell_type_Filter'].isin(['Non-Significant','Low-Significant'])] - - #Adding chrM SNVs detected above: - unfiltered_df = pd.concat([input_df,chrm_df]) - unfiltered_df.to_csv(out_prefix+ '.calling.step3.unfiltered.tsv', sep='\t', index=False, mode='a') + + #Filter 2: Min. alt reads and cells in cancer + input_df['STEP3FILTER'] = input_df.apply(lambda x: BC_CC_filtering(x['STEP3FILTER'],x['ALT'],x['Cancer'],min_ac_reads,min_ac_cells), axis=1) + + # Filter 3 and 8: Betabinomial filtering in cancer cells (Filter 3) and non-cancer cells (Filter 8) + input_df['STEP3FILTER'] = input_df.apply(lambda x: BetaBino_filtering(x['STEP3FILTER'],x['Cell_types'],x['Cell_type_Filter'],x['Start']), axis=1) # Filter 4: Noise filter: input_df = input_df[~input_df['FILTER'].str.contains('Noisy_site')] #multi allelic sites are filtered above @@ -90,26 +78,27 @@ def variant_calling_step3(file,out_prefix,deltaVAF,deltaMCF,chrM_conta,min_ac_re input_df = input_df[~input_df['FILTER'].str.contains('PoN', regex = True)] # Filter 8: Betabinomial filter in non-cancer cells - input_df = input_df[~input_df['Cell_type_Filter'].str.contains('Low-Significant,|PASS,', regex = True)] - input_df = input_df[~input_df['FILTER'].str.contains('Cell_type_noise|Multiple_cell_types|Noisy_site', regex = True)] + input_df = input_df[~input_df['FILTER'].str.contains('Cell_type_noise', regex = True)] # Filter 9: gnomAD filter input_df = input_df[~input_df['FILTER'].str.contains('gnomAD', regex = True)] - # Filter 10: Distance filter - input_df['FILTER'] = tag_clustered_SNVs(input_df, clust_dist) - input_df = input_df[~input_df['FILTER'].str.contains('dist', regex = True)] - #Adding chrM SNVs detected above: input_df = pd.concat([input_df,chrm_df]) + # Filter 10: Distance filter + input_df['STEP3FILTER'] = tag_clustered_SNVs(input_df, clust_dist) + input_df.to_csv(out_prefix+ '.calling.step3.unfiltered.tsv', sep='\t', index=False, mode='a') + input_df = input_df[~input_df['STEP3FILTER'].str.contains('dist', regex = True)] + #Filtering PASS SNVs (only left in chrM) - filtered_df = input_df[input_df['FILTER'] =='PASS'] + filtered_df = input_df[input_df['STEP3FILTER'] =='PASS'] # Write output filtered_df.to_csv(out_prefix+ '.calling.step3.tsv', sep='\t', index=False, mode='a') + -def chrM_filtering(CTYPES,DP,VAF,MCF,deltaVAFmin,deltaMCFmin): +def chrM_filtering(STEP3FILTER,CTYPES,DP,VAF,MCF,deltaVAFmin,deltaMCFmin): ctypes = CTYPES.split(',') if len(ctypes)>1: if ctypes[0]=='Cancer': @@ -121,7 +110,10 @@ def chrM_filtering(CTYPES,DP,VAF,MCF,deltaVAFmin,deltaMCFmin): DP1,DP2=DP.split(',') #Only look at higher depth loci if int(DP1)<100 or int(DP2)<100: - return 'LowDepth' + if STEP3FILTER == 'PASS': + return 'LowDepth' + else: + return STEP3FILTER+',LowDepth' else: VAFs = VAF.split(',') MCFs = MCF.split(',') @@ -135,24 +127,38 @@ def chrM_filtering(CTYPES,DP,VAF,MCF,deltaVAFmin,deltaMCFmin): # mtSNVs are variants with high VAF in cancer and low VAF in non-cancer cells if deltaVAF < deltaVAFmin: - return 'LowDeltaVAF' + if STEP3FILTER == 'PASS': + return 'LowDeltaVAF' + else: + return STEP3FILTER+',LowDeltaVAF' + elif deltaMCF < deltaMCFmin: - return 'LowDeltaMCF' + if STEP3FILTER == 'PASS': + return 'LowDeltaMCF' + else: + return STEP3FILTER+',LowDeltaMCF' else: - return 'PASS' + return STEP3FILTER elif len(ctypes)==1: - if ctypes[0]!="Cancer": - return 'Non-Cancer' - elif int(DP)<100: - return 'LowDepth' + if int(DP)<100: + if STEP3FILTER == 'PASS': + return 'LowDepth' + else: + return STEP3FILTER+',LowDepth' elif float(VAF)<0.05: - return 'LowVAF' + if STEP3FILTER == 'PASS': + return 'LowVAF' + else: + return STEP3FILTER+',LowVAF' elif float(MCF)<0.05: - return 'LowMCF' + if STEP3FILTER == 'PASS': + return 'LowMCF' + else: + return STEP3FILTER+',LowMCF' else: - return 'PASS' + return STEP3FILTER def MultiAllelic_filtering(REF, ALT, FILTER, CTYPES, DP, NC, BC, CC, VAF, MCF, CancerInfo, NonCancerInfo): ref_dict = {'A':0, 'C':1, 'T':2, 'G':3} @@ -174,8 +180,6 @@ def MultiAllelic_filtering(REF, ALT, FILTER, CTYPES, DP, NC, BC, CC, VAF, MCF, C index = np.argmax(BCS) BCS[index] = 0 # removing max to select next "max" MAX2 = max(BCS) - if not(MAX2/MAX<0.05): # one alt 20x larger than the other) - return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'DELETE' ALT_Cancer = 'ACTG'[index] BC_Cancer = int(CancerInfo.split('|')[3].split(':')[index]) @@ -195,42 +199,89 @@ def MultiAllelic_filtering(REF, ALT, FILTER, CTYPES, DP, NC, BC, CC, VAF, MCF, C VAF = ','.join([str(VAF_NonCancer),str(VAF_Cancer)]) MCF = ','.join([str(MCF_NonCancer),str(MCF_Cancer)]) + if not(MAX2/MAX<0.05): # one alt 20x larger than the other) + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'Multi-Allelic' + + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'PASS' + + elif len(ctypes)==1: + BCS = CancerInfo.split('|')[3].split(':')[:4] + BCS = [int(i) for i in BCS] + BCS[i_ref] = 0 # setting REF to 0 as we only consider ALT + MAX = max(BCS) + index = np.argmax(BCS) + BCS[index] = 0 # removing max to select next "max" + MAX2 = max(BCS) + + ALT = 'ACTG'[index] + BC = int(CancerInfo.split('|')[3].split(':')[index]) + CC = int(CancerInfo.split('|')[2].split(':')[index]) + VAF = round(BC/int(DP),4) + MCF= round(CC/int(NC),4) + FILTER = FILTER.replace('Multi-allelic,','') FILTER = FILTER.replace(',Multi-allelic','') FILTER = FILTER.replace('Multi-allelic','') - return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'KEEP' - - elif len(ctypes)==1: - if ctypes[0]=='Cancer': - BCS = CancerInfo.split('|')[3].split(':')[:4] - BCS = [int(i) for i in BCS] - BCS[i_ref] = 0 # setting REF to 0 as we only consider ALT - MAX = max(BCS) - index = np.argmax(BCS) - BCS[index] = 0 # removing max to select next "max" - MAX2 = max(BCS) - if not(MAX2/MAX<0.05): # one alt 20x larger than the other) - return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'DELETE' - - ALT = 'ACTG'[index] - BC = int(CancerInfo.split('|')[3].split(':')[index]) - CC = int(CancerInfo.split('|')[2].split(':')[index]) - VAF = round(BC/int(DP),4) - MCF= round(CC/int(NC),4) - - FILTER = FILTER.replace('Multi-allelic,','') - FILTER = FILTER.replace(',Multi-allelic','') - FILTER = FILTER.replace('Multi-allelic','') - - return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'KEEP' - else: - return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'DELETE' + if not(MAX2/MAX<0.05): # one alt 20x larger than the other) + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'Multi-Allelic' + + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'PASS' else: - return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'KEEP' + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'PASS' +def BC_CC_filtering(STEP3FILTER,ALT,CancerInfo,min_ac_reads,min_ac_cells): + alt_dict = {'A':0, 'C':1, 'T':2, 'G':3} + i_alt = alt_dict[ALT[0]] + try: + CancerInfos = CancerInfo.split('|') + BC = CancerInfos[3].split(':')[i_alt] + CC = CancerInfos[2].split(':')[i_alt] + if int(BC) 1: + if ctypes[0] == 'Cancer': + i_cancer = 0 + i_noncancer = 1 + elif ctypes[1] == 'Cancer': + i_cancer = 1 + i_noncancer = 0 + if CTYPESFILTER.split(',')[i_cancer] in ['Non-Significant','Low-Significance']: + if STEP3FILTER == 'PASS': + return 'CancerNonSig' + else: + return STEP3FILTER + ',CancerNonSig' + if CTYPESFILTER.split(',')[i_noncancer] in ['PASS','Low-Significance']: + if STEP3FILTER == 'PASS': + return 'NonCancerSig' + else: + return STEP3FILTER + ',NonCancerSig' + return STEP3FILTER + + def tag_clustered_SNVs(df, clust_dist): - df2 = df[df['FILTER']=='PASS'] + df2 = df[df['STEP3FILTER']=='PASS'] idx = df2['INDEX'] a=[] for i in idx: @@ -249,13 +300,13 @@ def tag_clustered_SNVs(df, clust_dist): trash.append(':'.join([chr2,pos2,base2])) trash = set(trash) updated_filter= df.apply(lambda x : - modify_filter(x['INDEX'],x['FILTER'], clust_dist, trash), + modify_filter(x['INDEX'],x['STEP3FILTER'], clust_dist, trash), axis=1) print(updated_filter) return updated_filter def modify_filter(INDEX, FILTER, clust_dist, trash): - clustered = 'Clust_dist{}'.format(str(clust_dist)) + clustered = 'Clust_dist_{}'.format(str(clust_dist)) if INDEX in trash: if FILTER == 'PASS': return clustered