diff --git a/bin/extract_variant_reads.py b/bin/extract_variant_reads.py new file mode 100755 index 0000000..3a77836 --- /dev/null +++ b/bin/extract_variant_reads.py @@ -0,0 +1,649 @@ +#!/usr/local/bin/python3 + +from __future__ import division +import pysam +import biotite.sequence.align as align +import biotite.sequence as seq +import argparse +import re +import string +import csv, sys +import scipy.stats as stats +import pandas as pd, pyranges as pr + +# reverse complement function +def revcomp(seq): + tab = str.maketrans('ACGTacgtRYMKrymkVBHDvbhd', 'TGCAtgcaYRKMyrkmBVDHbvdh') # maketrans <- maps the reverse complement + return seq.translate(tab)[::-1] # translate(x)[::-1] <- works backward through the string, effectively reversing the string + +# make cigar tuples from a cigar string +def make_cigar_tuples(cigar): + cigar_dict = {'M': 0, 'I': 1, 'D': 2, 'N': 3, 'S': 4, 'H': 5, 'P': 6, '=': 7, 'X': 8} + # use regex to parse cigar string into tuples + cigar_tuples = re.findall(r'(\d+)([MIDNSHP=X])', cigar) + cigar_tuples = [(cigar_dict[operation],int(length)) for length, operation in cigar_tuples] + return cigar_tuples + +def cigar_summary(cigar): + cigar_dict = {'M': 0, 'I': 1, 'D': 2, 'N': 3, 'S': 4, 'H': 5, 'P': 6, '=': 7, 'X': 8} + # use regex to parse cigar string into tuples + cigar_tuples = re.findall(r'(\d+)([MIDNSHP=X])', cigar) + cigar_sum = { 'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0 } + for length, operation in cigar_tuples: + cigar_sum[operation] += int(length) + + return cigar_sum + +def make_aligned_pairs(sup_align,refseq): + """ + Converts a supplementary alignment string into a list of aligned pairs. + + Args: + supplementary_alignment (str): Supplementary alignment string (e.g., 'chr12,107092390,-,110M41S,60,0'). + + Returns: + List of tuples: Aligned pairs (e.g., [(0, 168391768, 'C'), (1, 168391769, 'C'), ...]). + """ + # Split the supplementary alignment string into components + ref_name, ref_start, strand, cigar, _, _ = supplementary_alignment.split(',') + + # Convert positions and CIGAR to appropriate types + ref_start = int(ref_start) + cigar_ops = parse_cigar(cigar) + + # Initialize variables + aligned_pairs = [] + read_pos = 0 + ref_pos = ref_start + + for op, length in cigar_ops: + if op == 'M': # Match/mismatch + for i in range(length): + aligned_pairs.append((read_pos, ref_pos, refseq[ref_pos - ref_start])) + read_pos += 1 + ref_pos += 1 + elif op == 'I': # Insertion + for i in range(length): + aligned_pairs.append((read_pos, None, refseq[ref_pos - ref_start])) + read_pos += 1 + elif op == 'D': # Deletion + for i in range(length): + aligned_pairs.append((None, ref_pos, refseq[ref_pos - ref_start])) + ref_pos += 1 + elif op == 'S': # Soft clipping + read_pos += length + # Skipping 'H' (hard clipping) and 'N' (skipped region) as they do not appear in the read + + return aligned_pairs + + +def indels_from_aligned_pairs(pairs,readseq): + + # remove leading soft clips + while pairs[0][0] is None: + pairs = pairs[1:] + + # remove training soft clips + while pairs[-1][0] is None: + pairs = pairs[:-1] + + pairs = [ (x[0],x[1],x[2].upper(),None) for x in pairs ] + for i in range(len(pairs)): + if pairs[i][0] is not None: + pairs[i] = (pairs[i][0],readseq[pairs[i][0]],pairs[i][1],pairs[i][2]) + + variant_start_index = j-1 + variant_end_index = j-1 + j = 0 + while j < len(pairs): + if pairs[j][0] is not None and pairs[j][2] is not None: + j+=1 + continue + + if pairs[j][0] is None or pairs[j][2] is None: # indel + variant_start_index = j-1 + variant_end_index = len(pairs) + break + + j+=1 + + # construct indel tuple + indel = (pairs[variant_start_index][2],''.join([ x[3] if x[3] else '' for x in pairs[variant_start_index:variant_end_index]]),''.join([ x[1] if x[1] else '' for x in pairs[variant_start_index:variant_end_index]])) + + return indel + +# function to count the number of reads that support an indel or BND event +def add_normal_counts(df, reads, fasta, maxreads=0,handicap=5,window=25,flank=150): + + # make ref and alt sequences for each indel/BND + df['refseq'] = '' + df['altseq'] = '' + for it, row in df.iterrows(): + df.at[it,'refseq'] = fasta.fetch(row['chrom'],row['pos']-1-flank,row['pos']+len(row['ref'])+flank) + if row['type'] in ['INDEL','DEL','INS']: + df.at[it,'altseq'] = fasta.fetch(row['chrom'],row['pos']-1-flank,row['pos']-1) + row['alt'] + fasta.fetch(row['chrom'],row['pos']+len(row['ref'])-1,row['pos']+len(row['ref'])+flank) + elif row['type'] == 'BND': + df.at[it,'altseq'] = fasta.fetch(row['chrom'],row['pos']-1-flank,row['pos']-1) + (fasta.fetch(row['chrom2'],row['pos2']-1,row['pos2']+flank) if row['strands']=='++' else revcomp(fasta.fetch(row['chrom2'],row['pos2']-1-flank,row['pos2']))) + + df['control_alt_counts'] = 0 + df['control_total_counts'] = 0 + + total_reads = set() + + # iterate through reads in bam file for the first position + for read in reads: + # skip if not primary alignment or a duplicate or alignment doesnt overlap start,end + if read.is_mapped is False or \ + read.is_duplicate is True or \ + read.is_secondary is True or \ + read.is_supplementary is True or \ + read.mapping_quality == 0: + continue + + total_reads.add(read.query_name) + + # for speed: if the read has no indels, add it to the ref_count set + if len(read.cigartuples) == 1 and read.cigartuples[0][0] == 0 and read.cigartuples[0][1] == len(read.query_sequence): + continue + + for it, row in df.iterrows(): + + if read.cigartuples[0][0]==0 and read.cigartuples[-1][0]==0: + if (len(row['ref']) > len(row['alt']) and 'D' not in read.cigarstring) or (len(row['alt']) > len(row['ref']) and 'I' not in read.cigarstring): + continue + + if row['type'] == 'BND' and not read.has_tag('SA'): + continue + + refseq = row['refseq'] + altseq = row['altseq'] + + ref_align = align.align_optimal(seq.NucleotideSequence(row['refseq']),seq.NucleotideSequence(read.query_sequence),matrix=align.SubstitutionMatrix.std_nucleotide_matrix(),gap_penalty=(-10,-1),local=True,terminal_penalty=False,max_number=1)[0] + + alt_align = align.align_optimal(seq.NucleotideSequence(row['altseq']),seq.NucleotideSequence(read.query_sequence),matrix=align.SubstitutionMatrix.std_nucleotide_matrix(),gap_penalty=(-10,-1),local=True,terminal_penalty=False,max_number=1)[0] + + if alt_align.score > 0 and alt_align.score - handicap > ref_align.score: + ref_align_cigar_sum = cigar_summary(align.write_alignment_to_cigar(ref_align)) + alt_align_cigar_sum = cigar_summary(align.write_alignment_to_cigar(alt_align)) + + if ((len(row['ref'])-len(row['alt']) > 0 and ref_align_cigar_sum['D']==len(row['ref'])-len(row['alt'])) or + (len(row['alt'])-len(row['ref']) > 0 and ref_align_cigar_sum['I']==len(row['alt'])-len(row['ref']))): + df.at[it,'control_alt_counts'] += 1 + # print to stderr: found control alt count + print(f"\tFound control alt count for {row['chrom']}:{row['pos']}:{row['ref']}:{row['alt']}:{read.query_name}", file=sys.stderr) + + df['control_total_counts'] = len(total_reads) + + return df.copy() + + +# function to get indels for one amplicon from bam file +def get_indels(bam,controlbam,chr,start,end,fasta,window=100,distance=25,pam_positions=pd.NA,minSecMapQual=10,maxNM=5,svDistanceThreshold=100000,saAlignmentTolerance=5,minSoftClipLength=5,deletionDistanceThreshold=1000,minreads=1,maxcontrol=0,strict=False,verbose=False): + + # window == flanking sequence to add to read search + # distance == max distance from start/end to consider an indel, BND, or call a read wild-type. + + cigarVarDict = {'0':'M','1':'I','2':'D','3':'N','4':'S','5':'H','6':'P','7':'=','8':'X'} + + readaln = pd.DataFrame(columns=['read','chrom','pos','chrom2','pos2','ref','alt','strands','type']) + + # if verbose, print to stderr: getting reads that align within window of start and end + if verbose: + print(f"\tGetting reads that align within window of {chr}:{start}-{end}", file=sys.stderr) + + regionStart = max(start - svDistanceThreshold,1) + regionEnd = min(end + svDistanceThreshold,fasta.get_reference_length(chr)) + regionSeq = fasta.fetch(chr,regionStart-1,regionEnd) + + # get reads that align within window of start and end + for read in bam.fetch(chr, start-window, end+window, multiple_iterators = True): + + # skip if not primary alignment or a duplicate or alignment doesnt overlap start,end + if read.is_mapped is False or \ + read.is_duplicate is True or \ + read.is_secondary is True or \ + read.is_supplementary is True or \ + read.mapping_quality == 0: + continue + + cigar = read.cigartuples # get cigar info + mate_cigar = make_cigar_tuples(read.get_tag('MC')) if read.has_tag('MC') else None + + # Quality filter: Get mismatches from NM tag and subtract deleted/skipped bases from cigar + # and skip if too many mismatches + if read.has_tag('NM') and int(read.get_tag('NM')) > maxNM: + if (int(read.get_tag('NM')) - sum([x[1] for x in cigar if x[0] == 2 or x[0]==3])) > maxNM: + continue + + read_strand = "+" + if read.is_reverse is True: + read_strand = "-" + + leftSoftClip = 0 + rightSoftClip = 0 + read_reference_start = read.reference_start + 1 + read_reference_end = read.reference_end + read_next_reference_start = read.next_reference_start + 1 + read_next_reference_end = None if mate_cigar is None else read_next_reference_start + sum([x[1] for x in mate_cigar if x[0] == 0 or x[0] == 2 or x[0] == 3]) + + read_query_start = 0 + read_query_end = len(read.query_sequence) + + # setup indelinfo dictionary + indelinfo = {'read':read.query_name,'chrom':'','pos':None,'chrom2':None,'pos2':None,'strands': '','ref':'','alt':'','type':''} + + # Get left soft clip length + if cigar[0][0] == 4: + leftSoftClip = cigar[0][1] + + # Get right soft clip length + if cigar[-1][0] == 4: + rightSoftClip = cigar[-1][1] + + # + # This adjusts alignment cigars for soft clips and supplementary alignments. + # It extends the cigar if the softclips map nearby and therefore the read has a deletion. + # + + # Process left soft clip, if present + if leftSoftClip > rightSoftClip: + + # first check for local supplementary alignments + sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(';')[0].split(',') if read.has_tag('SA') else [None,None,None,None,None,None] + + if (sChr != None and int(sMq)>=minSecMapQual and + int(sNm)= 4 and sCigarTuples[0][0] == 0 and sCigarTuples[0][1] == cigar[0][1]: + cigar = sCigarTuples[:-1] + [(2,read_reference_start-sPos-1)] + cigar[1:] + + else: + # if there is a supplementary alignment, then realign the entire read to the reference sequence + cigar = make_cigar_tuples(align.write_alignment_to_cigar(align.align_optimal(seq.NucleotideSequence(regionSeq[sPos-regionStart:read_reference_end-regionStart+1]), #seq.NucleotideSequence(fasta.fetch(chr,sPos-1,read_reference_end)), + seq.NucleotideSequence(read.query_sequence), + matrix=align.SubstitutionMatrix.std_nucleotide_matrix(), + gap_penalty=(-10,-1),local=False,max_number=1)[0])) + read_reference_start = sPos + + else: + # if the left soft clip is long enough, see if it supports a deletion within window + # this performs an exact match and so isnt ideal, but should be good enough for now + if leftSoftClip >= minSoftClipLength: + clippedSeq = read.query_sequence[:leftSoftClip] + refSeq = regionSeq[read_reference_start - regionStart - window + 1:read_reference_start - regionStart + 1] #fasta.fetch(chr,read_reference_start - window,read_reference_start) + # find the position of the left-most occurance of clippedSeq in refSeq + leftClipPos = refSeq.rfind(clippedSeq) + if leftClipPos != -1: + delSeq = refSeq[leftClipPos + len(clippedSeq)-1:] + cigar = [(0,len(clippedSeq))] + [(2,len(delSeq))] + cigar[1:] + read_reference_start = read_reference_end - sum([x[1] for x in cigar if x[0] == 0 or x[0] == 2 or x[0] == 3]) + 1 + + # Process right soft clip, if present + if rightSoftClip > leftSoftClip: + + # first check for local supplementary alignments + sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(';')[0].split(',') if read.has_tag('SA') else [None,None,None,None,None,None] + + if (sChr != None): + sPos = int(sPos) + sCigarTuples = make_cigar_tuples(sCigar) + + if (int(sMq)>=minSecMapQual and sCigarTuples[0][0] >= 4 and sCigarTuples[-1][0] == 0 and + int(sNm) read_reference_end and + int(sPos) - read_reference_end < svDistanceThreshold): + + if sCigarTuples[0][0] >= 4 and sCigarTuples[-1][0] == 0 and sCigarTuples[-1][1] == cigar[-1][1]: + cigar = cigar[:-1] + [(2,read_reference_end-sPos)] + sCigarTuples[1:] + + else: + cigar = make_cigar_tuples(align.write_alignment_to_cigar(align.align_optimal(seq.NucleotideSequence(regionSeq[read_reference_start-regionStart:sPos-regionStart+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 2 or x[0] == 3])]), #fasta.fetch(chr,read_reference_start-1,sPos-1+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 2 or x[0] == 3]))), + seq.NucleotideSequence(read.query_sequence), + matrix=align.SubstitutionMatrix.std_nucleotide_matrix(), + gap_penalty=(-10,-1),local=False,max_number=1)[0])) + + read_reference_end = read_reference_start + sum([x[1] for x in cigar if x[0] == 0 or x[0] == 2 or x[0] == 3]) + + else: + # if the left soft clip is long enough, see if it supports a deletion within deletionDistanceThreshold + # this performs an exact match and so isnt ideal, but should be good enough for now + if rightSoftClip >= minSoftClipLength: + clippedSeq = read.query_sequence[-rightSoftClip:] + refSeq = regionSeq[read_reference_end-regionStart:read_reference_end-regionStart + window] #fasta.fetch(chr,read_reference_end,read_reference_end + window) + # find the position of the left-most occurance of clippedSeq in refSeq + rightClipPos = refSeq.find(clippedSeq) + if rightClipPos != -1: + delSeq = refSeq[0:rightClipPos] + cigar = cigar[:-1] + [(2,len(delSeq))] + [(0,len(clippedSeq))] + read_reference_end = read_reference_start + sum([x[1] for x in cigar if x[0] == 0 or x[0] == 2 or x[0] == 3]) + + # Adjusted read has to be within distance of the start/end positions to consider + if read_reference_start - leftSoftClip >= end + distance or read_reference_end + rightSoftClip <= start - distance: + continue + + # remove leading or trailing soft clips + if cigar[0][0] >= 4: + read_query_start += cigar[0][1] + del cigar[0] + + if cigar[-1][0] >= 4: + read_query_end -= cigar[-1][1] + del cigar[-1] + + # there are multiple cigar operations bookened by matches, the process an indel + if len(cigar) > 1: + + # adjust alignment start for first aligned block and remove it + if cigar[0][0] == 0: + read_reference_start += cigar[0][1]# sets to position before event + read_query_start = read_query_start + cigar[0][1] + del cigar[0] + + # adjust alignment end for last aligned block and remove it + if cigar[-1][0] == 0: + read_reference_end = read_reference_end - cigar[-1][1] + read_query_end = read_query_end - cigar[-1][1] + del cigar[-1] + + # If there's only one cigar operation left, it's an insertion or deletion + if len(cigar) == 1: + + # theres one event, so record the coordinates + indelinfo['chrom'] = read.reference_name + indelinfo['pos'] = read_reference_start + indelinfo['strands'] = '++' + + # if insertion + if cigar[0][0] == 1: + indelinfo['ref'] = regionSeq[read_reference_start-regionStart-1:read_reference_start-regionStart] #fasta.fetch(chr,read_reference_start-1-1,read_reference_start-1) # 0 based and position before the isertion + indelinfo['alt'] = read.query_sequence[read_query_start-1:read_query_start+cigar[0][1]] + indelinfo['type'] = 'INDEL' + + # if deletion + elif cigar[0][0] == 2 or cigar[0][0] == 3: + indelinfo['ref'] = regionSeq[read_reference_start-regionStart:read_reference_start-regionStart + cigar[0][1] + 1] #fasta.fetch(chr,read_reference_start-1,read_reference_start+cigar[0][1]) + indelinfo['alt'] = indelinfo['ref'][0] + indelinfo['type'] = 'INDEL' + + # if its a complex indel + else: + indelinfo['chrom'] = read.reference_name + indelinfo['pos'] = read_reference_start + indelinfo['strands'] = '++' + indelinfo['type'] = 'INDEL' + + # iterate through cigar and get indels + varlen = sum([ x[1] for x in cigar ]) + indelinfo['ref'] = regionSeq[read_reference_start-regionStart:read_reference_start-regionStart+sum([ x[1] for x in cigar ])+1] #fasta.fetch(chr,read_reference_start-1,read_reference_start+sum([ x[1] for x in cigar ])) + indelinfo['alt'] = read.query_sequence[read_query_start+1:read_query_start+sum([x[1] if x[0] == 1 or x[0] == 0 else 0 for x in cigar])] + + # if the read is chimeric and has a supplementary alignment near the start/end, process the chimeric read + elif read.has_tag('SA') is True and (leftSoftClip > 0 or rightSoftClip > 0): + sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(';')[0].split(',') if read.has_tag('SA') else [None,None,None,None,None,None] + + sCigarTuples = make_cigar_tuples(sCigar) + sPos = int(sPos) + sEnd = sPos + sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 2 or x[0] == 3]) - 1 + sLeftSoftClip = sCigarTuples[0][1] if sCigarTuples[0][0] >= 4 else 0 + sRightSoftClip = sCigarTuples[-1][1] if sCigarTuples[-1][0] >= 4 else 0 + + pSeq = read.query_alignment_sequence + sSeq = read.query_sequence[sLeftSoftClip:sLeftSoftClip+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 1])] if sStrand == read_strand else revcomp(read.query_sequence[sLeftSoftClip:sLeftSoftClip+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 1])]) + + # correct for overlapping alignments + if len(pSeq)+len(sSeq) > len(read.query_sequence): + trimLen = len(pSeq)+len(sSeq) - len(read.query_sequence) + if sLeftSoftClip > sRightSoftClip: + sPos += trimLen + sSeq = sSeq[trimLen:] + + elif sRightSoftClip > sLeftSoftClip: + sEnd -= trimLen + sSeq = sSeq[:-trimLen] + + if (int(sMq) >= minSecMapQual and int(sNm) <= maxNM and int(sMq)>0 and + (sChr != read.reference_name or abs(int(sPos) - read_reference_start) >= svDistanceThreshold) or sStrand != read_strand or + (rightSoftClip > 0 and sEnd < read_reference_start) or (leftSoftClip > 0 and sPos > read_reference_end)): + + indelinfo['chrom'] = read.reference_name + indelinfo['chrom2'] = sChr + indelinfo['strands'] = '++'if sStrand == read_strand else '+-' + indelinfo['type'] = 'BND' + + if indelinfo['strands'] == '++': + # if --->...> ...>---> or <---<... <...<--- + if rightSoftClip > leftSoftClip and sLeftSoftClip > sRightSoftClip: + indelinfo['pos'] = read_reference_end + indelinfo['pos2'] = sPos + + # if ...>---> --->...> or <...<--- <---<... + elif leftSoftClip > rightSoftClip and sRightSoftClip > sLeftSoftClip: + indelinfo['pos'] = read_reference_start - 1 + indelinfo['pos2'] = sEnd + 1 + + else: + print(f"Error: no proper soft clip orientation found: {read.query_name}: {str(read)}", file=sys.stderr) + if strict: + exit(1) + else: + continue + + elif indelinfo['strands'] == '+-': + # if --->...> <---<... or <---<... --->...> + if rightSoftClip > leftSoftClip and sRightSoftClip > sLeftSoftClip: + indelinfo['pos'] = read_reference_end + indelinfo['pos2'] = sEnd + + # if ...>---> <...<--- or <...<--- ...>---> + elif leftSoftClip > rightSoftClip and sLeftSoftClip > sRightSoftClip: + indelinfo['pos'] = read_reference_start - 1 + indelinfo['pos2'] = sPos + + else: + print(f"Error: no proper soft clip orientation found: {read.query_name}: {str(read)}", file=sys.stderr) + if strict: + exit(1) + else: + continue + + else: + print(f"Error: no proper soft clip orientation found: {read.query_name}: {str(read)}", file=sys.stderr) + if strict: + exit(1) + else: + continue + + elif read_reference_start < end and read_reference_end > start: + indelinfo['chrom'] = chr + indelinfo['pos'] = start + indelinfo['ref'] = '.' + indelinfo['alt'] = '.' + indelinfo['type'] = 'REF' + + else: + continue + + elif read_reference_start < end and read_reference_end > start: + indelinfo['chrom'] = chr + indelinfo['pos'] = start + indelinfo['ref'] = '.' + indelinfo['alt'] = '.' + indelinfo['type'] = 'REF' + + else: + continue + + # add indel info to dataframe + readaln = pd.concat([readaln, pd.DataFrame([indelinfo])], ignore_index=True) + + # if verbose, sorting indels + if verbose: + print("\tSorting indels", file=sys.stderr) + + # group by read and sort by cigar, then sv and get the first value in each group + readaln = readaln.sort_values(by=['read','chrom','pos','chrom2','pos2','strands','ref','alt','type'],key=lambda col: col != '',ascending=False).groupby('read').first().reset_index() + indelcounts = readaln.groupby(['chrom','pos','chrom2','pos2','strands','ref','alt','type'],dropna=False).size().reset_index(name='counts') + + # need to recast as int type, but allow for NA values + indelcounts['pos'] = indelcounts['pos'].astype(pd.Int64Dtype()) + indelcounts['pos2'] = indelcounts['pos2'].astype(pd.Int64Dtype()) + + # if verbose, get control counts + if verbose: + print("\tGetting control counts", file=sys.stderr) + + # get counts of reads in the controlbam for each indel/BND + if len(indelcounts) > 0: + # add pam positions to the df + if pam_positions is not pd.NA: + indelcounts['Positions'] = [pam_positions] * len(indelcounts) + indelcounts['Distance'] = indelcounts.apply(lambda r: min([abs(r['pos'] - x) for x in r['Positions']] + [abs(r['pos'] + len(r['ref']) - 1 - x) for x in r['Positions']] ) if r['pos'] is not pd.NA else pd.NA,axis=1) + else: + indelcounts['Positions'] = pd.NA + indelcounts['Distance'] = pd.NA + + indelcounts = add_normal_counts(indelcounts, [ x for x in controlbam.fetch(contig=chr,start=start-window,end=end) ], fasta, window=distance) + + else: + indelcounts['control_alt_counts'] = 0 + indelcounts['control_total_counts'] = 0 + indelcounts['Positions'] = pd.NA + indelcounts['Distance'] = pd.NA + + + # apply filters + indelcounts = indelcounts[(indelcounts['counts'] >= minreads) | (indelcounts['ref']=='.')] + indelcounts = indelcounts[(indelcounts['control_alt_counts'] <= maxcontrol) | (indelcounts['ref']=='.')] + indelcounts = indelcounts[(indelcounts['Distance'] <= distance) | (indelcounts['ref']=='.')] + indelcounts = indelcounts[~indelcounts['alt'].str.contains('N')] + + return(indelcounts) + + +def main(): + + parser = argparse.ArgumentParser(description='Find indels in a bam file at BED coordinates') + parser.add_argument('-f','--fasta',type=str,default="/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/singh_v4.3.6/hg38_PLVM_CD19_CARv4_cd34.fa",help='Reference fasta file') + parser.add_argument('-w','--window',type=int,default=100,help='Distance between off-target sites for merging intervals.') + parser.add_argument('-d','--distance',type=int,default=25,help='Window size around off-target site to identify mutations') + parser.add_argument('-m','--minreads',type=int,default=1,help='Minimum supporting reads to report an indel/bnd event.') + parser.add_argument('-c','--chromosome',type=str,default=None,help='Chromosome to process.') + parser.add_argument('-x','--maxcontrol',type=int,default=0,help='Maximum supporting reads to to report an indel/bnd event.') + parser.add_argument('-s','--strict',action='store_true',help='Exit if a read cannot be properly parsed.') + # add verbosity argument + parser.add_argument('-v','--verbose',action='store_true',help='Print verbose output') + + # add outfile argument -o or --outfile + parser.add_argument('-o','--outfile',type=str,help='Output file (optional)') + parser.add_argument('bed',help='Off-target coordinate file') + parser.add_argument('expbamfile',help='BAM file') + parser.add_argument('conbamfile',help='BAM file') + + args = parser.parse_args() + + # TODO: + # Nihdi: We need to update this code to take the file and return 2 objects: + # 1. Take the positions and make intervals that are chr,(pos-distup),(pos+distdown) and then merge them into new list of non-redunant intervals (might use pyranges for this) + # 2. The individual entries need to be retained, though, along with the information about the off-target (the exact position, the mismatches, etc) + + # verbose to stderr: processing input file + if args.verbose: + print("Processing input file", file=sys.stderr) + + # open off-target file and create ranges to search + bedDf = pd.read_csv(args.bed) + bedDf['Pos'] = bedDf['Start'] + bedDf['End'] = bedDf['Start'] + bedDf['Start'] = bedDf['Start'] - 1 + bedDf['Info'] = bedDf.apply(lambda row: f"{row['Source']},{row['DNA Sequence']},{row['PAM']},{row['Chromosome']},{row['Pos']},{row['Strand Direction']},{row['Mismatch']},{row['Bulge Type']},{row['Bulge Size']}", axis=1) + bedDf['Ontarget'] = bedDf['On_target'] + + # make pyranges object + bedPr = pr.PyRanges(bedDf[['Chromosome','Start','End','Pos','Info','Ontarget']]) + + # cluster the intervals + bedPr = bedPr.cluster(slack=args.window) + mergedBedPr = bedPr.merge(by='Cluster',strand=False,slack=args.window) + mergedBedDf = mergedBedPr.df.join(bedPr.df.groupby('Cluster')['Pos'].agg(list).reset_index().set_index('Cluster'),on='Cluster',how='left') + mergedBedDf = mergedBedDf.join(bedPr.df.groupby('Cluster')['Info'].agg(list).reset_index().set_index('Cluster'),on='Cluster',how='left') + mergedBedDf = mergedBedDf.join(bedPr.df.groupby('Cluster')['Ontarget'].agg('first').reset_index().set_index('Cluster'),on='Cluster',how='left') + + if args.chromosome is not None: + print("Processing chromosome", args.chromosome, file=sys.stderr) + mergedBedDf = mergedBedDf[mergedBedDf['Chromosome'] == args.chromosome] + + # verbose: say done + if args.verbose: + print("Done processing input file", file=sys.stderr) + + # open bam file(s) + expsamfile = pysam.AlignmentFile(args.expbamfile,"rc",reference_filename=args.fasta) + consamfile = pysam.AlignmentFile(args.conbamfile,"rc",reference_filename=args.fasta) + # open fasta file + refFasta = pysam.FastaFile(args.fasta) + + # print to outfile or stdout + if args.outfile: + sys.stdout = open(args.outfile, 'w') + + print("\t".join('chrom start end pam_positions total_reads indel_reads indel_fraction control_reads control_indel_reads control_indel_fraction indel_count indel_info bnd_count bnd_info target_info is_target'.split(' ')),flush=True) + + # iterate over mergedBedPr intervals: + for index, row in mergedBedDf.iterrows(): + + # verbose to stderr: processing interval + if args.verbose: + print(f"Processing interval {row['Chromosome']}:{row['Start']}-{row['End']}", file=sys.stderr) + + indels = get_indels(bam=expsamfile,controlbam=consamfile,chr=row['Chromosome'],start=row['Start'],end=row['End'], + fasta=refFasta,window=args.window,distance=args.distance,pam_positions=row['Pos'],minreads=args.minreads,maxcontrol=args.maxcontrol,verbose=args.verbose) + + total_reads, indel_reads, control_total_reads, control_indel_reads = 0, 0, 0, 0 + indel_fraction, control_indel_fraction = 0, 0 + indel_keys, bnd_keys = '.', '.' + bnds = [] + + if len(indels) > 0: + # get total reads in this region + total_reads = sum(indels['counts']) + indel_reads = sum(indels[indels['type']!='REF']['counts']) + control_indel_reads = int(indels['control_alt_counts'].mean()) + control_total_reads = int(indels['control_total_counts'].mean()) + + # separate BNDs and indels + bnds = indels[indels['type']=='BND'].copy() + indels = indels[indels['type']=='INDEL'].copy() + + if len(indels) > 0: + indels['Key'] = indels.apply(lambda r: f"{r['chrom']}:{r['pos']}:{r['ref']}:{r['alt']}:{r['counts']}:{r['control_alt_counts']}:{r['Distance']}", axis=1) + + if len(bnds) > 0: + bnds['Key'] = bnds.apply(lambda r: f"{r['chrom']}:{r['pos']}:{r['chrom2']}:{r['pos2']}:{r['strands']}:{r['counts']}:{r['control_alt_counts']}:{r['Distance']}", axis=1) + + # print results to tab-delimited file + indel_fraction = round(indel_reads/total_reads,4) if total_reads > 0 else 0 + control_indel_fraction = round(control_indel_reads/control_total_reads,4) if control_total_reads > 0 else 0 + indel_keys = ';'.join(indels['Key'].tolist()) if len(indels) > 0 else '.' + bnd_keys = ';'.join(bnds['Key'].tolist()) if len(bnds) > 0 else '.' + ontarget = row['Ontarget'] + offtargetsites = ';'.join(row['Info']) if len(row['Info']) > 0 else '.' + + print(f"{row['Chromosome']}\t{row['Start']}\t{row['End']}\t{';'.join([ str(x) for x in row['Pos'] ])}\t{total_reads}\t{indel_reads}\t{indel_fraction}\t{control_total_reads}\t{control_indel_reads}\t{control_indel_fraction}\t{len(indels)}\t{indel_keys}\t{len(bnds)}\t{bnd_keys}\t{offtargetsites}\t{ontarget}",flush=True) + + + # close stdout or outfile + if args.outfile: + sys.stdout.close() + + expsamfile.close() + consamfile.close() + refFasta.close() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bin/getIndelsFromBam.py b/bin/getIndelsFromBam.py index c159a7d..3d147e7 100755 --- a/bin/getIndelsFromBam.py +++ b/bin/getIndelsFromBam.py @@ -2,289 +2,556 @@ from __future__ import division import pysam +import biotite.sequence.align as align +import biotite.sequence as seq import argparse import re import string -import csv +import csv, sys import scipy.stats as stats +import pandas as pd, pyranges as pr # reverse complement function def revcomp(seq): tab = str.maketrans('ACGTacgtRYMKrymkVBHDvbhd', 'TGCAtgcaYRKMyrkmBVDHbvdh') # maketrans <- maps the reverse complement return seq.translate(tab)[::-1] # translate(x)[::-1] <- works backward through the string, effectively reversing the string +# make cigar tuples from a cigar string +def make_cigar_tuples(cigar): + cigar_dict = {'M': 0, 'I': 1, 'D': 2, 'N': 3, 'S': 4, 'H': 5, 'P': 6, '=': 7, 'X': 8} + # use regex to parse cigar string into tuples + cigar_tuples = re.findall(r'(\d+)([MIDNSHP=X])', cigar) + cigar_tuples = [(cigar_dict[operation],int(length)) for length, operation in cigar_tuples] + return cigar_tuples + +# function to count the number of reads that support an indel or BND event +def calculate_normal_counts(row, bam, fasta, window=25): + + # subfunctoin to count read hits to a sequence + def count_read_hits(bam,fasta,chrom,pos,chrom2=None,pos2=None,ref=None,alt=None,orientation=None,handicap=5,window=25,flank=150): + + # note, the handicap is used to adjust the score of the alignment to account for the difference in length between the ref and alt sequences + # if the normal reads really support the indel, it should have a score that reflects the difference in length between the ref and alt sequences + refseq = '' + altseq = '' + + # make ref and alt sequences, first if an indel, next if a BND. + # note that if its a BND then the refseq is a concatenation of the two sequences from either end + if ref and alt: + refseq = fasta.fetch(chrom,pos-1-flank,pos+len(ref)+flank) + altseq = fasta.fetch(chrom,pos-1-flank,pos-1) + alt + fasta.fetch(chrom,pos+len(ref)-1,pos+len(ref)+flank) + #handicap = abs(len(ref) - len(alt)) if handicap == 0 else handicap + + elif not chrom2 is pd.NA and not pos2 is pd.NA and not orientation is None: + handicap = 50 if handicap == 0 else handicap + refseq = fasta.fetch(chrom,pos-1-flank,pos+flank) + altseq = fasta.fetch(chrom,pos-1-flank,pos-1) + if orientation == '++': + altseq += fasta.fetch(chrom2,pos2-1,pos2+flank) + elif orientation == '+-': + altseq += revcomp(fasta.fetch(chrom2,pos2-1-flank,pos2)) + + # make ref_count and alt_count counters + ref_count = set() + alt_count = set() + + # iterate through reads in bam file for the first position + for read in bam.fetch(chrom,pos-1-window,pos+window,multiple_iterators=True): + if not read.is_mapped: + continue + + # if the read has no indels, add it to the ref_count set + if len(read.cigartuples) == 1 and read.cigartuples[0][0] == 0 and read.cigartuples[0][1] == len(read.query_sequence): + ref_count.add(read.query_name) + continue + + ref_align = align.align_optimal(seq.NucleotideSequence(read.query_sequence),seq.NucleotideSequence(refseq),matrix=align.SubstitutionMatrix.std_nucleotide_matrix(),gap_penalty=(-10,-1),local=True,terminal_penalty=False,max_number=1)[0].score + + alt_align = align.align_optimal(seq.NucleotideSequence(read.query_sequence),seq.NucleotideSequence(altseq),matrix=align.SubstitutionMatrix.std_nucleotide_matrix(),gap_penalty=(-10,-1),local=True,terminal_penalty=False,max_number=1)[0].score + + if alt_align > 0 and alt_align - handicap > ref_align: + alt_count.add(read.query_name) + else: + ref_count.add(read.query_name) + + return (len(alt_count),len(alt_count)+len(ref_count)) + + if row['type'] == 'BND': + return count_read_hits( + bam, fasta, window=window, + chrom=row['chrom'], pos=row['pos'], + chrom2=row['chrom2'], pos2=row['pos2'], + orientation=row['strands'] + ) + elif row['type'] == 'INDEL': + return count_read_hits( + bam, fasta, window=window, + chrom=row['chrom'], pos=row['pos'], + ref=row['ref'], alt=row['alt'] + ) + else: + #print(f"{row['chrom']}\t{row['pos']}\t{row['type']}") + control_reads = [ x.query_name for x in bam.fetch(row['chrom'],row['pos']-1-window,row['pos']+window,multiple_iterators=True) ] + control_reads = set(control_reads) + return (0,len(control_reads)) + + # function to get indels for one amplicon from bam file -def get_indels(bam,chr,start,end,pampos,strand,srchSeq,srchSeqMM): +def get_indels(bam,controlbam,chr,start,end,fasta,window=100,distance=25,pam_positions=pd.NA,minSecMapQual=10,maxNM=5,svDistanceThreshold=100000,saAlignmentTolerance=5,minSoftClipLength=5,deletionDistanceThreshold=1000,minreads=1,maxcontrol=0,strict=False): + + # window == flanking sequence to add to read search + # distance == max distance from start/end to consider an indel, BND, or call a read wild-type. - dat = {'all':0,'indels':{},'sv':{},'seq':0} + cigarVarDict = {'0':'M','1':'I','2':'D','3':'N','4':'S','5':'H','6':'P','7':'=','8':'X'} + + readaln = pd.DataFrame(columns=['read','chrom','pos','chrom2','pos2','ref','alt','strands','type']) - for read in bam.fetch(chr, start, end, multiple_iterators = True): + # get reads that align within window of start and end + for read in bam.fetch(chr, start-window, end+window, multiple_iterators = True): # skip if not primary alignment or a duplicate or alignment doesnt overlap start,end if read.is_mapped is False or \ read.is_duplicate is True or \ read.is_secondary is True or \ read.is_supplementary is True or \ - read.reference_start > end or \ - read.reference_end < start: - continue - - if read.mapping_quality == 0: + read.mapping_quality == 0: continue - if read.has_tag('NM') is True and int(read.get_tag('NM'))>5: - continue + cigar = read.cigartuples # get cigar info + mate_cigar = make_cigar_tuples(read.get_tag('MC')) if read.has_tag('MC') else None - # check for a chimeric read - if read.is_proper_pair is False and read.mate_is_mapped is True: - mate_contig = read.next_reference_name - mate_start = read.next_reference_start - if mate_contig != read.reference_name or abs(mate_start - read.reference_start) > 1000: - dat['sv'][':'.join([read.reference_name,str(read.reference_start),read.next_reference_name,str(read.next_reference_start)])] = 1 - continue + # Quality filter: Get mismatches from NM tag and subtract deleted/skipped bases from cigar + # and skip if too many mismatches + if read.has_tag('NM') and int(read.get_tag('NM')) > maxNM: + if (int(read.get_tag('NM')) - sum([x[1] for x in cigar if x[0] == 2 or x[0]==3])) > maxNM: + continue - if read.has_tag('SA') is True: - sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(',') - if sChr != read.reference_name and int(sMq)>0: - dat['sv'][':'.join([read.reference_name,str(read.reference_start),sChr,str(sPos)])] = 1 - continue - - # check that read start and end are within w bp of the amplicon target start and end (to avoid spurious alignments--skip if this is not true) - cigar = read.cigartuples # get cigar info + read_strand = "+" + if read.is_reverse is True: + read_strand = "-" leftSoftClip = 0 rightSoftClip = 0 + read_reference_start = read.reference_start + 1 + read_reference_end = read.reference_end + read_next_reference_start = read.next_reference_start + 1 + read_next_reference_end = None if mate_cigar is None else read_next_reference_start + sum([x[1] for x in mate_cigar if x[0] == 0 or x[0] == 2 or x[0] == 3]) + + read_query_start = 0 + read_query_end = len(read.query_sequence) + + # setup indelinfo dictionary + indelinfo = {'read':read.query_name,'chrom':'','pos':None,'chrom2':None,'pos2':None,'strands': '','ref':'','alt':'','type':''} + + # Get left soft clip length if cigar[0][0] == 4: leftSoftClip = cigar[0][1] + # Get right soft clip length if cigar[-1][0] == 4: rightSoftClip = cigar[-1][1] - # skip if read is not aligned across the pampos (incl. soft clips) -# if read.is_proper_pair is True and (read.reference_start - leftSoftClip > pampos or read.reference_end + rightSoftClip < pampos): -# continue + # + # This adjusts alignment cigars for soft clips and supplementary alignments. + # It extends the cigar if the softclips map nearby and therefore the read has a deletion. + # - # skip if read doesnt fully span the pampos - if read.reference_start >= pampos or read.reference_end <= pampos: - continue + # Process left soft clip, if present + if leftSoftClip > 0: - # we cant resolve these so will skip for now. - if leftSoftClip > 0 or rightSoftClip > 0: - continue + # first check for local supplementary alignments + sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(';')[0].split(',') if read.has_tag('SA') else [None,None,None,None,None,None] - # count all reads covering this bed record position - dat['all'] += 1 + if (sChr != None and int(sMq)>=minSecMapQual and + int(sNm) 2: - del cigar[0] - - if cigar[-1][0] > 2: - del cigar[-1] - - if len(cigar) == 1 and cigar[0][0] == 0: # no indels, so continue - continue - - # get reference position start for the alignment - # then skip beginning and trailing aligned blocks that are aligned - # to the reference - rpos = read.reference_start - - var = [] # operation: [I]nsertion, [D]eletion, or [C]omplex - varlen = [] # length of indel - vartype = [] # variant type - varst = [] # ref pos of variant start - varen = [] # ref pos of variant end - deltabases = 0 # sum of bases inserted/deleted - - for cg in cigar: # rebuild cigar and keep track - s = rpos - e = rpos - if cg[0] == 0 or cg[0] == 2 or cg[0] == 3: - e = rpos + cg[1] - elif cg[0] == 1: - e = rpos + 1 - - # do not include the cigar operation if it does not overlap start,end... - if e < start or s > end: - rpos = e - continue + sPos = int(sPos) - # ...or its a matching segment that goes beyond start,end since these dont matter - elif cg[0] == 0 and (s < start or e > end): - rpos = e - continue - - elif cg[0] == 0: # aligned block - var.append(str(cg[1]) + "M") - varlen.append(cg[1]) - vartype.append("M") - varst.append(s) - varen.append(e) - - elif cg[0] == 1: # an insertion - var.append(str(cg[1]) + "I") - vartype.append("I") - varlen.append(cg[1]) - varst.append(s) - varen.append(e) - deltabases += cg[1] - - elif cg[0] == 2 or cg[0] == 3: # a deletion or reference skip - var.append(str(cg[1]) + "D") - vartype.append("D") - varlen.append(cg[1]) - varst.append(s) - varen.append(e) - deltabases += cg[1] - - else: # account for other operations - var.append(str(cg[1]) + "X") - vartype.append("C") - varlen.append(cg[1]) - varst.append(s) - varen.append(e) - deltabases += cg[1] - - rpos = e # increment reference position + # if there is a supplementary alignment, then realign the entire read to the reference sequence + cigar = make_cigar_tuples(align.write_alignment_to_cigar(align.align_optimal(seq.NucleotideSequence(fasta.fetch(chr,sPos-1,read_reference_end)), + seq.NucleotideSequence(read.query_sequence), + matrix=align.SubstitutionMatrix.std_nucleotide_matrix(), + gap_penalty=(-10,-1),local=False,max_number=1)[0])) + read_reference_start = sPos - # do not continue if there are no indels - if len(var) == 0: - continue - - # combine multiple indels into a complex one - if vartype[0] == 'M': - var = var[1:] - varlen = varlen[1:] - vartype = vartype[1:] - varst = varst[1:] - varen = varen[1:] - - if vartype[-1] == 'M': - var = var[:-1] - varlen = varlen[:-1] - vartype = vartype[:-1] - varst = varst[:-1] - varen = varen[:-1] - - if len(var) > 1: - var = ["".join(var)] - varlen = [sum(varlen)] - vartype = ["C"] - varst = [min(varst)] - varen = [max(varen)] - - i = 0 - # get position relative to PAM, strand - relpos = None - if (vartype[i] == "D" or vartype[i] == "C") and varst[i] <= pampos and varen[i] >= pampos: - relpos = 0 - - elif strand == "+" and varst[i] >= pampos: # fwd downstream - relpos = varst[i] - pampos + else: + # if the left soft clip is long enough, see if it supports a deletion within window + # this performs an exact match and so isnt ideal, but should be good enough for now + if leftSoftClip >= minSoftClipLength: + clippedSeq = read.query_sequence[:leftSoftClip] + refSeq = fasta.fetch(chr,read_reference_start - window,read_reference_start) + # find the position of the left-most occurance of clippedSeq in refSeq + leftClipPos = refSeq.rfind(clippedSeq) + if leftClipPos != -1: + delSeq = refSeq[leftClipPos + len(clippedSeq):] + cigar = [(0,len(clippedSeq))] + [(2,len(delSeq))] + cigar[1:] + read_reference_start = read_reference_end - sum([x[1] for x in cigar if x[0] == 0 or x[0] == 2 or x[0] == 3]) + 1 + + # Process right soft clip, if present + if rightSoftClip > 0: + + # first check for local supplementary alignments + sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(';')[0].split(',') if read.has_tag('SA') else [None,None,None,None,None,None] + + if (sChr != None and int(sMq)>=minSecMapQual and + int(sNm) read_reference_start and + int(sPos) - read_reference_start < svDistanceThreshold): - elif strand == "+" and varen[i] <= pampos: # fwd upstream - relpos = varen[i] - pampos - - elif strand == "-" and varen[i] <= pampos: # rev downstream - relpos = pampos - varen[i] - - elif strand == "-" and varst[i] >= pampos: # rev upstream - relpos = pampos - varst[i] - - # key is reference_position:cigar:vartype:len:relpos - key = str(varst[i]) + ":" + var[i] + ":" + vartype[i] + ":" + str(varlen[i]) + ":" + str(relpos) - - if key not in dat['indels'].keys(): - dat['indels'][key] = 0 - - dat['indels'][key] += 1 # count the reads with this variant - - return(dat) - - -srchSeq = 'TTAATTGAGTTGTCATATGTTAATAACGGT' # this is the dsODN sequence -srchSeqMM = 2 # mismatches tolerated when looking for the search sequence -maxInControl = 10 - -parser = argparse.ArgumentParser(description='Find indels in a bam file at BED coordinates') -parser.add_argument('bed',help='Amplicon bed file') -parser.add_argument('expbamfile',help='BAM file') -parser.add_argument('conbamfile',help='BAM file') - -args = parser.parse_args() - -bed = args.bed - -# open bam file(s) -expsamfile = pysam.AlignmentFile(args.expbamfile,"rc",reference_filename="/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/singh/hg38_PLVM_CD19_CARv4_cd34.fa") -consamfile = pysam.AlignmentFile(args.conbamfile,"rc",reference_filename="/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/singh/hg38_PLVM_CD19_CARv4_cd34.fa") - -distup = 20 -distdown = 20 - -print("\t".join('id chrom pos strand edited_reads edited_indels edited_unique_indels control_reads control_indels control_unique_indels indel_info exp_svs con_svs'.split(' '))) -#rec = [contig,pampos,strand,datExp['all'],expindels,sum(datExp['filtindels'].values()),datExp['seq'],datCon['all'],controlindels,sum(datCon['filtindels'].values()),datCon['seq'],";".join(map(str,vsfilt)) or '.'] -with open(bed) as f: - for line in f: - rec = line.strip().split(',') + sPos = int(sPos) + sCigarTuples = make_cigar_tuples(sCigar) - p = re.compile("^(\S+)\:([+-])(\d+)") - m = p.search(rec[5]) - contig = m[1] - pampos = int(m[3]) - strand = m[2] - if strand == '+': - pampos = pampos + 20 + cigar = make_cigar_tuples(align.write_alignment_to_cigar(align.align_optimal(seq.NucleotideSequence(fasta.fetch(chr,read_reference_start-1,sPos-1+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 2 or x[0] == 3]))),seq.NucleotideSequence(read.query_sequence), + matrix=align.SubstitutionMatrix.std_nucleotide_matrix(), + gap_penalty=(-10,-1),local=False,max_number=1)[0])) - st = pampos - distup - en = pampos + distdown - if strand == "-": - st = pampos - distdown - en = pampos + distup + read_reference_end = read_reference_start + sum([x[1] for x in cigar if x[0] == 0 or x[0] == 2 or x[0] == 3]) - datExp = get_indels(expsamfile,contig,st,en,pampos,strand,srchSeq,srchSeqMM) - datCon = get_indels(consamfile,contig,st,en,pampos,strand,srchSeq,srchSeqMM) - - datExp['filtindels'] = {} - datExp['filtseq'] = {} - datCon['filtindels'] = {} - datCon['filtseq'] = {} - - expindels = 0 - controlindels = 0 + else: + # if the left soft clip is long enough, see if it supports a deletion within deletionDistanceThreshold + # this performs an exact match and so isnt ideal, but should be good enough for now + if rightSoftClip >= minSoftClipLength: + clippedSeq = read.query_sequence[-rightSoftClip:] + refSeq = fasta.fetch(chr,read_reference_end,read_reference_end + window) + # find the position of the left-most occurance of clippedSeq in refSeq + rightClipPos = refSeq.find(clippedSeq) + if rightClipPos != -1: + delSeq = refSeq[0:rightClipPos] + cigar = cigar[:-1] + [(2,len(delSeq))] + [(0,len(clippedSeq))] + read_reference_end = read_reference_start + sum([x[1] for x in cigar if x[0] == 0 or x[0] == 2 or x[0] == 3]) - for k in set(list(datExp['indels'].keys()) + list(datCon['indels'].keys())): # iterate through all exp and con indels + # Adjusted read has to be within distance of the start/end positions to consider + if read_reference_start - leftSoftClip >= end + distance or read_reference_end + rightSoftClip <= start - distance: + continue - if (datCon['indels'].get(k) or 0) > maxInControl: # do not report indels with >maxInControl reads--these are inherited or artifacts. delete from dicts - if datExp['indels'].get(k): - del datExp['indels'][k] - - if datCon['indels'].get(k): - del datCon['indels'][k] + # remove leading or trailing soft clips + if cigar[0][0] >= 4: + read_query_start += cigar[0][1] + del cigar[0] + + if cigar[-1][0] >= 4: + read_query_end -= cigar[-1][1] + del cigar[-1] + + # there are multiple cigar operations bookened by matches, the process an indel + if len(cigar) > 1: + + # adjust alignment start for first aligned block and remove it + if cigar[0][0] == 0: + read_reference_start += cigar[0][1]# sets to position before event + read_query_start = read_query_start + cigar[0][1] + del cigar[0] + + # adjust alignment end for last aligned block and remove it + if cigar[-1][0] == 0: + read_reference_end = read_reference_end - cigar[-1][1] + read_query_end = read_query_end - cigar[-1][1] + del cigar[-1] + + # If there's only one cigar operation left, it's an insertion or deletion + if len(cigar) == 1: + + # theres one event, so record the coordinates + indelinfo['chrom'] = read.reference_name + indelinfo['pos'] = read_reference_start + indelinfo['strands'] = '++' + + # if insertion + if cigar[0][0] == 1: + indelinfo['ref'] = fasta.fetch(chr,read_reference_start-1-1,read_reference_start-1) # 0 based and position before the isertion + indelinfo['alt'] = read.query_sequence[read_query_start-1:read_query_start+cigar[0][1]] + indelinfo['type'] = 'INDEL' + + # if deletion + elif cigar[0][0] == 2 or cigar[0][0] == 3: + indelinfo['ref'] = fasta.fetch(chr,read_reference_start-1,read_reference_start+cigar[0][1]) + indelinfo['alt'] = indelinfo['ref'][0] + indelinfo['type'] = 'INDEL' + + # if its a complex indel + else: + indelinfo['chrom'] = read.reference_name + indelinfo['pos'] = read_reference_start + indelinfo['strands'] = '++' + indelinfo['type'] = 'INDEL' + + # iterate through cigar and get indels + varlen = sum([ x[1] for x in cigar ]) + indelinfo['ref'] = fasta.fetch(chr,read_reference_start-1,read_reference_start+sum([ x[1] for x in cigar ])) + indelinfo['alt'] = read.query_sequence[read_query_start+1:read_query_start+sum([x[1] if x[0] == 1 or x[0] == 0 else 0 for x in cigar])] + + # if the read is chimeric and has a supplementary alignment near the start/end, process the chimeric read + elif read.has_tag('SA') is True and (leftSoftClip > 0 or rightSoftClip > 0): + sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(';')[0].split(',') if read.has_tag('SA') else [None,None,None,None,None,None] + + sCigarTuples = make_cigar_tuples(sCigar) + sPos = int(sPos) + sEnd = sPos + sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 2 or x[0] == 3]) - 1 + sLeftSoftClip = sCigarTuples[0][1] if sCigarTuples[0][0] >= 4 else 0 + sRightSoftClip = sCigarTuples[-1][1] if sCigarTuples[-1][0] >= 4 else 0 + + pSeq = read.query_alignment_sequence + sSeq = read.query_sequence[sLeftSoftClip:sLeftSoftClip+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 1])] if sStrand == read_strand else revcomp(read.query_sequence[sLeftSoftClip:sLeftSoftClip+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 1])]) + + # correct for overlapping alignments + if len(pSeq)+len(sSeq) > len(read.query_sequence): + trimLen = len(pSeq)+len(sSeq) - len(read.query_sequence) + if sLeftSoftClip > sRightSoftClip: + sPos += trimLen + sSeq = sSeq[trimLen:] + + elif sRightSoftClip > sLeftSoftClip: + sEnd -= trimLen + sSeq = sSeq[:-trimLen] + + if (int(sMq) >= minSecMapQual and int(sNm) <= maxNM and int(sMq)>0 and + (sChr != read.reference_name or abs(int(sPos) - read_reference_start) >= svDistanceThreshold) or sStrand != read_strand or + (rightSoftClip > 0 and sEnd < read_reference_start) or (leftSoftClip > 0 and sPos > read_reference_end)): + + indelinfo['chrom'] = read.reference_name + indelinfo['chrom2'] = sChr + indelinfo['strands'] = '++'if sStrand == read_strand else '+-' + indelinfo['type'] = 'BND' + + if indelinfo['strands'] == '++': + # if --->...> ...>---> or <---<... <...<--- + if rightSoftClip > leftSoftClip and sLeftSoftClip > sRightSoftClip: + indelinfo['pos'] = read_reference_end + indelinfo['pos2'] = sPos + + # if ...>---> --->...> or <...<--- <---<... + elif leftSoftClip > rightSoftClip and sRightSoftClip > sLeftSoftClip: + indelinfo['pos'] = read_reference_start - 1 + indelinfo['pos2'] = sEnd + 1 + + else: + print(f"Error: no proper soft clip orientation found: {read.query_name}") + if strict: + exit(1) + else: + continue + + elif indelinfo['strands'] == '+-': + # if --->...> <---<... or <---<... --->...> + if rightSoftClip > leftSoftClip and sRightSoftClip > sLeftSoftClip: + indelinfo['pos'] = read_reference_end + indelinfo['pos2'] = sEnd + + # if ...>---> <...<--- or <...<--- ...>---> + elif leftSoftClip > rightSoftClip and sLeftSoftClip > sRightSoftClip: + indelinfo['pos'] = read_reference_start - 1 + indelinfo['pos2'] = sPos + + else: + print(f"Error: no proper soft clip orientation found: {read.query_name}") + if strict: + exit(1) + else: + continue + + else: + print(f"Error: no proper soft clip orientation found: {read.query_name}") + if strict: + exit(1) + else: + continue + + elif read_reference_start < end and read_reference_end > start: + indelinfo['chrom'] = chr + indelinfo['pos'] = start + indelinfo['ref'] = '.' + indelinfo['alt'] = '.' + indelinfo['type'] = 'REF' else: - datExp['filtindels'][k] = datExp['indels'].get(k) or 0 - if datExp['filtindels'][k] > 0: - expindels += 1 + continue - datCon['filtindels'][k] = datCon['indels'].get(k) or 0 - if datCon['filtindels'][k] > 0: - controlindels += 1 + elif read_reference_start < end and read_reference_end > start: + indelinfo['chrom'] = chr + indelinfo['pos'] = start + indelinfo['ref'] = '.' + indelinfo['alt'] = '.' + indelinfo['type'] = 'REF' + + else: + continue + + # add indel info to dataframe + readaln = pd.concat([readaln, pd.DataFrame([indelinfo])], ignore_index=True) + + # group by read and sort by cigar, then sv and get the first value in each group + readaln = readaln.sort_values(by=['read','chrom','pos','chrom2','pos2','strands','ref','alt','type'],key=lambda col: col != '',ascending=False).groupby('read').first().reset_index() + indelcounts = readaln.groupby(['chrom','pos','chrom2','pos2','strands','ref','alt','type'],dropna=False).size().reset_index(name='counts') + + # need to recast as int type, but allow for NA values + indelcounts['pos'] = indelcounts['pos'].astype(pd.Int64Dtype()) + indelcounts['pos2'] = indelcounts['pos2'].astype(pd.Int64Dtype()) + + # get counts of reads in the controlbam for each indel/BND + if len(indelcounts) > 0: + # add pam positions to the df + if pam_positions is not pd.NA: + indelcounts['Positions'] = [pam_positions] * len(indelcounts) + indelcounts['Distance'] = indelcounts.apply(lambda r: min([abs(r['pos'] - x) for x in r['Positions']] + [abs(r['pos'] + len(r['ref']) - 1 - x) for x in r['Positions']] ) if r['pos'] is not pd.NA else pd.NA,axis=1) + else: + indelcounts['Positions'] = pd.NA + indelcounts['Distance'] = pd.NA + + indelcounts[['control_alt_counts','control_total_counts']] = indelcounts.apply( + lambda row: calculate_normal_counts(row, controlbam, fasta, window=distance), axis=1 + ).apply(pd.Series) + else: + indelcounts['control_alt_counts'] = 0 + indelcounts['control_total_counts'] = 0 + indelcounts['Positions'] = pd.NA + indelcounts['Distance'] = pd.NA + + + # apply filters + indelcounts = indelcounts[(indelcounts['counts'] >= minreads) | (indelcounts['ref']=='.')] + indelcounts = indelcounts[(indelcounts['control_alt_counts'] <= maxcontrol) | (indelcounts['ref']=='.')] + indelcounts = indelcounts[(indelcounts['Distance'] <= distance) | (indelcounts['ref']=='.')] + return(indelcounts) + +# Conversion function for CSV to hotspot format +def convert_to_hotspot_format(input_csv, output_file): + with open(input_csv, 'r') as csv_file, open(output_file, 'w', newline='') as out_file: + reader = csv.DictReader(csv_file) + writer = csv.writer(out_file) + + for row in reader: + dna_sequence = row['DNA Sequence'] + pam = row['PAM'] + mismatch = row['Mismatch'] + bulge_type = row['Bulge Type'] + bulge_size = row['Bulge Size'] + chromosome = row['Chromosome'] + strand = row['Strand Direction'] + start = row['Start'] + + mismatch = mismatch if mismatch else "N/A" + bulge_type = bulge_type if bulge_type else "" + bulge_size = bulge_size if bulge_size else "" + chrom_pos = f"{chromosome}:{strand}{start}" + + writer.writerow([dna_sequence, pam, '', mismatch, '', chrom_pos]) + +# Convert CSV to hotspot format if it has the specified header +def convert_csv_if_needed(input_file): + with open(input_file, 'r') as file: + first_line = file.readline().strip() + expected_columns = "Source,DNA Sequence,PAM,Chromosome,Strand Direction,Start,Bulge Type,Mismatch,Bulge Size,On_target" + if first_line == expected_columns: + output_file = input_file + '.converted' + convert_to_hotspot_format(input_file, output_file) + return output_file + return input_file + +def main(): + + parser = argparse.ArgumentParser(description='Find indels in a bam file at BED coordinates') + parser.add_argument('-f','--fasta',type=str,default="/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/singh_v4.3.6/hg38_PLVM_CD19_CARv4_cd34.fa",help='Reference fasta file') + parser.add_argument('-w','--window',type=int,default=100,help='Distance between off-target sites for merging intervals.') + parser.add_argument('-d','--distance',type=int,default=25,help='Window size around off-target site to identify mutations') + parser.add_argument('-m','--minreads',type=int,default=1,help='Minimum supporting reads to report an indel/bnd event.') + parser.add_argument('-c','--maxcontrol',type=int,default=0,help='Maximum supporting reads to to report an indel/bnd event.') + parser.add_argument('-s','--strict',action='store_true',help='Exit if a read cannot be properly parsed.') + # add outfile argument -o or --outfile + parser.add_argument('-o','--outfile',type=str,help='Output file (optional)') + parser.add_argument('bed',help='Off-target coordinate file') + parser.add_argument('expbamfile',help='BAM file') + parser.add_argument('conbamfile',help='BAM file') + + args = parser.parse_args() + + # TODO: + # Nihdi: We need to update this code to take the file and return 2 objects: + # 1. Take the positions and make intervals that are chr,(pos-distup),(pos+distdown) and then merge them into new list of non-redunant intervals (might use pyranges for this) + # 2. The individual entries need to be retained, though, along with the information about the off-target (the exact position, the mismatches, etc) + + # open off-target file and create ranges to search + bedDf = pd.read_csv(args.bed) + bedDf['Pos'] = bedDf['Start'] + bedDf['End'] = bedDf['Start'] + bedDf['Start'] = bedDf['Start'] - 1 + bedDf['Info'] = bedDf.apply(lambda row: f"{row['Source']},{row['DNA Sequence']},{row['PAM']},{row['Chromosome']},{row['Pos']},{row['Strand Direction']},{row['Mismatch']},{row['Bulge Type']},{row['Bulge Size']}", axis=1) + bedDf['Ontarget'] = bedDf['On_target'] + + # make pyranges object + bedPr = pr.PyRanges(bedDf[['Chromosome','Start','End','Pos','Info','Ontarget']]) + + # cluster the intervals + bedPr = bedPr.cluster(slack=args.window) + mergedBedPr = bedPr.merge(by='Cluster',strand=False,slack=args.window) + mergedBedDf = mergedBedPr.df.join(bedPr.df.groupby('Cluster')['Pos'].agg(list).reset_index().set_index('Cluster'),on='Cluster',how='left') + mergedBedDf = mergedBedDf.join(bedPr.df.groupby('Cluster')['Info'].agg(list).reset_index().set_index('Cluster'),on='Cluster',how='left') + mergedBedDf = mergedBedDf.join(bedPr.df.groupby('Cluster')['Ontarget'].agg('first').reset_index().set_index('Cluster'),on='Cluster',how='left') + + # open bam file(s) + expsamfile = pysam.AlignmentFile(args.expbamfile,"rc",reference_filename=args.fasta) + consamfile = pysam.AlignmentFile(args.conbamfile,"rc",reference_filename=args.fasta) + # open fasta file + refFasta = pysam.FastaFile(args.fasta) + + # print to outfile or stdout + if args.outfile: + sys.stdout = open(args.outfile, 'w') + + print("\t".join('chrom start end pam_positions total_reads indel_reads indel_fraction control_reads control_indel_reads control_indel_fraction indel_count indel_info bnd_count bnd_info target_info is_target'.split(' '))) + + # iterate over mergedBedPr intervals: + for index, row in mergedBedDf.iterrows(): + + indels = get_indels(bam=expsamfile,controlbam=consamfile,chr=row['Chromosome'],start=row['Start'],end=row['End'], + fasta=refFasta,window=args.window,distance=args.distance,pam_positions=row['Pos'],minreads=args.minreads,maxcontrol=args.maxcontrol) + + total_reads, indel_reads, control_total_reads, control_indel_reads = 0, 0, 0, 0 + indel_fraction, control_indel_fraction = 0, 0 + indel_keys, bnd_keys = '.', '.' + bnds = pd.DataFrame() + offtargetsites = ';'.join(row['Info']) if len(row['Info']) > 0 else '.' + ontarget = row['Ontarget'] + + + if len(indels) > 0: + # get total reads in this region + total_reads = sum(indels['counts']) + indel_reads = sum(indels[indels['type']!='REF']['counts']) + control_indel_reads = int(indels['control_alt_counts'].mean()) + control_total_reads = int(indels['control_total_counts'].mean()) + + # separate BNDs and indels + bnds = indels[indels['type']=='BND'].copy() + indels = indels[indels['type']=='INDEL'].copy() + + if len(indels) > 0: + indels['Key'] = indels.apply(lambda r: f"{r['chrom']}:{r['pos']}:{r['ref']}:{r['alt']}:{r['counts']}:{r['control_alt_counts']}:{r['Distance']}", axis=1) + + if len(bnds) > 0: + bnds['Key'] = bnds.apply(lambda r: f"{r['chrom']}:{r['pos']}:{r['chrom2']}:{r['pos2']}:{r['strands']}:{r['counts']}:{r['control_alt_counts']}:{r['Distance']}", axis=1) + + # print results to tab-delimited file + indel_fraction = round(indel_reads/total_reads,4) if total_reads > 0 else 0 + control_indel_fraction = round(control_indel_reads/control_total_reads,4) if control_total_reads > 0 else 0 + indel_keys = ';'.join(indels['Key'].tolist()) if len(indels) > 0 else '.' + bnd_keys = ';'.join(bnds['Key'].tolist()) if len(bnds) > 0 else '.' + ontarget = row['Ontarget'] + offtargetsites = ';'.join(row['Info']) if len(row['Info']) > 0 else '.' + print(f"{row['Chromosome']}\t{row['Start']}\t{row['End']}\t{';'.join([ str(x) for x in row['Pos'] ])}\t{total_reads}\t{indel_reads}\t{indel_fraction}\t{control_total_reads}\t{control_indel_reads}\t{control_indel_fraction}\t{len(indels)}\t{indel_keys}\t{len(bnds)}\t{bnd_keys}\t{offtargetsites}\t{ontarget}") - # assemble filtered data - vsfilt = [] - for k in datExp['filtindels'].keys(): - vsfilt.append(":".join(map(str,[k,datExp['filtindels'].get(k) or 0,datCon['indels'].get(k) or 0]))) - expSvs = ';'.join(datExp['sv'].keys()) or '.' - controlSvs = ';'.join(datCon['sv'].keys()) or '.' - rec = [rec[5],contig,pampos,strand,datExp['all'],expindels,sum(datExp['filtindels'].values()),datCon['all'],controlindels,sum(datCon['filtindels'].values()),";".join(map(str,vsfilt)) or '.', expSvs, controlSvs] - - print("\t".join(map(str,rec))) -f.close() -expsamfile.close() -consamfile.close() + # close stdout or outfile + if args.outfile: + sys.stdout.close() + + expsamfile.close() + consamfile.close() + refFasta.close() + + +if __name__ == "__main__": + main() diff --git a/bin/make_hotspot_file.py b/bin/make_hotspot_file.py index 33290b8..781e89a 100755 --- a/bin/make_hotspot_file.py +++ b/bin/make_hotspot_file.py @@ -42,10 +42,10 @@ def main(): parser.add_argument('--bed', type=checkfile, help="bed hotspot file") parser.add_argument('--csv', type=checkfile, help="csv hotspot file") parser.add_argument('--window', type=int, default=200, help="window") + parser.add_argument('--id', help="meta id") args = parser.parse_args() window = args.window - # assumes csv_file ranges don't overlap data_df = pd.DataFrame() if (args.bed): bed_file = os.path.realpath(args.bed) @@ -69,12 +69,13 @@ def main(): row_df['Chromosome'] = rows['Chromosome'] vcf_df = pd.concat([vcf_df, row_df], ignore_index=True) result_vcf = dataframe_to_vcf(vcf_df) - with open(f"hotspot.vcf", "w") as f: + id = args.id + with open(f"{id}.hotspot.vcf", "w") as f: f.write(result_vcf) if __name__ == "__main__": if len(sys.argv) < 2: - print("Usage: python script.py --bed --csv --window [window]") + print("Usage: python script.py --bed --csv --window [window] --id ") sys.exit(1) else: main() diff --git a/modules/local/annotate_transgene.nf b/modules/local/annotate_transgene.nf new file mode 100644 index 0000000..a01fb9a --- /dev/null +++ b/modules/local/annotate_transgene.nf @@ -0,0 +1,25 @@ +process ANNOTATE_TRANSGENE_VARIANTS { + tag "$meta.id" + label 'process_low' + label 'final_output' + container "ghcr.io/dhslab/docker-vep:release_105" + + input: + tuple val(meta), path(files), path("${meta.id}.transgene_out.tsv") + + output: + tuple val(meta), path("${meta.id}.transgene.annotated.tsv"), emit: annotated_transgenes + path "versions.yml", emit: versions + + script: + """ + awk 'BEGIN {FS=OFS="\\t"} NR > 1 {sub(/^chr/, "", \$1); for (i = 1; i < NF; i++) {printf "%s%s", \$i, (i == NF-1 ? "\\n" : OFS)}}' ${meta.id}.transgene_out.tsv > ${meta.id}.edited.transgene.tsv && \\ + /opt/vep/src/ensembl-vep/vep --cache --dir /storage1/fs1/dspencer/Active/spencerlab/refdata/hg38/VEP_cache --symbol --per_gene -i ${meta.id}.edited.transgene.tsv -o ${meta.id}.transgene.annotated.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vep: \$(/opt/vep/src/ensembl-vep/vep 2>&1 | grep ensembl-vep | cut -d ':' -f 2 | sed 's/\s*//g') + END_VERSIONS + """ + +} \ No newline at end of file diff --git a/modules/local/get_indels.nf b/modules/local/get_indels.nf index 61669c7..2e263fc 100644 --- a/modules/local/get_indels.nf +++ b/modules/local/get_indels.nf @@ -2,11 +2,11 @@ process GET_INDELS { tag "$meta.id" label 'process_low' label 'final_output' - container "ghcr.io/dhslab/docker-cleutils" + container "ghcr.io/dhslab/docker-baseimage:latest" + errorStrategy 'ignore' input: - tuple val(meta), path(files) - path(targetfile) + tuple val(meta), path(files), path(hotspot_file) output: tuple val(meta), path("${meta.id}.indels.txt") @@ -14,7 +14,7 @@ process GET_INDELS { script: """ - getIndelsFromBam.py ${targetfile} ${meta.id}_tumor.cram ${meta.id}.cram > ${meta.id}.indels.txt + extract_variant_reads.py ${hotspot_file} ${meta.id}_tumor.cram ${meta.id}.cram > ${meta.id}.indels.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/get_transgene_junctions.nf b/modules/local/get_transgene_junctions.nf index 57eb2a0..a8b7341 100644 --- a/modules/local/get_transgene_junctions.nf +++ b/modules/local/get_transgene_junctions.nf @@ -8,7 +8,7 @@ process GET_TRANSGENE_JUNCTIONS { tuple val(meta), path(files) output: - tuple val(meta), path("${meta.id}.transgene_out.tsv") + tuple val(meta), path("${meta.id}.transgene_out.tsv"), emit: transgene_file path "versions.yml" , emit: versions script: diff --git a/modules/local/make_hotspot_file.nf b/modules/local/make_hotspot_file.nf index 8ace3cb..93617ba 100644 --- a/modules/local/make_hotspot_file.nf +++ b/modules/local/make_hotspot_file.nf @@ -25,7 +25,7 @@ process MAKE_HOTSPOT_FILE { def bed = bed_file.name != 'NO_FILE.bed' ? "--bed $bed_file" : '' def csv = csv_file.name != 'NO_FILE.csv' ? "--csv $csv_file" : '' """ - make_hotspot_file.py $bed $csv --window $params.window + make_hotspot_file.py $bed $csv --window $params.window --id ${info[0].id} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 7fd057a..122c927 100644 --- a/nextflow.config +++ b/nextflow.config @@ -52,9 +52,9 @@ params { // Dragen inputs dragen_inputs { - dragen_hash = "/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/hg38_PLVM_CD19_CARv4_cd34" - snv_noisefile = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/dragen_v1.0_systematic_noise.nextera_wgs.120920.bed.gz" - sv_noisefile = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/WGS_v2.0.0_hg38_sv_systematic_noise.bedpe.gz" + dragen_hash = "/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/singh_v4.3.6" + snv_noisefile = "/storage1/fs1/dspencer/Active/spencerlab/refdata/hg38/dragenfiles/IDPF_WGS_hg38_v.2.0.0_systematic_noise.snv.bed.gz" + sv_noisefile = "/storage1/fs1/dspencer/Active/spencerlab/refdata/hg38/dragenfiles/IDPF_WGS_hg38_v3.0.0_systematic_noise.sv.bedpe.gz" dbsnp = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/dbsnp.vcf.gz" dbsnp_index = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/dbsnp.vcf.gz.tbi" pop_af_vcf = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz" diff --git a/subworkflows/local/somatic_input_check.nf b/subworkflows/local/somatic_input_check.nf index 220d22a..1aa67fc 100644 --- a/subworkflows/local/somatic_input_check.nf +++ b/subworkflows/local/somatic_input_check.nf @@ -156,8 +156,26 @@ workflow SOMATIC_INPUT_CHECK { ch_input_data = ch_input_data.mix(ch_bam) + ch_mastersheet + .map { meta -> + if (meta.dragen_path != null){ + def new_meta = [:] + new_meta['id'] = meta.id + return [new_meta] + } + } + .set{ch_ids} + ch_input_data = ch_input_data.mix(ch_ids) + ch_hotspots = ch_mastersheet - .map{ row -> [row.uid, row.hotspot_file ?: "$projectDir/assets/NO_FILE.csv"]} + .map{ row -> + if (row.uid != null) { + return [row.uid, row.hotspot_file ?: "$projectDir/assets/NO_FILE.csv"] + } + else { + return [row.id, row.hotspot_file ?: "$projectDir/assets/NO_FILE.csv"] + } + } .unique() ch_input_data = ch_input_data @@ -175,7 +193,6 @@ workflow SOMATIC_INPUT_CHECK { } } .set { ch_dragen_outputs } - ch_dragen_outputs.dump(tag: 'dragen_output') emit: dragen_outputs = ch_dragen_outputs diff --git a/workflows/scge.nf b/workflows/scge.nf index a527dd0..b9fca6e 100644 --- a/workflows/scge.nf +++ b/workflows/scge.nf @@ -50,6 +50,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft include { MAKE_HOTSPOT_FILE } from '../modules/local/make_hotspot_file.nf' include { DRAGEN_SCGE } from '../modules/local/dragen_scge.nf' include { ANNOTATE_VARIANTS } from '../modules/local/annotate_variants.nf' +include { ANNOTATE_TRANSGENE_VARIANTS } from '../modules/local/annotate_transgene.nf' include { GET_INDELS } from '../modules/local/get_indels.nf' include { GET_TRANSGENE_JUNCTIONS } from '../modules/local/get_transgene_junctions.nf' include { REFORMAT_CNV_DATA } from '../modules/local/reformat_cnv_data.nf' @@ -103,36 +104,51 @@ workflow SCGE { SOMATIC_INPUT_CHECK(Channel.fromPath(mastersheet), data_path) ch_input_data = SOMATIC_INPUT_CHECK.out.input_data - hotspot_input = ch_input_data.combine(hotspot_bed) - MAKE_HOTSPOT_FILE(hotspot_input) - ch_input_data = MAKE_HOTSPOT_FILE.out.hotspot_vcf - .map{ info, hotspot_vcf -> - def newinfo = [] - newinfo = info + [hotspot_vcf] - newinfo - } - - ch_dragen_outputs = ch_dragen_outputs.mix(SOMATIC_INPUT_CHECK.out.dragen_outputs) - ch_dragen_inputs = Channel.value(stageFileset(params.dragen_inputs)) - ch_assay_inputs = Channel.value(stageFileset(params.assay_inputs)) + ch_hotspots = ch_input_data.map{ meta, file -> return [meta[0]['id'], file] } if (params.run_dragen == true) { + + hotspot_input = ch_input_data.combine(hotspot_bed) + MAKE_HOTSPOT_FILE(hotspot_input) + ch_input_data = MAKE_HOTSPOT_FILE.out.hotspot_vcf + .map{ info, hotspot_vcf -> + def newinfo = [] + newinfo = info + [hotspot_vcf] + newinfo + } + + ch_dragen_outputs = ch_dragen_outputs.mix(SOMATIC_INPUT_CHECK.out.dragen_outputs) + ch_dragen_inputs = Channel.value(stageFileset(params.dragen_inputs)) + ch_assay_inputs = Channel.value(stageFileset(params.assay_inputs)) + DRAGEN_SCGE (ch_input_data, ch_dragen_inputs) ch_versions = ch_versions.mix(DRAGEN_SCGE.out.versions) ch_dragen_outputs = ch_dragen_outputs.mix(DRAGEN_SCGE.out.dragen_output) + } else { + ch_dragen_outputs = ch_dragen_outputs.mix(SOMATIC_INPUT_CHECK.out.dragen_outputs) + ch_dragen_inputs = Channel.value(stageFileset(params.dragen_inputs)) + ch_assay_inputs = Channel.value(stageFileset(params.assay_inputs)) } if (params.run_analysis == true) { - - ANNOTATE_VARIANTS (ch_dragen_outputs) + + ANNOTATE_VARIANTS (ch_dragen_outputs, ch_assay_inputs, params.fasta) ch_versions = ch_versions.mix(ANNOTATE_VARIANTS.out.versions) - - GET_INDELS (ch_dragen_outputs) + + get_indels_input = ch_dragen_outputs + .map{ meta -> [meta[0]['id'], meta] } + .combine(ch_hotspots, by: 0) + .map{id, meta, hotspot_file -> [meta[0], meta[1], hotspot_file]} + GET_INDELS (get_indels_input) ch_versions = ch_versions.mix(GET_INDELS.out.versions) GET_TRANSGENE_JUNCTIONS (ch_dragen_outputs) ch_versions = ch_versions.mix(GET_TRANSGENE_JUNCTIONS.out.versions) + annotate_transgene_input = ch_dragen_outputs.join(GET_TRANSGENE_JUNCTIONS.out.transgene_file) + ANNOTATE_TRANSGENE_VARIANTS (annotate_transgene_input) + ch_versions = ch_versions.mix(ANNOTATE_TRANSGENE_VARIANTS.out.versions) + REFORMAT_CNV_DATA (ch_dragen_outputs) ch_versions = ch_versions.mix(REFORMAT_CNV_DATA.out.versions) }