diff --git a/bin/extract_variant_reads.py b/bin/extract_variant_reads.py
new file mode 100755
index 0000000..3a77836
--- /dev/null
+++ b/bin/extract_variant_reads.py
@@ -0,0 +1,649 @@
+#!/usr/local/bin/python3
+
+from __future__ import division
+import pysam
+import biotite.sequence.align as align
+import biotite.sequence as seq
+import argparse
+import re
+import string
+import csv, sys
+import scipy.stats as stats
+import pandas as pd, pyranges as pr
+
+# reverse complement function
+def revcomp(seq):
+    tab = str.maketrans('ACGTacgtRYMKrymkVBHDvbhd', 'TGCAtgcaYRKMyrkmBVDHbvdh') # maketrans <- maps the reverse complement
+    return seq.translate(tab)[::-1] # translate(x)[::-1] <- works backward through the string, effectively reversing the string
+
+# make cigar tuples from a cigar string
+def make_cigar_tuples(cigar):
+    cigar_dict = {'M': 0, 'I': 1, 'D': 2, 'N': 3, 'S': 4, 'H': 5, 'P': 6, '=': 7, 'X': 8}
+    # use regex to parse cigar string into tuples
+    cigar_tuples = re.findall(r'(\d+)([MIDNSHP=X])', cigar)
+    cigar_tuples = [(cigar_dict[operation],int(length)) for length, operation in cigar_tuples]
+    return cigar_tuples
+
+def cigar_summary(cigar):
+    cigar_dict = {'M': 0, 'I': 1, 'D': 2, 'N': 3, 'S': 4, 'H': 5, 'P': 6, '=': 7, 'X': 8}
+    # use regex to parse cigar string into tuples
+    cigar_tuples = re.findall(r'(\d+)([MIDNSHP=X])', cigar)
+    cigar_sum = { 'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0 }
+    for length, operation in cigar_tuples:
+        cigar_sum[operation] += int(length)
+
+    return cigar_sum
+
+def make_aligned_pairs(sup_align,refseq):
+    """
+    Converts a supplementary alignment string into a list of aligned pairs.
+
+    Args:
+    supplementary_alignment (str): Supplementary alignment string (e.g., 'chr12,107092390,-,110M41S,60,0').
+
+    Returns:
+    List of tuples: Aligned pairs (e.g., [(0, 168391768, 'C'), (1, 168391769, 'C'), ...]).
+    """
+    # Split the supplementary alignment string into components
+    ref_name, ref_start, strand, cigar, _, _ = supplementary_alignment.split(',')
+    
+    # Convert positions and CIGAR to appropriate types
+    ref_start = int(ref_start)
+    cigar_ops = parse_cigar(cigar)
+    
+    # Initialize variables
+    aligned_pairs = []
+    read_pos = 0
+    ref_pos = ref_start
+        
+    for op, length in cigar_ops:
+        if op == 'M':  # Match/mismatch
+            for i in range(length):
+                aligned_pairs.append((read_pos, ref_pos, refseq[ref_pos - ref_start]))
+                read_pos += 1
+                ref_pos += 1
+        elif op == 'I':  # Insertion
+            for i in range(length):
+                aligned_pairs.append((read_pos, None, refseq[ref_pos - ref_start]))
+                read_pos += 1
+        elif op == 'D':  # Deletion
+            for i in range(length):
+                aligned_pairs.append((None, ref_pos, refseq[ref_pos - ref_start]))
+                ref_pos += 1
+        elif op == 'S':  # Soft clipping
+            read_pos += length
+        # Skipping 'H' (hard clipping) and 'N' (skipped region) as they do not appear in the read
+
+    return aligned_pairs
+
+
+def indels_from_aligned_pairs(pairs,readseq):
+
+    # remove leading soft clips
+    while pairs[0][0] is None:
+        pairs = pairs[1:]
+
+    # remove training soft clips
+    while pairs[-1][0] is None:
+        pairs = pairs[:-1]
+
+    pairs = [ (x[0],x[1],x[2].upper(),None) for x in pairs ]
+    for i in range(len(pairs)):
+        if pairs[i][0] is not None:
+            pairs[i] = (pairs[i][0],readseq[pairs[i][0]],pairs[i][1],pairs[i][2])
+
+    variant_start_index = j-1
+    variant_end_index = j-1
+    j = 0
+    while j < len(pairs):
+        if pairs[j][0] is not None and pairs[j][2] is not None:
+            j+=1
+            continue
+
+        if pairs[j][0] is None or pairs[j][2] is None: # indel
+            variant_start_index = j-1
+            variant_end_index = len(pairs)
+            break
+
+        j+=1
+
+    # construct indel tuple
+    indel = (pairs[variant_start_index][2],''.join([ x[3] if x[3] else '' for x in pairs[variant_start_index:variant_end_index]]),''.join([ x[1] if x[1] else '' for x in pairs[variant_start_index:variant_end_index]]))
+    
+    return indel
+
+# function to count the number of reads that support an indel or BND event
+def add_normal_counts(df, reads, fasta, maxreads=0,handicap=5,window=25,flank=150):
+
+    # make ref and alt sequences for each indel/BND
+    df['refseq'] = ''
+    df['altseq'] = ''
+    for it, row in df.iterrows():
+        df.at[it,'refseq'] = fasta.fetch(row['chrom'],row['pos']-1-flank,row['pos']+len(row['ref'])+flank)
+        if row['type'] in ['INDEL','DEL','INS']:
+            df.at[it,'altseq'] = fasta.fetch(row['chrom'],row['pos']-1-flank,row['pos']-1) + row['alt'] + fasta.fetch(row['chrom'],row['pos']+len(row['ref'])-1,row['pos']+len(row['ref'])+flank)
+        elif row['type'] == 'BND':
+            df.at[it,'altseq'] = fasta.fetch(row['chrom'],row['pos']-1-flank,row['pos']-1) + (fasta.fetch(row['chrom2'],row['pos2']-1,row['pos2']+flank) if row['strands']=='++' else revcomp(fasta.fetch(row['chrom2'],row['pos2']-1-flank,row['pos2'])))
+
+    df['control_alt_counts'] = 0
+    df['control_total_counts'] = 0
+
+    total_reads = set()
+
+    # iterate through reads in bam file for the first position
+    for read in reads:
+        # skip if not primary alignment or a duplicate or alignment doesnt overlap start,end
+        if read.is_mapped is False or \
+            read.is_duplicate is True or \
+            read.is_secondary is True or \
+            read.is_supplementary is True or \
+            read.mapping_quality == 0:
+            continue
+
+        total_reads.add(read.query_name)
+
+        # for speed: if the read has no indels, add it to the ref_count set
+        if len(read.cigartuples) == 1 and read.cigartuples[0][0] == 0 and read.cigartuples[0][1] == len(read.query_sequence):
+            continue
+        
+        for it, row in df.iterrows():
+
+            if read.cigartuples[0][0]==0 and read.cigartuples[-1][0]==0:
+                if (len(row['ref']) > len(row['alt']) and 'D' not in read.cigarstring) or (len(row['alt']) > len(row['ref']) and 'I' not in read.cigarstring):
+                    continue
+
+            if row['type'] == 'BND' and not read.has_tag('SA'):
+                continue
+
+            refseq = row['refseq']
+            altseq = row['altseq']
+
+            ref_align = align.align_optimal(seq.NucleotideSequence(row['refseq']),seq.NucleotideSequence(read.query_sequence),matrix=align.SubstitutionMatrix.std_nucleotide_matrix(),gap_penalty=(-10,-1),local=True,terminal_penalty=False,max_number=1)[0]
+        
+            alt_align = align.align_optimal(seq.NucleotideSequence(row['altseq']),seq.NucleotideSequence(read.query_sequence),matrix=align.SubstitutionMatrix.std_nucleotide_matrix(),gap_penalty=(-10,-1),local=True,terminal_penalty=False,max_number=1)[0]
+
+            if alt_align.score > 0 and alt_align.score - handicap > ref_align.score:
+                ref_align_cigar_sum = cigar_summary(align.write_alignment_to_cigar(ref_align))
+                alt_align_cigar_sum = cigar_summary(align.write_alignment_to_cigar(alt_align))
+
+                if ((len(row['ref'])-len(row['alt']) > 0 and ref_align_cigar_sum['D']==len(row['ref'])-len(row['alt'])) or 
+                (len(row['alt'])-len(row['ref']) > 0 and ref_align_cigar_sum['I']==len(row['alt'])-len(row['ref']))):
+                    df.at[it,'control_alt_counts'] += 1
+                    # print to stderr: found control alt count
+                    print(f"\tFound control alt count for {row['chrom']}:{row['pos']}:{row['ref']}:{row['alt']}:{read.query_name}", file=sys.stderr)
+
+    df['control_total_counts'] = len(total_reads)
+
+    return df.copy()
+
+
+# function to get indels for one amplicon from bam file
+def get_indels(bam,controlbam,chr,start,end,fasta,window=100,distance=25,pam_positions=pd.NA,minSecMapQual=10,maxNM=5,svDistanceThreshold=100000,saAlignmentTolerance=5,minSoftClipLength=5,deletionDistanceThreshold=1000,minreads=1,maxcontrol=0,strict=False,verbose=False):
+
+    # window == flanking sequence to add to read search
+    # distance == max distance from start/end to consider an indel, BND, or call a read wild-type.
+
+    cigarVarDict = {'0':'M','1':'I','2':'D','3':'N','4':'S','5':'H','6':'P','7':'=','8':'X'}
+
+    readaln = pd.DataFrame(columns=['read','chrom','pos','chrom2','pos2','ref','alt','strands','type'])
+    
+    # if verbose, print to stderr: getting reads that align within window of start and end
+    if verbose:
+        print(f"\tGetting reads that align within window of {chr}:{start}-{end}", file=sys.stderr)
+
+    regionStart = max(start - svDistanceThreshold,1)
+    regionEnd = min(end + svDistanceThreshold,fasta.get_reference_length(chr))
+    regionSeq = fasta.fetch(chr,regionStart-1,regionEnd)
+
+    # get reads that align within window of start and end
+    for read in bam.fetch(chr, start-window, end+window, multiple_iterators = True):
+
+        # skip if not primary alignment or a duplicate or alignment doesnt overlap start,end
+        if read.is_mapped is False or \
+            read.is_duplicate is True or \
+            read.is_secondary is True or \
+            read.is_supplementary is True or \
+            read.mapping_quality == 0:
+            continue
+
+        cigar = read.cigartuples # get cigar info
+        mate_cigar = make_cigar_tuples(read.get_tag('MC')) if read.has_tag('MC') else None
+
+        # Quality filter: Get mismatches from NM tag and subtract deleted/skipped bases from cigar
+        # and skip if too many mismatches
+        if read.has_tag('NM') and int(read.get_tag('NM')) > maxNM:
+            if (int(read.get_tag('NM')) - sum([x[1] for x in cigar if x[0] == 2 or x[0]==3])) > maxNM:
+                continue
+
+        read_strand = "+"
+        if read.is_reverse is True:
+            read_strand = "-"
+
+        leftSoftClip = 0
+        rightSoftClip = 0
+        read_reference_start = read.reference_start + 1
+        read_reference_end = read.reference_end
+        read_next_reference_start = read.next_reference_start + 1
+        read_next_reference_end = None if mate_cigar is None else read_next_reference_start + sum([x[1] for x in mate_cigar if x[0] == 0 or x[0] == 2 or x[0] == 3])
+        
+        read_query_start = 0
+        read_query_end = len(read.query_sequence)
+
+        # setup indelinfo dictionary
+        indelinfo = {'read':read.query_name,'chrom':'','pos':None,'chrom2':None,'pos2':None,'strands': '','ref':'','alt':'','type':''}
+
+        # Get left soft clip length
+        if cigar[0][0] == 4:
+            leftSoftClip = cigar[0][1]
+
+        # Get right soft clip length
+        if cigar[-1][0] == 4:
+            rightSoftClip = cigar[-1][1]
+
+        # 
+        # This adjusts alignment cigars for soft clips and supplementary alignments. 
+        # It extends the cigar if the softclips map nearby and therefore the read has a deletion.
+        #
+
+        # Process left soft clip, if present
+        if leftSoftClip > rightSoftClip:
+
+            # first check for local supplementary alignments 
+            sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(';')[0].split(',') if read.has_tag('SA') else [None,None,None,None,None,None]
+
+            if (sChr != None and int(sMq)>=minSecMapQual and
+                int(sNm)<maxNM and sChr == read.reference_name and
+                sStrand == read_strand and int(sPos) < read_reference_start and
+                abs(int(sPos) - read_reference_start) < svDistanceThreshold):
+                
+                sPos = int(sPos)
+                sCigarTuples = make_cigar_tuples(sCigar)
+
+                if sCigarTuples[-1][0] >= 4 and sCigarTuples[0][0] == 0 and sCigarTuples[0][1] == cigar[0][1]:
+                    cigar = sCigarTuples[:-1] + [(2,read_reference_start-sPos-1)] + cigar[1:]
+
+                else:
+                    # if there is a supplementary alignment, then realign the entire read to the reference sequence
+                    cigar = make_cigar_tuples(align.write_alignment_to_cigar(align.align_optimal(seq.NucleotideSequence(regionSeq[sPos-regionStart:read_reference_end-regionStart+1]), #seq.NucleotideSequence(fasta.fetch(chr,sPos-1,read_reference_end)),
+                                                                                seq.NucleotideSequence(read.query_sequence),
+                                                                                matrix=align.SubstitutionMatrix.std_nucleotide_matrix(),
+                                                                                gap_penalty=(-10,-1),local=False,max_number=1)[0]))
+                read_reference_start = sPos
+
+            else:
+                # if the left soft clip is long enough, see if it supports a deletion within window
+                # this performs an exact match and so isnt ideal, but should be good enough for now
+                if leftSoftClip >= minSoftClipLength:
+                    clippedSeq = read.query_sequence[:leftSoftClip]
+                    refSeq = regionSeq[read_reference_start - regionStart - window + 1:read_reference_start - regionStart +  1] #fasta.fetch(chr,read_reference_start - window,read_reference_start)
+                    # find the position of the left-most occurance of clippedSeq in refSeq
+                    leftClipPos = refSeq.rfind(clippedSeq)
+                    if leftClipPos != -1:
+                        delSeq = refSeq[leftClipPos + len(clippedSeq)-1:]
+                        cigar = [(0,len(clippedSeq))] + [(2,len(delSeq))] + cigar[1:]
+                        read_reference_start = read_reference_end - sum([x[1] for x in cigar if x[0] == 0 or x[0] == 2 or x[0] == 3]) + 1
+
+        # Process right soft clip, if present
+        if rightSoftClip > leftSoftClip:
+
+            # first check for local supplementary alignments 
+            sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(';')[0].split(',') if read.has_tag('SA') else [None,None,None,None,None,None]
+
+            if (sChr != None):
+                sPos = int(sPos)
+                sCigarTuples = make_cigar_tuples(sCigar)
+
+                if (int(sMq)>=minSecMapQual and sCigarTuples[0][0] >= 4 and sCigarTuples[-1][0] == 0 and
+                    int(sNm)<maxNM and sChr == read.reference_name and 
+                    sStrand == read_strand and int(sPos) > read_reference_end and 
+                    int(sPos) - read_reference_end < svDistanceThreshold):
+                
+                    if sCigarTuples[0][0] >= 4 and sCigarTuples[-1][0] == 0 and sCigarTuples[-1][1] == cigar[-1][1]:
+                        cigar = cigar[:-1] + [(2,read_reference_end-sPos)] + sCigarTuples[1:]
+
+                    else:
+                        cigar = make_cigar_tuples(align.write_alignment_to_cigar(align.align_optimal(seq.NucleotideSequence(regionSeq[read_reference_start-regionStart:sPos-regionStart+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 2 or x[0] == 3])]), #fasta.fetch(chr,read_reference_start-1,sPos-1+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 2 or x[0] == 3]))),
+                                        seq.NucleotideSequence(read.query_sequence),
+                                        matrix=align.SubstitutionMatrix.std_nucleotide_matrix(),
+                                        gap_penalty=(-10,-1),local=False,max_number=1)[0]))
+
+                read_reference_end = read_reference_start + sum([x[1] for x in cigar if x[0] == 0 or x[0] == 2 or x[0] == 3])
+
+            else:
+                # if the left soft clip is long enough, see if it supports a deletion within deletionDistanceThreshold
+                # this performs an exact match and so isnt ideal, but should be good enough for now
+                if rightSoftClip >= minSoftClipLength:
+                    clippedSeq = read.query_sequence[-rightSoftClip:]
+                    refSeq = regionSeq[read_reference_end-regionStart:read_reference_end-regionStart + window] #fasta.fetch(chr,read_reference_end,read_reference_end + window)
+                    # find the position of the left-most occurance of clippedSeq in refSeq
+                    rightClipPos = refSeq.find(clippedSeq)
+                    if rightClipPos != -1:
+                        delSeq = refSeq[0:rightClipPos]
+                        cigar = cigar[:-1] + [(2,len(delSeq))] + [(0,len(clippedSeq))]
+                        read_reference_end = read_reference_start + sum([x[1] for x in cigar if x[0] == 0 or x[0] == 2 or x[0] == 3])
+        
+        # Adjusted read has to be within distance of the start/end positions to consider
+        if read_reference_start - leftSoftClip >= end + distance or read_reference_end + rightSoftClip <= start - distance:
+            continue
+
+        # remove leading or trailing soft clips
+        if cigar[0][0] >= 4:
+            read_query_start += cigar[0][1]
+            del cigar[0]
+        
+        if cigar[-1][0] >= 4:
+            read_query_end -= cigar[-1][1]
+            del cigar[-1]
+
+        # there are multiple cigar operations bookened by matches, the process an indel                
+        if len(cigar) > 1:
+
+            # adjust alignment start for first aligned block and remove it
+            if cigar[0][0] == 0:
+                read_reference_start += cigar[0][1]# sets to position before event
+                read_query_start = read_query_start + cigar[0][1] 
+                del cigar[0]
+
+            # adjust alignment end for last aligned block and remove it
+            if cigar[-1][0] == 0:
+                read_reference_end = read_reference_end - cigar[-1][1]
+                read_query_end = read_query_end - cigar[-1][1]
+                del cigar[-1]
+
+            # If there's only one cigar operation left, it's an insertion or deletion
+            if len(cigar) == 1:
+
+                # theres one event, so record the coordinates
+                indelinfo['chrom'] = read.reference_name
+                indelinfo['pos'] = read_reference_start
+                indelinfo['strands'] = '++'
+
+                # if insertion
+                if cigar[0][0] == 1:           
+                    indelinfo['ref'] = regionSeq[read_reference_start-regionStart-1:read_reference_start-regionStart] #fasta.fetch(chr,read_reference_start-1-1,read_reference_start-1) # 0 based and position before the isertion
+                    indelinfo['alt'] = read.query_sequence[read_query_start-1:read_query_start+cigar[0][1]]
+                    indelinfo['type'] = 'INDEL'
+
+                # if deletion
+                elif cigar[0][0] == 2 or cigar[0][0] == 3:
+                    indelinfo['ref'] = regionSeq[read_reference_start-regionStart:read_reference_start-regionStart + cigar[0][1] + 1] #fasta.fetch(chr,read_reference_start-1,read_reference_start+cigar[0][1])
+                    indelinfo['alt'] = indelinfo['ref'][0]
+                    indelinfo['type'] = 'INDEL'
+
+            # if its a complex indel
+            else:
+                indelinfo['chrom'] = read.reference_name
+                indelinfo['pos'] = read_reference_start
+                indelinfo['strands'] = '++'
+                indelinfo['type'] = 'INDEL'
+
+                # iterate through cigar and get indels
+                varlen = sum([ x[1] for x in cigar ])
+                indelinfo['ref'] = regionSeq[read_reference_start-regionStart:read_reference_start-regionStart+sum([ x[1] for x in cigar ])+1] #fasta.fetch(chr,read_reference_start-1,read_reference_start+sum([ x[1] for x in cigar ]))
+                indelinfo['alt'] = read.query_sequence[read_query_start+1:read_query_start+sum([x[1] if x[0] == 1 or x[0] == 0 else 0 for x in cigar])]
+
+        # if the read is chimeric and has a supplementary alignment near the start/end, process the chimeric read
+        elif read.has_tag('SA') is True and (leftSoftClip > 0 or rightSoftClip > 0):
+            sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(';')[0].split(',') if read.has_tag('SA') else [None,None,None,None,None,None]
+
+            sCigarTuples = make_cigar_tuples(sCigar)
+            sPos = int(sPos)
+            sEnd = sPos + sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 2 or x[0] == 3]) - 1
+            sLeftSoftClip = sCigarTuples[0][1] if sCigarTuples[0][0] >= 4 else 0
+            sRightSoftClip = sCigarTuples[-1][1] if sCigarTuples[-1][0] >= 4 else 0
+
+            pSeq = read.query_alignment_sequence
+            sSeq = read.query_sequence[sLeftSoftClip:sLeftSoftClip+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 1])] if sStrand == read_strand else revcomp(read.query_sequence[sLeftSoftClip:sLeftSoftClip+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 1])])
+
+            # correct for overlapping alignments
+            if len(pSeq)+len(sSeq) > len(read.query_sequence):
+                trimLen = len(pSeq)+len(sSeq) - len(read.query_sequence)
+                if sLeftSoftClip > sRightSoftClip:
+                    sPos += trimLen
+                    sSeq = sSeq[trimLen:]
+
+                elif sRightSoftClip > sLeftSoftClip:
+                    sEnd -= trimLen
+                    sSeq = sSeq[:-trimLen]
+
+            if (int(sMq) >= minSecMapQual and int(sNm) <= maxNM and int(sMq)>0 and
+                (sChr != read.reference_name or abs(int(sPos) - read_reference_start) >= svDistanceThreshold) or sStrand != read_strand or 
+                (rightSoftClip > 0 and sEnd < read_reference_start) or (leftSoftClip > 0 and sPos > read_reference_end)):
+
+                indelinfo['chrom'] = read.reference_name
+                indelinfo['chrom2'] = sChr
+                indelinfo['strands'] = '++'if sStrand == read_strand else '+-'
+                indelinfo['type'] = 'BND'
+
+                if indelinfo['strands'] == '++':
+                    # if --->...> ...>---> or <---<... <...<---
+                    if rightSoftClip > leftSoftClip and sLeftSoftClip > sRightSoftClip:
+                        indelinfo['pos'] = read_reference_end
+                        indelinfo['pos2'] = sPos
+
+                    # if ...>---> --->...> or <...<--- <---<...
+                    elif leftSoftClip > rightSoftClip and sRightSoftClip > sLeftSoftClip:
+                        indelinfo['pos'] = read_reference_start - 1 
+                        indelinfo['pos2'] = sEnd + 1
+
+                    else:
+                        print(f"Error: no proper soft clip orientation found: {read.query_name}: {str(read)}", file=sys.stderr)
+                        if strict:
+                            exit(1)
+                        else:
+                            continue
+
+                elif indelinfo['strands'] == '+-':
+                    # if --->...> <---<... or <---<... --->...>
+                    if rightSoftClip > leftSoftClip and sRightSoftClip > sLeftSoftClip:
+                        indelinfo['pos'] = read_reference_end
+                        indelinfo['pos2'] = sEnd
+
+                    # if ...>---> <...<--- or <...<--- ...>--->
+                    elif leftSoftClip > rightSoftClip and sLeftSoftClip > sRightSoftClip:
+                        indelinfo['pos'] = read_reference_start - 1
+                        indelinfo['pos2'] = sPos
+
+                    else:
+                        print(f"Error: no proper soft clip orientation found: {read.query_name}: {str(read)}", file=sys.stderr)
+                        if strict:
+                            exit(1)
+                        else:
+                            continue
+                
+                else:
+                    print(f"Error: no proper soft clip orientation found: {read.query_name}: {str(read)}", file=sys.stderr)
+                    if strict:
+                        exit(1)
+                    else:
+                        continue
+            
+            elif read_reference_start < end and read_reference_end > start:
+                indelinfo['chrom'] = chr
+                indelinfo['pos'] = start
+                indelinfo['ref'] = '.'
+                indelinfo['alt'] = '.'
+                indelinfo['type'] = 'REF'
+
+            else:
+                continue
+                    
+        elif read_reference_start < end and read_reference_end > start:
+            indelinfo['chrom'] = chr
+            indelinfo['pos'] = start
+            indelinfo['ref'] = '.'
+            indelinfo['alt'] = '.'
+            indelinfo['type'] = 'REF'
+
+        else:
+           continue
+
+        # add indel info to dataframe
+        readaln = pd.concat([readaln, pd.DataFrame([indelinfo])], ignore_index=True)
+
+    # if verbose, sorting indels
+    if verbose:
+        print("\tSorting indels", file=sys.stderr)
+
+    # group by read and sort by cigar, then sv and get the first value in each group
+    readaln = readaln.sort_values(by=['read','chrom','pos','chrom2','pos2','strands','ref','alt','type'],key=lambda col: col != '',ascending=False).groupby('read').first().reset_index()
+    indelcounts = readaln.groupby(['chrom','pos','chrom2','pos2','strands','ref','alt','type'],dropna=False).size().reset_index(name='counts')
+
+    # need to recast as int type, but allow for NA values
+    indelcounts['pos'] = indelcounts['pos'].astype(pd.Int64Dtype())
+    indelcounts['pos2'] = indelcounts['pos2'].astype(pd.Int64Dtype())
+
+    # if verbose, get control counts
+    if verbose:
+        print("\tGetting control counts", file=sys.stderr)
+
+    # get counts of reads in the controlbam for each indel/BND
+    if len(indelcounts) > 0:
+        # add pam positions to the df
+        if pam_positions is not pd.NA:
+            indelcounts['Positions'] = [pam_positions] * len(indelcounts)
+            indelcounts['Distance'] = indelcounts.apply(lambda r: min([abs(r['pos'] - x) for x in r['Positions']] + [abs(r['pos'] + len(r['ref']) - 1 - x) for x in r['Positions']] ) if r['pos'] is not pd.NA else pd.NA,axis=1)
+        else:
+            indelcounts['Positions'] = pd.NA
+            indelcounts['Distance'] = pd.NA
+
+        indelcounts = add_normal_counts(indelcounts, [ x for x in controlbam.fetch(contig=chr,start=start-window,end=end) ], fasta, window=distance)
+
+    else:
+        indelcounts['control_alt_counts'] = 0
+        indelcounts['control_total_counts'] = 0
+        indelcounts['Positions'] = pd.NA
+        indelcounts['Distance'] = pd.NA
+
+
+    # apply filters
+    indelcounts = indelcounts[(indelcounts['counts'] >= minreads) | (indelcounts['ref']=='.')]
+    indelcounts = indelcounts[(indelcounts['control_alt_counts'] <= maxcontrol) | (indelcounts['ref']=='.')]
+    indelcounts = indelcounts[(indelcounts['Distance'] <= distance) | (indelcounts['ref']=='.')]
+    indelcounts = indelcounts[~indelcounts['alt'].str.contains('N')]
+
+    return(indelcounts)
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Find indels in a bam file at BED coordinates')
+    parser.add_argument('-f','--fasta',type=str,default="/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/singh_v4.3.6/hg38_PLVM_CD19_CARv4_cd34.fa",help='Reference fasta file')
+    parser.add_argument('-w','--window',type=int,default=100,help='Distance between off-target sites for merging intervals.')
+    parser.add_argument('-d','--distance',type=int,default=25,help='Window size around off-target site to identify mutations')
+    parser.add_argument('-m','--minreads',type=int,default=1,help='Minimum supporting reads to report an indel/bnd event.')
+    parser.add_argument('-c','--chromosome',type=str,default=None,help='Chromosome to process.')
+    parser.add_argument('-x','--maxcontrol',type=int,default=0,help='Maximum supporting reads to to report an indel/bnd event.')
+    parser.add_argument('-s','--strict',action='store_true',help='Exit if a read cannot be properly parsed.')
+    # add verbosity argument
+    parser.add_argument('-v','--verbose',action='store_true',help='Print verbose output')
+
+    # add outfile argument -o or --outfile
+    parser.add_argument('-o','--outfile',type=str,help='Output file (optional)')
+    parser.add_argument('bed',help='Off-target coordinate file')
+    parser.add_argument('expbamfile',help='BAM file')
+    parser.add_argument('conbamfile',help='BAM file')
+
+    args = parser.parse_args()
+
+    # TODO: 
+    # Nihdi: We need to update this code to take the file and return 2 objects:
+    # 1. Take the positions and make intervals that are chr,(pos-distup),(pos+distdown) and then merge them into new list of non-redunant intervals (might use pyranges for this)
+    # 2. The individual entries need to be retained, though, along with the information about the off-target (the exact position, the mismatches, etc)
+
+    # verbose to stderr: processing input file
+    if args.verbose:
+        print("Processing input file", file=sys.stderr)
+
+    # open off-target file and create ranges to search
+    bedDf = pd.read_csv(args.bed)
+    bedDf['Pos'] = bedDf['Start']
+    bedDf['End'] = bedDf['Start']
+    bedDf['Start'] = bedDf['Start'] - 1
+    bedDf['Info'] = bedDf.apply(lambda row: f"{row['Source']},{row['DNA Sequence']},{row['PAM']},{row['Chromosome']},{row['Pos']},{row['Strand Direction']},{row['Mismatch']},{row['Bulge Type']},{row['Bulge Size']}", axis=1)
+    bedDf['Ontarget'] = bedDf['On_target']
+
+    # make pyranges object
+    bedPr = pr.PyRanges(bedDf[['Chromosome','Start','End','Pos','Info','Ontarget']])
+
+    # cluster the intervals
+    bedPr = bedPr.cluster(slack=args.window)
+    mergedBedPr = bedPr.merge(by='Cluster',strand=False,slack=args.window)
+    mergedBedDf = mergedBedPr.df.join(bedPr.df.groupby('Cluster')['Pos'].agg(list).reset_index().set_index('Cluster'),on='Cluster',how='left')
+    mergedBedDf = mergedBedDf.join(bedPr.df.groupby('Cluster')['Info'].agg(list).reset_index().set_index('Cluster'),on='Cluster',how='left')
+    mergedBedDf = mergedBedDf.join(bedPr.df.groupby('Cluster')['Ontarget'].agg('first').reset_index().set_index('Cluster'),on='Cluster',how='left')
+
+    if args.chromosome is not None:
+        print("Processing chromosome", args.chromosome, file=sys.stderr)
+        mergedBedDf = mergedBedDf[mergedBedDf['Chromosome'] == args.chromosome]
+
+    # verbose: say done
+    if args.verbose:
+        print("Done processing input file", file=sys.stderr)
+
+    # open bam file(s)
+    expsamfile = pysam.AlignmentFile(args.expbamfile,"rc",reference_filename=args.fasta)
+    consamfile = pysam.AlignmentFile(args.conbamfile,"rc",reference_filename=args.fasta)
+    # open fasta file
+    refFasta = pysam.FastaFile(args.fasta)
+
+    # print to outfile or stdout
+    if args.outfile:
+        sys.stdout = open(args.outfile, 'w')
+
+    print("\t".join('chrom start end pam_positions total_reads indel_reads indel_fraction control_reads control_indel_reads control_indel_fraction indel_count indel_info bnd_count bnd_info target_info is_target'.split(' ')),flush=True)
+
+    # iterate over mergedBedPr intervals:
+    for index, row in mergedBedDf.iterrows():
+
+        # verbose to stderr: processing interval
+        if args.verbose:
+            print(f"Processing interval {row['Chromosome']}:{row['Start']}-{row['End']}", file=sys.stderr)
+
+        indels = get_indels(bam=expsamfile,controlbam=consamfile,chr=row['Chromosome'],start=row['Start'],end=row['End'],
+                            fasta=refFasta,window=args.window,distance=args.distance,pam_positions=row['Pos'],minreads=args.minreads,maxcontrol=args.maxcontrol,verbose=args.verbose)
+        
+        total_reads, indel_reads, control_total_reads, control_indel_reads = 0, 0, 0, 0
+        indel_fraction, control_indel_fraction = 0, 0
+        indel_keys, bnd_keys = '.', '.'
+        bnds = []
+
+        if len(indels) > 0:
+            # get total reads in this region
+            total_reads = sum(indels['counts'])
+            indel_reads = sum(indels[indels['type']!='REF']['counts'])
+            control_indel_reads = int(indels['control_alt_counts'].mean())
+            control_total_reads = int(indels['control_total_counts'].mean())
+
+            # separate BNDs and indels
+            bnds = indels[indels['type']=='BND'].copy()
+            indels = indels[indels['type']=='INDEL'].copy()
+
+            if len(indels) > 0:
+                indels['Key'] = indels.apply(lambda r: f"{r['chrom']}:{r['pos']}:{r['ref']}:{r['alt']}:{r['counts']}:{r['control_alt_counts']}:{r['Distance']}", axis=1)
+
+            if len(bnds) > 0:
+                bnds['Key'] = bnds.apply(lambda r: f"{r['chrom']}:{r['pos']}:{r['chrom2']}:{r['pos2']}:{r['strands']}:{r['counts']}:{r['control_alt_counts']}:{r['Distance']}", axis=1)
+    
+        # print results to tab-delimited file
+        indel_fraction = round(indel_reads/total_reads,4) if total_reads > 0 else 0
+        control_indel_fraction = round(control_indel_reads/control_total_reads,4) if control_total_reads > 0 else 0
+        indel_keys = ';'.join(indels['Key'].tolist()) if len(indels) > 0 else '.'
+        bnd_keys = ';'.join(bnds['Key'].tolist()) if len(bnds) > 0 else '.'
+        ontarget = row['Ontarget']
+        offtargetsites = ';'.join(row['Info']) if len(row['Info']) > 0 else '.'
+
+        print(f"{row['Chromosome']}\t{row['Start']}\t{row['End']}\t{';'.join([ str(x) for x in row['Pos'] ])}\t{total_reads}\t{indel_reads}\t{indel_fraction}\t{control_total_reads}\t{control_indel_reads}\t{control_indel_fraction}\t{len(indels)}\t{indel_keys}\t{len(bnds)}\t{bnd_keys}\t{offtargetsites}\t{ontarget}",flush=True)
+
+
+    # close stdout or outfile
+    if args.outfile:
+        sys.stdout.close()
+
+    expsamfile.close()
+    consamfile.close()
+    refFasta.close()
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/bin/getIndelsFromBam.py b/bin/getIndelsFromBam.py
index c159a7d..3d147e7 100755
--- a/bin/getIndelsFromBam.py
+++ b/bin/getIndelsFromBam.py
@@ -2,289 +2,556 @@
 
 from __future__ import division
 import pysam
+import biotite.sequence.align as align
+import biotite.sequence as seq
 import argparse
 import re
 import string
-import csv
+import csv, sys
 import scipy.stats as stats
+import pandas as pd, pyranges as pr
 
 # reverse complement function
 def revcomp(seq):
     tab = str.maketrans('ACGTacgtRYMKrymkVBHDvbhd', 'TGCAtgcaYRKMyrkmBVDHbvdh') # maketrans <- maps the reverse complement
     return seq.translate(tab)[::-1] # translate(x)[::-1] <- works backward through the string, effectively reversing the string
 
+# make cigar tuples from a cigar string
+def make_cigar_tuples(cigar):
+    cigar_dict = {'M': 0, 'I': 1, 'D': 2, 'N': 3, 'S': 4, 'H': 5, 'P': 6, '=': 7, 'X': 8}
+    # use regex to parse cigar string into tuples
+    cigar_tuples = re.findall(r'(\d+)([MIDNSHP=X])', cigar)
+    cigar_tuples = [(cigar_dict[operation],int(length)) for length, operation in cigar_tuples]
+    return cigar_tuples
+
+# function to count the number of reads that support an indel or BND event
+def calculate_normal_counts(row, bam, fasta, window=25):
+
+    # subfunctoin to count read hits to a sequence
+    def count_read_hits(bam,fasta,chrom,pos,chrom2=None,pos2=None,ref=None,alt=None,orientation=None,handicap=5,window=25,flank=150):
+
+        # note, the handicap is used to adjust the score of the alignment to account for the difference in length between the ref and alt sequences
+        # if the normal reads really support the indel, it should have a score that reflects the difference in length between the ref and alt sequences
+        refseq = ''
+        altseq = ''
+
+        # make ref and alt sequences, first if an indel, next if a BND.
+        # note that if its a BND then the refseq is a concatenation of the two sequences from either end
+        if ref and alt:
+            refseq = fasta.fetch(chrom,pos-1-flank,pos+len(ref)+flank)
+            altseq = fasta.fetch(chrom,pos-1-flank,pos-1) + alt + fasta.fetch(chrom,pos+len(ref)-1,pos+len(ref)+flank)
+            #handicap = abs(len(ref) - len(alt)) if handicap == 0 else handicap
+
+        elif not chrom2 is pd.NA and not pos2 is pd.NA and not orientation is None:
+            handicap = 50 if handicap == 0 else handicap
+            refseq = fasta.fetch(chrom,pos-1-flank,pos+flank)
+            altseq = fasta.fetch(chrom,pos-1-flank,pos-1)
+            if orientation == '++':
+                altseq += fasta.fetch(chrom2,pos2-1,pos2+flank)
+            elif orientation == '+-':
+                altseq += revcomp(fasta.fetch(chrom2,pos2-1-flank,pos2))
+
+        # make ref_count and alt_count counters
+        ref_count = set()
+        alt_count = set()
+
+        # iterate through reads in bam file for the first position
+        for read in bam.fetch(chrom,pos-1-window,pos+window,multiple_iterators=True):
+            if not read.is_mapped:
+                continue
+
+            # if the read has no indels, add it to the ref_count set
+            if len(read.cigartuples) == 1 and read.cigartuples[0][0] == 0 and read.cigartuples[0][1] == len(read.query_sequence):
+                ref_count.add(read.query_name)
+                continue
+
+            ref_align = align.align_optimal(seq.NucleotideSequence(read.query_sequence),seq.NucleotideSequence(refseq),matrix=align.SubstitutionMatrix.std_nucleotide_matrix(),gap_penalty=(-10,-1),local=True,terminal_penalty=False,max_number=1)[0].score
+
+            alt_align = align.align_optimal(seq.NucleotideSequence(read.query_sequence),seq.NucleotideSequence(altseq),matrix=align.SubstitutionMatrix.std_nucleotide_matrix(),gap_penalty=(-10,-1),local=True,terminal_penalty=False,max_number=1)[0].score
+
+            if alt_align > 0 and alt_align - handicap > ref_align:
+                alt_count.add(read.query_name)
+            else:
+                ref_count.add(read.query_name)
+
+        return (len(alt_count),len(alt_count)+len(ref_count))
+
+    if row['type'] == 'BND':
+        return count_read_hits(
+            bam, fasta, window=window,
+            chrom=row['chrom'], pos=row['pos'],
+            chrom2=row['chrom2'], pos2=row['pos2'],
+            orientation=row['strands']
+        )
+    elif row['type'] == 'INDEL':
+        return count_read_hits(
+            bam, fasta, window=window,
+            chrom=row['chrom'], pos=row['pos'],
+            ref=row['ref'], alt=row['alt']
+        )
+    else:
+        #print(f"{row['chrom']}\t{row['pos']}\t{row['type']}")
+        control_reads = [ x.query_name for x in bam.fetch(row['chrom'],row['pos']-1-window,row['pos']+window,multiple_iterators=True) ]
+        control_reads = set(control_reads)
+        return (0,len(control_reads))
+
+
 # function to get indels for one amplicon from bam file
-def get_indels(bam,chr,start,end,pampos,strand,srchSeq,srchSeqMM):
+def get_indels(bam,controlbam,chr,start,end,fasta,window=100,distance=25,pam_positions=pd.NA,minSecMapQual=10,maxNM=5,svDistanceThreshold=100000,saAlignmentTolerance=5,minSoftClipLength=5,deletionDistanceThreshold=1000,minreads=1,maxcontrol=0,strict=False):
+
+    # window == flanking sequence to add to read search
+    # distance == max distance from start/end to consider an indel, BND, or call a read wild-type.
 
-    dat = {'all':0,'indels':{},'sv':{},'seq':0}
+    cigarVarDict = {'0':'M','1':'I','2':'D','3':'N','4':'S','5':'H','6':'P','7':'=','8':'X'}
+
+    readaln = pd.DataFrame(columns=['read','chrom','pos','chrom2','pos2','ref','alt','strands','type'])
     
-    for read in bam.fetch(chr, start, end, multiple_iterators = True):
+    # get reads that align within window of start and end
+    for read in bam.fetch(chr, start-window, end+window, multiple_iterators = True):
 
         # skip if not primary alignment or a duplicate or alignment doesnt overlap start,end
         if read.is_mapped is False or \
             read.is_duplicate is True or \
             read.is_secondary is True or \
             read.is_supplementary is True or \
-            read.reference_start > end or \
-            read.reference_end < start:
-            continue
-
-        if read.mapping_quality == 0:
+            read.mapping_quality == 0:
             continue
 
-        if read.has_tag('NM') is True and int(read.get_tag('NM'))>5:
-            continue
+        cigar = read.cigartuples # get cigar info
+        mate_cigar = make_cigar_tuples(read.get_tag('MC')) if read.has_tag('MC') else None
 
-        # check for a chimeric read
-        if read.is_proper_pair is False and read.mate_is_mapped is True:
-            mate_contig = read.next_reference_name
-            mate_start = read.next_reference_start
-            if mate_contig != read.reference_name or abs(mate_start - read.reference_start) > 1000:
-                dat['sv'][':'.join([read.reference_name,str(read.reference_start),read.next_reference_name,str(read.next_reference_start)])] = 1
-            continue
+        # Quality filter: Get mismatches from NM tag and subtract deleted/skipped bases from cigar
+        # and skip if too many mismatches
+        if read.has_tag('NM') and int(read.get_tag('NM')) > maxNM:
+            if (int(read.get_tag('NM')) - sum([x[1] for x in cigar if x[0] == 2 or x[0]==3])) > maxNM:
+                continue
 
-        if read.has_tag('SA') is True:
-            sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(',')
-            if sChr != read.reference_name and int(sMq)>0:
-                dat['sv'][':'.join([read.reference_name,str(read.reference_start),sChr,str(sPos)])] = 1
-            continue
-
-        # check that read start and end are within w bp of the amplicon target start and end (to avoid spurious alignments--skip if this is not true) 
-        cigar = read.cigartuples # get cigar info
+        read_strand = "+"
+        if read.is_reverse is True:
+            read_strand = "-"
 
         leftSoftClip = 0
         rightSoftClip = 0
+        read_reference_start = read.reference_start + 1
+        read_reference_end = read.reference_end
+        read_next_reference_start = read.next_reference_start + 1
+        read_next_reference_end = None if mate_cigar is None else read_next_reference_start + sum([x[1] for x in mate_cigar if x[0] == 0 or x[0] == 2 or x[0] == 3])
+        
+        read_query_start = 0
+        read_query_end = len(read.query_sequence)
+
+        # setup indelinfo dictionary
+        indelinfo = {'read':read.query_name,'chrom':'','pos':None,'chrom2':None,'pos2':None,'strands': '','ref':'','alt':'','type':''}
+
+        # Get left soft clip length
         if cigar[0][0] == 4:
             leftSoftClip = cigar[0][1]
 
+        # Get right soft clip length
         if cigar[-1][0] == 4:
             rightSoftClip = cigar[-1][1]
 
-        # skip if read is not aligned across the pampos (incl. soft clips)
-#        if read.is_proper_pair is True and (read.reference_start - leftSoftClip > pampos or read.reference_end + rightSoftClip < pampos):
-#            continue
+        # 
+        # This adjusts alignment cigars for soft clips and supplementary alignments. 
+        # It extends the cigar if the softclips map nearby and therefore the read has a deletion.
+        #
 
-        # skip if read doesnt fully span the pampos
-        if read.reference_start >= pampos or read.reference_end <= pampos:
-            continue
+        # Process left soft clip, if present
+        if leftSoftClip > 0:
 
-        # we cant resolve these so will skip for now.
-        if leftSoftClip > 0 or rightSoftClip > 0:
-            continue
+            # first check for local supplementary alignments 
+            sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(';')[0].split(',') if read.has_tag('SA') else [None,None,None,None,None,None]
 
-        # count all reads covering this bed record position
-        dat['all'] += 1
+            if (sChr != None and int(sMq)>=minSecMapQual and 
+                int(sNm)<maxNM and sChr == read.reference_name and 
+                sStrand == read_strand and int(sPos) < read_reference_start and 
+                abs(int(sPos) - read_reference_start) < svDistanceThreshold):
 
-        # remove start and end cigar operations that are N/H/S/X (ie, not M, D, or I).
-        # These are soft or hard clips and not aligned portions of the read
-        if cigar[0][0] > 2:
-            del cigar[0]
-            
-        if cigar[-1][0] > 2:
-            del cigar[-1]                    
-                
-        if len(cigar) == 1 and cigar[0][0] == 0: # no indels, so continue
-            continue                       
-
-        # get reference position start for the alignment
-        # then skip beginning and trailing aligned blocks that are aligned
-        # to the reference
-        rpos = read.reference_start
-
-        var = [] # operation: [I]nsertion, [D]eletion, or [C]omplex
-        varlen = [] # length of indel
-        vartype = [] # variant type
-        varst = [] # ref pos of variant start 
-        varen = [] # ref pos of variant end
-        deltabases = 0 # sum of bases inserted/deleted
-
-        for cg in cigar: # rebuild cigar and keep track
-            s = rpos
-            e = rpos
-            if cg[0] == 0 or cg[0] == 2 or cg[0] == 3:
-                e = rpos + cg[1]                
-            elif cg[0] == 1:
-                e = rpos + 1
-                
-            # do not include the cigar operation if it does not overlap start,end...
-            if e < start or s > end:
-                rpos = e
-                continue
+                sPos = int(sPos)
 
-            # ...or its a matching segment that goes beyond start,end since these dont matter            
-            elif cg[0] == 0 and (s < start or e > end):
-                rpos = e
-                continue
-            
-            elif cg[0] == 0: # aligned block
-                var.append(str(cg[1]) + "M")
-                varlen.append(cg[1])
-                vartype.append("M")
-                varst.append(s)
-                varen.append(e)
-                
-            elif cg[0] == 1: # an insertion
-                var.append(str(cg[1]) + "I")
-                vartype.append("I")
-                varlen.append(cg[1])
-                varst.append(s)
-                varen.append(e)
-                deltabases += cg[1]
-               
-            elif cg[0] == 2 or cg[0] == 3: # a deletion or reference skip
-                var.append(str(cg[1]) + "D")
-                vartype.append("D")
-                varlen.append(cg[1])
-                varst.append(s)
-                varen.append(e)
-                deltabases += cg[1]
-                
-            else: # account for other operations
-                var.append(str(cg[1]) + "X")
-                vartype.append("C")
-                varlen.append(cg[1])
-                varst.append(s)
-                varen.append(e)
-                deltabases += cg[1]
-                
-            rpos = e # increment reference position
+                # if there is a supplementary alignment, then realign the entire read to the reference sequence
+                cigar = make_cigar_tuples(align.write_alignment_to_cigar(align.align_optimal(seq.NucleotideSequence(fasta.fetch(chr,sPos-1,read_reference_end)),
+                                                                            seq.NucleotideSequence(read.query_sequence),
+                                                                            matrix=align.SubstitutionMatrix.std_nucleotide_matrix(),
+                                                                            gap_penalty=(-10,-1),local=False,max_number=1)[0]))
+                read_reference_start = sPos
 
-        # do not continue if there are no indels
-        if len(var) == 0:
-            continue
-            
-        # combine multiple indels into a complex one
-        if vartype[0] == 'M':
-            var = var[1:]
-            varlen = varlen[1:]
-            vartype = vartype[1:]
-            varst = varst[1:]
-            varen = varen[1:]
-
-        if vartype[-1] == 'M':
-            var = var[:-1]
-            varlen = varlen[:-1]
-            vartype = vartype[:-1]
-            varst = varst[:-1]
-            varen = varen[:-1]
-        
-        if len(var) > 1:
-            var = ["".join(var)]
-            varlen = [sum(varlen)]
-            vartype = ["C"]
-            varst = [min(varst)]
-            varen = [max(varen)]
-            
-        i = 0
-        # get position relative to PAM, strand
-        relpos = None
-        if (vartype[i] == "D" or vartype[i] == "C") and varst[i] <= pampos and varen[i] >= pampos:
-            relpos = 0
-                
-        elif strand == "+" and varst[i] >= pampos: # fwd downstream
-            relpos = varst[i] - pampos
+            else:
+                # if the left soft clip is long enough, see if it supports a deletion within window
+                # this performs an exact match and so isnt ideal, but should be good enough for now
+                if leftSoftClip >= minSoftClipLength:
+                    clippedSeq = read.query_sequence[:leftSoftClip]
+                    refSeq = fasta.fetch(chr,read_reference_start - window,read_reference_start)
+                    # find the position of the left-most occurance of clippedSeq in refSeq
+                    leftClipPos = refSeq.rfind(clippedSeq)
+                    if leftClipPos != -1:
+                        delSeq = refSeq[leftClipPos + len(clippedSeq):]
+                        cigar = [(0,len(clippedSeq))] + [(2,len(delSeq))] + cigar[1:]
+                        read_reference_start = read_reference_end - sum([x[1] for x in cigar if x[0] == 0 or x[0] == 2 or x[0] == 3]) + 1
+
+        # Process right soft clip, if present
+        if rightSoftClip > 0:
+
+            # first check for local supplementary alignments 
+            sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(';')[0].split(',') if read.has_tag('SA') else [None,None,None,None,None,None]
+
+            if (sChr != None and int(sMq)>=minSecMapQual and 
+                int(sNm)<maxNM and sChr == read.reference_name and 
+                sStrand == read_strand and int(sPos) > read_reference_start and 
+                int(sPos) - read_reference_start < svDistanceThreshold):
                 
-        elif strand == "+" and varen[i] <= pampos: # fwd upstream
-            relpos = varen[i] - pampos
-    
-        elif strand == "-" and varen[i] <= pampos: # rev downstream
-            relpos = pampos - varen[i]
-    
-        elif strand == "-" and varst[i] >= pampos: # rev upstream
-            relpos = pampos - varst[i] 
-
-        # key is reference_position:cigar:vartype:len:relpos
-        key = str(varst[i]) + ":" + var[i] + ":" + vartype[i] + ":" + str(varlen[i]) + ":" + str(relpos) 
-            
-        if key not in dat['indels'].keys():
-            dat['indels'][key] = 0
-
-        dat['indels'][key] += 1 # count the reads with this variant        
-
-    return(dat)
-
-
-srchSeq = 'TTAATTGAGTTGTCATATGTTAATAACGGT' # this is the dsODN sequence
-srchSeqMM = 2 # mismatches tolerated when looking for the search sequence
-maxInControl = 10
-
-parser = argparse.ArgumentParser(description='Find indels in a bam file at BED coordinates')
-parser.add_argument('bed',help='Amplicon bed file')
-parser.add_argument('expbamfile',help='BAM file')
-parser.add_argument('conbamfile',help='BAM file')
-
-args = parser.parse_args()
-
-bed = args.bed
-
-# open bam file(s)
-expsamfile = pysam.AlignmentFile(args.expbamfile,"rc",reference_filename="/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/singh/hg38_PLVM_CD19_CARv4_cd34.fa")
-consamfile = pysam.AlignmentFile(args.conbamfile,"rc",reference_filename="/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/singh/hg38_PLVM_CD19_CARv4_cd34.fa")
-
-distup = 20
-distdown = 20
-
-print("\t".join('id chrom pos strand edited_reads edited_indels edited_unique_indels control_reads control_indels control_unique_indels indel_info exp_svs con_svs'.split(' ')))
-#rec = [contig,pampos,strand,datExp['all'],expindels,sum(datExp['filtindels'].values()),datExp['seq'],datCon['all'],controlindels,sum(datCon['filtindels'].values()),datCon['seq'],";".join(map(str,vsfilt)) or '.']
-with open(bed) as f:
-    for line in f:
-        rec = line.strip().split(',')
+                sPos = int(sPos)
+                sCigarTuples = make_cigar_tuples(sCigar)
 
-        p = re.compile("^(\S+)\:([+-])(\d+)")
-        m = p.search(rec[5])
-        contig = m[1]
-        pampos = int(m[3])
-        strand = m[2]
-        if strand == '+':
-            pampos = pampos + 20
+                cigar = make_cigar_tuples(align.write_alignment_to_cigar(align.align_optimal(seq.NucleotideSequence(fasta.fetch(chr,read_reference_start-1,sPos-1+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 2 or x[0] == 3]))),seq.NucleotideSequence(read.query_sequence),
+                                matrix=align.SubstitutionMatrix.std_nucleotide_matrix(),
+                                gap_penalty=(-10,-1),local=False,max_number=1)[0]))
 
-        st = pampos - distup
-        en = pampos + distdown
-        if strand == "-":
-            st = pampos - distdown
-            en = pampos + distup
+                read_reference_end = read_reference_start + sum([x[1] for x in cigar if x[0] == 0 or x[0] == 2 or x[0] == 3])
 
-        datExp = get_indels(expsamfile,contig,st,en,pampos,strand,srchSeq,srchSeqMM)
-        datCon = get_indels(consamfile,contig,st,en,pampos,strand,srchSeq,srchSeqMM)
-
-        datExp['filtindels'] = {}
-        datExp['filtseq'] = {}
-        datCon['filtindels'] = {}
-        datCon['filtseq'] = {}
-
-        expindels = 0
-        controlindels = 0
+            else:
+                # if the left soft clip is long enough, see if it supports a deletion within deletionDistanceThreshold
+                # this performs an exact match and so isnt ideal, but should be good enough for now
+                if rightSoftClip >= minSoftClipLength:
+                    clippedSeq = read.query_sequence[-rightSoftClip:]
+                    refSeq = fasta.fetch(chr,read_reference_end,read_reference_end + window)
+                    # find the position of the left-most occurance of clippedSeq in refSeq
+                    rightClipPos = refSeq.find(clippedSeq)
+                    if rightClipPos != -1:
+                        delSeq = refSeq[0:rightClipPos]
+                        cigar = cigar[:-1] + [(2,len(delSeq))] + [(0,len(clippedSeq))]
+                        read_reference_end = read_reference_start + sum([x[1] for x in cigar if x[0] == 0 or x[0] == 2 or x[0] == 3])
         
-        for k in set(list(datExp['indels'].keys()) + list(datCon['indels'].keys())): # iterate through all exp and con indels
+        # Adjusted read has to be within distance of the start/end positions to consider
+        if read_reference_start - leftSoftClip >= end + distance or read_reference_end + rightSoftClip <= start - distance:
+            continue
 
-            if (datCon['indels'].get(k) or 0) > maxInControl: # do not report indels with >maxInControl reads--these are inherited or artifacts. delete from dicts
-                if datExp['indels'].get(k):
-                    del datExp['indels'][k]
-                    
-                if datCon['indels'].get(k):
-                    del datCon['indels'][k]
+        # remove leading or trailing soft clips
+        if cigar[0][0] >= 4:
+            read_query_start += cigar[0][1]
+            del cigar[0]
+        
+        if cigar[-1][0] >= 4:
+            read_query_end -= cigar[-1][1]
+            del cigar[-1]
+
+        # there are multiple cigar operations bookened by matches, the process an indel                
+        if len(cigar) > 1:
+
+            # adjust alignment start for first aligned block and remove it
+            if cigar[0][0] == 0:
+                read_reference_start += cigar[0][1]# sets to position before event
+                read_query_start = read_query_start + cigar[0][1] 
+                del cigar[0]
+
+            # adjust alignment end for last aligned block and remove it
+            if cigar[-1][0] == 0:
+                read_reference_end = read_reference_end - cigar[-1][1]
+                read_query_end = read_query_end - cigar[-1][1]
+                del cigar[-1]
+
+            # If there's only one cigar operation left, it's an insertion or deletion
+            if len(cigar) == 1:
+
+                # theres one event, so record the coordinates
+                indelinfo['chrom'] = read.reference_name
+                indelinfo['pos'] = read_reference_start
+                indelinfo['strands'] = '++'
+
+                # if insertion
+                if cigar[0][0] == 1:           
+                    indelinfo['ref'] = fasta.fetch(chr,read_reference_start-1-1,read_reference_start-1) # 0 based and position before the isertion
+                    indelinfo['alt'] = read.query_sequence[read_query_start-1:read_query_start+cigar[0][1]]
+                    indelinfo['type'] = 'INDEL'
+
+                # if deletion
+                elif cigar[0][0] == 2 or cigar[0][0] == 3:
+                    indelinfo['ref'] = fasta.fetch(chr,read_reference_start-1,read_reference_start+cigar[0][1])
+                    indelinfo['alt'] = indelinfo['ref'][0]
+                    indelinfo['type'] = 'INDEL'
+
+            # if its a complex indel
+            else:
+                indelinfo['chrom'] = read.reference_name
+                indelinfo['pos'] = read_reference_start
+                indelinfo['strands'] = '++'
+                indelinfo['type'] = 'INDEL'
+
+                # iterate through cigar and get indels
+                varlen = sum([ x[1] for x in cigar ])
+                indelinfo['ref'] = fasta.fetch(chr,read_reference_start-1,read_reference_start+sum([ x[1] for x in cigar ]))
+                indelinfo['alt'] = read.query_sequence[read_query_start+1:read_query_start+sum([x[1] if x[0] == 1 or x[0] == 0 else 0 for x in cigar])]
+
+        # if the read is chimeric and has a supplementary alignment near the start/end, process the chimeric read
+        elif read.has_tag('SA') is True and (leftSoftClip > 0 or rightSoftClip > 0):
+            sChr, sPos, sStrand, sCigar, sMq, sNm = read.get_tag('SA').split(';')[0].split(',') if read.has_tag('SA') else [None,None,None,None,None,None]
+
+            sCigarTuples = make_cigar_tuples(sCigar)
+            sPos = int(sPos)
+            sEnd = sPos + sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 2 or x[0] == 3]) - 1
+            sLeftSoftClip = sCigarTuples[0][1] if sCigarTuples[0][0] >= 4 else 0
+            sRightSoftClip = sCigarTuples[-1][1] if sCigarTuples[-1][0] >= 4 else 0
+
+            pSeq = read.query_alignment_sequence
+            sSeq = read.query_sequence[sLeftSoftClip:sLeftSoftClip+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 1])] if sStrand == read_strand else revcomp(read.query_sequence[sLeftSoftClip:sLeftSoftClip+sum([x[1] for x in sCigarTuples if x[0] == 0 or x[0] == 1])])
+
+            # correct for overlapping alignments
+            if len(pSeq)+len(sSeq) > len(read.query_sequence):
+                trimLen = len(pSeq)+len(sSeq) - len(read.query_sequence)
+                if sLeftSoftClip > sRightSoftClip:
+                    sPos += trimLen
+                    sSeq = sSeq[trimLen:]
+
+                elif sRightSoftClip > sLeftSoftClip:
+                    sEnd -= trimLen
+                    sSeq = sSeq[:-trimLen]
+
+            if (int(sMq) >= minSecMapQual and int(sNm) <= maxNM and int(sMq)>0 and
+                (sChr != read.reference_name or abs(int(sPos) - read_reference_start) >= svDistanceThreshold) or sStrand != read_strand or 
+                (rightSoftClip > 0 and sEnd < read_reference_start) or (leftSoftClip > 0 and sPos > read_reference_end)):
+
+                indelinfo['chrom'] = read.reference_name
+                indelinfo['chrom2'] = sChr
+                indelinfo['strands'] = '++'if sStrand == read_strand else '+-'
+                indelinfo['type'] = 'BND'
+
+                if indelinfo['strands'] == '++':
+                    # if --->...> ...>---> or <---<... <...<---
+                    if rightSoftClip > leftSoftClip and sLeftSoftClip > sRightSoftClip:
+                        indelinfo['pos'] = read_reference_end
+                        indelinfo['pos2'] = sPos
+
+                    # if ...>---> --->...> or <...<--- <---<...
+                    elif leftSoftClip > rightSoftClip and sRightSoftClip > sLeftSoftClip:
+                        indelinfo['pos'] = read_reference_start - 1 
+                        indelinfo['pos2'] = sEnd + 1
+
+                    else:
+                        print(f"Error: no proper soft clip orientation found: {read.query_name}")
+                        if strict:
+                            exit(1)
+                        else:
+                            continue
+
+                elif indelinfo['strands'] == '+-':
+                    # if --->...> <---<... or <---<... --->...>
+                    if rightSoftClip > leftSoftClip and sRightSoftClip > sLeftSoftClip:
+                        indelinfo['pos'] = read_reference_end
+                        indelinfo['pos2'] = sEnd
+
+                    # if ...>---> <...<--- or <...<--- ...>--->
+                    elif leftSoftClip > rightSoftClip and sLeftSoftClip > sRightSoftClip:
+                        indelinfo['pos'] = read_reference_start - 1
+                        indelinfo['pos2'] = sPos
+
+                    else:
+                        print(f"Error: no proper soft clip orientation found: {read.query_name}")
+                        if strict:
+                            exit(1)
+                        else:
+                            continue
+                
+                else:
+                    print(f"Error: no proper soft clip orientation found: {read.query_name}")
+                    if strict:
+                        exit(1)
+                    else:
+                        continue
+            
+            elif read_reference_start < end and read_reference_end > start:
+                indelinfo['chrom'] = chr
+                indelinfo['pos'] = start
+                indelinfo['ref'] = '.'
+                indelinfo['alt'] = '.'
+                indelinfo['type'] = 'REF'
 
             else:
-                datExp['filtindels'][k] = datExp['indels'].get(k) or 0
-                if datExp['filtindels'][k] > 0:
-                    expindels += 1
+                continue
                     
-                datCon['filtindels'][k] = datCon['indels'].get(k) or 0
-                if datCon['filtindels'][k] > 0:
-                    controlindels += 1
+        elif read_reference_start < end and read_reference_end > start:
+            indelinfo['chrom'] = chr
+            indelinfo['pos'] = start
+            indelinfo['ref'] = '.'
+            indelinfo['alt'] = '.'
+            indelinfo['type'] = 'REF'
+
+        else:
+           continue
+
+        # add indel info to dataframe
+        readaln = pd.concat([readaln, pd.DataFrame([indelinfo])], ignore_index=True)
+
+    # group by read and sort by cigar, then sv and get the first value in each group
+    readaln = readaln.sort_values(by=['read','chrom','pos','chrom2','pos2','strands','ref','alt','type'],key=lambda col: col != '',ascending=False).groupby('read').first().reset_index()
+    indelcounts = readaln.groupby(['chrom','pos','chrom2','pos2','strands','ref','alt','type'],dropna=False).size().reset_index(name='counts')
+
+    # need to recast as int type, but allow for NA values
+    indelcounts['pos'] = indelcounts['pos'].astype(pd.Int64Dtype())
+    indelcounts['pos2'] = indelcounts['pos2'].astype(pd.Int64Dtype())
+
+    # get counts of reads in the controlbam for each indel/BND
+    if len(indelcounts) > 0:
+        # add pam positions to the df
+        if pam_positions is not pd.NA:
+            indelcounts['Positions'] = [pam_positions] * len(indelcounts)
+            indelcounts['Distance'] = indelcounts.apply(lambda r: min([abs(r['pos'] - x) for x in r['Positions']] + [abs(r['pos'] + len(r['ref']) - 1 - x) for x in r['Positions']] ) if r['pos'] is not pd.NA else pd.NA,axis=1)
+        else:
+            indelcounts['Positions'] = pd.NA
+            indelcounts['Distance'] = pd.NA
+
+        indelcounts[['control_alt_counts','control_total_counts']] = indelcounts.apply(
+            lambda row: calculate_normal_counts(row, controlbam, fasta, window=distance), axis=1
+        ).apply(pd.Series)
+    else:
+        indelcounts['control_alt_counts'] = 0
+        indelcounts['control_total_counts'] = 0
+        indelcounts['Positions'] = pd.NA
+        indelcounts['Distance'] = pd.NA
+
+
+    # apply filters
+    indelcounts = indelcounts[(indelcounts['counts'] >= minreads) | (indelcounts['ref']=='.')]
+    indelcounts = indelcounts[(indelcounts['control_alt_counts'] <= maxcontrol) | (indelcounts['ref']=='.')]
+    indelcounts = indelcounts[(indelcounts['Distance'] <= distance) | (indelcounts['ref']=='.')]
+    return(indelcounts)
+
+# Conversion function for CSV to hotspot format
+def convert_to_hotspot_format(input_csv, output_file):
+    with open(input_csv, 'r') as csv_file, open(output_file, 'w', newline='') as out_file:
+        reader = csv.DictReader(csv_file)
+        writer = csv.writer(out_file)
+        
+        for row in reader:
+            dna_sequence = row['DNA Sequence']
+            pam = row['PAM']
+            mismatch = row['Mismatch']
+            bulge_type = row['Bulge Type']
+            bulge_size = row['Bulge Size']
+            chromosome = row['Chromosome']
+            strand = row['Strand Direction']
+            start = row['Start']
+
+            mismatch = mismatch if mismatch else "N/A"
+            bulge_type = bulge_type if bulge_type else ""
+            bulge_size = bulge_size if bulge_size else ""
+            chrom_pos = f"{chromosome}:{strand}{start}"
+
+            writer.writerow([dna_sequence, pam, '', mismatch, '', chrom_pos])
+
+# Convert CSV to hotspot format if it has the specified header
+def convert_csv_if_needed(input_file):
+    with open(input_file, 'r') as file:
+        first_line = file.readline().strip()
+        expected_columns = "Source,DNA Sequence,PAM,Chromosome,Strand Direction,Start,Bulge Type,Mismatch,Bulge Size,On_target"
+        if first_line == expected_columns:
+            output_file = input_file + '.converted'
+            convert_to_hotspot_format(input_file, output_file)
+            return output_file
+    return input_file
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Find indels in a bam file at BED coordinates')
+    parser.add_argument('-f','--fasta',type=str,default="/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/singh_v4.3.6/hg38_PLVM_CD19_CARv4_cd34.fa",help='Reference fasta file')
+    parser.add_argument('-w','--window',type=int,default=100,help='Distance between off-target sites for merging intervals.')
+    parser.add_argument('-d','--distance',type=int,default=25,help='Window size around off-target site to identify mutations')
+    parser.add_argument('-m','--minreads',type=int,default=1,help='Minimum supporting reads to report an indel/bnd event.')
+    parser.add_argument('-c','--maxcontrol',type=int,default=0,help='Maximum supporting reads to to report an indel/bnd event.')
+    parser.add_argument('-s','--strict',action='store_true',help='Exit if a read cannot be properly parsed.')
+    # add outfile argument -o or --outfile
+    parser.add_argument('-o','--outfile',type=str,help='Output file (optional)')
+    parser.add_argument('bed',help='Off-target coordinate file')
+    parser.add_argument('expbamfile',help='BAM file')
+    parser.add_argument('conbamfile',help='BAM file')
+
+    args = parser.parse_args()
+
+    # TODO: 
+    # Nihdi: We need to update this code to take the file and return 2 objects:
+    # 1. Take the positions and make intervals that are chr,(pos-distup),(pos+distdown) and then merge them into new list of non-redunant intervals (might use pyranges for this)
+    # 2. The individual entries need to be retained, though, along with the information about the off-target (the exact position, the mismatches, etc)
+
+    # open off-target file and create ranges to search
+    bedDf = pd.read_csv(args.bed)
+    bedDf['Pos'] = bedDf['Start']
+    bedDf['End'] = bedDf['Start']
+    bedDf['Start'] = bedDf['Start'] - 1
+    bedDf['Info'] = bedDf.apply(lambda row: f"{row['Source']},{row['DNA Sequence']},{row['PAM']},{row['Chromosome']},{row['Pos']},{row['Strand Direction']},{row['Mismatch']},{row['Bulge Type']},{row['Bulge Size']}", axis=1)
+    bedDf['Ontarget'] = bedDf['On_target']
+
+    # make pyranges object
+    bedPr = pr.PyRanges(bedDf[['Chromosome','Start','End','Pos','Info','Ontarget']])
+
+    # cluster the intervals
+    bedPr = bedPr.cluster(slack=args.window)
+    mergedBedPr = bedPr.merge(by='Cluster',strand=False,slack=args.window)
+    mergedBedDf = mergedBedPr.df.join(bedPr.df.groupby('Cluster')['Pos'].agg(list).reset_index().set_index('Cluster'),on='Cluster',how='left')
+    mergedBedDf = mergedBedDf.join(bedPr.df.groupby('Cluster')['Info'].agg(list).reset_index().set_index('Cluster'),on='Cluster',how='left')
+    mergedBedDf = mergedBedDf.join(bedPr.df.groupby('Cluster')['Ontarget'].agg('first').reset_index().set_index('Cluster'),on='Cluster',how='left')
+
+    # open bam file(s)
+    expsamfile = pysam.AlignmentFile(args.expbamfile,"rc",reference_filename=args.fasta)
+    consamfile = pysam.AlignmentFile(args.conbamfile,"rc",reference_filename=args.fasta)
+    # open fasta file
+    refFasta = pysam.FastaFile(args.fasta)
+
+    # print to outfile or stdout
+    if args.outfile:
+        sys.stdout = open(args.outfile, 'w')
+
+    print("\t".join('chrom start end pam_positions total_reads indel_reads indel_fraction control_reads control_indel_reads control_indel_fraction indel_count indel_info bnd_count bnd_info target_info is_target'.split(' ')))
+
+    # iterate over mergedBedPr intervals:
+    for index, row in mergedBedDf.iterrows():
+
+        indels = get_indels(bam=expsamfile,controlbam=consamfile,chr=row['Chromosome'],start=row['Start'],end=row['End'],
+                            fasta=refFasta,window=args.window,distance=args.distance,pam_positions=row['Pos'],minreads=args.minreads,maxcontrol=args.maxcontrol)
+        
+        total_reads, indel_reads, control_total_reads, control_indel_reads = 0, 0, 0, 0
+        indel_fraction, control_indel_fraction = 0, 0
+        indel_keys, bnd_keys = '.', '.'
+        bnds = pd.DataFrame()
+        offtargetsites = ';'.join(row['Info']) if len(row['Info']) > 0 else '.'
+        ontarget = row['Ontarget']
+
+
+        if len(indels) > 0:
+            # get total reads in this region
+            total_reads = sum(indels['counts'])
+            indel_reads = sum(indels[indels['type']!='REF']['counts'])
+            control_indel_reads = int(indels['control_alt_counts'].mean())
+            control_total_reads = int(indels['control_total_counts'].mean())
+
+            # separate BNDs and indels
+            bnds = indels[indels['type']=='BND'].copy()
+            indels = indels[indels['type']=='INDEL'].copy()
+
+            if len(indels) > 0:
+                indels['Key'] = indels.apply(lambda r: f"{r['chrom']}:{r['pos']}:{r['ref']}:{r['alt']}:{r['counts']}:{r['control_alt_counts']}:{r['Distance']}", axis=1)
+
+            if len(bnds) > 0:
+                bnds['Key'] = bnds.apply(lambda r: f"{r['chrom']}:{r['pos']}:{r['chrom2']}:{r['pos2']}:{r['strands']}:{r['counts']}:{r['control_alt_counts']}:{r['Distance']}", axis=1)
+    
+            # print results to tab-delimited file
+            indel_fraction = round(indel_reads/total_reads,4) if total_reads > 0 else 0
+            control_indel_fraction = round(control_indel_reads/control_total_reads,4) if control_total_reads > 0 else 0
+            indel_keys = ';'.join(indels['Key'].tolist()) if len(indels) > 0 else '.'
+            bnd_keys = ';'.join(bnds['Key'].tolist()) if len(bnds) > 0 else '.'
+            ontarget = row['Ontarget']
+            offtargetsites = ';'.join(row['Info']) if len(row['Info']) > 0 else '.'
 
+        print(f"{row['Chromosome']}\t{row['Start']}\t{row['End']}\t{';'.join([ str(x) for x in row['Pos'] ])}\t{total_reads}\t{indel_reads}\t{indel_fraction}\t{control_total_reads}\t{control_indel_reads}\t{control_indel_fraction}\t{len(indels)}\t{indel_keys}\t{len(bnds)}\t{bnd_keys}\t{offtargetsites}\t{ontarget}")
 
-        # assemble filtered data
-        vsfilt = []
-        for k in datExp['filtindels'].keys():
-            vsfilt.append(":".join(map(str,[k,datExp['filtindels'].get(k) or 0,datCon['indels'].get(k) or 0])))
 
-        expSvs = ';'.join(datExp['sv'].keys()) or '.'
-        controlSvs = ';'.join(datCon['sv'].keys()) or '.'
-        rec = [rec[5],contig,pampos,strand,datExp['all'],expindels,sum(datExp['filtindels'].values()),datCon['all'],controlindels,sum(datCon['filtindels'].values()),";".join(map(str,vsfilt)) or '.', expSvs, controlSvs]
-        
-        print("\t".join(map(str,rec)))
 
-f.close()
-expsamfile.close()
-consamfile.close()
+    # close stdout or outfile
+    if args.outfile:
+        sys.stdout.close()
+
+    expsamfile.close()
+    consamfile.close()
+    refFasta.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/make_hotspot_file.py b/bin/make_hotspot_file.py
index 33290b8..781e89a 100755
--- a/bin/make_hotspot_file.py
+++ b/bin/make_hotspot_file.py
@@ -42,10 +42,10 @@ def main():
     parser.add_argument('--bed', type=checkfile, help="bed hotspot file")
     parser.add_argument('--csv', type=checkfile, help="csv hotspot file")
     parser.add_argument('--window', type=int, default=200, help="window")
+    parser.add_argument('--id', help="meta id")
     args = parser.parse_args()
 
     window = args.window 
-    # assumes csv_file ranges don't overlap
     data_df = pd.DataFrame()
     if (args.bed):
         bed_file = os.path.realpath(args.bed)
@@ -69,12 +69,13 @@ def main():
         row_df['Chromosome'] = rows['Chromosome']
         vcf_df = pd.concat([vcf_df, row_df], ignore_index=True)
     result_vcf = dataframe_to_vcf(vcf_df)
-    with open(f"hotspot.vcf", "w") as f:
+    id = args.id
+    with open(f"{id}.hotspot.vcf", "w") as f:
         f.write(result_vcf)
 
 if __name__ == "__main__":
     if len(sys.argv) < 2:
-        print("Usage: python script.py --bed <bed_file> --csv <csv_file> --window [window]")
+        print("Usage: python script.py --bed <bed_file> --csv <csv_file> --window [window] --id <id>")
         sys.exit(1)
     else:
         main()
diff --git a/modules/local/annotate_transgene.nf b/modules/local/annotate_transgene.nf
new file mode 100644
index 0000000..a01fb9a
--- /dev/null
+++ b/modules/local/annotate_transgene.nf
@@ -0,0 +1,25 @@
+process ANNOTATE_TRANSGENE_VARIANTS {
+    tag "$meta.id"
+    label 'process_low'
+    label 'final_output'
+    container "ghcr.io/dhslab/docker-vep:release_105"
+
+    input:
+    tuple val(meta), path(files), path("${meta.id}.transgene_out.tsv")
+
+    output:
+    tuple val(meta), path("${meta.id}.transgene.annotated.tsv"), emit: annotated_transgenes
+    path "versions.yml",    emit: versions
+
+    script:
+    """
+    awk 'BEGIN {FS=OFS="\\t"} NR > 1 {sub(/^chr/, "", \$1); for (i = 1; i < NF; i++) {printf "%s%s", \$i, (i == NF-1 ? "\\n" : OFS)}}' ${meta.id}.transgene_out.tsv > ${meta.id}.edited.transgene.tsv && \\
+    /opt/vep/src/ensembl-vep/vep --cache --dir /storage1/fs1/dspencer/Active/spencerlab/refdata/hg38/VEP_cache --symbol --per_gene -i ${meta.id}.edited.transgene.tsv -o ${meta.id}.transgene.annotated.tsv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        vep: \$(/opt/vep/src/ensembl-vep/vep 2>&1 | grep ensembl-vep | cut -d ':' -f 2 | sed 's/\s*//g')
+    END_VERSIONS
+    """
+
+}
\ No newline at end of file
diff --git a/modules/local/get_indels.nf b/modules/local/get_indels.nf
index 61669c7..2e263fc 100644
--- a/modules/local/get_indels.nf
+++ b/modules/local/get_indels.nf
@@ -2,11 +2,11 @@ process GET_INDELS {
     tag "$meta.id"
     label 'process_low'
     label 'final_output'
-    container "ghcr.io/dhslab/docker-cleutils"
+    container "ghcr.io/dhslab/docker-baseimage:latest"
+    errorStrategy 'ignore'
 
     input:
-    tuple val(meta), path(files)
-    path(targetfile)
+    tuple val(meta), path(files), path(hotspot_file)
 
     output:
     tuple val(meta), path("${meta.id}.indels.txt")
@@ -14,7 +14,7 @@ process GET_INDELS {
 
     script:
     """
-    getIndelsFromBam.py ${targetfile} ${meta.id}_tumor.cram ${meta.id}.cram > ${meta.id}.indels.txt
+    extract_variant_reads.py ${hotspot_file} ${meta.id}_tumor.cram ${meta.id}.cram > ${meta.id}.indels.txt
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/local/get_transgene_junctions.nf b/modules/local/get_transgene_junctions.nf
index 57eb2a0..a8b7341 100644
--- a/modules/local/get_transgene_junctions.nf
+++ b/modules/local/get_transgene_junctions.nf
@@ -8,7 +8,7 @@ process GET_TRANSGENE_JUNCTIONS {
     tuple val(meta), path(files)
 
     output:
-    tuple val(meta), path("${meta.id}.transgene_out.tsv")
+    tuple val(meta), path("${meta.id}.transgene_out.tsv"), emit: transgene_file
     path "versions.yml"                                     , emit: versions
 
     script:
diff --git a/modules/local/make_hotspot_file.nf b/modules/local/make_hotspot_file.nf
index 8ace3cb..93617ba 100644
--- a/modules/local/make_hotspot_file.nf
+++ b/modules/local/make_hotspot_file.nf
@@ -25,7 +25,7 @@ process MAKE_HOTSPOT_FILE {
         def bed = bed_file.name != 'NO_FILE.bed' ? "--bed $bed_file" : ''
         def csv = csv_file.name != 'NO_FILE.csv' ? "--csv $csv_file" : ''
         """
-        make_hotspot_file.py $bed $csv --window $params.window
+        make_hotspot_file.py $bed $csv --window $params.window --id ${info[0].id}
 
         cat <<-END_VERSIONS > versions.yml
         "${task.process}":
diff --git a/nextflow.config b/nextflow.config
index 7fd057a..122c927 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -52,9 +52,9 @@ params {
     
     // Dragen inputs
     dragen_inputs {
-        dragen_hash                 = "/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/hg38_PLVM_CD19_CARv4_cd34"
-        snv_noisefile               = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/dragen_v1.0_systematic_noise.nextera_wgs.120920.bed.gz"
-        sv_noisefile                = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/WGS_v2.0.0_hg38_sv_systematic_noise.bedpe.gz"
+        dragen_hash                 = "/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/singh_v4.3.6"
+        snv_noisefile               = "/storage1/fs1/dspencer/Active/spencerlab/refdata/hg38/dragenfiles/IDPF_WGS_hg38_v.2.0.0_systematic_noise.snv.bed.gz"
+        sv_noisefile                = "/storage1/fs1/dspencer/Active/spencerlab/refdata/hg38/dragenfiles/IDPF_WGS_hg38_v3.0.0_systematic_noise.sv.bedpe.gz"
         dbsnp                       = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/dbsnp.vcf.gz"
         dbsnp_index                 = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/dbsnp.vcf.gz.tbi"
         pop_af_vcf                  = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
diff --git a/subworkflows/local/somatic_input_check.nf b/subworkflows/local/somatic_input_check.nf
index 220d22a..1aa67fc 100644
--- a/subworkflows/local/somatic_input_check.nf
+++ b/subworkflows/local/somatic_input_check.nf
@@ -156,8 +156,26 @@ workflow SOMATIC_INPUT_CHECK {
 
     ch_input_data = ch_input_data.mix(ch_bam)
 
+    ch_mastersheet
+    .map { meta -> 
+        if (meta.dragen_path != null){
+            def new_meta = [:]
+            new_meta['id'] = meta.id
+            return [new_meta]
+        }
+    }
+    .set{ch_ids}
+    ch_input_data = ch_input_data.mix(ch_ids)
+
     ch_hotspots = ch_mastersheet
-        .map{ row -> [row.uid, row.hotspot_file ?: "$projectDir/assets/NO_FILE.csv"]}
+        .map{ row -> 
+        if (row.uid != null) {
+            return [row.uid, row.hotspot_file ?: "$projectDir/assets/NO_FILE.csv"]
+            }
+        else {
+            return [row.id, row.hotspot_file ?: "$projectDir/assets/NO_FILE.csv"]
+            }
+        }
         .unique()
 
     ch_input_data = ch_input_data
@@ -175,7 +193,6 @@ workflow SOMATIC_INPUT_CHECK {
         }
     }
     .set { ch_dragen_outputs }
-    ch_dragen_outputs.dump(tag: 'dragen_output')
 
     emit:
     dragen_outputs = ch_dragen_outputs
diff --git a/workflows/scge.nf b/workflows/scge.nf
index a527dd0..b9fca6e 100644
--- a/workflows/scge.nf
+++ b/workflows/scge.nf
@@ -50,6 +50,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft
 include { MAKE_HOTSPOT_FILE           } from '../modules/local/make_hotspot_file.nf'
 include { DRAGEN_SCGE                 } from '../modules/local/dragen_scge.nf'
 include { ANNOTATE_VARIANTS           } from '../modules/local/annotate_variants.nf'
+include { ANNOTATE_TRANSGENE_VARIANTS } from '../modules/local/annotate_transgene.nf'
 include { GET_INDELS                  } from '../modules/local/get_indels.nf'
 include { GET_TRANSGENE_JUNCTIONS     } from '../modules/local/get_transgene_junctions.nf'
 include { REFORMAT_CNV_DATA           } from '../modules/local/reformat_cnv_data.nf'
@@ -103,36 +104,51 @@ workflow SCGE {
     SOMATIC_INPUT_CHECK(Channel.fromPath(mastersheet), data_path)
 
     ch_input_data = SOMATIC_INPUT_CHECK.out.input_data
-    hotspot_input = ch_input_data.combine(hotspot_bed)
-    MAKE_HOTSPOT_FILE(hotspot_input)
-    ch_input_data = MAKE_HOTSPOT_FILE.out.hotspot_vcf
-        .map{ info, hotspot_vcf ->
-        def newinfo = []
-        newinfo = info + [hotspot_vcf]
-        newinfo
-        }
-
-    ch_dragen_outputs = ch_dragen_outputs.mix(SOMATIC_INPUT_CHECK.out.dragen_outputs)
-    ch_dragen_inputs = Channel.value(stageFileset(params.dragen_inputs))
-    ch_assay_inputs = Channel.value(stageFileset(params.assay_inputs))
+    ch_hotspots = ch_input_data.map{ meta, file -> return [meta[0]['id'], file] }
 
     if (params.run_dragen == true) {
+
+        hotspot_input = ch_input_data.combine(hotspot_bed)
+        MAKE_HOTSPOT_FILE(hotspot_input)
+        ch_input_data = MAKE_HOTSPOT_FILE.out.hotspot_vcf
+            .map{ info, hotspot_vcf ->
+            def newinfo = []
+            newinfo = info + [hotspot_vcf]
+            newinfo
+            }
+
+        ch_dragen_outputs = ch_dragen_outputs.mix(SOMATIC_INPUT_CHECK.out.dragen_outputs)
+        ch_dragen_inputs = Channel.value(stageFileset(params.dragen_inputs))
+        ch_assay_inputs = Channel.value(stageFileset(params.assay_inputs))
+
         DRAGEN_SCGE (ch_input_data, ch_dragen_inputs)
         ch_versions = ch_versions.mix(DRAGEN_SCGE.out.versions)
         ch_dragen_outputs = ch_dragen_outputs.mix(DRAGEN_SCGE.out.dragen_output)
+    } else {
+        ch_dragen_outputs = ch_dragen_outputs.mix(SOMATIC_INPUT_CHECK.out.dragen_outputs)
+        ch_dragen_inputs = Channel.value(stageFileset(params.dragen_inputs))
+        ch_assay_inputs = Channel.value(stageFileset(params.assay_inputs))
     }
     
     if (params.run_analysis == true) {
-
-        ANNOTATE_VARIANTS (ch_dragen_outputs)
+        
+        ANNOTATE_VARIANTS (ch_dragen_outputs, ch_assay_inputs, params.fasta)
         ch_versions = ch_versions.mix(ANNOTATE_VARIANTS.out.versions)
-
-        GET_INDELS (ch_dragen_outputs)
+        
+        get_indels_input = ch_dragen_outputs
+            .map{ meta -> [meta[0]['id'], meta] }
+            .combine(ch_hotspots, by: 0)
+            .map{id, meta, hotspot_file -> [meta[0], meta[1], hotspot_file]}
+        GET_INDELS (get_indels_input)
         ch_versions = ch_versions.mix(GET_INDELS.out.versions)
 
         GET_TRANSGENE_JUNCTIONS (ch_dragen_outputs)
         ch_versions = ch_versions.mix(GET_TRANSGENE_JUNCTIONS.out.versions)
 
+        annotate_transgene_input = ch_dragen_outputs.join(GET_TRANSGENE_JUNCTIONS.out.transgene_file)
+        ANNOTATE_TRANSGENE_VARIANTS (annotate_transgene_input)
+        ch_versions = ch_versions.mix(ANNOTATE_TRANSGENE_VARIANTS.out.versions)
+
         REFORMAT_CNV_DATA (ch_dragen_outputs)
         ch_versions = ch_versions.mix(REFORMAT_CNV_DATA.out.versions)
     }