find ns mutations in bootstrap interval, limit to annotated genes in …

…sv overlap, misc updates
quinlan-lab · Oct 25, 2023 · dedf127 · dedf127
1 parent dead435
commit dedf127
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 27 deletions.
diff --git a/scripts/calculate_callable_kmer_content.py b/scripts/calculate_callable_kmer_content.py
@@ -7,7 +7,6 @@
 from pyfaidx import Fasta
 import numpy as np
 import pandas as pd
-import tqdm
 
 def make_interval_tree(
     path: str,
@@ -52,7 +51,7 @@ def main(args):
 
     with open(args.coverage, "r") as f:
         csvf = csv.reader(f, delimiter='\t')
-        for l in tqdm.tqdm(csvf):
+        for l in csvf:
             # skip header line
             if l[0] == "#chrom": continue
             # make sure we only look at autosomes

diff --git a/scripts/find_nonsynonymous.py b/scripts/find_nonsynonymous.py
@@ -13,13 +13,13 @@
 print (vcf.raw_header.rstrip(), file=outfh)
 
 for v in vcf:
-    if v.POS < 111_270_000 - 5e6: continue
-    if v.POS > 111_270_000 + 5e6: break
+    if v.POS < 95_000_000: continue
+    if v.POS > 114_100_000: break
     if len(v.ALT) > 1: continue
     annotations = v.INFO.get("ANN").split(';')
     rel_ann = [a for a in annotations if a.split('|')[7] == "protein_coding"]
-    #if not any([a.split('|')[2] in ("MODERATE", "HIGH") for a in rel_ann]):
-    #    continue
+    if not any([a.split('|')[2] in ("MODERATE", "HIGH") for a in rel_ann]):
+      continue
 
     gts = v.gt_types
     gqs = v.gt_quals
@@ -60,7 +60,7 @@
                     v.start,
                     v.end,
                     v.REF,
-                    v.ALT,
+                    v.ALT[0],
                     gts[smp2idx[C57]],
                     gts[smp2idx[DBA]],
                     ac,

diff --git a/scripts/find_sv_overlap.py b/scripts/find_sv_overlap.py
@@ -7,10 +7,10 @@ def main(args):
 
     refseq = pd.read_csv(args.refseq, sep="\t")
 
-    start = 111_270_000
-    buff = 10_000_000
+    start = 95_000_000
+    end = 116_100_000
 
-    chrom, start, end = "chr6", start - buff, start + buff
+    chrom = "chr6"
 
     refseq = refseq[(refseq["chrom"] == chrom) & (refseq["txStart"] > start)
                     & (refseq["txStart"] < end)]
@@ -21,7 +21,7 @@ def main(args):
 
     for i, row in refseq.iterrows():
         # limit to curated refseq 
-        #if not row["name"].startswith("NM"): continue
+        if not (row["name"].startswith("NM") or row["name"].startswith("NR")): continue
         exon_starts, exon_ends = row["exonStarts"].split(','), row["exonEnds"].split(',')
         tx_start, tx_end = int(row["txStart"]), int(row["txEnd"])
         tx_start -= 10_000
@@ -43,11 +43,11 @@ def main(args):
         v_start, v_end = v.start, v.end
         if v.INFO.get("SVTYPE") == "DEL":
             v_end -= v.INFO.get("SVLEN")
+        if abs(v_end - v_start) < 100: continue
         exon_overlaps = exons.find(v_start, v_end)
         tx_overlaps = full_tx.find(v_start, v_end)
         overlapping_genes = list(set([o["gene"] for o in tx_overlaps]))
 
-        if len(tx_overlaps) == 0 and len(exon_overlaps) == 0: continue
         is_in_exon = False
         if len(exon_overlaps) > 0: is_in_exon = True
         vals = {

diff --git a/scripts/map_bxd_qtl.R b/scripts/map_bxd_qtl.R
@@ -75,9 +75,9 @@ peaks <- find_peaks(out, bxd$pmap, threshold = lod_cutoff)
 # plot LOD scores genome-wide
 png(opt$output_file, width = 7, height = 3.5, units = "in", res = 300)
 par(mar = c(4.1, 4.1, 1.6, 1.1))
-color <- c("cornflowerblue")
-plot(out, pmap, lodcolumn = 1, col = color[1], ylim = c(0, 6), main = mutation_type)
-legend("topright", lwd = 1, col = "cornflowerblue", "LOD scores", bg = "gray90", lty = c(1, 1, 2))
-abline(h = lod_cutoff, col = "black", lwd = 2, lty = 2)
+color <- c("black")
+plot(out, pmap, lodcolumn = 1, col = color[1], bgcolor = "white", altbgcolor = "white", ylim = c(0, 6), main = mutation_type)
+legend("topright", lwd = 1, col = "black", "LOD scores", bg = "white", lty = c(1, 1, 2))
+abline(h = lod_cutoff, col = "darkgrey", lwd = 2, lty = 2)
 
 dev.off()
diff --git a/scripts/plot_bxd_spectra_vs_age.py b/scripts/plot_bxd_spectra_vs_age.py
@@ -20,18 +20,15 @@ def main(args):
 
     f, ax = plt.subplots(figsize=(8, 6))
 
-    # subset to desired mutation type
     spectra_df = spectra_df[spectra_df["Mutation type"] == args.mutation_type.replace("_", ">")]
-
     # reformat mutation type
     spectra_df["Mutation type"] = spectra_df["Mutation type"].apply(lambda m: m.replace(">", r"$\to$"))
 
     for hap, hap_df in spectra_df.groupby("Haplotypes"):
         hap_df = hap_df.sort_values("Generations", ascending=True)
         X = hap_df["Generations"].values
-        y = hap_df["Count"].values
-        # if args.phenotype == "Count":
-        #     y /= hap_df["callable_nucleotides"]
+        y = hap_df[args.phenotype].values
+
         sm.add_constant(X)
         link_func = statsmodels.genmod.families.links.identity()
         model = sm.GLM(y, X, family=sm.families.Poisson(link=link_func))
@@ -71,7 +68,6 @@ def main(args):
             s=50,
         )
 
-
     sns.despine(ax=ax, top=True, right=True)
     for axis in ['top','bottom','left','right']:
         ax.spines[axis].set_linewidth(1.5)
@@ -91,15 +87,16 @@ def main(args):
         "--spectra",
         help="""Path to tidy dataframe of mutation spectra in BXDs""",
     )
-    p.add_argument(
-        "--mutation_type",
-        help="""Mutation type to plot. Default is C_A.""",
-        type=str,
-    )
     p.add_argument(
         "--out",
         help="""name of output file with tidy mutation spectra""",
     )
+    p.add_argument(
+        "-mutation_type",
+        help="""Mutation type to plot. Default is C_A.""",
+        type=str,
+        default="C_A",
+    )
     p.add_argument(
         "-phenotype",
         help="phenotype to use for the plot (options are Fraction, Count or Rate). Default is Count.",