diff --git a/scripts/calculate_callable_kmer_content.py b/scripts/calculate_callable_kmer_content.py index 3996304..152a325 100644 --- a/scripts/calculate_callable_kmer_content.py +++ b/scripts/calculate_callable_kmer_content.py @@ -7,7 +7,6 @@ from pyfaidx import Fasta import numpy as np import pandas as pd -import tqdm def make_interval_tree( path: str, @@ -52,7 +51,7 @@ def main(args): with open(args.coverage, "r") as f: csvf = csv.reader(f, delimiter='\t') - for l in tqdm.tqdm(csvf): + for l in csvf: # skip header line if l[0] == "#chrom": continue # make sure we only look at autosomes diff --git a/scripts/find_nonsynonymous.py b/scripts/find_nonsynonymous.py index 17c1ac5..c4188db 100644 --- a/scripts/find_nonsynonymous.py +++ b/scripts/find_nonsynonymous.py @@ -13,13 +13,13 @@ print (vcf.raw_header.rstrip(), file=outfh) for v in vcf: - if v.POS < 111_270_000 - 5e6: continue - if v.POS > 111_270_000 + 5e6: break + if v.POS < 95_000_000: continue + if v.POS > 114_100_000: break if len(v.ALT) > 1: continue annotations = v.INFO.get("ANN").split(';') rel_ann = [a for a in annotations if a.split('|')[7] == "protein_coding"] - #if not any([a.split('|')[2] in ("MODERATE", "HIGH") for a in rel_ann]): - # continue + if not any([a.split('|')[2] in ("MODERATE", "HIGH") for a in rel_ann]): + continue gts = v.gt_types gqs = v.gt_quals @@ -60,7 +60,7 @@ v.start, v.end, v.REF, - v.ALT, + v.ALT[0], gts[smp2idx[C57]], gts[smp2idx[DBA]], ac, diff --git a/scripts/find_sv_overlap.py b/scripts/find_sv_overlap.py index f30d5ce..5a13889 100644 --- a/scripts/find_sv_overlap.py +++ b/scripts/find_sv_overlap.py @@ -7,10 +7,10 @@ def main(args): refseq = pd.read_csv(args.refseq, sep="\t") - start = 111_270_000 - buff = 10_000_000 + start = 95_000_000 + end = 116_100_000 - chrom, start, end = "chr6", start - buff, start + buff + chrom = "chr6" refseq = refseq[(refseq["chrom"] == chrom) & (refseq["txStart"] > start) & (refseq["txStart"] < end)] @@ -21,7 +21,7 @@ def main(args): for i, row in refseq.iterrows(): # limit to curated refseq - #if not row["name"].startswith("NM"): continue + if not (row["name"].startswith("NM") or row["name"].startswith("NR")): continue exon_starts, exon_ends = row["exonStarts"].split(','), row["exonEnds"].split(',') tx_start, tx_end = int(row["txStart"]), int(row["txEnd"]) tx_start -= 10_000 @@ -43,11 +43,11 @@ def main(args): v_start, v_end = v.start, v.end if v.INFO.get("SVTYPE") == "DEL": v_end -= v.INFO.get("SVLEN") + if abs(v_end - v_start) < 100: continue exon_overlaps = exons.find(v_start, v_end) tx_overlaps = full_tx.find(v_start, v_end) overlapping_genes = list(set([o["gene"] for o in tx_overlaps])) - if len(tx_overlaps) == 0 and len(exon_overlaps) == 0: continue is_in_exon = False if len(exon_overlaps) > 0: is_in_exon = True vals = { diff --git a/scripts/map_bxd_qtl.R b/scripts/map_bxd_qtl.R index 84747a5..69c9952 100644 --- a/scripts/map_bxd_qtl.R +++ b/scripts/map_bxd_qtl.R @@ -75,9 +75,9 @@ peaks <- find_peaks(out, bxd$pmap, threshold = lod_cutoff) # plot LOD scores genome-wide png(opt$output_file, width = 7, height = 3.5, units = "in", res = 300) par(mar = c(4.1, 4.1, 1.6, 1.1)) -color <- c("cornflowerblue") -plot(out, pmap, lodcolumn = 1, col = color[1], ylim = c(0, 6), main = mutation_type) -legend("topright", lwd = 1, col = "cornflowerblue", "LOD scores", bg = "gray90", lty = c(1, 1, 2)) -abline(h = lod_cutoff, col = "black", lwd = 2, lty = 2) +color <- c("black") +plot(out, pmap, lodcolumn = 1, col = color[1], bgcolor = "white", altbgcolor = "white", ylim = c(0, 6), main = mutation_type) +legend("topright", lwd = 1, col = "black", "LOD scores", bg = "white", lty = c(1, 1, 2)) +abline(h = lod_cutoff, col = "darkgrey", lwd = 2, lty = 2) dev.off() \ No newline at end of file diff --git a/scripts/plot_bxd_spectra_vs_age.py b/scripts/plot_bxd_spectra_vs_age.py index 24646c1..6e8ab5e 100644 --- a/scripts/plot_bxd_spectra_vs_age.py +++ b/scripts/plot_bxd_spectra_vs_age.py @@ -20,18 +20,15 @@ def main(args): f, ax = plt.subplots(figsize=(8, 6)) - # subset to desired mutation type spectra_df = spectra_df[spectra_df["Mutation type"] == args.mutation_type.replace("_", ">")] - # reformat mutation type spectra_df["Mutation type"] = spectra_df["Mutation type"].apply(lambda m: m.replace(">", r"$\to$")) for hap, hap_df in spectra_df.groupby("Haplotypes"): hap_df = hap_df.sort_values("Generations", ascending=True) X = hap_df["Generations"].values - y = hap_df["Count"].values - # if args.phenotype == "Count": - # y /= hap_df["callable_nucleotides"] + y = hap_df[args.phenotype].values + sm.add_constant(X) link_func = statsmodels.genmod.families.links.identity() model = sm.GLM(y, X, family=sm.families.Poisson(link=link_func)) @@ -71,7 +68,6 @@ def main(args): s=50, ) - sns.despine(ax=ax, top=True, right=True) for axis in ['top','bottom','left','right']: ax.spines[axis].set_linewidth(1.5) @@ -91,15 +87,16 @@ def main(args): "--spectra", help="""Path to tidy dataframe of mutation spectra in BXDs""", ) - p.add_argument( - "--mutation_type", - help="""Mutation type to plot. Default is C_A.""", - type=str, - ) p.add_argument( "--out", help="""name of output file with tidy mutation spectra""", ) + p.add_argument( + "-mutation_type", + help="""Mutation type to plot. Default is C_A.""", + type=str, + default="C_A", + ) p.add_argument( "-phenotype", help="phenotype to use for the plot (options are Fraction, Count or Rate). Default is Count.",