Skip to content

Commit

Permalink
find ns mutations in bootstrap interval, limit to annotated genes in …
Browse files Browse the repository at this point in the history
…sv overlap, misc updates
  • Loading branch information
tomsasani committed Oct 25, 2023
1 parent dead435 commit dedf127
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 27 deletions.
3 changes: 1 addition & 2 deletions scripts/calculate_callable_kmer_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from pyfaidx import Fasta
import numpy as np
import pandas as pd
import tqdm

def make_interval_tree(
path: str,
Expand Down Expand Up @@ -52,7 +51,7 @@ def main(args):

with open(args.coverage, "r") as f:
csvf = csv.reader(f, delimiter='\t')
for l in tqdm.tqdm(csvf):
for l in csvf:
# skip header line
if l[0] == "#chrom": continue
# make sure we only look at autosomes
Expand Down
10 changes: 5 additions & 5 deletions scripts/find_nonsynonymous.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
print (vcf.raw_header.rstrip(), file=outfh)

for v in vcf:
if v.POS < 111_270_000 - 5e6: continue
if v.POS > 111_270_000 + 5e6: break
if v.POS < 95_000_000: continue
if v.POS > 114_100_000: break
if len(v.ALT) > 1: continue
annotations = v.INFO.get("ANN").split(';')
rel_ann = [a for a in annotations if a.split('|')[7] == "protein_coding"]
#if not any([a.split('|')[2] in ("MODERATE", "HIGH") for a in rel_ann]):
# continue
if not any([a.split('|')[2] in ("MODERATE", "HIGH") for a in rel_ann]):
continue

gts = v.gt_types
gqs = v.gt_quals
Expand Down Expand Up @@ -60,7 +60,7 @@
v.start,
v.end,
v.REF,
v.ALT,
v.ALT[0],
gts[smp2idx[C57]],
gts[smp2idx[DBA]],
ac,
Expand Down
10 changes: 5 additions & 5 deletions scripts/find_sv_overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ def main(args):

refseq = pd.read_csv(args.refseq, sep="\t")

start = 111_270_000
buff = 10_000_000
start = 95_000_000
end = 116_100_000

chrom, start, end = "chr6", start - buff, start + buff
chrom = "chr6"

refseq = refseq[(refseq["chrom"] == chrom) & (refseq["txStart"] > start)
& (refseq["txStart"] < end)]
Expand All @@ -21,7 +21,7 @@ def main(args):

for i, row in refseq.iterrows():
# limit to curated refseq
#if not row["name"].startswith("NM"): continue
if not (row["name"].startswith("NM") or row["name"].startswith("NR")): continue
exon_starts, exon_ends = row["exonStarts"].split(','), row["exonEnds"].split(',')
tx_start, tx_end = int(row["txStart"]), int(row["txEnd"])
tx_start -= 10_000
Expand All @@ -43,11 +43,11 @@ def main(args):
v_start, v_end = v.start, v.end
if v.INFO.get("SVTYPE") == "DEL":
v_end -= v.INFO.get("SVLEN")
if abs(v_end - v_start) < 100: continue
exon_overlaps = exons.find(v_start, v_end)
tx_overlaps = full_tx.find(v_start, v_end)
overlapping_genes = list(set([o["gene"] for o in tx_overlaps]))

if len(tx_overlaps) == 0 and len(exon_overlaps) == 0: continue
is_in_exon = False
if len(exon_overlaps) > 0: is_in_exon = True
vals = {
Expand Down
8 changes: 4 additions & 4 deletions scripts/map_bxd_qtl.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ peaks <- find_peaks(out, bxd$pmap, threshold = lod_cutoff)
# plot LOD scores genome-wide
png(opt$output_file, width = 7, height = 3.5, units = "in", res = 300)
par(mar = c(4.1, 4.1, 1.6, 1.1))
color <- c("cornflowerblue")
plot(out, pmap, lodcolumn = 1, col = color[1], ylim = c(0, 6), main = mutation_type)
legend("topright", lwd = 1, col = "cornflowerblue", "LOD scores", bg = "gray90", lty = c(1, 1, 2))
abline(h = lod_cutoff, col = "black", lwd = 2, lty = 2)
color <- c("black")
plot(out, pmap, lodcolumn = 1, col = color[1], bgcolor = "white", altbgcolor = "white", ylim = c(0, 6), main = mutation_type)
legend("topright", lwd = 1, col = "black", "LOD scores", bg = "white", lty = c(1, 1, 2))
abline(h = lod_cutoff, col = "darkgrey", lwd = 2, lty = 2)

dev.off()
19 changes: 8 additions & 11 deletions scripts/plot_bxd_spectra_vs_age.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,15 @@ def main(args):

f, ax = plt.subplots(figsize=(8, 6))

# subset to desired mutation type
spectra_df = spectra_df[spectra_df["Mutation type"] == args.mutation_type.replace("_", ">")]

# reformat mutation type
spectra_df["Mutation type"] = spectra_df["Mutation type"].apply(lambda m: m.replace(">", r"$\to$"))

for hap, hap_df in spectra_df.groupby("Haplotypes"):
hap_df = hap_df.sort_values("Generations", ascending=True)
X = hap_df["Generations"].values
y = hap_df["Count"].values
# if args.phenotype == "Count":
# y /= hap_df["callable_nucleotides"]
y = hap_df[args.phenotype].values

sm.add_constant(X)
link_func = statsmodels.genmod.families.links.identity()
model = sm.GLM(y, X, family=sm.families.Poisson(link=link_func))
Expand Down Expand Up @@ -71,7 +68,6 @@ def main(args):
s=50,
)


sns.despine(ax=ax, top=True, right=True)
for axis in ['top','bottom','left','right']:
ax.spines[axis].set_linewidth(1.5)
Expand All @@ -91,15 +87,16 @@ def main(args):
"--spectra",
help="""Path to tidy dataframe of mutation spectra in BXDs""",
)
p.add_argument(
"--mutation_type",
help="""Mutation type to plot. Default is C_A.""",
type=str,
)
p.add_argument(
"--out",
help="""name of output file with tidy mutation spectra""",
)
p.add_argument(
"-mutation_type",
help="""Mutation type to plot. Default is C_A.""",
type=str,
default="C_A",
)
p.add_argument(
"-phenotype",
help="phenotype to use for the plot (options are Fraction, Count or Rate). Default is Count.",
Expand Down

0 comments on commit dedf127

Please sign in to comment.