Skip to content

Commit

Permalink
remove markers in strong LD with chr4, simplify simulation code
Browse files Browse the repository at this point in the history
  • Loading branch information
tomsasani committed Oct 25, 2023
1 parent dedf127 commit 2905a05
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 81 deletions.
72 changes: 30 additions & 42 deletions ihd/run_ihd_power_simulations.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def run_simulation_trials(
n_haplotypes: int = 100,
n_markers: int = 1000,
number_of_trials: int = 100,
f_with_mutator: float = 1.,
exp_af: float = 0.5,
distance_method: Callable = compute_manual_chisquare,
vary_mutation_counts: bool = False,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
Expand All @@ -104,9 +104,6 @@ def run_simulation_trials(
base_lambdas (np.ndarray): 1D numpy array of lambda values corresponding to the \
expected number of each k-mer mutation type we observe on a haplotype.
genotype_matrix (np.ndarray): 2D numpy array of size (g, n), where g is the number \
of genotyped sites and n is the number of simulated haplotypes.
mutation_type_idx (int, optional): Index of the mutation type to augment by the specified \
effect size. Defaults to 0.
Expand All @@ -124,17 +121,25 @@ def run_simulation_trials(
number_of_trials (int, optional): Number of trials to run for every combination of \
simulation parameters. Defaults to 100.
f_with_mutator (float, optional): The fraction of haplotypes with the ALT allele at the \
exp_af (float, optional): The fraction of haplotypes at the \
simulated mutator locus that actually possess the mutator allele and augmented mutation \
spectra. I.e., the degree to which the ALT allele at the mutator locus truly tags \
the mutator allele. Defaults to 1..
spectra. Defaults to 0.5.
distance_method (Callable, optional): Distance metric to use in AMSD scan. Defaults to cosine.
vary_mutation_counts (bool, optional): Whether to allow mutation counts on haplotypes to vary by \
a factor of 20. Defaults to False.
Returns:
pvals (np.ndarray): 1D numpy array of p-values from `number_of_trials` trials.
mutation_spectra_in_trials (np.ndarray): 3D numpy array of size (T, H, M), where T \
is the number of trials, H is the number of haplotypes, and M is the number of \
k-mer mutation types. Contains simulated mutation spectra for every haplotype.
genotype_matrices_in_trials (np.ndarray): 3D numpy array of size (T, H, G), where T \
is the number of trials, H is the number of haplotypes, and G is the number of \
genotyped sites. Contains simulated genotypes at each marker in each trial.
focal_markers (np.ndarray): 1D numpy array containing the marker at which we simulate \
the mutator allele in every trial.
Expand All @@ -154,7 +159,6 @@ def run_simulation_trials(

genotype_matrix = simulate_genotypes(n_markers, n_haplotypes, exp_af = 0.5)


# create a matrix of lambda values
lambdas = np.broadcast_to(
base_lambdas * mutation_counts,
Expand All @@ -172,12 +176,10 @@ def run_simulation_trials(
alt_haplotypes = np.where(genotype_matrix[focal_marker] == 2)[0]

# figure out how many of the haplotypes at the "mutator locus"
# actually carry the mutator allele. in other words, what if all of
# the markers have AF = 0.5, but the "focal" marker *imperfectly tags* the mutator
# locus? such that 1/2 of haplotypes carry the marker, but only a fraction of those
# actually carry the true mutator allele.
n_true_alt_haplotypes = int(alt_haplotypes.shape[0] * f_with_mutator)
alt_haplotypes = np.random.choice(alt_haplotypes, n_true_alt_haplotypes, replace=False)
# actually carry the mutator allele. we may want the allele frequency
# of the mutator allele to be less than 0.5
n_true_alt_haplotypes = int(n_haplotypes * exp_af)
alt_haplotypes = np.random.choice(np.arange(n_haplotypes), n_true_alt_haplotypes, replace=False)

# TEST: if the mutator is present at lower frequency, bump
# the genotypes to reflect it
Expand All @@ -188,7 +190,7 @@ def run_simulation_trials(
# ensure that at least one haplotype at this site has the
# alternate allele. otherwise, return a p-value of 1.
if alt_haplotypes.shape[0] < 1:
print ("NO ALT HAPLOTYPES", n_true_alt_haplotypes, f_with_mutator, alt_haplotypes, genotype_matrix[focal_marker], np.sum(genotype_matrix, axis=1))
#print ("NO ALT HAPLOTYPES", n_true_alt_haplotypes, f_with_mutator, alt_haplotypes, genotype_matrix[focal_marker], np.sum(genotype_matrix, axis=1))
pvals[trial_n] = 1.
continue
# augment the lambda on the alt haplotypes by an effect size
Expand All @@ -209,7 +211,6 @@ def run_simulation_trials(
covariate_ratios,
adjust_statistics=False,
distance_method=distance_method,

)

# and get null
Expand All @@ -223,7 +224,6 @@ def run_simulation_trials(
distance_method=distance_method,
adjust_statistics=False,
)

thresh = np.percentile(null_distances, q=95)
pvals[trial_n] = 1 - (focal_dists[focal_marker] > thresh)

Expand All @@ -232,8 +232,6 @@ def run_simulation_trials(

def main(args):

number_of_trials = 100

base_mutations = ["C>T", "CpG>TpG", "C>A", "C>G", "A>T", "A>C", "A>G"]
base_lambdas = np.array([0.29, 0.17, 0.12, 0.075, 0.1, 0.075, 0.17])

Expand Down Expand Up @@ -271,15 +269,7 @@ def main(args):
res_df = []

mutation_type_idx = mut2idx[args.mutation_type.replace("_", ">")]

# NOTE: simulate genotypes separately in each trial? or just
# simulate the mutation spectra independently?
# genotype_matrix = simulate_genotypes(
# args.n_markers,
# args.n_haplotypes,
# exp_af=args.exp_af / 100.,
# )


trial_pvals, trial_mutation_spectra, trial_genotype_matrices, focal_markers = run_simulation_trials(
lambdas,
mutation_type_idx=mutation_type_idx,
Expand All @@ -288,16 +278,16 @@ def main(args):
n_mutations=args.n_mutations,
n_haplotypes=args.n_haplotypes,
n_markers=args.n_markers,
number_of_trials=number_of_trials,
f_with_mutator=args.tag_strength / 100.,
number_of_trials=args.n_trials,
exp_af=args.exp_af / 100.,
distance_method=distance_method,
vary_mutation_counts=True,
vary_mutation_counts=False,
)

# if desired, output the genotype matrix to a CSV
if args.raw_geno is not None:
genotype_matrix_dfs = []
for trial_n in range(number_of_trials):
for trial_n in range(args.n_trials):
columns = [f"S{i}" for i in range(args.n_haplotypes)]
genotype_matrix_df = pd.DataFrame(trial_genotype_matrices[trial_n], columns = columns)
genotype_matrix_df.replace({0: "B", 2: "D"}, inplace=True)
Expand All @@ -312,7 +302,7 @@ def main(args):
# if desired, output the simulated mutation spectra
if args.raw_spectra is not None:
raw_spectra_df = []
for trial_n in range(number_of_trials):
for trial_n in range(args.n_trials):
trial_spectra_df = pd.DataFrame(
trial_mutation_spectra[trial_n, :, :],
columns=mutations,
Expand All @@ -333,7 +323,6 @@ def main(args):
"effect_size": args.effect_size,
"mutation_type": args.mutation_type,
"exp_af": args.exp_af,
"tag_strength": args.tag_strength,
"distance_method": args.distance_method,
"trial": i,
"pval": pval,
Expand Down Expand Up @@ -396,14 +385,7 @@ def main(args):
type=float,
default=50,
help=
"""Expected allele frequency at each simulated marker, expressed as a percentage. Default is 50.""",
)
p.add_argument(
"-tag_strength",
type=float,
default=100,
help=
"""Percentage of haplotypes with "A" alleles at the simulated mutator locus that actually carry the effects of the mutator on their mutation spectra. Default is 100.""",
"""Expected allele frequency at the mutator locus, expressed as a percentage. Default is 50.""",
)
p.add_argument(
"-distance_method",
Expand All @@ -421,5 +403,11 @@ def main(args):
help=
"""Path to file containing the raw mutation spectrum data from each simulation.""",
)
p.add_argument(
"-n_trials",
type=int,
default=1,
help="""Number of times to simulate data.""",
)
args = p.parse_args()
main(args)
79 changes: 43 additions & 36 deletions ihd/run_ihd_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,20 @@
compute_manual_cosine_distance,
calculate_confint,
get_covariate_matrix,
calculate_covariate_by_marker
calculate_covariate_by_marker,
)
from schema import IHDResultSchema, MutationSchema
import numba
from typing import List
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as ss
import statsmodels.stats.multitest as mt
import scipy
import allel
from scipy.spatial.distance import squareform

def filter_mutation_data(mutations: pd.DataFrame, geno: pd.DataFrame) -> pd.DataFrame:
# get unique samples in mutation dataframe
Expand All @@ -38,26 +47,6 @@ def filter_mutation_data(mutations: pd.DataFrame, geno: pd.DataFrame) -> pd.Data
return mutations_filtered


def filter_by_af(
geno: pd.DataFrame,
samples: List[str],
af_thresh: float = 0.1,
) -> pd.DataFrame:

# calculate allele frequencies at each site
ac = np.nansum(geno[samples], axis=1)
an = np.sum(~np.isnan(geno[samples]), axis=1) * 2
afs = ac / an

# only consider sites where allele frequency is between thresholds
idxs2keep = np.where((afs > af_thresh) & (afs < 1 - af_thresh))[0]
print("Using {} genotypes that meet filtering criteria.".format(idxs2keep.shape[0]))

# filter the genotype dataframe to include specified sites
geno_filtered = geno.iloc[idxs2keep]
return geno_filtered


def main(args):

# read in JSON file with file paths
Expand All @@ -82,7 +71,7 @@ def main(args):
samples, mutation_types, spectra = compute_spectra(
mutations_filtered,
k=args.k,
cpg=False,
cpg=True,
)
print(f"Using {len(samples)} samples and {int(np.sum(spectra))} total mutations.")

Expand All @@ -106,17 +95,30 @@ def main(args):
print (s, m)
callable_kmer_arr[si, mi] = callable_k["count"].values[0]


# convert string genotypes to integers based on config definition
replace_dict = config_dict['genotypes']
geno_asint = geno.replace(replace_dict).replace({1: np.nan})

# filter genotypes by allele frequency
geno_asint_filtered = filter_by_af(geno_asint, samples, af_thresh=0.1)

# measure LD between all pairs of markers
cols = [c for c in geno_asint.columns if c != "marker"]
gn = geno_asint[cols].values
gn[np.isnan(gn)] = -1
r = allel.rogers_huff_r(gn)
ld = squareform(r ** 2)
np.fill_diagonal(ld, 1.)

# if specified, remove all markers with r2 > 0.5 with selected markers
if args.adj_marker is not None:
marker_idxs = geno_asint[geno_asint["marker"] == args.adj_marker].index.values
_, ld_markers = np.where(ld[marker_idxs] >= 0.5)
geno_asint = geno_asint.iloc[geno_asint.index.difference(ld_markers)]

print("Using {} genotypes that meet filtering criteria.".format(geno_asint.shape[0]))
# convert genotype values to a matrix
geno_asint_filtered_matrix = geno_asint_filtered[samples].values
geno_asint_filtered_matrix = geno_asint[samples].values
# get an array of marker names at the filtered genotyped loci
markers_filtered = geno_asint_filtered['marker'].values
markers_filtered = geno_asint['marker'].values

# compute similarity between allele frequencies at each marker
genotype_similarity = compute_genotype_similarity(geno_asint_filtered_matrix)
Expand All @@ -126,7 +128,7 @@ def main(args):

covariate_ratios = np.ones(genotype_similarity.shape[0]).reshape(-1, 1)

covariate_cols = ["n_generations"]
covariate_cols = []
covariate_matrix = get_covariate_matrix(
mutations_filtered,
samples,
Expand Down Expand Up @@ -179,33 +181,32 @@ def main(args):

# for each chromosome, compute the specified confidence interval around
# the peak observed distance
geno_asint_filtered_merged = geno_asint_filtered.merge(markers, on="marker")
geno_asint_filtered_merged = geno_asint.merge(markers, on="marker")

combined_conf_int_df = []

conf_int_chroms = ["4", "6"]
for chrom, chrom_df in geno_asint_filtered_merged.groupby("chromosome"):
for chrom, chrom_df in tqdm.tqdm(geno_asint_filtered_merged.groupby("chromosome")):
if chrom not in conf_int_chroms: continue
# get the indices of each marker on the chromosome
chrom_idxs = chrom_df.index.values
chrom_genotype_matrix = geno_asint_filtered_matrix[chrom_idxs, :]

chrom_genotype_matrix = chrom_df[samples].values
# compute confidence intervals on the chromosome
conf_int_lo, conf_int_hi = calculate_confint(
conf_int_lo, conf_int_hi, peak_markers = calculate_confint(
spectra,
chrom_genotype_matrix,
covariate_matrix,
distance_method=distance_method,
adjust_statistics=True,
conf_int=80.,
conf_int=90.,
n_permutations=10_000,
)

conf_int_df = chrom_df.iloc[np.array([conf_int_lo, conf_int_hi])]
combined_conf_int_df.append(conf_int_df[["chromosome", "marker", "Mb"]])

combined_conf_int_df = pd.concat(combined_conf_int_df)

combined_conf_int_df.to_csv(f"{args.out}.outie.csv", index=False)
combined_conf_int_df.to_csv(f"{args.out}.ci.csv", index=False)

res_df.to_csv(args.out, index=False)

Expand Down Expand Up @@ -268,6 +269,12 @@ def main(args):
type=str,
help="""If specified, use this column to perform a stratified permutation test by only permuting BXDs within groups defined by the column to account for population structure."""
)
p.add_argument(
"-adj_marker",
default=None,
type=str,
help="""Comma-separated list of markers that we should adjust for when performing scan."""
)
args = p.parse_args()

main(args)
2 changes: 1 addition & 1 deletion scripts/rules/process_bxd_data.smk
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,6 @@ rule plot_bxd_spectra_vs_age:
"""
python {input.py_script} --spectra {input.spectra} \
--out {output} \
--mutation_type {wildcards.mutation_type} \
-mutation_type {wildcards.mutation_type} \
-phenotype Count
"""
5 changes: 3 additions & 2 deletions scripts/test_for_epistasis.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,15 @@ gfit2 <- lmekin(Fraction ~ Genotype_A * Genotype_B + (1 | sample),

# use a poisson model to test for interaction effects between genotypes
# on C>A mutation counts, modeled as rates
m1 <- glm(Count ~ offset(log(ADJ_AGE)) + Genotype_A + Genotype_B,
m1 <- glm(Count ~ Genotype_A + Genotype_B + offset(log(ADJ_AGE)),
data = phen_df,
family = poisson()
)
m2 <- glm(Count ~ offset(log(ADJ_AGE)) + Genotype_A * Genotype_B,
m2 <- glm(Count ~ Genotype_A * Genotype_B + offset(log(ADJ_AGE)),
data = phen_df,
family = poisson()
)

print(anova(m2, m1, test = "Chisq"))

# use the MGP data from dumont to test for interaction effects between
Expand Down

0 comments on commit 2905a05

Please sign in to comment.