From 2905a05f62e53bcc168ddd2925344621a2107969 Mon Sep 17 00:00:00 2001 From: Tom Sasani Date: Wed, 25 Oct 2023 10:20:04 -0400 Subject: [PATCH] remove markers in strong LD with chr4, simplify simulation code --- ihd/run_ihd_power_simulations.py | 72 ++++++++++++--------------- ihd/run_ihd_scan.py | 79 ++++++++++++++++-------------- scripts/rules/process_bxd_data.smk | 2 +- scripts/test_for_epistasis.R | 5 +- 4 files changed, 77 insertions(+), 81 deletions(-) diff --git a/ihd/run_ihd_power_simulations.py b/ihd/run_ihd_power_simulations.py index c85acd9..f02cdb3 100644 --- a/ihd/run_ihd_power_simulations.py +++ b/ihd/run_ihd_power_simulations.py @@ -94,7 +94,7 @@ def run_simulation_trials( n_haplotypes: int = 100, n_markers: int = 1000, number_of_trials: int = 100, - f_with_mutator: float = 1., + exp_af: float = 0.5, distance_method: Callable = compute_manual_chisquare, vary_mutation_counts: bool = False, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: @@ -104,9 +104,6 @@ def run_simulation_trials( base_lambdas (np.ndarray): 1D numpy array of lambda values corresponding to the \ expected number of each k-mer mutation type we observe on a haplotype. - genotype_matrix (np.ndarray): 2D numpy array of size (g, n), where g is the number \ - of genotyped sites and n is the number of simulated haplotypes. - mutation_type_idx (int, optional): Index of the mutation type to augment by the specified \ effect size. Defaults to 0. @@ -124,10 +121,14 @@ def run_simulation_trials( number_of_trials (int, optional): Number of trials to run for every combination of \ simulation parameters. Defaults to 100. - f_with_mutator (float, optional): The fraction of haplotypes with the ALT allele at the \ + exp_af (float, optional): The fraction of haplotypes at the \ simulated mutator locus that actually possess the mutator allele and augmented mutation \ - spectra. I.e., the degree to which the ALT allele at the mutator locus truly tags \ - the mutator allele. Defaults to 1.. + spectra. Defaults to 0.5. + + distance_method (Callable, optional): Distance metric to use in AMSD scan. Defaults to cosine. + + vary_mutation_counts (bool, optional): Whether to allow mutation counts on haplotypes to vary by \ + a factor of 20. Defaults to False. Returns: pvals (np.ndarray): 1D numpy array of p-values from `number_of_trials` trials. @@ -135,6 +136,10 @@ def run_simulation_trials( mutation_spectra_in_trials (np.ndarray): 3D numpy array of size (T, H, M), where T \ is the number of trials, H is the number of haplotypes, and M is the number of \ k-mer mutation types. Contains simulated mutation spectra for every haplotype. + + genotype_matrices_in_trials (np.ndarray): 3D numpy array of size (T, H, G), where T \ + is the number of trials, H is the number of haplotypes, and G is the number of \ + genotyped sites. Contains simulated genotypes at each marker in each trial. focal_markers (np.ndarray): 1D numpy array containing the marker at which we simulate \ the mutator allele in every trial. @@ -154,7 +159,6 @@ def run_simulation_trials( genotype_matrix = simulate_genotypes(n_markers, n_haplotypes, exp_af = 0.5) - # create a matrix of lambda values lambdas = np.broadcast_to( base_lambdas * mutation_counts, @@ -172,12 +176,10 @@ def run_simulation_trials( alt_haplotypes = np.where(genotype_matrix[focal_marker] == 2)[0] # figure out how many of the haplotypes at the "mutator locus" - # actually carry the mutator allele. in other words, what if all of - # the markers have AF = 0.5, but the "focal" marker *imperfectly tags* the mutator - # locus? such that 1/2 of haplotypes carry the marker, but only a fraction of those - # actually carry the true mutator allele. - n_true_alt_haplotypes = int(alt_haplotypes.shape[0] * f_with_mutator) - alt_haplotypes = np.random.choice(alt_haplotypes, n_true_alt_haplotypes, replace=False) + # actually carry the mutator allele. we may want the allele frequency + # of the mutator allele to be less than 0.5 + n_true_alt_haplotypes = int(n_haplotypes * exp_af) + alt_haplotypes = np.random.choice(np.arange(n_haplotypes), n_true_alt_haplotypes, replace=False) # TEST: if the mutator is present at lower frequency, bump # the genotypes to reflect it @@ -188,7 +190,7 @@ def run_simulation_trials( # ensure that at least one haplotype at this site has the # alternate allele. otherwise, return a p-value of 1. if alt_haplotypes.shape[0] < 1: - print ("NO ALT HAPLOTYPES", n_true_alt_haplotypes, f_with_mutator, alt_haplotypes, genotype_matrix[focal_marker], np.sum(genotype_matrix, axis=1)) + #print ("NO ALT HAPLOTYPES", n_true_alt_haplotypes, f_with_mutator, alt_haplotypes, genotype_matrix[focal_marker], np.sum(genotype_matrix, axis=1)) pvals[trial_n] = 1. continue # augment the lambda on the alt haplotypes by an effect size @@ -209,7 +211,6 @@ def run_simulation_trials( covariate_ratios, adjust_statistics=False, distance_method=distance_method, - ) # and get null @@ -223,7 +224,6 @@ def run_simulation_trials( distance_method=distance_method, adjust_statistics=False, ) - thresh = np.percentile(null_distances, q=95) pvals[trial_n] = 1 - (focal_dists[focal_marker] > thresh) @@ -232,8 +232,6 @@ def run_simulation_trials( def main(args): - number_of_trials = 100 - base_mutations = ["C>T", "CpG>TpG", "C>A", "C>G", "A>T", "A>C", "A>G"] base_lambdas = np.array([0.29, 0.17, 0.12, 0.075, 0.1, 0.075, 0.17]) @@ -271,15 +269,7 @@ def main(args): res_df = [] mutation_type_idx = mut2idx[args.mutation_type.replace("_", ">")] - - # NOTE: simulate genotypes separately in each trial? or just - # simulate the mutation spectra independently? - # genotype_matrix = simulate_genotypes( - # args.n_markers, - # args.n_haplotypes, - # exp_af=args.exp_af / 100., - # ) - + trial_pvals, trial_mutation_spectra, trial_genotype_matrices, focal_markers = run_simulation_trials( lambdas, mutation_type_idx=mutation_type_idx, @@ -288,16 +278,16 @@ def main(args): n_mutations=args.n_mutations, n_haplotypes=args.n_haplotypes, n_markers=args.n_markers, - number_of_trials=number_of_trials, - f_with_mutator=args.tag_strength / 100., + number_of_trials=args.n_trials, + exp_af=args.exp_af / 100., distance_method=distance_method, - vary_mutation_counts=True, + vary_mutation_counts=False, ) # if desired, output the genotype matrix to a CSV if args.raw_geno is not None: genotype_matrix_dfs = [] - for trial_n in range(number_of_trials): + for trial_n in range(args.n_trials): columns = [f"S{i}" for i in range(args.n_haplotypes)] genotype_matrix_df = pd.DataFrame(trial_genotype_matrices[trial_n], columns = columns) genotype_matrix_df.replace({0: "B", 2: "D"}, inplace=True) @@ -312,7 +302,7 @@ def main(args): # if desired, output the simulated mutation spectra if args.raw_spectra is not None: raw_spectra_df = [] - for trial_n in range(number_of_trials): + for trial_n in range(args.n_trials): trial_spectra_df = pd.DataFrame( trial_mutation_spectra[trial_n, :, :], columns=mutations, @@ -333,7 +323,6 @@ def main(args): "effect_size": args.effect_size, "mutation_type": args.mutation_type, "exp_af": args.exp_af, - "tag_strength": args.tag_strength, "distance_method": args.distance_method, "trial": i, "pval": pval, @@ -396,14 +385,7 @@ def main(args): type=float, default=50, help= - """Expected allele frequency at each simulated marker, expressed as a percentage. Default is 50.""", - ) - p.add_argument( - "-tag_strength", - type=float, - default=100, - help= - """Percentage of haplotypes with "A" alleles at the simulated mutator locus that actually carry the effects of the mutator on their mutation spectra. Default is 100.""", + """Expected allele frequency at the mutator locus, expressed as a percentage. Default is 50.""", ) p.add_argument( "-distance_method", @@ -421,5 +403,11 @@ def main(args): help= """Path to file containing the raw mutation spectrum data from each simulation.""", ) + p.add_argument( + "-n_trials", + type=int, + default=1, + help="""Number of times to simulate data.""", + ) args = p.parse_args() main(args) \ No newline at end of file diff --git a/ihd/run_ihd_scan.py b/ihd/run_ihd_scan.py index a864ac8..cd96a4f 100644 --- a/ihd/run_ihd_scan.py +++ b/ihd/run_ihd_scan.py @@ -12,11 +12,20 @@ compute_manual_cosine_distance, calculate_confint, get_covariate_matrix, - calculate_covariate_by_marker + calculate_covariate_by_marker, ) from schema import IHDResultSchema, MutationSchema import numba from typing import List +import tqdm +import matplotlib.pyplot as plt +import seaborn as sns +import statsmodels.api as sm +import scipy.stats as ss +import statsmodels.stats.multitest as mt +import scipy +import allel +from scipy.spatial.distance import squareform def filter_mutation_data(mutations: pd.DataFrame, geno: pd.DataFrame) -> pd.DataFrame: # get unique samples in mutation dataframe @@ -38,26 +47,6 @@ def filter_mutation_data(mutations: pd.DataFrame, geno: pd.DataFrame) -> pd.Data return mutations_filtered -def filter_by_af( - geno: pd.DataFrame, - samples: List[str], - af_thresh: float = 0.1, -) -> pd.DataFrame: - - # calculate allele frequencies at each site - ac = np.nansum(geno[samples], axis=1) - an = np.sum(~np.isnan(geno[samples]), axis=1) * 2 - afs = ac / an - - # only consider sites where allele frequency is between thresholds - idxs2keep = np.where((afs > af_thresh) & (afs < 1 - af_thresh))[0] - print("Using {} genotypes that meet filtering criteria.".format(idxs2keep.shape[0])) - - # filter the genotype dataframe to include specified sites - geno_filtered = geno.iloc[idxs2keep] - return geno_filtered - - def main(args): # read in JSON file with file paths @@ -82,7 +71,7 @@ def main(args): samples, mutation_types, spectra = compute_spectra( mutations_filtered, k=args.k, - cpg=False, + cpg=True, ) print(f"Using {len(samples)} samples and {int(np.sum(spectra))} total mutations.") @@ -106,17 +95,30 @@ def main(args): print (s, m) callable_kmer_arr[si, mi] = callable_k["count"].values[0] + # convert string genotypes to integers based on config definition replace_dict = config_dict['genotypes'] geno_asint = geno.replace(replace_dict).replace({1: np.nan}) - # filter genotypes by allele frequency - geno_asint_filtered = filter_by_af(geno_asint, samples, af_thresh=0.1) - + # measure LD between all pairs of markers + cols = [c for c in geno_asint.columns if c != "marker"] + gn = geno_asint[cols].values + gn[np.isnan(gn)] = -1 + r = allel.rogers_huff_r(gn) + ld = squareform(r ** 2) + np.fill_diagonal(ld, 1.) + + # if specified, remove all markers with r2 > 0.5 with selected markers + if args.adj_marker is not None: + marker_idxs = geno_asint[geno_asint["marker"] == args.adj_marker].index.values + _, ld_markers = np.where(ld[marker_idxs] >= 0.5) + geno_asint = geno_asint.iloc[geno_asint.index.difference(ld_markers)] + + print("Using {} genotypes that meet filtering criteria.".format(geno_asint.shape[0])) # convert genotype values to a matrix - geno_asint_filtered_matrix = geno_asint_filtered[samples].values + geno_asint_filtered_matrix = geno_asint[samples].values # get an array of marker names at the filtered genotyped loci - markers_filtered = geno_asint_filtered['marker'].values + markers_filtered = geno_asint['marker'].values # compute similarity between allele frequencies at each marker genotype_similarity = compute_genotype_similarity(geno_asint_filtered_matrix) @@ -126,7 +128,7 @@ def main(args): covariate_ratios = np.ones(genotype_similarity.shape[0]).reshape(-1, 1) - covariate_cols = ["n_generations"] + covariate_cols = [] covariate_matrix = get_covariate_matrix( mutations_filtered, samples, @@ -179,25 +181,24 @@ def main(args): # for each chromosome, compute the specified confidence interval around # the peak observed distance - geno_asint_filtered_merged = geno_asint_filtered.merge(markers, on="marker") + geno_asint_filtered_merged = geno_asint.merge(markers, on="marker") combined_conf_int_df = [] conf_int_chroms = ["4", "6"] - for chrom, chrom_df in geno_asint_filtered_merged.groupby("chromosome"): + for chrom, chrom_df in tqdm.tqdm(geno_asint_filtered_merged.groupby("chromosome")): if chrom not in conf_int_chroms: continue - # get the indices of each marker on the chromosome - chrom_idxs = chrom_df.index.values - chrom_genotype_matrix = geno_asint_filtered_matrix[chrom_idxs, :] + chrom_genotype_matrix = chrom_df[samples].values # compute confidence intervals on the chromosome - conf_int_lo, conf_int_hi = calculate_confint( + conf_int_lo, conf_int_hi, peak_markers = calculate_confint( spectra, chrom_genotype_matrix, covariate_matrix, distance_method=distance_method, adjust_statistics=True, - conf_int=80., + conf_int=90., + n_permutations=10_000, ) conf_int_df = chrom_df.iloc[np.array([conf_int_lo, conf_int_hi])] @@ -205,7 +206,7 @@ def main(args): combined_conf_int_df = pd.concat(combined_conf_int_df) - combined_conf_int_df.to_csv(f"{args.out}.outie.csv", index=False) + combined_conf_int_df.to_csv(f"{args.out}.ci.csv", index=False) res_df.to_csv(args.out, index=False) @@ -268,6 +269,12 @@ def main(args): type=str, help="""If specified, use this column to perform a stratified permutation test by only permuting BXDs within groups defined by the column to account for population structure.""" ) + p.add_argument( + "-adj_marker", + default=None, + type=str, + help="""Comma-separated list of markers that we should adjust for when performing scan.""" + ) args = p.parse_args() main(args) diff --git a/scripts/rules/process_bxd_data.smk b/scripts/rules/process_bxd_data.smk index 606fa45..310be1d 100644 --- a/scripts/rules/process_bxd_data.smk +++ b/scripts/rules/process_bxd_data.smk @@ -51,6 +51,6 @@ rule plot_bxd_spectra_vs_age: """ python {input.py_script} --spectra {input.spectra} \ --out {output} \ - --mutation_type {wildcards.mutation_type} \ + -mutation_type {wildcards.mutation_type} \ -phenotype Count """ \ No newline at end of file diff --git a/scripts/test_for_epistasis.R b/scripts/test_for_epistasis.R index e8664dd..fb42853 100644 --- a/scripts/test_for_epistasis.R +++ b/scripts/test_for_epistasis.R @@ -54,14 +54,15 @@ gfit2 <- lmekin(Fraction ~ Genotype_A * Genotype_B + (1 | sample), # use a poisson model to test for interaction effects between genotypes # on C>A mutation counts, modeled as rates -m1 <- glm(Count ~ offset(log(ADJ_AGE)) + Genotype_A + Genotype_B, +m1 <- glm(Count ~ Genotype_A + Genotype_B + offset(log(ADJ_AGE)), data = phen_df, family = poisson() ) -m2 <- glm(Count ~ offset(log(ADJ_AGE)) + Genotype_A * Genotype_B, +m2 <- glm(Count ~ Genotype_A * Genotype_B + offset(log(ADJ_AGE)), data = phen_df, family = poisson() ) + print(anova(m2, m1, test = "Chisq")) # use the MGP data from dumont to test for interaction effects between