remove markers in strong LD with chr4, simplify simulation code

quinlan-lab · Oct 25, 2023 · 2905a05 · 2905a05
1 parent dedf127
commit 2905a05
Show file tree

Hide file tree

Showing 4 changed files with 77 additions and 81 deletions.
diff --git a/ihd/run_ihd_power_simulations.py b/ihd/run_ihd_power_simulations.py
@@ -94,7 +94,7 @@ def run_simulation_trials(
     n_haplotypes: int = 100,
     n_markers: int = 1000,
     number_of_trials: int = 100,
-    f_with_mutator: float = 1.,
+    exp_af: float = 0.5,
     distance_method: Callable = compute_manual_chisquare,
     vary_mutation_counts: bool = False,
 ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
@@ -104,9 +104,6 @@ def run_simulation_trials(
         base_lambdas (np.ndarray): 1D numpy array of lambda values corresponding to the \
             expected number of each k-mer mutation type we observe on a haplotype.
 
-        genotype_matrix (np.ndarray): 2D numpy array of size (g, n), where g is the number \
-            of genotyped sites and n is the number of simulated haplotypes.
-
         mutation_type_idx (int, optional): Index of the mutation type to augment by the specified \
             effect size. Defaults to 0.
 
@@ -124,17 +121,25 @@ def run_simulation_trials(
         number_of_trials (int, optional): Number of trials to run for every combination of \
             simulation parameters. Defaults to 100.
 
-        f_with_mutator (float, optional): The fraction of haplotypes with the ALT allele at the \
+        exp_af (float, optional): The fraction of haplotypes at the \
             simulated mutator locus that actually possess the mutator allele and augmented mutation \
-            spectra. I.e., the degree to which the ALT allele at the mutator locus truly tags \
-            the mutator allele. Defaults to 1..
+            spectra. Defaults to 0.5.
+
+        distance_method (Callable, optional): Distance metric to use in AMSD scan. Defaults to cosine.
+
+        vary_mutation_counts (bool, optional): Whether to allow mutation counts on haplotypes to vary by \
+            a factor of 20. Defaults to False.
 
     Returns:
         pvals (np.ndarray): 1D numpy array of p-values from `number_of_trials` trials.
 
         mutation_spectra_in_trials (np.ndarray): 3D numpy array of size (T, H, M), where T \
             is the number of trials, H is the number of haplotypes, and M is the number of \
             k-mer mutation types. Contains simulated mutation spectra for every haplotype.
+
+        genotype_matrices_in_trials (np.ndarray): 3D numpy array of size (T, H, G), where T \
+            is the number of trials, H is the number of haplotypes, and G is the number of \
+            genotyped sites. Contains simulated genotypes at each marker in each trial.
         
         focal_markers (np.ndarray): 1D numpy array containing the marker at which we simulate \
             the mutator allele in every trial.
@@ -154,7 +159,6 @@ def run_simulation_trials(
 
         genotype_matrix = simulate_genotypes(n_markers, n_haplotypes, exp_af = 0.5)
 
-
         # create a matrix of lambda values
         lambdas = np.broadcast_to(
             base_lambdas * mutation_counts,
@@ -172,12 +176,10 @@ def run_simulation_trials(
         alt_haplotypes = np.where(genotype_matrix[focal_marker] == 2)[0]
 
         # figure out how many of the haplotypes at the "mutator locus"
-        # actually carry the mutator allele. in other words, what if all of
-        # the markers have AF = 0.5, but the "focal" marker *imperfectly tags* the mutator
-        # locus? such that 1/2 of haplotypes carry the marker, but only a fraction of those
-        # actually carry the true mutator allele.
-        n_true_alt_haplotypes = int(alt_haplotypes.shape[0] * f_with_mutator)
-        alt_haplotypes = np.random.choice(alt_haplotypes, n_true_alt_haplotypes, replace=False)
+        # actually carry the mutator allele. we may want the allele frequency
+        # of the mutator allele to be less than 0.5
+        n_true_alt_haplotypes = int(n_haplotypes * exp_af)
+        alt_haplotypes = np.random.choice(np.arange(n_haplotypes), n_true_alt_haplotypes, replace=False)
 
         # TEST: if the mutator is present at lower frequency, bump
         # the genotypes to reflect it
@@ -188,7 +190,7 @@ def run_simulation_trials(
         # ensure that at least one haplotype at this site has the
         # alternate allele. otherwise, return a p-value of 1.
         if alt_haplotypes.shape[0] < 1:
-            print ("NO ALT HAPLOTYPES", n_true_alt_haplotypes, f_with_mutator, alt_haplotypes, genotype_matrix[focal_marker], np.sum(genotype_matrix, axis=1))
+            #print ("NO ALT HAPLOTYPES", n_true_alt_haplotypes, f_with_mutator, alt_haplotypes, genotype_matrix[focal_marker], np.sum(genotype_matrix, axis=1))
             pvals[trial_n] = 1.
             continue
         # augment the lambda on the alt haplotypes by an effect size
@@ -209,7 +211,6 @@ def run_simulation_trials(
             covariate_ratios,
             adjust_statistics=False,
             distance_method=distance_method,
-
         )
 
         # and get null
@@ -223,7 +224,6 @@ def run_simulation_trials(
             distance_method=distance_method,
             adjust_statistics=False,
         )
-
         thresh = np.percentile(null_distances, q=95)
         pvals[trial_n] = 1 - (focal_dists[focal_marker] > thresh)
 
@@ -232,8 +232,6 @@ def run_simulation_trials(
 
 def main(args):
 
-    number_of_trials = 100
-
     base_mutations = ["C>T", "CpG>TpG", "C>A", "C>G", "A>T", "A>C", "A>G"]
     base_lambdas = np.array([0.29, 0.17, 0.12, 0.075, 0.1, 0.075, 0.17])
 
@@ -271,15 +269,7 @@ def main(args):
     res_df = []
 
     mutation_type_idx = mut2idx[args.mutation_type.replace("_", ">")]
-
-    # NOTE: simulate genotypes separately in each trial? or just
-    # simulate the mutation spectra independently?
-    # genotype_matrix = simulate_genotypes(
-    #     args.n_markers,
-    #     args.n_haplotypes,
-    #     exp_af=args.exp_af / 100.,
-    # )
-
+
     trial_pvals, trial_mutation_spectra, trial_genotype_matrices, focal_markers = run_simulation_trials(
         lambdas,
         mutation_type_idx=mutation_type_idx,
@@ -288,16 +278,16 @@ def main(args):
         n_mutations=args.n_mutations,
         n_haplotypes=args.n_haplotypes,
         n_markers=args.n_markers,
-        number_of_trials=number_of_trials,
-        f_with_mutator=args.tag_strength / 100.,
+        number_of_trials=args.n_trials,
+        exp_af=args.exp_af / 100.,
         distance_method=distance_method,
-        vary_mutation_counts=True,
+        vary_mutation_counts=False,
     )
 
     # if desired, output the genotype matrix to a CSV
     if args.raw_geno is not None:
         genotype_matrix_dfs = []
-        for trial_n in range(number_of_trials):
+        for trial_n in range(args.n_trials):
             columns = [f"S{i}" for i in range(args.n_haplotypes)]
             genotype_matrix_df = pd.DataFrame(trial_genotype_matrices[trial_n], columns = columns)
             genotype_matrix_df.replace({0: "B", 2: "D"}, inplace=True)
@@ -312,7 +302,7 @@ def main(args):
     # if desired, output the simulated mutation spectra
     if args.raw_spectra is not None:
         raw_spectra_df = []
-        for trial_n in range(number_of_trials):
+        for trial_n in range(args.n_trials):
             trial_spectra_df = pd.DataFrame(
                 trial_mutation_spectra[trial_n, :, :],
                 columns=mutations,
@@ -333,7 +323,6 @@ def main(args):
             "effect_size": args.effect_size,
             "mutation_type": args.mutation_type,
             "exp_af": args.exp_af,
-            "tag_strength": args.tag_strength,
             "distance_method": args.distance_method,
             "trial": i,
             "pval": pval,
@@ -396,14 +385,7 @@ def main(args):
         type=float,
         default=50,
         help=
-        """Expected allele frequency at each simulated marker, expressed as a percentage. Default is 50.""",
-    )
-    p.add_argument(
-        "-tag_strength",
-        type=float,
-        default=100,
-        help=
-        """Percentage of haplotypes with "A" alleles at the simulated mutator locus that actually carry the effects of the mutator on their mutation spectra. Default is 100.""",
+        """Expected allele frequency at the mutator locus, expressed as a percentage. Default is 50.""",
     )
     p.add_argument(
         "-distance_method",
@@ -421,5 +403,11 @@ def main(args):
         help=
         """Path to file containing the raw mutation spectrum data from each simulation.""",
     )
+    p.add_argument(
+        "-n_trials",
+        type=int,
+        default=1,
+        help="""Number of times to simulate data.""",
+    )
     args = p.parse_args()
     main(args)
diff --git a/ihd/run_ihd_scan.py b/ihd/run_ihd_scan.py
@@ -12,11 +12,20 @@
     compute_manual_cosine_distance,
     calculate_confint,
     get_covariate_matrix,
-    calculate_covariate_by_marker
+    calculate_covariate_by_marker,
 )
 from schema import IHDResultSchema, MutationSchema
 import numba
 from typing import List
+import tqdm
+import matplotlib.pyplot as plt
+import seaborn as sns
+import statsmodels.api as sm
+import scipy.stats as ss
+import statsmodels.stats.multitest as mt
+import scipy
+import allel
+from scipy.spatial.distance import squareform
 
 def filter_mutation_data(mutations: pd.DataFrame, geno: pd.DataFrame) -> pd.DataFrame:
     # get unique samples in mutation dataframe
@@ -38,26 +47,6 @@ def filter_mutation_data(mutations: pd.DataFrame, geno: pd.DataFrame) -> pd.Data
     return mutations_filtered
 
 
-def filter_by_af(
-    geno: pd.DataFrame,
-    samples: List[str],
-    af_thresh: float = 0.1,
-) -> pd.DataFrame:
-
-    # calculate allele frequencies at each site
-    ac = np.nansum(geno[samples], axis=1)
-    an = np.sum(~np.isnan(geno[samples]), axis=1) * 2
-    afs = ac / an
-
-    # only consider sites where allele frequency is between thresholds
-    idxs2keep = np.where((afs > af_thresh) & (afs < 1 - af_thresh))[0]
-    print("Using {} genotypes that meet filtering criteria.".format(idxs2keep.shape[0]))
-
-    # filter the genotype dataframe to include specified sites
-    geno_filtered = geno.iloc[idxs2keep]
-    return geno_filtered
-
-
 def main(args):
 
     # read in JSON file with file paths
@@ -82,7 +71,7 @@ def main(args):
     samples, mutation_types, spectra = compute_spectra(
         mutations_filtered,
         k=args.k,
-        cpg=False,
+        cpg=True,
     )
     print(f"Using {len(samples)} samples and {int(np.sum(spectra))} total mutations.")
 
@@ -106,17 +95,30 @@ def main(args):
                     print (s, m)
                 callable_kmer_arr[si, mi] = callable_k["count"].values[0]
 
+
     # convert string genotypes to integers based on config definition
     replace_dict = config_dict['genotypes']
     geno_asint = geno.replace(replace_dict).replace({1: np.nan})
 
-    # filter genotypes by allele frequency
-    geno_asint_filtered = filter_by_af(geno_asint, samples, af_thresh=0.1)
-
+    # measure LD between all pairs of markers
+    cols = [c for c in geno_asint.columns if c != "marker"]
+    gn = geno_asint[cols].values
+    gn[np.isnan(gn)] = -1
+    r = allel.rogers_huff_r(gn)
+    ld = squareform(r ** 2)
+    np.fill_diagonal(ld, 1.)
+
+    # if specified, remove all markers with r2 > 0.5 with selected markers
+    if args.adj_marker is not None:
+        marker_idxs = geno_asint[geno_asint["marker"] == args.adj_marker].index.values
+        _, ld_markers = np.where(ld[marker_idxs] >= 0.5)
+        geno_asint = geno_asint.iloc[geno_asint.index.difference(ld_markers)]
+
+    print("Using {} genotypes that meet filtering criteria.".format(geno_asint.shape[0]))
     # convert genotype values to a matrix
-    geno_asint_filtered_matrix = geno_asint_filtered[samples].values
+    geno_asint_filtered_matrix = geno_asint[samples].values
     # get an array of marker names at the filtered genotyped loci
-    markers_filtered = geno_asint_filtered['marker'].values
+    markers_filtered = geno_asint['marker'].values
 
     # compute similarity between allele frequencies at each marker
     genotype_similarity = compute_genotype_similarity(geno_asint_filtered_matrix)
@@ -126,7 +128,7 @@ def main(args):
 
     covariate_ratios = np.ones(genotype_similarity.shape[0]).reshape(-1, 1)
 
-    covariate_cols = ["n_generations"]
+    covariate_cols = []
     covariate_matrix = get_covariate_matrix(
         mutations_filtered,
         samples,
@@ -179,33 +181,32 @@ def main(args):
 
     # for each chromosome, compute the specified confidence interval around
     # the peak observed distance
-    geno_asint_filtered_merged = geno_asint_filtered.merge(markers, on="marker")
+    geno_asint_filtered_merged = geno_asint.merge(markers, on="marker")
 
     combined_conf_int_df = []
 
     conf_int_chroms = ["4", "6"]
-    for chrom, chrom_df in geno_asint_filtered_merged.groupby("chromosome"):
+    for chrom, chrom_df in tqdm.tqdm(geno_asint_filtered_merged.groupby("chromosome")):
         if chrom not in conf_int_chroms: continue
-        # get the indices of each marker on the chromosome
-        chrom_idxs = chrom_df.index.values
-        chrom_genotype_matrix = geno_asint_filtered_matrix[chrom_idxs, :]
 
+        chrom_genotype_matrix = chrom_df[samples].values
         # compute confidence intervals on the chromosome
-        conf_int_lo, conf_int_hi = calculate_confint(
+        conf_int_lo, conf_int_hi, peak_markers = calculate_confint(
             spectra,
             chrom_genotype_matrix,
             covariate_matrix,
             distance_method=distance_method,
             adjust_statistics=True,
-            conf_int=80.,
+            conf_int=90., 
+            n_permutations=10_000,
         )
 
         conf_int_df = chrom_df.iloc[np.array([conf_int_lo, conf_int_hi])]
         combined_conf_int_df.append(conf_int_df[["chromosome", "marker", "Mb"]])
 
     combined_conf_int_df = pd.concat(combined_conf_int_df)
 
-    combined_conf_int_df.to_csv(f"{args.out}.outie.csv", index=False)
+    combined_conf_int_df.to_csv(f"{args.out}.ci.csv", index=False)
 
     res_df.to_csv(args.out, index=False)
 
@@ -268,6 +269,12 @@ def main(args):
         type=str,
         help="""If specified, use this column to perform a stratified permutation test by only permuting BXDs within groups defined by the column to account for population structure."""
     )
+    p.add_argument(
+        "-adj_marker",
+        default=None,
+        type=str,
+        help="""Comma-separated list of markers that we should adjust for when performing scan."""
+    )
     args = p.parse_args()
 
     main(args)
diff --git a/scripts/rules/process_bxd_data.smk b/scripts/rules/process_bxd_data.smk
@@ -51,6 +51,6 @@ rule plot_bxd_spectra_vs_age:
         """
         python {input.py_script} --spectra {input.spectra} \
                                  --out {output} \
-                                 --mutation_type {wildcards.mutation_type} \
+                                 -mutation_type {wildcards.mutation_type} \
                                  -phenotype Count
         """
diff --git a/scripts/test_for_epistasis.R b/scripts/test_for_epistasis.R
@@ -54,14 +54,15 @@ gfit2 <- lmekin(Fraction ~ Genotype_A * Genotype_B + (1 | sample),
 
 # use a poisson model to test for interaction effects between genotypes
 # on C>A mutation counts, modeled as rates
-m1 <- glm(Count ~ offset(log(ADJ_AGE)) + Genotype_A + Genotype_B,
+m1 <- glm(Count ~ Genotype_A + Genotype_B + offset(log(ADJ_AGE)),
     data = phen_df,
     family = poisson()
 )
-m2 <- glm(Count ~ offset(log(ADJ_AGE)) + Genotype_A * Genotype_B,
+m2 <- glm(Count ~ Genotype_A * Genotype_B + offset(log(ADJ_AGE)),
     data = phen_df,
     family = poisson()
 )
+
 print(anova(m2, m1, test = "Chisq"))
 
 # use the MGP data from dumont to test for interaction effects between