From d376670b5e4bfb0df8c3622cef8073c583ed85e0 Mon Sep 17 00:00:00 2001 From: Tom Sasani Date: Fri, 19 Jan 2024 16:00:56 -0500 Subject: [PATCH] condition scans based on explicit Mb region rather than a single marker --- ihd/run_ihd_scan.py | 84 +++++++++++++++++++-------------------- ihd/utils.py | 2 +- scripts/rules/run_ihd.smk | 9 +++-- 3 files changed, 47 insertions(+), 48 deletions(-) diff --git a/ihd/run_ihd_scan.py b/ihd/run_ihd_scan.py index 191e9c4..49c32b3 100644 --- a/ihd/run_ihd_scan.py +++ b/ihd/run_ihd_scan.py @@ -7,17 +7,13 @@ compute_spectra, perform_permutation_test, perform_ihd_scan, - compute_genotype_similarity, compute_manual_chisquare, compute_manual_cosine_distance, - calculate_confint, get_covariate_matrix, calculate_covariate_by_marker, ) from schema import IHDResultSchema, MutationSchema import numba -import allel -from scipy.spatial.distance import squareform def filter_mutation_data( @@ -105,18 +101,19 @@ def main(args): replace_dict = config_dict["genotypes"] geno_asint = geno.replace(replace_dict).replace({1: np.nan}) - # if specified, remove all markers with r2 > X with selected markers - if args.adj_marker is not None: - # measure LD between all pairs of markers - cols = [c for c in geno_asint.columns if c != "marker"] - gn = geno_asint[cols].values - gn[np.isnan(gn)] = -1 - r = allel.rogers_huff_r(gn) - ld = squareform(r**2) - np.fill_diagonal(ld, 1.0) - marker_idxs = geno_asint[geno_asint["marker"] == args.adj_marker].index.values - _, ld_markers = np.where(ld[marker_idxs] >= 0.1) - geno_asint = geno_asint.iloc[geno_asint.index.difference(ld_markers)] + if args.adj_region != "None": + chrom = args.adj_region.split(":")[0] + start, end = list(map(float, args.adj_region.split(":")[1].split("-"))) + # find markers within this region + markers_to_filter = markers[ + (markers["chromosome"] == chrom) + & (markers["Mb"] >= start) + & (markers["Mb"] <= end) + ]["marker"].unique() + marker_idxs = geno_asint[ + geno_asint["marker"].isin(markers_to_filter) + ].index.values + geno_asint = geno_asint.iloc[geno_asint.index.difference(marker_idxs)] # convert genotype values to a matrix geno_asint_filtered_matrix = geno_asint[samples].values @@ -124,7 +121,8 @@ def main(args): markers_filtered = geno_asint["marker"].values # compute similarity between allele frequencies at each marker - genotype_similarity = compute_genotype_similarity(geno_asint_filtered_matrix) + # genotype_similarity = compute_genotype_similarity(geno_asint_filtered_matrix) + genotype_similarity = np.ones(geno_asint_filtered_matrix.shape[0]) distance_method = compute_manual_cosine_distance if args.distance_method == "chisquare": distance_method = compute_manual_chisquare @@ -149,7 +147,7 @@ def main(args): genotype_similarity, covariate_ratios, distance_method=distance_method, - adjust_statistics=True, + adjust_statistics=False, ) res_df = pd.DataFrame( @@ -173,7 +171,7 @@ def main(args): distance_method=distance_method, n_permutations=args.permutations, progress=args.progress, - adjust_statistics=True, + adjust_statistics=False, ) # compute the Nth percentile of the maximum distance @@ -186,31 +184,31 @@ def main(args): # the peak observed distance geno_asint_filtered_merged = geno_asint.merge(markers, on="marker") - combined_conf_int_df = [] - - conf_int_chroms = ["4", "6"] - for chrom, chrom_df in geno_asint_filtered_merged.groupby("chromosome"): - if chrom not in conf_int_chroms: - continue - - chrom_genotype_matrix = chrom_df[samples].values - # compute confidence intervals on the chromosome - conf_int_lo, conf_int_hi, peak_markers = calculate_confint( - spectra, - chrom_genotype_matrix, - covariate_matrix, - distance_method=distance_method, - adjust_statistics=True, - conf_int=90., - n_permutations=1_000, - ) + # combined_conf_int_df = [] + + # conf_int_chroms = ["4", "6"] + # for chrom, chrom_df in geno_asint_filtered_merged.groupby("chromosome"): + # if chrom not in conf_int_chroms: + # continue + + # chrom_genotype_matrix = chrom_df[samples].values + # # compute confidence intervals on the chromosome + # conf_int_lo, conf_int_hi, peak_markers = calculate_confint( + # spectra, + # chrom_genotype_matrix, + # covariate_matrix, + # distance_method=distance_method, + # adjust_statistics=True, + # conf_int=90., + # n_permutations=1_000, + # ) - conf_int_df = chrom_df.iloc[np.array([conf_int_lo, conf_int_hi])] - combined_conf_int_df.append(conf_int_df[["chromosome", "marker", "Mb"]]) + # conf_int_df = chrom_df.iloc[np.array([conf_int_lo, conf_int_hi])] + # combined_conf_int_df.append(conf_int_df[["chromosome", "marker", "Mb"]]) - combined_conf_int_df = pd.concat(combined_conf_int_df) + # combined_conf_int_df = pd.concat(combined_conf_int_df) - combined_conf_int_df.to_csv(f"{args.out}.ci.csv", index=False) + # combined_conf_int_df.to_csv(f"{args.out}.ci.csv", index=False) res_df.to_csv(args.out, index=False) @@ -273,10 +271,10 @@ def main(args): help="""If specified, use this column to perform a stratified permutation test by only permuting BXDs within groups defined by the column to account for population structure.""", ) p.add_argument( - "-adj_marker", + "-adj_region", default=None, type=str, - help="""Comma-separated list of markers that we should adjust for when performing scan.""", + help="""If specified, a chromosomal region (chr:start-end) that we should adjust for in our AMSD scans. Start and end coordinates should be specified in Mb. Default is None""", ) args = p.parse_args() diff --git a/ihd/utils.py b/ihd/utils.py index 760fd4e..aaa5663 100644 --- a/ihd/utils.py +++ b/ihd/utils.py @@ -150,7 +150,7 @@ def compute_haplotype_distance( @numba.njit def shuffle_spectra( spectra: np.ndarray, - groups: np.ndarray = None, + groups: np.ndarray, ) -> np.ndarray: """Randomly shuffle the rows of a 2D numpy array of mutation spectrum data of size (N, M), where N is the number diff --git a/scripts/rules/run_ihd.smk b/scripts/rules/run_ihd.smk index 9c2c545..1ab44e9 100644 --- a/scripts/rules/run_ihd.smk +++ b/scripts/rules/run_ihd.smk @@ -1,4 +1,5 @@ -condition2markers = {"N": None, "D": "rs27509845", "B": "rs27509845"} +# region containing significant markers in unconditioned scan +condition2region = {"N": None, "D": "4:103.634906-125.068158", "B": "4:103.634906-125.068158"} rule run_ihd: @@ -8,7 +9,7 @@ rule run_ihd: config = "data/json/{cross}.json", py_script = "ihd/run_ihd_scan.py" output: "csv/{cross}.k{k}.genome.condition_on_{condition}.results.csv" - params: adj_marker = lambda wc: condition2markers[wc.condition] + params: adj_region = lambda wc: condition2region[wc.condition] shell: """ python {input.py_script} --mutations {input.singletons} \ @@ -16,10 +17,10 @@ rule run_ihd: --out {output} \ -k {wildcards.k} \ -distance_method cosine \ - -permutations 10000 \ + -permutations 1000 \ -stratify_column true_epoch \ + -adj_region {params.adj_region} \ -threads 4 \ - -adj_marker {params.adj_marker} \ -progress """