diff --git a/ihd/plot_ihd_results.py b/ihd/plot_ihd_results.py index 19005dd..b707179 100644 --- a/ihd/plot_ihd_results.py +++ b/ihd/plot_ihd_results.py @@ -8,8 +8,8 @@ plt.rc("font", size=18) -def main(args): +def main(args): # read in results of IHD scan and validate with pandera results = pd.read_csv(args.results) IHDResultSchema.validate(results) @@ -19,26 +19,33 @@ def main(args): MarkerMetadataSchema.validate(markers) results_merged = results.merge(markers, on="marker").astype({"chromosome": str}) - chr_pref = any(["chr" in c for c in results_merged["chromosome"].unique()]) if chr_pref: - results_merged["chromosome"] = results_merged['chromosome'].apply(lambda c: int(c.split("chr")[1])) - results_merged = results_merged[results_merged["chromosome"] != "X"] + results_merged["chromosome"] = results_merged["chromosome"].apply( + lambda c: int(c.split("chr")[1]) + ) - pctile_label = f"95th_percentile" + results_merged = results_merged[~results_merged["chromosome"].isin(["Y", "M"])] - # get significant markers - signif = results_merged[results_merged["Distance"] >= results_merged[pctile_label]] + pctile_label = f"95th_percentile" chrom_order = list(map(str, range(1, 23))) chrom_order.append("X") chrom_order_idx = dict(zip(chrom_order, range(len(chrom_order)))) - results_merged["sort_idx"] = results_merged["chromosome"].apply(lambda c: chrom_order_idx[c]) - results_merged.sort_values("sort_idx", inplace=True) + results_merged["sort_idx"] = results_merged["chromosome"].apply( + lambda c: chrom_order_idx[c] + ) + results_merged.sort_values(["sort_idx", args.colname], inplace=True) - results_merged["is_significant"] = results_merged["Distance"] >= results_merged[pctile_label] - results_merged["ec"] = results_merged["is_significant"].apply(lambda s: "k" if s else "w") - results_merged["lw"] = results_merged["is_significant"].apply(lambda s: 1 if s else 0.5) + results_merged["is_significant"] = ( + results_merged["Distance"] >= results_merged[pctile_label] + ) + results_merged["ec"] = results_merged["is_significant"].apply( + lambda s: "k" if s else "w" + ) + results_merged["lw"] = results_merged["is_significant"].apply( + lambda s: 1 if s else 0.5 + ) # plot manhattan f, ax = plt.subplots(figsize=(16, 6)) @@ -49,8 +56,7 @@ def main(args): y=max_threshold_dist * args.scale, ls=":", c="grey", - label="Genome-wide significance threshold " + - r"$\left(p = 0.05\right)$", + label="Genome-wide significance threshold " + r"$\left(p = 0.05\right)$", lw=2, ) @@ -60,10 +66,9 @@ def main(args): colors = ["#E6803C", "#398D84"] for i, ( - chrom, - chrom_df, + chrom, + chrom_df, ) in enumerate(results_merged.groupby("chromosome", sort=False)): - color_idx = i % 2 xvals = chrom_df[args.colname].values + previous_max yvals = chrom_df["Distance"].values * args.scale @@ -81,19 +86,22 @@ def main(args): xtick_positions.append(np.median(xvals)) xticks.append(chrom) - ax.set_xticks(xtick_positions) ax.set_xticklabels(xticks) n_yticks = 6 - max_ypos = np.max([ - np.max(results_merged["Distance"].values * args.scale), - max_threshold_dist, - ]) - min_ypos = np.min([ - np.min(results_merged["Distance"].values * args.scale), - max_threshold_dist, - ]) + max_ypos = np.max( + [ + np.max(results_merged["Distance"].values * args.scale), + max_threshold_dist, + ] + ) + min_ypos = np.min( + [ + np.min(results_merged["Distance"].values * args.scale), + max_threshold_dist, + ] + ) ytick_pos = np.linspace(min_ypos * 0.95, max_ypos * 1.05, n_yticks) ytick_labs = [f"{yval:.1e}" for yval in ytick_pos] @@ -102,11 +110,11 @@ def main(args): ax.set_yticklabels(ytick_labs) # change all spines - for axis in ['top','bottom','left','right']: + for axis in ["top", "bottom", "left", "right"]: ax.spines[axis].set_linewidth(1.5) # increase tick width - ax.tick_params(width=2.) + ax.tick_params(width=2.0) sns.despine(ax=ax, top=True, right=True) ax.set_xlabel("Chromosome") @@ -125,8 +133,7 @@ def main(args): y=max_threshold_dist * args.scale, ls=":", c="grey", - label="Genome-wide significance threshold " + - r"$\left(p = 0.05\right)$", + label="Genome-wide significance threshold " + r"$\left(p = 0.05\right)$", lw=2, ) @@ -148,30 +155,27 @@ def main(args): f.savefig(args.out, dpi=300) + if __name__ == "__main__": p = argparse.ArgumentParser() p.add_argument( "--results", - help= - "Output of IHD scan, with an entry for every genotyped marker, in CSV format.", + help="Output of IHD scan, with an entry for every genotyped marker, in CSV format.", ) p.add_argument( "--markers", - help= - "File containing metadata (cM position, Mb position, or both) about each genotyped marker.", + help="File containing metadata (cM position, Mb position, or both) about each genotyped marker.", ) p.add_argument( "--out", type=str, default="ihd.png", - help= - "Name of output plot. Default is 'ihd.png'", + help="Name of output plot. Default is 'ihd.png'", ) p.add_argument( "-colname", default="Mb", - help= - "Name of the column in `--markers` that denotes the position you which to plot on the x-axis of the Manhattan plot. Default is 'Mb'", + help="Name of the column in `--markers` that denotes the position you which to plot on the x-axis of the Manhattan plot. Default is 'Mb'", ) p.add_argument( "-chrom", @@ -182,7 +186,7 @@ def main(args): "-scale", type=int, default=1, - help="""Scale the cosine distance values by the specified amount to make visualization a bit easier on the y-axis. Default is 1000.""" + help="""Scale the cosine distance values by the specified amount to make visualization a bit easier on the y-axis. Default is 1000.""", ) args = p.parse_args() - main(args) \ No newline at end of file + main(args) diff --git a/ihd/run_ihd_scan.py b/ihd/run_ihd_scan.py index ce065dd..1701e47 100644 --- a/ihd/run_ihd_scan.py +++ b/ihd/run_ihd_scan.py @@ -16,26 +16,26 @@ ) from schema import IHDResultSchema, MutationSchema import numba -from typing import List +import allel import tqdm import matplotlib.pyplot as plt -import seaborn as sns -import statsmodels.api as sm -import scipy.stats as ss -import statsmodels.stats.multitest as mt -import scipy -import allel from scipy.spatial.distance import squareform -def filter_mutation_data(mutations: pd.DataFrame, geno: pd.DataFrame) -> pd.DataFrame: + +def filter_mutation_data( + mutations: pd.DataFrame, + geno: pd.DataFrame, +) -> pd.DataFrame: # get unique samples in mutation dataframe - samples = mutations['sample'].unique() + samples = mutations["sample"].unique() # get the overlap between those and the sample names in the genotype data samples_overlap = list(set(samples).intersection(set(geno.columns))) if len(samples_overlap) == 0: - print("""Sorry, no samples in common between mutation data - and genotype matrix. Please ensure sample names are identical.""") + print( + """Sorry, no samples in common between mutation data + and genotype matrix. Please ensure sample names are identical.""" + ) sys.exit() # then subset the genotype and mutation data to include only those samples @@ -43,20 +43,19 @@ def filter_mutation_data(mutations: pd.DataFrame, geno: pd.DataFrame) -> pd.Data cols2use.extend(samples_overlap) geno = geno[cols2use] - mutations_filtered = mutations[mutations['sample'].isin(samples_overlap)] + mutations_filtered = mutations[mutations["sample"].isin(samples_overlap)] return mutations_filtered def main(args): - # read in JSON file with file paths config_dict = None with open(args.config, "rb") as config: config_dict = json.load(config) # read in genotype info - geno = pd.read_csv(config_dict['geno']) - markers = pd.read_csv(config_dict['markers']) + geno = pd.read_csv(config_dict["geno"]) + markers = pd.read_csv(config_dict["markers"]) markers2use = markers[markers["chromosome"] != "X"]["marker"].unique() geno = geno[geno["marker"].isin(markers2use)] @@ -73,52 +72,58 @@ def main(args): k=args.k, cpg=True, ) - print(f"Using {len(samples)} samples and {int(np.sum(spectra))} total mutations.") + print( + f"""Using {len(samples)} samples + and {int(np.sum(spectra))} total mutations.""" + ) strata = np.ones(len(samples)) if args.stratify_column is not None: - sample2strata = dict(zip(mutations_filtered["sample"], mutations_filtered[args.stratify_column])) + sample2strata = dict( + zip( + mutations_filtered["sample"], + mutations_filtered[args.stratify_column], + ) + ) strata = np.array([sample2strata[s] for s in samples]) - callable_kmer_arr = np.ones(spectra.shape) - + callable_kmer_arr = None if args.callable_kmers and args.k == 1: + callable_kmer_arr = np.zeros( + (len(samples), len(mutation_types)), dtype=np.int64 + ) callable_kmers = pd.read_csv(args.callable_kmers) # NOTE: need to check schema of df for si, s in enumerate(samples): for mi, m in enumerate(mutation_types): base_nuc = m.split(">")[0] if m != "CpG>TpG" else "C" callable_k = callable_kmers[ - (callable_kmers["GeneNetwork name"] == s) - & (callable_kmers["nucleotide"] == base_nuc)] - if callable_k["count"].values.shape[0] == 0: - print (s, m) + (callable_kmers["sample"] == s) + & (callable_kmers["nucleotide"] == base_nuc) + ] callable_kmer_arr[si, mi] = callable_k["count"].values[0] - # convert string genotypes to integers based on config definition - replace_dict = config_dict['genotypes'] + replace_dict = config_dict["genotypes"] geno_asint = geno.replace(replace_dict).replace({1: np.nan}) - # measure LD between all pairs of markers - cols = [c for c in geno_asint.columns if c != "marker"] - gn = geno_asint[cols].values - gn[np.isnan(gn)] = -1 - r = allel.rogers_huff_r(gn) - ld = squareform(r ** 2) - np.fill_diagonal(ld, 1.) - # if specified, remove all markers with r2 > X with selected markers if args.adj_marker is not None: + # measure LD between all pairs of markers + cols = [c for c in geno_asint.columns if c != "marker"] + gn = geno_asint[cols].values + gn[np.isnan(gn)] = -1 + r = allel.rogers_huff_r(gn) + ld = squareform(r**2) + np.fill_diagonal(ld, 1.0) marker_idxs = geno_asint[geno_asint["marker"] == args.adj_marker].index.values _, ld_markers = np.where(ld[marker_idxs] >= 0.1) geno_asint = geno_asint.iloc[geno_asint.index.difference(ld_markers)] - print("Using {} genotypes that meet filtering criteria.".format(geno_asint.shape[0])) # convert genotype values to a matrix geno_asint_filtered_matrix = geno_asint[samples].values # get an array of marker names at the filtered genotyped loci - markers_filtered = geno_asint['marker'].values + markers_filtered = geno_asint["marker"].values # compute similarity between allele frequencies at each marker genotype_similarity = compute_genotype_similarity(geno_asint_filtered_matrix) @@ -126,8 +131,6 @@ def main(args): if args.distance_method == "chisquare": distance_method = compute_manual_chisquare - covariate_ratios = np.ones(genotype_similarity.shape[0]).reshape(-1, 1) - covariate_cols = [] covariate_matrix = get_covariate_matrix( mutations_filtered, @@ -151,11 +154,13 @@ def main(args): adjust_statistics=True, ) - res_df = pd.DataFrame({ - 'marker': markers_filtered, - 'Distance': focal_dists, - 'k': args.k, - }) + res_df = pd.DataFrame( + { + "marker": markers_filtered, + "Distance": focal_dists, + "k": args.k, + } + ) IHDResultSchema.validate(res_df) numba.set_num_threads(args.threads) @@ -177,7 +182,7 @@ def main(args): # distribution to figure out the distance thresholds for pctile in (20, 5, 1): score_pctile = np.percentile(null_distances, 100 - pctile, axis=0) - res_df[f'{100 - pctile}th_percentile'] = score_pctile + res_df[f"{100 - pctile}th_percentile"] = score_pctile # for each chromosome, compute the specified confidence interval around # the peak observed distance @@ -187,10 +192,10 @@ def main(args): conf_int_chroms = ["4", "6"] for chrom, chrom_df in tqdm.tqdm(geno_asint_filtered_merged.groupby("chromosome")): - if chrom not in conf_int_chroms: continue + if chrom not in conf_int_chroms: + continue chrom_genotype_matrix = chrom_df[samples].values - print (chrom_genotype_matrix.shape) # compute confidence intervals on the chromosome conf_int_lo, conf_int_hi, peak_markers = calculate_confint( spectra, @@ -198,8 +203,8 @@ def main(args): covariate_matrix, distance_method=distance_method, adjust_statistics=True, - conf_int=90., - n_permutations=10_000, + conf_int=90., + n_permutations=1_000, ) conf_int_df = chrom_df.iloc[np.array([conf_int_lo, conf_int_hi])] @@ -211,6 +216,7 @@ def main(args): res_df.to_csv(args.out, index=False) + if __name__ == "__main__": p = argparse.ArgumentParser() p.add_argument( @@ -237,44 +243,42 @@ def main(args): "-permutations", type=int, default=1_000, - help= - "Number of permutations to perform when calculating significance thresholds. Default is 1,000.", + help="Number of permutations to perform when calculating significance thresholds. Default is 1,000.", ) p.add_argument( "-distance_method", default="cosine", type=str, - help= - """Method to use for calculating distance between aggregate spectra. Options are 'cosine' and 'chisquare', default is 'chisquare'.""", + help="""Method to use for calculating distance between aggregate spectra. Options are 'cosine' and 'chisquare', default is 'chisquare'.""", ) p.add_argument( "-threads", default=1, type=int, - help= - """Number of threads to use during permutation testing step. Default is 1.""", + help="""Number of threads to use during permutation testing step. Default is 1.""", ) p.add_argument( "-progress", action="store_true", - help= - """Whether to output the progress of the permutation testing step (i.e., the number of completed permutations).""", + help="""Whether to output the progress of the permutation testing step (i.e., the number of completed permutations).""", ) p.add_argument( "-callable_kmers", - help="""Path to CSV file containing numbers of callable base pairs in each sample, stratified by nucleotide.""" + default=None, + type=str, + help="""Path to CSV file containing numbers of callable base pairs in each sample, stratified by nucleotide.""", ) p.add_argument( "-stratify_column", default=None, type=str, - help="""If specified, use this column to perform a stratified permutation test by only permuting BXDs within groups defined by the column to account for population structure.""" + help="""If specified, use this column to perform a stratified permutation test by only permuting BXDs within groups defined by the column to account for population structure.""", ) p.add_argument( "-adj_marker", default=None, type=str, - help="""Comma-separated list of markers that we should adjust for when performing scan.""" + help="""Comma-separated list of markers that we should adjust for when performing scan.""", ) args = p.parse_args() diff --git a/ihd/schema.py b/ihd/schema.py index df9b2de..38765ad 100644 --- a/ihd/schema.py +++ b/ihd/schema.py @@ -7,7 +7,7 @@ # since we adjust cosine distances and potentially even use chisquare # statistics, we won't apply a strict check to the range of possible # distance values. - "Distance": Column(float), + "Distance": Column(float), "k": Column(int, Check.isin([1, 3])), }) diff --git a/ihd/utils.py b/ihd/utils.py index a8c90a1..6b5a81f 100644 --- a/ihd/utils.py +++ b/ihd/utils.py @@ -57,6 +57,7 @@ def compute_nansum(a: np.ndarray, row: bool = True) -> np.ndarray: empty_a[i] = np.nansum(a[i]) if row else np.nansum(a[:, i]) return empty_a + @numba.njit def compute_allele_frequency(genotype_matrix: np.ndarray) -> np.ndarray: """Given a genotype matrix of size (G, N) where G is the number of @@ -147,7 +148,10 @@ def compute_haplotype_distance( @numba.njit -def shuffle_spectra(spectra: np.ndarray, groups: np.ndarray = None) -> np.ndarray: +def shuffle_spectra( + spectra: np.ndarray, + groups: np.ndarray = None, +) -> np.ndarray: """Randomly shuffle the rows of a 2D numpy array of mutation spectrum data of size (N, M), where N is the number of samples and M is the number of mutation types. Shuffled array @@ -217,12 +221,13 @@ def compute_genotype_similarity(genotype_matrix: np.ndarray) -> np.ndarray: # compute Pearson correlation between allele frequencies af_corr = np.corrcoef(a_afs, b_afs)[0][1] # NOTE: not sure how good of an idea this is, even if it's rare - if np.isnan(af_corr): af_corr = 0 + if np.isnan(af_corr): + af_corr = 0 genotype_sims[ni] = af_corr return genotype_sims -@numba.njit + def adjust_spectra_for_nuccomp( a_spectra: np.ndarray, b_spectra: np.ndarray, @@ -362,9 +367,15 @@ def perform_ihd_scan( focal_dist[ni] = cur_dist if adjust_statistics: - covariate_matrix_full = np.hstack((genotype_similarity.reshape(-1, 1), covariate_ratios)) + covariate_matrix_full = np.hstack( + ( + genotype_similarity.reshape(-1, 1), + covariate_ratios, + ) + ) return compute_residuals(covariate_matrix_full, focal_dist) - else: return focal_dist + else: + return focal_dist def get_covariate_matrix( @@ -393,7 +404,9 @@ def get_covariate_matrix( cols = ["sample"] + covariate_cols # subset pheno information to relevant samples - pheno_sub = pheno[pheno["sample"].isin(samples)].drop_duplicates(cols).set_index("sample") + pheno_sub = ( + pheno[pheno["sample"].isin(samples)].drop_duplicates(cols).set_index("sample") + ) covariate_matrix = pheno_sub.loc[samples][covariate_cols].values.T return covariate_matrix @@ -545,7 +558,8 @@ def perform_permutation_test( null_distances: np.ndarray = np.zeros(n_permutations) for pi in numba.prange(n_permutations): - if pi > 0 and pi % 100 == 0 and progress: print(pi) + if pi > 0 and pi % 100 == 0 and progress: + print(pi) # shuffle the mutation spectra by row shuffled_spectra = shuffle_spectra(spectra, strata) @@ -627,7 +641,8 @@ def calculate_confint( peak_markers: np.ndarray = np.zeros(n_permutations) for pi in numba.prange(n_permutations): - if pi > 0 and pi % 1000 == 0 and progress: print(pi) + if pi > 0 and pi % 1000 == 0 and progress: + print(pi) # resample the mutation spectra by bootstrapping resampled_idxs = np.random.randint( 0, @@ -639,7 +654,9 @@ def calculate_confint( # resample the corresponding genotype data using the indices resampled_genotype_matrix = genotype_matrix[:, resampled_idxs] # recalculate genotype similarities using the resampled genotype matrix. NOTE: this is slow! - resampled_genotype_similarity = compute_genotype_similarity(resampled_genotype_matrix) + resampled_genotype_similarity = compute_genotype_similarity( + resampled_genotype_matrix + ) # resample the covariate matrix to include the bootstrap resampled samples resampled_covariate_matrix = covariate_matrix[:, resampled_idxs] resampled_covariate_ratios = calculate_covariate_by_marker( @@ -660,25 +677,26 @@ def calculate_confint( peak_marker_i = np.argmax(focal_dists) peak_markers[pi] = peak_marker_i - pctile_lo = (100 - conf_int) / 2. + pctile_lo = (100 - conf_int) / 2.0 pctile_hi = conf_int + pctile_lo return ( int(np.percentile(peak_markers, q=pctile_lo)), int(np.percentile(peak_markers, q=pctile_hi)), - peak_markers + peak_markers, ) def find_central_mut(kmer: str, cpg: bool = True) -> str: - orig, new = kmer.split('>') + orig, new = kmer.split(">") fp, tp = orig[0], orig[2] central_orig = orig[1] central_new = new[1] if central_orig == "C" and tp == "G" and central_new == "T": if cpg: return f"{central_orig}p{tp}>{central_new}p{tp}" - else: return f"{central_orig}>{central_new}" + else: + return f"{central_orig}>{central_new}" else: return ">".join([central_orig, central_new]) @@ -722,24 +740,31 @@ def compute_spectra( """ # compute 3-mer spectra - hap_spectra_agg = mutations.groupby(['sample', 'kmer']).agg({ - 'count': sum - }).reset_index() + hap_spectra_agg = ( + mutations.groupby(["sample", "kmer"]).agg({"count": sum}).reset_index() + ) # if 1-mer spectra are desired, compute that if k == 1: # add base mutation type - hap_spectra_agg['base_mut'] = hap_spectra_agg['kmer'].apply( - lambda k: find_central_mut(k, cpg=cpg)) - hap_spectra_agg = hap_spectra_agg.groupby(['sample', 'base_mut']).agg({ - 'count': - sum - }).reset_index() + hap_spectra_agg["base_mut"] = hap_spectra_agg["kmer"].apply( + lambda k: find_central_mut(k, cpg=cpg) + ) + hap_spectra_agg = ( + hap_spectra_agg.groupby(["sample", "base_mut"]) + .agg({"count": sum}) + .reset_index() + ) # get spectra as per-haplotype vectors of mutation counts mut_col = "base_mut" if k == 1 else "kmer" - spectra = hap_spectra_agg.pivot( - index="sample", columns=mut_col).reset_index().fillna(value=0) - samples, mutations, spectra = spectra['sample'].to_list(), [ - el[1] for el in spectra.columns[1:] - ], spectra.values[:, 1:] + spectra = ( + hap_spectra_agg.pivot(index="sample", columns=mut_col) + .reset_index() + .fillna(value=0) + ) + samples, mutations, spectra = ( + spectra["sample"].to_list(), + [el[1] for el in spectra.columns[1:]], + spectra.values[:, 1:], + ) return samples, mutations, spectra.astype(np.float64) diff --git a/scripts/find_sv_overlap.py b/scripts/find_sv_overlap.py index 5a13889..af9830a 100644 --- a/scripts/find_sv_overlap.py +++ b/scripts/find_sv_overlap.py @@ -21,7 +21,7 @@ def main(args): for i, row in refseq.iterrows(): # limit to curated refseq - if not (row["name"].startswith("NM") or row["name"].startswith("NR")): continue + if not (row["name"].startswith("N")): continue exon_starts, exon_ends = row["exonStarts"].split(','), row["exonEnds"].split(',') tx_start, tx_end = int(row["txStart"]), int(row["txEnd"]) tx_start -= 10_000 diff --git a/scripts/plot_bxd_spectra.py b/scripts/plot_bxd_spectra.py index a53a6c0..ac37d81 100644 --- a/scripts/plot_bxd_spectra.py +++ b/scripts/plot_bxd_spectra.py @@ -104,7 +104,7 @@ def main(args): data=spectra_df, x=xval, y=args.phenotype, - hue=hue, + hue=None, ) annotator.configure( test="Mann-Whitney", diff --git a/scripts/rules/run_ihd.smk b/scripts/rules/run_ihd.smk index e39760f..9c2c545 100644 --- a/scripts/rules/run_ihd.smk +++ b/scripts/rules/run_ihd.smk @@ -19,7 +19,8 @@ rule run_ihd: -permutations 10000 \ -stratify_column true_epoch \ -threads 4 \ - -adj_marker {params.adj_marker} + -adj_marker {params.adj_marker} \ + -progress """ rule plot_ihd: