Skip to content

Commit

Permalink
Merge branch 'dev' into ds_flag_duplication
Browse files Browse the repository at this point in the history
  • Loading branch information
addramir authored Oct 25, 2024
2 parents 3189d08 + 85be796 commit 8bacef3
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 24 deletions.
30 changes: 20 additions & 10 deletions src/gentropy/credible_set_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,35 +16,45 @@ def __init__(
self,
session: Session,
credible_sets_path: str,
study_index_path: str,
ld_index_path: str,
output_path: str,
p_value_threshold: float = 1e-5,
purity_min_r2: float = 0.01,
clump: bool = False,
ld_index_path: str | None = None,
study_index_path: str | None = None,
ld_min_r2: float = 0.8,
) -> None:
"""Run credible set quality control step.
Args:
session (Session): Session object.
credible_sets_path (str): Path to credible sets file.
study_index_path (str): Path to study index file.
ld_index_path (str): Path to LD index file.
output_path (str): Path to write the output file.
p_value_threshold (float): P-value threshold for credible set quality control. Default is 1e-5.
purity_min_r2 (float): Minimum R2 for purity estimation. Default is 0.01.
clump (bool): Whether to clump the credible sets by LD. Default is False.
ld_index_path (str | None): Path to LD index file.
study_index_path (str | None): Path to study index file.
ld_min_r2 (float): Minimum R2 for LD estimation. Default is 0.8.
"""
cred_sets = StudyLocus.from_parquet(session, credible_sets_path)
study_index = StudyIndex.from_parquet(session, study_index_path)
ld_index = LDIndex.from_parquet(session, ld_index_path)

cred_sets = StudyLocus.from_parquet(
session, credible_sets_path, recursiveFileLookup=True
).coalesce(200)
ld_index = (
LDIndex.from_parquet(session, ld_index_path) if ld_index_path else None
)
study_index = (
StudyIndex.from_parquet(session, study_index_path)
if study_index_path
else None
)
cred_sets_clean = SUSIE_inf.credible_set_qc(
cred_sets,
study_index,
ld_index,
p_value_threshold,
purity_min_r2,
clump,
ld_index,
study_index,
ld_min_r2,
)

Expand Down
45 changes: 31 additions & 14 deletions src/gentropy/method/susie_inf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pyspark.sql.functions as f
import scipy.linalg
import scipy.special
from pyspark.sql.window import Window
from scipy.optimize import minimize, minimize_scalar
from scipy.special import logsumexp

Expand Down Expand Up @@ -469,43 +470,59 @@ def cred_inf(
@staticmethod
def credible_set_qc(
cred_sets: StudyLocus,
study_index: StudyIndex,
ld_index: LDIndex,
p_value_threshold: float = 1e-5,
purity_min_r2: float = 0.01,
clump: bool = False,
ld_index: LDIndex | None = None,
study_index: StudyIndex | None = None,
ld_min_r2: float = 0.8,
) -> StudyLocus:
"""Filter credible sets by lead P-value and min-R2 purity, and performs LD clumping.
In case of duplicated loci, the filtering retains the loci wth the highest credibleSetLog10BF
Args:
cred_sets (StudyLocus): StudyLocus object with credible sets to filter/clump
study_index (StudyIndex): StudyIndex object
ld_index (LDIndex): LDIndex object
p_value_threshold (float): p-value threshold for filtering credible sets, default is 1e-5
purity_min_r2 (float): min-R2 purity threshold for filtering credible sets, default is 0.01
clump (bool): Whether to clump the credible sets by LD, default is False
ld_index (LDIndex | None): LDIndex object
study_index (StudyIndex | None): StudyIndex object
ld_min_r2 (float): LD R2 threshold for clumping, default is 0.8
Returns:
StudyLocus: Credible sets which pass filters and LD clumping.
"""
df = (
cred_sets.df = (
cred_sets.df.withColumn(
"pValue", f.col("pValueMantissa") * f.pow(10, f.col("pValueExponent"))
)
.filter(f.col("pValue") <= p_value_threshold)
.filter(f.col("purityMinR2") >= purity_min_r2)
.drop("pValue")
.withColumn(
"rn",
f.row_number().over(
Window.partitionBy("studyLocusId").orderBy(
f.desc("credibleSetLog10BF")
)
),
)
.filter(f.col("rn") == 1)
.drop("rn")
)
cred_sets.df = df
cred_sets = (
cred_sets.annotate_ld(study_index, ld_index, ld_min_r2)
.clump()
.filter(
~f.array_contains(
f.col("qualityControls"),
"Explained by a more significant variant in high LD (clumped)",
if clump:
assert study_index, "Running in clump mode, which requires study_index."
assert ld_index, "Running in clump mode, which requires ld_index."
cred_sets = (
cred_sets.annotate_ld(study_index, ld_index, ld_min_r2)
.clump()
.filter(
~f.array_contains(
f.col("qualityControls"),
"Explained by a more significant variant in high LD (clumped)",
)
)
)
)

return cred_sets

0 comments on commit 8bacef3

Please sign in to comment.