Skip to content

Commit

Permalink
add all remaining files
Browse files Browse the repository at this point in the history
  • Loading branch information
olivierlabayle committed May 23, 2024
1 parent c445986 commit 93eb68c
Show file tree
Hide file tree
Showing 18 changed files with 480 additions and 277 deletions.
2 changes: 1 addition & 1 deletion src/Simulations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ using Serialization
using Makie
using CSV
using CairoMakie
using TargeneCore

include("utils.jl")

Expand All @@ -34,7 +35,6 @@ include(joinpath("samplers", "density_estimate_sampler.jl"))

include(joinpath("inputs_from_gene_atlas.jl"))
include("estimation.jl")
include("analysis.jl")
include("cli.jl")

export NullSampler, DensityEstimateSampler
Expand Down
58 changes: 34 additions & 24 deletions src/cli.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,6 @@ function cli_settings()
action = :command
help = "Aggregate multiple results file created by estimation procedures."

"density-estimation-inputs"
action = :command
help = "Generates density estimation inputs."

"density-estimation"
action = :command
help = "Estimate a conditional density."
Expand Down Expand Up @@ -120,25 +116,6 @@ function cli_settings()
default = mktempdir()
end

@add_arg_table! s["density-estimation-inputs"] begin
"dataset"
arg_type = String
help = "Path to the dataset (either .csv or .arrow)"

"estimands-prefix"
arg_type = String
help = "A prefix to serialized TMLE.Configuration (accepted formats: .json | .yaml | .jls)"

"--output-prefix"
arg_type = String
default = "de_inputs"
help = "Output JSON file."
"--batchsize"
arg_type = Int
default = 10
help = "Estimands are batched to optimize speed by //"
end

@add_arg_table! s["density-estimation"] begin
"dataset"
arg_type = String
Expand Down Expand Up @@ -175,6 +152,18 @@ function cli_settings()
arg_type = String
help = "A prefix to serialized TMLE.Configuration (accepted formats: .json | .yaml | .jls)"

"bgen-prefix"
arg_type = String
help = "A prefix to imputed genotypes (BGEN format)"

"traits"
arg_type = String
help = "The dataset containing phenotypes."

"pcs"
arg_type = String
help = "The dataset of principal components."

"--ga-download-dir"
arg_type = String
default = "gene_atlas_data"
Expand Down Expand Up @@ -209,6 +198,21 @@ function cli_settings()
arg_type = Int
default = 100
help = "Maximum variants retrieved per trait."

"--positivity-constraint"
arg_type = Float64
default = 0.0
help = "Minimum frequency of a treatment."

"--call-threshold"
arg_type = Float64
default = 0.9
help = "If no genotype as at least this probability it is considered missing."

"--verbosity"
arg_type = Int
default = 0
help = "Verbosity level."

"--output-prefix"
arg_type = String
Expand Down Expand Up @@ -248,13 +252,19 @@ function julia_main()::Cint
save_aggregated_df_results(cmd_settings["input-prefix"], cmd_settings["out"])
elseif cmd == "simulation-inputs-from-ga"
simulation_inputs_from_gene_atlas(
cmd_settings["estimands-prefix"];
cmd_settings["estimands-prefix"],
cmd_settings["bgen-prefix"],
cmd_settings["traits"],
cmd_settings["pcs"];
gene_atlas_dir=cmd_settings["ga-download-dir"],
remove_ga_data=cmd_settings["remove-ga-data"],
trait_table_path=cmd_settings["ga-trait-table"],
maf_threshold=cmd_settings["maf-threshold"],
pvalue_threshold=cmd_settings["pvalue-threshold"],
distance_threshold=cmd_settings["distance-threshold"],
positivity_constraint=cmd_settings["positivity-constraint"],
call_threshold=cmd_settings["call-threshold"],
verbosity=cmd_settings["verbosity"],
output_prefix=cmd_settings["output-prefix"],
batchsize=cmd_settings["batchsize"],
max_variants=cmd_settings["max-variants"]
Expand Down
216 changes: 177 additions & 39 deletions src/inputs_from_gene_atlas.jl
Original file line number Diff line number Diff line change
Expand Up @@ -171,21 +171,17 @@ function group_by_outcome(estimands)
return groups
end

function simulation_inputs_from_gene_atlas(estimands_prefix;
function get_trait_to_variants(estimands;
verbosity=0,
gene_atlas_dir="gene_atlas_data",
remove_ga_data=true,
trait_table_path=joinpath("assets", "Traits_Table_GeneATLAS.csv"),
maf_threshold=0.01,
pvalue_threshold=1e-5,
distance_threshold=1e6,
max_variants=100,
output_prefix="ga_sim_input",
batchsize=10
)
estimands = reduce(
vcat,
TargetedEstimation.read_estimands_config(f).estimands for f files_matching_prefix(estimands_prefix)
)
verbosity > 0 && @info("Retrieve significant variants for each outcome.")
# Retrieve traits and variants from estimands
trait_to_variants = get_trait_to_variants_from_estimands(estimands)
# Retrieve Trait to geneAtlas key map
Expand All @@ -199,44 +195,186 @@ function simulation_inputs_from_gene_atlas(estimands_prefix;
distance_threshold=distance_threshold,
max_variants=max_variants
)
return trait_to_variants
end

# Write all variants for dataset extraction
unique_variants = unique(vcat(values(trait_to_variants)...))
open(string(output_prefix, "_variants.txt"), "w") do io
for v in unique_variants
write(io, string(v, "\n"))
end
end
# Group and Write Estimands
batch_index = 1
for (outcome, estimands_group) group_by_outcome(estimands)
# Optimize order within a group and write estimands
estimands_group = groups_ordering(estimands_group)
for batch in Iterators.partition(estimands_group, batchsize)
batch_filename = string(output_prefix, "_estimands_", batch_index, ".jls")
serialize(batch_filename, TMLE.Configuration(estimands=batch))
batch_index += 1
end
function get_dataset_and_validated_estimands(
estimands,
bgen_prefix,
traits_file,
pcs_file,
trait_to_variants;
call_threshold=0.9,
positivity_constraint=0.,
verbosity=0
)
verbosity > 0 && @info("Calling genotypes.")
variants_set = Set(string.(vcat(values(trait_to_variants)...)))

genotypes = TargeneCore.call_genotypes(
bgen_prefix,
variants_set,
call_threshold
)
## Read PCs and traits
traits = TargeneCore.read_csv_file(traits_file)
pcs = TargeneCore.read_csv_file(pcs_file)
## Merge all together
dataset = TargeneCore.merge(traits, pcs, genotypes)

# Validate Estimand
verbosity > 0 && @info("Validating estimands.")
variables = TargeneCore.get_variables(estimands, traits, pcs)
estimands = TargeneCore.adjusted_estimands(
estimands, variables, dataset;
positivity_constraint=positivity_constraint
)
return dataset, estimands
end

all_treatments(Ψ) = keys.treatment_values)
all_treatments::TMLE.ComposedEstimand) = Tuple(union((all_treatments(arg) for arg Ψ.args)...))

all_outcome_extra_covariates(Ψ) = Ψ.outcome_extra_covariates
all_outcome_extra_covariates::TMLE.ComposedEstimand) = Tuple(union((all_outcome_extra_covariates(arg) for arg Ψ.args)...))

all_confounders(Ψ) = Tuple(union(values.treatment_confounders)...))
all_confounders::TMLE.ComposedEstimand) = Tuple(union((all_confounders(arg) for arg Ψ.args)...))

function write_densities(output_prefix, trait_to_variants, estimands)
outcome_parents = Dict(outcome => Set(variants) for (outcome, variants) in trait_to_variants)
for Ψ estimands
outcome = get_outcome(Ψ)
new_parents = string.(union(
all_outcome_extra_covariates(Ψ),
all_treatments(Ψ),
all_confounders(Ψ)
))
union!(
outcome_parents[string(outcome)],
new_parents
)
end
# Write densities
density_index = 1
confounders = get_confounders_assert_equal(estimands)
covariates = get_covariates_assert_equal(estimands)
## Outcome densities
for (outcome, variants) in trait_to_variants
parents = vcat(variants, confounders..., covariates...)
open(string(output_prefix, "_conditional_density_", density_index, ".json"), "w") do io
JSON.print(io, Dict("outcome" => outcome, "parents" => parents), 1)
end
density_index += 1
end
## Propensity scores
for variant in unique_variants
open(string(output_prefix, "_conditional_density_", density_index, ".json"), "w") do io
JSON.print(io, Dict("outcome" => variant, "parents" => confounders), 1)
for (outcome, parents) in outcome_parents
open(string(output_prefix, ".conditional_density_", density_index, ".json"), "w") do io
JSON.print(io, Dict("outcome" => outcome, "parents" => collect(parents)), 1)
end
density_index += 1
end
end

function write_ga_simulation_inputs(
output_prefix,
dataset,
estimands,
trait_to_variants;
batchsize=10,
verbosity=0
)
verbosity > 0 && @info("Writing outputs.")
# Writing estimands and dataset
TargeneCore.write_tl_inputs(output_prefix, dataset, estimands; batch_size=batchsize)
# Writing densities
write_densities(output_prefix, trait_to_variants, estimands)
end

"""
simulation_inputs_from_gene_atlas(
estimands_prefix,
bgen_prefix,
traits_file,
pcs_file;
gene_atlas_dir="gene_atlas_data",
remove_ga_data=true,
trait_table_path=joinpath("assets", "Traits_Table_GeneATLAS.csv"),
maf_threshold=0.01,
pvalue_threshold=1e-5,
distance_threshold=1e6,
max_variants=100,
output_prefix="ga_sim_input",
batchsize=10,
positivity_constraint=0,
call_threshold=0.9,
verbosity=0,
)
This function generates input files for density estimates based simulations using
variants identified from the geneATLAS.
## How are variants selected from the geneATLAS
Association data is downloaded to `gene_atlas_dir` and cleaned afterwards
if `remove_ga_data`. For each outcome, variants are selected if:
- They pass a significance threshold: `pvalue_threshold`.
- They are at least `distance_threshold` away from other variants.
- They are frequent: `maf_threshold`.
Finally, a maximum of `max_variants` is retained per outcome.
## What files are Generated
The Generated input files, prefixed by `output_prefix`, are:
- A dataset: Combines `pcs_file`, `traits_file` and called genotypes from `bgen_prefix` at `call_threshold`.
- Validated estimands: Estimands from `estimands_prefix` are validated. We makes sure the estimands match
the data in the dataset and pass the `positivity_constraint`. They are then written in batches of size `batchsize`.
- Conditional Densities: To be learnt to simulate new data for the estimands of interest.
"""
function simulation_inputs_from_gene_atlas(
estimands_prefix,
bgen_prefix,
traits_file,
pcs_file;
gene_atlas_dir="gene_atlas_data",
remove_ga_data=true,
trait_table_path=joinpath("assets", "Traits_Table_GeneATLAS.csv"),
maf_threshold=0.01,
pvalue_threshold=1e-5,
distance_threshold=1e6,
max_variants=100,
output_prefix="ga_sim_input",
batchsize=10,
positivity_constraint=0,
call_threshold=0.9,
verbosity=0,
)
estimands = reduce(
vcat,
TargetedEstimation.read_estimands_config(f).estimands for f files_matching_prefix(estimands_prefix)
)
# Trait to variants from geneATLAS
trait_to_variants = get_trait_to_variants(estimands;
verbosity=verbosity,
gene_atlas_dir=gene_atlas_dir,
remove_ga_data=remove_ga_data,
trait_table_path=trait_table_path,
maf_threshold=maf_threshold,
pvalue_threshold=pvalue_threshold,
distance_threshold=distance_threshold,
max_variants=max_variants,
)
# Dataset and validated estimands
dataset, estimands = get_dataset_and_validated_estimands(
estimands,
bgen_prefix,
traits_file,
pcs_file,
trait_to_variants;
call_threshold=call_threshold,
positivity_constraint=positivity_constraint,
verbosity=verbosity
)
# Write outputs
write_ga_simulation_inputs(
output_prefix,
dataset,
estimands,
trait_to_variants;
batchsize=batchsize,
verbosity=verbosity
)
verbosity > 0 && @info("Done.")
return 0
end
Loading

0 comments on commit 93eb68c

Please sign in to comment.