From 93eb68c62912386b20d8d1ee075bd1b72df8a850 Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Thu, 23 May 2024 15:37:55 +0100 Subject: [PATCH] add all remaining files --- src/Simulations.jl | 2 +- src/cli.jl | 58 +++--- src/inputs_from_gene_atlas.jl | 216 +++++++++++++++++---- src/samplers/density_estimate_sampler.jl | 105 ++++------ src/samplers/null_sampler.jl | 45 +---- test/assets/conditional_density_Ybin.json | 8 + test/assets/conditional_density_Ycont.json | 6 + test/assets/conditional_density_x_y.json | 6 + test/assets/dataset.arrow | Bin 0 -> 6946 bytes test/assets/density_estimators.jl | 8 + test/assets/estimands/estimands_ates.jls | Bin 0 -> 569 bytes test/assets/estimands/estimands_iates.jls | Bin 0 -> 320 bytes test/inputs_from_gene_atlas.jl | 193 ++++++++++++------ test/null_simulation.jl | 2 +- test/runtests.jl | 1 - test/samplers/density_estimate_sampler.jl | 69 +++++-- test/samplers/null_sampler.jl | 16 -- test/testutils.jl | 22 ++- 18 files changed, 480 insertions(+), 277 deletions(-) create mode 100644 test/assets/conditional_density_Ybin.json create mode 100644 test/assets/conditional_density_Ycont.json create mode 100644 test/assets/conditional_density_x_y.json create mode 100644 test/assets/dataset.arrow create mode 100644 test/assets/density_estimators.jl create mode 100644 test/assets/estimands/estimands_ates.jls create mode 100644 test/assets/estimands/estimands_iates.jls diff --git a/src/Simulations.jl b/src/Simulations.jl index 4204bea..9af7c21 100644 --- a/src/Simulations.jl +++ b/src/Simulations.jl @@ -22,6 +22,7 @@ using Serialization using Makie using CSV using CairoMakie +using TargeneCore include("utils.jl") @@ -34,7 +35,6 @@ include(joinpath("samplers", "density_estimate_sampler.jl")) include(joinpath("inputs_from_gene_atlas.jl")) include("estimation.jl") -include("analysis.jl") include("cli.jl") export NullSampler, DensityEstimateSampler diff --git a/src/cli.jl b/src/cli.jl index 8ce3d73..87ddafe 100644 --- a/src/cli.jl +++ b/src/cli.jl @@ -10,10 +10,6 @@ function cli_settings() action = :command help = "Aggregate multiple results file created by estimation procedures." - "density-estimation-inputs" - action = :command - help = "Generates density estimation inputs." - "density-estimation" action = :command help = "Estimate a conditional density." @@ -120,25 +116,6 @@ function cli_settings() default = mktempdir() end - @add_arg_table! s["density-estimation-inputs"] begin - "dataset" - arg_type = String - help = "Path to the dataset (either .csv or .arrow)" - - "estimands-prefix" - arg_type = String - help = "A prefix to serialized TMLE.Configuration (accepted formats: .json | .yaml | .jls)" - - "--output-prefix" - arg_type = String - default = "de_inputs" - help = "Output JSON file." - "--batchsize" - arg_type = Int - default = 10 - help = "Estimands are batched to optimize speed by //" - end - @add_arg_table! s["density-estimation"] begin "dataset" arg_type = String @@ -175,6 +152,18 @@ function cli_settings() arg_type = String help = "A prefix to serialized TMLE.Configuration (accepted formats: .json | .yaml | .jls)" + "bgen-prefix" + arg_type = String + help = "A prefix to imputed genotypes (BGEN format)" + + "traits" + arg_type = String + help = "The dataset containing phenotypes." + + "pcs" + arg_type = String + help = "The dataset of principal components." + "--ga-download-dir" arg_type = String default = "gene_atlas_data" @@ -209,6 +198,21 @@ function cli_settings() arg_type = Int default = 100 help = "Maximum variants retrieved per trait." + + "--positivity-constraint" + arg_type = Float64 + default = 0.0 + help = "Minimum frequency of a treatment." + + "--call-threshold" + arg_type = Float64 + default = 0.9 + help = "If no genotype as at least this probability it is considered missing." + + "--verbosity" + arg_type = Int + default = 0 + help = "Verbosity level." "--output-prefix" arg_type = String @@ -248,13 +252,19 @@ function julia_main()::Cint save_aggregated_df_results(cmd_settings["input-prefix"], cmd_settings["out"]) elseif cmd == "simulation-inputs-from-ga" simulation_inputs_from_gene_atlas( - cmd_settings["estimands-prefix"]; + cmd_settings["estimands-prefix"], + cmd_settings["bgen-prefix"], + cmd_settings["traits"], + cmd_settings["pcs"]; gene_atlas_dir=cmd_settings["ga-download-dir"], remove_ga_data=cmd_settings["remove-ga-data"], trait_table_path=cmd_settings["ga-trait-table"], maf_threshold=cmd_settings["maf-threshold"], pvalue_threshold=cmd_settings["pvalue-threshold"], distance_threshold=cmd_settings["distance-threshold"], + positivity_constraint=cmd_settings["positivity-constraint"], + call_threshold=cmd_settings["call-threshold"], + verbosity=cmd_settings["verbosity"], output_prefix=cmd_settings["output-prefix"], batchsize=cmd_settings["batchsize"], max_variants=cmd_settings["max-variants"] diff --git a/src/inputs_from_gene_atlas.jl b/src/inputs_from_gene_atlas.jl index 703aee8..7a52f28 100644 --- a/src/inputs_from_gene_atlas.jl +++ b/src/inputs_from_gene_atlas.jl @@ -171,7 +171,8 @@ function group_by_outcome(estimands) return groups end -function simulation_inputs_from_gene_atlas(estimands_prefix; +function get_trait_to_variants(estimands; + verbosity=0, gene_atlas_dir="gene_atlas_data", remove_ga_data=true, trait_table_path=joinpath("assets", "Traits_Table_GeneATLAS.csv"), @@ -179,13 +180,8 @@ function simulation_inputs_from_gene_atlas(estimands_prefix; pvalue_threshold=1e-5, distance_threshold=1e6, max_variants=100, - output_prefix="ga_sim_input", - batchsize=10 - ) - estimands = reduce( - vcat, - TargetedEstimation.read_estimands_config(f).estimands for f ∈ files_matching_prefix(estimands_prefix) ) + verbosity > 0 && @info("Retrieve significant variants for each outcome.") # Retrieve traits and variants from estimands trait_to_variants = get_trait_to_variants_from_estimands(estimands) # Retrieve Trait to geneAtlas key map @@ -199,44 +195,186 @@ function simulation_inputs_from_gene_atlas(estimands_prefix; distance_threshold=distance_threshold, max_variants=max_variants ) + return trait_to_variants +end - # Write all variants for dataset extraction - unique_variants = unique(vcat(values(trait_to_variants)...)) - open(string(output_prefix, "_variants.txt"), "w") do io - for v in unique_variants - write(io, string(v, "\n")) - end - end - # Group and Write Estimands - batch_index = 1 - for (outcome, estimands_group) ∈ group_by_outcome(estimands) - # Optimize order within a group and write estimands - estimands_group = groups_ordering(estimands_group) - for batch in Iterators.partition(estimands_group, batchsize) - batch_filename = string(output_prefix, "_estimands_", batch_index, ".jls") - serialize(batch_filename, TMLE.Configuration(estimands=batch)) - batch_index += 1 - end +function get_dataset_and_validated_estimands( + estimands, + bgen_prefix, + traits_file, + pcs_file, + trait_to_variants; + call_threshold=0.9, + positivity_constraint=0., + verbosity=0 + ) + verbosity > 0 && @info("Calling genotypes.") + variants_set = Set(string.(vcat(values(trait_to_variants)...))) + + genotypes = TargeneCore.call_genotypes( + bgen_prefix, + variants_set, + call_threshold + ) + ## Read PCs and traits + traits = TargeneCore.read_csv_file(traits_file) + pcs = TargeneCore.read_csv_file(pcs_file) + ## Merge all together + dataset = TargeneCore.merge(traits, pcs, genotypes) + + # Validate Estimand + verbosity > 0 && @info("Validating estimands.") + variables = TargeneCore.get_variables(estimands, traits, pcs) + estimands = TargeneCore.adjusted_estimands( + estimands, variables, dataset; + positivity_constraint=positivity_constraint + ) + return dataset, estimands +end + +all_treatments(Ψ) = keys(Ψ.treatment_values) +all_treatments(Ψ::TMLE.ComposedEstimand) = Tuple(union((all_treatments(arg) for arg ∈ Ψ.args)...)) + +all_outcome_extra_covariates(Ψ) = Ψ.outcome_extra_covariates +all_outcome_extra_covariates(Ψ::TMLE.ComposedEstimand) = Tuple(union((all_outcome_extra_covariates(arg) for arg ∈ Ψ.args)...)) + +all_confounders(Ψ) = Tuple(union(values(Ψ.treatment_confounders)...)) +all_confounders(Ψ::TMLE.ComposedEstimand) = Tuple(union((all_confounders(arg) for arg ∈ Ψ.args)...)) + +function write_densities(output_prefix, trait_to_variants, estimands) + outcome_parents = Dict(outcome => Set(variants) for (outcome, variants) in trait_to_variants) + for Ψ ∈ estimands + outcome = get_outcome(Ψ) + new_parents = string.(union( + all_outcome_extra_covariates(Ψ), + all_treatments(Ψ), + all_confounders(Ψ) + )) + union!( + outcome_parents[string(outcome)], + new_parents + ) end - # Write densities density_index = 1 - confounders = get_confounders_assert_equal(estimands) - covariates = get_covariates_assert_equal(estimands) - ## Outcome densities - for (outcome, variants) in trait_to_variants - parents = vcat(variants, confounders..., covariates...) - open(string(output_prefix, "_conditional_density_", density_index, ".json"), "w") do io - JSON.print(io, Dict("outcome" => outcome, "parents" => parents), 1) - end - density_index += 1 - end - ## Propensity scores - for variant in unique_variants - open(string(output_prefix, "_conditional_density_", density_index, ".json"), "w") do io - JSON.print(io, Dict("outcome" => variant, "parents" => confounders), 1) + for (outcome, parents) in outcome_parents + open(string(output_prefix, ".conditional_density_", density_index, ".json"), "w") do io + JSON.print(io, Dict("outcome" => outcome, "parents" => collect(parents)), 1) end density_index += 1 end +end + +function write_ga_simulation_inputs( + output_prefix, + dataset, + estimands, + trait_to_variants; + batchsize=10, + verbosity=0 + ) + verbosity > 0 && @info("Writing outputs.") + # Writing estimands and dataset + TargeneCore.write_tl_inputs(output_prefix, dataset, estimands; batch_size=batchsize) + # Writing densities + write_densities(output_prefix, trait_to_variants, estimands) +end +""" + simulation_inputs_from_gene_atlas( + estimands_prefix, + bgen_prefix, + traits_file, + pcs_file; + gene_atlas_dir="gene_atlas_data", + remove_ga_data=true, + trait_table_path=joinpath("assets", "Traits_Table_GeneATLAS.csv"), + maf_threshold=0.01, + pvalue_threshold=1e-5, + distance_threshold=1e6, + max_variants=100, + output_prefix="ga_sim_input", + batchsize=10, + positivity_constraint=0, + call_threshold=0.9, + verbosity=0, + ) + +This function generates input files for density estimates based simulations using +variants identified from the geneATLAS. + +## How are variants selected from the geneATLAS + +Association data is downloaded to `gene_atlas_dir` and cleaned afterwards +if `remove_ga_data`. For each outcome, variants are selected if: + +- They pass a significance threshold: `pvalue_threshold`. +- They are at least `distance_threshold` away from other variants. +- They are frequent: `maf_threshold`. + +Finally, a maximum of `max_variants` is retained per outcome. + +## What files are Generated + +The Generated input files, prefixed by `output_prefix`, are: + +- A dataset: Combines `pcs_file`, `traits_file` and called genotypes from `bgen_prefix` at `call_threshold`. +- Validated estimands: Estimands from `estimands_prefix` are validated. We makes sure the estimands match +the data in the dataset and pass the `positivity_constraint`. They are then written in batches of size `batchsize`. +- Conditional Densities: To be learnt to simulate new data for the estimands of interest. +""" +function simulation_inputs_from_gene_atlas( + estimands_prefix, + bgen_prefix, + traits_file, + pcs_file; + gene_atlas_dir="gene_atlas_data", + remove_ga_data=true, + trait_table_path=joinpath("assets", "Traits_Table_GeneATLAS.csv"), + maf_threshold=0.01, + pvalue_threshold=1e-5, + distance_threshold=1e6, + max_variants=100, + output_prefix="ga_sim_input", + batchsize=10, + positivity_constraint=0, + call_threshold=0.9, + verbosity=0, + ) + estimands = reduce( + vcat, + TargetedEstimation.read_estimands_config(f).estimands for f ∈ files_matching_prefix(estimands_prefix) + ) + # Trait to variants from geneATLAS + trait_to_variants = get_trait_to_variants(estimands; + verbosity=verbosity, + gene_atlas_dir=gene_atlas_dir, + remove_ga_data=remove_ga_data, + trait_table_path=trait_table_path, + maf_threshold=maf_threshold, + pvalue_threshold=pvalue_threshold, + distance_threshold=distance_threshold, + max_variants=max_variants, + ) + # Dataset and validated estimands + dataset, estimands = get_dataset_and_validated_estimands( + estimands, + bgen_prefix, + traits_file, + pcs_file, + trait_to_variants; + call_threshold=call_threshold, + positivity_constraint=positivity_constraint, + verbosity=verbosity + ) + # Write outputs + write_ga_simulation_inputs( + output_prefix, + dataset, + estimands, + trait_to_variants; + batchsize=batchsize, + verbosity=verbosity + ) + verbosity > 0 && @info("Done.") return 0 end \ No newline at end of file diff --git a/src/samplers/density_estimate_sampler.jl b/src/samplers/density_estimate_sampler.jl index 77bb0f4..36deb18 100644 --- a/src/samplers/density_estimate_sampler.jl +++ b/src/samplers/density_estimate_sampler.jl @@ -2,90 +2,55 @@ sieve_neural_net_density_estimator(file::AbstractString) = jldopen(io -> restore!(io["sieve-neural-net"]), file) struct DensityEstimateSampler - sources::Vector - treatment_density_mapping::Dict - outcome_density_mapping::Dict + density_mapping::Dict + all_parents_set::Vector{Symbol} + variables_required_for_estimation::Vector{Symbol} end +get_outcomes_set(estimands) = Set(get_outcome(Ψ) for Ψ in estimands) + function DensityEstimateSampler(prefix, estimands) # Create density to file map (There could be more files than actually required) + outcomes_set = get_outcomes_set(estimands) + all_parents_set = Set{Symbol}() density_mapping = Dict() for f ∈ files_matching_prefix(prefix) jldopen(f) do io - density_mapping[(Symbol(io["outcome"]) => Tuple(Symbol.(io["parents"])))] = f + outcome = Symbol(io["outcome"]) + if outcome ∈ outcomes_set + parents = Symbol.(io["parents"]) + density_mapping[outcome] = (parents, f) + union!(all_parents_set, parents) + end end end - # Create required density to file map - required_densities = Dict() - all_parents = Set([]) - all_outcomes = Set([]) - for Ψ in estimands - for η in TMLE.nuisance_functions_iterator(Ψ) - required_densities[η.outcome => η.parents] = density_mapping[η.outcome => η.parents] - union!(all_parents, η.parents) - push!(all_outcomes, η.outcome) - end + variables_required_for_estimation = Set{Symbol}() + for Ψ ∈ estimands + pre_outcome_variables = union( + all_outcome_extra_covariates(Ψ), + all_treatments(Ψ), + all_confounders(Ψ), + ) + union!( + variables_required_for_estimation, + pre_outcome_variables + ) + push!(variables_required_for_estimation, get_outcome(Ψ)) end - # A source is never an outcome - sources = [x for x in all_parents if x ∉ all_outcomes] - # A treatment node is both an outcome and a parent - treatment_density_mapping = Dict(cd => f for (cd, f) in required_densities if (cd[1] ∈ all_parents && cd[1] ∈ all_outcomes)) - # An outcome node is never a parent - outcome_density_mapping = Dict(cd => f for (cd, f) in required_densities if cd[1] ∉ all_parents) - return DensityEstimateSampler(sources, treatment_density_mapping, outcome_density_mapping) + + return DensityEstimateSampler(density_mapping, collect(all_parents_set), collect(variables_required_for_estimation)) end function sample_from(sampler::DensityEstimateSampler, origin_dataset; n=100) - sampled_dataset = sample_from(origin_dataset, sampler.sources; n=n) - - for density_mapping in (sampler.treatment_density_mapping, sampler.outcome_density_mapping) - for ((outcome, parents), file) in density_mapping - conditional_density_estimate = Simulations.sieve_neural_net_density_estimator(file) - sampled_dataset[!, outcome] = sample_from( - conditional_density_estimate, - sampled_dataset[!, collect(parents)] - ) - end - end - - return sampled_dataset -end + sampled_dataset = sample_from(origin_dataset, sampler.all_parents_set; n=n) -function counterfactual_aggregate(Ψ, Q, X) - Ttemplate = TMLE.selectcols(X, TMLE.treatments(Ψ)) - n = nrow(Ttemplate) - ctf_agg = zeros(n) - # Loop over Treatment settings - for (vals, sign) in TMLE.indicator_fns(Ψ) - # Counterfactual dataset for a given treatment setting - T_ct = TMLE.counterfactualTreatment(vals, Ttemplate) - X_ct = merge(X, T_ct) - # Counterfactual mean - ctf_agg .+= sign .* TMLE.expected_value(Q, X_ct) + for (outcome, (parents, file)) in sampler.density_mapping + conditional_density_estimate = Simulations.sieve_neural_net_density_estimator(file) + sampled_dataset[!, outcome] = sample_from( + conditional_density_estimate, + sampled_dataset[!, parents] + ) end - return ctf_agg -end - -function monte_carlo_effect(Ψ, sampler, dataset) - outcome_mean = TMLE.outcome_mean(Ψ) - Q = Simulations.sieve_neural_net_density_estimator(sampler.outcome_density_mapping[outcome_mean.outcome => outcome_mean.parents]) - X = TMLE.selectcols(dataset, outcome_mean.parents) - labels = Simulations.getlabels(dataset[!, outcome_mean.outcome]) - ctf_agg = counterfactual_aggregate(Ψ, Q, X) - return mean(ctf_agg) + return sampled_dataset[!, sampler.variables_required_for_estimation] end - -function true_effect(Ψ, sampler::DensityEstimateSampler, origin_dataset; n=500_000) - sampled_dataset = sample_from(sampler, origin_dataset; n=n) - return monte_carlo_effect(Ψ, sampler, sampled_dataset) -end - -function true_effect(Ψ::ComposedEstimand, sampler::DensityEstimateSampler, origin_dataset;n=500_000) - sampled_dataset = sample_from(sampler, origin_dataset; n=n) - effect = zeros(length(Ψ.args)) - for (index, arg) in enumerate(Ψ.args) - effect[index] = monte_carlo_effect(arg, sampler, sampled_dataset) - end - return effect -end \ No newline at end of file diff --git a/src/samplers/null_sampler.jl b/src/samplers/null_sampler.jl index 64dc56c..ecd9359 100644 --- a/src/samplers/null_sampler.jl +++ b/src/samplers/null_sampler.jl @@ -37,47 +37,4 @@ function sample_from(sampler::NullSampler, origin_dataset; n=100) sampled_dataset[!, variable] = origin_dataset[sample_mask, variable] end return sampled_dataset -end - -theoretical_true_effect(Ψ, sampler::NullSampler) = 0. -theoretical_true_effect(Ψ::ComposedEstimand, sampler::NullSampler) = [0. for _ in Ψ.args] - -""" -For this generating process, Y is independent of both T and W. - -This effect computation: - - Discards the dependence on W. - - Uses the "dependence" on T. - -Not sure if this is reasonnable. - -Mathematically, this is saying: - -E[Y|do(T=t)] = E[Y|T=t] (= E[Y], in principle but not used) -""" -function confounders_independent_effect(dataset, Ψ) - indicator_fns = TMLE.indicator_fns(Ψ) - effect = 0. - for (key, group) ∈ pairs(groupby(dataset, TMLE.treatments(Ψ))) - if haskey(indicator_fns, values(key)) - effect += mean(group[!, Ψ.outcome]) * indicator_fns[values(key)] - end - end - return effect -end - -function empirical_true_effect(Ψ, sampler::NullSampler, origin_dataset; n=500_000) - sampled_dataset = sample_from(sampler, origin_dataset; n=n) - return confounders_independent_effect(sampled_dataset, Ψ) -end - -function empirical_true_effect(Ψ::ComposedEstimand, sampler::NullSampler, origin_dataset; n=500_000) - sampled_dataset = sample_from(sampler, origin_dataset; n=n) - effect = zeros(length(Ψ.args)) - for (index, arg) in enumerate(Ψ.args) - effect[index] = confounders_independent_effect(sampled_dataset, arg) - end - return effect -end - -true_effect(Ψ, sampler::NullSampler, origin_dataset; n=500_000) = theoretical_true_effect(Ψ, sampler) \ No newline at end of file +end \ No newline at end of file diff --git a/test/assets/conditional_density_Ybin.json b/test/assets/conditional_density_Ybin.json new file mode 100644 index 0000000..de8e2e2 --- /dev/null +++ b/test/assets/conditional_density_Ybin.json @@ -0,0 +1,8 @@ +{ + "parents": [ + "C", + "T₁", + "W" + ], + "outcome": "Ybin" + } \ No newline at end of file diff --git a/test/assets/conditional_density_Ycont.json b/test/assets/conditional_density_Ycont.json new file mode 100644 index 0000000..4cbedec --- /dev/null +++ b/test/assets/conditional_density_Ycont.json @@ -0,0 +1,6 @@ +{ + "parents": [ + "W", "T₁", "T₂", "C" + ], + "outcome": "Ycont" + } \ No newline at end of file diff --git a/test/assets/conditional_density_x_y.json b/test/assets/conditional_density_x_y.json new file mode 100644 index 0000000..6a40faf --- /dev/null +++ b/test/assets/conditional_density_x_y.json @@ -0,0 +1,6 @@ +{ + "parents": [ + "y" + ], + "outcome": "x" + } \ No newline at end of file diff --git a/test/assets/dataset.arrow b/test/assets/dataset.arrow new file mode 100644 index 0000000000000000000000000000000000000000..f1e55b04002e1e66a53b5251f008e165155dc78d GIT binary patch literal 6946 zcmeHMdpK3;8eeGoLySqvlR(&ZDMK zuN@d16xJKA$s`o1bKx+idQre&QwMQsS~=`--|a>aid6iM{B9rlg>|j$op)0(ROdIp z*oQs@@%50ONsJnWD3?S^peqd)8^zK-F3V#F{~25N^SsVUqi;W~u* zmei!m4c#xir?P}HRZ)Rj9gCU-QKdv3s4=XIQBi5Crf}`U_f!}cP}5Op`Y=Z0lRrRp zvEd*_Y{48Odw8K(CNRhPW~pDLp%cup?jv0MM+0-LM~wMMq)su%I;rq`9doP;RpIaL z$JZ^4{ngybhc6SYRSr8-X@WEVO7-pRoXJ#9y~?PGtjgOuMc`aV?x+^#^5!H$bGT-1?pqX(+W^M>lLy z0mMSHz_6hjsvb)AM_CmkGsU8 zW+^(K?C-fCs|C#rN*sG~b}bsanqzin1HooCC^9%@1o=Q?e5+T_dt}tCFaRq7J@@{n zj;ur%A|oH^sHt+{uD_1Us@3YaHF%va|rH3*MexdY~Q*lBL4$K4^i7 zw*?W&_k-beW_o~qt^iCKm#S>_%22zPr^H>GIC!Y>^%JM0YS=)yxy0RWMsLoJZ{9o5 z1CM1S9v{9JgAA37stX(5!;Xc$G21F0!CePl8vkT1R9F`%y381aszr9|1U*flE~9a$ z=mj7BU>JC&+~gI69erA~BH$g&n9$_Ac|kYi%o-CrFNP2A+{2u;?!SYGhk8#|TW3IE z>!k&AwRo_pe^Ys-i2%I!BG*dYCdAL3|Apn4m1yP^b^pw7Qa~YVxkOVz6VksGc_4f* z32N3m$T+=bq@kI)M^ouNoY6k-IyJcnNoBrk(@-cw+fGy@m>%Pzi<`qdQ{&o^9aNUC zyitext8|Sg%zuyeDU&4|HEUpW%-KGv%qH-VUgBYGT@60liJQ6_Jd~2-d}Cu)9=cex zhC4R56Ks|KUDP68g*^MiwCXH5h#h!(+1UsIu#W`h!DB95WwrXLzvQ6r7GFr%jQY?E zzpYQs=f#7XW8zyMehFyC{(L&7IRjP1`pZOkrJ_Xj*me7ZnIvP6ZDq?sf|N^ea?W|n zfkXS&>ex#MA;++l=X9m2;edYKdd-45@FDGYNeHS@o^45(Rdze-n6F`B*3o5g0gZC9!!i>zIs|lj3kndb_6&x$P(7A$=6Iu5#!qH>4n#e z;b}nN?&P*YSSNO5&G>l&)EL}<_*vg&kWQ;flr!i+?!9}Bx15q8Ew9-f_l|Ew?~LuO zj1As^LGI!ii%qEe=ex!Ha%>w~?lCpdx}1*!eZ`rZV|dV;@nB9)YdL(~7e8oiTMQ07 z@oD2P7QwVRI~v3-TH%$^`L53|1Jg2~RqOh+ zG}X(nXx0+DQ^hH0_v;NsuBNXcv9{@Yph`We;J9qAUiKEw$>qoMIK2>V>z>s9Y}w*HeLCvYGC&7U!mjw9>$m_Ob;2=26yJ3KXuLi z6|9{;akI>%BxEJFrYpCL3*PD{*kd!Yk$SStZsZ$<#8HLagDC}Y_@YYbrgI$Vc{qQG ziE1LU7T?#TlhX;uUE?a;2in2sWU=+m!=0#ds)x0|!6oGX(xfI*x(|)=i3uzI;REHq#tQ`SrmYNUZ(cLLN1&t8kqk5`DPev zqPM$#G8djbm|_uR(*;W}D8EQ&W<#**lTeR>Qn;ypr?AVh743fzv?wQ~1DR+$%I$K< zL8{1g*`~)V@`O>1_xcoqY5&JOWI7WJWBrYmXR23JrC=m;#kK> z_hBvy>3M(Fhx125TvwEju49goo`>TY>H6p9qWa*tDEwUO!1d|%{%joQ<2*W!q>tps z`53YPr>l;r>I@fo*_x;OJqiNe6L!;7ehYf*_J!_cr4l&lRd}v)n>aggbpXqMRy!D@ zZ@b6m5@eg~wENQRcF5lwaHk0|NM|$2(_>rT09p2y?9zD&Nm6$sWmw%1>e%0xwUdW* z^{?`G{ltZ~jgi)Ile$5JzrjkA&jTOLr~#LIOww34`m2?*I?&=lw&wQ4TBubqYCW^P z2l=(tjSjxjh3-3yj!osYBc)M}A@Y+2sQc;Mfv9Z^($n6ewrkb^($C2$_{FCYrMeyD zUz2%~Ji{^-z36as2!PJXFyVu9IfYhb0Os zDi(YO*~ZD$wlgHy%Bwb(&35Pn)#Q5`^%n?o()w^U6-zNP*uS&?-sCp4U2ohW)o2!* zJzB$oY55LV?D>)|M)}A^VJl;S>N6A*t7i}$nFvvhr7!xP6xmy}M%klU z0AEU)#CIGa*xJ=+t{h?Wvsc0@jWVg|DJzrhmk>HhWa zEBtw(BmMJ4*FP5*)d$Dvi2LHWD30TQ7o@NEuU^-u_WM-d=UVUo+Ha(Mx=!bxdL1~A z`5}QtL#)%|xIWIqKFqOSR2=&-;(DU;a2>4UJUYj9u%FK9d2}6fjPyDl{c`6cG2ri7Fh zMuA|E2qRELK!<|?F3ZHolAKta%E%5fp(sBGq70;wfrEj= zM2Cyv!F>}H9hm09O=e&O+Qt|z!WeI&!^*(O26R>_&|Nz03{Zn~ctGxfs1RTjaL&&y z$S+P!As8A!ukkBqb$;2lEb+uOSxe2!l<${{Unzh!jOt>gNXl-cOy? literal 0 HcmV?d00001 diff --git a/test/assets/estimands/estimands_iates.jls b/test/assets/estimands/estimands_iates.jls new file mode 100644 index 0000000000000000000000000000000000000000..baa825f175e931bdcca3b6fd954e78e128e15467 GIT binary patch literal 320 zcmXr_@{wR+U|=v2VB~eq&r8cpFD*(e$;{7_=RSYpCQs_dr0)C9#d*Ik{q1683GwxD zbrBU{LnG0Ub657#pM}Ik7mEksV}CQGO0W83P9chYkmW2@pTH4<*sH sVl^IY5hKt642(KF3?ht-jNwos9_(h8$fV3X6CGAmqjk7YmHPPs02 ["RSID_2", "RSID_198"], + "CONTINUOUS_2" => ["RSID_2", "RSID_198"], + "BINARY_2" => ["RSID_2"], + ) + return estimands, traits_to_variants end @testset "Test get_trait_to_variants_from_estimands" begin @@ -81,62 +132,88 @@ end ) end -@testset "Test density_estimation_inputs_from_gene_atlas" begin +@testset "Test simulation_inputs_from_gene_atlas" begin + # The function `simulation_inputs_from_gene_atlas` is hard to test end to end due to + # data limitations. It is split into 3 subfunctions that we here test sequentially but + # with different data. + verbosity = 0 + # Here we use the real geneATLAS data tmpdir = mktempdir() - save_test_estimands(tmpdir) - outprefix = joinpath(tmpdir, "ga_sim_input") - copy!(ARGS, [ - "simulation-inputs-from-ga", - joinpath(tmpdir, "estimands"), - string("--ga-download-dir=", joinpath(tmpdir, "gene_atlas_data")), - "--remove-ga-data=true", - string("--ga-trait-table=", joinpath(PKGDIR, "assets", "Traits_Table_GeneATLAS.csv")), - "--maf-threshold=0.01", - "--pvalue-threshold=1e-5", - "--distance-threshold=1e6", - string("--output-prefix=", outprefix), - "--batchsize=2", - "--max-variants=10" - ]) - Simulations.julia_main() - # Estimand files - ## sarcoidosis has 3 estimands -> split in two batches (batchsize=2) - ## G20 Parkinson's disease has 1 estimand -> 1 file - estimands_files = filter(x -> occursin("ga_sim_input_estimands", x), readdir(tmpdir, join=true)) - all_estimands = [] - for f ∈ estimands_files - estimands = deserialize(f).estimands - append!(all_estimands, estimands) - if length(estimands) == 1 - continue - elseif length(estimands) == 2 - @test all(Simulations.get_outcome(Ψ) === :sarcoidosis for Ψ in estimands) - else - throw(ArgumentError("Batchsize should be max 2.")) - end - end - @test length(all_estimands) == 4 + estimands = gene_atlas_estimands() + trait_to_variants = Simulations.get_trait_to_variants( + estimands; + verbosity=0, + gene_atlas_dir=joinpath(tmpdir, "gene_atlas_data"), + remove_ga_data=true, + trait_table_path=joinpath("assets", "Traits_Table_GeneATLAS.csv"), + maf_threshold=0.01, + pvalue_threshold=1e-5, + distance_threshold=1e6, + max_variants=10, + ) + @test length(trait_to_variants["sarcoidosis"]) == 10 + @test issubset(("rs502771", "rs184270108"), trait_to_variants["sarcoidosis"]) + @test length(trait_to_variants["G20 Parkinson's disease"]) == 10 + @test issubset(("rs11868112", "rs6456121", "rs356219"), trait_to_variants["G20 Parkinson's disease"]) + + # Dataset and validated estimands + # We change the data to match what is in the BGEN file + estimands, trait_to_variants = estimands_and_traits_to_variants_matching_bgen() + bgen_prefix = joinpath(TARGENCORE_TESTDIR, "data", "ukbb", "imputed" ,"ukbb") + traits_file = joinpath(TARGENCORE_TESTDIR, "data", "traits_1.csv") + pcs_file = joinpath(TARGENCORE_TESTDIR, "data", "pcs.csv") + positivity_constraint = 0 + call_threshold = 0.9 + dataset, validated_estimands = Simulations.get_dataset_and_validated_estimands( + estimands, + bgen_prefix, + traits_file, + pcs_file, + trait_to_variants; + call_threshold=call_threshold, + positivity_constraint=positivity_constraint, + verbosity=verbosity + ) + @test names(dataset) == ["SAMPLE_ID", "BINARY_1", "BINARY_2", "CONTINUOUS_1", "CONTINUOUS_2", "COV_1", + "21003", "22001", "TREAT_1", "PC1", "PC2", "RSID_2", "RSID_198"] + @test size(dataset, 1) == 490 + @test length(estimands) == length(validated_estimands) + # Check estimands have been matched to the dataset: GA -> AG + Ψ = validated_estimands[findfirst(x->x isa TMLE.ComposedEstimand, validated_estimands)] + @test Ψ.args[1].treatment_values.RSID_198 == "AG" + @test Ψ.args[2].treatment_values.RSID_198 == "AG" - # Conditional densities - ## There should be 2 + n_variants densities - ## 10 variants per trait = 20 variants - requested_variants = ["rs502771", "rs184270108", "rs11868112", "rs6456121"] - variants = open(readlines, string(outprefix, "_variants.txt")) - @test issubset(requested_variants, variants) - @test length(variants) == 20 - conditional_density_files = filter(x -> occursin("ga_sim_input_conditional_density", x), readdir(tmpdir, join=true)) - @test length(conditional_density_files) == length(variants) + 2 - density_targets = Set([]) - for f in conditional_density_files - conditional_density = JSON.parsefile(f) - push!(density_targets, conditional_density["outcome"]) - if conditional_density["outcome"] ∈ variants - @test Set(conditional_density["parents"]) == Set(["PC2", "PC1", "PC3"]) - else - @test issubset(["PC2", "PC1", "Age-Assessment", "PC3", "Genetic-Sex"], conditional_density["parents"]) - end + # Now the writing + output_prefix = joinpath(tmpdir, "ga.input") + batchsize = 2 + Simulations.write_ga_simulation_inputs( + output_prefix, + dataset, + validated_estimands, + trait_to_variants; + batchsize=batchsize, + verbosity=verbosity + ) + # One dataset + loaded_dataset = Arrow.Table(string(output_prefix, ".data.arrow")) |> DataFrame + @test size(loaded_dataset) == (490, 13) + # 5 estimands split in 3 files of size (2, 2, 1) + loaded_estimands = reduce( + vcat, + deserialize(string(output_prefix, ".estimands_$i.jls")).estimands for i in 1:3 + ) + @test loaded_estimands == validated_estimands + # 3 densities for the 3 outcomes + outcome_to_parents = Dict() + for i in 1:3 + density = JSON.parsefile(string(output_prefix, ".conditional_density_$i.json")) + outcome_to_parents[density["outcome"]] = sort(density["parents"]) end - @test density_targets == Set(vcat(variants, ["sarcoidosis", "G20 Parkinson's disease"])) + @test outcome_to_parents == Dict( + "BINARY_2" => ["21003", "22001", "COV_1", "PC1", "PC2", "RSID_2"], + "CONTINUOUS_2" => ["21003", "22001", "COV_1", "PC1", "PC2", "RSID_198", "RSID_2"], + "BINARY_1" => ["22001", "PC1", "PC2", "RSID_198", "RSID_2", "TREAT_1"] + ) end end diff --git a/test/null_simulation.jl b/test/null_simulation.jl index 4eb0cac..efdcb24 100644 --- a/test/null_simulation.jl +++ b/test/null_simulation.jl @@ -24,7 +24,7 @@ include(joinpath(TESTDIR, "testutils.jl")) @testset "Integration Test Null Simulation" begin inputdir = mktempdir() estimands_filename = joinpath(inputdir, "estimands.yaml") - save_integration_test_configuration(estimands_filename) + save_configuration_macthing_bgen(estimands_filename) # Creating the inputs parsed_args = Dict( "out-prefix" => joinpath(inputdir, "final"), diff --git a/test/runtests.jl b/test/runtests.jl index 2e2a88e..3ff6176 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -18,5 +18,4 @@ TESTDIR = joinpath(pkgdir(Simulations), "test") # Integration Tests @test include(joinpath(TESTDIR, "null_simulation.jl")) - @test include(joinpath(TESTDIR, "gene_atlas_simulation.jl")) end diff --git a/test/samplers/density_estimate_sampler.jl b/test/samplers/density_estimate_sampler.jl index 422110a..bd711e3 100644 --- a/test/samplers/density_estimate_sampler.jl +++ b/test/samplers/density_estimate_sampler.jl @@ -23,38 +23,77 @@ TESTDIR = joinpath(pkgdir(Simulations), "test") train_ratio=10, verbosity=0 ) - # Learn T1 - density_file = joinpath(TESTDIR, "assets", "conditional_density_T1.json") + # Learn Ycont + density_file = joinpath(TESTDIR, "assets", "conditional_density_Ycont.json") density_estimation( dataset_file, density_file; mode="test", - output=joinpath(density_dir, "density_T1.hdf5"), + output=joinpath(density_dir, "density_Ycont.hdf5"), train_ratio=10, verbosity=0 ) - # Create the sampler + # Scenario 1: + # only 1 estimand, not all parents of Ybin are in the estimand's variables. + # This is to make sure variants that are used to generated the + # data but not required afterwards for estimation are discarded + # in the sampled dataset estimands = [ATE( outcome=:Ybin, treatment_values=(T₁=(case=1, control=0),), treatment_confounders=(:W,), - outcome_extra_covariates=(:C,) )] prefix = joinpath(density_dir, "density") sampler = DensityEstimateSampler(prefix, estimands) - @test Set(sampler.sources) == Set([:W, :C]) - @test sampler.treatment_density_mapping == Dict((:T₁=>(:W,)) => joinpath(density_dir, "density_T1.hdf5")) - @test sampler.outcome_density_mapping == Dict((:Ybin=>(:C, :T₁, :W)) => joinpath(density_dir, "density_Ybin.hdf5")) + @test sort(sampler.all_parents_set) == [:C, :T₁, :W] + @test sort(sampler.variables_required_for_estimation) == [:T₁, :W, :Ybin] + @test sampler.density_mapping == Dict(:Ybin => ([:C, :T₁, :W], joinpath(density_dir, "density_Ybin.hdf5"))) # Sample origin_dataset = DataFrame(Arrow.Table(dataset_file)) - sampled_dataset = sample_from(sampler, origin_dataset, n=100) + sampled_dataset = sample_from(sampler, origin_dataset, n=50) + ## Check parents are given by the empirical + parents = [:W, :T₁] + all_origin_rows = collect(eachrow(origin_dataset[!, parents])) + for row in eachrow(sampled_dataset[!, parents]) + @test row ∈ all_origin_rows + end + @test names(sampled_dataset) == string.(sampler.variables_required_for_estimation) + @test size(sampled_dataset, 1) == 50 + @test sampled_dataset.Ybin isa CategoricalVector - @test names(sampled_dataset) == ["W", "C", "T₁", "Ybin"] - @test size(sampled_dataset, 1) == 100 - - # True effects - Ψ = estimands[1] - effect = true_effect(Ψ, sampler, origin_dataset; n=1000) + # Scenario 2: + # Two estimands + estimands = [ATE( + outcome=:Ybin, + treatment_values=(T₁=(case=1, control=0),), + treatment_confounders=(:W,), + ), + factorialEstimand( + IATE, + (T₁=[0, 1], T₂=[0, 1]), + :Ycont, + confounders=(:W,), + outcome_extra_covariates=(:C,) + ) + ] + sampler = DensityEstimateSampler(prefix, estimands) + @test sort(sampler.all_parents_set) == [:C, :T₁, :T₂, :W] + @test sort(sampler.variables_required_for_estimation) == [:C, :T₁, :T₂, :W, :Ybin, :Ycont] + @test sampler.density_mapping == Dict( + :Ybin => ([:C, :T₁, :W], joinpath(density_dir, "density_Ybin.hdf5")), + :Ycont => ([:W, :T₁, :T₂, :C], joinpath(density_dir, "density_Ycont.hdf5")) + ) + # Sample + origin_dataset = DataFrame(Arrow.Table(dataset_file)) + sampled_dataset = sample_from(sampler, origin_dataset, n=50) + ## All parents are retained + all_origin_rows = collect(eachrow(origin_dataset[!, sampler.all_parents_set])) + for row in eachrow(sampled_dataset[!, sampler.all_parents_set]) + @test row ∈ all_origin_rows + end + @test names(sampled_dataset) == string.(sampler.variables_required_for_estimation) + @test size(sampled_dataset, 1) == 50 + @test sampled_dataset.Ybin isa CategoricalVector end end diff --git a/test/samplers/null_sampler.jl b/test/samplers/null_sampler.jl index b11a8fc..f0a3371 100644 --- a/test/samplers/null_sampler.jl +++ b/test/samplers/null_sampler.jl @@ -47,22 +47,6 @@ include(joinpath(TESTDIR, "testutils.jl")) )) @test_throws AssertionError("All estimands should share the same confounders and covariates.") NullSampler(estimands) - # True effects - ## Continuous Outcome - Ψ = estimands[1] - @test theoretical_true_effect(Ψ, sampler) == 0 - @test empirical_true_effect(Ψ, sampler, origin_dataset; n=100_000) ≈ 0. atol=0.01 - @test true_effect(Ψ, sampler, origin_dataset) == 0 - ## Count Outcome - Ψ = estimands[2] - @test true_effect(Ψ, sampler, origin_dataset) == 0 - ## Composed Estimand / Binary Outcome - Ψ = estimands[3] - @test theoretical_true_effect(Ψ, sampler) == [0, 0] - composed_effect = empirical_true_effect(Ψ, sampler, origin_dataset; n=100_000) - @test composed_effect[1] == - composed_effect[2] - @test composed_effect[1] ≈ 0 atol=0.01 - @test true_effect(Ψ, sampler, origin_dataset) == [0, 0] end end diff --git a/test/testutils.jl b/test/testutils.jl index 02b1b7a..8fcb0a2 100644 --- a/test/testutils.jl +++ b/test/testutils.jl @@ -107,7 +107,7 @@ function write_linear_interaction_dataset_estimands() serialize("test/assets/estimands/estimands_ates.jls", linear_interaction_dataset_ATEs()) end -function make_integration_test_configuration() +function estimands_and_traits_to_variants_matching_bgen() estimands = [ IATE( outcome = "BINARY_1", @@ -128,7 +128,7 @@ function make_integration_test_configuration() ), ATE( outcome = "CONTINUOUS_2", - treatment_values = (RSID_2 = (case = "AA", control = "GG"), RSID_198 = (case = "AG", control = "AA")), + treatment_values = (RSID_2 = (case = "AA", control = "GG"), RSID_198 = (case = "GA", control = "AA")), treatment_confounders = (RSID_2 = [], RSID_198 = []), outcome_extra_covariates = [22001] ), @@ -136,22 +136,28 @@ function make_integration_test_configuration() TMLE.joint_estimand, ( CM( outcome = "BINARY_1", - treatment_values = (RSID_2 = "GG", RSID_198 = "AG"), + treatment_values = (RSID_2 = "GG", RSID_198 = "GA"), treatment_confounders = (RSID_2 = [], RSID_198 = []), outcome_extra_covariates = [22001] ), CM( outcome = "BINARY_1", - treatment_values = (RSID_2 = "AA", RSID_198 = "AG"), + treatment_values = (RSID_2 = "AA", RSID_198 = "GA"), treatment_confounders = (RSID_2 = [], RSID_198 = []), outcome_extra_covariates = [22001] )) ) ] - return Configuration(estimands=estimands) + traits_to_variants = Dict( + "BINARY_1" => ["RSID_2", "RSID_198"], + "CONTINUOUS_2" => ["RSID_2", "RSID_198"], + "BINARY_2" => ["RSID_2"], + ) + return estimands, traits_to_variants end -function save_integration_test_configuration(path) - config = make_integration_test_configuration() +function save_configuration_macthing_bgen(path) + estimands, _ = estimands_and_traits_to_variants_matching_bgen() + config = TMLE.Configuration(estimands=estimands) TMLE.write_yaml(path, config) -end \ No newline at end of file +end