From 93eb68c62912386b20d8d1ee075bd1b72df8a850 Mon Sep 17 00:00:00 2001
From: Olivier Labayle <olabayle@gmail.com>
Date: Thu, 23 May 2024 15:37:55 +0100
Subject: [PATCH] add all remaining files

---
 src/Simulations.jl                         |   2 +-
 src/cli.jl                                 |  58 +++---
 src/inputs_from_gene_atlas.jl              | 216 +++++++++++++++++----
 src/samplers/density_estimate_sampler.jl   | 105 ++++------
 src/samplers/null_sampler.jl               |  45 +----
 test/assets/conditional_density_Ybin.json  |   8 +
 test/assets/conditional_density_Ycont.json |   6 +
 test/assets/conditional_density_x_y.json   |   6 +
 test/assets/dataset.arrow                  | Bin 0 -> 6946 bytes
 test/assets/density_estimators.jl          |   8 +
 test/assets/estimands/estimands_ates.jls   | Bin 0 -> 569 bytes
 test/assets/estimands/estimands_iates.jls  | Bin 0 -> 320 bytes
 test/inputs_from_gene_atlas.jl             | 193 ++++++++++++------
 test/null_simulation.jl                    |   2 +-
 test/runtests.jl                           |   1 -
 test/samplers/density_estimate_sampler.jl  |  69 +++++--
 test/samplers/null_sampler.jl              |  16 --
 test/testutils.jl                          |  22 ++-
 18 files changed, 480 insertions(+), 277 deletions(-)
 create mode 100644 test/assets/conditional_density_Ybin.json
 create mode 100644 test/assets/conditional_density_Ycont.json
 create mode 100644 test/assets/conditional_density_x_y.json
 create mode 100644 test/assets/dataset.arrow
 create mode 100644 test/assets/density_estimators.jl
 create mode 100644 test/assets/estimands/estimands_ates.jls
 create mode 100644 test/assets/estimands/estimands_iates.jls

diff --git a/src/Simulations.jl b/src/Simulations.jl
index 4204bea..9af7c21 100644
--- a/src/Simulations.jl
+++ b/src/Simulations.jl
@@ -22,6 +22,7 @@ using Serialization
 using Makie
 using CSV
 using CairoMakie
+using TargeneCore
 
 include("utils.jl")
 
@@ -34,7 +35,6 @@ include(joinpath("samplers", "density_estimate_sampler.jl"))
 
 include(joinpath("inputs_from_gene_atlas.jl"))
 include("estimation.jl")
-include("analysis.jl")
 include("cli.jl")
 
 export NullSampler, DensityEstimateSampler
diff --git a/src/cli.jl b/src/cli.jl
index 8ce3d73..87ddafe 100644
--- a/src/cli.jl
+++ b/src/cli.jl
@@ -10,10 +10,6 @@ function cli_settings()
             action = :command
             help = "Aggregate multiple results file created by estimation procedures."
 
-        "density-estimation-inputs"
-            action = :command
-            help = "Generates density estimation inputs."
-
         "density-estimation"
             action = :command
             help = "Estimate a conditional density."
@@ -120,25 +116,6 @@ function cli_settings()
             default = mktempdir()
     end
 
-    @add_arg_table! s["density-estimation-inputs"] begin
-        "dataset"
-            arg_type = String
-            help = "Path to the dataset (either .csv or .arrow)"
-
-        "estimands-prefix"
-            arg_type = String
-            help = "A prefix to serialized TMLE.Configuration (accepted formats: .json | .yaml | .jls)"
-
-        "--output-prefix"
-            arg_type = String
-            default = "de_inputs"
-            help = "Output JSON file."
-        "--batchsize"
-            arg_type = Int
-            default = 10
-            help = "Estimands are batched to optimize speed by //"
-    end
-
     @add_arg_table! s["density-estimation"] begin
         "dataset"
             arg_type = String
@@ -175,6 +152,18 @@ function cli_settings()
             arg_type = String
             help = "A prefix to serialized TMLE.Configuration (accepted formats: .json | .yaml | .jls)"
 
+        "bgen-prefix"
+            arg_type = String
+            help = "A prefix to imputed genotypes (BGEN format)"
+        
+        "traits"
+            arg_type = String
+            help = "The dataset containing phenotypes."
+        
+        "pcs"
+            arg_type = String
+            help = "The dataset of principal components."
+
         "--ga-download-dir"
             arg_type = String
             default = "gene_atlas_data"
@@ -209,6 +198,21 @@ function cli_settings()
             arg_type = Int
             default = 100
             help = "Maximum variants retrieved per trait."
+        
+        "--positivity-constraint"
+            arg_type = Float64
+            default = 0.0
+            help = "Minimum frequency of a treatment."
+
+        "--call-threshold"
+            arg_type = Float64
+            default = 0.9
+            help = "If no genotype as at least this probability it is considered missing."
+
+        "--verbosity"
+            arg_type = Int
+            default = 0
+            help = "Verbosity level."
 
         "--output-prefix"
             arg_type = String
@@ -248,13 +252,19 @@ function julia_main()::Cint
         save_aggregated_df_results(cmd_settings["input-prefix"], cmd_settings["out"])
     elseif cmd == "simulation-inputs-from-ga"
         simulation_inputs_from_gene_atlas(
-            cmd_settings["estimands-prefix"];
+            cmd_settings["estimands-prefix"],
+            cmd_settings["bgen-prefix"],
+            cmd_settings["traits"],
+            cmd_settings["pcs"];
             gene_atlas_dir=cmd_settings["ga-download-dir"],
             remove_ga_data=cmd_settings["remove-ga-data"], 
             trait_table_path=cmd_settings["ga-trait-table"],
             maf_threshold=cmd_settings["maf-threshold"],
             pvalue_threshold=cmd_settings["pvalue-threshold"],
             distance_threshold=cmd_settings["distance-threshold"],
+            positivity_constraint=cmd_settings["positivity-constraint"],
+            call_threshold=cmd_settings["call-threshold"],
+            verbosity=cmd_settings["verbosity"],
             output_prefix=cmd_settings["output-prefix"],
             batchsize=cmd_settings["batchsize"],
             max_variants=cmd_settings["max-variants"]
diff --git a/src/inputs_from_gene_atlas.jl b/src/inputs_from_gene_atlas.jl
index 703aee8..7a52f28 100644
--- a/src/inputs_from_gene_atlas.jl
+++ b/src/inputs_from_gene_atlas.jl
@@ -171,7 +171,8 @@ function group_by_outcome(estimands)
     return groups
 end
 
-function simulation_inputs_from_gene_atlas(estimands_prefix;
+function get_trait_to_variants(estimands; 
+    verbosity=0, 
     gene_atlas_dir="gene_atlas_data",
     remove_ga_data=true,
     trait_table_path=joinpath("assets", "Traits_Table_GeneATLAS.csv"),
@@ -179,13 +180,8 @@ function simulation_inputs_from_gene_atlas(estimands_prefix;
     pvalue_threshold=1e-5,
     distance_threshold=1e6,
     max_variants=100,
-    output_prefix="ga_sim_input",
-    batchsize=10
-    )
-    estimands = reduce(
-        vcat, 
-        TargetedEstimation.read_estimands_config(f).estimands for f ∈ files_matching_prefix(estimands_prefix)
     )
+    verbosity > 0 && @info("Retrieve significant variants for each outcome.")
     # Retrieve traits and variants from estimands
     trait_to_variants = get_trait_to_variants_from_estimands(estimands)
     # Retrieve Trait to geneAtlas key map
@@ -199,44 +195,186 @@ function simulation_inputs_from_gene_atlas(estimands_prefix;
         distance_threshold=distance_threshold,
         max_variants=max_variants
     )
+    return trait_to_variants
+end
 
-    # Write all variants for dataset extraction
-    unique_variants = unique(vcat(values(trait_to_variants)...))
-    open(string(output_prefix, "_variants.txt"), "w") do io
-        for v in unique_variants
-            write(io, string(v, "\n"))
-        end
-    end
-    # Group and Write Estimands
-    batch_index = 1
-    for (outcome, estimands_group) ∈ group_by_outcome(estimands)
-        # Optimize order within a group and write estimands
-        estimands_group = groups_ordering(estimands_group)
-        for batch in Iterators.partition(estimands_group, batchsize)
-            batch_filename = string(output_prefix, "_estimands_", batch_index, ".jls")
-            serialize(batch_filename, TMLE.Configuration(estimands=batch))
-            batch_index += 1
-        end
+function get_dataset_and_validated_estimands(
+    estimands,
+    bgen_prefix,
+    traits_file,
+    pcs_file,
+    trait_to_variants;
+    call_threshold=0.9,
+    positivity_constraint=0.,
+    verbosity=0
+    )
+    verbosity > 0 && @info("Calling genotypes.")
+    variants_set = Set(string.(vcat(values(trait_to_variants)...)))
+
+    genotypes = TargeneCore.call_genotypes(
+        bgen_prefix, 
+        variants_set, 
+        call_threshold
+    )
+    ## Read PCs and traits
+    traits = TargeneCore.read_csv_file(traits_file)
+    pcs = TargeneCore.read_csv_file(pcs_file)
+    ## Merge all together
+    dataset = TargeneCore.merge(traits, pcs, genotypes)
+
+    # Validate Estimand
+    verbosity > 0 && @info("Validating estimands.")
+    variables = TargeneCore.get_variables(estimands, traits, pcs)
+    estimands = TargeneCore.adjusted_estimands(
+        estimands, variables, dataset; 
+        positivity_constraint=positivity_constraint
+    )
+    return dataset, estimands
+end
+
+all_treatments(Ψ) = keys(Ψ.treatment_values)
+all_treatments(Ψ::TMLE.ComposedEstimand) = Tuple(union((all_treatments(arg) for arg ∈ Ψ.args)...))
+
+all_outcome_extra_covariates(Ψ) = Ψ.outcome_extra_covariates
+all_outcome_extra_covariates(Ψ::TMLE.ComposedEstimand) = Tuple(union((all_outcome_extra_covariates(arg) for arg ∈ Ψ.args)...))
+
+all_confounders(Ψ) = Tuple(union(values(Ψ.treatment_confounders)...))
+all_confounders(Ψ::TMLE.ComposedEstimand) = Tuple(union((all_confounders(arg) for arg ∈ Ψ.args)...))
+
+function write_densities(output_prefix, trait_to_variants, estimands)
+    outcome_parents = Dict(outcome => Set(variants) for (outcome, variants) in trait_to_variants)
+    for Ψ ∈ estimands
+        outcome = get_outcome(Ψ)
+        new_parents = string.(union(
+            all_outcome_extra_covariates(Ψ),
+            all_treatments(Ψ),
+            all_confounders(Ψ)
+        ))
+        union!(
+            outcome_parents[string(outcome)],
+            new_parents
+        )
     end
-    # Write densities
     density_index = 1
-    confounders = get_confounders_assert_equal(estimands)
-    covariates = get_covariates_assert_equal(estimands)
-    ## Outcome densities
-    for (outcome, variants) in trait_to_variants
-        parents = vcat(variants, confounders..., covariates...)
-        open(string(output_prefix, "_conditional_density_", density_index, ".json"), "w") do io
-            JSON.print(io, Dict("outcome" => outcome, "parents" => parents), 1)
-        end
-        density_index += 1
-    end
-    ## Propensity scores
-    for variant in unique_variants
-        open(string(output_prefix, "_conditional_density_", density_index, ".json"), "w") do io
-            JSON.print(io, Dict("outcome" => variant, "parents" => confounders), 1)
+    for (outcome, parents) in outcome_parents
+        open(string(output_prefix, ".conditional_density_", density_index, ".json"), "w") do io
+            JSON.print(io, Dict("outcome" => outcome, "parents" => collect(parents)), 1)
         end
         density_index += 1
     end
+end
+
+function write_ga_simulation_inputs(
+    output_prefix,
+    dataset,
+    estimands,
+    trait_to_variants;
+    batchsize=10,
+    verbosity=0
+    )
+    verbosity > 0 && @info("Writing outputs.")
+    # Writing estimands and dataset
+    TargeneCore.write_tl_inputs(output_prefix, dataset, estimands; batch_size=batchsize)
+    # Writing densities
+    write_densities(output_prefix, trait_to_variants, estimands)
+end
 
+"""
+    simulation_inputs_from_gene_atlas(
+        estimands_prefix, 
+        bgen_prefix, 
+        traits_file, 
+        pcs_file;
+        gene_atlas_dir="gene_atlas_data",
+        remove_ga_data=true,
+        trait_table_path=joinpath("assets", "Traits_Table_GeneATLAS.csv"),
+        maf_threshold=0.01,
+        pvalue_threshold=1e-5,
+        distance_threshold=1e6,
+        max_variants=100,
+        output_prefix="ga_sim_input",
+        batchsize=10,
+        positivity_constraint=0,
+        call_threshold=0.9,
+        verbosity=0,
+        )
+
+This function generates input files for density estimates based simulations using 
+variants identified from the geneATLAS.
+
+## How are variants selected from the geneATLAS
+
+Association data is downloaded to `gene_atlas_dir` and cleaned afterwards 
+if `remove_ga_data`. For each outcome, variants are selected if:
+
+- They pass a significance threshold: `pvalue_threshold`.
+- They are at least `distance_threshold` away from other variants.
+- They are frequent: `maf_threshold`.
+
+Finally, a maximum of `max_variants` is retained per outcome.
+
+## What files are Generated
+
+The Generated input files, prefixed by `output_prefix`, are:
+
+- A dataset: Combines `pcs_file`, `traits_file` and called genotypes from `bgen_prefix` at `call_threshold`.
+- Validated estimands: Estimands from `estimands_prefix` are validated. We makes sure the estimands match 
+the data in the dataset and pass the `positivity_constraint`. They are then written in batches of size `batchsize`.
+- Conditional Densities: To be learnt to simulate new data for the estimands of interest.
+"""
+function simulation_inputs_from_gene_atlas(
+    estimands_prefix, 
+    bgen_prefix, 
+    traits_file, 
+    pcs_file;
+    gene_atlas_dir="gene_atlas_data",
+    remove_ga_data=true,
+    trait_table_path=joinpath("assets", "Traits_Table_GeneATLAS.csv"),
+    maf_threshold=0.01,
+    pvalue_threshold=1e-5,
+    distance_threshold=1e6,
+    max_variants=100,
+    output_prefix="ga_sim_input",
+    batchsize=10,
+    positivity_constraint=0,
+    call_threshold=0.9,
+    verbosity=0,
+    )
+    estimands = reduce(
+        vcat, 
+        TargetedEstimation.read_estimands_config(f).estimands for f ∈ files_matching_prefix(estimands_prefix)
+    )
+    # Trait to variants from geneATLAS
+    trait_to_variants = get_trait_to_variants(estimands; 
+        verbosity=verbosity, 
+        gene_atlas_dir=gene_atlas_dir,
+        remove_ga_data=remove_ga_data,
+        trait_table_path=trait_table_path,
+        maf_threshold=maf_threshold,
+        pvalue_threshold=pvalue_threshold,
+        distance_threshold=distance_threshold,
+        max_variants=max_variants,
+        )
+    # Dataset and validated estimands
+    dataset, estimands = get_dataset_and_validated_estimands(
+        estimands,
+        bgen_prefix,
+        traits_file,
+        pcs_file,
+        trait_to_variants;
+        call_threshold=call_threshold,
+        positivity_constraint=positivity_constraint,
+        verbosity=verbosity
+    )
+    # Write outputs
+    write_ga_simulation_inputs(
+        output_prefix,
+        dataset,
+        estimands,
+        trait_to_variants;
+        batchsize=batchsize,
+        verbosity=verbosity
+        )
+    verbosity > 0 && @info("Done.")
     return 0
 end
\ No newline at end of file
diff --git a/src/samplers/density_estimate_sampler.jl b/src/samplers/density_estimate_sampler.jl
index 77bb0f4..36deb18 100644
--- a/src/samplers/density_estimate_sampler.jl
+++ b/src/samplers/density_estimate_sampler.jl
@@ -2,90 +2,55 @@
 sieve_neural_net_density_estimator(file::AbstractString) = jldopen(io -> restore!(io["sieve-neural-net"]), file)
 
 struct DensityEstimateSampler
-    sources::Vector
-    treatment_density_mapping::Dict
-    outcome_density_mapping::Dict
+    density_mapping::Dict
+    all_parents_set::Vector{Symbol}
+    variables_required_for_estimation::Vector{Symbol}
 end
 
+get_outcomes_set(estimands) = Set(get_outcome(Ψ) for Ψ in estimands)
+
 function DensityEstimateSampler(prefix, estimands)
     # Create density to file map (There could be more files than actually required)
+    outcomes_set = get_outcomes_set(estimands)
+    all_parents_set = Set{Symbol}()
     density_mapping = Dict()
     for f ∈ files_matching_prefix(prefix)
         jldopen(f) do io
-            density_mapping[(Symbol(io["outcome"]) => Tuple(Symbol.(io["parents"])))] = f
+            outcome = Symbol(io["outcome"])
+            if outcome ∈ outcomes_set
+                parents = Symbol.(io["parents"])
+                density_mapping[outcome] = (parents, f)
+                union!(all_parents_set, parents)
+            end
         end
     end
-    # Create required density to file map
-    required_densities = Dict()
-    all_parents = Set([])
-    all_outcomes = Set([])
-    for Ψ in estimands
-        for η in TMLE.nuisance_functions_iterator(Ψ)
-            required_densities[η.outcome => η.parents] = density_mapping[η.outcome => η.parents]
-            union!(all_parents, η.parents)
-            push!(all_outcomes, η.outcome)
-        end
+    variables_required_for_estimation = Set{Symbol}()
+    for Ψ ∈ estimands
+        pre_outcome_variables = union(
+            all_outcome_extra_covariates(Ψ),
+            all_treatments(Ψ),
+            all_confounders(Ψ),
+        )
+        union!(
+            variables_required_for_estimation,
+            pre_outcome_variables
+        )
+        push!(variables_required_for_estimation, get_outcome(Ψ))
     end
-    # A source is never an outcome
-    sources = [x for x in all_parents if x ∉ all_outcomes]
-    # A treatment node is both an outcome and a parent
-    treatment_density_mapping = Dict(cd => f for (cd, f) in required_densities if (cd[1] ∈ all_parents && cd[1] ∈ all_outcomes))
-    # An outcome node is never a parent
-    outcome_density_mapping = Dict(cd => f for (cd, f) in required_densities if cd[1] ∉ all_parents)
-    return DensityEstimateSampler(sources, treatment_density_mapping, outcome_density_mapping)
+
+    return DensityEstimateSampler(density_mapping, collect(all_parents_set), collect(variables_required_for_estimation))
 end
 
 function sample_from(sampler::DensityEstimateSampler, origin_dataset; n=100)
 
-    sampled_dataset = sample_from(origin_dataset, sampler.sources; n=n)
-
-    for density_mapping in (sampler.treatment_density_mapping, sampler.outcome_density_mapping)
-        for ((outcome, parents), file) in density_mapping
-            conditional_density_estimate = Simulations.sieve_neural_net_density_estimator(file)
-            sampled_dataset[!, outcome] = sample_from(
-                conditional_density_estimate, 
-                sampled_dataset[!, collect(parents)]            
-            ) 
-        end
-    end
-
-    return sampled_dataset
-end
+    sampled_dataset = sample_from(origin_dataset, sampler.all_parents_set; n=n)
 
-function counterfactual_aggregate(Ψ, Q, X)
-    Ttemplate = TMLE.selectcols(X, TMLE.treatments(Ψ))
-    n = nrow(Ttemplate)
-    ctf_agg = zeros(n)
-    # Loop over Treatment settings
-    for (vals, sign) in TMLE.indicator_fns(Ψ)
-        # Counterfactual dataset for a given treatment setting
-        T_ct = TMLE.counterfactualTreatment(vals, Ttemplate)
-        X_ct = merge(X, T_ct)
-        # Counterfactual mean
-        ctf_agg .+= sign .* TMLE.expected_value(Q, X_ct)
+    for (outcome, (parents, file)) in sampler.density_mapping
+        conditional_density_estimate = Simulations.sieve_neural_net_density_estimator(file)
+        sampled_dataset[!, outcome] = sample_from(
+            conditional_density_estimate, 
+            sampled_dataset[!, parents]            
+        ) 
     end
-    return ctf_agg
-end
-
-function monte_carlo_effect(Ψ, sampler, dataset)
-    outcome_mean = TMLE.outcome_mean(Ψ)
-    Q = Simulations.sieve_neural_net_density_estimator(sampler.outcome_density_mapping[outcome_mean.outcome => outcome_mean.parents])
-    X = TMLE.selectcols(dataset, outcome_mean.parents)
-    labels = Simulations.getlabels(dataset[!, outcome_mean.outcome])
-    ctf_agg = counterfactual_aggregate(Ψ, Q, X)
-    return mean(ctf_agg)
+    return sampled_dataset[!, sampler.variables_required_for_estimation]
 end
-
-function true_effect(Ψ, sampler::DensityEstimateSampler, origin_dataset; n=500_000)
-    sampled_dataset = sample_from(sampler, origin_dataset; n=n)
-    return monte_carlo_effect(Ψ, sampler, sampled_dataset)
-end
-
-function true_effect(Ψ::ComposedEstimand, sampler::DensityEstimateSampler, origin_dataset;n=500_000)
-    sampled_dataset = sample_from(sampler, origin_dataset; n=n)
-    effect = zeros(length(Ψ.args))
-    for (index, arg) in enumerate(Ψ.args)
-        effect[index] = monte_carlo_effect(arg, sampler, sampled_dataset)
-    end
-    return effect
-end
\ No newline at end of file
diff --git a/src/samplers/null_sampler.jl b/src/samplers/null_sampler.jl
index 64dc56c..ecd9359 100644
--- a/src/samplers/null_sampler.jl
+++ b/src/samplers/null_sampler.jl
@@ -37,47 +37,4 @@ function sample_from(sampler::NullSampler, origin_dataset; n=100)
         sampled_dataset[!, variable] = origin_dataset[sample_mask, variable]
     end
     return sampled_dataset
-end
-
-theoretical_true_effect(Ψ, sampler::NullSampler) = 0.
-theoretical_true_effect(Ψ::ComposedEstimand, sampler::NullSampler) = [0. for _ in Ψ.args]
-
-"""
-For this generating process, Y is independent of both T and W.
-
-This effect computation:
-    - Discards the dependence on W.
-    - Uses the "dependence" on T. 
-    
-Not sure if this is reasonnable.
-
-Mathematically, this is saying:
-
-E[Y|do(T=t)] = E[Y|T=t] (= E[Y], in principle but not used)
-"""
-function confounders_independent_effect(dataset, Ψ)
-    indicator_fns = TMLE.indicator_fns(Ψ)
-    effect = 0.
-    for (key, group) ∈ pairs(groupby(dataset, TMLE.treatments(Ψ)))
-        if haskey(indicator_fns, values(key))
-            effect += mean(group[!, Ψ.outcome]) * indicator_fns[values(key)]
-        end
-    end
-    return effect
-end
-
-function empirical_true_effect(Ψ, sampler::NullSampler, origin_dataset; n=500_000)
-    sampled_dataset = sample_from(sampler, origin_dataset; n=n)
-    return confounders_independent_effect(sampled_dataset, Ψ)
-end
-
-function empirical_true_effect(Ψ::ComposedEstimand, sampler::NullSampler, origin_dataset; n=500_000)
-    sampled_dataset = sample_from(sampler, origin_dataset; n=n)
-    effect = zeros(length(Ψ.args))
-    for (index, arg) in enumerate(Ψ.args)
-        effect[index] = confounders_independent_effect(sampled_dataset, arg)
-    end
-    return effect
-end
-
-true_effect(Ψ, sampler::NullSampler, origin_dataset; n=500_000) = theoretical_true_effect(Ψ, sampler)
\ No newline at end of file
+end
\ No newline at end of file
diff --git a/test/assets/conditional_density_Ybin.json b/test/assets/conditional_density_Ybin.json
new file mode 100644
index 0000000..de8e2e2
--- /dev/null
+++ b/test/assets/conditional_density_Ybin.json
@@ -0,0 +1,8 @@
+{
+    "parents": [
+     "C",
+     "T₁",
+     "W"
+    ],
+    "outcome": "Ybin"
+   }
\ No newline at end of file
diff --git a/test/assets/conditional_density_Ycont.json b/test/assets/conditional_density_Ycont.json
new file mode 100644
index 0000000..4cbedec
--- /dev/null
+++ b/test/assets/conditional_density_Ycont.json
@@ -0,0 +1,6 @@
+{
+    "parents": [
+     "W", "T₁", "T₂", "C"
+    ],
+    "outcome": "Ycont"
+   }
\ No newline at end of file
diff --git a/test/assets/conditional_density_x_y.json b/test/assets/conditional_density_x_y.json
new file mode 100644
index 0000000..6a40faf
--- /dev/null
+++ b/test/assets/conditional_density_x_y.json
@@ -0,0 +1,6 @@
+{
+    "parents": [
+     "y"
+    ],
+    "outcome": "x"
+   }
\ No newline at end of file
diff --git a/test/assets/dataset.arrow b/test/assets/dataset.arrow
new file mode 100644
index 0000000000000000000000000000000000000000..f1e55b04002e1e66a53b5251f008e165155dc78d
GIT binary patch
literal 6946
zcmeHMdpK3;8ee<cxho09N#Qg#rBNo7QNCR%x=Lw;q%cZXPNXE!O;JiWlB5Zl)Fe%k
ziFZ*L%C+1IVPm_WogKFOS)0C@W<9eX%{k5U%pd3TeCxM<@9+1%@3+2XKby4{u3c+p
zsY4Kh)QDjagghY}(u6EEB?%UF9F8%lb4lul7((HZAS9@1O6AB>GoLySqvlR(&ZDMK
zuN@d16xJKA$s`o1bKx+idQre&QwMQsS~=`--|a>aid6iM{B9rlg>|j$op)0(ROdIp
z*oQs@@%50ONsJn<Ei6sAh^U=Ut8Gzp<cP3##0UQbAt9K=#33Z89gwA_#L!Mqp`mLK
zu8GR?6<SJv*XOr2WeoLU64X`^i>WD3?S^peqd)8^zK-F3V#F{~25N^SsVUqi;W~u*
zmei!m4c#xir?P}HRZ)Rj9gCU-QKdv3s4=XIQBi5Crf}`U_f!}cP}5Op`Y=Z0lRrRp
zvEd*_Y{48Odw8K(CNRhPW~pDLp%cup?jv0MM+0-LM~wMMq)su%I;rq`9doP;RpIaL
z$JZ^4{ngybhc6SYRSr8<vI+&+RVd9jDQ%o}1T<RR41HCc52uM0tNJww_DpN%QjeN$
z==J4F>-X@WEVO7-pRoXJ#9y~?PGtjgOuMc`aV?x+^#^5!H$bGT-1?pqX(+W^M>lLy
z0mMSHz_6hjsvb)AM_CmkGsU8<HSQfSVUgLDDE&^j_0YgK_FyMc$THDR|Dg``k=zt>
zW+^(K?C-fCs|C#rN*sG~b}bsanqzin1HooCC^9%@1o=Q?e5+T_dt}tCFaRq7J@@{n
zj;ur%A|oH^sHt+{uD_1Us@<s|r?}2w^^+>3YaHF%va|rH3*MexdY~Q*lBL4$K4^i7
zw*?W&_k-beW_o~qt^iCKm#S>_%22zPr^H>GIC!Y>^%JM0YS=)yxy0RWMsLoJZ{9o5
z1CM1S9v{9JgAA37stX(5!;Xc$G21F0!CePl8vkT1R9F`%y381aszr9|1U*flE~9a$
z=mj7BU>JC&+~gI69erA~BH$g&n9$_Ac|kYi%o-CrFNP2A+{2u;?!SYGhk8#|TW3IE
z>!k&AwRo_pe^Ys-i2%I!BG*dYCdAL3|Apn4m1yP^b^pw7Qa~YVxkOVz6VksGc_4f*
z32N3m$T+=bq@kI)M^ouNoY6k-IyJcnNoBrk(@-cw+fGy@m>%Pzi<`qdQ{&o^9aNUC
zyitext8|Sg%zuyeDU&4|HEUpW%-KGv%qH-VUgBYGT@60liJQ6_Jd~2-d}Cu)9=cex
zhC4R56Ks|KUDP68g*^MiwCXH5h#h!(+1UsIu#W`h!DB95WwrXLzvQ6r7GFr%jQY?E
zzpYQs=f#7XW8zyMehFyC{(L&7IRjP1`pZOkrJ_Xj*me7ZnIvP6ZDq?sf|N^ea?W|n
zfkXS&>ex#MA;++l=X9m2;edYKdd-45@FDGYNeHS@o^45(Rdze-<aU^y@2x=BqTJ$7
z9{4x<COUcY=tG6*U;A`5IwsUYd*c2qj%qE;u=85L3FN@B@gBiCyB|S*jaFRVoo4WQ
z_9{Fyh6_hc>n6F`B*3o5g0gZC9!!i>zIs|lj3kndb_6&x$P(7A$=6Iu5#!qH>4n#e
z;b}nN?&P*YSSNO5&G>l&)EL}<_*vg&kWQ;flr!i+?!9}Bx15q8Ew9-f_l|Ew?~LuO
zj1As^LGI!ii%qEe=ex!Ha%>w~?lCpdx}1*!eZ`rZV|dV;@nB9)YdL(~7e8oiTMQ07
z@oD2P7QwVRI~v3-TH%$^<mAS&xlm~rGS@qS12NCn@5?@42B{_AFY^m+2bJ`@o+a5C
z$f*17<vqF8D7bN*to588NHRYD#<jN)=5{*Qk5Xi@$pZNUriOK>`L53|1Jg2~RqOh+
zG}X(nXx0+DQ^hH0_v;NsuBNXcv9{@Yph`We;J9qAUiKEw$>qoMIK2>V>z><Q&VxmM
zRV&;Ln?RLm?h|BLgEl^RK5?>s9Y}w*HeLCvYGC&7U!mjw9>$m_Ob;2=26yJ3KXuLi
z6|9{;akI>%BxEJFrYpCL3*PD{*kd!Yk$SStZsZ$<#8HLagDC}Y_@YYbrgI$Vc{qQG
ziE1LU7T?#TlhX;uUE?a;2in2sWU=+m!=0#ds)x0|!6oGX(xfI*x(|)=i3u<J#|vbz
z&CL8raz3P-+j8rQS0jo#apP2ySSGSe_YF;6S%^3%CSC0R`4-%I!qrN0;X{C{QiH8i
z2<#YLidwXaVD=dOa7&va<Rta(pd*=wR!^CyaAEdKSme#CTDYbP*6dQa6z$i5e0MST
z1hQLDl;oMDkim<f8xoUmKdlStFHl>zI;REHq#tQ`SrmYNUZ(cLLN1&t8kqk5`DPev
zqPM$#G8djbm|_uR(*;W}D8EQ&W<#**lTeR>Qn;ypr?AVh743fzv?wQ~1DR+$%I$K<
zL8{1g*`~)V@`O>1_xcoqY<GllPBY)2aS;NqfDO-(&G%umfAkQ*a^r^wmP})@`&u4P
z=K0ltqW<+AsXykxeRJ{t()e~bc6er}gl#8$ySyz`O<jU)dv)l#YDfmU95!*@$`wC?
z@H<L4gx~)k56m%sto=9VxDG~o4IIaab<F8V*XbPlMa8j>5&JOWI7WJWBrYmXR2<jC
z_-B3CFAC{(MfvD)dY&jgQk<@j)CcQ04<nA#aU@-oALosPI3N2&aU7>3JrC=m;#kK>
z_hBvy>3M(Fhx125TvwEju49goo`>TY>H6p9qWa*tDEwUO!1d|%{%joQ<2*W!q>tps
z`53YPr>l;r>I@fo*_x;OJqiNe6L!;7ehYf*_J!_cr4l&lRd}v)n>aggbpXqMRy!D@
zZ@b6m5@eg~wENQRcF5lwaHk0|NM|$2(_>rT09p2y?9zD&Nm6$sWmw%1>e%0xwUdW*
z^{?`G{ltZ~jgi)Ile$5JzrjkA&jTOLr~#LIOww34`m2?*I?&=lw&wQ4TBubqYCW^P
z2l=(tjSjxjh3-3yj!osYBc)M}A@Y+2sQc;Mfv9Z^($n6ewrkb^($C2$_{FCYrMeyD
zUz2%<B=fpExb+M&J3%SGYFRhP?<qf~s@8}6bM5v!RV092Nr}GRn0ka(-&o}=&mhCL
zENe+$UkB=1UOUoa`CzT&BN6ZM7~;dX&Wh6<L<$$8<EyXr!<FhbM#9!t5E?m8dApQ2
zX+KBF;-({$ZM&YOGremNwuES(8D~?7JolT~YASTV*eyJ_@`IyD%K=UKOx;dMuy4NR
zB_}}A?#)sB4NV~UF6kSG36(JJbjYUb5(02!ddsGjwZoSzKjyg7EVO9O<NZ#G1X&un
zIoU*xLHg+Y98;zyMlNZ#PByw;2)mR|9(}g91@0Iq7kXZ*MwW6$O2?cyC`HArZ~aMv
zteNMXAkO8Y84>~Ji{<zbky3d(UhoFITD9Nalz0kLS6Qhz7<D6`_YH?#O_NZKy-}8f
z+)Lzi`q}dnmwHe~H7A~7B1Xm-MdYbWYK8#D@uTH>^-z36as2!PJXFyVu9IfYhb0Os
zDi(YO*~ZD$wlgHy%Bwb(&35Pn)#Q5`^%n?o()w^U6-zNP*uS&?-sCp4U2ohW)o2!*
zJzB$oY55LV?D>)|M)}A^VJl;S>N6A*t7i}$nFv<l`A@G3y1>vhr7!xP6xmy}M%klU
z0AEU)#CIGa*xJ=+t{h<Z!L|iT!3qUTQoON_Wn)`~wre*Yp3iPaOD$?<SKiJ7p9`T|
zR=f0|jXH*&xf^Rxb`awzGnheITw2phSk<G%8cxfu>?Wvsc0@jWVg|DJzrhmk>HhWa
zEBtw(BmMJ4*FP5*)d$Dvi2LHWD30TQ7o@NEuU^-u_WM-d=UVUo+Ha(Mx=!bxdL1~A
z`5}QtL#)%|xIWIqKFqOSR2=&-;(DU;a2>4UJUYj9u%FK9d2}6fjPyD<ALB?qtdHc!
zedyPNIUTVdBkqHJIF9?`I7Yf3bGjdMtc%LWKHL}k=t!@F^Kcv^u7f$QgE`K_am+t8
z5BqUFTn8i8e|ruu|LZe-;iGKfoIRAx8v5To@k8Ud4~U=B3oZYDzw-BazmiYgpM^v3
G#{L5pAcI=~

literal 0
HcmV?d00001

diff --git a/test/assets/density_estimators.jl b/test/assets/density_estimators.jl
new file mode 100644
index 0000000..d94efb7
--- /dev/null
+++ b/test/assets/density_estimators.jl
@@ -0,0 +1,8 @@
+function get_density_estimators(X, y;batchsize=16)
+    snne = SieveNeuralNetworkEstimator(X, y; 
+        hidden_sizes_candidates=[(20,), (20, 20)], 
+        batchsize=batchsize
+    )
+    glm = GLMEstimator(X, y)
+    return [snne, glm]
+end
\ No newline at end of file
diff --git a/test/assets/estimands/estimands_ates.jls b/test/assets/estimands/estimands_ates.jls
new file mode 100644
index 0000000000000000000000000000000000000000..894f0879d42e1b5aac041469ecf34c1528cbf0d4
GIT binary patch
literal 569
zcmXr_@{wR+U|=v2VB~eq&r8cpFD*(e$;{7_=RSYpCQs_dr0)C9#d*Ik{q1683GwxD
zbrBU{<Zvx6$;?g6OTnq=5zsuo;1ZxoK$XdfIgTN&IMp(;MkeRyl>l{c`6cG2ri7Fh
z<fO{`&30h|x=Vx+Xzin>MuA|E2qRELK!<|?F3ZHolAKta%E%5fp(sBGq70;wfrEj=
zM2Cyv!F>}H9hm09O=e&O+Qt|z!WeI&!^*(O26R>_&|Nz03{Zn~ctGxfs1RTjaL&&y
z$S+P!As8A!ukkBq<!9!V#HSJpIT0qX<t&j&nR%!#2l^Eh5@50k;ZdL;fe7Y8Ob_GI
f#>b$;2lEb+uOSxe2!l<${{Unzh!jOt>gNXl-cOy?

literal 0
HcmV?d00001

diff --git a/test/assets/estimands/estimands_iates.jls b/test/assets/estimands/estimands_iates.jls
new file mode 100644
index 0000000000000000000000000000000000000000..baa825f175e931bdcca3b6fd954e78e128e15467
GIT binary patch
literal 320
zcmXr_@{wR+U|=v2VB~eq&r8cpFD*(e$;{7_=RSYpCQs_dr0)C9#d*Ik{q1683GwxD
zbrBU{<PR<ZswpnXOis-4bPRFDsrI3X4l@HIYh-eMUJ1|!F2BUw)Rd6Yf}B)&zu7KK
zKzl`)fW|&*Y6O!_0>LnG0Ub657#pM}Ik7mEksV}CQGO0W83P9chYkmW2@pTH4<*sH
sVl^IY5hKt642(KF3?ht-jNwos9_(h8$fV3X6CGAmqjk7YmHPPs02<R%WdHyG

literal 0
HcmV?d00001

diff --git a/test/inputs_from_gene_atlas.jl b/test/inputs_from_gene_atlas.jl
index 4230d9b..a207c89 100644
--- a/test/inputs_from_gene_atlas.jl
+++ b/test/inputs_from_gene_atlas.jl
@@ -5,13 +5,21 @@ using Test
 using Simulations
 using Serialization
 using JSON
+using TargeneCore
+using Arrow
+using DataFrames
+
+TARGENCORE_PKGDIR = pkgdir(TargeneCore)
+
+TARGENCORE_TESTDIR = joinpath(TARGENCORE_PKGDIR, "test")
 
 PKGDIR = pkgdir(Simulations)
+
 TESTDIR = joinpath(PKGDIR, "test")
 
 include(joinpath(TESTDIR, "testutils.jl"))
 
-function test_estimands()
+function gene_atlas_estimands()
     return [
         factorialEstimand(ATE, (rs502771=["TT", "TC", "CC"],), "sarcoidosis";
             confounders=[:PC1, :PC2, :PC3], 
@@ -32,10 +40,53 @@ function test_estimands()
     ]
 end
 
-function save_test_estimands(outdir)
-    estimands = test_estimands()
-    serialize(joinpath(outdir, "estimands_1.jls"), TMLE.Configuration(estimands=estimands[1:2]))
-    serialize(joinpath(outdir, "estimands_2.jls"), TMLE.Configuration(estimands=estimands[3:end]))
+function estimands_and_traits_to_variants_matching_bgen()
+    estimands = [
+        IATE(
+            outcome = "BINARY_1",
+            treatment_values = (RSID_2 = (case = "AA", control = "GG"), TREAT_1 = (case = 1, control = 0)),
+            treatment_confounders = (RSID_2 = [], TREAT_1 = [])
+        ),
+        ATE(
+            outcome = "BINARY_2",
+            treatment_values = (RSID_2 = (case = "AA", control = "GG"),),
+            treatment_confounders = (RSID_2 = [22001], ),
+            outcome_extra_covariates = ["COV_1", 21003]
+        ),
+        CM(
+            outcome = "CONTINUOUS_2",
+            treatment_values = (RSID_2 = "AA", ),
+            treatment_confounders = (RSID_2 = [22001],),
+            outcome_extra_covariates = ["COV_1", 21003]
+        ),
+        ATE(
+            outcome = "CONTINUOUS_2",
+            treatment_values = (RSID_2 = (case = "AA", control = "GG"), RSID_198 = (case = "GA", control = "AA")),
+            treatment_confounders = (RSID_2 = [], RSID_198 = []),
+            outcome_extra_covariates = [22001]
+        ),
+        ComposedEstimand(
+            TMLE.joint_estimand, (
+                CM(
+                outcome = "BINARY_1",
+                treatment_values = (RSID_2 = "GG", RSID_198 = "GA"),
+                treatment_confounders = (RSID_2 = [], RSID_198 = []),
+                outcome_extra_covariates = [22001]
+            ),
+                CM(
+                outcome = "BINARY_1",
+                treatment_values = (RSID_2 = "AA", RSID_198 = "GA"),
+                treatment_confounders = (RSID_2 = [], RSID_198 = []),
+                outcome_extra_covariates = [22001]
+            ))
+        )
+    ]
+    traits_to_variants = Dict(
+        "BINARY_1" => ["RSID_2", "RSID_198"],
+        "CONTINUOUS_2" => ["RSID_2", "RSID_198"],
+        "BINARY_2" => ["RSID_2"],
+        )
+    return estimands, traits_to_variants
 end
 
 @testset "Test get_trait_to_variants_from_estimands" begin
@@ -81,62 +132,88 @@ end
     )
 end
 
-@testset "Test density_estimation_inputs_from_gene_atlas" begin
+@testset "Test simulation_inputs_from_gene_atlas" begin
+    # The function `simulation_inputs_from_gene_atlas` is hard to test end to end due to
+    # data limitations. It is split into 3 subfunctions that we here test sequentially but 
+    # with different data.
+    verbosity = 0
+    # Here we use the real geneATLAS data
     tmpdir = mktempdir()
-    save_test_estimands(tmpdir)
-    outprefix = joinpath(tmpdir, "ga_sim_input")
-    copy!(ARGS, [
-        "simulation-inputs-from-ga",
-        joinpath(tmpdir, "estimands"),
-        string("--ga-download-dir=", joinpath(tmpdir, "gene_atlas_data")),
-        "--remove-ga-data=true",
-        string("--ga-trait-table=", joinpath(PKGDIR, "assets", "Traits_Table_GeneATLAS.csv")),
-        "--maf-threshold=0.01",
-        "--pvalue-threshold=1e-5",
-        "--distance-threshold=1e6",
-        string("--output-prefix=", outprefix),
-        "--batchsize=2",
-        "--max-variants=10"
-    ])
-    Simulations.julia_main()
-    # Estimand files
-    ## sarcoidosis has 3 estimands -> split in two batches (batchsize=2)
-    ## G20 Parkinson's disease has 1 estimand -> 1 file
-    estimands_files = filter(x -> occursin("ga_sim_input_estimands", x), readdir(tmpdir, join=true))
-    all_estimands = []
-    for f ∈ estimands_files
-        estimands = deserialize(f).estimands
-        append!(all_estimands, estimands)
-        if length(estimands) == 1
-            continue
-        elseif length(estimands) == 2
-            @test all(Simulations.get_outcome(Ψ) === :sarcoidosis for Ψ in estimands)
-        else
-            throw(ArgumentError("Batchsize should be max 2."))
-        end
-    end
-    @test length(all_estimands) == 4
+    estimands = gene_atlas_estimands()
+    trait_to_variants = Simulations.get_trait_to_variants(
+        estimands; 
+        verbosity=0, 
+        gene_atlas_dir=joinpath(tmpdir, "gene_atlas_data"),
+        remove_ga_data=true,
+        trait_table_path=joinpath("assets", "Traits_Table_GeneATLAS.csv"),
+        maf_threshold=0.01,
+        pvalue_threshold=1e-5,
+        distance_threshold=1e6,
+        max_variants=10,
+    )
+    @test length(trait_to_variants["sarcoidosis"]) == 10
+    @test issubset(("rs502771", "rs184270108"), trait_to_variants["sarcoidosis"])
+    @test length(trait_to_variants["G20 Parkinson's disease"]) == 10
+    @test issubset(("rs11868112", "rs6456121", "rs356219"), trait_to_variants["G20 Parkinson's disease"])
+
+    # Dataset and validated estimands
+    # We change the data to match what is in the BGEN file
+    estimands, trait_to_variants = estimands_and_traits_to_variants_matching_bgen()
+    bgen_prefix = joinpath(TARGENCORE_TESTDIR, "data", "ukbb", "imputed" ,"ukbb")
+    traits_file = joinpath(TARGENCORE_TESTDIR, "data", "traits_1.csv")
+    pcs_file = joinpath(TARGENCORE_TESTDIR, "data", "pcs.csv")
+    positivity_constraint = 0
+    call_threshold = 0.9
+    dataset, validated_estimands = Simulations.get_dataset_and_validated_estimands(
+        estimands,
+        bgen_prefix,
+        traits_file,
+        pcs_file,
+        trait_to_variants;
+        call_threshold=call_threshold,
+        positivity_constraint=positivity_constraint,
+        verbosity=verbosity
+    )
+    @test names(dataset) == ["SAMPLE_ID", "BINARY_1", "BINARY_2", "CONTINUOUS_1", "CONTINUOUS_2", "COV_1",
+        "21003", "22001", "TREAT_1", "PC1", "PC2", "RSID_2", "RSID_198"]
+    @test size(dataset, 1) == 490
+    @test length(estimands) == length(validated_estimands)
+    # Check estimands have been matched to the dataset: GA -> AG
+    Ψ = validated_estimands[findfirst(x->x isa TMLE.ComposedEstimand, validated_estimands)]
+    @test Ψ.args[1].treatment_values.RSID_198 == "AG"
+    @test Ψ.args[2].treatment_values.RSID_198 == "AG"
 
-    # Conditional densities
-    ## There should be 2 + n_variants densities
-    ## 10 variants per trait = 20 variants
-    requested_variants = ["rs502771", "rs184270108", "rs11868112", "rs6456121"]
-    variants = open(readlines, string(outprefix, "_variants.txt"))
-    @test issubset(requested_variants, variants)
-    @test length(variants) == 20 
-    conditional_density_files = filter(x -> occursin("ga_sim_input_conditional_density", x), readdir(tmpdir, join=true))
-    @test length(conditional_density_files) == length(variants) + 2
-    density_targets = Set([])
-    for f in conditional_density_files
-        conditional_density = JSON.parsefile(f)
-        push!(density_targets, conditional_density["outcome"])
-        if conditional_density["outcome"] ∈ variants
-            @test Set(conditional_density["parents"]) == Set(["PC2", "PC1", "PC3"])
-        else
-            @test issubset(["PC2", "PC1", "Age-Assessment", "PC3", "Genetic-Sex"], conditional_density["parents"])
-        end
+    # Now the writing
+    output_prefix = joinpath(tmpdir, "ga.input")
+    batchsize = 2
+    Simulations.write_ga_simulation_inputs(
+        output_prefix,
+        dataset,
+        validated_estimands,
+        trait_to_variants;
+        batchsize=batchsize,
+        verbosity=verbosity
+    )
+    # One dataset
+    loaded_dataset = Arrow.Table(string(output_prefix, ".data.arrow")) |> DataFrame
+    @test size(loaded_dataset) == (490, 13)
+    # 5 estimands split in 3 files of size (2, 2, 1)
+    loaded_estimands = reduce(
+        vcat, 
+        deserialize(string(output_prefix, ".estimands_$i.jls")).estimands for i in 1:3
+    )
+    @test loaded_estimands == validated_estimands
+    # 3 densities for the 3 outcomes
+    outcome_to_parents = Dict()
+    for i in 1:3
+        density = JSON.parsefile(string(output_prefix, ".conditional_density_$i.json"))
+        outcome_to_parents[density["outcome"]] = sort(density["parents"])
     end
-    @test density_targets == Set(vcat(variants, ["sarcoidosis", "G20 Parkinson's disease"]))
+    @test outcome_to_parents == Dict(
+        "BINARY_2"     => ["21003", "22001", "COV_1", "PC1", "PC2", "RSID_2"],
+        "CONTINUOUS_2" => ["21003", "22001", "COV_1", "PC1", "PC2", "RSID_198", "RSID_2"],
+        "BINARY_1"     => ["22001", "PC1", "PC2", "RSID_198", "RSID_2", "TREAT_1"]
+    )
 end
 
 end
diff --git a/test/null_simulation.jl b/test/null_simulation.jl
index 4eb0cac..efdcb24 100644
--- a/test/null_simulation.jl
+++ b/test/null_simulation.jl
@@ -24,7 +24,7 @@ include(joinpath(TESTDIR, "testutils.jl"))
 @testset "Integration Test Null Simulation" begin
     inputdir = mktempdir()
     estimands_filename = joinpath(inputdir, "estimands.yaml")
-    save_integration_test_configuration(estimands_filename)
+    save_configuration_macthing_bgen(estimands_filename)
     # Creating the inputs
     parsed_args = Dict(
         "out-prefix" => joinpath(inputdir, "final"), 
diff --git a/test/runtests.jl b/test/runtests.jl
index 2e2a88e..3ff6176 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -18,5 +18,4 @@ TESTDIR = joinpath(pkgdir(Simulations), "test")
 
     # Integration Tests
     @test include(joinpath(TESTDIR, "null_simulation.jl"))
-    @test include(joinpath(TESTDIR, "gene_atlas_simulation.jl"))
 end
diff --git a/test/samplers/density_estimate_sampler.jl b/test/samplers/density_estimate_sampler.jl
index 422110a..bd711e3 100644
--- a/test/samplers/density_estimate_sampler.jl
+++ b/test/samplers/density_estimate_sampler.jl
@@ -23,38 +23,77 @@ TESTDIR = joinpath(pkgdir(Simulations), "test")
         train_ratio=10,
         verbosity=0
     )
-    # Learn T1
-    density_file = joinpath(TESTDIR, "assets", "conditional_density_T1.json")
+    # Learn Ycont
+    density_file = joinpath(TESTDIR, "assets", "conditional_density_Ycont.json")
     density_estimation(
         dataset_file,
         density_file;
         mode="test",
-        output=joinpath(density_dir, "density_T1.hdf5"),
+        output=joinpath(density_dir, "density_Ycont.hdf5"),
         train_ratio=10,
         verbosity=0
     )
-    # Create the sampler
+    # Scenario 1:
+    # only 1 estimand, not all parents of Ybin are in the estimand's variables.
+    # This is to make sure variants that are used to generated the 
+    # data but not required afterwards for estimation are discarded 
+    # in the sampled dataset
     estimands = [ATE(
         outcome=:Ybin,
         treatment_values=(T₁=(case=1, control=0),),
         treatment_confounders=(:W,),
-        outcome_extra_covariates=(:C,)
         )]
     prefix = joinpath(density_dir, "density")
     sampler = DensityEstimateSampler(prefix, estimands)
-    @test Set(sampler.sources) == Set([:W, :C])
-    @test sampler.treatment_density_mapping == Dict((:T₁=>(:W,)) => joinpath(density_dir, "density_T1.hdf5"))
-    @test sampler.outcome_density_mapping == Dict((:Ybin=>(:C, :T₁, :W)) => joinpath(density_dir, "density_Ybin.hdf5"))
+    @test sort(sampler.all_parents_set) == [:C, :T₁, :W]
+    @test sort(sampler.variables_required_for_estimation) == [:T₁, :W, :Ybin]
+    @test sampler.density_mapping == Dict(:Ybin => ([:C, :T₁, :W], joinpath(density_dir, "density_Ybin.hdf5")))
     # Sample
     origin_dataset = DataFrame(Arrow.Table(dataset_file))
-    sampled_dataset = sample_from(sampler, origin_dataset, n=100)
+    sampled_dataset = sample_from(sampler, origin_dataset, n=50)
+    ## Check parents are given by the empirical
+    parents = [:W, :T₁]
+    all_origin_rows = collect(eachrow(origin_dataset[!, parents]))
+    for row in eachrow(sampled_dataset[!, parents])
+        @test row ∈ all_origin_rows
+    end
+    @test names(sampled_dataset) == string.(sampler.variables_required_for_estimation)
+    @test size(sampled_dataset, 1) == 50
+    @test sampled_dataset.Ybin isa CategoricalVector
 
-    @test names(sampled_dataset) == ["W", "C", "T₁", "Ybin"]
-    @test size(sampled_dataset, 1) == 100
-
-    # True effects
-    Ψ = estimands[1]
-    effect = true_effect(Ψ, sampler, origin_dataset; n=1000)
+    # Scenario 2:
+    # Two estimands
+    estimands = [ATE(
+        outcome=:Ybin,
+        treatment_values=(T₁=(case=1, control=0),),
+        treatment_confounders=(:W,),
+        ),
+        factorialEstimand(
+            IATE,
+            (T₁=[0, 1], T₂=[0, 1]),
+            :Ycont,
+            confounders=(:W,),
+            outcome_extra_covariates=(:C,)
+        )
+    ]
+    sampler = DensityEstimateSampler(prefix, estimands)
+    @test sort(sampler.all_parents_set) == [:C, :T₁, :T₂, :W]
+    @test sort(sampler.variables_required_for_estimation) == [:C, :T₁, :T₂, :W, :Ybin, :Ycont]
+    @test sampler.density_mapping == Dict(
+        :Ybin => ([:C, :T₁, :W], joinpath(density_dir, "density_Ybin.hdf5")),
+        :Ycont => ([:W, :T₁, :T₂, :C], joinpath(density_dir, "density_Ycont.hdf5"))
+        )
+    # Sample
+    origin_dataset = DataFrame(Arrow.Table(dataset_file))
+    sampled_dataset = sample_from(sampler, origin_dataset, n=50)
+    ## All parents are retained
+    all_origin_rows = collect(eachrow(origin_dataset[!, sampler.all_parents_set]))
+    for row in eachrow(sampled_dataset[!, sampler.all_parents_set])
+        @test row ∈ all_origin_rows
+    end
+    @test names(sampled_dataset) == string.(sampler.variables_required_for_estimation)
+    @test size(sampled_dataset, 1) == 50
+    @test sampled_dataset.Ybin isa CategoricalVector
 end
 
 end
diff --git a/test/samplers/null_sampler.jl b/test/samplers/null_sampler.jl
index b11a8fc..f0a3371 100644
--- a/test/samplers/null_sampler.jl
+++ b/test/samplers/null_sampler.jl
@@ -47,22 +47,6 @@ include(joinpath(TESTDIR, "testutils.jl"))
     ))
     @test_throws AssertionError("All estimands should share the same confounders and covariates.") NullSampler(estimands)
 
-    # True effects
-    ## Continuous Outcome
-    Ψ = estimands[1]
-    @test theoretical_true_effect(Ψ, sampler) == 0
-    @test empirical_true_effect(Ψ, sampler, origin_dataset; n=100_000) ≈ 0. atol=0.01
-    @test true_effect(Ψ, sampler, origin_dataset) == 0
-    ## Count Outcome
-    Ψ = estimands[2]
-    @test true_effect(Ψ, sampler, origin_dataset) == 0
-    ## Composed Estimand / Binary Outcome
-    Ψ = estimands[3]
-    @test theoretical_true_effect(Ψ, sampler) == [0, 0]
-    composed_effect = empirical_true_effect(Ψ, sampler, origin_dataset; n=100_000)
-    @test composed_effect[1] == - composed_effect[2]
-    @test composed_effect[1] ≈ 0 atol=0.01
-    @test true_effect(Ψ, sampler, origin_dataset) == [0, 0]
 end
 
 end
diff --git a/test/testutils.jl b/test/testutils.jl
index 02b1b7a..8fcb0a2 100644
--- a/test/testutils.jl
+++ b/test/testutils.jl
@@ -107,7 +107,7 @@ function write_linear_interaction_dataset_estimands()
     serialize("test/assets/estimands/estimands_ates.jls", linear_interaction_dataset_ATEs())
 end
 
-function make_integration_test_configuration()
+function estimands_and_traits_to_variants_matching_bgen()
     estimands = [
         IATE(
             outcome = "BINARY_1",
@@ -128,7 +128,7 @@ function make_integration_test_configuration()
         ),
         ATE(
             outcome = "CONTINUOUS_2",
-            treatment_values = (RSID_2 = (case = "AA", control = "GG"), RSID_198 = (case = "AG", control = "AA")),
+            treatment_values = (RSID_2 = (case = "AA", control = "GG"), RSID_198 = (case = "GA", control = "AA")),
             treatment_confounders = (RSID_2 = [], RSID_198 = []),
             outcome_extra_covariates = [22001]
         ),
@@ -136,22 +136,28 @@ function make_integration_test_configuration()
             TMLE.joint_estimand, (
                 CM(
                 outcome = "BINARY_1",
-                treatment_values = (RSID_2 = "GG", RSID_198 = "AG"),
+                treatment_values = (RSID_2 = "GG", RSID_198 = "GA"),
                 treatment_confounders = (RSID_2 = [], RSID_198 = []),
                 outcome_extra_covariates = [22001]
             ),
                 CM(
                 outcome = "BINARY_1",
-                treatment_values = (RSID_2 = "AA", RSID_198 = "AG"),
+                treatment_values = (RSID_2 = "AA", RSID_198 = "GA"),
                 treatment_confounders = (RSID_2 = [], RSID_198 = []),
                 outcome_extra_covariates = [22001]
             ))
         )
     ]
-    return Configuration(estimands=estimands)
+    traits_to_variants = Dict(
+        "BINARY_1" => ["RSID_2", "RSID_198"],
+        "CONTINUOUS_2" => ["RSID_2", "RSID_198"],
+        "BINARY_2" => ["RSID_2"],
+        )
+    return estimands, traits_to_variants
 end
 
-function save_integration_test_configuration(path)
-    config = make_integration_test_configuration()
+function save_configuration_macthing_bgen(path)
+    estimands, _ = estimands_and_traits_to_variants_matching_bgen()
+    config = TMLE.Configuration(estimands=estimands)
     TMLE.write_yaml(path, config)
-end
\ No newline at end of file
+end