Summary stat fix (#120)

* edits for DFE-alpha * updates based on code review * more updates based on code review * doc string edit * skip empty intervals patch * fixing empty intervals bug * fixing empty intervals bug * dropping dfe-alpha (with minimal changes) * fixes for summary stats and runnign it
popsim-consortium · Oct 2, 2024 · 73cac2a · 73cac2a
1 parent a20dfd3
commit 73cac2a
Show file tree

Hide file tree

Showing 7 changed files with 1,260 additions and 54 deletions.
diff --git a/notebooks/DFE_figures.ipynb b/notebooks/DFE_figures.ipynb
diff --git a/workflows/config/snakemake/production_config_PhoSin.yml b/workflows/config/snakemake/production_config_PhoSin.yml
@@ -2,7 +2,7 @@
 "seed": 12345
 "replicates": 3
 "output_dir": "PhoSin_results"
-"workflow_path": "workflows/"
+"workflow_path": "./workflows/"
 
 # species-specific settings
 "species": "PhoSin"

diff --git a/workflows/dfe.snake b/workflows/dfe.snake
@@ -27,7 +27,7 @@ np.random.seed(config["seed"])
 replicates = config["replicates"]
 
 #workflow path in case it's not the current directory
-workflow_path = config.get("workflow_path", "./")
+workflow_path = config.get("workflow_path", "./workflows")
 
 # Where you would like all output files from analysis to live
 output_dir = os.path.abspath(config["output_dir"])
@@ -85,7 +85,7 @@ rule all:
         expand(output_dir + "/plots/{demog}/{dfes}/{annots}/dfe.inference.benchmark.pdf",
             demog=demo_model_id_list,
             dfes=dfe_list,
-            annots=annotation_list )
+            annots=annotation_list)
 
 
 # ###############################################################################
@@ -165,7 +165,7 @@ rule dadi_infer_dfe:
         opts = 100,
         prefix = output_dir + "/inference/{demog}/dadi/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.two_epoch.gamma"
     resources:
-        time=4000
+        time=10080
     threads: 8
     shell:
         """
@@ -290,7 +290,7 @@ rule generate_dfe_alpha_fs:
             ts_dict[chrm] = fp
         index = int(wildcards.ids)
         ts2fs.generate_fs(ts_dict, index, mask_file, wildcards.annots,
-                  species, output, format='DFE-alpha', is_folded=True,
+                  species, output, format='DFE-alpha', is_folded=False,
                   data_path_1=dfe_alpha_data_path_1, data_path_2=dfe_alpha_data_path_2,
                   sfs_input_file=output[2], est_dfe_results_dir=params.output,
                   est_dfe_demography_results_file=params.output+"/neu/est_dfe.out")
@@ -404,13 +404,13 @@ rule plot_results:
             annots=annotation_list,
             ids=pids,
         )),
-        dfe_alpha_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/DFE-alpha/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.DFE-alpha.bestfit",
-            demog=demo_model_id_list,
-            seeds=seed_list,
-            dfes=dfe_list,
-            annots=annotation_list,
-            ids=pids,
-        )),
+        #dfe_alpha_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/DFE-alpha/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.DFE-alpha.bestfit",
+        #    demog=demo_model_id_list,
+        #    seeds=seed_list,
+        #    dfes=dfe_list,
+        #    annots=annotation_list,
+        #    ids=pids,
+        #)),
         grapes_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/grapes/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.grapes.bestfit",
             demog=demo_model_id_list,
             seeds=seed_list,
@@ -438,7 +438,7 @@ rule plot_results:
 
         dadi_bestfits = [ b for b in input.dadi_res if wildcards.demog in b]
         polydfe_bestfits = [ b for b in input.polydfe_res if wildcards.demog in b]
-        dfe_alpha_bestfits = [ b for b in input.dfe_alpha_res if wildcards.demog in b]
+        #dfe_alpha_bestfits = [ b for b in input.dfe_alpha_res if wildcards.demog in b]
         grapes_bestfits = [ b for b in input.grapes_res if wildcards.demog in b ]
 
         if wildcards.demog == 'Constant': 
@@ -447,7 +447,7 @@ rule plot_results:
 
             dadi_bestfits = [ b for b in dadi_bestfits if 'pop0' in b ]
             polydfe_bestfits = [ b for b in polydfe_bestfits if 'pop0' in b ]
-            dfe_alpha_bestfits = [ b for b in dfe_alpha_bestfits if 'pop0' in b ]
+            #dfe_alpha_bestfits = [ b for b in dfe_alpha_bestfits if 'pop0' in b ]
             grapes_bestfits = [ b for b in grapes_bestfits if 'pop0' in b ]
         else: 
             model = species.get_demographic_model(wildcards.demog)
@@ -456,4 +456,6 @@ rule plot_results:
                 mutation_rate = species.genome.mean_mutation_rate
 
         pop_names = [ model.populations[i].name for i in range(len(model.populations)) ]
-        plots.plot_all_dfe_results([dadi_bestfits, polydfe_bestfits, dfe_alpha_bestfits, grapes_bestfits], output, mutation_rate, seq_len, 0.7, pop_names)
+        #plots.plot_all_dfe_results([dadi_bestfits, polydfe_bestfits, dfe_alpha_bestfits, grapes_bestfits], output, mutation_rate, seq_len, 0.7, pop_names)
+        plots.plot_all_dfe_results([dadi_bestfits, polydfe_bestfits, grapes_bestfits], output, mutation_rate, seq_len, 0.7, pop_names)
+
diff --git a/workflows/plots.py b/workflows/plots.py
@@ -246,8 +246,8 @@ def plot_all_dfe_results(input, output, mu, seq_len, nonneu_prop, pop_names=None
     """
     dadi_bestfits = _read_bestfits(input[0], len(pop_names))
     polydfe_bestfits = _read_bestfits(input[1], len(pop_names))
-    dfe_alpha_bestfits = _read_bestfits(input[2], len(pop_names))
-    grapes_bestfits = _read_bestfits(input[3], len(pop_names))
+    #dfe_alpha_bestfits = _read_bestfits(input[2], len(pop_names))
+    grapes_bestfits = _read_bestfits(input[2], len(pop_names))
 
     # SLiM assumes genotype fitnesses: 1, 1+sh, 1+s
     # dadi and polyDFE assumes genotype fitnesses: 1, 1+2sh, 1+2s
@@ -273,10 +273,10 @@ def plot_all_dfe_results(input, output, mu, seq_len, nonneu_prop, pop_names=None
 
     # Per https://doi.org/10.1534/genetics.107.080663 under Model
     # DFE alpha assumes genotype fitnesses are 1, 1-s/2, 1-s, so like SLiM
-    dfe_alpha_shapes = [dfe_alpha_bestfits[i]['b']
-                        for i in range(len(dfe_alpha_bestfits))]
-    dfe_alpha_Es = [abs(dfe_alpha_bestfits[i]['Es'])
-                    for i in range(len(dfe_alpha_bestfits))]
+    #dfe_alpha_shapes = [dfe_alpha_bestfits[i]['b']
+    #                    for i in range(len(dfe_alpha_bestfits))]
+    #dfe_alpha_Es = [abs(dfe_alpha_bestfits[i]['Es'])
+    #                for i in range(len(dfe_alpha_bestfits))]
 
     # Per https://github.com/BioPP/grapes/blob/master/README.md,
     #   GRAPES implements the model of DFE alpha, so we expect genotype fitnesses 1, 1-s/2, and 1-s.
@@ -311,12 +311,12 @@ def plot_all_dfe_results(input, output, mu, seq_len, nonneu_prop, pop_names=None
     plt.subplot(4, 2, 4)
     _plot_boxplots(plt, 'polyDFE', polydfe_Es, 0.014, xtick_label, '|E(s)|')
 
-    plt.subplot(4, 2, 5)
-    _plot_boxplots(plt, 'DFE-alpha', dfe_alpha_shapes,
-                   0.19, xtick_label, 'Shape')
-    plt.subplot(4, 2, 6)
-    _plot_boxplots(plt, 'DFE-alpha', dfe_alpha_Es,
-                   0.014, xtick_label, '|E(s)|')
+    #plt.subplot(4, 2, 5)
+    #_plot_boxplots(plt, 'DFE-alpha', dfe_alpha_shapes,
+    #              0.19, xtick_label, 'Shape')
+    #plt.subplot(4, 2, 6)
+    #_plot_boxplots(plt, 'DFE-alpha', dfe_alpha_Es,
+    #               0.014, xtick_label, '|E(s)|')
 
     plt.subplot(4, 2, 7)
     _plot_boxplots(plt, 'grapes', grapes_shapes, 0.19,

diff --git a/workflows/summary_stats.sh b/workflows/summary_stats.sh
@@ -0,0 +1,12 @@
+#config=workflows/config/snakemake/production_config_HomSap.yml
+config=workflows/config/snakemake/production_config_PhoSin.yml
+
+for i in {1..20};
+do
+    snakemake \
+    --snakefile workflows/summary_stats.snake \
+    --profile workflows/config/snakemake/oregon_profile/ \
+    --configfile $config \
+    --batch all=$i/20 \
+    --groups run_diploshic_fvs=group0
+done
diff --git a/workflows/summary_stats.snake b/workflows/summary_stats.snake
@@ -26,7 +26,6 @@ output_dir = os.path.abspath(config["output_dir"])
 # The analysis species
 species = stdpopsim.get_species(config["species"])
 
-
 # The names of all chromosomes to simulate, separated by commas
 # Use "all" to simulate all chromsomes for the genome
 chrm_list = [chrom.id for chrom in species.genome.chromosomes]
@@ -47,22 +46,19 @@ for x in demo_model_array:
     else:
         model = species.get_demographic_model(x["id"])
     demo_sample_size_dict[x["id"]] = {f"{model.populations[i].name}": m for i, m in enumerate(x["num_samples_per_population"])}
-    demo_pop_ids[x["id"]] = [x.name for x in model.populations]
+    demo_pop_ids[x["id"]] = [x.name for x in model.populations[:len(x["num_samples_per_population"])]]
 
 # Select DFE model from catalog  
 dfe_list = config["dfe_list"]
 annotation_list = config["annotation_list"]
 mask_file = config["mask_file"]
 
-
-
 nchunks=100
 chunks = np.arange(nchunks)
 # ###############################################################################
 # GENERAL RULES & GLOBALS
 # ###############################################################################
 
-
 rule all:
     input:
         expand(
@@ -92,14 +88,9 @@ rule all:
             chrms=chrm_list,
         ),
 
-
-
-
-
 rule make_vcf:
     input:
         output_dir + "/simulated_data/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.trees"
-
     output:
         output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.vcf"
     run:
@@ -128,10 +119,9 @@ def write_diploshic_ancestralAllelesFile(ts, filename):
 rule make_diploshic_inputs:
     input:
         output_dir + "/simulated_data/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.trees"
-
     output:
-        temp(output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.sampleToPopFile"),
-        temp(output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.ancestralAllelesFile")
+        output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.sampleToPopFile",
+        output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.ancestralAllelesFile"
     run:
         ts = tskit.load(input[0])
         write_diploshic_sampleToPopFile(ts, output[0], wildcards.demog)
@@ -156,22 +146,17 @@ rule run_diploshic_fvs:
     input:
         rules.make_vcf.output,
         rules.make_diploshic_inputs.output
-
     output:
-        temp(
             expand(output_dir + "/summaries/{{demog}}/{{dfes}}/{{annots}}/{{seeds}}/sim_{{chrms}}.{{popid}}.{{chunk}}.diploshic.{ext}",
             chunk=chunks,
             ext=["fvec", "stats"],        
             )
-        )
-
     params:
         seq_len = lambda wildcards, input: int(species.get_contig(wildcards.chrms).length), 
         start = lambda wildcards, input: chunk_coords(wildcards.chrms, int(wildcards.chunk), nchunks)[0],
         end = lambda wildcards, input: chunk_coords(wildcards.chrms, int(wildcards.chunk), nchunks)[1],
         test = lambda wildcards, input: chunk_coords(wildcards.chrms, int(wildcards.chunk), nchunks)[2],
         popid = lambda wildcards, input: wildcards.popid,
-
     shell:
         """
         if [[ {params.test} -eq 1 && `diploSHIC fvecVcf haploid {input[0]} 1 {params.seq_len} {output[0]} --sampleToPopFileName {input[1]} --ancFileName {input[2]} --targetPop {params.popid} --statFileName {output[1]} --segmentStart {params.start} --segmentEnd {params.end}` == 0 ]]; 
@@ -181,8 +166,7 @@ rule run_diploshic_fvs:
                 touch {output[0]} && touch {output[1]}
         fi
         """
-
-
+
 rule gather_diploshic_fvs:
     input:
          expand(output_dir + "/summaries/{{demog}}/{{dfes}}/{{annots}}/{{seeds}}/sim_{{chrms}}.{{popid}}.{chunk}.diploshic.{ext}",
@@ -222,7 +206,7 @@ rule plot_pi_individual:
         sns.lineplot(x="mid", y="pi", data=df, ax=ax, linewidth=2.5, palette="tab10")
 
         # plot annotations as rugplot
-        if wildcards.annots != "none":
+        if wildcards.annots not in ["none", "all_sites"]:
             exons = species.get_annotations(wildcards.annots)
             exon_intervals = exons.get_chromosome_annotations(wildcards.chrms)
             sns.rugplot(pd.DataFrame(data={'exons':exon_intervals[:,0]}), ax=ax, color="g", lw=1, alpha=0.05)
@@ -266,7 +250,7 @@ rule plot_pi_allseeds:
         for i, stat in enumerate(stat_names):
             sns.lineplot(data=stacked, x="mid", y=stat, hue="seed", alpha=0.5, ax=axs[i])
             # plot annotations as rugplot
-            if wildcards.annots != "none":
+            if wildcards.annots not in ["none", "all_sites"]:
                 exons = species.get_annotations(wildcards.annots)
                 exon_intervals = exons.get_chromosome_annotations(wildcards.chrms)
                 sns.rugplot(pd.DataFrame(data={'exons':exon_intervals[:,0]}),