Skip to content

Commit

Permalink
Summary stat fix (#120)
Browse files Browse the repository at this point in the history
* edits for DFE-alpha

* updates based on code review

* more updates based on code review

* doc string edit

* skip empty intervals patch

* fixing empty intervals bug

* fixing empty intervals bug

* dropping dfe-alpha (with minimal changes)

* fixes for summary stats and runnign it
  • Loading branch information
silastittes authored Oct 2, 2024
1 parent a20dfd3 commit 73cac2a
Show file tree
Hide file tree
Showing 7 changed files with 1,260 additions and 54 deletions.
1,122 changes: 1,122 additions & 0 deletions notebooks/DFE_figures.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion workflows/config/snakemake/production_config_PhoSin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"seed": 12345
"replicates": 3
"output_dir": "PhoSin_results"
"workflow_path": "workflows/"
"workflow_path": "./workflows/"

# species-specific settings
"species": "PhoSin"
Expand Down
30 changes: 16 additions & 14 deletions workflows/dfe.snake
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ np.random.seed(config["seed"])
replicates = config["replicates"]

#workflow path in case it's not the current directory
workflow_path = config.get("workflow_path", "./")
workflow_path = config.get("workflow_path", "./workflows")

# Where you would like all output files from analysis to live
output_dir = os.path.abspath(config["output_dir"])
Expand Down Expand Up @@ -85,7 +85,7 @@ rule all:
expand(output_dir + "/plots/{demog}/{dfes}/{annots}/dfe.inference.benchmark.pdf",
demog=demo_model_id_list,
dfes=dfe_list,
annots=annotation_list )
annots=annotation_list)


# ###############################################################################
Expand Down Expand Up @@ -165,7 +165,7 @@ rule dadi_infer_dfe:
opts = 100,
prefix = output_dir + "/inference/{demog}/dadi/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.two_epoch.gamma"
resources:
time=4000
time=10080
threads: 8
shell:
"""
Expand Down Expand Up @@ -290,7 +290,7 @@ rule generate_dfe_alpha_fs:
ts_dict[chrm] = fp
index = int(wildcards.ids)
ts2fs.generate_fs(ts_dict, index, mask_file, wildcards.annots,
species, output, format='DFE-alpha', is_folded=True,
species, output, format='DFE-alpha', is_folded=False,
data_path_1=dfe_alpha_data_path_1, data_path_2=dfe_alpha_data_path_2,
sfs_input_file=output[2], est_dfe_results_dir=params.output,
est_dfe_demography_results_file=params.output+"/neu/est_dfe.out")
Expand Down Expand Up @@ -404,13 +404,13 @@ rule plot_results:
annots=annotation_list,
ids=pids,
)),
dfe_alpha_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/DFE-alpha/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.DFE-alpha.bestfit",
demog=demo_model_id_list,
seeds=seed_list,
dfes=dfe_list,
annots=annotation_list,
ids=pids,
)),
#dfe_alpha_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/DFE-alpha/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.DFE-alpha.bestfit",
# demog=demo_model_id_list,
# seeds=seed_list,
# dfes=dfe_list,
# annots=annotation_list,
# ids=pids,
#)),
grapes_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/grapes/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.grapes.bestfit",
demog=demo_model_id_list,
seeds=seed_list,
Expand Down Expand Up @@ -438,7 +438,7 @@ rule plot_results:

dadi_bestfits = [ b for b in input.dadi_res if wildcards.demog in b]
polydfe_bestfits = [ b for b in input.polydfe_res if wildcards.demog in b]
dfe_alpha_bestfits = [ b for b in input.dfe_alpha_res if wildcards.demog in b]
#dfe_alpha_bestfits = [ b for b in input.dfe_alpha_res if wildcards.demog in b]
grapes_bestfits = [ b for b in input.grapes_res if wildcards.demog in b ]

if wildcards.demog == 'Constant':
Expand All @@ -447,7 +447,7 @@ rule plot_results:

dadi_bestfits = [ b for b in dadi_bestfits if 'pop0' in b ]
polydfe_bestfits = [ b for b in polydfe_bestfits if 'pop0' in b ]
dfe_alpha_bestfits = [ b for b in dfe_alpha_bestfits if 'pop0' in b ]
#dfe_alpha_bestfits = [ b for b in dfe_alpha_bestfits if 'pop0' in b ]
grapes_bestfits = [ b for b in grapes_bestfits if 'pop0' in b ]
else:
model = species.get_demographic_model(wildcards.demog)
Expand All @@ -456,4 +456,6 @@ rule plot_results:
mutation_rate = species.genome.mean_mutation_rate

pop_names = [ model.populations[i].name for i in range(len(model.populations)) ]
plots.plot_all_dfe_results([dadi_bestfits, polydfe_bestfits, dfe_alpha_bestfits, grapes_bestfits], output, mutation_rate, seq_len, 0.7, pop_names)
#plots.plot_all_dfe_results([dadi_bestfits, polydfe_bestfits, dfe_alpha_bestfits, grapes_bestfits], output, mutation_rate, seq_len, 0.7, pop_names)
plots.plot_all_dfe_results([dadi_bestfits, polydfe_bestfits, grapes_bestfits], output, mutation_rate, seq_len, 0.7, pop_names)

24 changes: 12 additions & 12 deletions workflows/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,8 @@ def plot_all_dfe_results(input, output, mu, seq_len, nonneu_prop, pop_names=None
"""
dadi_bestfits = _read_bestfits(input[0], len(pop_names))
polydfe_bestfits = _read_bestfits(input[1], len(pop_names))
dfe_alpha_bestfits = _read_bestfits(input[2], len(pop_names))
grapes_bestfits = _read_bestfits(input[3], len(pop_names))
#dfe_alpha_bestfits = _read_bestfits(input[2], len(pop_names))
grapes_bestfits = _read_bestfits(input[2], len(pop_names))

# SLiM assumes genotype fitnesses: 1, 1+sh, 1+s
# dadi and polyDFE assumes genotype fitnesses: 1, 1+2sh, 1+2s
Expand All @@ -273,10 +273,10 @@ def plot_all_dfe_results(input, output, mu, seq_len, nonneu_prop, pop_names=None

# Per https://doi.org/10.1534/genetics.107.080663 under Model
# DFE alpha assumes genotype fitnesses are 1, 1-s/2, 1-s, so like SLiM
dfe_alpha_shapes = [dfe_alpha_bestfits[i]['b']
for i in range(len(dfe_alpha_bestfits))]
dfe_alpha_Es = [abs(dfe_alpha_bestfits[i]['Es'])
for i in range(len(dfe_alpha_bestfits))]
#dfe_alpha_shapes = [dfe_alpha_bestfits[i]['b']
# for i in range(len(dfe_alpha_bestfits))]
#dfe_alpha_Es = [abs(dfe_alpha_bestfits[i]['Es'])
# for i in range(len(dfe_alpha_bestfits))]

# Per https://github.com/BioPP/grapes/blob/master/README.md,
# GRAPES implements the model of DFE alpha, so we expect genotype fitnesses 1, 1-s/2, and 1-s.
Expand Down Expand Up @@ -311,12 +311,12 @@ def plot_all_dfe_results(input, output, mu, seq_len, nonneu_prop, pop_names=None
plt.subplot(4, 2, 4)
_plot_boxplots(plt, 'polyDFE', polydfe_Es, 0.014, xtick_label, '|E(s)|')

plt.subplot(4, 2, 5)
_plot_boxplots(plt, 'DFE-alpha', dfe_alpha_shapes,
0.19, xtick_label, 'Shape')
plt.subplot(4, 2, 6)
_plot_boxplots(plt, 'DFE-alpha', dfe_alpha_Es,
0.014, xtick_label, '|E(s)|')
#plt.subplot(4, 2, 5)
#_plot_boxplots(plt, 'DFE-alpha', dfe_alpha_shapes,
# 0.19, xtick_label, 'Shape')
#plt.subplot(4, 2, 6)
#_plot_boxplots(plt, 'DFE-alpha', dfe_alpha_Es,
# 0.014, xtick_label, '|E(s)|')

plt.subplot(4, 2, 7)
_plot_boxplots(plt, 'grapes', grapes_shapes, 0.19,
Expand Down
12 changes: 12 additions & 0 deletions workflows/summary_stats.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#config=workflows/config/snakemake/production_config_HomSap.yml
config=workflows/config/snakemake/production_config_PhoSin.yml

for i in {1..20};
do
snakemake \
--snakefile workflows/summary_stats.snake \
--profile workflows/config/snakemake/oregon_profile/ \
--configfile $config \
--batch all=$i/20 \
--groups run_diploshic_fvs=group0
done
28 changes: 6 additions & 22 deletions workflows/summary_stats.snake
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ output_dir = os.path.abspath(config["output_dir"])
# The analysis species
species = stdpopsim.get_species(config["species"])


# The names of all chromosomes to simulate, separated by commas
# Use "all" to simulate all chromsomes for the genome
chrm_list = [chrom.id for chrom in species.genome.chromosomes]
Expand All @@ -47,22 +46,19 @@ for x in demo_model_array:
else:
model = species.get_demographic_model(x["id"])
demo_sample_size_dict[x["id"]] = {f"{model.populations[i].name}": m for i, m in enumerate(x["num_samples_per_population"])}
demo_pop_ids[x["id"]] = [x.name for x in model.populations]
demo_pop_ids[x["id"]] = [x.name for x in model.populations[:len(x["num_samples_per_population"])]]

# Select DFE model from catalog
dfe_list = config["dfe_list"]
annotation_list = config["annotation_list"]
mask_file = config["mask_file"]



nchunks=100
chunks = np.arange(nchunks)
# ###############################################################################
# GENERAL RULES & GLOBALS
# ###############################################################################


rule all:
input:
expand(
Expand Down Expand Up @@ -92,14 +88,9 @@ rule all:
chrms=chrm_list,
),





rule make_vcf:
input:
output_dir + "/simulated_data/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.trees"

output:
output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.vcf"
run:
Expand Down Expand Up @@ -128,10 +119,9 @@ def write_diploshic_ancestralAllelesFile(ts, filename):
rule make_diploshic_inputs:
input:
output_dir + "/simulated_data/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.trees"

output:
temp(output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.sampleToPopFile"),
temp(output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.ancestralAllelesFile")
output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.sampleToPopFile",
output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.ancestralAllelesFile"
run:
ts = tskit.load(input[0])
write_diploshic_sampleToPopFile(ts, output[0], wildcards.demog)
Expand All @@ -156,22 +146,17 @@ rule run_diploshic_fvs:
input:
rules.make_vcf.output,
rules.make_diploshic_inputs.output

output:
temp(
expand(output_dir + "/summaries/{{demog}}/{{dfes}}/{{annots}}/{{seeds}}/sim_{{chrms}}.{{popid}}.{{chunk}}.diploshic.{ext}",
chunk=chunks,
ext=["fvec", "stats"],
)
)

params:
seq_len = lambda wildcards, input: int(species.get_contig(wildcards.chrms).length),
start = lambda wildcards, input: chunk_coords(wildcards.chrms, int(wildcards.chunk), nchunks)[0],
end = lambda wildcards, input: chunk_coords(wildcards.chrms, int(wildcards.chunk), nchunks)[1],
test = lambda wildcards, input: chunk_coords(wildcards.chrms, int(wildcards.chunk), nchunks)[2],
popid = lambda wildcards, input: wildcards.popid,

shell:
"""
if [[ {params.test} -eq 1 && `diploSHIC fvecVcf haploid {input[0]} 1 {params.seq_len} {output[0]} --sampleToPopFileName {input[1]} --ancFileName {input[2]} --targetPop {params.popid} --statFileName {output[1]} --segmentStart {params.start} --segmentEnd {params.end}` == 0 ]];
Expand All @@ -181,8 +166,7 @@ rule run_diploshic_fvs:
touch {output[0]} && touch {output[1]}
fi
"""



rule gather_diploshic_fvs:
input:
expand(output_dir + "/summaries/{{demog}}/{{dfes}}/{{annots}}/{{seeds}}/sim_{{chrms}}.{{popid}}.{chunk}.diploshic.{ext}",
Expand Down Expand Up @@ -222,7 +206,7 @@ rule plot_pi_individual:
sns.lineplot(x="mid", y="pi", data=df, ax=ax, linewidth=2.5, palette="tab10")

# plot annotations as rugplot
if wildcards.annots != "none":
if wildcards.annots not in ["none", "all_sites"]:
exons = species.get_annotations(wildcards.annots)
exon_intervals = exons.get_chromosome_annotations(wildcards.chrms)
sns.rugplot(pd.DataFrame(data={'exons':exon_intervals[:,0]}), ax=ax, color="g", lw=1, alpha=0.05)
Expand Down Expand Up @@ -266,7 +250,7 @@ rule plot_pi_allseeds:
for i, stat in enumerate(stat_names):
sns.lineplot(data=stacked, x="mid", y=stat, hue="seed", alpha=0.5, ax=axs[i])
# plot annotations as rugplot
if wildcards.annots != "none":
if wildcards.annots not in ["none", "all_sites"]:
exons = species.get_annotations(wildcards.annots)
exon_intervals = exons.get_chromosome_annotations(wildcards.chrms)
sns.rugplot(pd.DataFrame(data={'exons':exon_intervals[:,0]}),
Expand Down
Loading

0 comments on commit 73cac2a

Please sign in to comment.