Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

various updates to sweep pipeline #116

Merged
merged 5 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 32 additions & 5 deletions boundary.ipynb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@ dependencies:
- r-patchwork
- r-scales
- r-ggnewscale
- r-quantreg
- libprotobuf=3.21.12
- pip:
- git+https://github.com/popsim-consortium/stdpopsim.git
- git+https://github.com/popgenmethods/smcpp
- scikit-allel
- git+https://github.com/xin-huang/dadi-cli
- git+https://github.com/kr-colab/diploSHIC.git@refs/pull/56/merge
- diploSHIC
328 changes: 328 additions & 0 deletions pca_sims_training.ipynb

Large diffs are not rendered by default.

11,892 changes: 11,454 additions & 438 deletions power_analysis.ipynb

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions workflows/config/snakemake/oregon_profile_simple/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@ cluster:
--time={resources.time}
--job-name=smk-{rule}%j
--output=logs/{rule}/{rule}%j.out
--parsable
default-resources:
- time=60
- mem_mb=12000
- threads=1
cluster-status: "status-sacct.sh"
restart-times: 3
max-jobs-per-second: 10
max-status-checks-per-second: 1
local-cores: 1
latency-wait: 60
jobs: 500
keep-going: True
max-jobs-per-second: 1000
max-status-checks-per-second: 1000
jobs: 3500
rerun-incomplete: True
printshellcmds: True
latency-wait: 30
scheduler: greedy
use-conda: True
jobscript: "jobscript-wo-properties.sh"
24 changes: 24 additions & 0 deletions workflows/config/snakemake/oregon_profile_simple/status-sacct.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env bash

# Check status of Slurm job

jobid="$1"

if [[ "$jobid" == Submitted ]]
then
echo smk-simple-slurm: Invalid job ID: "$jobid" >&2
echo smk-simple-slurm: Did you remember to add the flag --parsable to your sbatch call? >&2
exit 1
fi

output=`sacct -j "$jobid" --format State --noheader | head -n 1 | awk '{print $1}'`

if [[ $output =~ ^(COMPLETED).* ]]
then
echo success
elif [[ $output =~ ^(RUNNING|PENDING|COMPLETING|CONFIGURING|SUSPENDED).* ]]
then
echo running
else
echo failed
fi
2 changes: 1 addition & 1 deletion workflows/config/snakemake/sweep_config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# General configs
seed: 12345
replicates: 200
replicates: 1_000
output_dir: results

# Contig configs
Expand Down
29 changes: 18 additions & 11 deletions workflows/diploshic.snake
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ rule all:
expand("{tDir}/neut_0.fvec", tDir = ["train", "test"]),
expand("trainingSets/{cl}.fvec", cl = ["hard", "linkedHard", "soft", "linkedSoft", "neut"]),
"trained_model.json",
"trained_model.weights.hdf5"
"trained_model.weights.h5"

rule clone_discoal:
output:
Expand All @@ -74,6 +74,7 @@ rule clone_discoal:
cd ext/
git clone https://github.com/kr-colab/discoal.git
cd discoal/
sed -i -e 's/SITES 220020/SITES 1100100/' discoal.h
make discoal
cd ..
"""
Expand All @@ -84,6 +85,7 @@ rule discoal_neutral_simulation:
discoal_exec
output:
"{tDir}/discoal.neut.{i}.out"
resources: time=6000, mem_mb=256000
shell:
neutString + " > {output}"

Expand All @@ -94,6 +96,7 @@ rule discoal_hard_simulation:
discoal_exec
output:
"{tDir}/discoal.hard.{x}.{i}.out"
resources: time=6000, mem_mb=256000
run:
cmd = selStr + " -x "+str(sweep_locs[int(wildcards.x)])+" > {output}"
shell(cmd)
Expand All @@ -104,6 +107,7 @@ rule discoal_soft_simulation:
discoal_exec
output:
"{tDir}/discoal.soft.{x}.{i}.out"
resources: time=6000, mem_mb=256000
run:
cmd = selStr + softStr + " -x "+str(sweep_locs[int(wildcards.x)])+" > {output}"
shell(cmd)
Expand All @@ -126,14 +130,17 @@ rule concat_fvecs:
run:
for t in ["train", "test"]:
cmd = "cat {t}/discoal.neut.0.0.out.fvec > {t}/neut_0.fvec"
shell(cmd)
for ii in range(1, totSimRuns):
cmd += f" && tail -n +2 {t}/discoal.neut.0.{ii}.out.fvec >> {t}/neut_0.fvec"
cmd = f"tail -n +2 {t}/discoal.neut.0.{ii}.out.fvec >> {t}/neut_0.fvec"
shell(cmd)
for model in ["hard", "soft"]:
for xx in range(11):
cmd += f" && cat {t}/discoal.{model}.{xx}.0.out.fvec > {t}/{model}_{xx}.fvec"
for i in range(1,totSimRuns):
cmd += f" && tail -n +2 {t}/discoal.{model}.{xx}.{ii}.out.fvec >> {t}/{model}_{xx}.fvec"
shell(cmd)
cmd = f"cat {t}/discoal.{model}.{xx}.0.out.fvec > {t}/{model}_{xx}.fvec"
shell(cmd)
for ii in range(1,totSimRuns):
cmd = f"tail -n +2 {t}/discoal.{model}.{xx}.{ii}.out.fvec >> {t}/{model}_{xx}.fvec"
shell(cmd)

rule make_training_sets:
input:
Expand All @@ -149,13 +156,13 @@ rule train_classifier:
rules.make_training_sets.output
output:
"trained_model.json",
"trained_model.weights.hdf5"
"trained_model.weights.h5"
run:
# cpu training below
cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && {diploSHIC_exec} train trainingSets/ trainingSets/ trained_model --epochs=100"
shell(cmd)

rule clean:
shell:
"rm -rf test/ train/ trained_model* \
.snakemake slurm*"
#rule clean:
# shell:
# "rm -rf test/ train/ trained_model* \
# .snakemake slurm*"
12 changes: 6 additions & 6 deletions workflows/diploshic_params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
trainSampleNumber: 20 #the number of simulation replicates we want to generate for each file in our training set
testSampleNumber: 20 #the number of simulations to create for each file in the test set
sampleSize: 20 #the number of individuals in our population sample
numSites: 110_000 #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11)
u: 1e-8 #per-site mutation rate (used to calculate mean theta)
numSites: 1_100_000 #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11)
u: 2e-8 #per-site mutation rate (used to calculate mean theta)
maxSoftSweepInitFreq: 0.1 #maximum initial selected frequency for soft sweeps
tauHigh: 0.02 #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past
tauHigh: 0.01 #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past
rhoOverTheta: 1.0 #crossover rate over mut rate (used to calculate mean rho)

ne0: 10_000
thetaMean: 4*N0*u*numSites
rhoMean: thetaMean * rhoOverTheta

seln_coeff_max: 0.05
seln_coeff_min: 0.001
totSimRuns: 20
seln_coeff_max: 0.10
seln_coeff_min: 0.01
totSimRuns: 100
49 changes: 28 additions & 21 deletions workflows/sweep_simulate.snake
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,12 @@ def write_rec_rates(fname, ratemap, windows):
last = 0
fh = open(fname, 'w')
for start, end in windows:
cum_cM = ratemap.get_cumulative_mass(end)
print(f'{start}\t{end}\t{cum_cM}\t{cum_cM-last}', file=fh)
last = cum_cM
wsize = end - start
start_center = start + (wsize*0.4)
end_center = start + (wsize*0.6)
cM_center = ratemap.get_cumulative_mass(end_center) - ratemap.get_cumulative_mass(start_center)
cM = ratemap.get_cumulative_mass(end) - ratemap.get_cumulative_mass(start)
print(f'{start}\t{end}\t{cM}\t{cM_center}', file=fh)
fh.close()

def write_annot_density(fname, chrom, annot, windows):
Expand Down Expand Up @@ -410,7 +413,11 @@ def dump_results(input, output, params_dict, target_pops, num_subwins=1):
if len(del_intervals) > 0:
tss = tss.delete_intervals(del_intervals)
tss = tss.trim()
tss.write_vcf(fh_vcf, position_transform = lambda x: np.fmax(1, np.round(x)))
# because we are shifting from 0-based to 1-based, I need to remove any sites that may have happened at the last position
sites_at_last = np.where(np.round(tss.sites_position)==config["focal_size"])[0]
assert sites_at_last.shape[0] < 4 # realistically we shouldn't get more than two or three hits there
tss = tss.delete_sites(sites_at_last)
tss.write_vcf(fh_vcf, position_transform = lambda x: 1 + np.round(x))
fh_vcf.close()
# write seqlen of shortened ts
with open(output[2], 'w') as f:
Expand Down Expand Up @@ -535,7 +542,7 @@ shic_outs3 = [file_prefix+".stats.tsv.shic" for file_prefix in sw_outs_prefix_po

rule all:
input:
rules.diploshic_all.input,
rules.diploshic_all.input,
boundary_outs + trees_outs + stats_outs + vcf_outs + [output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv', output_dir+f'/simulated_data/sweeps/rec_map_{chrom}_{config["num_windows"]}.tsv'] + annot_outs +anc_outs + fv_outs + pred_outs + shic_outs1 + shic_outs2 + shic_outs3
default_target: True

Expand All @@ -556,7 +563,7 @@ rule boundary_sims:
input:
output:
output_dir + "/simulated_data/sweeps/boundary_sims/sim_{seed}_{region_size}.trees"
resources: time=6000, mem_mb=6000
resources: time=60, mem_mb=6000
run:
model = species.get_demographic_model(demo_model["id"])
mut_rate = model.mutation_rate
Expand Down Expand Up @@ -585,7 +592,7 @@ rule neutral:
input:
output:
output_dir + f"/simulated_data/sweeps/neutral/{demo_model['id']}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees",
resources: time=3000, mem_mb=8000
resources: time=30, mem_mb=7000
run:
model = species.get_demographic_model(demo_model["id"])
mutation_rate = model.mutation_rate
Expand Down Expand Up @@ -617,7 +624,7 @@ rule bgs:
input:
output:
output_dir + f"/simulated_data/sweeps/bgs/{demo_model['id']}/{{annot}}/{{dfe}}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees",
resources: time=3000, mem_mb=3000
resources: time=30, mem_mb=8000
run:
model = species.get_demographic_model(demo_model["id"])
mutation_rate = model.mutation_rate
Expand Down Expand Up @@ -656,7 +663,7 @@ rule sweep:
input:
output:
output_dir + f"/simulated_data/sweeps/sweep/{demo_model['id']}/{{popu}}/{{annot}}/{{dfe}}/{{coeff}}/{{tmult}}/{{seed}}/sim_{{chrom}}_{{left}}_{{right}}.trees",
resources: time=3000, mem_mb=16000
resources: time=30, mem_mb=12000
run:
model = species.get_demographic_model(demo_model["id"])
mutation_rate = model.mutation_rate
Expand Down Expand Up @@ -751,7 +758,7 @@ rule get_stats:
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.ancFile",
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.samples"

resources: time=3000, mem_mb=2000
resources: time=30, mem_mb=4000
run:
params_dict, target_pops = _get_params_dict_from_wildcards(wildcards)
dump_results(input, output, params_dict, target_pops, config["num_subwins"])
Expand All @@ -766,11 +773,11 @@ rule diploshic_fvs:

output:
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.diploshic.fv'
resources: time=30, mem_mb=1200
resources: time=40, mem_mb=5000
run:
with open(input[0],'r') as f:
seq_len = f.read().strip()
cmd = f"diploSHIC fvecVcf haploid {input[1]} 1 {int(seq_len)} {output[0]} --targetPop {wildcards.popu} --sampleToPopFileName {input[3]} --winSize 2200000 --ancFileName {input[2]}"
cmd = f"diploSHIC fvecVcf haploid {input[1]} 1 {int(seq_len)} {output[0]} --targetPop {wildcards.popu} --sampleToPopFileName {input[3]} --winSize 1100000 --ancFileName {input[2]}"
shell(cmd)


Expand All @@ -780,17 +787,17 @@ rule diploshic_pred:
rules.diploshic_train_classifier.output
output:
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.diploshic.preds'
resources: time=30, mem_mb=1200
resources: time=30, mem_mb=3000
run:
cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && diploSHIC predict trained_model.json trained_model.weights.hdf5 {input[0]} {output[0]}"
cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && diploSHIC predict trained_model.json trained_model.weights.h5 {input[0]} {output[0]}"
shell(cmd)


rule add_diploshic_preds_to_stats:
input: output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.trees",
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.diploshic.preds"
output: output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.stats.tsv.shic"
resources: time=30, mem_mb=600
resources: time=30, mem_mb=6000
run:
params_dict, target_pops = _get_params_dict_from_wildcards(wildcards)
target_pops = [wildcards.popu]
Expand All @@ -806,7 +813,7 @@ rule add_diploshic_preds_to_stats:
assert np.all(df.classifiedWinEnd <= ts.metadata['windowing']['window_right'])
assert np.all(df.classifiedWinStart > ts.metadata['windowing']['window_left']+1)
windows = df.iloc[:,[1,2]].to_numpy().astype(np.int32)
stats = 1 - df.iloc[:,5].to_numpy().astype(np.float32) # prob non neutral
stats = (df.iloc[:,8].to_numpy() + df.iloc[:,9].to_numpy()).astype(np.float32) # prob soft+hard
stats = np.expand_dims(stats, axis = 1) # adding another dim going from (n_wins,) to (n_wins,1)
fh = open(output[0], 'w')
_print_stat_line("diploshic", stats, windows, target_pops, meta_str, fh)
Expand All @@ -817,7 +824,7 @@ rule merge_stats:
input: stats_outs
output:
output_dir + f'/simulated_data/sweeps/all_sims.tmp.stats.tsv'
resources: time=3000, mem_mb=350000, disk_mb=350000
resources: time=1500, mem_mb=350000, disk_mb=350000
run:
#print(input, flush=True)
#import pdb; pdb.set_trace()
Expand All @@ -827,7 +834,7 @@ rule merge_stats_shic1:
input: shic_outs1
output:
output_dir + f'/simulated_data/sweeps/all_sims1.shic.stats.tsv'
resources: time=3000, mem_mb=150000, disk_mb=150000
resources: time=3000, mem_mb=350000, disk_mb=350000
run:
#print(input, flush=True)
#import pdb; pdb.set_trace()
Expand All @@ -837,7 +844,7 @@ rule merge_stats_shic2:
input: shic_outs2
output:
output_dir + f'/simulated_data/sweeps/all_sims2.shic.stats.tsv'
resources: time=3000, mem_mb=150000, disk_mb=150000
resources: time=3000, mem_mb=350000, disk_mb=350000
run:
#print(input, flush=True)
#import pdb; pdb.set_trace()
Expand All @@ -847,7 +854,7 @@ rule merge_stats_shic3:
input: shic_outs3
output:
output_dir + f'/simulated_data/sweeps/all_sims3.shic.stats.tsv'
resources: time=3000, mem_mb=150000, disk_mb=150000
resources: time=3000, mem_mb=350000, disk_mb=350000
run:
#print(input, flush=True)
#import pdb; pdb.set_trace()
Expand All @@ -857,7 +864,7 @@ rule merge_stats_shic3:
rule merge_all_stats:
input: [output_dir + f'/simulated_data/sweeps/all_sims.tmp.stats.tsv', output_dir + f'/simulated_data/sweeps/all_sims3.shic.stats.tsv', output_dir + f'/simulated_data/sweeps/all_sims2.shic.stats.tsv', output_dir + f'/simulated_data/sweeps/all_sims1.shic.stats.tsv']
output: output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv'
resources: time=3000, mem_mb=150000, disk_mb=150000
resources: time=3000, mem_mb=350000, disk_mb=350000
shell:
"cat {input} > {output}"

Expand Down
Loading