Skip to content

Commit

Permalink
Update filenames for h5n1-cattle-flu segments
Browse files Browse the repository at this point in the history
Part of simplifications described in
<#70>

This approach avoids using an empty wildcard for time but introduces the
need to have a separate rule to map the target Auspice dataset JSON
filename (e.g. `auspice/avian-flu_h5n1-cattle-outbreak_pb2.json`) to a
results file with a time wildcard (e.g.
`results/avian-flu_h5n1-cattle-outbreak_pb2_default.json`). The rest of
the pipeline is unchanged.
  • Loading branch information
jameshadfield committed Jul 25, 2024
1 parent aef6d75 commit beb0275
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 14 deletions.
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ nextstrain build \
. \
--config s3_src=s3://nextstrain-data/files/workflows/avian-flu/h5n1 \
-pf \
auspice/avian-flu_h5n1-cattle-outbreak_pb2_all-time.json \
auspice/avian-flu_h5n1-cattle-outbreak_pb1_all-time.json \
auspice/avian-flu_h5n1-cattle-outbreak_pa_all-time.json \
auspice/avian-flu_h5n1-cattle-outbreak_ha_all-time.json \
auspice/avian-flu_h5n1-cattle-outbreak_np_all-time.json \
auspice/avian-flu_h5n1-cattle-outbreak_na_all-time.json \
auspice/avian-flu_h5n1-cattle-outbreak_mp_all-time.json \
auspice/avian-flu_h5n1-cattle-outbreak_ns_all-time.json
auspice/avian-flu_h5n1-cattle-outbreak_pb2.json \
auspice/avian-flu_h5n1-cattle-outbreak_pb1.json \
auspice/avian-flu_h5n1-cattle-outbreak_pa.json \
auspice/avian-flu_h5n1-cattle-outbreak_ha.json \
auspice/avian-flu_h5n1-cattle-outbreak_np.json \
auspice/avian-flu_h5n1-cattle-outbreak_na.json \
auspice/avian-flu_h5n1-cattle-outbreak_mp.json \
auspice/avian-flu_h5n1-cattle-outbreak_ns.json
```

## Creating a custom build
Expand Down
51 changes: 47 additions & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def clock_rate(w):
'h5n1': {'all-time':'', '2y': clock_rates_h5n1[w.segment]},
'h7n9': {'all-time':''},
'h9n2': {'all-time':''},
'h5n1-cattle-outbreak': {'all-time': clock_rates_h5n1[w.segment]}
'h5n1-cattle-outbreak': {'default': clock_rates_h5n1[w.segment]}
}

return clock_rate[w.subtype][w.time]
Expand All @@ -132,7 +132,7 @@ def clock_rate_std_dev(w):
'h5n1': {'all-time': '', '2y': '--clock-std-dev 0.00211'},
'h7n9': {'all-time': ''},
'h9n2': {'all-time': ''},
'h5n1-cattle-outbreak': {'all-time': '--clock-std-dev 0.00211'}
'h5n1-cattle-outbreak': {'default': '--clock-std-dev 0.00211'}
}

return clock_rate_std_dev[w.subtype][w.time]
Expand Down Expand Up @@ -401,7 +401,10 @@ def additional_export_config(wildcards):
return args

rule export:
message: "Exporting data files for for auspice"
"""
Export the files into results/ and then use a subsequent rule to move these to the
auspice/ directory
"""
input:
tree = refined_tree,
metadata = metadata_by_wildcards,
Expand All @@ -411,7 +414,7 @@ rule export:
auspice_config = files.auspice_config,
description = files.description
output:
auspice_json = "auspice/avian-flu_{subtype}_{segment}_{time}.json"
auspice_json = "results/avian-flu_{subtype}_{segment}_{time}.json"
params:
additional_config = additional_export_config
shell:
Expand All @@ -429,6 +432,46 @@ rule export:
--output {output.auspice_json}
"""

def auspice_name_to_wildcard_name(wildcards):
"""
Used to link Auspice JSONs filenames to their intermediate filename which includes all wildcards.
Examples:
1. subtype + segment + time in their filename / URL,
e.g. "avian-flu_h5n1_ha_2y.json" (nextstrain.org/avian-flu/h5n1/ha/2y)
maps to subtype=h5n1, segment=ha, time=2y
2. subtype + segment in their filename / URL,
e.g. "avian-flu_h5n1-cattle-outbreak_ha.json" (nextstrain.org/avian-flu/h5n1-cattle-outbreak/ha)
maps to subtype=h5n1-cattle-outbreak, segment=ha, time=default
"""
parts = wildcards.parts.split("_")
if len(parts)==3:
[subtype, segment, time] = parts
assert segment!='genome', "Genome builds are not available for this build"
return f"results/avian-flu_{subtype}_{segment}_{time}.json"
if len(parts)==2:
[subtype, segment] = parts
assert subtype=='h5n1-cattle-outbreak', "Only h5n1 builds produce an Auspice dataset without a time component in the filename"
return f"results/avian-flu_{subtype}_{segment}_default.json"
raise Exception("Auspice JSON filename requested with an unexpected number of (underscore-separated) parts")


rule rename_auspice_datasets:
"""
This allows us to create files in auspice/ which mirror the intended URL structure rather than
the wildcard structure we use in the workflow.
"""
input:
json = auspice_name_to_wildcard_name
output:
json = "auspice/avian-flu_{parts}.json"
wildcard_constraints:
timepart = ".*"
shell:
"""
cp {input.json} {output.json}
"""


rule clean:
message: "Removing directories: {params}"
params:
Expand Down
4 changes: 2 additions & 2 deletions rules/cattle-flu.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ rule download_tree:
dataset="https://data.nextstrain.org/avian-flu_h5n1-cattle-outbreak_genome.json"
wildcard_constraints:
subtype="h5n1-cattle-outbreak",
time="all-time",
time="default",
shell:
"""
curl --compressed {params.dataset} -o {output.tree}
Expand All @@ -27,7 +27,7 @@ rule prune_tree:
node_data = "results/tree_{subtype}_{segment}_{time}_outbreak-clade.json",
wildcard_constraints:
subtype="h5n1-cattle-outbreak",
time="all-time",
time="default",
shell:
"""
python3 scripts/restrict-via-common-ancestor.py \
Expand Down

0 comments on commit beb0275

Please sign in to comment.