diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml new file mode 100644 index 0000000..6f78a83 --- /dev/null +++ b/.github/workflows/ingest-to-phylogenetic.yaml @@ -0,0 +1,102 @@ +name: Ingest to phylogenetic + +defaults: + run: + # This is the same as GitHub Action's `bash` keyword as of 20 June 2023: + # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell + # + # Completely spelling it out here so that GitHub can't change it out from under us + # and we don't have to refer to the docs to know the expected behavior. + shell: bash --noprofile --norc -eo pipefail {0} + +on: + schedule: + # Note times are in UTC, which is 1 or 2 hours behind CET depending on daylight savings. + # + # Note the actual runs might be late. + # Numerous people were confused, about that, including me: + # - https://github.community/t/scheduled-action-running-consistently-late/138025/11 + # - https://github.com/github/docs/issues/3059 + # + # Note, '*' is a special character in YAML, so you have to quote this string. + # + # Docs: + # - https://docs.github.com/en/actions/learn-github-actions/events-that-trigger-workflows#schedule + # + # Tool that deciphers this particular format of crontab string: + # - https://crontab.guru/ + # + # Runs at 5pm UTC (1pm EDT/10am PDT) since curation by NCBI happens on the East Coast. + # We were running into invalid zip archive errors at 9am PDT, so hoping an hour + # delay will lower the error frequency + - cron: '0 17 * * *' + + workflow_dispatch: + inputs: + ingest_image: + description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")' + required: false + phylogenetic_image: + description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")' + required: false + +jobs: + ingest: + permissions: + id-token: write + uses: ./.github/workflows/ingest.yaml + secrets: inherit + with: + image: ${{ inputs.ingest_image }} + + # Check if ingest results include new data by checking for the cache + # of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3) + # GitHub will remove any cache entries that have not been accessed in over 7 days, + # so if the workflow has not been run over 7 days then it will trigger phylogenetic. + check-new-data: + needs: [ingest] + runs-on: ubuntu-latest + outputs: + cache-hit: ${{ steps.check-cache.outputs.cache-hit }} + steps: + - name: Get sha256sum + id: get-sha256sum + env: + AWS_DEFAULT_REGION: ${{ vars.AWS_DEFAULT_REGION }} + run: | + s3_urls=( + "s3://nextstrain-data/files/workflows/lassa/metadata_all.tsv.zst" + "s3://nextstrain-data/files/workflows/lassa/sequences_all.fasta.zst" + ) + + # Code below is modified from ingest/upload-to-s3 + # https://github.com/nextstrain/ingest/blob/c0b4c6bb5e6ccbba86374d2c09b42077768aac23/upload-to-s3#L23-L29 + + no_hash=0000000000000000000000000000000000000000000000000000000000000000 + + for s3_url in "${s3_urls[@]}"; do + s3path="${s3_url#s3://}" + bucket="${s3path%%/*}" + key="${s3path#*/}" + + s3_hash="$(aws s3api head-object --no-sign-request --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" + echo "${s3_hash}" | tee -a ingest-output-sha256sum + done + + - name: Check cache + id: check-cache + uses: actions/cache@v4 + with: + path: ingest-output-sha256sum + key: ingest-output-sha256sum-${{ hashFiles('ingest-output-sha256sum') }} + lookup-only: true + + phylogenetic: + needs: [check-new-data] + if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }} + permissions: + id-token: write + uses: ./.github/workflows/phylogenetic.yaml + secrets: inherit + with: + image: ${{ inputs.phylogenetic_image }} diff --git a/.github/workflows/phylogenetic.yaml b/.github/workflows/phylogenetic.yaml new file mode 100644 index 0000000..afde3e3 --- /dev/null +++ b/.github/workflows/phylogenetic.yaml @@ -0,0 +1,107 @@ +name: Phylogenetic + +defaults: + run: + # This is the same as GitHub Action's `bash` keyword as of 20 June 2023: + # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell + # + # Completely spelling it out here so that GitHub can't change it out from under us + # and we don't have to refer to the docs to know the expected behavior. + shell: bash --noprofile --norc -eo pipefail {0} + +on: + workflow_call: + inputs: + image: + description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")' + required: false + type: string + + workflow_dispatch: + inputs: + image: + description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")' + required: false + type: string + trial_name: + description: | + Trial name for deploying builds. + If not set, builds will overwrite existing builds at s3://nextstrain-data/lassa* + If set, builds will be deployed to s3://nextstrain-staging/lassa_trials__* + required: false + type: string + sequences_url: + description: | + URL for the sequences.fasta.zst file + If not provided, will use default sequences_url from phylogenetic/defaults/config.yaml + required: false + type: string + metadata_url: + description: | + URL for the metadata.tsv.zst file + If not provided, will use default metadata_url from phylogenetic/defaults/config.yaml + required: false + type: string + +jobs: + set_config_overrides: + runs-on: ubuntu-latest + steps: + - id: config + name: Set config overrides + env: + TRIAL_NAME: ${{ inputs.trial_name }} + SEQUENCES_URL: ${{ inputs.sequences_url }} + METADATA_URL: ${{ inputs.metadata_url }} + run: | + config="" + + if [[ "$TRIAL_NAME" ]]; then + config+=" deploy_url='s3://nextstrain-staging/lassa_trials_"$TRIAL_NAME"_'" + fi + + if [[ "$SEQUENCES_URL" ]]; then + config+=" sequences_url='"$SEQUENCES_URL"'" + fi + + if [[ "$METADATA_URL" ]]; then + config+=" metadata_url='"$METADATA_URL"'" + fi + + if [[ $config ]]; then + config="--config $config" + fi + + echo "config=$config" >> "$GITHUB_OUTPUT" + outputs: + config_overrides: ${{ steps.config.outputs.config }} + + phylogenetic: + needs: [set_config_overrides] + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + # Starting with the default docker runtime + # We can migrate to AWS Batch when/if we need to for more resources or if + # the job runs longer than the GH Action limit of 6 hours. + runtime: docker + env: | + NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }} + CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }} + run: | + nextstrain build \ + phylogenetic \ + deploy_all \ + --configfile build-configs/nextstrain-automation/config.yaml \ + $CONFIG_OVERRIDES + # Specifying artifact name to differentiate ingest build outputs from + # the phylogenetic build outputs + artifact-name: phylogenetic-build-output + artifact-paths: | + phylogenetic/auspice/ + phylogenetic/results/ + phylogenetic/benchmarks/ + phylogenetic/logs/ + phylogenetic/.snakemake/log/ diff --git a/phylogenetic/build-configs/nextstrain-automation/config.yaml b/phylogenetic/build-configs/nextstrain-automation/config.yaml new file mode 100644 index 0000000..e549405 --- /dev/null +++ b/phylogenetic/build-configs/nextstrain-automation/config.yaml @@ -0,0 +1,4 @@ +custom_rules: + - build-configs/nextstrain-automation/deploy.smk + +deploy_url: "s3://nextstrain-data" diff --git a/phylogenetic/build-configs/nextstrain-automation/deploy.smk b/phylogenetic/build-configs/nextstrain-automation/deploy.smk new file mode 100644 index 0000000..f37b788 --- /dev/null +++ b/phylogenetic/build-configs/nextstrain-automation/deploy.smk @@ -0,0 +1,15 @@ +""" +This part of the workflow handles automatic deployments of the lassa build. +Uploads the build defined as the default output of the workflow through +the `all` rule from Snakefille +""" + +rule deploy_all: + input: *rules.all.input + output: touch("results/deploy_all.done") + params: + deploy_url = config["deploy_url"] + shell: + """ + nextstrain remote upload {params.deploy_url} {input} + """