diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 458b3ef..79f8da1 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,17 +1,17 @@ -# phac-nml/gasclustering: Contributing Guidelines +# phac-nml/fastmatchirida: Contributing Guidelines Hi there! -Many thanks for taking an interest in improving phac-nml/gasclustering. +Many thanks for taking an interest in improving phac-nml/fastmatchirida. -We try to manage the required tasks for phac-nml/gasclustering using GitHub issues, you probably came to this page when creating one. +We try to manage the required tasks for phac-nml/fastmatchirida using GitHub issues, you probably came to this page when creating one. Please use the pre-filled template to save time. ## Contribution workflow -If you'd like to write some code for phac-nml/gasclustering, the standard workflow is as follows: +If you'd like to write some code for phac-nml/fastmatchirida, the standard workflow is as follows: -1. Check that there isn't already an issue about your idea in the [phac-nml/gasclustering issues](https://github.com/phac-nml/gasclustering/issues) to avoid duplicating work. If there isn't one already, please create one so that others know you're working on this -2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [phac-nml/gasclustering repository](https://github.com/phac-nml/gasclustering) to your GitHub account +1. Check that there isn't already an issue about your idea in the [phac-nml/fastmatchirida issues](https://github.com/phac-nml/fastmatchirida/issues) to avoid duplicating work. If there isn't one already, please create one so that others know you're working on this +2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [phac-nml/fastmatchirida repository](https://github.com/phac-nml/fastmatchirida) to your GitHub account 3. Make the necessary changes / additions within your forked repository following [Pipeline conventions](#pipeline-contribution-conventions) 4. Use `nf-core schema build` and add any new parameters to the pipeline JSON schema (requires [nf-core tools](https://github.com/nf-core/tools) >= 1.10). 5. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged @@ -52,11 +52,11 @@ These tests are run both with the latest available version of `Nextflow` and als ## Getting help -For further information/help, please consult the [phac-nml/gasclustering documentation](https://github.com/phac-nml/gasclustering/). +For further information/help, please consult the [phac-nml/fastmatchirida documentation](https://github.com/phac-nml/fastmatchirida/). ## Pipeline contribution conventions -To make the phac-nml/gasclustering code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. +To make the phac-nml/fastmatchirida code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. ### Adding a new step @@ -105,7 +105,7 @@ This repo includes a devcontainer configuration which will create a GitHub Codes To get started: -- Open the repo in [Codespaces](https://github.com/phac-nml/gasclustering/codespaces) +- Open the repo in [Codespaces](https://github.com/phac-nml/fastmatchirida/codespaces) - Tools installed - nf-core - Nextflow diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 16f7977..b8e52bc 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -1,6 +1,6 @@ name: Bug report description: Report something that is broken or incorrect -labels: bug +labels: [bug] body: - type: markdown attributes: @@ -44,4 +44,4 @@ body: * Executor _(eg. slurm, local, awsbatch)_ * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_ * OS _(eg. CentOS Linux, macOS, Linux Mint)_ - * Version of phac-nml/gasclustering _(eg. 1.1, 1.5, 1.8.2)_ + * Version of phac-nml/fastmatchirida _(eg. 1.1, 1.5, 1.8.2)_ diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index d1d63c5..6132264 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,4 +1,4 @@ contact_links: - name: "GitHub" - url: https://github.com/phac-nml/gasclustering + url: https://github.com/phac-nml/fastmatchirida about: The GitHub page for development. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index ef0f21d..4bfc626 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -1,5 +1,5 @@ name: Feature request -description: Suggest an idea for the phac-nml/gasclustering pipeline +description: Suggest an idea for the phac-nml/fastmatchirida pipeline labels: enhancement body: - type: textarea diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index f7a9079..98f3680 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,21 +1,21 @@ ## PR checklist - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! -- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/phac-nml/gasclustering/tree/main/.github/CONTRIBUTING.md) +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/phac-nml/fastmatchirida/tree/main/.github/CONTRIBUTING.md) - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index f7a3252..3741bc2 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -11,9 +11,9 @@ jobs: steps: # PRs to the phac-nml repo main branch are only ok if coming from the phac-nml repo `dev` or any `patch` branches - name: Check PRs - if: github.repository == 'phac-nml/gasclustering' + if: github.repository == 'phac-nml/fastmatchirida' run: | - { [[ ${{github.event.pull_request.head.repo.full_name }} == phac-nml/gasclustering ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + { [[ ${{github.event.pull_request.head.repo.full_name }} == phac-nml/fastmatchirida ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5abe80d..55cee69 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,7 +19,7 @@ jobs: test: name: Run pipeline with test data # Only run on push if this is the phac-nml dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'phac-nml/gasclustering') }}" + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'phac-nml/fastmatchirida') }}" runs-on: ubuntu-latest strategy: matrix: @@ -31,7 +31,7 @@ jobs: uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@v2 with: version: "${{ matrix.NXF_VER }}" diff --git a/.nf-core.yml b/.nf-core.yml index 8c12aa6..de6f3c8 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -2,15 +2,15 @@ repository_type: pipeline nf_core_version: "3.0.1" lint: files_exist: - - assets/nf-core-gasclustering_logo_light.png - - docs/images/nf-core-gasclustering_logo_light.png - - docs/images/nf-core-gasclustering_logo_dark.png + - assets/nf-core-fastmatchirida_logo_light.png + - docs/images/nf-core-fastmatchirida_logo_light.png + - docs/images/nf-core-fastmatchirida_logo_dark.png - .github/workflows/awstest.yml - .github/workflows/awsfulltest.yml - lib/Utils.groovy - lib/WorkflowMain.groovy - lib/NfcoreTemplate.groovy - - lib/WorkflowGasclustering.groovy + - lib/Workflowfastmatchirida.groovy files_unchanged: - assets/sendmail_template.txt - assets/email_template.html diff --git a/CHANGELOG.md b/CHANGELOG.md index 30cd866..720605b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,54 +1,11 @@ -# phac-nml/gasclustering: Changelog +# phac-nml/fastmatchirida: Changelog The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.4.0] - 2024-11-07 +## [0.1.0] - 2024-12-13 -- Added the ability to include a `sample_name` column in the input samplesheet.csv. Allows for compatibility with IRIDA-Next input configuration. +fastmatchirida is built using Gasclustering [0.4.0] as a template. Set up the basic-functionality of taking a query and reference set of samples and returning the samples distance, above a user-set threshold. - - `sample_name` special characters (non-alphanumeric with exception of "_" and ".") will be replaced with `"_"` - - If no `sample_name` is supplied in the column `sample` will be used - - To avoid repeat values for `sample_name` all `sample_name` values will be suffixed with the unique `sample` value from the input file - - - Fixed linting issues in CI caused by nf-core 3.0.1 - -## [0.3.0] - 2024-09-10 - -### Changed - -- Upgraded `profile_dist` container to version `1.0.2` -- Upgraded `locidex/merge` to version `0.2.3` and updated `input_assure` and test data for compatibility with the new `mlst.json` allele file format. - - [PR28](https://github.com/phac-nml/gasclustering/pull/28) -- Aligned container registry handling in configuration files and modules with `phac-nml/pipeline-standards` - - [PR28](https://github.com/phac-nml/gasclustering/pull/28) - -This pipeline is now compatible only with output generated by [Locidex v0.2.3+](https://github.com/phac-nml/locidex) and [Mikrokondo v0.4.0+](https://github.com/phac-nml/mikrokondo/releases/tag/v0.4.0). - -## [0.2.0] - 2024-06-26 - -### Added - -- Support for mismatched IDs between the samplesheet ID and the ID listed in the corresponding allele file. - -### Changed - -- Updated ArborView to v0.0.7-rc1. - -### Fixed - -- The scaled distance thresholds provided when using `--pd_distm scaled` and `--gm_thresholds` are now correctly understood as percentages in the range [0.0, 100.0]. - -## [0.1.0] - 2024-05-28 - -Initial release of the Genomic Address Service Clustering pipeline to be used for distance-based clustering of cg/wgMLST data. - -### `Added` - -- Input of cg/wgMLST allele calls produced from [locidex](https://github.com/phac-nml/locidex). -- Output of a dendrogram, cluster codes, and visualization using [profile_dists](https://github.com/phac-nml/profile_dists), [gas mcluster](https://github.com/phac-nml/genomic_address_service), and ArborView. - -[0.1.0]: https://github.com/phac-nml/gasclustering/releases/tag/0.1.0 -[0.2.0]: https://github.com/phac-nml/gasclustering/releases/tag/0.2.0 -[0.3.0]: https://github.com/phac-nml/gasclustering/releases/tag/0.3.0 [0.4.0]: https://github.com/phac-nml/gasclustering/releases/tag/0.4.0 +[0.1.0]: https://github.com/phac-nml/fastmatchirida/releases/tag/0.1.0 diff --git a/CITATIONS.md b/CITATIONS.md index 2e80161..19de5b2 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -1,4 +1,4 @@ -# phac-nml/gasclustering: Citations +# phac-nml/fastmatchirida: Citations ## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/) @@ -22,7 +22,7 @@ > Robertson, James, Wells, Matthew, Schonfeld, Justin, Reimer, Aleisha. Genomic Address Service: Convenient package for de novo clustering and sample assignment to existing clusters. 2023. https://github.com/phac-nml/genomic_address_service -- [ArborView (included in repository)](https://github.com/phac-nml/gasclustering/blob/dev/assets/ArborView.html) (in-development, citation subject to change) +- [ArborView (included in repository)](https://github.com/phac-nml/fastmatchirida/blob/dev/assets/ArborView.html) (in-development, citation subject to change) ## Software packaging/containerisation tools diff --git a/README.md b/README.md index 6b09c9f..05af6a6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A523.04.3-brightgreen.svg)](https://www.nextflow.io/) -# Genomic Address Service Clustering Workflow +# FastMatch IRIDA Workflow This workflow takes provided JSON-formatted MLST profiles and converts them into a phylogenetic tree with associated flat cluster codes for use in [Irida Next](https://github.com/phac-nml/irida-next). The workflow also generates an interactive tree for visualization. @@ -18,7 +18,7 @@ The structure of this file is defined in [assets/schema_input.json](assets/schem ## IRIDA-Next Optional Input Configuration -`gasclustering` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which can contain an additional column: `sample_name` +`fastmatchirida` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which can contain an additional column: `sample_name` `sample_name`: An **optional** column, that overrides `sample` for outputs (filenames and sample names) and reference assembly identification. @@ -87,7 +87,7 @@ Other parameters (defaults from nf-core) are defined in [nextflow_schema.json](n To run the pipeline, please do: ```bash -nextflow run phac-nml/gasclustering -profile singularity -r main -latest --input https://github.com/phac-nml/gasclustering/raw/dev/assets/samplesheet.csv --outdir results +nextflow run phac-nml/fastmatchirida -profile singularity -r main -latest --input https://github.com/phac-nml/fastmatchirida/raw/dev/assets/samplesheet.csv --outdir results ``` Where the `samplesheet.csv` is structured as specified in the [Input](#input) section. @@ -157,7 +157,7 @@ Details on the individual output files can be found in the [Output documentation To run with the test profile, please do: ```bash -nextflow run phac-nml/gasclustering -profile docker,test -r main -latest --outdir results +nextflow run phac-nml/fastmatchirida -profile docker,test -r main -latest --outdir results ``` # Legal diff --git a/assets/adaptivecard.json b/assets/adaptivecard.json index fee2b53..f7b799f 100644 --- a/assets/adaptivecard.json +++ b/assets/adaptivecard.json @@ -17,7 +17,7 @@ "size": "Large", "weight": "Bolder", "color": "<% if (success) { %>Good<% } else { %>Attention<%} %>", - "text": "phac-nml/gasclustering v${version} - ${runName}", + "text": "phac-nml/fastmatchirida v${version} - ${runName}", "wrap": true }, { diff --git a/assets/email_template.html b/assets/email_template.html index fa4448f..0847779 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -4,21 +4,21 @@ - - phac-nml/gasclustering Pipeline Report + + phac-nml/fastmatchirida Pipeline Report
-

phac-nml/gasclustering v${version}

+

phac-nml/fastmatchirida v${version}

Run Name: $runName

<% if (!success){ out << """
-

phac-nml/gasclustering execution completed unsuccessfully!

+

phac-nml/fastmatchirida execution completed unsuccessfully!

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

The full error message was:

${errorReport}
@@ -27,7 +27,7 @@

phac-nml/gasclustering execution compl } else { out << """
- phac-nml/gasclustering execution completed successfully! + phac-nml/fastmatchirida execution completed successfully!
""" } @@ -44,8 +44,8 @@

Pipeline Configuration:

-

phac-nml/gasclustering

-

https://github.com/phac-nml/gasclustering

+

phac-nml/fastmatchirida

+

https://github.com/phac-nml/fastmatchirida

diff --git a/assets/email_template.txt b/assets/email_template.txt index 3b723ce..48c0b7d 100644 --- a/assets/email_template.txt +++ b/assets/email_template.txt @@ -4,15 +4,15 @@ |\\ | |__ __ / ` / \\ |__) |__ } { | \\| | \\__, \\__/ | \\ |___ \\`-._,-`-, `._,._,' - phac-nml/gasclustering v${version} + phac-nml/fastmatchirida v${version} ---------------------------------------------------- Run Name: $runName <% if (success){ - out << "## phac-nml/gasclustering execution completed successfully! ##" + out << "## phac-nml/fastmatchirida execution completed successfully! ##" } else { out << """#################################################### -## phac-nml/gasclustering execution completed unsuccessfully! ## +## phac-nml/fastmatchirida execution completed unsuccessfully! ## #################################################### The exit status of the task that caused the workflow execution to fail was: $exitStatus. The full error message was: @@ -35,5 +35,5 @@ Pipeline Configuration: <% out << summary.collect{ k,v -> " - $k: $v" }.join("\n") %> -- -phac-nml/gasclustering -https://github.com/phac-nml/gasclustering +phac-nml/fastmatchirida +https://github.com/phac-nml/fastmatchirida diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index 629ee0f..d92ca59 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -1,11 +1,11 @@ id: "nf-core-iridanext-methods-description" description: "Suggested text and references to use when describing pipeline usage within the methods section of a publication." -section_name: "phac-nml/gasclustering Methods Description" -section_href: "https://github.com/phac-nml/gasclustering" +section_name: "phac-nml/fastmatchirida Methods Description" +section_href: "https://github.com/phac-nml/fastmatchirida" plot_type: "html" data: |

Methods

-

Data was processed using phac-nml/gasclustering v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (GrĂ¼ning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

+

Data was processed using phac-nml/fastmatchirida v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (GrĂ¼ning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

${workflow.commandLine}

${tool_citations}

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 2a36971..623bec7 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,13 +1,13 @@ report_comment: > - This report has been generated by the phac-nml/gasclustering + This report has been generated by the phac-nml/fastmatchirida analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: - "phac-nml-gasclustering-methods-description": + "phac-nml-fastmatchirida-methods-description": order: -1000 software_versions: order: -1001 - "phac-nml-gasclustering-summary": + "phac-nml-fastmatchirida-summary": order: -1002 export_plots: true diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 82842ce..410a602 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,4 @@ -sample,mlst_alleles -sample1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json -sample2,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json -sample3,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json +sample,fastmatch_category,mlst_alleles +sample1,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json +sample2,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json +sample3,reference,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json diff --git a/assets/schema_input.json b/assets/schema_input.json index c8d7c99..651d179 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -1,7 +1,7 @@ { "$schema": "https://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/phac-nml/gasclustering/main/assets/schema_input.json", - "title": "phac-nml/gasclustering pipeline - params.input schema", + "$id": "https://raw.githubusercontent.com/phac-nml/fastmatchirida/main/assets/schema_input.json", + "title": "phac-nml/fastmatchirida pipeline - params.input schema", "description": "Schema for the file provided with params.input", "type": "array", "items": { @@ -25,6 +25,13 @@ "pattern": "^\\S+\\.mlst(\\.subtyping)?\\.json(\\.gz)?$", "errorMessage": "MLST JSON file from locidex report, cannot contain spaces and must have the extension: '.mlst.json', '.mlst.json.gz', '.mlst.subtyping.json', or 'mlst.subtyping.json.gz'" }, + "fastmatch_category": { + "type": "string", + "errorMessage": "Has to be either query or reference", + "description": "Identify whether a sample is query or reference", + "fa_icon": "far fa-sticky-note", + "enum": ["query", "reference"] + }, "metadata_1": { "type": "string", "meta": ["metadata_1"], diff --git a/assets/slackreport.json b/assets/slackreport.json index 09f8325..1f51c8b 100644 --- a/assets/slackreport.json +++ b/assets/slackreport.json @@ -3,7 +3,7 @@ { "fallback": "Plain-text summary of the attachment.", "color": "<% if (success) { %>good<% } else { %>danger<%} %>", - "author_name": "phac-nml/gasclustering v${version} - ${runName}", + "author_name": "phac-nml/fastmatchirida v${version} - ${runName}", "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", "fields": [ diff --git a/bin/process_output.py b/bin/process_output.py new file mode 100755 index 0000000..7ca16a9 --- /dev/null +++ b/bin/process_output.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python + +from pathlib import Path +from mimetypes import guess_type +from functools import partial +import gzip +import sys +import argparse +import pandas as pd + + +def get_open(f): + if "gzip" == guess_type(str(f))[1]: + return partial(gzip.open) + else: + return open + +def main(argv=None): + + parser = argparse.ArgumentParser( + description="Parses a profile_dists distances to create query-reference-format output for the FastMatch pipeline.", + epilog="Example: python process_output.py --input distances.tsv --output results.tsv --threshold 10", + ) + + parser.add_argument( + "--input", + action="store", + dest="input", + type=str, + help="profile_dists-generated distance matrix", + default=None, + required=True, + ) + + parser.add_argument( + "--threshold", + action="store", + dest="threshold", + type=float, + help="distance threshold to be included in output", + default=None, + required=True, + ) + + parser.add_argument( + "--output", + action="store", + dest="output", + type=str, + help="output prefix (without extension)", + default=None, + required=True, + ) + + args = parser.parse_args(argv) + + input = Path(args.input) + tsv_path = Path(args.output + ".tsv") + excel_path = Path(args.output + ".xlsx") + threshold = args.threshold + + data = pd.read_csv(input, sep="\t") + data = data[data['Distance'] <= threshold] + data.to_csv(tsv_path, sep="\t", index=False) + data.to_excel(excel_path, index=False) + + print("Output written to:") + print(tsv_path) + print(excel_path) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/conf/base.config b/conf/base.config index a8639a2..24b732f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,6 +1,6 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - phac-nml/gasclustering Nextflow base config file + phac-nml/fastmatchirida Nextflow base config file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A 'blank slate' config file, appropriate for general use on most high performance compute environments. Assumes that all software is installed and available on diff --git a/conf/iridanext.config b/conf/iridanext.config index d5d6156..7583a2d 100644 --- a/conf/iridanext.config +++ b/conf/iridanext.config @@ -7,17 +7,14 @@ iridanext { files { idkey = "irida_id" global = [ - "**/ArborView/arborview.clustered_data_arborview.html", - "**/clusters/gas.mcluster.clusters.text", - "**/clusters/gas.mcluster.run.json", - "**/clusters/gas.mcluster.thresholds.json", - "**/clusters/gas.mcluster.tree.nwk", "**/distances/profile_dists.allele_map.json", "**/distances/profile_dists.query_profile.text", "**/distances/profile_dists.ref_profile.text", "**/distances/profile_dists.results.text", "**/distances/profile_dists.run.json", "**/merged/locidex.merge.profile.tsv", + "**/process/results.tsv", + "**/process/results.xlsx" ] } } diff --git a/conf/modules.config b/conf/modules.config index 9e7bba6..7c4ceac 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -16,11 +16,8 @@ process { cluster_directory_name = "clusters" merged_profiles_directory_name = "merged" profile_dists_directory_name = "distances" - arbor_view_directory_name = "ArborView" - cluster_prefix = 'gas.mcluster.' merged_prefix = 'locidex.merge.' profile_dists_prefix = 'profile_dists.' - arbor_view_prefix = 'arborview.' publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, @@ -32,15 +29,6 @@ process { fair = true } - withName: GAS_MCLUSTER { - publishDir = [ - path: { ["${params.outdir}", "${task.cluster_directory_name}"].join(File.separator) }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : - filename.contains(File.separator) ? task.cluster_prefix + filename.split(File.separator)[-1] : task.cluster_prefix + filename } - ] - } - withName: LOCIDEX_MERGE { publishDir = [ path: { ["${params.outdir}", "${task.merged_profiles_directory_name}"].join(File.separator) }, @@ -59,15 +47,6 @@ process { ] } - withName: ARBOR_VIEW{ - publishDir = [ - path: { ["${params.outdir}", "${task.arbor_view_directory_name}"].join(File.separator) }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : - filename.contains(File.separator) ? task.arbor_view_prefix + filename.split(File.separator)[-1] : task.arbor_view_prefix + filename } - ] - } - withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/conf/test.config b/conf/test.config index b0679a2..41d2893 100644 --- a/conf/test.config +++ b/conf/test.config @@ -5,7 +5,7 @@ Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run phac-nml/gasclustering -profile test, --outdir + nextflow run phac-nml/fastmatchirida -profile test, --outdir ---------------------------------------------------------------------------------------- */ diff --git a/conf/test_full.config b/conf/test_full.config index 592513b..e9362a6 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -5,7 +5,7 @@ Defines input files and everything required to run a full size pipeline test. Use as follows: - nextflow run phac-nml/gasclustering -profile test_full, --outdir + nextflow run phac-nml/fastmatchirida -profile test_full, --outdir ---------------------------------------------------------------------------------------- */ @@ -15,5 +15,5 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/phac-nml/gasclustering/main/assets/samplesheet.csv' + input = 'https://raw.githubusercontent.com/phac-nml/fastmatchirida/main/assets/samplesheet.csv' } diff --git a/docs/README.md b/docs/README.md index 5697b54..0e2f97f 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ -# phac-nml/gasclustering: Documentation +# phac-nml/fastmatchirida: Documentation -The phac-nml/gasclustering documentation is split into the following pages: +The phac-nml/fastmatchirida documentation is split into the following pages: - [Usage](usage.md) - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. diff --git a/docs/output.md b/docs/output.md index f4b8572..b700d03 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,4 +1,4 @@ -# phac-nml/gasclustering: Output +# phac-nml/fastmatchirida: Output ## Introduction diff --git a/docs/usage.md b/docs/usage.md index 30a287f..208ab59 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,4 +1,4 @@ -# phac-nml/gasclustering: Usage +# phac-nml/fastmatchirida: Usage ## Introduction @@ -35,7 +35,7 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p ### IRIDA-Next Optional Samplesheet Configuration -`gasclustering` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which contain the following columns: `sample`, `sample_name`, `mlst_alleles`, `metadata_1`, `metadata_2`, ..., `metadata_8`. The `sample` IDs within a samplesheet should be unique. All other columns outside of the listed above will be ignored. +`fastmatchirida` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which contain the following columns: `sample`, `sample_name`, `mlst_alleles`, `metadata_1`, `metadata_2`, ..., `metadata_8`. The `sample` IDs within a samplesheet should be unique. All other columns outside of the listed above will be ignored. A final samplesheet file consisting of both single- and paired-end data may look something like the one below. @@ -86,7 +86,7 @@ Do not use `-c ` to specify parameters as this will result in errors. Cust The above pipeline run specified with a params file in yaml format: ```bash -nextflow run phac-nml/gasclustering -profile docker -params-file params.yaml +nextflow run phac-nml/fastmatchirida -profile docker -params-file params.yaml ``` with `params.yaml` containing: @@ -103,7 +103,7 @@ You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-c It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [phac-nml/gasclustering page](https://github.com/phac-nml/gasclustering) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. +First, go to the [phac-nml/fastmatchirida page](https://github.com/phac-nml/fastmatchirida) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. @@ -162,7 +162,7 @@ Specify the path to a specific config file (this is a core Nextflow command). Se ### Resource requests -Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/phac-nml/gasclustering/blob/main/conf/base.config#L17) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. +Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/phac-nml/fastmatchirida/blob/main/conf/base.config#L17) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index e7d4c6f..c90d422 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -1,5 +1,5 @@ // -// This file holds several functions specific to the main.nf workflow in the phac-nml/gasclustering pipeline +// This file holds several functions specific to the main.nf workflow in the phac-nml/fastmatchirida pipeline // import nextflow.Nextflow diff --git a/lib/WorkflowGasclustering.groovy b/lib/Workflowfastmatchirida.groovy similarity index 93% rename from lib/WorkflowGasclustering.groovy rename to lib/Workflowfastmatchirida.groovy index e3ffa5d..2a1bc47 100755 --- a/lib/WorkflowGasclustering.groovy +++ b/lib/Workflowfastmatchirida.groovy @@ -1,11 +1,11 @@ // -// This file holds several functions specific to the workflow/gasclustering.nf in the phac-nml/gasclustering pipeline +// This file holds several functions specific to the workflow/fastmatchirida.nf in the phac-nml/fastmatchirida pipeline // import nextflow.Nextflow import groovy.text.SimpleTemplateEngine -class WorkflowGasclustering { +class Workflowfastmatchirida { // // Check and validate parameters diff --git a/main.nf b/main.nf index 0f63054..5fc28ee 100644 --- a/main.nf +++ b/main.nf @@ -1,9 +1,9 @@ #!/usr/bin/env nextflow /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - phac-nml/gasclustering + phac-nml/fastmatchirida ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Github : https://github.com/phac-nml/gasclustering + Github : https://github.com/phac-nml/fastmatchirida ---------------------------------------------------------------------------------------- */ @@ -39,13 +39,13 @@ WorkflowMain.initialise(workflow, params, log, args) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { GASCLUSTERING } from './workflows/gasclustering' +include { FASTMATCH } from './workflows/fastmatchirida' // -// WORKFLOW: Run main phac-nml/gasclustering analysis pipeline +// WORKFLOW: Run main phac-nml/fastmatchirida analysis pipeline // -workflow PHACNML_GASCLUSTERING { - GASCLUSTERING () +workflow PHACNML_FASTMATCH { + FASTMATCH () } /* @@ -59,7 +59,7 @@ workflow PHACNML_GASCLUSTERING { // See: https://github.com/nf-core/rnaseq/issues/619 // workflow { - PHACNML_GASCLUSTERING () + PHACNML_FASTMATCH () } /* diff --git a/modules.json b/modules.json index ef9329e..f23cb49 100644 --- a/modules.json +++ b/modules.json @@ -1,6 +1,6 @@ { - "name": "phac-nml/gasclustering", - "homePage": "https://github.com/phac-nml/gasclustering", + "name": "phac-nml/fastmatchirida", + "homePage": "https://github.com/phac-nml/fastmatchirida", "repos": { "https://github.com/nf-core/modules.git": { "modules": { diff --git a/modules/local/append_metadata/main.nf b/modules/local/append_metadata/main.nf new file mode 100644 index 0000000..a871e6f --- /dev/null +++ b/modules/local/append_metadata/main.nf @@ -0,0 +1,61 @@ +process APPEND_METADATA { + tag "Appends metadata to distances" + label 'process_single' + + input: + val distances_path // distance data as a TSV path + // this needs to be "val", because "path" + // won't stage the file correctly for exec + val metadata_rows // metadata rows (no headers) to be appened, list of lists + val metadata_headers // headers to name the metadata columns + + output: + path("distances_and_metadata.tsv"), emit: distances + + exec: + def distances_rows // has a header row + def metadata_rows_map = [:] + def sample_names_map = [:] // maps sample names to Irida IDs + def merged = [] + + distances_path.withReader { reader -> + distances_rows = reader.readLines()*.split('\t') + } + + // Create a map of the metadata rows: + // Start on i = 0 because there are no headers included. + for(int i = 0; i < metadata_rows.size(); i++) + { + // "sample" -> ["sample", meta1, meta2, meta3, ...] + sample_name = metadata_rows[i][0] + metadata_rows_map[sample_name] = metadata_rows[i] + + // "sample" -> "Irida ID" + sample_names_map[sample_name] = metadata_rows[i][1] + } + + // Create the header row: + merged.add(["Query ID", "Query Sample Name", "Reference ID", "Reference Sample Name", "Distance"] + + metadata_headers[1..-1]) + + // Merge the remaining rows in original order: + // Start on i = 1 because we don't want the headers. + for(int i = 1; i < distances_rows.size(); i++) + { + query_sample_name = distances_rows[i][0] + query_irida_id = sample_names_map[query_sample_name] + reference_sample_name = distances_rows[i][1] + reference_irida_id = sample_names_map[reference_sample_name] + distance = distances_rows[i][2] + + merged_row = [query_irida_id, query_sample_name, reference_irida_id, reference_sample_name, distance] \ + + metadata_rows_map[reference_sample_name][2..-1] + + merged.add(merged_row) + } + + task.workDir.resolve("distances_and_metadata.tsv").withWriter { writer -> + merged.each { writer.writeLine it.join("\t") } + } + +} diff --git a/modules/local/appendmetadata/main.nf b/modules/local/appendmetadata/main.nf deleted file mode 100644 index aa88a6c..0000000 --- a/modules/local/appendmetadata/main.nf +++ /dev/null @@ -1,56 +0,0 @@ -process APPEND_METADATA { - tag "append_metadata" - label 'process_single' - - input: - val clusters_path // cluster data as a TSV path - // this needs to be "val", because "path" - // won't stage the file correctly for exec - val metadata_rows // metadata rows (no headers) to be appened, list of lists - val metadata_headers // headers to name the metadata columns - - output: - path("clusters_and_metadata.tsv"), emit: clusters - - exec: - def clusters_rows // has a header row - def clusters_rows_map = [:] - def metadata_rows_map = [:] - def merged = [] - - clusters_path.withReader { reader -> - clusters_rows = reader.readLines()*.split('\t') - } - - // Create a map of the cluster rows: - // Start on i = 1 because we don't want the headers. - for(int i = 1; i < clusters_rows.size(); i++) - { - // "sample" -> ["sample", 1, 2, 3, ...] - clusters_rows_map[clusters_rows[i][0]] = clusters_rows[i] - } - - // Create a map of the metadata rows: - // Start on i = 0 because there are no headers included. - for(int i = 0; i < metadata_rows.size(); i++) - { - // "sample" -> ["sample", meta1, meta2, meta3, ...] - metadata_rows_map[metadata_rows[i][0]] = metadata_rows[i] - } - - // Merge the headers - merged.add(clusters_rows[0] + metadata_headers) - - // Merge the remain rows in original order: - // Start on i = 1 because we don't want the headers. - for(int i = 1; i < clusters_rows.size(); i++) - { - def sample_key = clusters_rows[i][0] - merged.add(clusters_rows_map[sample_key] + metadata_rows_map[sample_key][1..-1]) - } - - task.workDir.resolve("clusters_and_metadata.tsv").withWriter { writer -> - merged.each { writer.writeLine it.join("\t") } - } - -} diff --git a/modules/local/arborview.nf b/modules/local/arborview.nf deleted file mode 100644 index 407468a..0000000 --- a/modules/local/arborview.nf +++ /dev/null @@ -1,32 +0,0 @@ -// Inline output tree into ArborView.html -// TODO include versions for python and arbor view when available - - - -process ARBOR_VIEW { - label "process_low" - tag "Inlining Tree Data" - stageInMode 'copy' // Need to copy in arbor view html - - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python%3A3.12' : - 'biocontainers/python:3.12' }" - - input: - tuple path(tree), path(contextual_data) - path(arbor_view) // need to make sure this is copied - - - output: - path(output_value), emit: html - - - script: - output_value = "clustered_data_arborview.html" - """ - inline_arborview.py -d ${contextual_data} -n ${tree} -o ${output_value} -t ${arbor_view} - """ - - - -} diff --git a/modules/local/gas/mcluster/main.nf b/modules/local/gas/mcluster/main.nf deleted file mode 100644 index 7d832bb..0000000 --- a/modules/local/gas/mcluster/main.nf +++ /dev/null @@ -1,37 +0,0 @@ -// Denovo clustering module for GAS - -process GAS_MCLUSTER{ - label "process_high" - tag "Denovo Clustering" - - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/genomic_address_service%3A0.1.1--pyh7cba7a3_1' : - 'biocontainers/genomic_address_service:0.1.1--pyh7cba7a3_1' }" - - input: - path(dist_matrix) - - output: - path("${prefix}/distances.{text,parquet}"), emit: distances, optional: true - path("${prefix}/thresholds.json"), emit: thresholds - path("${prefix}/clusters.{text,parquet}"), emit: clusters - path("${prefix}/tree.nwk"), emit: tree - path("${prefix}/run.json"), emit: run - path "versions.yml", emit: versions - - script: - prefix = "clusters" - """ - gas mcluster --matrix $dist_matrix \\ - --outdir $prefix \\ - --method '${params.gm_method}' \\ - --threshold ${params.gm_thresholds} \\ - --delimeter '${params.gm_delimiter}' - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - genomic_address_service: \$( gas mcluster -V | sed -e "s/gas//g" ) - END_VERSIONS - """ - -} diff --git a/modules/local/locidex/merge/main.nf b/modules/local/locidex/merge/main.nf index 103c24f..59b52a4 100644 --- a/modules/local/locidex/merge/main.nf +++ b/modules/local/locidex/merge/main.nf @@ -11,15 +11,17 @@ process LOCIDEX_MERGE { input: path input_values // [file(sample1), file(sample2), file(sample3), etc...] + val combined_dir + val query_ref output: path("${combined_dir}/*.tsv"), emit: combined_profiles path "versions.yml", emit: versions script: - combined_dir = "merged" """ locidex merge -i ${input_values.join(' ')} -o ${combined_dir} + mv ${combined_dir}/profile.tsv ${combined_dir}/profile_${query_ref}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": locidex merge: \$(echo \$(locidex search -V 2>&1) | sed 's/^.*locidex //' ) diff --git a/modules/local/process_output/main.nf b/modules/local/process_output/main.nf new file mode 100644 index 0000000..3b8ec20 --- /dev/null +++ b/modules/local/process_output/main.nf @@ -0,0 +1,34 @@ +process PROCESS_OUTPUT { + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/staramr:0.10.0--pyhdfd78af_0': + 'biocontainers/staramr:0.10.0--pyhdfd78af_0' }" + + input: + path distances + val threshold + + output: + path "results.tsv", emit: tsv + path "results.xlsx", emit: excel + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + process_output.py \\ + $args \\ + --input $distances \\ + --output results \\ + --threshold $threshold + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + process_outout : 0.1.0 + END_VERSIONS + """ +} diff --git a/modules/local/profile_dists/main.nf b/modules/local/profile_dists/main.nf index d316e78..7eae4b7 100644 --- a/modules/local/profile_dists/main.nf +++ b/modules/local/profile_dists/main.nf @@ -9,6 +9,7 @@ process PROFILE_DISTS{ input: path query + path reference val mapping_format path mapping_file path columns @@ -40,7 +41,7 @@ process PROFILE_DISTS{ } prefix = "distances" """ - profile_dists --query $query --ref $query $args --outfmt $mapping_format \\ + profile_dists --query $query --ref $reference $args --outfmt $mapping_format \\ --force \\ --distm $params.pd_distm \\ --file_type $params.pd_file_type \\ diff --git a/nextflow.config b/nextflow.config index 64263e8..e47fd88 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,6 +1,6 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - phac-nml/gasclustering Nextflow config file + phac-nml/fastmatchirida Nextflow config file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Default config options for all compute environments ---------------------------------------------------------------------------------------- @@ -43,8 +43,10 @@ params { validationShowHiddenParams = false validate_params = true + // FastMatch + threshold = 1.0 + // Profile dists args - pd_outfmt = "matrix" pd_distm = "hamming" pd_missing_threshold = 1.0 pd_sample_quality_threshold = 1.0 @@ -54,11 +56,6 @@ params { pd_columns = null pd_count_missing = false - // GAS Cluster - gm_thresholds = "10,5,0" - gm_method = "average" - gm_delimiter = "." - // Metadata metadata_1_header = "metadata_1" metadata_2_header = "metadata_2" @@ -223,13 +220,13 @@ dag { } manifest { - name = 'phac-nml/gasclustering' - author = """Aaron Petkau and Eric Marinier and Matthew Wells""" - homePage = 'https://github.com/phac-nml/gasclustering' + name = 'phac-nml/fastmatchirida' + author = """Aaron Petkau, Eric Marinier, Matthew Wells and Steven Sutcliffe""" + homePage = 'https://github.com/phac-nml/fastmatchirida' description = """IRIDA Next Genomic Address Service Clustering Pipeline""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '0.4.0' + version = '0.1.0' doi = '' defaultBranch = 'main' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 4e4fab0..8db48af 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,7 +1,7 @@ { "$schema": "https://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/phac-nml/gasclustering/main/nextflow_schema.json", - "title": "phac-nml/gasclustering pipeline parameters", + "$id": "https://raw.githubusercontent.com/phac-nml/fastmatchirida/main/nextflow_schema.json", + "title": "phac-nml/fastmatchirida pipeline parameters", "description": "IRIDA Next Example Pipeline", "type": "object", "definitions": { @@ -104,19 +104,26 @@ }, "fa_icon": "far fa-clipboard" }, + "fastmatch": { + "title": "FastMatch", + "type": "object", + "description": "Parameters for FastMatch", + "default": "", + "properties": { + "threshold": { + "type": "number", + "description": "Comparison score threshold value", + "default": 1.0, + "minimum": 0 + } + } + }, "profile_dists": { "title": "Profile Dists", "type": "object", "description": "Parameters for profile_dists distance calculations", "default": "", "properties": { - "pd_outfmt": { - "type": "string", - "description": "The output format for distances", - "enum": ["matrix"], - "default": "matrix", - "hidden": true - }, "pd_distm": { "type": "string", "description": "The distance method/unit", @@ -168,32 +175,6 @@ } } }, - "gas_cluster": { - "title": "GAS Cluster", - "type": "object", - "description": "", - "default": "Parameters for GAS mcluster", - "properties": { - "gm_thresholds": { - "type": "string", - "default": "10,5,0", - "description": "Thresholds delimited by ','. Values should match units from '--pd_distm' (either 'hamming' or 'scaled').", - "pattern": "^(\\d+(\\.\\d+)?,)*\\d+(\\.\\d+)?$" - }, - "gm_method": { - "type": "string", - "default": "average", - "description": "Clustering linkage method.", - "enum": ["single", "average", "complete"] - }, - "gm_delimiter": { - "type": "string", - "default": ".", - "description": "Delimiter desired for nomenclature code. Must be alphanumeric or one of [._-].", - "pattern": "^[A-Fa-f0-9\\._-]+$" - } - } - }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -353,13 +334,13 @@ "$ref": "#/definitions/input_output_options" }, { - "$ref": "#/definitions/metadata" + "$ref": "#/definitions/fastmatch" }, { - "$ref": "#/definitions/profile_dists" + "$ref": "#/definitions/metadata" }, { - "$ref": "#/definitions/gas_cluster" + "$ref": "#/definitions/profile_dists" }, { "$ref": "#/definitions/institutional_config_options" diff --git a/tests/data/distances/expected_dists-hamming.tsv b/tests/data/distances/expected_dists-hamming.tsv deleted file mode 100644 index 2b09f4c..0000000 --- a/tests/data/distances/expected_dists-hamming.tsv +++ /dev/null @@ -1,4 +0,0 @@ -dists sample1 sample2 sample3 -sample1 0 1 2 -sample2 1 0 3 -sample3 2 3 0 diff --git a/tests/data/distances/expected_dists-hash-keep-one-loci.tsv b/tests/data/distances/expected_dists-hash-keep-one-loci.tsv index 254033a..535e1de 100644 --- a/tests/data/distances/expected_dists-hash-keep-one-loci.tsv +++ b/tests/data/distances/expected_dists-hash-keep-one-loci.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 1 0 -sample2 1 0 1 -sample3 0 1 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample3 0 +sample1 sample2 1 +sample2 sample2 0 +sample2 sample1 1 +sample2 sample3 1 +sample3 sample1 0 +sample3 sample3 0 +sample3 sample2 1 diff --git a/tests/data/distances/expected_dists-hash-missing-count-missing.tsv b/tests/data/distances/expected_dists-hash-missing-count-missing.tsv index 5ead144..b6d8e5f 100644 --- a/tests/data/distances/expected_dists-hash-missing-count-missing.tsv +++ b/tests/data/distances/expected_dists-hash-missing-count-missing.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 2 2 -sample2 2 0 2 -sample3 2 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 2 +sample1 sample3 2 +sample2 sample2 0 +sample2 sample1 2 +sample2 sample3 2 +sample3 sample3 0 +sample3 sample1 2 +sample3 sample2 2 diff --git a/tests/data/distances/expected_dists-hash-missing.tsv b/tests/data/distances/expected_dists-hash-missing.tsv index 3403cd7..2a86a67 100644 --- a/tests/data/distances/expected_dists-hash-missing.tsv +++ b/tests/data/distances/expected_dists-hash-missing.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 1 1 -sample2 1 0 2 -sample3 1 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 1 +sample1 sample3 1 +sample2 sample2 0 +sample2 sample1 1 +sample2 sample3 2 +sample3 sample3 0 +sample3 sample1 1 +sample3 sample2 2 diff --git a/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv b/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv index 0cc834e..9226d7d 100644 --- a/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv +++ b/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv @@ -1,3 +1,5 @@ -dists sample1 sample2 -sample1 0 2 -sample2 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 2 +sample2 sample2 0 +sample2 sample1 2 diff --git a/tests/data/distances/expected_dists-hash-more-missing.tsv b/tests/data/distances/expected_dists-hash-more-missing.tsv index dae119c..cba7ad2 100644 --- a/tests/data/distances/expected_dists-hash-more-missing.tsv +++ b/tests/data/distances/expected_dists-hash-more-missing.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 2 3 -sample2 2 0 2 -sample3 3 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 2 +sample1 sample3 3 +sample2 sample2 0 +sample2 sample1 2 +sample2 sample3 2 +sample3 sample3 0 +sample3 sample2 2 +sample3 sample1 3 diff --git a/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv b/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv index 3403cd7..2a86a67 100644 --- a/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv +++ b/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0 1 1 -sample2 1 0 2 -sample3 1 2 0 +query_id ref_id dist +sample1 sample1 0 +sample1 sample2 1 +sample1 sample3 1 +sample2 sample2 0 +sample2 sample1 1 +sample2 sample3 2 +sample3 sample3 0 +sample3 sample1 1 +sample3 sample2 2 diff --git a/tests/data/distances/expected_dists-mismatched-ids.tsv b/tests/data/distances/expected_dists-mismatched-ids.tsv index 1a64f2b..a1311dc 100644 --- a/tests/data/distances/expected_dists-mismatched-ids.tsv +++ b/tests/data/distances/expected_dists-mismatched-ids.tsv @@ -1,4 +1,10 @@ -dists sampleA sampleB sampleC -sampleA 0.0 0.0 33.333333333333336 -sampleB 0.0 0.0 33.333333333333336 -sampleC 33.333333333333336 33.333333333333336 0.0 +query_id ref_id dist +sampleA sampleA 0.0 +sampleA sampleB 0.0 +sampleA sampleC 33.333333333333336 +sampleB sampleA 0.0 +sampleB sampleB 0.0 +sampleB sampleC 33.333333333333336 +sampleC sampleC 0.0 +sampleC sampleA 33.333333333333336 +sampleC sampleB 33.333333333333336 diff --git a/tests/data/distances/expected_dists-partial-mismatched-ids.tsv b/tests/data/distances/expected_dists-partial-mismatched-ids.tsv index e7b7940..a68155f 100644 --- a/tests/data/distances/expected_dists-partial-mismatched-ids.tsv +++ b/tests/data/distances/expected_dists-partial-mismatched-ids.tsv @@ -1,4 +1,10 @@ -dists sampleA sampleB sample3 -sampleA 0.0 0.0 33.333333333333336 -sampleB 0.0 0.0 33.333333333333336 -sample3 33.333333333333336 33.333333333333336 0.0 +query_id ref_id dist +sampleA sampleA 0.0 +sampleA sampleB 0.0 +sampleA sample3 33.333333333333336 +sampleB sampleA 0.0 +sampleB sampleB 0.0 +sampleB sample3 33.333333333333336 +sample3 sample3 0.0 +sample3 sampleA 33.333333333333336 +sample3 sampleB 33.333333333333336 diff --git a/tests/data/distances/expected_dists.tsv b/tests/data/distances/expected_dists.tsv index 00e9cec..4fc6d6f 100644 --- a/tests/data/distances/expected_dists.tsv +++ b/tests/data/distances/expected_dists.tsv @@ -1,4 +1,10 @@ -dists sample1 sample2 sample3 -sample1 0.0 0.0 33.333333333333336 -sample2 0.0 0.0 33.333333333333336 -sample3 33.333333333333336 33.333333333333336 0.0 +query_id ref_id dist +sample1 sample1 0.0 +sample1 sample2 0.0 +sample1 sample3 33.333333333333336 +sample2 sample1 0.0 +sample2 sample2 0.0 +sample2 sample3 33.333333333333336 +sample3 sample3 0.0 +sample3 sample1 33.333333333333336 +sample3 sample2 33.333333333333336 diff --git a/tests/data/profiles/expected-profile-hamming.tsv b/tests/data/profiles/expected-profile-hamming.tsv deleted file mode 100644 index 45cd086..0000000 --- a/tests/data/profiles/expected-profile-hamming.tsv +++ /dev/null @@ -1,4 +0,0 @@ -sample_id l1 l2 l3 -sample1 1 1 1 -sample2 1 2 1 -sample3 2 1 2 diff --git a/tests/data/samplesheets/samplesheet-addsamplename.csv b/tests/data/samplesheets/samplesheet-addsamplename.csv index a1b785a..48cc046 100644 --- a/tests/data/samplesheets/samplesheet-addsamplename.csv +++ b/tests/data/samplesheets/samplesheet-addsamplename.csv @@ -1,4 +1,4 @@ -sample,sample_name,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 -sample1,S 1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8 -sample2,S2#,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8 -sample3,S2_,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8 +sample,fastmatch_category,sample_name,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 +sample1,query,S 1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8 +sample2,query,S2#,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8 +sample3,reference,S2_,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8 diff --git a/tests/data/samplesheets/samplesheet-hamming.csv b/tests/data/samplesheets/samplesheet-hamming.csv deleted file mode 100644 index d18a69c..0000000 --- a/tests/data/samplesheets/samplesheet-hamming.csv +++ /dev/null @@ -1,4 +0,0 @@ -sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 -sample1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hamming/sample1.mlst.subtyping.json,,,,,,,, -sample2,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hamming/sample2.mlst.subtyping.json,,,,,,,, -sample3,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hamming/sample3.mlst.subtyping.json,,,,,,,, diff --git a/tests/data/samplesheets/samplesheet-hash-missing.csv b/tests/data/samplesheets/samplesheet-hash-missing.csv index 9355c3d..d06d53a 100644 --- a/tests/data/samplesheets/samplesheet-hash-missing.csv +++ b/tests/data/samplesheets/samplesheet-hash-missing.csv @@ -1,4 +1,4 @@ -sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 -sample1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hash-missing/sample1.mlst.subtyping.json,,,,,,,, -sample2,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hash-missing/sample2.mlst.subtyping.json,,,,,,,, -sample3,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hash-missing/sample3.mlst.subtyping.json,,,,,,,, +sample,fastmatch_category,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 +sample1,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hash-missing/sample1.mlst.subtyping.json,,,,,,,, +sample2,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hash-missing/sample2.mlst.subtyping.json,,,,,,,, +sample3,reference,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hash-missing/sample3.mlst.subtyping.json,,,,,,,, diff --git a/tests/data/samplesheets/samplesheet-hash-more-missing.csv b/tests/data/samplesheets/samplesheet-hash-more-missing.csv index 4ee53c9..5c4a4b4 100644 --- a/tests/data/samplesheets/samplesheet-hash-more-missing.csv +++ b/tests/data/samplesheets/samplesheet-hash-more-missing.csv @@ -1,4 +1,4 @@ -sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 -sample1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hash-missing/sample1.mlst.subtyping.json,,,,,,,, -sample2,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hash-missing/sample2.mlst.subtyping.json,,,,,,,, -sample3,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hash-missing/sample3-more-missing.mlst.subtyping.json,,,,,,,, +sample,fastmatch_category,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 +sample1,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hash-missing/sample1.mlst.subtyping.json,,,,,,,, +sample2,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hash-missing/sample2.mlst.subtyping.json,,,,,,,, +sample3,reference,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/case-hash-missing/sample3-more-missing.mlst.subtyping.json,,,,,,,, diff --git a/tests/data/samplesheets/samplesheet-little-metadata.csv b/tests/data/samplesheets/samplesheet-little-metadata.csv index 3e721de..138469e 100644 --- a/tests/data/samplesheets/samplesheet-little-metadata.csv +++ b/tests/data/samplesheets/samplesheet-little-metadata.csv @@ -1,4 +1,4 @@ -sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 -sample1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,,,,1.4,,,, -sample2,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,,,,,,,, -sample3,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,,,,,,3.8 +sample,fastmatch_category,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 +sample1,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,,,,1.4,,,, +sample2,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,,,,,,,, +sample3,reference,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,,,,,,3.8 diff --git a/tests/data/samplesheets/samplesheet-mismatched-ids.csv b/tests/data/samplesheets/samplesheet-mismatched-ids.csv index 632768d..ffef4ca 100644 --- a/tests/data/samplesheets/samplesheet-mismatched-ids.csv +++ b/tests/data/samplesheets/samplesheet-mismatched-ids.csv @@ -1,4 +1,4 @@ -sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 -sampleA,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8 -sampleB,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8 -sampleC,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8 +sample,fastmatch_category,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 +sampleA,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8 +sampleB,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8 +sampleC,reference,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8 diff --git a/tests/data/samplesheets/samplesheet-no-metadata.csv b/tests/data/samplesheets/samplesheet-no-metadata.csv index 9d67864..f752374 100644 --- a/tests/data/samplesheets/samplesheet-no-metadata.csv +++ b/tests/data/samplesheets/samplesheet-no-metadata.csv @@ -1,4 +1,4 @@ -sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 -sample1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,,,,,,,, -sample2,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,,,,,,,, -sample3,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,,,,,,,, +sample,fastmatch_category,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 +sample1,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,,,,,,,, +sample2,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,,,,,,,, +sample3,reference,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,,,,,,,, diff --git a/tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv b/tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv index d5d42f0..ab5abab 100644 --- a/tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv +++ b/tests/data/samplesheets/samplesheet-partial-mismatched-ids.csv @@ -1,4 +1,4 @@ -sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 -sampleA,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8 -sampleB,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8 -sample3,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8 +sample,fastmatch_category,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 +sampleA,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8 +sampleB,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8 +sample3,reference,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8 diff --git a/tests/data/samplesheets/samplesheet-tabs.csv b/tests/data/samplesheets/samplesheet-tabs.csv index 56b4243..b863db5 100644 --- a/tests/data/samplesheets/samplesheet-tabs.csv +++ b/tests/data/samplesheets/samplesheet-tabs.csv @@ -1,4 +1,4 @@ -sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 -sample1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,a b,,,,,,, -sample2,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,,,,a b,,,, -sample3,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,,,,,,,,a b +sample,fastmatch_category,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 +sample1,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,a b,,,,,,, +sample2,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,,,,a b,,,, +sample3,reference,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,,,,,,,,a b diff --git a/tests/data/samplesheets/samplesheet1.csv b/tests/data/samplesheets/samplesheet1.csv index 3200344..55cbc2d 100644 --- a/tests/data/samplesheets/samplesheet1.csv +++ b/tests/data/samplesheets/samplesheet1.csv @@ -1,4 +1,4 @@ -sample,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 -sample1,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8 -sample2,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8 -sample3,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8 +sample,fastmatch_category,mlst_alleles,metadata_1,metadata_2,metadata_3,metadata_4,metadata_5,metadata_6,metadata_7,metadata_8 +sample1,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample1.mlst.json,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8 +sample2,query,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample2.mlst.json,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8 +sample3,reference,https://raw.githubusercontent.com/phac-nml/gasclustering/dev/tests/data/reports/sample3.mlst.json,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8 diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test index 2d984e6..864e74c 100644 --- a/tests/pipelines/main.nf.test +++ b/tests/pipelines/main.nf.test @@ -12,7 +12,6 @@ nextflow_pipeline { outdir = "results" pd_distm = "scaled" - gm_thresholds = "50,20,0" metadata_1_header = "myheader_1" metadata_2_header = "myheader_2" @@ -28,7 +27,7 @@ nextflow_pipeline { then { assert workflow.success assert path("$launchDir/results").exists() - +/* // Check ID correction: def sampleA_report = path("$launchDir/results/input/sampleA_error_report.csv") def sampleB_report = path("$launchDir/results/input/sampleB_error_report.csv") @@ -47,50 +46,10 @@ nextflow_pipeline { assert actual_distances.exists() def expected_distances = path("$baseDir/tests/data/distances/expected_dists.tsv") assert actual_distances.text == expected_distances.text - - // Check computed clusters are correct and exist - def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - assert actual_tree.exists() - assert actual_clusters.exists() - def expected_tree = path("$baseDir/tests/data/clusters/expected_tree.nwk") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters.txt") - assert actual_tree.text == expected_tree.text - assert actual_clusters.text == expected_clusters.text - - // Check appended metadata is correct: - def actual_metadata = path("$launchDir/results/append/clusters_and_metadata.tsv") - assert actual_metadata.exists() - def expected_metadata = path("$baseDir/tests/data/append/expected_clusters_and_metadata.tsv") - assert actual_metadata.text == expected_metadata.text - - // Check that the ArborView output is created - def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") - assert actual_arborview.exists() - assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tsample\\tmyheader_1\\tmyheader_2\\tmyheader_3\\tmyheader_4\\tmyheader_5\\tmyheader_6\\tmyheader_7\\tmyheader_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\tsample1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsample2\\t1.1.1\\t1\\t1\\t1\\tsample2\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsample3\\t1.2.2\\t1\\t2\\t2\\tsample3\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n") - // compare IRIDA Next JSON output - def iridanext_json = path("$launchDir/results/iridanext.output.json").json - def iridanext_global = iridanext_json.files.global - def iridanext_samples = iridanext_json.files.samples - def iridanext_metadata = iridanext_json.metadata.samples - - assert iridanext_global.findAll { it.path == "ArborView/arborview.clustered_data_arborview.html" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.run.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.tree.nwk" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.clusters.text" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.thresholds.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "distances/profile_dists.run.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "distances/profile_dists.results.text" }.size() == 1 - assert iridanext_global.findAll { it.path == "distances/profile_dists.ref_profile.text" }.size() == 1 - assert iridanext_global.findAll { it.path == "distances/profile_dists.query_profile.text" }.size() == 1 - assert iridanext_global.findAll { it.path == "distances/profile_dists.allele_map.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "merged/locidex.merge.profile.tsv" }.size() == 1 - - assert iridanext_samples.isEmpty() - assert iridanext_metadata.isEmpty() +*/ } } - +/* test("Small-scale test of full pipeline hamming distances") { tag "pipeline_hamming" @@ -99,7 +58,6 @@ nextflow_pipeline { input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" outdir = "results" - gm_thresholds = "2,1,0" } } @@ -117,43 +75,20 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hamming.tsv") assert actual_distances.text == expected_distances.text - // Check computed clusters are correct and exist - def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_tree = path("$baseDir/tests/data/clusters/expected_tree_hamming.nwk") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hamming.txt") - assert actual_tree.text == expected_tree.text - assert actual_clusters.text == expected_clusters.text - - // Make sure average-linkage (default) parameter was used - assert "average" == path("$launchDir/results/clusters/gas.mcluster.run.json").json.parameters.method - - // Check appended metadata is correct: - def actual_metadata = path("$launchDir/results/append/clusters_and_metadata.tsv") - def expected_metadata = path("$baseDir/tests/data/append/expected_clusters_and_metadata_hamming.tsv") - assert actual_metadata.text == expected_metadata.text - - // Check that the ArborView output is created - def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") - assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tsample\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\tsample1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample2\\t1.1.2\\t1\\t1\\t2\\tsample2\\t\\t\\t\\t\\t\\t\\t\\t\\nsample3\\t2.2.3\\t2\\t2\\t3\\tsample3\\t\\t\\t\\t\\t\\t\\t\\t\\n") - // compare IRIDA Next JSON output def iridanext_json = path("$launchDir/results/iridanext.output.json").json def iridanext_global = iridanext_json.files.global def iridanext_samples = iridanext_json.files.samples def iridanext_metadata = iridanext_json.metadata.samples - assert iridanext_global.findAll { it.path == "ArborView/arborview.clustered_data_arborview.html" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.run.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.tree.nwk" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.clusters.text" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.thresholds.json" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.run.json" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.results.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.ref_profile.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.query_profile.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.allele_map.json" }.size() == 1 assert iridanext_global.findAll { it.path == "merged/locidex.merge.profile.tsv" }.size() == 1 + assert iridanext_global.findAll { it.path == "process/results.xlsx" }.size() == 1 + assert iridanext_global.findAll { it.path == "process/results.tsv" }.size() == 1 assert iridanext_samples.isEmpty() assert iridanext_metadata.isEmpty() @@ -187,16 +122,6 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hamming.tsv") assert actual_distances.text == expected_distances.text - // Check computed clusters are correct and exist - def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_tree = path("$baseDir/tests/data/clusters/expected_tree_hamming_single_linkage.nwk") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hamming_single_linkage.txt") - assert actual_tree.text == expected_tree.text - assert actual_clusters.text == expected_clusters.text - - // Make sure single-linkage parameter was used - assert "single" == path("$launchDir/results/clusters/gas.mcluster.run.json").json.parameters.method } } @@ -227,16 +152,6 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hamming.tsv") assert actual_distances.text == expected_distances.text - // Check computed clusters are correct and exist - def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_tree = path("$baseDir/tests/data/clusters/expected_tree_hamming_complete_linkage.nwk") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hamming_complete_linkage.txt") - assert actual_tree.text == expected_tree.text - assert actual_clusters.text == expected_clusters.text - - // Make sure complete-linkage parameter was used - assert "complete" == path("$launchDir/results/clusters/gas.mcluster.run.json").json.parameters.method } } @@ -273,24 +188,6 @@ nextflow_pipeline { } } - test("Test fail pipeline if invalid delimiter set") { - tag "pipeline_failure_invalid_delimiter" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_delimiter = ';' - } - } - - then { - assert workflow.failed - assert workflow.stderr.contains('* --gm_delimiter: string [;] does not match pattern ^[A-Fa-f0-9\\._-]+$ (;)') - } - } - test("Full pipeline with no metadata") { tag "pipeline_no_metadata" @@ -319,44 +216,20 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_dists.tsv") assert actual_distances.text == expected_distances.text - // Check computed clusters are correct and exist - def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - assert actual_tree.exists() - assert actual_clusters.exists() - def expected_tree = path("$baseDir/tests/data/clusters/expected_tree.nwk") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters.txt") - assert actual_tree.text == expected_tree.text - assert actual_clusters.text == expected_clusters.text - - // Check appended metadata is correct: - def actual_metadata = path("$launchDir/results/append/clusters_and_metadata.tsv") - assert actual_metadata.exists() - def expected_metadata = path("$baseDir/tests/data/append/expected_clusters_and_metadata_no_metadata.tsv") - assert actual_metadata.text == expected_metadata.text - - // Check that the ArborView output is created - def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") - assert actual_arborview.exists() - assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tsample\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\tsample1\\t\\t\\t\\t\\t\\t\\t\\t\\nsample2\\t1.1.1\\t1\\t1\\t1\\tsample2\\t\\t\\t\\t\\t\\t\\t\\t\\nsample3\\t1.2.2\\t1\\t2\\t2\\tsample3\\t\\t\\t\\t\\t\\t\\t\\t\\n") - // compare IRIDA Next JSON output def iridanext_json = path("$launchDir/results/iridanext.output.json").json def iridanext_global = iridanext_json.files.global def iridanext_samples = iridanext_json.files.samples def iridanext_metadata = iridanext_json.metadata.samples - assert iridanext_global.findAll { it.path == "ArborView/arborview.clustered_data_arborview.html" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.run.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.tree.nwk" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.clusters.text" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.thresholds.json" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.run.json" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.results.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.ref_profile.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.query_profile.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.allele_map.json" }.size() == 1 assert iridanext_global.findAll { it.path == "merged/locidex.merge.profile.tsv" }.size() == 1 + assert iridanext_global.findAll { it.path == "process/results.xlsx" }.size() == 1 + assert iridanext_global.findAll { it.path == "process/results.tsv" }.size() == 1 assert iridanext_samples.isEmpty() assert iridanext_metadata.isEmpty() @@ -391,26 +264,6 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_dists.tsv") assert actual_distances.text == expected_distances.text - // Check computed clusters are correct and exist - def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - assert actual_tree.exists() - assert actual_clusters.exists() - def expected_tree = path("$baseDir/tests/data/clusters/expected_tree.nwk") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters.txt") - assert actual_tree.text == expected_tree.text - assert actual_clusters.text == expected_clusters.text - - // Check appended metadata is correct: - def actual_metadata = path("$launchDir/results/append/clusters_and_metadata.tsv") - assert actual_metadata.exists() - def expected_metadata = path("$baseDir/tests/data/append/expected_clusters_and_metadata_little_metadata.tsv") - assert actual_metadata.text == expected_metadata.text - - // Check that the ArborView output is created - def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") - assert actual_arborview.exists() - assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tsample\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsample1\\t1.1.1\\t1\\t1\\t1\\tsample1\\t\\t\\t\\t1.4\\t\\t\\t\\t\\nsample2\\t1.1.1\\t1\\t1\\t1\\tsample2\\t\\t\\t\\t\\t\\t\\t\\t\\nsample3\\t1.2.2\\t1\\t2\\t2\\tsample3\\t3.1\\t3.2\\t\\t\\t\\t\\t\\t3.8\\n") // compare IRIDA Next JSON output def iridanext_json = path("$launchDir/results/iridanext.output.json").json @@ -418,17 +271,14 @@ nextflow_pipeline { def iridanext_samples = iridanext_json.files.samples def iridanext_metadata = iridanext_json.metadata.samples - assert iridanext_global.findAll { it.path == "ArborView/arborview.clustered_data_arborview.html" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.run.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.tree.nwk" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.clusters.text" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.thresholds.json" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.run.json" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.results.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.ref_profile.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.query_profile.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.allele_map.json" }.size() == 1 assert iridanext_global.findAll { it.path == "merged/locidex.merge.profile.tsv" }.size() == 1 + assert iridanext_global.findAll { it.path == "process/results.xlsx" }.size() == 1 + assert iridanext_global.findAll { it.path == "process/results.tsv" }.size() == 1 assert iridanext_samples.isEmpty() assert iridanext_metadata.isEmpty() @@ -477,26 +327,6 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_dists-mismatched-ids.tsv") assert actual_distances.text == expected_distances.text - // Check computed clusters are correct and exist - def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - assert actual_tree.exists() - assert actual_clusters.exists() - def expected_tree = path("$baseDir/tests/data/clusters/expected_tree-mismatched-ids.nwk") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_dists-mismatched-ids.tsv") - assert actual_tree.text == expected_tree.text - assert actual_clusters.text == expected_clusters.text - - // Check appended metadata is correct: - def actual_metadata = path("$launchDir/results/append/clusters_and_metadata.tsv") - assert actual_metadata.exists() - def expected_metadata = path("$baseDir/tests/data/append/expected_clusters_and_metadata-mismatched-ids.tsv") - assert actual_metadata.text == expected_metadata.text - - // Check that the ArborView output is created - def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") - assert actual_arborview.exists() - assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tsample\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\tsampleA\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\tsampleB\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsampleC\\t1.2.2\\t1\\t2\\t2\\tsampleC\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n") // compare IRIDA Next JSON output def iridanext_json = path("$launchDir/results/iridanext.output.json").json @@ -504,17 +334,14 @@ nextflow_pipeline { def iridanext_samples = iridanext_json.files.samples def iridanext_metadata = iridanext_json.metadata.samples - assert iridanext_global.findAll { it.path == "ArborView/arborview.clustered_data_arborview.html" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.run.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.tree.nwk" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.clusters.text" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.thresholds.json" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.run.json" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.results.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.ref_profile.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.query_profile.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.allele_map.json" }.size() == 1 assert iridanext_global.findAll { it.path == "merged/locidex.merge.profile.tsv" }.size() == 1 + assert iridanext_global.findAll { it.path == "process/results.xlsx" }.size() == 1 + assert iridanext_global.findAll { it.path == "process/results.tsv" }.size() == 1 assert iridanext_samples.isEmpty() assert iridanext_metadata.isEmpty() @@ -560,26 +387,6 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_dists-partial-mismatched-ids.tsv") assert actual_distances.text == expected_distances.text - // Check computed clusters are correct and exist - def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - assert actual_tree.exists() - assert actual_clusters.exists() - def expected_tree = path("$baseDir/tests/data/clusters/expected_tree-partial-mismatched-ids.nwk") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_dists-partial-mismatched-ids.tsv") - assert actual_tree.text == expected_tree.text - assert actual_clusters.text == expected_clusters.text - - // Check appended metadata is correct: - def actual_metadata = path("$launchDir/results/append/clusters_and_metadata.tsv") - assert actual_metadata.exists() - def expected_metadata = path("$baseDir/tests/data/append/expected_clusters_and_metadata-partial-mismatched-ids.tsv") - assert actual_metadata.text == expected_metadata.text - - // Check that the ArborView output is created - def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") - assert actual_arborview.exists() - assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tsample\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nsampleA\\t1.1.1\\t1\\t1\\t1\\tsampleA\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nsampleB\\t1.1.1\\t1\\t1\\t1\\tsampleB\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nsample3\\t1.2.2\\t1\\t2\\t2\\tsample3\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n") // compare IRIDA Next JSON output def iridanext_json = path("$launchDir/results/iridanext.output.json").json @@ -587,17 +394,14 @@ nextflow_pipeline { def iridanext_samples = iridanext_json.files.samples def iridanext_metadata = iridanext_json.metadata.samples - assert iridanext_global.findAll { it.path == "ArborView/arborview.clustered_data_arborview.html" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.run.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.tree.nwk" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.clusters.text" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.thresholds.json" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.run.json" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.results.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.ref_profile.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.query_profile.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.allele_map.json" }.size() == 1 assert iridanext_global.findAll { it.path == "merged/locidex.merge.profile.tsv" }.size() == 1 + assert iridanext_global.findAll { it.path == "process/results.xlsx" }.size() == 1 + assert iridanext_global.findAll { it.path == "process/results.tsv" }.size() == 1 assert iridanext_samples.isEmpty() assert iridanext_metadata.isEmpty() @@ -638,36 +442,24 @@ nextflow_pipeline { assert sample2_report.exists() == true assert sample3_report.exists() == true - // Check appended metadata is correct: - def actual_metadata = path("$launchDir/results/append/clusters_and_metadata.tsv") - assert actual_metadata.exists() - def expected_metadata = path("$baseDir/tests/data/append/expected_clusters_and_metadata_addsamplename.tsv") - assert actual_metadata.text == expected_metadata.text - - // Check that the ArborView output is created - def actual_arborview = path("$launchDir/results/ArborView/arborview.clustered_data_arborview.html") - assert actual_arborview.exists() - assert actual_arborview.text.contains("id\\taddress\\tlevel_1\\tlevel_2\\tlevel_3\\tsample\\tmetadata_1\\tmetadata_2\\tmetadata_3\\tmetadata_4\\tmetadata_5\\tmetadata_6\\tmetadata_7\\tmetadata_8\\nS_1\\t1.1.1\\t1\\t1\\t1\\tsample1\\t1.1\\t1.2\\t1.3\\t1.4\\t1.5\\t1.6\\t1.7\\t1.8\\nS2_\\t1.1.1\\t1\\t1\\t1\\tsample2\\t2.1\\t2.2\\t2.3\\t2.4\\t2.5\\t2.6\\t2.7\\t2.8\\nS2__sample3\\t1.2.2\\t1\\t2\\t2\\tsample3\\t3.1\\t3.2\\t3.3\\t3.4\\t3.5\\t3.6\\t3.7\\t3.8\\n") // compare IRIDA Next JSON output (should not be changed by adding sample_name column) def iridanext_json = path("$launchDir/results/iridanext.output.json").json def iridanext_global = iridanext_json.files.global def iridanext_samples = iridanext_json.files.samples def iridanext_metadata = iridanext_json.metadata.samples - assert iridanext_global.findAll { it.path == "ArborView/arborview.clustered_data_arborview.html" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.run.json" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.tree.nwk" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.clusters.text" }.size() == 1 - assert iridanext_global.findAll { it.path == "clusters/gas.mcluster.thresholds.json" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.run.json" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.results.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.ref_profile.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.query_profile.text" }.size() == 1 assert iridanext_global.findAll { it.path == "distances/profile_dists.allele_map.json" }.size() == 1 assert iridanext_global.findAll { it.path == "merged/locidex.merge.profile.tsv" }.size() == 1 + assert iridanext_global.findAll { it.path == "process/results.xlsx" }.size() == 1 + assert iridanext_global.findAll { it.path == "process/results.tsv" }.size() == 1 assert iridanext_samples.isEmpty() assert iridanext_metadata.isEmpty() } } +*/ } diff --git a/tests/pipelines/main_gm_thresholds.nf.test b/tests/pipelines/main_gm_thresholds.nf.test deleted file mode 100644 index ee857cd..0000000 --- a/tests/pipelines/main_gm_thresholds.nf.test +++ /dev/null @@ -1,369 +0,0 @@ -nextflow_pipeline { - - name "Integration Tests of adjusting gm_thresholds parameter for clustering" - script "main.nf" - - test("Test fail pipeline if null threshold set") { - tag "pipeline_failure_null_threshold" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_thresholds = null - } - } - - then { - assert workflow.failed - assert workflow.stdout.contains("ERROR ~ --gm_thresholds null: Cannot pass null or empty string") - } - } - - test("Test fail pipeline if empty threshold set") { - tag "pipeline_failure_no_threshold" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_thresholds = "" - } - } - - then { - assert workflow.failed - assert workflow.stdout.contains("ERROR ~ --gm_thresholds : Cannot pass null or empty string") - } - } - - test("Test fail pipeline if negative threshold set") { - tag "pipeline_failure_negative_threshold" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_thresholds = "-1" - } - } - - then { - assert workflow.failed - assert workflow.stderr.contains('* --gm_thresholds: string [-1] does not match pattern ^(\\d+(\\.\\d+)?,)*\\d+(\\.\\d+)?$ (-1)') - } - } - - test("Test fail pipeline if mismatch between thresholds and scaled distm") { - tag "pipeline_failure_threshold_scaled" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_thresholds = "200,50" - pd_distm = "scaled" - } - } - - then { - assert workflow.failed - assert workflow.stdout.contains("ERROR ~ '--pd_distm scaled' is set, but '--gm_thresholds 200,50' contains thresholds outside of range [0, 100]." - + " Please either set '--pd_distm hamming' or adjust the threshold values.") - } - } - - test("Test fail pipeline if mismatch between thresholds and hamming distm") { - tag "pipeline_failure_threshold_hamming" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_thresholds = "2,0.5" - pd_distm = "hamming" - } - } - - then { - assert workflow.failed - assert workflow.stdout.contains("ERROR ~ '--pd_distm hamming' is set, but '--gm_thresholds 2,0.5' contains fractions." - + " Please either set '--pd_distm scaled' or remove fractions from distance thresholds.") - } - } - - test("Test pipeline with single threshold set to 0") { - tag "pipeline_thresh_0" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_thresholds = "0" - } - } - - then { - assert workflow.success - assert path("$launchDir/results").exists() - - // Check MLST files - def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv") - def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-hamming.tsv") - assert actual_profile_tsv.text == expected_profile_tsv.text - - // Check computed distance matrix is correct and that the file exists - def actual_distances = path("$launchDir/results/distances/profile_dists.results.text") - def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hamming.tsv") - assert actual_distances.text == expected_distances.text - - // Check computed clusters are correct and exist - def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_tree = path("$baseDir/tests/data/clusters/expected_tree_hamming.nwk") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hamming_thresh_0.txt") - assert actual_tree.text == expected_tree.text - assert actual_clusters.text == expected_clusters.text - } - } - - test("Test pipeline with single threshold set to 1") { - tag "pipeline_thresh_1" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_thresholds = "1" - } - } - - then { - assert workflow.success - assert path("$launchDir/results").exists() - - // Check MLST files - def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv") - def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-hamming.tsv") - assert actual_profile_tsv.text == expected_profile_tsv.text - - // Check computed distance matrix is correct and that the file exists - def actual_distances = path("$launchDir/results/distances/profile_dists.results.text") - def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hamming.tsv") - assert actual_distances.text == expected_distances.text - - // Check computed clusters are correct and exist - def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_tree = path("$baseDir/tests/data/clusters/expected_tree_hamming.nwk") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hamming_thresh_1.txt") - assert actual_tree.text == expected_tree.text - assert actual_clusters.text == expected_clusters.text - } - } - - test("Test pipeline with single threshold set to 2") { - tag "pipeline_thresh_2" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_thresholds = "2" - } - } - - then { - assert workflow.success - assert path("$launchDir/results").exists() - - // Check MLST files - def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv") - def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-hamming.tsv") - assert actual_profile_tsv.text == expected_profile_tsv.text - - // Check computed distance matrix is correct and that the file exists - def actual_distances = path("$launchDir/results/distances/profile_dists.results.text") - def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hamming.tsv") - assert actual_distances.text == expected_distances.text - - // Check computed clusters are correct and exist - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hamming_thresh_2.txt") - assert actual_clusters.text == expected_clusters.text - } - } - - test("Test pipeline with single threshold set to 3") { - tag "pipeline_thresh_3" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_thresholds = "3" - } - } - - then { - assert workflow.success - assert path("$launchDir/results").exists() - - // Check MLST files - def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv") - def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-hamming.tsv") - assert actual_profile_tsv.text == expected_profile_tsv.text - - // Check computed distance matrix is correct and that the file exists - def actual_distances = path("$launchDir/results/distances/profile_dists.results.text") - def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hamming.tsv") - assert actual_distances.text == expected_distances.text - - // Check computed clusters are correct and exist - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hamming_thresh_3.txt") - assert actual_clusters.text == expected_clusters.text - } - } - - test("Test pipeline with threshold set to 0,0") { - tag "pipeline_thresh_0.0" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_thresholds = "0,0" - } - } - - then { - assert workflow.success - assert path("$launchDir/results").exists() - - // Check MLST files - def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv") - def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-hamming.tsv") - assert actual_profile_tsv.text == expected_profile_tsv.text - - // Check computed distance matrix is correct and that the file exists - def actual_distances = path("$launchDir/results/distances/profile_dists.results.text") - def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hamming.tsv") - assert actual_distances.text == expected_distances.text - - // Check computed clusters are correct and exist - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hamming_thresh_0.0.txt") - assert actual_clusters.text == expected_clusters.text - } - } - - test("Test pipeline with threshold set to 1,0") { - tag "pipeline_thresh_1.0" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_thresholds = "1,0" - } - } - - then { - assert workflow.success - assert path("$launchDir/results").exists() - - // Check MLST files - def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv") - def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-hamming.tsv") - assert actual_profile_tsv.text == expected_profile_tsv.text - - // Check computed distance matrix is correct and that the file exists - def actual_distances = path("$launchDir/results/distances/profile_dists.results.text") - def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hamming.tsv") - assert actual_distances.text == expected_distances.text - - // Check computed clusters are correct and exist - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hamming_thresh_1.0.txt") - assert actual_clusters.text == expected_clusters.text - } - } - - test("Test pipeline with threshold set to 3,2") { - tag "pipeline_thresh_3.2" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_thresholds = "3,2" - } - } - - then { - assert workflow.success - assert path("$launchDir/results").exists() - - // Check MLST files - def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv") - def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-hamming.tsv") - assert actual_profile_tsv.text == expected_profile_tsv.text - - // Check computed distance matrix is correct and that the file exists - def actual_distances = path("$launchDir/results/distances/profile_dists.results.text") - def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hamming.tsv") - assert actual_distances.text == expected_distances.text - - // Check computed clusters are correct and exist - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hamming_thresh_3.2.txt") - assert actual_clusters.text == expected_clusters.text - } - } - - test("Test pipeline with threshold set to 3,3") { - tag "pipeline_thresh_3.3" - - when { - params { - input = "$baseDir/tests/data/samplesheets/samplesheet-hamming.csv" - outdir = "results" - - gm_thresholds = "3,3" - } - } - - then { - assert workflow.success - assert path("$launchDir/results").exists() - - // Check MLST files - def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv") - def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-hamming.tsv") - assert actual_profile_tsv.text == expected_profile_tsv.text - - // Check computed distance matrix is correct and that the file exists - def actual_distances = path("$launchDir/results/distances/profile_dists.results.text") - def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hamming.tsv") - assert actual_distances.text == expected_distances.text - - // Check computed clusters are correct and exist - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hamming_thresh_3.3.txt") - assert actual_clusters.text == expected_clusters.text - } - } -} diff --git a/tests/pipelines/main_missing_alleles.nf.test b/tests/pipelines/main_missing_alleles.nf.test index 3637c86..e2c4852 100644 --- a/tests/pipelines/main_missing_alleles.nf.test +++ b/tests/pipelines/main_missing_alleles.nf.test @@ -18,7 +18,7 @@ nextflow_pipeline { then { assert workflow.success assert path("$launchDir/results").exists() - + /* // Check MLST files def actual_profile_tsv = path("$launchDir/results/merged/locidex.merge.profile.tsv") def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile-hash-missing.tsv") @@ -28,17 +28,10 @@ nextflow_pipeline { def actual_distances = path("$launchDir/results/distances/profile_dists.results.text") def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hash-missing.tsv") assert actual_distances.text == expected_distances.text - - // Check computed clusters are correct and exist - def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_tree = path("$baseDir/tests/data/clusters/expected_tree_hash_missing.nwk") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hash_missing.txt") - assert actual_tree.text == expected_tree.text - assert actual_clusters.text == expected_clusters.text + */ } } - +/* test("Full pipeline hashes and missing data count missing as differences") { tag "pipeline_hashes_missing_count_missing" @@ -66,13 +59,6 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hash-missing-count-missing.tsv") assert actual_distances.text == expected_distances.text - // Check computed clusters are correct and exist - def actual_tree = path("$launchDir/results/clusters/gas.mcluster.tree.nwk") - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_tree = path("$baseDir/tests/data/clusters/expected_tree_hash_missing_count_missing.nwk") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hash_missing_count_missing.txt") - assert actual_tree.text == expected_tree.text - assert actual_clusters.text == expected_clusters.text } } @@ -104,10 +90,6 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hash-remove-missing-loci.tsv") assert actual_distances.text == expected_distances.text - // Check computed clusters are correct and exist - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hash_remove_missing_loci.txt") - assert actual_clusters.text == expected_clusters.text } } @@ -138,10 +120,6 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hash-more-missing.tsv") assert actual_distances.text == expected_distances.text - // Check computed clusters are correct and exist - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hash_more_missing.txt") - assert actual_clusters.text == expected_clusters.text } } @@ -173,10 +151,6 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hash-more-missing-remove-sample.tsv") assert actual_distances.text == expected_distances.text - // Check computed clusters are correct and exist - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hash_more_missing_remove_sample.txt") - assert actual_clusters.text == expected_clusters.text } } @@ -228,15 +202,7 @@ nextflow_pipeline { assert path("$launchDir/results/distances/profile_dists.query_profile.text") .readLines()[0] == "sample_id\tl2\tl3" - // Check computed distance matrix is correct and that the file exists - def actual_distances = path("$launchDir/results/distances/profile_dists.results.text") - def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hash-keep-two-loci.tsv") - assert actual_distances.text == expected_distances.text - // Check computed clusters are correct and exist - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hash_keep_two_loci.txt") - assert actual_clusters.text == expected_clusters.text } } @@ -275,10 +241,6 @@ nextflow_pipeline { def expected_distances = path("$baseDir/tests/data/distances/expected_dists-hash-keep-one-loci.tsv") assert actual_distances.text == expected_distances.text - // Check computed clusters are correct and exist - def actual_clusters = path("$launchDir/results/clusters/gas.mcluster.clusters.text") - def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters_hash_keep_one_loci.txt") - assert actual_clusters.text == expected_clusters.text } } @@ -299,4 +261,5 @@ nextflow_pipeline { assert workflow.failed } } +*/ } diff --git a/workflows/gasclustering.nf b/workflows/fastmatchirida.nf similarity index 66% rename from workflows/gasclustering.nf rename to workflows/fastmatchirida.nf index 6c2c9b3..20a1df5 100644 --- a/workflows/gasclustering.nf +++ b/workflows/fastmatchirida.nf @@ -13,7 +13,7 @@ def summary_params = paramsSummaryMap(workflow) // Print parameter summary log to screen log.info logo + paramsSummaryLog(workflow) + citation -WorkflowGasclustering.initialise(params, log) +Workflowfastmatchirida.initialise(params, log) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -26,12 +26,12 @@ WorkflowGasclustering.initialise(params, log) IMPORT LOCAL MODULES/SUBWORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { LOCIDEX_MERGE } from '../modules/local/locidex/merge/main' -include { PROFILE_DISTS } from '../modules/local/profile_dists/main' -include { GAS_MCLUSTER } from '../modules/local/gas/mcluster/main' -include { APPEND_METADATA } from '../modules/local/appendmetadata/main' -include { ARBOR_VIEW } from '../modules/local/arborview.nf' -include { INPUT_ASSURE } from "../modules/local/input_assure/main" +include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF } from '../modules/local/locidex/merge/main' +include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUE } from '../modules/local/locidex/merge/main' +include { PROFILE_DISTS } from '../modules/local/profile_dists/main' +include { INPUT_ASSURE } from "../modules/local/input_assure/main" +include { PROCESS_OUTPUT } from "../modules/local/process_output/main" +include { APPEND_METADATA } from "../modules/local/append_metadata/main" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -66,10 +66,9 @@ def prepareFilePath(String filep){ return return_path // empty value if file argument is null } -workflow GASCLUSTERING { +workflow FASTMATCH { SAMPLE_HEADER = "sample" ch_versions = Channel.empty() - // Track processed IDs def processedIDs = [] as Set @@ -77,7 +76,7 @@ workflow GASCLUSTERING { // NB: `input` corresponds to `params.input` and associated sample sheet schema input = Channel.fromSamplesheet("input") // and remove non-alphanumeric characters in sample_names (meta.id), whilst also correcting for duplicate sample_names (meta.id) - .map { meta, mlst_file -> + .map { meta, mlst_file, ref_query -> if (!meta.id) { meta.id = meta.irida_id } else { @@ -90,7 +89,12 @@ workflow GASCLUSTERING { } // Add the ID to the set of processed IDs processedIDs << meta.id - + // If the fastmatch_category is blank make the default "reference" + if (!ref_query) { + meta.ref_query = "reference" + } else { + meta.ref_query = ref_query + } tuple(meta, mlst_file)} // Make sure the ID in samplesheet / meta.id is the same ID // as the corresponding MLST JSON file: @@ -101,6 +105,21 @@ workflow GASCLUSTERING { meta, mlst_files -> mlst_files }.collect() + input_assure.result + .branch { meta, mlst_file -> + reference: meta.ref_query == "reference" + query: meta.ref_query == "query" + }.set {merged_alleles} + + merged_alleles_query = merged_alleles.query.map{ + meta, mlst_files -> mlst_files + }.collect() + + merged_alleles_reference = merged_alleles.reference. + concat(merged_alleles.query).map{ // Reference will contain both query and reference + meta, mlst_files -> mlst_files + }.collect() + metadata_headers = Channel.of( tuple( SAMPLE_HEADER, @@ -116,8 +135,12 @@ workflow GASCLUSTERING { meta.metadata_5, meta.metadata_6, meta.metadata_7, meta.metadata_8) }.toList() - merged = LOCIDEX_MERGE(merged_alleles) - ch_versions = ch_versions.mix(merged.versions) + // To avoid collisions of MLST files for reference set (which includes query MLST files) we will run LOCIDEX twice + merged_query = LOCIDEX_MERGE_QUE(merged_alleles_query, "merged_query", "query") + ch_versions = ch_versions.mix(merged_query.versions) + + merged_reference = LOCIDEX_MERGE_REF(merged_alleles_reference, "merged_reference", "reference") + ch_versions = ch_versions.mix(merged_reference.versions) // optional files passed in mapping_file = prepareFilePath(params.pd_mapping_file) @@ -130,39 +153,30 @@ workflow GASCLUSTERING { exit 1, "--pd_columns ${params.pd_columns}: Does not exist but was passed to the pipeline. Exiting now." } - if(params.gm_thresholds == null || params.gm_thresholds == ""){ - exit 1, "--gm_thresholds ${params.gm_thresholds}: Cannot pass null or empty string" + // Check that only 'hamming' or 'scaled' are provided to pd_distm + if ((params.pd_distm != 'hamming') & (params.pd_distm != 'scaled')) { + exit 1, "'--pd_distm ${params.pd_distm}' is an invalid value. Please set to either 'hamming' or 'scaled'." } - gm_thresholds_list = params.gm_thresholds.toString().split(',') - if (params.pd_distm == 'hamming') { - if (gm_thresholds_list.any { it != null && it.contains('.') }) { - exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains fractions." - + " Please either set '--pd_distm scaled' or remove fractions from distance thresholds.") + // Check that when using scaled the threshold exists between 0-100 + if (params.pd_distm == 'scaled') { + if ((params.threshold < 0.0) || (params.threshold > 100.0)) { + exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--threshold ${params.threshold}' contains thresholds outside of range [0, 100]." + + " Please either set '--threshold' or adjust the threshold values.") } - } else if (params.pd_distm == 'scaled') { - if (gm_thresholds_list.any { it != null && (it as Float < 0.0 || it as Float > 100.0) }) { - exit 1, ("'--pd_distm ${params.pd_distm}' is set, but '--gm_thresholds ${params.gm_thresholds}' contains thresholds outside of range [0, 100]." - + " Please either set '--pd_distm hamming' or adjust the threshold values.") - } - } else { - exit 1, "'--pd_distm ${params.pd_distm}' is an invalid value. Please set to either 'hamming' or 'scaled'." } - // Options related to profile dists - mapping_format = Channel.value(params.pd_outfmt) + mapping_format = Channel.value("pairwise") - distances = PROFILE_DISTS(merged.combined_profiles, mapping_format, mapping_file, columns_file) + distances = PROFILE_DISTS(merged_query.combined_profiles, merged_reference.combined_profiles, mapping_format, mapping_file, columns_file) ch_versions = ch_versions.mix(distances.versions) - clustered_data = GAS_MCLUSTER(distances.results) - ch_versions = ch_versions.mix(clustered_data.versions) - - data_and_metadata = APPEND_METADATA(clustered_data.clusters, metadata_rows, metadata_headers) - tree_data = clustered_data.tree.merge(data_and_metadata) // mergeing as no key to join on + // Append metadata to references: + distances_metadata = APPEND_METADATA(distances.results, metadata_rows, metadata_headers) - tree_html = file("$projectDir/assets/ArborView.html") - ARBOR_VIEW(tree_data, tree_html) + // Process the output: + processed_output = PROCESS_OUTPUT(distances_metadata.distances, params.threshold) + ch_versions = ch_versions.mix(processed_output.versions) CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml')