Skip to content

Commit

Permalink
bugfix
Browse files Browse the repository at this point in the history
  • Loading branch information
ttubb committed May 13, 2024
1 parent 47bafdb commit a4b7301
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 49 deletions.
61 changes: 16 additions & 45 deletions examples/localtest.yaml
Original file line number Diff line number Diff line change
@@ -1,85 +1,56 @@
# ABOUT: This a config for submitting samples, paired-end reads, an assembly, bins and MAGs
# ABOUT: Coverage is derived from 2 bam files.
# ABOUT: This is a config for submitting 2 set of paired end reads, an assembly and bins
# ABOUT: Coverage is known.
# ABOUT: Taxonomy is derived from `gtdb_to_ncbi_majority_vote.py` output and a MANUAL_TAXONOMY_FILE
# USAGE: navigate to the <examples> directory
# USAGE: submg submit --config 01_samples_reads_assembly_bins_mags.yaml --staging_dir <path/to/staging/dir> --logging_dir <path/to/logging/dir> --submit_samples --submit_reads --submit_assembly --submit_bins --submit_mags
# USAGE: submg submit --config 05_reads_assembly_bins.yaml --staging_dir <path/to/staging/dir> --logging_dir <path/to/logging/dir> --submit_reads --submit_assembly --submit_bins


STUDY: "PRJEB71644" # The accession of your study (which has to already exist in ENA) >>EXAMPLE: "PRJEB71644"
METAGENOME_SCIENTIFIC_NAME: "biogas fermenter metagenome" # Taxonomic identifier of the metagenome. Check the ENA metagenome taxonomy tree to find a taxonomy ID and species name fitting your sample >>EXAMPLE: "biogas fermenter metagenome"
METAGENOME_TAXID: "718289" # Taxonomic identifier of the assembly. Must match SPECIES_SCIENTIFIC_NAME >>EXAMPLE: "718289"
SEQUENCING_PLATFORMS: ["ILLUMINA"] # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#platform >>EXAMPLE: ["ILLUMINA","OXFORD_NANOPORE"]
PROJECT_NAME: "7yyIc_AgRFex 2 Survey" # Name of the project within which the sequencing was organized >>EXAMPLE: "AgRFex 2 Biogas Survey"
NEW_SAMPLES: # These samples will be created in ENA according to the data entered below. Your assembly MUST BE BASED ON ALL OF THESE.
- TITLE: "7yyIc_hydrolysis digester sample" # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample"
collection date: "2022-07-12" # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03"
geographic location (country and/or sea): "Germany" # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany"
ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment
geographic location (latitude): 52.51 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85"
geographic location (longitude): 8.77 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65"
broad-scale environmental context: "tropical biome" # For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical biome"
local environmental context: "tropical marine upwelling biome" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical marine upwelling biome"
environmental medium: "grass silage|animal waste material|anoxic water" # Pipe separated! For more information consult an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) and https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS >>EXAMPLE: "grass silage|animal waste material|anoxic water"
- TITLE: "7yyIc_main digester sample" # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample"
collection date: "2022-07-12" # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03"
geographic location (country and/or sea): "Germany" # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany"
ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment
geographic location (latitude): 52.51 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85"
geographic location (longitude): 8.77 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65"
broad-scale environmental context: "tropical biome" # For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical biome"
local environmental context: "tropical marine upwelling biome" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical marine upwelling biome"
environmental medium: "grass silage|animal waste material|anoxic water" # Pipe separated! For more information consult an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) and https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS >>EXAMPLE: "grass silage|animal waste material|anoxic water"
SAMPLE_ACCESSIONS: ['SAMEA113417017', 'SAMEA113417018'] # These samples exist in ENA. Your assembly is based on them. >>EXAMPLE: ["ERS15898933","ERS15898932"]
PAIRED_END_READS:
- NAME: "7yyIc_dh_pe" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1"
- NAME: "AKQ4G_ex05_rp1" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1"
SEQUENCING_INSTRUMENT: "Illumina HiSeq 1500" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"]
LIBRARY_SOURCE: "METAGENOMIC" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC"
LIBRARY_SELECTION: "RANDOM" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM"
LIBRARY_STRATEGY: "WGS" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS"
INSERT_SIZE: "300" # Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html) >>EXAMPLE: "300"
FASTQ1_FILE: "data/reads/fwd1.fastq" # Path to the fastq file with forward reads >>EXAMPLE: "/mnt/data/reads_R1.fastq.gz"
FASTQ2_FILE: "data/reads/rev1.fastq" # Path to the fastq file with reverse reads >>EXAMPLE: "/mnt/data/reads_R2.fastq.gz"
RELATED_SAMPLE_TITLE: "7yyIc_hydrolysis digester sample" # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample"
RELATED_SAMPLE_ACCESSION: 'SAMEA113417017' # The accession of the sample that these reads originate from >>EXAMPLE: "ERS15898933"
ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest
- NAME: "7yyIc_dm_pe" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1"
- NAME: "AKQ4G_ex05_rp2" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1"
SEQUENCING_INSTRUMENT: "Illumina HiSeq 1500" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"]
LIBRARY_SOURCE: "GENOMIC" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC"
LIBRARY_SOURCE: "METAGENOMIC" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC"
LIBRARY_SELECTION: "RANDOM" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM"
LIBRARY_STRATEGY: "WGS" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS"
INSERT_SIZE: "300" # Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html) >>EXAMPLE: "300"
FASTQ1_FILE: "data/reads/fwd2.fastq" # Path to the fastq file with forward reads >>EXAMPLE: "/mnt/data/reads_R1.fastq.gz"
FASTQ2_FILE: "data/reads/rev2.fastq" # Path to the fastq file with reverse reads >>EXAMPLE: "/mnt/data/reads_R2.fastq.gz"
RELATED_SAMPLE_TITLE: "7yyIc_main digester sample" # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample"
RELATED_SAMPLE_ACCESSION: 'SAMEA113417018' # The accession of the sample that these reads originate from >>EXAMPLE: "ERS15898933"
ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest
ASSEMBLY:
ASSEMBLY_NAME: "7yyIc_e01_coasm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg"
ASSEMBLY_NAME: "AKQ4G_e05_coasm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg"
ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT"
ISOLATION_SOURCE: "biogas plant anaerobic digester" # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester"
FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz"
collection date: "2022-07-12" # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03"
collection date: "2024-01-01" # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03"
geographic location (country and/or sea): "Germany" # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany"
COVERAGE_VALUE: 128.27 # Read coverage of the assembly.
ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment
geographic location (latitude): 52.51 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85"
geographic location (longitude): 8.77 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65"
broad-scale environmental context: "tropical biome" # For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical biome"
local environmental context: "tropical marine upwelling biome" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical marine upwelling biome"
environmental medium: "grass silage|animal waste material|anoxic water" # Pipe separated! For more information consult an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) and https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS >>EXAMPLE: "grass silage|animal waste material|anoxic water"
ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest
BINS:
BINS_DIRECTORY: "data/3bins" # Directory containing the fasta files of all bins/MAGs >>EXAMPLE: "/mnt/data/bins"
COMPLETENESS_SOFTWARE: "CheckM" # Software used to calculate completeness >>EXAMPLE: "CheckM"
QUALITY_FILE: "data/checkm_quality_3bins.tsv" # tsv file containing quality values of each bin. Header must include 'Bin_id', 'Completeness', 'Contamination'. A CheckM output table will work here. >>EXAMPLE: "/mnt/data/checkm_quality.tsv"
NCBI_TAXONOMY_FILES: # A list of files with NCBI taxonomy information about the bins. Consult the README to see how they should be structured. >>EXAMPLE: ["/mnt/data/bacteria_tax.tsv","/mnt/data/archaea_tax.tsv"]
- "data/taxonomy/archaea_taxonomy.tsv"
- "data/taxonomy/bacteria_taxonomy.tsv"
- "data/taxonomy/bacteria_taxonomy.tsv"
MANUAL_TAXONOMY_FILE: "data/taxonomy/manual_taxonomy.tsv" # Scientific names and taxids for bins. See example file for the structure. Columns must be 'Bin_id', 'Tax_id' and 'Scientific_name'. Consult the README for more information. >>EXAMPLE: "/mnt/data/manual_tax.tsv"
BINNING_SOFTWARE: "metabat2" # The program that was used for binning. >>EXAMPLE: "metabat2"
ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment
binning parameters: "default" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000047) >>EXAMPLE: "default"
taxonomic identity marker: "multi marker approach" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000047) >>EXAMPLE: "multi marker approach"
ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest
MAGS:
MAG_METADATA_FILE: "data/mag_metadata/mag_metadata.tsv" # A .tsv specifying 'Bin_id', 'Sample_id', 'Quality_category', 'Flatfile_path' and 'Unlocalised_path' for all MAGs. See README for more details. >>EXAMPLE: "/mnt/data/mag_data.tsv"
ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment
ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest
BAM_FILES: # The reads from your experiment mapped back to the assembly
- "data/mapping/1.sorted.bam"
- "data/mapping/2.sorted.bam"
COVERAGE_FILE: "data/bin_coverage.tsv" # .tsv file containing the coverage values of each bin. Columns must be 'Bin_id' and 'Coverage'.

9 changes: 5 additions & 4 deletions submg/magSubmission.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,11 +399,12 @@ def submit_mags(config: dict,
bin_files = binSubmission.get_bins_in_dir(bins_directory)
if not depth_files is None:
bin_coverages = binSubmission.bin_coverage_from_depth(depth_files,
bin_files,
threads=threads)
bin_files,
threads=threads)
elif not bin_coverage_file is None:
bin_coverages = binSubmission.bin_coverage_from_tsv(bin_coverage_file,
bin_files)
bin_coverages = binSubmission.bin_coverage_from_tsv(mag_metadata.keys(),
bin_coverage_file,
bin_files)

# Make a samplesheet for all MAGs
loggingC.message(">Making MAG samplesheet", threshold=1)
Expand Down

0 comments on commit a4b7301

Please sign in to comment.