bugfix

metagenomics · May 13, 2024 · a4b7301 · a4b7301
1 parent 47bafdb
commit a4b7301
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 49 deletions.
diff --git a/examples/localtest.yaml b/examples/localtest.yaml
@@ -1,85 +1,56 @@
-# ABOUT: This a config for submitting samples, paired-end reads, an assembly, bins and MAGs
-# ABOUT: Coverage is derived from 2 bam files.
+# ABOUT: This is a config for submitting 2 set of paired end reads, an assembly and bins
+# ABOUT: Coverage is known.
 # ABOUT: Taxonomy is derived from `gtdb_to_ncbi_majority_vote.py` output and a MANUAL_TAXONOMY_FILE
 # USAGE: navigate to the <examples> directory
-# USAGE: submg submit --config 01_samples_reads_assembly_bins_mags.yaml --staging_dir <path/to/staging/dir> --logging_dir <path/to/logging/dir> --submit_samples --submit_reads --submit_assembly --submit_bins --submit_mags
+# USAGE: submg submit --config 05_reads_assembly_bins.yaml --staging_dir <path/to/staging/dir> --logging_dir <path/to/logging/dir> --submit_reads --submit_assembly --submit_bins
+
 
 STUDY: "PRJEB71644"                                                           # The accession of your study (which has to already exist in ENA) >>EXAMPLE: "PRJEB71644"
 METAGENOME_SCIENTIFIC_NAME: "biogas fermenter metagenome"                     # Taxonomic identifier of the metagenome. Check the ENA metagenome taxonomy tree to find a taxonomy ID and species name fitting your sample >>EXAMPLE: "biogas fermenter metagenome"
 METAGENOME_TAXID: "718289"                                                    # Taxonomic identifier of the assembly. Must match SPECIES_SCIENTIFIC_NAME >>EXAMPLE: "718289"
 SEQUENCING_PLATFORMS: ["ILLUMINA"]                                            # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#platform >>EXAMPLE: ["ILLUMINA","OXFORD_NANOPORE"]
-PROJECT_NAME: "7yyIc_AgRFex 2 Survey"                                         # Name of the project within which the sequencing was organized >>EXAMPLE: "AgRFex 2 Biogas Survey"
-NEW_SAMPLES:                                                                  # These samples will be created in ENA according to the data entered below. Your assembly MUST BE BASED ON ALL OF THESE.
-- TITLE: "7yyIc_hydrolysis digester sample"                                   # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample"
-  collection date: "2022-07-12"                                               # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03"
-  geographic location (country and/or sea): "Germany"                         # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany"
-  ADDITIONAL_SAMPLESHEET_FIELDS:                                              # Please add more fields from the ENA samplesheet that most closely matches your experiment
-    geographic location (latitude): 52.51                                     # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85"
-    geographic location (longitude): 8.77                                     # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65"
-    broad-scale environmental context: "tropical biome"                       # For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical biome"
-    local environmental context: "tropical marine upwelling biome"            # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical marine upwelling biome"
-    environmental medium: "grass silage|animal waste material|anoxic water"   # Pipe separated! For more information consult an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) and https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS >>EXAMPLE: "grass silage|animal waste material|anoxic water"
-- TITLE: "7yyIc_main digester sample"                                         # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample"
-  collection date: "2022-07-12"                                               # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03"
-  geographic location (country and/or sea): "Germany"                         # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany"
-  ADDITIONAL_SAMPLESHEET_FIELDS:                                              # Please add more fields from the ENA samplesheet that most closely matches your experiment
-    geographic location (latitude): 52.51                                     # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85"
-    geographic location (longitude): 8.77                                     # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65"
-    broad-scale environmental context: "tropical biome"                       # For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical biome"
-    local environmental context: "tropical marine upwelling biome"            # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical marine upwelling biome"
-    environmental medium: "grass silage|animal waste material|anoxic water"   # Pipe separated! For more information consult an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) and https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS >>EXAMPLE: "grass silage|animal waste material|anoxic water"
+SAMPLE_ACCESSIONS: ['SAMEA113417017', 'SAMEA113417018']                       # These samples exist in ENA. Your assembly is based on them. >>EXAMPLE: ["ERS15898933","ERS15898932"]
 PAIRED_END_READS:                                 
-- NAME: "7yyIc_dh_pe"                                                         # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1"
+- NAME: "AKQ4G_ex05_rp1"                                                            # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1"
   SEQUENCING_INSTRUMENT: "Illumina HiSeq 1500"                                # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"]
   LIBRARY_SOURCE: "METAGENOMIC"                                               # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC"
   LIBRARY_SELECTION: "RANDOM"                                                 # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM"
   LIBRARY_STRATEGY: "WGS"                                                     # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS"
   INSERT_SIZE: "300"                                                          # Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html) >>EXAMPLE: "300"
   FASTQ1_FILE: "data/reads/fwd1.fastq"                                        # Path to the fastq file with forward reads >>EXAMPLE: "/mnt/data/reads_R1.fastq.gz"
   FASTQ2_FILE: "data/reads/rev1.fastq"                                        # Path to the fastq file with reverse reads >>EXAMPLE: "/mnt/data/reads_R2.fastq.gz"
-  RELATED_SAMPLE_TITLE: "7yyIc_hydrolysis digester sample"                    # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample"
+  RELATED_SAMPLE_ACCESSION: 'SAMEA113417017'                                  # The accession of the sample that these reads originate from >>EXAMPLE: "ERS15898933"
   ADDITIONAL_MANIFEST_FIELDS:                                                 # You can add additional fields that will be written to the manifest
-- NAME: "7yyIc_dm_pe"                                                         # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1"
+- NAME: "AKQ4G_ex05_rp2"                                                            # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1"
   SEQUENCING_INSTRUMENT: "Illumina HiSeq 1500"                                # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"]
-  LIBRARY_SOURCE: "GENOMIC"                                                   # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC"
+  LIBRARY_SOURCE: "METAGENOMIC"                                               # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC"
   LIBRARY_SELECTION: "RANDOM"                                                 # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM"
   LIBRARY_STRATEGY: "WGS"                                                     # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS"
   INSERT_SIZE: "300"                                                          # Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html) >>EXAMPLE: "300"
   FASTQ1_FILE: "data/reads/fwd2.fastq"                                        # Path to the fastq file with forward reads >>EXAMPLE: "/mnt/data/reads_R1.fastq.gz"
   FASTQ2_FILE: "data/reads/rev2.fastq"                                        # Path to the fastq file with reverse reads >>EXAMPLE: "/mnt/data/reads_R2.fastq.gz"
-  RELATED_SAMPLE_TITLE: "7yyIc_main digester sample"                          # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample"
+  RELATED_SAMPLE_ACCESSION: 'SAMEA113417018'                                  # The accession of the sample that these reads originate from >>EXAMPLE: "ERS15898933"
   ADDITIONAL_MANIFEST_FIELDS:                                                 # You can add additional fields that will be written to the manifest
 ASSEMBLY:                                         
-  ASSEMBLY_NAME: "7yyIc_e01_coasm"                                       # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg"
+  ASSEMBLY_NAME: "AKQ4G_e05_coasm"                                       # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg"
   ASSEMBLY_SOFTWARE: "MEGAHIT"                                                # Software used to generate the assembly >>EXAMPLE: "MEGAHIT"
   ISOLATION_SOURCE: "biogas plant anaerobic digester"                         # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester"
   FASTA_FILE: "data/assembly.fasta"                                           # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz"
-  collection date: "2022-07-12"                                               # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03"
+  collection date: "2024-01-01"                                               # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03"
   geographic location (country and/or sea): "Germany"                         # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany"
+  COVERAGE_VALUE: 128.27                                                      # Read coverage of the assembly.
   ADDITIONAL_SAMPLESHEET_FIELDS:                                              # Please add more fields from the ENA samplesheet that most closely matches your experiment
-    geographic location (latitude): 52.51                                     # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85"
-    geographic location (longitude): 8.77                                     # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65"
-    broad-scale environmental context: "tropical biome"                       # For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical biome"
-    local environmental context: "tropical marine upwelling biome"            # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical marine upwelling biome"
-    environmental medium: "grass silage|animal waste material|anoxic water"   # Pipe separated! For more information consult an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) and https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS >>EXAMPLE: "grass silage|animal waste material|anoxic water"
   ADDITIONAL_MANIFEST_FIELDS:                                                 # You can add additional fields that will be written to the manifest
 BINS:                                             
   BINS_DIRECTORY: "data/3bins"                                                # Directory containing the fasta files of all bins/MAGs >>EXAMPLE: "/mnt/data/bins"
   COMPLETENESS_SOFTWARE: "CheckM"                                             # Software used to calculate completeness >>EXAMPLE: "CheckM"
   QUALITY_FILE: "data/checkm_quality_3bins.tsv"                               # tsv file containing quality values of each bin. Header must include 'Bin_id', 'Completeness', 'Contamination'. A CheckM output table will work here. >>EXAMPLE: "/mnt/data/checkm_quality.tsv"
   NCBI_TAXONOMY_FILES:                                                        # A list of files with NCBI taxonomy information about the bins. Consult the README to see how they should be structured. >>EXAMPLE: ["/mnt/data/bacteria_tax.tsv","/mnt/data/archaea_tax.tsv"]
   - "data/taxonomy/archaea_taxonomy.tsv"
-  - "data/taxonomy/bacteria_taxonomy.tsv"                              
+  - "data/taxonomy/bacteria_taxonomy.tsv"
   MANUAL_TAXONOMY_FILE: "data/taxonomy/manual_taxonomy.tsv"                   # Scientific names and taxids for bins. See example file for the structure. Columns must be 'Bin_id', 'Tax_id' and 'Scientific_name'. Consult the README for more information. >>EXAMPLE: "/mnt/data/manual_tax.tsv"
   BINNING_SOFTWARE: "metabat2"                                                # The program that was used for binning. >>EXAMPLE: "metabat2"
-  ADDITIONAL_SAMPLESHEET_FIELDS:                                              # Please add more fields from the ENA samplesheet that most closely matches your experiment
-    binning parameters: "default"                                             # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000047) >>EXAMPLE: "default"
-    taxonomic identity marker: "multi marker approach"                        # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000047) >>EXAMPLE: "multi marker approach"
-  ADDITIONAL_MANIFEST_FIELDS:                                                 # You can add additional fields that will be written to the manifest
-MAGS:                                             
-  MAG_METADATA_FILE: "data/mag_metadata/mag_metadata.tsv"                     # A .tsv specifying 'Bin_id', 'Sample_id', 'Quality_category', 'Flatfile_path' and 'Unlocalised_path' for all MAGs. See README for more details. >>EXAMPLE: "/mnt/data/mag_data.tsv"
   ADDITIONAL_SAMPLESHEET_FIELDS:                                              # Please add more fields from the ENA samplesheet that most closely matches your experiment
   ADDITIONAL_MANIFEST_FIELDS:                                                 # You can add additional fields that will be written to the manifest
-BAM_FILES:                                                                    # The reads from your experiment mapped back to the assembly
-  - "data/mapping/1.sorted.bam"
-  - "data/mapping/2.sorted.bam"
+  COVERAGE_FILE: "data/bin_coverage.tsv"                                      # .tsv file containing the coverage values of each bin. Columns must be 'Bin_id' and 'Coverage'.
+
diff --git a/submg/magSubmission.py b/submg/magSubmission.py
@@ -399,11 +399,12 @@ def submit_mags(config: dict,
     bin_files = binSubmission.get_bins_in_dir(bins_directory)
     if not depth_files is None:
         bin_coverages = binSubmission.bin_coverage_from_depth(depth_files,
-                                                bin_files,
-                                                threads=threads)
+                                                              bin_files,
+                                                              threads=threads)
     elif not bin_coverage_file is None:
-        bin_coverages = binSubmission.bin_coverage_from_tsv(bin_coverage_file,
-                                              bin_files)
+        bin_coverages = binSubmission.bin_coverage_from_tsv(mag_metadata.keys(),
+                                                            bin_coverage_file,
+                                                            bin_files)
 
     # Make a samplesheet for all MAGs
     loggingC.message(">Making MAG samplesheet", threshold=1)