diff --git a/examples/localtest.yaml b/examples/localtest.yaml deleted file mode 100644 index b98bfcc..0000000 --- a/examples/localtest.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# ABOUT: This is a config for submitting 2 set of paired end reads, an assembly and bins -# ABOUT: Coverage is known. -# ABOUT: Taxonomy is derived from `gtdb_to_ncbi_majority_vote.py` output and a MANUAL_TAXONOMY_FILE -# USAGE: navigate to the directory -# USAGE: submg submit --config 05_reads_assembly_bins.yaml --staging_dir --logging_dir --submit_reads --submit_assembly --submit_bins - - -STUDY: "PRJEB71644" # The accession of your study (which has to already exist in ENA) >>EXAMPLE: "PRJEB71644" -METAGENOME_SCIENTIFIC_NAME: "biogas fermenter metagenome" # Taxonomic identifier of the metagenome. Check the ENA metagenome taxonomy tree to find a taxonomy ID and species name fitting your sample >>EXAMPLE: "biogas fermenter metagenome" -METAGENOME_TAXID: "718289" # Taxonomic identifier of the assembly. Must match SPECIES_SCIENTIFIC_NAME >>EXAMPLE: "718289" -SEQUENCING_PLATFORMS: ["ILLUMINA"] # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#platform >>EXAMPLE: ["ILLUMINA","OXFORD_NANOPORE"] -SAMPLE_ACCESSIONS: ['SAMEA113417017', 'SAMEA113417018'] # These samples exist in ENA. Your assembly is based on them. >>EXAMPLE: ["ERS15898933","ERS15898932"] -PAIRED_END_READS: -- NAME: "AKQ4G_ex05_rp1" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" - SEQUENCING_INSTRUMENT: "Illumina HiSeq 1500" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] - LIBRARY_SOURCE: "METAGENOMIC" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" - LIBRARY_SELECTION: "RANDOM" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" - LIBRARY_STRATEGY: "WGS" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" - INSERT_SIZE: "300" # Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html) >>EXAMPLE: "300" - FASTQ1_FILE: "data/reads/fwd1.fastq" # Path to the fastq file with forward reads >>EXAMPLE: "/mnt/data/reads_R1.fastq.gz" - FASTQ2_FILE: "data/reads/rev1.fastq" # Path to the fastq file with reverse reads >>EXAMPLE: "/mnt/data/reads_R2.fastq.gz" - RELATED_SAMPLE_ACCESSION: 'SAMEA113417017' # The accession of the sample that these reads originate from >>EXAMPLE: "ERS15898933" - ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest -- NAME: "AKQ4G_ex05_rp2" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" - SEQUENCING_INSTRUMENT: "Illumina HiSeq 1500" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] - LIBRARY_SOURCE: "METAGENOMIC" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" - LIBRARY_SELECTION: "RANDOM" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" - LIBRARY_STRATEGY: "WGS" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" - INSERT_SIZE: "300" # Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html) >>EXAMPLE: "300" - FASTQ1_FILE: "data/reads/fwd2.fastq" # Path to the fastq file with forward reads >>EXAMPLE: "/mnt/data/reads_R1.fastq.gz" - FASTQ2_FILE: "data/reads/rev2.fastq" # Path to the fastq file with reverse reads >>EXAMPLE: "/mnt/data/reads_R2.fastq.gz" - RELATED_SAMPLE_ACCESSION: 'SAMEA113417018' # The accession of the sample that these reads originate from >>EXAMPLE: "ERS15898933" - ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest -ASSEMBLY: - ASSEMBLY_NAME: "AKQ4G_e05_coasm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg" - ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" - ISOLATION_SOURCE: "biogas plant anaerobic digester" # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" - FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" - collection date: "2024-01-01" # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" - geographic location (country and/or sea): "Germany" # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" - COVERAGE_VALUE: 128.27 # Read coverage of the assembly. - ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment - ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest -BINS: - BINS_DIRECTORY: "data/3bins" # Directory containing the fasta files of all bins/MAGs >>EXAMPLE: "/mnt/data/bins" - COMPLETENESS_SOFTWARE: "CheckM" # Software used to calculate completeness >>EXAMPLE: "CheckM" - QUALITY_FILE: "data/checkm_quality_3bins.tsv" # tsv file containing quality values of each bin. Header must include 'Bin_id', 'Completeness', 'Contamination'. A CheckM output table will work here. >>EXAMPLE: "/mnt/data/checkm_quality.tsv" - NCBI_TAXONOMY_FILES: # A list of files with NCBI taxonomy information about the bins. Consult the README to see how they should be structured. >>EXAMPLE: ["/mnt/data/bacteria_tax.tsv","/mnt/data/archaea_tax.tsv"] - - "data/taxonomy/archaea_taxonomy.tsv" - - "data/taxonomy/bacteria_taxonomy.tsv" - MANUAL_TAXONOMY_FILE: "data/taxonomy/manual_taxonomy.tsv" # Scientific names and taxids for bins. See example file for the structure. Columns must be 'Bin_id', 'Tax_id' and 'Scientific_name'. Consult the README for more information. >>EXAMPLE: "/mnt/data/manual_tax.tsv" - BINNING_SOFTWARE: "metabat2" # The program that was used for binning. >>EXAMPLE: "metabat2" - ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment - ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest - COVERAGE_FILE: "data/bin_coverage.tsv" # .tsv file containing the coverage values of each bin. Columns must be 'Bin_id' and 'Coverage'. - \ No newline at end of file diff --git a/submg/binSubmission.py b/submg/binSubmission.py index 31df6e6..05421ab 100644 --- a/submg/binSubmission.py +++ b/submg/binSubmission.py @@ -110,7 +110,7 @@ def get_bin_quality(config, silent=False) -> dict: def __prep_bins_samplesheet(filtered_bins: list, config: dict, - assembly_sample_accession: str, + sample_accession_data: list, samples_submission_dir: str, upload_taxonomy_data: dict) -> str: """ @@ -119,9 +119,8 @@ def __prep_bins_samplesheet(filtered_bins: list, Args: filtered_bins (list): A list of bin names to submit. config (dict): The config dictionary. - assembly_sample_accession (str): Either the accession of a co-assembly - virtual sample or the accession of the single biological sample - which the assembly is based on. + sample_accession_data (list): A list of dictionaries with the data + about each biological sample samples_submission_dir (str): The directory where the samplesheet will be written to. upload_taxonomy_data (dict): A dictionary with the taxid and scientific @@ -143,7 +142,7 @@ def __prep_bins_samplesheet(filtered_bins: list, collection_date = utility.from_config(config, 'ASSEMBLY', 'collection date') geographic_location_country = utility.from_config(config, 'ASSEMBLY', 'geographic location (country and/or sea)') investigation_type = staticConfig.bin_investigation_type - sample_derived_from = assembly_sample_accession + sample_derived_from = ",".join([x['accession'] for x in sample_accession_data]) metagenomic_source = utility.from_config(config, 'METAGENOME_SCIENTIFIC_NAME') sequencing_method = utility.from_config(config, 'SEQUENCING_PLATFORMS') if isinstance(sequencing_method, list): @@ -518,7 +517,7 @@ def get_bins_in_dir(bins_directory: str) -> list: def submit_bins(filtered_bins: list, config: dict, upload_taxonomy_data: dict, - assembly_sample_accession: str, + sample_accession_data: list, run_accessions, staging_dir: str, logging_dir: str, @@ -536,9 +535,8 @@ def submit_bins(filtered_bins: list, config (dict): The config dictionary. upload_taxonomy_data (dict): A dictionary with the taxid and scientific name for each bin. - assembly_sample_accession (str): Either the accession of a co-assembly - virtual sample or the accession of the single biological sample - which the assembly is based on. + sample_accession_data (list): A list of dictionaries with the data + about each biological sample run_accessions (list): A list of accession numbers of the runs. staging_dir (str): The directory where the bins will be staged. logging_dir (str): The directory where the logs will be written to. @@ -591,7 +589,7 @@ def submit_bins(filtered_bins: list, os.makedirs(samples_submission_dir, exist_ok=False) samplesheet = __prep_bins_samplesheet(filtered_bins, config, - assembly_sample_accession, + sample_accession_data, samples_submission_dir, upload_taxonomy_data) diff --git a/submg/magSubmission.py b/submg/magSubmission.py index 74fb9bf..16932b4 100644 --- a/submg/magSubmission.py +++ b/submg/magSubmission.py @@ -69,8 +69,8 @@ def __prep_mags_samplesheet(config: dict, Args: config (dict): The config dictionary. - sample_accession_data (list): A list of dictionaries with the sample - accession data. + sample_accession_data (list): A list of dictionaries with the data + about each biological sample mag_metadata (dict): A dictionary with the metadata for each MAG. bin_taxonomy_data (dict): A dictionary with the taxonomy data for each bin. @@ -351,7 +351,6 @@ def submit_mags(config: dict, sample_accession_data: list, run_accessions, bin_taxonomy_data: dict, - # bin_to_sample: dict, staging_dir: str, logging_dir: str, depth_files: str, @@ -365,10 +364,12 @@ def submit_mags(config: dict, Args: config (dict): The config dictionary. - upload_taxonomy_data (dict): A dictionary with the taxid and scientific - name for each bin. metagenome_scientific_name (str): The scientific name of the metagenome. + sample_accession_data (list): A list of dictionaries with the data + about each biological sample run_accessions (list): A list of accession numbers of the runs. + bin_taxonomy_data (dict): A dictionary with the taxid and scientific + name for each bin. staging_dir (str): The directory where the bins will be staged. logging_dir (str): The directory where the logs will be written to. depth_files (list): A list of paths to the depth files. Either this or diff --git a/submg/main.py b/submg/main.py index d1b8403..2cbd736 100644 --- a/submg/main.py +++ b/submg/main.py @@ -323,8 +323,6 @@ def submit(args): args.staging_dir, args.logging_dir, test=args.development_service) - - else: sample_accessions = utility.from_config(config, 'SAMPLE_ACCESSIONS') @@ -383,7 +381,7 @@ def submit(args): submit_bins(filtered_bins, config, bin_taxonomy, - assembly_sample_accession, + sample_accession_data, run_accessions, prepdir(args.staging_dir, 'bins'), prepdir(args.logging_dir, 'bins'),