Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: output Genebank feature table file #982

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 23 additions & 1 deletion packages_rs/nextclade-cli/src/cli/nextalign_cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ pub enum NextalignOutputSelection {
All,
Fasta,
Translations,
FeatureTable,
Insertions,
Errors,
}
Expand Down Expand Up @@ -148,7 +149,7 @@ pub struct NextalignRunOutputArgs {
///
/// If both the `--output-all` and individual `--output-*` flags are provided, each individual flag overrides the corresponding default output path.
///
/// At least one of the output flags is required: `--output-all`, `--output-fasta`, `--output-translations`, `--output-insertions`, `--output-errors`
/// At least one of the output flags is required: `--output-all`, `--output-fasta`, `--output-translations`, `--output-feature-table`, `--output-insertions`, `--output-errors`
///
/// If the required directory tree does not exist, it will be created.
#[clap(long, short = 'O')]
Expand Down Expand Up @@ -212,6 +213,22 @@ pub struct NextalignRunOutputArgs {
#[clap(value_hint = ValueHint::AnyPath)]
pub output_translations: Option<String>,

/// Path to output Genbank Feature Table file (.tbl).
///
/// Writes Feature Table in Genbank file format, which can be used to facilitate submissions to Genbank database.
///
/// See: https://www.ncbi.nlm.nih.gov/genbank/feature_table/
///
/// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`.
///
/// If filename ends with one of the supported file extensions: `gz`, `bz2`, `xz`, `zstd`, it will be transparently
/// compressed. If a filename is "-" then the output will be written uncompressed to standard output (stdout).
///
/// If the required directory tree does not exist, it will be created.
#[clap(long, short = 'F')]
#[clap(value_hint = ValueHint::AnyPath)]
pub output_feature_table: Option<PathBuf>,

/// Path to output CSV file that contain insertions stripped from the reference alignment.
///
/// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`.
Expand Down Expand Up @@ -320,6 +337,7 @@ pub fn nextalign_get_output_filenames(run_args: &mut NextalignRunArgs) -> Result
output_selection,
output_fasta,
output_translations,
output_feature_table,
output_insertions,
output_errors,
include_reference,
Expand Down Expand Up @@ -373,6 +391,10 @@ pub fn nextalign_get_output_filenames(run_args: &mut NextalignRunArgs) -> Result
output_translations.get_or_insert(output_translations_template)
};
}

if output_selection.contains(&NextalignOutputSelection::FeatureTable) {
let output_feature_table = output_feature_table.get_or_insert(add_extension(&default_output_file_path, "tbl"));
}
}

if let Some(output_translations) = output_translations {
Expand Down
2 changes: 2 additions & 0 deletions packages_rs/nextclade-cli/src/cli/nextalign_loop.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ pub fn nextalign_run(run_args: NextalignRunArgs) -> Result<(), Report> {
output_selection,
output_fasta,
output_translations,
output_feature_table,
output_insertions,
output_errors,
include_reference,
Expand Down Expand Up @@ -152,6 +153,7 @@ pub fn nextalign_run(run_args: NextalignRunArgs) -> Result<(), Report> {
&gene_map,
&output_fasta,
&output_translations,
&output_feature_table,
&output_insertions,
&output_errors,
in_order,
Expand Down
15 changes: 15 additions & 0 deletions packages_rs/nextclade-cli/src/cli/nextalign_ordered_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use eyre::{Report, WrapErr};
use log::warn;
use nextclade::io::errors_csv::ErrorsCsvWriter;
use nextclade::io::fasta::{FastaPeptideWriter, FastaRecord, FastaWriter};
use nextclade::io::genbank_feature_table::{GenbankFeatureTableEntry, GenbankFeatureTableFileWriter};
use nextclade::io::gene_map::GeneMap;
use nextclade::io::insertions_csv::InsertionsCsvWriter;
use nextclade::io::nuc::from_nuc_seq;
Expand All @@ -17,6 +18,7 @@ use std::path::PathBuf;
pub struct NextalignOrderedWriter<'a> {
fasta_writer: Option<FastaWriter>,
fasta_peptide_writer: Option<FastaPeptideWriter>,
feature_table_writer: Option<GenbankFeatureTableFileWriter<'a>>,
insertions_csv_writer: Option<InsertionsCsvWriter>,
errors_csv_writer: Option<ErrorsCsvWriter<'a>>,
expected_index: usize,
Expand All @@ -29,6 +31,7 @@ impl<'a> NextalignOrderedWriter<'a> {
gene_map: &'a GeneMap,
output_fasta: &Option<PathBuf>,
output_translations: &Option<String>,
output_feature_table: &Option<PathBuf>,
output_insertions: &Option<PathBuf>,
output_errors: &Option<PathBuf>,
in_order: bool,
Expand All @@ -38,6 +41,9 @@ impl<'a> NextalignOrderedWriter<'a> {
let fasta_peptide_writer = output_translations
.map_ref_fallible(|output_translations| FastaPeptideWriter::new(gene_map, &output_translations))?;

let feature_table_writer = output_feature_table
.map_ref_fallible(|output_feature_table| GenbankFeatureTableFileWriter::new(output_feature_table, gene_map))?;

let insertions_csv_writer = output_insertions.map_ref_fallible(InsertionsCsvWriter::new)?;

let errors_csv_writer =
Expand All @@ -46,6 +52,7 @@ impl<'a> NextalignOrderedWriter<'a> {
Ok(Self {
fasta_writer,
fasta_peptide_writer,
feature_table_writer,
insertions_csv_writer,
errors_csv_writer,
expected_index: 0,
Expand Down Expand Up @@ -85,6 +92,7 @@ impl<'a> NextalignOrderedWriter<'a> {
stripped,
alignment,
translations,
gene_ranges_qry,
warnings,
missing_genes,
is_reverse_complement,
Expand All @@ -100,6 +108,13 @@ impl<'a> NextalignOrderedWriter<'a> {
}
}

if let Some(feature_table_writer) = &mut self.feature_table_writer {
feature_table_writer.write(&GenbankFeatureTableEntry {
seq_name: seq_name.clone(),
gene_ranges_qry: gene_ranges_qry.clone(),
})?;
}

if let Some(insertions_csv_writer) = &mut self.insertions_csv_writer {
insertions_csv_writer.write(&seq_name, &stripped.insertions, translations)?;
}
Expand Down
24 changes: 23 additions & 1 deletion packages_rs/nextclade-cli/src/cli/nextclade_cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ pub enum NextcladeOutputSelection {
Tsv,
Tree,
Translations,
FeatureTable,
Insertions,
Errors,
}
Expand Down Expand Up @@ -361,7 +362,7 @@ pub struct NextcladeRunOutputArgs {
///
/// If both the `--output-all` and individual `--output-*` flags are provided, each individual flag overrides the corresponding default output path.
///
/// At least one of the output flags is required: `--output-all`, `--output-fasta`, `--output-ndjson`, `--output-json`, `--output-csv`, `--output-tsv`, `--output-tree`, `--output-translations`, `--output-insertions`, `--output-errors`
/// At least one of the output flags is required: `--output-all`, `--output-fasta`, `--output-ndjson`, `--output-json`, `--output-csv`, `--output-tsv`, `--output-tree`, `--output-translations`, `--output-feature-table`, `--output-insertions`, `--output-errors`
///
/// If the required directory tree does not exist, it will be created.
#[clap(long, short = 'O')]
Expand Down Expand Up @@ -425,6 +426,22 @@ pub struct NextcladeRunOutputArgs {
#[clap(value_hint = ValueHint::AnyPath)]
pub output_translations: Option<String>,

/// Path to output Genbank Feature Table file (.tbl).
///
/// Writes Feature Table in Genbank file format, which can be used to facilitate submissions to Genbank database.
///
/// See: https://www.ncbi.nlm.nih.gov/genbank/feature_table/
///
/// Takes precedence over paths configured with `--output-all`, `--output-basename` and `--output-selection`.
///
/// If filename ends with one of the supported file extensions: `gz`, `bz2`, `xz`, `zstd`, it will be transparently
/// compressed. If a filename is "-" then the output will be written uncompressed to standard output (stdout).
///
/// If the required directory tree does not exist, it will be created.
#[clap(long, short = 'F')]
#[clap(value_hint = ValueHint::AnyPath)]
pub output_feature_table: Option<PathBuf>,

/// Path to output Newline-delimited JSON (NDJSON) results file.
///
/// This file format is most suitable for further machine processing of the results. By contrast to plain json, it can be streamed line-by line, so much bigger outputs are feasible.
Expand Down Expand Up @@ -615,6 +632,7 @@ pub fn nextclade_get_output_filenames(run_args: &mut NextcladeRunArgs) -> Result
output_selection,
output_fasta,
output_translations,
output_feature_table,
output_ndjson,
output_json,
output_csv,
Expand Down Expand Up @@ -671,6 +689,10 @@ pub fn nextclade_get_output_filenames(run_args: &mut NextcladeRunArgs) -> Result
output_translations.get_or_insert(output_translations_template);
}

if output_selection.contains(&NextcladeOutputSelection::FeatureTable) {
let output_feature_table = output_feature_table.get_or_insert(add_extension(&default_output_file_path, "tbl"));
}

if output_selection.contains(&NextcladeOutputSelection::Ndjson) {
output_ndjson.get_or_insert(add_extension(&default_output_file_path, "ndjson"));
}
Expand Down
8 changes: 6 additions & 2 deletions packages_rs/nextclade-cli/src/cli/nextclade_loop.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use crate::dataset::dataset_download::{
};
use crossbeam::thread;
use eyre::{Report, WrapErr};
use indexmap::IndexMap;
use itertools::Itertools;
use log::info;
use nextclade::align::gap_open::{get_gap_open_close_scores_codon_aware, get_gap_open_close_scores_flat};
Expand All @@ -22,12 +23,13 @@ use nextclade::translate::translate_genes_ref::translate_genes_ref;
use nextclade::tree::tree_attach_new_nodes::tree_attach_new_nodes_in_place;
use nextclade::tree::tree_preprocess::tree_preprocess_in_place;
use nextclade::types::outputs::NextcladeOutputs;
use nextclade::utils::range::Range;
use std::path::PathBuf;

pub struct NextcladeRecord {
pub index: usize,
pub seq_name: String,
pub outputs_or_err: Result<(Vec<Nuc>, Vec<Translation>, NextcladeOutputs), Report>,
pub outputs_or_err: Result<(Vec<Nuc>, Vec<Translation>, IndexMap<String, Range>, NextcladeOutputs), Report>,
}

pub struct DatasetFilePaths {
Expand Down Expand Up @@ -83,6 +85,7 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> {
output_selection,
output_fasta,
output_translations,
output_feature_table,
output_ndjson,
output_json,
output_csv,
Expand Down Expand Up @@ -229,6 +232,7 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> {
&output_insertions,
&output_errors,
&output_translations,
&output_feature_table,
in_order,
)
.wrap_err("When creating output writer")
Expand All @@ -243,7 +247,7 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> {

for record in result_receiver {
if should_keep_outputs {
if let Ok((_, _, nextclade_outputs)) = &record.outputs_or_err {
if let Ok((_, _, _, nextclade_outputs)) = &record.outputs_or_err {
outputs.push(nextclade_outputs.clone());
}
}
Expand Down
16 changes: 15 additions & 1 deletion packages_rs/nextclade-cli/src/cli/nextclade_ordered_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use itertools::Itertools;
use log::warn;
use nextclade::io::errors_csv::ErrorsCsvWriter;
use nextclade::io::fasta::{FastaPeptideWriter, FastaRecord, FastaWriter};
use nextclade::io::genbank_feature_table::{GenbankFeatureTableEntry, GenbankFeatureTableFileWriter};
use nextclade::io::gene_map::GeneMap;
use nextclade::io::insertions_csv::InsertionsCsvWriter;
use nextclade::io::ndjson::NdjsonFileWriter;
Expand All @@ -23,6 +24,7 @@ use std::path::PathBuf;
pub struct NextcladeOrderedWriter<'a> {
fasta_writer: Option<FastaWriter>,
fasta_peptide_writer: Option<FastaPeptideWriter>,
feature_table_writer: Option<GenbankFeatureTableFileWriter<'a>>,
output_json_writer: Option<ResultsJsonWriter>,
output_ndjson_writer: Option<NdjsonFileWriter>,
output_csv_writer: Option<NextcladeResultsCsvFileWriter>,
Expand All @@ -46,13 +48,17 @@ impl<'a> NextcladeOrderedWriter<'a> {
output_insertions: &Option<PathBuf>,
output_errors: &Option<PathBuf>,
output_translations: &Option<String>,
output_feature_table: &Option<PathBuf>,
in_order: bool,
) -> Result<Self, Report> {
let fasta_writer = output_fasta.map_ref_fallible(FastaWriter::from_path)?;

let fasta_peptide_writer = output_translations
.map_ref_fallible(|output_translations| FastaPeptideWriter::new(gene_map, &output_translations))?;

let feature_table_writer = output_feature_table
.map_ref_fallible(|output_feature_table| GenbankFeatureTableFileWriter::new(output_feature_table, gene_map))?;

let insertions_csv_writer = output_insertions.map_ref_fallible(InsertionsCsvWriter::new)?;

let errors_csv_writer =
Expand All @@ -75,6 +81,7 @@ impl<'a> NextcladeOrderedWriter<'a> {
Ok(Self {
fasta_writer,
fasta_peptide_writer,
feature_table_writer,
output_json_writer,
output_ndjson_writer,
output_csv_writer,
Expand Down Expand Up @@ -113,7 +120,7 @@ impl<'a> NextcladeOrderedWriter<'a> {
} = record;

match outputs_or_err {
Ok((qry_seq_stripped, translations, nextclade_outputs)) => {
Ok((qry_seq_stripped, translations, gene_ranges_qry, nextclade_outputs)) => {
let NextcladeOutputs {
warnings,
insertions,
Expand All @@ -132,6 +139,13 @@ impl<'a> NextcladeOrderedWriter<'a> {
}
}

if let Some(feature_table_writer) = &mut self.feature_table_writer {
feature_table_writer.write(&GenbankFeatureTableEntry {
seq_name: seq_name.clone(),
gene_ranges_qry: gene_ranges_qry.clone(),
})?;
}

if let Some(insertions_csv_writer) = &mut self.insertions_csv_writer {
insertions_csv_writer.write(&seq_name, insertions, &translations)?;
}
Expand Down
4 changes: 2 additions & 2 deletions packages_rs/nextclade-web/src/wasm/analyze.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ impl NextcladeParams {
pub struct AnalysisInput {
#[wasm_bindgen(getter_with_clone)]
pub qry_index: usize,

#[wasm_bindgen(getter_with_clone)]
pub qry_seq_name: String,

Expand Down Expand Up @@ -243,7 +243,7 @@ impl Nextclade {
&self.gap_open_close_aa,
&self.aln_params,
) {
Ok((qry_seq_aligned_stripped, translations, nextclade_outputs)) => {
Ok((qry_seq_aligned_stripped, translations, gene_ranges_qry, nextclade_outputs)) => {
let nextclade_outputs_str =
json_stringify(&nextclade_outputs).wrap_err("When serializing output results of Nextclade")?;

Expand Down
3 changes: 2 additions & 1 deletion packages_rs/nextclade/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@ env_logger = "0.9.0"
eyre = "0.6.8"
flate2 = "1.0.24"
getrandom = "0.2.6"
indexmap = { version = "1.8.1", features = ["serde"] }
indexmap = { version = "1.8.1", features = ["serde-1", "rayon"] }
itertools = "0.10.3"
lazy_static = "1.4.0"
log = "0.4.16"
multimap = "0.8.3"
num = "0.4.0"
num-traits = "0.2.14"
num_cpus = "1.13.1"
Expand Down
18 changes: 18 additions & 0 deletions packages_rs/nextclade/src/gene/gene.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,21 @@
use crate::utils::range::Range;
use multimap::MultiMap;
use serde::{Deserialize, Serialize};
use std::str::FromStr;
use strum_macros::{EnumString, Display as EnumDisplay};

#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq, EnumString, EnumDisplay)]
pub enum FeatureKind {
#[strum(serialize = "gene")]
#[serde(rename = "gene")]
Gene,

CDS,

#[strum(serialize = "unknown")]
#[serde(rename = "unknown")]
Unknown,
}

#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
pub enum GeneStrand {
Expand All @@ -24,11 +40,13 @@ impl From<bio_types::strand::Strand> for GeneStrand {
#[derive(Clone, Debug, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Gene {
pub kind: FeatureKind,
pub gene_name: String,
pub start: usize,
pub end: usize,
pub strand: GeneStrand,
pub frame: i32,
pub attributes: MultiMap<String, String>,
}

impl Gene {
Expand Down
Loading