From 8ed40793558e9a72a140239a78c14b8f131ea548 Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Wed, 10 Apr 2024 14:11:03 -0400 Subject: [PATCH] making moves for a conda package: remove Email::Stuffer and add in MultiQC. Remove CG-Pipeline in most places. --- SneakerNet.plugins/addReadMetrics.pl | 24 ++- SneakerNet.plugins/assembleAll.pl | 200 ++---------------- SneakerNet.plugins/emailWhoever.pl | 53 +++-- SneakerNet.plugins/sn_crypto_assembleAll.pl | 4 +- SneakerNet.plugins/sn_immediateStatus.pl | 35 ++- .../sn_iontorrent_assembleAll.pl | 6 +- SneakerNet.plugins/sn_report.pl | 5 +- lib/perl5/SneakerNet.pm | 88 +++++++- scripts/SneakerNetPlugins.pl | 31 +-- 9 files changed, 182 insertions(+), 264 deletions(-) diff --git a/SneakerNet.plugins/addReadMetrics.pl b/SneakerNet.plugins/addReadMetrics.pl index 355966df..f583f68b 100755 --- a/SneakerNet.plugins/addReadMetrics.pl +++ b/SneakerNet.plugins/addReadMetrics.pl @@ -15,7 +15,7 @@ use lib "$FindBin::RealBin/../lib/perl5"; use List::MoreUtils qw/uniq/; -use SneakerNet qw/exitOnSomeSneakernetOptions recordProperties readConfig logmsg samplesheetInfo_tsv command/; +use SneakerNet qw/readMetrics exitOnSomeSneakernetOptions recordProperties readConfig logmsg samplesheetInfo_tsv command/; $ENV{PATH}="$ENV{PATH}:/opt/cg_pipeline/scripts"; our $VERSION = "2.0"; @@ -30,7 +30,6 @@ sub main{ exitOnSomeSneakernetOptions({ _CITATION => $CITATION, _VERSION => $VERSION, - 'run_assembly_readMetrics.pl (CG-Pipeline)' => 'echo CG-Pipeline version unknown', }, $settings, ); @@ -150,21 +149,24 @@ sub addReadMetrics{ sub readMetricsWorker{ my($Q, $settings)=@_; + + my %metrics; my $tempdir=tempdir("worker.XXXXXX", DIR=>$$settings{tempdir}, CLEANUP=>1); while(defined(my $fastq=$Q->dequeue)){ logmsg "read metrics for $fastq"; - eval{ - command("run_assembly_readMetrics.pl --numcpus 1 --fast $fastq >> $tempdir/readMetrics.tsv"); - return 1; - }; - if($@){ - logmsg "There was an error running run_assembly_readMetrics.pl. This might be because of a divide-by-zero error. This can be solved by running the metrics without subsampling the reads which is slower.\n"; - logmsg "Rerunning without --fast."; - command("run_assembly_readMetrics.pl --numcpus 1 $fastq >> $tempdir/readMetrics.tsv"); - } + my $metricsHashRef = readMetrics([$fastq]); + %metrics = (%metrics, %$metricsHashRef); + } + # Write to the output file + open(my $fh, ">>", "$tempdir/readMetrics.tsv") or die "ERROR: could not append to $tempdir/readMetrics.tsv: $!"; + print $fh join("\t", qw(File avgReadLength totalBases minReadLength maxReadLength avgQuality numReads coverage))."\n"; + while(my($fastq,$m) = each(%metrics)){ + print $fh join("\t", basename($fastq), $$m{avgReadLength}, $$m{totalBases}, $$m{minReadLength}, + $$m{maxReadLength}, $$m{avgQuality}, $$m{numReads}, $$m{coverage}) . "\n"; } + close $fh; } diff --git a/SneakerNet.plugins/assembleAll.pl b/SneakerNet.plugins/assembleAll.pl index 013b6f4d..e3ea0e14 100755 --- a/SneakerNet.plugins/assembleAll.pl +++ b/SneakerNet.plugins/assembleAll.pl @@ -30,8 +30,6 @@ sub main{ exitOnSomeSneakernetOptions({ _CITATION => $CITATION, _VERSION => $VERSION, - 'run_prediction_metrics.pl (CG-Pipeline)' => "echo CG Pipeline version unknown", - 'run_assembly_metrics.pl (CG-Pipeline)' => "echo CG Pipeline version unknown", cat => 'cat --version | head -n 1', sort => 'sort --version | head -n 1', head => 'head --version | head -n 1', @@ -80,12 +78,12 @@ sub main{ mkdir "$dir/SneakerNet"; mkdir "$dir/SneakerNet/assemblies"; - my $metricsOut=assembleAll($dir,$settings); - logmsg "Metrics can be found in $metricsOut"; + assembleAll($dir,$settings); + #logmsg "Metrics can be found in $metricsOut"; - my $rawMultiQC = makeMultiQC($dir, $settings); + #my $rawMultiQC = makeMultiQC($dir, $settings); - recordProperties($dir,{version=>$VERSION,table=>$metricsOut, mqc=>$rawMultiQC, + recordProperties($dir,{version=>$VERSION, "Minimum contig length for assembly metrics" => "500bp", }); @@ -136,8 +134,7 @@ sub assembleAll{ my $outassembly="$outdir/$sample.shovill.skesa.fasta"; my $outgfa ="$outdir/$sample.shovill.skesa.gfa"; my $outgbk="$outdir/$sample.shovill.skesa.gbk"; - #my $outassembly="$outdir/$sample.megahit.fasta"; - #my $outgbk="$outdir/$sample.megahit.gbk"; + my $outgff="$outdir/$sample.shovill.skesa.gff"; # Run the assembly if(!-e $outassembly){ @@ -158,141 +155,23 @@ sub assembleAll{ cp($assembly, $outassembly) or die "ERROR copying $assembly => $outassembly\n $!"; cp($gfa, $outgfa) or die "ERROR copying $gfa => $outgfa\n $!"; - mkdir "$outdir/prodigal"; # just make this directory right away + #mkdir "$outdir/prodigal"; # just make this directory right away } - - # Genome annotation - logmsg "PREDICT GENES FOR SAMPLE $sample"; - if(!-e $outgbk){ - my $gbk=annotateFasta($sample,$outassembly,$settings); - cp($gbk,$outgbk) or die "ERROR: could not copy $gbk to $outgbk: $!"; - } - - } - - # run assembly metrics with min contig size=0.5kb - my $metricsOut="$dir/SneakerNet/forEmail/assemblyMetrics.tsv"; - logmsg "Running metrics on the genbank files at $metricsOut"; - - my @thr; - my $Q=Thread::Queue->new(glob("$dir/SneakerNet/assemblies/*/*.shovill.skesa.gbk")); - for(0..$$settings{numcpus}-1){ - $thr[$_]=threads->new(\&predictionMetricsWorker,$Q,$settings); - $Q->enqueue(undef); - } - for(@thr){ - $_->join; - } - - command("cat $$settings{tempdir}/worker.*/metrics.tsv | head -n 1 > $metricsOut"); # header - #command("sort -k1,1 $$settings{tempdir}/worker.*/metrics.tsv | uniq -u >> $metricsOut"); # content - command("tail -q -n +2 $$settings{tempdir}/worker.*/metrics.tsv | sort | grep -Ev '^File\\b' >> $metricsOut"); - - return $metricsOut; -} - -sub predictionMetricsWorker{ - my($Q,$settings)=@_; - my $tempdir=tempdir("worker.XXXXXX", DIR=>$$settings{tempdir}, CLEANUP=>1); - my $metricsOut ="$tempdir/metrics.tsv"; # Combined metrics - - #command("touch $predictionOut $assemblyOut"); - - my $numMetrics=0; - while(defined(my $gbk=$Q->dequeue)){ - my $dir = dirname($gbk); - my $predictionOut ="$dir/predictionMetrics.tsv"; - my $assemblyOut ="$dir/assemblyMetrics.tsv"; - my $coverageOut ="$dir/depth.tsv.gz"; - my $effectiveCoverage = "$dir/effectiveCoverage.tsv"; - - # Metrics for the fasta: the fasta file has the same - # base name but with a fasta extension - my $fasta=$gbk; - $fasta=~s/gbk$/fasta/; - my $bam =$gbk; - $bam =~s/gbk$/bam/; - # Bring the shovill bam file over - my $srcBam = dirname($fasta)."/shovill/shovill.bam"; - if(!-e $bam){ - logmsg "hard linking: $srcBam => $bam"; - link($srcBam, $bam) or logmsg "WARNING: could not hard link $srcBam => $bam: $!"; - } - - logmsg "gbk metrics for $gbk"; - command("run_prediction_metrics.pl $gbk > $predictionOut"); - logmsg "asm metrics for $fasta"; - command("run_assembly_metrics.pl --allMetrics --numcpus 1 $fasta > $assemblyOut"); - - if(-e $bam){ - logmsg "Coverage for $bam"; - command("samtools depth -aa $bam | gzip -c > $coverageOut"); - - logmsg "Effective coverage for $bam"; - my $total = 0; - my $numSites = 0; - open(my $fh, "zcat $coverageOut |") or die "ERROR: could not zcat $coverageOut: $!"; - while(my $line = <$fh>){ - chomp($line); - my(undef, undef, $depth) = split(/\t/, $line); - $total+=$depth; - $numSites++; - } - close $fh; - open(my $outFh, ">", $effectiveCoverage) or die "ERROR: could not write to $effectiveCoverage: $!"; - print $outFh join("\t", qw(File effectiveCoverage))."\n"; - print $outFh join("\t", $bam, sprintf("%0.2f", $total/$numSites))."\n"; - close $outFh; - } else { - logmsg "WARNING: cannot determine effective coverage: could not find bam at $bam"; - # dummy data - open(my $outFh, ">", $effectiveCoverage) or die "ERROR: could not write to $effectiveCoverage: $!"; - print $outFh join("\t", qw(File effectiveCoverage))."\n"; - print $outFh join("\t", $bam, -1)."\n"; - close $outFh; - } - - # Combine the metrics into one file - # We know that each file is one header and one values line - my %metric; - for my $file ($predictionOut, $effectiveCoverage, $assemblyOut){ - open(my $metricsFh, $file) or die "ERROR: could not read file $file: $!"; - my $header = <$metricsFh>; - my $values = <$metricsFh>; - chomp($header, $values); - close $metricsFh; - - # Header and value match up - my @header = split(/\t/, $header); - my @value = split(/\t/, $values); - for(my $i=0;$i<@header;$i++){ - $metric{$header[$i]} = $value[$i]; - } - # Remove the directory and suffix of the filename - $metric{File} = basename($metric{File}, qw(.shovill.skesa.fasta .shovill.skesa.gbk .shovill.skesa.bam)); + if(!-e "$outdir/quast/report.html"){ + logmsg "Running quast on $outassembly > $outdir/quast"; + command("quast $outassembly --glimmer --output-dir $outdir/quast --threads $$settings{numcpus} --rna-finding"); } - # Combine the metrics - # Alphabetize the header but take care of Filename separately - my $filename = $metric{File}; - my @header = sort keys(%metric); - @header = grep{!/File/} @header; - - # Write combined metrics to file - open(my $combinedFh, ">>", $metricsOut) or die "ERROR: could not write to $metricsOut: $!"; - print $combinedFh join("\t", "File", @header)."\n"; - print $combinedFh $metric{File}; - for my $h(@header){ - print $combinedFh "\t" . $metric{$h}; + else{ + logmsg "Found $outdir/quast/report.html. Not rerunning"; } - print $combinedFh "\n"; - close $combinedFh; - $numMetrics++; } + } + sub assembleSample{ my($sample,$sampleInfo,$settings)=@_; @@ -336,59 +215,6 @@ sub assembleSample{ return $outdir } -# I _would_ use prokka, except it depends on having an up to date tbl2asn -# which is not really necessary for what I'm doing here. -sub annotateFasta{ - my($sample,$assembly,$settings)=@_; - - # Ensure a clean slate - my $outdir="$$settings{tempdir}/$sample/prodigal"; - system("rm -rf $outdir"); - - mkdir "$$settings{tempdir}/$sample"; - mkdir $outdir; - - my $outgff="$outdir/prodigal.gff"; - my $outgbk="$outdir/prodigal.gbk"; - - logmsg "Predicting genes on $sample with Prodigal"; - eval{ - command("prodigal -q -i $assembly -o $outgff -f gff -g 11 1>&2"); - }; - # If there is an issue, push ahead with a zero byte file - if($@){ - logmsg "There was an issue with predicting genes on sample $sample. This might be caused by a poor assembly."; - open(my $fh, ">", $outgff) or die "ERROR: could not write to $outgff: $!"; - close $fh; - } - - # Read the assembly sequence - my %seqObj; - my $seqin=Bio::SeqIO->new(-file=>$assembly); - while(my $seq=$seqin->next_seq){ - $seqObj{$seq->id}=$seq; - } - $seqin->close; - - # Add seq features - my $gffin=Bio::FeatureIO->new(-file=>$outgff); - while(my $feat=$gffin->next_feature){ - # put the features onto the seqobj and write it to file - my $id=$feat->seq_id; - $seqObj{$id}->add_SeqFeature($feat); - } - $gffin->close; - - # Convert to gbk - my $gbkObj=Bio::SeqIO->new(-file=>">$outgbk",-format=>"genbank"); - for my $seq(values(%seqObj)){ - $gbkObj->write_seq($seq); - } - $gbkObj->close; - - return $outgbk; -} - sub usage{ print "Assemble all genomes Usage: $0 MiSeq_run_dir diff --git a/SneakerNet.plugins/emailWhoever.pl b/SneakerNet.plugins/emailWhoever.pl index 03112644..b60fc8fa 100755 --- a/SneakerNet.plugins/emailWhoever.pl +++ b/SneakerNet.plugins/emailWhoever.pl @@ -12,6 +12,7 @@ use File::Find qw/find/; use Cwd qw/realpath/; use File::Temp qw/tempdir/; +use MIME::Base64 qw/encode_base64/; use POSIX qw/strftime/; use IO::Compress::Zip qw(zip $ZipError); @@ -19,7 +20,6 @@ use Config::Simple; use SneakerNet qw/exitOnSomeSneakernetOptions recordProperties readConfig passfail command logmsg version/; -use Email::Stuffer; use List::MoreUtils qw/uniq/; our $VERSION = "3.0"; @@ -38,7 +38,8 @@ sub main{ exitOnSomeSneakernetOptions({ _CITATION => $CITATION, _VERSION => $VERSION, - sendmail => 'sendmail -d0.4 -bv root 2>&1 | grep -m 1 Version' + sendmail => 'apt-cache show sendmail 2>/dev/null | grep Version || rpm -qi sendmail 2>/dev/null | grep Version', + uuencode => 'uuencode --version | grep uuencode', }, $settings, ); @@ -178,31 +179,27 @@ sub emailWhoever{ $body.=$failureMessage; } - my $email=Email::Stuffer->from($from) - ->subject($subject) - ->to($to); + my $emailFile = "$$settings{tempdir}/email.txt"; + open(my $fh, ">", $emailFile) or die "ERROR: could not write to $emailFile: $!"; + print $fh "To: $to\n"; + print $fh "From: $from\n"; + print $fh "Subject: $subject\n"; + print $fh "\n"; + print $fh "$body\n"; - #for my $file(glob("$dir/SneakerNet/forEmail/*")){ - # next if(!-f $file); - # $email->attach_file($file); - #} - # Attach log files + # Append attachments to the email text file for my $file(glob("$dir/*.log")){ next if(!-f $file); - $email->attach_file($file); + append_attachment($fh, $file); } + # Make the zip file my $zip = "$$settings{tempdir}/$runName.zip"; zip_directory("$dir/SneakerNet/forEmail", $zip); - $email->attach_file($zip); - $email->attach_file("$dir/SneakerNet/forEmail/report.html"); - $email->attach_file("$dir/SneakerNet/forEmail/multiqc_report.html"); - $email->text_body($body); + append_attachment($fh, $zip); + append_attachment($fh, "$dir/SneakerNet/forEmail/report.html"); + append_attachment($fh, "$dir/SneakerNet/forEmail/multiqc_report.html"); - my $was_sent=$email->send; - - if(!$was_sent){ - logmsg "Warning: Email was not sent to $to!"; - } + command("sendmail -t < $emailFile"); return \@to; } @@ -229,6 +226,22 @@ sub zip_directory { die "Zip failed" if $?; } +# Add an attachment to an email file handle +sub append_attachment { + my ($fh, $file_path) = @_; + + # Encode the attachment content using base64 encoding + my $attachment_name = basename($file_path); + my $encoded_content = `uuencode $file_path $attachment_name`; + die "Failed to encode attachment content from $file_path: $!" if $?; + + print $fh $encoded_content . "\n"; + + # Print a newline to separate MIME parts + print $fh "\n"; +} + + sub usage{ print "Email a SneakerNet run's results Usage: $0 run-dir diff --git a/SneakerNet.plugins/sn_crypto_assembleAll.pl b/SneakerNet.plugins/sn_crypto_assembleAll.pl index ec1a3fdc..088af0d9 100755 --- a/SneakerNet.plugins/sn_crypto_assembleAll.pl +++ b/SneakerNet.plugins/sn_crypto_assembleAll.pl @@ -30,7 +30,7 @@ sub main{ exitOnSomeSneakernetOptions({ _CITATION => $CITATION, _VERSION => $VERSION, - 'run_assembly_filterContigs.pl (CG-Pipeline)' => "echo CG Pipeline version unknown", + 'seqtk' => 'seqtk 2>&1 | grep Version', 'run_prediction_metrics.pl (CG-Pipeline)' => "echo CG Pipeline version unknown", 'run_assembly_metrics.pl (CG-Pipeline)' => "echo CG Pipeline version unknown", cat => 'cat --version | head -n 1', @@ -111,7 +111,7 @@ sub assembleAll{ # Save the assembly mkdir $outdir; mkdir "$outdir/prodigal"; # just make this directory right away - command("run_assembly_filterContigs.pl -l 500 $assembly > $outassembly"); + command("seqtk seq -L 500 $assembly > $outassembly"); } # Genome annotation diff --git a/SneakerNet.plugins/sn_immediateStatus.pl b/SneakerNet.plugins/sn_immediateStatus.pl index 2108a6e5..f491deaf 100755 --- a/SneakerNet.plugins/sn_immediateStatus.pl +++ b/SneakerNet.plugins/sn_immediateStatus.pl @@ -14,7 +14,6 @@ use SneakerNet qw/exitOnSomeSneakernetOptions recordProperties readConfig samplesheetInfo_tsv command logmsg fullPathToExec/; use Text::Fuzzy; -use Email::Stuffer; our $VERSION = "1.7"; our $CITATION= "Immediate status report by Lee Katz"; @@ -28,7 +27,8 @@ sub main{ exitOnSomeSneakernetOptions({ _CITATION => $CITATION, _VERSION => $VERSION, - sendmail => 'sendmail -d0.4 -bv root 2>&1 | grep -m 1 Version', + sendmail => 'apt-cache show sendmail 2>/dev/null | grep Version || rpm -qi sendmail 2>/dev/null | grep Version', + uuencode => 'uuencode --version | grep uuencode', }, $settings, ); @@ -114,14 +114,15 @@ sub main{ my $subject="Initial SneakerNet status for ".basename(File::Spec->rel2abs($dir)); my $body = "If you see errors below, please contact the bioinformatics team with your run number and when you deposited the run. Include this file in your message.\nRun can be found at ".File::Spec->abs2rel($dir)."\n"; $body.= "\nDocumentation can be found at https://github.com/lskatz/SneakerNet/blob/master/docs/plugins/sn_immediateStatus.pl.md\n"; - my $email=Email::Stuffer->from($from) - ->subject($subject) - ->to($to) - ->text_body($body) - ->attach_file($outfile); - if(!$email->send){ - die "ERROR: email was not sent to $to!"; - } + + my $emailFile = "$$settings{tempdir}/email.txt"; + open(my $fh, ">", $emailFile) or die "ERROR: could not write to $emailFile: $!"; + print $fh "To: $to\n"; + print $fh "From: $from\n"; + print $fh "Subject: $subject\n"; + print $fh "\n"; + print $fh "$body\n\n"; + append_attachment($fh, $outfile); recordProperties($dir,{ version=>$VERSION, reportTo=>$to, @@ -183,6 +184,20 @@ sub doubleCheckRun{ return \%errHash; } +# Add an attachment to an email file handle +sub append_attachment { + my ($fh, $file_path) = @_; + + # Encode the attachment content using base64 encoding + my $attachment_name = basename($file_path); + my $encoded_content = `uuencode $file_path $attachment_name`; + die "Failed to encode attachment content from $file_path: $!" if $?; + + print $fh $encoded_content . "\n"; + + # Print a newline to separate MIME parts + print $fh "\n"; +} sub usage{ print "Double check a run and its completeness. Email a report. diff --git a/SneakerNet.plugins/sn_iontorrent_assembleAll.pl b/SneakerNet.plugins/sn_iontorrent_assembleAll.pl index 8b9a98da..23601403 100755 --- a/SneakerNet.plugins/sn_iontorrent_assembleAll.pl +++ b/SneakerNet.plugins/sn_iontorrent_assembleAll.pl @@ -30,7 +30,7 @@ sub main{ exitOnSomeSneakernetOptions({ _CITATION => $CITATION, _VERSION => $VERSION, - 'run_assembly_filterContigs.pl (CG-Pipeline)' => "echo CG Pipeline version unknown", + 'seqtk' => 'seqtk 2>&1 | grep Version', 'run_prediction_metrics.pl (CG-Pipeline)' => "echo CG Pipeline version unknown", 'spades.py (SPAdes)' => 'spades.py --version 2>&1', 'prodigal' => "prodigal -v 2>&1 | grep -i '^Prodigal V'", @@ -51,7 +51,7 @@ sub main{ mkdir "$dir/SneakerNet/forEmail"; # Check for required executables - for (qw(spades.py prodigal run_assembly_filterContigs.pl run_prediction_metrics.pl)){ + for (qw(spades.py prodigal run_prediction_metrics.pl)){ fullPathToExec($_); } @@ -89,7 +89,7 @@ sub assembleAll{ # Save the assembly mkdir $outdir; mkdir "$outdir/prodigal"; # just make this directory right away - command("run_assembly_filterContigs.pl -l 500 $assembly > $outassembly"); + command("seqtk seq -L 500 $assembly > $outassembly"); } # Genome annotation diff --git a/SneakerNet.plugins/sn_report.pl b/SneakerNet.plugins/sn_report.pl index 071d99ea..30be6655 100755 --- a/SneakerNet.plugins/sn_report.pl +++ b/SneakerNet.plugins/sn_report.pl @@ -7,6 +7,7 @@ use Data::Dumper; use File::Basename qw/fileparse basename dirname/; use File::Temp qw/tempdir/; +use File::Copy qw/cp/; use File::Spec; use Cwd qw/realpath/; use POSIX qw/strftime/; @@ -147,8 +148,8 @@ sub main{ logmsg "Report can be found in $outfile"; command("multiqc --force $dir/SneakerNet/MultiQC-build --outdir $dir/SneakerNet/MultiQC.out"); - link("$dir/SneakerNet/MultiQC.out/multiqc_report.html", "$dir/SneakerNet/forEmail/multiqc_report.html") - or die "ERROR: could not hard link multiqc_report.html to the forEmail folder: $!"; + cp("$dir/SneakerNet/MultiQC.out/multiqc_report.html", "$dir/SneakerNet/forEmail/multiqc_report.html") + or die "ERROR: could not cp multiqc_report.html to the forEmail folder: $!"; return 0; } diff --git a/lib/perl5/SneakerNet.pm b/lib/perl5/SneakerNet.pm index 421b5894..f20944cf 100644 --- a/lib/perl5/SneakerNet.pm +++ b/lib/perl5/SneakerNet.pm @@ -4,11 +4,14 @@ use warnings; use Exporter qw(import); use File::Basename qw/fileparse basename dirname/; use File::Copy qw/mv/; +use File::Temp qw/tempdir/; #use File::Spec (); use Config::Simple; use Data::Dumper; use Carp qw/croak confess carp/; use Cwd qw/realpath/; +use List::Util qw/sum min max/; +use version 0.77; use FindBin qw/$Bin $Script $RealBin $RealScript/; @@ -17,6 +20,7 @@ our @EXPORT_OK = qw( readTsv readKrakenDir command logmsg fullPathToExec version recordProperties readProperties exitOnSomeSneakernetOptions + readMetrics @rankOrder %rankOrder $VERSION ); @@ -32,7 +36,7 @@ TODO =cut -our $VERSION = '0.25.0'; +our $VERSION = version->declare('0.25.0'); our %rankName = (S=>'species', G=>'genus', F=>'family', O=>'order', C=>'class', P=>'phylum', K=>'kingdom', D=>'domain', U=>'unclassified'); our @rankOrder= qw(S G F O C P K D U); our %rankOrder= (S=>0, G=>1, F=>2, O=>3, C=>4, P=>5, K=>6, D=>7, U=>8); @@ -926,6 +930,88 @@ sub readProperties{ return \%prop; } +=head3 readMetrics(\@fastq, $expectedGenomeSize, $settings) + +Read metrics. +Replacing CG-Pipeline's run_assembly_readMetrics.pl + +Arguments: + + \@fastq The list of fastq files + $expectedGenomeSize in nucleotides + $settings hash + +Returns + + $properties: hash ref with keys. E.g., + File + avgReadLength + totalBases + minReadLength + maxReadLength + avgQuality + numReads + coverage + +=cut + +sub readMetrics{ + my($fastqs, $expectedGenomeSize, $settings) = @_; + my %metrics; + + my @fastq = sort{$a cmp $b} @$fastqs; + + my $tempdir = tempdir(basename($0).".readMetrics.XXXXXX",TMPDIR=>1,CLEANUP=>1); + + for(my $i=0; $i<@fastq; $i++){ + my $seqtkComp = "$tempdir/".basename($fastq[$i]).".seqtkcomp"; + my $seqtkFqchk= "$tempdir/".basename($fastq[$i]).".seqtkfqchk"; + command("seqtk comp $fastq[$i] > $seqtkComp"); + command("seqtk fqchk $fastq[$i]> $seqtkFqchk"); + + my(@lengths,@A,@C,@G,@T); + open(my $fh, $seqtkComp) or die "ERROR: could not read seqtk comp file $seqtkComp: $!"; + while(<$fh>){ + # chomp; # <- not including the last field and so no need to remove last newline char + my($readName, $length, $A, $C, $G, $T) = split /\t/; + push(@lengths, $length); + push(@A, $A); + push(@C, $C); + push(@G, $G); + push(@T, $T); + } + close $fh; + + # parse Fqchk to get average qual + my $avgQual = -1; + open(my $fh2, $seqtkFqchk) or die "ERROR: could not read seqtk fqchk file $seqtkFqchk: $!"; + while(<$fh2>){ + next if(!/^ALL/); + chomp; + my(undef, $bases, $A,$C,$G,$T,$N, $avgQ, $errQ, $low, $high) = split /\t/; + $avgQual = $avgQ; + last; + } + close $fh2; + + my $totalBases = sum(@lengths); + my $numReads = scalar(@lengths); + my $coverage = -1; + if($expectedGenomeSize){ + $coverage = $totalBases / $expectedGenomeSize; + } + $metrics{$fastq[$i]} = { + avgReadLength => ($totalBases/$numReads), + totalBases => $totalBases, + minReadLength => min(@lengths), + maxReadLength => max(@lengths), + avgQuality => $avgQual, + numReads => $numReads, + coverage => $coverage, + }; + } + return \%metrics; +} 1; diff --git a/scripts/SneakerNetPlugins.pl b/scripts/SneakerNetPlugins.pl index 841c9082..9959cffd 100755 --- a/scripts/SneakerNetPlugins.pl +++ b/scripts/SneakerNetPlugins.pl @@ -13,7 +13,6 @@ use lib "$RealBin/../lib/perl5"; use SneakerNet qw/readConfig command logmsg/; use Config::Simple; -use Email::Stuffer; $ENV{PATH}="$ENV{PATH}:/opt/cg_pipeline/scripts"; @@ -89,13 +88,12 @@ sub main{ # Run all plugins chdir($dir) or die "ERROR: could not change to directory $dir: $!"; + logmsg "Changing directory to $dir"; for my $e(@$exe){ my $command="$RealBin/../SneakerNet.plugins/$e . --numcpus $$settings{numcpus}"; $command.=" --force" if($$settings{force}); $command.=" --tempdir $$settings{tempdir}" if($$settings{tempdir}); - #print "$command\n\n"; next; - #command($command); next; if($$settings{'dry-run'}){ logmsg $command; @@ -110,33 +108,10 @@ sub main{ logmsg " ... however, --keep-going was specified and I will ignore that."; next; } - - my $from=$$settings{from} || die "ERROR: need to set 'from' in the settings.conf file!"; - my $subject="Run failed for ".basename(realpath($dir)); - my @to; - if(ref($$settings{'default.emails'}) eq "ARRAY"){ - push(@to, @{$$settings{'default.emails'}}); - } else { - push(@to, $$settings{'default.emails'}); - } - my $to = join(",",@to); - my $body = "Failed for ".realpath($dir)."\n\n"; - $body .= "Plugin that failed was $e: $@"; - my $email=Email::Stuffer->from($from) - ->subject($subject) - ->to($to) - ->text_body($body); - if($email->send){ - logmsg "Email sent to $to"; - } else { - logmsg "ERROR email failed to send to $to"; - } - return 1; } - - } - chdir($cwd); # go back to original directory + chdir($cwd) or die "ERROR: could not change back to directory $cwd: $!"; # go back to original directory + logmsg "Changing directories from $dir, back to $cwd"; } return 0;