Skip to content

Commit

Permalink
making moves for a conda package: remove Email::Stuffer and add in Mu…
Browse files Browse the repository at this point in the history
…ltiQC. Remove CG-Pipeline in most places.
  • Loading branch information
lskatz committed Apr 10, 2024
1 parent 614ed49 commit 8ed4079
Show file tree
Hide file tree
Showing 9 changed files with 182 additions and 264 deletions.
24 changes: 13 additions & 11 deletions SneakerNet.plugins/addReadMetrics.pl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

use lib "$FindBin::RealBin/../lib/perl5";
use List::MoreUtils qw/uniq/;
use SneakerNet qw/exitOnSomeSneakernetOptions recordProperties readConfig logmsg samplesheetInfo_tsv command/;
use SneakerNet qw/readMetrics exitOnSomeSneakernetOptions recordProperties readConfig logmsg samplesheetInfo_tsv command/;

$ENV{PATH}="$ENV{PATH}:/opt/cg_pipeline/scripts";
our $VERSION = "2.0";
Expand All @@ -30,7 +30,6 @@ sub main{
exitOnSomeSneakernetOptions({
_CITATION => $CITATION,
_VERSION => $VERSION,
'run_assembly_readMetrics.pl (CG-Pipeline)' => 'echo CG-Pipeline version unknown',
}, $settings,
);

Expand Down Expand Up @@ -150,21 +149,24 @@ sub addReadMetrics{

sub readMetricsWorker{
my($Q, $settings)=@_;

my %metrics;

my $tempdir=tempdir("worker.XXXXXX", DIR=>$$settings{tempdir}, CLEANUP=>1);
while(defined(my $fastq=$Q->dequeue)){
logmsg "read metrics for $fastq";
eval{
command("run_assembly_readMetrics.pl --numcpus 1 --fast $fastq >> $tempdir/readMetrics.tsv");
return 1;
};
if($@){
logmsg "There was an error running run_assembly_readMetrics.pl. This might be because of a divide-by-zero error. This can be solved by running the metrics without subsampling the reads which is slower.\n";
logmsg "Rerunning without --fast.";
command("run_assembly_readMetrics.pl --numcpus 1 $fastq >> $tempdir/readMetrics.tsv");
}
my $metricsHashRef = readMetrics([$fastq]);
%metrics = (%metrics, %$metricsHashRef);
}

# Write to the output file
open(my $fh, ">>", "$tempdir/readMetrics.tsv") or die "ERROR: could not append to $tempdir/readMetrics.tsv: $!";
print $fh join("\t", qw(File avgReadLength totalBases minReadLength maxReadLength avgQuality numReads coverage))."\n";
while(my($fastq,$m) = each(%metrics)){
print $fh join("\t", basename($fastq), $$m{avgReadLength}, $$m{totalBases}, $$m{minReadLength},
$$m{maxReadLength}, $$m{avgQuality}, $$m{numReads}, $$m{coverage}) . "\n";
}
close $fh;
}


Expand Down
200 changes: 13 additions & 187 deletions SneakerNet.plugins/assembleAll.pl
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ sub main{
exitOnSomeSneakernetOptions({
_CITATION => $CITATION,
_VERSION => $VERSION,
'run_prediction_metrics.pl (CG-Pipeline)' => "echo CG Pipeline version unknown",
'run_assembly_metrics.pl (CG-Pipeline)' => "echo CG Pipeline version unknown",
cat => 'cat --version | head -n 1',
sort => 'sort --version | head -n 1',
head => 'head --version | head -n 1',
Expand Down Expand Up @@ -80,12 +78,12 @@ sub main{

mkdir "$dir/SneakerNet";
mkdir "$dir/SneakerNet/assemblies";
my $metricsOut=assembleAll($dir,$settings);
logmsg "Metrics can be found in $metricsOut";
assembleAll($dir,$settings);
#logmsg "Metrics can be found in $metricsOut";

my $rawMultiQC = makeMultiQC($dir, $settings);
#my $rawMultiQC = makeMultiQC($dir, $settings);

recordProperties($dir,{version=>$VERSION,table=>$metricsOut, mqc=>$rawMultiQC,
recordProperties($dir,{version=>$VERSION,
"Minimum contig length for assembly metrics" => "500bp",
});

Expand Down Expand Up @@ -136,8 +134,7 @@ sub assembleAll{
my $outassembly="$outdir/$sample.shovill.skesa.fasta";
my $outgfa ="$outdir/$sample.shovill.skesa.gfa";
my $outgbk="$outdir/$sample.shovill.skesa.gbk";
#my $outassembly="$outdir/$sample.megahit.fasta";
#my $outgbk="$outdir/$sample.megahit.gbk";
my $outgff="$outdir/$sample.shovill.skesa.gff";

# Run the assembly
if(!-e $outassembly){
Expand All @@ -158,141 +155,23 @@ sub assembleAll{
cp($assembly, $outassembly) or die "ERROR copying $assembly => $outassembly\n $!";
cp($gfa, $outgfa) or die "ERROR copying $gfa => $outgfa\n $!";

mkdir "$outdir/prodigal"; # just make this directory right away
#mkdir "$outdir/prodigal"; # just make this directory right away
}

# Genome annotation
logmsg "PREDICT GENES FOR SAMPLE $sample";
if(!-e $outgbk){
my $gbk=annotateFasta($sample,$outassembly,$settings);
cp($gbk,$outgbk) or die "ERROR: could not copy $gbk to $outgbk: $!";
}

}

# run assembly metrics with min contig size=0.5kb
my $metricsOut="$dir/SneakerNet/forEmail/assemblyMetrics.tsv";
logmsg "Running metrics on the genbank files at $metricsOut";

my @thr;
my $Q=Thread::Queue->new(glob("$dir/SneakerNet/assemblies/*/*.shovill.skesa.gbk"));
for(0..$$settings{numcpus}-1){
$thr[$_]=threads->new(\&predictionMetricsWorker,$Q,$settings);
$Q->enqueue(undef);
}
for(@thr){
$_->join;
}

command("cat $$settings{tempdir}/worker.*/metrics.tsv | head -n 1 > $metricsOut"); # header
#command("sort -k1,1 $$settings{tempdir}/worker.*/metrics.tsv | uniq -u >> $metricsOut"); # content
command("tail -q -n +2 $$settings{tempdir}/worker.*/metrics.tsv | sort | grep -Ev '^File\\b' >> $metricsOut");

return $metricsOut;
}

sub predictionMetricsWorker{
my($Q,$settings)=@_;
my $tempdir=tempdir("worker.XXXXXX", DIR=>$$settings{tempdir}, CLEANUP=>1);
my $metricsOut ="$tempdir/metrics.tsv"; # Combined metrics

#command("touch $predictionOut $assemblyOut");

my $numMetrics=0;
while(defined(my $gbk=$Q->dequeue)){
my $dir = dirname($gbk);
my $predictionOut ="$dir/predictionMetrics.tsv";
my $assemblyOut ="$dir/assemblyMetrics.tsv";
my $coverageOut ="$dir/depth.tsv.gz";
my $effectiveCoverage = "$dir/effectiveCoverage.tsv";

# Metrics for the fasta: the fasta file has the same
# base name but with a fasta extension
my $fasta=$gbk;
$fasta=~s/gbk$/fasta/;
my $bam =$gbk;
$bam =~s/gbk$/bam/;

# Bring the shovill bam file over
my $srcBam = dirname($fasta)."/shovill/shovill.bam";
if(!-e $bam){
logmsg "hard linking: $srcBam => $bam";
link($srcBam, $bam) or logmsg "WARNING: could not hard link $srcBam => $bam: $!";
}

logmsg "gbk metrics for $gbk";
command("run_prediction_metrics.pl $gbk > $predictionOut");
logmsg "asm metrics for $fasta";
command("run_assembly_metrics.pl --allMetrics --numcpus 1 $fasta > $assemblyOut");

if(-e $bam){
logmsg "Coverage for $bam";
command("samtools depth -aa $bam | gzip -c > $coverageOut");

logmsg "Effective coverage for $bam";
my $total = 0;
my $numSites = 0;
open(my $fh, "zcat $coverageOut |") or die "ERROR: could not zcat $coverageOut: $!";
while(my $line = <$fh>){
chomp($line);
my(undef, undef, $depth) = split(/\t/, $line);
$total+=$depth;
$numSites++;
}
close $fh;
open(my $outFh, ">", $effectiveCoverage) or die "ERROR: could not write to $effectiveCoverage: $!";
print $outFh join("\t", qw(File effectiveCoverage))."\n";
print $outFh join("\t", $bam, sprintf("%0.2f", $total/$numSites))."\n";
close $outFh;
} else {
logmsg "WARNING: cannot determine effective coverage: could not find bam at $bam";
# dummy data
open(my $outFh, ">", $effectiveCoverage) or die "ERROR: could not write to $effectiveCoverage: $!";
print $outFh join("\t", qw(File effectiveCoverage))."\n";
print $outFh join("\t", $bam, -1)."\n";
close $outFh;
}

# Combine the metrics into one file
# We know that each file is one header and one values line
my %metric;
for my $file ($predictionOut, $effectiveCoverage, $assemblyOut){
open(my $metricsFh, $file) or die "ERROR: could not read file $file: $!";
my $header = <$metricsFh>;
my $values = <$metricsFh>;
chomp($header, $values);
close $metricsFh;

# Header and value match up
my @header = split(/\t/, $header);
my @value = split(/\t/, $values);
for(my $i=0;$i<@header;$i++){
$metric{$header[$i]} = $value[$i];
}
# Remove the directory and suffix of the filename
$metric{File} = basename($metric{File}, qw(.shovill.skesa.fasta .shovill.skesa.gbk .shovill.skesa.bam));
if(!-e "$outdir/quast/report.html"){
logmsg "Running quast on $outassembly > $outdir/quast";
command("quast $outassembly --glimmer --output-dir $outdir/quast --threads $$settings{numcpus} --rna-finding");
}
# Combine the metrics
# Alphabetize the header but take care of Filename separately
my $filename = $metric{File};
my @header = sort keys(%metric);
@header = grep{!/File/} @header;

# Write combined metrics to file
open(my $combinedFh, ">>", $metricsOut) or die "ERROR: could not write to $metricsOut: $!";
print $combinedFh join("\t", "File", @header)."\n";
print $combinedFh $metric{File};
for my $h(@header){
print $combinedFh "\t" . $metric{$h};
else{
logmsg "Found $outdir/quast/report.html. Not rerunning";
}
print $combinedFh "\n";
close $combinedFh;

$numMetrics++;
}

}



sub assembleSample{
my($sample,$sampleInfo,$settings)=@_;

Expand Down Expand Up @@ -336,59 +215,6 @@ sub assembleSample{
return $outdir
}

# I _would_ use prokka, except it depends on having an up to date tbl2asn
# which is not really necessary for what I'm doing here.
sub annotateFasta{
my($sample,$assembly,$settings)=@_;

# Ensure a clean slate
my $outdir="$$settings{tempdir}/$sample/prodigal";
system("rm -rf $outdir");

mkdir "$$settings{tempdir}/$sample";
mkdir $outdir;

my $outgff="$outdir/prodigal.gff";
my $outgbk="$outdir/prodigal.gbk";

logmsg "Predicting genes on $sample with Prodigal";
eval{
command("prodigal -q -i $assembly -o $outgff -f gff -g 11 1>&2");
};
# If there is an issue, push ahead with a zero byte file
if($@){
logmsg "There was an issue with predicting genes on sample $sample. This might be caused by a poor assembly.";
open(my $fh, ">", $outgff) or die "ERROR: could not write to $outgff: $!";
close $fh;
}

# Read the assembly sequence
my %seqObj;
my $seqin=Bio::SeqIO->new(-file=>$assembly);
while(my $seq=$seqin->next_seq){
$seqObj{$seq->id}=$seq;
}
$seqin->close;

# Add seq features
my $gffin=Bio::FeatureIO->new(-file=>$outgff);
while(my $feat=$gffin->next_feature){
# put the features onto the seqobj and write it to file
my $id=$feat->seq_id;
$seqObj{$id}->add_SeqFeature($feat);
}
$gffin->close;

# Convert to gbk
my $gbkObj=Bio::SeqIO->new(-file=>">$outgbk",-format=>"genbank");
for my $seq(values(%seqObj)){
$gbkObj->write_seq($seq);
}
$gbkObj->close;

return $outgbk;
}

sub usage{
print "Assemble all genomes
Usage: $0 MiSeq_run_dir
Expand Down
53 changes: 33 additions & 20 deletions SneakerNet.plugins/emailWhoever.pl
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@
use File::Find qw/find/;
use Cwd qw/realpath/;
use File::Temp qw/tempdir/;
use MIME::Base64 qw/encode_base64/;
use POSIX qw/strftime/;
use IO::Compress::Zip qw(zip $ZipError);

$ENV{PATH}="$ENV{PATH}:/opt/cg_pipeline/scripts";

use Config::Simple;
use SneakerNet qw/exitOnSomeSneakernetOptions recordProperties readConfig passfail command logmsg version/;
use Email::Stuffer;
use List::MoreUtils qw/uniq/;

our $VERSION = "3.0";
Expand All @@ -38,7 +38,8 @@ sub main{
exitOnSomeSneakernetOptions({
_CITATION => $CITATION,
_VERSION => $VERSION,
sendmail => 'sendmail -d0.4 -bv root 2>&1 | grep -m 1 Version'
sendmail => 'apt-cache show sendmail 2>/dev/null | grep Version || rpm -qi sendmail 2>/dev/null | grep Version',
uuencode => 'uuencode --version | grep uuencode',
}, $settings,
);

Expand Down Expand Up @@ -178,31 +179,27 @@ sub emailWhoever{
$body.=$failureMessage;
}

my $email=Email::Stuffer->from($from)
->subject($subject)
->to($to);
my $emailFile = "$$settings{tempdir}/email.txt";
open(my $fh, ">", $emailFile) or die "ERROR: could not write to $emailFile: $!";
print $fh "To: $to\n";
print $fh "From: $from\n";
print $fh "Subject: $subject\n";
print $fh "\n";
print $fh "$body\n";

#for my $file(glob("$dir/SneakerNet/forEmail/*")){
# next if(!-f $file);
# $email->attach_file($file);
#}
# Attach log files
# Append attachments to the email text file
for my $file(glob("$dir/*.log")){
next if(!-f $file);
$email->attach_file($file);
append_attachment($fh, $file);
}
# Make the zip file
my $zip = "$$settings{tempdir}/$runName.zip";
zip_directory("$dir/SneakerNet/forEmail", $zip);
$email->attach_file($zip);
$email->attach_file("$dir/SneakerNet/forEmail/report.html");
$email->attach_file("$dir/SneakerNet/forEmail/multiqc_report.html");
$email->text_body($body);
append_attachment($fh, $zip);
append_attachment($fh, "$dir/SneakerNet/forEmail/report.html");
append_attachment($fh, "$dir/SneakerNet/forEmail/multiqc_report.html");

my $was_sent=$email->send;

if(!$was_sent){
logmsg "Warning: Email was not sent to $to!";
}
command("sendmail -t < $emailFile");

return \@to;
}
Expand All @@ -229,6 +226,22 @@ sub zip_directory {
die "Zip failed" if $?;
}

# Add an attachment to an email file handle
sub append_attachment {
my ($fh, $file_path) = @_;

# Encode the attachment content using base64 encoding
my $attachment_name = basename($file_path);
my $encoded_content = `uuencode $file_path $attachment_name`;
die "Failed to encode attachment content from $file_path: $!" if $?;

print $fh $encoded_content . "\n";

# Print a newline to separate MIME parts
print $fh "\n";
}


sub usage{
print "Email a SneakerNet run's results
Usage: $0 run-dir
Expand Down
Loading

0 comments on commit 8ed4079

Please sign in to comment.