Skip to content

Commit

Permalink
introducing more yaml; learning how to parse quast and fastqc results
Browse files Browse the repository at this point in the history
  • Loading branch information
lskatz committed Apr 20, 2024
1 parent 1e7a568 commit bb4a8e0
Show file tree
Hide file tree
Showing 7 changed files with 239 additions and 12 deletions.
1 change: 1 addition & 0 deletions Makefile.PL
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ WriteMakefile(
'Data::Dumper' => 0,
'List::Util' => 0,
'Exporter' => 0,
'TAP::Parser::YAMLish::Writer' => '3.44',
# Threads modules
'threads' => 0,
'threads::shared'=> 0,
Expand Down
3 changes: 3 additions & 0 deletions SneakerNet.plugins/addReadMetrics.pl
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ sub makeMultiQC{
print $outFh "#section_name: \"Read metrics\"\n";
print $outFh "#description: \"$plugin v$VERSION $docLink $pluginLink\"\n";
print $outFh "#anchor: '$anchor'\n";
print $outFh "#plot_type: 'table'\n";
# Print the rest of the table
open(my $fh, $intable) or die "ERROR: could not read table $intable: $!";
while(<$fh>){
Expand All @@ -85,6 +86,8 @@ sub makeMultiQC{
}
close $fh;

command("fastqc --nogroup $dir/*.fastq.gz -o $dir/SneakerNet/MultiQC-build/fastqc --noextract --threads $$settings{numcpus}");

return $outtable;
}

Expand Down
24 changes: 24 additions & 0 deletions SneakerNet.plugins/assembleAll.pl
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,30 @@ sub main{

#my $rawMultiQC = makeMultiQC($dir, $settings);

# Get multiQC to pick up on the quast results
mkdir "$dir/SneakerNet/MultiQC-build/quast";
for my $asmdir(glob("$dir/SneakerNet/assemblies/*")){
my $sample = basename($asmdir);
my $from = "$asmdir/quast/report.tsv";
# MultiQC is looking for 'report.tsv'
# The sample name is scraped from the file contents
# but also we need unique report.tsv files and so
# all the subdirectories.
my $to = "$dir/SneakerNet/MultiQC-build/quast/$sample/report.tsv";
mkdir dirname($to);

# sed any shovill name to just sample name
logmsg "$from => $to";
open(my $inFh, $from) or die "ERROR: could not read $from: $!";
open(my $outFh,">",$to) or die "ERROR: could not write to $to: $!";
while(my $line = <$inFh>){
$line =~ s/\Q$sample\E\S+/$sample/g;
print $outFh $line;
}
close $outFh;
close $inFh;
}

recordProperties($dir,{version=>$VERSION,
"Minimum contig length for assembly metrics" => "500bp",
});
Expand Down
13 changes: 13 additions & 0 deletions SneakerNet.plugins/sn_kraken.pl
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ sub runKrakenOnDir{
die "ERROR: output directory does not exist $outdir";
}

my $mqcDir = "$dir/SneakerNet/MultiQC-build";
mkdir $mqcDir;
mkdir "$mqcDir/kraken";

my $sampleInfo=samplesheetInfo_tsv("$dir/samples.tsv",$settings);

my @report; # reporting contamination in an array, in case I want to sort it later
Expand Down Expand Up @@ -111,6 +115,15 @@ sub runKrakenOnDir{
next;
}

#symlink to make MultiQC work on the raw data
my $symlinkFrom = "../../../$sampledir/kraken.report";
my $symlinkTo = "$mqcDir/kraken/$sampleName";
if(-e $symlinkTo){
unlink($symlinkTo)
or die "ERROR: could not remove file $symlinkTo: $!";
}
symlink($symlinkFrom, $symlinkTo) or die "ERROR: could not symlink to the multiqc directory ($symlinkFrom => $symlinkTo): $!";

}

return $outdir;
Expand Down
13 changes: 12 additions & 1 deletion SneakerNet.plugins/sn_mlst.pl
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,22 @@ sub makeMultiQC{
print $outFh "#section_name: \"MLST (7-gene)\"\n";
print $outFh "#description: \"$plugin v$VERSION $docLink $pluginLink\"\n";
print $outFh "#anchor: '$anchor'\n";
print $outFh "#scale: false\n";
# Print the rest of the table
open(my $fh, $intable) or die "ERROR: could not read table $intable: $!";
while(<$fh>){
next if(/^#/);
print $outFh $_;
chomp;
my($File, $mlst_scheme, $ST, @loci) = split(/\t/, $_);

# Do these replacements if it isn't the header
if($File !~ /File/i){
# replace any filename extensions in the first field
$File =~ s/^(\S+?)\..+/$1/;
# Prepend "ST-" so that it gets treated like a string
$ST = "ST-$ST";
}
print $outFh join("\t", $File, $mlst_scheme, $ST, @loci)."\n";
}
close $fh;

Expand Down
39 changes: 39 additions & 0 deletions SneakerNet.plugins/sn_passfail.pl
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,45 @@ sub identifyBadRuns{
$assemblyMetrics{$newKey} = $metrics;
}
}
# If we don't have assembly metrics from the old version of assembleAll.pl, then
# maybe we're looking for quast metrics.
else {
for my $samplename(keys(%$sampleInfo)){
my $quastReport = "$dir/SneakerNet/assemblies/$samplename/quast/report.tsv";
next if(!-e $quastReport);

open(my $fh, $quastReport) or die "ERROR: could not read $quastReport: $!";
while(<$fh>){
chomp;
my ($key, $value) = split(/\t/, $_);

# I don't really need a pound sign for number sign.
$key =~ s/#\s*//;

# If the key describes a contig length >= than something,
# then let's go with 1kb.
if($key =~ /(.+?)\s*>=\s*(\d+)/){
my $size = $2;
next if($size != 1000);
#$key = $1;
$key =~ s/\s+\(.+//;
}
$key =~ s/\s+/_/g;

$assemblyMetrics{$samplename}{$key} = $value;
}
close $fh;

# change some headers to my own legacy names
$assemblyMetrics{$samplename}{largestContig} = $assemblyMetrics{$samplename}{Largest_contig};
$assemblyMetrics{$samplename}{genomeLength} = $assemblyMetrics{$samplename}{Total_length};
$assemblyMetrics{$samplename}{GC} = $assemblyMetrics{$samplename}{'GC_(%)'};
$assemblyMetrics{$samplename}{numContigs} = $assemblyMetrics{$samplename}{contigs};
$assemblyMetrics{$samplename}{percentNs} = $assemblyMetrics{$samplename}{'N\'s_per_100_kbp'} / 10e5;
# still don't have: avgContigLength assemblyScore effectiveCoverage expectedGenomeLength
# kmer21 minContigLength
}
}

# Understand for each sample whether it passed or failed
# on each category
Expand Down
158 changes: 147 additions & 11 deletions SneakerNet.plugins/sn_report.pl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
use POSIX qw/strftime/;
use List::Util qw/min max/;

# Attempt to use a core YAML writer here but the downside
# is that this is not supposed to be a general purpose
# YAML writer.
use TAP::Parser::YAMLish::Writer;

use FindBin;
use lib "$FindBin::RealBin/../lib/perl5";
use SneakerNet qw/exitOnSomeSneakernetOptions recordProperties readProperties readConfig samplesheetInfo_tsv command logmsg fullPathToExec passfail/;
Expand Down Expand Up @@ -44,10 +49,11 @@ sub main{
# Make the summary table before writing properties
# so that it can get encoded with the rest of the
# tables in the html.
my $file = "$dir/SneakerNet/forEmail/QC_summary.tsv";
makeSummaryTable($file, $dir, $settings);
my $summaryTable = makeSummaryTable($dir, $settings);

my $rawMultiQC = makeMultiQC($dir, $settings);

my $pathFromDir = File::Spec->abs2rel(realpath($file), realpath($dir));
my $pathFromDir = File::Spec->abs2rel(realpath($summaryTable), realpath($dir));

# Special to this plugin: record any properties before
# reading the properties.
Expand All @@ -57,6 +63,7 @@ sub main{
date=>strftime("%Y-%m-%d", localtime()),
time=>strftime("%H:%M:%S", localtime()),
fullpath=>realpath($dir).'/SneakerNet',
mqc => $rawMultiQC,
});

my $properties = readProperties($dir);
Expand Down Expand Up @@ -147,15 +154,118 @@ sub main{
close $fh;
logmsg "Report can be found in $outfile";

command("multiqc --force $dir/SneakerNet/MultiQC-build --outdir $dir/SneakerNet/MultiQC.out");
my $run_name = basename(realpath($dir));
my $mqcConfig = makeMultiQC_config($dir, $settings);
system("cat $mqcConfig | nl -b a");
command("multiqc --config $mqcConfig --force $dir/SneakerNet/MultiQC-build --outdir $dir/SneakerNet/MultiQC.out");
cp("$dir/SneakerNet/MultiQC.out/multiqc_report.html", "$dir/SneakerNet/forEmail/multiqc_report.html")
or die "ERROR: could not cp multiqc_report.html to the forEmail folder: $!";

return 0;
}

sub makeMultiQC_config{
my($dir, $settings) = @_;

my $yaml = "$$settings{tempdir}/mqc.conf";

# Build the yaml hash which will be written later
my %yamlHash = ();
$yamlHash{table_columns_placement}{general_stats_table} = {
emoji => 900,
failure_code => 910,
score => 915,
cov1 => 920,
cov2 => 922,
qual1 => 930,
qual2 => 932,
};
$yamlHash{custom_data}{sn_report_}{plot_type} = "generalstats";
$yamlHash{custom_data}{sn_report }{plot_type} = "generalstats";
=cut
$yamlHash{module_order}[0] = {fastqc => {
name => "fastqc R1",
anchor => "fastqc_R1",
info => "FastQC for R1",
target => "",
path_filters => ["'*_1_fastqc.zip'"],
}};
$yamlHash{module_order}[1] = {fastqc => {
name => "fastqc R2",
anchor => "fastqc_R2",
info => "FastQC for R2",
target => "",
path_filters => ["'*_2_fastqc.zip'"],
}};
=cut

my $mqcBuildDir = "$dir/SneakerNet/MultiQC-build";

my $configStr = "";
my $yw = TAP::Parser::YAMLish::Writer->new;
$yw->write(\%yamlHash, \$configStr);

# Get rid of situations where the yaml writer makes a dash,
# then a new line, then the key for this element of the array.
# e.g.,
# -
# fastqc:
# name: something
# and turn it into
# - fastqc:
# name: something
$configStr =~ s/(\-)\s*\n\s+(\w+)/$1 $2/gm;

# Remove the weird --- and ... opening and closing that
# the YAML writer gives.
#$configStr =~ s/\-{3}//;
#$configStr =~ s/\.{3}//;

open(my $fh, ">", $yaml) or die "ERROR writing to multiQC config file $yaml: $!";
print $fh $configStr;
close $fh;

return $yaml;

}

# Make a table suitable for MultiQC
# Example found at https://github.com/MultiQC/test-data/blob/main/data/custom_content/issue_1883/4056145068.variant_counts_mqc.tsv
sub makeMultiQC{
my($dir, $settings) = @_;
my $intable = "$dir/SneakerNet/forEmail/QC_summary.tsv";
my $mqcDir = "$dir/SneakerNet/MultiQC-build";
my $outtable= "$mqcDir/qcsummary_mqc.tsv";
mkdir($mqcDir);

my $plugin = basename($0);
my $anchor = basename($0, ".pl");

my $docLink = "<a title='documentation' href='https://github.com/lskatz/sneakernet/blob/master/docs/plugins/$plugin.md'>&#128196;</a>";
my $pluginLink = "<a title='$plugin on github' href='https://github.com/lskatz/sneakernet/blob/master/SneakerNet.plugins/$plugin'><span style='font-family:monospace;font-size:small'>1011</span></a>";

open(my $outFh, ">", $outtable) or die "ERROR: could not write to multiqc table $outtable: $!";
print $outFh "#id: $anchor'\n";
print $outFh "#section_name: \"SneakerNet Summary\"\n";
print $outFh "#description: \"Quick stats from SneakerNet<br />$plugin v$VERSION $docLink $pluginLink\"\n";
print $outFh "#anchor: '$anchor'\n";
print $outFh "#plot_type: 'generalstats'\n";
# Print the rest of the table
open(my $fh, $intable) or die "ERROR: could not read table $intable: $!";
while(<$fh>){
next if(/^#/);
s/ +/_/g;
print $outFh $_;
}
close $fh;

return $outtable;
}

sub makeSummaryTable{
my($outfile, $dir, $settings) = @_;
my($dir, $settings) = @_;

my $outfile = "$dir/SneakerNet/forEmail/QC_summary.tsv";

# Gather some information
my $sample = samplesheetInfo_tsv("$dir/samples.tsv", $settings);
Expand Down Expand Up @@ -185,7 +295,7 @@ sub makeSummaryTable{
# Any knock on the perfection of a genome gets this penalty
my $penalty = 1/scalar(@$happiness) * 100;

my @tableRow; # to be sorted
my %tableRow;
while(my($sampleName, $s) = each(%$sample)){
my $score = 100;
my $emojiIdx= 0;
Expand Down Expand Up @@ -217,22 +327,48 @@ sub makeSummaryTable{
$cov .= sprintf("%0.0f",$readMetrics{$f}{coverage}).' ';
$qual.= sprintf("%0.0f",$readMetrics{$f}{avgQuality}).' ';
}
$cov =~ s/\s+$//;
$qual=~ s/\s+$//;

my $cov1 = $readMetrics{basename($$s{fastq}[0])}{coverage};
my $cov2 = $readMetrics{basename($$s{fastq}[1])}{coverage};
my $qual1= $readMetrics{basename($$s{fastq}[0])}{avgQuality};
my $qual2= $readMetrics{basename($$s{fastq}[1])}{avgQuality};

@failure_code = ("None") if(!@failure_code);
push(@tableRow, [$sampleName, $emoji, $score, join(", ", @failure_code),$qual, $cov, $$s{taxon}]);
$tableRow{$sampleName} = {
sample => $sampleName,
emoji => $emoji,
score => $score,
failure_code => join(", ", @failure_code),
qual1 => $qual1,
qual2 => $qual2,
cov1 => $cov1,
cov2 => $cov2,
taxon => $$s{taxon},
};

}
my @sortedRow = sort{$$a[2] <=> $$b[2] || $$a[0] cmp $$b[0]} @tableRow;
my @sortedSample = sort{ $a cmp $b } keys(%tableRow);

# Write the summary table
open(my $fh, '>', $outfile) or die "ERROR: could not write to $outfile: $!";
print $fh join("\t", qw(sample emoji score failure_code qual cov taxon))."\n";
for my $row(@sortedRow){
print $fh join("\t", @$row)."\n";
my @header = qw(sample emoji score failure_code qual1 qual2 cov1 cov2 taxon);
print $fh join("\t", @header)."\n";
for my $sample(@sortedSample){
my $info = $tableRow{$sample};
print $fh $sample;
for(my $i=1;$i<@header;$i++){
print $fh "\t".$tableRow{$sample}{$header[$i]};
}
print $fh "\n";
#print $fh join("\t", @$row)."\n";
}
print $fh "# Scores start at 100 percent and receive an equal percent penalty for each: assembly, low coverage, low quality, high percentage of Ns in the assembly, or high contamination in the Kraken report. See documentation for sn_passfail.pl for more information.\n";
print $fh "# Current emoticons range from high score (100%) to low: ".join(" ",@$happiness);
close $fh;

return $outfile;
}

# This function has the meat of the report for a given plugin
Expand Down

0 comments on commit bb4a8e0

Please sign in to comment.