Skip to content

Commit

Permalink
Merge branch 'poly-a' into 'dev'
Browse files Browse the repository at this point in the history
[CW-4176] Add poly-a histogramming

See merge request epi2melabs/fastcat!60
  • Loading branch information
cjw85 committed Jun 4, 2024
2 parents 4888a01 + b2dbfef commit 526c83c
Show file tree
Hide file tree
Showing 11 changed files with 108 additions and 69 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [unreleased]
## [v0.18.0]
### Added
- Basecaller summary information similar to runid summary.
- RNA poly-A tail length histogram output.
### Fixed
- Random output for runid when not found in header.

Expand Down
10 changes: 10 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ mem_check_fastcat_demultiplex: fastcat

.PHONY: mem_check_bamstats
mem_check_bamstats: bamstats
rm -rf bamstats-histograms
$(VALGRIND) --error-exitcode=1 --tool=memcheck --leak-check=full --show-leak-kinds=all -s \
./bamstats test/bamstats/400ecoli.bam

Expand Down Expand Up @@ -147,6 +148,15 @@ test_bamstats_NM: bamstats
../../bamstats ../bamstats_zeroNM/test.sam
rm -r test/test-tmp

.PHONY: test_bamstats_polya
test_bamstats_polya: bamstats
if [ -d test/test-tmp ]; then rm -r test/test-tmp; fi
mkdir test/test-tmp && \
cd test/test-tmp && \
../../bamstats ../bamstats/RCS-100A.bam --poly_a > /dev/null && \
diff bamstats-histograms/polya.hist ../bamstats/RCS-100A.bam.polya.hist
rm -r test/test-tmp

.PHONY: regression_test_fastcat
regression_test_fastcat: fastcat
if [ -d test/test-tmp ]; then rm -r test/test-tmp; fi
Expand Down
26 changes: 26 additions & 0 deletions src/bamstats/args.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,16 @@ static struct argp_option options[] = {
"Only process reads with a given tag value.", 3},
{"haplotype", 0x300, "VAL", 0,
"Only process reads from a given haplotype. Equivalent to --tag_name HP --tag_value VAL.", 3},
{0, 0, 0, 0,
"Poly-A Options:", 0},
{"poly_a", 0x500, 0, 0,
"Enable poly-A tail length histogram.", 5},
{"poly_a_cover", 0x600, "PCT_COVERAGE", 0,
"Reference alignment coverage for acceptance of read. (default: 95)", 5},
{"poly_a_qual", 0x700, "QUAL", 0,
"Read mean Q score for acceptance of read. (default: 10)", 5},
{"poly_a_rev", 0x800, 0, 0,
"Allow reverse alignments (useful for cDNA, default is appropriate for direct RNA seq).", 5},
{ 0 }
};

Expand Down Expand Up @@ -76,6 +86,18 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state) {
case 0x400:
arguments->histograms = arg;
break;
case 0x500:
arguments->poly_a = true;
break;
case 0x600:
arguments->poly_a_cover = atof(arg);
break;
case 0x700:
arguments->poly_a_qual = atof(arg);
break;
case 0x800:
arguments->poly_a_rev = true;
break;
case 's':
arguments->sample = arg;
break;
Expand Down Expand Up @@ -133,6 +155,10 @@ arguments_t parse_arguments(int argc, char** argv) {
args.runids = NULL;
args.basecallers = NULL;
args.histograms = "bamstats-histograms";
args.poly_a = false;
args.poly_a_cover = 95;
args.poly_a_qual = 10;
args.poly_a_rev = false;
args.sample = NULL;
args.ref = NULL;
args.region = NULL;
Expand Down
4 changes: 4 additions & 0 deletions src/bamstats/args.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ typedef struct arguments {
char* runids;
char* basecallers;
char* histograms;
bool poly_a;
float poly_a_cover;
float poly_a_qual;
bool poly_a_rev;
char *sample;
char* ref;
char* region;
Expand Down
56 changes: 22 additions & 34 deletions src/bamstats/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,15 @@ static inline void write_counter(const char* fname, kh_counter_t *counter, const
}


void write_hist_stats(read_stats* stats, char* prefix, char* name) {
char* path = calloc(strlen(prefix) + strlen(name) + 2, sizeof(char));
sprintf(path, "%s/%s", prefix, name);
FILE* fp = fopen(path, "w");
print_stats(stats, false, true, fp);
fclose(fp); free(path);
}


int main(int argc, char *argv[]) {
clock_t begin = clock();
arguments_t args = parse_arguments(argc, argv);
Expand Down Expand Up @@ -137,6 +146,7 @@ int main(int argc, char *argv[]) {
read_stats* qual_stats = create_qual_stats(QUAL_HIST_WIDTH);
read_stats* acc_stats = create_qual_stats(ACC_HIST_WIDTH);
read_stats* cov_stats = create_qual_stats(COV_HIST_WIDTH);
read_stats* polya_stats = args.poly_a ? create_length_stats() : NULL;

// Prepare also for the unmapped reads
read_stats* length_stats_unmapped = create_length_stats();
Expand All @@ -151,6 +161,7 @@ int main(int argc, char *argv[]) {
flag_counts, args.unmapped,
length_stats, qual_stats, acc_stats, cov_stats,
length_stats_unmapped, qual_stats_unmapped,
polya_stats, args.poly_a_cover, args.poly_a_qual, args.poly_a_rev,
run_ids, basecallers);

// write flagstat counts if requested
Expand Down Expand Up @@ -195,6 +206,7 @@ int main(int argc, char *argv[]) {
flag_counts, args.unmapped,
length_stats, qual_stats, acc_stats, cov_stats,
length_stats_unmapped, qual_stats_unmapped,
polya_stats, args.poly_a_cover, args.poly_a_qual, args.poly_a_rev,
run_ids, basecallers);
if (flag_counts != NULL) {
write_stats(flag_counts->counts[0], chr, args.sample, flagstats);
Expand All @@ -203,44 +215,19 @@ int main(int argc, char *argv[]) {
hts_idx_destroy(idx);
}

char* path = calloc(strlen(args.histograms) + 13, sizeof(char));
sprintf(path, "%s/length.hist", args.histograms);
FILE* stats_fp = fopen(path, "w");
print_stats(length_stats, false, true, stats_fp);
fclose(stats_fp); free(path);

path = calloc(strlen(args.histograms) + 14, sizeof(char));
sprintf(path, "%s/quality.hist", args.histograms);
stats_fp = fopen(path, "w");
print_stats(qual_stats, false, true, stats_fp);
fclose(stats_fp); free(path);

path = calloc(strlen(args.histograms) + 15, sizeof(char));
sprintf(path, "%s/accuracy.hist", args.histograms);
stats_fp = fopen(path, "w");
print_stats(acc_stats, false, true, stats_fp);
fclose(stats_fp); free(path);

path = calloc(strlen(args.histograms) + 15, sizeof(char));
sprintf(path, "%s/coverage.hist", args.histograms);
stats_fp = fopen(path, "w");
print_stats(cov_stats, false, true, stats_fp);
fclose(stats_fp); free(path);
write_hist_stats(length_stats, args.histograms, "length.hist");
write_hist_stats(qual_stats, args.histograms, "quality.hist");
write_hist_stats(acc_stats, args.histograms, "accuracy.hist");
write_hist_stats(cov_stats, args.histograms, "coverage.hist");
if (polya_stats != NULL) {
write_hist_stats(polya_stats, args.histograms, "polya.hist");
}

// Save also histograms for the unmapped reads if requested
// and if the user is not asking for a region
if (args.unmapped && args.region == NULL){
path = calloc(strlen(args.histograms) + 19, sizeof(char));
sprintf(path, "%s/length.unmap.hist", args.histograms);
stats_fp = fopen(path, "w");
print_stats(length_stats_unmapped, false, true, stats_fp);
fclose(stats_fp); free(path);

path = calloc(strlen(args.histograms) + 20, sizeof(char));
sprintf(path, "%s/quality.unmap.hist", args.histograms);
stats_fp = fopen(path, "w");
print_stats(qual_stats_unmapped, false, true, stats_fp);
fclose(stats_fp); free(path);
write_hist_stats(length_stats_unmapped, args.histograms, "length.unmap.hist");
write_hist_stats(qual_stats_unmapped, args.histograms, "quality.unmap.hist");
}

// write runids summary
Expand All @@ -258,6 +245,7 @@ int main(int argc, char *argv[]) {
destroy_qual_stats(cov_stats);
destroy_length_stats(length_stats_unmapped);
destroy_qual_stats(qual_stats_unmapped);
destroy_length_stats(polya_stats);
kh_counter_destroy(basecallers);
kh_counter_destroy(run_ids);

Expand Down
53 changes: 27 additions & 26 deletions src/bamstats/readstats.c
Original file line number Diff line number Diff line change
Expand Up @@ -146,39 +146,16 @@ int get_duplex_tag(bam1_t* b) {
return res;
}

/** Generates alignment stats from a region of a bam.
*
* @param fp htsFile pointer
* @param idx hts_idx_t pointer
* @param hdr sam_hdr_t pointer
* @param sample sample name.
* @param chr bam target name.
* @param start start position of chr to consider.
* @param end end position of chr to consider.
* @param overlap_start whether reads overhanging start should be included.
* @param read_group by which to filter alignments.
* @param tag_name by which to filter alignments.
* @param tag_value associated with tag_name.
* @param flag_counts flag_stats pointer.
* @param unmapped bool include unmapped reads in output.
* @param length_stats read_stats* for accumulating read length information.
* @param qual_stats read_stats* for accumulating read quality information.
* @param acc_stats read_stats* for accumulating read alignment accuracy information.
* @param cov_stats read_stats* for accumulating read alignment coverage information.
* @param length_stats_unmapped read_stats* for accumulating read length information for unmapped reads.
* @param qual_stats_unmapped read_stats* for accumulating read quality information for unmapped reads.
* @param runids kh_counter_t* for accumulating runids.
* @param basecallers kh_counter_t* for accumulating reads per basecaller.
* @returns void. Prints output to stdout.
*
*/

// Do all-the-things
void process_bams(
htsFile *fp, hts_idx_t *idx, sam_hdr_t *hdr, const char *sample,
const char *chr, hts_pos_t start, hts_pos_t end, bool overlap_start,
const char *read_group, const char tag_name[2], const int tag_value,
flag_stats *flag_counts, bool unmapped,
read_stats* length_stats, read_stats* qual_stats, read_stats* acc_stats, read_stats* cov_stats,
read_stats* length_stats_unmapped, read_stats* qual_stats_unmapped,
read_stats* polya_stats, float polya_cover, float polya_qual, bool polya_rev,
kh_counter_t* runids, kh_counter_t* basecallers) {
if (chr != NULL) {
if (strcmp(chr, "*") == 0) {
Expand Down Expand Up @@ -345,6 +322,30 @@ void process_bams(
add_qual_count(acc_stats, acc);
add_qual_count(cov_stats, coverage);

// get poly-A tail length. For now we require:
// i) "good" coverage on reference, i.e. "full length"
// ii) read is sense strand, i.e. fwd alignment
// iii) "good" mean quality
// iv) no split reads
if (polya_stats != NULL) {
int polya_len = -1;
if ((ref_cover >= polya_cover)
&& (!bam_is_rev(b) || polya_rev)
&& mean_quality >= polya_qual) {
uint8_t* tag = NULL;
tag = bam_get_tag_caseinsensitive(b, "pi");
if (tag == NULL) { // the tag is present for split reads
tag = bam_get_tag_caseinsensitive(b, "pt");
if (tag != NULL) {
polya_len = bam_aux_tag_int(tag);
}
}
}
if (polya_len >= 0) {
add_length_count(polya_stats, polya_len);
}
}

if (sample == NULL) {
fprintf(stdout,
"%s\t%s\t%s\t" \
Expand Down
5 changes: 5 additions & 0 deletions src/bamstats/readstats.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ void destroy_flag_stats(flag_stats* stats);
* @param cov_stats read_stats* for accumulating read alignment coverage information.
* @param length_stats_unmapped read_stats* for accumulating read length information for unmapped reads.
* @param qual_stats_unmapped read_stats* for accumulating read quality information for unmapped reads.
* @param polya_stats read_stats* for accumulating polyA tail length information.
* @param polya_cover minimum reference coverage for polyA tail length to be considered.
* @param polya_qual minimum mean quality for polyA tail length to be considered.
* @param polya_rev whether to allow reverse alignments for polyA tail length.
* @param runids kh_counter_t* for accumulating runid information.
* @param basecallers kh_counter_t* for accumulating basecaller information.
* @returns void. Prints output to stdout.
Expand All @@ -65,6 +69,7 @@ void process_bams(
flag_stats *flag_counts, bool unmapped,
read_stats* length_stats, read_stats* qual_stats, read_stats* acc_stats, read_stats* cov_stats,
read_stats* length_stats_unmapped, read_stats* qual_stats_unmapped,
read_stats* polya_stats, float polya_cover, float polya_qual, bool polya_rev,
kh_counter_t* runids, kh_counter_t* basecallers);

#endif
18 changes: 11 additions & 7 deletions src/stats.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,13 @@ read_stats* create_length_stats(void) {
}

void destroy_length_stats(read_stats* stats) {
free(stats->buckets->groups);
free(stats->buckets);
free(stats->edges);
free(stats->counts);
free(stats);
if (stats != NULL) {
free(stats->buckets->groups);
free(stats->buckets);
free(stats->edges);
free(stats->counts);
free(stats);
}
}

void add_length_count(read_stats* stats, size_t x) {
Expand Down Expand Up @@ -97,8 +99,10 @@ read_stats* create_qual_stats(float width) {
}

void destroy_qual_stats(read_stats* stats) {
free(stats->counts);
free(stats);
if (stats != NULL) {
free(stats->counts);
free(stats);
}
}

void add_qual_count(read_stats* stats, float q) {
Expand Down
2 changes: 1 addition & 1 deletion src/version.h
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@

const char *argp_program_version = "0.17.1";
const char *argp_program_version = "0.18.0";
Binary file added test/bamstats/RCS-100A.bam
Binary file not shown.
Binary file added test/bamstats/RCS-100A.bam.bai
Binary file not shown.

0 comments on commit 526c83c

Please sign in to comment.