diff --git a/CHANGELOG.md b/CHANGELOG.md index 8611899..3321011 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Fixed +- More informative error message upon read duplicate detection. ### Updated - Remove duplicate fastcat call. diff --git a/bin/workflow_glue/cluster_umis.py b/bin/workflow_glue/cluster_umis.py index c879209..619fdd0 100755 --- a/bin/workflow_glue/cluster_umis.py +++ b/bin/workflow_glue/cluster_umis.py @@ -212,7 +212,14 @@ def process_records(df, args): def main(args): """Run entry point.""" - df_tags = pd.read_csv(args.read_tags, sep='\t', index_col=0) + df_tags = pd.read_csv(args.read_tags, sep='\t', index_col='read_id') + + dups = df_tags[df_tags.index.duplicated(keep='first')] + if not dups.empty: + raise ValueError( + f"One or more input reads are duplicated, please rectify.\n" + f"Duplicated reads: {list(set(dups.index))[:20]}") + df_features = pd.read_csv( args.feature_assigns, sep='\t', index_col=0) # Merge genes and transcripts onto tags.