Update simON_reads.py

Added --enable_homopolymer_error and --enable_sequencing_error to allow further control of the resulting data
AstraBert · Dec 19, 2023 · 90582a5 · 90582a5
1 parent c9aab34
commit 90582a5
Showing 1 changed file with 7 additions and 1 deletion.
diff --git a/scripts/simON_reads.py b/scripts/simON_reads.py
@@ -1,4 +1,5 @@
 
+
 from argparse import ArgumentParser
 from datetime import datetime
 import sys
@@ -10,17 +11,22 @@
 argparse.add_argument("-i", "--infile", help="Path to the input fasta file that contains the reference sequence(s)", required=True)
 argparse.add_argument("-snp", "--single_nucleotide_polymorphism", help="Insert a single nucleotide variant; the syntax of this option should be SAMPLE:POS:REF>ALT,SAMPLE:POS:REF>ALT,...,SAMPLE:POS:REF>ALT (it should be separated by commas without blank spaces) where SAMPLE is the header of the sequence (withouth \">\") in the original fasta file, POS is an integer that indicates the position (0-based) of the polymorphic site, REF is the reference allele, ALT is the alternative allele you want to be put. This will generate a diploid-like distribution of variants", required=False, default="NO_SNP")
 argparse.add_argument("-n", "--nreads", help="Insert the number of reads you want to generate for each provided reference sequence: deafult is 2000", required=False, default=2000, type=int)
+argparse.add_argument("-ehp", "--enable_homopolymer_error", help="This will set a 30%% chance of getting an extra nucleotide around homopolymeric regions", required=False, default=False, action='store_true')
+argparse.add_argument("-ese", "--enable_sequencing_error", help="This will set a 5%% chance of getting a random single nucleotide variant or insertion, while it retains also a 5%% chance of skipping a base (single delition)", required=False, default=False, action='store_true')
 
 args = argparse.parse_args()
 
 inf = args.infile
 snpstring = args.single_nucleotide_polymorphism
 nr = args.nreads
+enhompol= args.enable_homopolymer_error
+enseqerr= args.enable_sequencing_error
+
 
 if __name__=="__main__":
     start = datetime.now()
     dic=load_data(inf) ## Load reference sequences from infile
-    avgs=seqs_to_file(dic,snpstring,nr) ## Generate reads and print them
+    avgs=seqs_to_file(dic,snpstring,nr,enhompol,enseqerr) ## Generate reads and print them
     end=datetime.now()
     plt.style.use('ggplot') ## Plot average read quality as histogram, show it and save it as avg_quality_distribution.png in current directory
     fig,ax=plt.subplots(figsize=(10,5))