Skip to content

Commit

Permalink
Update simON_reads.py
Browse files Browse the repository at this point in the history
Added --enable_homopolymer_error and --enable_sequencing_error to allow further control of the resulting data
  • Loading branch information
AstraBert authored Dec 19, 2023
1 parent c9aab34 commit 90582a5
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion scripts/simON_reads.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@


from argparse import ArgumentParser
from datetime import datetime
import sys
Expand All @@ -10,17 +11,22 @@
argparse.add_argument("-i", "--infile", help="Path to the input fasta file that contains the reference sequence(s)", required=True)
argparse.add_argument("-snp", "--single_nucleotide_polymorphism", help="Insert a single nucleotide variant; the syntax of this option should be SAMPLE:POS:REF>ALT,SAMPLE:POS:REF>ALT,...,SAMPLE:POS:REF>ALT (it should be separated by commas without blank spaces) where SAMPLE is the header of the sequence (withouth \">\") in the original fasta file, POS is an integer that indicates the position (0-based) of the polymorphic site, REF is the reference allele, ALT is the alternative allele you want to be put. This will generate a diploid-like distribution of variants", required=False, default="NO_SNP")
argparse.add_argument("-n", "--nreads", help="Insert the number of reads you want to generate for each provided reference sequence: deafult is 2000", required=False, default=2000, type=int)
argparse.add_argument("-ehp", "--enable_homopolymer_error", help="This will set a 30%% chance of getting an extra nucleotide around homopolymeric regions", required=False, default=False, action='store_true')
argparse.add_argument("-ese", "--enable_sequencing_error", help="This will set a 5%% chance of getting a random single nucleotide variant or insertion, while it retains also a 5%% chance of skipping a base (single delition)", required=False, default=False, action='store_true')

args = argparse.parse_args()

inf = args.infile
snpstring = args.single_nucleotide_polymorphism
nr = args.nreads
enhompol= args.enable_homopolymer_error
enseqerr= args.enable_sequencing_error


if __name__=="__main__":
start = datetime.now()
dic=load_data(inf) ## Load reference sequences from infile
avgs=seqs_to_file(dic,snpstring,nr) ## Generate reads and print them
avgs=seqs_to_file(dic,snpstring,nr,enhompol,enseqerr) ## Generate reads and print them
end=datetime.now()
plt.style.use('ggplot') ## Plot average read quality as histogram, show it and save it as avg_quality_distribution.png in current directory
fig,ax=plt.subplots(figsize=(10,5))
Expand Down

0 comments on commit 90582a5

Please sign in to comment.