-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathrename_fasta_id.pl
265 lines (181 loc) · 6.5 KB
/
rename_fasta_id.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#!/usr/bin/perl
#######
# POD #
#######
=pod
=head1 NAME
C<rename_fasta_id.pl> - rename fasta IDs according to regular expressions
=head1 SYNOPSIS
C<perl rename_fasta_id.pl -i file.fasta -p "NODE_.+$" -r "K-12_" -n -a c E<gt> out.fasta>
B<or>
C<zcat file.fasta.gz | perl rename_fasta_id.pl -i - -p "coli" -r "" -o E<gt> out.fasta>
=head1 DESCRIPTION
This script uses the built-in Perl substitution operator C<s///> to
replace strings in FASTA IDs. To do this, a B<pattern> and a
B<replacement> have to be provided (Perl regular expression syntax
can be used). The leading '>' character for the FASTA ID will be
removed before the substitution and added again afterwards. FASTA
IDs will be searched for matches with the B<pattern>, and if found
the B<pattern> will be replaced by the B<replacement>.
B<IMPORTANT>: Enclose the B<pattern> and the B<replacement> in
quotation marks (' or ") if they contain characters that would be
interpreted by the shell (e.g. pipes '|', brackets etc.).
For substitutions without any appendices in a UNIX OS you can of
course just use the great
L<C<sed>|https://www.gnu.org/software/sed/manual/sed.html> (see
C<man sed>), e.g.:
C<sed 's/^E<gt>pattern/E<gt>replacement/' file.fasta>
=head1 OPTIONS
=head2 Mandatory options
=over 20
=item B<-i>=I<str>, B<-input>=I<str>
Input FASTA file or piped STDIN (-) from a gzipped file
=item B<-p>=I<str>, B<-pattern>=I<str>
Pattern to be replaced in FASTA ID
=item B<-r>=I<str>, B<-replacement>=I<str>
Replacement to replace the pattern with. To entirely remove the
pattern use '' or "" as input for B<-r>.
=back
=head2 Optional options
=over 20
=item B<-h>, B<-help>
Help (perldoc POD)
=item B<-c>, B<-case-insensitive>
Match pattern case-insensitive
=item B<-g>, B<-global>
Replace pattern globally in the string
=item B<-n>, B<-numerate>
Append a numeration/the count of the pattern hits to the
replacement. This is e.g. useful to number contigs consecutively in
a draft genome.
=item B<-a>=I<str>, B<-append>=I<str>
Append a string after the numeration, e.g. 'c' for chromosome
=item B<-o>, B<-output>
Verbose output of the substitutions that were carried out, printed
to C<STDERR>
=item B<-v>, B<-version>
Print version number to C<STDERR>
=back
=head1 OUTPUT
=over 20
=item C<STDOUT>
The FASTA file with substituted ID lines is printed to C<STDOUT>.
Redirect or pipe into another tool as needed.
=back
=head1 EXAMPLES
=over
=item C<perl rename_fasta_id.pl -i file.fasta -p "T" -r "a" -c -g -o>
=back
=head1 VERSION
0.1 09-11-2014
=head1 AUTHOR
Andreas Leimbach aleimba[at]gmx[dot]de
=head1 LICENSE
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 (GPLv3) of the License,
or (at your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see L<http://www.gnu.org/licenses/>.
=cut
########
# MAIN #
########
use strict;
use warnings;
use autodie;
use Getopt::Long;
use Pod::Usage;
### Get the options with Getopt::Long
my $Input_File; # input fasta file
my $Pattern; # pattern to search for in the FASTA IDs
my $Replacement; # regex to replace pattern with
my $Opt_Case; # substitute case-insensitive
my $Opt_Global; # substitute pattern globally in string
my $Opt_Numerate; # append the count of the performed substitions to each replacement regex
my $Append; # append an additional string after $Opt_Numerate
my $Opt_Output; # print substitutions to STDERR
my $VERSION = 0.1;
my ($Opt_Version, $Opt_Help);
GetOptions ('input=s' => \$Input_File,
'pattern=s' => \$Pattern,
'replacement=s' => \$Replacement,
'case-insensitive' => \$Opt_Case,
'global' => \$Opt_Global,
'numerate' => \$Opt_Numerate,
'append:s' => \$Append,
'output' => \$Opt_Output,
'version' => \$Opt_Version,
'help|?' => \$Opt_Help);
### Run perldoc on POD
pod2usage(-verbose => 2) if ($Opt_Help);
die "$0 $VERSION\n" if ($Opt_Version);
if (!$Input_File || !$Pattern) {
my $warning = "\n### Fatal error: Options '-i' or '-p' or their arguments are missing!\n";
pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
}
### Pipe input from STDIN or open input file
my $Input_Fh;
if ($Input_File eq '-') { # file input via STDIN
$Input_Fh = *STDIN; # capture typeglob of STDIN
} else { # input via input file
open ($Input_Fh, "<", "$Input_File");
}
### Parse FASTA file
my $Substitution_Count = 0; # count substitutions
while (<$Input_Fh>) {
chomp;
# only substitute in FASTA ID lines
if (/^>/) {
# only substitute if pattern found, case-sensitive or case-INsensitive
if (/$Pattern/ || (/$Pattern/i && $Opt_Case)) {
$_ = substitute_string($_); # subroutine
# "reprint" FASTA IDs, which don't fit the pattern
} else {
print "$_\n";
}
# "reprint" sequence/non-ID lines of FASTA files
} else {
print "$_\n";
}
}
print STDERR "$Substitution_Count substitutions have been carried out\n";
exit;
#############
#Subroutines#
#############
### Subroutine to rename headers/ID lines of the FASTA file
sub substitute_string {
my $string = shift;
$string =~ s/^>//; # get rid of '>', append afterwards
print STDERR "$string " if ($Opt_Output); # optional verbose output to STDERR
$Substitution_Count++; # count occurences of carried out substitutions
# substitutions
if ($Opt_Global && $Opt_Case) {
$string =~ s/$Pattern/$Replacement/gi;
} elsif ($Opt_Case) {
$string =~ s/$Pattern/$Replacement/i;
} elsif ($Opt_Global) {
$string =~ s/$Pattern/$Replacement/g;
} else {
$string =~ s/$Pattern/$Replacement/;
}
# output to STDOUT, optionally STDERR
print ">$string";
print STDERR "-> $string" if ($Opt_Output);
if ($Opt_Numerate) {
print "$Substitution_Count";
print STDERR "$Substitution_Count" if ($Opt_Output);
}
if ($Append) {
print "$Append";
print STDERR "$Append" if ($Opt_Output);
}
print "\n";
print STDERR "\n" if ($Opt_Output);
return 1;
}