-
Notifications
You must be signed in to change notification settings - Fork 4
/
GENCODE.pm
170 lines (142 loc) · 4.05 KB
/
GENCODE.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
package GENCODE;
## This class was created to return genomic coordinates for a gene
## given its name or stable ID on Ensembl.
## Usage:
# Initialize:
#
# use GENCODE;
# my $G = GENCODE->new($GENCODE_fileName);
# Extract coordinates:
# ($chr, $start, $end, $stable_ID, $name) = $G->GetCoordinates($ID);
use strict;
use warnings;
sub new {
my ( $class, $parameters) = @_;
my $self = {};
bless( $self, $class );
my $GENCODE_filename = $parameters->{"gencode_file"};
$self->_initialize($GENCODE_filename);
return $self;
}
# reading gzipped gencode file
sub _initialize {
my $self = shift;
my $gencodeFile = shift;
my $FILE;
$self->{"failed"}=0;
if (! -e $gencodeFile){
print "[Error] GENCODE file ($gencodeFile) could not be opened\n";
$self->{"failed"}=1;
return;
}
my %counts;
open($FILE, "zcat $gencodeFile | ");
while (my $line = <$FILE>) {
next if $line =~ /^#/;
chomp $line;
#1-based coordinates
my ($chr, $start, $end, $name, $ID) = split("\t", $line);
$chr =~ s/^chr//i;
if ( $chr eq "Y" && (($start > 10001 && $end < 2781479) || ($start > 56887903 && $end < 57217415))){
# we are in PAR Y. The genes are also on PAR X so we skip
next;
}
my $ref = {"chr" => $chr,
"start" => $start,
"end" => $end,
"name" => $name,
"ID" => $ID};
# to detect (name) duplicates
$counts{$name}++;
# by name
# array can contain more than one ref (in case of duplicate names)
push @{$self->{"gene_names"}->{$name}}, $ref;
# by ID
# shouldn't happen as IDs are supposed to be unique
if (exists($self->{"gene_names"}->{$ID})){
print "[Error] GENCODE::_initialize : duplicate ID $ID";
$self->{"failed"}=1;
return;
}
else{
push @{$self->{"gene_names"}->{$ID}}, $ref;
}
# also use ID prefix as key
# potentially there could be duplicates, but we excluded PAR_Y genes above, so shouldn't happen
if ($ID=~/(ENSG\d+)\./){ # should always be the case
my $x=$1;
if (exists($self->{"gene_names"}->{$x})){
print "[Error] GENCODE::_initialize : duplicate ID $x";
$self->{"failed"}=1;
return;
}
else{
push @{$self->{"gene_names"}->{$x}}, $ref;
}
}
}
# for my $k (keys %{$self->{"gene_names"}}){
# print "ERROR: $k" if scalar(@{$self->{"gene_names"}->{$k}})>1;
# }
foreach my $n (keys %counts){
if ($counts{$n}>1){
$self->{"duplicate_names"}->{$n}=1;
}
}
if (exists($self->{"duplicate_names"})){
print "[Warning]: duplicate gene names detected in $gencodeFile:";
foreach my $n (keys %{$self->{"duplicate_names"}}){
print $n;
}
}
close($FILE);
}
# checks if provided gene name occurs more than once in GENCODE data
sub isDuplicateName {
my $self = shift;
my $name = shift;
if (exists($self->{"duplicate_names"})){
if (exists($self->{"duplicate_names"}->{$name})){
return 1;
}
else{
return 0;
}
}
else{
return 0;
}
}
# get gene coordinates based on the gene name, stable ID or stable ID prefix.
sub GetCoordinates {
my $self = shift;
my $ID = shift; # name , full ID or ID prefix
my $ret=undef;
# seems redundant
if ( exists $self->{"gene_names"}->{$ID} ) {
foreach my $rec (@{$self->{"gene_names"}->{$ID}}){
my $stID=$rec->{ID};
$ret->{$stID}->{chr} = $rec->{chr};
$ret->{$stID}->{start} = $rec->{start};
$ret->{$stID}->{end} = $rec->{end};
$ret->{$stID}->{name} = $rec->{name};
}
}
elsif ($ID=~/(ENSG\d+)\./){
my $x=$1;
if ( exists $self->{"gene_names"}->{$x} ) {
foreach my $rec (@{$self->{"gene_names"}->{$x}}){
my $stID=$rec->{ID};
$ret->{$stID}->{chr} = $rec->{chr};
$ret->{$stID}->{start} = $rec->{start};
$ret->{$stID}->{end} = $rec->{end};
$ret->{$stID}->{name} = $rec->{name};
}
}
}
else {
print "[Warning] GENCODE::GetCoordinates: $ID was not found\n";
}
return $ret;
}
1;