-
Notifications
You must be signed in to change notification settings - Fork 51
/
nuc.c
169 lines (137 loc) · 3.75 KB
/
nuc.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#include <stdio.h>
#include <string.h>
#include "memutil.h"
#include "nuc.h"
#include "util.h"
#include "err.h"
static const char NUC_SYMBOL[NUM_NUCS] = {'A', 'C', 'G', 'T',
'M', 'K', 'R', 'Y', 'W', 'S',
'-', 'N'};
/**
* Converts a nucleotide ID into a character.
*/
char nuc_id_to_char(const unsigned char id) {
if(id > NUM_NUCS) {
my_err("Invalid nucleotide identifier %d", id);
}
return NUC_SYMBOL[id];
}
/**
* returns TRUE if this is an id for an unambiguous nucleotide
* (one of A, C, T, G)
*/
int nuc_is_real(const unsigned char id) {
return (id < NUC_M);
}
/*
* returns TRUE if this is an id for a nucleotide ambiguity code
* (one of M, K, R, Y, W, S).
*/
int nuc_is_ambi(const unsigned char id) {
return (id > NUC_T) && (id < NUC_GAP);
}
/**
* Converts a nucleotide to a unique nucleotide integer identifier.
*/
unsigned char nuc_char_to_id(const char nuc) {
switch(nuc) {
case('A'): case('a'): return NUC_A;
case('C'): case('c'): return NUC_C;
case('T'): case('t'): return NUC_T;
case('G'): case('g'): return NUC_G;
case('M'): case('m'): return NUC_M;
case('K'): case('k'): return NUC_K;
case('R'): case('r'): return NUC_R;
case('Y'): case('y'): return NUC_Y;
case('W'): case('w'): return NUC_W;
case('S'): case('s'): return NUC_S;
case('.'): case('-'): case('*'): return NUC_GAP;
case('N'): case('n'): return NUC_N;
}
my_warn("%s:%d: unknown nucleotide character %c",
__FILE__, __LINE__, nuc);
return NUC_N;
}
/**
* returns the complement of the provided nucleotide ID
*/
unsigned char nuc_comp(const unsigned char nuc_id) {
switch(nuc_id) {
case(NUC_A): return NUC_T;
case(NUC_C): return NUC_G;
case(NUC_G): return NUC_C;
case(NUC_T): return NUC_A;
case(NUC_M): return NUC_K;
case(NUC_K): return NUC_M;
case(NUC_R): return NUC_Y;
case(NUC_Y): return NUC_R;
case(NUC_W): return NUC_W;
case(NUC_S): return NUC_S;
}
/* for N, GAP, complement is same */
return nuc_id;
}
/**
* Fills the provided buffer with a null-terminated string representation
* of the provided array of nucleotide arrays. The provided buffer
* must be at least len+1 bytes long, and the provided nucleotide
* id array must be at least len bytes long.
*
* If the provided buffer is NULL, a new buffer of length len+1 is
* allocated and returned.
*/
char *nuc_ids_to_str(char *buf, const unsigned char *ids, const long len) {
long i;
if(buf == NULL) {
buf = my_new(char, len+1);
}
for(i = 0; i < len; i++) {
buf[i] = nuc_id_to_char(ids[i]);
}
buf[len] = '\0';
return buf;
}
/**
* Fills the provided buffer with nucleotide ids corresponding to the
* string representation of DNA sequence provided. The returned array
* is NOT null terminated. The len argument should correspond to the
* length of the provided DNA character string. The provided buf must
* be at least len bytes long. If the provided buf is NULL a new buffer
* of length len is allocated and returned.
*
* The return value is just the ptr to the provided buf.
*/
unsigned char *nuc_str_to_ids(unsigned char *buf, const char *str,
const long len) {
long i;
if(buf == NULL) {
buf = my_new(unsigned char, len);
}
for(i = 0; i < len; i++) {
buf[i] = nuc_char_to_id(str[i]);
}
return buf;
}
/**
* Does an in-place reverse complement of an array of
* nucleotide ids
*/
void nuc_ids_revcomp(unsigned char *nuc_ids, long len) {
long i;
util_breverse(nuc_ids, len);
for(i = 0; i < len; i++) {
nuc_ids[i] = nuc_comp(nuc_ids[i]);
}
}
/**
* Returns true if any nucleotides in provided array are NUC_N
*/
int nuc_ids_have_n(unsigned char *nucs, const long len) {
long i;
for(i = 0; i < len; i++) {
if(nucs[i] == NUC_N) {
return TRUE;
}
}
return FALSE;
}