-
Notifications
You must be signed in to change notification settings - Fork 2
/
libgtftk.h
415 lines (362 loc) · 10.4 KB
/
libgtftk.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
/*
* libgtftk.h
*
* Created on: Jan 10, 2017
* Author: fafa
*
* Header file for the gtftk library.
* Contains all the structure definitions and the prototype declarations.
*/
/*
* If this flag is set, the library output a very huge debug information used
* to find memory leaks.
*/
//#define GTFTOOLKIT_DEBUG
#define test 1
#ifndef GTFTOOLKIT_GTFTK_SRC_LIB_LIBGTFTK_H_
#define GTFTOOLKIT_GTFTK_SRC_LIB_LIBGTFTK_H_
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <zlib.h>
#include <search.h>
#include <string.h>
#include <errno.h>
/*
* Debug of memory allocation. Must be linked with libmemory.so shared library.
* GTFTOOLKIT_DEBUG must be defined.
*/
#ifdef GTFTOOLKIT_DEBUG
extern void *F_calloc(int nb, int size, char *file, const char *func, int line);
extern void *F_realloc(void *ptr, int size, char *file, const char *func, int line);
extern void F_free(void *ptr, char *file, const char *func, int line);
extern void *F_malloc(int size, char *file, const char *func, int line);
#define calloc(nb, size) F_calloc(nb, size, __FILE__, __func__, __LINE__)
#define realloc(ptr, size) F_realloc(ptr, size, __FILE__, __func__, __LINE__)
#define malloc(size) F_malloc(size, __FILE__, __func__, __LINE__)
#define free(p) F_free(p, __FILE__, __func__, __LINE__)
#endif /* GTFTOOLKIT_DEBUG */
/*
* constants for transcript selection in select_transcript function
*/
#define SHORTEST_TRANSCRIPT 1
#define LONGEST_TRANSCRIPT 2
#define MOST5P_TRANSCRIPT 3
/*
* some usefull macros used in get_sequences.c
*/
#define MIN(x, y) (x <= y ? x : y)
#define MAX(x, y) (x > y ? x : y)
#define COMPLEMENT(c) ( c == 'A' ? 'T' : \
c == 'a' ? 't' : \
c == 'T' ? 'A' : \
c == 't' ? 'a' : \
c == 'G' ? 'C' : \
c == 'g' ? 'c' : \
c == 'C' ? 'G' : \
c == 'c' ? 'g' : c)
/*
* This structure describes the input (i.e. a GTF/BLASTN plain file or gzipped).
* It is created by the get_gtf_reader function in get_reader.c source file.
* gzFile or plainfile are set depending on the kind of input (gzip or plain).
* Even if these two elements are exclusive, they are not in an union to be
* sure to be compatible with the most of native interfaces.
*/
typedef struct TEXTFILE_READER {
/*
* The file name with its path (or "-" for standard input)
*/
char *filename;
/*
* A boolean to tell if it is a gzipped file
*/
int gz;
/*
* The gzip file descriptor
*/
gzFile gzfile;
/*
* The plain file descriptor
*/
FILE *plainfile;
} TEXTFILE_READER;
/*
* The structure that represents an attribute (key/value) in the last column of
* a GTF file.
*/
typedef struct ATTRIBUTE {
char *key, *value;
struct ATTRIBUTE *next;
} ATTRIBUTE;
/*
* A set of ATTRIBUTE
*/
typedef struct ATTRIBUTES {
ATTRIBUTE **attr;
int nb;
} ATTRIBUTES;
/*
* A structure to store a row from a GTF file.
*/
typedef struct GTF_ROW {
/*
* the 8 first fields of a GTF file row
*/
char **field;
/*
* the attributes
*/
ATTRIBUTES attributes;
/*
* the rank number of the row in the GTF file
*/
int rank;
/*
* the link to the next row
*/
struct GTF_ROW *next;
} GTF_ROW;
/*
* This is the structure that holds data in GTF format. It is also the
* structure used as input/output for most of the functions of the library. To
* start using the library, one must call the load_GTF() function with a GTF
* file name in parameter and gets a pointer on a GTF_DATA in return. Then, all
* the other functions must be called with this GTF_DATA pointer as input.
* Their result can be another GTF_DATA pointer that can be used as input for
* another function of the library.
*/
typedef struct GTF_DATA {
/*
* the number of rows
*/
int size;
/*
* a table of rows
*/
GTF_ROW **data;
/*
* the comments at the beginning of the file, started with "##"
*/
} GTF_DATA;
/*
* This structure represents an index on a column, or on an attribute of the
* last column.
*/
typedef struct INDEX {
/*
* the name of a column (feature, seqid, ...) or of an attribute (gene_id,
* transcript_id, ...)
*/
char *key;
/*
* the pointer on a binary tree created with tsearch C function in
* the index_row function in column.c source file. This tree contains
* ROW_LIST elements described later in this file and that contains a
* token and the associated list of row numbers (the rows containing the
* token as the value of the key (a column name or an attribute name).
*/
void *data;
/*
* a reference to the GTF_DATA on which the index has been made
*/
GTF_DATA *gtf_data;
/*
* a pointer on the next index
*/
struct INDEX *next;
} INDEX;
/*
* A structure that contains the information about an index in the column model:
* the column and the rank as an index can contain several indexes. The index
* can be accessed as: column[index_id.column]->index[index_id.index_rank]
*/
typedef struct INDEX_ID {
int column;
int index_rank;
} INDEX_ID;
/*
* This is a structure that modelize a column of a GTF file.
*/
typedef struct COLUMN {
/*
* the rank number of the column
*/
int num;
/*
* the column name : seqid, source, feature, start, end, score, strand,
* phase or attributes
*/
char *name;
/*
* the default value to print if no value is available (".")
*/
char *default_value;
/*
* a linked list of indexes. It contains only one pointer for each column except
* the attributes column for which there is as needed indexes (one can
* index data on several attributes)
*/
INDEX **index;
/*
* the number of indexes in the previous table
*/
int nb_index;
} COLUMN ;
/*
* A list of row numbers associated with a token (the values in the 8 first
* columns or the values associated to an attribute in the last column). This
* structure is used in the indexes as elements.
*/
typedef struct ROW_LIST {
/*
* the token that is contained in the rows. For example, this can be "gene"
* or "transcript" for an index on the column feature, or "protein_coding"
* and "lincRNA" for an index on the attribute "gene_biotype".
*/
char *token;
/*
* the number of rows
*/
int nb_row;
/*
* the table of row numbers
*/
int *row;
} ROW_LIST;
/*
* This is a structure that can hold any tabulated text. It is for example the
* result of extract_data function. All functions that return a RAW_DATA
* structure are "terminal" functions because this kind of result cannot be the
* input of another function.
*/
typedef struct RAW_DATA {
/*
* The number of rows and columns
*/
int nb_rows, nb_columns;
/*
* The name of the columns
*/
char **column_name;
/*
* The data (nb_rows x nb_columns character strings)
*/
char ***data;
} RAW_DATA;
/*
* This structure is used to store a list of strings. Useful as a function
* return type like get_attribute_list in get_list.c source file. Also used as
* hashtable elements to discard redundant rows in extract_data.
*/
typedef struct STRING_LIST {
/*
* the strings
*/
char **list;
/*
* the size of the previous list
*/
int nb;
} STRING_LIST;
/*
* Used by get_sequences function to modelize exons, introns ...
*/
typedef struct SEQFRAG {
int start, end;
char strand;
} SEQFRAG ;
typedef struct FEATURE {
char *name;
int start, end, tr_start, tr_end;
} FEATURE;
typedef struct FEATURES {
FEATURE **feature;
int nb;
} FEATURES;
typedef struct SEQUENCE {
char *header, *sequence, strand, *seqid, *gene_id, *transcript_id, *gene_name, *gene_biotype;
int start, end;
FEATURES *features;
} SEQUENCE;
typedef struct SEQUENCES {
int nb;
SEQUENCE **sequence;
} SEQUENCES;
/*
* Used by get_list function to store an return the results as a matrix of
* character strings.
*/
typedef struct TTEXT {
int size;
char ***data;
} TTEXT;
/*
* used by add_exon_number to sort exons by their start value
*/
typedef struct SORT_ROW {
int row;
int value;
} SORT_ROW;
typedef struct BLAST_HEADER {
char *program_name;
char *database_name;
unsigned int database_length;
int database_nb_sequences;
} BLAST_HEADER;
typedef struct BLAST_QUERY {
char *query_name;
int query_length;
int nb_subject;
} BLAST_QUERY;
typedef struct BLAST_SUBJECT {
char *subject_name;
int subject_length;
int nb_HSP;
} BLAST_SUBJECT;
typedef struct BLAST_HSP {
BLAST_HEADER bh;
BLAST_QUERY bq;
BLAST_SUBJECT bs;
double score;
double expect;
char *identities;
int identities_percent;
char *gaps;
int gap_percent;
char strand_query, strand_subject;
int query_start, query_end, subject_start, subject_end;
} BLAST_HSP;
/*
* Prototypes for the visible functions (callable by external client)
*/
GTF_DATA *load_GTF(char *input);
GTF_DATA *select_by_key(GTF_DATA *gtf_data, char *key, char *value, int not);
void print_gtf_data(GTF_DATA *gtf_data, char *output, int add_chr);
GTF_DATA *select_by_transcript_size(GTF_DATA *gtf_data, int min, int max);
GTF_DATA *select_by_number_of_exon(GTF_DATA *gtf_data, int min, int max);
GTF_DATA *select_by_genomic_location(GTF_DATA *gtf_data, int nb_loc, char **chr, int *begin_gl, int *end_gl);
RAW_DATA *extract_data(GTF_DATA *gtf_data, char *key, int base, int uniq);
void print_raw_data(RAW_DATA *raw_data, char delim, char *output);
GTF_DATA *select_transcript(GTF_DATA *gtf_data, int type);
SEQUENCES *get_sequences(GTF_DATA *gtf_data, char *genome_file, int intron, int rc);
int free_gtf_data(GTF_DATA *gtf_data);
int free_raw_data(RAW_DATA *raw_data);
char *get_memory(long int size);
int free_mem(char *ptr);
TTEXT *get_feature_list(GTF_DATA *gtf_data);
TTEXT *get_seqid_list(GTF_DATA *gtf_data);
TTEXT *get_attribute_list(GTF_DATA *gtf_data);
TTEXT *get_attribute_values_list(GTF_DATA *gtf_data, char *attribute);
GTF_DATA *convert_to_ensembl(GTF_DATA *gtf_data);
GTF_DATA *add_attributes(GTF_DATA *gtf_data, char *features, char *key, char *new_key, char *inputfile_name);
GTF_DATA *del_attributes(GTF_DATA *gtf_data, char *features, char *keys);
GTF_DATA *select_by_positions(GTF_DATA *gtf_data, int *pos, int size);
GTF_DATA *add_exon_number(GTF_DATA *gtf_data, char *exon_number_field);
GTF_DATA *add_prefix(GTF_DATA *gtf_data, char *features, char *key, char *txt, int suffix);
GTF_DATA *merge_attr(GTF_DATA *gtf_data, char *features, char *keys, char *dest_key, char *sep);
GTF_DATA *load_blast(char *input);
GTF_DATA *add_attr_to_pos(GTF_DATA *gtf_data, char *inputfile_name, char *new_key);
void clear_indexes(void);
GTF_DATA *add_attr_column(GTF_DATA *gtf_data, char *inputfile_name, char *new_key);
int int_array_test(int *pos, int size);
#endif /* GTFTOOLKIT_GTFTK_SRC_LIB_LIBGTFTK_H_ */