-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse-function-frequency.py
executable file
·101 lines (86 loc) · 4.33 KB
/
parse-function-frequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python3
# Import required modules
import os
import sys
import pandas as pd
def main():
help_string = """
Synopsis:
python parse-function-frequency.py <input-dir> <mode> <frequency-out> <counts-out>
Description:
This is an auxiliary utility for the script anvi-script-frequency-enrichment.sh
and is not generally intended for independent use. For example, it has no input
validation and relies on its parent script always passing well-formed input. For
that script to work, both scripts must be in the same directory.
The input directory should contain one file per genome listing annotation information.
The mode parameter allows you to choose how the annotations should be interpreted.
The available modes are listed below.
'annotation' Provide the number of times an annotation is present in each genome.
'gene' Provide the number of genes with a given annotation in each genome.
The last two arguments are the file paths to use for output..
"""
version_string = """
Last updated 4 November 2021
"""
if "-h" in sys.argv or "--help" in sys.argv:
print(help_string)
sys.exit(1)
#Handle arguments
input_dir = sys.argv[1]
mode = sys.argv[2]
freq_out = sys.argv[3]
count_out = sys.argv[4]
# Import the data files we need and extract the relevant genome names
file_list = os.listdir(input_dir)
file_list = list(filter(lambda x: "-functions.tmp" in x, file_list))
genomes = [name.replace("-functions.tmp","") for name in file_list]
# Each input file will have the following format, where each line represents a single gene:
# accession function
# X textX
# Y!!!Z textY!!!textZ
# The output file will have the following format:
# accession function genome1 genome2 genome3 ...
# X textX N1_X N2_X N3_X ...
# Y textY N1_Y N2_Y N3_Y ...
# Z textZ N1_Z N2_Z N3_Z ...
# Create a DataFrame to store the frequency output
cols = ['accession', 'function']
cols.extend(genomes)
freq_df = pd.DataFrame(columns=cols)
# For each genome's file...
for func_file in file_list:
filename = input_dir + '/' + func_file
genome = func_file.replace("-functions.tmp","")
func_df = pd.read_csv(filename, sep='\t')
# For each annotated gene...
for index, row in func_df.iterrows():
accessions = row['accession'].split("!!!")
functions = row['function'].split("!!!")
# Add a row to the output for any annotations not already present
for accession, function in zip(accessions, functions):
if accession not in freq_df['accession'].values:
newrow = [accession, function]
newrow.extend([0]*len(genomes))
freq_df = pd.concat([freq_df, pd.DataFrame(columns=cols, data=[newrow])])
# We have to handle annotations differently depending on the output mode
if mode == 'annotation':
# Increment the annotation count(s) for this genome by stepping through
# each annotation for the gene. If an accession is present multiple times,
# its counter will increment multiple times.
for accession in accessions:
freq_df.loc[freq_df['accession'] == accession, genome] += 1
elif mode == 'gene':
# Increment the annotation count(s) for this genome by testing if an
# accession is present on a gene. If an accession is present multiple times,
# its counter will increment one time.
freq_df.loc[freq_df['accession'].isin(accessions), genome] += 1
# Output final counts of annotations to be used by later scripts.
# The values of N are only meaningful in 'annotation' mode.
count_df = pd.DataFrame(columns=['genome', 'N'])
for genome in genomes:
count = freq_df[genome].sum()
count_df = pd.concat([count_df, pd.DataFrame([[genome, count]], columns=['genome', 'N'])])
freq_df.to_csv(freq_out, sep='\t', index=False)
count_df.to_csv(count_out, sep='\t', index=False)
if __name__ == "__main__":
main()