-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdevinfo_mdb2csv.py
236 lines (192 loc) · 8.7 KB
/
devinfo_mdb2csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import sys
import pandas as pd
import numpy as np
import time as time
import csv
import db_read
# "Z:\devinfo\cam\CAMInfo 2016_r8_DECRY
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#
# "Z:\devinfo\cam\CAMInfo 2016_r8_DECRYPT.csv" km
# Subgroup_Type_Name distinct
def get_subgroup_name_distinct(df):
df_subgroup_name = df['Subgroup_Type_Name']
df_subgroup_name.drop_duplicates(inplace=True)
return df_subgroup_name
'''
This function gets the Dimensions table in the following form:
Subgroup_Val_NId Subgroup_Type_Name Subgroup_Name
1 Location Total
2 Sex Male
2 Age 65+ yr
3 Sex Male
3 Age 55-64 yr
And returns a table in the following form:
Subgroup_Val_NId Location Sex Age Other
1 Total NaN NaN NaN
2 NaN Male 65+ yr NaN
3 NaN Male 55-64 yr NaN
Creating a column for each Subgroup_Type_Name and adding the proper value when available
'''
def create_dimensions_table(df):
# gets the unique values of the Subgroup_Type_Name
df_subgroup_name = get_subgroup_name_distinct(df)
# creates the new set of columns in the Pandas dataframe: an id column + all the Unique subgroups found
new_cols = {'Subgroup_Val_NId': np.int64}
for c in df_subgroup_name:
new_cols.update({c: np.str})
df_dims = pd.DataFrame(columns=new_cols)
'''
For each row: add the value in the right column
Get the id (val_nid)
if exists get the location if not create a new one
get the row (line)
get the name contained in the Subgroup_Type_Name column (dim_name)
get the value contained in the Subgroup_Name column (dim_value)
add the value in the right column
'''
for i in range(0, len(df)):
val_nid = df.iloc[i, 0] # df.iloc gets the row i col 0
exists = df_dims.loc[df_dims['Subgroup_Val_NId'] == val_nid]
if exists.empty:
df_dims = df_dims.append({'Subgroup_Val_NId': val_nid}, ignore_index=True)
line = df.iloc[i]
dim_name = line[1]
dim_value = line[2]
df_dims.loc[df_dims['Subgroup_Val_NId'] == val_nid, dim_name] = dim_value
return df_dims
'''Extracts the facts table and merges all the additional information
Renames the columns to hanlde the multilanguage
the dataFrameName.groupby('Indicator_NId').apply(lambda x: ','.join(x.COLUMN_NAME)) is used to join all the values having the same id in one row, separated by comma
e.g.
id:25 SUBSECTOR: Primary Education
id:25 SUBSECTOR: Secondary Education
id:25 SUBSECTOR: Upper Secondary Education
Becomes
id:25 SUBSECTOR: Primary Education, Secondary Education, Upper Secondary Education
The operation doesn't seem to be available in Access so it is done in Pandas
'''
def extract_dataframe(access_database, lang):
db = db_read.DbRead(access_database)
print("Extracting data from the database")
# Extract the facts and dimensions data from the database
dfFacts = db.get_dataframe(db_read.Q_FACTS, lang)
dfDims = db.get_dataframe(db_read.Q_DIMS, lang)
# creates the dimensions table
dfAllDims = create_dimensions_table(dfDims)
# extracts the sectors table
dfSectors = db.get_dataframe(db_read.Q_SECTORS, lang)
if len(dfSectors) > 0:
dfSectors = dfSectors.groupby('Indicator_NId').apply(lambda x: ','.join(x.SECTOR))
tmpdf = pd.DataFrame()
tmpdf['SECTOR'] = dfSectors
dfSectors = tmpdf
# extracts the subsectors table
dfSubsectors = db.get_dataframe(db_read.Q_SUBSECTORS, lang)
if len(dfSubsectors) > 0:
dfSubsectors = dfSubsectors.groupby('Indicator_NId').apply(lambda x: ','.join(x.SUBSECTOR))
tmpdf = pd.DataFrame()
tmpdf['SUBSECTOR'] = dfSubsectors
dfSubsectors = tmpdf
# extracts the sources table
dfSources = db.get_dataframe(db_read.Q_SOURCES, lang)
# extracts the source agency table
dfSource_agency = db.get_dataframe(db_read.Q_SOURCE_AGENCY, lang)
if len(dfSource_agency) > 0:
dfSource_agency = dfSource_agency.groupby('Indicator_NId').apply(lambda x: ','.join(x.SOURCE_AGENCY))
tmpdf = pd.DataFrame()
tmpdf['SOURCE_AGENCY'] = dfSource_agency
dfSource_agency = tmpdf
# extracts the agency table
dfAgency = db.get_dataframe(db_read.Q_AGENCY, lang)
if len(dfAgency) > 0:
dfAgency = dfAgency.groupby('Indicator_NId').apply(lambda x: ','.join(x.AGENCY))
tmpdf = pd.DataFrame()
tmpdf['AGENCY'] = dfAgency
dfAgency = tmpdf
# extracts the goals table
dfGoals = db.get_dataframe(db_read.Q_GOALS, lang)
if len(dfGoals):
dfGoals = dfGoals.groupby('Indicator_NId').apply(lambda x: ','.join(x.GOALS))
tmpdf = pd.DataFrame()
tmpdf['GOALS'] = dfGoals
dfGoals = tmpdf
# extracts the themes table
dfThemes = db.get_dataframe(db_read.Q_THEMES, lang)
if len(dfThemes) > 0:
dfThemes = dfThemes.groupby('Indicator_NId').apply(lambda x: ','.join(x.THEMES))
tmpdf = pd.DataFrame()
tmpdf['THEMES'] = dfThemes
dfThemes = tmpdf
print("mapping...")
# A set of joins to generate the final table
dfFacts['Subgroup_Val_NId'] = dfFacts['Subgroup_Val_NId'].astype(np.int64)
dfFacts['Indicator_NId'] = dfFacts['Indicator_NId'].astype(np.int64)
dfFacts = pd.merge(dfFacts, dfSectors, on='Indicator_NId', how='left')
dfFacts = pd.merge(dfFacts, dfSubsectors, on='Indicator_NId', how='left')
dfFacts = pd.merge(dfFacts, dfSources, on='data_id', how='left')
dfFacts = pd.merge(dfFacts, dfSource_agency, on='Indicator_NId', how='left')
dfFacts = pd.merge(dfFacts, dfAgency, on='Indicator_NId', how='left')
dfFacts = pd.merge(dfFacts, dfGoals, on='Indicator_NId', how='left')
if dfThemes.empty:
dfFacts['THEMES'] = ""
else:
dfFacts = pd.merge(dfFacts, dfThemes, on='Indicator_NId', how='left')
dfFacts = pd.merge(dfFacts, dfAllDims, how='left', left_on='Subgroup_Val_NId', right_on='Subgroup_Val_NId')
dimCols = get_subgroup_name_distinct(dfDims)
# re arrange the columns: the first set of columns (always present)
cols_position = ['data_id', 'SECTOR', 'SUBSECTOR', 'GOALS', 'THEMES', 'AGENCY', 'SOURCE_AGENCY', 'SOURCE',
'INDICATOR']
# the ones generated by "splitting" the Subgroup_Type_Name
cols_position.extend(dimCols.values.tolist())
# the last ones (always present).
cols_position.extend(
['REF_AREA', 'REF_AREA_CODE', 'REF_AREA_TYPE', 'TIME_PERIOD', 'UNIT_MEASURE', 'OBS_VALUE', 'FOOTNOTE'])
# re arrange the columns
dfFacts = dfFacts[cols_position]
# rename the columns, data_id and OBS_VALUE are not renamed, append the language code to the other ones
newColNames = []
for c in dfFacts.columns:
if (c == 'data_id' or c == 'OBS_VALUE'):
newColNames.append(c)
else:
newColNames.append(c + "_" + lang)
# rename facts ID col from data_id to id
newColNames[0] = 'id'
dfFacts.columns = newColNames #assign the new col names
return dfFacts
def main(argv):
# Check and parse the parameters
if len(argv) < 3:
print("Use: python.exe devinfo_mdb2csv.py source destination <second language>")
return
access_database = argv[1]
output_csv = argv[2]
lang1 = 'en'
lang2 = None
if (len(argv) == 4):
if (len(argv[3]) == 2):
lang2 = argv[3]
else:
print("The additional language must be the two letter code: e.g. sp")
return
print("reading " + access_database + "...")
# pandas options used for debug reasons to show all the dataframe can be deleted
pd.set_option('display.expand_frame_repr', False)
time1 = time.time()
# calls the extract datafextract_dataframe function to retrieve the data to be written to CSV
dfFacts = extract_dataframe(access_database, lang1)
# repeats the extraction if a second language is available (and passed as param)
if lang2 is not None:
print('extracting ' + lang2 + ' language from database...')
dfFacts2 = extract_dataframe(access_database, lang2)
dfFacts2.drop(columns=['OBS_VALUE'], axis=1, inplace=True)
dfFacts = pd.merge(dfFacts, dfFacts2, how='inner', left_on='id', right_on='id')
print("saving to " + output_csv + "...")
dfFacts.to_csv(output_csv, index=False, sep=",", quoting=csv.QUOTE_NONNUMERIC, quotechar='"',
encoding='utf-8')
time2 = time.time()
print("done.")
print(str(round((time2 - time1), 1)) + " seconds")
if __name__ == "__main__":
main(sys.argv)