-
Notifications
You must be signed in to change notification settings - Fork 18
/
collectrare50data.py
51 lines (42 loc) · 11.7 KB
/
collectrare50data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
import json, ujson, csv
from constant import DATA_DIR, MIMIC_3_DIR
output_path = MIMIC_3_DIR
train_ids = ['406_113900', '518_115629', '679_110573', '954_176383', '1203_147462', '1407_162470', '1520_102809', '1538_196547', '2012_122070', '2383_160878', '2553_126138', '2602_196111', '2707_100626', '3867_140636', '4052_179307', '4246_157628', '4635_161168', '4929_167945', '4961_156050', '5030_138683', '5268_120061', '5336_114433', '5727_138942', '5727_183910', '5933_115681', '6091_156981', '6231_149561', '6317_145875', '6317_194396', '6351_116962', '6917_153716', '6927_148457', '7093_115161', '7190_191465', '7199_150157', '7241_163199', '7254_160980', '7386_105665', '7558_152799', '8499_130407', '9058_115024', '9061_132744', '9474_123878', '10304_146726', '10616_122738', '10667_136092', '10696_134006', '11272_186072', '11413_150164', '11505_106933', '12110_105928', '12198_108387', '12257_141962', '12291_164933', '12332_123526', '12618_187332', '12938_179057', '13628_115231', '13895_146421', '14400_148718', '14672_147841', '14757_192174', '15127_130189', '15295_199475', '15441_121169', '15489_175196', '16156_142237', '16553_138869', '16650_176541', '16992_136218', '17364_104593', '17707_102227', '17759_182475', '18968_133722', '19029_108677', '19044_168558', '19059_154958', '19355_105433', '19470_115735', '19558_101168', '19582_152643', '20237_141063', '20293_147927', '20950_185250', '21090_185350', '21314_111263', '21914_112824', '21973_127752', '22700_154513', '22992_182579', '23207_171490', '23675_125101', '23872_160602', '24110_117662', '24164_164901', '24226_118785', '24477_127064', '24519_124747', '25231_113767', '25330_137981', '26127_161400', '26474_146771', '26588_149063', '27079_173730', '27083_110408', '27478_156536', '27536_127047', '27660_134321', '27768_130656', '27894_136871', '28033_145544', '28125_147262', '28381_198580', '28620_128017', '28628_108334', '28774_154487', '29137_188600', '29496_185056', '29504_181564', '29609_127784', '29894_170918', '30177_180920', '30507_122101', '30711_164779', '30713_169358', '31033_120713', '31121_199748', '31123_154148', '31614_125573', '31935_176163', '32203_111420', '32376_160949', '40227_163305', '40577_135411', '41282_156578', '42820_127889', '43083_152395', '43126_132026', '43151_118917', '43926_185036', '44468_177637', '44656_130791', '44908_173941', '45489_105911', '46305_152478', '46588_179696', '46676_101460', '48397_131741', '48636_133237', '48771_165326', '48950_165922', '50007_117512', '50362_109359', '50434_163782', '50507_145537', '50899_172060', '50899_193309', '51538_170567', '51601_197326', '51821_197028', '52622_157538', '52736_184633', '52986_186209', '53001_109747', '53152_182753', '53216_195381', '53258_132104', '53626_155583', '53695_111574', '53832_101528', '54183_150129', '54205_177558', '54289_162528', '54523_164156', '55083_186985', '55186_164063', '55308_131281', '55646_135160', '55713_183070', '56327_119001', '57102_173980', '57445_100085', '57764_103584', '57817_127483', '58005_196023', '58891_111619', '59005_180743', '59485_108441', '59834_151679', '60580_150902', '60805_143452', '61455_110475', '61472_128606', '61597_142151', '61940_188349', '62004_123404', '62527_154061', '62681_131593', '62930_176910', '63402_117508', '63755_111041', '64024_102769', '65689_143433', '65741_173092', '66037_157729', '66283_180114', '66710_184546', '66770_138200', '66835_131248', '68941_111436', '68998_176037', '69250_138381', '69371_155261', '69426_110202', '69596_174074', '70156_174073', '70491_165089', '70514_187690', '71059_111156', '72260_148924', '72273_133553', '72459_120861', '72930_123525', '72978_198761', '73043_166944', '73695_108174', '73970_130239', '77484_125750', '77511_116682', '77608_103903', '77842_166955', '78342_179212', '78565_147907', '78704_109026', '79075_196132', '79285_138379', '79664_161109', '81111_179864', '81247_102230', '81461_170377', '81579_109770', '81660_159603', '81660_198618', '81737_180701', '81783_146223', '82439_170348', '82929_171568', '82938_170239', '83034_167891']
train_labels = ['990', '378.52', '737.43', '607.82', '569.42', '873.52', '53.02', '40.7', '506.0', '52.0', '607.82', '990', '318.2', '780.94', '362.11', '148.1', '252.08', '569.42', '569.42', '569.42', '378.52', '550.11', '719.49', '506.0', '710.8', '747.69', '959.7', '282.2', '282.2', '34.71', '40.7', '378.52', '351.9', 'V26.52', '282.2', '282.2', '955.3', '52.0', '607.82', '252.08', '447.9', 'V65.3', '40.7', '53.02', '447.9', '959.7', '737.43', '52.0', '148.1', '506.0', '512.2', '719.49', '816.02', '955.3', 'V18.3', '351.9', '38.47', '569.42', '506.0', '378.52', 'V26.52', '351.9', '252.08', '52.0', 'V10.61', '252.08', '785.9', '38.47', '34.71', 'V65.3', '270.6', '737.43', '959.7', 'V18.3', '318.2', '816.02', 'V65.3', '955.1', '998.01', '816.02', '53.02', '990', '790.8', '873.52', '737.43', '955.3', '813.32', '990', '701.0', '40.7', '816.02', 'V10.61', '282.2', '202.82', '955.3', '171.0', '52.0', '719.49', '955.1', '813.32', 'V18.3', '737.43', '701.0', '38.47', '77.7', '958.91', '338.28', '362.11', '955.1', '148.1', '785.9', '710.8', '785.9', 'V26.52', '318.2', '38.47', '477.8', '338.28', '270.6', 'V10.61', '34.71', '710.8', '710.8', '747.69', '737.43', '252.08', '955.1;40.7', '959.7', '506.0', '607.82', '282.2', '40.7', '17.36', '813.32', '958.91', '338.28', '282.2', '790.8', '252.08', '998.01', '999.82', '351.9', '453.50', '477.8', '780.94', '596.89', '701.0', 'V65.3', '701.0', '873.52', '453.50', '453.50', '816.02', '282.2', '202.82', '477.8', '477.8', '955.3', '747.69', '790.8', '550.11', '998.01', '958.91', '318.2', '999.82', '252.08', '53.02', '719.49', 'V10.61', '171.0', '998.01', '362.11', '998.01', '477.8', '77.7', '351.9', '813.32', '362.11', '17.36', '447.9', '176.0', '512.2', '958.91', '77.7', '270.6', 'V26.52', '34.71', '17.36', '506.0', '596.89', '958.91', '447.9', 'V26.52', '512.2', '785.9', '77.7', 'V10.61', '40.7', '148.1', '550.11', '512.2', '378.52', '477.8', '747.69', 'V26.52', '999.82', '447.9', '998.01', '270.6', '999.82', '719.49', '701.0', '998.01', '596.89', '202.82', '569.42', '171.0', 'V65.3', '351.9', '999.82', '447.9', '52.0', '596.89', '202.82', '252.08', '790.8', '17.36', '338.28', 'V65.3', '171.0', '780.94', '338.28', '550.11', '737.43', '998.01', '351.9', '747.69', 'V18.3', '77.7', '737.43', '790.8', '990', '453.50', 'V65.3', '780.94', '176.0', '873.52', '77.7', 'V18.3']
dev_ids = ['83728_117847', '83785_161477', '84206_113320', '84650_181406', '85184_153859', '85417_139769', '85417_162413', '85539_196491', '85627_117999', '85940_191980', '86498_176343', '86824_114735', '86866_138801', '87879_155607', '88217_142411', '88409_172502', '88484_141735', '88660_197599', '89092_117191', '89092_144923']
dev_labels = ['447.9', '282.2', '998.01', '477.8', '477.8', 'V26.52', 'V26.52', '477.8', '378.52', '959.7', '999.82', '318.2', '202.82', '202.82', '378.52', '958.91', '282.2', 'V18.3', '710.8', '710.8']
test_ids = ['89303_139801', '89416_100985', '89734_156733', '89760_161352', '89831_149366', '89875_171715', '89929_118735', '90020_181840', '90040_186956', '90208_114419', '90403_103315', '90508_142580', '90540_183305', '90688_162391', '90690_158221', '90776_181396', '90843_198603', '90863_152704', '90926_181337', '90959_136680', '91043_198158', '91101_144211', '91280_114683', '91343_131485', '91376_193978', '91399_152252', '91461_158919', '91579_142486', '91633_167703', '91744_190701', '91855_174847', '91859_146208', '91975_136078', '92137_179704', '92195_171402', '92235_155917', '92381_123670', '92420_117169', '92420_174251', '92464_122116', '92528_110863', '92580_165339', '92590_159846', '92613_160265', '92685_194703', '92800_172885', '92865_131271', '92989_162725', '93078_142712', '93078_174921', '93227_153146', '93265_108360', '93318_153526', '93336_193535', '93541_160567', '93610_164181', '93653_156750', '93831_107720', '93836_197401', '93950_192009', '94025_145724', '94029_111325', '94081_140901', '94164_152085', '94329_147311', '94385_131186', '94401_137634', '94450_182673', '94525_154715', '94665_121127', '94979_153554', '94991_184424', '95011_195773', '95218_127006', '95247_102696', '95390_129275', '95512_111046', '95632_113336', '95637_114653', '95637_121747', '95638_115853', '95803_134466', '95864_172947', '95895_115708', '95895_125448', '95895_128252', '95895_140477', '95895_145111', '95895_149702', '95895_173179', '95895_199262', '96171_102997', '96260_110058', '96443_103219', '96719_156657', '96731_127839', '96741_138697', '96741_189530', '96777_114995', '96777_176399', '96881_153877', '96962_172970', '97178_160879', '97321_169423', '97382_157400', '97395_125435', '97441_123913', '97441_150758', '97441_173649', '97441_192102', '97531_198072', '97605_119699', '97677_169589', '97762_107220', '97783_115153', '97808_137417', '97849_150980', '97924_186334', '98003_188840', '98174_176070', '98177_190914', '98342_107047', '98525_168505', '98649_110990', '98673_137415', '98800_191113', '98851_105014', '98905_188430', '98930_143207', '98930_163484', '99020_139178', '99080_173330', '99231_102589', '99231_151778', '99647_109725', '99691_140367', '99712_159835', '99822_146997', '99822_163117', '99822_195871', '99982_183791', '99991_151118']
test_labels = ['V18.3', '318.2', '512.2', '77.7', '477.8;38.47', '710.8', '270.6', '351.9', '171.0', '40.7', '338.28', '477.8', '171.0', '959.7', '998.01', '447.9', '351.9', '351.9', '955.1', '40.7', '607.82', '813.32', '338.28', '52.0', '958.91;34.71', '785.9', '52.0', '252.08', '52.0', '34.71;34.71', '607.82', '202.82', '959.7', '351.9', '453.50', '362.11', '270.6', '378.52', '378.52', '550.11;53.02;53.02', '176.0', '999.82', '955.3', '998.01', '785.9', 'V10.61', '747.69', '17.36', 'V10.61', 'V10.61', '958.91', '40.7', '362.11', '40.7', '701.0', '252.08', '40.7', '477.8', '780.94', '447.9', '816.02', 'V65.3', '816.02', '998.01', '998.01', '569.42', '378.52', '701.0', '569.42', '999.82', '477.8', '958.91', '447.9', '816.02', '999.82', '318.2', '477.8', 'V26.52', '338.28', '569.42', 'V65.3', '719.49', '719.49', '737.43', '737.43', '710.8', '737.43', '737.43', '737.43', '710.8', '710.8', '873.52', '790.8', '747.69', '790.8', '282.2', '282.2', '282.2', '596.89', '596.89', '955.3', '17.36', '719.49', '990', 'V26.52', '990', '506.0', '506.0', '506.0', '506.0', '780.94', '252.08', '990', '747.69', '77.7', '351.9', 'V26.52', '550.11', '512.2', '38.47', '955.1;955.3', '453.50', 'V65.3', '710.8', '813.32', '873.52', '252.08', '202.82', 'V18.3', 'V18.3', 'V65.3', '77.7', '790.8', '790.8', '790.8', '701.0', '282.2', '148.1', '148.1', '148.1', '596.89', '17.36']
def datato_50l(mode, select_ids, select_lables, output_path):
oupath = os.path.join(output_path, f"mimic3-50l_{mode}.json")
inpath = os.path.join(MIMIC_3_DIR, f"mimic3_{mode}.json")
tosavejson_list = []
with open(inpath, "r") as f:
df = ujson.load(f)
for index in range(len(df)):
codes_new = df[index]['subject_id']+"_"+df[index]['hadm_id']
if codes_new in select_ids:
tmp = df[index]
tmp['LABELS'] = select_lables[select_ids.index(codes_new)]
tosavejson_list.append(tmp)
with open(oupath, "w") as f:
json.dump(tosavejson_list, f, indent=4)
oupath = os.path.join(output_path, f"{mode}_50l.csv")
inpath = os.path.join(MIMIC_3_DIR, f"{mode}_full.csv")
tosavecsv_list = ["SUBJECT_ID,HADM_ID,TEXT,LABELS,length"]
with open(inpath, 'r') as f:
lr = csv.reader(f)
next(lr)
for row in lr:
codes_new = row[0]+"_"+row[1]
if codes_new in select_ids:
row[3] = select_lables[select_ids.index(codes_new)]
tosavecsv_list.append(",".join(row))
with open(oupath, "w") as f:
for a in tosavecsv_list:
f.write(a+"\n")
datato_50l('test', test_ids, test_labels, output_path)
datato_50l('train', train_ids, train_labels, output_path)
datato_50l('dev', dev_ids, dev_labels, output_path)
print("Done")