-
Notifications
You must be signed in to change notification settings - Fork 5
/
similarities.py
601 lines (467 loc) · 22.4 KB
/
similarities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
from owlready2 import *
from tabulate import tabulate
import itertools
import pandas as pd
import os
import seaborn as sns
import json
import urllib
import pandas as pd
import convert
import time
import sys
sys.setrecursionlimit(100000)
#"C:\Program Files\Java\jdk-11.0.2\bin\javaw.exe"
def ontology_classes_loader(ontology):
# Create sets of class labels for each ontology
# using label depending on ontology!
# iris
try:
onto1_iris = list([cls.iri for cls in ontology.classes()])
except:
print("IRIs of ontology " + ontology.name + " not (well) defined and could not be read!")
onto1_iris=[]
return None
iri_dict ={}
for iri in onto1_iris:
try:
if type(ontology.search_one(iri = iri).label.first()) == locstr:
# some ontologies use locstrings to account for different languages
class_label = ontology.search_one(iri = iri).label.first().split()[0]
else:
class_label = ontology.search_one(iri = iri).label.first()
except:
class_label = None
try:
if type(ontology.search_one(iri = iri).prefLabel.first()) == locstr:
# some ontologies use locstrings to account for different languages
class_prefLabel = ontology.search_one(iri = iri).prefLabel.first().split()[0]
else:
class_prefLabel = ontology.search_one(iri = iri).prefLabel.first()
except:
class_prefLabel = None
try:
if type(ontology.search_one(iri = iri).altLabel.first()) == locstr:
# some ontologies use locstrings to account for different languages
class_altLabel = ontology.search_one(iri = iri).altLabel.first().split()[0]
else:
class_altLabel = ontology.search_one(iri = iri).altLabel.first()
except:
class_altLabel = None
try:
class_name = ontology.search_one(iri = iri).name
except:
class_name = None
iri_dict[str(iri)] = {"label": class_label,
"prefLabel": class_prefLabel,
"altLabel": class_altLabel,
"name": class_name,
}
return iri_dict
####
def class_definition_readin(ontology_class):
try:
if ontology_class.definition:
definition_string = ontology_class.definition
else: #causing Error to break the try
sys.exit(1)
except:
try:
if ontology_class.hasDefinition:
definition_string = ontology_class.hasDefinition
else:
sys.exit(1)
except:
try:
if getattr(ontology_class,'IAO_0000115'):
definition_string = getattr(ontology_class,'IAO_0000115')
else:
sys.exit(1)
except:
try:
if ontology_class.comment:
definition_string = ontology_class.comment
else:
sys.exit(1)
except:
definition_string = []
return definition_string
####
def get_ontology_URLs():
## Reads in the URLs of the ontology files and returns returns them as
# dictionary with {ontology name : URL}
md_dict = convert.load_ontologies_metadata()
URL_dict = {}
for key in md_dict:
URL = md_dict[key]["References"]["Persistent URI of Ontology File (or perma link to latest Version)"]
URL_dict[key] = URL
return URL_dict
####
####
def ttl_to_owl(url):
## Conversion of ttl-ontology to owl-ontology with ROBOT
# Input: URL of ontology file
# Output: File name of ontology downloaded into subdirectory ./ontologies/ as str
##
filename = url.rpartition('/')[-1] # gets last bit of URL after / to obtain "filename.ttl"
onto_name = filename.split('.')[0]
##
# Finds ontology abbreviation based on URL of ontology in ontology URL dictionary
onto_URLs = get_ontology_URLs()
onto_abbrev = list(onto_URLs.keys())[list(onto_URLs.values()).index(url)]
##
ontology_output_filename = onto_abbrev + '.owl'
# if isfile == true, owl file is already contained in ./ontologies/ and no reload or
# conversion of ttl-ontology from web is necessary. So only if no owl file is
# contained, the merge and convert is executed.
if not os.path.isfile('./ontologies/' + ontology_output_filename):
onto_txt = urllib.request.urlopen(url)
onto_txt = onto_txt.read()#readlines()
with open('./ontologies/'+onto_name+'.ttl', 'wb') as onto_file:
onto_file.write(onto_txt)
#os.system(".\\robot\\robot convert --input .\\ontologies\\{} --format owl --output .\\ontologies\\{}".format(filename, ontology_output_filename))
os.system(".\\robot\\robot merge --input .\\ontologies\\{} --output .\\ontologies\\{}".format(filename,filename))
os.system(".\\robot\\robot convert --input .\\ontologies\\{} --format owl --output .\\ontologies\\{}".format(filename, ontology_output_filename))
return ontology_output_filename
####
####
def rdf_to_owl(url):
## Conversion of ttl-ontology to owl-ontology with ROBOT
# Input: URL of ontology file
# Output: File name of ontology downloaded into subdirectory ./ontologies/ as str
##
filename = url.rpartition('/')[-1] # gets last bit of URL after / to obtain "filename.ttl"
onto_name = filename.split('.')[0]
##
# Finds ontology abbreviation based on URL of ontology in ontology URL dictionary
onto_URLs = get_ontology_URLs()
onto_abbrev = list(onto_URLs.keys())[list(onto_URLs.values()).index(url)]
##
ontology_output_filename = onto_abbrev + '.owl'
# if isfile == true, owl file is already contained in ./ontologies/ and no reload or
# conversion of ttl-ontology from web is necessary. So only if no owl file is
# contained, the merge and convert is executed.
if not os.path.isfile('./ontologies/' + ontology_output_filename):
onto_txt = urllib.request.urlopen(url)
onto_txt = onto_txt.read()#readlines()
with open('./ontologies/'+onto_name+'.rdf', 'wb') as onto_file:
onto_file.write(onto_txt)
#os.system(".\\robot\\robot convert --input .\\ontologies\\{} --format owl --output .\\ontologies\\{}".format(filename, ontology_output_filename))
#os.system(".\\robot\\robot merge --input .\\ontologies\\{} --output .\\ontologies\\{}".format(filename,filename))
os.system(".\\robot\\robot convert --input .\\ontologies\\{} --format owl --output .\\ontologies\\{}".format(onto_name+'.rdf', ontology_output_filename))
return ontology_output_filename
####
####
def load_ontology_from_name(onto_name):
## Tries to load in the ontology by accessing the URL to an owl-file
#
onto_URLs = get_ontology_URLs()
URL = onto_URLs[onto_name]
onto_loaded = None
if onto_name in ['CHEMINF','M3']:
#contains deprecated classes and object properties, thus needs to be cleaned
# and loaded manually, else owlready2 will crash
try:
print("Loading Ontology: {} from local path ./ontologies/".format(onto_name))
onto_loaded = get_ontology("./ontologies/"+onto_name+'.owl').load()
print("Successfully loaded Ontology: {}".format(onto_name))
except:
print("Need to place file here: ./ontologies/{}.owl".format(onto_name))
onto_loaded = None
pass
elif URL.endswith('.owl'):
try:
print("Loading Ontology: {}".format(onto_name))
onto_loaded = get_ontology(URL).load()
print("Successfully loaded Ontology: {}".format(onto_name))
except:
print("Something went wrong, ontology name: {}".format(onto_name))
try:
try:
print("Trying to load Ontology: {} from local path ./ontologies/".format(onto_name))
onto_loaded = get_ontology("./ontologies/"+onto_name+'.owl').load()
print("Successfully loaded Ontology: {}".format(onto_name))
except:
print("Something went wrong, you need to place the owl-file here: ./ontologies/{}.owl".format(onto_name))
onto_loaded = None
pass
except:
pass
elif URL.endswith('.ttl'):
print("Ontology {} is provided as ttl, searching for owl verison of ontology in subdir ./ontologies/ and converting ontology from ttl to owl if not found".format(URL))
ontology_in_owl = ttl_to_owl(URL)
onto_loaded = get_ontology('./ontologies/' + ontology_in_owl).load()
else:
#TODO: try to load from ./ontologies/ if there is a manual added version of the OWL, such as for OntoCAPE.
print("Unknown file-ending for ontology {}, please check the URL!\n URL: {}\n".format(onto_name, URL))
try:
"""
print("Trying to load ontology {} from local path ./ontologies/".format(onto_name))
onto_loaded = get_ontology("./ontologies/"+onto_name+'.owl').load()
print("Successfully loaded Ontology: {}".format(onto_name))
"""
ontology_in_owl = rdf_to_owl(URL)
onto_loaded = get_ontology('./ontologies/' + ontology_in_owl).load()
except:
print("Something went wrong, ontology name: ".format(onto_name))
onto_loaded = None
pass
return onto_loaded
####
####
def onto_format_validation(onto_name, URL):
## prints compatibility of provided links with owlready2
#
if URL.endswith('.owl'):
#print("OWL: {}, {}".format(onto_name, URL))
return True
elif URL.endswith('.ttl'):
#print("TTL: {}, {} -> will need formatting".format(onto_name, URL))
return True
else:
print("Non-Conform: {}, {} -> not compatible".format(onto_name, URL))
return False
####
# search for same values in nested dicts
# with input dictionary and value to be searched, it outputs the
# path to the searched value for each dictionary
def search_value_in_nested_dict(dictionary, value, keys=None, path=None):
if keys is None:
keys = []
if path is None:
path = []
for key, val in dictionary.items():
# Update the current key path
current_path = path + [key]
if str(val).lower() == str(value).lower():
keys.append(tuple(current_path))
# recursive call of the function such that the lowest part of the nested dict is queried
if isinstance(val, dict):
search_value_in_nested_dict(val, value, keys, current_path)
result_dict = {}
for key_path in keys:
current_dict = result_dict
for key in key_path[:-1]:
current_dict.setdefault(key, {})
current_dict = current_dict[key]
current_dict[key_path[-1]] = value
return result_dict
####
####
def class_description_loader():
onto_URLs = get_ontology_URLs()
ontoNameList_output = list(onto_URLs.keys())
ontoNameList_output.remove("OntoCAPE")
# ontoNameList_output.remove("EMMO")
iri_dictionary = {}
for ontologyname in ontoNameList_output:
ontology = None
print(ontologyname)
ontology = load_ontology_from_name(ontologyname)
if ontology != None:
iri_dictionary[ontologyname] = ontology_classes_loader(ontology)
else:
print(ontologyname + " was empty!")
with open('iriDictionary.json', 'w') as fp:
json.dump(iri_dictionary, fp)
return iri_dictionary
####
####
#TODO: Code aufraeumen!
def store_similarities(onto_combination, match_list,export_str="xlsx"):
# onto_combination = tuple of ontology names
# match_list = Nested list of dictionaries, where each list entry contains
# a dictionary with a key for each ontology contained in onto_combination.
# The values of these keys contain themselves information on mapped classes
# in form of IRI:{label:<label>, prefLabel:<prefLabel>, altLabel:<altLabel>, name:<name>}
# Output: Dataframe with first two columns containing IRI and
# {label, prefLabel, altLabel, name} of each mapped class within the first ontology.
# The other two columns contain the same information (IRI and labeling of
# classes) for the second ontology.
# Initialize lists to store the data
onto1_data1 = []
onto1_data2 = []
onto2_data1 = []
onto2_data2 = []
def_list = []
#load second ontology
onto2 = load_ontology_from_name(onto_combination[1])
# iterate through match_list
for entry in match_list:
if type(entry[0]) == list:
onto1_entry1 = list(entry[0][0].get(onto_combination[0]).keys())[0]
onto1_entry2 = list(entry[0][0].get(onto_combination[0]).values())[0]
else:
onto1_entry1 = list(entry[0].get(onto_combination[0]).keys())[0]
onto1_entry2 = list(entry[0].get(onto_combination[0]).values())[0]
if type(entry[0]) == list:
onto2_entry1 = list(entry[0][1].get(onto_combination[1]).keys())[0]
onto2_entry2 = list(entry[0][1].get(onto_combination[1]).values())[0]
else:
onto2_entry1 = list(entry[1].get(onto_combination[1]).keys())[0]
onto2_entry2 = list(entry[1].get(onto_combination[1]).values())[0]
onto1_data1.append(onto1_entry1)
onto1_data2.append(onto1_entry2)
onto2_data1.append(onto2_entry1)
onto2_data2.append(onto2_entry2)
class_def = class_definition_readin(onto2.search_one(iri = onto2_entry1))
def_list.append(class_def)
# Create DataFrame from the data
df = pd.DataFrame({onto_combination[0]+'_IRI': onto1_data1, onto_combination[0]+'_DESC':onto1_data2, onto_combination[1]+'_IRI': onto2_data1, onto_combination[1]+'_DESC':onto2_data2,onto_combination[1]+'_DEF':def_list})
if export_str == "xlsx":
df.to_excel("./mapping/"+onto_combination[0]+"_"+onto_combination[1]+".xlsx")
elif export_str == "json":
df.to_json("./mapping/"+onto_combination[0]+"_"+onto_combination[1]+".json")
else:
print("Error: No export Style chosen for Mapping")
return df
####
def Ontology_Mapping():
# Generates the Mapping Heatmap.xlsx, that gets mappings for each combination
# of ontologies. This is done by comparing each set of classes of the
# combination of ontologies against each other by searching for same iris.
# Then, all classes that were not found by same iri, the labels,
# preferred labels, alternate labels and names of the classes of the first
# ontology are searched for in the labels, preferred labels, alternate
# labels and names of the second ontology.
# For each combination of ontologies, the function store_similarities() is
# called to store the mapping as excel-file in the subdirectory ./mapping/
onto_URLs = get_ontology_URLs()
ontoNameList = list(onto_URLs.keys())
ontoNameList_output = list(onto_URLs.keys())
#[ontoNameList_output.remove(key) for key in ontoNameList if not onto_format_validation(key,onto_URLs[key])]
ontoNameList_output.remove("OntoCAPE")
#ontoNameList_output.remove("EMMO")
onto_combinations = list(itertools.combinations(ontoNameList_output, 2))
df_numbers = pd.DataFrame(index = ontoNameList_output, columns = ontoNameList_output)
#TODO: substitute with class_description_loader()
with open("./iriDictionary.json") as f:
iri_dictionary = json.load(f)
#onto_combinations = [('AFO', 'BAO')]
#TODO: transfer to function
for comb in onto_combinations:
onto_dict1 = iri_dictionary[comb[0]]
onto_dict2 = iri_dictionary[comb[1]]
iri_list_dict_1 = list(onto_dict1.keys())
match_list = []
# search for same iris
for iri in iri_list_dict_1:
class_match = None
try:
class_match = onto_dict2[iri]
except:
class_match = None
if class_match:
match_list.append([{comb[0]:{iri:{'iri':iri}}},{comb[1]:{iri:{'iri':iri}}}])
# delete already found iris from dict
iri_list_dict_1_cleaned = iri_list_dict_1
[iri_list_dict_1_cleaned.remove(list(entry[0][comb[0]].keys())[0]) for entry in match_list]
label_list1 = [onto_dict1[iri]["label"] for iri in iri_list_dict_1_cleaned]
prefLabel_list1 = [onto_dict1[iri]["prefLabel"] for iri in iri_list_dict_1_cleaned]
altLabel_list1 = [onto_dict1[iri]["altLabel"] for iri in iri_list_dict_1_cleaned]
name_list1 = [onto_dict1[iri]["name"] for iri in iri_list_dict_1_cleaned]
#search for same preflabels, labels, altlabels, names
for i in range(len(label_list1)):
string_list = [label_list1[i], prefLabel_list1[i], altLabel_list1[i], name_list1[i]]
append_dict = []
for value in string_list:
if value != None:
value_dict = search_value_in_nested_dict(onto_dict2,value)
if value_dict:
if label_list1[i] != None: #try to insert label of first ontology
append_dict.append([{comb[0]:{iri_list_dict_1_cleaned[i]:onto_dict1[iri_list_dict_1_cleaned[i]]}}, {comb[1]:value_dict}])
elif prefLabel_list1[i] != None: #try to insert prefLabel of first ontology
append_dict.append([{comb[0]:{iri_list_dict_1_cleaned[i]:onto_dict1[iri_list_dict_1_cleaned[i]]}}, {comb[1]:value_dict}])
elif altLabel_list1[i] != None: #try to insert altLabel of first ontology
append_dict.append([{comb[0]:{iri_list_dict_1_cleaned[i]:onto_dict1[iri_list_dict_1_cleaned[i]]}}, {comb[1]:value_dict}])
elif name_list1[i] != None: #try to insert name of first ontology
append_dict.append([{comb[0]:{iri_list_dict_1_cleaned[i]:onto_dict1[iri_list_dict_1_cleaned[i]]}}, {comb[1]:value_dict}])
else: # iri has no label, thus, no label is inserted but the value of the string list
append_dict.append([{comb[0]:{iri_list_dict_1_cleaned[i]:{value}}}, {comb[1]:value_dict}])
if append_dict:
match_list.append(append_dict)
store_similarities(comb,match_list)
df_numbers[comb[0]][comb[1]] = len(match_list)
#print(df_numbers)
df_numbers.to_excel("MappingHeatmap.xlsx")
return df_numbers
####
####
def run():
#class_description_loader()
df = Ontology_Mapping()
####
####
def Similarity_Search_from_List(input_list,list_name, export_str="xlsx"):
# Uses list of strings as input to output matching classes from ontology
# collection.
onto_URLs = get_ontology_URLs()
ontoNameList = list(onto_URLs.keys())
ontoNameList_output = list(onto_URLs.keys())
#[ontoNameList_output.remove(key) for key in ontoNameList if not onto_format_validation(key,onto_URLs[key])]
ontoNameList_output.remove("OntoCAPE")
#ontoNameList_output.remove("EMMO")
#onto_combinations = list(itertools.combinations(ontoNameList_output, 2))
df_numbers = pd.DataFrame(index = [list_name], columns = ontoNameList_output)
with open("./iriDictionary.json") as f:
iri_dictionary = json.load(f)
for ontology in ontoNameList_output:
onto_dict1 = iri_dictionary[ontology]
comb = (list_name, ontology)
match_list = []
#search for same preflabels, labels, altlabels, names
for value in input_list:
append_dict = []
value_dict = search_value_in_nested_dict(onto_dict1,value)
if value_dict:
append_dict.append([{comb[0]:{'no IRI':{value}}}, {comb[1]:value_dict}])
if append_dict:
match_list.append(append_dict)
store_similarities(comb,match_list,export_str)
df_numbers[comb[1]][comb[0]] = len(match_list)
#print(df_numbers)
if export_str == "xlsx":
df_numbers.to_excel("MappingHeatmap_"+list_name+".xlsx")
elif export_str == "json":
df_numbers.to_json("MappingHeatmap_"+list_name+".json")
else:
print("Error: No export Style chosen for Mapping")
return df_numbers
####
def run_similarity_from_vocabulary():
test_ontology = get_ontology('./ontologies/voc4cat.owl').load()
ind_list = list(test_ontology.individuals())
#prefList = [[str(i.prefLabel[0]),i.altLabel] for i in ind_list]
prefList = [str(i.prefLabel[0]) for i in ind_list]
Similarity_Search_from_List(prefList,"input_list")
####
"""
with open("PhotoCatVocabulary.txt") as file:
lines = [line.rstrip("\n") for line in file]
t = time.time()
df = pd.read_excel("Combined Cleaned Vocabulary.xlsx", sheet_name = "Concepts", skiprows=1)
column_data = list(df["Preferred Label*"])
data_frame_numbers = Similarity_Search_from_List(column_data,"Combined Cleaned Vocabulary")
elapsed = time.time() - t
print(elapsed)
"""
"""
df = pd.read_excel("CombinedConcepts_condensed_09-2022_Test_AB.xlsx", sheet_name = "Concepts")
column_data = list(df["Preferred Label*"])
numbers = Similarity_Search_from_List(column_data,"CombinedConcepts_condensed_09-2022_")
with open('combinedVocabulary_nextcloud.txt', 'r') as file:
# Read the contents of the file
contents = file.read()
# Split the contents into individual elements to create a list
my_list = contents.split(',')
Similarity_Search_from_List(my_list,"concept_collection_")
"""
#https://github.com/pysemtec/semantic-python-overview
#https://arrow.apache.org/docs/python/index.html
####
#print out ontologies without proper URLs ->
#TODO: find those links and fix it in MasterTable
#onto_URLs = get_ontology_URLs()
#for i in onto_URLs:
# onto_format_validation(i, onto_URLs[i])
####