-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_labeling_functions.py
82 lines (61 loc) · 2.5 KB
/
create_labeling_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pandas as pd
import sys
sys.path.insert(0, '../snorkel')
from snorkel.labeling import LabelingFunction
import itertools
import math
from snorkel.labeling.lf.core import labeling_function
import numpy as np
'''
Useful Functions
'''
def keyword_lookup(x,phrase_to_match:str, label_id:int):
"""Returns the id corresponding to the label
Args:
phrase_to_match (str): some phrase that we need to match
label_id (int): id of label to use for this match
Returns:
(int): label id if match or -1 if no match
"""
if phrase_to_match.lower() in x.text.lower():
return label_id
else:
return -1
'''
Main Code
'''
def create_labeling_functions(bio_file:pd.DataFrame, bio_rules:pd.DataFrame):
"""create a list of labeling functions
Args:
bio_file (pd.DataFrame): a list of all the biomimicry functions
bio_rules (pd.DataFrame): a list of all the 'rules' for each biomimicry function
Returns:
labeling_function_list: a list of all the labeling function 'rules' corresponding to each biomimicry function
"""
bio_file = pd.read_csv(bio_file)
bio_rules = pd.read_csv(bio_rules)
names_used = list()
labeling_function_list = list()
#get a list of all the rules
for i in range(len(bio_file)):
label_name = bio_file.iloc[i]['function']
label_id = bio_file.iloc[i]['function_enumerated']
label_rule_name = label_name + "_rules"
if label_rule_name in list(bio_rules.columns):
underscore_list = []
phrases_lst = bio_rules[label_rule_name].to_list()
#remove blank cells and keep unique values
rules_no_na = list(set([x for x in phrases_lst if not pd.isnull(x)]))
#add underscore to rules
for item in rules_no_na:
item = item.replace(" ", "_")
underscore_list.append(item)
#create labeling function for each rule
for phrase in underscore_list:
function_name = f"keyword_{label_id}_{phrase}"
if (function_name not in names_used):
labeling_function = LabelingFunction(name=function_name, f=keyword_lookup,
resources={"phrase_to_match":phrase, "label_id":label_id})
labeling_function_list.append(labeling_function)
names_used.append(function_name)
return labeling_function_list