-
Notifications
You must be signed in to change notification settings - Fork 3
Enrichments
amy wieliczka edited this page Nov 2, 2022
·
14 revisions
There are 41 enrichment functions, listed here with the number of times they are used in an enrichment chain:
- '/required-values-from-collection-registry': 6711,
- '/shred': 5146,
- '/copy_prop': 4161,
- '/move_date_values': 4158,
- '/lookup': 3980,
- '/enrich_location': 3865,
- '/enrich_date': 3288,
-
'/dpla_mapper': 2358, - '/enrich_earliest_date': 2355,
- '/validate_mapv3': 2343,
- '/filter_fields': 2333,
- '/set-ucldc-dataprovider': 2333,
- '/dedupe-sourceresource': 2333,
- '/enrich-type': 2081,
- '/enrich-subject': 2080,
- '/enrich-format': 2080,
- '/set_prop': 2080,
- '/enrich_language': 2079,
- '/cleanup_value': 2076,
- '/select-id': 1720,
- '/strip_html': 1660,
- '/set_context': 994,
- '/capitalize_value': 994,
- '/select-oac-id': 570,
- '/jsonfy-prop': 293,
- '/drop-long-values': 51,
- '/select-cmis-atom-id': 21,
- '/select-preservica-id': 18,
- '/replace_regex': 17,
- '/replace_substring': 16,
- '/validate_mapv3\r\n': 13,
- '/unset_prop': 11,
- 'select-id': 6,
- 'select-oac-id': 4,
- '/geocode': 2,
- '/csl-marc-id': 2,
- '/ucsb-aleph-marc-id': 1,
- '/sfpl-marc-id': 1,
- 'unescape-xhtml-entities': 1,
- '/validate_map': 1,
- '/validate_mapv3,': 1
from collections import Counter
from library_collection.models import Collection
rich_collections = Collection.published.all()
enrichments = [rc.enrichment_array for rc in rich_collections]
all_enrichments = sum(enrichments, [])
stripped_enrichments = [e.split('?')[0] for e in all_enrichments]
count_stripped_enrichments = Counter(stripped_enrichments)
count_stripped_enrichments
count_all_enrichments = Counter(all_enrichments)
count_all_enrichments
def group_collections(collection_list, group_by, dict_name='matched'):
group_list = []
while len(collection_list) > 0:
remainder = []
matched = []
match_value = group_by(collection_list[0])
for collection in collection_list:
if group_by(collection) == match_value:
matched.append(collection)
else:
remainder.append(collection)
group_list.append({
dict_name: match_value,
'collections': matched,
'count': len(matched)
})
collection_list = remainder
return group_list
group_enrichments = group_collections(all_enrichments, lambda e: e.split('?')[0])
sub_group_enrichments = [{
'enrichment_function': e['matched'],
'parameters': Counter(e['collections']),
'count': e['count']
} for e in group_enrichments]
sub_group_enrichments.sort(key=lambda e: e['count'], reverse=True)
table_view = ""
for e in sub_group_enrichments:
list_view = "<ul>"
for p, p_count in e['parameters'].items():
list_view = f"{list_view}<li>{p}: {p_count}</li>"
list_view = f"{list_view}</ul>"
table_view = f"{table_view}| {e['enrichment_function']} | {e['count']} | {list_view} |\n"
print(
"| Enrichment Function | Usage Count | Parameterized Usage Count |\n"
"| --- | --- | --- |\n"
f"{table_view}"
)
Enrichment Function | Usage Count | Parameterized Usage Count |
---|---|---|
/required-values-from-collection-registry | 6713 |
|
/shred | 5146 |
|
/copy_prop | 4161 |
|
/move_date_values | 4158 |
|
/lookup | 3980 |
|
/enrich_location | 3865 |
|
/enrich_date | 3288 |
|
/enrich_earliest_date | 2355 |
|
/validate_mapv3 | 2343 |
|
/filter_fields | 2333 |
|
/set-ucldc-dataprovider | 2333 |
|
/dedupe-sourceresource | 2333 |
|
/enrich-type | 2081 |
|
/enrich-subject | 2080 |
|
/enrich-format | 2080 |
|
/set_prop | 2080 |
|
/enrich_language | 2079 |
|
/cleanup_value | 2076 |
|
/select-id | 1720 |
|
/strip_html | 1660 |
|
/set_context | 994 |
|
/capitalize_value | 994 |
|
/select-oac-id | 570 |
|
/jsonfy-prop | 293 |
|
/drop-long-values | 51 |
|
/select-cmis-atom-id | 21 |
|
/select-preservica-id | 18 |
|
/replace_regex | 17 |
|
/replace_substring | 16 |
|
/validate_mapv3 | 13 |
|
/unset_prop | 11 |
|
select-id | 6 |
|
select-oac-id | 4 |
|
/geocode | 2 |
|
/csl-marc-id | 2 |
|
/ucsb-aleph-marc-id | 1 |
|
/sfpl-marc-id | 1 |
|
unescape-xhtml-entities | 1 |
|
/validate_map | 1 |
|
/validate_mapv3, | 1 |
|
Removed DPLA Mapper from above table:
Enrichment Function | Usage Count | Parameterized Usage Count |
---|---|---|
/dpla_mapper | 2358 |
|