Skip to content

Enrichments

amy wieliczka edited this page Nov 2, 2022 · 14 revisions

There are 41 enrichment functions, listed here with the number of times they are used in an enrichment chain:

  • '/required-values-from-collection-registry': 6711,
  • '/shred': 5146,
  • '/copy_prop': 4161,
  • '/move_date_values': 4158,
  • '/lookup': 3980,
  • '/enrich_location': 3865,
  • '/enrich_date': 3288,
  • '/dpla_mapper': 2358,
  • '/enrich_earliest_date': 2355,
  • '/validate_mapv3': 2343,
  • '/filter_fields': 2333,
  • '/set-ucldc-dataprovider': 2333,
  • '/dedupe-sourceresource': 2333,
  • '/enrich-type': 2081,
  • '/enrich-subject': 2080,
  • '/enrich-format': 2080,
  • '/set_prop': 2080,
  • '/enrich_language': 2079,
  • '/cleanup_value': 2076,
  • '/select-id': 1720,
  • '/strip_html': 1660,
  • '/set_context': 994,
  • '/capitalize_value': 994,
  • '/select-oac-id': 570,
  • '/jsonfy-prop': 293,
  • '/drop-long-values': 51,
  • '/select-cmis-atom-id': 21,
  • '/select-preservica-id': 18,
  • '/replace_regex': 17,
  • '/replace_substring': 16,
  • '/validate_mapv3\r\n': 13,
  • '/unset_prop': 11,
  • 'select-id': 6,
  • 'select-oac-id': 4,
  • '/geocode': 2,
  • '/csl-marc-id': 2,
  • '/ucsb-aleph-marc-id': 1,
  • '/sfpl-marc-id': 1,
  • 'unescape-xhtml-entities': 1,
  • '/validate_map': 1,
  • '/validate_mapv3,': 1
from collections import Counter
from library_collection.models import Collection
rich_collections = Collection.published.all()
enrichments = [rc.enrichment_array for rc in rich_collections]
all_enrichments = sum(enrichments, [])
stripped_enrichments = [e.split('?')[0] for e in all_enrichments]
count_stripped_enrichments = Counter(stripped_enrichments)
count_stripped_enrichments
count_all_enrichments = Counter(all_enrichments)
count_all_enrichments

def group_collections(collection_list, group_by, dict_name='matched'):
    group_list = []
    while len(collection_list) > 0:
        remainder = []
        matched = []
        match_value = group_by(collection_list[0])
        for collection in collection_list:
            if group_by(collection) == match_value:
                matched.append(collection)
            else:
                remainder.append(collection)
        group_list.append({
            dict_name: match_value,
            'collections': matched,
            'count': len(matched)
        })
        collection_list = remainder
    return group_list

group_enrichments = group_collections(all_enrichments, lambda e: e.split('?')[0])
sub_group_enrichments = [{
    'enrichment_function': e['matched'], 
    'parameters': Counter(e['collections']), 
    'count': e['count']
} for e in group_enrichments]
sub_group_enrichments.sort(key=lambda e: e['count'], reverse=True)
table_view = ""
for e in sub_group_enrichments:
    list_view = "<ul>"
    for p, p_count in e['parameters'].items():
        list_view = f"{list_view}<li>{p}: {p_count}</li>"
    list_view = f"{list_view}</ul>"
    table_view = f"{table_view}| {e['enrichment_function']} | {e['count']} | {list_view} |\n"

print(
    "| Enrichment Function | Usage Count | Parameterized Usage Count |\n"
    "| --- | --- | --- |\n"
    f"{table_view}"
)
Enrichment Function Usage Count Parameterized Usage Count
/required-values-from-collection-registry 6713
  • /required-values-from-collection-registry?field=rights&mode=fill: 2310
  • /required-values-from-collection-registry?field=type&mode=fill: 1961
  • /required-values-from-collection-registry?field=title&mode=fill: 2035
  • /required-values-from-collection-registry?field=type&mode=overwrite: 373
  • /required-values-from-collection-registry?field=rights&mode=overwrite: 34
/shred 5146
  • /shred?prop=sourceResource%2Fspatial&delim=--: 2078
  • /shred?prop=sourceResource%2Fsubject%2Fname: 1021
  • /shred?prop=sourceResource%2Fcreator: 1021
  • /shred?prop=sourceResource%2Ftype: 1021
  • /shred?prop=sourceResource%2Fdescription&delim=%3Cbr%3E: 5
/copy_prop 4161
  • /copy_prop?prop=sourceResource%2Fpublisher&to_prop=dataProvider: 675
  • /copy_prop?prop=provider%2Fname&to_prop=dataProvider&skip_if_exists=True: 1000
  • /copy_prop?prop=originalRecord%2Fcollection&to_prop=dataProvider: 1105
  • /copy_prop?prop=provider%2Fname&to_prop=dataProvider&skip_if_exists=true: 440
  • /copy_prop?prop=sourceResource%4Fpublisher&to_prop=dataProvider: 293
  • /copy_prop?prop=provider%2Fname&to_prop=dataProvider: 86
  • /copy_prop?prop=provider%2Fname&to_prop=dataProvider&no_overwrite=True: 549
  • /copy_prop?prop=sourceResource%2Fspatial&to_prop=sourceResource%2Ftemporal&skip_if_exists=true: 1
  • /copy_prop?prop=sourceResource%2Fdescription&to_prop=sourceResource%2Ftitle: 1
  • /copy_prop?prop=originalRecord%2Ftype&to_prop=sourceResource%2Fformat: 11
/move_date_values 4158
  • /move_date_values?prop=sourceResource%2Fsubject: 2079
  • /move_date_values?prop=sourceResource%2Fspatial: 2079
/lookup 3980
  • /lookup?prop=sourceResource%2Flanguage%2Fname&target=sourceResource%2Flanguage%2Fname&substitution=iso639_3: 2031
  • /lookup?prop=sourceResource%2Flanguage%2Fname&target=sourceResource%2Flanguage%2Fiso639_3&substitution=iso639_3&inverse=True: 1946
  • /lookup?prop=sourceResource%2Fformat&target=sourceResource%2Fformat&substitution=scdl_fix_format: 3
/enrich_location 3865
  • /enrich_location: 1785
  • /enrich_location?prop=sourceResource%2FstateLocatedIn: 2080
/enrich_date 3288
  • /enrich_date: 2335
  • /enrich_date?prop=sourceResource%2Fdate: 953
/enrich_earliest_date 2355
  • /enrich_earliest_date: 2355
/validate_mapv3 2343
  • /validate_mapv3: 2343
/filter_fields 2333
  • /filter_fields?keys=sourceResource: 2333
/set-ucldc-dataprovider 2333
  • /set-ucldc-dataprovider: 2333
/dedupe-sourceresource 2333
  • /dedupe-sourceresource: 2333
/enrich-type 2081
  • /enrich-type: 2081
/enrich-subject 2080
  • /enrich-subject: 2080
/enrich-format 2080
  • /enrich-format: 2080
/set_prop 2080
  • /set_prop?prop=sourceResource%2FstateLocatedIn&value=California: 2080
/enrich_language 2079
  • /enrich_language: 2079
/cleanup_value 2076
  • /cleanup_value: 2076
/select-id 1720
  • /select-id?prop=uid: 278
  • /select-id?prop=id: 1429
  • /select-id?prop=PID: 3
  • /select-id?prop=metadata/identifier: 7
  • /select-id?prop=identifier: 3
/strip_html 1660
  • /strip_html: 1660
/set_context 994
  • /set_context: 994
/capitalize_value 994
  • /capitalize_value?exclude=sourceResource%2Frelation: 994
/select-oac-id 570
  • /select-oac-id: 570
/jsonfy-prop 293
  • /jsonfy-prop: 293
/drop-long-values 51
  • /drop-long-values?field=description&max_length=150: 42
  • /drop-long-values?field=description&max_length=250: 8
  • /drop-long-values?field=description&max_length=1000: 1
/select-cmis-atom-id 21
  • /select-cmis-atom-id: 21
/select-preservica-id 18
  • /select-preservica-id: 18
/replace_regex 17
  • /replace_regex?prop=sourceResource%2Fpublisher&regex=%5C%24%5CS&new=--: 10
  • /replace_regex?prop=sourceResource%2Fsubject&regex=%5C%24%5CS&new=--: 3
  • /replace_regex?prop=sourceResource%2Fcontributor&regex=%5C%24%5CS&new=--: 3
  • /replace_regex?prop=sourceResource%2Fcreator&regex=%5C%24%5CS&new=--: 1
/replace_substring 16
  • /replace_substring?prop=sourceResource%2Fsubject&old=%5Blcsh%5D&new=: 2
  • /replace_substring?prop=sourceResource%2Fsubject&old=%5Blcna%5D&new=: 1
  • /replace_substring?prop=sourceResource%2Fsubject&old=%5Baacr2%5D&new=: 1
  • /replace_substring?prop=sourceResource%2Ftitle&old=%5Bgraphic%5D&new=: 3
  • /replace_substring?prop=sourceResource%2Ftitle&old=%5Bgraphic&new=: 3
  • /replace_substring?prop=sourceResource%2Ftitle&old=graphic%5D&new=: 3
  • /replace_substring?prop=sourceResource%2Ftitle&old=%5Bgraphic%5B&new=: 3
/validate_mapv3 13
  • /validate_mapv3: 13
/unset_prop 11
  • /unset_prop?prop=sourceResource%2Fspatial,: 1
  • /unset_prop?prop=sourceResource%2Fprovenance: 10
select-id 6
  • select-id?prop=id: 6
select-oac-id 4
  • select-oac-id: 4
/geocode 2
  • /geocode: 2
/csl-marc-id 2
  • /csl-marc-id: 2
/ucsb-aleph-marc-id 1
  • /ucsb-aleph-marc-id: 1
/sfpl-marc-id 1
  • /sfpl-marc-id: 1
unescape-xhtml-entities 1
  • unescape-xhtml-entities?field=sourceResource: 1
/validate_map 1
  • /validate_map: 1
/validate_mapv3, 1
  • /validate_mapv3,: 1

Removed DPLA Mapper from above table:

Enrichment Function Usage Count Parameterized Usage Count
/dpla_mapper 2358
  • /dpla_mapper?mapper_type=oac_dc: 571
  • /dpla_mapper?mapper_type=ucd_json: 13
  • /dpla_mapper?mapper_type=ucldc_nuxeo: 278
  • /dpla_mapper?mapper_type=ucsb_aleph_marc: 1
  • /dpla_mapper?mapper_type=ucb_tind_marc: 23
  • /dpla_mapper?mapper_type=ucsc_oai_dpla: 10
  • /dpla_mapper?mapper_type=ucsd_blacklight_dc: 292
  • /dpla_mapper?mapper_type=csa_omeka: 44
  • /dpla_mapper?mapper_type=ucla_solr_dc: 3
  • /dpla_mapper?mapper_type=oac_dc_suppress_publisher: 2
  • /dpla_mapper?mapper_type=quartex_oai: 46
  • /dpla_mapper?mapper_type=sjsu_islandora: 24
  • /dpla_mapper?mapper_type=cca_vault_oai_dc: 4
  • /dpla_mapper?mapper_type=chs_islandora: 36
  • /dpla_mapper?mapper_type=contentdm_oai_dc: 311
  • /dpla_mapper?mapper_type=cmis_atom: 21
  • /dpla_mapper?mapper_type=black_gold_oai: 9
  • /dpla_mapper?mapper_type=calpoly_oai_dc: 77
  • /dpla_mapper?mapper_type=csu_sac_oai_dc: 2
  • /dpla_mapper?mapper_type=csudh_contentdm_oai_dc: 10
  • /dpla_mapper?mapper_type=oac_dc_suppress_desc_2: 1
  • /dpla_mapper?mapper_type=chula_vista_pl_contentdm_oai_dc: 5
  • /dpla_mapper?mapper_type=lapl_oai: 10
  • /dpla_mapper?mapper_type=sfpl_marc: 1
  • /dpla_mapper?mapper_type=lapl_26096: 1
  • /dpla_mapper?mapper_type=ucsf_solr: 2
  • /dpla_mapper?mapper_type=cavpp_islandora: 245
  • /dpla_mapper?mapper_type=up_oai_dc: 19
  • /dpla_mapper?mapper_type=chapman_oai_dc: 48
  • /dpla_mapper?mapper_type=preservica_api: 19
  • /dpla_mapper?mapper_type=csl_marc: 1
  • /dpla_mapper?mapper_type=contentdm_oai_dc_get_sound_thumbs: 1
  • /dpla_mapper?mapper_type=pspl_oai_dc: 8
  • /dpla_mapper?mapper_type=omeka: 28
  • /dpla_mapper?mapper_type=chico_oai_dc: 7
  • /dpla_mapper?mapper_type=ucb_bampfa_solr: 1
  • /dpla_mapper?mapper_type=islandora_oai_dc: 9
  • /dpla_mapper?mapper_type=youtube_video_snippet: 17
  • /dpla_mapper?mapper_type=csuci_mets: 1
  • /dpla_mapper?mapper_type=pastperfect_xml: 5
  • /dpla_mapper?mapper_type=caltech_restrict: 2
  • /dpla_mapper?mapper_type=usc_oai_dc: 75
  • /dpla_mapper?mapper_type=yosemite_oai_dc: 4
  • /dpla_mapper?mapper_type=emuseum_xml: 1
  • /dpla_mapper?mapper_type=csu_dspace_mets: 1
  • /dpla_mapper?mapper_type=flickr_api: 1
  • /dpla_mapper?mapper_type=sierramadre_marc: 1
  • /dpla_mapper?mapper_type=burbank_islandora: 11
  • /dpla_mapper?mapper_type=omeka_nothumb: 3
  • /dpla_mapper?mapper_type=sanjose_pastperfect: 1
  • /dpla_mapper?mapper_type=tv_academy_oai_dc: 1
  • /dpla_mapper?mapper_type=flickr_sdasm: 5
  • /dpla_mapper?mapper_type=flickr_sppl: 33
  • /dpla_mapper?mapper_type=internet_archive: 2
  • /dpla_mapper?mapper_type=arck_oai: 11