Skip to content

Commit

Permalink
add springer_materials notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
tschaume committed Oct 18, 2023
1 parent 360c1c8 commit 5a4d12b
Show file tree
Hide file tree
Showing 2 changed files with 343 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,16 @@
"client.delete_contributions()\n",
"client.submit_contributions(contributions)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "varied-condition",
"metadata": {},
"outputs": [],
"source": [
"client.make_public()"
]
}
],
"metadata": {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,333 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "native-patrick",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import re\n",
"from glob import glob\n",
"from mpcontribs.client import Client\n",
"from flatten_dict import unflatten, flatten"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "affecting-smooth",
"metadata": {},
"outputs": [],
"source": [
"# create a project - only needed once\n",
"# client = Client()\n",
"# client.create_project(\n",
"# name=\"springer_materials\",\n",
"# title=\"Springer Materials\",\n",
"# authors=\"S. Scherer, S. George, P. Huck\",\n",
"# description=\"Linus Pauling Files from Springer Materials\",\n",
"# url=\"https://materials.springer.com\"\n",
"# )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "strange-living",
"metadata": {},
"outputs": [],
"source": [
"# init client and update project info if needed\n",
"client = Client(project=\"springer_materials\")\n",
"# client.make_public() # needs approval\n",
"# client.update_project(update={\"unique_identifiers\": False}) # allow multiple contributions per identifier/mpid\n",
"# client.update_project(update={\"other\": { # functions as a legend for root-level fields\n",
"# \"springer\": \"main info about springer entry\",\n",
"# \"properties\": \"meta data and availability of property entries\",\n",
"# \"phasediagram\": \"meta data about phase diagram entries\"\n",
"# }})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "reverse-label",
"metadata": {},
"outputs": [],
"source": [
"# load data\n",
"data_dir = \"/Users/patrick/GoogleDriveLBNL/My Drive/MaterialsProject/gitrepos/mpcontribs-data/springer_materials\"\n",
"\n",
"data = {}\n",
"for p in glob(f\"{data_dir}/*.json\"):\n",
" if not p.endswith(\"_example.json\"):\n",
" with open(p) as f:\n",
" k = p.rsplit(\"/\", 1)[-1]\n",
" data[k] = json.load(f)\n",
"\n",
"keys = set(k for docs in data.values() for doc in docs for k in doc)\n",
"len(data), len(keys)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "boxed-shade",
"metadata": {},
"outputs": [],
"source": [
"# define map for column names and their units\n",
"columns_map = {\n",
" # common fields/columns\n",
" \"Document_ID\": {\"name\": \"springer.id\"},\n",
" \"Document_Title\": {\"name\": \"springer.title\"},\n",
" \"Element_System\": {\"name\": \"springer.chemsys\"},\n",
" \"ISP_Distinct_Solid_Phase\": {\"name\": \"springer.phase\"},\n",
" \"Release_Year\": {\"name\": \"springer.released\", \"unit\": \"\"},\n",
" \"URL\": {\"name\": \"springer.url\"},\n",
" # properties\n",
" \"Prototype\": {\"name\": \"properties.prototype\"},\n",
" \"Pearson_Symbol\": {\"name\": \"properties.pearson\"},\n",
" \"Space_Group_Symbol\": {\"name\": \"properties.spacegroup\"},\n",
" \"Sample_Form\": {\"name\": \"properties.sample\"},\n",
" \"Main_Physical_Property\": {\"name\": \"properties.main\"},\n",
" \"Number_of_DataPoints\": {\"name\": \"properties.stats.datapoints\", \"unit\": \"\"},\n",
" \"Number_of_Samples\": {\"name\": \"properties.stats.samples\", \"unit\": \"\"},\n",
" \"Number_of_References\": {\"name\": \"properties.stats.references\", \"unit\": \"\"},\n",
" # phase diagram\n",
" \"Composition\": {\"name\": \"phasediagram.composition\"},\n",
" \"Temperature\": {\"name\": \"phasediagram.temperature\", \"unit\": \"K\"},\n",
" \"Status_of_Phase_Diagram\": {\"name\": \"phasediagram.status\"}\n",
"}\n",
"\n",
"keys - set(columns_map.keys()) # just making sure I didn't miss a key"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fifty-parallel",
"metadata": {},
"outputs": [],
"source": [
"# prep contributions\n",
"contributions = []\n",
"prop_set = set()\n",
"special_char_map = {ord('ä'): 'ae', ord('ü'): 'ue', ord('ö'): 'oe', ord('ß'): 'ss'}\n",
"CLEANR = re.compile('<.*?>') \n",
"\n",
"def convert_prop(s):\n",
" cleaned = \"\".join([c if c.isalnum() else \" \" for c in s])\n",
" capitalized = \"\".join([w.capitalize() for w in cleaned.split()])\n",
" return capitalized.translate(special_char_map)\n",
"\n",
"def cleanhtml(raw_html):\n",
" return re.sub(CLEANR, '', raw_html)\n",
"\n",
"for fn, docs in data.items():\n",
" category = \"-\".join(fn.replace(\".json\", \"\").rsplit(\"_\", 2)[1:])\n",
" print(category)\n",
" for doc in docs:\n",
" identifier = doc[\"MaterialsProject_ID\"]\n",
" formula = doc[\"Molecular_Formula\"]\n",
" properties = [\n",
" convert_prop(prop)\n",
" for prop in sorted(doc[\"List_of_Physical_Properties\"])\n",
" ]\n",
" contrib = {\n",
" \"identifier\": identifier, \"formula\": formula,\n",
" \"data\": {\"springer.category\": category},\n",
" }\n",
" \n",
" if properties:\n",
" prop_set |= set(properties)\n",
" for prop in properties:\n",
" contrib[\"data\"][f\"properties.available.{prop}\"] = \"Yes\"\n",
"\n",
" for k, v in doc.items():\n",
" if v:\n",
" col = columns_map.get(k)\n",
" if col:\n",
" name = col.get(\"name\")\n",
" if name:\n",
" unit = col.get(\"unit\")\n",
" val = \",\".join(v) if isinstance(v, list) else v\n",
" if unit is None and \"<\" in val:\n",
" val = cleanhtml(val)\n",
"\n",
" contrib[\"data\"][name] = f\"{val} {unit}\" if unit else val \n",
" \n",
" contrib[\"data\"] = unflatten(contrib[\"data\"], splitter=\"dot\")\n",
" contributions.append(contrib)\n",
" \n",
"len(contributions)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "crude-incident",
"metadata": {},
"outputs": [],
"source": [
"# init columns\n",
"columns = {v[\"name\"]: v.get(\"unit\") for v in columns_map.values()}\n",
"columns[\"springer.category\"] = None\n",
"\n",
"for prop in sorted(prop_set):\n",
" columns[f\"properties.available.{prop}\"] = None\n",
" \n",
"client.init_columns(columns)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "taken-drama",
"metadata": {},
"outputs": [],
"source": [
"# submit everything\n",
"client.delete_contributions() # need to delete first due to `unique_identifiers=False`\n",
"client.init_columns(columns) # good practice :)\n",
"client.submit_contributions(contributions)\n",
"client.init_columns(columns) # just to make sure that all columns show up in the intended order"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "digital-approval",
"metadata": {},
"outputs": [],
"source": [
"# list of available query parameters for this project\n",
"client._reinit() # might be needed if new data was just submitted\n",
"client.available_query_params(startswith=(\"data__springer__released\", \"formula\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "later-discharge",
"metadata": {},
"outputs": [],
"source": [
"# count contributions for query:\n",
"# - \"physical-properties\" category\n",
"# - \"elasticity\" as main property\n",
"# - more than 5 samples\n",
"query = {\n",
" \"data__springer__category__exact\": \"physical-properties\",\n",
" \"data__properties__main__exact\": \"elasticity\",\n",
" \"data__properties__stats__samples__value__gt\": 5\n",
"}\n",
"client.count(query=query)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "neither-moore",
"metadata": {},
"outputs": [],
"source": [
"# retrieve contributions for query and project out Springer ID and spacegroup fields\n",
"fields = [\"id\", \"identifier\", \"data.springer.id\", \"data.properties.spacegroup\"]\n",
"client.query_contributions(query=query, fields=fields)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "proud-certificate",
"metadata": {},
"outputs": [],
"source": [
"# get mp-id (and other info if needed) from Springer ID\n",
"springer_id = \"ppp_350781a8aa14dc0b19c6c879daff3be2\"\n",
"client.query_contributions(\n",
" query={\"data__springer__id__exact\": springer_id},\n",
" fields=[\"id\", \"identifier\", \"data.springer.id\", \"data.properties.pearson\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "protecting-guidance",
"metadata": {},
"outputs": [],
"source": [
"# count all entries for a list of formulas released before 2023\n",
"client.count(query={\n",
" \"formula__in\": [\"Fe2O3\", \"GaAS\"], \"data__springer__released__value__lt\": 2023\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "detected-pricing",
"metadata": {},
"outputs": [],
"source": [
"# get all entries containing all selected properties\n",
"properties = [\"XRayDiffraction\", \"IsotropicDisplacementParameter\", \"AnisotropicDisplacementParameter\"]\n",
"query = {f\"data__properties__available__{prop}__exact\": \"Yes\" for prop in properties}\n",
"\n",
"client.query_contributions(\n",
" query=query,\n",
" fields=[\"id\", \"identifier\", \"data.springer.id\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "august-farming",
"metadata": {},
"outputs": [],
"source": [
"# query/code to show Springer URLs and available properties under \"External Links\" on MP Details Page\n",
"query = {\n",
" \"identifier\": \"mp-2534\",\n",
" \"data__springer__category__exact\": \"physical-properties\",\n",
"}\n",
"fields = [\"data.springer.url\", \"data.properties.available\"]\n",
"entries = client.query_contributions(query=query, fields=fields).get(\"data\")\n",
"\n",
"# mimick table\n",
"for entry in entries:\n",
" flat = flatten(entry, reducer=\"dot\")\n",
" url = flat.pop(\"data.springer.url\")\n",
" print(url)\n",
" properties = [k.split(\".\")[-1] for k in flat]\n",
" print(\"\\t\", \", \".join(properties))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 5a4d12b

Please sign in to comment.