-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
343 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
333 changes: 333 additions & 0 deletions
333
mpcontribs-portal/notebooks/contribs.materialsproject.org/springer_materials.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,333 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "native-patrick", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import json\n", | ||
"import re\n", | ||
"from glob import glob\n", | ||
"from mpcontribs.client import Client\n", | ||
"from flatten_dict import unflatten, flatten" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "affecting-smooth", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# create a project - only needed once\n", | ||
"# client = Client()\n", | ||
"# client.create_project(\n", | ||
"# name=\"springer_materials\",\n", | ||
"# title=\"Springer Materials\",\n", | ||
"# authors=\"S. Scherer, S. George, P. Huck\",\n", | ||
"# description=\"Linus Pauling Files from Springer Materials\",\n", | ||
"# url=\"https://materials.springer.com\"\n", | ||
"# )" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "strange-living", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# init client and update project info if needed\n", | ||
"client = Client(project=\"springer_materials\")\n", | ||
"# client.make_public() # needs approval\n", | ||
"# client.update_project(update={\"unique_identifiers\": False}) # allow multiple contributions per identifier/mpid\n", | ||
"# client.update_project(update={\"other\": { # functions as a legend for root-level fields\n", | ||
"# \"springer\": \"main info about springer entry\",\n", | ||
"# \"properties\": \"meta data and availability of property entries\",\n", | ||
"# \"phasediagram\": \"meta data about phase diagram entries\"\n", | ||
"# }})" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "reverse-label", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# load data\n", | ||
"data_dir = \"/Users/patrick/GoogleDriveLBNL/My Drive/MaterialsProject/gitrepos/mpcontribs-data/springer_materials\"\n", | ||
"\n", | ||
"data = {}\n", | ||
"for p in glob(f\"{data_dir}/*.json\"):\n", | ||
" if not p.endswith(\"_example.json\"):\n", | ||
" with open(p) as f:\n", | ||
" k = p.rsplit(\"/\", 1)[-1]\n", | ||
" data[k] = json.load(f)\n", | ||
"\n", | ||
"keys = set(k for docs in data.values() for doc in docs for k in doc)\n", | ||
"len(data), len(keys)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "boxed-shade", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# define map for column names and their units\n", | ||
"columns_map = {\n", | ||
" # common fields/columns\n", | ||
" \"Document_ID\": {\"name\": \"springer.id\"},\n", | ||
" \"Document_Title\": {\"name\": \"springer.title\"},\n", | ||
" \"Element_System\": {\"name\": \"springer.chemsys\"},\n", | ||
" \"ISP_Distinct_Solid_Phase\": {\"name\": \"springer.phase\"},\n", | ||
" \"Release_Year\": {\"name\": \"springer.released\", \"unit\": \"\"},\n", | ||
" \"URL\": {\"name\": \"springer.url\"},\n", | ||
" # properties\n", | ||
" \"Prototype\": {\"name\": \"properties.prototype\"},\n", | ||
" \"Pearson_Symbol\": {\"name\": \"properties.pearson\"},\n", | ||
" \"Space_Group_Symbol\": {\"name\": \"properties.spacegroup\"},\n", | ||
" \"Sample_Form\": {\"name\": \"properties.sample\"},\n", | ||
" \"Main_Physical_Property\": {\"name\": \"properties.main\"},\n", | ||
" \"Number_of_DataPoints\": {\"name\": \"properties.stats.datapoints\", \"unit\": \"\"},\n", | ||
" \"Number_of_Samples\": {\"name\": \"properties.stats.samples\", \"unit\": \"\"},\n", | ||
" \"Number_of_References\": {\"name\": \"properties.stats.references\", \"unit\": \"\"},\n", | ||
" # phase diagram\n", | ||
" \"Composition\": {\"name\": \"phasediagram.composition\"},\n", | ||
" \"Temperature\": {\"name\": \"phasediagram.temperature\", \"unit\": \"K\"},\n", | ||
" \"Status_of_Phase_Diagram\": {\"name\": \"phasediagram.status\"}\n", | ||
"}\n", | ||
"\n", | ||
"keys - set(columns_map.keys()) # just making sure I didn't miss a key" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "fifty-parallel", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# prep contributions\n", | ||
"contributions = []\n", | ||
"prop_set = set()\n", | ||
"special_char_map = {ord('ä'): 'ae', ord('ü'): 'ue', ord('ö'): 'oe', ord('ß'): 'ss'}\n", | ||
"CLEANR = re.compile('<.*?>') \n", | ||
"\n", | ||
"def convert_prop(s):\n", | ||
" cleaned = \"\".join([c if c.isalnum() else \" \" for c in s])\n", | ||
" capitalized = \"\".join([w.capitalize() for w in cleaned.split()])\n", | ||
" return capitalized.translate(special_char_map)\n", | ||
"\n", | ||
"def cleanhtml(raw_html):\n", | ||
" return re.sub(CLEANR, '', raw_html)\n", | ||
"\n", | ||
"for fn, docs in data.items():\n", | ||
" category = \"-\".join(fn.replace(\".json\", \"\").rsplit(\"_\", 2)[1:])\n", | ||
" print(category)\n", | ||
" for doc in docs:\n", | ||
" identifier = doc[\"MaterialsProject_ID\"]\n", | ||
" formula = doc[\"Molecular_Formula\"]\n", | ||
" properties = [\n", | ||
" convert_prop(prop)\n", | ||
" for prop in sorted(doc[\"List_of_Physical_Properties\"])\n", | ||
" ]\n", | ||
" contrib = {\n", | ||
" \"identifier\": identifier, \"formula\": formula,\n", | ||
" \"data\": {\"springer.category\": category},\n", | ||
" }\n", | ||
" \n", | ||
" if properties:\n", | ||
" prop_set |= set(properties)\n", | ||
" for prop in properties:\n", | ||
" contrib[\"data\"][f\"properties.available.{prop}\"] = \"Yes\"\n", | ||
"\n", | ||
" for k, v in doc.items():\n", | ||
" if v:\n", | ||
" col = columns_map.get(k)\n", | ||
" if col:\n", | ||
" name = col.get(\"name\")\n", | ||
" if name:\n", | ||
" unit = col.get(\"unit\")\n", | ||
" val = \",\".join(v) if isinstance(v, list) else v\n", | ||
" if unit is None and \"<\" in val:\n", | ||
" val = cleanhtml(val)\n", | ||
"\n", | ||
" contrib[\"data\"][name] = f\"{val} {unit}\" if unit else val \n", | ||
" \n", | ||
" contrib[\"data\"] = unflatten(contrib[\"data\"], splitter=\"dot\")\n", | ||
" contributions.append(contrib)\n", | ||
" \n", | ||
"len(contributions)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "crude-incident", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# init columns\n", | ||
"columns = {v[\"name\"]: v.get(\"unit\") for v in columns_map.values()}\n", | ||
"columns[\"springer.category\"] = None\n", | ||
"\n", | ||
"for prop in sorted(prop_set):\n", | ||
" columns[f\"properties.available.{prop}\"] = None\n", | ||
" \n", | ||
"client.init_columns(columns)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "taken-drama", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# submit everything\n", | ||
"client.delete_contributions() # need to delete first due to `unique_identifiers=False`\n", | ||
"client.init_columns(columns) # good practice :)\n", | ||
"client.submit_contributions(contributions)\n", | ||
"client.init_columns(columns) # just to make sure that all columns show up in the intended order" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "digital-approval", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# list of available query parameters for this project\n", | ||
"client._reinit() # might be needed if new data was just submitted\n", | ||
"client.available_query_params(startswith=(\"data__springer__released\", \"formula\"))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "later-discharge", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# count contributions for query:\n", | ||
"# - \"physical-properties\" category\n", | ||
"# - \"elasticity\" as main property\n", | ||
"# - more than 5 samples\n", | ||
"query = {\n", | ||
" \"data__springer__category__exact\": \"physical-properties\",\n", | ||
" \"data__properties__main__exact\": \"elasticity\",\n", | ||
" \"data__properties__stats__samples__value__gt\": 5\n", | ||
"}\n", | ||
"client.count(query=query)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "neither-moore", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# retrieve contributions for query and project out Springer ID and spacegroup fields\n", | ||
"fields = [\"id\", \"identifier\", \"data.springer.id\", \"data.properties.spacegroup\"]\n", | ||
"client.query_contributions(query=query, fields=fields)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "proud-certificate", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# get mp-id (and other info if needed) from Springer ID\n", | ||
"springer_id = \"ppp_350781a8aa14dc0b19c6c879daff3be2\"\n", | ||
"client.query_contributions(\n", | ||
" query={\"data__springer__id__exact\": springer_id},\n", | ||
" fields=[\"id\", \"identifier\", \"data.springer.id\", \"data.properties.pearson\"]\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "protecting-guidance", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# count all entries for a list of formulas released before 2023\n", | ||
"client.count(query={\n", | ||
" \"formula__in\": [\"Fe2O3\", \"GaAS\"], \"data__springer__released__value__lt\": 2023\n", | ||
"})" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "detected-pricing", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# get all entries containing all selected properties\n", | ||
"properties = [\"XRayDiffraction\", \"IsotropicDisplacementParameter\", \"AnisotropicDisplacementParameter\"]\n", | ||
"query = {f\"data__properties__available__{prop}__exact\": \"Yes\" for prop in properties}\n", | ||
"\n", | ||
"client.query_contributions(\n", | ||
" query=query,\n", | ||
" fields=[\"id\", \"identifier\", \"data.springer.id\"]\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "august-farming", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# query/code to show Springer URLs and available properties under \"External Links\" on MP Details Page\n", | ||
"query = {\n", | ||
" \"identifier\": \"mp-2534\",\n", | ||
" \"data__springer__category__exact\": \"physical-properties\",\n", | ||
"}\n", | ||
"fields = [\"data.springer.url\", \"data.properties.available\"]\n", | ||
"entries = client.query_contributions(query=query, fields=fields).get(\"data\")\n", | ||
"\n", | ||
"# mimick table\n", | ||
"for entry in entries:\n", | ||
" flat = flatten(entry, reducer=\"dot\")\n", | ||
" url = flat.pop(\"data.springer.url\")\n", | ||
" print(url)\n", | ||
" properties = [k.split(\".\")[-1] for k in flat]\n", | ||
" print(\"\\t\", \", \".join(properties))" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.1" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |