From 5a4d12b9870452aebdd8ff94b0cbaea4d13928d1 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 18 Oct 2023 14:22:12 -0700 Subject: [PATCH] add springer_materials notebook --- .../pycroscopy.ipynb | 10 + .../springer_materials.ipynb | 333 ++++++++++++++++++ 2 files changed, 343 insertions(+) create mode 100644 mpcontribs-portal/notebooks/contribs.materialsproject.org/springer_materials.ipynb diff --git a/mpcontribs-portal/notebooks/contribs.materialsproject.org/pycroscopy.ipynb b/mpcontribs-portal/notebooks/contribs.materialsproject.org/pycroscopy.ipynb index 595d6031f..99f10a538 100644 --- a/mpcontribs-portal/notebooks/contribs.materialsproject.org/pycroscopy.ipynb +++ b/mpcontribs-portal/notebooks/contribs.materialsproject.org/pycroscopy.ipynb @@ -166,6 +166,16 @@ "client.delete_contributions()\n", "client.submit_contributions(contributions)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "varied-condition", + "metadata": {}, + "outputs": [], + "source": [ + "client.make_public()" + ] } ], "metadata": { diff --git a/mpcontribs-portal/notebooks/contribs.materialsproject.org/springer_materials.ipynb b/mpcontribs-portal/notebooks/contribs.materialsproject.org/springer_materials.ipynb new file mode 100644 index 000000000..941be79b3 --- /dev/null +++ b/mpcontribs-portal/notebooks/contribs.materialsproject.org/springer_materials.ipynb @@ -0,0 +1,333 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "native-patrick", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import re\n", + "from glob import glob\n", + "from mpcontribs.client import Client\n", + "from flatten_dict import unflatten, flatten" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "affecting-smooth", + "metadata": {}, + "outputs": [], + "source": [ + "# create a project - only needed once\n", + "# client = Client()\n", + "# client.create_project(\n", + "# name=\"springer_materials\",\n", + "# title=\"Springer Materials\",\n", + "# authors=\"S. Scherer, S. George, P. Huck\",\n", + "# description=\"Linus Pauling Files from Springer Materials\",\n", + "# url=\"https://materials.springer.com\"\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "strange-living", + "metadata": {}, + "outputs": [], + "source": [ + "# init client and update project info if needed\n", + "client = Client(project=\"springer_materials\")\n", + "# client.make_public() # needs approval\n", + "# client.update_project(update={\"unique_identifiers\": False}) # allow multiple contributions per identifier/mpid\n", + "# client.update_project(update={\"other\": { # functions as a legend for root-level fields\n", + "# \"springer\": \"main info about springer entry\",\n", + "# \"properties\": \"meta data and availability of property entries\",\n", + "# \"phasediagram\": \"meta data about phase diagram entries\"\n", + "# }})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "reverse-label", + "metadata": {}, + "outputs": [], + "source": [ + "# load data\n", + "data_dir = \"/Users/patrick/GoogleDriveLBNL/My Drive/MaterialsProject/gitrepos/mpcontribs-data/springer_materials\"\n", + "\n", + "data = {}\n", + "for p in glob(f\"{data_dir}/*.json\"):\n", + " if not p.endswith(\"_example.json\"):\n", + " with open(p) as f:\n", + " k = p.rsplit(\"/\", 1)[-1]\n", + " data[k] = json.load(f)\n", + "\n", + "keys = set(k for docs in data.values() for doc in docs for k in doc)\n", + "len(data), len(keys)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "boxed-shade", + "metadata": {}, + "outputs": [], + "source": [ + "# define map for column names and their units\n", + "columns_map = {\n", + " # common fields/columns\n", + " \"Document_ID\": {\"name\": \"springer.id\"},\n", + " \"Document_Title\": {\"name\": \"springer.title\"},\n", + " \"Element_System\": {\"name\": \"springer.chemsys\"},\n", + " \"ISP_Distinct_Solid_Phase\": {\"name\": \"springer.phase\"},\n", + " \"Release_Year\": {\"name\": \"springer.released\", \"unit\": \"\"},\n", + " \"URL\": {\"name\": \"springer.url\"},\n", + " # properties\n", + " \"Prototype\": {\"name\": \"properties.prototype\"},\n", + " \"Pearson_Symbol\": {\"name\": \"properties.pearson\"},\n", + " \"Space_Group_Symbol\": {\"name\": \"properties.spacegroup\"},\n", + " \"Sample_Form\": {\"name\": \"properties.sample\"},\n", + " \"Main_Physical_Property\": {\"name\": \"properties.main\"},\n", + " \"Number_of_DataPoints\": {\"name\": \"properties.stats.datapoints\", \"unit\": \"\"},\n", + " \"Number_of_Samples\": {\"name\": \"properties.stats.samples\", \"unit\": \"\"},\n", + " \"Number_of_References\": {\"name\": \"properties.stats.references\", \"unit\": \"\"},\n", + " # phase diagram\n", + " \"Composition\": {\"name\": \"phasediagram.composition\"},\n", + " \"Temperature\": {\"name\": \"phasediagram.temperature\", \"unit\": \"K\"},\n", + " \"Status_of_Phase_Diagram\": {\"name\": \"phasediagram.status\"}\n", + "}\n", + "\n", + "keys - set(columns_map.keys()) # just making sure I didn't miss a key" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fifty-parallel", + "metadata": {}, + "outputs": [], + "source": [ + "# prep contributions\n", + "contributions = []\n", + "prop_set = set()\n", + "special_char_map = {ord('ä'): 'ae', ord('ü'): 'ue', ord('ö'): 'oe', ord('ß'): 'ss'}\n", + "CLEANR = re.compile('<.*?>') \n", + "\n", + "def convert_prop(s):\n", + " cleaned = \"\".join([c if c.isalnum() else \" \" for c in s])\n", + " capitalized = \"\".join([w.capitalize() for w in cleaned.split()])\n", + " return capitalized.translate(special_char_map)\n", + "\n", + "def cleanhtml(raw_html):\n", + " return re.sub(CLEANR, '', raw_html)\n", + "\n", + "for fn, docs in data.items():\n", + " category = \"-\".join(fn.replace(\".json\", \"\").rsplit(\"_\", 2)[1:])\n", + " print(category)\n", + " for doc in docs:\n", + " identifier = doc[\"MaterialsProject_ID\"]\n", + " formula = doc[\"Molecular_Formula\"]\n", + " properties = [\n", + " convert_prop(prop)\n", + " for prop in sorted(doc[\"List_of_Physical_Properties\"])\n", + " ]\n", + " contrib = {\n", + " \"identifier\": identifier, \"formula\": formula,\n", + " \"data\": {\"springer.category\": category},\n", + " }\n", + " \n", + " if properties:\n", + " prop_set |= set(properties)\n", + " for prop in properties:\n", + " contrib[\"data\"][f\"properties.available.{prop}\"] = \"Yes\"\n", + "\n", + " for k, v in doc.items():\n", + " if v:\n", + " col = columns_map.get(k)\n", + " if col:\n", + " name = col.get(\"name\")\n", + " if name:\n", + " unit = col.get(\"unit\")\n", + " val = \",\".join(v) if isinstance(v, list) else v\n", + " if unit is None and \"<\" in val:\n", + " val = cleanhtml(val)\n", + "\n", + " contrib[\"data\"][name] = f\"{val} {unit}\" if unit else val \n", + " \n", + " contrib[\"data\"] = unflatten(contrib[\"data\"], splitter=\"dot\")\n", + " contributions.append(contrib)\n", + " \n", + "len(contributions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "crude-incident", + "metadata": {}, + "outputs": [], + "source": [ + "# init columns\n", + "columns = {v[\"name\"]: v.get(\"unit\") for v in columns_map.values()}\n", + "columns[\"springer.category\"] = None\n", + "\n", + "for prop in sorted(prop_set):\n", + " columns[f\"properties.available.{prop}\"] = None\n", + " \n", + "client.init_columns(columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "taken-drama", + "metadata": {}, + "outputs": [], + "source": [ + "# submit everything\n", + "client.delete_contributions() # need to delete first due to `unique_identifiers=False`\n", + "client.init_columns(columns) # good practice :)\n", + "client.submit_contributions(contributions)\n", + "client.init_columns(columns) # just to make sure that all columns show up in the intended order" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "digital-approval", + "metadata": {}, + "outputs": [], + "source": [ + "# list of available query parameters for this project\n", + "client._reinit() # might be needed if new data was just submitted\n", + "client.available_query_params(startswith=(\"data__springer__released\", \"formula\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "later-discharge", + "metadata": {}, + "outputs": [], + "source": [ + "# count contributions for query:\n", + "# - \"physical-properties\" category\n", + "# - \"elasticity\" as main property\n", + "# - more than 5 samples\n", + "query = {\n", + " \"data__springer__category__exact\": \"physical-properties\",\n", + " \"data__properties__main__exact\": \"elasticity\",\n", + " \"data__properties__stats__samples__value__gt\": 5\n", + "}\n", + "client.count(query=query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "neither-moore", + "metadata": {}, + "outputs": [], + "source": [ + "# retrieve contributions for query and project out Springer ID and spacegroup fields\n", + "fields = [\"id\", \"identifier\", \"data.springer.id\", \"data.properties.spacegroup\"]\n", + "client.query_contributions(query=query, fields=fields)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "proud-certificate", + "metadata": {}, + "outputs": [], + "source": [ + "# get mp-id (and other info if needed) from Springer ID\n", + "springer_id = \"ppp_350781a8aa14dc0b19c6c879daff3be2\"\n", + "client.query_contributions(\n", + " query={\"data__springer__id__exact\": springer_id},\n", + " fields=[\"id\", \"identifier\", \"data.springer.id\", \"data.properties.pearson\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "protecting-guidance", + "metadata": {}, + "outputs": [], + "source": [ + "# count all entries for a list of formulas released before 2023\n", + "client.count(query={\n", + " \"formula__in\": [\"Fe2O3\", \"GaAS\"], \"data__springer__released__value__lt\": 2023\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "detected-pricing", + "metadata": {}, + "outputs": [], + "source": [ + "# get all entries containing all selected properties\n", + "properties = [\"XRayDiffraction\", \"IsotropicDisplacementParameter\", \"AnisotropicDisplacementParameter\"]\n", + "query = {f\"data__properties__available__{prop}__exact\": \"Yes\" for prop in properties}\n", + "\n", + "client.query_contributions(\n", + " query=query,\n", + " fields=[\"id\", \"identifier\", \"data.springer.id\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "august-farming", + "metadata": {}, + "outputs": [], + "source": [ + "# query/code to show Springer URLs and available properties under \"External Links\" on MP Details Page\n", + "query = {\n", + " \"identifier\": \"mp-2534\",\n", + " \"data__springer__category__exact\": \"physical-properties\",\n", + "}\n", + "fields = [\"data.springer.url\", \"data.properties.available\"]\n", + "entries = client.query_contributions(query=query, fields=fields).get(\"data\")\n", + "\n", + "# mimick table\n", + "for entry in entries:\n", + " flat = flatten(entry, reducer=\"dot\")\n", + " url = flat.pop(\"data.springer.url\")\n", + " print(url)\n", + " properties = [k.split(\".\")[-1] for k in flat]\n", + " print(\"\\t\", \", \".join(properties))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}