From 77195153effda5dc97e28a47507ab64a52246ffb Mon Sep 17 00:00:00 2001 From: Ihor Radchenko Date: Tue, 29 Oct 2024 01:05:56 +0100 Subject: [PATCH] VaspDoc.get_incar_tags: Use Mediawiki API (#4141) * src/pymatgen/io/vasp/help.py (VaspDoc.get_incar_tags): Use Mediawiki API instead of parsing the HTML source directly. The old approach is not stable against changes in the tag list because of the way URLs are constructed. pagefrom= parameters start from certain tag, which is not guaranteed to provide the complete tag list as the new tags are added before that tag given in pagefrom=. At the moment of writing this commit, PRECFOCK tag is already missed using the old approach. Following up: https://github.com/materialsproject/pymatgen/issues/4119#issuecomment-2439617604-permalink --- src/pymatgen/io/vasp/help.py | 41 +++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/src/pymatgen/io/vasp/help.py b/src/pymatgen/io/vasp/help.py index 8491c65ec6b..79ff183e6ce 100644 --- a/src/pymatgen/io/vasp/help.py +++ b/src/pymatgen/io/vasp/help.py @@ -2,6 +2,7 @@ from __future__ import annotations +import json import re import requests @@ -68,16 +69,32 @@ def get_help(cls, tag: str, fmt: str = "text") -> str: @classmethod def get_incar_tags(cls) -> list[str]: """Get a list of all INCAR tags from the VASP wiki.""" - tags = [] - for url in ( - "https://www.vasp.at/wiki/index.php/Category:INCAR_tag", - "https://www.vasp.at/wiki/index.php?title=Category:INCAR_tag&pagefrom=LREAL#mw-pages", - "https://www.vasp.at/wiki/index.php?title=Category:INCAR_tag&pagefrom=Profiling#mw-pages", - ): - response = requests.get(url, timeout=60) - soup = BeautifulSoup(response.text, features="html.parser") - for div in soup.findAll("div", {"class": "mw-category-group"}): - children = div.findChildren("li") - for child in children: - tags.append(child.text.strip()) + # Use Mediawiki API as documented in + # https://www.vasp.at/wiki/api.php?action=help&modules=query + url = ( + "https://www.vasp.at/wiki/api.php?" + "action=query&list=categorymembers" + "&cmtitle=Category:INCAR_tag" + "&cmlimit=500&format=json" + ) + response = requests.get(url, timeout=60) + response_dict = json.loads(response.text) + + def extract_titles(data): + """Extract keywords from from Wikimedia response data. + See https://www.vasp.at/wiki/api.php?action=help&modules=query%2Bcategorymembers + Returns: List of keywords as strings. + """ + return [category_data["title"] for category_data in data["query"]["categorymembers"]] + + tags = extract_titles(response_dict) + + # If there are more than 500 items in the response, we will + # get 'continue' field in the response + # See https://www.mediawiki.org/wiki/API:Continue + while "continue" in response_dict: + response = requests.get(url + f"&cmcontinue={response_dict['continue']['cmcontinue']}", timeout=60) + response_dict = json.loads(response.text) + tags = tags + extract_titles(response_dict) + return tags