Skip to content

Commit

Permalink
VaspDoc.get_incar_tags: Use Mediawiki API (#4141)
Browse files Browse the repository at this point in the history
* src/pymatgen/io/vasp/help.py (VaspDoc.get_incar_tags):
Use Mediawiki API instead of parsing the HTML source directly.  The
old approach is not stable against changes in the tag list because of
the way URLs are constructed.  pagefrom= parameters start from certain
tag, which is not guaranteed to provide the complete tag list as the
new tags are added before that tag given in pagefrom=.  At the moment
of writing this commit, PRECFOCK tag is already missed using the old
approach.

Following up:  #4119 (comment)
  • Loading branch information
yantar92 authored Oct 29, 2024
1 parent 91a5b65 commit 7719515
Showing 1 changed file with 29 additions and 12 deletions.
41 changes: 29 additions & 12 deletions src/pymatgen/io/vasp/help.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import json
import re

import requests
Expand Down Expand Up @@ -68,16 +69,32 @@ def get_help(cls, tag: str, fmt: str = "text") -> str:
@classmethod
def get_incar_tags(cls) -> list[str]:
"""Get a list of all INCAR tags from the VASP wiki."""
tags = []
for url in (
"https://www.vasp.at/wiki/index.php/Category:INCAR_tag",
"https://www.vasp.at/wiki/index.php?title=Category:INCAR_tag&pagefrom=LREAL#mw-pages",
"https://www.vasp.at/wiki/index.php?title=Category:INCAR_tag&pagefrom=Profiling#mw-pages",
):
response = requests.get(url, timeout=60)
soup = BeautifulSoup(response.text, features="html.parser")
for div in soup.findAll("div", {"class": "mw-category-group"}):
children = div.findChildren("li")
for child in children:
tags.append(child.text.strip())
# Use Mediawiki API as documented in
# https://www.vasp.at/wiki/api.php?action=help&modules=query
url = (
"https://www.vasp.at/wiki/api.php?"
"action=query&list=categorymembers"
"&cmtitle=Category:INCAR_tag"
"&cmlimit=500&format=json"
)
response = requests.get(url, timeout=60)
response_dict = json.loads(response.text)

def extract_titles(data):
"""Extract keywords from from Wikimedia response data.
See https://www.vasp.at/wiki/api.php?action=help&modules=query%2Bcategorymembers
Returns: List of keywords as strings.
"""
return [category_data["title"] for category_data in data["query"]["categorymembers"]]

tags = extract_titles(response_dict)

# If there are more than 500 items in the response, we will
# get 'continue' field in the response
# See https://www.mediawiki.org/wiki/API:Continue
while "continue" in response_dict:
response = requests.get(url + f"&cmcontinue={response_dict['continue']['cmcontinue']}", timeout=60)
response_dict = json.loads(response.text)
tags = tags + extract_titles(response_dict)

return tags

0 comments on commit 7719515

Please sign in to comment.