VaspDoc.get_incar_tags: Use Mediawiki API (#4141)

* src/pymatgen/io/vasp/help.py (VaspDoc.get_incar_tags): Use Mediawiki API instead of parsing the HTML source directly. The old approach is not stable against changes in the tag list because of the way URLs are constructed. pagefrom= parameters start from certain tag, which is not guaranteed to provide the complete tag list as the new tags are added before that tag given in pagefrom=. At the moment of writing this commit, PRECFOCK tag is already missed using the old approach. Following up: #4119 (comment)
materialsproject · Oct 29, 2024 · 7719515 · 7719515
1 parent 91a5b65
commit 7719515
Showing 1 changed file with 29 additions and 12 deletions.
diff --git a/src/pymatgen/io/vasp/help.py b/src/pymatgen/io/vasp/help.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import json
 import re
 
 import requests
@@ -68,16 +69,32 @@ def get_help(cls, tag: str, fmt: str = "text") -> str:
     @classmethod
     def get_incar_tags(cls) -> list[str]:
         """Get a list of all INCAR tags from the VASP wiki."""
-        tags = []
-        for url in (
-            "https://www.vasp.at/wiki/index.php/Category:INCAR_tag",
-            "https://www.vasp.at/wiki/index.php?title=Category:INCAR_tag&pagefrom=LREAL#mw-pages",
-            "https://www.vasp.at/wiki/index.php?title=Category:INCAR_tag&pagefrom=Profiling#mw-pages",
-        ):
-            response = requests.get(url, timeout=60)
-            soup = BeautifulSoup(response.text, features="html.parser")
-            for div in soup.findAll("div", {"class": "mw-category-group"}):
-                children = div.findChildren("li")
-                for child in children:
-                    tags.append(child.text.strip())
+        # Use Mediawiki API as documented in
+        # https://www.vasp.at/wiki/api.php?action=help&modules=query
+        url = (
+            "https://www.vasp.at/wiki/api.php?"
+            "action=query&list=categorymembers"
+            "&cmtitle=Category:INCAR_tag"
+            "&cmlimit=500&format=json"
+        )
+        response = requests.get(url, timeout=60)
+        response_dict = json.loads(response.text)
+
+        def extract_titles(data):
+            """Extract keywords from from Wikimedia response data.
+            See https://www.vasp.at/wiki/api.php?action=help&modules=query%2Bcategorymembers
+            Returns: List of keywords as strings.
+            """
+            return [category_data["title"] for category_data in data["query"]["categorymembers"]]
+
+        tags = extract_titles(response_dict)
+
+        # If there are more than 500 items in the response, we will
+        # get 'continue' field in the response
+        # See https://www.mediawiki.org/wiki/API:Continue
+        while "continue" in response_dict:
+            response = requests.get(url + f"&cmcontinue={response_dict['continue']['cmcontinue']}", timeout=60)
+            response_dict = json.loads(response.text)
+            tags = tags + extract_titles(response_dict)
+
         return tags