From 77195153effda5dc97e28a47507ab64a52246ffb Mon Sep 17 00:00:00 2001
From: Ihor Radchenko <yantar92@posteo.net>
Date: Tue, 29 Oct 2024 01:05:56 +0100
Subject: [PATCH] VaspDoc.get_incar_tags: Use Mediawiki API (#4141)

* src/pymatgen/io/vasp/help.py (VaspDoc.get_incar_tags):
Use Mediawiki API instead of parsing the HTML source directly.  The
old approach is not stable against changes in the tag list because of
the way URLs are constructed.  pagefrom= parameters start from certain
tag, which is not guaranteed to provide the complete tag list as the
new tags are added before that tag given in pagefrom=.  At the moment
of writing this commit, PRECFOCK tag is already missed using the old
approach.

Following up:  https://github.com/materialsproject/pymatgen/issues/4119#issuecomment-2439617604-permalink
---
 src/pymatgen/io/vasp/help.py | 41 +++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/pymatgen/io/vasp/help.py b/src/pymatgen/io/vasp/help.py
index 8491c65ec6b..79ff183e6ce 100644
--- a/src/pymatgen/io/vasp/help.py
+++ b/src/pymatgen/io/vasp/help.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import json
 import re
 
 import requests
@@ -68,16 +69,32 @@ def get_help(cls, tag: str, fmt: str = "text") -> str:
     @classmethod
     def get_incar_tags(cls) -> list[str]:
         """Get a list of all INCAR tags from the VASP wiki."""
-        tags = []
-        for url in (
-            "https://www.vasp.at/wiki/index.php/Category:INCAR_tag",
-            "https://www.vasp.at/wiki/index.php?title=Category:INCAR_tag&pagefrom=LREAL#mw-pages",
-            "https://www.vasp.at/wiki/index.php?title=Category:INCAR_tag&pagefrom=Profiling#mw-pages",
-        ):
-            response = requests.get(url, timeout=60)
-            soup = BeautifulSoup(response.text, features="html.parser")
-            for div in soup.findAll("div", {"class": "mw-category-group"}):
-                children = div.findChildren("li")
-                for child in children:
-                    tags.append(child.text.strip())
+        # Use Mediawiki API as documented in
+        # https://www.vasp.at/wiki/api.php?action=help&modules=query
+        url = (
+            "https://www.vasp.at/wiki/api.php?"
+            "action=query&list=categorymembers"
+            "&cmtitle=Category:INCAR_tag"
+            "&cmlimit=500&format=json"
+        )
+        response = requests.get(url, timeout=60)
+        response_dict = json.loads(response.text)
+
+        def extract_titles(data):
+            """Extract keywords from from Wikimedia response data.
+            See https://www.vasp.at/wiki/api.php?action=help&modules=query%2Bcategorymembers
+            Returns: List of keywords as strings.
+            """
+            return [category_data["title"] for category_data in data["query"]["categorymembers"]]
+
+        tags = extract_titles(response_dict)
+
+        # If there are more than 500 items in the response, we will
+        # get 'continue' field in the response
+        # See https://www.mediawiki.org/wiki/API:Continue
+        while "continue" in response_dict:
+            response = requests.get(url + f"&cmcontinue={response_dict['continue']['cmcontinue']}", timeout=60)
+            response_dict = json.loads(response.text)
+            tags = tags + extract_titles(response_dict)
+
         return tags