extract author locations from affiliations section of xml

neurodatascience · Aug 29, 2023 · f7be191 · f7be191
1 parent ede7402
commit f7be191
Show file tree

Hide file tree

Showing 6 changed files with 41,173 additions and 0 deletions.
diff --git a/src/pubextract/author_locations/__init__.py b/src/pubextract/author_locations/__init__.py
diff --git a/src/pubextract/author_locations/_data/worldcities.csv b/src/pubextract/author_locations/_data/worldcities.csv
diff --git a/src/pubextract/author_locations/_guessing_locations.py b/src/pubextract/author_locations/_guessing_locations.py
@@ -0,0 +1,84 @@
+from pathlib import Path
+import re
+
+from unidecode import unidecode
+import pandas as pd
+import numpy as np
+import en_core_web_sm
+
+from pubextract.author_locations import _reading_xml
+
+
+cities_path = Path(__file__).parent / "_data" / "worldcities.csv"
+WC = pd.read_csv(cities_path)
+WC = WC.dropna()
+COUNTRIES = set(WC["country"])
+CITIES = set(WC["city_ascii"])
+LOCATIONS = COUNTRIES.union(CITIES)
+COUNTRY_MAPPING = {
+    "UK": "United Kingdom",
+    "USA": "United States",
+    "South Korea": "Korea, South",
+}
+
+
+def _preprocess_text(text):
+    to_remove = [
+        "org/1999",
+        "/addr-line",
+        "/aff",
+        "/Affiliation"
+        "University",
+        "College",
+        "Center"
+    ]
+    text = re.sub(r'[,.;@#?!&$><:="-]+\ *', " ", text)
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    text = unidecode(text)
+    for item in to_remove:
+        text = text.replace(item, "")
+    return text
+
+
+def _get_entities(article_path):
+    aff = _reading_xml._get_first_affiliation(article_path)
+    aff = _preprocess_text(aff)
+    nlp = en_core_web_sm.load()
+    doc = nlp(aff)
+    items = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
+    unigrams = aff.split(" ")
+    items = items + unigrams
+    for i, unigram in enumerate(unigrams[:-1]):
+        bigram = " ".join([unigram, unigrams[i+1]])
+        items.append(bigram)
+    entities = [x for x in items if x in LOCATIONS]
+    entities = [x.strip() for x in entities]
+    entities = list(set(entities))
+    return entities
+
+
+def _get_location(ents):
+    ents = [COUNTRY_MAPPING[x] if x in COUNTRY_MAPPING else x for x in ents]
+    cities = CITIES.intersection(set(ents))
+    countries = COUNTRIES.intersection(set(ents))
+    i_ci = WC[WC["city_ascii"].isin(cities)].index
+    i_co = WC[WC["country"].isin(countries)].index
+    i = i_ci.intersection(i_co)
+    if not countries:
+        i = i_ci
+    if len(i) > 0:
+        # the [0] is to take the first match
+        location = WC.loc[i[0]].to_dict()
+    else:
+        location = np.nan
+    return location
+
+# class Locations:
+#     def __init__(self, article_path):
+#         self.article_path = article_path
+#         self.id = _reading_xml._get_id(article_path)
+#         self.affiliation = _reading_xml._get_first_affiliation(article_path)
+#         # self.tree = _reading._get_tree(article_path)
+#         self.entities = self._get_entities()
+#         self.locations = self._get_locations()
diff --git a/src/pubextract/author_locations/_pubget.py b/src/pubextract/author_locations/_pubget.py
@@ -0,0 +1,37 @@
+import logging
+from pathlib import Path
+
+import pandas as pd
+
+from pubextract.author_locations import _guessing_locations, _reading_xml
+
+
+_STEP_NAME = "extract_author_locations"
+_STEP_DESCRIPTION = "Extract author locations from studies' text."
+_LOG = logging.getLogger(_STEP_NAME)
+
+
+def _extract_from_articles_dir(articles_dir, output_dir=None):
+    if output_dir is None:
+        output_dir = articles_dir.parent / "subset_allArticles_authorLocations"
+    else:
+        output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
+    ids = []
+    locations = []
+    entss = []
+    article_paths = list(articles_dir.glob("**/article.xml"))
+    for i_article, article_path in enumerate(article_paths):
+        print("Processing article %d/%d" % (i_article, len(article_paths)), end="\r")
+        ents = _guessing_locations._get_entities(article_path)
+        location = _guessing_locations._get_location(ents)
+
+        if not pd.isna(location):
+            ids.append(_reading_xml._get_id(article_path))
+            entss.append("; ".join(ents))
+            locations.append(location)
+        d = 1
+    df = pd.DataFrame.from_records(locations)
+    df["entities"] = entss
+    df["id"] = ids
+    df.to_csv(output_dir / "author_locations.csv")
diff --git a/src/pubextract/author_locations/_reading_xml.py b/src/pubextract/author_locations/_reading_xml.py
@@ -0,0 +1,35 @@
+from pathlib import Path
+import re
+from typing import List, Optional, Union, Tuple, Any, NewType
+import dataclasses
+
+from unidecode import unidecode
+from lxml import etree
+import pandas as pd
+import en_core_web_sm
+
+
+def _get_tree(article_path):
+    parser = etree.XMLParser(remove_blank_text=True)
+    return etree.parse(article_path, parser)
+
+
+def _get_id(article_path):
+    tree = _get_tree(article_path)
+    try:
+        pmcid = tree.find("front/article-meta/article-id[@pub-id-type='pmc']").text
+        id = "PMC%s" % pmcid
+    except:
+        pmid = tree.xpath("//PMID/text()")[0]
+        id = "Pubmed%s" % pmid
+    return id
+
+
+def _get_first_affiliation(article_path):
+    aff = ""
+    for event, element in etree.iterparse(article_path):
+        if element.tag == "aff" or element.tag == "Affiliation":
+            aff = etree.tostring(element, with_tail=False, encoding="unicode")
+            if aff:
+                break
+    return aff
diff --git a/src/pubextract/author_locations/test.py b/src/pubextract/author_locations/test.py
@@ -0,0 +1,15 @@
+from pathlib import Path
+
+from pubextract.author_locations import _pubget
+
+
+articles_dir = (
+    Path(__file__).resolve().parents[5]
+    / "data"
+    / "pubget_data"
+    / "review-neuro-meta-analyses_2023-06-29"
+    / "query_a84b639ed7c2cc2d04c773db7c22905d"
+    / "articles"
+)
+
+_pubget._extract_from_articles_dir(articles_dir)