Skip to content

Commit

Permalink
extract author locations from affiliations section of xml
Browse files Browse the repository at this point in the history
  • Loading branch information
koudyk committed Aug 29, 2023
1 parent ede7402 commit f7be191
Show file tree
Hide file tree
Showing 6 changed files with 41,173 additions and 0 deletions.
Empty file.
41,002 changes: 41,002 additions & 0 deletions src/pubextract/author_locations/_data/worldcities.csv

Large diffs are not rendered by default.

84 changes: 84 additions & 0 deletions src/pubextract/author_locations/_guessing_locations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from pathlib import Path
import re

from unidecode import unidecode
import pandas as pd
import numpy as np
import en_core_web_sm

from pubextract.author_locations import _reading_xml


cities_path = Path(__file__).parent / "_data" / "worldcities.csv"
WC = pd.read_csv(cities_path)
WC = WC.dropna()
COUNTRIES = set(WC["country"])
CITIES = set(WC["city_ascii"])
LOCATIONS = COUNTRIES.union(CITIES)
COUNTRY_MAPPING = {
"UK": "United Kingdom",
"USA": "United States",
"South Korea": "Korea, South",
}


def _preprocess_text(text):
to_remove = [
"org/1999",
"/addr-line",
"/aff",
"/Affiliation"
"University",
"College",
"Center"
]
text = re.sub(r'[,.;@#?!&$><:="-]+\ *', " ", text)
text = re.sub(r"\s+", " ", text)
text = text.strip()
text = unidecode(text)
for item in to_remove:
text = text.replace(item, "")
return text


def _get_entities(article_path):
aff = _reading_xml._get_first_affiliation(article_path)
aff = _preprocess_text(aff)
nlp = en_core_web_sm.load()
doc = nlp(aff)
items = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
unigrams = aff.split(" ")
items = items + unigrams
for i, unigram in enumerate(unigrams[:-1]):
bigram = " ".join([unigram, unigrams[i+1]])
items.append(bigram)
entities = [x for x in items if x in LOCATIONS]
entities = [x.strip() for x in entities]
entities = list(set(entities))
return entities


def _get_location(ents):
ents = [COUNTRY_MAPPING[x] if x in COUNTRY_MAPPING else x for x in ents]
cities = CITIES.intersection(set(ents))
countries = COUNTRIES.intersection(set(ents))
i_ci = WC[WC["city_ascii"].isin(cities)].index
i_co = WC[WC["country"].isin(countries)].index
i = i_ci.intersection(i_co)
if not countries:
i = i_ci
if len(i) > 0:
# the [0] is to take the first match
location = WC.loc[i[0]].to_dict()
else:
location = np.nan
return location

# class Locations:
# def __init__(self, article_path):
# self.article_path = article_path
# self.id = _reading_xml._get_id(article_path)
# self.affiliation = _reading_xml._get_first_affiliation(article_path)
# # self.tree = _reading._get_tree(article_path)
# self.entities = self._get_entities()
# self.locations = self._get_locations()
37 changes: 37 additions & 0 deletions src/pubextract/author_locations/_pubget.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import logging
from pathlib import Path

import pandas as pd

from pubextract.author_locations import _guessing_locations, _reading_xml


_STEP_NAME = "extract_author_locations"
_STEP_DESCRIPTION = "Extract author locations from studies' text."
_LOG = logging.getLogger(_STEP_NAME)


def _extract_from_articles_dir(articles_dir, output_dir=None):
if output_dir is None:
output_dir = articles_dir.parent / "subset_allArticles_authorLocations"
else:
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)
ids = []
locations = []
entss = []
article_paths = list(articles_dir.glob("**/article.xml"))
for i_article, article_path in enumerate(article_paths):
print("Processing article %d/%d" % (i_article, len(article_paths)), end="\r")
ents = _guessing_locations._get_entities(article_path)
location = _guessing_locations._get_location(ents)

if not pd.isna(location):
ids.append(_reading_xml._get_id(article_path))
entss.append("; ".join(ents))
locations.append(location)
d = 1
df = pd.DataFrame.from_records(locations)
df["entities"] = entss
df["id"] = ids
df.to_csv(output_dir / "author_locations.csv")
35 changes: 35 additions & 0 deletions src/pubextract/author_locations/_reading_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from pathlib import Path
import re
from typing import List, Optional, Union, Tuple, Any, NewType
import dataclasses

from unidecode import unidecode
from lxml import etree
import pandas as pd
import en_core_web_sm


def _get_tree(article_path):
parser = etree.XMLParser(remove_blank_text=True)
return etree.parse(article_path, parser)


def _get_id(article_path):
tree = _get_tree(article_path)
try:
pmcid = tree.find("front/article-meta/article-id[@pub-id-type='pmc']").text
id = "PMC%s" % pmcid
except:
pmid = tree.xpath("//PMID/text()")[0]
id = "Pubmed%s" % pmid
return id


def _get_first_affiliation(article_path):
aff = ""
for event, element in etree.iterparse(article_path):
if element.tag == "aff" or element.tag == "Affiliation":
aff = etree.tostring(element, with_tail=False, encoding="unicode")
if aff:
break
return aff
15 changes: 15 additions & 0 deletions src/pubextract/author_locations/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from pathlib import Path

from pubextract.author_locations import _pubget


articles_dir = (
Path(__file__).resolve().parents[5]
/ "data"
/ "pubget_data"
/ "review-neuro-meta-analyses_2023-06-29"
/ "query_a84b639ed7c2cc2d04c773db7c22905d"
/ "articles"
)

_pubget._extract_from_articles_dir(articles_dir)

0 comments on commit f7be191

Please sign in to comment.