From db3ad351febfbff116d280e9ce6e0a36a96e7d27 Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Tue, 3 Sep 2024 12:10:26 +0200 Subject: [PATCH 1/2] #134 Parse version_id and version_date in parse_xml_web() --- pubmed_parser/pubmed_web_parser.py | 11 ++++++++++- tests/test_pubmed_web_parser.py | 17 +++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/pubmed_parser/pubmed_web_parser.py b/pubmed_parser/pubmed_web_parser.py index 8e06c0b..c26835c 100644 --- a/pubmed_parser/pubmed_web_parser.py +++ b/pubmed_parser/pubmed_web_parser.py @@ -149,6 +149,13 @@ def parse_pubmed_web_tree(tree): language = language[0].text except IndexError: language = None + + medline_citation = tree.xpath('//medlinecitation') + try: + version_id = medline_citation[0].attrib.get('versionid') + version_date = medline_citation[0].attrib.get('versiondate') + except IndexError: + version_id, version_date = None, None dict_out = { "title": title, @@ -160,7 +167,9 @@ def parse_pubmed_web_tree(tree): "doi": doi, "pii": pii, "year": year, - "language": language + "language": language, + "version_id": version_id, + "version_date": version_date, } return dict_out diff --git a/tests/test_pubmed_web_parser.py b/tests/test_pubmed_web_parser.py index 120fe71..b01db07 100644 --- a/tests/test_pubmed_web_parser.py +++ b/tests/test_pubmed_web_parser.py @@ -19,6 +19,8 @@ def test_pubmed_web_parser_all_fields_content(): "year": "2024", "language": "ger", "pmid": "38218666", + "version_id": None, + "version_date": None, }, 23340801: { "title": "E. coli as an all-rounder: the thin line between commensalism and pathogenicity.", @@ -32,6 +34,8 @@ def test_pubmed_web_parser_all_fields_content(): "year": "2013", "language": "eng", "pmid": "23340801", + "version_id": None, + "version_date": None, }, } @@ -54,6 +58,8 @@ def test_pubmed_web_parser_all_fields_existence(): "pii", "year", "language", + "version_id", + "version_date", "pmid", ] pubmed_dict = pp.parse_xml_web(random_id, save_xml=False) @@ -80,3 +86,14 @@ def test_pii(): """Test the correct parsing of the pii.""" pubmed_dict = pp.parse_xml_web("32145645", save_xml=False) assert pubmed_dict['pii'] == "S0223-5234(20)30153-7" + + +def test_version(): + """Test the correct parsing of the version.""" + xml_20029612 = pp.parse_xml_web('20029612') + assert xml_20029612['version_id'] == '4' + assert xml_20029612['version_date'] == '2011/01/03' + + xml_21113338 = pp.parse_xml_web('21113338') + assert xml_21113338['version_id'] == '3' + assert xml_21113338['version_date'] is None From d50f670dbae31c929c015cd3c533150d8ae4b779 Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Wed, 4 Sep 2024 09:35:48 +0200 Subject: [PATCH 2/2] #134 Requested changes: Update docstrings --- pubmed_parser/pubmed_web_parser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pubmed_parser/pubmed_web_parser.py b/pubmed_parser/pubmed_web_parser.py index c26835c..6227b12 100644 --- a/pubmed_parser/pubmed_web_parser.py +++ b/pubmed_parser/pubmed_web_parser.py @@ -60,7 +60,7 @@ def parse_pubmed_web_tree(tree): 'title', 'abstract', 'journal', 'affliation' (string of affiliation with ';' separated), 'authors' (string with ';' separated), 'keywords' (keywords and MeSH terms from an XML -- if MeSH term it will be 'MeSH descriptor':'MeSH name') - 'doi', 'year' + 'doi', 'pii', 'year', 'language', 'version_id', 'version_date' """ if len(tree.xpath("//articletitle")) != 0: title = " ".join([title.text for title in tree.xpath("//articletitle")]) @@ -208,6 +208,8 @@ def parse_xml_web(pmid, sleep=None, save_xml=False): 'keywords': 'D000818:Animals;D005075:Biological Evolution;...', 'doi': '10.1126/science.1060852', 'year': '2001', + 'version_id': None, + 'version_date': None, 'pmid': '11360989' } """