Skip to content

Commit

Permalink
HDXDSYS-926 add option to convert xlsx to csv before processing (#33)
Browse files Browse the repository at this point in the history
* Add xlsx2csv option with test

* Remove fill_merged_cells if present when using xlsx2csv

* Test sheet id and name with xlsx2csv
  • Loading branch information
mcarans authored Jul 29, 2024
1 parent 253025c commit 79a641f
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 9 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ dependencies = [
"ratelimit",
"requests-file",
"ruamel.yaml",
"xlsx2csv",
]
dynamic = ["version"]

Expand Down
18 changes: 10 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ charset-normalizer==3.3.2
# via requests
click==8.1.7
# via typer
coverage==7.5.4
coverage==7.6.0
# via pytest-cov
distlib==0.3.8
# via virtualenv
Expand All @@ -36,9 +36,9 @@ frictionless==5.17.0
# via hdx-python-utilities (pyproject.toml)
html5lib==1.1
# via hdx-python-utilities (pyproject.toml)
humanize==4.9.0
humanize==4.10.0
# via frictionless
identify==2.5.36
identify==2.6.0
# via pre-commit
idna==3.7
# via
Expand All @@ -54,7 +54,7 @@ jinja2==3.1.4
# via frictionless
jsonlines==4.0.0
# via hdx-python-utilities (pyproject.toml)
jsonschema==4.22.0
jsonschema==4.23.0
# via
# frictionless
# tableschema-to-template
Expand Down Expand Up @@ -84,15 +84,15 @@ platformdirs==4.2.2
# via virtualenv
pluggy==1.5.0
# via pytest
pre-commit==3.7.1
pre-commit==3.8.0
# via hdx-python-utilities (pyproject.toml)
pydantic==2.8.2
# via frictionless
pydantic-core==2.20.1
# via pydantic
pygments==2.18.0
# via rich
pytest==8.2.2
pytest==8.3.2
# via
# hdx-python-utilities (pyproject.toml)
# pytest-cov
Expand Down Expand Up @@ -127,7 +127,7 @@ rfc3986==2.0.0
# via frictionless
rich==13.7.1
# via typer
rpds-py==0.18.1
rpds-py==0.19.1
# via
# jsonschema
# referencing
Expand Down Expand Up @@ -164,14 +164,16 @@ typing-extensions==4.12.2
# typer
urllib3==2.2.2
# via requests
validators==0.30.0
validators==0.33.0
# via frictionless
virtualenv==20.26.3
# via pre-commit
webencodings==0.5.1
# via html5lib
xlrd==2.0.1
# via hdx-python-utilities (pyproject.toml)
xlsx2csv==0.8.3
# via hdx-python-utilities (pyproject.toml)
xlsxwriter==3.2.0
# via tableschema-to-template
xlwt==1.3.0
Expand Down
19 changes: 19 additions & 0 deletions src/hdx/utilities/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ratelimit import RateLimitDecorator, sleep_and_retry
from requests import Request
from ruamel.yaml import YAML
from xlsx2csv import Xlsx2csv

from .base_downloader import BaseDownload, DownloadError
from .frictionless_wrapper import get_frictionless_tableresource
Expand Down Expand Up @@ -669,6 +670,7 @@ def _get_tabular_rows(
**kwargs:
format (Optional[str]): Type of file. Defaults to inferring.
file_type (Optional[str]): Type of file. Defaults to inferring.
xlsx2csv (bool): Whether to convert xlsx files. Defaults to False.
encoding (Optional[str]): Type of encoding. Defaults to inferring.
compression (Optional[str]): Type of compression. Defaults to inferring.
delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.
Expand All @@ -690,6 +692,20 @@ def _get_tabular_rows(
"""
if headers is None:
raise DownloadError("Argument headers cannot be None!")
xlsx2csv = kwargs.pop("xlsx2csv", False)
if xlsx2csv:
path = self.download_file(url)
outpath = path.replace(".xlsx", ".csv")
sheet = kwargs.pop("sheet", 1)
if isinstance(sheet, int):
sheet_args = {"sheetid": sheet}
else:
sheet_args = {"sheetname": sheet}
Xlsx2csv(path).convert(outpath, **sheet_args)
url = outpath
kwargs["format"] = "csv" # format takes precedence over file_type
kwargs.pop("fill_merged_cells", None)

resource = self.get_frictionless_tableresource(
url,
ignore_blank_rows=ignore_blank_rows,
Expand Down Expand Up @@ -771,6 +787,7 @@ def get_tabular_rows(
**kwargs:
format (Optional[str]): Type of file. Defaults to inferring.
file_type (Optional[str]): Type of file. Defaults to inferring.
xlsx2csv (bool): Whether to convert xlsx files. Defaults to False.
encoding (Optional[str]): Type of encoding. Defaults to inferring.
compression (Optional[str]): Type of compression. Defaults to inferring.
delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.
Expand Down Expand Up @@ -875,6 +892,7 @@ def get_tabular_rows_as_list(
**kwargs:
format (Optional[str]): Type of file. Defaults to inferring.
file_type (Optional[str]): Type of file. Defaults to inferring.
xlsx2csv (bool): Whether to convert xlsx files. Defaults to False.
encoding (Optional[str]): Type of encoding. Defaults to inferring.
compression (Optional[str]): Type of compression. Defaults to inferring.
delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.
Expand Down Expand Up @@ -950,6 +968,7 @@ def get_tabular_rows_as_dict(
**kwargs:
format (Optional[str]): Type of file. Defaults to inferring.
file_type (Optional[str]): Type of file. Defaults to inferring.
xlsx2csv (bool): Whether to convert xlsx files. Defaults to False.
encoding (Optional[str]): Type of encoding. Defaults to inferring.
compression (Optional[str]): Type of compression. Defaults to inferring.
delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.
Expand Down
2 changes: 1 addition & 1 deletion src/hdx/utilities/retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ def get_tabular_rows(
filename (Optional[str]): Filename of saved file. Defaults to getting from url.
logstr (Optional[str]): Text to use in log string to describe download. Defaults to filename.
fallback (bool): Whether to use static fallback if download fails. Defaults to False.
**kwargs: Parameters to pass to download_file call
**kwargs: Parameters to pass to download_file and get_tabular_rows calls
Returns:
Tuple[List[str],Iterator[ListDict]]: Tuple (headers, iterator where each row is a list or dictionary)
Expand Down
76 changes: 76 additions & 0 deletions tests/hdx/utilities/test_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1220,6 +1220,82 @@ def test_get_tabular_rows_xlsx(self, fixturexlsxurl):
"Tulkarm",
]

def test_get_tabular_rows_xlsx2csv(self, fixtureurlexcel):
expected_headers = [
"GWNO",
"EVENT_ID_CNTY",
"EVENT_ID_NO_CNTY",
"EVENT_DATE",
"YEAR",
"TIME_PRECISION",
"EVENT_TYPE",
"ACTOR1",
"ALLY_ACTOR_1",
"INTER1",
"ACTOR2",
"ALLY_ACTOR_2",
"INTER2",
"INTERACTION",
"COUNTRY",
"ADMIN1",
"ADMIN2",
"ADMIN3",
"LOCATION",
"LATITUDE",
"LONGITUDE",
"GEO_PRECISION",
"SOURCE",
"NOTES",
"FATALITIES",
]

expected_row = [
"615",
"1416RTA",
None,
"18/04/2001",
"2001",
"1",
"Violence against civilians",
"Police Forces of Algeria (1999-)",
None,
"1",
"Civilians (Algeria)",
"Berber Ethnic Group (Algeria)",
"7",
"17",
"Algeria",
"Tizi Ouzou",
"Beni-Douala",
None,
"Beni Douala",
"36.61954",
"4.08282",
"1",
"Associated Press Online",
"A Berber student was shot while in police custody at a police station in "
"Beni Douala. He later died on Apr.21.",
"1",
]

with Download() as downloader:
headers, iterator = downloader.get_tabular_rows(
fixtureurlexcel,
format="xlsx",
xlsx2csv=True,
)
assert headers == expected_headers
assert list(iterator)[0] == expected_row

headers, iterator = downloader.get_tabular_rows(
fixtureurlexcel,
format="xlsx",
xlsx2csv=True,
sheet="test_data",
)
assert headers == expected_headers
assert list(iterator)[0] == expected_row

def test_get_tabular_rows_json(self, fixturejsonurl):
with Download() as downloader:
headers, iterator = downloader.get_tabular_rows(
Expand Down

0 comments on commit 79a641f

Please sign in to comment.