HDXDSYS-926 add option to convert xlsx to csv before processing (#33)

* Add xlsx2csv option with test * Remove fill_merged_cells if present when using xlsx2csv * Test sheet id and name with xlsx2csv
OCHA-DAP · Jul 29, 2024 · 79a641f · 79a641f
1 parent 253025c
commit 79a641f
Show file tree

Hide file tree

Showing 5 changed files with 107 additions and 9 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,6 +51,7 @@ dependencies = [
     "ratelimit",
     "requests-file",
     "ruamel.yaml",
+    "xlsx2csv",
 ]
 dynamic = ["version"]
 

diff --git a/requirements.txt b/requirements.txt
@@ -20,7 +20,7 @@ charset-normalizer==3.3.2
     # via requests
 click==8.1.7
     # via typer
-coverage==7.5.4
+coverage==7.6.0
     # via pytest-cov
 distlib==0.3.8
     # via virtualenv
@@ -36,9 +36,9 @@ frictionless==5.17.0
     # via hdx-python-utilities (pyproject.toml)
 html5lib==1.1
     # via hdx-python-utilities (pyproject.toml)
-humanize==4.9.0
+humanize==4.10.0
     # via frictionless
-identify==2.5.36
+identify==2.6.0
     # via pre-commit
 idna==3.7
     # via
@@ -54,7 +54,7 @@ jinja2==3.1.4
     # via frictionless
 jsonlines==4.0.0
     # via hdx-python-utilities (pyproject.toml)
-jsonschema==4.22.0
+jsonschema==4.23.0
     # via
     #   frictionless
     #   tableschema-to-template
@@ -84,15 +84,15 @@ platformdirs==4.2.2
     # via virtualenv
 pluggy==1.5.0
     # via pytest
-pre-commit==3.7.1
+pre-commit==3.8.0
     # via hdx-python-utilities (pyproject.toml)
 pydantic==2.8.2
     # via frictionless
 pydantic-core==2.20.1
     # via pydantic
 pygments==2.18.0
     # via rich
-pytest==8.2.2
+pytest==8.3.2
     # via
     #   hdx-python-utilities (pyproject.toml)
     #   pytest-cov
@@ -127,7 +127,7 @@ rfc3986==2.0.0
     # via frictionless
 rich==13.7.1
     # via typer
-rpds-py==0.18.1
+rpds-py==0.19.1
     # via
     #   jsonschema
     #   referencing
@@ -164,14 +164,16 @@ typing-extensions==4.12.2
     #   typer
 urllib3==2.2.2
     # via requests
-validators==0.30.0
+validators==0.33.0
     # via frictionless
 virtualenv==20.26.3
     # via pre-commit
 webencodings==0.5.1
     # via html5lib
 xlrd==2.0.1
     # via hdx-python-utilities (pyproject.toml)
+xlsx2csv==0.8.3
+    # via hdx-python-utilities (pyproject.toml)
 xlsxwriter==3.2.0
     # via tableschema-to-template
 xlwt==1.3.0

diff --git a/src/hdx/utilities/downloader.py b/src/hdx/utilities/downloader.py
@@ -15,6 +15,7 @@
 from ratelimit import RateLimitDecorator, sleep_and_retry
 from requests import Request
 from ruamel.yaml import YAML
+from xlsx2csv import Xlsx2csv
 
 from .base_downloader import BaseDownload, DownloadError
 from .frictionless_wrapper import get_frictionless_tableresource
@@ -669,6 +670,7 @@ def _get_tabular_rows(
             **kwargs:
             format (Optional[str]): Type of file. Defaults to inferring.
             file_type (Optional[str]): Type of file. Defaults to inferring.
+            xlsx2csv (bool): Whether to convert xlsx files. Defaults to False.
             encoding (Optional[str]): Type of encoding. Defaults to inferring.
             compression (Optional[str]): Type of compression. Defaults to inferring.
             delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.
@@ -690,6 +692,20 @@ def _get_tabular_rows(
         """
         if headers is None:
             raise DownloadError("Argument headers cannot be None!")
+        xlsx2csv = kwargs.pop("xlsx2csv", False)
+        if xlsx2csv:
+            path = self.download_file(url)
+            outpath = path.replace(".xlsx", ".csv")
+            sheet = kwargs.pop("sheet", 1)
+            if isinstance(sheet, int):
+                sheet_args = {"sheetid": sheet}
+            else:
+                sheet_args = {"sheetname": sheet}
+            Xlsx2csv(path).convert(outpath, **sheet_args)
+            url = outpath
+            kwargs["format"] = "csv"  # format takes precedence over file_type
+            kwargs.pop("fill_merged_cells", None)
+
         resource = self.get_frictionless_tableresource(
             url,
             ignore_blank_rows=ignore_blank_rows,
@@ -771,6 +787,7 @@ def get_tabular_rows(
             **kwargs:
             format (Optional[str]): Type of file. Defaults to inferring.
             file_type (Optional[str]): Type of file. Defaults to inferring.
+            xlsx2csv (bool): Whether to convert xlsx files. Defaults to False.
             encoding (Optional[str]): Type of encoding. Defaults to inferring.
             compression (Optional[str]): Type of compression. Defaults to inferring.
             delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.
@@ -875,6 +892,7 @@ def get_tabular_rows_as_list(
             **kwargs:
             format (Optional[str]): Type of file. Defaults to inferring.
             file_type (Optional[str]): Type of file. Defaults to inferring.
+            xlsx2csv (bool): Whether to convert xlsx files. Defaults to False.
             encoding (Optional[str]): Type of encoding. Defaults to inferring.
             compression (Optional[str]): Type of compression. Defaults to inferring.
             delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.
@@ -950,6 +968,7 @@ def get_tabular_rows_as_dict(
             **kwargs:
             format (Optional[str]): Type of file. Defaults to inferring.
             file_type (Optional[str]): Type of file. Defaults to inferring.
+            xlsx2csv (bool): Whether to convert xlsx files. Defaults to False.
             encoding (Optional[str]): Type of encoding. Defaults to inferring.
             compression (Optional[str]): Type of compression. Defaults to inferring.
             delimiter (Optional[str]): Delimiter for values in csv rows. Defaults to inferring.

diff --git a/src/hdx/utilities/retriever.py b/src/hdx/utilities/retriever.py
@@ -418,7 +418,7 @@ def get_tabular_rows(
             filename (Optional[str]): Filename of saved file. Defaults to getting from url.
             logstr (Optional[str]): Text to use in log string to describe download. Defaults to filename.
             fallback (bool): Whether to use static fallback if download fails. Defaults to False.
-            **kwargs: Parameters to pass to download_file call
+            **kwargs: Parameters to pass to download_file and get_tabular_rows calls
 
         Returns:
             Tuple[List[str],Iterator[ListDict]]: Tuple (headers, iterator where each row is a list or dictionary)

diff --git a/tests/hdx/utilities/test_downloader.py b/tests/hdx/utilities/test_downloader.py
@@ -1220,6 +1220,82 @@ def test_get_tabular_rows_xlsx(self, fixturexlsxurl):
                 "Tulkarm",
             ]
 
+    def test_get_tabular_rows_xlsx2csv(self, fixtureurlexcel):
+        expected_headers = [
+            "GWNO",
+            "EVENT_ID_CNTY",
+            "EVENT_ID_NO_CNTY",
+            "EVENT_DATE",
+            "YEAR",
+            "TIME_PRECISION",
+            "EVENT_TYPE",
+            "ACTOR1",
+            "ALLY_ACTOR_1",
+            "INTER1",
+            "ACTOR2",
+            "ALLY_ACTOR_2",
+            "INTER2",
+            "INTERACTION",
+            "COUNTRY",
+            "ADMIN1",
+            "ADMIN2",
+            "ADMIN3",
+            "LOCATION",
+            "LATITUDE",
+            "LONGITUDE",
+            "GEO_PRECISION",
+            "SOURCE",
+            "NOTES",
+            "FATALITIES",
+        ]
+
+        expected_row = [
+            "615",
+            "1416RTA",
+            None,
+            "18/04/2001",
+            "2001",
+            "1",
+            "Violence against civilians",
+            "Police Forces of Algeria (1999-)",
+            None,
+            "1",
+            "Civilians (Algeria)",
+            "Berber Ethnic Group (Algeria)",
+            "7",
+            "17",
+            "Algeria",
+            "Tizi Ouzou",
+            "Beni-Douala",
+            None,
+            "Beni Douala",
+            "36.61954",
+            "4.08282",
+            "1",
+            "Associated Press Online",
+            "A Berber student was shot while in police custody at a police station in "
+            "Beni Douala. He later died on Apr.21.",
+            "1",
+        ]
+
+        with Download() as downloader:
+            headers, iterator = downloader.get_tabular_rows(
+                fixtureurlexcel,
+                format="xlsx",
+                xlsx2csv=True,
+            )
+            assert headers == expected_headers
+            assert list(iterator)[0] == expected_row
+
+            headers, iterator = downloader.get_tabular_rows(
+                fixtureurlexcel,
+                format="xlsx",
+                xlsx2csv=True,
+                sheet="test_data",
+            )
+            assert headers == expected_headers
+            assert list(iterator)[0] == expected_row
+
     def test_get_tabular_rows_json(self, fixturejsonurl):
         with Download() as downloader:
             headers, iterator = downloader.get_tabular_rows(