diff --git a/.github/workflows/cmake_builds.yml b/.github/workflows/cmake_builds.yml index d4e8f04587fd..234021da7493 100644 --- a/.github/workflows/cmake_builds.yml +++ b/.github/workflows/cmake_builds.yml @@ -432,7 +432,7 @@ jobs: cfitsio freexl geotiff libjpeg-turbo libpq libspatialite libwebp-base pcre pcre2 postgresql \ sqlite tiledb zstd cryptopp cgal doxygen librttopo libkml openssl xz \ openjdk ant qhull armadillo blas blas-devel libblas libcblas liblapack liblapacke blosc libarchive \ - arrow-cpp pyarrow libaec libheif libavif cmake + arrow-cpp pyarrow libaec libheif libavif cmake fsspec - name: Check CMake version shell: bash -l {0} run: | diff --git a/.github/workflows/ubuntu_24.04/Dockerfile.ci b/.github/workflows/ubuntu_24.04/Dockerfile.ci index 9d4f4137480d..0c217be9b185 100644 --- a/.github/workflows/ubuntu_24.04/Dockerfile.ci +++ b/.github/workflows/ubuntu_24.04/Dockerfile.ci @@ -149,3 +149,5 @@ RUN python3 -m pip install -U --break-system-packages -r /tmp/requirements.txt # cfchecker requires udunits2 RUN apt-get install -y --allow-unauthenticated libudunits2-0 libudunits2-data RUN python3 -m pip install --break-system-packages cfchecker + +RUN python3 -m pip install --break-system-packages fsspec diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a0215f92638f..b7f88f67772b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,8 @@ repos: - id: black exclude: > (?x)^( - swig/python/osgeo/| + swig/python/osgeo/__init__.py| + swig/python/osgeo/gdalnumeric.py| autotest/ogr/data/ ) - repo: https://github.com/timothycrosley/isort @@ -14,7 +15,8 @@ repos: - id: isort exclude: > (?x)^( - swig/python/osgeo/| + swig/python/osgeo/__init__.py| + swig/python/osgeo/gdalnumeric.py| autotest/ogr/data/ ) - repo: https://github.com/pycqa/flake8 @@ -23,7 +25,8 @@ repos: - id: flake8 exclude: > (?x)^( - swig/python/osgeo/| + swig/python/osgeo/__init__.py| + swig/python/osgeo/gdalnumeric.py| examples/| autotest/ogr/data/ ) diff --git a/autotest/gcore/test_gdal_fsspec.py b/autotest/gcore/test_gdal_fsspec.py new file mode 100644 index 000000000000..ac227a57db10 --- /dev/null +++ b/autotest/gcore/test_gdal_fsspec.py @@ -0,0 +1,229 @@ +#!/usr/bin/env pytest +# -*- coding: utf-8 -*- +############################################################################### +# Project: GDAL/OGR Test Suite +# Purpose: Test gdal_fsspec module +# Author: Even Rouault +# +############################################################################### +# Copyright (c) 20124, Even Rouault +# +# SPDX-License-Identifier: MIT +############################################################################### + +import pytest + +from osgeo import gdal + +fsspec = pytest.importorskip("fsspec") +pytest.importorskip("fsspec.spec") + +from osgeo import gdal_fsspec # NOQA + + +def test_gdal_fsspec_open_read(): + + with fsspec.open("vsi://data/byte.tif") as f: + assert len(f.read()) == gdal.VSIStatL("data/byte.tif").size + + +def test_gdal_fsspec_info_file(): + + fs = fsspec.filesystem("vsi") + info = fs.info("data/byte.tif") + assert "mtime" in info + del info["mtime"] + assert (info["mode"] & 32768) != 0 + del info["mode"] + assert info == { + "name": "data/byte.tif", + "size": 736, + "type": "file", + } + + +def test_gdal_fsspec_info_dir(): + + fs = fsspec.filesystem("vsi") + info = fs.info("data") + assert (info["mode"] & 16384) != 0 + del info["mode"] + assert info == { + "name": "data", + "size": 0, + "type": "directory", + } + + +def test_gdal_fsspec_info_error(): + + fs = fsspec.filesystem("vsi") + with pytest.raises(FileNotFoundError): + fs.info("/i/do/not/exist") + + +def test_gdal_fsspec_ls(): + + fs = fsspec.filesystem("vsi") + ret = fs.ls("data") + assert len(ret) > 2 + item_of_interest = None + for item in ret: + if item["name"] == "data/byte.tif": + item_of_interest = item + break + assert item_of_interest + assert "mtime" in item_of_interest + del item_of_interest["mtime"] + assert item_of_interest == { + "name": "data/byte.tif", + "size": 736, + "type": "file", + } + + +def test_gdal_fsspec_ls_file(): + + fs = fsspec.filesystem("vsi") + ret = fs.ls("data/byte.tif") + assert ret == ["data/byte.tif"] + + +def test_gdal_fsspec_ls_error(): + + fs = fsspec.filesystem("vsi") + with pytest.raises(FileNotFoundError): + fs.ls("vsi://i/do/not/exist") + + +def test_gdal_fsspec_modified(): + + fs = fsspec.filesystem("vsi") + modified = fs.modified("data/byte.tif") + assert modified is not None + import datetime + + assert isinstance(modified, datetime.datetime) + + +def test_gdal_fsspec_modified_error(): + + fs = fsspec.filesystem("vsi") + with pytest.raises(FileNotFoundError): + fs.modified("vsi://i/do/not/exist") + + +def test_gdal_fsspec_rm(): + + with fsspec.open("vsimem:///foo.bin", "wb") as f: + f.write(b"""bar""") + fs = fsspec.filesystem("vsimem") + fs.info("/foo.bin") + fs.rm("/foo.bin") + with pytest.raises(FileNotFoundError): + fs.info("/foo.bin") + + +def test_gdal_fsspec_rm_error(): + + fs = fsspec.filesystem("vsimem") + with pytest.raises(FileNotFoundError): + fs.rm("/foo.bin") + + +def test_gdal_fsspec_copy(): + + with fsspec.open("vsimem://foo.bin", "wb") as f: + f.write(b"""bar""") + fs = fsspec.filesystem("vsimem") + fs.copy("/foo.bin", "/bar.bin") + assert fs.info("/bar.bin")["size"] == 3 + assert fs.info("/foo.bin")["size"] == 3 + fs.rm("/foo.bin") + fs.rm("/bar.bin") + + +def test_gdal_fsspec_copy_error(): + + fs = fsspec.filesystem("vsimem") + with pytest.raises(FileNotFoundError): + fs.copy("/foo.bin", "/bar.bin") + + +def test_gdal_fsspec_mv(): + + with fsspec.open("vsimem://foo.bin", "wb") as f: + f.write(b"""bar""") + fs = fsspec.filesystem("vsimem") + fs.mv("/foo.bin", "/bar.bin") + assert fs.info("/bar.bin")["size"] == 3 + with pytest.raises(FileNotFoundError): + fs.info("/foo.bin") + fs.rm("/bar.bin") + + +def test_gdal_fsspec_mv_error(): + + fs = fsspec.filesystem("vsimem") + with pytest.raises(FileNotFoundError): + fs.mv("/foo.bin", "/bar.bin") + + +def test_gdal_fsspec_mkdir(tmp_path): + + fs = fsspec.filesystem("vsi") + + my_path = str(tmp_path) + "/my_dir" + + fs.mkdir(my_path) + assert fs.info(my_path)["type"] == "directory" + with pytest.raises(FileExistsError): + fs.mkdir(my_path) + fs.rmdir(my_path) + + fs.mkdir(my_path + "/my_subdir") + assert fs.info(my_path)["type"] == "directory" + assert fs.info(my_path + "/my_subdir")["type"] == "directory" + fs.rmdir(my_path + "/my_subdir") + fs.rmdir(my_path) + with pytest.raises(FileNotFoundError): + fs.info(my_path) + + fs = fsspec.filesystem("vsi") + with pytest.raises(Exception): + fs.mkdir(my_path + "/my_subdir", create_parents=False) + with pytest.raises(FileNotFoundError): + fs.info(my_path) + + +def test_gdal_fsspec_makedirs(tmp_path): + + fs = fsspec.filesystem("vsi") + + my_path = str(tmp_path) + "/my_dir" + fs.makedirs(my_path) + assert fs.info(my_path)["type"] == "directory" + with pytest.raises(FileExistsError): + fs.makedirs(my_path) + fs.makedirs(my_path, exist_ok=True) + fs.rmdir(my_path) + + +def test_gdal_fsspec_usable_by_pyarrow_dataset(tmp_vsimem): + + ds = pytest.importorskip("pyarrow.dataset") + + tmp_vsimem_file = str(tmp_vsimem / "tmp.parquet") + gdal.FileFromMemBuffer( + tmp_vsimem_file, open("../ogr/data/parquet/test.parquet", "rb").read() + ) + + fs_vsimem = fsspec.filesystem("vsimem") + + assert ( + ds.dataset(tmp_vsimem_file[len("/vsimem") :], filesystem=fs_vsimem) is not None + ) + + assert ( + ds.dataset(str(tmp_vsimem)[len("/vsimem") :], filesystem=fs_vsimem) is not None + ) diff --git a/doc/requirements.txt b/doc/requirements.txt index e4378d870530..34b6230cfd8d 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,5 +1,6 @@ # This file may be used to create an environment using: # $ pip install --upgrade -r +fsspec numpy sphinx breathe diff --git a/doc/source/api/python/general.rst b/doc/source/api/python/general.rst index 7c7cb58a19ab..a77c90f3844c 100644 --- a/doc/source/api/python/general.rst +++ b/doc/source/api/python/general.rst @@ -95,6 +95,26 @@ Error Handling File Management --------------- +osgeo.gdal_fsspec module +++++++++++++++++++++++++ + +.. automodule:: osgeo.gdal_fsspec + :members: + :undoc-members: + :show-inheritance: + :noindex: + +osgeo.gdal.VSIFile class +++++++++++++++++++++++++ + +.. autoclass:: osgeo.gdal.VSIFile + :members: + :undoc-members: + :noindex: + +Low level functions ++++++++++++++++++++ + .. autofunction:: osgeo.gdal.CloseDir .. autofunction:: osgeo.gdal.CopyFile diff --git a/doc/source/api/python/osgeo.gdal_fsspec.rst b/doc/source/api/python/osgeo.gdal_fsspec.rst new file mode 100644 index 000000000000..2b1bbe35b138 --- /dev/null +++ b/doc/source/api/python/osgeo.gdal_fsspec.rst @@ -0,0 +1,12 @@ +.. + The documentation displayed on this page is automatically generated from + Python docstrings. See https://gdal.org/development/dev_documentation.html + for information on updating this content. + +osgeo.gdal_fsspec module +======================== + +.. automodule:: osgeo.gdal_fsspec + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/python/osgeo.rst b/doc/source/api/python/osgeo.rst index d10dfdded4ba..5cf9bcd6e46a 100644 --- a/doc/source/api/python/osgeo.rst +++ b/doc/source/api/python/osgeo.rst @@ -12,6 +12,7 @@ Submodules osgeo.gdal osgeo.gdal_array + osgeo.gdal_fsspec osgeo.gdalconst osgeo.gnm osgeo.ogr diff --git a/swig/include/cpl.i b/swig/include/cpl.i index 0e8efca98794..e1890997737b 100644 --- a/swig/include/cpl.i +++ b/swig/include/cpl.i @@ -742,6 +742,14 @@ void CopyFileRestartable(const char* pszSource, } +%rename (MoveFile) wrapper_MoveFile; +%inline { +int wrapper_MoveFile(const char* pszSource, const char* pszTarget) +{ + return CPLMoveFile(pszTarget, pszSource); +} +} + %clear (const char* pszSource); %clear (const char* pszTarget); diff --git a/swig/python/CMakeLists.txt b/swig/python/CMakeLists.txt index d694c71c208b..b0a12f19c704 100644 --- a/swig/python/CMakeLists.txt +++ b/swig/python/CMakeLists.txt @@ -106,6 +106,14 @@ set(GDAL_PYTHON_CSOURCES list(APPEND GDAL_PYTHON_PYSOURCES "${CMAKE_CURRENT_BINARY_DIR}/osgeo/__init__.py") endif() + if (NOT "${CMAKE_BINARY_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdal_fsspec.py" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/osgeo/gdal_fsspec.py" "${CMAKE_CURRENT_BINARY_DIR}/osgeo" + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/osgeo/gdal_fsspec.py") + list(APPEND GDAL_PYTHON_PYSOURCES "${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdal_fsspec.py") + endif() + foreach(_file IN ITEMS ${GDAL_PYTHON_CSOURCES}) add_custom_command( OUTPUT ${_file} @@ -544,6 +552,7 @@ elseif (Python_Development_FOUND) ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdal.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdalconst.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdalnumeric.py + ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdal_fsspec.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gnm.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/ogr.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/osr.py @@ -554,6 +563,7 @@ elseif (Python_Development_FOUND) ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdal.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdalconst.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdalnumeric.py + ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdal_fsspec.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gnm.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/ogr.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/osr.py diff --git a/swig/python/osgeo/gdal_fsspec.py b/swig/python/osgeo/gdal_fsspec.py new file mode 100644 index 000000000000..5debc910ec36 --- /dev/null +++ b/swig/python/osgeo/gdal_fsspec.py @@ -0,0 +1,292 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2024, Even Rouault + +"""Module exposing GDAL Virtual File Systems (VSI) as fsspec implementations. + + Importing "osgeo.gdal_fsspec" requires the Python "fsspec" + (https://filesystem-spec.readthedocs.io/en/latest/) module to be available. + + A generic "vsi" fsspec protocol is available. All GDAL VSI file names must be + simply prefixed with "vsi://". For example: + + - "vsi://data/byte.tif" to access relative file "data/byte.tif" + - "vsi:///home/user/byte.tif" to access absolute file "/home/user/byte.tif" + - "vsi:///vsimem/byte.tif" (note the 3 slashes) to access VSIMem file "/vsimem/byte.tif" + + Each VSI file system is also registered as a distinct fsspec protocol, such + as "vsimem", "vsicurl", "vsizip", "vsitar", etc. + + Examples: + + - "vsimem://byte.tif" to access file "/vsimem/byte.tif" + - "vsicurl://http://example.com/foo" to access file "/vsicurl/http://example.com/foo" + - "vsis3://my_bucket/byte.tif" to access file "/vsis3/my_bucket/byte.tif" + - "vsizip:///home/user/my.zip/foo.tif" (note the 3 slashes to indicate absolute path) + to access (absolute) file "/vsizip//home/user/my.zip/foo.tif" + - "vsizip://my.zip/foo.tif" to access (relative) file "/vsizip/my.zip/foo.tif" + + :since: GDAL 3.11 +""" + +from pathlib import PurePath + +from fsspec.registry import register_implementation +from fsspec.spec import AbstractFileSystem +from fsspec.utils import stringify_path + +from osgeo import gdal + + +class VSIFileSystem(AbstractFileSystem): + """Implementation of AbstractFileSystem for a GDAL Virtual File System""" + + @classmethod + def _get_gdal_path(cls, path): + """Return a GDAL compatible file from a fsspec file name. + + For the file system using the generic "vsi" protocol, + remove the leading vsi:// if found (normally, it should be there, + but most AbstractFileSystem implementations seem to be ready to remove + it if found) + + For specialized file systems, like vsimem://, etc., for an input + like "vsimem:///foo", return "/vsimem/foo". And for an input like + "/foo" also return "/vsimem/foo". + """ + + if isinstance(path, PurePath): + path = stringify_path(path) + + if cls.protocol == "vsi": + # "vsi://something" just becomes "something" + if path.startswith("vsi://"): + return path[len("vsi://") :] + + return path + + else: + list_protocols_that_need_leeding_slash = [ + "vsis3", + "vsigs", + "vsiaz", + "vsioss", + "vsiswift", + ] + list_protocols_that_need_leeding_slash += [ + item + "_streaming" for item in list_protocols_that_need_leeding_slash + ] + list_protocols_that_need_leeding_slash.append("vsimem") + + # Deal with paths like "vsis3://foo" + full_protocol = cls.protocol + "://" + if path.startswith(full_protocol): + path = path[len(full_protocol) :] + + # Deal with paths like "/foo" with a VSIFileSystem that is something like "vsis3" + if ( + cls.protocol in list_protocols_that_need_leeding_slash + and not path.startswith("/") + ): + path = "/" + path + + return "/" + cls.protocol + path + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + """Implements AbstractFileSystem._open()""" + + path = self._get_gdal_path(path) + return gdal.VSIFile(path, mode) + + def info(self, path, **kwargs): + """Implements AbstractFileSystem.info()""" + + gdal_path = self._get_gdal_path(path) + stat = gdal.VSIStatL(gdal_path) + if stat is None: + raise FileNotFoundError(path) + if stat.IsDirectory(): + ret = { + "name": self._strip_protocol(path), + "size": 0, + "type": "directory", + } + else: + ret = { + "name": self._strip_protocol(path), + "size": stat.size, + "type": "file", + "mtime": stat.mtime, + } + if stat.mode: + ret["mode"] = stat.mode + return ret + + def modified(self, path): + """Implements AbstractFileSystem.modified()""" + + gdal_path = self._get_gdal_path(path) + stat = gdal.VSIStatL(gdal_path) + if stat is None: + raise FileNotFoundError(path) + import datetime + + return datetime.datetime.fromtimestamp(stat.mtime) + + def ls(self, path, detail=True, **kwargs): + """Implements AbstractFileSystem.ls()""" + + fs_path = self._strip_protocol(path) + gdal_path = self._get_gdal_path(path) + ret = [] + directory = gdal.OpenDir(gdal_path) + if directory is None: + stat = gdal.VSIStatL(gdal_path) + if stat is None: + raise FileNotFoundError(path) + return [fs_path] + + try: + while True: + entry = gdal.GetNextDirEntry(directory) + if entry is None: + break + + ret_entry = { + "name": fs_path + "/" + entry.name, + "type": ( + "file" + if (entry.mode & 32768) != 0 + else "directory" + if (entry.mode & 16384) != 0 + else None + ), + } + if ret_entry["type"] == "file": + ret_entry["size"] = entry.size if entry.sizeKnown else None + if entry.mtimeKnown: + ret_entry["mtime"] = entry.mtime + ret.append(ret_entry) + finally: + gdal.CloseDir(directory) + return ret + + def mkdir(self, path, create_parents=True, **kwargs): + """Implements AbstractFileSystem.mkdir()""" + + # Base fs.makedirs() may call us with "/vsimem" + if ( + path + "/" in gdal.GetFileSystemsPrefixes() + or path in gdal.GetFileSystemsPrefixes() + ): + return + + gdal_path = self._get_gdal_path(path) + if gdal.VSIStatL(gdal_path): + raise FileExistsError(path) + if create_parents: + ret = gdal.MkdirRecursive(gdal_path, 0o755) + else: + ret = gdal.Mkdir(gdal_path, 0o755) + if ret != 0: + raise IOError(path) + + def makedirs(self, path, exist_ok=False): + """Implements AbstractFileSystem.makedirs()""" + + gdal_path = self._get_gdal_path(path) + if gdal.VSIStatL(gdal_path): + if not exist_ok: + raise FileExistsError(path) + return + + self.mkdir(path, create_parents=True) + + def _rm(self, path): + """Implements AbstractFileSystem._rm()""" + + gdal_path = self._get_gdal_path(path) + ret = -1 + try: + ret = gdal.Unlink(gdal_path) + except Exception: + pass + if ret != 0: + if gdal.VSIStatL(gdal_path) is None: + raise FileNotFoundError(path) + raise IOError(path) + + def rmdir(self, path): + """Implements AbstractFileSystem.rmdir()""" + + gdal_path = self._get_gdal_path(path) + ret = -1 + try: + ret = gdal.Rmdir(gdal_path) + except Exception: + pass + if ret != 0: + if gdal.VSIStatL(gdal_path) is None: + raise FileNotFoundError(path) + raise IOError(path) + + def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs): + """Implements AbstractFileSystem.mv()""" + + old_path = self._get_gdal_path(path1) + new_path = self._get_gdal_path(path2) + try: + if gdal.MoveFile(old_path, new_path) != 0: + if gdal.VSIStatL(old_path) is None: + raise FileNotFoundError(path1) + raise IOError(f"Cannot move from {path1} to {path2}") + except Exception: + if gdal.VSIStatL(old_path) is None: + raise FileNotFoundError(path1) + raise + + def copy( + self, path1, path2, recursive=False, maxdepth=None, on_error=None, **kwargs + ): + """Implements AbstractFileSystem.copy()""" + + old_path = self._get_gdal_path(path1) + new_path = self._get_gdal_path(path2) + try: + if gdal.CopyFile(old_path, new_path) != 0: + if gdal.VSIStatL(old_path) is None: + raise FileNotFoundError(path1) + raise IOError(f"Cannot copy from {path1} to {path2}") + except Exception: + if gdal.VSIStatL(old_path) is None: + raise FileNotFoundError(path1) + raise + + +def register_vsi_implementations(): + """Register a generic "vsi" protocol and "vsimem", "vsitar", etc. + This method is automatically called on osgeo.gdal_fsspec import. + """ + register_implementation("vsi", VSIFileSystem) + for vsi_prefix in gdal.GetFileSystemsPrefixes(): + if vsi_prefix.startswith("/vsi") and not vsi_prefix.endswith("?"): + assert vsi_prefix.endswith("/") + protocol = vsi_prefix[1:-1] + # We need to duplicate the base class for each protocol, so that + # each class has a distinct "protocol" member. + new_class = type( + "VSIFileSystem_" + protocol, + VSIFileSystem.__bases__, + dict(VSIFileSystem.__dict__), + ) + register_implementation(protocol, new_class) + + +register_vsi_implementations()