From 0dede5a742530db7905bfb23090c00cbfaddc40e Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 9 Oct 2024 23:09:43 +0200 Subject: [PATCH 1/3] Python bindings; Add osgeo.gdal.VSIFile class (originating from autotest) --- autotest/pymod/gdaltest.py | 81 +------------------------ doc/source/spelling_wordlist.txt | 1 + swig/include/python/gdal_python.i | 99 +++++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+), 80 deletions(-) diff --git a/autotest/pymod/gdaltest.py b/autotest/pymod/gdaltest.py index 417a0768181e..e02254740f9d 100755 --- a/autotest/pymod/gdaltest.py +++ b/autotest/pymod/gdaltest.py @@ -2102,87 +2102,8 @@ def reopen(ds, update=False, open_options=None): ) -# VSIFile helper class - - -class VSIFile: - def __init__(self, path, mode, encoding="utf-8"): - self._path = path - self._mode = mode - - self._binary = "b" in mode - self._encoding = encoding - - self._fp = gdal.VSIFOpenExL(self._path, self._mode, True) - if self._fp is None: - raise OSError(gdal.VSIGetLastErrorMsg()) - - self._closed = False - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def __iter__(self): - return self - - def __next__(self): - line = gdal.CPLReadLineL(self._fp) - if line is None: - raise StopIteration - if self._binary: - return line.encode() - return line - - def close(self): - if self._closed: - return - - self._closed = True - gdal.VSIFCloseL(self._fp) - - def read(self, size=-1): - if size == -1: - pos = self.tell() - self.seek(0, 2) - size = self.tell() - self.seek(pos) - - raw = gdal.VSIFReadL(1, size, self._fp) - - if self._binary: - return bytes(raw) - else: - return raw.decode(self._encoding) - - def write(self, x): - - if self._binary: - assert type(x) in (bytes, bytearray, memoryview) - else: - assert type(x) is str - x = x.encode(self._encoding) - - planned_write = len(x) - actual_write = gdal.VSIFWriteL(x, 1, planned_write, self._fp) - - if planned_write != actual_write: - raise OSError( - f"Expected to write {planned_write} bytes but {actual_write} were written" - ) - - def seek(self, offset, whence=0): - if gdal.VSIFSeekL(self._fp, offset, whence) != 0: - raise OSError(gdal.VSIGetLastErrorMsg()) - - def tell(self): - return gdal.VSIFTellL(self._fp) - - def vsi_open(path, mode="r"): - return VSIFile(path, mode) + return gdal.VSIFile(path, mode) def vrt_has_open_support(): diff --git a/doc/source/spelling_wordlist.txt b/doc/source/spelling_wordlist.txt index 77d3e4e14e6c..4c4ea2d861af 100644 --- a/doc/source/spelling_wordlist.txt +++ b/doc/source/spelling_wordlist.txt @@ -3455,6 +3455,7 @@ vsiaz vsicached vsicrypt vsicurl +VSIFile VSIFOpen vsigs vsigz diff --git a/swig/include/python/gdal_python.i b/swig/include/python/gdal_python.i index 902aa5d6bf50..d6354b63a47e 100644 --- a/swig/include/python/gdal_python.i +++ b/swig/include/python/gdal_python.i @@ -5012,3 +5012,102 @@ def InterpolateAtPoint(self, *args, **kwargs): else: return ret[1] %} + +%pythoncode %{ + +# VSIFile: Copyright (c) 2024, Dan Baston + +from io import BytesIO + +class VSIFile(BytesIO): + """Class wrapping a GDAL VSILFILE instance as a Python BytesIO instance + + :since: GDAL 3.11 + """ + + def __init__(self, path, mode, encoding="utf-8"): + self._path = path + self._mode = mode + + self._binary = "b" in mode + self._encoding = encoding + + self._fp = VSIFOpenExL(self._path, self._mode, True) + if self._fp is None: + raise OSError(VSIGetLastErrorMsg()) + + self._closed = False + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + def __iter__(self): + return self + + def __next__(self): + line = CPLReadLineL(self._fp) + if line is None: + raise StopIteration + if self._binary: + return line.encode() + return line + + def close(self): + if self._closed: + return + + self._closed = True + VSIFCloseL(self._fp) + + def read(self, size=-1): + if size == -1: + pos = self.tell() + self.seek(0, 2) + size = self.tell() + self.seek(pos) + + raw = VSIFReadL(1, size, self._fp) + + if self._binary: + return bytes(raw) + else: + return raw.decode(self._encoding) + + def write(self, x): + + if self._binary: + assert type(x) in (bytes, bytearray, memoryview) + else: + assert type(x) is str + x = x.encode(self._encoding) + + planned_write = len(x) + actual_write = VSIFWriteL(x, 1, planned_write, self._fp) + + if planned_write != actual_write: + raise OSError( + f"Expected to write {planned_write} bytes but {actual_write} were written" + ) + + def seek(self, offset, whence=0): + # We redefine the docstring since otherwise breathe would complain on the one coming from BytesIO.seek() + """Change stream position. + + Seek to byte offset pos relative to position indicated by whence: + + - 0: Start of stream (the default). pos should be >= 0; + - 1: Current position - pos may be negative; + - 2: End of stream - pos usually negative. + + Returns the new absolute position. + """ + + if VSIFSeekL(self._fp, offset, whence) != 0: + raise OSError(VSIGetLastErrorMsg()) + + def tell(self): + return VSIFTellL(self._fp) +%} From 2210300b4a11eaf99edfce0fc6017661d6bc61d3 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 9 Oct 2024 23:11:26 +0200 Subject: [PATCH 2/3] Python bindings: add a osgeo.gdal_fsspec module that on import will register GDAL VSI file system handlers as fsspec AbstractFileSystem --- .github/workflows/cmake_builds.yml | 2 +- .github/workflows/ubuntu_24.04/Dockerfile.ci | 2 + .pre-commit-config.yaml | 9 +- autotest/gcore/test_gdal_fsspec.py | 229 +++++++++++++++ doc/requirements.txt | 1 + doc/source/api/python/general.rst | 20 ++ doc/source/api/python/osgeo.gdal_fsspec.rst | 12 + doc/source/api/python/osgeo.rst | 1 + swig/include/cpl.i | 8 + swig/python/CMakeLists.txt | 10 + swig/python/osgeo/gdal_fsspec.py | 292 +++++++++++++++++++ 11 files changed, 582 insertions(+), 4 deletions(-) create mode 100644 autotest/gcore/test_gdal_fsspec.py create mode 100644 doc/source/api/python/osgeo.gdal_fsspec.rst create mode 100644 swig/python/osgeo/gdal_fsspec.py diff --git a/.github/workflows/cmake_builds.yml b/.github/workflows/cmake_builds.yml index d4e8f04587fd..234021da7493 100644 --- a/.github/workflows/cmake_builds.yml +++ b/.github/workflows/cmake_builds.yml @@ -432,7 +432,7 @@ jobs: cfitsio freexl geotiff libjpeg-turbo libpq libspatialite libwebp-base pcre pcre2 postgresql \ sqlite tiledb zstd cryptopp cgal doxygen librttopo libkml openssl xz \ openjdk ant qhull armadillo blas blas-devel libblas libcblas liblapack liblapacke blosc libarchive \ - arrow-cpp pyarrow libaec libheif libavif cmake + arrow-cpp pyarrow libaec libheif libavif cmake fsspec - name: Check CMake version shell: bash -l {0} run: | diff --git a/.github/workflows/ubuntu_24.04/Dockerfile.ci b/.github/workflows/ubuntu_24.04/Dockerfile.ci index 9d4f4137480d..0c217be9b185 100644 --- a/.github/workflows/ubuntu_24.04/Dockerfile.ci +++ b/.github/workflows/ubuntu_24.04/Dockerfile.ci @@ -149,3 +149,5 @@ RUN python3 -m pip install -U --break-system-packages -r /tmp/requirements.txt # cfchecker requires udunits2 RUN apt-get install -y --allow-unauthenticated libudunits2-0 libudunits2-data RUN python3 -m pip install --break-system-packages cfchecker + +RUN python3 -m pip install --break-system-packages fsspec diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a0215f92638f..b7f88f67772b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,8 @@ repos: - id: black exclude: > (?x)^( - swig/python/osgeo/| + swig/python/osgeo/__init__.py| + swig/python/osgeo/gdalnumeric.py| autotest/ogr/data/ ) - repo: https://github.com/timothycrosley/isort @@ -14,7 +15,8 @@ repos: - id: isort exclude: > (?x)^( - swig/python/osgeo/| + swig/python/osgeo/__init__.py| + swig/python/osgeo/gdalnumeric.py| autotest/ogr/data/ ) - repo: https://github.com/pycqa/flake8 @@ -23,7 +25,8 @@ repos: - id: flake8 exclude: > (?x)^( - swig/python/osgeo/| + swig/python/osgeo/__init__.py| + swig/python/osgeo/gdalnumeric.py| examples/| autotest/ogr/data/ ) diff --git a/autotest/gcore/test_gdal_fsspec.py b/autotest/gcore/test_gdal_fsspec.py new file mode 100644 index 000000000000..ac227a57db10 --- /dev/null +++ b/autotest/gcore/test_gdal_fsspec.py @@ -0,0 +1,229 @@ +#!/usr/bin/env pytest +# -*- coding: utf-8 -*- +############################################################################### +# Project: GDAL/OGR Test Suite +# Purpose: Test gdal_fsspec module +# Author: Even Rouault +# +############################################################################### +# Copyright (c) 20124, Even Rouault +# +# SPDX-License-Identifier: MIT +############################################################################### + +import pytest + +from osgeo import gdal + +fsspec = pytest.importorskip("fsspec") +pytest.importorskip("fsspec.spec") + +from osgeo import gdal_fsspec # NOQA + + +def test_gdal_fsspec_open_read(): + + with fsspec.open("vsi://data/byte.tif") as f: + assert len(f.read()) == gdal.VSIStatL("data/byte.tif").size + + +def test_gdal_fsspec_info_file(): + + fs = fsspec.filesystem("vsi") + info = fs.info("data/byte.tif") + assert "mtime" in info + del info["mtime"] + assert (info["mode"] & 32768) != 0 + del info["mode"] + assert info == { + "name": "data/byte.tif", + "size": 736, + "type": "file", + } + + +def test_gdal_fsspec_info_dir(): + + fs = fsspec.filesystem("vsi") + info = fs.info("data") + assert (info["mode"] & 16384) != 0 + del info["mode"] + assert info == { + "name": "data", + "size": 0, + "type": "directory", + } + + +def test_gdal_fsspec_info_error(): + + fs = fsspec.filesystem("vsi") + with pytest.raises(FileNotFoundError): + fs.info("/i/do/not/exist") + + +def test_gdal_fsspec_ls(): + + fs = fsspec.filesystem("vsi") + ret = fs.ls("data") + assert len(ret) > 2 + item_of_interest = None + for item in ret: + if item["name"] == "data/byte.tif": + item_of_interest = item + break + assert item_of_interest + assert "mtime" in item_of_interest + del item_of_interest["mtime"] + assert item_of_interest == { + "name": "data/byte.tif", + "size": 736, + "type": "file", + } + + +def test_gdal_fsspec_ls_file(): + + fs = fsspec.filesystem("vsi") + ret = fs.ls("data/byte.tif") + assert ret == ["data/byte.tif"] + + +def test_gdal_fsspec_ls_error(): + + fs = fsspec.filesystem("vsi") + with pytest.raises(FileNotFoundError): + fs.ls("vsi://i/do/not/exist") + + +def test_gdal_fsspec_modified(): + + fs = fsspec.filesystem("vsi") + modified = fs.modified("data/byte.tif") + assert modified is not None + import datetime + + assert isinstance(modified, datetime.datetime) + + +def test_gdal_fsspec_modified_error(): + + fs = fsspec.filesystem("vsi") + with pytest.raises(FileNotFoundError): + fs.modified("vsi://i/do/not/exist") + + +def test_gdal_fsspec_rm(): + + with fsspec.open("vsimem:///foo.bin", "wb") as f: + f.write(b"""bar""") + fs = fsspec.filesystem("vsimem") + fs.info("/foo.bin") + fs.rm("/foo.bin") + with pytest.raises(FileNotFoundError): + fs.info("/foo.bin") + + +def test_gdal_fsspec_rm_error(): + + fs = fsspec.filesystem("vsimem") + with pytest.raises(FileNotFoundError): + fs.rm("/foo.bin") + + +def test_gdal_fsspec_copy(): + + with fsspec.open("vsimem://foo.bin", "wb") as f: + f.write(b"""bar""") + fs = fsspec.filesystem("vsimem") + fs.copy("/foo.bin", "/bar.bin") + assert fs.info("/bar.bin")["size"] == 3 + assert fs.info("/foo.bin")["size"] == 3 + fs.rm("/foo.bin") + fs.rm("/bar.bin") + + +def test_gdal_fsspec_copy_error(): + + fs = fsspec.filesystem("vsimem") + with pytest.raises(FileNotFoundError): + fs.copy("/foo.bin", "/bar.bin") + + +def test_gdal_fsspec_mv(): + + with fsspec.open("vsimem://foo.bin", "wb") as f: + f.write(b"""bar""") + fs = fsspec.filesystem("vsimem") + fs.mv("/foo.bin", "/bar.bin") + assert fs.info("/bar.bin")["size"] == 3 + with pytest.raises(FileNotFoundError): + fs.info("/foo.bin") + fs.rm("/bar.bin") + + +def test_gdal_fsspec_mv_error(): + + fs = fsspec.filesystem("vsimem") + with pytest.raises(FileNotFoundError): + fs.mv("/foo.bin", "/bar.bin") + + +def test_gdal_fsspec_mkdir(tmp_path): + + fs = fsspec.filesystem("vsi") + + my_path = str(tmp_path) + "/my_dir" + + fs.mkdir(my_path) + assert fs.info(my_path)["type"] == "directory" + with pytest.raises(FileExistsError): + fs.mkdir(my_path) + fs.rmdir(my_path) + + fs.mkdir(my_path + "/my_subdir") + assert fs.info(my_path)["type"] == "directory" + assert fs.info(my_path + "/my_subdir")["type"] == "directory" + fs.rmdir(my_path + "/my_subdir") + fs.rmdir(my_path) + with pytest.raises(FileNotFoundError): + fs.info(my_path) + + fs = fsspec.filesystem("vsi") + with pytest.raises(Exception): + fs.mkdir(my_path + "/my_subdir", create_parents=False) + with pytest.raises(FileNotFoundError): + fs.info(my_path) + + +def test_gdal_fsspec_makedirs(tmp_path): + + fs = fsspec.filesystem("vsi") + + my_path = str(tmp_path) + "/my_dir" + fs.makedirs(my_path) + assert fs.info(my_path)["type"] == "directory" + with pytest.raises(FileExistsError): + fs.makedirs(my_path) + fs.makedirs(my_path, exist_ok=True) + fs.rmdir(my_path) + + +def test_gdal_fsspec_usable_by_pyarrow_dataset(tmp_vsimem): + + ds = pytest.importorskip("pyarrow.dataset") + + tmp_vsimem_file = str(tmp_vsimem / "tmp.parquet") + gdal.FileFromMemBuffer( + tmp_vsimem_file, open("../ogr/data/parquet/test.parquet", "rb").read() + ) + + fs_vsimem = fsspec.filesystem("vsimem") + + assert ( + ds.dataset(tmp_vsimem_file[len("/vsimem") :], filesystem=fs_vsimem) is not None + ) + + assert ( + ds.dataset(str(tmp_vsimem)[len("/vsimem") :], filesystem=fs_vsimem) is not None + ) diff --git a/doc/requirements.txt b/doc/requirements.txt index e4378d870530..34b6230cfd8d 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,5 +1,6 @@ # This file may be used to create an environment using: # $ pip install --upgrade -r +fsspec numpy sphinx breathe diff --git a/doc/source/api/python/general.rst b/doc/source/api/python/general.rst index 7c7cb58a19ab..a77c90f3844c 100644 --- a/doc/source/api/python/general.rst +++ b/doc/source/api/python/general.rst @@ -95,6 +95,26 @@ Error Handling File Management --------------- +osgeo.gdal_fsspec module +++++++++++++++++++++++++ + +.. automodule:: osgeo.gdal_fsspec + :members: + :undoc-members: + :show-inheritance: + :noindex: + +osgeo.gdal.VSIFile class +++++++++++++++++++++++++ + +.. autoclass:: osgeo.gdal.VSIFile + :members: + :undoc-members: + :noindex: + +Low level functions ++++++++++++++++++++ + .. autofunction:: osgeo.gdal.CloseDir .. autofunction:: osgeo.gdal.CopyFile diff --git a/doc/source/api/python/osgeo.gdal_fsspec.rst b/doc/source/api/python/osgeo.gdal_fsspec.rst new file mode 100644 index 000000000000..2b1bbe35b138 --- /dev/null +++ b/doc/source/api/python/osgeo.gdal_fsspec.rst @@ -0,0 +1,12 @@ +.. + The documentation displayed on this page is automatically generated from + Python docstrings. See https://gdal.org/development/dev_documentation.html + for information on updating this content. + +osgeo.gdal_fsspec module +======================== + +.. automodule:: osgeo.gdal_fsspec + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/python/osgeo.rst b/doc/source/api/python/osgeo.rst index d10dfdded4ba..5cf9bcd6e46a 100644 --- a/doc/source/api/python/osgeo.rst +++ b/doc/source/api/python/osgeo.rst @@ -12,6 +12,7 @@ Submodules osgeo.gdal osgeo.gdal_array + osgeo.gdal_fsspec osgeo.gdalconst osgeo.gnm osgeo.ogr diff --git a/swig/include/cpl.i b/swig/include/cpl.i index 0e8efca98794..e1890997737b 100644 --- a/swig/include/cpl.i +++ b/swig/include/cpl.i @@ -742,6 +742,14 @@ void CopyFileRestartable(const char* pszSource, } +%rename (MoveFile) wrapper_MoveFile; +%inline { +int wrapper_MoveFile(const char* pszSource, const char* pszTarget) +{ + return CPLMoveFile(pszTarget, pszSource); +} +} + %clear (const char* pszSource); %clear (const char* pszTarget); diff --git a/swig/python/CMakeLists.txt b/swig/python/CMakeLists.txt index d694c71c208b..b0a12f19c704 100644 --- a/swig/python/CMakeLists.txt +++ b/swig/python/CMakeLists.txt @@ -106,6 +106,14 @@ set(GDAL_PYTHON_CSOURCES list(APPEND GDAL_PYTHON_PYSOURCES "${CMAKE_CURRENT_BINARY_DIR}/osgeo/__init__.py") endif() + if (NOT "${CMAKE_BINARY_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdal_fsspec.py" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/osgeo/gdal_fsspec.py" "${CMAKE_CURRENT_BINARY_DIR}/osgeo" + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/osgeo/gdal_fsspec.py") + list(APPEND GDAL_PYTHON_PYSOURCES "${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdal_fsspec.py") + endif() + foreach(_file IN ITEMS ${GDAL_PYTHON_CSOURCES}) add_custom_command( OUTPUT ${_file} @@ -544,6 +552,7 @@ elseif (Python_Development_FOUND) ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdal.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdalconst.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdalnumeric.py + ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdal_fsspec.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gnm.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/ogr.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/osr.py @@ -554,6 +563,7 @@ elseif (Python_Development_FOUND) ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdal.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdalconst.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdalnumeric.py + ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gdal_fsspec.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/gnm.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/ogr.py ${CMAKE_CURRENT_BINARY_DIR}/osgeo/osr.py diff --git a/swig/python/osgeo/gdal_fsspec.py b/swig/python/osgeo/gdal_fsspec.py new file mode 100644 index 000000000000..5debc910ec36 --- /dev/null +++ b/swig/python/osgeo/gdal_fsspec.py @@ -0,0 +1,292 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2024, Even Rouault + +"""Module exposing GDAL Virtual File Systems (VSI) as fsspec implementations. + + Importing "osgeo.gdal_fsspec" requires the Python "fsspec" + (https://filesystem-spec.readthedocs.io/en/latest/) module to be available. + + A generic "vsi" fsspec protocol is available. All GDAL VSI file names must be + simply prefixed with "vsi://". For example: + + - "vsi://data/byte.tif" to access relative file "data/byte.tif" + - "vsi:///home/user/byte.tif" to access absolute file "/home/user/byte.tif" + - "vsi:///vsimem/byte.tif" (note the 3 slashes) to access VSIMem file "/vsimem/byte.tif" + + Each VSI file system is also registered as a distinct fsspec protocol, such + as "vsimem", "vsicurl", "vsizip", "vsitar", etc. + + Examples: + + - "vsimem://byte.tif" to access file "/vsimem/byte.tif" + - "vsicurl://http://example.com/foo" to access file "/vsicurl/http://example.com/foo" + - "vsis3://my_bucket/byte.tif" to access file "/vsis3/my_bucket/byte.tif" + - "vsizip:///home/user/my.zip/foo.tif" (note the 3 slashes to indicate absolute path) + to access (absolute) file "/vsizip//home/user/my.zip/foo.tif" + - "vsizip://my.zip/foo.tif" to access (relative) file "/vsizip/my.zip/foo.tif" + + :since: GDAL 3.11 +""" + +from pathlib import PurePath + +from fsspec.registry import register_implementation +from fsspec.spec import AbstractFileSystem +from fsspec.utils import stringify_path + +from osgeo import gdal + + +class VSIFileSystem(AbstractFileSystem): + """Implementation of AbstractFileSystem for a GDAL Virtual File System""" + + @classmethod + def _get_gdal_path(cls, path): + """Return a GDAL compatible file from a fsspec file name. + + For the file system using the generic "vsi" protocol, + remove the leading vsi:// if found (normally, it should be there, + but most AbstractFileSystem implementations seem to be ready to remove + it if found) + + For specialized file systems, like vsimem://, etc., for an input + like "vsimem:///foo", return "/vsimem/foo". And for an input like + "/foo" also return "/vsimem/foo". + """ + + if isinstance(path, PurePath): + path = stringify_path(path) + + if cls.protocol == "vsi": + # "vsi://something" just becomes "something" + if path.startswith("vsi://"): + return path[len("vsi://") :] + + return path + + else: + list_protocols_that_need_leeding_slash = [ + "vsis3", + "vsigs", + "vsiaz", + "vsioss", + "vsiswift", + ] + list_protocols_that_need_leeding_slash += [ + item + "_streaming" for item in list_protocols_that_need_leeding_slash + ] + list_protocols_that_need_leeding_slash.append("vsimem") + + # Deal with paths like "vsis3://foo" + full_protocol = cls.protocol + "://" + if path.startswith(full_protocol): + path = path[len(full_protocol) :] + + # Deal with paths like "/foo" with a VSIFileSystem that is something like "vsis3" + if ( + cls.protocol in list_protocols_that_need_leeding_slash + and not path.startswith("/") + ): + path = "/" + path + + return "/" + cls.protocol + path + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + """Implements AbstractFileSystem._open()""" + + path = self._get_gdal_path(path) + return gdal.VSIFile(path, mode) + + def info(self, path, **kwargs): + """Implements AbstractFileSystem.info()""" + + gdal_path = self._get_gdal_path(path) + stat = gdal.VSIStatL(gdal_path) + if stat is None: + raise FileNotFoundError(path) + if stat.IsDirectory(): + ret = { + "name": self._strip_protocol(path), + "size": 0, + "type": "directory", + } + else: + ret = { + "name": self._strip_protocol(path), + "size": stat.size, + "type": "file", + "mtime": stat.mtime, + } + if stat.mode: + ret["mode"] = stat.mode + return ret + + def modified(self, path): + """Implements AbstractFileSystem.modified()""" + + gdal_path = self._get_gdal_path(path) + stat = gdal.VSIStatL(gdal_path) + if stat is None: + raise FileNotFoundError(path) + import datetime + + return datetime.datetime.fromtimestamp(stat.mtime) + + def ls(self, path, detail=True, **kwargs): + """Implements AbstractFileSystem.ls()""" + + fs_path = self._strip_protocol(path) + gdal_path = self._get_gdal_path(path) + ret = [] + directory = gdal.OpenDir(gdal_path) + if directory is None: + stat = gdal.VSIStatL(gdal_path) + if stat is None: + raise FileNotFoundError(path) + return [fs_path] + + try: + while True: + entry = gdal.GetNextDirEntry(directory) + if entry is None: + break + + ret_entry = { + "name": fs_path + "/" + entry.name, + "type": ( + "file" + if (entry.mode & 32768) != 0 + else "directory" + if (entry.mode & 16384) != 0 + else None + ), + } + if ret_entry["type"] == "file": + ret_entry["size"] = entry.size if entry.sizeKnown else None + if entry.mtimeKnown: + ret_entry["mtime"] = entry.mtime + ret.append(ret_entry) + finally: + gdal.CloseDir(directory) + return ret + + def mkdir(self, path, create_parents=True, **kwargs): + """Implements AbstractFileSystem.mkdir()""" + + # Base fs.makedirs() may call us with "/vsimem" + if ( + path + "/" in gdal.GetFileSystemsPrefixes() + or path in gdal.GetFileSystemsPrefixes() + ): + return + + gdal_path = self._get_gdal_path(path) + if gdal.VSIStatL(gdal_path): + raise FileExistsError(path) + if create_parents: + ret = gdal.MkdirRecursive(gdal_path, 0o755) + else: + ret = gdal.Mkdir(gdal_path, 0o755) + if ret != 0: + raise IOError(path) + + def makedirs(self, path, exist_ok=False): + """Implements AbstractFileSystem.makedirs()""" + + gdal_path = self._get_gdal_path(path) + if gdal.VSIStatL(gdal_path): + if not exist_ok: + raise FileExistsError(path) + return + + self.mkdir(path, create_parents=True) + + def _rm(self, path): + """Implements AbstractFileSystem._rm()""" + + gdal_path = self._get_gdal_path(path) + ret = -1 + try: + ret = gdal.Unlink(gdal_path) + except Exception: + pass + if ret != 0: + if gdal.VSIStatL(gdal_path) is None: + raise FileNotFoundError(path) + raise IOError(path) + + def rmdir(self, path): + """Implements AbstractFileSystem.rmdir()""" + + gdal_path = self._get_gdal_path(path) + ret = -1 + try: + ret = gdal.Rmdir(gdal_path) + except Exception: + pass + if ret != 0: + if gdal.VSIStatL(gdal_path) is None: + raise FileNotFoundError(path) + raise IOError(path) + + def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs): + """Implements AbstractFileSystem.mv()""" + + old_path = self._get_gdal_path(path1) + new_path = self._get_gdal_path(path2) + try: + if gdal.MoveFile(old_path, new_path) != 0: + if gdal.VSIStatL(old_path) is None: + raise FileNotFoundError(path1) + raise IOError(f"Cannot move from {path1} to {path2}") + except Exception: + if gdal.VSIStatL(old_path) is None: + raise FileNotFoundError(path1) + raise + + def copy( + self, path1, path2, recursive=False, maxdepth=None, on_error=None, **kwargs + ): + """Implements AbstractFileSystem.copy()""" + + old_path = self._get_gdal_path(path1) + new_path = self._get_gdal_path(path2) + try: + if gdal.CopyFile(old_path, new_path) != 0: + if gdal.VSIStatL(old_path) is None: + raise FileNotFoundError(path1) + raise IOError(f"Cannot copy from {path1} to {path2}") + except Exception: + if gdal.VSIStatL(old_path) is None: + raise FileNotFoundError(path1) + raise + + +def register_vsi_implementations(): + """Register a generic "vsi" protocol and "vsimem", "vsitar", etc. + This method is automatically called on osgeo.gdal_fsspec import. + """ + register_implementation("vsi", VSIFileSystem) + for vsi_prefix in gdal.GetFileSystemsPrefixes(): + if vsi_prefix.startswith("/vsi") and not vsi_prefix.endswith("?"): + assert vsi_prefix.endswith("/") + protocol = vsi_prefix[1:-1] + # We need to duplicate the base class for each protocol, so that + # each class has a distinct "protocol" member. + new_class = type( + "VSIFileSystem_" + protocol, + VSIFileSystem.__bases__, + dict(VSIFileSystem.__dict__), + ) + register_implementation(protocol, new_class) + + +register_vsi_implementations() From 0b07ea7db3c95b57e779a4ef0c59d0b08bf9f552 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sat, 12 Oct 2024 15:10:11 +0200 Subject: [PATCH 3/3] Python bindings: simplify previous commit to have a single 'gdalvsi' protocol --- autotest/gcore/test_gdal_fsspec.py | 86 ++++++++++++++---------------- doc/source/spelling_wordlist.txt | 1 + swig/python/osgeo/gdal_fsspec.py | 84 +++++------------------------ 3 files changed, 56 insertions(+), 115 deletions(-) diff --git a/autotest/gcore/test_gdal_fsspec.py b/autotest/gcore/test_gdal_fsspec.py index ac227a57db10..38deb8da07be 100644 --- a/autotest/gcore/test_gdal_fsspec.py +++ b/autotest/gcore/test_gdal_fsspec.py @@ -23,13 +23,13 @@ def test_gdal_fsspec_open_read(): - with fsspec.open("vsi://data/byte.tif") as f: + with fsspec.open("gdalvsi://data/byte.tif") as f: assert len(f.read()) == gdal.VSIStatL("data/byte.tif").size def test_gdal_fsspec_info_file(): - fs = fsspec.filesystem("vsi") + fs = fsspec.filesystem("gdalvsi") info = fs.info("data/byte.tif") assert "mtime" in info del info["mtime"] @@ -44,7 +44,7 @@ def test_gdal_fsspec_info_file(): def test_gdal_fsspec_info_dir(): - fs = fsspec.filesystem("vsi") + fs = fsspec.filesystem("gdalvsi") info = fs.info("data") assert (info["mode"] & 16384) != 0 del info["mode"] @@ -57,14 +57,14 @@ def test_gdal_fsspec_info_dir(): def test_gdal_fsspec_info_error(): - fs = fsspec.filesystem("vsi") + fs = fsspec.filesystem("gdalvsi") with pytest.raises(FileNotFoundError): fs.info("/i/do/not/exist") def test_gdal_fsspec_ls(): - fs = fsspec.filesystem("vsi") + fs = fsspec.filesystem("gdalvsi") ret = fs.ls("data") assert len(ret) > 2 item_of_interest = None @@ -84,21 +84,21 @@ def test_gdal_fsspec_ls(): def test_gdal_fsspec_ls_file(): - fs = fsspec.filesystem("vsi") + fs = fsspec.filesystem("gdalvsi") ret = fs.ls("data/byte.tif") assert ret == ["data/byte.tif"] def test_gdal_fsspec_ls_error(): - fs = fsspec.filesystem("vsi") + fs = fsspec.filesystem("gdalvsi") with pytest.raises(FileNotFoundError): - fs.ls("vsi://i/do/not/exist") + fs.ls("gdalvsi://i/do/not/exist") def test_gdal_fsspec_modified(): - fs = fsspec.filesystem("vsi") + fs = fsspec.filesystem("gdalvsi") modified = fs.modified("data/byte.tif") assert modified is not None import datetime @@ -108,70 +108,70 @@ def test_gdal_fsspec_modified(): def test_gdal_fsspec_modified_error(): - fs = fsspec.filesystem("vsi") + fs = fsspec.filesystem("gdalvsi") with pytest.raises(FileNotFoundError): - fs.modified("vsi://i/do/not/exist") + fs.modified("gdalvsi://i/do/not/exist") def test_gdal_fsspec_rm(): - with fsspec.open("vsimem:///foo.bin", "wb") as f: + with fsspec.open("gdalvsi:///vsimem/foo.bin", "wb") as f: f.write(b"""bar""") - fs = fsspec.filesystem("vsimem") - fs.info("/foo.bin") - fs.rm("/foo.bin") + fs = fsspec.filesystem("gdalvsi") + fs.info("/vsimem/foo.bin") + fs.rm("/vsimem/foo.bin") with pytest.raises(FileNotFoundError): - fs.info("/foo.bin") + fs.info("/vsimem/foo.bin") def test_gdal_fsspec_rm_error(): - fs = fsspec.filesystem("vsimem") + fs = fsspec.filesystem("gdalvsi") with pytest.raises(FileNotFoundError): - fs.rm("/foo.bin") + fs.rm("/vsimem/foo.bin") def test_gdal_fsspec_copy(): - with fsspec.open("vsimem://foo.bin", "wb") as f: + with fsspec.open("gdalvsi:///vsimem/foo.bin", "wb") as f: f.write(b"""bar""") - fs = fsspec.filesystem("vsimem") - fs.copy("/foo.bin", "/bar.bin") - assert fs.info("/bar.bin")["size"] == 3 - assert fs.info("/foo.bin")["size"] == 3 - fs.rm("/foo.bin") - fs.rm("/bar.bin") + fs = fsspec.filesystem("gdalvsi") + fs.copy("/vsimem/foo.bin", "/vsimem/bar.bin") + assert fs.info("/vsimem/bar.bin")["size"] == 3 + assert fs.info("/vsimem/foo.bin")["size"] == 3 + fs.rm("/vsimem/foo.bin") + fs.rm("/vsimem/bar.bin") def test_gdal_fsspec_copy_error(): - fs = fsspec.filesystem("vsimem") + fs = fsspec.filesystem("gdalvsi") with pytest.raises(FileNotFoundError): - fs.copy("/foo.bin", "/bar.bin") + fs.copy("/vsimem/foo.bin", "/vsimem/bar.bin") def test_gdal_fsspec_mv(): - with fsspec.open("vsimem://foo.bin", "wb") as f: + with fsspec.open("gdalvsi:///vsimem/foo.bin", "wb") as f: f.write(b"""bar""") - fs = fsspec.filesystem("vsimem") - fs.mv("/foo.bin", "/bar.bin") - assert fs.info("/bar.bin")["size"] == 3 + fs = fsspec.filesystem("gdalvsi") + fs.mv("/vsimem/foo.bin", "/vsimem/bar.bin") + assert fs.info("/vsimem/bar.bin")["size"] == 3 with pytest.raises(FileNotFoundError): - fs.info("/foo.bin") - fs.rm("/bar.bin") + fs.info("/vsimem/foo.bin") + fs.rm("/vsimem/bar.bin") def test_gdal_fsspec_mv_error(): - fs = fsspec.filesystem("vsimem") + fs = fsspec.filesystem("gdalvsi") with pytest.raises(FileNotFoundError): - fs.mv("/foo.bin", "/bar.bin") + fs.mv("/vsimem/foo.bin", "/bar.bin") def test_gdal_fsspec_mkdir(tmp_path): - fs = fsspec.filesystem("vsi") + fs = fsspec.filesystem("gdalvsi") my_path = str(tmp_path) + "/my_dir" @@ -189,7 +189,7 @@ def test_gdal_fsspec_mkdir(tmp_path): with pytest.raises(FileNotFoundError): fs.info(my_path) - fs = fsspec.filesystem("vsi") + fs = fsspec.filesystem("gdalvsi") with pytest.raises(Exception): fs.mkdir(my_path + "/my_subdir", create_parents=False) with pytest.raises(FileNotFoundError): @@ -198,7 +198,7 @@ def test_gdal_fsspec_mkdir(tmp_path): def test_gdal_fsspec_makedirs(tmp_path): - fs = fsspec.filesystem("vsi") + fs = fsspec.filesystem("gdalvsi") my_path = str(tmp_path) + "/my_dir" fs.makedirs(my_path) @@ -218,12 +218,8 @@ def test_gdal_fsspec_usable_by_pyarrow_dataset(tmp_vsimem): tmp_vsimem_file, open("../ogr/data/parquet/test.parquet", "rb").read() ) - fs_vsimem = fsspec.filesystem("vsimem") + fs_vsimem = fsspec.filesystem("gdalvsi") - assert ( - ds.dataset(tmp_vsimem_file[len("/vsimem") :], filesystem=fs_vsimem) is not None - ) + assert ds.dataset(tmp_vsimem_file, filesystem=fs_vsimem) is not None - assert ( - ds.dataset(str(tmp_vsimem)[len("/vsimem") :], filesystem=fs_vsimem) is not None - ) + assert ds.dataset(str(tmp_vsimem), filesystem=fs_vsimem) is not None diff --git a/doc/source/spelling_wordlist.txt b/doc/source/spelling_wordlist.txt index 4c4ea2d861af..484b6fd812ea 100644 --- a/doc/source/spelling_wordlist.txt +++ b/doc/source/spelling_wordlist.txt @@ -1018,6 +1018,7 @@ GDALThreadLocalDatasetCache gdaltindex gdaltransform gdalvirtualmem +gdalvsi gdalwarp gdalwmscache gdb diff --git a/swig/python/osgeo/gdal_fsspec.py b/swig/python/osgeo/gdal_fsspec.py index 5debc910ec36..5861422095dc 100644 --- a/swig/python/osgeo/gdal_fsspec.py +++ b/swig/python/osgeo/gdal_fsspec.py @@ -1,29 +1,18 @@ # SPDX-License-Identifier: MIT # Copyright (c) 2024, Even Rouault -"""Module exposing GDAL Virtual File Systems (VSI) as fsspec implementations. +"""Module exposing GDAL Virtual File Systems (VSI) as a "gdalvsi" fsspec implementation. Importing "osgeo.gdal_fsspec" requires the Python "fsspec" (https://filesystem-spec.readthedocs.io/en/latest/) module to be available. - A generic "vsi" fsspec protocol is available. All GDAL VSI file names must be - simply prefixed with "vsi://". For example: + A generic "gdalvsi" fsspec protocol is available. All GDAL VSI file names must be + simply prefixed with "gdalvsi://". For example: - - "vsi://data/byte.tif" to access relative file "data/byte.tif" - - "vsi:///home/user/byte.tif" to access absolute file "/home/user/byte.tif" - - "vsi:///vsimem/byte.tif" (note the 3 slashes) to access VSIMem file "/vsimem/byte.tif" - - Each VSI file system is also registered as a distinct fsspec protocol, such - as "vsimem", "vsicurl", "vsizip", "vsitar", etc. - - Examples: - - - "vsimem://byte.tif" to access file "/vsimem/byte.tif" - - "vsicurl://http://example.com/foo" to access file "/vsicurl/http://example.com/foo" - - "vsis3://my_bucket/byte.tif" to access file "/vsis3/my_bucket/byte.tif" - - "vsizip:///home/user/my.zip/foo.tif" (note the 3 slashes to indicate absolute path) - to access (absolute) file "/vsizip//home/user/my.zip/foo.tif" - - "vsizip://my.zip/foo.tif" to access (relative) file "/vsizip/my.zip/foo.tif" + - "gdalvsi://data/byte.tif" to access relative file "data/byte.tif" + - "gdalvsi:///home/user/byte.tif" to access absolute file "/home/user/byte.tif" + - "gdalvsi:///vsimem/byte.tif" (note the 3 slashes) to access VSIMem file "/vsimem/byte.tif" + - "gdalvsi:///vsicurl/https://example.com/byte.tif (note the 3 slashes) to access "https://example.com/byte.tif" through /vsicurl/ :since: GDAL 3.11 """ @@ -44,52 +33,19 @@ class VSIFileSystem(AbstractFileSystem): def _get_gdal_path(cls, path): """Return a GDAL compatible file from a fsspec file name. - For the file system using the generic "vsi" protocol, - remove the leading vsi:// if found (normally, it should be there, + Remove the leading vsi:// if found (normally, it should be there, but most AbstractFileSystem implementations seem to be ready to remove it if found) - - For specialized file systems, like vsimem://, etc., for an input - like "vsimem:///foo", return "/vsimem/foo". And for an input like - "/foo" also return "/vsimem/foo". """ if isinstance(path, PurePath): path = stringify_path(path) - if cls.protocol == "vsi": - # "vsi://something" just becomes "something" - if path.startswith("vsi://"): - return path[len("vsi://") :] - - return path + # "vsi://something" just becomes "something" + if path.startswith("vsi://"): + return path[len("vsi://") :] - else: - list_protocols_that_need_leeding_slash = [ - "vsis3", - "vsigs", - "vsiaz", - "vsioss", - "vsiswift", - ] - list_protocols_that_need_leeding_slash += [ - item + "_streaming" for item in list_protocols_that_need_leeding_slash - ] - list_protocols_that_need_leeding_slash.append("vsimem") - - # Deal with paths like "vsis3://foo" - full_protocol = cls.protocol + "://" - if path.startswith(full_protocol): - path = path[len(full_protocol) :] - - # Deal with paths like "/foo" with a VSIFileSystem that is something like "vsis3" - if ( - cls.protocol in list_protocols_that_need_leeding_slash - and not path.startswith("/") - ): - path = "/" + path - - return "/" + cls.protocol + path + return path def _open( self, @@ -271,22 +227,10 @@ def copy( def register_vsi_implementations(): - """Register a generic "vsi" protocol and "vsimem", "vsitar", etc. + """Register a generic "gdalvsi" protocol. This method is automatically called on osgeo.gdal_fsspec import. """ - register_implementation("vsi", VSIFileSystem) - for vsi_prefix in gdal.GetFileSystemsPrefixes(): - if vsi_prefix.startswith("/vsi") and not vsi_prefix.endswith("?"): - assert vsi_prefix.endswith("/") - protocol = vsi_prefix[1:-1] - # We need to duplicate the base class for each protocol, so that - # each class has a distinct "protocol" member. - new_class = type( - "VSIFileSystem_" + protocol, - VSIFileSystem.__bases__, - dict(VSIFileSystem.__dict__), - ) - register_implementation(protocol, new_class) + register_implementation("gdalvsi", VSIFileSystem) register_vsi_implementations()