Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

References #90

Draft
wants to merge 13 commits into
base: dev
Choose a base branch
from
4 changes: 4 additions & 0 deletions .conda-recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ requirements:
- numpy
- scipy
- ruamel_yaml
- pandas
- pyarrow
- pyyaml
- pathlib # [py2k]
- enum34 # [py2k]
Expand All @@ -34,6 +36,8 @@ test:
- pytest
- pytest-benchmark
- h5py
- pyarrow
- pandas
- six
- coverage
- codecov
Expand Down
6 changes: 5 additions & 1 deletion exdir/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from . import core
from . import plugin_interface
from . import plugins
from .core import File, validation, Attribute, Dataset, Group, Raw, Object
from .core import (
File, validation, Attribute, Dataset, Group, Raw, Object, SoftLink,
ExternalLink, ref_dtype, regionref_dtype, Reference, RegionReference,
special_dtype, check_dtype
)

# TODO remove versioneer
from ._version import get_versions
Expand Down
2 changes: 2 additions & 0 deletions exdir/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@
from .dataset import Dataset
from .group import Group
from .raw import Raw
from .links import SoftLink, ExternalLink, Reference, RegionReference
from .dtype import ref_dtype, regionref_dtype, special_dtype, check_dtype
6 changes: 6 additions & 0 deletions exdir/core/attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,12 @@ def filename(self):
def __len__(self):
return len(self.keys())

def get(self, name):
if name in self:
return self[name]
else:
return None

def update(self, value):
"""
Update the Attribute with the key/value pairs from :code:`value`, overwriting existing keys.
Expand Down
9 changes: 9 additions & 0 deletions exdir/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@
EXDIR_METANAME = "exdir"
TYPE_METANAME = "type"
VERSION_METANAME = "version"
LINK_METANAME = "link"
TARGET_METANAME = "target"

#links
LINK_TYPENAME = "link"
LINK_TARGETNAME = "target"
LINK_EXTERNALNAME = "external"
LINK_SOFTNAME = "soft"
LINK_FILENAME = "file"

# filenames
META_FILENAME = "exdir.yaml"
Expand Down
145 changes: 110 additions & 35 deletions exdir/core/dataset.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
import numbers
import numpy as np
import pyarrow.feather as feather
import pandas as pd
import exdir

from . import exdir_object as exob
from .links import Reference, RegionReference
from .mode import assert_file_open, OpenMode, assert_file_writable

NUMPY_SUFFIX = '.npy'
FEATHER_SUFFIX = '.feather'


def _prepare_write(data, plugins, attrs, meta):
for plugin in plugins:
dataset_data = exdir.plugin_interface.DatasetData(
Expand All @@ -25,7 +32,14 @@ def _prepare_write(data, plugins, attrs, meta):


def _dataset_filename(dataset_directory):
return dataset_directory / "data.npy"
base = dataset_directory / "data"
if base.with_suffix(FEATHER_SUFFIX).exists():
filename = base.with_suffix(FEATHER_SUFFIX)
is_numpy = False
else:
filename = base.with_suffix(NUMPY_SUFFIX)
is_numpy = True
return filename, is_numpy


class Dataset(exob.Object):
Expand All @@ -44,17 +58,30 @@ def __init__(self, root_directory, parent_path, object_name, file):
object_name=object_name,
file=file
)
self._data_memmap = None
self._data_loaded = None
self.plugin_manager = file.plugin_manager
self.data_filename = str(_dataset_filename(self.directory))

def __getitem__(self, args):
assert_file_open(self.file)

if self.meta.get('has_ref'):
refs = self._data.values[args]
Refs = []
for ref in refs:
if Reference.base in ref:
Refs.append(Reference(ref))
elif RegionReference.base in ref:
Refs.append(RegionReference(ref))
if len(Refs) == 1:
return Refs[0]
return Refs

if len(self._data.shape) == 0:
values = self._data
else:
values = self._data[args]


enabled_plugins = [plugin_module.name for plugin_module in self.plugin_manager.plugins]

data = values
Expand All @@ -75,9 +102,8 @@ def __getitem__(self, args):
meta = self.meta.to_dict()
atts = self.attrs.to_dict()

dataset_data = exdir.plugin_interface.DatasetData(data=values,
attrs=self.attrs.to_dict(),
meta=meta)
dataset_data = exdir.plugin_interface.DatasetData(
data=values, attrs=self.attrs.to_dict(), meta=meta)
for plugin in plugins:
dataset_data = plugin.prepare_read(dataset_data)

Expand All @@ -88,56 +114,92 @@ def __getitem__(self, args):
def __setitem__(self, args, value):
assert_file_writable(self.file)

if self.meta.get('has_ref'):
self._data.values[args] = value.ref
self.flush()
return

_, is_numpy = _dataset_filename(self.directory)

value, attrs, meta = _prepare_write(
data=value,
plugins=self.plugin_manager.dataset_plugins.write_order,
attrs=self.attrs.to_dict(),
meta=self.meta.to_dict()
)
self._data[args] = value

if is_numpy:
self._data[args] = value
else:
self._data[args] = value
self.flush()
self.attrs = attrs
self.meta._set_data(meta)

def flush(self):
self.data = self._data

def _reload_data(self):
assert_file_open(self.file)
data_filename, is_numpy = _dataset_filename(self.directory)
for plugin in self.plugin_manager.dataset_plugins.write_order:
plugin.before_load(self.data_filename)
plugin.before_load(str(data_filename))


if self.file.io_mode == OpenMode.READ_ONLY:
mmap_mode = "r"
else:
mmap_mode = "r+"

try:
self._data_memmap = np.load(self.data_filename, mmap_mode=mmap_mode, allow_pickle=False)
if is_numpy:
self._data_loaded = np.load(
str(data_filename),
mmap_mode=mmap_mode, allow_pickle=False)
else:
self._data_loaded = feather.read_feather(str(data_filename))
self.file._open_datasets[self.name] = self
except ValueError as e:
# Could be that it is a Git LFS file. Let's see if that is the case and warn if so.
with open(self.data_filename, "r") as f:
# Could be that numpy needs to pickle, suggest the user to use
# dataframe

# Could be that it is a Git LFS file.
# Let's see if that is the case and warn if so.
with open(str(data_filename), "r") as f:
test_string = "version https://git-lfs.github.com/spec/v1"
contents = f.read(len(test_string))
if contents == test_string:
raise IOError("The file '{}' is a Git LFS placeholder. "
"Open the the Exdir File with the Git LFS plugin or run "
"`git lfs fetch` first. ".format(self.data_filename))
"Open the the Exdir File with the Git LFS plugin or run"
" `git lfs fetch` first. ".format(str(data_filename)))
else:
raise e

def _reset_data(self, value, attrs, meta):
assert_file_open(self.file)
self._data_memmap = np.lib.format.open_memmap(
self.data_filename,
mode="w+",
dtype=value.dtype,
shape=value.shape
)

if len(value.shape) == 0:
# scalars need to be set with itemset
self._data_memmap.itemset(value)
data_filename, _ = _dataset_filename(self.directory)
if isinstance(value, pd.DataFrame):
feather.write_feather(
value, str(data_filename.with_suffix(FEATHER_SUFFIX)))
if data_filename.with_suffix(NUMPY_SUFFIX).exists():
data_filename.with_suffix(NUMPY_SUFFIX).unlink()
else:
# replace the contents with the value
self._data_memmap[:] = value
self._data_loaded = np.lib.format.open_memmap(
str(data_filename.with_suffix(NUMPY_SUFFIX)),
mode="w+",
dtype=value.dtype,
shape=value.shape
)

if len(value.shape) == 0:
# scalars need to be set with itemset
self._data_loaded.itemset(value)
else:
# replace the contents with the value
self._data_loaded[:] = value

if data_filename.with_suffix(FEATHER_SUFFIX).exists():
data_filename.with_suffix(FEATHER_SUFFIX).unlink()

# update attributes and plugin metadata
if attrs:
Expand Down Expand Up @@ -177,17 +239,30 @@ def data(self):
@data.setter
def data(self, value):
assert_file_open(self.file)
if self._data.shape != value.shape or self._data.dtype != value.dtype:
if isinstance(value, pd.DataFrame):
value, attrs, meta = _prepare_write(
data=value,
plugins=self.plugin_manager.dataset_plugins.write_order,
attrs=self.attrs.to_dict(),
meta=self.meta.to_dict()
)
self._reset_data(value, attrs, meta)
return

self[:] = value
else:
if hasattr(self._data, 'dtype'):
new_dtype = self._data.dtype != value.dtype
else:
new_dtype = True # changing from feather to numpy
if self._data.shape != value.shape or new_dtype:
value, attrs, meta = _prepare_write(
data=value,
plugins=self.plugin_manager.dataset_plugins.write_order,
attrs=self.attrs.to_dict(),
meta=self.meta.to_dict()
)
self._reset_data(value, attrs, meta)
return

self[:] = value

@property
def shape(self):
Expand Down Expand Up @@ -261,21 +336,21 @@ def __iter__(self):
if len(self.shape) == 0:
raise TypeError("Can't iterate over a scalar dataset")

for i in range(self.shape[0]):
yield self[i]
for val in self.data:
yield val

def __str__(self):
return self.data.__str__()

def __repr__(self):
if self.file.io_mode == OpenMode.FILE_CLOSED:
return "<Closed Exdir Dataset>"
return "<Exdir Dataset {} shape {} dtype {}>".format(
self.name, self.shape, self.dtype)
return "<Exdir Dataset {} shape {}>".format(
self.name, self.shape)

@property
def _data(self):
assert_file_open(self.file)
if self._data_memmap is None:
if self._data_loaded is None:
self._reload_data()
return self._data_memmap
return self._data_loaded
Loading