Skip to content

Commit

Permalink
implement option to not append STATS.nc files (#193)
Browse files Browse the repository at this point in the history
* implement option to not append STATS.nc files

* Update pyproject.toml

* add combine_stats_netcdf_files

* better indexing of STATS to be incremental over multiple raw images
  • Loading branch information
emlynjdavies authored Aug 20, 2024
1 parent b581f6c commit 7d586cb
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 15 deletions.
3 changes: 2 additions & 1 deletion docs/notebooks/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ pixel_size = 24 # pixel size of imaging system in microns

[steps.output]
pipeline_class = 'pyopia.io.StatsToDisc'
output_datafile = './test' # prefix path for output nc file
output_datafile = './test' # prefix path for output nc file
append = true
2 changes: 1 addition & 1 deletion pyopia/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.1.11'
__version__ = '1.1.12'
62 changes: 50 additions & 12 deletions pyopia/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
from pyopia import __version__ as pyopia_version


def write_stats(
stats,
datafilename,
settings=None,
export_name_len=40,
dataformat='nc'):
def write_stats(stats,
datafilename,
settings=None,
export_name_len=40,
dataformat='nc',
append=True):
'''
Writes particle stats into the ouput file.
Appends if file already exists.
Expand All @@ -27,6 +27,12 @@ def write_stats(
datafilename (str): filame prefix for -STATS.h5 file that may or may not include a path
stats_all (DataFrame): stats dataframe returned from processImage()
export_name_len (int): max number of chars allowed for col 'export name'
append (bool): Append all processed data into one nc file.
Defaults to True.
If False, then one nc file will be generated per raw image,
which can be loaded using :func:`pyopia.io.combine_stats_netcdf_files`
This is useful for larger datasets, where appending causes substantial slowdown
as the dataset gets larger.
'''

if len(stats) == 0: # to avoid issue with wrong time datatypes in xarray
Expand All @@ -51,12 +57,15 @@ def write_stats(
meta.attrs['Pipeline steps'] = settings
elif dataformat == 'nc':
xstats = make_xstats(stats, settings)
if os.path.isfile(datafilename + '-STATS.nc'):
if append and os.path.isfile(datafilename + '-STATS.nc'):
existing_stats = load_stats(datafilename + '-STATS.nc')
xstats = xarray.concat([existing_stats, xstats], 'index')

xstats.index.values[:] = range(0, xstats.index.size)
elif not append:
xstats = xstats.set_index(index="index")
datafilename += ('-Image-D' +
str(xstats['timestamp'][0].values).replace('-', '').replace(':', '').replace('.', '-'))
encoding = {k: {'dtype': 'str'} for k in ['export name', 'holo_filename'] if k in xstats.data_vars}

xstats.to_netcdf(datafilename + '-STATS.nc', encoding=encoding)


Expand Down Expand Up @@ -94,7 +103,7 @@ def load_stats(datafilename):
Returns
-------
DataFrame
STATS SataFrame
STATS DataFrame / xarray dataset
'''

if datafilename.endswith('.nc'):
Expand All @@ -110,6 +119,25 @@ def load_stats(datafilename):
return stats


def combine_stats_netcdf_files(path_to_data):
'''Combine a multi-file directory of STATS.nc files into a 'stats' xarray dataset created by :func:`pyopia.io.write_stats`
when using 'append = false'
Parameters
----------
path_to_data : str
Folder name containing nc files with pattern '*Image-D*-STATS.nc'
Returns
-------
DataFrame
STATS xarray dataset
'''
xstats = xarray.open_mfdataset(os.path.join(path_to_data, '*Image-D*-STATS.nc'), combine='nested', concat_dim='index')
xstats = xstats.set_index(range(0, xstats.index.size))
return xstats


def load_stats_as_dataframe(stats_file):
'''A loading function for stats files that forces stats into a pandas DataFrame
Expand Down Expand Up @@ -159,6 +187,12 @@ class StatsToDisc():
dataformat (str): either 'nc' or 'h5
append (bool): if to allow append to an existing STATS file. Defaults to True
export_name_len (int): max number of chars allowed for col 'export name'. Defaults to 40
append (bool): Append all processed data into one nc file.
Defaults to True.
If False, then one nc file will be generated per raw image,
which can be loaded using :func:`pyopia.io.combine_stats_netcdf_files`
This is useful for larger datasets, where appending causes substantial slowdown
as the dataset gets larger.
Returns:
data (dict): data from pipeline
Expand All @@ -170,21 +204,25 @@ class StatsToDisc():
[steps.output]
pipeline_class = 'pyopia.io.StatsToDisc'
output_datafile = './test' # prefix path for output nc file
append = true
'''
def __init__(self,
output_datafile='data',
dataformat='nc',
export_name_len=40):
export_name_len=40,
append=True):

self.output_datafile = output_datafile
self.dataformat = dataformat
self.export_name_len = export_name_len
self.append = append

def __call__(self, data):
write_stats(data['stats'], self.output_datafile,
settings=data['settings'],
dataformat=self.dataformat,
export_name_len=self.export_name_len)
export_name_len=self.export_name_len,
append=self.append)

return data

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ urllib3 = "<2.0"
gdown = "^4.7.1"
cmocean = "^3.0.3"
toml = "^0.10.2"
xarray = "^2023.8.0"
xarray = "^2023.12.0"
typer = {extras = ["all"], version = "^0.9.0"}
pandas = {version = "^2.1.1", extras = ["computation"]}
h5py = "^3.9.0"
Expand All @@ -47,6 +47,7 @@ tensorflow-cpu = {version = "2.11.0", optional = true}
tensorflow-io-gcs-filesystem = [
{version = ">=0.31.0", optional=true}
]
dask = ">=2024.8.1"

[tool.poetry.extras]
classification-arm64 = ["tensorflow-io-gcs-filesystem", "tensorflow-macos"]
Expand Down

0 comments on commit 7d586cb

Please sign in to comment.