diff --git a/rsciio/mrc/_api.py b/rsciio/mrc/_api.py index 5370d4d7..9735bb8e 100644 --- a/rsciio/mrc/_api.py +++ b/rsciio/mrc/_api.py @@ -140,7 +140,7 @@ def get_data_type(mode): raise ValueError(f"Unrecognised mode '{mode}'.") -def read_de_metadata_file(filename, nav_shape=None): +def read_de_metadata_file(filename, nav_shape=None, scan_pos_file=None): """This reads the metadata ".txt" file that is saved alongside a DE .mrc file. There are 3 ways in which the files are saved: @@ -163,20 +163,37 @@ def read_de_metadata_file(filename, nav_shape=None): original_metadata = {} with open(filename) as metadata: for line in metadata.readlines(): - key, value = line.split("=") - key = key.strip() - value = value.strip() - original_metadata[key] = value - + try: + key, value = line.split("=") + key = key.strip() + value = value.strip() + original_metadata[key] = value + except ValueError: + _logger.warning( + f"Could not parse line: {line} in metadata file {filename} " + f"Each line should be in the form 'key = value'." + ) # -1 -> Not read from TEM Channel 0 -> TEM 1 -> STEM in_stem_mode = int(original_metadata.get("Instrument Project TEMorSTEM Mode", -1)) scanning = original_metadata.get("Scan - Enable", "Disable") == "Enable" raster = original_metadata.get("Scan - Type", "Raster") == "Raster" - if not raster: # pragma: no cover - _logger.warning( - "Non-raster scans are not fully supported yet. Please raise an issue on GitHub" - " if you need this feature." - ) + + if scanning and not raster: + if scan_pos_file is None: + raise ValueError("Scan position file is required for non-raster scans.") + positions = np.loadtxt(scan_pos_file, delimiter=",", dtype=int) + nav_shape = (positions[:, 1].max() + 1, positions[:, 0].max() + 1) + unique_pos, counts = np.unique(positions, axis=0, return_counts=True) + repeats = np.max(counts) + if repeats > 1: + nav_shape = ( + nav_shape[0], + nav_shape[1], + repeats, + ) # repeat the scan positions + else: + positions = None + if in_stem_mode == -1: in_stem_mode = scanning elif in_stem_mode == 0: # 0 -> TEM Mode @@ -191,7 +208,9 @@ def read_de_metadata_file(filename, nav_shape=None): has_camera_length != -1 or in_stem_mode ) # Force diffracting if in STEM mode - if in_stem_mode and scanning and raster or nav_shape is not None: + if ( + in_stem_mode and scanning and raster or nav_shape is not None + ): # nav-shape is not None for pos files axes_scales = np.array( [ float( @@ -292,7 +311,7 @@ def read_de_metadata_file(filename, nav_shape=None): }, "General": {"timestamp": timestamp}, } - return original_metadata, metadata, axes, nav_shape + return original_metadata, metadata, axes, nav_shape, positions def file_reader( @@ -306,6 +325,7 @@ def file_reader( metadata_file="auto", virtual_images=None, external_images=None, + scan_file=None, ): """ File reader for the MRC format for tomographic and 4D-STEM data. @@ -328,6 +348,9 @@ def file_reader( external_images : list A list of filenames of external images (e.g. external detectors) to be loaded alongside the main data. For DE movies these are automatically loaded. + scan_file : str + The filename of the scan file. This is only used for DE movies to determine the scan order + for custom scans. %s """ @@ -342,7 +365,10 @@ def file_reader( suffix = "_".join(split[2:-1]) else: suffix = "" + if len(dir_name) == 0: + dir_name = "." metadata = glob.glob(dir_name + "/" + unique_id + suffix + "_info.txt") + virtual_images = glob.glob( dir_name + "/" + unique_id + suffix + "_[0-4]_*.mrc" ) @@ -354,6 +380,16 @@ def file_reader( ): # Not a DE movie or File Naming Convention is not followed _logger.warning("Could not find metadata file for DE movie.") metadata = [] + try: + scan_file = glob.glob( + f"{dir_name}/{unique_id}_{suffix}*coordinates.csv" + )[0] + except IndexError: + _logger.warning( + f"Could not find scan file [{dir_name}/{unique_id}_{suffix}*_coordinates.csv]" + f" for DE movie {filename}." + ) + scan_file = None else: metadata = [] if len(metadata) == 1: @@ -368,7 +404,10 @@ def file_reader( metadata, metadata_axes, _navigation_shape, - ) = read_de_metadata_file(metadata_file, nav_shape=navigation_shape) + positions, + ) = read_de_metadata_file( + metadata_file, nav_shape=navigation_shape, scan_pos_file=scan_file + ) if navigation_shape is None: navigation_shape = _navigation_shape original_metadata = {"de_metadata": de_metadata} @@ -376,6 +415,7 @@ def file_reader( original_metadata = {} metadata = {"General": {}, "Signal": {}} metadata_axes = None + positions = None metadata["General"]["original_filename"] = os.path.split(filename)[1] metadata["Signal"]["signal_type"] = "" @@ -412,12 +452,14 @@ def file_reader( shape = (NX[0], NY[0], NZ[0]) if navigation_shape is not None: shape = shape[:2] + navigation_shape - if distributed: + + if distributed or positions is not None: data = memmap_distributed( filename, offset=f.tell(), shape=shape[::-1], dtype=get_data_type(std_header["MODE"]), + positions=positions, chunks=chunks, ) if not lazy: diff --git a/rsciio/utils/distributed.py b/rsciio/utils/distributed.py index f880a9fa..b2fb4659 100644 --- a/rsciio/utils/distributed.py +++ b/rsciio/utils/distributed.py @@ -73,7 +73,55 @@ def get_chunk_slice( return da.from_array(slices, chunks=(1,) * len(shape) + slices.shape[-2:]), chunks -def slice_memmap(slices, file, dtypes, shape, key=None, **kwargs): +def get_arbitrary_chunk_slice( + positions, + shape, + chunks="auto", + block_size_limit=None, + dtype=None, +): + """ + Get chunk slices for the :func:`rsciio.utils.distributed.slice_memmap` function. From arbitrary positions + given by a list of x, y coordinates. + + Parameters + ---------- + positions: array-like + A numpy array in the form [[x1, y1], [x2, y2], ...] where x, y map the frame to the + real space coordinate of the data. + signal_shape : tuple + Shape of the signal data. + chunks : tuple, optional + Chunk shape. The default is "auto". + block_size_limit : int, optional + Maximum size of a block in bytes. The default is None. This is passed + to the :py:func:`dask.array.core.normalize_chunks` function when chunks == "auto". + dtype : numpy.dtype, optional + Data type. The default is None. This is passed to the + :py:func:`dask.array.core.normalize_chunks` function when chunks == "auto". + """ + if not isinstance(positions, np.ndarray): + positions = np.array(positions) + if chunks == "auto": + chunks = ("auto",) * (len(shape) - 2) + (-1, -1) + elif chunks[-2:] != (-1, -1): + raise ValueError("Last two dimensions of chunks must be -1") + chunks = da.core.normalize_chunks( + chunks=chunks, shape=shape, limit=block_size_limit, dtype=dtype + ) + pos_mapping = np.zeros(shape=shape[:-2] + (1, 1), dtype=int) + + for i, p in enumerate(positions): + pos_mapping[p[1], p[0]] = i + 1 + pos_mapping = pos_mapping - 1 # 0 based indexing, -1 for the empty frames + + # Now we chunk the pos_mapping array. In the case each frame remains in a single chunk and we only + # return the navigation dimensions. Later when we populate the data we will use the pos_mapping array + # map some frame index to the position within a dense array. + return da.from_array(pos_mapping, chunks=chunks[:-2] + (1, 1)), chunks + + +def slice_memmap(slices, file, dtypes, shape, key=None, positions=False, **kwargs): """ Slice a memory mapped file using a tuple of slices. @@ -108,13 +156,17 @@ def slice_memmap(slices, file, dtypes, shape, key=None, **kwargs): data = np.memmap(file, dtypes, shape=shape, **kwargs) if key is not None: data = data[key] - slices_ = tuple([slice(s[0], s[1]) for s in slices_]) - return data[slices_] + if positions: + return data[slices_] + else: + slices_ = tuple([slice(s[0], s[1]) for s in slices_]) + return data[slices_] def memmap_distributed( filename, dtype, + positions=None, offset=0, shape=None, order="C", @@ -177,15 +229,32 @@ def memmap_distributed( shape = int(os.path.getsize(filename) / unit_size) if not isinstance(shape, tuple): shape = (shape,) - - # Separates slices into appropriately sized chunks. - chunked_slices, data_chunks = get_chunk_slice( - shape=shape + sub_array_shape, - chunks=chunks, - block_size_limit=block_size_limit, - dtype=array_dtype, - ) num_dim = len(shape) + if positions is not None: + # We have arbitrary positions + chunked_slices, data_chunks = get_arbitrary_chunk_slice( + positions=positions, + shape=shape + sub_array_shape, + chunks=chunks, + block_size_limit=block_size_limit, + dtype=array_dtype, + ) + drop_axes = None + use_positions = True + shape = (len(positions),) + shape[-2:] # update the shape to be linear + else: + # Separates slices into appropriately sized chunks. + chunked_slices, data_chunks = get_chunk_slice( + shape=shape + sub_array_shape, + chunks=chunks, + block_size_limit=block_size_limit, + dtype=array_dtype, + ) + drop_axes = ( + num_dim, + num_dim + 1, + ) # Dask 2021.10.0 minimum to use negative indexing + use_positions = False data = da.map_blocks( slice_memmap, chunked_slices, @@ -197,10 +266,8 @@ def memmap_distributed( dtypes=dtype, offset=offset, chunks=data_chunks, - drop_axis=( - num_dim, - num_dim + 1, - ), # Dask 2021.10.0 minimum to use negative indexing + drop_axis=drop_axes, + positions=use_positions, key=key, ) return data