diff --git a/changelog.d/20241126_094511_msarniak_fix_creation_time.md b/changelog.d/20241126_094511_msarniak_fix_creation_time.md new file mode 100644 index 000000000000..7bb1eab6be62 --- /dev/null +++ b/changelog.d/20241126_094511_msarniak_fix_creation_time.md @@ -0,0 +1,4 @@ +### Fixed + +- Task creation performance for > 50k frames jobs + () diff --git a/cvat/apps/engine/media_extractors.py b/cvat/apps/engine/media_extractors.py index 9bee5fbee4cb..6fde4824f4ca 100644 --- a/cvat/apps/engine/media_extractors.py +++ b/cvat/apps/engine/media_extractors.py @@ -208,6 +208,7 @@ def __init__( dimension: DimensionType = DimensionType.DIM_2D ): self._source_path = source_path + self._source_path_set = set(source_path) if source_path else set() self._step = step @@ -312,7 +313,7 @@ def __iter__(self): yield (self.get_image(i), self.get_path(i), i) def __contains__(self, media_file): - return media_file in self._source_path + return media_file in self._source_path_set def filter(self, callback): source_path = list(filter(callback, self._source_path)) diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index 1158bfa700a6..b4d554838817 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -250,7 +250,11 @@ def _count_files(data): if not os.path.dirname(v[0]).startswith(v[1])] # we need to keep the original sequence of files - data['server_files'] = [f for f in server_files if f in without_extra_dirs] + # Convert without_extra_dirs to a set for O(1) lookups + without_extra_dirs_set = set(without_extra_dirs) if without_extra_dirs else set() + + # Filter server_files based on whether they exist in the set + data['server_files'] = [f for f in server_files if f in without_extra_dirs_set] def count_files(file_mapping, counter): for rel_path, full_path in file_mapping.items(): @@ -722,9 +726,12 @@ def _update_status(msg: str) -> None: # We only need to process the files specified in job_file_mapping if job_file_mapping is not None: + # Convert data['server_files'] to a set for O(1) membership checks + server_files_set = set(data['server_files']) if data['server_files'] else set() + filtered_files = [] for f in itertools.chain.from_iterable(job_file_mapping): - if f not in data['server_files']: + if f not in server_files_set: raise ValidationError(f"Job mapping file {f} is not specified in input files") filtered_files.append(f) data['server_files'] = filtered_files diff --git a/utils/dataset_manifest/core.py b/utils/dataset_manifest/core.py index c50c3c470cc6..98bea214cf5b 100644 --- a/utils/dataset_manifest/core.py +++ b/utils/dataset_manifest/core.py @@ -682,10 +682,14 @@ def data(self): def get_subset(self, subset_names): index_list = [] subset = [] + # First, create a dictionary mapping image names to their indices + name_to_index = {name: index for index, name in enumerate(subset_names)} if subset_names else {} + + # Now, loop through the images and check against the dictionary for _, image in self: image_name = f"{image.full_name}" - if image_name in subset_names: - index_list.append(subset_names.index(image_name)) + if image_name in name_to_index: + index_list.append(name_to_index[image_name]) properties = { "name": f"{image['name']}", "extension": f"{image['extension']}",