-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Task/bulk sms poor csv performance (#350)
* Add decorator to process iterables in parallel * Bump utils + waffles version * Add auto scaling to parallelizing decorator - Added a few more tests * Add exception handling and logging - Tweaked control_chunk_and_worker_size to ensure that the chunk sizes fit evenly with the number of available workers so long as the chunk size does not exceed the max chunk size. This should mean fewer instances of where a few items from the iterable are left over and a final thread is spawned to process them after the initial batch * Refine break_condition type hinting Co-authored-by: Jimmy Royer <[email protected]> * Add is_atomic flag to control behaviour - Code cleanups --------- Co-authored-by: Jumana B <[email protected]> Co-authored-by: Jimmy Royer <[email protected]>
- Loading branch information
1 parent
195b462
commit 0fb484b
Showing
6 changed files
with
235 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,4 +2,4 @@ docopt==0.6.2 | |
Flask==2.3.3 | ||
markupsafe==2.1.5 | ||
setuptools==75.6.0 # required for distutils in Python 3.12 | ||
git+https://github.com/cds-snc/[email protected].1#egg=notifications-utils | ||
git+https://github.com/cds-snc/[email protected].2#egg=notifications-utils |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# Helper function to chunk a list | ||
from itertools import islice | ||
from typing import Generator, Iterable | ||
|
||
|
||
def chunk_iterable(iterable_collection: Iterable, chunk_size: int) -> Generator: | ||
"""Helper function to chunk an iterable collection in preparation for parallel processing. | ||
Args: | ||
iterable_collection (Iterable): The collection to be chunked | ||
chunk_size (int): The size of each chunk | ||
Yields: | ||
list: The next chunk of the iterable | ||
""" | ||
iterable = iter(iterable_collection) | ||
while True: | ||
chunk = list(islice(iterable, chunk_size)) | ||
if not chunk: | ||
break | ||
yield chunk |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import math | ||
import multiprocessing | ||
|
||
|
||
def control_chunk_and_worker_size(data_size=None, chunk_size=None, max_workers=None): | ||
"""Attempts to optimize the chunk size and number of workers based on the size of the data to be processed. The following rules are applied: | ||
- Max concurrently allowed workers is the number of CPU cores | ||
- 1 worker is used when data sets <= 1000 | ||
- Chunk sizes are capped at 10,000 and are calculated with: `data_size / max_chunk_size` | ||
- For chunk sizes < 10,000 worker counts are scaled up | ||
- For chunk sizes that would be >= 10,000 the concurrent workers scale down to 5 to limit CPU context switching | ||
Args: | ||
data_size (int, optional): Size of the iterable being chunked. Defaults to 40000. | ||
chunk_size (int, optional): Overrides default chunk_size of 10000. Defaults to None. | ||
max_workers (int, optional): Overrides default max workers of 10. Defaults to None. | ||
Returns: | ||
tuple[int, int]: The optimized chunk size and number of workers to execute in parallel. | ||
""" | ||
MIN_CHUNK_SIZE = 1000 | ||
MAX_CHUNK_SIZE = 10000 if not chunk_size else chunk_size | ||
MAX_WORKERS = multiprocessing.cpu_count() if not max_workers else max_workers | ||
|
||
if data_size <= MIN_CHUNK_SIZE: | ||
return MIN_CHUNK_SIZE, 1 | ||
|
||
# Initial chunk size | ||
ideal_chunk_size = max(data_size // MAX_WORKERS, MIN_CHUNK_SIZE) | ||
ideal_chunk_size = min(ideal_chunk_size, MAX_CHUNK_SIZE) | ||
|
||
# Adjust the chunk size to ensure no leftovers | ||
worker_count = min(math.ceil(data_size / ideal_chunk_size), MAX_WORKERS) # noqa: F821 | ||
chunk_size = math.ceil(data_size / worker_count) | ||
|
||
# Ensure chunk size remains within min and max chunk size bounds | ||
chunk_size = max(MIN_CHUNK_SIZE, min(chunk_size, MAX_CHUNK_SIZE)) | ||
worker_count = math.ceil(data_size / chunk_size) | ||
|
||
# Suppress workers for larger chunks to avoid memory and/or context switching overhead | ||
if chunk_size > MAX_CHUNK_SIZE * 0.8: | ||
worker_count = min(worker_count, MAX_WORKERS // 2) | ||
else: | ||
worker_count = min(worker_count, MAX_WORKERS) | ||
|
||
return chunk_size, worker_count |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters