diff --git a/backup/dictionary.pkl b/backup/dictionary.pkl index 4d24649..1316889 100644 Binary files a/backup/dictionary.pkl and b/backup/dictionary.pkl differ diff --git a/backup/processing_progress.txt b/backup/processing_progress.txt index b85671a..4b0ad8d 100644 --- a/backup/processing_progress.txt +++ b/backup/processing_progress.txt @@ -1 +1 @@ -19885937,18351999999 \ No newline at end of file +20392261,18823999999 \ No newline at end of file diff --git a/dictionary.msgpack b/dictionary.msgpack index b2eba50..1559b9f 100644 Binary files a/dictionary.msgpack and b/dictionary.msgpack differ diff --git a/lib/merge_batches.py b/lib/merge_batches.py index b6aaf6c..e566da7 100644 --- a/lib/merge_batches.py +++ b/lib/merge_batches.py @@ -278,15 +278,23 @@ def finish_merge(): # Delete all files in training/processed_batches shutil.rmtree('training/processed_batches', ignore_errors=True) + shutil.rmtree('training/copy_of_batches_being_processed_in_this_round', ignore_errors=True) + shutil.copy('training/dictionary.pkl', 'training/batches') def main(): # If training/batches has more than one file, run the function with the first two files + os.makedirs('training/copy_of_batches_being_processed_in_this_round', exist_ok=True) shutil.rmtree('training/batches_to_process', ignore_errors=True) os.makedirs('training/batches_to_process', exist_ok=True) threads = [] for file in os.listdir('training/batches'): - thread = threading.Thread(target=perform_file_operation, args=(f'training/batches/{file}', f'training/batches_to_process/{file}', 'copy')) + thread = threading.Thread(target=perform_file_operation, args=(f'training/batches/{file}', f'training/copy_of_batches_being_processed_in_this_round/{file}', 'move')) + threads.append(thread) + thread.start() + + for file in os.listdir('training/copy_of_batches_being_processed_in_this_round'): + thread = threading.Thread(target=perform_file_operation, args=(f'training/copy_of_batches_being_processed_in_this_round/{file}', f'training/batches_to_process/{file}', 'copy')) threads.append(thread) thread.start() diff --git a/tokens.msgpack b/tokens.msgpack index 2883909..4dc24d2 100644 Binary files a/tokens.msgpack and b/tokens.msgpack differ