From 7c0978bef778073224a31ad74ed41b37a7a9f8a9 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Tue, 22 Oct 2024 16:59:56 -0400 Subject: [PATCH 1/8] Refactor build.yaml workflow: Update Python setup and dependencies --- README.md | 8 +- simpler_whisper/whisper.py | 95 ++++++++++ src/whisper_wrapper.cpp | 345 +++++++++++++++++++++++++++++++++++-- test_simpler_whisper.py | 36 +++- 4 files changed, 456 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 51206c2..a18c115 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ![Build and Test](https://img.shields.io/github/actions/workflow/status/locaal-ai/simpler-whisper/build.yaml) -A zero-dependency simple Python wrapper for [whisper.cpp](https://github.com/ggerganov/whisper.cpp), providing an easy-to-use interface for speech recognition using the Whisper model. +A zero-dependency simple Python wrapper for [whisper.cpp](https://github.com/ggerganov/whisper.cpp), providing an easy-to-use interface for speech recognition using the Whisper model. Why is it better than [faster-whisper](https://github.com/SYSTRAN/faster-whisper) and [pywhispercpp](https://github.com/abdeladim-s/pywhispercpp): - Zero-dependency: Everything is shipped with the built wheel, no Python dependency (on `av` or `ctranslate2` etc.) except for `numpy`. @@ -30,7 +30,7 @@ pip install simpler-whisper import simpler_whisper.whisper import numpy as np -# Load the model file. +# Load the model file. # It's on you to download one from https://huggingface.co/ggerganov/whisper.cpp model = simpler_whisper.whisper.load_model("path/to/model.bin") @@ -79,7 +79,7 @@ Simpler Whisper supports various build configurations to optimize for different ### Example: Building for Windows with CUDA acceleration ```powershell -$env:SIMPLER_WHISPER_ACCELERATION=cuda +$env:SIMPLER_WHISPER_ACCELERATION="cuda" pip install . ``` @@ -91,4 +91,4 @@ SIMPLER_WHISPER_PLATFORM=arm64 pip install . ## License -This project is licensed under the MIT License - see the LICENSE file for details. \ No newline at end of file +This project is licensed under the MIT License - see the LICENSE file for details. diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py index 2defc93..605c365 100644 --- a/simpler_whisper/whisper.py +++ b/simpler_whisper/whisper.py @@ -21,10 +21,105 @@ def __del__(self): del self.model +class ThreadedWhisperModel: + def __init__(self, model_path: str, use_gpu=False, max_duration_sec=10.0, sample_rate=16000): + """ + Initialize a threaded Whisper model for continuous audio processing. + + Args: + model_path (str): Path to the Whisper model file + use_gpu (bool): Whether to use GPU acceleration + max_duration_sec (float): Maximum duration in seconds before finalizing a segment + sample_rate (int): Audio sample rate (default: 16000) + """ + self.model = _whisper_cpp.ThreadedWhisperModel( + model_path, use_gpu, max_duration_sec, sample_rate + ) + self._is_running = False + + def start(self, callback, result_check_interval_ms=100): + """ + Start the processing threads with a callback for results. + + Args: + callback: Function that takes three arguments: + - chunk_id (int): Unique identifier for the audio chunk + - segments (list): List of transcribed text segments + - is_partial (bool): Whether this is a partial result + result_check_interval_ms (int): How often to check for results + """ + if self._is_running: + return + + self.model.start(callback, result_check_interval_ms) + self._is_running = True + + def stop(self): + """ + Stop processing and clean up resources. + Any remaining audio will be processed as a final segment. + """ + if not self._is_running: + return + + self.model.stop() + self._is_running = False + + def queue_audio(self, audio): + """ + Queue audio for processing. + + Args: + audio: Audio samples as numpy array or array-like object. + Will be converted to float32. + + Returns: + chunk_id (int): Unique identifier for this audio chunk + """ + # Ensure audio is a numpy array of float32 + audio = np.array(audio, dtype=np.float32) + return self.model.queue_audio(audio) + + def set_max_duration(self, max_duration_sec, sample_rate=16000): + """ + Change the maximum duration for partial segments. + + Args: + max_duration_sec (float): New maximum duration in seconds + sample_rate (int): Audio sample rate (default: 16000) + """ + self.model.set_max_duration(max_duration_sec, sample_rate) + + def __del__(self): + # Ensure threads are stopped and resources cleaned up + if hasattr(self, "model"): + if self._is_running: + self.stop() + del self.model + + def load_model(model_path: str, use_gpu=False) -> WhisperModel: return WhisperModel(model_path, use_gpu) +def load_threaded_model( + model_path: str, use_gpu=False, max_duration_sec=10.0, sample_rate=16000 +) -> ThreadedWhisperModel: + """ + Load a threaded Whisper model for continuous audio processing. + + Args: + model_path (str): Path to the Whisper model file + use_gpu (bool): Whether to use GPU acceleration + max_duration_sec (float): Maximum duration in seconds before finalizing a segment + sample_rate (int): Audio sample rate (default: 16000) + + Returns: + ThreadedWhisperModel: A model instance ready for processing + """ + return ThreadedWhisperModel(model_path, use_gpu, max_duration_sec, sample_rate) + + def set_log_callback(callback): """ Set a custom logging callback function. diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp index 63506f8..4c8e374 100644 --- a/src/whisper_wrapper.cpp +++ b/src/whisper_wrapper.cpp @@ -2,9 +2,37 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include namespace py = pybind11; +// Global variable to store the Python callback function +py::function g_py_log_callback; + +// C++ callback function that will be passed to whisper_log_set +void cpp_log_callback(ggml_log_level level, const char *text, void *) +{ + if (!g_py_log_callback.is_none()) + { + g_py_log_callback(level, text); + } +} + +// Function to set the log callback +void set_log_callback(py::function callback) +{ + g_py_log_callback = callback; + whisper_log_set(cpp_log_callback, nullptr); + ggml_log_set(cpp_log_callback, nullptr); +} + +// Original synchronous implementation class WhisperModel { public: @@ -12,6 +40,7 @@ class WhisperModel { whisper_context_params ctx_params = whisper_context_default_params(); ctx_params.use_gpu = use_gpu; + std::cout << "WhisperModel c'tor Loading model from path: " << model_path << std::endl; ctx = whisper_init_from_file_with_params(model_path.c_str(), ctx_params); if (!ctx) { @@ -21,6 +50,7 @@ class WhisperModel ~WhisperModel() { + std::cout << "WhisperModel d'tor Freeing whisper context" << std::endl; if (ctx) { whisper_free(ctx); @@ -33,20 +63,37 @@ class WhisperModel float *audio_data = static_cast(audio_buffer.ptr); int n_samples = audio_buffer.size; - whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + std::vector segments = transcribe_raw_audio(audio_data, n_samples); + + py::list result; + for (const auto &segment : segments) + { + result.append(segment); + } + return result; + } + + std::vector transcribe_raw_audio(const float *audio_data, int n_samples) + { + std::cout << "Transcribing audio with " << n_samples << " samples" << std::endl; + std::cout << "first sample: " << audio_data[0] << std::endl; + std::cout << "last sample: " << audio_data[n_samples - 1] << std::endl; + + whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); if (whisper_full(ctx, params, audio_data, n_samples) != 0) { + std::cout << "Whisper inference failed" << std::endl; throw std::runtime_error("Whisper inference failed"); } int n_segments = whisper_full_n_segments(ctx); - py::list transcription; + std::vector transcription; for (int i = 0; i < n_segments; i++) { const char *text = whisper_full_get_segment_text(ctx, i); - transcription.append(py::str(text)); + transcription.push_back(std::string(text)); } return transcription; @@ -56,32 +103,294 @@ class WhisperModel whisper_context *ctx; }; -// Global variable to store the Python callback function -py::function g_py_log_callback; +struct AudioChunk +{ + std::vector data; + size_t id; +}; -// C++ callback function that will be passed to whisper_log_set -void cpp_log_callback(ggml_log_level level, const char *text, void *) +struct TranscriptionResult { - if (!g_py_log_callback.is_none()) + size_t chunk_id; + std::vector segments; + bool is_partial; +}; + +class ThreadedWhisperModel +{ +public: + ThreadedWhisperModel(const std::string &model_path, bool use_gpu = false, + float max_duration_sec = 10.0f, int sample_rate = 16000) + : running(false), next_chunk_id(0), + max_samples(static_cast(max_duration_sec * sample_rate)), + accumulated_samples(0), current_chunk_id(0), model_path(model_path), + use_gpu(use_gpu) { - g_py_log_callback(level, text); } -} -// Function to set the log callback -void set_log_callback(py::function callback) -{ - g_py_log_callback = callback; - whisper_log_set(cpp_log_callback, nullptr); - ggml_log_set(cpp_log_callback, nullptr); -} + ~ThreadedWhisperModel() + { + stop(); + } + + void start(py::function callback, int result_check_interval_ms = 100) + { + if (running) + return; + + running = true; + result_callback = callback; + + process_thread = std::thread(&ThreadedWhisperModel::processThread, this); + result_thread = std::thread(&ThreadedWhisperModel::resultThread, this, + result_check_interval_ms); + } + + void stop() + { + if (!running) + return; + running = false; + + { + std::lock_guard lock(input_mutex); + input_cv.notify_one(); + } + + { + std::lock_guard lock(result_mutex); + result_cv.notify_one(); + } + + if (process_thread.joinable()) + process_thread.join(); + if (result_thread.joinable()) + result_thread.join(); + + // Clear accumulated buffer + { + std::lock_guard lock(buffer_mutex); + accumulated_buffer.clear(); + accumulated_samples = 0; + } + } + + size_t queueAudio(py::array_t audio) + { + auto buffer = audio.request(); + float *data = static_cast(buffer.ptr); + size_t n_samples = buffer.size; + + AudioChunk chunk; + chunk.data.assign(data, data + n_samples); + chunk.id = next_chunk_id++; + + { + std::lock_guard lock(input_mutex); + input_queue.push(std::move(chunk)); + input_cv.notify_one(); + } + + return chunk.id; + } + + void setMaxDuration(float max_duration_sec, int sample_rate = 16000) + { + max_samples = static_cast(max_duration_sec * sample_rate); + } + +private: + void processAccumulatedAudio(WhisperModel &model, bool force_final = false) + { + std::vector process_buffer; + size_t current_id; + + { + std::lock_guard lock(buffer_mutex); + std::cout << "Processing accumulated audio with size: " << accumulated_buffer.size() << std::endl; + if (accumulated_buffer.empty()) + return; + + process_buffer = accumulated_buffer; + current_id = current_chunk_id; + + // Only clear the buffer if we're processing a final result + if (force_final || accumulated_samples >= max_samples) + { + accumulated_buffer.clear(); + accumulated_samples = 0; + } + } + + // Process audio + std::cout << "Processing audio with size: " << process_buffer.size() << std::endl; + std::cout << "Pointer to first sample: " << process_buffer.data() << std::endl; + std::cout << "First sample: " << process_buffer[0] << std::endl; + std::vector segments = model.transcribe_raw_audio(process_buffer.data(), process_buffer.size()); + + TranscriptionResult result; + result.chunk_id = current_id; + result.segments = segments; + // Set partial flag based on whether this is a final result + result.is_partial = !(force_final || process_buffer.size() >= max_samples); + + // Add result to output queue + { + std::lock_guard lock(result_mutex); + result_queue.push(std::move(result)); + result_cv.notify_one(); + } + } + + void processThread() + { + std::cout << "Starting process thread." << std::endl; + + std::cout << "Loading model from path: " << this->model_path << std::endl; + WhisperModel model(this->model_path, this->use_gpu); + std::cout << "Model loaded." << std::endl; + + while (running) + { + AudioChunk chunk; + bool has_chunk = false; + + // Get next chunk from input queue + { + std::unique_lock lock(input_mutex); + input_cv.wait(lock, [this] + { return !input_queue.empty() || !running; }); + + if (!running) + { + // Process any remaining audio as final before shutting down + std::cout << "Shutting down, processing remaining audio as final." << std::endl; + processAccumulatedAudio(model, true); + break; + } + + if (!input_queue.empty()) + { + chunk = std::move(input_queue.front()); + input_queue.pop(); + has_chunk = true; + std::cout << "Got chunk with ID: " << chunk.id << " and size: " << chunk.data.size() << std::endl; + } + } + + if (has_chunk) + { + // Add to accumulated buffer + { + std::lock_guard lock(buffer_mutex); + size_t old_size = accumulated_buffer.size(); + accumulated_buffer.resize(old_size + chunk.data.size()); + std::copy(chunk.data.begin(), chunk.data.end(), + accumulated_buffer.begin() + old_size); + + accumulated_samples += chunk.data.size(); + current_chunk_id = chunk.id; + std::cout << "Accumulated buffer size: " << accumulated_buffer.size() << std::endl; + } + + // Process the accumulated audio + std::cout << "Processing accumulated audio." << std::endl; + processAccumulatedAudio(model, false); + } + } + std::cout << "Exiting process thread." << std::endl; + } + + void resultThread(int check_interval_ms) + { + while (running) + { + std::vector results; + + { + std::unique_lock lock(result_mutex); + result_cv.wait_for(lock, + std::chrono::milliseconds(check_interval_ms), + [this] + { return !result_queue.empty() || !running; }); + + if (!running && result_queue.empty()) + break; + + while (!result_queue.empty()) + { + results.push_back(std::move(result_queue.front())); + result_queue.pop(); + } + } + + if (!results.empty()) + { + py::gil_scoped_acquire gil; + for (const auto &result : results) + { + py::list segments; + for (const auto &segment : result.segments) + { + segments.append(segment); + } + result_callback(result.chunk_id, segments, result.is_partial); + } + } + } + } + + whisper_context *ctx; + std::atomic running; + std::atomic next_chunk_id; + size_t current_chunk_id; + + // Audio accumulation + std::vector accumulated_buffer; + size_t accumulated_samples; + size_t max_samples; + std::mutex buffer_mutex; + + std::thread process_thread; + std::thread result_thread; + + std::queue input_queue; + std::mutex input_mutex; + std::condition_variable input_cv; + + std::queue result_queue; + std::mutex result_mutex; + std::condition_variable result_cv; + + py::function result_callback; + + std::string model_path; + bool use_gpu; +}; PYBIND11_MODULE(_whisper_cpp, m) { + // Expose synchronous model py::class_(m, "WhisperModel") .def(py::init()) .def("transcribe", &WhisperModel::transcribe); + py::class_(m, "ThreadedWhisperModel") + .def(py::init(), + py::arg("model_path"), + py::arg("use_gpu") = false, + py::arg("max_duration_sec") = 10.0f, + py::arg("sample_rate") = 16000) + .def("start", &ThreadedWhisperModel::start, + py::arg("callback"), + py::arg("result_check_interval_ms") = 100) + .def("stop", &ThreadedWhisperModel::stop) + .def("queue_audio", &ThreadedWhisperModel::queueAudio) + .def("set_max_duration", &ThreadedWhisperModel::setMaxDuration, + py::arg("max_duration_sec"), + py::arg("sample_rate") = 16000); + + // Expose logging functionality m.def("set_log_callback", &set_log_callback, "Set the log callback function"); py::enum_(m, "LogLevel") @@ -89,4 +398,4 @@ PYBIND11_MODULE(_whisper_cpp, m) .value("WARN", GGML_LOG_LEVEL_WARN) .value("INFO", GGML_LOG_LEVEL_INFO) .export_values(); -} \ No newline at end of file +} diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py index 48667ec..384efc7 100644 --- a/test_simpler_whisper.py +++ b/test_simpler_whisper.py @@ -6,19 +6,17 @@ import numpy as np import time -from simpler_whisper.whisper import load_model, set_log_callback, LogLevel +from simpler_whisper.whisper import load_model, set_log_callback, LogLevel, ThreadedWhisperModel def my_log_callback(level, message): log_levels = {LogLevel.ERROR: "ERROR", LogLevel.WARN: "WARN", LogLevel.INFO: "INFO"} print(f"whisper.cpp [{log_levels.get(level, 'UNKNOWN')}] {message.strip()}") +# Path to your Whisper model file +model_path = R"ggml-tiny.en-q5_1.bin" def test_simpler_whisper(): - # Path to your Whisper model file - # Replace this with the path to your actual model file - model_path = R"ggml-model-whisper-tiny.en.bin" - try: set_log_callback(my_log_callback) @@ -62,6 +60,32 @@ def test_simpler_whisper(): except Exception as e: print(f"An error occurred: {str(e)}") +def test_threaded_whisper(): + def handle_result(chunk_id, segments, is_partial): + print(f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}):") + for segment in segments: + print(f" {segment}") + + # Create model with 10-second max duration + model = ThreadedWhisperModel(model_path=model_path, use_gpu=True, + max_duration_sec=10.0) + + # Start processing with callback + print("Starting threaded Whisper model...") + model.start(callback=handle_result) + + for i in range(15): + print(f"Queueing audio chunk {i + 1}") + # Queue some audio (will get partial results until 10 seconds accumulate) + chunk_id = model.queue_audio(np.random.rand(16000).astype(np.float32)) + print(f" Queued chunk {i + 1} with ID {chunk_id}") + # sleep for 1 seconds + time.sleep(1) + + # When done + print("Stopping threaded Whisper model...") + model.stop() # Will process any remaining audio as final if __name__ == "__main__": - test_simpler_whisper() + # test_simpler_whisper() + test_threaded_whisper() From 963a6133f7056c4fd3c7472512d7c1b1452ea062 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Wed, 23 Oct 2024 01:47:41 -0400 Subject: [PATCH 2/8] Refactor build.yaml workflow: Update Python setup and dependencies --- simpler_whisper/whisper.py | 11 ++++++++--- src/whisper_wrapper.cpp | 27 +++++++++++++++++++++------ test_simpler_whisper.py | 21 +++++++++++++++------ 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py index 605c365..3b7ac07 100644 --- a/simpler_whisper/whisper.py +++ b/simpler_whisper/whisper.py @@ -1,4 +1,5 @@ import numpy as np +from typing import Callable, List from . import _whisper_cpp @@ -22,7 +23,9 @@ def __del__(self): class ThreadedWhisperModel: - def __init__(self, model_path: str, use_gpu=False, max_duration_sec=10.0, sample_rate=16000): + def __init__( + self, model_path: str, use_gpu=False, max_duration_sec=10.0, sample_rate=16000 + ): """ Initialize a threaded Whisper model for continuous audio processing. @@ -37,14 +40,16 @@ def __init__(self, model_path: str, use_gpu=False, max_duration_sec=10.0, sample ) self._is_running = False - def start(self, callback, result_check_interval_ms=100): + def start( + self, callback: Callable[[int, str, bool], None], result_check_interval_ms=100 + ): """ Start the processing threads with a callback for results. Args: callback: Function that takes three arguments: - chunk_id (int): Unique identifier for the audio chunk - - segments (list): List of transcribed text segments + - segments (str): Transcribed text for the audio chunk - is_partial (bool): Whether this is a partial result result_check_interval_ms (int): How often to check for results """ diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp index 4c8e374..e193274 100644 --- a/src/whisper_wrapper.cpp +++ b/src/whisper_wrapper.cpp @@ -44,15 +44,16 @@ class WhisperModel ctx = whisper_init_from_file_with_params(model_path.c_str(), ctx_params); if (!ctx) { + std::cout << "Failed to initialize whisper context" << std::endl; throw std::runtime_error("Failed to initialize whisper context"); } } ~WhisperModel() { - std::cout << "WhisperModel d'tor Freeing whisper context" << std::endl; if (ctx) { + std::cout << "WhisperModel d'tor Freeing whisper context" << std::endl; whisper_free(ctx); } } @@ -86,15 +87,16 @@ class WhisperModel std::cout << "Whisper inference failed" << std::endl; throw std::runtime_error("Whisper inference failed"); } + std::cout << "Whisper inference succeeded" << std::endl; int n_segments = whisper_full_n_segments(ctx); std::vector transcription; - for (int i = 0; i < n_segments; i++) { const char *text = whisper_full_get_segment_text(ctx, i); transcription.push_back(std::string(text)); } + std::cout << "num segments: " << n_segments << std::endl; return transcription; } @@ -211,6 +213,13 @@ class ThreadedWhisperModel if (accumulated_buffer.empty()) return; + // check if buffer has less than 1 second of audio + if (accumulated_samples < 16000) + { + std::cout << "Not enough audio to process" << std::endl; + return; + } + process_buffer = accumulated_buffer; current_id = current_chunk_id; @@ -237,7 +246,7 @@ class ThreadedWhisperModel // Add result to output queue { std::lock_guard lock(result_mutex); - result_queue.push(std::move(result)); + result_queue.push(result); result_cv.notify_one(); } } @@ -326,15 +335,21 @@ class ThreadedWhisperModel if (!results.empty()) { + std::cout << "Got " << results.size() << " results." << std::endl; py::gil_scoped_acquire gil; for (const auto &result : results) { - py::list segments; + // concatenate segments into a single string + std::string full_text; for (const auto &segment : result.segments) { - segments.append(segment); + full_text += segment; + } + std::cout << "Calling result callback with ID: " << result.chunk_id << std::endl; + if (result_callback) + { + result_callback(result.chunk_id, full_text, result.is_partial); } - result_callback(result.chunk_id, segments, result.is_partial); } } } diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py index 384efc7..96e5fb6 100644 --- a/test_simpler_whisper.py +++ b/test_simpler_whisper.py @@ -6,16 +6,23 @@ import numpy as np import time -from simpler_whisper.whisper import load_model, set_log_callback, LogLevel, ThreadedWhisperModel +from simpler_whisper.whisper import ( + load_model, + set_log_callback, + LogLevel, + ThreadedWhisperModel, +) def my_log_callback(level, message): log_levels = {LogLevel.ERROR: "ERROR", LogLevel.WARN: "WARN", LogLevel.INFO: "INFO"} print(f"whisper.cpp [{log_levels.get(level, 'UNKNOWN')}] {message.strip()}") + # Path to your Whisper model file model_path = R"ggml-tiny.en-q5_1.bin" + def test_simpler_whisper(): try: set_log_callback(my_log_callback) @@ -60,15 +67,16 @@ def test_simpler_whisper(): except Exception as e: print(f"An error occurred: {str(e)}") + def test_threaded_whisper(): - def handle_result(chunk_id, segments, is_partial): + def handle_result(chunk_id, text, is_partial): print(f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}):") - for segment in segments: - print(f" {segment}") + print(f" {text}") # Create model with 10-second max duration - model = ThreadedWhisperModel(model_path=model_path, use_gpu=True, - max_duration_sec=10.0) + model = ThreadedWhisperModel( + model_path=model_path, use_gpu=True, max_duration_sec=10.0 + ) # Start processing with callback print("Starting threaded Whisper model...") @@ -86,6 +94,7 @@ def handle_result(chunk_id, segments, is_partial): print("Stopping threaded Whisper model...") model.stop() # Will process any remaining audio as final + if __name__ == "__main__": # test_simpler_whisper() test_threaded_whisper() From ac714b3caf00031af5247ccc183af46ccf74921a Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Wed, 23 Oct 2024 09:52:21 -0400 Subject: [PATCH 3/8] Refactor whisper_wrapper.cpp: Remove debug print statements and unused variables --- src/whisper_wrapper.cpp | 46 ++++++----------------------------------- test_simpler_whisper.py | 29 +++++++++++++++++++------- 2 files changed, 28 insertions(+), 47 deletions(-) diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp index e193274..8312c80 100644 --- a/src/whisper_wrapper.cpp +++ b/src/whisper_wrapper.cpp @@ -40,11 +40,9 @@ class WhisperModel { whisper_context_params ctx_params = whisper_context_default_params(); ctx_params.use_gpu = use_gpu; - std::cout << "WhisperModel c'tor Loading model from path: " << model_path << std::endl; ctx = whisper_init_from_file_with_params(model_path.c_str(), ctx_params); if (!ctx) { - std::cout << "Failed to initialize whisper context" << std::endl; throw std::runtime_error("Failed to initialize whisper context"); } } @@ -53,7 +51,6 @@ class WhisperModel { if (ctx) { - std::cout << "WhisperModel d'tor Freeing whisper context" << std::endl; whisper_free(ctx); } } @@ -77,17 +74,11 @@ class WhisperModel std::vector transcribe_raw_audio(const float *audio_data, int n_samples) { - std::cout << "Transcribing audio with " << n_samples << " samples" << std::endl; - std::cout << "first sample: " << audio_data[0] << std::endl; - std::cout << "last sample: " << audio_data[n_samples - 1] << std::endl; - whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); if (whisper_full(ctx, params, audio_data, n_samples) != 0) { - std::cout << "Whisper inference failed" << std::endl; throw std::runtime_error("Whisper inference failed"); } - std::cout << "Whisper inference succeeded" << std::endl; int n_segments = whisper_full_n_segments(ctx); std::vector transcription; @@ -96,7 +87,6 @@ class WhisperModel const char *text = whisper_full_get_segment_text(ctx, i); transcription.push_back(std::string(text)); } - std::cout << "num segments: " << n_segments << std::endl; return transcription; } @@ -125,7 +115,7 @@ class ThreadedWhisperModel float max_duration_sec = 10.0f, int sample_rate = 16000) : running(false), next_chunk_id(0), max_samples(static_cast(max_duration_sec * sample_rate)), - accumulated_samples(0), current_chunk_id(0), model_path(model_path), + current_chunk_id(0), model_path(model_path), use_gpu(use_gpu) { } @@ -173,7 +163,6 @@ class ThreadedWhisperModel { std::lock_guard lock(buffer_mutex); accumulated_buffer.clear(); - accumulated_samples = 0; } } @@ -209,34 +198,24 @@ class ThreadedWhisperModel { std::lock_guard lock(buffer_mutex); - std::cout << "Processing accumulated audio with size: " << accumulated_buffer.size() << std::endl; - if (accumulated_buffer.empty()) + if (accumulated_buffer.empty() || accumulated_buffer.size() < 16000) return; - // check if buffer has less than 1 second of audio - if (accumulated_samples < 16000) - { - std::cout << "Not enough audio to process" << std::endl; - return; - } - process_buffer = accumulated_buffer; current_id = current_chunk_id; // Only clear the buffer if we're processing a final result - if (force_final || accumulated_samples >= max_samples) + if (force_final || accumulated_buffer.size() >= max_samples) { accumulated_buffer.clear(); - accumulated_samples = 0; } } // Process audio - std::cout << "Processing audio with size: " << process_buffer.size() << std::endl; - std::cout << "Pointer to first sample: " << process_buffer.data() << std::endl; - std::cout << "First sample: " << process_buffer[0] << std::endl; std::vector segments = model.transcribe_raw_audio(process_buffer.data(), process_buffer.size()); + std::cout << "Transcription: " << segments[0] << std::endl; + TranscriptionResult result; result.chunk_id = current_id; result.segments = segments; @@ -253,11 +232,7 @@ class ThreadedWhisperModel void processThread() { - std::cout << "Starting process thread." << std::endl; - - std::cout << "Loading model from path: " << this->model_path << std::endl; WhisperModel model(this->model_path, this->use_gpu); - std::cout << "Model loaded." << std::endl; while (running) { @@ -273,7 +248,6 @@ class ThreadedWhisperModel if (!running) { // Process any remaining audio as final before shutting down - std::cout << "Shutting down, processing remaining audio as final." << std::endl; processAccumulatedAudio(model, true); break; } @@ -283,7 +257,6 @@ class ThreadedWhisperModel chunk = std::move(input_queue.front()); input_queue.pop(); has_chunk = true; - std::cout << "Got chunk with ID: " << chunk.id << " and size: " << chunk.data.size() << std::endl; } } @@ -297,17 +270,13 @@ class ThreadedWhisperModel std::copy(chunk.data.begin(), chunk.data.end(), accumulated_buffer.begin() + old_size); - accumulated_samples += chunk.data.size(); current_chunk_id = chunk.id; - std::cout << "Accumulated buffer size: " << accumulated_buffer.size() << std::endl; } // Process the accumulated audio - std::cout << "Processing accumulated audio." << std::endl; processAccumulatedAudio(model, false); } } - std::cout << "Exiting process thread." << std::endl; } void resultThread(int check_interval_ms) @@ -335,7 +304,6 @@ class ThreadedWhisperModel if (!results.empty()) { - std::cout << "Got " << results.size() << " results." << std::endl; py::gil_scoped_acquire gil; for (const auto &result : results) { @@ -345,10 +313,9 @@ class ThreadedWhisperModel { full_text += segment; } - std::cout << "Calling result callback with ID: " << result.chunk_id << std::endl; if (result_callback) { - result_callback(result.chunk_id, full_text, result.is_partial); + result_callback((int)result.chunk_id, py::str(full_text), result.is_partial); } } } @@ -362,7 +329,6 @@ class ThreadedWhisperModel // Audio accumulation std::vector accumulated_buffer; - size_t accumulated_samples; size_t max_samples; std::mutex buffer_mutex; diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py index 96e5fb6..05b8027 100644 --- a/test_simpler_whisper.py +++ b/test_simpler_whisper.py @@ -5,6 +5,7 @@ import numpy as np import time +import resampy from simpler_whisper.whisper import ( load_model, @@ -69,7 +70,7 @@ def test_simpler_whisper(): def test_threaded_whisper(): - def handle_result(chunk_id, text, is_partial): + def handle_result(chunk_id: int, text: str, is_partial: bool): print(f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}):") print(f" {text}") @@ -78,17 +79,31 @@ def handle_result(chunk_id, text, is_partial): model_path=model_path, use_gpu=True, max_duration_sec=10.0 ) + # load audio from file with av + import av + container = av.open(R"C:\Users\roysh\Downloads\1847363777395929088.mp4") + audio = container.streams.audio[0] + print(audio) + frame_generator = container.decode(audio) + # Start processing with callback print("Starting threaded Whisper model...") model.start(callback=handle_result) - for i in range(15): - print(f"Queueing audio chunk {i + 1}") + for i, frame in enumerate(frame_generator): + # print(f"Queueing audio chunk {i + 1}") + # Read audio chunk + # resample to 16kHz + samples = resampy.resample(frame.to_ndarray().mean(axis=0), frame.rate, 16000) + # Queue some audio (will get partial results until 10 seconds accumulate) - chunk_id = model.queue_audio(np.random.rand(16000).astype(np.float32)) - print(f" Queued chunk {i + 1} with ID {chunk_id}") - # sleep for 1 seconds - time.sleep(1) + chunk_id = model.queue_audio(samples) + # print(f" Queued chunk {i + 1} with ID {chunk_id} size {len(samples)}") + # sleep for the size of the audio chunk + time.sleep(len(samples) / 16000) + + # close the container + container.close() # When done print("Stopping threaded Whisper model...") From fddebc49cadfa6a600315b796dbcc4914b7e86a7 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 24 Oct 2024 10:13:12 -0400 Subject: [PATCH 4/8] Refactor whisper_wrapper.cpp: Add trim function and handle exceptions in transcription and result callback --- src/whisper_wrapper.cpp | 45 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp index 8312c80..b4ded28 100644 --- a/src/whisper_wrapper.cpp +++ b/src/whisper_wrapper.cpp @@ -12,6 +12,17 @@ namespace py = pybind11; +std::string trim(const std::string &str) +{ + size_t start = str.find_first_not_of(" \t\n\r"); + size_t end = str.find_last_not_of(" \t\n\r"); + + if (start == std::string::npos) // handles empty string "" and all-whitespace strings like " " + return ""; + + return str.substr(start, end - start + 1); +} + // Global variable to store the Python callback function py::function g_py_log_callback; @@ -212,7 +223,19 @@ class ThreadedWhisperModel } // Process audio - std::vector segments = model.transcribe_raw_audio(process_buffer.data(), process_buffer.size()); + std::vector segments; + try + { + segments = model.transcribe_raw_audio(process_buffer.data(), process_buffer.size()); + } + catch (const std::exception &e) + { + std::cerr << "Exception during transcription: " << e.what() << std::endl; + } + catch (...) + { + std::cerr << "Unknown exception during transcription" << std::endl; + } std::cout << "Transcription: " << segments[0] << std::endl; @@ -307,15 +330,33 @@ class ThreadedWhisperModel py::gil_scoped_acquire gil; for (const auto &result : results) { + if (result.segments.empty()) + continue; + // concatenate segments into a single string std::string full_text; for (const auto &segment : result.segments) { full_text += segment; } + full_text = trim(full_text); + if (full_text.empty()) + continue; + if (result_callback) { - result_callback((int)result.chunk_id, py::str(full_text), result.is_partial); + try + { + result_callback((int)result.chunk_id, py::str(full_text), result.is_partial); + } + catch (const std::exception &e) + { + std::cerr << "Exception in result callback: " << e.what() << std::endl; + } + catch (...) + { + std::cerr << "Unknown exception in result callback" << std::endl; + } } } } From 56d27b4717dfc4c0ae0af3ee0a9b2c08274a1f02 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Fri, 25 Oct 2024 09:53:04 -0400 Subject: [PATCH 5/8] Refactor whisper_wrapper.cpp: Add trim function and handle exceptions in transcription and result callback --- simpler_whisper/whisper.py | 18 +++++++++++++----- src/whisper_wrapper.cpp | 8 ++++++-- test_simpler_whisper.py | 28 +++++++++++++++++++--------- 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py index 3b7ac07..a3a0cfe 100644 --- a/simpler_whisper/whisper.py +++ b/simpler_whisper/whisper.py @@ -24,7 +24,12 @@ def __del__(self): class ThreadedWhisperModel: def __init__( - self, model_path: str, use_gpu=False, max_duration_sec=10.0, sample_rate=16000 + self, + model_path: str, + callback: Callable[[int, str, bool], None], + use_gpu=False, + max_duration_sec=10.0, + sample_rate=16000, ): """ Initialize a threaded Whisper model for continuous audio processing. @@ -39,10 +44,13 @@ def __init__( model_path, use_gpu, max_duration_sec, sample_rate ) self._is_running = False + self.callback = callback - def start( - self, callback: Callable[[int, str, bool], None], result_check_interval_ms=100 - ): + def handle_result(self, chunk_id: int, text: str, is_partial: bool): + if self.callback is not None: + self.callback(chunk_id, text, is_partial) + + def start(self, result_check_interval_ms=100): """ Start the processing threads with a callback for results. @@ -56,7 +64,7 @@ def start( if self._is_running: return - self.model.start(callback, result_check_interval_ms) + self.model.start(self.handle_result, result_check_interval_ms) self._is_running = True def stop(self): diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp index b4ded28..f27cc33 100644 --- a/src/whisper_wrapper.cpp +++ b/src/whisper_wrapper.cpp @@ -56,6 +56,7 @@ class WhisperModel { throw std::runtime_error("Failed to initialize whisper context"); } + params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); } ~WhisperModel() @@ -85,7 +86,6 @@ class WhisperModel std::vector transcribe_raw_audio(const float *audio_data, int n_samples) { - whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); if (whisper_full(ctx, params, audio_data, n_samples) != 0) { throw std::runtime_error("Whisper inference failed"); @@ -104,6 +104,7 @@ class WhisperModel private: whisper_context *ctx; + whisper_full_params params; }; struct AudioChunk @@ -237,7 +238,10 @@ class ThreadedWhisperModel std::cerr << "Unknown exception during transcription" << std::endl; } - std::cout << "Transcription: " << segments[0] << std::endl; + if (segments.empty()) + { + return; + } TranscriptionResult result; result.chunk_id = current_id; diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py index 05b8027..192d248 100644 --- a/test_simpler_whisper.py +++ b/test_simpler_whisper.py @@ -71,36 +71,46 @@ def test_simpler_whisper(): def test_threaded_whisper(): def handle_result(chunk_id: int, text: str, is_partial: bool): - print(f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}):") - print(f" {text}") + print( + f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}): {text}" + ) # Create model with 10-second max duration model = ThreadedWhisperModel( - model_path=model_path, use_gpu=True, max_duration_sec=10.0 + model_path=model_path, + callback=handle_result, + use_gpu=True, + max_duration_sec=10.0, ) # load audio from file with av import av - container = av.open(R"C:\Users\roysh\Downloads\1847363777395929088.mp4") + + container = av.open( + R"local_path_to_audio_file" + ) audio = container.streams.audio[0] print(audio) frame_generator = container.decode(audio) # Start processing with callback print("Starting threaded Whisper model...") - model.start(callback=handle_result) + model.start() for i, frame in enumerate(frame_generator): - # print(f"Queueing audio chunk {i + 1}") # Read audio chunk + incoming_audio = frame.to_ndarray().mean(axis=0) + incoming_audio = incoming_audio / 32768.0 # normalize to [-1, 1] # resample to 16kHz - samples = resampy.resample(frame.to_ndarray().mean(axis=0), frame.rate, 16000) + samples = resampy.resample(incoming_audio, frame.rate, 16000) # Queue some audio (will get partial results until 10 seconds accumulate) chunk_id = model.queue_audio(samples) - # print(f" Queued chunk {i + 1} with ID {chunk_id} size {len(samples)}") # sleep for the size of the audio chunk - time.sleep(len(samples) / 16000) + try: + time.sleep(len(samples) / 16000) + except: + break # close the container container.close() From 33e2a3c92daf4366cfb9f0c9c17acbb9ba7b7a48 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Fri, 25 Oct 2024 12:39:58 -0400 Subject: [PATCH 6/8] Refactor setup.py: Add extension suffix and create extension directory --- setup.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 67c07d6..f032daf 100644 --- a/setup.py +++ b/setup.py @@ -23,14 +23,22 @@ def run(self): self.build_extension(ext) def build_extension(self, ext): + # This is the critical change - we need to get the proper extension suffix + ext_suffix = sysconfig.get_config_var('EXT_SUFFIX') + + # Get the full path where the extension should be placed extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) - + + # Ensure the extension directory exists + os.makedirs(extdir, exist_ok=True) + # Get acceleration and platform from environment variables acceleration = os.environ.get('SIMPLER_WHISPER_ACCELERATION', 'cpu') target_platform = os.environ.get('SIMPLER_WHISPER_PLATFORM', platform.machine()) - + cmake_args = [ f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}', + f'-DPYTHON_EXTENSION_SUFFIX={ext_suffix}', # Pass the extension suffix to CMake f'-DACCELERATION={acceleration}', ] @@ -38,8 +46,12 @@ def build_extension(self, ext): # Add platform-specific arguments if platform.system() == "Darwin": # macOS - cmake_args += [f'-DCMAKE_OSX_ARCHITECTURES={target_platform}'] - # add MACOS_ARCH env variable to specify the target platform + cmake_args += [ + f'-DCMAKE_OSX_ARCHITECTURES={target_platform}', + '-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON', + '-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON', + f'-DCMAKE_INSTALL_NAME_DIR=@rpath' + ] env["MACOS_ARCH"] = target_platform cfg = 'Debug' if self.debug else 'Release' @@ -55,13 +67,14 @@ def build_extension(self, ext): build_args += ['--', '-j2'] env['CXXFLAGS'] = f'{env.get("CXXFLAGS", "")} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"' - + if not os.path.exists(self.build_temp): os.makedirs(self.build_temp) - + print("CMake args:", cmake_args) print("Build args:", build_args) - + print(f"Extension will be built in: {extdir}") + subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env) subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp) @@ -75,4 +88,5 @@ def build_extension(self, ext): ext_modules=[CMakeExtension('simpler_whisper._whisper_cpp')], cmdclass=dict(build_ext=CMakeBuild), zip_safe=False, -) \ No newline at end of file + packages=['simpler_whisper'], # Add this line to ensure the package directory is created +) From e771682a8e218509fbd813bf6b7982b00c6fac0f Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Fri, 25 Oct 2024 14:09:08 -0400 Subject: [PATCH 7/8] Refactor version numbers in pyproject.toml and setup.py --- pyproject.toml | 4 ++-- setup.py | 2 +- src/whisper_wrapper.cpp | 27 ++++++++++++++-------- test_simpler_whisper.py | 51 ++++++++++++++++++++++++++++------------- 4 files changed, 55 insertions(+), 29 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 61cfc82..0baf19e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "simpler-whisper" -version = "0.1.0" +version = "0.2.0" authors = [ {name = "Roy Shilkrot", email = "roy.shil@gmail.com"}, ] @@ -33,4 +33,4 @@ dependencies = [ packages = ["simpler_whisper"] [tool.setuptools.package-data] -simpler_whisper = ["*.dll", "*.pyd", "*.so", "*.metal"] \ No newline at end of file +simpler_whisper = ["*.dll", "*.pyd", "*.so", "*.metal"] diff --git a/setup.py b/setup.py index f032daf..143d780 100644 --- a/setup.py +++ b/setup.py @@ -80,7 +80,7 @@ def build_extension(self, ext): setup( name='simpler-whisper', - version='0.1.0', + version='0.2.0', author='Roy Shilkrot', author_email='roy.shil@gmail.com', description='A simple Python wrapper for whisper.cpp', diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp index f27cc33..fc73ef0 100644 --- a/src/whisper_wrapper.cpp +++ b/src/whisper_wrapper.cpp @@ -29,9 +29,10 @@ py::function g_py_log_callback; // C++ callback function that will be passed to whisper_log_set void cpp_log_callback(ggml_log_level level, const char *text, void *) { - if (!g_py_log_callback.is_none()) + if (!g_py_log_callback.is_none() && text != nullptr && strlen(text) > 0) { - g_py_log_callback(level, text); + py::gil_scoped_acquire gil; + g_py_log_callback(level, std::string(text)); } } @@ -263,7 +264,7 @@ class ThreadedWhisperModel while (running) { - AudioChunk chunk; + AudioChunk all_chunks; bool has_chunk = false; // Get next chunk from input queue @@ -279,10 +280,13 @@ class ThreadedWhisperModel break; } - if (!input_queue.empty()) + // take all chunks from the queue and create a single chunk + while (!input_queue.empty()) { - chunk = std::move(input_queue.front()); + AudioChunk chunk = std::move(input_queue.front()); input_queue.pop(); + all_chunks.data.insert(all_chunks.data.end(), chunk.data.begin(), chunk.data.end()); + all_chunks.id = chunk.id; has_chunk = true; } } @@ -293,11 +297,11 @@ class ThreadedWhisperModel { std::lock_guard lock(buffer_mutex); size_t old_size = accumulated_buffer.size(); - accumulated_buffer.resize(old_size + chunk.data.size()); - std::copy(chunk.data.begin(), chunk.data.end(), + accumulated_buffer.resize(old_size + all_chunks.data.size()); + std::copy(all_chunks.data.begin(), all_chunks.data.end(), accumulated_buffer.begin() + old_size); - current_chunk_id = chunk.id; + current_chunk_id = all_chunks.id; } // Process the accumulated audio @@ -420,8 +424,11 @@ PYBIND11_MODULE(_whisper_cpp, m) m.def("set_log_callback", &set_log_callback, "Set the log callback function"); py::enum_(m, "LogLevel") - .value("ERROR", GGML_LOG_LEVEL_ERROR) - .value("WARN", GGML_LOG_LEVEL_WARN) + .value("NONE", GGML_LOG_LEVEL_NONE) .value("INFO", GGML_LOG_LEVEL_INFO) + .value("WARN", GGML_LOG_LEVEL_WARN) + .value("ERROR", GGML_LOG_LEVEL_ERROR) + .value("DEBUG", GGML_LOG_LEVEL_DEBUG) + .value("CONT", GGML_LOG_LEVEL_CONT) .export_values(); } diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py index 192d248..3e60637 100644 --- a/test_simpler_whisper.py +++ b/test_simpler_whisper.py @@ -1,3 +1,4 @@ +import argparse import sys # Remove the current directory from sys.path to avoid conflicts with the installed package @@ -15,13 +16,23 @@ ) +log_levels = {LogLevel.ERROR: "ERROR", LogLevel.WARN: "WARN", LogLevel.INFO: "INFO"} + + def my_log_callback(level, message): - log_levels = {LogLevel.ERROR: "ERROR", LogLevel.WARN: "WARN", LogLevel.INFO: "INFO"} - print(f"whisper.cpp [{log_levels.get(level, 'UNKNOWN')}] {message.strip()}") + if message is not None and len(message.strip()) > 0: + print(f"whisper.cpp [{log_levels.get(level, 'UNKNOWN')}] {message.strip()}") # Path to your Whisper model file -model_path = R"ggml-tiny.en-q5_1.bin" +# Parse command-line arguments +parser = argparse.ArgumentParser(description="Test simpler-whisper model.") +parser.add_argument("model_path", type=str, help="Path to the Whisper model file") +parser.add_argument("audio_file", type=str, help="Path to the audio file") +args = parser.parse_args() + +model_path = args.model_path +audio_file = args.audio_file def test_simpler_whisper(): @@ -70,6 +81,8 @@ def test_simpler_whisper(): def test_threaded_whisper(): + set_log_callback(my_log_callback) + def handle_result(chunk_id: int, text: str, is_partial: bool): print( f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}): {text}" @@ -86,9 +99,7 @@ def handle_result(chunk_id: int, text: str, is_partial: bool): # load audio from file with av import av - container = av.open( - R"local_path_to_audio_file" - ) + container = av.open(audio_file) audio = container.streams.audio[0] print(audio) frame_generator = container.decode(audio) @@ -98,17 +109,25 @@ def handle_result(chunk_id: int, text: str, is_partial: bool): model.start() for i, frame in enumerate(frame_generator): - # Read audio chunk - incoming_audio = frame.to_ndarray().mean(axis=0) - incoming_audio = incoming_audio / 32768.0 # normalize to [-1, 1] - # resample to 16kHz - samples = resampy.resample(incoming_audio, frame.rate, 16000) - - # Queue some audio (will get partial results until 10 seconds accumulate) - chunk_id = model.queue_audio(samples) - # sleep for the size of the audio chunk try: - time.sleep(len(samples) / 16000) + # Read audio chunk + incoming_audio = frame.to_ndarray() + # check if stereo + if incoming_audio.shape[0] == 2: + incoming_audio = incoming_audio.mean(axis=0) + # check if the type is int16 or float32 + if incoming_audio.dtype == np.int16: + incoming_audio = incoming_audio / 32768.0 # normalize to [-1, 1] + # resample to 16kHz if needed + if frame.rate != 16000: + samples = resampy.resample(incoming_audio, frame.rate, 16000) + else: + samples = incoming_audio + + # Queue some audio (will get partial results until 10 seconds accumulate) + chunk_id = model.queue_audio(samples) + # sleep for the size of the audio chunk + time.sleep(float(len(samples)) / float(16000)) except: break From 96f019da60602e51f10536ccd7e6b60af202f95d Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Fri, 25 Oct 2024 14:23:30 -0400 Subject: [PATCH 8/8] Refactor build.yaml, CMakeLists.txt, and setup.py --- .github/workflows/build.yaml | 16 +++---- CMakeLists.txt | 4 +- setup.py | 85 ++++++++++++++++++++++-------------- 3 files changed, 63 insertions(+), 42 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index cbe7286..87336bc 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -74,17 +74,18 @@ jobs: env: SIMPLER_WHISPER_ACCELERATION: ${{ matrix.acceleration }} SIMPLER_WHISPER_PLATFORM: ${{ matrix.platform }} + SIMPLER_WHISPER_PYTHON_VERSION: ${{ matrix.python-version }} run: | python setup.py build_ext --inplace python -m build --wheel - + - name: Install built wheel Non-Windows if: startsWith(matrix.os, 'windows') == false run: | pip install dist/*.whl - + - name: Install built wheel Windows - if: startsWith(matrix.os, 'windows') == true + if: startsWith(matrix.os, 'windows') == true shell: pwsh run: | $wheelFile = Get-ChildItem dist/*.whl | Select-Object -First 1 @@ -104,15 +105,15 @@ jobs: run: | import os import glob - + wheel_file = glob.glob('dist/*.whl')[0] base_name = os.path.basename(wheel_file) name_parts = base_name.split('-') - + # Insert acceleration and platform before the last part (which is like 'any.whl') new_name_parts = name_parts[:-1] + ['${{ matrix.acceleration }}', '${{ matrix.platform }}'] + [name_parts[-1]] new_name = '-'.join(new_name_parts) - + new_path = os.path.join('dist', new_name) os.rename(wheel_file, new_path) print(f"Renamed {base_name} to {new_name}") @@ -125,10 +126,9 @@ jobs: $wheelName += "-${{ matrix.acceleration }}" } echo "WHEEL_NAME=$wheelName" >> $env:GITHUB_ENV - + - name: Upload wheel uses: actions/upload-artifact@v4 with: name: ${{ env.WHEEL_NAME }} path: dist/*.whl - \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 0fb5f0d..48a9fe7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) # Find Python -find_package(Python COMPONENTS Interpreter Development NumPy REQUIRED) +find_package(Python ${PYTHON_VERSION} EXACT COMPONENTS Interpreter Development NumPy REQUIRED) # Fetch pybind11 include(FetchContent) @@ -40,4 +40,4 @@ if(WIN32 OR APPLE) $ ) endforeach() -endif() \ No newline at end of file +endif() diff --git a/setup.py b/setup.py index 143d780..a2eb7f5 100644 --- a/setup.py +++ b/setup.py @@ -6,25 +6,29 @@ import platform import sysconfig + class CMakeExtension(Extension): - def __init__(self, name, sourcedir=''): + def __init__(self, name, sourcedir=""): Extension.__init__(self, name, sources=[]) self.sourcedir = os.path.abspath(sourcedir) + class CMakeBuild(build_ext): def run(self): try: - out = subprocess.check_output(['cmake', '--version']) + out = subprocess.check_output(["cmake", "--version"]) except OSError: - raise RuntimeError("CMake must be installed to build the following extensions: " + - ", ".join(e.name for e in self.extensions)) + raise RuntimeError( + "CMake must be installed to build the following extensions: " + + ", ".join(e.name for e in self.extensions) + ) for ext in self.extensions: self.build_extension(ext) def build_extension(self, ext): # This is the critical change - we need to get the proper extension suffix - ext_suffix = sysconfig.get_config_var('EXT_SUFFIX') + ext_suffix = sysconfig.get_config_var("EXT_SUFFIX") # Get the full path where the extension should be placed extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) @@ -33,13 +37,18 @@ def build_extension(self, ext): os.makedirs(extdir, exist_ok=True) # Get acceleration and platform from environment variables - acceleration = os.environ.get('SIMPLER_WHISPER_ACCELERATION', 'cpu') - target_platform = os.environ.get('SIMPLER_WHISPER_PLATFORM', platform.machine()) + acceleration = os.environ.get("SIMPLER_WHISPER_ACCELERATION", "cpu") + target_platform = os.environ.get("SIMPLER_WHISPER_PLATFORM", platform.machine()) + python_version = os.environ.get( + "SIMPLER_WHISPER_PYTHON_VERSION", + f"{sys.version_info.major}.{sys.version_info.minor}", + ) cmake_args = [ - f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}', - f'-DPYTHON_EXTENSION_SUFFIX={ext_suffix}', # Pass the extension suffix to CMake - f'-DACCELERATION={acceleration}', + f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}", + f"-DPYTHON_EXTENSION_SUFFIX={ext_suffix}", # Pass the extension suffix to CMake + f"-DACCELERATION={acceleration}", + f"-DPYTHON_VERSION={python_version}", ] env = os.environ.copy() @@ -47,26 +56,28 @@ def build_extension(self, ext): # Add platform-specific arguments if platform.system() == "Darwin": # macOS cmake_args += [ - f'-DCMAKE_OSX_ARCHITECTURES={target_platform}', - '-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON', - '-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON', - f'-DCMAKE_INSTALL_NAME_DIR=@rpath' + f"-DCMAKE_OSX_ARCHITECTURES={target_platform}", + "-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON", + "-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON", + f"-DCMAKE_INSTALL_NAME_DIR=@rpath", ] env["MACOS_ARCH"] = target_platform - cfg = 'Debug' if self.debug else 'Release' - build_args = ['--config', cfg] + cfg = "Debug" if self.debug else "Release" + build_args = ["--config", cfg] if platform.system() == "Windows": - cmake_args += [f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}'] + cmake_args += [f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"] if sys.maxsize > 2**32: - cmake_args += ['-A', 'x64'] - build_args += ['--', '/m'] + cmake_args += ["-A", "x64"] + build_args += ["--", "/m"] else: - cmake_args += [f'-DCMAKE_BUILD_TYPE={cfg}'] - build_args += ['--', '-j2'] + cmake_args += [f"-DCMAKE_BUILD_TYPE={cfg}"] + build_args += ["--", "-j2"] - env['CXXFLAGS'] = f'{env.get("CXXFLAGS", "")} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"' + env["CXXFLAGS"] = ( + f'{env.get("CXXFLAGS", "")} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"' + ) if not os.path.exists(self.build_temp): os.makedirs(self.build_temp) @@ -74,19 +85,29 @@ def build_extension(self, ext): print("CMake args:", cmake_args) print("Build args:", build_args) print(f"Extension will be built in: {extdir}") + print( + f"Building for Python {python_version} on {target_platform} with acceleration: {acceleration}" + ) + + subprocess.check_call( + ["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env + ) + subprocess.check_call( + ["cmake", "--build", "."] + build_args, cwd=self.build_temp + ) - subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env) - subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp) setup( - name='simpler-whisper', - version='0.2.0', - author='Roy Shilkrot', - author_email='roy.shil@gmail.com', - description='A simple Python wrapper for whisper.cpp', - long_description='', - ext_modules=[CMakeExtension('simpler_whisper._whisper_cpp')], + name="simpler-whisper", + version="0.2.0", + author="Roy Shilkrot", + author_email="roy.shil@gmail.com", + description="A simple Python wrapper for whisper.cpp", + long_description="", + ext_modules=[CMakeExtension("simpler_whisper._whisper_cpp")], cmdclass=dict(build_ext=CMakeBuild), zip_safe=False, - packages=['simpler_whisper'], # Add this line to ensure the package directory is created + packages=[ + "simpler_whisper" + ], # Add this line to ensure the package directory is created )