From 7c0978bef778073224a31ad74ed41b37a7a9f8a9 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Tue, 22 Oct 2024 16:59:56 -0400
Subject: [PATCH 1/8] Refactor build.yaml workflow: Update Python setup and
 dependencies

---
 README.md                  |   8 +-
 simpler_whisper/whisper.py |  95 ++++++++++
 src/whisper_wrapper.cpp    | 345 +++++++++++++++++++++++++++++++++++--
 test_simpler_whisper.py    |  36 +++-
 4 files changed, 456 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index 51206c2..a18c115 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 ![Build and Test](https://img.shields.io/github/actions/workflow/status/locaal-ai/simpler-whisper/build.yaml)
 
-A zero-dependency simple Python wrapper for [whisper.cpp](https://github.com/ggerganov/whisper.cpp), providing an easy-to-use interface for speech recognition using the Whisper model. 
+A zero-dependency simple Python wrapper for [whisper.cpp](https://github.com/ggerganov/whisper.cpp), providing an easy-to-use interface for speech recognition using the Whisper model.
 
 Why is it better than [faster-whisper](https://github.com/SYSTRAN/faster-whisper) and [pywhispercpp](https://github.com/abdeladim-s/pywhispercpp):
 - Zero-dependency: Everything is shipped with the built wheel, no Python dependency (on `av` or `ctranslate2` etc.) except for `numpy`.
@@ -30,7 +30,7 @@ pip install simpler-whisper
 import simpler_whisper.whisper
 import numpy as np
 
-# Load the model file. 
+# Load the model file.
 # It's on you to download one from https://huggingface.co/ggerganov/whisper.cpp
 model = simpler_whisper.whisper.load_model("path/to/model.bin")
 
@@ -79,7 +79,7 @@ Simpler Whisper supports various build configurations to optimize for different
 ### Example: Building for Windows with CUDA acceleration
 
 ```powershell
-$env:SIMPLER_WHISPER_ACCELERATION=cuda
+$env:SIMPLER_WHISPER_ACCELERATION="cuda"
 pip install .
 ```
 
@@ -91,4 +91,4 @@ SIMPLER_WHISPER_PLATFORM=arm64 pip install .
 
 ## License
 
-This project is licensed under the MIT License - see the LICENSE file for details.
\ No newline at end of file
+This project is licensed under the MIT License - see the LICENSE file for details.
diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py
index 2defc93..605c365 100644
--- a/simpler_whisper/whisper.py
+++ b/simpler_whisper/whisper.py
@@ -21,10 +21,105 @@ def __del__(self):
             del self.model
 
 
+class ThreadedWhisperModel:
+    def __init__(self, model_path: str, use_gpu=False, max_duration_sec=10.0, sample_rate=16000):
+        """
+        Initialize a threaded Whisper model for continuous audio processing.
+
+        Args:
+            model_path (str): Path to the Whisper model file
+            use_gpu (bool): Whether to use GPU acceleration
+            max_duration_sec (float): Maximum duration in seconds before finalizing a segment
+            sample_rate (int): Audio sample rate (default: 16000)
+        """
+        self.model = _whisper_cpp.ThreadedWhisperModel(
+            model_path, use_gpu, max_duration_sec, sample_rate
+        )
+        self._is_running = False
+
+    def start(self, callback, result_check_interval_ms=100):
+        """
+        Start the processing threads with a callback for results.
+
+        Args:
+            callback: Function that takes three arguments:
+                     - chunk_id (int): Unique identifier for the audio chunk
+                     - segments (list): List of transcribed text segments
+                     - is_partial (bool): Whether this is a partial result
+            result_check_interval_ms (int): How often to check for results
+        """
+        if self._is_running:
+            return
+
+        self.model.start(callback, result_check_interval_ms)
+        self._is_running = True
+
+    def stop(self):
+        """
+        Stop processing and clean up resources.
+        Any remaining audio will be processed as a final segment.
+        """
+        if not self._is_running:
+            return
+
+        self.model.stop()
+        self._is_running = False
+
+    def queue_audio(self, audio):
+        """
+        Queue audio for processing.
+
+        Args:
+            audio: Audio samples as numpy array or array-like object.
+                  Will be converted to float32.
+
+        Returns:
+            chunk_id (int): Unique identifier for this audio chunk
+        """
+        # Ensure audio is a numpy array of float32
+        audio = np.array(audio, dtype=np.float32)
+        return self.model.queue_audio(audio)
+
+    def set_max_duration(self, max_duration_sec, sample_rate=16000):
+        """
+        Change the maximum duration for partial segments.
+
+        Args:
+            max_duration_sec (float): New maximum duration in seconds
+            sample_rate (int): Audio sample rate (default: 16000)
+        """
+        self.model.set_max_duration(max_duration_sec, sample_rate)
+
+    def __del__(self):
+        # Ensure threads are stopped and resources cleaned up
+        if hasattr(self, "model"):
+            if self._is_running:
+                self.stop()
+            del self.model
+
+
 def load_model(model_path: str, use_gpu=False) -> WhisperModel:
     return WhisperModel(model_path, use_gpu)
 
 
+def load_threaded_model(
+    model_path: str, use_gpu=False, max_duration_sec=10.0, sample_rate=16000
+) -> ThreadedWhisperModel:
+    """
+    Load a threaded Whisper model for continuous audio processing.
+
+    Args:
+        model_path (str): Path to the Whisper model file
+        use_gpu (bool): Whether to use GPU acceleration
+        max_duration_sec (float): Maximum duration in seconds before finalizing a segment
+        sample_rate (int): Audio sample rate (default: 16000)
+
+    Returns:
+        ThreadedWhisperModel: A model instance ready for processing
+    """
+    return ThreadedWhisperModel(model_path, use_gpu, max_duration_sec, sample_rate)
+
+
 def set_log_callback(callback):
     """
     Set a custom logging callback function.
diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp
index 63506f8..4c8e374 100644
--- a/src/whisper_wrapper.cpp
+++ b/src/whisper_wrapper.cpp
@@ -2,9 +2,37 @@
 #include <pybind11/numpy.h>
 #include <pybind11/functional.h>
 #include <whisper.h>
+#include <queue>
+#include <mutex>
+#include <thread>
+#include <condition_variable>
+#include <atomic>
+#include <vector>
+#include <iostream>
 
 namespace py = pybind11;
 
+// Global variable to store the Python callback function
+py::function g_py_log_callback;
+
+// C++ callback function that will be passed to whisper_log_set
+void cpp_log_callback(ggml_log_level level, const char *text, void *)
+{
+    if (!g_py_log_callback.is_none())
+    {
+        g_py_log_callback(level, text);
+    }
+}
+
+// Function to set the log callback
+void set_log_callback(py::function callback)
+{
+    g_py_log_callback = callback;
+    whisper_log_set(cpp_log_callback, nullptr);
+    ggml_log_set(cpp_log_callback, nullptr);
+}
+
+// Original synchronous implementation
 class WhisperModel
 {
 public:
@@ -12,6 +40,7 @@ class WhisperModel
     {
         whisper_context_params ctx_params = whisper_context_default_params();
         ctx_params.use_gpu = use_gpu;
+        std::cout << "WhisperModel c'tor Loading model from path: " << model_path << std::endl;
         ctx = whisper_init_from_file_with_params(model_path.c_str(), ctx_params);
         if (!ctx)
         {
@@ -21,6 +50,7 @@ class WhisperModel
 
     ~WhisperModel()
     {
+        std::cout << "WhisperModel d'tor Freeing whisper context" << std::endl;
         if (ctx)
         {
             whisper_free(ctx);
@@ -33,20 +63,37 @@ class WhisperModel
         float *audio_data = static_cast<float *>(audio_buffer.ptr);
         int n_samples = audio_buffer.size;
 
-        whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+        std::vector<std::string> segments = transcribe_raw_audio(audio_data, n_samples);
+
+        py::list result;
+        for (const auto &segment : segments)
+        {
+            result.append(segment);
+        }
 
+        return result;
+    }
+
+    std::vector<std::string> transcribe_raw_audio(const float *audio_data, int n_samples)
+    {
+        std::cout << "Transcribing audio with " << n_samples << " samples" << std::endl;
+        std::cout << "first sample: " << audio_data[0] << std::endl;
+        std::cout << "last sample: " << audio_data[n_samples - 1] << std::endl;
+
+        whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
         if (whisper_full(ctx, params, audio_data, n_samples) != 0)
         {
+            std::cout << "Whisper inference failed" << std::endl;
             throw std::runtime_error("Whisper inference failed");
         }
 
         int n_segments = whisper_full_n_segments(ctx);
-        py::list transcription;
+        std::vector<std::string> transcription;
 
         for (int i = 0; i < n_segments; i++)
         {
             const char *text = whisper_full_get_segment_text(ctx, i);
-            transcription.append(py::str(text));
+            transcription.push_back(std::string(text));
         }
 
         return transcription;
@@ -56,32 +103,294 @@ class WhisperModel
     whisper_context *ctx;
 };
 
-// Global variable to store the Python callback function
-py::function g_py_log_callback;
+struct AudioChunk
+{
+    std::vector<float> data;
+    size_t id;
+};
 
-// C++ callback function that will be passed to whisper_log_set
-void cpp_log_callback(ggml_log_level level, const char *text, void *)
+struct TranscriptionResult
 {
-    if (!g_py_log_callback.is_none())
+    size_t chunk_id;
+    std::vector<std::string> segments;
+    bool is_partial;
+};
+
+class ThreadedWhisperModel
+{
+public:
+    ThreadedWhisperModel(const std::string &model_path, bool use_gpu = false,
+                         float max_duration_sec = 10.0f, int sample_rate = 16000)
+        : running(false), next_chunk_id(0),
+          max_samples(static_cast<size_t>(max_duration_sec * sample_rate)),
+          accumulated_samples(0), current_chunk_id(0), model_path(model_path),
+          use_gpu(use_gpu)
     {
-        g_py_log_callback(level, text);
     }
-}
 
-// Function to set the log callback
-void set_log_callback(py::function callback)
-{
-    g_py_log_callback = callback;
-    whisper_log_set(cpp_log_callback, nullptr);
-    ggml_log_set(cpp_log_callback, nullptr);
-}
+    ~ThreadedWhisperModel()
+    {
+        stop();
+    }
+
+    void start(py::function callback, int result_check_interval_ms = 100)
+    {
+        if (running)
+            return;
+
+        running = true;
+        result_callback = callback;
+
+        process_thread = std::thread(&ThreadedWhisperModel::processThread, this);
+        result_thread = std::thread(&ThreadedWhisperModel::resultThread, this,
+                                    result_check_interval_ms);
+    }
+
+    void stop()
+    {
+        if (!running)
+            return;
+        running = false;
+
+        {
+            std::lock_guard<std::mutex> lock(input_mutex);
+            input_cv.notify_one();
+        }
+
+        {
+            std::lock_guard<std::mutex> lock(result_mutex);
+            result_cv.notify_one();
+        }
+
+        if (process_thread.joinable())
+            process_thread.join();
+        if (result_thread.joinable())
+            result_thread.join();
+
+        // Clear accumulated buffer
+        {
+            std::lock_guard<std::mutex> lock(buffer_mutex);
+            accumulated_buffer.clear();
+            accumulated_samples = 0;
+        }
+    }
+
+    size_t queueAudio(py::array_t<float> audio)
+    {
+        auto buffer = audio.request();
+        float *data = static_cast<float *>(buffer.ptr);
+        size_t n_samples = buffer.size;
+
+        AudioChunk chunk;
+        chunk.data.assign(data, data + n_samples);
+        chunk.id = next_chunk_id++;
+
+        {
+            std::lock_guard<std::mutex> lock(input_mutex);
+            input_queue.push(std::move(chunk));
+            input_cv.notify_one();
+        }
+
+        return chunk.id;
+    }
+
+    void setMaxDuration(float max_duration_sec, int sample_rate = 16000)
+    {
+        max_samples = static_cast<size_t>(max_duration_sec * sample_rate);
+    }
+
+private:
+    void processAccumulatedAudio(WhisperModel &model, bool force_final = false)
+    {
+        std::vector<float> process_buffer;
+        size_t current_id;
+
+        {
+            std::lock_guard<std::mutex> lock(buffer_mutex);
+            std::cout << "Processing accumulated audio with size: " << accumulated_buffer.size() << std::endl;
+            if (accumulated_buffer.empty())
+                return;
+
+            process_buffer = accumulated_buffer;
+            current_id = current_chunk_id;
+
+            // Only clear the buffer if we're processing a final result
+            if (force_final || accumulated_samples >= max_samples)
+            {
+                accumulated_buffer.clear();
+                accumulated_samples = 0;
+            }
+        }
+
+        // Process audio
+        std::cout << "Processing audio with size: " << process_buffer.size() << std::endl;
+        std::cout << "Pointer to first sample: " << process_buffer.data() << std::endl;
+        std::cout << "First sample: " << process_buffer[0] << std::endl;
+        std::vector<std::string> segments = model.transcribe_raw_audio(process_buffer.data(), process_buffer.size());
+
+        TranscriptionResult result;
+        result.chunk_id = current_id;
+        result.segments = segments;
+        // Set partial flag based on whether this is a final result
+        result.is_partial = !(force_final || process_buffer.size() >= max_samples);
+
+        // Add result to output queue
+        {
+            std::lock_guard<std::mutex> lock(result_mutex);
+            result_queue.push(std::move(result));
+            result_cv.notify_one();
+        }
+    }
+
+    void processThread()
+    {
+        std::cout << "Starting process thread." << std::endl;
+
+        std::cout << "Loading model from path: " << this->model_path << std::endl;
+        WhisperModel model(this->model_path, this->use_gpu);
+        std::cout << "Model loaded." << std::endl;
+
+        while (running)
+        {
+            AudioChunk chunk;
+            bool has_chunk = false;
+
+            // Get next chunk from input queue
+            {
+                std::unique_lock<std::mutex> lock(input_mutex);
+                input_cv.wait(lock, [this]
+                              { return !input_queue.empty() || !running; });
+
+                if (!running)
+                {
+                    // Process any remaining audio as final before shutting down
+                    std::cout << "Shutting down, processing remaining audio as final." << std::endl;
+                    processAccumulatedAudio(model, true);
+                    break;
+                }
+
+                if (!input_queue.empty())
+                {
+                    chunk = std::move(input_queue.front());
+                    input_queue.pop();
+                    has_chunk = true;
+                    std::cout << "Got chunk with ID: " << chunk.id << " and size: " << chunk.data.size() << std::endl;
+                }
+            }
+
+            if (has_chunk)
+            {
+                // Add to accumulated buffer
+                {
+                    std::lock_guard<std::mutex> lock(buffer_mutex);
+                    size_t old_size = accumulated_buffer.size();
+                    accumulated_buffer.resize(old_size + chunk.data.size());
+                    std::copy(chunk.data.begin(), chunk.data.end(),
+                              accumulated_buffer.begin() + old_size);
+
+                    accumulated_samples += chunk.data.size();
+                    current_chunk_id = chunk.id;
+                    std::cout << "Accumulated buffer size: " << accumulated_buffer.size() << std::endl;
+                }
+
+                // Process the accumulated audio
+                std::cout << "Processing accumulated audio." << std::endl;
+                processAccumulatedAudio(model, false);
+            }
+        }
+        std::cout << "Exiting process thread." << std::endl;
+    }
+
+    void resultThread(int check_interval_ms)
+    {
+        while (running)
+        {
+            std::vector<TranscriptionResult> results;
+
+            {
+                std::unique_lock<std::mutex> lock(result_mutex);
+                result_cv.wait_for(lock,
+                                   std::chrono::milliseconds(check_interval_ms),
+                                   [this]
+                                   { return !result_queue.empty() || !running; });
+
+                if (!running && result_queue.empty())
+                    break;
+
+                while (!result_queue.empty())
+                {
+                    results.push_back(std::move(result_queue.front()));
+                    result_queue.pop();
+                }
+            }
+
+            if (!results.empty())
+            {
+                py::gil_scoped_acquire gil;
+                for (const auto &result : results)
+                {
+                    py::list segments;
+                    for (const auto &segment : result.segments)
+                    {
+                        segments.append(segment);
+                    }
+                    result_callback(result.chunk_id, segments, result.is_partial);
+                }
+            }
+        }
+    }
+
+    whisper_context *ctx;
+    std::atomic<bool> running;
+    std::atomic<size_t> next_chunk_id;
+    size_t current_chunk_id;
+
+    // Audio accumulation
+    std::vector<float> accumulated_buffer;
+    size_t accumulated_samples;
+    size_t max_samples;
+    std::mutex buffer_mutex;
+
+    std::thread process_thread;
+    std::thread result_thread;
+
+    std::queue<AudioChunk> input_queue;
+    std::mutex input_mutex;
+    std::condition_variable input_cv;
+
+    std::queue<TranscriptionResult> result_queue;
+    std::mutex result_mutex;
+    std::condition_variable result_cv;
+
+    py::function result_callback;
+
+    std::string model_path;
+    bool use_gpu;
+};
 
 PYBIND11_MODULE(_whisper_cpp, m)
 {
+    // Expose synchronous model
     py::class_<WhisperModel>(m, "WhisperModel")
         .def(py::init<const std::string &, bool>())
         .def("transcribe", &WhisperModel::transcribe);
 
+    py::class_<ThreadedWhisperModel>(m, "ThreadedWhisperModel")
+        .def(py::init<const std::string &, bool, float, int>(),
+             py::arg("model_path"),
+             py::arg("use_gpu") = false,
+             py::arg("max_duration_sec") = 10.0f,
+             py::arg("sample_rate") = 16000)
+        .def("start", &ThreadedWhisperModel::start,
+             py::arg("callback"),
+             py::arg("result_check_interval_ms") = 100)
+        .def("stop", &ThreadedWhisperModel::stop)
+        .def("queue_audio", &ThreadedWhisperModel::queueAudio)
+        .def("set_max_duration", &ThreadedWhisperModel::setMaxDuration,
+             py::arg("max_duration_sec"),
+             py::arg("sample_rate") = 16000);
+
+    // Expose logging functionality
     m.def("set_log_callback", &set_log_callback, "Set the log callback function");
 
     py::enum_<ggml_log_level>(m, "LogLevel")
@@ -89,4 +398,4 @@ PYBIND11_MODULE(_whisper_cpp, m)
         .value("WARN", GGML_LOG_LEVEL_WARN)
         .value("INFO", GGML_LOG_LEVEL_INFO)
         .export_values();
-}
\ No newline at end of file
+}
diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py
index 48667ec..384efc7 100644
--- a/test_simpler_whisper.py
+++ b/test_simpler_whisper.py
@@ -6,19 +6,17 @@
 import numpy as np
 import time
 
-from simpler_whisper.whisper import load_model, set_log_callback, LogLevel
+from simpler_whisper.whisper import load_model, set_log_callback, LogLevel, ThreadedWhisperModel
 
 
 def my_log_callback(level, message):
     log_levels = {LogLevel.ERROR: "ERROR", LogLevel.WARN: "WARN", LogLevel.INFO: "INFO"}
     print(f"whisper.cpp [{log_levels.get(level, 'UNKNOWN')}] {message.strip()}")
 
+# Path to your Whisper model file
+model_path = R"ggml-tiny.en-q5_1.bin"
 
 def test_simpler_whisper():
-    # Path to your Whisper model file
-    # Replace this with the path to your actual model file
-    model_path = R"ggml-model-whisper-tiny.en.bin"
-
     try:
         set_log_callback(my_log_callback)
 
@@ -62,6 +60,32 @@ def test_simpler_whisper():
     except Exception as e:
         print(f"An error occurred: {str(e)}")
 
+def test_threaded_whisper():
+    def handle_result(chunk_id, segments, is_partial):
+        print(f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}):")
+        for segment in segments:
+            print(f"  {segment}")
+
+    # Create model with 10-second max duration
+    model = ThreadedWhisperModel(model_path=model_path, use_gpu=True,
+                                 max_duration_sec=10.0)
+
+    # Start processing with callback
+    print("Starting threaded Whisper model...")
+    model.start(callback=handle_result)
+
+    for i in range(15):
+        print(f"Queueing audio chunk {i + 1}")
+        # Queue some audio (will get partial results until 10 seconds accumulate)
+        chunk_id = model.queue_audio(np.random.rand(16000).astype(np.float32))
+        print(f"  Queued chunk {i + 1} with ID {chunk_id}")
+        # sleep for 1 seconds
+        time.sleep(1)
+
+    # When done
+    print("Stopping threaded Whisper model...")
+    model.stop()  # Will process any remaining audio as final
 
 if __name__ == "__main__":
-    test_simpler_whisper()
+    # test_simpler_whisper()
+    test_threaded_whisper()

From 963a6133f7056c4fd3c7472512d7c1b1452ea062 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Wed, 23 Oct 2024 01:47:41 -0400
Subject: [PATCH 2/8] Refactor build.yaml workflow: Update Python setup and
 dependencies

---
 simpler_whisper/whisper.py | 11 ++++++++---
 src/whisper_wrapper.cpp    | 27 +++++++++++++++++++++------
 test_simpler_whisper.py    | 21 +++++++++++++++------
 3 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py
index 605c365..3b7ac07 100644
--- a/simpler_whisper/whisper.py
+++ b/simpler_whisper/whisper.py
@@ -1,4 +1,5 @@
 import numpy as np
+from typing import Callable, List
 from . import _whisper_cpp
 
 
@@ -22,7 +23,9 @@ def __del__(self):
 
 
 class ThreadedWhisperModel:
-    def __init__(self, model_path: str, use_gpu=False, max_duration_sec=10.0, sample_rate=16000):
+    def __init__(
+        self, model_path: str, use_gpu=False, max_duration_sec=10.0, sample_rate=16000
+    ):
         """
         Initialize a threaded Whisper model for continuous audio processing.
 
@@ -37,14 +40,16 @@ def __init__(self, model_path: str, use_gpu=False, max_duration_sec=10.0, sample
         )
         self._is_running = False
 
-    def start(self, callback, result_check_interval_ms=100):
+    def start(
+        self, callback: Callable[[int, str, bool], None], result_check_interval_ms=100
+    ):
         """
         Start the processing threads with a callback for results.
 
         Args:
             callback: Function that takes three arguments:
                      - chunk_id (int): Unique identifier for the audio chunk
-                     - segments (list): List of transcribed text segments
+                     - segments (str): Transcribed text for the audio chunk
                      - is_partial (bool): Whether this is a partial result
             result_check_interval_ms (int): How often to check for results
         """
diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp
index 4c8e374..e193274 100644
--- a/src/whisper_wrapper.cpp
+++ b/src/whisper_wrapper.cpp
@@ -44,15 +44,16 @@ class WhisperModel
         ctx = whisper_init_from_file_with_params(model_path.c_str(), ctx_params);
         if (!ctx)
         {
+            std::cout << "Failed to initialize whisper context" << std::endl;
             throw std::runtime_error("Failed to initialize whisper context");
         }
     }
 
     ~WhisperModel()
     {
-        std::cout << "WhisperModel d'tor Freeing whisper context" << std::endl;
         if (ctx)
         {
+            std::cout << "WhisperModel d'tor Freeing whisper context" << std::endl;
             whisper_free(ctx);
         }
     }
@@ -86,15 +87,16 @@ class WhisperModel
             std::cout << "Whisper inference failed" << std::endl;
             throw std::runtime_error("Whisper inference failed");
         }
+        std::cout << "Whisper inference succeeded" << std::endl;
 
         int n_segments = whisper_full_n_segments(ctx);
         std::vector<std::string> transcription;
-
         for (int i = 0; i < n_segments; i++)
         {
             const char *text = whisper_full_get_segment_text(ctx, i);
             transcription.push_back(std::string(text));
         }
+        std::cout << "num segments: " << n_segments << std::endl;
 
         return transcription;
     }
@@ -211,6 +213,13 @@ class ThreadedWhisperModel
             if (accumulated_buffer.empty())
                 return;
 
+            // check if buffer has less than 1 second of audio
+            if (accumulated_samples < 16000)
+            {
+                std::cout << "Not enough audio to process" << std::endl;
+                return;
+            }
+
             process_buffer = accumulated_buffer;
             current_id = current_chunk_id;
 
@@ -237,7 +246,7 @@ class ThreadedWhisperModel
         // Add result to output queue
         {
             std::lock_guard<std::mutex> lock(result_mutex);
-            result_queue.push(std::move(result));
+            result_queue.push(result);
             result_cv.notify_one();
         }
     }
@@ -326,15 +335,21 @@ class ThreadedWhisperModel
 
             if (!results.empty())
             {
+                std::cout << "Got " << results.size() << " results." << std::endl;
                 py::gil_scoped_acquire gil;
                 for (const auto &result : results)
                 {
-                    py::list segments;
+                    // concatenate segments into a single string
+                    std::string full_text;
                     for (const auto &segment : result.segments)
                     {
-                        segments.append(segment);
+                        full_text += segment;
+                    }
+                    std::cout << "Calling result callback with ID: " << result.chunk_id << std::endl;
+                    if (result_callback)
+                    {
+                        result_callback(result.chunk_id, full_text, result.is_partial);
                     }
-                    result_callback(result.chunk_id, segments, result.is_partial);
                 }
             }
         }
diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py
index 384efc7..96e5fb6 100644
--- a/test_simpler_whisper.py
+++ b/test_simpler_whisper.py
@@ -6,16 +6,23 @@
 import numpy as np
 import time
 
-from simpler_whisper.whisper import load_model, set_log_callback, LogLevel, ThreadedWhisperModel
+from simpler_whisper.whisper import (
+    load_model,
+    set_log_callback,
+    LogLevel,
+    ThreadedWhisperModel,
+)
 
 
 def my_log_callback(level, message):
     log_levels = {LogLevel.ERROR: "ERROR", LogLevel.WARN: "WARN", LogLevel.INFO: "INFO"}
     print(f"whisper.cpp [{log_levels.get(level, 'UNKNOWN')}] {message.strip()}")
 
+
 # Path to your Whisper model file
 model_path = R"ggml-tiny.en-q5_1.bin"
 
+
 def test_simpler_whisper():
     try:
         set_log_callback(my_log_callback)
@@ -60,15 +67,16 @@ def test_simpler_whisper():
     except Exception as e:
         print(f"An error occurred: {str(e)}")
 
+
 def test_threaded_whisper():
-    def handle_result(chunk_id, segments, is_partial):
+    def handle_result(chunk_id, text, is_partial):
         print(f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}):")
-        for segment in segments:
-            print(f"  {segment}")
+        print(f"  {text}")
 
     # Create model with 10-second max duration
-    model = ThreadedWhisperModel(model_path=model_path, use_gpu=True,
-                                 max_duration_sec=10.0)
+    model = ThreadedWhisperModel(
+        model_path=model_path, use_gpu=True, max_duration_sec=10.0
+    )
 
     # Start processing with callback
     print("Starting threaded Whisper model...")
@@ -86,6 +94,7 @@ def handle_result(chunk_id, segments, is_partial):
     print("Stopping threaded Whisper model...")
     model.stop()  # Will process any remaining audio as final
 
+
 if __name__ == "__main__":
     # test_simpler_whisper()
     test_threaded_whisper()

From ac714b3caf00031af5247ccc183af46ccf74921a Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Wed, 23 Oct 2024 09:52:21 -0400
Subject: [PATCH 3/8] Refactor whisper_wrapper.cpp: Remove debug print
 statements and unused variables

---
 src/whisper_wrapper.cpp | 46 ++++++-----------------------------------
 test_simpler_whisper.py | 29 +++++++++++++++++++-------
 2 files changed, 28 insertions(+), 47 deletions(-)

diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp
index e193274..8312c80 100644
--- a/src/whisper_wrapper.cpp
+++ b/src/whisper_wrapper.cpp
@@ -40,11 +40,9 @@ class WhisperModel
     {
         whisper_context_params ctx_params = whisper_context_default_params();
         ctx_params.use_gpu = use_gpu;
-        std::cout << "WhisperModel c'tor Loading model from path: " << model_path << std::endl;
         ctx = whisper_init_from_file_with_params(model_path.c_str(), ctx_params);
         if (!ctx)
         {
-            std::cout << "Failed to initialize whisper context" << std::endl;
             throw std::runtime_error("Failed to initialize whisper context");
         }
     }
@@ -53,7 +51,6 @@ class WhisperModel
     {
         if (ctx)
         {
-            std::cout << "WhisperModel d'tor Freeing whisper context" << std::endl;
             whisper_free(ctx);
         }
     }
@@ -77,17 +74,11 @@ class WhisperModel
 
     std::vector<std::string> transcribe_raw_audio(const float *audio_data, int n_samples)
     {
-        std::cout << "Transcribing audio with " << n_samples << " samples" << std::endl;
-        std::cout << "first sample: " << audio_data[0] << std::endl;
-        std::cout << "last sample: " << audio_data[n_samples - 1] << std::endl;
-
         whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
         if (whisper_full(ctx, params, audio_data, n_samples) != 0)
         {
-            std::cout << "Whisper inference failed" << std::endl;
             throw std::runtime_error("Whisper inference failed");
         }
-        std::cout << "Whisper inference succeeded" << std::endl;
 
         int n_segments = whisper_full_n_segments(ctx);
         std::vector<std::string> transcription;
@@ -96,7 +87,6 @@ class WhisperModel
             const char *text = whisper_full_get_segment_text(ctx, i);
             transcription.push_back(std::string(text));
         }
-        std::cout << "num segments: " << n_segments << std::endl;
 
         return transcription;
     }
@@ -125,7 +115,7 @@ class ThreadedWhisperModel
                          float max_duration_sec = 10.0f, int sample_rate = 16000)
         : running(false), next_chunk_id(0),
           max_samples(static_cast<size_t>(max_duration_sec * sample_rate)),
-          accumulated_samples(0), current_chunk_id(0), model_path(model_path),
+          current_chunk_id(0), model_path(model_path),
           use_gpu(use_gpu)
     {
     }
@@ -173,7 +163,6 @@ class ThreadedWhisperModel
         {
             std::lock_guard<std::mutex> lock(buffer_mutex);
             accumulated_buffer.clear();
-            accumulated_samples = 0;
         }
     }
 
@@ -209,34 +198,24 @@ class ThreadedWhisperModel
 
         {
             std::lock_guard<std::mutex> lock(buffer_mutex);
-            std::cout << "Processing accumulated audio with size: " << accumulated_buffer.size() << std::endl;
-            if (accumulated_buffer.empty())
+            if (accumulated_buffer.empty() || accumulated_buffer.size() < 16000)
                 return;
 
-            // check if buffer has less than 1 second of audio
-            if (accumulated_samples < 16000)
-            {
-                std::cout << "Not enough audio to process" << std::endl;
-                return;
-            }
-
             process_buffer = accumulated_buffer;
             current_id = current_chunk_id;
 
             // Only clear the buffer if we're processing a final result
-            if (force_final || accumulated_samples >= max_samples)
+            if (force_final || accumulated_buffer.size() >= max_samples)
             {
                 accumulated_buffer.clear();
-                accumulated_samples = 0;
             }
         }
 
         // Process audio
-        std::cout << "Processing audio with size: " << process_buffer.size() << std::endl;
-        std::cout << "Pointer to first sample: " << process_buffer.data() << std::endl;
-        std::cout << "First sample: " << process_buffer[0] << std::endl;
         std::vector<std::string> segments = model.transcribe_raw_audio(process_buffer.data(), process_buffer.size());
 
+        std::cout << "Transcription: " << segments[0] << std::endl;
+
         TranscriptionResult result;
         result.chunk_id = current_id;
         result.segments = segments;
@@ -253,11 +232,7 @@ class ThreadedWhisperModel
 
     void processThread()
     {
-        std::cout << "Starting process thread." << std::endl;
-
-        std::cout << "Loading model from path: " << this->model_path << std::endl;
         WhisperModel model(this->model_path, this->use_gpu);
-        std::cout << "Model loaded." << std::endl;
 
         while (running)
         {
@@ -273,7 +248,6 @@ class ThreadedWhisperModel
                 if (!running)
                 {
                     // Process any remaining audio as final before shutting down
-                    std::cout << "Shutting down, processing remaining audio as final." << std::endl;
                     processAccumulatedAudio(model, true);
                     break;
                 }
@@ -283,7 +257,6 @@ class ThreadedWhisperModel
                     chunk = std::move(input_queue.front());
                     input_queue.pop();
                     has_chunk = true;
-                    std::cout << "Got chunk with ID: " << chunk.id << " and size: " << chunk.data.size() << std::endl;
                 }
             }
 
@@ -297,17 +270,13 @@ class ThreadedWhisperModel
                     std::copy(chunk.data.begin(), chunk.data.end(),
                               accumulated_buffer.begin() + old_size);
 
-                    accumulated_samples += chunk.data.size();
                     current_chunk_id = chunk.id;
-                    std::cout << "Accumulated buffer size: " << accumulated_buffer.size() << std::endl;
                 }
 
                 // Process the accumulated audio
-                std::cout << "Processing accumulated audio." << std::endl;
                 processAccumulatedAudio(model, false);
             }
         }
-        std::cout << "Exiting process thread." << std::endl;
     }
 
     void resultThread(int check_interval_ms)
@@ -335,7 +304,6 @@ class ThreadedWhisperModel
 
             if (!results.empty())
             {
-                std::cout << "Got " << results.size() << " results." << std::endl;
                 py::gil_scoped_acquire gil;
                 for (const auto &result : results)
                 {
@@ -345,10 +313,9 @@ class ThreadedWhisperModel
                     {
                         full_text += segment;
                     }
-                    std::cout << "Calling result callback with ID: " << result.chunk_id << std::endl;
                     if (result_callback)
                     {
-                        result_callback(result.chunk_id, full_text, result.is_partial);
+                        result_callback((int)result.chunk_id, py::str(full_text), result.is_partial);
                     }
                 }
             }
@@ -362,7 +329,6 @@ class ThreadedWhisperModel
 
     // Audio accumulation
     std::vector<float> accumulated_buffer;
-    size_t accumulated_samples;
     size_t max_samples;
     std::mutex buffer_mutex;
 
diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py
index 96e5fb6..05b8027 100644
--- a/test_simpler_whisper.py
+++ b/test_simpler_whisper.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import time
+import resampy
 
 from simpler_whisper.whisper import (
     load_model,
@@ -69,7 +70,7 @@ def test_simpler_whisper():
 
 
 def test_threaded_whisper():
-    def handle_result(chunk_id, text, is_partial):
+    def handle_result(chunk_id: int, text: str, is_partial: bool):
         print(f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}):")
         print(f"  {text}")
 
@@ -78,17 +79,31 @@ def handle_result(chunk_id, text, is_partial):
         model_path=model_path, use_gpu=True, max_duration_sec=10.0
     )
 
+    # load audio from file with av
+    import av
+    container = av.open(R"C:\Users\roysh\Downloads\1847363777395929088.mp4")
+    audio = container.streams.audio[0]
+    print(audio)
+    frame_generator = container.decode(audio)
+
     # Start processing with callback
     print("Starting threaded Whisper model...")
     model.start(callback=handle_result)
 
-    for i in range(15):
-        print(f"Queueing audio chunk {i + 1}")
+    for i, frame in enumerate(frame_generator):
+        # print(f"Queueing audio chunk {i + 1}")
+        # Read audio chunk
+        # resample to 16kHz
+        samples = resampy.resample(frame.to_ndarray().mean(axis=0), frame.rate, 16000)
+
         # Queue some audio (will get partial results until 10 seconds accumulate)
-        chunk_id = model.queue_audio(np.random.rand(16000).astype(np.float32))
-        print(f"  Queued chunk {i + 1} with ID {chunk_id}")
-        # sleep for 1 seconds
-        time.sleep(1)
+        chunk_id = model.queue_audio(samples)
+        # print(f"  Queued chunk {i + 1} with ID {chunk_id} size {len(samples)}")
+        # sleep for the size of the audio chunk
+        time.sleep(len(samples) / 16000)
+
+    # close the container
+    container.close()
 
     # When done
     print("Stopping threaded Whisper model...")

From fddebc49cadfa6a600315b796dbcc4914b7e86a7 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 24 Oct 2024 10:13:12 -0400
Subject: [PATCH 4/8] Refactor whisper_wrapper.cpp: Add trim function and
 handle exceptions in transcription and result callback

---
 src/whisper_wrapper.cpp | 45 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp
index 8312c80..b4ded28 100644
--- a/src/whisper_wrapper.cpp
+++ b/src/whisper_wrapper.cpp
@@ -12,6 +12,17 @@
 
 namespace py = pybind11;
 
+std::string trim(const std::string &str)
+{
+    size_t start = str.find_first_not_of(" \t\n\r");
+    size_t end = str.find_last_not_of(" \t\n\r");
+
+    if (start == std::string::npos) // handles empty string "" and all-whitespace strings like " "
+        return "";
+
+    return str.substr(start, end - start + 1);
+}
+
 // Global variable to store the Python callback function
 py::function g_py_log_callback;
 
@@ -212,7 +223,19 @@ class ThreadedWhisperModel
         }
 
         // Process audio
-        std::vector<std::string> segments = model.transcribe_raw_audio(process_buffer.data(), process_buffer.size());
+        std::vector<std::string> segments;
+        try
+        {
+            segments = model.transcribe_raw_audio(process_buffer.data(), process_buffer.size());
+        }
+        catch (const std::exception &e)
+        {
+            std::cerr << "Exception during transcription: " << e.what() << std::endl;
+        }
+        catch (...)
+        {
+            std::cerr << "Unknown exception during transcription" << std::endl;
+        }
 
         std::cout << "Transcription: " << segments[0] << std::endl;
 
@@ -307,15 +330,33 @@ class ThreadedWhisperModel
                 py::gil_scoped_acquire gil;
                 for (const auto &result : results)
                 {
+                    if (result.segments.empty())
+                        continue;
+
                     // concatenate segments into a single string
                     std::string full_text;
                     for (const auto &segment : result.segments)
                     {
                         full_text += segment;
                     }
+                    full_text = trim(full_text);
+                    if (full_text.empty())
+                        continue;
+
                     if (result_callback)
                     {
-                        result_callback((int)result.chunk_id, py::str(full_text), result.is_partial);
+                        try
+                        {
+                            result_callback((int)result.chunk_id, py::str(full_text), result.is_partial);
+                        }
+                        catch (const std::exception &e)
+                        {
+                            std::cerr << "Exception in result callback: " << e.what() << std::endl;
+                        }
+                        catch (...)
+                        {
+                            std::cerr << "Unknown exception in result callback" << std::endl;
+                        }
                     }
                 }
             }

From 56d27b4717dfc4c0ae0af3ee0a9b2c08274a1f02 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Fri, 25 Oct 2024 09:53:04 -0400
Subject: [PATCH 5/8] Refactor whisper_wrapper.cpp: Add trim function and
 handle exceptions in transcription and result callback

---
 simpler_whisper/whisper.py | 18 +++++++++++++-----
 src/whisper_wrapper.cpp    |  8 ++++++--
 test_simpler_whisper.py    | 28 +++++++++++++++++++---------
 3 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py
index 3b7ac07..a3a0cfe 100644
--- a/simpler_whisper/whisper.py
+++ b/simpler_whisper/whisper.py
@@ -24,7 +24,12 @@ def __del__(self):
 
 class ThreadedWhisperModel:
     def __init__(
-        self, model_path: str, use_gpu=False, max_duration_sec=10.0, sample_rate=16000
+        self,
+        model_path: str,
+        callback: Callable[[int, str, bool], None],
+        use_gpu=False,
+        max_duration_sec=10.0,
+        sample_rate=16000,
     ):
         """
         Initialize a threaded Whisper model for continuous audio processing.
@@ -39,10 +44,13 @@ def __init__(
             model_path, use_gpu, max_duration_sec, sample_rate
         )
         self._is_running = False
+        self.callback = callback
 
-    def start(
-        self, callback: Callable[[int, str, bool], None], result_check_interval_ms=100
-    ):
+    def handle_result(self, chunk_id: int, text: str, is_partial: bool):
+        if self.callback is not None:
+            self.callback(chunk_id, text, is_partial)
+
+    def start(self, result_check_interval_ms=100):
         """
         Start the processing threads with a callback for results.
 
@@ -56,7 +64,7 @@ def start(
         if self._is_running:
             return
 
-        self.model.start(callback, result_check_interval_ms)
+        self.model.start(self.handle_result, result_check_interval_ms)
         self._is_running = True
 
     def stop(self):
diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp
index b4ded28..f27cc33 100644
--- a/src/whisper_wrapper.cpp
+++ b/src/whisper_wrapper.cpp
@@ -56,6 +56,7 @@ class WhisperModel
         {
             throw std::runtime_error("Failed to initialize whisper context");
         }
+        params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
     }
 
     ~WhisperModel()
@@ -85,7 +86,6 @@ class WhisperModel
 
     std::vector<std::string> transcribe_raw_audio(const float *audio_data, int n_samples)
     {
-        whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
         if (whisper_full(ctx, params, audio_data, n_samples) != 0)
         {
             throw std::runtime_error("Whisper inference failed");
@@ -104,6 +104,7 @@ class WhisperModel
 
 private:
     whisper_context *ctx;
+    whisper_full_params params;
 };
 
 struct AudioChunk
@@ -237,7 +238,10 @@ class ThreadedWhisperModel
             std::cerr << "Unknown exception during transcription" << std::endl;
         }
 
-        std::cout << "Transcription: " << segments[0] << std::endl;
+        if (segments.empty())
+        {
+            return;
+        }
 
         TranscriptionResult result;
         result.chunk_id = current_id;
diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py
index 05b8027..192d248 100644
--- a/test_simpler_whisper.py
+++ b/test_simpler_whisper.py
@@ -71,36 +71,46 @@ def test_simpler_whisper():
 
 def test_threaded_whisper():
     def handle_result(chunk_id: int, text: str, is_partial: bool):
-        print(f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}):")
-        print(f"  {text}")
+        print(
+            f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}): {text}"
+        )
 
     # Create model with 10-second max duration
     model = ThreadedWhisperModel(
-        model_path=model_path, use_gpu=True, max_duration_sec=10.0
+        model_path=model_path,
+        callback=handle_result,
+        use_gpu=True,
+        max_duration_sec=10.0,
     )
 
     # load audio from file with av
     import av
-    container = av.open(R"C:\Users\roysh\Downloads\1847363777395929088.mp4")
+
+    container = av.open(
+        R"local_path_to_audio_file"
+    )
     audio = container.streams.audio[0]
     print(audio)
     frame_generator = container.decode(audio)
 
     # Start processing with callback
     print("Starting threaded Whisper model...")
-    model.start(callback=handle_result)
+    model.start()
 
     for i, frame in enumerate(frame_generator):
-        # print(f"Queueing audio chunk {i + 1}")
         # Read audio chunk
+        incoming_audio = frame.to_ndarray().mean(axis=0)
+        incoming_audio = incoming_audio / 32768.0  # normalize to [-1, 1]
         # resample to 16kHz
-        samples = resampy.resample(frame.to_ndarray().mean(axis=0), frame.rate, 16000)
+        samples = resampy.resample(incoming_audio, frame.rate, 16000)
 
         # Queue some audio (will get partial results until 10 seconds accumulate)
         chunk_id = model.queue_audio(samples)
-        # print(f"  Queued chunk {i + 1} with ID {chunk_id} size {len(samples)}")
         # sleep for the size of the audio chunk
-        time.sleep(len(samples) / 16000)
+        try:
+            time.sleep(len(samples) / 16000)
+        except:
+            break
 
     # close the container
     container.close()

From 33e2a3c92daf4366cfb9f0c9c17acbb9ba7b7a48 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Fri, 25 Oct 2024 12:39:58 -0400
Subject: [PATCH 6/8] Refactor setup.py: Add extension suffix and create
 extension directory

---
 setup.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index 67c07d6..f032daf 100644
--- a/setup.py
+++ b/setup.py
@@ -23,14 +23,22 @@ def run(self):
             self.build_extension(ext)
 
     def build_extension(self, ext):
+        # This is the critical change - we need to get the proper extension suffix
+        ext_suffix = sysconfig.get_config_var('EXT_SUFFIX')
+
+        # Get the full path where the extension should be placed
         extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
-        
+
+        # Ensure the extension directory exists
+        os.makedirs(extdir, exist_ok=True)
+
         # Get acceleration and platform from environment variables
         acceleration = os.environ.get('SIMPLER_WHISPER_ACCELERATION', 'cpu')
         target_platform = os.environ.get('SIMPLER_WHISPER_PLATFORM', platform.machine())
-        
+
         cmake_args = [
             f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}',
+            f'-DPYTHON_EXTENSION_SUFFIX={ext_suffix}',  # Pass the extension suffix to CMake
             f'-DACCELERATION={acceleration}',
         ]
 
@@ -38,8 +46,12 @@ def build_extension(self, ext):
 
         # Add platform-specific arguments
         if platform.system() == "Darwin":  # macOS
-            cmake_args += [f'-DCMAKE_OSX_ARCHITECTURES={target_platform}']
-            # add MACOS_ARCH env variable to specify the target platform
+            cmake_args += [
+                f'-DCMAKE_OSX_ARCHITECTURES={target_platform}',
+                '-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON',
+                '-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON',
+                f'-DCMAKE_INSTALL_NAME_DIR=@rpath'
+            ]
             env["MACOS_ARCH"] = target_platform
 
         cfg = 'Debug' if self.debug else 'Release'
@@ -55,13 +67,14 @@ def build_extension(self, ext):
             build_args += ['--', '-j2']
 
         env['CXXFLAGS'] = f'{env.get("CXXFLAGS", "")} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"'
-        
+
         if not os.path.exists(self.build_temp):
             os.makedirs(self.build_temp)
-        
+
         print("CMake args:", cmake_args)
         print("Build args:", build_args)
-        
+        print(f"Extension will be built in: {extdir}")
+
         subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
         subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
 
@@ -75,4 +88,5 @@ def build_extension(self, ext):
     ext_modules=[CMakeExtension('simpler_whisper._whisper_cpp')],
     cmdclass=dict(build_ext=CMakeBuild),
     zip_safe=False,
-)
\ No newline at end of file
+    packages=['simpler_whisper'],  # Add this line to ensure the package directory is created
+)

From e771682a8e218509fbd813bf6b7982b00c6fac0f Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Fri, 25 Oct 2024 14:09:08 -0400
Subject: [PATCH 7/8] Refactor version numbers in pyproject.toml and setup.py

---
 pyproject.toml          |  4 ++--
 setup.py                |  2 +-
 src/whisper_wrapper.cpp | 27 ++++++++++++++--------
 test_simpler_whisper.py | 51 ++++++++++++++++++++++++++++-------------
 4 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 61cfc82..0baf19e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "simpler-whisper"
-version = "0.1.0"
+version = "0.2.0"
 authors = [
     {name = "Roy Shilkrot", email = "roy.shil@gmail.com"},
 ]
@@ -33,4 +33,4 @@ dependencies = [
 packages = ["simpler_whisper"]
 
 [tool.setuptools.package-data]
-simpler_whisper = ["*.dll", "*.pyd", "*.so", "*.metal"]
\ No newline at end of file
+simpler_whisper = ["*.dll", "*.pyd", "*.so", "*.metal"]
diff --git a/setup.py b/setup.py
index f032daf..143d780 100644
--- a/setup.py
+++ b/setup.py
@@ -80,7 +80,7 @@ def build_extension(self, ext):
 
 setup(
     name='simpler-whisper',
-    version='0.1.0',
+    version='0.2.0',
     author='Roy Shilkrot',
     author_email='roy.shil@gmail.com',
     description='A simple Python wrapper for whisper.cpp',
diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp
index f27cc33..fc73ef0 100644
--- a/src/whisper_wrapper.cpp
+++ b/src/whisper_wrapper.cpp
@@ -29,9 +29,10 @@ py::function g_py_log_callback;
 // C++ callback function that will be passed to whisper_log_set
 void cpp_log_callback(ggml_log_level level, const char *text, void *)
 {
-    if (!g_py_log_callback.is_none())
+    if (!g_py_log_callback.is_none() && text != nullptr && strlen(text) > 0)
     {
-        g_py_log_callback(level, text);
+        py::gil_scoped_acquire gil;
+        g_py_log_callback(level, std::string(text));
     }
 }
 
@@ -263,7 +264,7 @@ class ThreadedWhisperModel
 
         while (running)
         {
-            AudioChunk chunk;
+            AudioChunk all_chunks;
             bool has_chunk = false;
 
             // Get next chunk from input queue
@@ -279,10 +280,13 @@ class ThreadedWhisperModel
                     break;
                 }
 
-                if (!input_queue.empty())
+                // take all chunks from the queue and create a single chunk
+                while (!input_queue.empty())
                 {
-                    chunk = std::move(input_queue.front());
+                    AudioChunk chunk = std::move(input_queue.front());
                     input_queue.pop();
+                    all_chunks.data.insert(all_chunks.data.end(), chunk.data.begin(), chunk.data.end());
+                    all_chunks.id = chunk.id;
                     has_chunk = true;
                 }
             }
@@ -293,11 +297,11 @@ class ThreadedWhisperModel
                 {
                     std::lock_guard<std::mutex> lock(buffer_mutex);
                     size_t old_size = accumulated_buffer.size();
-                    accumulated_buffer.resize(old_size + chunk.data.size());
-                    std::copy(chunk.data.begin(), chunk.data.end(),
+                    accumulated_buffer.resize(old_size + all_chunks.data.size());
+                    std::copy(all_chunks.data.begin(), all_chunks.data.end(),
                               accumulated_buffer.begin() + old_size);
 
-                    current_chunk_id = chunk.id;
+                    current_chunk_id = all_chunks.id;
                 }
 
                 // Process the accumulated audio
@@ -420,8 +424,11 @@ PYBIND11_MODULE(_whisper_cpp, m)
     m.def("set_log_callback", &set_log_callback, "Set the log callback function");
 
     py::enum_<ggml_log_level>(m, "LogLevel")
-        .value("ERROR", GGML_LOG_LEVEL_ERROR)
-        .value("WARN", GGML_LOG_LEVEL_WARN)
+        .value("NONE", GGML_LOG_LEVEL_NONE)
         .value("INFO", GGML_LOG_LEVEL_INFO)
+        .value("WARN", GGML_LOG_LEVEL_WARN)
+        .value("ERROR", GGML_LOG_LEVEL_ERROR)
+        .value("DEBUG", GGML_LOG_LEVEL_DEBUG)
+        .value("CONT", GGML_LOG_LEVEL_CONT)
         .export_values();
 }
diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py
index 192d248..3e60637 100644
--- a/test_simpler_whisper.py
+++ b/test_simpler_whisper.py
@@ -1,3 +1,4 @@
+import argparse
 import sys
 
 # Remove the current directory from sys.path to avoid conflicts with the installed package
@@ -15,13 +16,23 @@
 )
 
 
+log_levels = {LogLevel.ERROR: "ERROR", LogLevel.WARN: "WARN", LogLevel.INFO: "INFO"}
+
+
 def my_log_callback(level, message):
-    log_levels = {LogLevel.ERROR: "ERROR", LogLevel.WARN: "WARN", LogLevel.INFO: "INFO"}
-    print(f"whisper.cpp [{log_levels.get(level, 'UNKNOWN')}] {message.strip()}")
+    if message is not None and len(message.strip()) > 0:
+        print(f"whisper.cpp [{log_levels.get(level, 'UNKNOWN')}] {message.strip()}")
 
 
 # Path to your Whisper model file
-model_path = R"ggml-tiny.en-q5_1.bin"
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description="Test simpler-whisper model.")
+parser.add_argument("model_path", type=str, help="Path to the Whisper model file")
+parser.add_argument("audio_file", type=str, help="Path to the audio file")
+args = parser.parse_args()
+
+model_path = args.model_path
+audio_file = args.audio_file
 
 
 def test_simpler_whisper():
@@ -70,6 +81,8 @@ def test_simpler_whisper():
 
 
 def test_threaded_whisper():
+    set_log_callback(my_log_callback)
+
     def handle_result(chunk_id: int, text: str, is_partial: bool):
         print(
             f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}): {text}"
@@ -86,9 +99,7 @@ def handle_result(chunk_id: int, text: str, is_partial: bool):
     # load audio from file with av
     import av
 
-    container = av.open(
-        R"local_path_to_audio_file"
-    )
+    container = av.open(audio_file)
     audio = container.streams.audio[0]
     print(audio)
     frame_generator = container.decode(audio)
@@ -98,17 +109,25 @@ def handle_result(chunk_id: int, text: str, is_partial: bool):
     model.start()
 
     for i, frame in enumerate(frame_generator):
-        # Read audio chunk
-        incoming_audio = frame.to_ndarray().mean(axis=0)
-        incoming_audio = incoming_audio / 32768.0  # normalize to [-1, 1]
-        # resample to 16kHz
-        samples = resampy.resample(incoming_audio, frame.rate, 16000)
-
-        # Queue some audio (will get partial results until 10 seconds accumulate)
-        chunk_id = model.queue_audio(samples)
-        # sleep for the size of the audio chunk
         try:
-            time.sleep(len(samples) / 16000)
+            # Read audio chunk
+            incoming_audio = frame.to_ndarray()
+            # check if stereo
+            if incoming_audio.shape[0] == 2:
+                incoming_audio = incoming_audio.mean(axis=0)
+            # check if the type is int16 or float32
+            if incoming_audio.dtype == np.int16:
+                incoming_audio = incoming_audio / 32768.0  # normalize to [-1, 1]
+            # resample to 16kHz if needed
+            if frame.rate != 16000:
+                samples = resampy.resample(incoming_audio, frame.rate, 16000)
+            else:
+                samples = incoming_audio
+
+            # Queue some audio (will get partial results until 10 seconds accumulate)
+            chunk_id = model.queue_audio(samples)
+            # sleep for the size of the audio chunk
+            time.sleep(float(len(samples)) / float(16000))
         except:
             break
 

From 96f019da60602e51f10536ccd7e6b60af202f95d Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Fri, 25 Oct 2024 14:23:30 -0400
Subject: [PATCH 8/8] Refactor build.yaml, CMakeLists.txt, and setup.py

---
 .github/workflows/build.yaml | 16 +++----
 CMakeLists.txt               |  4 +-
 setup.py                     | 85 ++++++++++++++++++++++--------------
 3 files changed, 63 insertions(+), 42 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index cbe7286..87336bc 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -74,17 +74,18 @@ jobs:
       env:
         SIMPLER_WHISPER_ACCELERATION: ${{ matrix.acceleration }}
         SIMPLER_WHISPER_PLATFORM: ${{ matrix.platform }}
+        SIMPLER_WHISPER_PYTHON_VERSION: ${{ matrix.python-version }}
       run: |
         python setup.py build_ext --inplace
         python -m build --wheel
-  
+
     - name: Install built wheel Non-Windows
       if: startsWith(matrix.os, 'windows') == false
       run: |
         pip install dist/*.whl
-  
+
     - name: Install built wheel Windows
-      if: startsWith(matrix.os, 'windows') == true  
+      if: startsWith(matrix.os, 'windows') == true
       shell: pwsh
       run: |
         $wheelFile = Get-ChildItem dist/*.whl | Select-Object -First 1
@@ -104,15 +105,15 @@ jobs:
       run: |
         import os
         import glob
-        
+
         wheel_file = glob.glob('dist/*.whl')[0]
         base_name = os.path.basename(wheel_file)
         name_parts = base_name.split('-')
-        
+
         # Insert acceleration and platform before the last part (which is like 'any.whl')
         new_name_parts = name_parts[:-1] + ['${{ matrix.acceleration }}', '${{ matrix.platform }}'] + [name_parts[-1]]
         new_name = '-'.join(new_name_parts)
-        
+
         new_path = os.path.join('dist', new_name)
         os.rename(wheel_file, new_path)
         print(f"Renamed {base_name} to {new_name}")
@@ -125,10 +126,9 @@ jobs:
           $wheelName += "-${{ matrix.acceleration }}"
         }
         echo "WHEEL_NAME=$wheelName" >> $env:GITHUB_ENV
-      
+
     - name: Upload wheel
       uses: actions/upload-artifact@v4
       with:
         name: ${{ env.WHEEL_NAME }}
         path: dist/*.whl
-      
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0fb5f0d..48a9fe7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@ set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 # Find Python
-find_package(Python COMPONENTS Interpreter Development NumPy REQUIRED)
+find_package(Python ${PYTHON_VERSION} EXACT COMPONENTS Interpreter Development NumPy REQUIRED)
 
 # Fetch pybind11
 include(FetchContent)
@@ -40,4 +40,4 @@ if(WIN32 OR APPLE)
             $<TARGET_FILE_DIR:_whisper_cpp>
         )
     endforeach()
-endif()
\ No newline at end of file
+endif()
diff --git a/setup.py b/setup.py
index 143d780..a2eb7f5 100644
--- a/setup.py
+++ b/setup.py
@@ -6,25 +6,29 @@
 import platform
 import sysconfig
 
+
 class CMakeExtension(Extension):
-    def __init__(self, name, sourcedir=''):
+    def __init__(self, name, sourcedir=""):
         Extension.__init__(self, name, sources=[])
         self.sourcedir = os.path.abspath(sourcedir)
 
+
 class CMakeBuild(build_ext):
     def run(self):
         try:
-            out = subprocess.check_output(['cmake', '--version'])
+            out = subprocess.check_output(["cmake", "--version"])
         except OSError:
-            raise RuntimeError("CMake must be installed to build the following extensions: " +
-                               ", ".join(e.name for e in self.extensions))
+            raise RuntimeError(
+                "CMake must be installed to build the following extensions: "
+                + ", ".join(e.name for e in self.extensions)
+            )
 
         for ext in self.extensions:
             self.build_extension(ext)
 
     def build_extension(self, ext):
         # This is the critical change - we need to get the proper extension suffix
-        ext_suffix = sysconfig.get_config_var('EXT_SUFFIX')
+        ext_suffix = sysconfig.get_config_var("EXT_SUFFIX")
 
         # Get the full path where the extension should be placed
         extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
@@ -33,13 +37,18 @@ def build_extension(self, ext):
         os.makedirs(extdir, exist_ok=True)
 
         # Get acceleration and platform from environment variables
-        acceleration = os.environ.get('SIMPLER_WHISPER_ACCELERATION', 'cpu')
-        target_platform = os.environ.get('SIMPLER_WHISPER_PLATFORM', platform.machine())
+        acceleration = os.environ.get("SIMPLER_WHISPER_ACCELERATION", "cpu")
+        target_platform = os.environ.get("SIMPLER_WHISPER_PLATFORM", platform.machine())
+        python_version = os.environ.get(
+            "SIMPLER_WHISPER_PYTHON_VERSION",
+            f"{sys.version_info.major}.{sys.version_info.minor}",
+        )
 
         cmake_args = [
-            f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}',
-            f'-DPYTHON_EXTENSION_SUFFIX={ext_suffix}',  # Pass the extension suffix to CMake
-            f'-DACCELERATION={acceleration}',
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}",
+            f"-DPYTHON_EXTENSION_SUFFIX={ext_suffix}",  # Pass the extension suffix to CMake
+            f"-DACCELERATION={acceleration}",
+            f"-DPYTHON_VERSION={python_version}",
         ]
 
         env = os.environ.copy()
@@ -47,26 +56,28 @@ def build_extension(self, ext):
         # Add platform-specific arguments
         if platform.system() == "Darwin":  # macOS
             cmake_args += [
-                f'-DCMAKE_OSX_ARCHITECTURES={target_platform}',
-                '-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON',
-                '-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON',
-                f'-DCMAKE_INSTALL_NAME_DIR=@rpath'
+                f"-DCMAKE_OSX_ARCHITECTURES={target_platform}",
+                "-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON",
+                "-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON",
+                f"-DCMAKE_INSTALL_NAME_DIR=@rpath",
             ]
             env["MACOS_ARCH"] = target_platform
 
-        cfg = 'Debug' if self.debug else 'Release'
-        build_args = ['--config', cfg]
+        cfg = "Debug" if self.debug else "Release"
+        build_args = ["--config", cfg]
 
         if platform.system() == "Windows":
-            cmake_args += [f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}']
+            cmake_args += [f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"]
             if sys.maxsize > 2**32:
-                cmake_args += ['-A', 'x64']
-            build_args += ['--', '/m']
+                cmake_args += ["-A", "x64"]
+            build_args += ["--", "/m"]
         else:
-            cmake_args += [f'-DCMAKE_BUILD_TYPE={cfg}']
-            build_args += ['--', '-j2']
+            cmake_args += [f"-DCMAKE_BUILD_TYPE={cfg}"]
+            build_args += ["--", "-j2"]
 
-        env['CXXFLAGS'] = f'{env.get("CXXFLAGS", "")} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"'
+        env["CXXFLAGS"] = (
+            f'{env.get("CXXFLAGS", "")} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"'
+        )
 
         if not os.path.exists(self.build_temp):
             os.makedirs(self.build_temp)
@@ -74,19 +85,29 @@ def build_extension(self, ext):
         print("CMake args:", cmake_args)
         print("Build args:", build_args)
         print(f"Extension will be built in: {extdir}")
+        print(
+            f"Building for Python {python_version} on {target_platform} with acceleration: {acceleration}"
+        )
+
+        subprocess.check_call(
+            ["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env
+        )
+        subprocess.check_call(
+            ["cmake", "--build", "."] + build_args, cwd=self.build_temp
+        )
 
-        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
-        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
 
 setup(
-    name='simpler-whisper',
-    version='0.2.0',
-    author='Roy Shilkrot',
-    author_email='roy.shil@gmail.com',
-    description='A simple Python wrapper for whisper.cpp',
-    long_description='',
-    ext_modules=[CMakeExtension('simpler_whisper._whisper_cpp')],
+    name="simpler-whisper",
+    version="0.2.0",
+    author="Roy Shilkrot",
+    author_email="roy.shil@gmail.com",
+    description="A simple Python wrapper for whisper.cpp",
+    long_description="",
+    ext_modules=[CMakeExtension("simpler_whisper._whisper_cpp")],
     cmdclass=dict(build_ext=CMakeBuild),
     zip_safe=False,
-    packages=['simpler_whisper'],  # Add this line to ensure the package directory is created
+    packages=[
+        "simpler_whisper"
+    ],  # Add this line to ensure the package directory is created
 )