Refactor version numbers in pyproject.toml and setup.py

locaal-ai · Oct 25, 2024 · e771682 · e771682
1 parent 33e2a3c
commit e771682
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 29 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "simpler-whisper"
-version = "0.1.0"
+version = "0.2.0"
 authors = [
     {name = "Roy Shilkrot", email = "[email protected]"},
 ]
@@ -33,4 +33,4 @@ dependencies = [
 packages = ["simpler_whisper"]
 
 [tool.setuptools.package-data]
-simpler_whisper = ["*.dll", "*.pyd", "*.so", "*.metal"]
+simpler_whisper = ["*.dll", "*.pyd", "*.so", "*.metal"]
diff --git a/setup.py b/setup.py
@@ -80,7 +80,7 @@ def build_extension(self, ext):
 
 setup(
     name='simpler-whisper',
-    version='0.1.0',
+    version='0.2.0',
     author='Roy Shilkrot',
     author_email='[email protected]',
     description='A simple Python wrapper for whisper.cpp',

diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp
@@ -29,9 +29,10 @@ py::function g_py_log_callback;
 // C++ callback function that will be passed to whisper_log_set
 void cpp_log_callback(ggml_log_level level, const char *text, void *)
 {
-    if (!g_py_log_callback.is_none())
+    if (!g_py_log_callback.is_none() && text != nullptr && strlen(text) > 0)
     {
-        g_py_log_callback(level, text);
+        py::gil_scoped_acquire gil;
+        g_py_log_callback(level, std::string(text));
     }
 }
 
@@ -263,7 +264,7 @@ class ThreadedWhisperModel
 
         while (running)
         {
-            AudioChunk chunk;
+            AudioChunk all_chunks;
             bool has_chunk = false;
 
             // Get next chunk from input queue
@@ -279,10 +280,13 @@ class ThreadedWhisperModel
                     break;
                 }
 
-                if (!input_queue.empty())
+                // take all chunks from the queue and create a single chunk
+                while (!input_queue.empty())
                 {
-                    chunk = std::move(input_queue.front());
+                    AudioChunk chunk = std::move(input_queue.front());
                     input_queue.pop();
+                    all_chunks.data.insert(all_chunks.data.end(), chunk.data.begin(), chunk.data.end());
+                    all_chunks.id = chunk.id;
                     has_chunk = true;
                 }
             }
@@ -293,11 +297,11 @@ class ThreadedWhisperModel
                 {
                     std::lock_guard<std::mutex> lock(buffer_mutex);
                     size_t old_size = accumulated_buffer.size();
-                    accumulated_buffer.resize(old_size + chunk.data.size());
-                    std::copy(chunk.data.begin(), chunk.data.end(),
+                    accumulated_buffer.resize(old_size + all_chunks.data.size());
+                    std::copy(all_chunks.data.begin(), all_chunks.data.end(),
                               accumulated_buffer.begin() + old_size);
 
-                    current_chunk_id = chunk.id;
+                    current_chunk_id = all_chunks.id;
                 }
 
                 // Process the accumulated audio
@@ -420,8 +424,11 @@ PYBIND11_MODULE(_whisper_cpp, m)
     m.def("set_log_callback", &set_log_callback, "Set the log callback function");
 
     py::enum_<ggml_log_level>(m, "LogLevel")
-        .value("ERROR", GGML_LOG_LEVEL_ERROR)
-        .value("WARN", GGML_LOG_LEVEL_WARN)
+        .value("NONE", GGML_LOG_LEVEL_NONE)
         .value("INFO", GGML_LOG_LEVEL_INFO)
+        .value("WARN", GGML_LOG_LEVEL_WARN)
+        .value("ERROR", GGML_LOG_LEVEL_ERROR)
+        .value("DEBUG", GGML_LOG_LEVEL_DEBUG)
+        .value("CONT", GGML_LOG_LEVEL_CONT)
         .export_values();
 }
diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py
@@ -1,3 +1,4 @@
+import argparse
 import sys
 
 # Remove the current directory from sys.path to avoid conflicts with the installed package
@@ -15,13 +16,23 @@
 )
 
 
+log_levels = {LogLevel.ERROR: "ERROR", LogLevel.WARN: "WARN", LogLevel.INFO: "INFO"}
+
+
 def my_log_callback(level, message):
-    log_levels = {LogLevel.ERROR: "ERROR", LogLevel.WARN: "WARN", LogLevel.INFO: "INFO"}
-    print(f"whisper.cpp [{log_levels.get(level, 'UNKNOWN')}] {message.strip()}")
+    if message is not None and len(message.strip()) > 0:
+        print(f"whisper.cpp [{log_levels.get(level, 'UNKNOWN')}] {message.strip()}")
 
 
 # Path to your Whisper model file
-model_path = R"ggml-tiny.en-q5_1.bin"
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description="Test simpler-whisper model.")
+parser.add_argument("model_path", type=str, help="Path to the Whisper model file")
+parser.add_argument("audio_file", type=str, help="Path to the audio file")
+args = parser.parse_args()
+
+model_path = args.model_path
+audio_file = args.audio_file
 
 
 def test_simpler_whisper():
@@ -70,6 +81,8 @@ def test_simpler_whisper():
 
 
 def test_threaded_whisper():
+    set_log_callback(my_log_callback)
+
     def handle_result(chunk_id: int, text: str, is_partial: bool):
         print(
             f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}): {text}"
@@ -86,9 +99,7 @@ def handle_result(chunk_id: int, text: str, is_partial: bool):
     # load audio from file with av
     import av
 
-    container = av.open(
-        R"local_path_to_audio_file"
-    )
+    container = av.open(audio_file)
     audio = container.streams.audio[0]
     print(audio)
     frame_generator = container.decode(audio)
@@ -98,17 +109,25 @@ def handle_result(chunk_id: int, text: str, is_partial: bool):
     model.start()
 
     for i, frame in enumerate(frame_generator):
-        # Read audio chunk
-        incoming_audio = frame.to_ndarray().mean(axis=0)
-        incoming_audio = incoming_audio / 32768.0  # normalize to [-1, 1]
-        # resample to 16kHz
-        samples = resampy.resample(incoming_audio, frame.rate, 16000)
-
-        # Queue some audio (will get partial results until 10 seconds accumulate)
-        chunk_id = model.queue_audio(samples)
-        # sleep for the size of the audio chunk
         try:
-            time.sleep(len(samples) / 16000)
+            # Read audio chunk
+            incoming_audio = frame.to_ndarray()
+            # check if stereo
+            if incoming_audio.shape[0] == 2:
+                incoming_audio = incoming_audio.mean(axis=0)
+            # check if the type is int16 or float32
+            if incoming_audio.dtype == np.int16:
+                incoming_audio = incoming_audio / 32768.0  # normalize to [-1, 1]
+            # resample to 16kHz if needed
+            if frame.rate != 16000:
+                samples = resampy.resample(incoming_audio, frame.rate, 16000)
+            else:
+                samples = incoming_audio
+
+            # Queue some audio (will get partial results until 10 seconds accumulate)
+            chunk_id = model.queue_audio(samples)
+            # sleep for the size of the audio chunk
+            time.sleep(float(len(samples)) / float(16000))
         except:
             break