From c67d60917573da5e2e46e55b9cf9e2bdb37a49bc Mon Sep 17 00:00:00 2001 From: Eric Date: Thu, 5 Sep 2019 06:46:49 -0500 Subject: [PATCH] add metadata JSON output and POST client APIs - add POST API for uploading a wav file - add enhanced metadata output option to ws and POST options - enhanced README with additional examples and changelog - not complete fix, but partially aimed at issue #2 (request for package) with API addition --- README.md | 85 ++++++++++++++++++- client.py | 1 + requirements-server.txt | 1 + server.py | 182 +++++++++++++++++++++++++++++++--------- 4 files changed, 224 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index dc9b411..d653203 100644 --- a/README.md +++ b/README.md @@ -14,12 +14,66 @@ Work in progress. Developed to quickly test new models running DeepSpeech in [Wi - Streaming inference via DeepSpeech v0.2+ - Multi-user (only decodes one stream at a time, but can block until decoding is available) - Tested and works with DeepSpeech v0.5.1 on Windows + - Mode for JSON return and enhanced/rich metadata on timing of each word * Client - Streams raw audio data from microphone to server via WebSocket - Voice activity detection (VAD) to ignore noise and segment microphone input into separate utterances - Hypnotizing spinner to indicate voice activity is detected! - Option to automatically save each utterance to a separate .wav file, for later testing - Need to pause/unpause listening? [See here](https://github.com/daanzu/deepspeech-websocket-server/issues/6). + - A POST endpoint to push files directly (warning, limited file upload size) + + +### Server Endpoints + +Functionality was expanded with a few additional enpoints but the same great server wrapper. + +* `/recognize` - WebSocket-based traditional recognition (plain text result) +* `/recognize_meta` - WebSocket-based enhanced recognition that includes JSON results for probability, timing, etc. + - example JSON result: + ``` + { + "probability": 53.0922, + "text": "your power is sufficient i said", + "duration": 5.36, + "items": [ + { + "text": "your", + "start": 0.68, + "duration": 0.18 + }, + { + "text": "power", + "start": 0.92, + "duration": 0.50 + }, + { + "text": "is", + "start": 1.24, + "duration": 0.66 + }, + { + "text": "sufficient", + "start": 1.38, + "duration": 1.32 + }, + { + "text": "i", + "start": 1.86, + "duration": 1.32 + }, + { + "text": "said", + "start": 2.04, + "duration": 1.38 + } + ], + "start": 0.68 + } + ``` +* `/recognize_file` - POST recognition allowing either enhanced (JSON) or text-only (string) for a file upload (see [Audio File Processing](Audio+File+Processing)) + - uses web-form or parameter submissions using parameters `audio` (a `wav file`) and `enhanced` (integer `0` or `1`) + ## Installation @@ -48,7 +102,7 @@ On MacOS, try installing portaudio with brew: `brew install portaudio` . ## Server -``` +```bash > python server.py --model ../models/daanzu-6h-512l-0001lr-425dr/ -l -t Initializing model... 2018-10-06 AM 05:55:16.357: __main__: INFO: (): args.model: ../models/daanzu-6h-512l-0001lr-425dr/output_graph.pb @@ -69,7 +123,7 @@ Hit Ctrl-C to quit. ^CKeyboardInterrupt ``` -``` +```bash > python server.py -h usage: server.py [-h] -m MODEL [-a [ALPHABET]] [-l [LM]] [-t [TRIE]] [--lw LW] [--vwcw VWCW] [--bw BW] [-p PORT] @@ -99,7 +153,7 @@ optional arguments: ## Client -``` +```bash λ py client.py Listening... Recognized: alpha bravo charlie @@ -107,7 +161,7 @@ Recognized: delta echo foxtrot ^C ``` -``` +```bash λ py client.py -h usage: client.py [-h] [-s SERVER] [-a AGGRESSIVENESS] [--nospinner] [-w SAVEWAV] [-d DEVICE] [-v] @@ -133,9 +187,32 @@ optional arguments: ``` +### Audio File Processing +Want to send a file directly to the server instead of from a live source? + +```bash +# process a single file for text alone; must be wav file +curl -X POST -F file=@../audio/8455-210777-0068.wav http://localhost:8787/recognize_file + +# process a single file with enhanced return; must be wav file +curl -X POST -F file=@../audio/8455-210777-0068.wav -F enhanced=1 http://localhost:8787/recognize_file + +# process a single file with enhanced return; must be wav file (alternate with url-based parameter) +curl -X POST -F file=@../audio/8455-210777-0068.wav http://localhost:8787/recognize_file?enhanced=1 + +``` + ## Contributions Pull requests welcome. Contributors: * [@Zeddy913](https://github.com/Zeddy913) + + +## Changes + +Coarse description of significant modifications as they come. + +- 190905 - add POST API for file endpoint; enhanced mode for server returns; launch server at `0.0.0.0` instead of localhost +- 190903 - add device index for pyaudio so you can use other loopback devices (e.g. [MacOS Soundflower](https://github.com/mattingalls/Soundflower) ) diff --git a/client.py b/client.py index 0c0ce2a..b247d57 100644 --- a/client.py +++ b/client.py @@ -57,6 +57,7 @@ def read(self): else: return None + def read_loop(self, callback): """Block looping reading, repeatedly passing a block of audio data to callback.""" for block in iter(self): diff --git a/requirements-server.txt b/requirements-server.txt index 703f4fd..8e118e4 100644 --- a/requirements-server.txt +++ b/requirements-server.txt @@ -1,3 +1,4 @@ numpy>=1.15.1 bottle>=0.12.13 bottle-websocket>=0.2.9 +scipy>=0.12.0 diff --git a/server.py b/server.py index 5b0841f..6134d7f 100644 --- a/server.py +++ b/server.py @@ -1,13 +1,17 @@ import argparse, logging, os.path from time import time -from bottle import get, run, template +from bottle import get, run, template, post, request from bottle.ext.websocket import GeventWebSocketServer from bottle.ext.websocket import websocket from gevent.lock import BoundedSemaphore +from scipy import signal +from scipy.io.wavfile import read as wav_read + import deepspeech import numpy as np +import json logger = logging.getLogger(__name__) logging.basicConfig(level=20, @@ -38,7 +42,12 @@ logging.getLogger().setLevel(int(ARGS.debuglevel)) -gSem = BoundedSemaphore(1) # Only one Deepspeech instance available at a time +server_state = { + "start_time": None, + "semaphore_acquired": False, + "sctx": None, # stream context/state + "semaphore": BoundedSemaphore(1) # Only one Deepspeech instance available at a time +} if os.path.isdir(ARGS.model): model_dir = ARGS.model @@ -52,6 +61,7 @@ BEAM_WIDTH = ARGS.bw N_FEATURES = 26 N_CONTEXT = 9 +AUDIO_RATE = 16000 print('Initializing model...') logger.info("ARGS.model: %s", ARGS.model) @@ -66,59 +76,149 @@ ARGS.trie, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT) + logger.info("Model load complete.") + +# perform metadata flattening to a python dict for easy serialization +def regularize_metadata(metadata): + # https://github.com/mozilla/DeepSpeech/blob/4c14c6b78b3daf90b67f840035a991bb94d9e1fa/native_client/deepspeech.h#L26 + return_data = { "probability": round(metadata.probability, 4), "text":[], "duration":0.0, "items":[] } + word_data = {"text":[], "start":0, "duration":0} + + def promote_word(return_data, word_data, time_eos=None): + word_text = "".join(word_data["text"]) + return_data["items"].append({"text":word_text, "start":word_data["start"], "duration":time_eos}) + if time_eos is None: + return_data["items"][-1]["duration"] = return_data["items"][-1]["start"] + 0.02 # 20ms chunks + return_data["items"][-1]["duration"] = round(return_data["items"][-1]["duration"] - return_data["items"][0]["start"], 3) # adjust for start + return_data["duration"] = round(return_data["duration"] + return_data["items"][-1]["duration"], 3) # include last word + return_data["start"] = return_data["items"][0]["start"] # utilize start of first + return_data["text"].append(word_text) # append last word + + for item_idx in range(metadata.num_items): + new_item = metadata.items[item_idx] + start_item = round(new_item.start_time, 3) # chunks of 20ms time + if new_item.character == " ": # found the end of a word, promote + promote_word(return_data, word_data, start_item) + word_data = {"text":[], "start":0, "duration":0} + else: + if word_data["start"] == 0: + word_data["start"] = start_item + word_data["text"].append(new_item.character) + + if word_data["text"]: # clear out last word in queue + promote_word(return_data, word_data) + return_data["text"] = " ".join(return_data["text"]) + return return_data + +def data_resample(file_handle): + """ + Data may not arraive at our native processing sampling rate, so + resample from input_rate to RATE_PROCESS here for webrtcvad and + deepspeech + Args: + data (binary): Input audio stream + input_rate (int): Input audio rate to resample from + """ + input_rate, data = wav_read(file_handle) + data16 = np.fromstring(string=data, dtype=np.int16) + resample_size = int(len(data16) / input_rate * AUDIO_RATE) + resample = signal.resample(data16, resample_size) + resample16 = np.array(resample, dtype=np.int16) + return resample16.tostring() + + +def data_process(data, rich_return=False): + return_str = None + # logger.log(5, "got websocket data: %r", data) + + if isinstance(data, bytearray): + # Receive stream data + if not server_state["start_time"]: + # Start of stream (utterance) + server_state["start_time"] = time() + server_state["sctx"] = model.setupStream() + assert not server_state["semaphore_acquired"] + # logger.debug("acquiring lock for deepspeech ...") + server_state["semaphore"].acquire(blocking=True) + server_state["semaphore_acquired"] = True + # logger.debug("lock acquired") + model.feedAudioContent(server_state["sctx"], np.frombuffer(data, np.int16)) + return_str = {} if rich_return else '' + + elif isinstance(data, str) and data == 'EOS': + # End of stream (utterance) + eos_time = time() + metadata = regularize_metadata(model.finishStreamWithMetadata(server_state["sctx"])) + logger.info("recognized: {:}".format(metadata)) + logger.info("recognized: %r", metadata["text"]) + logger.info(" time: total=%s post_eos=%s", time()-server_state["start_time"], time()-eos_time) + if rich_return: + return_str = json.dumps(metadata) + else: + return_str = metadata["text"] + + # FIXME: handle ConnectionResetError & geventwebsocket.exceptions.WebSocketError + # logger.debug("releasing lock ...") + server_state["semaphore"].release() + server_state["semaphore_acquired"] = False + # logger.debug("lock released") + server_state["start_time"] = None + + else: + # Lost connection + logger.debug("dead websocket") + if server_state["semaphore_acquired"]: + # logger.debug("releasing lock ...") + server_state["semaphore"].release() + server_state["semaphore_acquired"] = False + # logger.debug("lock released") + return return_str @get('/recognize', apply=[websocket]) def recognize(ws): logger.debug("new websocket") - start_time = None - gSem_acquired = False while True: data = ws.receive() - # logger.log(5, "got websocket data: %r", data) - - if isinstance(data, bytearray): - # Receive stream data - if not start_time: - # Start of stream (utterance) - start_time = time() - sctx = model.setupStream() - assert not gSem_acquired - # logger.debug("acquiring lock for deepspeech ...") - gSem.acquire(blocking=True) - gSem_acquired = True - # logger.debug("lock acquired") - model.feedAudioContent(sctx, np.frombuffer(data, np.int16)) - - elif isinstance(data, str) and data == 'EOS': - # End of stream (utterance) - eos_time = time() - text = model.finishStream(sctx) - logger.info("recognized: %r", text) - logger.info(" time: total=%s post_eos=%s", time()-start_time, time()-eos_time) - ws.send(text) - # FIXME: handle ConnectionResetError & geventwebsocket.exceptions.WebSocketError - # logger.debug("releasing lock ...") - gSem.release() - gSem_acquired = False - # logger.debug("lock released") - start_time = None - + return_str = data_process(data, False) + if return_str is None: + break else: - # Lost connection - logger.debug("dead websocket") - if gSem_acquired: - # logger.debug("releasing lock ...") - gSem.release() - gSem_acquired = False - # logger.debug("lock released") + ws.send(return_str) + +@get('/recognize_meta', apply=[websocket]) +def recognize_meta(ws): + logger.debug("new websocket") + + while True: + data = ws.receive() + return_str = data_process(data, False) + if return_str is None: break + else: + ws.send(return_str) + +@post('/recognize_file') +def recognize_file(): + enhanced = request.params.get('enhanced', default=False, type=int) + enhanced = enhanced != False and int(enhanced) != 0 # input normalize + data_file = request.files.get('file') + data = data_resample(data_file.file) if data_file is not None else None + logger.debug(f"file: file: {'(missing file)' if not data_file else data_file.filename}, " + \ + f"len: {0 if not data_file else data_file.content_length}, " + \ + f"len-audio: {0 if not data else len(data)}, enhanced: {enhanced}") + return_str = "" + if data: # only if the data was non-zero + return_str = data_process(bytearray(data), enhanced) + return_str = data_process("EOS", enhanced) # send immediate EOS + return return_str + @get('/') def index(): return template('index') -run(host='127.0.0.1', port=ARGS.port, server=GeventWebSocketServer) +run(host='0.0.0.0', port=ARGS.port, server=GeventWebSocketServer) # python server.py --model ../models/daanzu-30330/output_graph.pb --alphabet ../models/daanzu-30330/alphabet.txt --lm ../models/daanzu-30330/lm.binary --trie ../models/daanzu-30330/trie # python server.py --model ../models/daanzu-30330.2/output_graph.pb --alphabet ../models/daanzu-30330.2/alphabet.txt --lm ../models/daanzu-30330.2/lm.binary --trie ../models/daanzu-30330.2/trie