audio_recorder.py

# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Interface to asynchronously capture continuous audio from PyAudio.


This module requires pyaudio. See here for installation instructions:
http://people.csail.mit.edu/hubert/pyaudio/

This module provides one class, AudioRecorder, which buffers chunks of audio
from PyAudio.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import logging

import math
import time

import numpy as np
import pyaudio
import queue

logger = logging.getLogger(__name__)


class TimeoutError(Exception):
  """A timeout while waiting for pyaudio to buffer samples."""
  pass


class AudioRecorder(object):
  """Asynchronously record and buffer audio using pyaudio.

  This class wraps the pyaudio interface. It contains a queue.Queue object to
  hold chunks of raw audio, and a callback function _enqueue_audio() which
  places raw audio into this queue. This allows the pyaudio.Stream object to
  record asynchronously at low latency.

  The class acts as a context manager. When entering the context it creates a
  pyaudio.Stream object and starts recording; it stops recording on exit. The
  Stream saves all of its audio to the Queue as two-tuples of
  (timestamp, raw_audio). The raw_audio is available from the queue as a numpy
  array using the get_audio() function.

  This class uses the term "frame" in the same sense that PortAudio does, so
  "frame" means something different here than elsewhere in the daredevil stack.
  A frame in PortAudio is one audio sample across all channels, so one frame of
  16-bit stereo audio is four bytes of data as two 16-bit integers.
  """
  pyaudio_format = pyaudio.paInt16
  numpy_format = np.int16
  num_channels = 1

  # How many frames of audio PyAudio will fetch at once.
  # Higher numbers will increase the latancy.
  frames_per_chunk = 2**9

  # Limit queue to this number of audio chunks.
  max_queue_chunks = 1200

  # Timeout if we can't get a chunk from the queue for timeout_factor times the
  # chunk duration.
  timeout_factor = 4

  def __init__(self, raw_audio_sample_rate_hz=48000,
                     downsample_factor=3,
                     device_index=None):
    self._downsample_factor = downsample_factor
    self._raw_audio_sample_rate_hz = raw_audio_sample_rate_hz
    self.audio_sample_rate_hz = self._raw_audio_sample_rate_hz // self._downsample_factor
    self._raw_audio_queue = queue.Queue(self.max_queue_chunks)
    self._audio = pyaudio.PyAudio()
    self._print_input_devices()
    self._device_index = device_index

  def __enter__(self):
    if self._device_index is None:
      self._device_index = self._audio.get_default_input_device_info()["index"]
    kwargs = {
        "input_device_index": self._device_index
    }
    device_info = self._audio.get_device_info_by_host_api_device_index(
        0, self._device_index)
    if device_info.get("maxInputChannels") <= 0:
      raise ValueError("Audio device has insufficient input channels.")
    print("Using audio device '%s' for index %d" % (
        device_info["name"], device_info["index"]))
    self._stream = self._audio.open(
        format=self.pyaudio_format,
        channels=self.num_channels,
        rate=self._raw_audio_sample_rate_hz,
        input=True,
        output=False,
        frames_per_buffer=self.frames_per_chunk,
        start=True,
        stream_callback=self._enqueue_raw_audio,
        **kwargs)
    logger.info("Started audio stream.")
    return self

  def __exit__(self, exception_type, exception_value, traceback):
    self._stream.stop_stream()
    self._stream.close()
    logger.info("Stopped and closed audio stream.")

  def __del__(self):
    self._audio.terminate()
    logger.info("Terminated PyAudio/PortAudio.")

  @property
  def is_active(self):
    return self._stream.is_active()

  @property
  def bytes_per_sample(self):
    return pyaudio.get_sample_size(self.pyaudio_format)

  @property
  def _chunk_duration_seconds(self):
    return self.frames_per_chunk / self._raw_audio_sample_rate_hz

  def _print_input_devices(self):
    info = self._audio.get_host_api_info_by_index(0)
    print("\nInput microphone devices:")
    for i in range(0, info.get("deviceCount")):
      device_info = self._audio.get_device_info_by_host_api_device_index(0, i)
      if device_info.get("maxInputChannels") <= 0: continue
      print("  ID: ", i, " - ", device_info.get("name"))

  def _enqueue_raw_audio(self, in_data, *_):  # unused args to match expected
    try:
      self._raw_audio_queue.put((in_data, time.time()), block=False)
      return None, pyaudio.paContinue
    except queue.Full:
      error_message = "Raw audio buffer full."
      logger.critical(error_message)
      raise TimeoutError(error_message)

  def _get_chunk(self, timeout=None):
    raw_data, timestamp = self._raw_audio_queue.get(timeout=timeout)
    array_data = np.fromstring(raw_data, self.numpy_format).reshape(
        -1, self.num_channels)
    return array_data, timestamp

  def get_audio_device_info(self):
    if self._device_index is None:
      return self._audio.get_default_input_device_info()
    else:
      return self._audio.get_device_info_by_index(self._device_index)

  def sample_duration_seconds(self, num_samples):
    return num_samples / self.audio_sample_rate_hz / self.num_channels

  def clear_queue(self):
    logger.debug("Purging %d chunks from queue.", self._raw_audio_queue.qsize())
    while not self._raw_audio_queue.empty():
      self._raw_audio_queue.get()

  def get_audio(self, num_audio_frames):
    """Grab at least num_audio_frames frames of audio.

    Record at least num_audio_frames of audio and transform it into a
    numpy array. The term "frame" is in the sense used by PortAudio; see the
    note in the class docstring for details.

    Audio returned will be the earliest audio in the queue; it could be from
    before this function was called.

    Args:
      num_audio_frames: minimum number of samples of audio to grab.

    Returns:
      A tuple of (audio, first_timestamp, last_timestamp).
    """
    num_audio_chunks = int(math.ceil(num_audio_frames *
                    self._downsample_factor / self.frames_per_chunk))
    logger.debug("Capturing %d chunks to get at least %d frames.",
                 num_audio_chunks, num_audio_frames)
    if num_audio_chunks < 1:
      num_audio_chunks = 1
    try:
      timeout = self.timeout_factor * self._chunk_duration_seconds
      chunks, timestamps = zip(
          *[self._get_chunk(timeout=timeout) for _ in range(num_audio_chunks)])
    except queue.Empty:
      error_message = "Audio capture timed out after %.1f seconds." % timeout
      logger.critical(error_message)
      raise TimeoutError(error_message)

    assert len(chunks) == num_audio_chunks
    logger.debug("Got %d chunks. Chunk 0 has shape %s and dtype %s.",
                 len(chunks), chunks[0].shape, chunks[0].dtype)
    if self._raw_audio_queue.qsize() > (0.8 * self.max_queue_chunks):
      logger.warning("%d chunks remain in the queue.",
                     self._raw_audio_queue.qsize())
    else:
      logger.debug("%d chunks remain in the queue.",
                   self._raw_audio_queue.qsize())

    audio = np.concatenate(chunks)
    if self._downsample_factor != 1:
      audio = audio[::self._downsample_factor]
    logging.debug("Audio array has shape %s and dtype %s.", audio.shape,
                  audio.dtype)
    return audio * 0.5, timestamps[0], timestamps[-1]