Mozilla deepspeech: python recording example

From wikinotes

This example is not complete (too laggy for production), but it's a simple implementation of deepspeech's usage, mic-to-text.

#!/usr/bin/env python
""" Record Audio in one thread, Interpret Audio as text within another thread.

Notes:
    This is super CPU-dependent. On my (slower) laptop, the time between sample
    recordings inflated, resulting in samples being cut-off strangely (hi there, how are you --> hi).
    My guess is that this is because I am using threads instead of independent processes - I
    have not tested this guess).

    On my (faster) desktop computer, the extracted audio samples perfectly encapsulated
    simple sentences -- but there were still issues with lag, and interpretation:

        a) the lag was still quite heavy -- process-pools would help here a lot.
           If this cannot catch-up to realtime, the lag will continually get worse.

        b) the untrained deepspeech model was starting to show it's weakness in interpreting text.
           * training the model would likely help
           * The SpeechRecognition python module has a filter that would
             likely help clean up audio files before passing it to the model.

"""
from __future__ import absolute_import, division, print_function
from deepspeech import client
import deepspeech
import logging
import multiprocessing
import numpy
import os
import pprint
import pyaudio
import resampy
import threading
import wave
import wavio
import time


# =======
# globals
# =======

logger = logging.getLogger(__name__)     # logger setup
pa = pyaudio.PyAudio()                   # init portaudio system
sample_queue = multiprocessing.Queue()   # queue shared by producer/consumer


# =============
# configuration
# =============

# device settings
# ---------------
# first device containing this text in it's name will be chosen
# as the Input Device for recording
audio_device_match = 'HDA Intel PCH: ALC3232 Analog'
audio_device_match = 'HD Pro'

# audio with volume registered between following will be counted as silence.
# (samplewidth * silence_threshold)
# and (samplewidth * silence_threshold * -1)
silence_threshold = 0.1


# audio settings
# --------------
channels = 1
target_rate = 16000                # mozilla-deepspeech requires samplerate-16000 MHz
pyaudio_fmt = pyaudio.paInt16      # sample-width
chunksize_samplerate_ratio = 0.1   # samples will be processed as this fraction-of-a-second

# seconds audio must be blank for N seconds before a command
# counts as complete, and it is processed by the deepspeech model
end_pause_seconds = 1
end_pause_frames = target_rate * end_pause_seconds

# deepspeech settings
# -------------------
deepspeech_models_dir = '/home/will/Downloads/deepspeech-0.5.1-models'


# ===========
# device info
# ===========


def print_input_devices():
    """ Prints all audio devices with their device-indexes.

    Notes:
        To help you determine audio device you want to record wtih.
    """
    print('------------')
    for i in range(pa.get_device_count()):
        device_info = pa.get_device_info_by_index(i)
        if device_info['maxInputChannels']:
            print('{name}: in({maxInputChannels})'.format(**device_info))


def get_input_device_index(device_text):
    """ Get device index from part of it's device-name.

    Args:
        device_text (str): ``(ex: 'HDA Intel PCM: ALC3232 Analog')``
            A string with enough of your desired recording device's `name` field text
            to uniquely select it.

    See Also:
        * :py:func:`print_input_devices`

    Returns:
        int:
            index of audio device
    """
    for i in range(pa.get_device_count()):
        device_info = pa.get_device_info_by_index(i)
        if device_text in device_info['name']:
            pprint.pprint(device_info, indent=2)
            return i
    raise RuntimeError('Unable to find known webcam')


def get_device_samplerate(device_index):
    """ Gets default samplerate for device.

    Returns:
        int: samplerate in Hz
    """
    device_info = pa.get_device_info_by_index(device_index)
    return int(device_info['defaultSampleRate'])


# =================
# Producer/Consumer
# =================


class RecordAudioThread(threading.Thread):
    """ Thread that records audio, queueing audio samples in chunks.
    """
    def __init__(self, input_device_index, input_samplerate):
        super(RecordAudioThread, self).__init__()
        self._device_index = input_device_index
        self._input_rate = input_samplerate

    def run(self):
        """ Records audio on a loop, dumping each chunk into `sample_queue` to be
        processed somewhere else.
        """
        try:
            # record at 44100 MHz (few input devices support recording at 16000 Mhz)
            chunksize = int(self._input_rate * chunksize_samplerate_ratio)
            stream = pa.open(format=pyaudio_fmt,
                             channels=channels,
                             rate=self._input_rate,
                             input=True,
                             frames_per_buffer=chunksize,
                             stream_callback=self._add_wavdata_to_samplequeue)

            # record indefinitely
            while True:
                stream.start_stream()

        finally:
            stream.stop_stream()
            stream.close()

    def _add_wavdata_to_samplequeue(self, raw_wavdata, frame_count, time_info, status):
        """ Dumps each recorded chunk into sample_queue (to be processed by consumer thread).
        """
        sample_queue.put(raw_wavdata)
        return (raw_wavdata, pyaudio.paContinue)


class AudioConsumerThread(threading.Thread):
    """ Retrieves samples from `sample_queue` , assembling non-silent audio until `silence_threshold` is reached.
    Then interprets audio using deepspeech, and prints it as text.

    Notes:
        this could benefit from being a processpool, to make it more responsive.
    """
    def __init__(self, input_samplerate):
        super(AudioConsumerThread, self).__init__()

        self._input_rate = input_samplerate

        self._model = deepspeech.Model(
            aModelPath='{}/output_graph.pb'.format(deepspeech_models_dir),
            aNCep=client.N_FEATURES,       # 26
            aNContext=client.N_CONTEXT,    # 9
            aAlphabetConfigPath='{}/alphabet.txt'.format(deepspeech_models_dir),
            aBeamWidth=client.BEAM_WIDTH,  # 500
        )

    def run(self):
        """ Run in separate thread, interprets speech.
        """
        command_chunks = []
        pause_frames = 0
        i = 0
        chunks_processed = False
        logger.info('Processing Samples Started')

        while True:
            wavdata = sample_queue.get()
            wav_array = self._wavdata_to_16bit_nparray(wavdata, self._input_rate)

            # if sample is not silent, append to command_chunks
            if not self._check_wavarray_is_silent(wav_array):
                logger.debug('Sample Contains Sound')
                command_chunks.append(wav_array)

            # if sample is silent, continue
            else:
                # if no audio recorded to date, ignore and continue
                # processing queue.
                if len(command_chunks) == 0:
                    if chunks_processed is False:
                        logger.debug('First Sample Without Sound, No Text Queued Yet')
                    pause_frames = 0
                    chunks_processed = True
                    continue

                # if audio has been recorded, count seconds of silence to
                # determine we should interpret the command, or continue
                # listening to samples.
                pause_frames += int(target_rate * chunksize_samplerate_ratio)
                logger.debug('Sample Without Sound (pause-{}/{}) (queuesize-{})'.format(pause_frames, end_pause_frames, sample_queue.qsize()))

                if pause_frames >= end_pause_frames:
                    command_array = numpy.concatenate(command_chunks)
                    logger.debug('Command Details: Num Samples: {}, Total Frames: {}'\
                                 .format(len(command_chunks), command_array.size))

                    self._write_audiosample(command_num=i, wav_array=command_array)
                    spoken_text = self._model.stt(command_array, target_rate)
                    logger.info('Spoken Text: {}'.format(spoken_text))

                    # reset command, increment command no#
                    command_chunks = []
                    i += 1

            chunks_processed = True

    def _wavdata_to_16bit_nparray(self, raw_wavdata, input_rate):
        """ Converts raw wavdata recorded at samplerate `input_rate` to
        a 1-dimensional numpy array representing wavedata at 16000Hz (deepspeech requirement).

        Returns:
            numpy.array
        """
        nparray_wavdata = numpy.frombuffer(raw_wavdata, dtype=numpy.int16)
        resampled_nparray_wavdata = resampy.resample(nparray_wavdata, input_rate, target_rate)
        return resampled_nparray_wavdata

    def _check_wavarray_is_silent(self, wav_array):
        """
        Returns:
            bool: True if wavarray is determined to be silent.
        """
        # 16bit int (unsigned) (-32,768 to 32,767)
        # https://stackoverflow.com/questions/28632721/does-16bit-integer-pcm-data-mean-its-signed-or-unsigned
        sample_max = 32767

        # skip processing if sample has no sound above/below a specific threshold
        above_threshold = sample_max * silence_threshold
        below_threshold = sample_max * silence_threshold * -1
        wav_array_wo_silence = numpy.array(list(filter(
            lambda x: x > above_threshold or x < below_threshold,
            wav_array
        )))
        return wav_array_wo_silence.size == 0

    def _write_audiosample(self, command_num, wav_array):
        """ Writes wav-array to ``/var/tmp/speechexe/{}.wav`` , to help debug program,
        and possibly in future, train deepspeech model.
        """
        if not os.path.isdir('/var/tmp/speechexe'):
            os.makedirs('/var/tmp/speechexe')
        wavio.write('/var/tmp/speechexe/{}.wav'.format(str(command_num)), wav_array, rate=target_rate)


if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)

    def run_listen():
        # choose device, and samplerate
        device_index = get_input_device_index(audio_device_match)
        input_samplerate = get_device_samplerate(device_index)

        # begin recording/interpreting
        producer = RecordAudioThread(device_index, input_samplerate)
        consumer = AudioConsumerThread(input_samplerate)
        consumer.start()
        producer.start()


    #print_input_devices()
    run_listen()