Mozilla deepspeech: python recording example
From wikinotes
This example is not complete (too laggy for production), but it's a simple implementation of deepspeech's usage, mic-to-text.
#!/usr/bin/env python
""" Record Audio in one thread, Interpret Audio as text within another thread.
Notes:
This is super CPU-dependent. On my (slower) laptop, the time between sample
recordings inflated, resulting in samples being cut-off strangely (hi there, how are you --> hi).
My guess is that this is because I am using threads instead of independent processes - I
have not tested this guess).
On my (faster) desktop computer, the extracted audio samples perfectly encapsulated
simple sentences -- but there were still issues with lag, and interpretation:
a) the lag was still quite heavy -- process-pools would help here a lot.
If this cannot catch-up to realtime, the lag will continually get worse.
b) the untrained deepspeech model was starting to show it's weakness in interpreting text.
* training the model would likely help
* The SpeechRecognition python module has a filter that would
likely help clean up audio files before passing it to the model.
"""
from __future__ import absolute_import, division, print_function
from deepspeech import client
import deepspeech
import logging
import multiprocessing
import numpy
import os
import pprint
import pyaudio
import resampy
import threading
import wave
import wavio
import time
# =======
# globals
# =======
logger = logging.getLogger(__name__) # logger setup
pa = pyaudio.PyAudio() # init portaudio system
sample_queue = multiprocessing.Queue() # queue shared by producer/consumer
# =============
# configuration
# =============
# device settings
# ---------------
# first device containing this text in it's name will be chosen
# as the Input Device for recording
audio_device_match = 'HDA Intel PCH: ALC3232 Analog'
audio_device_match = 'HD Pro'
# audio with volume registered between following will be counted as silence.
# (samplewidth * silence_threshold)
# and (samplewidth * silence_threshold * -1)
silence_threshold = 0.1
# audio settings
# --------------
channels = 1
target_rate = 16000 # mozilla-deepspeech requires samplerate-16000 MHz
pyaudio_fmt = pyaudio.paInt16 # sample-width
chunksize_samplerate_ratio = 0.1 # samples will be processed as this fraction-of-a-second
# seconds audio must be blank for N seconds before a command
# counts as complete, and it is processed by the deepspeech model
end_pause_seconds = 1
end_pause_frames = target_rate * end_pause_seconds
# deepspeech settings
# -------------------
deepspeech_models_dir = '/home/will/Downloads/deepspeech-0.5.1-models'
# ===========
# device info
# ===========
def print_input_devices():
""" Prints all audio devices with their device-indexes.
Notes:
To help you determine audio device you want to record wtih.
"""
print('------------')
for i in range(pa.get_device_count()):
device_info = pa.get_device_info_by_index(i)
if device_info['maxInputChannels']:
print('{name}: in({maxInputChannels})'.format(**device_info))
def get_input_device_index(device_text):
""" Get device index from part of it's device-name.
Args:
device_text (str): ``(ex: 'HDA Intel PCM: ALC3232 Analog')``
A string with enough of your desired recording device's `name` field text
to uniquely select it.
See Also:
* :py:func:`print_input_devices`
Returns:
int:
index of audio device
"""
for i in range(pa.get_device_count()):
device_info = pa.get_device_info_by_index(i)
if device_text in device_info['name']:
pprint.pprint(device_info, indent=2)
return i
raise RuntimeError('Unable to find known webcam')
def get_device_samplerate(device_index):
""" Gets default samplerate for device.
Returns:
int: samplerate in Hz
"""
device_info = pa.get_device_info_by_index(device_index)
return int(device_info['defaultSampleRate'])
# =================
# Producer/Consumer
# =================
class RecordAudioThread(threading.Thread):
""" Thread that records audio, queueing audio samples in chunks.
"""
def __init__(self, input_device_index, input_samplerate):
super(RecordAudioThread, self).__init__()
self._device_index = input_device_index
self._input_rate = input_samplerate
def run(self):
""" Records audio on a loop, dumping each chunk into `sample_queue` to be
processed somewhere else.
"""
try:
# record at 44100 MHz (few input devices support recording at 16000 Mhz)
chunksize = int(self._input_rate * chunksize_samplerate_ratio)
stream = pa.open(format=pyaudio_fmt,
channels=channels,
rate=self._input_rate,
input=True,
frames_per_buffer=chunksize,
stream_callback=self._add_wavdata_to_samplequeue)
# record indefinitely
while True:
stream.start_stream()
finally:
stream.stop_stream()
stream.close()
def _add_wavdata_to_samplequeue(self, raw_wavdata, frame_count, time_info, status):
""" Dumps each recorded chunk into sample_queue (to be processed by consumer thread).
"""
sample_queue.put(raw_wavdata)
return (raw_wavdata, pyaudio.paContinue)
class AudioConsumerThread(threading.Thread):
""" Retrieves samples from `sample_queue` , assembling non-silent audio until `silence_threshold` is reached.
Then interprets audio using deepspeech, and prints it as text.
Notes:
this could benefit from being a processpool, to make it more responsive.
"""
def __init__(self, input_samplerate):
super(AudioConsumerThread, self).__init__()
self._input_rate = input_samplerate
self._model = deepspeech.Model(
aModelPath='{}/output_graph.pb'.format(deepspeech_models_dir),
aNCep=client.N_FEATURES, # 26
aNContext=client.N_CONTEXT, # 9
aAlphabetConfigPath='{}/alphabet.txt'.format(deepspeech_models_dir),
aBeamWidth=client.BEAM_WIDTH, # 500
)
def run(self):
""" Run in separate thread, interprets speech.
"""
command_chunks = []
pause_frames = 0
i = 0
chunks_processed = False
logger.info('Processing Samples Started')
while True:
wavdata = sample_queue.get()
wav_array = self._wavdata_to_16bit_nparray(wavdata, self._input_rate)
# if sample is not silent, append to command_chunks
if not self._check_wavarray_is_silent(wav_array):
logger.debug('Sample Contains Sound')
command_chunks.append(wav_array)
# if sample is silent, continue
else:
# if no audio recorded to date, ignore and continue
# processing queue.
if len(command_chunks) == 0:
if chunks_processed is False:
logger.debug('First Sample Without Sound, No Text Queued Yet')
pause_frames = 0
chunks_processed = True
continue
# if audio has been recorded, count seconds of silence to
# determine we should interpret the command, or continue
# listening to samples.
pause_frames += int(target_rate * chunksize_samplerate_ratio)
logger.debug('Sample Without Sound (pause-{}/{}) (queuesize-{})'.format(pause_frames, end_pause_frames, sample_queue.qsize()))
if pause_frames >= end_pause_frames:
command_array = numpy.concatenate(command_chunks)
logger.debug('Command Details: Num Samples: {}, Total Frames: {}'\
.format(len(command_chunks), command_array.size))
self._write_audiosample(command_num=i, wav_array=command_array)
spoken_text = self._model.stt(command_array, target_rate)
logger.info('Spoken Text: {}'.format(spoken_text))
# reset command, increment command no#
command_chunks = []
i += 1
chunks_processed = True
def _wavdata_to_16bit_nparray(self, raw_wavdata, input_rate):
""" Converts raw wavdata recorded at samplerate `input_rate` to
a 1-dimensional numpy array representing wavedata at 16000Hz (deepspeech requirement).
Returns:
numpy.array
"""
nparray_wavdata = numpy.frombuffer(raw_wavdata, dtype=numpy.int16)
resampled_nparray_wavdata = resampy.resample(nparray_wavdata, input_rate, target_rate)
return resampled_nparray_wavdata
def _check_wavarray_is_silent(self, wav_array):
"""
Returns:
bool: True if wavarray is determined to be silent.
"""
# 16bit int (unsigned) (-32,768 to 32,767)
# https://stackoverflow.com/questions/28632721/does-16bit-integer-pcm-data-mean-its-signed-or-unsigned
sample_max = 32767
# skip processing if sample has no sound above/below a specific threshold
above_threshold = sample_max * silence_threshold
below_threshold = sample_max * silence_threshold * -1
wav_array_wo_silence = numpy.array(list(filter(
lambda x: x > above_threshold or x < below_threshold,
wav_array
)))
return wav_array_wo_silence.size == 0
def _write_audiosample(self, command_num, wav_array):
""" Writes wav-array to ``/var/tmp/speechexe/{}.wav`` , to help debug program,
and possibly in future, train deepspeech model.
"""
if not os.path.isdir('/var/tmp/speechexe'):
os.makedirs('/var/tmp/speechexe')
wavio.write('/var/tmp/speechexe/{}.wav'.format(str(command_num)), wav_array, rate=target_rate)
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
def run_listen():
# choose device, and samplerate
device_index = get_input_device_index(audio_device_match)
input_samplerate = get_device_samplerate(device_index)
# begin recording/interpreting
producer = RecordAudioThread(device_index, input_samplerate)
consumer = AudioConsumerThread(input_samplerate)
consumer.start()
producer.start()
#print_input_devices()
run_listen()