diff --git a/programs/speechRecognition/speechRecognition.py b/programs/speechRecognition/speechRecognition.py index 5c41060..b340c27 100644 --- a/programs/speechRecognition/speechRecognition.py +++ b/programs/speechRecognition/speechRecognition.py @@ -1,243 +1,380 @@ #!/usr/bin/env python3 -## -# @ingroup speech-programs -# \defgroup speechRecognition speechRecognition.py -# @brief Provides basic ASR capabilities. -# -# @section speechRecognition_legal Legal -# -# Copyright: 2016-present (c) edits by Santiago Morante, Juan G Victores and Raul de Santos; past-2008 (c) Carnegie Mellon University. -# -# CopyPolicy: You may modify and redistribute this file under the same terms as the CMU Sphinx system. See -# http://cmusphinx.sourceforge.net/html/LICENSE for more information. -# -# @section speechRecognition_running Running (assuming correct installation) -# -# First we must run a YARP name server if it is not running in our current namespace: -# -# \verbatim -# [on terminal 1] yarp server -# \endverbatim -# -# Now launch the program: -# -# \verbatim -# [on terminal 2] speechRecognition -# \endverbatim -# -# You can launch a 3rd terminal to read what is published via YARP port: -# -# \verbatim -# [on terminal 3] yarp read ... /speechRecognition:o -# \endverbatim -# -# or you can send commands to configure the speech recognition app. To know how to do that, use "help": -# \verbatim -# [on terminal 4] yarp rpc /speechRecognition/rpc:s -# [on terminal 4] >>help -# \endverbatim -# -# For instance, you can load the 20k words Spanish dictionary with: -# \verbatim -# [on terminal 4] >>setDictionary 20k es -# \endverbatim -# -# @section speechRecognition_troubleshooting Troubleshooting -# -# 1. gst-inspect-1.0 pocketsphinx -# -# 2. Check operating system sound settings -# -# 3. Check hardware such as cables, and physical volume controls and on/off switches - -import gi - -gi.require_version('Gst', '1.0') -from gi.repository import GObject, Gst -GObject.threads_init() -Gst.init(None) - -gst = Gst - -from gi import pygtkcompat -print('Using pygtkcompat and Gst from gi') - -pygtkcompat.enable() -pygtkcompat.enable_gtk(version='3.0') +# ## +# # @ingroup speech-programs +# # \defgroup speechRecognition speechRecognition.py +# # @brief Provides basic ASR capabilities. +# # +# # @section speechRecognition_legal Legal +# # +# # Copyright: 2016-present (c) edits by Santiago Morante, Juan G Victores and Raul de Santos; past-2008 (c) Carnegie Mellon University. +# # +# # CopyPolicy: You may modify and redistribute this file under the same terms as the CMU Sphinx system. See +# # http://cmusphinx.sourceforge.net/html/LICENSE for more information. +# # +# # @section speechRecognition_running Running (assuming correct installation) +# # +# # First we must run a YARP name server if it is not running in our current namespace: +# # +# # \verbatim +# # [on terminal 1] yarp server +# # \endverbatim +# # +# # Now launch the program: +# # +# # \verbatim +# # [on terminal 2] speechRecognition +# # \endverbatim +# # +# # You can launch a 3rd terminal to read what is published via YARP port: +# # +# # \verbatim +# # [on terminal 3] yarp read ... /speechRecognition:o +# # \endverbatim +# # +# # or you can send commands to configure the speech recognition app. To know how to do that, use "help": +# # \verbatim +# # [on terminal 4] yarp rpc /speechRecognition/rpc:s +# # [on terminal 4] >>help +# # \endverbatim +# # +# # For instance, you can load the 20k words Spanish dictionary with: +# # \verbatim +# # [on terminal 4] >>setDictionary 20k es +# # \endverbatim +# # +# # @section speechRecognition_troubleshooting Troubleshooting +# # +# # 1. gst-inspect-1.0 pocketsphinx +# # +# # 2. Check operating system sound settings +# # +# # 3. Check hardware such as cables, and physical volume controls and on/off switches + +# import gi + +# gi.require_version('Gst', '1.0') +# from gi.repository import GObject, Gst +# GObject.threads_init() +# Gst.init(None) + +# gst = Gst + +# from gi import pygtkcompat +# print('Using pygtkcompat and Gst from gi') + +# pygtkcompat.enable() +# pygtkcompat.enable_gtk(version='3.0') + +# import yarp +# import os.path + +# # For alsaaudio +# import sys +# import alsaaudio + +# import roboticslab_speech + + +# ## +# # @ingroup speechRecognition +# # @brief Speech Recognition callback class +# class SpeechRecognitionResponder(roboticslab_speech.SpeechRecognitionIDL): +# def __init__(self, owner): +# super().__init__() +# self.owner = owner +# self.mixer = None + +# def setDictionary(self, dictionary, language): +# print('Setting dictionary to %s (language: %s)' % (dictionary, language)) + +# lm_path = 'dictionary/%s-%s.lm' % (dictionary, language) +# dic_path = 'dictionary/%s-%s.dic' % (dictionary, language) +# model_path = 'model/' + language + +# if not self.owner.setDictionary(lm_path, dic_path, model_path): +# print('Unable to set dictionary') +# return False + +# return True + +# def muteMicrophone(self): +# print('Muting microphone') + +# if not self._initMixer(): +# return False + +# self.mixer.setrec(0, alsaaudio.MIXER_CHANNEL_ALL) +# return True + +# def unmuteMicrophone(self): +# print('Unmuting microphone') + +# if not self._initMixer(): +# return False + +# self.mixer.setrec(1, alsaaudio.MIXER_CHANNEL_ALL) +# return True + +# def _initMixer(self): +# if self.mixer is None: +# try: +# self.mixer = alsaaudio.Mixer('Capture') # Capture / Master +# except alsaaudio.ALSAAudioError: +# print('No such mixer: %s' % sys.stderr) +# return False + +# return True + + +# ## +# # @ingroup speechRecognition +# # @brief Speech Recognition main class. +# class SpeechRecognition(object): +# """Based on GStreamer/PocketSphinx Demo Application""" + +# def __init__(self, rf): +# """Initialize a SpeechRecognition object""" +# self.rf = rf +# self.my_lm = self.rf.findFileByName('dictionary/follow-me-en-us.lm') +# self.my_dic = self.rf.findFileByName('dictionary/follow-me-en-us.dic') +# self.my_model = self.rf.findPath('model/en-us/') +# self.outPort = yarp.BufferedPortBottle() +# self.configPort = yarp.RpcServer() +# self.dataProcessor = SpeechRecognitionResponder(self) +# self.outPort.open('/speechRecognition:o') +# self.configPort.open('/speechRecognition/rpc:s') +# self.dataProcessor.yarp().attachAsServer(self.configPort) +# self.init_gst() -import yarp -import os.path +# def init_gst(self): +# """Initialize the speech components""" +# #self.pipeline = gst.parse_launch('gconfaudiosrc ! audioconvert ! audioresample ' +# # + '! vader name=vad auto-threshold=true ' +# # + '! pocketsphinx name=asr ! fakesink') -# For alsaaudio -import sys -import alsaaudio +# """ Configuring the decoder and improving accuracy """ +# self.pipeline = gst.parse_launch('autoaudiosrc ! audioconvert ! audioresample ' +# + '! pocketsphinx name=asr beam=1e-20 ! fakesink') +# asr = self.pipeline.get_by_name('asr') +# # asr.connect('result', self.asr_result) (it's not running with Gstreamer 1.0) +# asr.set_property('lm', self.my_lm ) +# asr.set_property('dict', self.my_dic ) +# asr.set_property('hmm', self.my_model ) +# #asr.set_property('configured', "true") -import roboticslab_speech +# bus = self.pipeline.get_bus() +# bus.add_signal_watch() +# bus.connect('message::element', self.element_message) # new +# self.pipeline.set_state(gst.State.PLAYING) -## -# @ingroup speechRecognition -# @brief Speech Recognition callback class -class SpeechRecognitionResponder(roboticslab_speech.SpeechRecognitionIDL): - def __init__(self, owner): - super().__init__() - self.owner = owner - self.mixer = None +# def element_message(self, bus, msg): +# """Receive element messages from the bus.""" +# print('---') +# msgtype = msg.get_structure().get_name() +# print(msgtype) # pocketsphinx - def setDictionary(self, dictionary, language): - print('Setting dictionary to %s (language: %s)' % (dictionary, language)) +# if msgtype != 'pocketsphinx': +# return - lm_path = 'dictionary/%s-%s.lm' % (dictionary, language) - dic_path = 'dictionary/%s-%s.dic' % (dictionary, language) - model_path = 'model/' + language +# print("hypothesis= '%s' confidence=%s final=%s\n" % ( +# msg.get_structure().get_value('hypothesis'), +# msg.get_structure().get_value('confidence'), +# msg.get_structure().get_value('final') +# )) - if not self.owner.setDictionary(lm_path, dic_path, model_path): - print('Unable to set dictionary') - return False +# if msg.get_structure().get_value('final') is True: +# text = msg.get_structure().get_value('hypothesis') - return True +# if text != '': +# print(text.lower()) +# b = self.outPort.prepare() +# b.clear() +# b.addString(text.lower()) +# self.outPort.write() - def muteMicrophone(self): - print('Muting microphone') +# def setDictionary(self, lm, dic, hmm): +# lm = self.rf.findFileByName(lm) +# dic = self.rf.findFileByName(dic) +# model = self.rf.findFileByName(hmm) - if not self._initMixer(): - return False +# if lm == '' or dic == '' or model == '': +# return False - self.mixer.setrec(0, alsaaudio.MIXER_CHANNEL_ALL) - return True +# self.my_lm = lm +# self.my_dic = dic +# self.my_model = model - def unmuteMicrophone(self): - print('Unmuting microphone') +# self.pipeline.set_state(gst.State.NULL) +# self.pipeline = gst.parse_launch('autoaudiosrc ! audioconvert ! audioresample ' +# + '! pocketsphinx name=asr beam=1e-20 ! fakesink') - if not self._initMixer(): - return False +# asr = self.pipeline.get_by_name('asr') +# asr.set_property('lm', self.my_lm) +# asr.set_property('dict', self.my_dic) +# asr.set_property('hmm', self.my_model ) + +# print('-------------------------------') +# print('Dictionary changed successfully (%s) (%s) (%s)' % (self.my_lm, self.my_dic, self.my_model)) + +# bus = self.pipeline.get_bus() +# bus.add_signal_watch() +# bus.connect('message::element', self.element_message) - self.mixer.setrec(1, alsaaudio.MIXER_CHANNEL_ALL) - return True +# self.pipeline.set_state(gst.State.PLAYING) +# return True - def _initMixer(self): - if self.mixer is None: - try: - self.mixer = alsaaudio.Mixer('Capture') # Capture / Master - except alsaaudio.ALSAAudioError: - print('No such mixer: %s' % sys.stderr) - return False - return True +# rf = yarp.ResourceFinder() +# rf.setDefaultContext('speechRecognition') +# rf.setDefaultConfigFile('speechRecognition.ini') + +# yarp.Network.init() + +# if not yarp.Network.checkNetwork(): +# print('[asr] error: found no yarp network (try running "yarpserver &")') +# raise SystemExit + +# app = SpeechRecognition(rf) + +# # enter into a mainloop +# loop = GObject.MainLoop() +# loop.run() + + + + +# borrowed from https://github.com/alphacep/vosk-api/blob/12f29a3/python/example/test_microphone.py +# and https://github.com/cmusphinx/pocketsphinx/blob/b16c631/examples/live.py + +import argparse +import queue +import roboticslab_speech +import sounddevice as sd +import yarp + +from pocketsphinx import Endpointer, Decoder, set_loglevel +class PocketSphinxSpeechRecognitionResponder(roboticslab_speech.SpeechRecognitionIDL): + def __init__(self, rf, device, model, samplerate, stream): + super().__init__() + + if samplerate is None: + device_info = sd.query_devices(device, 'input') + # soundfile expects an int, sounddevice provides a float: + self.samplerate = int(device_info['default_samplerate']) + else: + self.samplerate = samplerate + + self.ep = Endpointer(sample_rate=self.samplerate) + self.decoder = Decoder(samprate=self.ep.sample_rate) + self.stream = stream -## -# @ingroup speechRecognition -# @brief Speech Recognition main class. -class SpeechRecognition(object): - """Based on GStreamer/PocketSphinx Demo Application""" - - def __init__(self, rf): - """Initialize a SpeechRecognition object""" - self.rf = rf - self.my_lm = self.rf.findFileByName('dictionary/follow-me-en-us.lm') - self.my_dic = self.rf.findFileByName('dictionary/follow-me-en-us.dic') - self.my_model = self.rf.findPath('model/en-us/') - self.outPort = yarp.BufferedPortBottle() - self.configPort = yarp.RpcServer() - self.dataProcessor = SpeechRecognitionResponder(self) - self.outPort.open('/speechRecognition:o') - self.configPort.open('/speechRecognition/rpc:s') - self.dataProcessor.yarp().attachAsServer(self.configPort) - self.init_gst() - - def init_gst(self): - """Initialize the speech components""" - #self.pipeline = gst.parse_launch('gconfaudiosrc ! audioconvert ! audioresample ' - # + '! vader name=vad auto-threshold=true ' - # + '! pocketsphinx name=asr ! fakesink') - - """ Configuring the decoder and improving accuracy """ - self.pipeline = gst.parse_launch('autoaudiosrc ! audioconvert ! audioresample ' - + '! pocketsphinx name=asr beam=1e-20 ! fakesink') - asr = self.pipeline.get_by_name('asr') - # asr.connect('result', self.asr_result) (it's not running with Gstreamer 1.0) - asr.set_property('lm', self.my_lm ) - asr.set_property('dict', self.my_dic ) - asr.set_property('hmm', self.my_model ) - #asr.set_property('configured', "true") - - bus = self.pipeline.get_bus() - bus.add_signal_watch() - bus.connect('message::element', self.element_message) # new - - self.pipeline.set_state(gst.State.PLAYING) - - def element_message(self, bus, msg): - """Receive element messages from the bus.""" - print('---') - msgtype = msg.get_structure().get_name() - print(msgtype) # pocketsphinx - - if msgtype != 'pocketsphinx': - return - - print("hypothesis= '%s' confidence=%s final=%s\n" % ( - msg.get_structure().get_value('hypothesis'), - msg.get_structure().get_value('confidence'), - msg.get_structure().get_value('final') - )) - - if msg.get_structure().get_value('final') is True: - text = msg.get_structure().get_value('hypothesis') - - if text != '': - print(text.lower()) - b = self.outPort.prepare() - b.clear() - b.addString(text.lower()) - self.outPort.write() - - def setDictionary(self, lm, dic, hmm): - lm = self.rf.findFileByName(lm) - dic = self.rf.findFileByName(dic) - model = self.rf.findFileByName(hmm) - - if lm == '' or dic == '' or model == '': - return False - - self.my_lm = lm - self.my_dic = dic - self.my_model = model - - self.pipeline.set_state(gst.State.NULL) - self.pipeline = gst.parse_launch('autoaudiosrc ! audioconvert ! audioresample ' - + '! pocketsphinx name=asr beam=1e-20 ! fakesink') - - asr = self.pipeline.get_by_name('asr') - asr.set_property('lm', self.my_lm) - asr.set_property('dict', self.my_dic) - asr.set_property('hmm', self.my_model ) - - print('-------------------------------') - print('Dictionary changed successfully (%s) (%s) (%s)' % (self.my_lm, self.my_dic, self.my_model)) - - bus = self.pipeline.get_bus() - bus.add_signal_watch() - bus.connect('message::element', self.element_message) - - self.pipeline.set_state(gst.State.PLAYING) + def setDictionary(self, dictionary, language): + pass + + def muteMicrophone(self): + print('Muting microphone') + self.stream.abort() return True + def unmuteMicrophone(self): + print('Unmuting microphone') + self.stream.start() + return True -rf = yarp.ResourceFinder() -rf.setDefaultContext('speechRecognition') -rf.setDefaultConfigFile('speechRecognition.ini') + def transcribe(self, frame): + prev_in_speech = self.ep.in_speech + speech = self.ep.process(frame) + + if speech is not None: + if not prev_in_speech: + self.decoder.start_utt() + + self.decoder.process_raw(speech) + hyp = self.decoder.hyp() + + if hyp is not None: + if not self.ep.in_speech: + self.decoder.end_utt() + return False, self.decoder.hyp().hypstr + else: + # partial result + return True, hyp.hypstr + return False, None + +def int_or_str(text): + """Helper function for argument parsing.""" + try: + return int(text) + except ValueError: + return text + +parser = argparse.ArgumentParser(description='ASR service using PocketSphinx and YARP', add_help=False) +parser.add_argument('--list-devices', '-l', action='store_true', help='list available audio devices and exit') +args, remaining = parser.parse_known_args() + +if args.list_devices: + print(sd.query_devices()) + raise SystemExit + +parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, parents=[parser]) +parser.add_argument('--device', '-d', type=int_or_str, help='input device (numeric ID or substring)') +parser.add_argument('--samplerate', '-r', type=int, help='sampling rate') +parser.add_argument('--model', '-m', type=str, help='language model, e.g. small-es-0.42') +args = parser.parse_args(remaining) yarp.Network.init() if not yarp.Network.checkNetwork(): - print('[asr] error: found no yarp network (try running "yarpserver &")') + print('no YARP network available') + raise SystemExit + +asrPort = yarp.BufferedPortBottle() +configPort = yarp.RpcServer() + +if not asrPort.open('/speechRecognition:o'): + print('Unable to open output port') raise SystemExit -app = SpeechRecognition(rf) +if not configPort.open('/speechRecognition/rpc:s'): + print('Unable to open RPC port') + raise SystemExit + +rf = yarp.ResourceFinder() +rf.setDefaultContext('speechRecognition') +rf.setDefaultConfigFile('speechRecognition.ini') -# enter into a mainloop -loop = GObject.MainLoop() -loop.run() +try: + q = queue.Queue() + + with sd.RawInputStream(samplerate=args.samplerate, + #blocksize=int(dataProcessor.ep.frame_bytes / 2), # each frame is 2 bytes wide + blocksize=int(2880 / 2), # FIXME: hardcoded + device=args.device, + dtype='int16', + channels=1, + callback=lambda indata, frames, time, status: q.put(bytes(indata))) as stream: + dataProcessor = PocketSphinxSpeechRecognitionResponder(rf, args.device, args.model, args.samplerate, stream) + dataProcessor.yarp().attachAsServer(configPort) + + while True: + frame = q.get() + isPartial, transcription = dataProcessor.transcribe(frame) + + if transcription: + if not isPartial: + print('result: %s' % transcription) + b = asrPort.prepare() + b.clear() + b.addString(transcription) + asrPort.write() + else: + print('partial: %s' % transcription) +except KeyboardInterrupt: + asrPort.interrupt() + asrPort.close() + configPort.interrupt() + configPort.close() + parser.exit(0)