forked from zaf/asterisk-speech-recog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
speech-recog.agi
executable file
·306 lines (265 loc) · 8.59 KB
/
speech-recog.agi
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#!/usr/bin/env perl
#
# AGI script that renders speech to text using Google's Cloud Speech API.
#
# Copyright (C) 2011 - 2016, Lefteris Zafiris <[email protected]>
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the COPYING file
# at the top of the source tree.
#
# -----
# Usage
# -----
# agi(speech-recog.agi,[lang],[timeout],[intkey],[NOBEEP],[rtimeout],[speechContexts])
# Records from the current channel until 2 seconds of silence are detected
# (this can be set by the user by the 'timeout' argument, -1 for no timeout) or the
# interrupt key (# by default) is pressed. If NOBEEP is set, no beep sound is played
# back to the user to indicate the start of the recording. If 'rtimeout' is set,
# overwrite to the absolute recording timeout. 'SpeechContext' provides hints to
# favor specific words and phrases in the results. Usage: [Agamemnon,Midas]
# The recorded sound is send over to Google speech recognition service and the
# returned text string is assigned as the value of the channel variable 'utterance'.
# The scripts sets the following channel variables:
# utterance : The generated text string.
# confidence : A value between 0 and 1 indicating how 'confident' the recognition engine
# feels about the result. Values bigger than 0.95 usually mean that the
# resulted text is correct.
#
# User defined parameters:
# Speech API key from Google:
# $key
#
# Default language:
# $language
#
# Default timeout:
# $timeout (value in seconds of silence before recording is stopped)
#
# Default interrupt key:
# $intkey (can be any digit from 0 to 9 or # and *, or a combination of them)
#
# Sample rate:
# $samplerate (value in Hz. 0 for automatic detection per channel/call, 16000 for
# use with wideband codecs, 8000 for traditional codecs.
#
# Profanity filter:
# $pro_filter ('false':disable, 'true': remove profanities)
#
use warnings;
use strict;
use File::Copy qw(move);
use File::Temp qw(tempfile);
use LWP::UserAgent;
use JSON;
use Encode qw(encode);
use MIME::Base64;
$| = 1;
# ----------------------------- #
# User defined parameters: #
# ----------------------------- #
# Speech API key #
my $key = "";
# Default language #
my $language = "en-US";
# Default max silence timeout #
my $timeout = 2;
# Absolute Recording timeout #
my $abs_timeout = -1;
# Default interrupt key #
my $intkey = "#";
# Input audio sample rate #
# Leave blank to auto-detect #
my $samplerate = "";
# Profanity filter #
my $pro_filter = "false";
# Verbose debugging messages #
my $debug = 0;
# ----------------------------- #
my %AGI;
my $format;
my @result;
my $silence;
my $results = 1;
my $beep = "BEEP";
my $comp_level = -8;
my $ua_timeout = 30;
my $tmpdir = "/tmp";
my $url = "https://speech.googleapis.com/v1/speech";
my $phrases = "";
my @phrases = [];
# Store AGI input #
($AGI{arg_1}, $AGI{arg_2}, $AGI{arg_3}, $AGI{arg_4}, $AGI{arg_5}, $AGI{arg_6}) = @ARGV;
while (<STDIN>) {
chomp;
last if (!length);
$AGI{$1} = $2 if (/^agi_(\w+)\:\s+(.*)$/);
}
my $name = " -- $AGI{request}:";
# Reset variables. #
warn "$name Clearing channel variables.\n" if ($debug);
my %response = (
utterance => -1,
confidence => -1,
);
set_channel_vars(%response);
# Abort if key is missing or required programs not found. #
if (!$key) {
print "VERBOSE \"API key is missing. Aborting.\" 3\n";
checkresponse();
die "$name API key is missing. Aborting.\n";
}
my $flac = `/usr/bin/which flac`;
die "$name flac is missing. Aborting.\n" if (!$flac);
chomp($flac);
warn "$name Found flac in: $flac\n" if ($debug);
# Setting language, timeout, interrupt keys and BEEP indication #
if (length($AGI{arg_1})) {
$language = $AGI{arg_1} if ($AGI{arg_1} =~ /^[a-z]{2}(-[a-zA-Z]{2,6})?$/);
}
if (length($AGI{arg_2})) {
if ($AGI{arg_2} == -1) {
$silence = "";
} elsif ($AGI{arg_2} =~ /^\d+$/) {
$silence = "s=$AGI{arg_2}";
} else {
$silence = "s=$timeout";
}
} else {
$silence = "s=$timeout";
}
if (length($AGI{arg_3})) {
$intkey = "0123456789#*" if ($AGI{arg_3} eq "any");
$intkey = $AGI{arg_3} if ($AGI{arg_3} =~ /^[0-9*#]+$/);
}
if (length($AGI{arg_4})) {
$beep = "" if ($AGI{arg_4} eq "NOBEEP");
}
if (length($AGI{arg_5})) {
$abs_timeout = $AGI{arg_5};
}
if (length($AGI{arg_6})) {
$phrases = $AGI{arg_6};
$phrases = substr($phrases,1,-1);
@phrases = split(',',$phrases);
}
# Answer channel if not already answered #
warn "$name Checking channel status.\n" if ($debug);
print "CHANNEL STATUS\n";
@result = checkresponse();
if ($result[0] == 4) {
warn "$name Answering channel.\n" if ($debug);
print "ANSWER\n";
@result = checkresponse();
if ($result[0] != 0) {
die "$name Failed to answer channel.\n";
}
}
# Setting recording file format according to sample rate. #
if (!$samplerate) { ($format, $samplerate) = detect_format(); }
elsif ($samplerate == 12000) { $format = "sln12"; }
elsif ($samplerate == 16000) { $format = "sln16"; }
elsif ($samplerate == 32000) { $format = "sln32"; }
elsif ($samplerate == 44100) { $format = "sln44"; }
elsif ($samplerate == 48000) { $format = "sln48"; }
else { ($format, $samplerate) = ("sln", 8000); }
# Initialize User agent #
my $ua = LWP::UserAgent->new(ssl_opts => {verify_hostname => 1});
$ua->agent("Asterisk AGI speech recognition script");
$ua->env_proxy;
$ua->timeout($ua_timeout);
# Handle interrupts #
$SIG{'INT'} = \&int_handler;
$SIG{'HUP'} = \&int_handler;
# Record file #
my ($fh, $tmpname) = tempfile("stt_XXXXXX", DIR => $tmpdir, UNLINK => 1);
print "RECORD FILE $tmpname $format \"$intkey\" \"$abs_timeout\" $beep \"$silence\"\n";
@result = checkresponse();
die "$name Failed to record file, aborting...\n" if ($result[0] == -1);
if ($debug) {
warn "$name Recording Format: $format, Rate: $samplerate Hz, ",
"Language: $language, ", "$silence, Interrupt keys: $intkey\n";
}
# Encode audio data to flac #
my $endian = (unpack("h*", pack("s", 1)) =~ /01/) ? "big" : "little";
system($flac, $comp_level, "--totally-silent", "--channels=1", "--endian=$endian",
"--sign=signed", "--bps=16", "--force-raw-format", "--sample-rate=$samplerate",
"$tmpname.$format") == 0 or die "$name $flac failed: $?\n";
open($fh, "<", "$tmpname.flac") or die "Can't read file: $!";
my $audio = do { local $/; <$fh> };
close($fh);
my %config = (
"encoding" => "FLAC",
"sampleRateHertz" => $samplerate,
"languageCode" => $language,
"profanityFilter" => $pro_filter,
"speechContexts" => {"phrases" => \@phrases},
);
my %audio = ( "content" => encode_base64($audio, "") );
my %json = (
"config" => \%config,
"audio" => \%audio,
);
# Send audio data for analysis #
my $uaresponse = $ua->post(
"$url:recognize?key=$key",
Content_Type => "application/json",
Content => encode_json(\%json),
);
warn "$name The response was:\n", $uaresponse->content if ($debug);
if (!$uaresponse->is_success) {
print "VERBOSE \"Unable to get recognition data.\" 3\n";
checkresponse();
die "$name Unable to get recognition data.\n";
}
my $jdata = decode_json($uaresponse->content);
$response{utterance} = encode('utf8', $jdata->{"results"}[0]->{"alternatives"}[0]->{"transcript"});
$response{confidence} = $jdata->{"results"}[0]->{"alternatives"}[0]->{"confidence"};
set_channel_vars(%response);
exit;
sub set_channel_vars {
my %resp = @_;
foreach (keys %resp) {
warn "$name Setting variable: $_ = $response{$_}\n" if ($debug);
print "SET VARIABLE \"$_\" \"$response{$_}\"\n";
checkresponse();
}
}
sub checkresponse {
my $input = <STDIN>;
my @values;
chomp $input;
if ($input =~ /^200 result=(-?\d+)\s?(.*)$/) {
warn "$name Command returned: $input\n" if ($debug);
@values = ("$1", "$2");
} else {
$input .= <STDIN> if ($input =~ /^520-Invalid/);
warn "$name Unexpected result: $input\n";
@values = (-1, -1);
}
return @values;
}
sub detect_format {
# Detect the sound format used #
my @format;
print "GET FULL VARIABLE \${CHANNEL(audionativeformat)}\n";
my @reply = checkresponse();
for ($reply[1]) {
if (/(silk|sln)12/) { @format = ("sln12", 12000); }
elsif (/(speex|slin|silk)16|g722|siren7/) { @format = ("sln16", 16000); }
elsif (/(speex|slin|celt)32|siren14/) { @format = ("sln32", 32000); }
elsif (/(celt|slin)44/) { @format = ("sln44", 44100); }
elsif (/(celt|slin)48/) { @format = ("sln48", 48000); }
else { @format = ("sln", 8000); }
}
return @format;
}
sub int_handler {
die "$name Interrupt signal received, terminating...\n";
}
END {
if ($tmpname) {
warn "$name Cleaning temp files.\n" if ($debug);
unlink glob "$tmpname.*";
}
}