Merge branch 'testing'. Many changes according to precision and relia…

…bility. After updating you need to re-train!
bishoph · Aug 26, 2017 · 9103ba0 · 9103ba0
2 parents eb3ea4b + 2d1fe29
commit 9103ba0
Show file tree

Hide file tree

Showing 13 changed files with 216 additions and 129 deletions.
diff --git a/sopare.py b/sopare.py
@@ -22,6 +22,7 @@
 import sopare.util as util
 import sopare.recorder as recorder
 import sopare.err_logger as err_logger
+import sopare.hatch as hatch
 from sopare.version import __version__
 
 def main(argv):
@@ -83,8 +84,16 @@ def main(argv):
             if opt in ("-d", "--delete"):
                 delete_word(arg, debug)
                 sys.exit(0)
-    recorder.recorder(endless_loop, debug, plot, wave, outfile, 
-                      infile, dict)
+
+    hatched = hatch.hatch()
+    hatched.add("endless_loop", endless_loop)
+    hatched.add("debug", debug)
+    hatched.add("plot", plot)
+    hatched.add("wave", wave)
+    hatched.add("outfile", outfile)
+    hatched.add("infile",infile )
+    hatched.add("dict", dict)
+    recorder.recorder(hatched)
 
 def recreate_dict(debug):
     print ("recreating dictionary from raw input files...")

diff --git a/sopare/analyze.py b/sopare/analyze.py
@@ -20,6 +20,7 @@
 from operator import itemgetter
 import characteristics
 import config
+import stm
 import path
 import util
 import imp
@@ -33,6 +34,7 @@ def __init__(self, debug):
         self.util = util.util(debug)
         self.learned_dict = self.util.getDICT()
         self.dict_analysis = self.util.compile_analysis(self.learned_dict)
+        self.stm = stm.short_term_memory(debug)
         self.plugins = [ ]
         self.load_plugins()
         self.last_results = None
@@ -45,6 +47,7 @@ def do_analysis(self, results, data, rawbuf):
             self.debug_info += ''.join([str(results), '\n\n'])
         matches = self.deep_search(framing, data)
         readable_results = self.get_match(matches)
+        readable_results = self.stm.get_results(readable_results)
         if (self.debug):
             print (self.debug_info)
         if (readable_results != None):
@@ -91,8 +94,7 @@ def deep_search(self, framing, data):
                if (x < len(sorted_framing_match)):
                    best_match.append(sorted_framing_match[x])
         sorted_best_match = sorted(best_match, key=lambda x: (x[1] +  x[2], -x[0]))
-        if (self.debug):
-            self.debug_info += str(sorted_best_match).join(['sorted_best_match: ', '\n\n'])
+        self.debug_info += str(sorted_best_match).join(['sorted_best_match: ', '\n\n'])
         for i, best in enumerate(sorted_best_match):
             if (best[0] >= config.MIN_CROSS_SIMILARITY and best[1] <= config.MIN_LEFT_DISTANCE and best[2] <= config.MIN_RIGHT_DISTANCE):
                 for x in range(best[3], best[3] + best[4]):
@@ -128,7 +130,7 @@ def deep_inspection(self, id, startpos, data):
                     token_sim[1] = token_sim[1] / c
                     token_sim[2] = token_sim[2] / c
                     token_sim[4] = int(c)
-                if ((config.STRICT_LENGTH_CHECK == False and c > 1 ) or c >= self.dict_analysis[id]['min_tokens']):
+                if ((config.STRICT_LENGTH_CHECK == False and c > 1 ) or c >= self.dict_analysis[id]['min_tokens'] - config.STRICT_LENGTH_UNDERMINING):
                     word_sim.append(token_sim)
         return word_sim
 
@@ -143,14 +145,16 @@ def get_match(self, framing):
                     match_results = self.validate_match_result(framing[s:], s, x, match_results)
             elif (x == len(framing)-1):
                 match_results = self.validate_match_result(framing[s:], s, x, match_results)
-        if (match_results.count('') > len(match_results) / 2):
-            return [ 0 ] * len(match_results)
+        if (framing.count('') > len(framing) * config.FILL_RESULT_PERCENTAGE):
+            if (self.debug):
+                self.debug_info += 'Results contain too many empty tokens. ' + str(framing.count('')) + ' / ' + str(len(framing)) + ' Eliminating results'
+            return [ ] * len(match_results)
         return match_results
 
     def validate_match_result(self, result, start, end, match_results):
         if (len(result) == 0 or result[0] == '' or end-start < 2):
             return match_results
-        if (config.STRICT_LENGTH_CHECK == True and (len(result) < self.dict_analysis[result[0]]['min_tokens'] or len(result) > self.dict_analysis[result[0]]['max_tokens'])):
+        if (config.STRICT_LENGTH_CHECK == True and (len(result) < self.dict_analysis[result[0]]['min_tokens'] - config.STRICT_LENGTH_UNDERMINING or len(result) > self.dict_analysis[result[0]]['max_tokens'])):
             if (self.debug):
                 self.debug_info += 'STRICT_LENGTH_CHECK failed for '+result[0] + ': ' + str(self.dict_analysis[result[0]]['min_tokens']) + ' > ' + str(len(result)) + ' < ' + str(self.dict_analysis[result[0]]['max_tokens']) + '\n'
             return match_results

diff --git a/sopare/buffering.py b/sopare/buffering.py
@@ -19,41 +19,35 @@
 
 import multiprocessing
 import processing
+import hatch
 
 class buffering(multiprocessing.Process):
 
-    def __init__(self, queue, endless_loop, debug, plot, wave, outfile, dict):
+    def __init__(self, hatch, queue):
         multiprocessing.Process.__init__(self, name="buffering queue")
+        self.hatch = hatch
         self.queue = queue
-        self.endless_loop = endless_loop
-        self.debug = debug
-        self.plot = plot
-        self.outfile = outfile
-        self.proc = processing.processor(endless_loop, debug, plot, wave, outfile, dict, self)
+        self.proc = processing.processor(hatch, self)
         self.PROCESS_ROUND_DONE = False
         self.test_counter = 0
         self.start()
 
     def run(self):
-        if (self.debug):
+        if (self.hatch.get('debug') == True):
             print ("buffering queue runner")
         while True:
             buf = self.queue.get()
-            if ((self.endless_loop == False or self.outfile != None) and self.PROCESS_ROUND_DONE):
+            if ((self.hatch.get('endless_loop') == False or self.hatch.get('outfile') != None) and self.PROCESS_ROUND_DONE):
                 break
             self.proc.check_silence(buf)
-        if (self.debug):
+        if (self.hatch.get('debug') == True):
             print ("terminating queue runner")
 
     def flush(self, message):
         self.proc.stop(message)
 
     def stop(self):
-        if (self.debug):
+        if (self.hatch.get('debug') == True):
             print ("stop buffering")
         self.PROCESS_ROUND_DONE = True
 
-
-
-
-
diff --git a/sopare/characteristics.py b/sopare/characteristics.py
@@ -19,25 +19,14 @@
 
 import numpy
 import config
+import hatch
 
 class characteristic:
 
-    def __init__(self, debug):
-        self.debug = debug
+    def __init__(self, hatch):
+        self.hatch = hatch
 
     def getcharacteristic(self, fft, chunked_norm, meta):
-
-        #chunked_norm = [ ]
-        #progessive = 1
-        #i = config.MIN_PROGRESSIVE_STEP
-        #for x in range(0, len(norm), i):
-        #    if (hasattr(config, 'START_PROGRESSIVE_FACTOR')  and x >= config.START_PROGRESSIVE_FACTOR):
-        #        progessive += progessive * config.PROGRESSIVE_FACTOR
-        #        i += int(progessive)
-        #        if (i > config.MAX_PROGRESSIVE_STEP):
-        #            i = config.MAX_PROGRESSIVE_STEP
-        #    chunked_norm.append(round(sum(norm[x:x+i]), 2))
-
         fft = numpy.abs(fft)
         df = numpy.argmax(fft)
         dfm = int(numpy.amax(fft))

diff --git a/sopare/config.py b/sopare/config.py
@@ -12,18 +12,16 @@
 THRESHOLD = 400
 
 # Silence time in seconds when analysis is called
-MAX_SILENCE_AFTER_START = 3
+MAX_SILENCE_AFTER_START = 1
 
 # Time in seconds after the analysis is forced
 MAX_TIME = 3.2
 
-# Counter to stop processing and prepare more data
-# Should be > LONG_SILENCE
-SILENCE_COUNTER = 42
-
 # Start the analysis after reaching LONG_SILENCE
-LONG_SILENCE = 40
+LONG_SILENCE = 30
 
+# Characteristic length
+CHUNKS = 2048
 
 
 #########################################################
@@ -36,15 +34,15 @@
 # Progressive value is used if you want to pack not
 # so relevant frequencies
 PROGRESSIVE_FACTOR = 0
-START_PROGRESSIVE_FACTOR = 600
+START_PROGRESSIVE_FACTOR = 1000
 MIN_PROGRESSIVE_STEP = 25
 MAX_PROGRESSIVE_STEP = 25
 
 # Specifies freq ranges that are kept for further
 # analysis. Freq outside of the ranges are set to zero.
 # Human language can be found between 20 and 5000.
 LOW_FREQ = 20
-HIGH_FREQ = 600
+HIGH_FREQ = 1000
 
 # Make use of Hann window function
 HANNING = True
@@ -62,31 +60,41 @@
 MIN_START_TOKENS = 3
 
 # Min. value for potential beginning of a word
-MARGINAL_VALUE = 0.8
+MARGINAL_VALUE = 0.7
 
 # Minimal similarity across all comparison to
 # identify a complete word across all tokens
-MIN_CROSS_SIMILARITY = 0.7
+MIN_CROSS_SIMILARITY = 0.85
 
 # Calculation basis or token/word comparison
-SIMILARITY_NORM = 0.6
-SIMILARITY_HEIGHT = 0.4
+SIMILARITY_NORM = 1
+SIMILARITY_HEIGHT = 0
 SIMILARITY_DOMINANT_FREQUENCY = 0
 
 # Number of best matches to consider.
 # Value must be > 0
 # If not specified or value < 1 value is set to 1
-NUMBER_OF_BEST_MATCHES = 1
+NUMBER_OF_BEST_MATCHES = 2
 
 # Min. distance to keep a word
-MIN_LEFT_DISTANCE = 0.3
+MIN_LEFT_DISTANCE = 0.4
 MIN_RIGHT_DISTANCE = 0.3
 
-
 # Use given number as results to assembly result
 # 0 for all predictions
 MAX_WORD_START_RESULTS = 2
 MAX_TOP_RESULTS = 3
 
 # Enable or disable strict length check for words
-STRICT_LENGTH_CHECK = False
+STRICT_LENGTH_CHECK = True
+# Value to soften the strict length check a bit to still
+# get quite precise results but to be less strict
+STRICT_LENGTH_UNDERMINING = 0
+
+# Short term memory retention time in seconds. Zero to disable STM
+STM_RETENTION = 0.8
+
+# Fill result percentage
+# 0.5 means that half of the values can by empty to still get valid results
+# A lower value should theoretically avoid false positives
+FILL_RESULT_PERCENTAGE = 0.2
diff --git a/sopare/filter.py b/sopare/filter.py
@@ -22,16 +22,16 @@
 import worker
 import config
 import characteristics
+import hatch
 
 class filtering():
 
-    def __init__(self, debug, plot, dict, wave):
-        self.debug = debug
-        self.plot = plot
+    def __init__(self, hatch):
+        self.hatch = hatch
         self.first = True
         self.queue = multiprocessing.Queue()
-        self.characteristic = characteristics.characteristic(debug)
-        self.worker = worker.worker(self.queue, debug, plot, dict, wave)
+        self.characteristic = characteristics.characteristic(self.hatch)
+        self.worker = worker.worker(self.hatch, self.queue)
 
     def stop(self):
         self.queue.put({ 'action': 'stop' })

diff --git a/sopare/hatch.py b/sopare/hatch.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Copyright (C) 2015 - 2017 Martin Kauss ([email protected])
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may
+not use this file except in compliance with the License. You may obtain
+a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+License for the specific language governing permissions and limitations
+under the License.
+"""
+
+import config
+
+class hatch():
+
+    def __init__(self):
+        self.plot_cache = [ ]
+        self.key_value_store = { }
+
+    def add(self, key, value):
+        self.key_value_store[key] = value
+
+    def get(self, key):
+        if (key in self.key_value_store):
+            return self.key_value_store[key]
+        return None
+
+    def extend_plot_cache(self, data):
+        self.plot_cache.extend(data)
+
+    def get_plot_cache(self):
+        return self.plot_cache
diff --git a/sopare/prepare.py b/sopare/prepare.py
@@ -22,17 +22,15 @@
 import visual
 import util
 import config
+import hatch
 
 class preparing():
 
-    def __init__(self, debug, plot, wave, dict):
-        self.debug = debug
-        self.plot = plot
-        self.wave = wave
-        self.dict = dict
+    def __init__(self, hatch):
+        self.hatch = hatch
         self.visual = visual.visual()
-        self.util = util.util(debug)
-        self.filter = filter.filtering(debug, plot, dict, wave)
+        self.util = util.util(self.hatch.get('debug'))
+        self.filter = filter.filtering(self.hatch)
         self.silence = 0
         self.force = False
         self.counter = 0
@@ -45,7 +43,6 @@ def __init__(self, debug, plot, wave, dict):
         self.token_peaks = [ ]
         self.last_low_pos = 0
         self.force = False
-        self.plot_buffer = [ ]
         self.entered_silence = False
 
     def tokenize(self, meta):
@@ -68,10 +65,10 @@ def valid_token(self, meta):
         return True
 
     def stop(self):
+        if (self.hatch.get('plot') == True):
+            self.visual.create_sample(self.hatch.get_plot_cache(), 'sample.png')
         self.tokenize([{ 'token': 'stop' }])
         self.filter.stop()
-        if (self.plot):
-            self.visual.create_sample(self.plot_buffer, 'sample.png')
         self.filter_reset()
         self.reset()
 
@@ -97,8 +94,8 @@ def force_tokenizer(self):
 
     def prepare(self, buf, volume):
         data = numpy.fromstring(buf, dtype=numpy.int16)
-        if (self.plot):
-            self.plot_buffer.extend(data)
+        if (self.hatch.get('plot') == True and self.hatch.get('endless_loop') == False):
+            self.hatch.extend_plot_cache(data)
         self.buffer.extend(data)
         self.counter += 1
         abs_data = abs(data)
@@ -120,7 +117,7 @@ def prepare(self, buf, volume):
             self.entered_silence = False
             self.silence = 0
 
-        if (len(self.buffer) == 4096): # TODO: Make configurable
+        if (len(self.buffer) == config.CHUNKS):
             self.new_token = True
             meta.append({ 'token': 'token', 'silence': self.silence, 'pos': self.counter, 'adapting': adaptive, 'volume': volume, 'token_peaks': self.token_peaks })