Merge branch 'testing', v1.2

bishoph · Mar 18, 2017 · eb3ea4b · eb3ea4b
2 parents 5927015 + cbcfc6f
commit eb3ea4b
Show file tree

Hide file tree

Showing 8 changed files with 156 additions and 230 deletions.
diff --git a/plugins/tweet/__init__.py b/plugins/tweet/__init__.py
diff --git a/sopare/analyze.py b/sopare/analyze.py
@@ -17,6 +17,7 @@
 under the License.
 """
 
+from operator import itemgetter
 import characteristics
 import config
 import path
@@ -52,13 +53,18 @@ def do_analysis(self, results, data, rawbuf):
 
     def framing(self, results, data_length):
         framing = { }
+        arr = [ ]
         for id in results:
             framing[id] = [ ]
             for i, row in enumerate(results[id]):
                 row = self.row_validation(row, id)
                 row_result = sum(row[0:len(row)]) / self.dict_analysis[id]['min_tokens']
                 if (row_result >= config.MARGINAL_VALUE):
-                    framing[id].append(i)
+                    arr.append([row_result, i, id])
+        sorted_arr = sorted(arr, key=itemgetter(0), reverse = True)
+        for el in sorted_arr:
+            if (el[1] not in framing[el[2]] and (config.MAX_WORD_START_RESULTS == 0 or len(framing[el[2]]) < config.MAX_WORD_START_RESULTS)):
+                framing[el[2]].append(el[1])
         return framing
 
     def row_validation(self, row, id):
@@ -69,140 +75,86 @@ def row_validation(self, row, id):
     def deep_search(self, framing, data):
         framing_match = [ ]
         match_results = [ '' ] * len(data)
-        high_results = [ 0 ] * len(data)
         for id in framing:
             for startpos in framing[id]:
-                xsim, xtsim, word_length = self.deep_inspection(id, startpos, data)
-                framing_match.append([id, startpos, xsim, xtsim, word_length])
-        for frame in framing_match:
-            xpos = 0
-            for x in range(frame[1], frame[1] + frame[4]):
-                if (x < len(high_results) and frame[2] > high_results[x]):
-                    high_results[x] = frame[2]
-                    match_results[x] = frame[0]
-                xpos += 1
-        result_set = set(match_results)        
-        self.debug_info += ''.join([str(framing), '\n'])
-        self.debug_info += ''.join([str(framing_match), '\n'])
-        self.debug_info += ''.join([str(match_results), '\n'])
-        self.debug_info += ''.join([str(high_results), '\n'])
-        check_length = 0
-        for result in result_set:
-            if (result != ''):
-                check_length += self.dict_analysis[id]['max_tokens'] + 4 # TODO: Cross check and eventually make configurable
-        if (check_length < len(match_results)):
-            if (self.debug):
-                print ('length check failed :'+str(check_length) + '/' + str(len(match_results)))
-            return [ '' ] * len(data)
+                word_sim = self.deep_inspection(id, startpos, data)
+                if (len(word_sim) > 0):
+                    framing_match.append(word_sim)
+        self.debug_info += str(framing_match).join(['framing_match: ', '\n\n'])
+        best_match = [ ]
+        for match in framing_match:
+            sorted_framing_match = sorted(match, key=lambda x: (x[1] + x[2], -x[0]))
+            nobm = 1
+            if (hasattr(config, 'NUMBER_OF_BEST_MATCHES') and config.NUMBER_OF_BEST_MATCHES > 0):
+                nobm = config.NUMBER_OF_BEST_MATCHES
+            for x in range(0, nobm):
+               if (x < len(sorted_framing_match)):
+                   best_match.append(sorted_framing_match[x])
+        sorted_best_match = sorted(best_match, key=lambda x: (x[1] +  x[2], -x[0]))
+        if (self.debug):
+            self.debug_info += str(sorted_best_match).join(['sorted_best_match: ', '\n\n'])
+        for i, best in enumerate(sorted_best_match):
+            if (best[0] >= config.MIN_CROSS_SIMILARITY and best[1] <= config.MIN_LEFT_DISTANCE and best[2] <= config.MIN_RIGHT_DISTANCE):
+                for x in range(best[3], best[3] + best[4]):
+                    if (match_results[x] == ''):
+                        match_results[x] = best[5]
+            if (config.MAX_TOP_RESULTS > 0 and i > config.MAX_TOP_RESULTS):
+                break
+        self.debug_info += str(match_results).join(['match_results: ', '\n\n'])
         return match_results
 
     def deep_inspection(self, id, startpos, data):
-        if (startpos + (self.dict_analysis[id]['min_tokens']) > len(data)):
-            if (self.debug):
-                print ('deep_inspection failed for '+id+'/'+str(startpos))
-            return 0, 0, 0
-        high_sim = 0
-        high_token_sim = [ ]
-        bias = [ ]
-        word_length = 0
+        word_sim = [ ]
         for dict_entries in self.learned_dict['dict']:
             if (id == dict_entries['id']):
                 dict_characteristic = dict_entries['characteristic']
-                word_sim = 0
-                bias_tokens = [ ]
-                token_sim = [ 0 ] * len(dict_characteristic)
-                c = 0.0
+                token_sim = [ 0, 0, 0, startpos, 0, id ]
+                c = 0
                 for i, dcharacteristic in enumerate(dict_characteristic):
-                    bias_obj = { }
-                    currentpos = startpos + i
-                    if (currentpos < len(data)):
-                        do = data[currentpos]
+                    if (startpos + i < len(data)):
+                        do = data[startpos + i]
                         characteristic, _ = do
-                        sim = 0
-                        ll = len(characteristic['peaks'])
-                        bias_obj['i'] = i
-                        if (ll > 0):
-                            sim_peaks = self.util.similarity(characteristic['norm'], dcharacteristic['norm']) * config.SIMILARITY_PEAKS
-                            sim_token_peaks = self.util.similarity(characteristic['token_peaks'], dcharacteristic['token_peaks']) * config.SIMILARITY_HEIGHT
-                            sim_df = self.util.single_similarity(characteristic['df'], dcharacteristic['df']) * config.SIMILARITY_DOMINANT_FREQUENCY
-                            sim = sim_peaks + sim_token_peaks + sim_df
-                            if (config.BIAS > 0):
-                                if (characteristic['df'] in self.dict_analysis[id]['df'][i]):
-                                    bias_obj['df'] = 1
-                                else:
-                                    bias_obj['df'] = 0
-                                xmin = min(characteristic['peaks'])
-                                xmax = max(characteristic['peaks'])
-                                if (xmin >= self.dict_analysis[id]['minp'][i] and xmax <= self.dict_analysis[id]['maxp'][i]):
-                                    sim = sim + 0.1
-                                    bias_obj['sr'] = 1
-                                else:
-                                    bias_obj['sr'] = 0
-                                if (ll >= self.dict_analysis[id]['mincp'][i] and ll <= self.dict_analysis[id]['maxcp'][i]):
-                                    sim = sim + 0.1
-                                    bias_obj['sl'] = 1
-                                else:
-                                    bias_obj['sl'] = 0
-                        token_sim[i] = sim    
-                        word_sim += sim
-                        bias_tokens.append(bias_obj)
-                    c += 1
-                word_sim = word_sim / c
-                bias.append(bias_tokens)
-                if (word_sim > 1):
-                    word_sim = 1
-                if (word_sim > high_sim and word_sim > config.MIN_CROSS_SIMILARITY):
-                    bc = self.calculate_bias(bias)
-                    if (bc >= config.BIAS):
-                        high_sim = word_sim
-                        word_length = int(c)
-                        high_token_sim.append(token_sim)
-        consolidated_high_token_sim = [ ]
-        for hts in high_token_sim:
-            for i, ts in enumerate(hts):
-                if (i == len(consolidated_high_token_sim)):
-                    consolidated_high_token_sim.append(ts)
-                elif (ts > consolidated_high_token_sim[i]):
-                    consolidated_high_token_sim[i] = ts
-        if (len(consolidated_high_token_sim) > 0):
-            consolidated_high_sim = sum(consolidated_high_token_sim) / len(consolidated_high_token_sim)
-            return consolidated_high_sim, consolidated_high_token_sim, word_length
-        else:
-            return high_sim, high_token_sim, word_length
-
-    @staticmethod
-    def calculate_bias(bias):
-        bc = 0
-        cc = 0
-        for b in bias:
-            for e in b:
-                if (len(e) == 4):
-                    bc += e['df'] + e['sr'] + e['sl']
-                    cc += 3.0
-        if (cc > 0):
-            return bc/cc
-        return 0
+                        sim_norm = self.util.similarity(characteristic['norm'], dcharacteristic['norm']) * config.SIMILARITY_NORM
+                        sim_token_peaks = self.util.similarity(characteristic['token_peaks'], dcharacteristic['token_peaks']) * config.SIMILARITY_HEIGHT
+                        sim_df = self.util.single_similarity(characteristic['df'], dcharacteristic['df']) * config.SIMILARITY_DOMINANT_FREQUENCY
+                        sim = sim_norm + sim_token_peaks + sim_df
+                        sl, sr = self.util.manhatten_distance(characteristic['norm'], dcharacteristic['norm'])
+                        token_sim[0] += sim
+                        token_sim[1] += sl
+                        token_sim[2] += sr
+                        c += 1.0
+                if (c > 0):
+                    token_sim[0] = token_sim[0] / c
+                    token_sim[1] = token_sim[1] / c
+                    token_sim[2] = token_sim[2] / c
+                    token_sim[4] = int(c)
+                if ((config.STRICT_LENGTH_CHECK == False and c > 1 ) or c >= self.dict_analysis[id]['min_tokens']):
+                    word_sim.append(token_sim)
+        return word_sim
 
     def get_match(self, framing):
         match_results = [ ]
-        counter = 0
-        last = ''
-        for check in framing:
-            if (check == last):
-                counter += 1
-            else:
-                if (last != ''):
-                    if (counter >= self.dict_analysis[last]['min_tokens']-1 and counter <= self.dict_analysis[last]['max_tokens']+1): # TODO: x-check
-                        match_results.append(last)
-                    elif (self.debug):
-                        print ('length check failed for :'+last+' from results. ' + str(self.dict_analysis[last]['min_tokens']-1) + ' ' + str(counter) + ' ' + str(self.dict_analysis[last]['max_tokens']+1))
-                counter = 1
-            last = check
-        if (last != ''):
-            if (counter >= self.dict_analysis[last]['min_tokens']-1 and counter <= self.dict_analysis[last]['max_tokens']+1): # TODO: x-check
-                match_results.append(last)
-            elif  (self.debug):
-                print ('length check failed for :'+last+' from results. ' + str(self.dict_analysis[last]['min_tokens']-1) + ' ' + str(counter) + ' ' + str(self.dict_analysis[last]['max_tokens']+1))
+        s = 0
+        for x in range(0, len(framing)):
+            if (x > 0 and framing[x] != framing[x-1]):
+                match_results = self.validate_match_result(framing[s:x], s, x, match_results)
+                s = x
+                if (x == len(framing)-1):
+                    match_results = self.validate_match_result(framing[s:], s, x, match_results)
+            elif (x == len(framing)-1):
+                match_results = self.validate_match_result(framing[s:], s, x, match_results)
+        if (match_results.count('') > len(match_results) / 2):
+            return [ 0 ] * len(match_results)
+        return match_results
+
+    def validate_match_result(self, result, start, end, match_results):
+        if (len(result) == 0 or result[0] == '' or end-start < 2):
+            return match_results
+        if (config.STRICT_LENGTH_CHECK == True and (len(result) < self.dict_analysis[result[0]]['min_tokens'] or len(result) > self.dict_analysis[result[0]]['max_tokens'])):
+            if (self.debug):
+                self.debug_info += 'STRICT_LENGTH_CHECK failed for '+result[0] + ': ' + str(self.dict_analysis[result[0]]['min_tokens']) + ' > ' + str(len(result)) + ' < ' + str(self.dict_analysis[result[0]]['max_tokens']) + '\n'
+            return match_results
+        match_results.append(result[0])
         return match_results
 
     def load_plugins(self):

diff --git a/sopare/characteristics.py b/sopare/characteristics.py
@@ -25,23 +25,27 @@ class characteristic:
     def __init__(self, debug):
         self.debug = debug
 
-    def getcharacteristic(self, fft, norm, meta):
-        chunked_norm = [ ]
-        progessive = 1
-        i = config.MIN_PROGRESSIVE_STEP
-        for x in range(0, len(norm), i):
-            if (hasattr(config, 'START_PROGRESSIVE_FACTOR')  and x >= config.START_PROGRESSIVE_FACTOR):
-                progessive += progessive * config.PROGRESSIVE_FACTOR
-                i += int(progessive)
-                if (i > config.MAX_PROGRESSIVE_STEP):
-                    i = config.MAX_PROGRESSIVE_STEP
-            chunked_norm.append(round(sum(norm[x:x+i]), 2))
+    def getcharacteristic(self, fft, chunked_norm, meta):
+
+        #chunked_norm = [ ]
+        #progessive = 1
+        #i = config.MIN_PROGRESSIVE_STEP
+        #for x in range(0, len(norm), i):
+        #    if (hasattr(config, 'START_PROGRESSIVE_FACTOR')  and x >= config.START_PROGRESSIVE_FACTOR):
+        #        progessive += progessive * config.PROGRESSIVE_FACTOR
+        #        i += int(progessive)
+        #        if (i > config.MAX_PROGRESSIVE_STEP):
+        #            i = config.MAX_PROGRESSIVE_STEP
+        #    chunked_norm.append(round(sum(norm[x:x+i]), 2))
+
+        fft = numpy.abs(fft)
         df = numpy.argmax(fft)
         dfm = int(numpy.amax(fft))
         fc = 0
-        where_range = numpy.mean(chunked_norm) / config.PEAK_FACTOR
-        peaks = list(numpy.array(numpy.where(chunked_norm > where_range))[0])
+        peaks = [ ]
         if (len(chunked_norm) > 0):
+            where_range = numpy.mean(chunked_norm) / config.PEAK_FACTOR
+            peaks = list(numpy.array(numpy.where(chunked_norm > where_range))[0])
             where_range = numpy.mean(chunked_norm)
             npeaks = numpy.array(numpy.where(chunked_norm > where_range))
             fc = round(numpy.sum(numpy.sqrt(npeaks)), 1)