Skip to content

Commit

Permalink
Merge branch 'testing', v1.2
Browse files Browse the repository at this point in the history
  • Loading branch information
bishoph committed Mar 18, 2017
2 parents 5927015 + cbcfc6f commit eb3ea4b
Show file tree
Hide file tree
Showing 8 changed files with 156 additions and 230 deletions.
72 changes: 0 additions & 72 deletions plugins/tweet/__init__.py

This file was deleted.

196 changes: 74 additions & 122 deletions sopare/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
under the License.
"""

from operator import itemgetter
import characteristics
import config
import path
Expand Down Expand Up @@ -52,13 +53,18 @@ def do_analysis(self, results, data, rawbuf):

def framing(self, results, data_length):
framing = { }
arr = [ ]
for id in results:
framing[id] = [ ]
for i, row in enumerate(results[id]):
row = self.row_validation(row, id)
row_result = sum(row[0:len(row)]) / self.dict_analysis[id]['min_tokens']
if (row_result >= config.MARGINAL_VALUE):
framing[id].append(i)
arr.append([row_result, i, id])
sorted_arr = sorted(arr, key=itemgetter(0), reverse = True)
for el in sorted_arr:
if (el[1] not in framing[el[2]] and (config.MAX_WORD_START_RESULTS == 0 or len(framing[el[2]]) < config.MAX_WORD_START_RESULTS)):
framing[el[2]].append(el[1])
return framing

def row_validation(self, row, id):
Expand All @@ -69,140 +75,86 @@ def row_validation(self, row, id):
def deep_search(self, framing, data):
framing_match = [ ]
match_results = [ '' ] * len(data)
high_results = [ 0 ] * len(data)
for id in framing:
for startpos in framing[id]:
xsim, xtsim, word_length = self.deep_inspection(id, startpos, data)
framing_match.append([id, startpos, xsim, xtsim, word_length])
for frame in framing_match:
xpos = 0
for x in range(frame[1], frame[1] + frame[4]):
if (x < len(high_results) and frame[2] > high_results[x]):
high_results[x] = frame[2]
match_results[x] = frame[0]
xpos += 1
result_set = set(match_results)
self.debug_info += ''.join([str(framing), '\n'])
self.debug_info += ''.join([str(framing_match), '\n'])
self.debug_info += ''.join([str(match_results), '\n'])
self.debug_info += ''.join([str(high_results), '\n'])
check_length = 0
for result in result_set:
if (result != ''):
check_length += self.dict_analysis[id]['max_tokens'] + 4 # TODO: Cross check and eventually make configurable
if (check_length < len(match_results)):
if (self.debug):
print ('length check failed :'+str(check_length) + '/' + str(len(match_results)))
return [ '' ] * len(data)
word_sim = self.deep_inspection(id, startpos, data)
if (len(word_sim) > 0):
framing_match.append(word_sim)
self.debug_info += str(framing_match).join(['framing_match: ', '\n\n'])
best_match = [ ]
for match in framing_match:
sorted_framing_match = sorted(match, key=lambda x: (x[1] + x[2], -x[0]))
nobm = 1
if (hasattr(config, 'NUMBER_OF_BEST_MATCHES') and config.NUMBER_OF_BEST_MATCHES > 0):
nobm = config.NUMBER_OF_BEST_MATCHES
for x in range(0, nobm):
if (x < len(sorted_framing_match)):
best_match.append(sorted_framing_match[x])
sorted_best_match = sorted(best_match, key=lambda x: (x[1] + x[2], -x[0]))
if (self.debug):
self.debug_info += str(sorted_best_match).join(['sorted_best_match: ', '\n\n'])
for i, best in enumerate(sorted_best_match):
if (best[0] >= config.MIN_CROSS_SIMILARITY and best[1] <= config.MIN_LEFT_DISTANCE and best[2] <= config.MIN_RIGHT_DISTANCE):
for x in range(best[3], best[3] + best[4]):
if (match_results[x] == ''):
match_results[x] = best[5]
if (config.MAX_TOP_RESULTS > 0 and i > config.MAX_TOP_RESULTS):
break
self.debug_info += str(match_results).join(['match_results: ', '\n\n'])
return match_results

def deep_inspection(self, id, startpos, data):
if (startpos + (self.dict_analysis[id]['min_tokens']) > len(data)):
if (self.debug):
print ('deep_inspection failed for '+id+'/'+str(startpos))
return 0, 0, 0
high_sim = 0
high_token_sim = [ ]
bias = [ ]
word_length = 0
word_sim = [ ]
for dict_entries in self.learned_dict['dict']:
if (id == dict_entries['id']):
dict_characteristic = dict_entries['characteristic']
word_sim = 0
bias_tokens = [ ]
token_sim = [ 0 ] * len(dict_characteristic)
c = 0.0
token_sim = [ 0, 0, 0, startpos, 0, id ]
c = 0
for i, dcharacteristic in enumerate(dict_characteristic):
bias_obj = { }
currentpos = startpos + i
if (currentpos < len(data)):
do = data[currentpos]
if (startpos + i < len(data)):
do = data[startpos + i]
characteristic, _ = do
sim = 0
ll = len(characteristic['peaks'])
bias_obj['i'] = i
if (ll > 0):
sim_peaks = self.util.similarity(characteristic['norm'], dcharacteristic['norm']) * config.SIMILARITY_PEAKS
sim_token_peaks = self.util.similarity(characteristic['token_peaks'], dcharacteristic['token_peaks']) * config.SIMILARITY_HEIGHT
sim_df = self.util.single_similarity(characteristic['df'], dcharacteristic['df']) * config.SIMILARITY_DOMINANT_FREQUENCY
sim = sim_peaks + sim_token_peaks + sim_df
if (config.BIAS > 0):
if (characteristic['df'] in self.dict_analysis[id]['df'][i]):
bias_obj['df'] = 1
else:
bias_obj['df'] = 0
xmin = min(characteristic['peaks'])
xmax = max(characteristic['peaks'])
if (xmin >= self.dict_analysis[id]['minp'][i] and xmax <= self.dict_analysis[id]['maxp'][i]):
sim = sim + 0.1
bias_obj['sr'] = 1
else:
bias_obj['sr'] = 0
if (ll >= self.dict_analysis[id]['mincp'][i] and ll <= self.dict_analysis[id]['maxcp'][i]):
sim = sim + 0.1
bias_obj['sl'] = 1
else:
bias_obj['sl'] = 0
token_sim[i] = sim
word_sim += sim
bias_tokens.append(bias_obj)
c += 1
word_sim = word_sim / c
bias.append(bias_tokens)
if (word_sim > 1):
word_sim = 1
if (word_sim > high_sim and word_sim > config.MIN_CROSS_SIMILARITY):
bc = self.calculate_bias(bias)
if (bc >= config.BIAS):
high_sim = word_sim
word_length = int(c)
high_token_sim.append(token_sim)
consolidated_high_token_sim = [ ]
for hts in high_token_sim:
for i, ts in enumerate(hts):
if (i == len(consolidated_high_token_sim)):
consolidated_high_token_sim.append(ts)
elif (ts > consolidated_high_token_sim[i]):
consolidated_high_token_sim[i] = ts
if (len(consolidated_high_token_sim) > 0):
consolidated_high_sim = sum(consolidated_high_token_sim) / len(consolidated_high_token_sim)
return consolidated_high_sim, consolidated_high_token_sim, word_length
else:
return high_sim, high_token_sim, word_length

@staticmethod
def calculate_bias(bias):
bc = 0
cc = 0
for b in bias:
for e in b:
if (len(e) == 4):
bc += e['df'] + e['sr'] + e['sl']
cc += 3.0
if (cc > 0):
return bc/cc
return 0
sim_norm = self.util.similarity(characteristic['norm'], dcharacteristic['norm']) * config.SIMILARITY_NORM
sim_token_peaks = self.util.similarity(characteristic['token_peaks'], dcharacteristic['token_peaks']) * config.SIMILARITY_HEIGHT
sim_df = self.util.single_similarity(characteristic['df'], dcharacteristic['df']) * config.SIMILARITY_DOMINANT_FREQUENCY
sim = sim_norm + sim_token_peaks + sim_df
sl, sr = self.util.manhatten_distance(characteristic['norm'], dcharacteristic['norm'])
token_sim[0] += sim
token_sim[1] += sl
token_sim[2] += sr
c += 1.0
if (c > 0):
token_sim[0] = token_sim[0] / c
token_sim[1] = token_sim[1] / c
token_sim[2] = token_sim[2] / c
token_sim[4] = int(c)
if ((config.STRICT_LENGTH_CHECK == False and c > 1 ) or c >= self.dict_analysis[id]['min_tokens']):
word_sim.append(token_sim)
return word_sim

def get_match(self, framing):
match_results = [ ]
counter = 0
last = ''
for check in framing:
if (check == last):
counter += 1
else:
if (last != ''):
if (counter >= self.dict_analysis[last]['min_tokens']-1 and counter <= self.dict_analysis[last]['max_tokens']+1): # TODO: x-check
match_results.append(last)
elif (self.debug):
print ('length check failed for :'+last+' from results. ' + str(self.dict_analysis[last]['min_tokens']-1) + ' ' + str(counter) + ' ' + str(self.dict_analysis[last]['max_tokens']+1))
counter = 1
last = check
if (last != ''):
if (counter >= self.dict_analysis[last]['min_tokens']-1 and counter <= self.dict_analysis[last]['max_tokens']+1): # TODO: x-check
match_results.append(last)
elif (self.debug):
print ('length check failed for :'+last+' from results. ' + str(self.dict_analysis[last]['min_tokens']-1) + ' ' + str(counter) + ' ' + str(self.dict_analysis[last]['max_tokens']+1))
s = 0
for x in range(0, len(framing)):
if (x > 0 and framing[x] != framing[x-1]):
match_results = self.validate_match_result(framing[s:x], s, x, match_results)
s = x
if (x == len(framing)-1):
match_results = self.validate_match_result(framing[s:], s, x, match_results)
elif (x == len(framing)-1):
match_results = self.validate_match_result(framing[s:], s, x, match_results)
if (match_results.count('') > len(match_results) / 2):
return [ 0 ] * len(match_results)
return match_results

def validate_match_result(self, result, start, end, match_results):
if (len(result) == 0 or result[0] == '' or end-start < 2):
return match_results
if (config.STRICT_LENGTH_CHECK == True and (len(result) < self.dict_analysis[result[0]]['min_tokens'] or len(result) > self.dict_analysis[result[0]]['max_tokens'])):
if (self.debug):
self.debug_info += 'STRICT_LENGTH_CHECK failed for '+result[0] + ': ' + str(self.dict_analysis[result[0]]['min_tokens']) + ' > ' + str(len(result)) + ' < ' + str(self.dict_analysis[result[0]]['max_tokens']) + '\n'
return match_results
match_results.append(result[0])
return match_results

def load_plugins(self):
Expand Down
30 changes: 17 additions & 13 deletions sopare/characteristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,23 +25,27 @@ class characteristic:
def __init__(self, debug):
self.debug = debug

def getcharacteristic(self, fft, norm, meta):
chunked_norm = [ ]
progessive = 1
i = config.MIN_PROGRESSIVE_STEP
for x in range(0, len(norm), i):
if (hasattr(config, 'START_PROGRESSIVE_FACTOR') and x >= config.START_PROGRESSIVE_FACTOR):
progessive += progessive * config.PROGRESSIVE_FACTOR
i += int(progessive)
if (i > config.MAX_PROGRESSIVE_STEP):
i = config.MAX_PROGRESSIVE_STEP
chunked_norm.append(round(sum(norm[x:x+i]), 2))
def getcharacteristic(self, fft, chunked_norm, meta):

#chunked_norm = [ ]
#progessive = 1
#i = config.MIN_PROGRESSIVE_STEP
#for x in range(0, len(norm), i):
# if (hasattr(config, 'START_PROGRESSIVE_FACTOR') and x >= config.START_PROGRESSIVE_FACTOR):
# progessive += progessive * config.PROGRESSIVE_FACTOR
# i += int(progessive)
# if (i > config.MAX_PROGRESSIVE_STEP):
# i = config.MAX_PROGRESSIVE_STEP
# chunked_norm.append(round(sum(norm[x:x+i]), 2))

fft = numpy.abs(fft)
df = numpy.argmax(fft)
dfm = int(numpy.amax(fft))
fc = 0
where_range = numpy.mean(chunked_norm) / config.PEAK_FACTOR
peaks = list(numpy.array(numpy.where(chunked_norm > where_range))[0])
peaks = [ ]
if (len(chunked_norm) > 0):
where_range = numpy.mean(chunked_norm) / config.PEAK_FACTOR
peaks = list(numpy.array(numpy.where(chunked_norm > where_range))[0])
where_range = numpy.mean(chunked_norm)
npeaks = numpy.array(numpy.where(chunked_norm > where_range))
fc = round(numpy.sum(numpy.sqrt(npeaks)), 1)
Expand Down
Loading

0 comments on commit eb3ea4b

Please sign in to comment.