dpwe · vriez · Sep 13, 2018 · Oct 10, 2018 · Oct 15, 2018 · Oct 16, 2018
diff --git a/.gitignore b/.gitignore
@@ -55,3 +55,9 @@ docs/_build/
 fpstats
 fpmstats
 tmp.fpdb
+
+# Dev Stuff
+venv/
+tests/
+precomppkdir/
+queries/
diff --git a/Makefile b/Makefile
@@ -13,70 +13,70 @@ test: test_onecore test_onecore_precomp test_onecore_newmerge test_onecore_preco
 	rm -f fpdbase*.pklz
 
 test_onecore: fpdbase.pklz
-	${AUDFPRINT} match --dbase fpdbase.pklz query.mp3
+	${AUDFPRINT} match --dbase fpdbase.pklz tests/data/query.mp3
 
 test_remove: fpdbase.pklz
-	${AUDFPRINT} remove --dbase fpdbase.pklz Nine_Lives/05-Full_Circle.mp3 Nine_Lives/01-Nine_Lives.mp3
+	${AUDFPRINT} remove --dbase fpdbase.pklz tests/data/Nine_Lives/05-Full_Circle.mp3 tests/data/Nine_Lives/01-Nine_Lives.mp3
 	${AUDFPRINT} list --dbase fpdbase.pklz
-	${AUDFPRINT} add --dbase fpdbase.pklz Nine_Lives/01-Nine_Lives.mp3 Nine_Lives/05-Full_Circle.mp3
+	${AUDFPRINT} add --dbase fpdbase.pklz tests/data/Nine_Lives/01-Nine_Lives.mp3 tests/data/Nine_Lives/05-Full_Circle.mp3
 	${AUDFPRINT} list --dbase fpdbase.pklz
-	${AUDFPRINT} match --dbase fpdbase.pklz query.mp3
+	${AUDFPRINT} match --dbase fpdbase.pklz tests/data/query.mp3
 
 fpdbase.pklz: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py
-	${AUDFPRINT} new --dbase fpdbase.pklz Nine_Lives/0*.mp3
-	${AUDFPRINT} add --dbase fpdbase.pklz Nine_Lives/1*.mp3
+	${AUDFPRINT} new --dbase fpdbase.pklz tests/data/Nine_Lives/0*.mp3
+	${AUDFPRINT} add --dbase fpdbase.pklz tests/data/Nine_Lives/1*.mp3
 
 test_onecore_precomp: precompdir
-	${AUDFPRINT} new --dbase fpdbase0.pklz precompdir/Nine_Lives/0*
-	${AUDFPRINT} new --dbase fpdbase1.pklz precompdir/Nine_Lives/1*
+	${AUDFPRINT} new --dbase fpdbase0.pklz precompdir/tests/data/Nine_Lives/0*
+	${AUDFPRINT} new --dbase fpdbase1.pklz precompdir/tests/data/Nine_Lives/1*
 	${AUDFPRINT} merge --dbase fpdbase1.pklz fpdbase0.pklz
-	${AUDFPRINT} match --dbase fpdbase1.pklz precompdir/query.afpt
+	${AUDFPRINT} match --dbase fpdbase1.pklz precompdir/tests/data/query.afpt
 
 test_onecore_newmerge: precompdir
-	${AUDFPRINT} new --dbase fpdbase0.pklz precompdir/Nine_Lives/0*
-	${AUDFPRINT} new --dbase fpdbase1.pklz precompdir/Nine_Lives/1*
+	${AUDFPRINT} new --dbase fpdbase0.pklz precompdir/tests/data/Nine_Lives/0*
+	${AUDFPRINT} new --dbase fpdbase1.pklz precompdir/tests/data/Nine_Lives/1*
 	rm -f fpdbase2.pklz
 	${AUDFPRINT} newmerge --dbase fpdbase2.pklz fpdbase0.pklz fpdbase1.pklz
-	${AUDFPRINT} match --dbase fpdbase2.pklz precompdir/query.afpt
+	${AUDFPRINT} match --dbase fpdbase2.pklz precompdir/tests/data/query.afpt
 
 precompdir: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py
 	rm -rf precompdir
 	mkdir precompdir
-	${AUDFPRINT} precompute --precompdir precompdir Nine_Lives/*.mp3
-	${AUDFPRINT} precompute --precompdir precompdir --shifts 4 query.mp3
+	${AUDFPRINT} precompute --precompdir precompdir tests/data/Nine_Lives/*.mp3
+	${AUDFPRINT} precompute --precompdir precompdir --shifts 4 tests/data/query.mp3
 
 test_onecore_precomppk: precomppkdir
-	${AUDFPRINT} new --dbase fpdbase0.pklz precomppkdir/Nine_Lives/0*
-	${AUDFPRINT} new --dbase fpdbase1.pklz precomppkdir/Nine_Lives/1*
+	${AUDFPRINT} new --dbase fpdbase0.pklz precomppkdir/tests/data/Nine_Lives/0*
+	${AUDFPRINT} new --dbase fpdbase1.pklz precomppkdir/tests/data/Nine_Lives/1*
 	${AUDFPRINT} merge --dbase fpdbase1.pklz fpdbase0.pklz
-	${AUDFPRINT} match --dbase fpdbase1.pklz precomppkdir/query.afpk
+	${AUDFPRINT} match --dbase fpdbase1.pklz precomppkdir/tests/data/query.afpk
 	rm -rf precomppkdir
 
 precomppkdir: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py
 	rm -rf precomppkdir
 	mkdir precomppkdir
-	${AUDFPRINT} precompute --precompute-peaks --precompdir precomppkdir Nine_Lives/*.mp3
-	${AUDFPRINT} precompute --precompute-peaks --precompdir precomppkdir --shifts 4 query.mp3
+	${AUDFPRINT} precompute --precompute-peaks --precompdir precomppkdir tests/data/Nine_Lives/*.mp3
+	${AUDFPRINT} precompute --precompute-peaks --precompdir precomppkdir --shifts 4 tests/data/query.mp3
 
 test_mucore: fpdbase_mu.pklz
-	${AUDFPRINT} match --dbase fpdbase_mu.pklz --ncores 4 query.mp3
+	${AUDFPRINT} match --dbase fpdbase_mu.pklz --ncores 4 tests/data/query.mp3
 
 fpdbase_mu.pklz: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py
-	${AUDFPRINT} new --dbase fpdbase_mu.pklz --ncores 4 Nine_Lives/0*.mp3
-	${AUDFPRINT} add --dbase fpdbase_mu.pklz --ncores 4 Nine_Lives/1*.mp3
+	${AUDFPRINT} new --dbase fpdbase_mu.pklz --ncores 4 tests/data/Nine_Lives/0*.mp3
+	${AUDFPRINT} add --dbase fpdbase_mu.pklz --ncores 4 tests/data/Nine_Lives/1*.mp3
 
 test_mucore_precomp: precompdir_mu
-	${AUDFPRINT} new --dbase fpdbase_mu0.pklz --ncores 4 precompdir_mu/Nine_Lives/0*
-	${AUDFPRINT} new --dbase fpdbase_mu.pklz --ncores 4 precompdir_mu/Nine_Lives/1*
+	${AUDFPRINT} new --dbase fpdbase_mu0.pklz --ncores 4 precompdir_mu/tests/data/Nine_Lives/0*
+	${AUDFPRINT} new --dbase fpdbase_mu.pklz --ncores 4 precompdir_mu/tests/data/Nine_Lives/1*
 	${AUDFPRINT} merge --dbase fpdbase_mu.pklz fpdbase_mu0.pklz
-	${AUDFPRINT} match --dbase fpdbase_mu.pklz --ncores 4 precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt
+	${AUDFPRINT} match --dbase fpdbase_mu.pklz --ncores 4 precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt
 
 precompdir_mu: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py
 	rm -rf precompdir_mu
 	mkdir precompdir_mu
-	${AUDFPRINT} precompute --ncores 4 --precompdir precompdir_mu Nine_Lives/*.mp3
-	${AUDFPRINT} precompute --ncores 4 --precompdir precompdir_mu --shifts 4 query.mp3 query.mp3 query.mp3 query.mp3 query.mp3 query.mp3
+	${AUDFPRINT} precompute --ncores 4 --precompdir precompdir_mu tests/data/Nine_Lives/*.mp3
+	${AUDFPRINT} precompute --ncores 4 --precompdir precompdir_mu --shifts 4 tests/data/query.mp3 tests/data/query.mp3 tests/data/query.mp3 tests/data/query.mp3 tests/data/query.mp3 tests/data/query.mp3
 
-test_hash_mask: 
-	${AUDFPRINT} new --dbase fpdbase.pklz --hashbits 16 Nine_Lives/*.mp3
-	${AUDFPRINT} match --dbase fpdbase.pklz query.mp3
+test_hash_mask:
+	${AUDFPRINT} new --dbase fpdbase.pklz --hashbits 16 tests/data/Nine_Lives/*.mp3
+	${AUDFPRINT} match --dbase fpdbase.pklz tests/data/query.mp3
diff --git a/README.md b/README.md
@@ -46,6 +46,7 @@ Options:
   -v <val>, --verbose <val>       Verbosity level [default: 1]
   -I, --illustrate                Make a plot showing the match
   -J, --illustrate-hpf            Plot the match, using onset enhancement
+  -O, --json                      Return json object instead of string
   -W <dir>, --wavdir <dir>        Find sound files under this dir [default: ]
   -V <ext>, --wavext <ext>        Extension to add to wav file names [default: ]
   --version                       Report version number
@@ -145,5 +146,3 @@ Scaling
 The fingerprint database records 2^20 (~1M) distinct fingerprints, with (by default) 100 entries for each fingerprint bucket.  When the bucket fills, track entries are dropped at random; since matching depends only on making a minimum number of matches, but no particular match, dropping some of the more popular ones does not prevent matching.  The Matlab version has been successfully used for databases of 100k+ tracks.  Reducing the hash density (`--density`) leads to smaller reference database size, and the capacity to record more reference items before buckets begin to fill; a density of 7.0 works well.
 
 Times (in units of 256 samples, i.e., 23 ms at the default 11kHz sampling rate) are stored in the bottom 14 bits of each database entry, meaning that times larger than 2^14*0.023 = 380 sec, or about 6 mins, are aliased.  If you want to correctly identify time offsets in tracks longer than this, you need to use a larger `--maxtimebits`; e.g. `--maxtimebits 16` increases the time range to 65,536 frames, or about 25 minutes at 11 kHz.  The trade-off is that the remaining bits in each 32 bit entry (i.e., 18 bits for the default 14 bit times) are used to store the track ID.  Thus, by default, the database can only remember 2^18 = 262k tracks; using a larger `--maxtimebits` will reduce this; similarly, you can increase the number of distinct tracks by reducing `--maxtimebits`, which doesn't prevent matching tracks, but progressively reduces discrimination as the number of distinct time slots reduces (and can make the reported time offsets, and time ranges for `--find-time-ranges`, completely wrong for longer tracks).
-
-
diff --git a/audfprint.py b/audfprint.py
@@ -27,12 +27,16 @@ def filename_list_iterator(filelist, wavdir, wavext, listflag):
     """ Iterator to yeild all the filenames, possibly interpreting them
         as list files, prepending wavdir """
     if not listflag:
+        # print(filelist)
         for filename in filelist:
-            yield os.path.join(wavdir, filename + wavext)
+            # print('listflag', os.path.join(wavdir, filename + wavext))
+            yield filename#os.path.join(wavdir, filename + wavext)
     else:
+        # print(filelist)
         for listfilename in filelist:
             with open(listfilename, 'r') as f:
                 for filename in f:
+                    # print('|', os.path.join(wavdir, filename.rstrip('\n') + wavext))
                     yield os.path.join(wavdir, filename.rstrip('\n') + wavext)
 
 
@@ -122,7 +126,7 @@ def make_ht_from_list(analyzer, filelist, hashbits, depth, maxtime, pipe=None):
     # Add in the files
     for filename in filelist:
         hashes = analyzer.wavfile2hashes(filename)
-        ht.store(filename, hashes)
+        ht.store(filename, hashes, analyzer.density)
     # Pass back to caller
     if pipe:
         pipe.send(ht)
@@ -153,17 +157,21 @@ def do_cmd(cmd, analyzer, hash_tab, filename_iter, matcher, outdir, type, report
     elif cmd == 'match':
         # Running query, single-core mode
         for num, filename in enumerate(filename_iter):
-            msgs = matcher.file_match_to_msgs(analyzer, hash_tab, filename, num)
-            report(msgs)
+            if matcher.json:
+                obj = matcher.file_match_to_objs(analyzer, hash_tab, filename, num)
+                print(obj)
+            else:
+                msgs = matcher.file_match_to_msgs(analyzer, hash_tab, filename, num)
+                report(msgs)
 
     elif cmd == 'new' or cmd == 'add':
         # Adding files
         tothashes = 0
         ix = 0
         for filename in filename_iter:
-            report([time.ctime() + " ingesting #" + str(ix) + ": "
-                    + filename + " ..."])
             dur, nhash = analyzer.ingest(hash_tab, filename)
+            # report([time.ctime() + " ingesting #" + str(ix) +" : "+ filename + " "+ str(hash_table.track_duration(filename))+"s ..."+str(nhash/dur)+"hashes/s"])
+            report(["ingesting # {} : track: {}, duration[sec]: {}, density[hashes/sec]: {} ".format(str(ix), filename, str(hash_table.track_duration(filename)), str(nhash//dur))])
             tothashes += nhash
             ix += 1
 
@@ -280,7 +288,7 @@ def setup_analyzer(args):
     # set default value for shifts depending on mode
     if analyzer.shifts == 0:
         # Default shift is 4 for match, otherwise 1
-        analyzer.shifts = 4 if args['match'] else 1
+        analyzer.shifts = 4 if args.get('match') else 1
     analyzer.fail_on_error = not args['--continue-on-error']
     return analyzer
 
@@ -297,6 +305,7 @@ def setup_matcher(args):
     matcher.exact_count = args['--exact-count'] | args['--illustrate'] | args['--illustrate-hpf']
     matcher.illustrate = args['--illustrate'] | args['--illustrate-hpf']
     matcher.illustrate_hpf = args['--illustrate-hpf']
+    matcher.json = args.get('--json')
     matcher.verbose = args['--verbose']
     matcher.find_time_range = args['--find-time-range']
     matcher.time_quantile = float(args['--time-quantile'])
@@ -366,6 +375,7 @@ def report(msglist):
   -v <val>, --verbose <val>       Verbosity level [default: 1]
   -I, --illustrate                Make a plot showing the match
   -J, --illustrate-hpf            Plot the match, using onset enhancement
+  -O, --json                      Return json object instead of string
   -W <dir>, --wavdir <dir>        Find sound files under this dir [default: ]
   -V <ext>, --wavext <ext>        Extension to add to wav file names [default: ]
   --version                       Report version number
@@ -473,7 +483,7 @@ def main(argv):
                strip_prefix=args['--wavdir'])
 
     elapsedtime = time.clock() - initticks
-    if analyzer and analyzer.soundfiletotaldur > 0.:
+    if analyzer and analyzer.soundfiletotaldur > 0. and not args['--verbose']:
         print("Processed "
               + "%d files (%.1f s total dur) in %.1f s sec = %.3f x RT" \
               % (analyzer.soundfilecount, analyzer.soundfiletotaldur,

diff --git a/audfprint_analyze.py b/audfprint_analyze.py
@@ -268,6 +268,7 @@ def find_peaks(self, d, sr):
             n_hop/sr secs), second is the FFT bin (in units of sr/n_fft
             Hz).
         """
+
         if len(d) == 0:
             return []
 
@@ -345,7 +346,9 @@ def wavfile2peaks(self, filename, shifts=None):
             list of (time, bin) pairs.  If specified, resample to sr first.
             shifts > 1 causes hashes to be extracted from multiple shifts of
             waveform, to reduce frame effects.  """
-        ext = os.path.splitext(filename)[1]
+
+        _, ext = os.path.splitext(filename)
+
         if ext == PRECOMPPKEXT:
             # short-circuit - precomputed fingerprint file
             peaks = peaks_load(filename)
@@ -419,8 +422,6 @@ def wavfile2hashes(self, filename):
             ]).astype(np.int32)
             hashes = unique_hashes
             # Or simply np.unique(query_hashes, axis=0) for numpy >= 1.13
-
-        # print("wavfile2hashes: read", len(hashes), "hashes from", filename)
         return hashes
 
     # ########## functions to link to actual hash table index database ###### #
@@ -448,7 +449,7 @@ def ingest(self, hashtable, filename):
         #                                                     n_fft=n_fft,
         #                                                     n_hop=n_hop)))
         hashes = self.wavfile2hashes(filename)
-        hashtable.store(filename, hashes)
+        hashtable.store(filename, hashes, self.density)
         # return (len(d)/float(sr), len(hashes))
         # return (np.max(hashes, axis=0)[0]*n_hop/float(sr), len(hashes))
         # soundfiledur is set up in wavfile2hashes, use result here
@@ -567,7 +568,7 @@ def glob2hashtable(pattern, density=20.0):
     totdur = 0.0
     tothashes = 0
     for ix, file_ in enumerate(filelist):
-        print(time.ctime(), "ingesting #", ix, ":", file_, "...")
+        print(time.ctime(), "ingesting #", ix, ":", file_, track_duration(ix), ht.densityperid[ix], "...")
         dur, nhash = g2h_analyzer.ingest(ht, file_)
         totdur += dur
         tothashes += nhash