diff --git a/trackeval/__init__.py b/trackeval/__init__.py
deleted file mode 100644
index dce62da..0000000
--- a/trackeval/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .eval import Evaluator
-from . import datasets
-from . import metrics
-from . import plotting
-from . import utils
diff --git a/trackeval/_timing.py b/trackeval/_timing.py
deleted file mode 100644
index ffb3caa..0000000
--- a/trackeval/_timing.py
+++ /dev/null
@@ -1,70 +0,0 @@
-from functools import wraps
-from time import perf_counter
-import inspect
-
-DO_TIMING = False
-DISPLAY_LESS_PROGRESS = False
-timer_dict = {}
-counter = 0
-
-
-def time(f):
-    @wraps(f)
-    def wrap(*args, **kw):
-        if DO_TIMING:
-            # Run function with timing
-            ts = perf_counter()
-            result = f(*args, **kw)
-            te = perf_counter()
-            tt = te-ts
-
-            # Get function name
-            arg_names = inspect.getfullargspec(f)[0]
-            if arg_names[0] == 'self' and DISPLAY_LESS_PROGRESS:
-                return result
-            elif arg_names[0] == 'self':
-                method_name = type(args[0]).__name__ + '.' + f.__name__
-            else:
-                method_name = f.__name__
-
-            # Record accumulative time in each function for analysis
-            if method_name in timer_dict.keys():
-                timer_dict[method_name] += tt
-            else:
-                timer_dict[method_name] = tt
-
-            # If code is finished, display timing summary
-            if method_name == "Evaluator.evaluate":
-                print("")
-                print("Timing analysis:")
-                for key, value in timer_dict.items():
-                    print('%-70s %2.4f sec' % (key, value))
-            else:
-                # Get function argument values for printing special arguments of interest
-                arg_titles = ['tracker', 'cls'] # , 'seq'
-                arg_vals = []
-                for i, a in enumerate(arg_names):
-                    if a == 'seq':
-                        if isinstance(args[i], tuple):
-                            arg_vals.append(args[i][0] + '_' + args[i][1])
-                        else:
-                            arg_vals.append(args[i])
-                    if a in arg_titles:
-                        arg_vals.append(args[i])
-                arg_text = '(' + ', '.join(arg_vals) + ')'
-
-                # Display methods and functions with different indentation.
-                if arg_names[0] == 'self':
-                    print('%-74s %2.4f sec' % (' '*4 + method_name + arg_text, tt))
-                elif arg_names[0] == 'test':
-                    pass
-                else:
-                    global counter
-                    counter += 1
-                    print('%i %-70s %2.4f sec' % (counter, method_name + arg_text, tt))
-
-            return result
-        else:
-            # If config["TIME_PROGRESS"] is false, or config["USE_PARALLEL"] is true, run functions normally without timing.
-            return f(*args, **kw)
-    return wrap
diff --git a/trackeval/baselines/__init__.py b/trackeval/baselines/__init__.py
deleted file mode 100644
index ddc9864..0000000
--- a/trackeval/baselines/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import baseline_utils
-import stp
-import non_overlap
-import pascal_colormap
-import thresholder
-import vizualize
\ No newline at end of file
diff --git a/trackeval/baselines/baseline_utils.py b/trackeval/baselines/baseline_utils.py
deleted file mode 100644
index b6c88fd..0000000
--- a/trackeval/baselines/baseline_utils.py
+++ /dev/null
@@ -1,321 +0,0 @@
-
-import os
-import csv
-import numpy as np
-from copy import deepcopy
-from PIL import Image
-from pycocotools import mask as mask_utils
-from scipy.optimize import linear_sum_assignment
-from trackeval.baselines.pascal_colormap import pascal_colormap
-
-
-def load_seq(file_to_load):
-    """ Load input data from file in RobMOTS format (e.g. provided detections).
-    Returns: Data object with the following structure (see STP :
-        data['cls'][t] = {'ids', 'scores', 'im_hs', 'im_ws', 'mask_rles'}
-    """
-    fp = open(file_to_load)
-    dialect = csv.Sniffer().sniff(fp.readline(), delimiters=' ')
-    dialect.skipinitialspace = True
-    fp.seek(0)
-    reader = csv.reader(fp, dialect)
-    read_data = {}
-    num_timesteps = 0
-    for i, row in enumerate(reader):
-        if row[-1] in '':
-            row = row[:-1]
-        t = int(row[0])
-        cid = row[1]
-        c = int(row[2])
-        s = row[3]
-        h = row[4]
-        w = row[5]
-        rle = row[6]
-
-        if t >= num_timesteps:
-            num_timesteps = t + 1
-
-        if c in read_data.keys():
-            if t in read_data[c].keys():
-                read_data[c][t]['ids'].append(cid)
-                read_data[c][t]['scores'].append(s)
-                read_data[c][t]['im_hs'].append(h)
-                read_data[c][t]['im_ws'].append(w)
-                read_data[c][t]['mask_rles'].append(rle)
-            else:
-                read_data[c][t] = {}
-                read_data[c][t]['ids'] = [cid]
-                read_data[c][t]['scores'] = [s]
-                read_data[c][t]['im_hs'] = [h]
-                read_data[c][t]['im_ws'] = [w]
-                read_data[c][t]['mask_rles'] = [rle]
-        else:
-            read_data[c] = {t: {}}
-            read_data[c][t]['ids'] = [cid]
-            read_data[c][t]['scores'] = [s]
-            read_data[c][t]['im_hs'] = [h]
-            read_data[c][t]['im_ws'] = [w]
-            read_data[c][t]['mask_rles'] = [rle]
-    fp.close()
-
-    data = {}
-    for c in read_data.keys():
-        data[c] = [{} for _ in range(num_timesteps)]
-        for t in range(num_timesteps):
-            if t in read_data[c].keys():
-                data[c][t]['ids'] = np.atleast_1d(read_data[c][t]['ids']).astype(int)
-                data[c][t]['scores'] = np.atleast_1d(read_data[c][t]['scores']).astype(float)
-                data[c][t]['im_hs'] = np.atleast_1d(read_data[c][t]['im_hs']).astype(int)
-                data[c][t]['im_ws'] = np.atleast_1d(read_data[c][t]['im_ws']).astype(int)
-                data[c][t]['mask_rles'] = np.atleast_1d(read_data[c][t]['mask_rles']).astype(str)
-            else:
-                data[c][t]['ids'] = np.empty(0).astype(int)
-                data[c][t]['scores'] = np.empty(0).astype(float)
-                data[c][t]['im_hs'] = np.empty(0).astype(int)
-                data[c][t]['im_ws'] = np.empty(0).astype(int)
-                data[c][t]['mask_rles'] = np.empty(0).astype(str)
-    return data
-
-
-def threshold(tdata, thresh):
-    """ Removes detections below a certian threshold ('thresh') score. """
-    new_data = {}
-    to_keep = tdata['scores'] > thresh
-    for field in ['ids', 'scores', 'im_hs', 'im_ws', 'mask_rles']:
-        new_data[field] = tdata[field][to_keep]
-    return new_data
-
-
-def create_coco_mask(mask_rles, im_hs, im_ws):
-    """ Converts mask as rle text (+ height and width) to encoded version used by pycocotools. """
-    coco_masks = [{'size': [h, w], 'counts': m.encode(encoding='UTF-8')}
-                  for h, w, m in zip(im_hs, im_ws, mask_rles)]
-    return coco_masks
-
-
-def mask_iou(mask_rles1, mask_rles2, im_hs, im_ws, do_ioa=0):
-    """ Calculate mask IoU between two masks.
-    Further allows 'intersection over area' instead of IoU (over the area of mask_rle1).
-    Allows either to pass in 1 boolean for do_ioa for all mask_rles2 or also one for each mask_rles2.
-    It is recommended that mask_rles1 is a detection and mask_rles2 is a groundtruth.
-    """
-    coco_masks1 = create_coco_mask(mask_rles1, im_hs, im_ws)
-    coco_masks2 = create_coco_mask(mask_rles2, im_hs, im_ws)
-
-    if not hasattr(do_ioa, "__len__"):
-        do_ioa = [do_ioa]*len(coco_masks2)
-    assert(len(coco_masks2) == len(do_ioa))
-    if len(coco_masks1) == 0 or len(coco_masks2) == 0:
-        iou = np.zeros(len(coco_masks1), len(coco_masks2))
-    else:
-        iou = mask_utils.iou(coco_masks1, coco_masks2, do_ioa)
-    return iou
-
-
-def sort_by_score(t_data):
-    """ Sorts data by score """
-    sort_index = np.argsort(t_data['scores'])[::-1]
-    for k in t_data.keys():
-        t_data[k] = t_data[k][sort_index]
-    return t_data
-
-
-def mask_NMS(t_data, nms_threshold=0.5, already_sorted=False):
-    """ Remove redundant masks by performing non-maximum suppression (NMS) """
-
-    # Sort by score
-    if not already_sorted:
-        t_data = sort_by_score(t_data)
-
-    #  Calculate the mask IoU between all detections in the timestep.
-    mask_ious_all = mask_iou(t_data['mask_rles'], t_data['mask_rles'], t_data['im_hs'], t_data['im_ws'])
-
-    # Determine which masks NMS should remove
-    # (those overlapping greater than nms_threshold with another mask that has a higher score)
-    num_dets = len(t_data['mask_rles'])
-    to_remove = [False for _ in range(num_dets)]
-    for i in range(num_dets):
-        if not to_remove[i]:
-            for j in range(i + 1, num_dets):
-                if mask_ious_all[i, j] > nms_threshold:
-                    to_remove[j] = True
-
-    # Remove detections which should be removed
-    to_keep = np.logical_not(to_remove)
-    for k in t_data.keys():
-        t_data[k] = t_data[k][to_keep]
-
-    return t_data
-
-
-def non_overlap(t_data, already_sorted=False):
-    """ Enforces masks to be non-overlapping in an image, does this by putting masks 'on top of one another',
-    such that higher score masks 'occlude' and thus remove parts of lower scoring masks.
-
-    Help wanted: if anyone knows a way to do this WITHOUT converting the RLE to the np.array let me know, because that
-    would be MUCH more efficient. (I have tried, but haven't yet had success).
-    """
-
-    # Sort by score
-    if not already_sorted:
-        t_data = sort_by_score(t_data)
-
-    # Get coco masks
-    coco_masks = create_coco_mask(t_data['mask_rles'], t_data['im_hs'], t_data['im_ws'])
-
-    # Create a single np.array to hold all of the non-overlapping mask
-    masks_array = np.zeros((t_data['im_hs'][0], t_data['im_ws'][0]), 'uint8')
-
-    # Decode each mask into a np.array, and place it into the overall array for the whole frame.
-    # Since masks with the lowest score are placed first, they are 'partially overridden' by masks with a higher score
-    # if they overlap.
-    for i, mask in enumerate(coco_masks[::-1]):
-        masks_array[mask_utils.decode(mask).astype('bool')] = i + 1
-
-    # Encode the resulting np.array back into a set of coco_masks which are now non-overlapping.
-    num_dets = len(coco_masks)
-    for i, j in enumerate(range(1, num_dets + 1)[::-1]):
-        coco_masks[i] = mask_utils.encode(np.asfortranarray(masks_array == j, dtype=np.uint8))
-
-    # Convert from coco_mask back into our mask_rle format.
-    t_data['mask_rles'] = [m['counts'].decode("utf-8") for m in coco_masks]
-
-    return t_data
-
-
-def masks2boxes(mask_rles, im_hs, im_ws):
-    """ Extracts bounding boxes which surround a set of masks. """
-    coco_masks = create_coco_mask(mask_rles, im_hs, im_ws)
-    boxes = np.array([mask_utils.toBbox(x) for x in coco_masks])
-    if len(boxes) == 0:
-        boxes = np.empty((0, 4))
-    return boxes
-
-
-def box_iou(bboxes1, bboxes2, box_format='xywh', do_ioa=False, do_giou=False):
-    """ Calculates the IOU (intersection over union) between two arrays of boxes.
-    Allows variable box formats ('xywh' and 'x0y0x1y1').
-    If do_ioa (intersection over area), then calculates the intersection over the area of boxes1 - this is commonly
-    used to determine if detections are within crowd ignore region.
-    If do_giou (generalized intersection over union, then calculates giou.
-    """
-    if len(bboxes1) == 0 or len(bboxes2) == 0:
-        ious = np.zeros((len(bboxes1), len(bboxes2)))
-        return ious
-    if box_format in 'xywh':
-        # layout: (x0, y0, w, h)
-        bboxes1 = deepcopy(bboxes1)
-        bboxes2 = deepcopy(bboxes2)
-
-        bboxes1[:, 2] = bboxes1[:, 0] + bboxes1[:, 2]
-        bboxes1[:, 3] = bboxes1[:, 1] + bboxes1[:, 3]
-        bboxes2[:, 2] = bboxes2[:, 0] + bboxes2[:, 2]
-        bboxes2[:, 3] = bboxes2[:, 1] + bboxes2[:, 3]
-    elif box_format not in 'x0y0x1y1':
-        raise (Exception('box_format %s is not implemented' % box_format))
-
-    # layout: (x0, y0, x1, y1)
-    min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
-    max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
-    intersection = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(min_[..., 3] - max_[..., 1], 0)
-    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1])
-
-    if do_ioa:
-        ioas = np.zeros_like(intersection)
-        valid_mask = area1 > 0 + np.finfo('float').eps
-        ioas[valid_mask, :] = intersection[valid_mask, :] / area1[valid_mask][:, np.newaxis]
-
-        return ioas
-    else:
-        area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1])
-        union = area1[:, np.newaxis] + area2[np.newaxis, :] - intersection
-        intersection[area1 <= 0 + np.finfo('float').eps, :] = 0
-        intersection[:, area2 <= 0 + np.finfo('float').eps] = 0
-        intersection[union <= 0 + np.finfo('float').eps] = 0
-        union[union <= 0 + np.finfo('float').eps] = 1
-        ious = intersection / union
-
-    if do_giou:
-        enclosing_area = np.maximum(max_[..., 2] - min_[..., 0], 0) * np.maximum(max_[..., 3] - min_[..., 1], 0)
-        eps = 1e-7
-        # giou
-        ious = ious - ((enclosing_area - union) / (enclosing_area + eps))
-
-    return ious
-
-
-def match(match_scores):
-    match_rows, match_cols = linear_sum_assignment(-match_scores)
-    return match_rows, match_cols
-
-
-def write_seq(output_data, out_file):
-    out_loc = os.path.dirname(out_file)
-    if not os.path.exists(out_loc):
-        os.makedirs(out_loc, exist_ok=True)
-    fp = open(out_file, 'w', newline='')
-    writer = csv.writer(fp, delimiter=' ')
-    for row in output_data:
-        writer.writerow(row)
-    fp.close()
-
-
-def combine_classes(data):
-    """ Converts data from a class-separated to a class-combined format.
-    Input format: data['cls'][t] = {'ids', 'scores', 'im_hs', 'im_ws', 'mask_rles'}
-    Output format: data[t] = {'ids', 'scores', 'im_hs', 'im_ws', 'mask_rles', 'cls'}
-    """
-    output_data = [{} for _ in list(data.values())[0]]
-    for cls, cls_data in data.items():
-        for timestep, t_data in enumerate(cls_data):
-            for k in t_data.keys():
-                if k in output_data[timestep].keys():
-                    output_data[timestep][k] += list(t_data[k])
-                else:
-                    output_data[timestep][k] = list(t_data[k])
-            if 'cls' in output_data[timestep].keys():
-                output_data[timestep]['cls'] += [cls]*len(output_data[timestep]['ids'])
-            else:
-                output_data[timestep]['cls'] = [cls]*len(output_data[timestep]['ids'])
-
-    for timestep, t_data in enumerate(output_data):
-        for k in t_data.keys():
-            output_data[timestep][k] = np.array(output_data[timestep][k])
-
-    return output_data
-
-
-def save_as_png(t_data, out_file, im_h, im_w):
-    """ Save a set of segmentation masks into a PNG format, the same as used for the DAVIS dataset."""
-
-    if len(t_data['mask_rles']) > 0:
-        coco_masks = create_coco_mask(t_data['mask_rles'], t_data['im_hs'], t_data['im_ws'])
-
-        list_of_np_masks = [mask_utils.decode(mask) for mask in coco_masks]
-
-        png = np.zeros((t_data['im_hs'][0], t_data['im_ws'][0]))
-        for mask, c_id in zip(list_of_np_masks, t_data['ids']):
-            png[mask.astype("bool")] = c_id + 1
-    else:
-        png = np.zeros((im_h, im_w))
-
-    if not os.path.exists(os.path.dirname(out_file)):
-        os.makedirs(os.path.dirname(out_file))
-
-    colmap = (np.array(pascal_colormap) * 255).round().astype("uint8")
-    palimage = Image.new('P', (16, 16))
-    palimage.putpalette(colmap)
-    im = Image.fromarray(np.squeeze(png.astype("uint8")))
-    im2 = im.quantize(palette=palimage)
-    im2.save(out_file)
-
-
-def get_frame_size(data):
-    """ Gets frame height and width from data. """
-    for cls, cls_data in data.items():
-        for timestep, t_data in enumerate(cls_data):
-            if len(t_data['im_hs'] > 0):
-                im_h = t_data['im_hs'][0]
-                im_w = t_data['im_ws'][0]
-                return im_h, im_w
-    return None
diff --git a/trackeval/baselines/non_overlap.py b/trackeval/baselines/non_overlap.py
deleted file mode 100644
index 43b131d..0000000
--- a/trackeval/baselines/non_overlap.py
+++ /dev/null
@@ -1,92 +0,0 @@
-"""
-Non-Overlap: Code to take in a set of raw detections and produce a set of non-overlapping detections from it.
-
-Author: Jonathon Luiten
-"""
-
-import os
-import sys
-from multiprocessing.pool import Pool
-from multiprocessing import freeze_support
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
-from trackeval.baselines import baseline_utils as butils
-from trackeval.utils import get_code_path
-
-code_path = get_code_path()
-config = {
-    'INPUT_FOL': os.path.join(code_path, 'data/detections/rob_mots/{split}/raw_supplied/data/'),
-    'OUTPUT_FOL': os.path.join(code_path, 'data/detections/rob_mots/{split}/non_overlap_supplied/data/'),
-    'SPLIT': 'train',  # valid: 'train', 'val', 'test'.
-    'Benchmarks': None,  # If None, all benchmarks in SPLIT.
-
-    'Num_Parallel_Cores': None,  # If None, run without parallel.
-
-    'THRESHOLD_NMS_MASK_IOU': 0.5,
-}
-
-
-def do_sequence(seq_file):
-
-    # Load input data from file (e.g. provided detections)
-    # data format: data['cls'][t] = {'ids', 'scores', 'im_hs', 'im_ws', 'mask_rles'}
-    data = butils.load_seq(seq_file)
-
-    # Converts data from a class-separated to a class-combined format.
-    # data[t] = {'ids', 'scores', 'im_hs', 'im_ws', 'mask_rles', 'cls'}
-    data = butils.combine_classes(data)
-
-    # Where to accumulate output data for writing out
-    output_data = []
-
-    # Run for each timestep.
-    for timestep, t_data in enumerate(data):
-
-        # Remove redundant masks by performing non-maximum suppression (NMS)
-        t_data = butils.mask_NMS(t_data, nms_threshold=config['THRESHOLD_NMS_MASK_IOU'])
-
-        # Perform non-overlap, to get non_overlapping masks.
-        t_data = butils.non_overlap(t_data, already_sorted=True)
-
-        # Save result in output format to write to file later.
-        # Output Format = [timestep ID class score im_h im_w mask_RLE]
-        for i in range(len(t_data['ids'])):
-            row = [timestep, int(t_data['ids'][i]), t_data['cls'][i], t_data['scores'][i], t_data['im_hs'][i],
-                   t_data['im_ws'][i], t_data['mask_rles'][i]]
-            output_data.append(row)
-
-    # Write results to file
-    out_file = seq_file.replace(config['INPUT_FOL'].format(split=config['SPLIT']),
-                                config['OUTPUT_FOL'].format(split=config['SPLIT']))
-    butils.write_seq(output_data, out_file)
-
-    print('DONE:', seq_file)
-
-
-if __name__ == '__main__':
-
-    # Required to fix bug in multiprocessing on windows.
-    freeze_support()
-
-    # Obtain list of sequences to run tracker for.
-    if config['Benchmarks']:
-        benchmarks = config['Benchmarks']
-    else:
-        benchmarks = ['davis_unsupervised', 'kitti_mots', 'youtube_vis', 'ovis', 'bdd_mots', 'tao']
-        if config['SPLIT'] != 'train':
-            benchmarks += ['waymo', 'mots_challenge']
-    seqs_todo = []
-    for bench in benchmarks:
-        bench_fol = os.path.join(config['INPUT_FOL'].format(split=config['SPLIT']), bench)
-        seqs_todo += [os.path.join(bench_fol, seq) for seq in os.listdir(bench_fol)]
-
-    # Run in parallel
-    if config['Num_Parallel_Cores']:
-        with Pool(config['Num_Parallel_Cores']) as pool:
-            results = pool.map(do_sequence, seqs_todo)
-
-    # Run in series
-    else:
-        for seq_todo in seqs_todo:
-            do_sequence(seq_todo)
-
diff --git a/trackeval/baselines/pascal_colormap.py b/trackeval/baselines/pascal_colormap.py
deleted file mode 100644
index b31f348..0000000
--- a/trackeval/baselines/pascal_colormap.py
+++ /dev/null
@@ -1,257 +0,0 @@
-pascal_colormap = [
-    0     ,         0,         0,
-    0.5020,         0,         0,
-         0,    0.5020,         0,
-    0.5020,    0.5020,         0,
-         0,         0,    0.5020,
-    0.5020,         0,    0.5020,
-         0,    0.5020,    0.5020,
-    0.5020,    0.5020,    0.5020,
-    0.2510,         0,         0,
-    0.7529,         0,         0,
-    0.2510,    0.5020,         0,
-    0.7529,    0.5020,         0,
-    0.2510,         0,    0.5020,
-    0.7529,         0,    0.5020,
-    0.2510,    0.5020,    0.5020,
-    0.7529,    0.5020,    0.5020,
-         0,    0.2510,         0,
-    0.5020,    0.2510,         0,
-         0,    0.7529,         0,
-    0.5020,    0.7529,         0,
-         0,    0.2510,    0.5020,
-    0.5020,    0.2510,    0.5020,
-         0,    0.7529,    0.5020,
-    0.5020,    0.7529,    0.5020,
-    0.2510,    0.2510,         0,
-    0.7529,    0.2510,         0,
-    0.2510,    0.7529,         0,
-    0.7529,    0.7529,         0,
-    0.2510,    0.2510,    0.5020,
-    0.7529,    0.2510,    0.5020,
-    0.2510,    0.7529,    0.5020,
-    0.7529,    0.7529,    0.5020,
-         0,         0,    0.2510,
-    0.5020,         0,    0.2510,
-         0,    0.5020,    0.2510,
-    0.5020,    0.5020,    0.2510,
-         0,         0,    0.7529,
-    0.5020,         0,    0.7529,
-         0,    0.5020,    0.7529,
-    0.5020,    0.5020,    0.7529,
-    0.2510,         0,    0.2510,
-    0.7529,         0,    0.2510,
-    0.2510,    0.5020,    0.2510,
-    0.7529,    0.5020,    0.2510,
-    0.2510,         0,    0.7529,
-    0.7529,         0,    0.7529,
-    0.2510,    0.5020,    0.7529,
-    0.7529,    0.5020,    0.7529,
-         0,    0.2510,    0.2510,
-    0.5020,    0.2510,    0.2510,
-         0,    0.7529,    0.2510,
-    0.5020,    0.7529,    0.2510,
-         0,    0.2510,    0.7529,
-    0.5020,    0.2510,    0.7529,
-         0,    0.7529,    0.7529,
-    0.5020,    0.7529,    0.7529,
-    0.2510,    0.2510,    0.2510,
-    0.7529,    0.2510,    0.2510,
-    0.2510,    0.7529,    0.2510,
-    0.7529,    0.7529,    0.2510,
-    0.2510,    0.2510,    0.7529,
-    0.7529,    0.2510,    0.7529,
-    0.2510,    0.7529,    0.7529,
-    0.7529,    0.7529,    0.7529,
-    0.1255,         0,         0,
-    0.6275,         0,         0,
-    0.1255,    0.5020,         0,
-    0.6275,    0.5020,         0,
-    0.1255,         0,    0.5020,
-    0.6275,         0,    0.5020,
-    0.1255,    0.5020,    0.5020,
-    0.6275,    0.5020,    0.5020,
-    0.3765,         0,         0,
-    0.8784,         0,         0,
-    0.3765,    0.5020,         0,
-    0.8784,    0.5020,         0,
-    0.3765,         0,    0.5020,
-    0.8784,         0,    0.5020,
-    0.3765,    0.5020,    0.5020,
-    0.8784,    0.5020,    0.5020,
-    0.1255,    0.2510,         0,
-    0.6275,    0.2510,         0,
-    0.1255,    0.7529,         0,
-    0.6275,    0.7529,         0,
-    0.1255,    0.2510,    0.5020,
-    0.6275,    0.2510,    0.5020,
-    0.1255,    0.7529,    0.5020,
-    0.6275,    0.7529,    0.5020,
-    0.3765,    0.2510,         0,
-    0.8784,    0.2510,         0,
-    0.3765,    0.7529,         0,
-    0.8784,    0.7529,         0,
-    0.3765,    0.2510,    0.5020,
-    0.8784,    0.2510,    0.5020,
-    0.3765,    0.7529,    0.5020,
-    0.8784,    0.7529,    0.5020,
-    0.1255,         0,    0.2510,
-    0.6275,         0,    0.2510,
-    0.1255,    0.5020,    0.2510,
-    0.6275,    0.5020,    0.2510,
-    0.1255,         0,    0.7529,
-    0.6275,         0,    0.7529,
-    0.1255,    0.5020,    0.7529,
-    0.6275,    0.5020,    0.7529,
-    0.3765,         0,    0.2510,
-    0.8784,         0,    0.2510,
-    0.3765,    0.5020,    0.2510,
-    0.8784,    0.5020,    0.2510,
-    0.3765,         0,    0.7529,
-    0.8784,         0,    0.7529,
-    0.3765,    0.5020,    0.7529,
-    0.8784,    0.5020,    0.7529,
-    0.1255,    0.2510,    0.2510,
-    0.6275,    0.2510,    0.2510,
-    0.1255,    0.7529,    0.2510,
-    0.6275,    0.7529,    0.2510,
-    0.1255,    0.2510,    0.7529,
-    0.6275,    0.2510,    0.7529,
-    0.1255,    0.7529,    0.7529,
-    0.6275,    0.7529,    0.7529,
-    0.3765,    0.2510,    0.2510,
-    0.8784,    0.2510,    0.2510,
-    0.3765,    0.7529,    0.2510,
-    0.8784,    0.7529,    0.2510,
-    0.3765,    0.2510,    0.7529,
-    0.8784,    0.2510,    0.7529,
-    0.3765,    0.7529,    0.7529,
-    0.8784,    0.7529,    0.7529,
-         0,    0.1255,         0,
-    0.5020,    0.1255,         0,
-         0,    0.6275,         0,
-    0.5020,    0.6275,         0,
-         0,    0.1255,    0.5020,
-    0.5020,    0.1255,    0.5020,
-         0,    0.6275,    0.5020,
-    0.5020,    0.6275,    0.5020,
-    0.2510,    0.1255,         0,
-    0.7529,    0.1255,         0,
-    0.2510,    0.6275,         0,
-    0.7529,    0.6275,         0,
-    0.2510,    0.1255,    0.5020,
-    0.7529,    0.1255,    0.5020,
-    0.2510,    0.6275,    0.5020,
-    0.7529,    0.6275,    0.5020,
-         0,    0.3765,         0,
-    0.5020,    0.3765,         0,
-         0,    0.8784,         0,
-    0.5020,    0.8784,         0,
-         0,    0.3765,    0.5020,
-    0.5020,    0.3765,    0.5020,
-         0,    0.8784,    0.5020,
-    0.5020,    0.8784,    0.5020,
-    0.2510,    0.3765,         0,
-    0.7529,    0.3765,         0,
-    0.2510,    0.8784,         0,
-    0.7529,    0.8784,         0,
-    0.2510,    0.3765,    0.5020,
-    0.7529,    0.3765,    0.5020,
-    0.2510,    0.8784,    0.5020,
-    0.7529,    0.8784,    0.5020,
-         0,    0.1255,    0.2510,
-    0.5020,    0.1255,    0.2510,
-         0,    0.6275,    0.2510,
-    0.5020,    0.6275,    0.2510,
-         0,    0.1255,    0.7529,
-    0.5020,    0.1255,    0.7529,
-         0,    0.6275,    0.7529,
-    0.5020,    0.6275,    0.7529,
-    0.2510,    0.1255,    0.2510,
-    0.7529,    0.1255,    0.2510,
-    0.2510,    0.6275,    0.2510,
-    0.7529,    0.6275,    0.2510,
-    0.2510,    0.1255,    0.7529,
-    0.7529,    0.1255,    0.7529,
-    0.2510,    0.6275,    0.7529,
-    0.7529,    0.6275,    0.7529,
-         0,    0.3765,    0.2510,
-    0.5020,    0.3765,    0.2510,
-         0,    0.8784,    0.2510,
-    0.5020,    0.8784,    0.2510,
-         0,    0.3765,    0.7529,
-    0.5020,    0.3765,    0.7529,
-         0,    0.8784,    0.7529,
-    0.5020,    0.8784,    0.7529,
-    0.2510,    0.3765,    0.2510,
-    0.7529,    0.3765,    0.2510,
-    0.2510,    0.8784,    0.2510,
-    0.7529,    0.8784,    0.2510,
-    0.2510,    0.3765,    0.7529,
-    0.7529,    0.3765,    0.7529,
-    0.2510,    0.8784,    0.7529,
-    0.7529,    0.8784,    0.7529,
-    0.1255,    0.1255,         0,
-    0.6275,    0.1255,         0,
-    0.1255,    0.6275,         0,
-    0.6275,    0.6275,         0,
-    0.1255,    0.1255,    0.5020,
-    0.6275,    0.1255,    0.5020,
-    0.1255,    0.6275,    0.5020,
-    0.6275,    0.6275,    0.5020,
-    0.3765,    0.1255,         0,
-    0.8784,    0.1255,         0,
-    0.3765,    0.6275,         0,
-    0.8784,    0.6275,         0,
-    0.3765,    0.1255,    0.5020,
-    0.8784,    0.1255,    0.5020,
-    0.3765,    0.6275,    0.5020,
-    0.8784,    0.6275,    0.5020,
-    0.1255,    0.3765,         0,
-    0.6275,    0.3765,         0,
-    0.1255,    0.8784,         0,
-    0.6275,    0.8784,         0,
-    0.1255,    0.3765,    0.5020,
-    0.6275,    0.3765,    0.5020,
-    0.1255,    0.8784,    0.5020,
-    0.6275,    0.8784,    0.5020,
-    0.3765,    0.3765,         0,
-    0.8784,    0.3765,         0,
-    0.3765,    0.8784,         0,
-    0.8784,    0.8784,         0,
-    0.3765,    0.3765,    0.5020,
-    0.8784,    0.3765,    0.5020,
-    0.3765,    0.8784,    0.5020,
-    0.8784,    0.8784,    0.5020,
-    0.1255,    0.1255,    0.2510,
-    0.6275,    0.1255,    0.2510,
-    0.1255,    0.6275,    0.2510,
-    0.6275,    0.6275,    0.2510,
-    0.1255,    0.1255,    0.7529,
-    0.6275,    0.1255,    0.7529,
-    0.1255,    0.6275,    0.7529,
-    0.6275,    0.6275,    0.7529,
-    0.3765,    0.1255,    0.2510,
-    0.8784,    0.1255,    0.2510,
-    0.3765,    0.6275,    0.2510,
-    0.8784,    0.6275,    0.2510,
-    0.3765,    0.1255,    0.7529,
-    0.8784,    0.1255,    0.7529,
-    0.3765,    0.6275,    0.7529,
-    0.8784,    0.6275,    0.7529,
-    0.1255,    0.3765,    0.2510,
-    0.6275,    0.3765,    0.2510,
-    0.1255,    0.8784,    0.2510,
-    0.6275,    0.8784,    0.2510,
-    0.1255,    0.3765,    0.7529,
-    0.6275,    0.3765,    0.7529,
-    0.1255,    0.8784,    0.7529,
-    0.6275,    0.8784,    0.7529,
-    0.3765,    0.3765,    0.2510,
-    0.8784,    0.3765,    0.2510,
-    0.3765,    0.8784,    0.2510,
-    0.8784,    0.8784,    0.2510,
-    0.3765,    0.3765,    0.7529,
-    0.8784,    0.3765,    0.7529,
-    0.3765,    0.8784,    0.7529,
-    0.8784,    0.8784,    0.7529]
\ No newline at end of file
diff --git a/trackeval/baselines/stp.py b/trackeval/baselines/stp.py
deleted file mode 100644
index c1c9d1e..0000000
--- a/trackeval/baselines/stp.py
+++ /dev/null
@@ -1,144 +0,0 @@
-"""
-STP: Simplest Tracker Possible
-
-Author: Jonathon Luiten
-
-This simple tracker, simply assigns track IDs which maximise the 'bounding box IoU' between previous tracks and current
-detections. It is also able to match detections to tracks at more than one timestep previously.
-"""
-
-import os
-import sys
-import numpy as np
-from multiprocessing.pool import Pool
-from multiprocessing import freeze_support
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
-from trackeval.baselines import baseline_utils as butils
-from trackeval.utils import get_code_path
-
-code_path = get_code_path()
-config = {
-    'INPUT_FOL': os.path.join(code_path, 'data/detections/rob_mots/{split}/non_overlap_supplied/data/'),
-    'OUTPUT_FOL': os.path.join(code_path, 'data/trackers/rob_mots/{split}/STP/data/'),
-    'SPLIT': 'train',  # valid: 'train', 'val', 'test'.
-    'Benchmarks': None,  # If None, all benchmarks in SPLIT.
-
-    'Num_Parallel_Cores': None,  # If None, run without parallel.
-
-    'DETECTION_THRESHOLD': 0.5,
-    'ASSOCIATION_THRESHOLD': 1e-10,
-    'MAX_FRAMES_SKIP': 7
-}
-
-
-def track_sequence(seq_file):
-
-    # Load input data from file (e.g. provided detections)
-    # data format: data['cls'][t] = {'ids', 'scores', 'im_hs', 'im_ws', 'mask_rles'}
-    data = butils.load_seq(seq_file)
-
-    # Where to accumulate output data for writing out
-    output_data = []
-
-    # To ensure IDs are unique per object across all classes.
-    curr_max_id = 0
-
-    # Run tracker for each class.
-    for cls, cls_data in data.items():
-
-        # Initialize container for holding previously tracked objects.
-        prev = {'boxes': np.empty((0, 4)),
-                'ids': np.array([], np.int),
-                'timesteps': np.array([])}
-
-        # Run tracker for each timestep.
-        for timestep, t_data in enumerate(cls_data):
-
-            # Threshold detections.
-            t_data = butils.threshold(t_data, config['DETECTION_THRESHOLD'])
-
-            # Convert mask dets to bounding boxes.
-            boxes = butils.masks2boxes(t_data['mask_rles'], t_data['im_hs'], t_data['im_ws'])
-
-            # Calculate IoU between previous and current frame dets.
-            ious = butils.box_iou(prev['boxes'], boxes)
-
-            # Score which decreases quickly for previous dets depending on how many timesteps before they come from.
-            prev_timestep_scores = np.power(10, -1 * prev['timesteps'])
-
-            # Matching score is such that it first tries to match 'most recent timesteps',
-            # and within each timestep maximised IoU.
-            match_scores = prev_timestep_scores[:, np.newaxis] * ious
-
-            # Find best matching between current dets and previous tracks.
-            match_rows, match_cols = butils.match(match_scores)
-
-            # Remove matches that have an IoU below a certain threshold.
-            actually_matched_mask = ious[match_rows, match_cols] > config['ASSOCIATION_THRESHOLD']
-            match_rows = match_rows[actually_matched_mask]
-            match_cols = match_cols[actually_matched_mask]
-
-            # Assign the prev track ID to the current dets if they were matched.
-            ids = np.nan * np.ones((len(boxes),), np.int)
-            ids[match_cols] = prev['ids'][match_rows]
-
-            # Create new track IDs for dets that were not matched to previous tracks.
-            num_not_matched = len(ids) - len(match_cols)
-            new_ids = np.arange(curr_max_id + 1, curr_max_id + num_not_matched + 1)
-            ids[np.isnan(ids)] = new_ids
-
-            # Update maximum ID to ensure future added tracks have a unique ID value.
-            curr_max_id += num_not_matched
-
-            # Drop tracks from 'previous tracks' if they have not been matched in the last MAX_FRAMES_SKIP frames.
-            unmatched_rows = [i for i in range(len(prev['ids'])) if
-                              i not in match_rows and (prev['timesteps'][i] + 1 <= config['MAX_FRAMES_SKIP'])]
-
-            # Update the set of previous tracking results to include the newly tracked detections.
-            prev['ids'] = np.concatenate((ids, prev['ids'][unmatched_rows]), axis=0)
-            prev['boxes'] = np.concatenate((np.atleast_2d(boxes), np.atleast_2d(prev['boxes'][unmatched_rows])), axis=0)
-            prev['timesteps'] = np.concatenate((np.zeros((len(ids),)), prev['timesteps'][unmatched_rows] + 1), axis=0)
-
-            # Save result in output format to write to file later.
-            # Output Format = [timestep ID class score im_h im_w mask_RLE]
-            for i in range(len(t_data['ids'])):
-                row = [timestep, int(ids[i]), cls, t_data['scores'][i], t_data['im_hs'][i], t_data['im_ws'][i],
-                       t_data['mask_rles'][i]]
-                output_data.append(row)
-
-    # Write results to file
-    out_file = seq_file.replace(config['INPUT_FOL'].format(split=config['SPLIT']),
-                                config['OUTPUT_FOL'].format(split=config['SPLIT']))
-    butils.write_seq(output_data, out_file)
-
-    print('DONE:', seq_file)
-
-
-if __name__ == '__main__':
-
-    # Required to fix bug in multiprocessing on windows.
-    freeze_support()
-
-    # Obtain list of sequences to run tracker for.
-    if config['Benchmarks']:
-        benchmarks = config['Benchmarks']
-    else:
-        benchmarks = ['davis_unsupervised', 'kitti_mots', 'youtube_vis', 'ovis', 'bdd_mots', 'tao']
-        if config['SPLIT'] != 'train':
-            benchmarks += ['waymo', 'mots_challenge']
-    seqs_todo = []
-    for bench in benchmarks:
-        bench_fol = os.path.join(config['INPUT_FOL'].format(split=config['SPLIT']), bench)
-        seqs_todo += [os.path.join(bench_fol, seq) for seq in os.listdir(bench_fol)]
-
-    # Run in parallel
-    if config['Num_Parallel_Cores']:
-        with Pool(config['Num_Parallel_Cores']) as pool:
-            results = pool.map(track_sequence, seqs_todo)
-
-    # Run in series
-    else:
-        for seq_todo in seqs_todo:
-            track_sequence(seq_todo)
-
diff --git a/trackeval/baselines/thresholder.py b/trackeval/baselines/thresholder.py
deleted file mode 100644
index c589e10..0000000
--- a/trackeval/baselines/thresholder.py
+++ /dev/null
@@ -1,92 +0,0 @@
-"""
-Thresholder
-
-Author: Jonathon Luiten
-
-Simply reads in a set of detection, thresholds them at a certain score threshold, and writes them out again.
-"""
-
-import os
-import sys
-from multiprocessing.pool import Pool
-from multiprocessing import freeze_support
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
-from trackeval.baselines import baseline_utils as butils
-from trackeval.utils import get_code_path
-
-THRESHOLD = 0.2
-
-code_path = get_code_path()
-config = {
-    'INPUT_FOL': os.path.join(code_path, 'data/detections/rob_mots/{split}/non_overlap_supplied/data/'),
-    'OUTPUT_FOL': os.path.join(code_path, 'data/detections/rob_mots/{split}/threshold_' + str(100*THRESHOLD) + '/data/'),
-    'SPLIT': 'train',  # valid: 'train', 'val', 'test'.
-    'Benchmarks': None,  # If None, all benchmarks in SPLIT.
-
-    'Num_Parallel_Cores': None,  # If None, run without parallel.
-
-    'DETECTION_THRESHOLD': THRESHOLD,
-}
-
-
-def do_sequence(seq_file):
-
-    # Load input data from file (e.g. provided detections)
-    # data format: data['cls'][t] = {'ids', 'scores', 'im_hs', 'im_ws', 'mask_rles'}
-    data = butils.load_seq(seq_file)
-
-    # Where to accumulate output data for writing out
-    output_data = []
-
-    # Run for each class.
-    for cls, cls_data in data.items():
-
-        # Run for each timestep.
-        for timestep, t_data in enumerate(cls_data):
-
-            # Threshold detections.
-            t_data = butils.threshold(t_data, config['DETECTION_THRESHOLD'])
-
-            # Save result in output format to write to file later.
-            # Output Format = [timestep ID class score im_h im_w mask_RLE]
-            for i in range(len(t_data['ids'])):
-                row = [timestep, int(t_data['ids'][i]), cls, t_data['scores'][i], t_data['im_hs'][i],
-                       t_data['im_ws'][i], t_data['mask_rles'][i]]
-                output_data.append(row)
-
-    # Write results to file
-    out_file = seq_file.replace(config['INPUT_FOL'].format(split=config['SPLIT']),
-                                config['OUTPUT_FOL'].format(split=config['SPLIT']))
-    butils.write_seq(output_data, out_file)
-
-    print('DONE:', seq_todo)
-
-
-if __name__ == '__main__':
-
-    # Required to fix bug in multiprocessing on windows.
-    freeze_support()
-
-    # Obtain list of sequences to run tracker for.
-    if config['Benchmarks']:
-        benchmarks = config['Benchmarks']
-    else:
-        benchmarks = ['davis_unsupervised', 'kitti_mots', 'youtube_vis', 'ovis', 'bdd_mots', 'tao']
-        if config['SPLIT'] != 'train':
-            benchmarks += ['waymo', 'mots_challenge']
-    seqs_todo = []
-    for bench in benchmarks:
-        bench_fol = os.path.join(config['INPUT_FOL'].format(split=config['SPLIT']), bench)
-        seqs_todo += [os.path.join(bench_fol, seq) for seq in os.listdir(bench_fol)]
-
-    # Run in parallel
-    if config['Num_Parallel_Cores']:
-        with Pool(config['Num_Parallel_Cores']) as pool:
-            results = pool.map(do_sequence, seqs_todo)
-
-    # Run in series
-    else:
-        for seq_todo in seqs_todo:
-            do_sequence(seq_todo)
-
diff --git a/trackeval/baselines/vizualize.py b/trackeval/baselines/vizualize.py
deleted file mode 100644
index 568a303..0000000
--- a/trackeval/baselines/vizualize.py
+++ /dev/null
@@ -1,94 +0,0 @@
-"""
-Vizualize: Code which converts .txt rle tracking results into a visual .png format.
-
-Author: Jonathon Luiten
-"""
-
-import os
-import sys
-from multiprocessing.pool import Pool
-from multiprocessing import freeze_support
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
-from trackeval.baselines import baseline_utils as butils
-from trackeval.utils import get_code_path
-from trackeval.datasets.rob_mots_classmap import cls_id_to_name
-
-code_path = get_code_path()
-config = {
-    # Tracker format:
-    'INPUT_FOL': os.path.join(code_path, 'data/trackers/rob_mots/{split}/STP/data/{bench}'),
-    'OUTPUT_FOL': os.path.join(code_path, 'data/viz/rob_mots/{split}/STP/data/{bench}'),
-    # GT format:
-    # 'INPUT_FOL': os.path.join(code_path, 'data/gt/rob_mots/{split}/{bench}/data/'),
-    # 'OUTPUT_FOL': os.path.join(code_path, 'data/gt_viz/rob_mots/{split}/{bench}/'),
-    'SPLIT': 'train',  # valid: 'train', 'val', 'test'.
-    'Benchmarks': None,  # If None, all benchmarks in SPLIT.
-    'Num_Parallel_Cores': None,  # If None, run without parallel.
-}
-
-
-def do_sequence(seq_file):
-    # Folder to save resulting visualization in
-    out_fol = seq_file.replace(config['INPUT_FOL'].format(split=config['SPLIT'], bench=bench),
-                               config['OUTPUT_FOL'].format(split=config['SPLIT'], bench=bench)).replace('.txt', '')
-
-    # Load input data from file (e.g. provided detections)
-    # data format: data['cls'][t] = {'ids', 'scores', 'im_hs', 'im_ws', 'mask_rles'}
-    data = butils.load_seq(seq_file)
-
-    # Get frame size for visualizing empty frames
-    im_h, im_w = butils.get_frame_size(data)
-
-    # First run for each class.
-    for cls, cls_data in data.items():
-
-        if cls >= 100:
-            continue
-
-        # Run for each timestep.
-        for timestep, t_data in enumerate(cls_data):
-            # Save out visualization
-            out_file = os.path.join(out_fol, cls_id_to_name[cls], str(timestep).zfill(5) + '.png')
-            butils.save_as_png(t_data, out_file, im_h, im_w)
-
-
-    # Then run for all classes combined
-    # Converts data from a class-separated to a class-combined format.
-    data = butils.combine_classes(data)
-
-    # Run for each timestep.
-    for timestep, t_data in enumerate(data):
-        # Save out visualization
-        out_file = os.path.join(out_fol, 'all_classes', str(timestep).zfill(5) + '.png')
-        butils.save_as_png(t_data, out_file, im_h, im_w)
-
-    print('DONE:', seq_file)
-
-
-if __name__ == '__main__':
-
-    # Required to fix bug in multiprocessing on windows.
-    freeze_support()
-
-    # Obtain list of sequences to run tracker for.
-    if config['Benchmarks']:
-        benchmarks = config['Benchmarks']
-    else:
-        benchmarks = ['davis_unsupervised', 'kitti_mots', 'youtube_vis', 'ovis', 'bdd_mots', 'tao']
-        if config['SPLIT'] != 'train':
-            benchmarks += ['waymo', 'mots_challenge']
-    seqs_todo = []
-    for bench in benchmarks:
-        bench_fol = config['INPUT_FOL'].format(split=config['SPLIT'], bench=bench)
-        seqs_todo += [os.path.join(bench_fol, seq) for seq in os.listdir(bench_fol)]
-
-    # Run in parallel
-    if config['Num_Parallel_Cores']:
-        with Pool(config['Num_Parallel_Cores']) as pool:
-            results = pool.map(do_sequence, seqs_todo)
-
-    # Run in series
-    else:
-        for seq_todo in seqs_todo:
-            do_sequence(seq_todo)
diff --git a/trackeval/datasets/__init__.py b/trackeval/datasets/__init__.py
deleted file mode 100644
index 6012d1c..0000000
--- a/trackeval/datasets/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .tao import TAO
-from .sem_track import SemTrack
\ No newline at end of file
diff --git a/trackeval/datasets/_base_dataset.py b/trackeval/datasets/_base_dataset.py
deleted file mode 100644
index 64bf9fc..0000000
--- a/trackeval/datasets/_base_dataset.py
+++ /dev/null
@@ -1,326 +0,0 @@
-import csv
-import io
-import zipfile
-import os
-import traceback
-import numpy as np
-from copy import deepcopy
-from abc import ABC, abstractmethod
-from .. import _timing
-from ..utils import TrackEvalException
-
-
-class _BaseDataset(ABC):
-    @abstractmethod
-    def __init__(self):
-        self.tracker_list = None
-        self.seq_list = None
-        self.class_list = None
-        self.output_fol = None
-        self.output_sub_fol = None
-        self.should_classes_combine = True
-        self.use_super_categories = False
-
-    # Functions to implement:
-
-    @staticmethod
-    @abstractmethod
-    def get_default_dataset_config():
-        ...
-
-    @abstractmethod
-    def _load_raw_file(self, tracker, seq, is_gt):
-        ...
-
-    @_timing.time
-    @abstractmethod
-    def get_preprocessed_seq_data(self, raw_data, cls):
-        ...
-
-    @abstractmethod
-    def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
-        ...
-
-    # Helper functions for all datasets:
-
-    @classmethod
-    def get_class_name(cls):
-        return cls.__name__
-
-    def get_name(self):
-        return self.get_class_name()
-
-    def get_output_fol(self, tracker):
-        return os.path.join(self.output_fol, tracker, self.output_sub_fol)
-
-    def get_display_name(self, tracker):
-        """ Can be overwritten if the trackers name (in files) is different to how it should be displayed.
-        By default this method just returns the trackers name as is.
-        """
-        return tracker
-
-    def get_eval_info(self):
-        """Return info about the dataset needed for the Evaluator"""
-        return self.tracker_list, self.seq_list, self.class_list
-
-    @_timing.time
-    def get_raw_seq_data(self, tracker, seq):
-        """ Loads raw data (tracker and ground-truth) for a single tracker on a single sequence.
-        Raw data includes all of the information needed for both preprocessing and evaluation, for all classes.
-        A later function (get_processed_seq_data) will perform such preprocessing and extract relevant information for
-        the evaluation of each class.
-
-        This returns a dict which contains the fields:
-        [num_timesteps]: integer
-        [gt_ids, tracker_ids, gt_classes, tracker_classes, tracker_confidences]:
-                                                                list (for each timestep) of 1D NDArrays (for each det).
-        [gt_dets, tracker_dets, gt_crowd_ignore_regions]: list (for each timestep) of lists of detections.
-        [similarity_scores]: list (for each timestep) of 2D NDArrays.
-        [gt_extras]: dict (for each extra) of lists (for each timestep) of 1D NDArrays (for each det).
-
-        gt_extras contains dataset specific information used for preprocessing such as occlusion and truncation levels.
-
-        Note that similarities are extracted as part of the dataset and not the metric, because almost all metrics are
-        independent of the exact method of calculating the similarity. However datasets are not (e.g. segmentation
-        masks vs 2D boxes vs 3D boxes).
-        We calculate the similarity before preprocessing because often both preprocessing and evaluation require it and
-        we don't wish to calculate this twice.
-        We calculate similarity between all gt and tracker classes (not just each class individually) to allow for
-        calculation of metrics such as class confusion matrices. Typically the impact of this on performance is low.
-        """
-        # Load raw data.
-        raw_gt_data = self._load_raw_file(tracker, seq, is_gt=True)
-        raw_tracker_data = self._load_raw_file(tracker, seq, is_gt=False)
-        raw_data = {**raw_tracker_data, **raw_gt_data}  # Merges dictionaries
-
-        # Calculate similarities for each timestep.
-        similarity_scores = []
-        for t, (gt_dets_t, tracker_dets_t) in enumerate(zip(raw_data['gt_dets'], raw_data['tracker_dets'])):
-            ious = self._calculate_similarities(gt_dets_t, tracker_dets_t)
-            similarity_scores.append(ious)
-        raw_data['similarity_scores'] = similarity_scores
-        return raw_data
-
-    @staticmethod
-    def _load_simple_text_file(file, time_col=0, id_col=None, remove_negative_ids=False, valid_filter=None,
-                               crowd_ignore_filter=None, convert_filter=None, is_zipped=False, zip_file=None,
-                               force_delimiters=None):
-        """ Function that loads data which is in a commonly used text file format.
-        Assumes each det is given by one row of a text file.
-        There is no limit to the number or meaning of each column,
-        however one column needs to give the timestep of each det (time_col) which is default col 0.
-
-        The file dialect (deliminator, num cols, etc) is determined automatically.
-        This function automatically separates dets by timestep,
-        and is much faster than alternatives such as np.loadtext or pandas.
-
-        If remove_negative_ids is True and id_col is not None, dets with negative values in id_col are excluded.
-        These are not excluded from ignore data.
-
-        valid_filter can be used to only include certain classes.
-        It is a dict with ints as keys, and lists as values,
-        such that a row is included if "row[key].lower() is in value" for all key/value pairs in the dict.
-        If None, all classes are included.
-
-        crowd_ignore_filter can be used to read crowd_ignore regions separately. It has the same format as valid filter.
-
-        convert_filter can be used to convert value read to another format.
-        This is used most commonly to convert classes given as string to a class id.
-        This is a dict such that the key is the column to convert, and the value is another dict giving the mapping.
-
-        Optionally, input files could be a zip of multiple text files for storage efficiency.
-
-        Returns read_data and ignore_data.
-        Each is a dict (with keys as timesteps as strings) of lists (over dets) of lists (over column values).
-        Note that all data is returned as strings, and must be converted to float/int later if needed.
-        Note that timesteps will not be present in the returned dict keys if there are no dets for them
-        """
-
-        if remove_negative_ids and id_col is None:
-            raise TrackEvalException('remove_negative_ids is True, but id_col is not given.')
-        if crowd_ignore_filter is None:
-            crowd_ignore_filter = {}
-        if convert_filter is None:
-            convert_filter = {}
-        try:
-            if is_zipped:  # Either open file directly or within a zip.
-                if zip_file is None:
-                    raise TrackEvalException('is_zipped set to True, but no zip_file is given.')
-                archive = zipfile.ZipFile(os.path.join(zip_file), 'r')
-                fp = io.TextIOWrapper(archive.open(file, 'r'))
-            else:
-                fp = open(file)
-            read_data = {}
-            crowd_ignore_data = {}
-            fp.seek(0, os.SEEK_END)
-            # check if file is empty
-            if fp.tell():
-                fp.seek(0)
-                dialect = csv.Sniffer().sniff(fp.readline(), delimiters=force_delimiters)  # Auto determine structure.
-                dialect.skipinitialspace = True  # Deal with extra spaces between columns
-                fp.seek(0)
-                reader = csv.reader(fp, dialect)
-                for row in reader:
-                    try:
-                        # Deal with extra trailing spaces at the end of rows
-                        if row[-1] in '':
-                            row = row[:-1]
-                        timestep = str(int(float(row[time_col])))
-                        # Read ignore regions separately.
-                        is_ignored = False
-                        for ignore_key, ignore_value in crowd_ignore_filter.items():
-                            if row[ignore_key].lower() in ignore_value:
-                                # Convert values in one column (e.g. string to id)
-                                for convert_key, convert_value in convert_filter.items():
-                                    row[convert_key] = convert_value[row[convert_key].lower()]
-                                # Save data separated by timestep.
-                                if timestep in crowd_ignore_data.keys():
-                                    crowd_ignore_data[timestep].append(row)
-                                else:
-                                    crowd_ignore_data[timestep] = [row]
-                                is_ignored = True
-                        if is_ignored:  # if det is an ignore region, it cannot be a normal det.
-                            continue
-                        # Exclude some dets if not valid.
-                        if valid_filter is not None:
-                            for key, value in valid_filter.items():
-                                if row[key].lower() not in value:
-                                    continue
-                        if remove_negative_ids:
-                            if int(float(row[id_col])) < 0:
-                                continue
-                        # Convert values in one column (e.g. string to id)
-                        for convert_key, convert_value in convert_filter.items():
-                            row[convert_key] = convert_value[row[convert_key].lower()]
-                        # Save data separated by timestep.
-                        if timestep in read_data.keys():
-                            read_data[timestep].append(row)
-                        else:
-                            read_data[timestep] = [row]
-                    except Exception:
-                        exc_str_init = 'In file %s the following line cannot be read correctly: \n' % os.path.basename(
-                            file)
-                        exc_str = ' '.join([exc_str_init]+row)
-                        raise TrackEvalException(exc_str)
-            fp.close()
-        except Exception:
-            print('Error loading file: %s, printing traceback.' % file)
-            traceback.print_exc()
-            raise TrackEvalException(
-                'File %s cannot be read because it is either not present or invalidly formatted' % os.path.basename(
-                    file))
-        return read_data, crowd_ignore_data
-
-    @staticmethod
-    def _calculate_mask_ious(masks1, masks2, is_encoded=False, do_ioa=False):
-        """ Calculates the IOU (intersection over union) between two arrays of segmentation masks.
-        If is_encoded a run length encoding with pycocotools is assumed as input format, otherwise an input of numpy
-        arrays of the shape (num_masks, height, width) is assumed and the encoding is performed.
-        If do_ioa (intersection over area) , then calculates the intersection over the area of masks1 - this is commonly
-        used to determine if detections are within crowd ignore region.
-        :param masks1:  first set of masks (numpy array of shape (num_masks, height, width) if not encoded,
-                        else pycocotools rle encoded format)
-        :param masks2:  second set of masks (numpy array of shape (num_masks, height, width) if not encoded,
-                        else pycocotools rle encoded format)
-        :param is_encoded: whether the input is in pycocotools rle encoded format
-        :param do_ioa: whether to perform IoA computation
-        :return: the IoU/IoA scores
-        """
-
-        # Only loaded when run to reduce minimum requirements
-        from pycocotools import mask as mask_utils
-
-        # use pycocotools for run length encoding of masks
-        if not is_encoded:
-            masks1 = mask_utils.encode(np.array(np.transpose(masks1, (1, 2, 0)), order='F'))
-            masks2 = mask_utils.encode(np.array(np.transpose(masks2, (1, 2, 0)), order='F'))
-
-        # use pycocotools for iou computation of rle encoded masks
-        ious = mask_utils.iou(masks1, masks2, [do_ioa]*len(masks2))
-        if len(masks1) == 0 or len(masks2) == 0:
-            ious = np.asarray(ious).reshape(len(masks1), len(masks2))
-        assert (ious >= 0 - np.finfo('float').eps).all()
-        assert (ious <= 1 + np.finfo('float').eps).all()
-
-        return ious
-
-    @staticmethod
-    def _calculate_box_ious(bboxes1, bboxes2, box_format='xywh', do_ioa=False):
-        """ Calculates the IOU (intersection over union) between two arrays of boxes.
-        Allows variable box formats ('xywh' and 'x0y0x1y1').
-        If do_ioa (intersection over area) , then calculates the intersection over the area of boxes1 - this is commonly
-        used to determine if detections are within crowd ignore region.
-        """
-        if box_format in 'xywh':
-            # layout: (x0, y0, w, h)
-            bboxes1 = deepcopy(bboxes1)
-            bboxes2 = deepcopy(bboxes2)
-
-            bboxes1[:, 2] = bboxes1[:, 0] + bboxes1[:, 2]
-            bboxes1[:, 3] = bboxes1[:, 1] + bboxes1[:, 3]
-            bboxes2[:, 2] = bboxes2[:, 0] + bboxes2[:, 2]
-            bboxes2[:, 3] = bboxes2[:, 1] + bboxes2[:, 3]
-        elif box_format not in 'x0y0x1y1':
-            raise (TrackEvalException('box_format %s is not implemented' % box_format))
-
-        # layout: (x0, y0, x1, y1)
-        min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
-        max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
-        intersection = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(min_[..., 3] - max_[..., 1], 0)
-        area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1])
-
-        if do_ioa:
-            ioas = np.zeros_like(intersection)
-            valid_mask = area1 > 0 + np.finfo('float').eps
-            ioas[valid_mask, :] = intersection[valid_mask, :] / area1[valid_mask][:, np.newaxis]
-
-            return ioas
-        else:
-            area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1])
-            union = area1[:, np.newaxis] + area2[np.newaxis, :] - intersection
-            intersection[area1 <= 0 + np.finfo('float').eps, :] = 0
-            intersection[:, area2 <= 0 + np.finfo('float').eps] = 0
-            intersection[union <= 0 + np.finfo('float').eps] = 0
-            union[union <= 0 + np.finfo('float').eps] = 1
-            ious = intersection / union
-            return ious
-
-    @staticmethod
-    def _calculate_euclidean_similarity(dets1, dets2, zero_distance=2.0):
-        """ Calculates the euclidean distance between two sets of detections, and then converts this into a similarity
-        measure with values between 0 and 1 using the following formula: sim = max(0, 1 - dist/zero_distance).
-        The default zero_distance of 2.0, corresponds to the default used in MOT15_3D, such that a 0.5 similarity
-        threshold corresponds to a 1m distance threshold for TPs.
-        """
-        dist = np.linalg.norm(dets1[:, np.newaxis]-dets2[np.newaxis, :], axis=2)
-        sim = np.maximum(0, 1 - dist/zero_distance)
-        return sim
-
-    @staticmethod
-    def _check_unique_ids(data, after_preproc=False):
-        """Check the requirement that the tracker_ids and gt_ids are unique per timestep"""
-        gt_ids = data['gt_ids']
-        tracker_ids = data['tracker_ids']
-        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(gt_ids, tracker_ids)):
-            if len(tracker_ids_t) > 0:
-                unique_ids, counts = np.unique(tracker_ids_t, return_counts=True)
-                if np.max(counts) != 1:
-                    duplicate_ids = unique_ids[counts > 1]
-                    exc_str_init = 'Tracker predicts the same ID more than once in a single timestep ' \
-                                   '(seq: %s, frame: %i, ids:' % (data['seq'], t+1)
-                    exc_str = ' '.join([exc_str_init] + [str(d) for d in duplicate_ids]) + ')'
-                    if after_preproc:
-                        exc_str_init += '\n Note that this error occurred after preprocessing (but not before), ' \
-                                        'so ids may not be as in file, and something seems wrong with preproc.'
-                    raise TrackEvalException(exc_str)
-            if len(gt_ids_t) > 0:
-                unique_ids, counts = np.unique(gt_ids_t, return_counts=True)
-                if np.max(counts) != 1:
-                    duplicate_ids = unique_ids[counts > 1]
-                    exc_str_init = 'Ground-truth has the same ID more than once in a single timestep ' \
-                                   '(seq: %s, frame: %i, ids:' % (data['seq'], t+1)
-                    exc_str = ' '.join([exc_str_init] + [str(d) for d in duplicate_ids]) + ')'
-                    if after_preproc:
-                        exc_str_init += '\n Note that this error occurred after preprocessing (but not before), ' \
-                                        'so ids may not be as in file, and something seems wrong with preproc.'
-                    raise TrackEvalException(exc_str)
diff --git a/trackeval/datasets/sem_track.py b/trackeval/datasets/sem_track.py
deleted file mode 100644
index 6e56104..0000000
--- a/trackeval/datasets/sem_track.py
+++ /dev/null
@@ -1,690 +0,0 @@
-import os
-import numpy as np
-import json
-import itertools
-from collections import defaultdict
-from scipy.optimize import linear_sum_assignment
-from ..utils import TrackEvalException
-from ._base_dataset import _BaseDataset
-from .. import utils
-from .. import _timing
-
-
-class SemTrack(_BaseDataset):
-    """Dataset class for TAO tracking"""
-
-    @staticmethod
-    def get_default_dataset_config():
-        """Default class config values"""
-        code_path = utils.get_code_path()
-        default_config = {
-            'GT_FOLDER': os.path.join(code_path, 'data/gt/tao/tao_training'),  # Location of GT data
-            'TRACKERS_FOLDER': os.path.join(code_path, 'data/trackers/tao/tao_training'),  # Trackers location
-            'OUTPUT_FOLDER': None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
-            'TRACKERS_TO_EVAL': None,  # Filenames of trackers to eval (if None, all in folder)
-            'CLASSES_TO_EVAL': None,  # Classes to eval (if None, all classes)
-            'SPLIT_TO_EVAL': 'training',  # Valid: 'training', 'val'
-            'PRINT_CONFIG': True,  # Whether to print current config
-            'TRACKER_SUB_FOLDER': 'data',  # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
-            'OUTPUT_SUB_FOLDER': '',  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
-            'TRACKER_DISPLAY_NAMES': None,  # Names of trackers to display, if None: TRACKERS_TO_EVAL
-            'MAX_DETECTIONS': 300,  # Number of maximal allowed detections per image (0 for unlimited)
-        }
-        return default_config
-
-    def __init__(self, config=None):
-        """Initialise dataset, checking that all required files are present"""
-        super().__init__()
-        # Fill non-given config values with defaults
-        self.config = utils.init_config(config, self.get_default_dataset_config(), self.get_name())
-        self.gt_fol = self.config['GT_FOLDER']
-        self.tracker_fol = self.config['TRACKERS_FOLDER']
-        self.should_classes_combine = True
-        self.use_super_categories = False
-
-        self.tracker_sub_fol = self.config['TRACKER_SUB_FOLDER']
-        self.output_fol = self.config['OUTPUT_FOLDER']
-        if self.output_fol is None:
-            self.output_fol = self.tracker_fol
-        self.output_sub_fol = self.config['OUTPUT_SUB_FOLDER']
-
-        gt_dir_files = [file for file in os.listdir(self.gt_fol) if file.endswith('.json')]
-        if len(gt_dir_files) != 1:
-            raise TrackEvalException(self.gt_fol + ' does not contain exactly one json file.')
-
-        with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
-            self.gt_data = json.load(f)
-
-        # merge categories marked with a merged tag in TAO dataset
-        self._merge_categories(self.gt_data['annotations'] + self.gt_data['tracks'])
-
-        # Get sequences to eval and sequence information
-        # self.seq_list = [vid['name'].replace('/', '-') for vid in self.gt_data['videos']]
-        self.seq_list = []
-        self.seq_name_to_seq_id = {}
-        for vid in self.gt_data['videos']:
-            name = vid['name']
-            lang = vid['lang']
-            seq_name = name.split('/')[-1] + '_' + lang.replace(' ', '_')
-            self.seq_list.append(seq_name)
-            self.seq_name_to_seq_id[seq_name] = vid['id']
-        # self.seq_name_to_seq_id = {vid['name'].replace('/', '-'): vid['id'] for vid in self.gt_data['videos']}
-        # compute mappings from videos to annotation data
-        self.videos_to_gt_tracks, self.videos_to_gt_images = self._compute_vid_mappings(self.gt_data['annotations'])
-        # compute sequence lengths
-        self.seq_lengths = {vid['id']: 0 for vid in self.gt_data['videos']}
-        for img in self.gt_data['images']:
-            self.seq_lengths[img['video_id']] += 1
-        self.seq_to_images_to_timestep = self._compute_image_to_timestep_mappings()
-        self.seq_to_classes = {vid['id']: {'pos_cat_ids': list({track['category_id'] for track
-                                                                in self.videos_to_gt_tracks[vid['id']]}),
-                                           'neg_cat_ids': vid['neg_category_ids'],
-                                           'not_exhaustively_labeled_cat_ids': vid['not_exhaustive_category_ids']}
-                               for vid in self.gt_data['videos']}
-
-        # Get classes to eval
-        considered_vid_ids = [self.seq_name_to_seq_id[vid] for vid in self.seq_list]
-        seen_cats = set([cat_id for vid_id in considered_vid_ids for cat_id
-                         in self.seq_to_classes[vid_id]['pos_cat_ids']])
-        # only classes with ground truth are evaluated in TAO
-        self.valid_classes = [cls['name'] for cls in self.gt_data['categories'] if cls['id'] in seen_cats]
-        cls_name_to_cls_id_map = {cls['name']: cls['id'] for cls in self.gt_data['categories']}
-
-        if self.config['CLASSES_TO_EVAL']:
-            self.class_list = [cls.lower() if cls.lower() in self.valid_classes else None
-                               for cls in self.config['CLASSES_TO_EVAL']]
-            if not all(self.class_list):
-                raise TrackEvalException('Attempted to evaluate an invalid class. Only classes ' +
-                                         ', '.join(self.valid_classes) +
-                                         ' are valid (classes present in ground truth data).')
-        else:
-            self.class_list = [cls for cls in self.valid_classes]
-        self.class_name_to_class_id = {k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list}
-
-        # Get trackers to eval
-        if self.config['TRACKERS_TO_EVAL'] is None:
-            self.tracker_list = os.listdir(self.tracker_fol)
-        else:
-            self.tracker_list = self.config['TRACKERS_TO_EVAL']
-
-        if self.config['TRACKER_DISPLAY_NAMES'] is None:
-            self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
-        elif (self.config['TRACKERS_TO_EVAL'] is not None) and (
-                len(self.config['TRACKER_DISPLAY_NAMES']) == len(self.tracker_list)):
-            self.tracker_to_disp = dict(zip(self.tracker_list, self.config['TRACKER_DISPLAY_NAMES']))
-        else:
-            raise TrackEvalException('List of tracker files and tracker display names do not match.')
-
-        self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
-
-        for tracker in self.tracker_list:
-            # tr_dir_files = [file for file in os.listdir(os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol))
-            #                 if file.endswith('.json')]
-            tr_dir_files = tracker + '.json'
-            if not os.path.isfile(os.path.join(self.tracker_fol, tr_dir_files)):
-                raise TrackEvalException(os.path.join(self.tracker_fol, tr_dir_files)
-                                         + ' does not exist.')
-            with open(os.path.join(self.tracker_fol, tr_dir_files)) as f:
-                curr_data = json.load(f)
-
-            # limit detections if MAX_DETECTIONS > 0
-            if self.config['MAX_DETECTIONS']:
-                curr_data = self._limit_dets_per_image(curr_data)
-
-            # fill missing video ids
-            self._fill_video_ids_inplace(curr_data)
-
-            # make track ids unique over whole evaluation set
-            self._make_track_ids_unique(curr_data)
-
-            # merge categories marked with a merged tag in TAO dataset
-            self._merge_categories(curr_data)
-
-            # get tracker sequence information
-            curr_videos_to_tracker_tracks, curr_videos_to_tracker_images = self._compute_vid_mappings(curr_data)
-            self.tracker_data[tracker]['vids_to_tracks'] = curr_videos_to_tracker_tracks
-            self.tracker_data[tracker]['vids_to_images'] = curr_videos_to_tracker_images
-
-    def get_display_name(self, tracker):
-        return self.tracker_to_disp[tracker]
-    
-    @_timing.time
-    def get_raw_seq_data(self, tracker, seq):
-        """ Loads raw data (tracker and ground-truth) for a single tracker on a single sequence.
-        Raw data includes all of the information needed for both preprocessing and evaluation, for all classes.
-        A later function (get_processed_seq_data) will perform such preprocessing and extract relevant information for
-        the evaluation of each class.
-
-        This returns a dict which contains the fields:
-        [num_timesteps]: integer
-        [gt_ids, tracker_ids, gt_classes, tracker_classes, tracker_confidences]:
-                                                                list (for each timestep) of 1D NDArrays (for each det).
-        [gt_dets, tracker_dets, gt_crowd_ignore_regions]: list (for each timestep) of lists of detections.
-        [similarity_scores]: list (for each timestep) of 2D NDArrays.
-        [gt_extras]: dict (for each extra) of lists (for each timestep) of 1D NDArrays (for each det).
-
-        gt_extras contains dataset specific information used for preprocessing such as occlusion and truncation levels.
-
-        Note that similarities are extracted as part of the dataset and not the metric, because almost all metrics are
-        independent of the exact method of calculating the similarity. However datasets are not (e.g. segmentation
-        masks vs 2D boxes vs 3D boxes).
-        We calculate the similarity before preprocessing because often both preprocessing and evaluation require it and
-        we don't wish to calculate this twice.
-        We calculate similarity between all gt and tracker classes (not just each class individually) to allow for
-        calculation of metrics such as class confusion matrices. Typically the impact of this on performance is low.
-        """
-        # Load raw data.
-        raw_gt_data = self._load_raw_file(tracker, seq, is_gt=True)
-        raw_tracker_data = self._load_raw_file(tracker, seq, is_gt=False)
-        raw_data = {**raw_tracker_data, **raw_gt_data}  # Merges dictionaries
-
-        # Calculate similarities for each timestep.
-        similarity_scores = []
-        for t, (gt_dets_t, tracker_dets_t) in enumerate(zip(raw_data['gt_dets'], raw_data['tracker_dets'])):
-            ious = self._calculate_similarities(gt_dets_t, tracker_dets_t)
-            similarity_scores.append(ious)
-        raw_data['similarity_scores'] = similarity_scores
-        return raw_data
-
-    def _load_raw_file(self, tracker, seq, is_gt):
-        """Load a file (gt or tracker) in the TAO format
-
-        If is_gt, this returns a dict which contains the fields:
-        [gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
-        [gt_dets]: list (for each timestep) of lists of detections.
-        [classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
-                                keys and corresponding segmentations as values) for each track
-        [classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_lengths]: dictionary with class values
-                                as keys and lists (for each track) as values
-
-        if not is_gt, this returns a dict which contains the fields:
-        [tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
-        [tracker_dets]: list (for each timestep) of lists of detections.
-        [classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
-                                keys and corresponding segmentations as values) for each track
-        [classes_to_dt_track_ids, classes_to_dt_track_areas, classes_to_dt_track_lengths]: dictionary with class values
-                                                                                           as keys and lists as values
-        [classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
-        """
-        seq_id = self.seq_name_to_seq_id[seq]
-        # File location
-        if is_gt:
-            imgs = self.videos_to_gt_images[seq_id]
-        else:
-            imgs = self.tracker_data[tracker]['vids_to_images'][seq_id]
-
-        # Convert data to required format
-        num_timesteps = self.seq_lengths[seq_id]
-        img_to_timestep = self.seq_to_images_to_timestep[seq_id]
-        data_keys = ['ids', 'classes', 'dets']
-        if not is_gt:
-            data_keys += ['tracker_confidences', 'is_main_track', 'rel_sub_list', 'rel_obj_list']
-        else:
-            data_keys += ['is_main_gt', 'rel_sub_class', 'rel_obj_class']
-        raw_data = {key: [None] * num_timesteps for key in data_keys}
-        for img in imgs:
-            # some tracker data contains images without any ground truth information, these are ignored
-            try:
-                t = img_to_timestep[img['id']]
-            except KeyError:
-                continue
-            annotations = img['annotations']
-            raw_data['dets'][t] = np.atleast_2d([ann['bbox'] for ann in annotations]).astype(float)
-            raw_data['ids'][t] = np.atleast_1d([ann['track_id'] for ann in annotations]).astype(int)
-            raw_data['classes'][t] = np.atleast_1d([ann['category_id'] for ann in annotations]).astype(int)
-            if not is_gt:
-                raw_data['tracker_confidences'][t] = np.atleast_1d([ann['score'] for ann in annotations]).astype(float)
-                raw_data['is_main_track'][t] = np.atleast_1d([ann['is_main'] for ann in annotations]).astype(int)
-                raw_data['rel_sub_list'][t] = np.atleast_2d([ann['relation_sub_category_id'] for ann in annotations]).astype(int)
-                raw_data['rel_obj_list'][t] = np.atleast_2d([ann['relation_obj_category_id'] for ann in annotations]).astype(int)
-            else:
-                raw_data['is_main_gt'][t] = np.atleast_1d([ann['main_object'] for ann in annotations]).astype(int)
-                raw_data['rel_sub_class'][t] = np.atleast_1d([ann['relation_sub_category_id'] for ann in annotations]).astype(int)
-                raw_data['rel_obj_class'][t] = np.atleast_1d([ann['relation_obj_category_id'] for ann in annotations]).astype(int)
-
-        for t, d in enumerate(raw_data['dets']):
-            if d is None:
-                raw_data['dets'][t] = np.empty((0, 4)).astype(float)
-                raw_data['ids'][t] = np.empty(0).astype(int)
-                raw_data['classes'][t] = np.empty(0).astype(int)
-                if not is_gt:
-                    raw_data['tracker_confidences'][t] = np.empty(0)
-
-        if is_gt:
-            key_map = {'ids': 'gt_ids',
-                       'classes': 'gt_classes',
-                       'dets': 'gt_dets'}
-        else:
-            key_map = {'ids': 'tracker_ids',
-                       'classes': 'tracker_classes',
-                       'dets': 'tracker_dets'}
-        for k, v in key_map.items():
-            raw_data[v] = raw_data.pop(k)
-
-        all_classes = [self.class_name_to_class_id[cls] for cls in self.class_list]
-        if is_gt:
-            classes_to_consider = all_classes
-            all_tracks = self.videos_to_gt_tracks[seq_id]
-        else:
-            classes_to_consider = self.seq_to_classes[seq_id]['pos_cat_ids'] \
-                                  + self.seq_to_classes[seq_id]['neg_cat_ids']
-            all_tracks = self.tracker_data[tracker]['vids_to_tracks'][seq_id]
-
-        classes_to_tracks = {cls: [track for track in all_tracks if track['category_id'] == cls]
-                             if cls in classes_to_consider else [] for cls in all_classes}
-
-        # mapping from classes to track information
-        raw_data['classes_to_tracks'] = {cls: [{det['image_id']: np.atleast_1d(det['bbox'])
-                                                for det in track['annotations']} for track in tracks]
-                                         for cls, tracks in classes_to_tracks.items()}
-        raw_data['classes_to_track_ids'] = {cls: [track['id'] for track in tracks]
-                                            for cls, tracks in classes_to_tracks.items()}
-        raw_data['classes_to_track_areas'] = {cls: [track['area'] for track in tracks]
-                                              for cls, tracks in classes_to_tracks.items()}
-        raw_data['classes_to_track_lengths'] = {cls: [len(track['annotations']) for track in tracks]
-                                                for cls, tracks in classes_to_tracks.items()}
-
-        if not is_gt:
-            raw_data['classes_to_dt_track_scores'] = {cls: np.array([np.mean([float(x['score'])
-                                                                              for x in track['annotations']])
-                                                                     for track in tracks])
-                                                      for cls, tracks in classes_to_tracks.items()}
-            raw_data['classes_to_dt_track_is_main'] = {cls: [{det['image_id']: det['is_main']
-                                                        for det in track['annotations']} for track in tracks]
-                                                for cls, tracks in classes_to_tracks.items()}
-            raw_data['classes_to_dt_track_rel_sub_list'] = {cls: [{det['image_id']: np.atleast_1d(det['relation_sub_category_id'])
-                                                        for det in track['annotations']} for track in tracks]
-                                                for cls, tracks in classes_to_tracks.items()}
-            raw_data['classes_to_dt_track_rel_obj_list'] = {cls: [{det['image_id']: np.atleast_1d(det['relation_obj_category_id'])
-                                                        for det in track['annotations']} for track in tracks]
-                                                for cls, tracks in classes_to_tracks.items()}
-        else:
-            raw_data['classes_to_gt_track_is_main'] = {cls: [{det['image_id']: det['main_object']
-                                                        for det in track['annotations']} for track in tracks]
-                                                for cls, tracks in classes_to_tracks.items()}
-            raw_data['classes_to_gt_track_rel_sub_class'] = {cls: [{det['image_id']: det['relation_sub_category_id']
-                                                        for det in track['annotations']} for track in tracks]
-                                                for cls, tracks in classes_to_tracks.items()}
-            raw_data['classes_to_gt_track_rel_obj_class'] = {cls: [{det['image_id']: det['relation_obj_category_id']
-                                                        for det in track['annotations']} for track in tracks]
-                                                for cls, tracks in classes_to_tracks.items()}
-
-        if is_gt:
-            key_map = {'classes_to_tracks': 'classes_to_gt_tracks',
-                       'classes_to_track_ids': 'classes_to_gt_track_ids',
-                       'classes_to_track_lengths': 'classes_to_gt_track_lengths',
-                       'classes_to_track_areas': 'classes_to_gt_track_areas'}
-        else:
-            key_map = {'classes_to_tracks': 'classes_to_dt_tracks',
-                       'classes_to_track_ids': 'classes_to_dt_track_ids',
-                       'classes_to_track_lengths': 'classes_to_dt_track_lengths',
-                       'classes_to_track_areas': 'classes_to_dt_track_areas',}
-        for k, v in key_map.items():
-            raw_data[v] = raw_data.pop(k)
-
-        raw_data['num_timesteps'] = num_timesteps
-        raw_data['neg_cat_ids'] = self.seq_to_classes[seq_id]['neg_cat_ids']
-        raw_data['not_exhaustively_labeled_cls'] = self.seq_to_classes[seq_id]['not_exhaustively_labeled_cat_ids']
-        raw_data['seq'] = seq
-        return raw_data
-
-    @_timing.time
-    def get_preprocessed_seq_data(self, raw_data, cls):
-        """ Preprocess data for a single sequence for a single class ready for evaluation.
-        Inputs:
-             - raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
-             - cls is the class to be evaluated.
-        Outputs:
-             - data is a dict containing all of the information that metrics need to perform evaluation.
-                It contains the following fields:
-                    [num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
-                    [gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
-                    [gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
-                    [similarity_scores]: list (for each timestep) of 2D NDArrays.
-        Notes:
-            General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
-                1) Extract only detections relevant for the class to be evaluated (including distractor detections).
-                2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
-                    distractor class, or otherwise marked as to be removed.
-                3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
-                    other criteria (e.g. are too small).
-                4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
-            After the above preprocessing steps, this function also calculates the number of gt and tracker detections
-                and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
-                unique within each timestep.
-        TAO:
-            In TAO, the 4 preproc steps are as follow:
-                1) All classes present in the ground truth data are evaluated separately.
-                2) No matched tracker detections are removed.
-                3) Unmatched tracker detections are removed if there is not ground truth data and the class does not
-                    belong to the categories marked as negative for this sequence. Additionally, unmatched tracker
-                    detections for classes which are marked as not exhaustively labeled are removed.
-                4) No gt detections are removed.
-            Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
-            and the tracks from the tracker data are sorted according to the tracker confidence.
-        """
-        cls_id = self.class_name_to_class_id[cls]
-        is_not_exhaustively_labeled = cls_id in raw_data['not_exhaustively_labeled_cls']
-        is_neg_category = cls_id in raw_data['neg_cat_ids']
-
-        data_keys = ['gt_ids', 'tracker_ids', 'gt_dets', 'tracker_dets', 'tracker_confidences', 'similarity_scores', 'is_main_gt', 'rel_sub_class', 'rel_obj_class', 'is_main_track', 'rel_sub_list', 'rel_obj_list']
-        data = {key: [None] * raw_data['num_timesteps'] for key in data_keys}
-        unique_gt_ids = []
-        unique_tracker_ids = []
-        num_gt_dets = 0
-        num_tracker_dets = 0
-
-        for t in range(raw_data['num_timesteps']):
-            if raw_data['is_main_gt'][t] is None:
-                raw_data['is_main_gt'][t] = np.array([], dtype=np.int64)
-            if raw_data['rel_sub_class'][t] is None:
-                raw_data['rel_sub_class'][t] = np.array([[]], dtype=np.int64)
-            if  raw_data['rel_obj_class'][t] is None:
-                raw_data['rel_obj_class'][t] = np.array([[]], dtype=np.int64)
-            if raw_data['is_main_track'][t] is None:
-                raw_data['is_main_track'][t] = np.array([], dtype=np.int64)
-            if raw_data['rel_sub_list'][t] is None:
-                raw_data['rel_sub_list'][t] = np.array([[]], dtype=np.int64)
-            if raw_data['rel_obj_list'][t] is None:
-                raw_data['rel_obj_list'][t] = np.array([[]], dtype=np.int64)
-
-        for t in range(raw_data['num_timesteps']):
-
-            # Only extract relevant dets for this class for preproc and eval (cls)
-            gt_class_mask = np.atleast_1d(raw_data['gt_classes'][t] == cls_id)
-            gt_class_mask = gt_class_mask.astype(np.bool_)
-            gt_ids = raw_data['gt_ids'][t][gt_class_mask]
-            gt_dets = raw_data['gt_dets'][t][gt_class_mask]
-            is_main_gt = raw_data['is_main_gt'][t][gt_class_mask]
-            rel_sub_class = raw_data['rel_sub_class'][t][gt_class_mask]
-            rel_obj_class = raw_data['rel_obj_class'][t][gt_class_mask]
-
-            tracker_class_mask = np.atleast_1d(raw_data['tracker_classes'][t] == cls_id)
-            tracker_class_mask = tracker_class_mask.astype(np.bool_)
-            tracker_ids = raw_data['tracker_ids'][t][tracker_class_mask]
-            tracker_dets = raw_data['tracker_dets'][t][tracker_class_mask]
-            tracker_confidences = raw_data['tracker_confidences'][t][tracker_class_mask]
-            similarity_scores = raw_data['similarity_scores'][t][gt_class_mask, :][:, tracker_class_mask]
-            # if raw_data['is_main_track'][t] is None:
-            #     is_main_track = None
-            # else:
-            #     is_main_track = raw_data['is_main_track'][t][tracker_class_mask]
-            # if raw_data['rel_sub_list'][t] is None:
-            #     rel_sub_list = None
-            # else:
-            #     rel_sub_list = raw_data['rel_sub_list'][t][tracker_class_mask]
-            # if raw_data['rel_obj_list'][t] is None:
-            #     rel_obj_list = None
-            # else:
-            #     rel_obj_list = raw_data['rel_obj_list'][t][tracker_class_mask]
-            is_main_track = raw_data['is_main_track'][t][tracker_class_mask]
-            rel_sub_list = raw_data['rel_sub_list'][t][tracker_class_mask]
-            rel_obj_list = raw_data['rel_obj_list'][t][tracker_class_mask]
-
-            # Match tracker and gt dets (with hungarian algorithm).
-            unmatched_indices = np.arange(tracker_ids.shape[0])
-            if gt_ids.shape[0] > 0 and tracker_ids.shape[0] > 0:
-                matching_scores = similarity_scores.copy()
-                matching_scores[matching_scores < 0.5 - np.finfo('float').eps] = 0
-                match_rows, match_cols = linear_sum_assignment(-matching_scores)
-                actually_matched_mask = matching_scores[match_rows, match_cols] > 0 + np.finfo('float').eps
-                match_cols = match_cols[actually_matched_mask]
-                unmatched_indices = np.delete(unmatched_indices, match_cols, axis=0)
-
-            if gt_ids.shape[0] == 0 and not is_neg_category:
-                to_remove_tracker = unmatched_indices
-            elif is_not_exhaustively_labeled:
-                to_remove_tracker = unmatched_indices
-            else:
-                to_remove_tracker = np.array([], dtype=np.int64)
-
-            # remove all unwanted unmatched tracker detections
-            data['tracker_ids'][t] = np.delete(tracker_ids, to_remove_tracker, axis=0)
-            data['tracker_dets'][t] = np.delete(tracker_dets, to_remove_tracker, axis=0)
-            data['tracker_confidences'][t] = np.delete(tracker_confidences, to_remove_tracker, axis=0)
-            similarity_scores = np.delete(similarity_scores, to_remove_tracker, axis=1)
-            data['is_main_track'][t] = np.delete(is_main_track, to_remove_tracker, axis=0)
-            data['rel_sub_list'][t] = np.delete(rel_sub_list, to_remove_tracker, axis=0)
-            data['rel_obj_list'][t] = np.delete(rel_obj_list, to_remove_tracker, axis=0)
-
-            data['gt_ids'][t] = gt_ids
-            data['gt_dets'][t] = gt_dets
-            data['similarity_scores'][t] = similarity_scores
-            data['is_main_gt'][t] = is_main_gt
-            data['rel_sub_class'][t] = rel_sub_class
-            data['rel_obj_class'][t] = rel_obj_class
-
-            unique_gt_ids += list(np.unique(data['gt_ids'][t]))
-            unique_tracker_ids += list(np.unique(data['tracker_ids'][t]))
-            num_tracker_dets += len(data['tracker_ids'][t])
-            num_gt_dets += len(data['gt_ids'][t])
-
-        # Re-label IDs such that there are no empty IDs
-        if len(unique_gt_ids) > 0:
-            unique_gt_ids = np.unique(unique_gt_ids)
-            gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
-            gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
-            for t in range(raw_data['num_timesteps']):
-                if len(data['gt_ids'][t]) > 0:
-                    data['gt_ids'][t] = gt_id_map[data['gt_ids'][t]].astype(np.int64)
-        if len(unique_tracker_ids) > 0:
-            unique_tracker_ids = np.unique(unique_tracker_ids)
-            tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
-            tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
-            for t in range(raw_data['num_timesteps']):
-                if len(data['tracker_ids'][t]) > 0:
-                    data['tracker_ids'][t] = tracker_id_map[data['tracker_ids'][t]].astype(np.int64)
-
-        # Record overview statistics.
-        data['num_tracker_dets'] = num_tracker_dets
-        data['num_gt_dets'] = num_gt_dets
-        data['num_tracker_ids'] = len(unique_tracker_ids)
-        data['num_gt_ids'] = len(unique_gt_ids)
-        data['num_timesteps'] = raw_data['num_timesteps']
-        data['seq'] = raw_data['seq']
-
-        # get track representations
-        data['gt_tracks'] = raw_data['classes_to_gt_tracks'][cls_id]
-        data['gt_track_ids'] = raw_data['classes_to_gt_track_ids'][cls_id]
-        data['gt_track_lengths'] = raw_data['classes_to_gt_track_lengths'][cls_id]
-        data['gt_track_areas'] = raw_data['classes_to_gt_track_areas'][cls_id]
-        data['dt_tracks'] = raw_data['classes_to_dt_tracks'][cls_id]
-        data['dt_track_ids'] = raw_data['classes_to_dt_track_ids'][cls_id]
-        data['dt_track_lengths'] = raw_data['classes_to_dt_track_lengths'][cls_id]
-        data['dt_track_areas'] = raw_data['classes_to_dt_track_areas'][cls_id]
-        data['dt_track_scores'] = raw_data['classes_to_dt_track_scores'][cls_id]
-        data['not_exhaustively_labeled'] = is_not_exhaustively_labeled
-        data['iou_type'] = 'bbox'
-        data['gt_track_is_main'] = raw_data['classes_to_gt_track_is_main'][cls_id]
-        data['gt_track_rel_sub_class'] = raw_data['classes_to_gt_track_rel_sub_class'][cls_id]
-        data['gt_track_rel_obj_class'] = raw_data['classes_to_gt_track_rel_obj_class'][cls_id]
-        data['dt_track_is_main'] = raw_data['classes_to_dt_track_is_main'][cls_id]
-        data['dt_track_rel_sub_list'] = raw_data['classes_to_dt_track_rel_sub_list'][cls_id]
-        data['dt_track_rel_obj_list'] = raw_data['classes_to_dt_track_rel_obj_list'][cls_id]
-
-        # sort tracker data tracks by tracker confidence scores
-        if data['dt_tracks']:
-            idx = np.argsort([-score for score in data['dt_track_scores']], kind="mergesort")
-            data['dt_track_scores'] = [data['dt_track_scores'][i] for i in idx]
-            data['dt_tracks'] = [data['dt_tracks'][i] for i in idx]
-            data['dt_track_ids'] = [data['dt_track_ids'][i] for i in idx]
-            data['dt_track_lengths'] = [data['dt_track_lengths'][i] for i in idx]
-            data['dt_track_areas'] = [data['dt_track_areas'][i] for i in idx]
-        # Ensure that ids are unique per timestep.
-        # tracker_ids = data['tracker_ids']
-        # for t, tracker_ids_t in enumerate(tracker_ids):
-        #     if len(tracker_ids_t) > 1:
-        #         tracker_ids[t] = np.array([tracker_ids_t[0]])
-        self._check_unique_ids(data)
-
-        return data
-
-    def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
-        similarity_scores = self._calculate_box_ious(gt_dets_t, tracker_dets_t)
-        return similarity_scores
-
-    def _merge_categories(self, annotations):
-        """
-        Merges categories with a merged tag. Adapted from https://github.com/TAO-Dataset
-        :param annotations: the annotations in which the classes should be merged
-        :return: None
-        """
-        merge_map = {}
-        for category in self.gt_data['categories']:
-            if 'merged' in category:
-                for to_merge in category['merged']:
-                    merge_map[to_merge['id']] = category['id']
-
-        for ann in annotations:
-            ann['category_id'] = merge_map.get(ann['category_id'], ann['category_id'])
-
-    def _compute_vid_mappings(self, annotations):
-        """
-        Computes mappings from Videos to corresponding tracks and images.
-        :param annotations: the annotations for which the mapping should be generated
-        :return: the video-to-track-mapping, the video-to-image-mapping
-        """
-        vids_to_tracks = {}
-        vids_to_imgs = {}
-        vid_ids = [vid['id'] for vid in self.gt_data['videos']]
-
-        # compute an mapping from image IDs to images
-        images = {}
-        for image in self.gt_data['images']:
-            images[image['id']] = image
-
-        for ann in annotations:
-            ann["area"] = ann["bbox"][2] * ann["bbox"][3]
-
-            vid = ann["video_id"]
-            if ann["video_id"] not in vids_to_tracks.keys():
-                vids_to_tracks[ann["video_id"]] = list()
-            if ann["video_id"] not in vids_to_imgs.keys():
-                vids_to_imgs[ann["video_id"]] = list()
-
-            # Fill in vids_to_tracks
-            tid = ann["track_id"]
-            exist_tids = [track["id"] for track in vids_to_tracks[vid]]
-            try:
-                index1 = exist_tids.index(tid)
-            except ValueError:
-                index1 = -1
-            if tid not in exist_tids:
-                curr_track = {"id": tid, "category_id": ann['category_id'],
-                              "video_id": vid, "annotations": [ann]}
-                vids_to_tracks[vid].append(curr_track)
-            else:
-                vids_to_tracks[vid][index1]["annotations"].append(ann)
-
-            # Fill in vids_to_imgs
-            img_id = ann['image_id']
-            exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
-            try:
-                index2 = exist_img_ids.index(img_id)
-            except ValueError:
-                index2 = -1
-            if index2 == -1:
-                curr_img = {"id": img_id, "annotations": [ann]}
-                vids_to_imgs[vid].append(curr_img)
-            else:
-                vids_to_imgs[vid][index2]["annotations"].append(ann)
-
-        # sort annotations by frame index and compute track area
-        for vid, tracks in vids_to_tracks.items():
-            for track in tracks:
-                track["annotations"] = sorted(
-                    track['annotations'],
-                    key=lambda x: images[x['image_id']]['frame_index'])
-                # Computer average area
-                track["area"] = (sum(x['area'] for x in track['annotations']) / len(track['annotations']))
-
-        # Ensure all videos are present
-        for vid_id in vid_ids:
-            if vid_id not in vids_to_tracks.keys():
-                vids_to_tracks[vid_id] = []
-            if vid_id not in vids_to_imgs.keys():
-                vids_to_imgs[vid_id] = []
-
-        return vids_to_tracks, vids_to_imgs
-
-    def _compute_image_to_timestep_mappings(self):
-        """
-        Computes a mapping from images to the corresponding timestep in the sequence.
-        :return: the image-to-timestep-mapping
-        """
-        images = {}
-        for image in self.gt_data['images']:
-            images[image['id']] = image
-
-        seq_to_imgs_to_timestep = {vid['id']: dict() for vid in self.gt_data['videos']}
-        for vid in seq_to_imgs_to_timestep:
-            curr_imgs = [img['id'] for img in self.videos_to_gt_images[vid]]
-            curr_imgs = sorted(curr_imgs, key=lambda x: images[x]['frame_index'])
-            seq_to_imgs_to_timestep[vid] = {curr_imgs[i]: i for i in range(len(curr_imgs))}
-
-        return seq_to_imgs_to_timestep
-
-    def _limit_dets_per_image(self, annotations):
-        """
-        Limits the number of detections for each image to config['MAX_DETECTIONS']. Adapted from
-        https://github.com/TAO-Dataset/
-        :param annotations: the annotations in which the detections should be limited
-        :return: the annotations with limited detections
-        """
-        max_dets = self.config['MAX_DETECTIONS']
-        img_ann = defaultdict(list)
-        for ann in annotations:
-            img_ann[ann["image_id"]].append(ann)
-
-        for img_id, _anns in img_ann.items():
-            if len(_anns) <= max_dets:
-                continue
-            _anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
-            img_ann[img_id] = _anns[:max_dets]
-
-        return [ann for anns in img_ann.values() for ann in anns]
-
-    def _fill_video_ids_inplace(self, annotations):
-        """
-        Fills in missing video IDs inplace. Adapted from https://github.com/TAO-Dataset/
-        :param annotations: the annotations for which the videos IDs should be filled inplace
-        :return: None
-        """
-        missing_video_id = [x for x in annotations if 'video_id' not in x]
-        if missing_video_id:
-            image_id_to_video_id = {
-                x['id']: x['video_id'] for x in self.gt_data['images']
-            }
-            for x in missing_video_id:
-                x['video_id'] = image_id_to_video_id[x['image_id']]
-
-    @staticmethod
-    def _make_track_ids_unique(annotations):
-        """
-        Makes the track IDs unqiue over the whole annotation set. Adapted from https://github.com/TAO-Dataset/
-        :param annotations: the annotation set
-        :return: the number of updated IDs
-        """
-        track_id_videos = {}
-        track_ids_to_update = set()
-        max_track_id = 0
-        for ann in annotations:
-            t = ann['track_id']
-            if t not in track_id_videos:
-                track_id_videos[t] = ann['video_id']
-
-            if ann['video_id'] != track_id_videos[t]:
-                # Track id is assigned to multiple videos
-                track_ids_to_update.add(t)
-            max_track_id = max(max_track_id, t)
-
-        if track_ids_to_update:
-            print('true')
-            next_id = itertools.count(max_track_id + 1)
-            new_track_ids = defaultdict(lambda: next(next_id))
-            for ann in annotations:
-                t = ann['track_id']
-                v = ann['video_id']
-                if t in track_ids_to_update:
-                    ann['track_id'] = new_track_ids[t, v]
-        return len(track_ids_to_update)
diff --git a/trackeval/datasets/tao.py b/trackeval/datasets/tao.py
deleted file mode 100644
index e846167..0000000
--- a/trackeval/datasets/tao.py
+++ /dev/null
@@ -1,566 +0,0 @@
-import os
-import numpy as np
-import json
-import itertools
-from collections import defaultdict
-from scipy.optimize import linear_sum_assignment
-from ..utils import TrackEvalException
-from ._base_dataset import _BaseDataset
-from .. import utils
-from .. import _timing
-
-
-class TAO(_BaseDataset):
-    """Dataset class for TAO tracking"""
-
-    @staticmethod
-    def get_default_dataset_config():
-        """Default class config values"""
-        code_path = utils.get_code_path()
-        default_config = {
-            'GT_FOLDER': os.path.join(code_path, 'data/gt/tao/tao_training'),  # Location of GT data
-            'TRACKERS_FOLDER': os.path.join(code_path, 'data/trackers/tao/tao_training'),  # Trackers location
-            'OUTPUT_FOLDER': None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
-            'TRACKERS_TO_EVAL': None,  # Filenames of trackers to eval (if None, all in folder)
-            'CLASSES_TO_EVAL': None,  # Classes to eval (if None, all classes)
-            'SPLIT_TO_EVAL': 'training',  # Valid: 'training', 'val'
-            'PRINT_CONFIG': True,  # Whether to print current config
-            'TRACKER_SUB_FOLDER': 'data',  # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
-            'OUTPUT_SUB_FOLDER': '',  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
-            'TRACKER_DISPLAY_NAMES': None,  # Names of trackers to display, if None: TRACKERS_TO_EVAL
-            'MAX_DETECTIONS': 300,  # Number of maximal allowed detections per image (0 for unlimited)
-        }
-        return default_config
-
-    def __init__(self, config=None):
-        """Initialise dataset, checking that all required files are present"""
-        super().__init__()
-        # Fill non-given config values with defaults
-        self.config = utils.init_config(config, self.get_default_dataset_config(), self.get_name())
-        self.gt_fol = self.config['GT_FOLDER']
-        self.tracker_fol = self.config['TRACKERS_FOLDER']
-        self.should_classes_combine = True
-        self.use_super_categories = False
-
-        self.tracker_sub_fol = self.config['TRACKER_SUB_FOLDER']
-        self.output_fol = self.config['OUTPUT_FOLDER']
-        if self.output_fol is None:
-            self.output_fol = self.tracker_fol
-        self.output_sub_fol = self.config['OUTPUT_SUB_FOLDER']
-
-        gt_dir_files = [file for file in os.listdir(self.gt_fol) if file.endswith('.json')]
-        if len(gt_dir_files) != 1:
-            raise TrackEvalException(self.gt_fol + ' does not contain exactly one json file.')
-
-        with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
-            self.gt_data = json.load(f)
-
-        # merge categories marked with a merged tag in TAO dataset
-        self._merge_categories(self.gt_data['annotations'] + self.gt_data['tracks'])
-
-        # Get sequences to eval and sequence information
-        self.seq_list = [vid['name'].replace('/', '-') for vid in self.gt_data['videos']]
-        self.seq_name_to_seq_id = {vid['name'].replace('/', '-'): vid['id'] for vid in self.gt_data['videos']}
-        # compute mappings from videos to annotation data
-        self.videos_to_gt_tracks, self.videos_to_gt_images = self._compute_vid_mappings(self.gt_data['annotations'])
-        # compute sequence lengths
-        self.seq_lengths = {vid['id']: 0 for vid in self.gt_data['videos']}
-        for img in self.gt_data['images']:
-            self.seq_lengths[img['video_id']] += 1
-        self.seq_to_images_to_timestep = self._compute_image_to_timestep_mappings()
-        self.seq_to_classes = {vid['id']: {'pos_cat_ids': list({track['category_id'] for track
-                                                                in self.videos_to_gt_tracks[vid['id']]}),
-                                           'neg_cat_ids': vid['neg_category_ids'],
-                                           'not_exhaustively_labeled_cat_ids': vid['not_exhaustive_category_ids']}
-                               for vid in self.gt_data['videos']}
-
-        # Get classes to eval
-        considered_vid_ids = [self.seq_name_to_seq_id[vid] for vid in self.seq_list]
-        seen_cats = set([cat_id for vid_id in considered_vid_ids for cat_id
-                         in self.seq_to_classes[vid_id]['pos_cat_ids']])
-        # only classes with ground truth are evaluated in TAO
-        self.valid_classes = [cls['name'] for cls in self.gt_data['categories'] if cls['id'] in seen_cats]
-        cls_name_to_cls_id_map = {cls['name']: cls['id'] for cls in self.gt_data['categories']}
-
-        if self.config['CLASSES_TO_EVAL']:
-            self.class_list = [cls.lower() if cls.lower() in self.valid_classes else None
-                               for cls in self.config['CLASSES_TO_EVAL']]
-            if not all(self.class_list):
-                raise TrackEvalException('Attempted to evaluate an invalid class. Only classes ' +
-                                         ', '.join(self.valid_classes) +
-                                         ' are valid (classes present in ground truth data).')
-        else:
-            self.class_list = [cls for cls in self.valid_classes]
-        self.class_name_to_class_id = {k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list}
-
-        # Get trackers to eval
-        if self.config['TRACKERS_TO_EVAL'] is None:
-            self.tracker_list = os.listdir(self.tracker_fol)
-        else:
-            self.tracker_list = self.config['TRACKERS_TO_EVAL']
-
-        if self.config['TRACKER_DISPLAY_NAMES'] is None:
-            self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
-        elif (self.config['TRACKERS_TO_EVAL'] is not None) and (
-                len(self.config['TRACKER_DISPLAY_NAMES']) == len(self.tracker_list)):
-            self.tracker_to_disp = dict(zip(self.tracker_list, self.config['TRACKER_DISPLAY_NAMES']))
-        else:
-            raise TrackEvalException('List of tracker files and tracker display names do not match.')
-
-        self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
-
-        for tracker in self.tracker_list:
-            tr_dir_files = [file for file in os.listdir(os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol))
-                            if file.endswith('.json')]
-            if len(tr_dir_files) != 1:
-                raise TrackEvalException(os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
-                                         + ' does not contain exactly one json file.')
-            with open(os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol, tr_dir_files[0])) as f:
-                curr_data = json.load(f)
-
-            # limit detections if MAX_DETECTIONS > 0
-            if self.config['MAX_DETECTIONS']:
-                curr_data = self._limit_dets_per_image(curr_data)
-
-            # fill missing video ids
-            self._fill_video_ids_inplace(curr_data)
-
-            # make track ids unique over whole evaluation set
-            self._make_track_ids_unique(curr_data)
-
-            # merge categories marked with a merged tag in TAO dataset
-            self._merge_categories(curr_data)
-
-            # get tracker sequence information
-            curr_videos_to_tracker_tracks, curr_videos_to_tracker_images = self._compute_vid_mappings(curr_data)
-            self.tracker_data[tracker]['vids_to_tracks'] = curr_videos_to_tracker_tracks
-            self.tracker_data[tracker]['vids_to_images'] = curr_videos_to_tracker_images
-
-    def get_display_name(self, tracker):
-        return self.tracker_to_disp[tracker]
-
-    def _load_raw_file(self, tracker, seq, is_gt):
-        """Load a file (gt or tracker) in the TAO format
-
-        If is_gt, this returns a dict which contains the fields:
-        [gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
-        [gt_dets]: list (for each timestep) of lists of detections.
-        [classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
-                                keys and corresponding segmentations as values) for each track
-        [classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_lengths]: dictionary with class values
-                                as keys and lists (for each track) as values
-
-        if not is_gt, this returns a dict which contains the fields:
-        [tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
-        [tracker_dets]: list (for each timestep) of lists of detections.
-        [classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
-                                keys and corresponding segmentations as values) for each track
-        [classes_to_dt_track_ids, classes_to_dt_track_areas, classes_to_dt_track_lengths]: dictionary with class values
-                                                                                           as keys and lists as values
-        [classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
-        """
-        seq_id = self.seq_name_to_seq_id[seq]
-        # File location
-        if is_gt:
-            imgs = self.videos_to_gt_images[seq_id]
-        else:
-            imgs = self.tracker_data[tracker]['vids_to_images'][seq_id]
-
-        # Convert data to required format
-        num_timesteps = self.seq_lengths[seq_id]
-        img_to_timestep = self.seq_to_images_to_timestep[seq_id]
-        data_keys = ['ids', 'classes', 'dets']
-        if not is_gt:
-            data_keys += ['tracker_confidences']
-        raw_data = {key: [None] * num_timesteps for key in data_keys}
-        for img in imgs:
-            # some tracker data contains images without any ground truth information, these are ignored
-            try:
-                t = img_to_timestep[img['id']]
-            except KeyError:
-                continue
-            annotations = img['annotations']
-            raw_data['dets'][t] = np.atleast_2d([ann['bbox'] for ann in annotations]).astype(float)
-            raw_data['ids'][t] = np.atleast_1d([ann['track_id'] for ann in annotations]).astype(int)
-            raw_data['classes'][t] = np.atleast_1d([ann['category_id'] for ann in annotations]).astype(int)
-            if not is_gt:
-                raw_data['tracker_confidences'][t] = np.atleast_1d([ann['score'] for ann in annotations]).astype(float)
-
-        for t, d in enumerate(raw_data['dets']):
-            if d is None:
-                raw_data['dets'][t] = np.empty((0, 4)).astype(float)
-                raw_data['ids'][t] = np.empty(0).astype(int)
-                raw_data['classes'][t] = np.empty(0).astype(int)
-                if not is_gt:
-                    raw_data['tracker_confidences'][t] = np.empty(0)
-
-        if is_gt:
-            key_map = {'ids': 'gt_ids',
-                       'classes': 'gt_classes',
-                       'dets': 'gt_dets'}
-        else:
-            key_map = {'ids': 'tracker_ids',
-                       'classes': 'tracker_classes',
-                       'dets': 'tracker_dets'}
-        for k, v in key_map.items():
-            raw_data[v] = raw_data.pop(k)
-
-        all_classes = [self.class_name_to_class_id[cls] for cls in self.class_list]
-        if is_gt:
-            classes_to_consider = all_classes
-            all_tracks = self.videos_to_gt_tracks[seq_id]
-        else:
-            classes_to_consider = self.seq_to_classes[seq_id]['pos_cat_ids'] \
-                                  + self.seq_to_classes[seq_id]['neg_cat_ids']
-            all_tracks = self.tracker_data[tracker]['vids_to_tracks'][seq_id]
-
-        classes_to_tracks = {cls: [track for track in all_tracks if track['category_id'] == cls]
-                             if cls in classes_to_consider else [] for cls in all_classes}
-
-        # mapping from classes to track information
-        raw_data['classes_to_tracks'] = {cls: [{det['image_id']: np.atleast_1d(det['bbox'])
-                                                for det in track['annotations']} for track in tracks]
-                                         for cls, tracks in classes_to_tracks.items()}
-        raw_data['classes_to_track_ids'] = {cls: [track['id'] for track in tracks]
-                                            for cls, tracks in classes_to_tracks.items()}
-        raw_data['classes_to_track_areas'] = {cls: [track['area'] for track in tracks]
-                                              for cls, tracks in classes_to_tracks.items()}
-        raw_data['classes_to_track_lengths'] = {cls: [len(track['annotations']) for track in tracks]
-                                                for cls, tracks in classes_to_tracks.items()}
-
-        if not is_gt:
-            raw_data['classes_to_dt_track_scores'] = {cls: np.array([np.mean([float(x['score'])
-                                                                              for x in track['annotations']])
-                                                                     for track in tracks])
-                                                      for cls, tracks in classes_to_tracks.items()}
-
-        if is_gt:
-            key_map = {'classes_to_tracks': 'classes_to_gt_tracks',
-                       'classes_to_track_ids': 'classes_to_gt_track_ids',
-                       'classes_to_track_lengths': 'classes_to_gt_track_lengths',
-                       'classes_to_track_areas': 'classes_to_gt_track_areas'}
-        else:
-            key_map = {'classes_to_tracks': 'classes_to_dt_tracks',
-                       'classes_to_track_ids': 'classes_to_dt_track_ids',
-                       'classes_to_track_lengths': 'classes_to_dt_track_lengths',
-                       'classes_to_track_areas': 'classes_to_dt_track_areas'}
-        for k, v in key_map.items():
-            raw_data[v] = raw_data.pop(k)
-
-        raw_data['num_timesteps'] = num_timesteps
-        raw_data['neg_cat_ids'] = self.seq_to_classes[seq_id]['neg_cat_ids']
-        raw_data['not_exhaustively_labeled_cls'] = self.seq_to_classes[seq_id]['not_exhaustively_labeled_cat_ids']
-        raw_data['seq'] = seq
-        return raw_data
-
-    @_timing.time
-    def get_preprocessed_seq_data(self, raw_data, cls):
-        """ Preprocess data for a single sequence for a single class ready for evaluation.
-        Inputs:
-             - raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
-             - cls is the class to be evaluated.
-        Outputs:
-             - data is a dict containing all of the information that metrics need to perform evaluation.
-                It contains the following fields:
-                    [num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
-                    [gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
-                    [gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
-                    [similarity_scores]: list (for each timestep) of 2D NDArrays.
-        Notes:
-            General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
-                1) Extract only detections relevant for the class to be evaluated (including distractor detections).
-                2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
-                    distractor class, or otherwise marked as to be removed.
-                3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
-                    other criteria (e.g. are too small).
-                4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
-            After the above preprocessing steps, this function also calculates the number of gt and tracker detections
-                and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
-                unique within each timestep.
-        TAO:
-            In TAO, the 4 preproc steps are as follow:
-                1) All classes present in the ground truth data are evaluated separately.
-                2) No matched tracker detections are removed.
-                3) Unmatched tracker detections are removed if there is not ground truth data and the class does not
-                    belong to the categories marked as negative for this sequence. Additionally, unmatched tracker
-                    detections for classes which are marked as not exhaustively labeled are removed.
-                4) No gt detections are removed.
-            Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
-            and the tracks from the tracker data are sorted according to the tracker confidence.
-        """
-        cls_id = self.class_name_to_class_id[cls]
-        is_not_exhaustively_labeled = cls_id in raw_data['not_exhaustively_labeled_cls']
-        is_neg_category = cls_id in raw_data['neg_cat_ids']
-
-        data_keys = ['gt_ids', 'tracker_ids', 'gt_dets', 'tracker_dets', 'tracker_confidences', 'similarity_scores']
-        data = {key: [None] * raw_data['num_timesteps'] for key in data_keys}
-        unique_gt_ids = []
-        unique_tracker_ids = []
-        num_gt_dets = 0
-        num_tracker_dets = 0
-        for t in range(raw_data['num_timesteps']):
-
-            # Only extract relevant dets for this class for preproc and eval (cls)
-            gt_class_mask = np.atleast_1d(raw_data['gt_classes'][t] == cls_id)
-            gt_class_mask = gt_class_mask.astype(np.bool)
-            gt_ids = raw_data['gt_ids'][t][gt_class_mask]
-            gt_dets = raw_data['gt_dets'][t][gt_class_mask]
-
-            tracker_class_mask = np.atleast_1d(raw_data['tracker_classes'][t] == cls_id)
-            tracker_class_mask = tracker_class_mask.astype(np.bool)
-            tracker_ids = raw_data['tracker_ids'][t][tracker_class_mask]
-            tracker_dets = raw_data['tracker_dets'][t][tracker_class_mask]
-            tracker_confidences = raw_data['tracker_confidences'][t][tracker_class_mask]
-            similarity_scores = raw_data['similarity_scores'][t][gt_class_mask, :][:, tracker_class_mask]
-
-            # Match tracker and gt dets (with hungarian algorithm).
-            unmatched_indices = np.arange(tracker_ids.shape[0])
-            if gt_ids.shape[0] > 0 and tracker_ids.shape[0] > 0:
-                matching_scores = similarity_scores.copy()
-                matching_scores[matching_scores < 0.5 - np.finfo('float').eps] = 0
-                match_rows, match_cols = linear_sum_assignment(-matching_scores)
-                actually_matched_mask = matching_scores[match_rows, match_cols] > 0 + np.finfo('float').eps
-                match_cols = match_cols[actually_matched_mask]
-                unmatched_indices = np.delete(unmatched_indices, match_cols, axis=0)
-
-            if gt_ids.shape[0] == 0 and not is_neg_category:
-                to_remove_tracker = unmatched_indices
-            elif is_not_exhaustively_labeled:
-                to_remove_tracker = unmatched_indices
-            else:
-                to_remove_tracker = np.array([], dtype=np.int)
-
-            # remove all unwanted unmatched tracker detections
-            data['tracker_ids'][t] = np.delete(tracker_ids, to_remove_tracker, axis=0)
-            data['tracker_dets'][t] = np.delete(tracker_dets, to_remove_tracker, axis=0)
-            data['tracker_confidences'][t] = np.delete(tracker_confidences, to_remove_tracker, axis=0)
-            similarity_scores = np.delete(similarity_scores, to_remove_tracker, axis=1)
-
-            data['gt_ids'][t] = gt_ids
-            data['gt_dets'][t] = gt_dets
-            data['similarity_scores'][t] = similarity_scores
-
-            unique_gt_ids += list(np.unique(data['gt_ids'][t]))
-            unique_tracker_ids += list(np.unique(data['tracker_ids'][t]))
-            num_tracker_dets += len(data['tracker_ids'][t])
-            num_gt_dets += len(data['gt_ids'][t])
-
-        # Re-label IDs such that there are no empty IDs
-        if len(unique_gt_ids) > 0:
-            unique_gt_ids = np.unique(unique_gt_ids)
-            gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
-            gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
-            for t in range(raw_data['num_timesteps']):
-                if len(data['gt_ids'][t]) > 0:
-                    data['gt_ids'][t] = gt_id_map[data['gt_ids'][t]].astype(np.int)
-        if len(unique_tracker_ids) > 0:
-            unique_tracker_ids = np.unique(unique_tracker_ids)
-            tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
-            tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
-            for t in range(raw_data['num_timesteps']):
-                if len(data['tracker_ids'][t]) > 0:
-                    data['tracker_ids'][t] = tracker_id_map[data['tracker_ids'][t]].astype(np.int)
-
-        # Record overview statistics.
-        data['num_tracker_dets'] = num_tracker_dets
-        data['num_gt_dets'] = num_gt_dets
-        data['num_tracker_ids'] = len(unique_tracker_ids)
-        data['num_gt_ids'] = len(unique_gt_ids)
-        data['num_timesteps'] = raw_data['num_timesteps']
-        data['seq'] = raw_data['seq']
-
-        # get track representations
-        data['gt_tracks'] = raw_data['classes_to_gt_tracks'][cls_id]
-        data['gt_track_ids'] = raw_data['classes_to_gt_track_ids'][cls_id]
-        data['gt_track_lengths'] = raw_data['classes_to_gt_track_lengths'][cls_id]
-        data['gt_track_areas'] = raw_data['classes_to_gt_track_areas'][cls_id]
-        data['dt_tracks'] = raw_data['classes_to_dt_tracks'][cls_id]
-        data['dt_track_ids'] = raw_data['classes_to_dt_track_ids'][cls_id]
-        data['dt_track_lengths'] = raw_data['classes_to_dt_track_lengths'][cls_id]
-        data['dt_track_areas'] = raw_data['classes_to_dt_track_areas'][cls_id]
-        data['dt_track_scores'] = raw_data['classes_to_dt_track_scores'][cls_id]
-        data['not_exhaustively_labeled'] = is_not_exhaustively_labeled
-        data['iou_type'] = 'bbox'
-
-        # sort tracker data tracks by tracker confidence scores
-        if data['dt_tracks']:
-            idx = np.argsort([-score for score in data['dt_track_scores']], kind="mergesort")
-            data['dt_track_scores'] = [data['dt_track_scores'][i] for i in idx]
-            data['dt_tracks'] = [data['dt_tracks'][i] for i in idx]
-            data['dt_track_ids'] = [data['dt_track_ids'][i] for i in idx]
-            data['dt_track_lengths'] = [data['dt_track_lengths'][i] for i in idx]
-            data['dt_track_areas'] = [data['dt_track_areas'][i] for i in idx]
-        # Ensure that ids are unique per timestep.
-        self._check_unique_ids(data)
-
-        return data
-
-    def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
-        similarity_scores = self._calculate_box_ious(gt_dets_t, tracker_dets_t)
-        return similarity_scores
-
-    def _merge_categories(self, annotations):
-        """
-        Merges categories with a merged tag. Adapted from https://github.com/TAO-Dataset
-        :param annotations: the annotations in which the classes should be merged
-        :return: None
-        """
-        merge_map = {}
-        for category in self.gt_data['categories']:
-            if 'merged' in category:
-                for to_merge in category['merged']:
-                    merge_map[to_merge['id']] = category['id']
-
-        for ann in annotations:
-            ann['category_id'] = merge_map.get(ann['category_id'], ann['category_id'])
-
-    def _compute_vid_mappings(self, annotations):
-        """
-        Computes mappings from Videos to corresponding tracks and images.
-        :param annotations: the annotations for which the mapping should be generated
-        :return: the video-to-track-mapping, the video-to-image-mapping
-        """
-        vids_to_tracks = {}
-        vids_to_imgs = {}
-        vid_ids = [vid['id'] for vid in self.gt_data['videos']]
-
-        # compute an mapping from image IDs to images
-        images = {}
-        for image in self.gt_data['images']:
-            images[image['id']] = image
-
-        for ann in annotations:
-            ann["area"] = ann["bbox"][2] * ann["bbox"][3]
-
-            vid = ann["video_id"]
-            if ann["video_id"] not in vids_to_tracks.keys():
-                vids_to_tracks[ann["video_id"]] = list()
-            if ann["video_id"] not in vids_to_imgs.keys():
-                vids_to_imgs[ann["video_id"]] = list()
-
-            # Fill in vids_to_tracks
-            tid = ann["track_id"]
-            exist_tids = [track["id"] for track in vids_to_tracks[vid]]
-            try:
-                index1 = exist_tids.index(tid)
-            except ValueError:
-                index1 = -1
-            if tid not in exist_tids:
-                curr_track = {"id": tid, "category_id": ann['category_id'],
-                              "video_id": vid, "annotations": [ann]}
-                vids_to_tracks[vid].append(curr_track)
-            else:
-                vids_to_tracks[vid][index1]["annotations"].append(ann)
-
-            # Fill in vids_to_imgs
-            img_id = ann['image_id']
-            exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
-            try:
-                index2 = exist_img_ids.index(img_id)
-            except ValueError:
-                index2 = -1
-            if index2 == -1:
-                curr_img = {"id": img_id, "annotations": [ann]}
-                vids_to_imgs[vid].append(curr_img)
-            else:
-                vids_to_imgs[vid][index2]["annotations"].append(ann)
-
-        # sort annotations by frame index and compute track area
-        for vid, tracks in vids_to_tracks.items():
-            for track in tracks:
-                track["annotations"] = sorted(
-                    track['annotations'],
-                    key=lambda x: images[x['image_id']]['frame_index'])
-                # Computer average area
-                track["area"] = (sum(x['area'] for x in track['annotations']) / len(track['annotations']))
-
-        # Ensure all videos are present
-        for vid_id in vid_ids:
-            if vid_id not in vids_to_tracks.keys():
-                vids_to_tracks[vid_id] = []
-            if vid_id not in vids_to_imgs.keys():
-                vids_to_imgs[vid_id] = []
-
-        return vids_to_tracks, vids_to_imgs
-
-    def _compute_image_to_timestep_mappings(self):
-        """
-        Computes a mapping from images to the corresponding timestep in the sequence.
-        :return: the image-to-timestep-mapping
-        """
-        images = {}
-        for image in self.gt_data['images']:
-            images[image['id']] = image
-
-        seq_to_imgs_to_timestep = {vid['id']: dict() for vid in self.gt_data['videos']}
-        for vid in seq_to_imgs_to_timestep:
-            curr_imgs = [img['id'] for img in self.videos_to_gt_images[vid]]
-            curr_imgs = sorted(curr_imgs, key=lambda x: images[x]['frame_index'])
-            seq_to_imgs_to_timestep[vid] = {curr_imgs[i]: i for i in range(len(curr_imgs))}
-
-        return seq_to_imgs_to_timestep
-
-    def _limit_dets_per_image(self, annotations):
-        """
-        Limits the number of detections for each image to config['MAX_DETECTIONS']. Adapted from
-        https://github.com/TAO-Dataset/
-        :param annotations: the annotations in which the detections should be limited
-        :return: the annotations with limited detections
-        """
-        max_dets = self.config['MAX_DETECTIONS']
-        img_ann = defaultdict(list)
-        for ann in annotations:
-            img_ann[ann["image_id"]].append(ann)
-
-        for img_id, _anns in img_ann.items():
-            if len(_anns) <= max_dets:
-                continue
-            _anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
-            img_ann[img_id] = _anns[:max_dets]
-
-        return [ann for anns in img_ann.values() for ann in anns]
-
-    def _fill_video_ids_inplace(self, annotations):
-        """
-        Fills in missing video IDs inplace. Adapted from https://github.com/TAO-Dataset/
-        :param annotations: the annotations for which the videos IDs should be filled inplace
-        :return: None
-        """
-        missing_video_id = [x for x in annotations if 'video_id' not in x]
-        if missing_video_id:
-            image_id_to_video_id = {
-                x['id']: x['video_id'] for x in self.gt_data['images']
-            }
-            for x in missing_video_id:
-                x['video_id'] = image_id_to_video_id[x['image_id']]
-
-    @staticmethod
-    def _make_track_ids_unique(annotations):
-        """
-        Makes the track IDs unqiue over the whole annotation set. Adapted from https://github.com/TAO-Dataset/
-        :param annotations: the annotation set
-        :return: the number of updated IDs
-        """
-        track_id_videos = {}
-        track_ids_to_update = set()
-        max_track_id = 0
-        for ann in annotations:
-            t = ann['track_id']
-            if t not in track_id_videos:
-                track_id_videos[t] = ann['video_id']
-
-            if ann['video_id'] != track_id_videos[t]:
-                # Track id is assigned to multiple videos
-                track_ids_to_update.add(t)
-            max_track_id = max(max_track_id, t)
-
-        if track_ids_to_update:
-            print('true')
-            next_id = itertools.count(max_track_id + 1)
-            new_track_ids = defaultdict(lambda: next(next_id))
-            for ann in annotations:
-                t = ann['track_id']
-                v = ann['video_id']
-                if t in track_ids_to_update:
-                    ann['track_id'] = new_track_ids[t, v]
-        return len(track_ids_to_update)
diff --git a/trackeval/eval.py b/trackeval/eval.py
deleted file mode 100644
index d672ba2..0000000
--- a/trackeval/eval.py
+++ /dev/null
@@ -1,302 +0,0 @@
-import time
-import traceback
-from multiprocessing.pool import Pool
-from functools import partial
-import os
-from . import utils
-from .utils import TrackEvalException
-from . import _timing
-from .metrics import Count
-
-try:
-    import tqdm
-    TQDM_IMPORTED = True
-except ImportError as _:
-    TQDM_IMPORTED = False
-
-
-class Evaluator:
-    """Evaluator class for evaluating different metrics for different datasets"""
-
-    @staticmethod
-    def get_default_eval_config():
-        """Returns the default config values for evaluation"""
-        code_path = utils.get_code_path()
-        default_config = {
-            'USE_PARALLEL': False,
-            'NUM_PARALLEL_CORES': 8,
-            'BREAK_ON_ERROR': True,  # Raises exception and exits with error
-            'RETURN_ON_ERROR': False,  # if not BREAK_ON_ERROR, then returns from function on error
-            'LOG_ON_ERROR': os.path.join(code_path, 'error_log.txt'),  # if not None, save any errors into a log file.
-
-            'PRINT_RESULTS': True,
-            'PRINT_ONLY_COMBINED': False,
-            'PRINT_CONFIG': True,
-            'TIME_PROGRESS': True,
-            'DISPLAY_LESS_PROGRESS': True,
-
-            'OUTPUT_SUMMARY': True,
-            'OUTPUT_EMPTY_CLASSES': True,  # If False, summary files are not output for classes with no detections
-            'OUTPUT_DETAILED': True,
-            'PLOT_CURVES': True,
-        }
-        return default_config
-
-    def __init__(self, config=None):
-        """Initialise the evaluator with a config file"""
-        self.config = utils.init_config(config, self.get_default_eval_config(), 'Eval')
-        # Only run timing analysis if not run in parallel.
-        if self.config['TIME_PROGRESS'] and not self.config['USE_PARALLEL']:
-            _timing.DO_TIMING = True
-            if self.config['DISPLAY_LESS_PROGRESS']:
-                _timing.DISPLAY_LESS_PROGRESS = True
-
-    @_timing.time
-    def evaluate(self, dataset_list, metrics_list, show_progressbar=False):
-        """Evaluate a set of metrics on a set of datasets"""
-        config = self.config
-        metrics_list = metrics_list + [Count()]  # Count metrics are always run
-        metric_names = utils.validate_metrics_list(metrics_list)
-        dataset_names = [dataset.get_name() for dataset in dataset_list]
-        output_res = {}
-        output_msg = {}
-
-        for dataset, dataset_name in zip(dataset_list, dataset_names):
-            # Get dataset info about what to evaluate
-            output_res[dataset_name] = {}
-            output_msg[dataset_name] = {}
-            tracker_list, seq_list, class_list = dataset.get_eval_info()
-            print('\nEvaluating %i tracker(s) on %i sequence(s) for %i class(es) on %s dataset using the following '
-                  'metrics: %s\n' % (len(tracker_list), len(seq_list), len(class_list), dataset_name,
-                                     ', '.join(metric_names)))
-
-            # Evaluate each tracker
-            for tracker in tracker_list:
-                # if not config['BREAK_ON_ERROR'] then go to next tracker without breaking
-                try:
-                    # Evaluate each sequence in parallel or in series.
-                    # returns a nested dict (res), indexed like: res[seq][class][metric_name][sub_metric field]
-                    # e.g. res[seq_0001][pedestrian][hota][DetA]
-                    print('\nEvaluating %s\n' % tracker)
-                    time_start = time.time()
-                    if config['USE_PARALLEL']:
-                        print('not implemented!!!')
-                        if show_progressbar and TQDM_IMPORTED:
-                            seq_list_sorted = sorted(seq_list)
-
-                            with Pool(config['NUM_PARALLEL_CORES']) as pool, tqdm.tqdm(total=len(seq_list)) as pbar:
-                                _eval_sequence = partial(eval_sequence, dataset=dataset, tracker=tracker,
-                                                         class_list=class_list, metrics_list=metrics_list,
-                                                         metric_names=metric_names)
-                                results = []
-                                for r in pool.imap(_eval_sequence, seq_list_sorted,
-                                                   chunksize=20):
-                                    results.append(r)
-                                    pbar.update()
-                                res = dict(zip(seq_list_sorted, results))
-
-                        else:
-                            with Pool(config['NUM_PARALLEL_CORES']) as pool:
-                                _eval_sequence = partial(eval_sequence, dataset=dataset, tracker=tracker,
-                                                         class_list=class_list, metrics_list=metrics_list,
-                                                         metric_names=metric_names)
-                                results = pool.map(_eval_sequence, seq_list)
-                                res = dict(zip(seq_list, results))
-                    else:
-                        res = {}
-                        if show_progressbar and TQDM_IMPORTED:
-                            seq_list_sorted = sorted(seq_list)
-                            for curr_seq in tqdm.tqdm(seq_list_sorted):
-                                if isinstance(curr_seq, tuple):
-                                    curr_seq_name = curr_seq[0] + '_' + curr_seq[1]
-                                else:
-                                    curr_seq_name = curr_seq
-                                res[curr_seq_name] = eval_sequence(curr_seq, dataset, tracker, class_list, metrics_list,
-                                                              metric_names)
-                        else:
-                            for curr_seq in sorted(seq_list):
-                                if isinstance(curr_seq, tuple):
-                                    curr_seq_name = curr_seq[0] + '_' + curr_seq[1]
-                                else:
-                                    curr_seq_name = curr_seq
-                                res[curr_seq_name] = eval_sequence(curr_seq, dataset, tracker, class_list, metrics_list,
-                                                              metric_names)
-
-                    # Combine results over all sequences and then over all classes
-
-                    # collecting combined cls keys (cls averaged, det averaged, super classes)
-                    combined_cls_keys = []
-                    res['COMBINED_SEQ'] = {}
-                    # combine sequences for each class
-                    for c_cls in class_list:
-                        res['COMBINED_SEQ'][c_cls] = {}
-                        for metric, metric_name in zip(metrics_list, metric_names):
-                            curr_res = {seq_key: seq_value[c_cls][metric_name] for seq_key, seq_value in res.items() if
-                                        seq_key != 'COMBINED_SEQ'}
-                            res['COMBINED_SEQ'][c_cls][metric_name] = metric.combine_sequences(curr_res)
-                    # # ---------------------------------------
-                    # import json
-                    # rel_to_seq = json.load(open("rel_class_to_seq.json"))
-                    # rel_class_list = rel_to_seq.keys()
-                    # out = {}
-                    # out['COMBINED_SEQ'] = {}
-                    # for idx, r_cls in enumerate(rel_class_list):
-                    #     out['COMBINED_SEQ'][r_cls] = {}
-                    #     curr_res = {}
-                        
-                    #     for metric, metric_name in zip(metrics_list, metric_names):
-                    #         if metric_name not in ['STMAP', 'Count']: continue
-                    #         for seq_dict in rel_to_seq[r_cls]:
-                    #             for seq, o_classes in seq_dict.items():
-                    #                 for o_cls in o_classes:
-                    #                     if o_cls not in class_list: continue
-                    #                     seq_ = seq + '_' + o_cls
-                    #                     curr_res[seq_] = res[seq][o_cls][metric_name]
-                    #         try:
-                    #             # out['COMBINED_SEQ'][r_cls][metric_name] = metric.combine_sequences(curr_res)
-                    #             out_ = metric.combine_sequences(curr_res)
-                    #             for k, v in out_.items():
-                    #                 out_[k] = v.tolist()
-                    #             out['COMBINED_SEQ'][r_cls][metric_name] = out_
-                    #         except:
-                    #             pass
-                            
-                    # # ---------------------------------------
-                    # combine classes
-                    if dataset.should_classes_combine:
-                        combined_cls_keys += ['cls_comb_cls_av', 'cls_comb_det_av', 'all']
-                        res['COMBINED_SEQ']['cls_comb_cls_av'] = {}
-                        res['COMBINED_SEQ']['cls_comb_det_av'] = {}
-                        for metric, metric_name in zip(metrics_list, metric_names):
-                            cls_res = {cls_key: cls_value[metric_name] for cls_key, cls_value in
-                                       res['COMBINED_SEQ'].items() if cls_key not in combined_cls_keys}
-                            res['COMBINED_SEQ']['cls_comb_cls_av'][metric_name] = \
-                                metric.combine_classes_class_averaged(cls_res)
-                            res['COMBINED_SEQ']['cls_comb_det_av'][metric_name] = \
-                                metric.combine_classes_det_averaged(cls_res)
-                    # combine classes to super classes
-                    if dataset.use_super_categories:
-                        for cat, sub_cats in dataset.super_categories.items():
-                            combined_cls_keys.append(cat)
-                            res['COMBINED_SEQ'][cat] = {}
-                            for metric, metric_name in zip(metrics_list, metric_names):
-                                cat_res = {cls_key: cls_value[metric_name] for cls_key, cls_value in
-                                           res['COMBINED_SEQ'].items() if cls_key in sub_cats}
-                                res['COMBINED_SEQ'][cat][metric_name] = metric.combine_classes_det_averaged(cat_res)
-
-                    # Print and output results in various formats
-                    if config['TIME_PROGRESS']:
-                        print('\nAll sequences for %s finished in %.2f seconds' % (tracker, time.time() - time_start))
-                    output_fol = dataset.get_output_fol(tracker)
-                    tracker_display_name = dataset.get_display_name(tracker)
-                    # for c_cls in rel_class_list:  # class_list + combined classes if calculated
-                    #     summaries = []
-                    #     details = []
-                    #     num_dets = out['COMBINED_SEQ'][c_cls]['Count']['Dets']
-                    #     if config['OUTPUT_EMPTY_CLASSES'] or num_dets > 0:
-                    #         for metric, metric_name in zip(metrics_list, metric_names):
-                    #             if metric_name not in ['STMAP', 'Count']: continue
-                    #             # for combined classes there is no per sequence evaluation
-                    #             if c_cls in combined_cls_keys:
-                    #                 table_res = {'COMBINED_SEQ': out['COMBINED_SEQ'][c_cls][metric_name]}
-                    #             else:
-                    #                 table_res = {seq_key: seq_value[c_cls][metric_name] for seq_key, seq_value
-                    #                              in out.items()}
-
-                    #             if config['PRINT_RESULTS'] and config['PRINT_ONLY_COMBINED']:
-                    #                 dont_print = dataset.should_classes_combine and c_cls not in combined_cls_keys
-                    #                 if not dont_print:
-                    #                     metric.print_table({'COMBINED_SEQ': table_res['COMBINED_SEQ']},
-                    #                                        tracker_display_name, c_cls)
-                    #             elif config['PRINT_RESULTS']:
-                    #                 metric.print_table(table_res, tracker_display_name, c_cls)
-                    #             if config['OUTPUT_SUMMARY']:
-                    #                 summaries.append(metric.summary_results(table_res))
-                    #             if config['OUTPUT_DETAILED']:
-                    #                 details.append(metric.detailed_results(table_res))
-                    #             if config['PLOT_CURVES']:
-                    #                 metric.plot_single_tracker_results(table_res, tracker_display_name, c_cls,
-                    #                                                    output_fol)
-                    #         if config['OUTPUT_SUMMARY']:
-                    #             utils.write_summary_results(summaries, c_cls, output_fol)
-                    #         if config['OUTPUT_DETAILED']:
-                    #             utils.write_detailed_results(details, c_cls, output_fol)
-
-                    # # ------
-                    for c_cls in res['COMBINED_SEQ'].keys():  # class_list + combined classes if calculated
-                        summaries = []
-                        details = []
-                        num_dets = res['COMBINED_SEQ'][c_cls]['Count']['Dets']
-                        if config['OUTPUT_EMPTY_CLASSES'] or num_dets > 0:
-                            for metric, metric_name in zip(metrics_list, metric_names):
-                                # for combined classes there is no per sequence evaluation
-                                if c_cls in combined_cls_keys:
-                                    table_res = {'COMBINED_SEQ': res['COMBINED_SEQ'][c_cls][metric_name]}
-                                else:
-                                    table_res = {seq_key: seq_value[c_cls][metric_name] for seq_key, seq_value
-                                                 in res.items()}
-
-                                if config['PRINT_RESULTS'] and config['PRINT_ONLY_COMBINED']:
-                                    dont_print = dataset.should_classes_combine and c_cls not in combined_cls_keys
-                                    if not dont_print:
-                                        metric.print_table({'COMBINED_SEQ': table_res['COMBINED_SEQ']},
-                                                           tracker_display_name, c_cls)
-                                elif config['PRINT_RESULTS']:
-                                    metric.print_table(table_res, tracker_display_name, c_cls)
-                                if config['OUTPUT_SUMMARY']:
-                                    summaries.append(metric.summary_results(table_res))
-                                if config['OUTPUT_DETAILED']:
-                                    details.append(metric.detailed_results(table_res))
-                                if config['PLOT_CURVES']:
-                                    metric.plot_single_tracker_results(table_res, tracker_display_name, c_cls,
-                                                                       output_fol)
-                            if config['OUTPUT_SUMMARY']:
-                                utils.write_summary_results(summaries, c_cls, output_fol)
-                            if config['OUTPUT_DETAILED']:
-                                utils.write_detailed_results(details, c_cls, output_fol)
-                    # -------
-
-                    # Output for returning from function
-                    output_res[dataset_name][tracker] = res
-                    output_msg[dataset_name][tracker] = 'Success'
-                    import pickle
-                    with open(output_fol + '/output_res.pkl', 'wb') as f:
-                        pickle.dump(output_res, f, pickle.HIGHEST_PROTOCOL)
-
-
-                except Exception as err:
-                    output_res[dataset_name][tracker] = None
-                    if type(err) == TrackEvalException:
-                        output_msg[dataset_name][tracker] = str(err)
-                    else:
-                        output_msg[dataset_name][tracker] = 'Unknown error occurred.'
-                    print('Tracker %s was unable to be evaluated.' % tracker)
-                    print(err)
-                    traceback.print_exc()
-                    if config['LOG_ON_ERROR'] is not None:
-                        with open(config['LOG_ON_ERROR'], 'a') as f:
-                            print(dataset_name, file=f)
-                            print(tracker, file=f)
-                            print(traceback.format_exc(), file=f)
-                            print('\n\n\n', file=f)
-                    if config['BREAK_ON_ERROR']:
-                        raise err
-                    elif config['RETURN_ON_ERROR']:
-                        return output_res, output_msg
-
-        return output_res, output_msg
-
-
-@_timing.time
-def eval_sequence(seq, dataset, tracker, class_list, metrics_list, metric_names):
-    """Function for evaluating a single sequence"""
-
-    raw_data = dataset.get_raw_seq_data(tracker, seq)
-    # raw_data = dataset.get_raw_seq_data(tracker, seq[0], seq[1])
-    seq_res = {}
-    for cls in class_list:
-        seq_res[cls] = {}
-        data = dataset.get_preprocessed_seq_data(raw_data, cls)
-        for metric, met_name in zip(metrics_list, metric_names):
-            seq_res[cls][met_name] = metric.eval_sequence(data)
-    return seq_res
diff --git a/trackeval/metrics/__init__.py b/trackeval/metrics/__init__.py
deleted file mode 100644
index 69c9fba..0000000
--- a/trackeval/metrics/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from .hota import HOTA
-from .clear import CLEAR
-from .identity import Identity
-from .count import Count
-from .j_and_f import JAndF
-from .track_map import TrackMAP
-from .vace import VACE
-from .ideucl import IDEucl
-from .st_map import STMAP
-from .clear_tr import CLEARTR
\ No newline at end of file
diff --git a/trackeval/metrics/_base_metric.py b/trackeval/metrics/_base_metric.py
deleted file mode 100644
index ea48885..0000000
--- a/trackeval/metrics/_base_metric.py
+++ /dev/null
@@ -1,133 +0,0 @@
-
-import numpy as np
-from abc import ABC, abstractmethod
-from .. import _timing
-from ..utils import TrackEvalException
-
-
-class _BaseMetric(ABC):
-    @abstractmethod
-    def __init__(self):
-        self.plottable = False
-        self.integer_fields = []
-        self.float_fields = []
-        self.array_labels = []
-        self.integer_array_fields = []
-        self.float_array_fields = []
-        self.fields = []
-        self.summary_fields = []
-        self.registered = False
-
-    #####################################################################
-    # Abstract functions for subclasses to implement
-
-    @_timing.time
-    @abstractmethod
-    def eval_sequence(self, data):
-        ...
-
-    @abstractmethod
-    def combine_sequences(self, all_res):
-        ...
-
-    @abstractmethod
-    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
-        ...
-
-    @ abstractmethod
-    def combine_classes_det_averaged(self, all_res):
-        ...
-
-    def plot_single_tracker_results(self, all_res, tracker, output_folder, cls):
-        """Plot results of metrics, only valid for metrics with self.plottable"""
-        if self.plottable:
-            raise NotImplementedError('plot_results is not implemented for metric %s' % self.get_name())
-        else:
-            pass
-
-    #####################################################################
-    # Helper functions which are useful for all metrics:
-
-    @classmethod
-    def get_name(cls):
-        return cls.__name__
-
-    @staticmethod
-    def _combine_sum(all_res, field):
-        """Combine sequence results via sum"""
-        return sum([all_res[k][field] for k in all_res.keys()])
-
-    @staticmethod
-    def _combine_weighted_av(all_res, field, comb_res, weight_field):
-        """Combine sequence results via weighted average"""
-        return sum([all_res[k][field] * all_res[k][weight_field] for k in all_res.keys()]) / np.maximum(1.0, comb_res[
-            weight_field])
-
-    def print_table(self, table_res, tracker, cls):
-        """Prints table of results for all sequences"""
-        print('')
-        metric_name = self.get_name()
-        self._row_print([metric_name + ': ' + tracker + '-' + cls] + self.summary_fields)
-        for seq, results in sorted(table_res.items()):
-            if seq == 'COMBINED_SEQ':
-                continue
-            summary_res = self._summary_row(results)
-            self._row_print([seq] + summary_res)
-        summary_res = self._summary_row(table_res['COMBINED_SEQ'])
-        self._row_print(['COMBINED'] + summary_res)
-
-    def _summary_row(self, results_):
-        vals = []
-        for h in self.summary_fields:
-            if h in self.float_array_fields:
-                vals.append("{0:1.5g}".format(100 * np.mean(results_[h])))
-            elif h in self.float_fields:
-                vals.append("{0:1.5g}".format(100 * float(results_[h])))
-            elif h in self.integer_fields:
-                vals.append("{0:d}".format(int(results_[h])))
-            else:
-                raise NotImplementedError("Summary function not implemented for this field type.")
-        return vals
-
-    @staticmethod
-    def _row_print(*argv):
-        """Prints results in an evenly spaced rows, with more space in first row"""
-        if len(argv) == 1:
-            argv = argv[0]
-        to_print = '%-35s' % argv[0]
-        for v in argv[1:]:
-            to_print += '%-10s' % str(v)
-        print(to_print)
-
-    def summary_results(self, table_res):
-        """Returns a simple summary of final results for a tracker"""
-        return dict(zip(self.summary_fields, self._summary_row(table_res['COMBINED_SEQ'])))
-
-    def detailed_results(self, table_res):
-        """Returns detailed final results for a tracker"""
-        # Get detailed field information
-        detailed_fields = self.float_fields + self.integer_fields
-        for h in self.float_array_fields + self.integer_array_fields:
-            for alpha in [int(100*x) for x in self.array_labels]:
-                detailed_fields.append(h + '___' + str(alpha))
-            detailed_fields.append(h + '___AUC')
-
-        # Get detailed results
-        detailed_results = {}
-        for seq, res in table_res.items():
-            detailed_row = self._detailed_row(res)
-            if len(detailed_row) != len(detailed_fields):
-                raise TrackEvalException(
-                    'Field names and data have different sizes (%i and %i)' % (len(detailed_row), len(detailed_fields)))
-            detailed_results[seq] = dict(zip(detailed_fields, detailed_row))
-        return detailed_results
-
-    def _detailed_row(self, res):
-        detailed_row = []
-        for h in self.float_fields + self.integer_fields:
-            detailed_row.append(res[h])
-        for h in self.float_array_fields + self.integer_array_fields:
-            for i, alpha in enumerate([int(100 * x) for x in self.array_labels]):
-                detailed_row.append(res[h][i])
-            detailed_row.append(np.mean(res[h]))
-        return detailed_row
diff --git a/trackeval/metrics/clear.py b/trackeval/metrics/clear.py
deleted file mode 100644
index 8b5e291..0000000
--- a/trackeval/metrics/clear.py
+++ /dev/null
@@ -1,186 +0,0 @@
-
-import numpy as np
-from scipy.optimize import linear_sum_assignment
-from ._base_metric import _BaseMetric
-from .. import _timing
-from .. import utils
-
-class CLEAR(_BaseMetric):
-    """Class which implements the CLEAR metrics"""
-
-    @staticmethod
-    def get_default_config():
-        """Default class config values"""
-        default_config = {
-            'THRESHOLD': 0.5,  # Similarity score threshold required for a TP match. Default 0.5.
-            'PRINT_CONFIG': True,  # Whether to print the config information on init. Default: False.
-        }
-        return default_config
-
-    def __init__(self, config=None):
-        super().__init__()
-        main_integer_fields = ['CLR_TP', 'CLR_FN', 'CLR_FP', 'IDSW', 'MT', 'PT', 'ML', 'Frag']
-        extra_integer_fields = ['CLR_Frames']
-        self.integer_fields = main_integer_fields + extra_integer_fields
-        main_float_fields = ['MOTA', 'MOTP', 'MODA', 'CLR_Re', 'CLR_Pr', 'MTR', 'PTR', 'MLR', 'sMOTA']
-        extra_float_fields = ['CLR_F1', 'FP_per_frame', 'MOTAL', 'MOTP_sum']
-        self.float_fields = main_float_fields + extra_float_fields
-        self.fields = self.float_fields + self.integer_fields
-        self.summed_fields = self.integer_fields + ['MOTP_sum']
-        self.summary_fields = main_float_fields + main_integer_fields
-
-        # Configuration options:
-        self.config = utils.init_config(config, self.get_default_config(), self.get_name())
-        self.threshold = float(self.config['THRESHOLD'])
-
-
-    @_timing.time
-    def eval_sequence(self, data):
-        """Calculates CLEAR metrics for one sequence"""
-        # Initialise results
-        res = {}
-        for field in self.fields:
-            res[field] = 0
-
-        # Return result quickly if tracker or gt sequence is empty
-        if data['num_tracker_dets'] == 0:
-            res['CLR_FN'] = data['num_gt_dets']
-            res['ML'] = data['num_gt_ids']
-            res['MLR'] = 1.0
-            return res
-        if data['num_gt_dets'] == 0:
-            res['CLR_FP'] = data['num_tracker_dets']
-            res['MLR'] = 1.0
-            return res
-
-        # Variables counting global association
-        num_gt_ids = data['num_gt_ids']
-        gt_id_count = np.zeros(num_gt_ids)  # For MT/ML/PT
-        gt_matched_count = np.zeros(num_gt_ids)  # For MT/ML/PT
-        gt_frag_count = np.zeros(num_gt_ids)  # For Frag
-
-        # Note that IDSWs are counted based on the last time each gt_id was present (any number of frames previously),
-        # but are only used in matching to continue current tracks based on the gt_id in the single previous timestep.
-        prev_tracker_id = np.nan * np.zeros(num_gt_ids)  # For scoring IDSW
-        prev_timestep_tracker_id = np.nan * np.zeros(num_gt_ids)  # For matching IDSW
-
-        # Calculate scores for each timestep
-        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
-            # Deal with the case that there are no gt_det/tracker_det in a timestep.
-            if len(gt_ids_t) == 0:
-                res['CLR_FP'] += len(tracker_ids_t)
-                continue
-            if len(tracker_ids_t) == 0:
-                res['CLR_FN'] += len(gt_ids_t)
-                gt_id_count[gt_ids_t] += 1
-                continue
-
-            # Calc score matrix to first minimise IDSWs from previous frame, and then maximise MOTP secondarily
-            similarity = data['similarity_scores'][t]
-            score_mat = (tracker_ids_t[np.newaxis, :] == prev_timestep_tracker_id[gt_ids_t[:, np.newaxis]])
-            score_mat = 1000 * score_mat + similarity
-            score_mat[similarity < self.threshold - np.finfo('float').eps] = 0
-
-            # Hungarian algorithm to find best matches
-            match_rows, match_cols = linear_sum_assignment(-score_mat)
-            actually_matched_mask = score_mat[match_rows, match_cols] > 0 + np.finfo('float').eps
-            match_rows = match_rows[actually_matched_mask]
-            match_cols = match_cols[actually_matched_mask]
-
-            matched_gt_ids = gt_ids_t[match_rows]
-            matched_tracker_ids = tracker_ids_t[match_cols]
-
-            # Calc IDSW for MOTA
-            prev_matched_tracker_ids = prev_tracker_id[matched_gt_ids]
-            is_idsw = (np.logical_not(np.isnan(prev_matched_tracker_ids))) & (
-                np.not_equal(matched_tracker_ids, prev_matched_tracker_ids))
-            res['IDSW'] += np.sum(is_idsw)
-
-            # Update counters for MT/ML/PT/Frag and record for IDSW/Frag for next timestep
-            gt_id_count[gt_ids_t] += 1
-            gt_matched_count[matched_gt_ids] += 1
-            not_previously_tracked = np.isnan(prev_timestep_tracker_id)
-            prev_tracker_id[matched_gt_ids] = matched_tracker_ids
-            prev_timestep_tracker_id[:] = np.nan
-            prev_timestep_tracker_id[matched_gt_ids] = matched_tracker_ids
-            currently_tracked = np.logical_not(np.isnan(prev_timestep_tracker_id))
-            gt_frag_count += np.logical_and(not_previously_tracked, currently_tracked)
-
-            # Calculate and accumulate basic statistics
-            num_matches = len(matched_gt_ids)
-            res['CLR_TP'] += num_matches
-            res['CLR_FN'] += len(gt_ids_t) - num_matches
-            res['CLR_FP'] += len(tracker_ids_t) - num_matches
-            if num_matches > 0:
-                res['MOTP_sum'] += sum(similarity[match_rows, match_cols])
-
-        # Calculate MT/ML/PT/Frag/MOTP
-        tracked_ratio = gt_matched_count[gt_id_count > 0] / gt_id_count[gt_id_count > 0]
-        res['MT'] = np.sum(np.greater(tracked_ratio, 0.8))
-        res['PT'] = np.sum(np.greater_equal(tracked_ratio, 0.2)) - res['MT']
-        res['ML'] = num_gt_ids - res['MT'] - res['PT']
-        res['Frag'] = np.sum(np.subtract(gt_frag_count[gt_frag_count > 0], 1))
-        res['MOTP'] = res['MOTP_sum'] / np.maximum(1.0, res['CLR_TP'])
-
-        res['CLR_Frames'] = data['num_timesteps']
-
-        # Calculate final CLEAR scores
-        res = self._compute_final_fields(res)
-        return res
-
-    def combine_sequences(self, all_res):
-        """Combines metrics across all sequences"""
-        res = {}
-        for field in self.summed_fields:
-            res[field] = self._combine_sum(all_res, field)
-        res = self._compute_final_fields(res)
-        return res
-
-    def combine_classes_det_averaged(self, all_res):
-        """Combines metrics across all classes by averaging over the detection values"""
-        res = {}
-        for field in self.summed_fields:
-            res[field] = self._combine_sum(all_res, field)
-        res = self._compute_final_fields(res)
-        return res
-
-    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
-        """Combines metrics across all classes by averaging over the class values.
-        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
-        """
-        res = {}
-        for field in self.integer_fields:
-            if ignore_empty_classes:
-                res[field] = self._combine_sum(
-                    {k: v for k, v in all_res.items() if v['CLR_TP'] + v['CLR_FN'] + v['CLR_FP'] > 0}, field)
-            else:
-                res[field] = self._combine_sum({k: v for k, v in all_res.items()}, field)
-        for field in self.float_fields:
-            if ignore_empty_classes:
-                res[field] = np.mean(
-                    [v[field] for v in all_res.values() if v['CLR_TP'] + v['CLR_FN'] + v['CLR_FP'] > 0], axis=0)
-            else:
-                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
-        return res
-
-    @staticmethod
-    def _compute_final_fields(res):
-        """Calculate sub-metric ('field') values which only depend on other sub-metric values.
-        This function is used both for both per-sequence calculation, and in combining values across sequences.
-        """
-        num_gt_ids = res['MT'] + res['ML'] + res['PT']
-        res['MTR'] = res['MT'] / np.maximum(1.0, num_gt_ids)
-        res['MLR'] = res['ML'] / np.maximum(1.0, num_gt_ids)
-        res['PTR'] = res['PT'] / np.maximum(1.0, num_gt_ids)
-        res['CLR_Re'] = res['CLR_TP'] / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
-        res['CLR_Pr'] = res['CLR_TP'] / np.maximum(1.0, res['CLR_TP'] + res['CLR_FP'])
-        res['MODA'] = (res['CLR_TP'] - res['CLR_FP']) / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
-        res['MOTA'] = (res['CLR_TP'] - res['CLR_FP'] - res['IDSW']) / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
-        res['MOTP'] = res['MOTP_sum'] / np.maximum(1.0, res['CLR_TP'])
-        res['sMOTA'] = (res['MOTP_sum'] - res['CLR_FP'] - res['IDSW']) / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
-
-        res['CLR_F1'] = res['CLR_TP'] / np.maximum(1.0, res['CLR_TP'] + 0.5*res['CLR_FN'] + 0.5*res['CLR_FP'])
-        res['FP_per_frame'] = res['CLR_FP'] / np.maximum(1.0, res['CLR_Frames'])
-        safe_log_idsw = np.log10(res['IDSW']) if res['IDSW'] > 0 else res['IDSW']
-        res['MOTAL'] = (res['CLR_TP'] - res['CLR_FP'] - safe_log_idsw) / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
-        return res
diff --git a/trackeval/metrics/clear_tr.py b/trackeval/metrics/clear_tr.py
deleted file mode 100644
index eda19fc..0000000
--- a/trackeval/metrics/clear_tr.py
+++ /dev/null
@@ -1,205 +0,0 @@
-
-import numpy as np
-from scipy.optimize import linear_sum_assignment
-from ._base_metric import _BaseMetric
-from .. import _timing
-from .. import utils
-
-class CLEARTR(_BaseMetric):
-    """Class which implements the CLEARTR metrics"""
-
-    @staticmethod
-    def get_default_config():
-        """Default class config values"""
-        default_config = {
-            'THRESHOLD': 0.5,  # Similarity score threshold required for a TP match. Default 0.5.
-            'PRINT_CONFIG': True,  # Whether to print the config information on init. Default: False.
-        }
-        return default_config
-
-    def __init__(self, config=None):
-        super().__init__()
-        main_integer_fields = ['CLR_TP', 'CLR_FN', 'CLR_FP', 'IDSW', 'MT', 'PT', 'ML', 'Frag', 'MTI', 'INT_True', 'INT_False']
-        extra_integer_fields = ['CLR_Frames']
-        self.integer_fields = main_integer_fields + extra_integer_fields
-        main_float_fields = ['MOTA', 'MOTP', 'MODA', 'CLR_Re', 'CLR_Pr', 'MTR', 'PTR', 'MLR', 'sMOTA', 'MTIR']
-        extra_float_fields = ['CLR_F1', 'FP_per_frame', 'MOTAL', 'MOTP_sum']
-        self.float_fields = main_float_fields + extra_float_fields
-        self.fields = self.float_fields + self.integer_fields
-        self.summed_fields = self.integer_fields + ['MOTP_sum']
-        self.summary_fields = main_float_fields + main_integer_fields
-
-        # Configuration options:
-        self.config = utils.init_config(config, self.get_default_config(), self.get_name())
-        self.threshold = float(self.config['THRESHOLD'])
-
-
-    @_timing.time
-    def eval_sequence(self, data):
-        """Calculates CLEARTR metrics for one sequence"""
-        # Initialise results
-        res = {}
-        for field in self.fields:
-            res[field] = 0
-
-        # Return result quickly if tracker or gt sequence is empty
-        if data['num_tracker_dets'] == 0:
-            res['CLR_FN'] = data['num_gt_dets']
-            res['ML'] = data['num_gt_ids']
-            res['MLR'] = 1.0
-            return res
-        if data['num_gt_dets'] == 0:
-            res['CLR_FP'] = data['num_tracker_dets']
-            res['MLR'] = 1.0
-            return res
-
-        # Variables counting global association
-        num_gt_ids = data['num_gt_ids']
-        gt_id_count = np.zeros(num_gt_ids)  # For MT/ML/PT/MTI
-        gt_matched_count = np.zeros(num_gt_ids)  # For MT/ML/PT/MTI
-        gt_frag_count = np.zeros(num_gt_ids)  # For Frag
-        rel_correct_count = np.zeros(num_gt_ids)  # For INT_True/INT_False
-        rel_wrong_count = np.zeros(num_gt_ids)  # For INT_True/INT_False
-
-        # Note that IDSWs are counted based on the last time each gt_id was present (any number of frames previously),
-        # but are only used in matching to continue current tracks based on the gt_id in the single previous timestep.
-        prev_tracker_id = np.nan * np.zeros(num_gt_ids)  # For scoring IDSW
-        prev_timestep_tracker_id = np.nan * np.zeros(num_gt_ids)  # For matching IDSW
-
-        # Calculate scores for each timestep
-        for t, (gt_ids_t, tracker_ids_t, main_gt_t, main_track_t, rel_sub_class_t, rel_sub_list_t, rel_obj_class_t, rel_obj_list_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'], data['is_main_gt'], data['is_main_track'], data['rel_sub_class'], data['rel_sub_list'], data['rel_obj_class'], data['rel_obj_list'])):
-            # Deal with the case that there are no gt_det/tracker_det in a timestep.
-            if len(gt_ids_t) == 0:
-                res['CLR_FP'] += len(tracker_ids_t)
-                continue
-            if len(tracker_ids_t) == 0:
-                res['CLR_FN'] += len(gt_ids_t)
-                gt_id_count[gt_ids_t] += 1
-                continue
-
-            # Calc score matrix to first minimise IDSWs from previous frame, and then maximise MOTP secondarily
-            similarity = data['similarity_scores'][t]
-            score_mat = (tracker_ids_t[np.newaxis, :] == prev_timestep_tracker_id[gt_ids_t[:, np.newaxis]])
-            score_mat = 1000 * score_mat + similarity
-            score_mat[similarity < self.threshold - np.finfo('float').eps] = 0
-
-            # Hungarian algorithm to find best matches
-            match_rows, match_cols = linear_sum_assignment(-score_mat)
-            actually_matched_mask = score_mat[match_rows, match_cols] > 0 + np.finfo('float').eps
-            match_rows = match_rows[actually_matched_mask]
-            match_cols = match_cols[actually_matched_mask]
-
-            matched_gt_ids = gt_ids_t[match_rows]
-            matched_tracker_ids = tracker_ids_t[match_cols]
-
-            # Calc IDSW for MOTA
-            prev_matched_tracker_ids = prev_tracker_id[matched_gt_ids]
-            is_idsw = (np.logical_not(np.isnan(prev_matched_tracker_ids))) & (
-                np.not_equal(matched_tracker_ids, prev_matched_tracker_ids))
-            res['IDSW'] += np.sum(is_idsw)
-
-            # Update counters for MT/ML/PT/Frag/MII and record for IDSW/Frag for next timestep
-            gt_id_count[gt_ids_t] += 1
-            gt_matched_count[matched_gt_ids] += 1
-            not_previously_tracked = np.isnan(prev_timestep_tracker_id)
-            prev_tracker_id[matched_gt_ids] = matched_tracker_ids
-            prev_timestep_tracker_id[:] = np.nan
-            prev_timestep_tracker_id[matched_gt_ids] = matched_tracker_ids
-            currently_tracked = np.logical_not(np.isnan(prev_timestep_tracker_id))
-            gt_frag_count += np.logical_and(not_previously_tracked, currently_tracked)
-
-            if main_gt_t in main_track_t:
-                if rel_sub_class_t in rel_sub_list_t or rel_obj_class_t in rel_obj_list_t:
-                    rel_correct_count[matched_gt_ids] += 1
-                else:
-                    rel_wrong_count[matched_gt_ids] += 1
-            elif (main_gt_t in main_track_t and (rel_obj_class_t in rel_sub_list_t) or (rel_sub_class_t in rel_obj_list_t)):
-                rel_correct_count[matched_gt_ids] += 1
-            else:
-                rel_wrong_count[matched_gt_ids] += 1
-
-            # Calculate and accumulate basic statistics
-            num_matches = len(matched_gt_ids)
-            res['CLR_TP'] += num_matches
-            res['CLR_FN'] += len(gt_ids_t) - num_matches
-            res['CLR_FP'] += len(tracker_ids_t) - num_matches
-            if num_matches > 0:
-                res['MOTP_sum'] += sum(similarity[match_rows, match_cols])
-
-
-        # Calculate MT/ML/PT/Frag/MOTP/MTI
-        tracked_ratio = gt_matched_count[gt_id_count > 0] / gt_id_count[gt_id_count > 0]
-        res['MT'] = np.sum(np.greater(tracked_ratio, 0.8))
-        res['PT'] = np.sum(np.greater_equal(tracked_ratio, 0.2)) - res['MT']
-        res['ML'] = num_gt_ids - res['MT'] - res['PT']
-        res['Frag'] = np.sum(np.subtract(gt_frag_count[gt_frag_count > 0], 1))
-        res['MOTP'] = res['MOTP_sum'] / np.maximum(1.0, res['CLR_TP'])
-
-        res['CLR_Frames'] = data['num_timesteps']
-
-        rel_correct_ratio = rel_correct_count[gt_id_count > 0] / (rel_correct_count[gt_id_count > 0] + rel_wrong_count[gt_id_count > 0])
-        for i, (tr_rat, rel_rat) in enumerate(zip(tracked_ratio, rel_correct_ratio)):
-            if tr_rat > 0.8 and rel_rat > 0.7:
-                res['MTI'] += 1
-
-        # Calculate final CLEARTR scores
-        res = self._compute_final_fields(res)
-        return res
-
-    def combine_sequences(self, all_res):
-        """Combines metrics across all sequences"""
-        res = {}
-        for field in self.summed_fields:
-            res[field] = self._combine_sum(all_res, field)
-        res = self._compute_final_fields(res)
-        return res
-
-    def combine_classes_det_averaged(self, all_res):
-        """Combines metrics across all classes by averaging over the detection values"""
-        res = {}
-        for field in self.summed_fields:
-            res[field] = self._combine_sum(all_res, field)
-        res = self._compute_final_fields(res)
-        return res
-
-    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
-        """Combines metrics across all classes by averaging over the class values.
-        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
-        """
-        res = {}
-        for field in self.integer_fields:
-            if ignore_empty_classes:
-                res[field] = self._combine_sum(
-                    {k: v for k, v in all_res.items() if v['CLR_TP'] + v['CLR_FN'] + v['CLR_FP'] > 0}, field)
-            else:
-                res[field] = self._combine_sum({k: v for k, v in all_res.items()}, field)
-        for field in self.float_fields:
-            if ignore_empty_classes:
-                res[field] = np.mean(
-                    [v[field] for v in all_res.values() if v['CLR_TP'] + v['CLR_FN'] + v['CLR_FP'] > 0], axis=0)
-            else:
-                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
-        return res
-
-    @staticmethod
-    def _compute_final_fields(res):
-        """Calculate sub-metric ('field') values which only depend on other sub-metric values.
-        This function is used both for both per-sequence calculation, and in combining values across sequences.
-        """
-        num_gt_ids = res['MT'] + res['ML'] + res['PT']
-        res['MTR'] = res['MT'] / np.maximum(1.0, num_gt_ids)
-        res['MLR'] = res['ML'] / np.maximum(1.0, num_gt_ids)
-        res['PTR'] = res['PT'] / np.maximum(1.0, num_gt_ids)
-        res['CLR_Re'] = res['CLR_TP'] / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
-        res['CLR_Pr'] = res['CLR_TP'] / np.maximum(1.0, res['CLR_TP'] + res['CLR_FP'])
-        res['MODA'] = (res['CLR_TP'] - res['CLR_FP']) / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
-        res['MOTA'] = (res['CLR_TP'] - res['CLR_FP'] - res['IDSW']) / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
-        res['MOTP'] = res['MOTP_sum'] / np.maximum(1.0, res['CLR_TP'])
-        res['sMOTA'] = (res['MOTP_sum'] - res['CLR_FP'] - res['IDSW']) / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
-
-        res['CLR_F1'] = res['CLR_TP'] / np.maximum(1.0, res['CLR_TP'] + 0.5*res['CLR_FN'] + 0.5*res['CLR_FP'])
-        res['FP_per_frame'] = res['CLR_FP'] / np.maximum(1.0, res['CLR_Frames'])
-        safe_log_idsw = np.log10(res['IDSW']) if res['IDSW'] > 0 else res['IDSW']
-        res['MOTAL'] = (res['CLR_TP'] - res['CLR_FP'] - safe_log_idsw) / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
-        res['MTIR'] = res['MTI'] / np.maximum(1.0, num_gt_ids)
-        return res
diff --git a/trackeval/metrics/count.py b/trackeval/metrics/count.py
deleted file mode 100644
index 49049b1..0000000
--- a/trackeval/metrics/count.py
+++ /dev/null
@@ -1,44 +0,0 @@
-
-from ._base_metric import _BaseMetric
-from .. import _timing
-
-
-class Count(_BaseMetric):
-    """Class which simply counts the number of tracker and gt detections and ids."""
-    def __init__(self, config=None):
-        super().__init__()
-        self.integer_fields = ['Dets', 'GT_Dets', 'IDs', 'GT_IDs']
-        self.fields = self.integer_fields
-        self.summary_fields = self.fields
-
-    @_timing.time
-    def eval_sequence(self, data):
-        """Returns counts for one sequence"""
-        # Get results
-        res = {'Dets': data['num_tracker_dets'],
-               'GT_Dets': data['num_gt_dets'],
-               'IDs': data['num_tracker_ids'],
-               'GT_IDs': data['num_gt_ids'],
-               'Frames': data['num_timesteps']}
-        return res
-
-    def combine_sequences(self, all_res):
-        """Combines metrics across all sequences"""
-        res = {}
-        for field in self.integer_fields:
-            res[field] = self._combine_sum(all_res, field)
-        return res
-
-    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=None):
-        """Combines metrics across all classes by averaging over the class values"""
-        res = {}
-        for field in self.integer_fields:
-            res[field] = self._combine_sum(all_res, field)
-        return res
-
-    def combine_classes_det_averaged(self, all_res):
-        """Combines metrics across all classes by averaging over the detection values"""
-        res = {}
-        for field in self.integer_fields:
-            res[field] = self._combine_sum(all_res, field)
-        return res
diff --git a/trackeval/metrics/hota.py b/trackeval/metrics/hota.py
deleted file mode 100644
index f551b76..0000000
--- a/trackeval/metrics/hota.py
+++ /dev/null
@@ -1,203 +0,0 @@
-
-import os
-import numpy as np
-from scipy.optimize import linear_sum_assignment
-from ._base_metric import _BaseMetric
-from .. import _timing
-
-
-class HOTA(_BaseMetric):
-    """Class which implements the HOTA metrics.
-    See: https://link.springer.com/article/10.1007/s11263-020-01375-2
-    """
-
-    def __init__(self, config=None):
-        super().__init__()
-        self.plottable = True
-        self.array_labels = np.arange(0.05, 0.99, 0.05)
-        self.integer_array_fields = ['HOTA_TP', 'HOTA_FN', 'HOTA_FP']
-        self.float_array_fields = ['HOTA', 'DetA', 'AssA', 'DetRe', 'DetPr', 'AssRe', 'AssPr', 'LocA', 'OWTA']
-        self.float_fields = ['HOTA(0)', 'LocA(0)', 'HOTALocA(0)']
-        self.fields = self.float_array_fields + self.integer_array_fields + self.float_fields
-        self.summary_fields = self.float_array_fields + self.float_fields
-
-    @_timing.time
-    def eval_sequence(self, data):
-        """Calculates the HOTA metrics for one sequence"""
-
-        # Initialise results
-        res = {}
-        for field in self.float_array_fields + self.integer_array_fields:
-            res[field] = np.zeros((len(self.array_labels)), dtype=np.float)
-        for field in self.float_fields:
-            res[field] = 0
-
-        # Return result quickly if tracker or gt sequence is empty
-        if data['num_tracker_dets'] == 0:
-            res['HOTA_FN'] = data['num_gt_dets'] * np.ones((len(self.array_labels)), dtype=np.float)
-            res['LocA'] = np.ones((len(self.array_labels)), dtype=np.float)
-            res['LocA(0)'] = 1.0
-            return res
-        if data['num_gt_dets'] == 0:
-            res['HOTA_FP'] = data['num_tracker_dets'] * np.ones((len(self.array_labels)), dtype=np.float)
-            res['LocA'] = np.ones((len(self.array_labels)), dtype=np.float)
-            res['LocA(0)'] = 1.0
-            return res
-
-        # Variables counting global association
-        potential_matches_count = np.zeros((data['num_gt_ids'], data['num_tracker_ids']))
-        gt_id_count = np.zeros((data['num_gt_ids'], 1))
-        tracker_id_count = np.zeros((1, data['num_tracker_ids']))
-
-        # First loop through each timestep and accumulate global track information.
-        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
-            # Count the potential matches between ids in each timestep
-            # These are normalised, weighted by the match similarity.
-            similarity = data['similarity_scores'][t]
-            sim_iou_denom = similarity.sum(0)[np.newaxis, :] + similarity.sum(1)[:, np.newaxis] - similarity
-            sim_iou = np.zeros_like(similarity)
-            sim_iou_mask = sim_iou_denom > 0 + np.finfo('float').eps
-            sim_iou[sim_iou_mask] = similarity[sim_iou_mask] / sim_iou_denom[sim_iou_mask]
-            potential_matches_count[gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]] += sim_iou
-
-            # Calculate the total number of dets for each gt_id and tracker_id.
-            gt_id_count[gt_ids_t] += 1
-            tracker_id_count[0, tracker_ids_t] += 1
-
-        # Calculate overall jaccard alignment score (before unique matching) between IDs
-        global_alignment_score = potential_matches_count / (gt_id_count + tracker_id_count - potential_matches_count)
-        matches_counts = [np.zeros_like(potential_matches_count) for _ in self.array_labels]
-
-        # Calculate scores for each timestep
-        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
-            # Deal with the case that there are no gt_det/tracker_det in a timestep.
-            if len(gt_ids_t) == 0:
-                for a, alpha in enumerate(self.array_labels):
-                    res['HOTA_FP'][a] += len(tracker_ids_t)
-                continue
-            if len(tracker_ids_t) == 0:
-                for a, alpha in enumerate(self.array_labels):
-                    res['HOTA_FN'][a] += len(gt_ids_t)
-                continue
-
-            # Get matching scores between pairs of dets for optimizing HOTA
-            similarity = data['similarity_scores'][t]
-            score_mat = global_alignment_score[gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]] * similarity
-
-            # Hungarian algorithm to find best matches
-            match_rows, match_cols = linear_sum_assignment(-score_mat)
-
-            # Calculate and accumulate basic statistics
-            for a, alpha in enumerate(self.array_labels):
-                actually_matched_mask = similarity[match_rows, match_cols] >= alpha - np.finfo('float').eps
-                alpha_match_rows = match_rows[actually_matched_mask]
-                alpha_match_cols = match_cols[actually_matched_mask]
-                num_matches = len(alpha_match_rows)
-                res['HOTA_TP'][a] += num_matches
-                res['HOTA_FN'][a] += len(gt_ids_t) - num_matches
-                res['HOTA_FP'][a] += len(tracker_ids_t) - num_matches
-                if num_matches > 0:
-                    res['LocA'][a] += sum(similarity[alpha_match_rows, alpha_match_cols])
-                    matches_counts[a][gt_ids_t[alpha_match_rows], tracker_ids_t[alpha_match_cols]] += 1
-
-        # Calculate association scores (AssA, AssRe, AssPr) for the alpha value.
-        # First calculate scores per gt_id/tracker_id combo and then average over the number of detections.
-        for a, alpha in enumerate(self.array_labels):
-            matches_count = matches_counts[a]
-            ass_a = matches_count / np.maximum(1, gt_id_count + tracker_id_count - matches_count)
-            res['AssA'][a] = np.sum(matches_count * ass_a) / np.maximum(1, res['HOTA_TP'][a])
-            ass_re = matches_count / np.maximum(1, gt_id_count)
-            res['AssRe'][a] = np.sum(matches_count * ass_re) / np.maximum(1, res['HOTA_TP'][a])
-            ass_pr = matches_count / np.maximum(1, tracker_id_count)
-            res['AssPr'][a] = np.sum(matches_count * ass_pr) / np.maximum(1, res['HOTA_TP'][a])
-
-        # Calculate final scores
-        res['LocA'] = np.maximum(1e-10, res['LocA']) / np.maximum(1e-10, res['HOTA_TP'])
-        res = self._compute_final_fields(res)
-        return res
-
-    def combine_sequences(self, all_res):
-        """Combines metrics across all sequences"""
-        res = {}
-        for field in self.integer_array_fields:
-            res[field] = self._combine_sum(all_res, field)
-        for field in ['AssRe', 'AssPr', 'AssA']:
-            res[field] = self._combine_weighted_av(all_res, field, res, weight_field='HOTA_TP')
-        loca_weighted_sum = sum([all_res[k]['LocA'] * all_res[k]['HOTA_TP'] for k in all_res.keys()])
-        res['LocA'] = np.maximum(1e-10, loca_weighted_sum) / np.maximum(1e-10, res['HOTA_TP'])
-        res = self._compute_final_fields(res)
-        return res
-
-    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
-        """Combines metrics across all classes by averaging over the class values.
-        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
-        """
-        res = {}
-        for field in self.integer_array_fields:
-            if ignore_empty_classes:
-                res[field] = self._combine_sum(
-                    {k: v for k, v in all_res.items()
-                     if (v['HOTA_TP'] + v['HOTA_FN'] + v['HOTA_FP'] > 0 + np.finfo('float').eps).any()}, field)
-            else:
-                res[field] = self._combine_sum({k: v for k, v in all_res.items()}, field)
-
-        for field in self.float_fields + self.float_array_fields:
-            if ignore_empty_classes:
-                res[field] = np.mean([v[field] for v in all_res.values() if
-                                      (v['HOTA_TP'] + v['HOTA_FN'] + v['HOTA_FP'] > 0 + np.finfo('float').eps).any()],
-                                     axis=0)
-            else:
-                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
-        return res
-
-    def combine_classes_det_averaged(self, all_res):
-        """Combines metrics across all classes by averaging over the detection values"""
-        res = {}
-        for field in self.integer_array_fields:
-            res[field] = self._combine_sum(all_res, field)
-        for field in ['AssRe', 'AssPr', 'AssA']:
-            res[field] = self._combine_weighted_av(all_res, field, res, weight_field='HOTA_TP')
-        loca_weighted_sum = sum([all_res[k]['LocA'] * all_res[k]['HOTA_TP'] for k in all_res.keys()])
-        res['LocA'] = np.maximum(1e-10, loca_weighted_sum) / np.maximum(1e-10, res['HOTA_TP'])
-        res = self._compute_final_fields(res)
-        return res
-
-    @staticmethod
-    def _compute_final_fields(res):
-        """Calculate sub-metric ('field') values which only depend on other sub-metric values.
-        This function is used both for both per-sequence calculation, and in combining values across sequences.
-        """
-        res['DetRe'] = res['HOTA_TP'] / np.maximum(1, res['HOTA_TP'] + res['HOTA_FN'])
-        res['DetPr'] = res['HOTA_TP'] / np.maximum(1, res['HOTA_TP'] + res['HOTA_FP'])
-        res['DetA'] = res['HOTA_TP'] / np.maximum(1, res['HOTA_TP'] + res['HOTA_FN'] + res['HOTA_FP'])
-        res['HOTA'] = np.sqrt(res['DetA'] * res['AssA'])
-        res['OWTA'] = np.sqrt(res['DetRe'] * res['AssA'])
-
-        res['HOTA(0)'] = res['HOTA'][0]
-        res['LocA(0)'] = res['LocA'][0]
-        res['HOTALocA(0)'] = res['HOTA(0)']*res['LocA(0)']
-        return res
-
-    def plot_single_tracker_results(self, table_res, tracker, cls, output_folder):
-        """Create plot of results"""
-
-        # Only loaded when run to reduce minimum requirements
-        from matplotlib import pyplot as plt
-
-        res = table_res['COMBINED_SEQ']
-        styles_to_plot = ['r', 'b', 'g', 'b--', 'b:', 'g--', 'g:', 'm']
-        for name, style in zip(self.float_array_fields, styles_to_plot):
-            plt.plot(self.array_labels, res[name], style)
-        plt.xlabel('alpha')
-        plt.ylabel('score')
-        plt.title(tracker + ' - ' + cls)
-        plt.axis([0, 1, 0, 1])
-        legend = []
-        for name in self.float_array_fields:
-            legend += [name + ' (' + str(np.round(np.mean(res[name]), 2)) + ')']
-        plt.legend(legend, loc='lower left')
-        out_file = os.path.join(output_folder, cls + '_plot.pdf')
-        os.makedirs(os.path.dirname(out_file), exist_ok=True)
-        plt.savefig(out_file)
-        plt.savefig(out_file.replace('.pdf', '.png'))
-        plt.clf()
diff --git a/trackeval/metrics/identity.py b/trackeval/metrics/identity.py
deleted file mode 100644
index c8c6c80..0000000
--- a/trackeval/metrics/identity.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import numpy as np
-from scipy.optimize import linear_sum_assignment
-from ._base_metric import _BaseMetric
-from .. import _timing
-from .. import utils
-
-
-class Identity(_BaseMetric):
-    """Class which implements the ID metrics"""
-
-    @staticmethod
-    def get_default_config():
-        """Default class config values"""
-        default_config = {
-            'THRESHOLD': 0.5,  # Similarity score threshold required for a IDTP match. Default 0.5.
-            'PRINT_CONFIG': True,  # Whether to print the config information on init. Default: False.
-        }
-        return default_config
-
-    def __init__(self, config=None):
-        super().__init__()
-        self.integer_fields = ['IDTP', 'IDFN', 'IDFP']
-        self.float_fields = ['IDF1', 'IDR', 'IDP']
-        self.fields = self.float_fields + self.integer_fields
-        self.summary_fields = self.fields
-
-        # Configuration options:
-        self.config = utils.init_config(config, self.get_default_config(), self.get_name())
-        self.threshold = float(self.config['THRESHOLD'])
-
-    @_timing.time
-    def eval_sequence(self, data):
-        """Calculates ID metrics for one sequence"""
-        # Initialise results
-        res = {}
-        for field in self.fields:
-            res[field] = 0
-
-        # Return result quickly if tracker or gt sequence is empty
-        if data['num_tracker_dets'] == 0:
-            res['IDFN'] = data['num_gt_dets']
-            return res
-        if data['num_gt_dets'] == 0:
-            res['IDFP'] = data['num_tracker_dets']
-            return res
-
-        # Variables counting global association
-        potential_matches_count = np.zeros((data['num_gt_ids'], data['num_tracker_ids']))
-        gt_id_count = np.zeros(data['num_gt_ids'])
-        tracker_id_count = np.zeros(data['num_tracker_ids'])
-
-        # First loop through each timestep and accumulate global track information.
-        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
-            # Count the potential matches between ids in each timestep
-            matches_mask = np.greater_equal(data['similarity_scores'][t], self.threshold)
-            match_idx_gt, match_idx_tracker = np.nonzero(matches_mask)
-            potential_matches_count[gt_ids_t[match_idx_gt], tracker_ids_t[match_idx_tracker]] += 1
-
-            # Calculate the total number of dets for each gt_id and tracker_id.
-            gt_id_count[gt_ids_t] += 1
-            tracker_id_count[tracker_ids_t] += 1
-
-        # Calculate optimal assignment cost matrix for ID metrics
-        num_gt_ids = data['num_gt_ids']
-        num_tracker_ids = data['num_tracker_ids']
-        fp_mat = np.zeros((num_gt_ids + num_tracker_ids, num_gt_ids + num_tracker_ids))
-        fn_mat = np.zeros((num_gt_ids + num_tracker_ids, num_gt_ids + num_tracker_ids))
-        fp_mat[num_gt_ids:, :num_tracker_ids] = 1e10
-        fn_mat[:num_gt_ids, num_tracker_ids:] = 1e10
-        for gt_id in range(num_gt_ids):
-            fn_mat[gt_id, :num_tracker_ids] = gt_id_count[gt_id]
-            fn_mat[gt_id, num_tracker_ids + gt_id] = gt_id_count[gt_id]
-        for tracker_id in range(num_tracker_ids):
-            fp_mat[:num_gt_ids, tracker_id] = tracker_id_count[tracker_id]
-            fp_mat[tracker_id + num_gt_ids, tracker_id] = tracker_id_count[tracker_id]
-        fn_mat[:num_gt_ids, :num_tracker_ids] -= potential_matches_count
-        fp_mat[:num_gt_ids, :num_tracker_ids] -= potential_matches_count
-
-        # Hungarian algorithm
-        match_rows, match_cols = linear_sum_assignment(fn_mat + fp_mat)
-
-        # Accumulate basic statistics
-        res['IDFN'] = fn_mat[match_rows, match_cols].sum().astype(np.int)
-        res['IDFP'] = fp_mat[match_rows, match_cols].sum().astype(np.int)
-        res['IDTP'] = (gt_id_count.sum() - res['IDFN']).astype(np.int)
-
-        # Calculate final ID scores
-        res = self._compute_final_fields(res)
-        return res
-
-    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
-        """Combines metrics across all classes by averaging over the class values.
-        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
-        """
-        res = {}
-        for field in self.integer_fields:
-            if ignore_empty_classes:
-                res[field] = self._combine_sum({k: v for k, v in all_res.items()
-                                                if v['IDTP'] + v['IDFN'] + v['IDFP'] > 0 + np.finfo('float').eps},
-                                               field)
-            else:
-                res[field] = self._combine_sum({k: v for k, v in all_res.items()}, field)
-        for field in self.float_fields:
-            if ignore_empty_classes:
-                res[field] = np.mean([v[field] for v in all_res.values()
-                                      if v['IDTP'] + v['IDFN'] + v['IDFP'] > 0 + np.finfo('float').eps], axis=0)
-            else:
-                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
-        return res
-
-    def combine_classes_det_averaged(self, all_res):
-        """Combines metrics across all classes by averaging over the detection values"""
-        res = {}
-        for field in self.integer_fields:
-            res[field] = self._combine_sum(all_res, field)
-        res = self._compute_final_fields(res)
-        return res
-
-    def combine_sequences(self, all_res):
-        """Combines metrics across all sequences"""
-        res = {}
-        for field in self.integer_fields:
-            res[field] = self._combine_sum(all_res, field)
-        res = self._compute_final_fields(res)
-        return res
-
-    @staticmethod
-    def _compute_final_fields(res):
-        """Calculate sub-metric ('field') values which only depend on other sub-metric values.
-        This function is used both for both per-sequence calculation, and in combining values across sequences.
-        """
-        res['IDR'] = res['IDTP'] / np.maximum(1.0, res['IDTP'] + res['IDFN'])
-        res['IDP'] = res['IDTP'] / np.maximum(1.0, res['IDTP'] + res['IDFP'])
-        res['IDF1'] = res['IDTP'] / np.maximum(1.0, res['IDTP'] + 0.5 * res['IDFP'] + 0.5 * res['IDFN'])
-        return res
diff --git a/trackeval/metrics/ideucl.py b/trackeval/metrics/ideucl.py
deleted file mode 100644
index db9b57b..0000000
--- a/trackeval/metrics/ideucl.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import numpy as np
-from scipy.optimize import linear_sum_assignment
-from ._base_metric import _BaseMetric
-from .. import _timing
-from collections import defaultdict
-from .. import utils
-
-
-class IDEucl(_BaseMetric):
-    """Class which implements the ID metrics"""
-
-    @staticmethod
-    def get_default_config():
-        """Default class config values"""
-        default_config = {
-            'THRESHOLD': 0.4,  # Similarity score threshold required for a IDTP match. 0.4 for IDEucl.
-            'PRINT_CONFIG': True,  # Whether to print the config information on init. Default: False.
-        }
-        return default_config
-
-    def __init__(self, config=None):
-        super().__init__()
-        self.fields = ['IDEucl']
-        self.float_fields = self.fields
-        self.summary_fields = self.fields
-
-        # Configuration options:
-        self.config = utils.init_config(config, self.get_default_config(), self.get_name())
-        self.threshold = float(self.config['THRESHOLD'])
-
-
-    @_timing.time
-    def eval_sequence(self, data):
-        """Calculates IDEucl metrics for all frames"""
-        # Initialise results
-        res = {'IDEucl' : 0}
-
-        # Return result quickly if tracker or gt sequence is empty
-        if data['num_tracker_dets'] == 0 or data['num_gt_dets'] == 0.:
-            return res
-
-        data['centroid'] = []
-        for t, gt_det in enumerate(data['gt_dets']):
-            # import pdb;pdb.set_trace()
-            data['centroid'].append(self._compute_centroid(gt_det))
-
-        oid_hid_cent = defaultdict(list)
-        oid_cent = defaultdict(list)
-        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
-            matches_mask = np.greater_equal(data['similarity_scores'][t], self.threshold)
-
-            # I hope the orders of ids and boxes are maintained in `data`
-            for ind, gid in enumerate(gt_ids_t):
-                oid_cent[gid].append(data['centroid'][t][ind])
-
-            match_idx_gt, match_idx_tracker = np.nonzero(matches_mask)
-            for m_gid, m_tid in zip(match_idx_gt, match_idx_tracker):
-                oid_hid_cent[gt_ids_t[m_gid], tracker_ids_t[m_tid]].append(data['centroid'][t][m_gid])
-
-        oid_hid_dist = {k : np.sum(np.linalg.norm(np.diff(np.array(v), axis=0), axis=1)) for k, v in oid_hid_cent.items()}
-        oid_dist = {int(k) : np.sum(np.linalg.norm(np.diff(np.array(v), axis=0), axis=1)) for k, v in oid_cent.items()}
-
-        unique_oid = np.unique([i[0] for i in oid_hid_dist.keys()]).tolist()
-        unique_hid = np.unique([i[1] for i in oid_hid_dist.keys()]).tolist()
-        o_len = len(unique_oid)
-        h_len = len(unique_hid)
-        dist_matrix = np.zeros((o_len, h_len))
-        for ((oid, hid), dist) in oid_hid_dist.items():
-            oid_ind = unique_oid.index(oid)
-            hid_ind = unique_hid.index(hid)
-            dist_matrix[oid_ind, hid_ind] = dist
-
-        # opt_hyp_dist contains GT ID : max dist covered by track
-        opt_hyp_dist = dict.fromkeys(oid_dist.keys(), 0.)
-        cost_matrix = np.max(dist_matrix) - dist_matrix
-        rows, cols = linear_sum_assignment(cost_matrix)
-        for (row, col) in zip(rows, cols):
-            value = dist_matrix[row, col]
-            opt_hyp_dist[int(unique_oid[row])] = value
-
-        assert len(opt_hyp_dist.keys()) == len(oid_dist.keys())
-        hyp_length = np.sum(list(opt_hyp_dist.values()))
-        gt_length = np.sum(list(oid_dist.values()))
-        id_eucl =np.mean([np.divide(a, b, out=np.zeros_like(a), where=b!=0) for a, b in zip(opt_hyp_dist.values(), oid_dist.values())])
-        res['IDEucl'] = np.divide(hyp_length, gt_length, out=np.zeros_like(hyp_length), where=gt_length!=0)
-        return res
-
-    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
-        """Combines metrics across all classes by averaging over the class values.
-        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
-        """
-        res = {}
-
-        for field in self.float_fields:
-            if ignore_empty_classes:
-                res[field] = np.mean([v[field] for v in all_res.values()
-                                      if v['IDEucl'] > 0 + np.finfo('float').eps], axis=0)
-            else:
-                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
-        return res
-
-    def combine_classes_det_averaged(self, all_res):
-        """Combines metrics across all classes by averaging over the detection values"""
-        res = {}
-        for field in self.float_fields:
-            res[field] = self._combine_sum(all_res, field)
-        res = self._compute_final_fields(res, len(all_res))
-        return res
-
-    def combine_sequences(self, all_res):
-        """Combines metrics across all sequences"""
-        res = {}
-        for field in self.float_fields:
-            res[field] = self._combine_sum(all_res, field)
-        res = self._compute_final_fields(res, len(all_res))
-        return res
-
-
-    @staticmethod
-    def _compute_centroid(box):
-        box = np.array(box)
-        if len(box.shape) == 1:
-            centroid = (box[0:2] + box[2:4])/2
-        else:
-            centroid = (box[:, 0:2] + box[:, 2:4])/2
-        return  np.flip(centroid, axis=1)
-
-
-    @staticmethod
-    def _compute_final_fields(res, res_len):
-        """
-        Exists only to match signature with the original Identiy class.
-
-        """
-        return {k:v/res_len for k,v in res.items()}
diff --git a/trackeval/metrics/j_and_f.py b/trackeval/metrics/j_and_f.py
deleted file mode 100644
index 1b18f04..0000000
--- a/trackeval/metrics/j_and_f.py
+++ /dev/null
@@ -1,310 +0,0 @@
-
-import numpy as np
-import math
-from scipy.optimize import linear_sum_assignment
-from ..utils import TrackEvalException
-from ._base_metric import _BaseMetric
-from .. import _timing
-
-
-class JAndF(_BaseMetric):
-    """Class which implements the J&F metrics"""
-    def __init__(self, config=None):
-        super().__init__()
-        self.integer_fields = ['num_gt_tracks']
-        self.float_fields = ['J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay', 'J&F']
-        self.fields = self.float_fields + self.integer_fields
-        self.summary_fields = self.float_fields
-        self.optim_type = 'J'  # possible values J, J&F
-
-    @_timing.time
-    def eval_sequence(self, data):
-        """Returns J&F metrics for one sequence"""
-
-        # Only loaded when run to reduce minimum requirements
-        from pycocotools import mask as mask_utils
-
-        num_timesteps = data['num_timesteps']
-        num_tracker_ids = data['num_tracker_ids']
-        num_gt_ids = data['num_gt_ids']
-        gt_dets = data['gt_dets']
-        tracker_dets = data['tracker_dets']
-        gt_ids = data['gt_ids']
-        tracker_ids = data['tracker_ids']
-
-        # get shape of frames
-        frame_shape = None
-        if num_gt_ids > 0:
-            for t in range(num_timesteps):
-                if len(gt_ids[t]) > 0:
-                    frame_shape = gt_dets[t][0]['size']
-                    break
-        elif num_tracker_ids > 0:
-            for t in range(num_timesteps):
-                if len(tracker_ids[t]) > 0:
-                    frame_shape = tracker_dets[t][0]['size']
-                    break
-
-        if frame_shape:
-            # append all zero masks for timesteps in which tracks do not have a detection
-            zero_padding = np.zeros((frame_shape), order= 'F').astype(np.uint8)
-            padding_mask = mask_utils.encode(zero_padding)
-            for t in range(num_timesteps):
-                gt_id_det_mapping = {gt_ids[t][i]: gt_dets[t][i] for i in range(len(gt_ids[t]))}
-                gt_dets[t] = [gt_id_det_mapping[index] if index in gt_ids[t] else padding_mask for index
-                              in range(num_gt_ids)]
-                tracker_id_det_mapping = {tracker_ids[t][i]: tracker_dets[t][i] for i in range(len(tracker_ids[t]))}
-                tracker_dets[t] = [tracker_id_det_mapping[index] if index in tracker_ids[t] else padding_mask for index
-                                   in range(num_tracker_ids)]
-            # also perform zero padding if number of tracker IDs < number of ground truth IDs
-            if num_tracker_ids < num_gt_ids:
-                diff = num_gt_ids - num_tracker_ids
-                for t in range(num_timesteps):
-                    tracker_dets[t] = tracker_dets[t] + [padding_mask for _ in range(diff)]
-                num_tracker_ids += diff
-
-        j = self._compute_j(gt_dets, tracker_dets, num_gt_ids, num_tracker_ids, num_timesteps)
-
-        # boundary threshold for F computation
-        bound_th = 0.008
-
-        # perform matching
-        if self.optim_type == 'J&F':
-            f = np.zeros_like(j)
-            for k in range(num_tracker_ids):
-                for i in range(num_gt_ids):
-                    f[k, i, :] = self._compute_f(gt_dets, tracker_dets, k, i, bound_th)
-            optim_metrics = (np.mean(j, axis=2) + np.mean(f, axis=2)) / 2
-            row_ind, col_ind = linear_sum_assignment(- optim_metrics)
-            j_m = j[row_ind, col_ind, :]
-            f_m = f[row_ind, col_ind, :]
-        elif self.optim_type == 'J':
-            optim_metrics = np.mean(j, axis=2)
-            row_ind, col_ind = linear_sum_assignment(- optim_metrics)
-            j_m = j[row_ind, col_ind, :]
-            f_m = np.zeros_like(j_m)
-            for i, (tr_ind, gt_ind) in enumerate(zip(row_ind, col_ind)):
-                f_m[i] = self._compute_f(gt_dets, tracker_dets, tr_ind, gt_ind, bound_th)
-        else:
-            raise TrackEvalException('Unsupported optimization type %s for J&F metric.' % self.optim_type)
-
-        # append zeros for false negatives
-        if j_m.shape[0] < data['num_gt_ids']:
-            diff = data['num_gt_ids'] - j_m.shape[0]
-            j_m = np.concatenate((j_m, np.zeros((diff, j_m.shape[1]))), axis=0)
-            f_m = np.concatenate((f_m, np.zeros((diff, f_m.shape[1]))), axis=0)
-
-        # compute the metrics for each ground truth track
-        res = {
-            'J-Mean': [np.nanmean(j_m[i, :]) for i in range(j_m.shape[0])],
-            'J-Recall': [np.nanmean(j_m[i, :] > 0.5 + np.finfo('float').eps) for i in range(j_m.shape[0])],
-            'F-Mean': [np.nanmean(f_m[i, :]) for i in range(f_m.shape[0])],
-            'F-Recall': [np.nanmean(f_m[i, :] > 0.5 + np.finfo('float').eps) for i in range(f_m.shape[0])],
-            'J-Decay': [],
-            'F-Decay': []
-        }
-        n_bins = 4
-        ids = np.round(np.linspace(1, data['num_timesteps'], n_bins + 1) + 1e-10) - 1
-        ids = ids.astype(np.uint8)
-
-        for k in range(j_m.shape[0]):
-            d_bins_j = [j_m[k][ids[i]:ids[i + 1] + 1] for i in range(0, n_bins)]
-            res['J-Decay'].append(np.nanmean(d_bins_j[0]) - np.nanmean(d_bins_j[3]))
-        for k in range(f_m.shape[0]):
-            d_bins_f = [f_m[k][ids[i]:ids[i + 1] + 1] for i in range(0, n_bins)]
-            res['F-Decay'].append(np.nanmean(d_bins_f[0]) - np.nanmean(d_bins_f[3]))
-
-        # count number of tracks for weighting of the result
-        res['num_gt_tracks'] = len(res['J-Mean'])
-        for field in ['J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay']:
-            res[field] = np.mean(res[field])
-        res['J&F'] = (res['J-Mean'] + res['F-Mean']) / 2
-        return res
-
-    def combine_sequences(self, all_res):
-        """Combines metrics across all sequences"""
-        res = {'num_gt_tracks': self._combine_sum(all_res, 'num_gt_tracks')}
-        for field in self.summary_fields:
-            res[field] = self._combine_weighted_av(all_res, field, res, weight_field='num_gt_tracks')
-        return res
-
-    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
-        """Combines metrics across all classes by averaging over the class values
-        'ignore empty classes' is not yet implemented here.
-        """
-        res = {'num_gt_tracks': self._combine_sum(all_res, 'num_gt_tracks')}
-        for field in self.float_fields:
-            res[field] = np.mean([v[field] for v in all_res.values()])
-        return res
-
-    def combine_classes_det_averaged(self, all_res):
-        """Combines metrics across all classes by averaging over the detection values"""
-        res = {'num_gt_tracks': self._combine_sum(all_res, 'num_gt_tracks')}
-        for field in self.float_fields:
-            res[field] = np.mean([v[field] for v in all_res.values()])
-        return res
-
-    @staticmethod
-    def _seg2bmap(seg, width=None, height=None):
-        """
-        From a segmentation, compute a binary boundary map with 1 pixel wide
-        boundaries.  The boundary pixels are offset by 1/2 pixel towards the
-        origin from the actual segment boundary.
-        Arguments:
-            seg     : Segments labeled from 1..k.
-            width	  :	Width of desired bmap  <= seg.shape[1]
-            height  :	Height of desired bmap <= seg.shape[0]
-        Returns:
-            bmap (ndarray):	Binary boundary map.
-         David Martin <dmartin@eecs.berkeley.edu>
-         January 2003
-        """
-
-        seg = seg.astype(np.bool)
-        seg[seg > 0] = 1
-
-        assert np.atleast_3d(seg).shape[2] == 1
-
-        width = seg.shape[1] if width is None else width
-        height = seg.shape[0] if height is None else height
-
-        h, w = seg.shape[:2]
-
-        ar1 = float(width) / float(height)
-        ar2 = float(w) / float(h)
-
-        assert not (
-                width > w | height > h | abs(ar1 - ar2) > 0.01
-        ), "Can" "t convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
-
-        e = np.zeros_like(seg)
-        s = np.zeros_like(seg)
-        se = np.zeros_like(seg)
-
-        e[:, :-1] = seg[:, 1:]
-        s[:-1, :] = seg[1:, :]
-        se[:-1, :-1] = seg[1:, 1:]
-
-        b = seg ^ e | seg ^ s | seg ^ se
-        b[-1, :] = seg[-1, :] ^ e[-1, :]
-        b[:, -1] = seg[:, -1] ^ s[:, -1]
-        b[-1, -1] = 0
-
-        if w == width and h == height:
-            bmap = b
-        else:
-            bmap = np.zeros((height, width))
-            for x in range(w):
-                for y in range(h):
-                    if b[y, x]:
-                        j = 1 + math.floor((y - 1) + height / h)
-                        i = 1 + math.floor((x - 1) + width / h)
-                        bmap[j, i] = 1
-
-        return bmap
-
-    @staticmethod
-    def _compute_f(gt_data, tracker_data, tracker_data_id, gt_id, bound_th):
-        """
-        Perform F computation for a given gt and a given tracker ID. Adapted from
-        https://github.com/davisvideochallenge/davis2017-evaluation
-        :param gt_data: the encoded gt masks
-        :param tracker_data: the encoded tracker masks
-        :param tracker_data_id: the tracker ID
-        :param gt_id: the ground truth ID
-        :param bound_th: boundary threshold parameter
-        :return: the F value for the given tracker and gt ID
-        """
-
-        # Only loaded when run to reduce minimum requirements
-        from pycocotools import mask as mask_utils
-        from skimage.morphology import disk
-        import cv2
-
-        f = np.zeros(len(gt_data))
-
-        for t, (gt_masks, tracker_masks) in enumerate(zip(gt_data, tracker_data)):
-            curr_tracker_mask = mask_utils.decode(tracker_masks[tracker_data_id])
-            curr_gt_mask = mask_utils.decode(gt_masks[gt_id])
-            
-            bound_pix = bound_th if bound_th >= 1 - np.finfo('float').eps else \
-                np.ceil(bound_th * np.linalg.norm(curr_tracker_mask.shape))
-
-            # Get the pixel boundaries of both masks
-            fg_boundary = JAndF._seg2bmap(curr_tracker_mask)
-            gt_boundary = JAndF._seg2bmap(curr_gt_mask)
-
-            # fg_dil = binary_dilation(fg_boundary, disk(bound_pix))
-            fg_dil = cv2.dilate(fg_boundary.astype(np.uint8), disk(bound_pix).astype(np.uint8))
-            # gt_dil = binary_dilation(gt_boundary, disk(bound_pix))
-            gt_dil = cv2.dilate(gt_boundary.astype(np.uint8), disk(bound_pix).astype(np.uint8))
-
-            # Get the intersection
-            gt_match = gt_boundary * fg_dil
-            fg_match = fg_boundary * gt_dil
-
-            # Area of the intersection
-            n_fg = np.sum(fg_boundary)
-            n_gt = np.sum(gt_boundary)
-
-            # % Compute precision and recall
-            if n_fg == 0 and n_gt > 0:
-                precision = 1
-                recall = 0
-            elif n_fg > 0 and n_gt == 0:
-                precision = 0
-                recall = 1
-            elif n_fg == 0 and n_gt == 0:
-                precision = 1
-                recall = 1
-            else:
-                precision = np.sum(fg_match) / float(n_fg)
-                recall = np.sum(gt_match) / float(n_gt)
-
-            # Compute F measure
-            if precision + recall == 0:
-                f_val = 0
-            else:
-                f_val = 2 * precision * recall / (precision + recall)
-
-            f[t] = f_val
-
-        return f
-
-    @staticmethod
-    def _compute_j(gt_data, tracker_data, num_gt_ids, num_tracker_ids, num_timesteps):
-        """
-        Computation of J value for all ground truth IDs and all tracker IDs in the given sequence. Adapted from
-        https://github.com/davisvideochallenge/davis2017-evaluation
-        :param gt_data: the ground truth masks
-        :param tracker_data: the tracker masks
-        :param num_gt_ids: the number of ground truth IDs
-        :param num_tracker_ids: the number of tracker IDs
-        :param num_timesteps: the number of timesteps
-        :return: the J values
-        """
-
-        # Only loaded when run to reduce minimum requirements
-        from pycocotools import mask as mask_utils
-
-        j = np.zeros((num_tracker_ids, num_gt_ids, num_timesteps))
-
-        for t, (time_gt, time_data) in enumerate(zip(gt_data, tracker_data)):
-            # run length encoded masks with pycocotools
-            area_gt = mask_utils.area(time_gt)
-            time_data = list(time_data)
-            area_tr = mask_utils.area(time_data)
-
-            area_tr = np.repeat(area_tr[:, np.newaxis], len(area_gt), axis=1)
-            area_gt = np.repeat(area_gt[np.newaxis, :], len(area_tr), axis=0)
-
-            # mask iou computation with pycocotools
-            ious = np.atleast_2d(mask_utils.iou(time_data, time_gt, [0]*len(time_gt)))
-            # set iou to 1 if both masks are close to 0 (no ground truth and no predicted mask in timestep)
-            ious[np.isclose(area_tr, 0) & np.isclose(area_gt, 0)] = 1
-            assert (ious >= 0 - np.finfo('float').eps).all()
-            assert (ious <= 1 + np.finfo('float').eps).all()
-
-            j[..., t] = ious
-
-        return j
diff --git a/trackeval/metrics/st_map.py b/trackeval/metrics/st_map.py
deleted file mode 100644
index 44917be..0000000
--- a/trackeval/metrics/st_map.py
+++ /dev/null
@@ -1,578 +0,0 @@
-import numpy as np
-from ._base_metric import _BaseMetric
-from .. import _timing
-from functools import partial
-from .. import utils
-from ..utils import TrackEvalException
-
-
-class STMAP(_BaseMetric):
-    """Class which implements the STMAP metrics"""
-
-    @staticmethod
-    def get_default_metric_config():
-        """Default class config values"""
-        default_config = {
-            'USE_AREA_RANGES': True,  # whether to evaluate for certain area ranges
-            'AREA_RANGES': [[0 ** 2, 32 ** 2],  # additional area range sets for which STMAP is evaluated
-                            [32 ** 2, 96 ** 2],  # (all area range always included), default values for TAO
-                            [96 ** 2, 1e5 ** 2]],  # evaluation
-            'AREA_RANGE_LABELS': ["area_s", "area_m", "area_l"],  # the labels for the area ranges
-            'USE_TIME_RANGES': True,  # whether to evaluate for certain time ranges (length of tracks)
-            'TIME_RANGES': [[0, 3], [3, 10], [10, 1e5]],  # additional time range sets for which STMAP is evaluated
-            # (all time range always included) , default values for TAO evaluation
-            'TIME_RANGE_LABELS': ["time_s", "time_m", "time_l"],  # the labels for the time ranges
-            'IOU_THRESHOLDS': np.arange(0.05, 0.99, 0.05),  # np.arange(0.3, 0.96, 0.05),  # the IoU thresholds
-            # 'IOU_THRESHOLDS': np.arange(0.05, 0.69, 0.05),  # the IoU thresholds
-            # 'IOU_THRESHOLDS': np.arange(0.05, 0.49, 0.05),  # the IoU thresholds
-            # 'IOU_THRESHOLDS': np.arange(0.05, 0.29, 0.05),  # the IoU thresholds
-            'RECALL_THRESHOLDS': np.linspace(0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01) + 1), endpoint=True),
-            'rel_pos_rate': 0.7,  # the relative positive rate for the relation positive/negative matching
-            # recall thresholds at which precision is evaluated
-            'MAX_DETECTIONS': 0,  # limit the maximum number of considered tracks per sequence (0 for unlimited)
-            'PRINT_CONFIG': True
-        }
-        return default_config
-
-    def __init__(self, config=None):
-        super().__init__()
-        self.config = utils.init_config(config, self.get_default_metric_config(), self.get_name())
-
-        self.num_ig_masks = 1
-        self.lbls = ['all']
-        self.use_area_rngs = self.config['USE_AREA_RANGES']
-        if self.use_area_rngs:
-            self.area_rngs = self.config['AREA_RANGES']
-            self.area_rng_lbls = self.config['AREA_RANGE_LABELS']
-            self.num_ig_masks += len(self.area_rng_lbls)
-            self.lbls += self.area_rng_lbls
-
-        self.use_time_rngs = self.config['USE_TIME_RANGES']
-        if self.use_time_rngs:
-            self.time_rngs = self.config['TIME_RANGES']
-            self.time_rng_lbls = self.config['TIME_RANGE_LABELS']
-            self.num_ig_masks += len(self.time_rng_lbls)
-            self.lbls += self.time_rng_lbls
-
-        self.array_labels = self.config['IOU_THRESHOLDS']
-        self.rec_thrs = self.config['RECALL_THRESHOLDS']
-
-        self.maxDet = self.config['MAX_DETECTIONS']
-        self.float_array_fields = ['TI_AP_' + lbl for lbl in self.lbls] + ['TI_AR_' + lbl for lbl in self.lbls]
-        self.fields = self.float_array_fields
-        self.summary_fields = self.float_array_fields
-
-    @_timing.time
-    def eval_sequence(self, data):
-        """Calculates GT and Tracker matches for one sequence for STMAP metrics. Adapted from
-        https://github.com/TAO-Dataset/"""
-
-        # Initialise results to zero for each sequence as the fields are only defined over the set of all sequences
-        res = {}
-        for field in self.fields:
-            res[field] = [0 for _ in self.array_labels]
-
-        gt_ids, dt_ids = data['gt_track_ids'], data['dt_track_ids']
-
-        if len(gt_ids) == 0 and len(dt_ids) == 0:
-            for idx in range(self.num_ig_masks):
-                res[idx] = None
-            return res
-
-        # get track data
-        gt_tr_areas = data.get('gt_track_areas', None) if self.use_area_rngs else None
-        gt_tr_lengths = data.get('gt_track_lengths', None) if self.use_time_rngs else None
-        gt_tr_iscrowd = data.get('gt_track_iscrowd', None)
-        dt_tr_areas = data.get('dt_track_areas', None) if self.use_area_rngs else None
-        dt_tr_lengths = data.get('dt_track_lengths', None) if self.use_time_rngs else None
-        is_nel = data.get('not_exhaustively_labeled', False)
-
-        # compute ignore masks for different track sets to eval
-        gt_ig_masks = self._compute_track_ig_masks(len(gt_ids), track_lengths=gt_tr_lengths, track_areas=gt_tr_areas,
-                                                   iscrowd=gt_tr_iscrowd)
-        dt_ig_masks = self._compute_track_ig_masks(len(dt_ids), track_lengths=dt_tr_lengths, track_areas=dt_tr_areas,
-                                                   is_not_exhaustively_labeled=is_nel, is_gt=False)
-
-        boxformat = data.get('boxformat', 'xywh')
-        ious = self._compute_track_ious(data['dt_tracks'], data['gt_tracks'], iou_function=data['iou_type'],
-                                        boxformat=boxformat)
-        rel_pos, rel_neg = self._compute_track_rel(data['gt_track_is_main'], data['gt_track_rel_sub_class'], data['gt_track_rel_obj_class'],
-                           data['dt_track_is_main'], data['dt_track_rel_sub_list'], data['dt_track_rel_obj_list'])
-
-        num_thrs = len(self.array_labels)
-        num_recalls = len(self.rec_thrs)
-
-        # -1 for absent categories
-        precision = -np.ones(
-            (num_thrs, num_recalls, self.num_ig_masks)
-        )
-        recall = -np.ones((num_thrs, self.num_ig_masks))
-
-        for mask_idx in range(self.num_ig_masks):
-            gt_ig_mask = gt_ig_masks[mask_idx]
-
-            # Sort gt ignore last
-            gt_idx = np.argsort([g for g in gt_ig_mask], kind="mergesort")
-            gt_ids = [gt_ids[i] for i in gt_idx]
-
-            ious_sorted = ious[:, gt_idx] if len(ious) > 0 else ious
-            rel_pos_sorted = rel_pos[:, gt_idx] if len(rel_pos) > 0 else rel_pos
-            rel_neg_sorted = rel_neg[:, gt_idx] if len(rel_neg) > 0 else rel_neg
-
-            num_thrs = len(self.array_labels)
-            num_gt = len(gt_ids)
-            num_dt = len(dt_ids)
-
-            # Array to store the "id" of the matched dt/gt
-            gt_m = np.zeros((num_thrs, num_gt)) - 1
-            dt_m = np.zeros((num_thrs, num_dt)) - 1
-
-            gt_ig = np.array([gt_ig_mask[idx] for idx in gt_idx])
-            dt_ig = np.zeros((num_thrs, num_dt))
-
-            rel_pos_sel = 0
-            rel_neg_sel = 0
-            for iou_thr_idx, iou_thr in enumerate(self.array_labels):
-                if len(ious_sorted) == 0:
-                    break
-
-                for dt_idx, _dt in enumerate(dt_ids):
-                    iou = min([iou_thr, 1 - 1e-10])
-                    # information about best match so far (m=-1 -> unmatched)
-                    # store the gt_idx which matched for _dt
-                    m = -1
-                    for gt_idx, _ in enumerate(gt_ids):
-                        # if this gt already matched continue
-                        if gt_m[iou_thr_idx, gt_idx] > 0:
-                            continue
-                        # if _dt matched to reg gt, and on ignore gt, stop
-                        if m > -1 and gt_ig[m] == 0 and gt_ig[gt_idx] == 1:
-                            break
-                        # continue to next gt unless better match made
-                        if ious_sorted[dt_idx, gt_idx] < iou - np.finfo('float').eps:
-                            continue
-                        # if match successful and best so far, store appropriately
-                        iou = ious_sorted[dt_idx, gt_idx]
-                        rel_pos_sel = rel_pos_sorted[dt_idx, gt_idx]
-                        rel_neg_sel = rel_neg_sorted[dt_idx, gt_idx]
-                        m = gt_idx
-
-                    # No match found for _dt, go to next _dt
-                    if m == -1:
-                        continue
-
-                    # if gt to ignore for some reason update dt_ig.
-                    # Should not be used in evaluation.
-                    dt_ig[iou_thr_idx, dt_idx] = gt_ig[m]
-                    # _dt match found, update gt_m, and dt_m with "id"
-                    dt_m[iou_thr_idx, dt_idx] = gt_ids[m]
-                    gt_m[iou_thr_idx, m] = _dt
-
-                if rel_pos_sel + rel_neg_sel > 0 and rel_pos_sel/(rel_pos_sel + rel_neg_sel) < self.config['rel_pos_rate']:
-                    dt_m[iou_thr_idx, dt_idx] = -1
-
-            dt_ig_mask = dt_ig_masks[mask_idx]
-
-            dt_ig_mask = np.array(dt_ig_mask).reshape((1, num_dt))  # 1 X num_dt
-            dt_ig_mask = np.repeat(dt_ig_mask, num_thrs, 0)  # num_thrs X num_dt
-
-            # Based on dt_ig_mask ignore any unmatched detection by updating dt_ig
-            dt_ig = np.logical_or(dt_ig, np.logical_and(dt_m == -1, dt_ig_mask))
-            # store results for given video and category
-            res[mask_idx] = {
-                "dt_ids": dt_ids,
-                "gt_ids": gt_ids,
-                "dt_matches": dt_m,
-                "gt_matches": gt_m,
-                "dt_scores": data['dt_track_scores'],
-                "gt_ignore": gt_ig,
-                "dt_ignore": dt_ig,
-                "rel_pos_sel": rel_pos_sel,
-                "rel_neg_sel": rel_neg_sel
-            }
-
-            # --------------------------------------------------
-            tps = np.logical_and(dt_m != -1, np.logical_not(dt_ig))
-            fps = np.logical_and(dt_m == -1, np.logical_not(dt_ig))
-
-            tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
-            fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
-
-            for iou_thr_idx, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
-                tp = np.array(tp)
-                fp = np.array(fp)
-                num_tp = len(tp)
-                rc = tp / num_gt
-                if num_tp:
-                    recall[iou_thr_idx, mask_idx] = rc[-1]
-                else:
-                    recall[iou_thr_idx, mask_idx] = 0
-
-                # np.spacing(1) ~= eps
-                pr = tp / (fp + tp + np.spacing(1))
-                pr = pr.tolist()
-
-                # Ensure precision values are monotonically decreasing
-                for i in range(num_tp - 1, 0, -1):
-                    if pr[i] > pr[i - 1]:
-                        pr[i - 1] = pr[i]
-
-                # find indices at the predefined recall values
-                rec_thrs_insert_idx = np.searchsorted(rc, self.rec_thrs, side="left")
-
-                pr_at_recall = [0.0] * num_recalls
-
-                try:
-                    for _idx, pr_idx in enumerate(rec_thrs_insert_idx):
-                        pr_at_recall[_idx] = pr[pr_idx]
-                except IndexError:
-                    pass
-
-                precision[iou_thr_idx, :, mask_idx] = (np.array(pr_at_recall))
-
-        # res = {'precision': precision, 'recall': recall}
-
-        for a_id, alpha in enumerate(self.array_labels):
-            for lbl_idx, lbl in enumerate(self.lbls):
-                p = precision[a_id, :, lbl_idx]
-                if len(p[p > -1]) == 0:
-                    mean_p = -1
-                else:
-                    mean_p = np.mean(p[p > -1])
-                res['TI_AP_' + lbl][a_id] = mean_p
-                res['TI_AR_' + lbl][a_id] = recall[a_id, lbl_idx]
-
-        return res
-
-    def combine_sequences(self, all_res):
-        """Combines metrics across all sequences. Computes precision and recall values based on track matches.
-        Adapted from https://github.com/TAO-Dataset/
-        """
-        num_thrs = len(self.array_labels)
-        num_recalls = len(self.rec_thrs)
-
-        # -1 for absent categories
-        precision = -np.ones(
-            (num_thrs, num_recalls, self.num_ig_masks)
-        )
-        recall = -np.ones((num_thrs, self.num_ig_masks))
-
-        for ig_idx in range(self.num_ig_masks):
-            ig_idx_results = [res[ig_idx] for res in all_res.values() if res[ig_idx] is not None]
-
-            # Remove elements which are None
-            if len(ig_idx_results) == 0:
-                continue
-
-            # Append all scores: shape (N,)
-            # limit considered tracks for each sequence if maxDet > 0
-            if self.maxDet == 0:
-                dt_scores = np.concatenate([res["dt_scores"] for res in ig_idx_results], axis=0)
-
-                dt_idx = np.argsort(-dt_scores, kind="mergesort")
-
-                dt_m = np.concatenate([e["dt_matches"] for e in ig_idx_results],
-                                      axis=1)[:, dt_idx]
-                dt_ig = np.concatenate([e["dt_ignore"] for e in ig_idx_results],
-                                       axis=1)[:, dt_idx]
-            elif self.maxDet > 0:
-                dt_scores = np.concatenate([res["dt_scores"][0:self.maxDet] for res in ig_idx_results], axis=0)
-
-                dt_idx = np.argsort(-dt_scores, kind="mergesort")
-
-                dt_m = np.concatenate([e["dt_matches"][:, 0:self.maxDet] for e in ig_idx_results],
-                                      axis=1)[:, dt_idx]
-                dt_ig = np.concatenate([e["dt_ignore"][:, 0:self.maxDet] for e in ig_idx_results],
-                                       axis=1)[:, dt_idx]
-            else:
-                raise Exception("Number of maximum detections must be >= 0, but is set to %i" % self.maxDet)
-
-            gt_ig = np.concatenate([res["gt_ignore"] for res in ig_idx_results])
-            # num gt anns to consider
-            num_gt = np.count_nonzero(gt_ig == 0)
-
-            if num_gt == 0:
-                continue
-
-            tps = np.logical_and(dt_m != -1, np.logical_not(dt_ig))
-            fps = np.logical_and(dt_m == -1, np.logical_not(dt_ig))
-
-            tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
-            fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
-
-            for iou_thr_idx, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
-                tp = np.array(tp)
-                fp = np.array(fp)
-                num_tp = len(tp)
-                rc = tp / num_gt
-                if num_tp:
-                    recall[iou_thr_idx, ig_idx] = rc[-1]
-                else:
-                    recall[iou_thr_idx, ig_idx] = 0
-
-                # np.spacing(1) ~= eps
-                pr = tp / (fp + tp + np.spacing(1))
-                pr = pr.tolist()
-
-                # Ensure precision values are monotonically decreasing
-                for i in range(num_tp - 1, 0, -1):
-                    if pr[i] > pr[i - 1]:
-                        pr[i - 1] = pr[i]
-
-                # find indices at the predefined recall values
-                rec_thrs_insert_idx = np.searchsorted(rc, self.rec_thrs, side="left")
-
-                pr_at_recall = [0.0] * num_recalls
-
-                try:
-                    for _idx, pr_idx in enumerate(rec_thrs_insert_idx):
-                        pr_at_recall[_idx] = pr[pr_idx]
-                except IndexError:
-                    pass
-
-                precision[iou_thr_idx, :, ig_idx] = (np.array(pr_at_recall))
-
-        res = {'precision': precision, 'recall': recall}
-
-        # compute the precision and recall averages for the respective alpha thresholds and ignore masks
-        for lbl in self.lbls:
-            res['TI_AP_' + lbl] = np.zeros((len(self.array_labels)), dtype=np.float)
-            res['TI_AR_' + lbl] = np.zeros((len(self.array_labels)), dtype=np.float)
-
-        for a_id, alpha in enumerate(self.array_labels):
-            for lbl_idx, lbl in enumerate(self.lbls):
-                p = precision[a_id, :, lbl_idx]
-                if len(p[p > -1]) == 0:
-                    mean_p = -1
-                else:
-                    mean_p = np.mean(p[p > -1])
-                res['TI_AP_' + lbl][a_id] = mean_p
-                res['TI_AR_' + lbl][a_id] = recall[a_id, lbl_idx]
-
-        return res
-
-    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=True):
-        """Combines metrics across all classes by averaging over the class values
-        Note mAP is not well defined for 'empty classes' so 'ignore empty classes' is always true here.
-        """
-        res = {}
-        for field in self.fields:
-            res[field] = np.zeros((len(self.array_labels)), dtype=np.float)
-            field_stacked = np.array([res[field] for res in all_res.values()])
-
-            for a_id, alpha in enumerate(self.array_labels):
-                values = field_stacked[:, a_id]
-                if len(values[values > -1]) == 0:
-                    mean = -1
-                else:
-                    mean = np.mean(values[values > -1])
-                res[field][a_id] = mean
-        return res
-
-    def combine_classes_det_averaged(self, all_res):
-        """Combines metrics across all classes by averaging over the detection values"""
-
-        res = {}
-        for field in self.fields:
-            res[field] = np.zeros((len(self.array_labels)), dtype=np.float)
-            field_stacked = np.array([res[field] for res in all_res.values()])
-
-            for a_id, alpha in enumerate(self.array_labels):
-                values = field_stacked[:, a_id]
-                if len(values[values > -1]) == 0:
-                    mean = -1
-                else:
-                    mean = np.mean(values[values > -1])
-                res[field][a_id] = mean
-        return res
-
-    def _compute_track_ig_masks(self, num_ids, track_lengths=None, track_areas=None, iscrowd=None,
-                                is_not_exhaustively_labeled=False, is_gt=True):
-        """
-        Computes ignore masks for different track sets to evaluate
-        :param num_ids: the number of track IDs
-        :param track_lengths: the lengths of the tracks (number of timesteps)
-        :param track_areas: the average area of a track
-        :param iscrowd: whether a track is marked as crowd
-        :param is_not_exhaustively_labeled: whether the track category is not exhaustively labeled
-        :param is_gt: whether it is gt
-        :return: the track ignore masks
-        """
-        # for TAO tracks for classes which are not exhaustively labeled are not evaluated
-        if not is_gt and is_not_exhaustively_labeled:
-            track_ig_masks = [[1 for _ in range(num_ids)] for i in range(self.num_ig_masks)]
-        else:
-            # consider all tracks
-            track_ig_masks = [[0 for _ in range(num_ids)]]
-
-            # consider tracks with certain area
-            if self.use_area_rngs:
-                for rng in self.area_rngs:
-                    track_ig_masks.append([0 if rng[0] - np.finfo('float').eps <= area <= rng[1] + np.finfo('float').eps
-                                           else 1 for area in track_areas])
-
-            # consider tracks with certain duration
-            if self.use_time_rngs:
-                for rng in self.time_rngs:
-                    track_ig_masks.append([0 if rng[0] - np.finfo('float').eps <= length
-                                                <= rng[1] + np.finfo('float').eps else 1 for length in track_lengths])
-
-        # for YouTubeVIS evaluation tracks with crowd tag are not evaluated
-        if is_gt and iscrowd:
-            track_ig_masks = [np.logical_or(mask, iscrowd) for mask in track_ig_masks]
-
-        return track_ig_masks
-
-    @staticmethod
-    def _compute_bb_track_iou(dt_track, gt_track, boxformat='xywh'):
-        """
-        Calculates the track IoU for one detected track and one ground truth track for bounding boxes
-        :param dt_track: the detected track (format: dictionary with frame index as keys and
-                            numpy arrays as values)
-        :param gt_track: the ground truth track (format: dictionary with frame index as keys and
-                        numpy array as values)
-        :param boxformat: the format of the boxes
-        :return: the track IoU
-        """
-        intersect = 0
-        union = 0
-        image_ids = set(gt_track.keys()) | set(dt_track.keys())
-        for image in image_ids:
-            g = gt_track.get(image, None)
-            d = dt_track.get(image, None)
-            if boxformat == 'xywh':
-                if d is not None and g is not None:
-                    dx, dy, dw, dh = d
-                    gx, gy, gw, gh = g
-                    w = max(min(dx + dw, gx + gw) - max(dx, gx), 0)
-                    h = max(min(dy + dh, gy + gh) - max(dy, gy), 0)
-                    i = w * h
-                    u = dw * dh + gw * gh - i
-                    intersect += i
-                    union += u
-                elif d is None and g is not None:
-                    union += g[2] * g[3]
-                elif d is not None and g is None:
-                    union += d[2] * d[3]
-            elif boxformat == 'x0y0x1y1':
-                if d is not None and g is not None:
-                    dx0, dy0, dx1, dy1 = d
-                    gx0, gy0, gx1, gy1 = g
-                    w = max(min(dx1, gx1) - max(dx0, gx0), 0)
-                    h = max(min(dy1, gy1) - max(dy0, gy0), 0)
-                    i = w * h
-                    u = (dx1 - dx0) * (dy1 - dy0) + (gx1 - gx0) * (gy1 - gy0) - i
-                    intersect += i
-                    union += u
-                elif d is None and g is not None:
-                    union += (g[2] - g[0]) * (g[3] - g[1])
-                elif d is not None and g is None:
-                    union += (d[2] - d[0]) * (d[3] - d[1])
-            else:
-                raise TrackEvalException('BoxFormat not implemented')
-        if intersect > union:
-            raise TrackEvalException("Intersection value > union value. Are the box values corrupted?")
-        return intersect / union if union > 0 else 0
-
-    @staticmethod
-    def _compute_mask_track_iou(dt_track, gt_track):
-        """
-        Calculates the track IoU for one detected track and one ground truth track for segmentation masks
-        :param dt_track: the detected track (format: dictionary with frame index as keys and
-                            pycocotools rle encoded masks as values)
-        :param gt_track: the ground truth track (format: dictionary with frame index as keys and
-                            pycocotools rle encoded masks as values)
-        :return: the track IoU
-        """
-        # only loaded when needed to reduce minimum requirements
-        from pycocotools import mask as mask_utils
-
-        intersect = .0
-        union = .0
-        image_ids = set(gt_track.keys()) | set(dt_track.keys())
-        for image in image_ids:
-            g = gt_track.get(image, None)
-            d = dt_track.get(image, None)
-            if d and g:
-                intersect += mask_utils.area(mask_utils.merge([d, g], True))
-                union += mask_utils.area(mask_utils.merge([d, g], False))
-            elif not d and g:
-                union += mask_utils.area(g)
-            elif d and not g:
-                union += mask_utils.area(d)
-        if union < 0.0 - np.finfo('float').eps:
-            raise TrackEvalException("Union value < 0. Are the segmentaions corrupted?")
-        if intersect > union:
-            raise TrackEvalException("Intersection value > union value. Are the segmentations corrupted?")
-        iou = intersect / union if union > 0.0 + np.finfo('float').eps else 0.0
-        return iou
-
-    @staticmethod
-    def _compute_track_ious(dt, gt, iou_function='bbox', boxformat='xywh'):
-        """
-        Calculate track IoUs for a set of ground truth tracks and a set of detected tracks
-        """
-
-        if len(gt) == 0 and len(dt) == 0:
-            return []
-
-        if iou_function == 'bbox':
-            track_iou_function = partial(STMAP._compute_bb_track_iou, boxformat=boxformat)
-        elif iou_function == 'mask':
-            track_iou_function = partial(STMAP._compute_mask_track_iou)
-        else:
-            raise Exception('IoU function not implemented')
-
-        ious = np.zeros([len(dt), len(gt)])
-        for i, j in np.ndindex(ious.shape):
-            ious[i, j] = track_iou_function(dt[i], gt[j])
-        return ious
-
-    @staticmethod
-    def _compute_track_rel(gt_track_is_main, gt_track_rel_sub_class, gt_track_rel_obj_class,
-                           dt_track_is_main, dt_track_rel_sub_list, dt_track_rel_obj_list):
-        """
-        Calculate the correct number of relation for a set of ground truth tracks and a set of detected tracks
-        """
-        rel_stat = []
-        if len(gt_track_rel_sub_class) == 0 and len(gt_track_rel_obj_class) == 0 and len(dt_track_rel_sub_list) == 0 and len(dt_track_rel_obj_list) == 0:
-            return []
-        rel_pos = np.zeros([len(dt_track_is_main), len(gt_track_is_main)])
-        rel_neg = np.zeros([len(dt_track_is_main), len(gt_track_is_main)])
-        for i, j in np.ndindex(rel_pos.shape):
-            image_ids = set(gt_track_rel_sub_class[j].keys()) | set(dt_track_rel_sub_list[i].keys())
-            for image in image_ids:
-                dt_sub = dt_track_rel_sub_list[i].get(image, None)
-                dt_obj = dt_track_rel_obj_list[i].get(image, None)
-                dt_is_main = dt_track_is_main[i].get(image, None)
-                gt_sub = gt_track_rel_sub_class[j].get(image, None)
-                gt_obj = gt_track_rel_obj_class[j].get(image, None)
-                gt_is_main = gt_track_is_main[j].get(image, None)
-                print()
-                # if dt_sub is None or dt_obj is None or gt_sub is None or gt_obj is None or dt_is_main is None or gt_is_main is None:
-                if (dt_sub is None and gt_sub is not None) or (dt_obj is None and gt_obj is not None):
-                    rel_neg[i, j] += 1
-                    continue
-                if dt_is_main == gt_is_main:
-                    if gt_sub in dt_sub or gt_obj in dt_obj:
-                        rel_pos[i, j] += 1
-                    else:
-                        rel_neg[i, j] += 1
-                elif (dt_is_main != gt_is_main and (gt_obj in dt_sub) or (gt_sub in dt_obj)):
-                    rel_pos[i, j] += 1
-                else:
-                    rel_neg[i, j] += 1
-                # else:
-                #     rel_neg[i, j] += 1
-
-        return rel_pos, rel_neg
-
-    @staticmethod
-    def _row_print(*argv):
-        """Prints results in an evenly spaced rows, with more space in first row"""
-        if len(argv) == 1:
-            argv = argv[0]
-        to_print = '%-40s' % argv[0]
-        for v in argv[1:]:
-            to_print += '%-12s' % str(v)
-        print(to_print)
diff --git a/trackeval/metrics/track_map.py b/trackeval/metrics/track_map.py
deleted file mode 100644
index 9054a58..0000000
--- a/trackeval/metrics/track_map.py
+++ /dev/null
@@ -1,462 +0,0 @@
-import numpy as np
-from ._base_metric import _BaseMetric
-from .. import _timing
-from functools import partial
-from .. import utils
-from ..utils import TrackEvalException
-
-
-class TrackMAP(_BaseMetric):
-    """Class which implements the TrackMAP metrics"""
-
-    @staticmethod
-    def get_default_metric_config():
-        """Default class config values"""
-        default_config = {
-            'USE_AREA_RANGES': True,  # whether to evaluate for certain area ranges
-            'AREA_RANGES': [[0 ** 2, 32 ** 2],  # additional area range sets for which TrackMAP is evaluated
-                            [32 ** 2, 96 ** 2],  # (all area range always included), default values for TAO
-                            [96 ** 2, 1e5 ** 2]],  # evaluation
-            'AREA_RANGE_LABELS': ["area_s", "area_m", "area_l"],  # the labels for the area ranges
-            'USE_TIME_RANGES': True,  # whether to evaluate for certain time ranges (length of tracks)
-            'TIME_RANGES': [[0, 3], [3, 10], [10, 1e5]],  # additional time range sets for which TrackMAP is evaluated
-            # (all time range always included) , default values for TAO evaluation
-            'TIME_RANGE_LABELS': ["time_s", "time_m", "time_l"],  # the labels for the time ranges
-            'IOU_THRESHOLDS': np.arange(0.05, 0.99, 0.05),  # the IoU thresholds
-            'RECALL_THRESHOLDS': np.linspace(0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01) + 1), endpoint=True),
-            # recall thresholds at which precision is evaluated
-            'MAX_DETECTIONS': 0,  # limit the maximum number of considered tracks per sequence (0 for unlimited)
-            'PRINT_CONFIG': True
-        }
-        return default_config
-
-    def __init__(self, config=None):
-        super().__init__()
-        self.config = utils.init_config(config, self.get_default_metric_config(), self.get_name())
-
-        self.num_ig_masks = 1
-        self.lbls = ['all']
-        self.use_area_rngs = self.config['USE_AREA_RANGES']
-        if self.use_area_rngs:
-            self.area_rngs = self.config['AREA_RANGES']
-            self.area_rng_lbls = self.config['AREA_RANGE_LABELS']
-            self.num_ig_masks += len(self.area_rng_lbls)
-            self.lbls += self.area_rng_lbls
-
-        self.use_time_rngs = self.config['USE_TIME_RANGES']
-        if self.use_time_rngs:
-            self.time_rngs = self.config['TIME_RANGES']
-            self.time_rng_lbls = self.config['TIME_RANGE_LABELS']
-            self.num_ig_masks += len(self.time_rng_lbls)
-            self.lbls += self.time_rng_lbls
-
-        self.array_labels = self.config['IOU_THRESHOLDS']
-        self.rec_thrs = self.config['RECALL_THRESHOLDS']
-
-        self.maxDet = self.config['MAX_DETECTIONS']
-        self.float_array_fields = ['AP_' + lbl for lbl in self.lbls] + ['AR_' + lbl for lbl in self.lbls]
-        self.fields = self.float_array_fields
-        self.summary_fields = self.float_array_fields
-
-    @_timing.time
-    def eval_sequence(self, data):
-        """Calculates GT and Tracker matches for one sequence for TrackMAP metrics. Adapted from
-        https://github.com/TAO-Dataset/"""
-
-        # Initialise results to zero for each sequence as the fields are only defined over the set of all sequences
-        res = {}
-        for field in self.fields:
-            res[field] = [0 for _ in self.array_labels]
-
-        gt_ids, dt_ids = data['gt_track_ids'], data['dt_track_ids']
-
-        if len(gt_ids) == 0 and len(dt_ids) == 0:
-            for idx in range(self.num_ig_masks):
-                res[idx] = None
-            return res
-
-        # get track data
-        gt_tr_areas = data.get('gt_track_areas', None) if self.use_area_rngs else None
-        gt_tr_lengths = data.get('gt_track_lengths', None) if self.use_time_rngs else None
-        gt_tr_iscrowd = data.get('gt_track_iscrowd', None)
-        dt_tr_areas = data.get('dt_track_areas', None) if self.use_area_rngs else None
-        dt_tr_lengths = data.get('dt_track_lengths', None) if self.use_time_rngs else None
-        is_nel = data.get('not_exhaustively_labeled', False)
-
-        # compute ignore masks for different track sets to eval
-        gt_ig_masks = self._compute_track_ig_masks(len(gt_ids), track_lengths=gt_tr_lengths, track_areas=gt_tr_areas,
-                                                   iscrowd=gt_tr_iscrowd)
-        dt_ig_masks = self._compute_track_ig_masks(len(dt_ids), track_lengths=dt_tr_lengths, track_areas=dt_tr_areas,
-                                                   is_not_exhaustively_labeled=is_nel, is_gt=False)
-
-        boxformat = data.get('boxformat', 'xywh')
-        ious = self._compute_track_ious(data['dt_tracks'], data['gt_tracks'], iou_function=data['iou_type'],
-                                        boxformat=boxformat)
-
-        for mask_idx in range(self.num_ig_masks):
-            gt_ig_mask = gt_ig_masks[mask_idx]
-
-            # Sort gt ignore last
-            gt_idx = np.argsort([g for g in gt_ig_mask], kind="mergesort")
-            gt_ids = [gt_ids[i] for i in gt_idx]
-
-            ious_sorted = ious[:, gt_idx] if len(ious) > 0 else ious
-
-            num_thrs = len(self.array_labels)
-            num_gt = len(gt_ids)
-            num_dt = len(dt_ids)
-
-            # Array to store the "id" of the matched dt/gt
-            gt_m = np.zeros((num_thrs, num_gt)) - 1
-            dt_m = np.zeros((num_thrs, num_dt)) - 1
-
-            gt_ig = np.array([gt_ig_mask[idx] for idx in gt_idx])
-            dt_ig = np.zeros((num_thrs, num_dt))
-
-            for iou_thr_idx, iou_thr in enumerate(self.array_labels):
-                if len(ious_sorted) == 0:
-                    break
-
-                for dt_idx, _dt in enumerate(dt_ids):
-                    iou = min([iou_thr, 1 - 1e-10])
-                    # information about best match so far (m=-1 -> unmatched)
-                    # store the gt_idx which matched for _dt
-                    m = -1
-                    for gt_idx, _ in enumerate(gt_ids):
-                        # if this gt already matched continue
-                        if gt_m[iou_thr_idx, gt_idx] > 0:
-                            continue
-                        # if _dt matched to reg gt, and on ignore gt, stop
-                        if m > -1 and gt_ig[m] == 0 and gt_ig[gt_idx] == 1:
-                            break
-                        # continue to next gt unless better match made
-                        if ious_sorted[dt_idx, gt_idx] < iou - np.finfo('float').eps:
-                            continue
-                        # if match successful and best so far, store appropriately
-                        iou = ious_sorted[dt_idx, gt_idx]
-                        m = gt_idx
-
-                    # No match found for _dt, go to next _dt
-                    if m == -1:
-                        continue
-
-                    # if gt to ignore for some reason update dt_ig.
-                    # Should not be used in evaluation.
-                    dt_ig[iou_thr_idx, dt_idx] = gt_ig[m]
-                    # _dt match found, update gt_m, and dt_m with "id"
-                    dt_m[iou_thr_idx, dt_idx] = gt_ids[m]
-                    gt_m[iou_thr_idx, m] = _dt
-
-            dt_ig_mask = dt_ig_masks[mask_idx]
-
-            dt_ig_mask = np.array(dt_ig_mask).reshape((1, num_dt))  # 1 X num_dt
-            dt_ig_mask = np.repeat(dt_ig_mask, num_thrs, 0)  # num_thrs X num_dt
-
-            # Based on dt_ig_mask ignore any unmatched detection by updating dt_ig
-            dt_ig = np.logical_or(dt_ig, np.logical_and(dt_m == -1, dt_ig_mask))
-            # store results for given video and category
-            res[mask_idx] = {
-                "dt_ids": dt_ids,
-                "gt_ids": gt_ids,
-                "dt_matches": dt_m,
-                "gt_matches": gt_m,
-                "dt_scores": data['dt_track_scores'],
-                "gt_ignore": gt_ig,
-                "dt_ignore": dt_ig,
-            }
-
-        return res
-
-    def combine_sequences(self, all_res):
-        """Combines metrics across all sequences. Computes precision and recall values based on track matches.
-        Adapted from https://github.com/TAO-Dataset/
-        """
-        num_thrs = len(self.array_labels)
-        num_recalls = len(self.rec_thrs)
-
-        # -1 for absent categories
-        precision = -np.ones(
-            (num_thrs, num_recalls, self.num_ig_masks)
-        )
-        recall = -np.ones((num_thrs, self.num_ig_masks))
-
-        for ig_idx in range(self.num_ig_masks):
-            ig_idx_results = [res[ig_idx] for res in all_res.values() if res[ig_idx] is not None]
-
-            # Remove elements which are None
-            if len(ig_idx_results) == 0:
-                continue
-
-            # Append all scores: shape (N,)
-            # limit considered tracks for each sequence if maxDet > 0
-            if self.maxDet == 0:
-                dt_scores = np.concatenate([res["dt_scores"] for res in ig_idx_results], axis=0)
-
-                dt_idx = np.argsort(-dt_scores, kind="mergesort")
-
-                dt_m = np.concatenate([e["dt_matches"] for e in ig_idx_results],
-                                      axis=1)[:, dt_idx]
-                dt_ig = np.concatenate([e["dt_ignore"] for e in ig_idx_results],
-                                       axis=1)[:, dt_idx]
-            elif self.maxDet > 0:
-                dt_scores = np.concatenate([res["dt_scores"][0:self.maxDet] for res in ig_idx_results], axis=0)
-
-                dt_idx = np.argsort(-dt_scores, kind="mergesort")
-
-                dt_m = np.concatenate([e["dt_matches"][:, 0:self.maxDet] for e in ig_idx_results],
-                                      axis=1)[:, dt_idx]
-                dt_ig = np.concatenate([e["dt_ignore"][:, 0:self.maxDet] for e in ig_idx_results],
-                                       axis=1)[:, dt_idx]
-            else:
-                raise Exception("Number of maximum detections must be >= 0, but is set to %i" % self.maxDet)
-
-            gt_ig = np.concatenate([res["gt_ignore"] for res in ig_idx_results])
-            # num gt anns to consider
-            num_gt = np.count_nonzero(gt_ig == 0)
-
-            if num_gt == 0:
-                continue
-
-            tps = np.logical_and(dt_m != -1, np.logical_not(dt_ig))
-            fps = np.logical_and(dt_m == -1, np.logical_not(dt_ig))
-
-            tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
-            fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
-
-            for iou_thr_idx, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
-                tp = np.array(tp)
-                fp = np.array(fp)
-                num_tp = len(tp)
-                rc = tp / num_gt
-                if num_tp:
-                    recall[iou_thr_idx, ig_idx] = rc[-1]
-                else:
-                    recall[iou_thr_idx, ig_idx] = 0
-
-                # np.spacing(1) ~= eps
-                pr = tp / (fp + tp + np.spacing(1))
-                pr = pr.tolist()
-
-                # Ensure precision values are monotonically decreasing
-                for i in range(num_tp - 1, 0, -1):
-                    if pr[i] > pr[i - 1]:
-                        pr[i - 1] = pr[i]
-
-                # find indices at the predefined recall values
-                rec_thrs_insert_idx = np.searchsorted(rc, self.rec_thrs, side="left")
-
-                pr_at_recall = [0.0] * num_recalls
-
-                try:
-                    for _idx, pr_idx in enumerate(rec_thrs_insert_idx):
-                        pr_at_recall[_idx] = pr[pr_idx]
-                except IndexError:
-                    pass
-
-                precision[iou_thr_idx, :, ig_idx] = (np.array(pr_at_recall))
-
-        res = {'precision': precision, 'recall': recall}
-
-        # compute the precision and recall averages for the respective alpha thresholds and ignore masks
-        for lbl in self.lbls:
-            res['AP_' + lbl] = np.zeros((len(self.array_labels)), dtype=np.float)
-            res['AR_' + lbl] = np.zeros((len(self.array_labels)), dtype=np.float)
-
-        for a_id, alpha in enumerate(self.array_labels):
-            for lbl_idx, lbl in enumerate(self.lbls):
-                p = precision[a_id, :, lbl_idx]
-                if len(p[p > -1]) == 0:
-                    mean_p = -1
-                else:
-                    mean_p = np.mean(p[p > -1])
-                res['AP_' + lbl][a_id] = mean_p
-                res['AR_' + lbl][a_id] = recall[a_id, lbl_idx]
-
-        return res
-
-    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=True):
-        """Combines metrics across all classes by averaging over the class values
-        Note mAP is not well defined for 'empty classes' so 'ignore empty classes' is always true here.
-        """
-        res = {}
-        for field in self.fields:
-            res[field] = np.zeros((len(self.array_labels)), dtype=np.float)
-            field_stacked = np.array([res[field] for res in all_res.values()])
-
-            for a_id, alpha in enumerate(self.array_labels):
-                values = field_stacked[:, a_id]
-                if len(values[values > -1]) == 0:
-                    mean = -1
-                else:
-                    mean = np.mean(values[values > -1])
-                res[field][a_id] = mean
-        return res
-
-    def combine_classes_det_averaged(self, all_res):
-        """Combines metrics across all classes by averaging over the detection values"""
-
-        res = {}
-        for field in self.fields:
-            res[field] = np.zeros((len(self.array_labels)), dtype=np.float)
-            field_stacked = np.array([res[field] for res in all_res.values()])
-
-            for a_id, alpha in enumerate(self.array_labels):
-                values = field_stacked[:, a_id]
-                if len(values[values > -1]) == 0:
-                    mean = -1
-                else:
-                    mean = np.mean(values[values > -1])
-                res[field][a_id] = mean
-        return res
-
-    def _compute_track_ig_masks(self, num_ids, track_lengths=None, track_areas=None, iscrowd=None,
-                                is_not_exhaustively_labeled=False, is_gt=True):
-        """
-        Computes ignore masks for different track sets to evaluate
-        :param num_ids: the number of track IDs
-        :param track_lengths: the lengths of the tracks (number of timesteps)
-        :param track_areas: the average area of a track
-        :param iscrowd: whether a track is marked as crowd
-        :param is_not_exhaustively_labeled: whether the track category is not exhaustively labeled
-        :param is_gt: whether it is gt
-        :return: the track ignore masks
-        """
-        # for TAO tracks for classes which are not exhaustively labeled are not evaluated
-        if not is_gt and is_not_exhaustively_labeled:
-            track_ig_masks = [[1 for _ in range(num_ids)] for i in range(self.num_ig_masks)]
-        else:
-            # consider all tracks
-            track_ig_masks = [[0 for _ in range(num_ids)]]
-
-            # consider tracks with certain area
-            if self.use_area_rngs:
-                for rng in self.area_rngs:
-                    track_ig_masks.append([0 if rng[0] - np.finfo('float').eps <= area <= rng[1] + np.finfo('float').eps
-                                           else 1 for area in track_areas])
-
-            # consider tracks with certain duration
-            if self.use_time_rngs:
-                for rng in self.time_rngs:
-                    track_ig_masks.append([0 if rng[0] - np.finfo('float').eps <= length
-                                                <= rng[1] + np.finfo('float').eps else 1 for length in track_lengths])
-
-        # for YouTubeVIS evaluation tracks with crowd tag are not evaluated
-        if is_gt and iscrowd:
-            track_ig_masks = [np.logical_or(mask, iscrowd) for mask in track_ig_masks]
-
-        return track_ig_masks
-
-    @staticmethod
-    def _compute_bb_track_iou(dt_track, gt_track, boxformat='xywh'):
-        """
-        Calculates the track IoU for one detected track and one ground truth track for bounding boxes
-        :param dt_track: the detected track (format: dictionary with frame index as keys and
-                            numpy arrays as values)
-        :param gt_track: the ground truth track (format: dictionary with frame index as keys and
-                        numpy array as values)
-        :param boxformat: the format of the boxes
-        :return: the track IoU
-        """
-        intersect = 0
-        union = 0
-        image_ids = set(gt_track.keys()) | set(dt_track.keys())
-        for image in image_ids:
-            g = gt_track.get(image, None)
-            d = dt_track.get(image, None)
-            if boxformat == 'xywh':
-                if d is not None and g is not None:
-                    dx, dy, dw, dh = d
-                    gx, gy, gw, gh = g
-                    w = max(min(dx + dw, gx + gw) - max(dx, gx), 0)
-                    h = max(min(dy + dh, gy + gh) - max(dy, gy), 0)
-                    i = w * h
-                    u = dw * dh + gw * gh - i
-                    intersect += i
-                    union += u
-                elif d is None and g is not None:
-                    union += g[2] * g[3]
-                elif d is not None and g is None:
-                    union += d[2] * d[3]
-            elif boxformat == 'x0y0x1y1':
-                if d is not None and g is not None:
-                    dx0, dy0, dx1, dy1 = d
-                    gx0, gy0, gx1, gy1 = g
-                    w = max(min(dx1, gx1) - max(dx0, gx0), 0)
-                    h = max(min(dy1, gy1) - max(dy0, gy0), 0)
-                    i = w * h
-                    u = (dx1 - dx0) * (dy1 - dy0) + (gx1 - gx0) * (gy1 - gy0) - i
-                    intersect += i
-                    union += u
-                elif d is None and g is not None:
-                    union += (g[2] - g[0]) * (g[3] - g[1])
-                elif d is not None and g is None:
-                    union += (d[2] - d[0]) * (d[3] - d[1])
-            else:
-                raise TrackEvalException('BoxFormat not implemented')
-        if intersect > union:
-            raise TrackEvalException("Intersection value > union value. Are the box values corrupted?")
-        return intersect / union if union > 0 else 0
-
-    @staticmethod
-    def _compute_mask_track_iou(dt_track, gt_track):
-        """
-        Calculates the track IoU for one detected track and one ground truth track for segmentation masks
-        :param dt_track: the detected track (format: dictionary with frame index as keys and
-                            pycocotools rle encoded masks as values)
-        :param gt_track: the ground truth track (format: dictionary with frame index as keys and
-                            pycocotools rle encoded masks as values)
-        :return: the track IoU
-        """
-        # only loaded when needed to reduce minimum requirements
-        from pycocotools import mask as mask_utils
-
-        intersect = .0
-        union = .0
-        image_ids = set(gt_track.keys()) | set(dt_track.keys())
-        for image in image_ids:
-            g = gt_track.get(image, None)
-            d = dt_track.get(image, None)
-            if d and g:
-                intersect += mask_utils.area(mask_utils.merge([d, g], True))
-                union += mask_utils.area(mask_utils.merge([d, g], False))
-            elif not d and g:
-                union += mask_utils.area(g)
-            elif d and not g:
-                union += mask_utils.area(d)
-        if union < 0.0 - np.finfo('float').eps:
-            raise TrackEvalException("Union value < 0. Are the segmentaions corrupted?")
-        if intersect > union:
-            raise TrackEvalException("Intersection value > union value. Are the segmentations corrupted?")
-        iou = intersect / union if union > 0.0 + np.finfo('float').eps else 0.0
-        return iou
-
-    @staticmethod
-    def _compute_track_ious(dt, gt, iou_function='bbox', boxformat='xywh'):
-        """
-        Calculate track IoUs for a set of ground truth tracks and a set of detected tracks
-        """
-
-        if len(gt) == 0 and len(dt) == 0:
-            return []
-
-        if iou_function == 'bbox':
-            track_iou_function = partial(TrackMAP._compute_bb_track_iou, boxformat=boxformat)
-        elif iou_function == 'mask':
-            track_iou_function = partial(TrackMAP._compute_mask_track_iou)
-        else:
-            raise Exception('IoU function not implemented')
-
-        ious = np.zeros([len(dt), len(gt)])
-        for i, j in np.ndindex(ious.shape):
-            ious[i, j] = track_iou_function(dt[i], gt[j])
-        return ious
-
-    @staticmethod
-    def _row_print(*argv):
-        """Prints results in an evenly spaced rows, with more space in first row"""
-        if len(argv) == 1:
-            argv = argv[0]
-        to_print = '%-40s' % argv[0]
-        for v in argv[1:]:
-            to_print += '%-12s' % str(v)
-        print(to_print)
diff --git a/trackeval/metrics/vace.py b/trackeval/metrics/vace.py
deleted file mode 100644
index 81858d4..0000000
--- a/trackeval/metrics/vace.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import numpy as np
-from scipy.optimize import linear_sum_assignment
-from ._base_metric import _BaseMetric
-from .. import _timing
-
-
-class VACE(_BaseMetric):
-    """Class which implements the VACE metrics.
-
-    The metrics are described in:
-    Manohar et al. (2006) "Performance Evaluation of Object Detection and Tracking in Video"
-    https://link.springer.com/chapter/10.1007/11612704_16
-
-    This implementation uses the "relaxed" variant of the metrics,
-    where an overlap threshold is applied in each frame.
-    """
-
-    def __init__(self, config=None):
-        super().__init__()
-        self.integer_fields = ['VACE_IDs', 'VACE_GT_IDs', 'num_non_empty_timesteps']
-        self.float_fields = ['STDA', 'ATA', 'FDA', 'SFDA']
-        self.fields = self.integer_fields + self.float_fields
-        self.summary_fields = ['SFDA', 'ATA']
-
-        # Fields that are accumulated over multiple videos.
-        self._additive_fields = self.integer_fields + ['STDA', 'FDA']
-
-        self.threshold = 0.5
-
-    @_timing.time
-    def eval_sequence(self, data):
-        """Calculates VACE metrics for one sequence.
-
-        Depends on the fields:
-            data['num_gt_ids']
-            data['num_tracker_ids']
-            data['gt_ids']
-            data['tracker_ids']
-            data['similarity_scores']
-        """
-        res = {}
-
-        # Obtain Average Tracking Accuracy (ATA) using track correspondence.
-        # Obtain counts necessary to compute temporal IOU.
-        # Assume that integer counts can be represented exactly as floats.
-        potential_matches_count = np.zeros((data['num_gt_ids'], data['num_tracker_ids']))
-        gt_id_count = np.zeros(data['num_gt_ids'])
-        tracker_id_count = np.zeros(data['num_tracker_ids'])
-        both_present_count = np.zeros((data['num_gt_ids'], data['num_tracker_ids']))
-        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
-            # Count the number of frames in which two tracks satisfy the overlap criterion.
-            matches_mask = np.greater_equal(data['similarity_scores'][t], self.threshold)
-            match_idx_gt, match_idx_tracker = np.nonzero(matches_mask)
-            potential_matches_count[gt_ids_t[match_idx_gt], tracker_ids_t[match_idx_tracker]] += 1
-            # Count the number of frames in which the tracks are present.
-            gt_id_count[gt_ids_t] += 1
-            tracker_id_count[tracker_ids_t] += 1
-            both_present_count[gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]] += 1
-        # Number of frames in which either track is present (union of the two sets of frames).
-        union_count = (gt_id_count[:, np.newaxis]
-                       + tracker_id_count[np.newaxis, :]
-                       - both_present_count)
-        # The denominator should always be non-zero if all tracks are non-empty.
-        with np.errstate(divide='raise', invalid='raise'):
-            temporal_iou = potential_matches_count / union_count
-        # Find assignment that maximizes temporal IOU.
-        match_rows, match_cols = linear_sum_assignment(-temporal_iou)
-        res['STDA'] = temporal_iou[match_rows, match_cols].sum()
-        res['VACE_IDs'] = data['num_tracker_ids']
-        res['VACE_GT_IDs'] = data['num_gt_ids']
-
-        # Obtain Frame Detection Accuracy (FDA) using per-frame correspondence.
-        non_empty_count = 0
-        fda = 0
-        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
-            n_g = len(gt_ids_t)
-            n_d = len(tracker_ids_t)
-            if not (n_g or n_d):
-                continue
-            # n_g > 0 or n_d > 0
-            non_empty_count += 1
-            if not (n_g and n_d):
-                continue
-            # n_g > 0 and n_d > 0
-            spatial_overlap = data['similarity_scores'][t]
-            match_rows, match_cols = linear_sum_assignment(-spatial_overlap)
-            overlap_ratio = spatial_overlap[match_rows, match_cols].sum()
-            fda += overlap_ratio / (0.5 * (n_g + n_d))
-        res['FDA'] = fda
-        res['num_non_empty_timesteps'] = non_empty_count
-
-        res.update(self._compute_final_fields(res))
-        return res
-
-    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=True):
-        """Combines metrics across all classes by averaging over the class values.
-        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
-        """
-        res = {}
-        for field in self.fields:
-            if ignore_empty_classes:
-                res[field] = np.mean([v[field] for v in all_res.values()
-                                  if v['VACE_GT_IDs'] > 0 or v['VACE_IDs'] > 0], axis=0)
-            else:
-                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
-        return res
-
-    def combine_classes_det_averaged(self, all_res):
-        """Combines metrics across all classes by averaging over the detection values"""
-        res = {}
-        for field in self._additive_fields:
-            res[field] = _BaseMetric._combine_sum(all_res, field)
-        res = self._compute_final_fields(res)
-        return res
-
-    def combine_sequences(self, all_res):
-        """Combines metrics across all sequences"""
-        res = {}
-        for header in self._additive_fields:
-            res[header] = _BaseMetric._combine_sum(all_res, header)
-        res.update(self._compute_final_fields(res))
-        return res
-
-    @staticmethod
-    def _compute_final_fields(additive):
-        final = {}
-        with np.errstate(invalid='ignore'):  # Permit nan results.
-            final['ATA'] = (additive['STDA'] /
-                            (0.5 * (additive['VACE_IDs'] + additive['VACE_GT_IDs'])))
-            final['SFDA'] = additive['FDA'] / additive['num_non_empty_timesteps']
-        return final
diff --git a/trackeval/plotting.py b/trackeval/plotting.py
deleted file mode 100644
index e76fd08..0000000
--- a/trackeval/plotting.py
+++ /dev/null
@@ -1,230 +0,0 @@
-
-import os
-import numpy as np
-from .utils import TrackEvalException
-
-
-def plot_compare_trackers(tracker_folder, tracker_list, cls, output_folder, plots_list=None):
-    """Create plots which compare metrics across different trackers."""
-    # Define what to plot
-    if plots_list is None:
-        plots_list = get_default_plots_list()
-
-    # Load data
-    data = load_multiple_tracker_summaries(tracker_folder, tracker_list, cls)
-    out_loc = os.path.join(output_folder, cls)
-
-    # Plot
-    for args in plots_list:
-        create_comparison_plot(data, out_loc, *args)
-
-
-def get_default_plots_list():
-    # y_label, x_label, sort_label, bg_label, bg_function
-    plots_list = [
-        ['AssA', 'DetA', 'HOTA', 'HOTA', 'geometric_mean'],
-        ['AssPr', 'AssRe', 'HOTA', 'AssA', 'jaccard'],
-        ['DetPr', 'DetRe', 'HOTA', 'DetA', 'jaccard'],
-        ['HOTA(0)', 'LocA(0)', 'HOTA', 'HOTALocA(0)', 'multiplication'],
-        ['HOTA', 'LocA', 'HOTA', None, None],
-
-        ['HOTA', 'MOTA', 'HOTA', None, None],
-        ['HOTA', 'IDF1', 'HOTA', None, None],
-        ['IDF1', 'MOTA', 'HOTA', None, None],
-    ]
-    return plots_list
-
-
-def load_multiple_tracker_summaries(tracker_folder, tracker_list, cls):
-    """Loads summary data for multiple trackers."""
-    data = {}
-    for tracker in tracker_list:
-        with open(os.path.join(tracker_folder, tracker, cls + '_summary.txt')) as f:
-            keys = next(f).split(' ')
-            done = False
-            while not done:
-                values = next(f).split(' ')
-                if len(values) == len(keys):
-                    done = True
-            data[tracker] = dict(zip(keys, map(float, values)))
-    return data
-
-
-def create_comparison_plot(data, out_loc, y_label, x_label, sort_label, bg_label=None, bg_function=None, settings=None):
-    """ Creates a scatter plot comparing multiple trackers between two metric fields, with one on the x-axis and the
-    other on the y axis. Adds pareto optical lines and (optionally) a background contour.
-
-    Inputs:
-        data: dict of dicts such that data[tracker_name][metric_field_name] = float
-        y_label: the metric_field_name to be plotted on the y-axis
-        x_label: the metric_field_name to be plotted on the x-axis
-        sort_label: the metric_field_name by which trackers are ordered and ranked
-        bg_label: the metric_field_name by which (optional) background contours are plotted
-        bg_function: the (optional) function bg_function(x,y) which converts the x_label / y_label values into bg_label.
-        settings: dict of plot settings with keys:
-            'gap_val': gap between axis ticks and bg curves.
-            'num_to_plot': maximum number of trackers to plot
-    """
-
-    # Only loaded when run to reduce minimum requirements
-    from matplotlib import pyplot as plt
-
-    # Get plot settings
-    if settings is None:
-        gap_val = 2
-        num_to_plot = 20
-    else:
-        gap_val = settings['gap_val']
-        num_to_plot = settings['num_to_plot']
-
-    if (bg_label is None) != (bg_function is None):
-        raise TrackEvalException('bg_function and bg_label must either be both given or neither given.')
-
-    # Extract data
-    tracker_names = np.array(list(data.keys()))
-    sort_index = np.array([data[t][sort_label] for t in tracker_names]).argsort()[::-1]
-    x_values = np.array([data[t][x_label] for t in tracker_names])[sort_index][:num_to_plot]
-    y_values = np.array([data[t][y_label] for t in tracker_names])[sort_index][:num_to_plot]
-
-    # Print info on what is being plotted
-    tracker_names = tracker_names[sort_index][:num_to_plot]
-    print('\nPlotting %s vs %s, for the following (ordered) trackers:' % (y_label, x_label))
-    for i, name in enumerate(tracker_names):
-        print('%i: %s' % (i+1, name))
-
-    # Find best fitting boundaries for data
-    boundaries = _get_boundaries(x_values, y_values, round_val=gap_val/2)
-
-    fig = plt.figure()
-
-    # Plot background contour
-    if bg_function is not None:
-        _plot_bg_contour(bg_function, boundaries, gap_val)
-
-    # Plot pareto optimal lines
-    _plot_pareto_optimal_lines(x_values, y_values)
-
-    # Plot data points with number labels
-    labels = np.arange(len(y_values)) + 1
-    plt.plot(x_values, y_values, 'b.', markersize=15)
-    for xx, yy, l in zip(x_values, y_values, labels):
-        plt.text(xx, yy, str(l), color="red", fontsize=15)
-
-    # Add extra explanatory text to plots
-    plt.text(0, -0.11, 'label order:\nHOTA', horizontalalignment='left', verticalalignment='center',
-             transform=fig.axes[0].transAxes, color="red", fontsize=12)
-    if bg_label is not None:
-        plt.text(1, -0.11, 'curve values:\n' + bg_label, horizontalalignment='right', verticalalignment='center',
-                 transform=fig.axes[0].transAxes, color="grey", fontsize=12)
-
-    plt.xlabel(x_label, fontsize=15)
-    plt.ylabel(y_label, fontsize=15)
-    title = y_label + ' vs ' + x_label
-    if bg_label is not None:
-        title += ' (' + bg_label + ')'
-    plt.title(title, fontsize=17)
-    plt.xticks(np.arange(0, 100, gap_val))
-    plt.yticks(np.arange(0, 100, gap_val))
-    min_x, max_x, min_y, max_y = boundaries
-    plt.xlim(min_x, max_x)
-    plt.ylim(min_y, max_y)
-    plt.gca().set_aspect('equal', adjustable='box')
-    plt.tight_layout()
-
-    os.makedirs(out_loc, exist_ok=True)
-    filename = os.path.join(out_loc, title.replace(' ', '_'))
-    plt.savefig(filename + '.pdf', bbox_inches='tight', pad_inches=0.05)
-    plt.savefig(filename + '.png', bbox_inches='tight', pad_inches=0.05)
-
-
-def _get_boundaries(x_values, y_values, round_val):
-    x1 = np.min(np.floor((x_values - 0.5) / round_val) * round_val)
-    x2 = np.max(np.ceil((x_values + 0.5) / round_val) * round_val)
-    y1 = np.min(np.floor((y_values - 0.5) / round_val) * round_val)
-    y2 = np.max(np.ceil((y_values + 0.5) / round_val) * round_val)
-    x_range = x2 - x1
-    y_range = y2 - y1
-    max_range = max(x_range, y_range)
-    x_center = (x1 + x2) / 2
-    y_center = (y1 + y2) / 2
-    min_x = max(x_center - max_range / 2, 0)
-    max_x = min(x_center + max_range / 2, 100)
-    min_y = max(y_center - max_range / 2, 0)
-    max_y = min(y_center + max_range / 2, 100)
-    return min_x, max_x, min_y, max_y
-
-
-def geometric_mean(x, y):
-    return np.sqrt(x * y)
-
-
-def jaccard(x, y):
-    x = x / 100
-    y = y / 100
-    return 100 * (x * y) / (x + y - x * y)
-
-
-def multiplication(x, y):
-    return x * y / 100
-
-
-bg_function_dict = {
-    "geometric_mean": geometric_mean,
-    "jaccard": jaccard,
-    "multiplication": multiplication,
-    }
-
-
-def _plot_bg_contour(bg_function, plot_boundaries, gap_val):
-    """ Plot background contour. """
-
-    # Only loaded when run to reduce minimum requirements
-    from matplotlib import pyplot as plt
-
-    # Plot background contour
-    min_x, max_x, min_y, max_y = plot_boundaries
-    x = np.arange(min_x, max_x, 0.1)
-    y = np.arange(min_y, max_y, 0.1)
-    x_grid, y_grid = np.meshgrid(x, y)
-    if bg_function in bg_function_dict.keys():
-        z_grid = bg_function_dict[bg_function](x_grid, y_grid)
-    else:
-        raise TrackEvalException("background plotting function '%s' is not defined." % bg_function)
-    levels = np.arange(0, 100, gap_val)
-    con = plt.contour(x_grid, y_grid, z_grid, levels, colors='grey')
-
-    def bg_format(val):
-        s = '{:1f}'.format(val)
-        return '{:.0f}'.format(val) if s[-1] == '0' else s
-
-    con.levels = [bg_format(val) for val in con.levels]
-    plt.clabel(con, con.levels, inline=True, fmt='%r', fontsize=8)
-
-
-def _plot_pareto_optimal_lines(x_values, y_values):
-    """ Plot pareto optimal lines """
-
-    # Only loaded when run to reduce minimum requirements
-    from matplotlib import pyplot as plt
-
-    # Plot pareto optimal lines
-    cxs = x_values
-    cys = y_values
-    best_y = np.argmax(cys)
-    x_pareto = [0, cxs[best_y]]
-    y_pareto = [cys[best_y], cys[best_y]]
-    t = 2
-    remaining = cxs > x_pareto[t - 1]
-    cys = cys[remaining]
-    cxs = cxs[remaining]
-    while len(cxs) > 0 and len(cys) > 0:
-        best_y = np.argmax(cys)
-        x_pareto += [x_pareto[t - 1], cxs[best_y]]
-        y_pareto += [cys[best_y], cys[best_y]]
-        t += 2
-        remaining = cxs > x_pareto[t - 1]
-        cys = cys[remaining]
-        cxs = cxs[remaining]
-    x_pareto.append(x_pareto[t - 1])
-    y_pareto.append(0)
-    plt.plot(np.array(x_pareto), np.array(y_pareto), '--r')
diff --git a/trackeval/run_stmap.py b/trackeval/run_stmap.py
deleted file mode 100644
index 98fbd28..0000000
--- a/trackeval/run_stmap.py
+++ /dev/null
@@ -1,150 +0,0 @@
-
-""" 
-Author: Wang Pengfei
-
-run_mot_event.py
-
-Run example:
-run_mot_event.py
-
-Command Line Arguments: Defaults, # Comments
-    Eval arguments:
-        'USE_PARALLEL': False,
-        'NUM_PARALLEL_CORES': 8,
-        'BREAK_ON_ERROR': True,
-        'PRINT_RESULTS': True,
-        'PRINT_ONLY_COMBINED': False,
-        'PRINT_CONFIG': True,
-        'TIME_PROGRESS': True,
-        'OUTPUT_SUMMARY': True,
-        'OUTPUT_DETAILED': True,
-        'PLOT_CURVES': True,
-    Dataset arguments:
-        'GT_FOLDER': os.path.join(code_path, 'data/gt/event/'),  # Location of GT data
-        'TRACKERS_FOLDER': os.path.join(code_path, 'data/trackers/event/'),  # Trackers location
-        'OUTPUT_FOLDER': None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
-        'TRACKERS_TO_EVAL': None,  # Filenames of trackers to eval (if None, all in folder)
-        'CLASSES_TO_EVAL': ['adult', 'aircraft', 'antelope', 'baby', 'baby_seat', 'baby_walker', 'backpack', 'ball/sports_ball', 'bat', 'bear', 'bench', 'bicycle', 'bird', 'bottle', 'bread', 'bus/truck', 'cake', 'camel', 'camera', 'car', 'cat', 'cattle/cow', 'cellphone', 'chair', 'chicken', 'child', 'crab', 'crocodile', 'cup', 'dish', 'dog', 'duck', 'electric_fan', 'elephant', 'faucet', 'fish', 'fox', 'frisbee', 'fruits', 'giant_panda', 'guitar', 'hamster/rat', 'handbag', 'horse', 'kangaroo', 'laptop', 'leopard', 'lion', 'lizard', 'microwave', 'monkey', 'motorcycle', 'oven', 'penguin', 'piano', 'pig', 'rabbit', 'racket', 'red_panda', 'refrigerator', 'scooter', 'screen/monitor', 'sheep/goat', 'sink', 'skateboard', 'ski', 'snake', 'snowboard', 'sofa', 'squirrel', 'stool', 'stop_sign', 'suitcase', 'surfboard', 'table', 'tiger', 'toilet', 'toy', 'traffic_light', 'train', 'turtle', 'vegetables', 'watercraft', 'whale', 'zebra'],
-        'SPLIT_TO_EVAL': 'val',  # Valid: 'train', 'val', 'test', 'all'
-        'INPUT_AS_ZIP': False,  # Whether tracker input files are zipped
-        'PRINT_CONFIG': True,  # Whether to print current config
-        'DO_PREPROC': True,  # Whether to perform preprocessing (never done for 2D_MOT_2015)
-        'TRACKER_SUB_FOLDER': 'track_results',  # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
-        'OUTPUT_SUB_FOLDER': '',  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
-    Metric arguments:
-        'METRICS': ['HOTA', 'CLEARTR', 'Identity', 'VACE']
-"""
-
-import sys
-import os
-import json
-import argparse
-from multiprocessing import freeze_support
-import datetime
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-import trackeval  # noqa: E402
-
-parser = argparse.ArgumentParser("EVENT EVAL")
-parser.add_argument("--category_info", type=str, default="data/event/category_info_new_split", help="path to category info folder")
-parser.add_argument("--GT_FOLDER", type=str, default="data/event/gt_tao_format/01", help="path to gt data")
-parser.add_argument("--TRACKERS_FOLDER", type=str, default="data/event/1BASELINE_TAO_FILTER/yolox_x")
-parser.add_argument("--OUTPUT_FOLDER", type=str, default="output/debug/yolox_x", help="path to save eval results")
-parser.add_argument("--TRACKERS_TO_EVAL", nargs='+', default=['bytetrack'], help="Filenames of trackers to eval (if None, all in folder) ['bytetrack', 'deepsort', 'sort']")
-parser.add_argument("--SPLIT_TO_EVAL", type=str, default="test", choices=['train', 'val', 'test', 'all'], help="Valid: 'train', 'val', 'test', 'all'")
-parser.add_argument("--INPUT_AS_ZIP", type=bool, default=False, help="Whether tracker input files are zipped")
-parser.add_argument("--PRINT_CONFIG", type=bool, default=True, help="Whether to print current config")
-parser.add_argument("--DO_PREPROC", type=bool, default=True, help="Whether to perform preprocessing (never done for MOT15)")
-parser.add_argument("--TRACKER_SUB_FOLDER", type=str, default="", help="Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER")
-parser.add_argument("--OUTPUT_SUB_FOLDER", type=str, default="", help="")
-parser.add_argument("--TRACKER_DISPLAY_NAMES", nargs='+', default=['bytetrack'], help="")
-parser.add_argument("--SEQMAP_FOLDER", type=str, default=None, help="")
-parser.add_argument("--SEQMAP_FILE", type=str, default=None, help="")
-parser.add_argument("--SEQ_INFO", type=str, default=None, help="")
-parser.add_argument("--GT_LOC_FORMAT", type=str, default=None, help="")
-parser.add_argument("--SKIP_SPLIT_FOL", type=bool, default=False, help="")
-parser.add_argument("--USE_PARALLEL", default=None)
-parser.add_argument("--NUM_PARALLEL_CORES", default=None)
-parser.add_argument("--BREAK_ON_ERROR", default=None)
-parser.add_argument("--RETURN_ON_ERROR", default=None)
-parser.add_argument("--LOG_ON_ERROR", default=None)
-parser.add_argument("--PRINT_RESULTS", default=None)
-parser.add_argument("--PRINT_ONLY_COMBINED", default=None)
-# parser.add_argument("--PRINT_CONFIG", default=None)
-parser.add_argument("--TIME_PROGRESS", default=None)
-parser.add_argument("--DISPLAY_LESS_PROGRESS", default=None)
-parser.add_argument("--OUTPUT_SUMMARY", default=True)
-parser.add_argument("--OUTPUT_EMPTY_CLASSES", default=None)
-parser.add_argument("--OUTPUT_DETAILED", default=None)
-parser.add_argument("--PLOT_CURVES", default=None)
-parser.add_argument("--CLASSES_TO_EVAL", default=['adult', 'baby', 'child', 'toy', 'dog', 'guitar'])
-parser.add_argument("--VALID_CLASSES", default=None)
-
-
-
-if __name__ == '__main__':
-    freeze_support()
-
-    # args = parser.parse_args()
-    # Command line interface:
-    default_eval_config = trackeval.Evaluator.get_default_eval_config()
-    default_eval_config['DISPLAY_LESS_PROGRESS'] = False
-    default_dataset_config = trackeval.datasets.VLOGBox.get_default_dataset_config()
-    # default_metrics_config = {'METRICS': ['HOTA', 'CLEARTR', 'Identity', 'STMAP', 'TrackMAP']} # , 'THRESHOLD': 0.5
-    default_metrics_config = {'METRICS': ['HOTA', 'CLEARTR', 'Identity','STMAP']} # , 'THRESHOLD': 0.5
-    config = {**default_eval_config, **default_dataset_config, **default_metrics_config}  # Merge default configs
-    # parser = argparse.ArgumentParser()
-    # for setting in config.keys():
-    #     if type(config[setting]) == list or type(config[setting]) == type(None):
-    #         parser.add_argument("--" + setting, nargs='+')
-    #     else:
-    #         parser.add_argument("--" + setting)
-    args = parser.parse_args().__dict__
-    # nowTime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
-    args['OUTPUT_FOLDER'] = os.path.join(args['OUTPUT_FOLDER'])
-    if os.path.exists(args['category_info']):
-        with open(os.path.join(args['category_info'], 'category_info.json')) as f:
-            category_json = json.load(f)
-        f.close()
-        object_classes = []
-        with open(os.path.join(args['category_info'], 'object_classes.txt')) as f:
-            for line in f.readlines():
-                object_classes.append(line.strip())
-        if args['CLASSES_TO_EVAL'] is None:
-            args['CLASSES_TO_EVAL'] = object_classes
-        args['VALID_CLASSES'] = object_classes
-        args['CLASS_NAME_TO_CLASS_ID'] = category_json['object_category_to_index']
-    del args['category_info']
-    for setting in args.keys():
-        if args[setting] is not None:
-            if type(config[setting]) == type(True):
-                if args[setting] == True:
-                    x = True
-                elif args[setting] == False:
-                    x = False
-                else:
-                    raise Exception('Command line parameter ' + setting + 'must be True or False')
-            elif type(config[setting]) == type(1):
-                x = int(args[setting])
-            elif type(args[setting]) == type(None):
-                x = None
-            elif setting == 'SEQ_INFO':
-                x = dict(zip(args[setting], [None]*len(args[setting])))
-            else:
-                x = args[setting]
-            config[setting] = x
-    eval_config = {k: v for k, v in config.items() if k in default_eval_config.keys()}
-    dataset_config = {k: v for k, v in config.items() if k in default_dataset_config.keys()}
-    metrics_config = {k: v for k, v in config.items() if k in default_metrics_config.keys()}
-
-    # Run code
-    evaluator = trackeval.Evaluator(eval_config)
-    dataset_list = [trackeval.datasets.SemTrack(dataset_config)]
-    metrics_list = []
-    for metric in [trackeval.metrics.STMAP, trackeval.metrics.CLEARTR, trackeval.metrics.Identity,
-                   trackeval.metrics.HOTA]:
-        if metric.get_name() in metrics_config['METRICS']:
-            metrics_list.append(metric(metrics_config.copy()))
-    if len(metrics_list) == 0:
-        raise Exception('No metrics selected for evaluation')
-    evaluator.evaluate(dataset_list, metrics_list)
diff --git a/trackeval/run_stmap.sh b/trackeval/run_stmap.sh
deleted file mode 100644
index c6bd4c7..0000000
--- a/trackeval/run_stmap.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-python run_stmap.py \
-    --GT_FOLDER data \
-    --TRACKERS_FOLDER result \
-    --OUTPUT_FOLDER output \
-    --TRACKERS_TO_EVAL bytetrack deepsort sort \
-    --TRACKER_DISPLAY_NAMES bytetrack deepsort sort
diff --git a/trackeval/run_tao.py b/trackeval/run_tao.py
deleted file mode 100644
index c70f08d..0000000
--- a/trackeval/run_tao.py
+++ /dev/null
@@ -1,90 +0,0 @@
-""" run_tao.py
-
-Run example:
-run_tao.py --USE_PARALLEL False --METRICS HOTA --TRACKERS_TO_EVAL Tracktor++
-
-Command Line Arguments: Defaults, # Comments
-    Eval arguments:
-        'USE_PARALLEL': False,
-        'NUM_PARALLEL_CORES': 8,
-        'BREAK_ON_ERROR': True,
-        'PRINT_RESULTS': True,
-        'PRINT_ONLY_COMBINED': False,
-        'PRINT_CONFIG': True,
-        'TIME_PROGRESS': True,
-        'OUTPUT_SUMMARY': True,
-        'OUTPUT_DETAILED': True,
-        'PLOT_CURVES': True,
-    Dataset arguments:
-        'GT_FOLDER': os.path.join(code_path, 'data/gt/tao/tao_training'),  # Location of GT data
-        'TRACKERS_FOLDER': os.path.join(code_path, 'data/trackers/tao/tao_training'),  # Trackers location
-        'OUTPUT_FOLDER': None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
-        'TRACKERS_TO_EVAL': None,  # Filenames of trackers to eval (if None, all in folder)
-        'CLASSES_TO_EVAL': None,  # Classes to eval (if None, all classes)
-        'SPLIT_TO_EVAL': 'training',  # Valid: 'training', 'val'
-        'PRINT_CONFIG': True,  # Whether to print current config
-        'TRACKER_SUB_FOLDER': 'data',  # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
-        'OUTPUT_SUB_FOLDER': '',  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
-        'TRACKER_DISPLAY_NAMES': None,  # Names of trackers to display, if None: TRACKERS_TO_EVAL
-        'MAX_DETECTIONS': 300,  # Number of maximal allowed detections per image (0 for unlimited)
-    Metric arguments:
-        'METRICS': ['HOTA', 'CLEAR', 'Identity', 'TrackMAP']
-"""
-
-import sys
-import os
-import argparse
-from multiprocessing import freeze_support
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-import trackeval  # noqa: E402
-
-if __name__ == '__main__':
-    freeze_support()
-
-    # Command line interface:
-    default_eval_config = trackeval.Evaluator.get_default_eval_config()
-    # print only combined since TrackMAP is undefined for per sequence breakdowns
-    default_eval_config['PRINT_ONLY_COMBINED'] = True
-    default_eval_config['DISPLAY_LESS_PROGRESS'] = True
-    default_dataset_config = trackeval.datasets.TAO.get_default_dataset_config()
-    default_metrics_config = {'METRICS': ['HOTA', 'CLEAR', 'Identity', 'TrackMAP']}
-    config = {**default_eval_config, **default_dataset_config, **default_metrics_config}  # Merge default configs
-    parser = argparse.ArgumentParser()
-    for setting in config.keys():
-        if type(config[setting]) == list or type(config[setting]) == type(None):
-            parser.add_argument("--" + setting, nargs='+')
-        else:
-            parser.add_argument("--" + setting)
-    args = parser.parse_args().__dict__
-    for setting in args.keys():
-        if args[setting] is not None:
-            if type(config[setting]) == type(True):
-                if args[setting] == 'True':
-                    x = True
-                elif args[setting] == 'False':
-                    x = False
-                else:
-                    raise Exception('Command line parameter ' + setting + 'must be True or False')
-            elif type(config[setting]) == type(1):
-                x = int(args[setting])
-            elif type(args[setting]) == type(None):
-                x = None
-            else:
-                x = args[setting]
-            config[setting] = x
-    eval_config = {k: v for k, v in config.items() if k in default_eval_config.keys()}
-    dataset_config = {k: v for k, v in config.items() if k in default_dataset_config.keys()}
-    metrics_config = {k: v for k, v in config.items() if k in default_metrics_config.keys()}
-
-    # Run code
-    evaluator = trackeval.Evaluator(eval_config)
-    dataset_list = [trackeval.datasets.TAO(dataset_config)]
-    metrics_list = []
-    for metric in [trackeval.metrics.TrackMAP, trackeval.metrics.CLEAR, trackeval.metrics.Identity,
-                   trackeval.metrics.HOTA]:
-        if metric.get_name() in metrics_config['METRICS']:
-            metrics_list.append(metric())
-    if len(metrics_list) == 0:
-        raise Exception('No metrics selected for evaluation')
-    evaluator.evaluate(dataset_list, metrics_list)
\ No newline at end of file
diff --git a/trackeval/utils.py b/trackeval/utils.py
deleted file mode 100644
index 8c7c916..0000000
--- a/trackeval/utils.py
+++ /dev/null
@@ -1,146 +0,0 @@
-
-import os
-import csv
-import argparse
-from collections import OrderedDict
-
-
-def init_config(config, default_config, name=None):
-    """Initialise non-given config values with defaults"""
-    if config is None:
-        config = default_config
-    else:
-        for k in default_config.keys():
-            if k not in config.keys():
-                config[k] = default_config[k]
-    if name and config['PRINT_CONFIG']:
-        print('\n%s Config:' % name)
-        for c in config.keys():
-            print('%-20s : %-30s' % (c, config[c]))
-    return config
-
-
-def update_config(config):
-    """
-    Parse the arguments of a script and updates the config values for a given value if specified in the arguments.
-    :param config: the config to update
-    :return: the updated config
-    """
-    parser = argparse.ArgumentParser()
-    for setting in config.keys():
-        if type(config[setting]) == list or type(config[setting]) == type(None):
-            parser.add_argument("--" + setting, nargs='+')
-        else:
-            parser.add_argument("--" + setting)
-    args = parser.parse_args().__dict__
-    for setting in args.keys():
-        if args[setting] is not None:
-            if type(config[setting]) == type(True):
-                if args[setting] == 'True':
-                    x = True
-                elif args[setting] == 'False':
-                    x = False
-                else:
-                    raise Exception('Command line parameter ' + setting + 'must be True or False')
-            elif type(config[setting]) == type(1):
-                x = int(args[setting])
-            elif type(args[setting]) == type(None):
-                x = None
-            else:
-                x = args[setting]
-            config[setting] = x
-    return config
-
-
-def get_code_path():
-    """Get base path where code is"""
-    return os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-
-
-def validate_metrics_list(metrics_list):
-    """Get names of metric class and ensures they are unique, further checks that the fields within each metric class
-    do not have overlapping names.
-    """
-    metric_names = [metric.get_name() for metric in metrics_list]
-    # check metric names are unique
-    if len(metric_names) != len(set(metric_names)):
-        raise TrackEvalException('Code being run with multiple metrics of the same name')
-    fields = []
-    for m in metrics_list:
-        fields += m.fields
-    # check metric fields are unique
-    if len(fields) != len(set(fields)):
-        raise TrackEvalException('Code being run with multiple metrics with fields of the same name')
-    return metric_names
-
-
-def write_summary_results(summaries, cls, output_folder):
-    """Write summary results to file"""
-
-    fields = sum([list(s.keys()) for s in summaries], [])
-    values = sum([list(s.values()) for s in summaries], [])
-
-    # In order to remain consistent upon new fields being adding, for each of the following fields if they are present
-    # they will be output in the summary first in the order below. Any further fields will be output in the order each
-    # metric family is called, and within each family either in the order they were added to the dict (python >= 3.6) or
-    # randomly (python < 3.6).
-    default_order = ['HOTA', 'DetA', 'AssA', 'DetRe', 'DetPr', 'AssRe', 'AssPr', 'LocA', 'OWTA', 'HOTA(0)', 'LocA(0)',
-                     'HOTALocA(0)', 'MOTA', 'MOTP', 'MODA', 'CLR_Re', 'CLR_Pr', 'MTR', 'PTR', 'MLR', 'CLR_TP', 'CLR_FN',
-                     'CLR_FP', 'IDSW', 'MT', 'PT', 'ML', 'Frag', 'sMOTA', 'IDF1', 'IDR', 'IDP', 'IDTP', 'IDFN', 'IDFP',
-                     'Dets', 'GT_Dets', 'IDs', 'GT_IDs']
-    default_ordered_dict = OrderedDict(zip(default_order, [None for _ in default_order]))
-    for f, v in zip(fields, values):
-        default_ordered_dict[f] = v
-    for df in default_order:
-        if default_ordered_dict[df] is None:
-            del default_ordered_dict[df]
-    fields = list(default_ordered_dict.keys())
-    values = list(default_ordered_dict.values())
-
-    out_file = os.path.join(output_folder, cls + '_summary.txt')
-    os.makedirs(os.path.dirname(out_file), exist_ok=True)
-    with open(out_file, 'w', newline='') as f:
-        writer = csv.writer(f, delimiter=' ')
-        writer.writerow(fields)
-        writer.writerow(values)
-
-
-def write_detailed_results(details, cls, output_folder):
-    """Write detailed results to file"""
-    sequences = details[0].keys()
-    fields = ['seq'] + sum([list(s['COMBINED_SEQ'].keys()) for s in details], [])
-    out_file = os.path.join(output_folder, cls + '_detailed.csv')
-    os.makedirs(os.path.dirname(out_file), exist_ok=True)
-    with open(out_file, 'w', newline='') as f:
-        writer = csv.writer(f)
-        writer.writerow(fields)
-        for seq in sorted(sequences):
-            if seq == 'COMBINED_SEQ':
-                continue
-            writer.writerow([seq] + sum([list(s[seq].values()) for s in details], []))
-        writer.writerow(['COMBINED'] + sum([list(s['COMBINED_SEQ'].values()) for s in details], []))
-
-
-def load_detail(file):
-    """Loads detailed data for a tracker."""
-    data = {}
-    with open(file) as f:
-        for i, row_text in enumerate(f):
-            row = row_text.replace('\r', '').replace('\n', '').split(',')
-            if i == 0:
-                keys = row[1:]
-                continue
-            current_values = row[1:]
-            seq = row[0]
-            if seq == 'COMBINED':
-                seq = 'COMBINED_SEQ'
-            if (len(current_values) == len(keys)) and seq != '':
-                data[seq] = {}
-                for key, value in zip(keys, current_values):
-                    data[seq][key] = float(value)
-    return data
-
-
-class TrackEvalException(Exception):
-    """Custom exception for catching expected errors."""
-    ...