main.py

"""
Code for generating an ensemble submission for the SeeClickFix contest hosted on Kaggle.  It loads base
submission files generated by our team's (Bryan Gregory and Miroslaw Horbal) individual models then combines them
using segment based averaging.

Ensemble weights for each segment of data are stored in SETTINGS.json along with the filepaths for the base input
submission files.  Note that base submission 0 corresponds to Bryan's model and base submission 1 is Miroslaw's model.

This relies on already generated submission files from the base models, so base models must be run prior to performing
the ensemble.

Requires: PANDAS >.13
          NUMPY
"""
__author__ = ['Bryan Gregory','Miroslaw Horbal']
__email__ = ['bryan.gregory1@gmail.com','miroslaw@gmail.com']
__date__ = '01-04-2013'

#Internal modules
import utils
#Start logger to record all info, warnings, and errors to Logs/logfile.log
log = utils.start_logging(__name__)

#External modules
import sys
import pandas as pd
import numpy as np
from datetime import datetime

def main():
    #---Load environment settings from SETTINGS.json in root directory and build filepaths for all base submissions---#
    settings = utils.load_settings('SETTINGS.json')
    base_filepaths = (settings['file_bryan_submission'],
                      settings['file_miroslaw_submission'])
    segment_weights = settings['ensemble_segment_weights']
    segments =  segment_weights.keys()
    targets = segment_weights[segments[0]].keys()

    #---Output the segment weights to be used for ensemble averaging of base submissions---#
    log.info('==========ENSEMBLE WEIGHTS (B,M)============')
    for segment in segment_weights:
        log.info(segment.upper()+':')
        for target in segment_weights[segment]:
            log.info('    '+target.upper()+' -- ['+segment_weights[segment][target]['0']+','+
                      segment_weights[segment][target]['1']+']')

    #---Load each base submission to a list of dataframes---#
    base_subs = []
    for file in base_filepaths:
        try:
            base_subs.append(pd.read_csv(file).set_index(['id'], drop=False).sort())
            log.info('Base submission successfully loaded: %s.' % file)
        except IOError:
            log.info('Base submission file does not exist: %s. Run base model to generate, or update filepath.' %file)
            sys.exit('---Exiting---')

    utils.line_break()

    #---Load id's labeled with segments to a dataframe used for segment based averaging---#
    file = settings['file_segment_ids']
    try:
        segment_ids = pd.read_csv(file)
        log.info('Segment IDs successfully loaded from: %s.' % file)
    except IOError:
        log.info('Segment IDs file does not exist: %s. Update filepath in SETTINGS.json.' % file)
    utils.line_break()

    #---Transform base predictions to log space prior to averaging, if selected in settings---#
    if settings['avg_log_space'] == 'y':
        log.info('Transforming base predictions to log space prior to averaging.')
        for i in range(len(base_subs)):
            for target in targets:
                base_subs[i][target] = np.log(base_subs[i][target]+1)
        utils.line_break()

    #---Apply segment based weights to each base submission then combine them to create ensemble submission---#
    log.info('Applying segment weights to base submissions then combining to create ensemble.')
    for i in range(len(base_subs)):
        #Merge the segment labels from the segment id's file with the base submission dataframe
        base_subs[i] = base_subs[i].merge(segment_ids,on='id',how='inner')
        for segment in segments:
            for target in targets:
                base_subs[i][target][base_subs[i]['Segment'] == segment] \
                    *= float(segment_weights[segment][target][str(i)])
        del base_subs[i]['Segment']
    ensemble_sub = base_subs[0].ix[:]
    for i in range(len(base_subs)-1):
        for target in targets:
            ensemble_sub[target] += base_subs[i+1][target]
    utils.line_break()

    #---Transform ensemble predictions back to normal, if use log space averaging was selected in settings---#
    if settings['avg_log_space'] == 'y':
        log.info('Transforming ensemble predictions back to normal from log space.')
        for target in targets:
            ensemble_sub[target] = np.exp(ensemble_sub[target])-1
        utils.line_break()

    #---Apply any final target scalars to ensemble predictions---#
    for target in targets:
        ensemble_sub[target] *= float(settings['target_scalars'][target])

    #---Output ensemble submission to directory set in SETTINGS.json, appending creation date and time---#
    timestamp = datetime.now().strftime('%m-%d-%y_%H%M')
    filename = settings['dir_ensemble_submissions']+'ensemble_predictions_'+timestamp+'.csv'
    ensemble_sub.to_csv(filename, index=False)
    log.info('Ensemble submission saved: %s' % filename)
    utils.line_break()

    #End main
    log.info('Program executed successfully without error! Exiting.')

if __name__ == '__main__':
    sys.exit(main())