mcnemar.py

import json
import math 
import sys


# using ben's results file 

def mcnemar(rep,svmdata,baselinedata, null_comparison,baseline_results,svm_results):
    if rep not in svmdata:
        print "rep not in svmdata"
    if rep not in baselinedata:
        print "rep not in baseline data"
    svm= svmdata[rep]
    baseline= baselinedata[rep]
    svm_acc= (svm_results[rep])["Test Accuracy"]
    baseline_acc=(baseline_results[rep])["Accuracy"]
    n_svm=0
    n_base=0
    results={}
    for bill in svm: #misclassified by svm
        if (bill not in baseline): #not misclassified by baseline
            n_svm=n_svm+1
    for bill in baseline:
        if (bill not in svm):
            n_base=n_base+1
    numer = math.pow(float((math.fabs(n_svm- n_base) -1)),2.0)
    denom= n_svm + n_base
    if denom==0 : 
        print "n_svm_misclassified and n_baseline_misclassified are 0 for rep#: "+str(rep)
        value= sys.maxint
        if float(svm_acc)>float(baseline_acc):
            better_algorithm= 1
        else:
            better_algorithm= 0
    else:
        value = float(numer/float(denom))
        if value > null_comparison:
            if float(svm_acc)>float(baseline_acc):
                better_algorithm= 1
            else:
                better_algorithm= 0
        else:
            better_algorithm=2


    results["n_svm_misclassified"]= n_svm
    results["n_baseline_misclassified"]= n_base
    results["value"]= value
    results["SVM accuracy"]=svm_acc
    results["baseline accuracy"]=baseline_acc
    results["null_hypothesis"]= null_comparison
    results["better_algorithm"]= better_algorithm
    return results

    # 1 for svm better
    # 0 for baseline better
    # 2 null_hypothesis that both algorithms have different performance not rejected by mcnemar's test

    
def mcnemarAll(null_comparison = 3.841459):
    name = "all_no_summary_validation"
    name2 = "all__baseline"
    svmfile= open("mcnemar_data/"+name,"r")
    baselinefile= open("mcnemar_data/"+name2,"r")
    svm_results = json.loads((open('experiment_results/'+name+'_ben.json', 'r')).read())
    baseline_results = json.loads((open('experiment_results/'+name2+'.json', 'r')).read())

    # json generated by svm and baseline experiments
    svmdata= json.loads(svmfile.read())
    baselinedata=json.loads(baselinefile.read())
    personlist = json.loads(open('representatives').read())
    stats={}
    for rep in personlist:
        stats[rep]=mcnemar(rep= rep,svmdata=svmdata,baselinedata=baselinedata, null_comparison= null_comparison, svm_results=svm_results, baseline_results= baseline_results)
        #print "Performed test for rep : "+ str(rep)
    writeAll(stats)


def writeAll(stats):
    print "Done with all reps"

    raw_input("Press Enter to continue... \nAbout to write .csv. Make sure to close the mcnemar results file if you have it open.")

    # Format stats for excel:
    f = open('experiment_results/all_mcnemar'+'.csv', 'w')

    #Write headers:
    for stat_name in stats[stats.keys()[0]]:
        f.write(','+stat_name)
    f.write('\n')
    #Write stats
    for rep_id in stats:
        f.write(str(rep_id))
        for stat in stats[rep_id]:
            f.write(','+str(stats[rep_id][stat]))
        f.write('\n')

    f.close()


mcnemarAll()