higgsml/HiggsBosonCompetition_AMSMetric_rev1.py at master · bourcierj/higgsml · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# -*- coding: utf-8 -*-
"""
Evaluation metric for the Higgs Boson Kaggle Competition,
as described on:
https://www.kaggle.com/c/higgs-boson/details/evaluation

@author: Joyce Noah-Vanhoukce
Created: Thu Apr 24 2014
"""
import os
import csv
import math


def create_solution_dictionary(solution):
    """ Read solution file, return a dictionary with key EventId and value (weight,label).
    Solution file headers: EventId, Label, Weight """

    solnDict = {}
    with open(solution, 'r') as f:
        soln = csv.reader(f)
        next(soln) # header
        for row in soln:
            if row[0] not in solnDict:
                solnDict[row[0]] = (row[1], row[2])
    return solnDict


def check_submission(submission, Nelements):
    """ Check that submission RankOrder column is correct:
        1. All numbers are in [1,NTestSet]
        2. All numbers are unqiue
    """
    rankOrderSet = set()
    with open(submission, 'r') as f:
        sub = csv.reader(f)
        next(sub) # header
        for row in sub:
            rankOrderSet.add(row[1])

    if len(rankOrderSet) != Nelements:
        print('RankOrder column must contain unique values')
        exit()
    elif rankOrderSet.isdisjoint(set(range(1,Nelements+1))) == False:
        print('RankOrder column must contain all numbers from [1..NTestSset]')
        exit()
    else:
        return True


def AMS(s, b):
    """ Approximate Median Significance defined as:
        AMS = sqrt(
                2 { (s + b + b_r) log[1 + (s/(b+b_r))] - s}
              )
    where b_r = 10, b = background, s = signal, log is natural logarithm """

    br = 10.0
    radicand = 2 *( (s+b+br) * math.log (1.0 + s/(b+br)) -s)
    if radicand < 0:
        print('radicand is negative. Exiting')
        exit()
    else:
        return math.sqrt(radicand)


def AMS_metric(solution, submission):
    """  Prints the AMS metric value to screen.
    Solution File header: EventId, Class, Weight
    Submission File header: EventId, RankOrder, Class
    """

    numEvents = 550000 # number of events = size of test set

    # solutionDict: key=eventId, value=(label, class)
    solutionDict = create_solution_dictionary(solution)

    signal = 0.0
    background = 0.0
    if check_submission(submission, numEvents):
        with open(submission, 'r') as f:
            sub = csv.reader(f)
            next(sub) # header row
            for row in sub:
                if row[2] == 's': # only events predicted to be signal are scored
                    if solutionDict[row[0]][0] == 's':
                        signal += float(solutionDict[row[0]][1])
                    elif solutionDict[row[0]][0] == 'b':
                        background += float(solutionDict[row[0]][1])

        print('signal = {0}, background = {1}'.format(signal, background))
        print('AMS = ' + str(AMS(signal, background)))


import pandas as pd

def build_solution():
    """Builds the solution file (columns EventId, Label, Weight) from the test csv fil
       with all other columns"""

    """Reads CSV file and splits in into four datasets: Train, Kaggle public
    eaderboard, Kaggle private leaderboard (i.e the test set), and unused."""
    df = pd.read_csv("data/atlas-higgs-challenge-2014-v2.csv", sep=',')
    df_test = df[df['KaggleSet'].isin(('b', 'v'))]
    assert(len(df_test) == 550_000)
    df_test = df_test.drop('Weight', axis='columns')
    df_test = df_test.rename(columns={'KaggleWeight': 'Weight', 'Label': 'Class'})
    df_test.to_csv('data/solution_from_cern.csv', columns=['EventId', 'Class', 'Weight'], index=False)


if __name__ == "__main__":

    # Check if test set from CERN and from Kaggle match: => Looks like they match

    # df = pd.read_csv("atlas-higgs-challenge-2014-v2.csv", sep=',')
    # dft1 = df[df['KaggleSet'].isin(('b', 'v'))]
    # assert(len(dft1) == 550_000)
    # dft1.drop('Weight', axis='columns')
    # dft1.rename(columns={'KaggleWeight': 'Weight'})
    # print(dft1.head(5))

    # dft2 = pd.read_csv("test.csv", sep=',')
    # print(dft2.head(5))

    # builds the solution file
    # build_solution()

    # enter path and file names here
    # solution_path = ''
    # submission_path = ''

    # AMS_metric(solution_path, submission_path)