-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
63 lines (48 loc) · 1.69 KB
/
preprocess.py
File metadata and controls
63 lines (48 loc) · 1.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""preprocess.py"""
from __future__ import division
import csv
import collections
import itertools
import numpy as np
def load_data(fname):
"""Reads a csv file and loads votes and labels into convenient format.
Input is csv file with the following fields:
- item (str): item name
- label (str): label name
- selected (int): vote
Returns:
- items ([str]): item names
- labels ([str]): label names
- votes (dict): (item name, label name) -> votes (list: bool)
"""
with open(fname, 'r') as f:
reader = csv.DictReader(f)
items = set()
labels = set()
votes = collections.defaultdict(list)
for row in reader:
items.add(row['item'])
labels.add(row['label'])
votes[row['item'], row['label']].append(bool(int(row['selected'])))
return list(items), list(labels), votes
class Data:
def __init__(self, fin):
items, labels, votes = load_data(fin)
self.items = items
self.labels = labels
self.votes = votes
def make_posneg(self):
"""Convert votes to matrix format.
Returns: item name -> matrix (2 x |labels|) with the number of
positive votes (first row) and negative votes (second row)
"""
posneg = dict()
for i in self.items:
pos = np.array([sum(self.votes[i, l]) for
l in self.labels])
neg = np.array([len(self.votes[i, l]) for
l in self.labels]) - pos
posneg[i] = np.vstack((pos, neg))
return posneg
def __repr__(self):
return '%s, %s\nitem1: %s' % (self.items, self.labels, self.votes.items()[0])