-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlog_compiler.py
More file actions
124 lines (110 loc) · 4.86 KB
/
log_compiler.py
File metadata and controls
124 lines (110 loc) · 4.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import time
import anyjson
import re
import csv
import operator
import os
log_dir = "sanitized_log"
auth_log = log_dir + "/auth.log"
web_access_log = log_dir + "/apache2/www-access.log"
media_access_log = log_dir + "/apache2/www-access.log"
# connections within this many seconds are grouped together
bundle_window = 60*60
log_data = []
conn_tracker = {}
def parse_auth_log(auth_log):
with open(auth_log, 'r') as f:
for line in f:
result = {}
conn = {}
m = re.search( \
"(\w+ ?\d+ \d+\:\d+\:\d+) ([^\s]+) (\w+)\[(\d+)\]\: (.*)", \
line)
if m:
(conn["time"], conn["origtime"]) = ssh_log_time_to_epoch(m.group(1), 2010)
conn["dst"] = m.group(2)
conn["pname"] = m.group(3)
conn["pid"] = m.group(4)
conn["data"] = m.group(5)
#print ("%s, %s, %s, %s, %s\n" % (date, hostname, pname, pid, data))
if conn["pname"] == "sshd":
sshm = re.search("(\w+) password for (\w+) from ([\d\.]+) port \d+ ssh2", conn["data"])
if sshm:
conn["dport"] = 22
conn["result"] = sshm.group(1)
conn["user"] = sshm.group(2)
conn["src"] = sshm.group(3)
#print "%s ssh login for %s from %s\n" % (result, user, remoteip)
conn["valid"] = False if conn["result"] == "Failed" else True
if (not bundle_if_possible(conn)):
yield({'src': conn["src"], \
'dst': conn["dst"], \
'dport': conn["dport"], \
'valid': conn["valid"], \
'time': conn["time"], \
'origtime': conn["origtime"], \
'last_time': conn["time"], \
'num_conns': 1,
'user': conn['user']})
def parse_web_access_log(web_log):
# 10.0.1.2 - - [19/Apr/2010:08:30:12 -0700] "GET /feed/ HTTP/1.1" 200 16605 "-" "Apple-PubSub/65.12.1" oxOvcAoAAQ4AAEY@W5kAAAAB 4159446
fields = {'src':0, 'ident':1, 'uname':2, 'date':3, 'tzone':4, 'req':5, 'status':6, 'bytes':7, 'referer':8, 'agent':9, 'recv':10}
f = csv.reader(open(web_log, 'rb'), delimiter=' ', quotechar='"')
for row in f:
conn= {}
# chop off that pesky [
conn["time"] = apache_log_time_to_epoch((row[fields['date']])[1:])
conn["dst"] = "app-1" # not in the log file, assuming same box
conn["dport"] = 80
conn["valid"] = get_http_code_validity(row[fields['status']])
conn["src"] = row[fields['src']]
conn["req"] = row[fields['req']]
#conn["valid"] = is_http_code_value
if (not bundle_if_possible(conn)):
yield({'src': conn["src"], \
'dst': conn["dst"], \
'dport': conn["dport"], \
'valid': conn["valid"], \
'time': conn["time"], \
'last_time': conn["time"], \
'httpreq': conn['req'], \
'num_conns': 1})
def bundle_if_possible(conn):
global log_data
global bundle_window
for i in xrange(len(log_data) - 1, 0, -1):
if (conn["time"] >= log_data[i]["time"] + bundle_window):
# outside the window, return
return None
else:
if (conn["src"] == log_data[i]["src"] and
conn["dst"] == log_data[i]["dst"] and
conn["dport"] == log_data[i]["dport"] and
conn["valid"] == log_data[i]["valid"]):
log_data[i]["last_time"] = conn["time"]
log_data[i]["num_conns"] += 1
return True
def get_http_code_validity(code):
return (int(code) >= 200 and int(code) < 400)
def apache_log_time_to_epoch(dtime):
# 19/Apr/2010:08:30:12 -0700
pattern = "%d/%b/%Y:%H:%M:%S"
convdate = time.strptime(dtime, pattern)
etime = int(time.mktime(convdate))
return etime
def ssh_log_time_to_epoch(dtime, year):
pattern = "%Y %b %d %H:%M:%S"
#print str(year) + " " + dtime
convdate = time.strptime(str(year) + " " + dtime, pattern)
etime = int(time.mktime(convdate))
return (etime, dtime)
if __name__ == "__main__":
for res in parse_auth_log(auth_log):
log_data.append(res)
for res in parse_web_access_log(media_access_log):
log_data.append(res)
for res in parse_web_access_log(web_access_log):
log_data.append(res)
#log_data.sort(log_data, key=operator.attrgetter('time'))
log_data.sort(key=lambda row: row["time"])
print "traffic = " + anyjson.serialize(log_data)