55from time import time
66from hashlib import md5
77from pathlib import Path
8- from datetime import datetime
98from os import system as shell
109from operator import itemgetter
1110from sys import path as sys_path
1211from ipaddress import ip_network
12+ from datetime import datetime , timedelta
1313
1414sys_path .append (str (Path (__file__ ).parent .parent .parent ))
1515
1616from riskdb .config import NET_SIZE
17- from riskdb .archiver .config import REPO_ARCHIVE , HEADERS_ARCHIVE_CSV , ARCHIVE_DEDUPE_FIELDS
1817from riskdb .builder .util import log
1918from riskdb .builder .load_reports import FileLoader
2019from riskdb .archiver .util import git_commit_and_push , git_clone , git_check_token
20+ from riskdb .archiver .config import REPO_ARCHIVE , HEADERS_ARCHIVE_CSV , ARCHIVE_DEDUPE_FIELDS , ARCHIVE_START_DATE
2121
2222
23- # NOTE: de-duplicating raw-report values to make the archive more compact
24- def _reports_by_day ( tmp_dir : str ) -> dict [ list [ dict ]]:
25- reports = {}
26- tmp_dir_dedupe = f' { tmp_dir } /dedupe'
27- dedupe_map = { k : [] for k in ARCHIVE_DEDUPE_FIELDS }
28- shell ( f'mkdir -p { tmp_dir_dedupe } ' )
23+ def _generate_archive_for_day ( date : datetime , dedupe_map : dict , tmp_dir : Path ) -> dict :
24+ reports = []
25+ for r in FileLoader ( sliding_window = False , match_date = date ):
26+ rdate = datetime . fromtimestamp ( r [ 'time' ])
27+ if rdate . year != date . year or rdate . month != date . month or rdate . day != date . day :
28+ continue
2929
30- for r in FileLoader ():
3130 for k , v in r .items ():
3231 if v is None :
3332 r [k ] = ''
@@ -46,25 +45,19 @@ def _reports_by_day(tmp_dir: str) -> dict[list[dict]]:
4645 if 'v' in r :
4746 r .pop ('v' )
4847
49- day = datetime .fromtimestamp (r ['time' ]).strftime ('%Y_%m_%d' )
50- if day not in reports :
51- reports [day ] = []
52-
5348 if 'an' not in r :
5449 r ['an' ] = ''
5550
56- if 'fp' not in r :
57- r ['fp' ] = ''
51+ if r ['by' ] != '' :
52+ if r ['by' ].find (':' ) != - 1 :
53+ cidr = NET_SIZE ['6' ]
5854
59- if r [ 'by' ]. find ( ':' ) != - 1 :
60- cidr = NET_SIZE ['6 ' ]
55+ else :
56+ cidr = NET_SIZE ['4 ' ]
6157
62- else :
63- cidr = NET_SIZE ['4' ]
64-
65- r ['by' ] = str (ip_network (f"{ r ['by' ]} /{ cidr } " , strict = False )).split ('/' , 1 )[0 ]
66- if r ['by' ] in ['::' , '::1' , '127.0.0.0' ]:
67- r ['by' ] = ''
58+ r ['by' ] = str (ip_network (f"{ r ['by' ]} /{ cidr } " , strict = False )).split ('/' , 1 )[0 ]
59+ if r ['by' ] in ['::' , '::1' , '127.0.0.0' ]:
60+ r ['by' ] = ''
6861
6962 for k in ARCHIVE_DEDUPE_FIELDS :
7063 if r [k ] == '' :
@@ -77,47 +70,60 @@ def _reports_by_day(tmp_dir: str) -> dict[list[dict]]:
7770
7871 r [k ] = dedupe_map [k ].index (r [k ])
7972
80- reports [ day ] .append (r )
73+ reports .append (r )
8174
82- for k in ARCHIVE_DEDUPE_FIELDS :
83- with open ( f' { tmp_dir_dedupe } /field_ { k } .csv' , 'w' , encoding = 'utf-8' ) as f :
84- f . write ( 'Key,Value \n ' )
85- f . write ( ' \n ' . join ([ f' { i } , { v } ' for i , v in enumerate ( dedupe_map [ k ])]))
75+ reports = sorted ( reports , key = itemgetter ( 'time' ))
76+
77+ if len ( reports ) == 0 :
78+ return dedupe_map
8679
87- for day in reports :
88- reports [day ] = sorted (reports [day ], key = itemgetter ('time' ))
80+ y = str (date .year ).zfill (2 )
81+ m = str (date .month ).zfill (2 )
82+ d = str (date .day ).zfill (2 )
83+ tmp_dir_mon = tmp_dir / y / m
84+ shell (f'mkdir -p { tmp_dir_mon } ' )
85+ with open (f'{ tmp_dir_mon } /{ y } _{ m } _{ d } .csv' , 'w' , encoding = 'utf-8' ) as f :
86+ f .write (f"{ ',' .join (HEADERS_ARCHIVE_CSV )} \n " )
87+ for r in reports :
88+ f .write (
89+ f"{ r ['time' ]} ,"
90+ f"{ r ['ip' ]} ,{ r ['an' ]} ,{ r ['cat' ]} ,{ r ['cmt' ]} ,"
91+ f"{ r ['by' ]} ,{ r ['user' ]} \n "
92+ )
8993
90- return reports
94+ return dedupe_map
9195
9296
93- def _write_reports (reports : dict [list [dict ]], tmp_dir : str ):
94- for y_m_d in reports :
95- y , m , _ = y_m_d .split ('_' )
96- tmp_dir_mon = f'{ tmp_dir } /{ y } /{ m } '
97- shell (f'mkdir -p { tmp_dir_mon } ' )
98- with open (f'{ tmp_dir_mon } /{ y_m_d } .csv' , 'w' , encoding = 'utf-8' ) as f :
99- f .write (f"{ ',' .join (HEADERS_ARCHIVE_CSV )} \n " )
100- for r in reports [y_m_d ]:
101- f .write (
102- f"{ r ['time' ]} ,"
103- f"{ r ['ip' ]} ,{ r ['an' ]} ,{ r ['cat' ]} ,{ r ['cmt' ]} ,"
104- f"{ r ['by' ]} ,{ r ['user' ]} ,{ r ['fp' ]} \n "
105- )
97+ # todo: multi-threading
98+ def _generate_archive (tmp_dir : Path ):
99+ today = datetime .now ()
100+ date = ARCHIVE_START_DATE
101+ dedupe_map = {k : [] for k in ARCHIVE_DEDUPE_FIELDS }
102+
103+ while date .year < today .year or date .month < today .month or date .day <= today .day :
104+ log (f'Generating archive for day: '
105+ f'{ str (date .year ).zfill (2 )} -{ str (date .month ).zfill (2 )} -{ str (date .day ).zfill (2 )} ' )
106+ dedupe_map = _generate_archive_for_day (date = date , dedupe_map = dedupe_map , tmp_dir = tmp_dir )
107+ date += timedelta (days = 1 )
108+
109+ log ('Writing dedupe-maps' )
110+ tmp_dir_dedupe = tmp_dir / 'dedupe'
111+ shell (f'mkdir -p { tmp_dir_dedupe } ' )
112+ for k in ARCHIVE_DEDUPE_FIELDS :
113+ with open (f'{ tmp_dir_dedupe } /field_{ k } .csv' , 'w' , encoding = 'utf-8' ) as f :
114+ f .write ('Key,Value\n ' )
115+ f .write ('\n ' .join ([f'{ i } ,{ v } ' for i , v in enumerate (dedupe_map [k ])]))
106116
107117 git_commit_and_push (user = 'Report Updater' , cmt = 'Report updates' , repo = REPO_ARCHIVE , tmp_dir = tmp_dir )
108118
109119
110120def main ():
111121 log ('Prepare Repository' )
112122 git_check_token ()
113- tmp_dir = f'/tmp/risk_db_archive_{ int (time ())} '
123+ tmp_dir = Path ( f'/tmp/risk_db_archive_{ int (time ())} ' )
114124 git_clone (repo = REPO_ARCHIVE , tmp_dir = tmp_dir )
115125
116- log ('Loading & Sorting Reports by Day' )
117- reports_by_day = _reports_by_day (tmp_dir )
118-
119- log ('Write Reports' )
120- _write_reports (reports_by_day , tmp_dir )
126+ _generate_archive (tmp_dir )
121127
122128
123129if __name__ == '__main__' :
0 commit comments