-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathcsv_file_splitter.py
More file actions
58 lines (47 loc) · 1.98 KB
/
csv_file_splitter.py
File metadata and controls
58 lines (47 loc) · 1.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import io
import os
class CsvFileSplitter(io.StringIO):
"""Helper for writing big data into multiple files splitted by size.
Expects data written in CSV format (first line is header)
Could be called from function decorated by @register (see Collector).
:param max_file_size: only passed by the library, could come from package max_data_size
"""
def __init__(self, filespec=None, max_file_size=200 * 1048576, *args, **kwargs):
self.max_file_size = max_file_size
self.filespec = filespec
self.files = []
self.currentfile = None
self.header = None
self.counter = 0
self.cycle_file()
def cycle_file(self):
"""Closes current file, opens new one and writes CSV header"""
if self.currentfile:
self.currentfile.close()
self.counter = 0
fname = '{}_split{}'.format(self.filespec, len(self.files))
self.currentfile = open(fname, 'w', encoding='utf-8')
self.files.append(fname)
if self.header:
self.counter += self.currentfile.write('{}\n'.format(self.header))
def file_list(self, keep_empty=False):
"""Returns list of written files"""
self.currentfile.close()
# Check for an empty dump
if not keep_empty and (len(self.header) + 1 == self.counter):
os.remove(self.files[-1])
self.files = self.files[:-1]
# If we only have one file, remove the suffix
if len(self.files) == 1:
filename = self.files.pop()
new_filename = filename.replace('_split0', '')
os.rename(filename, new_filename)
self.files.append(new_filename)
return self.files
def write(self, s):
"""Writes to file and creates new one if file exceedes threshold"""
if not self.header:
self.header = s[: s.index('\n')]
self.counter += self.currentfile.write(s)
if self.counter >= self.max_file_size:
self.cycle_file()