-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcsvsplit.py
More file actions
71 lines (56 loc) · 1.84 KB
/
Copy pathcsvsplit.py
File metadata and controls
71 lines (56 loc) · 1.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import sys
import time
import logging
MB = 1024*1024
MB100 = 100*MB
def iterchunk(f, block_size=MB100):
lines = []
size = 0
for line in f:
size += len(line)
lines.append(line)
if size > block_size:
yield lines
lines = []
size = 0
# return final chunk
yield lines
def iterprefix(cap):
"""Yield prefixes of the form "00X" where X is an incrementing counter."""
size_cap = len(str(cap))
for num in xrange(cap):
count = str(num)
prefix = '{}{}'.format('0' * (size_cap - len(count)), count)
yield prefix
if __name__ == "__main__":
# usage check
if len(sys.argv) < 2:
prog = os.path.basename(__file__)
usage = '{} <csvfile> [<outdir>] [<block-size>]'.format(prog)
print usage
sys.exit(1)
# set up logging
logging.basicConfig(level=logging.INFO)
# parse CL args
csvfile = sys.argv[1]
dirname = os.path.dirname(__file__) if len(sys.argv) < 3 else sys.argv[2]
outdir = os.path.abspath(dirname)
block_size = MB100 if len(sys.argv) < 4 else sys.argv[3]
# set up file prefix calculation
fsize = os.path.getsize(csvfile)
num_files = int(round(fsize / float(block_size)))
prefixgen = iterprefix(num_files)
# write the csv file splits
with open(csvfile, 'r') as source:
headers = source.readline()
logging.info('output directory: {}'.format(outdir))
for chunk in iterchunk(source, block_size):
prefix = prefixgen.next()
fname = '{}-split.csv'.format(prefix)
path = os.path.join(outdir, fname)
logging.info('writing ~{} bytes to: {}'.format(block_size, fname))
with open(path, 'w') as dest:
dest.write(headers)
dest.writelines(chunk)
sys.exit(0)