@@ -17,14 +17,20 @@ import (
1717 "github.com/m-lab/etl/storage"
1818)
1919
20+ // Impose 200MiB max size for a single file. Larger than this risks an OOM if there are
21+ // multiple large files at on multiple tasks.
22+ // This can be overridden with SetMaxFileSize()
23+ const MAX_FILE_SIZE = 20 * 1024 * 1024
24+
2025// TODO(dev) Add unit tests for meta data.
2126type Task struct {
2227 // ETLSource and Parser are both embedded, so their interfaces are delegated
2328 // to the component structs.
2429 * storage.ETLSource // Source from which to read tests.
2530 etl.Parser // Parser to parse the tests.
2631
27- meta map [string ]bigquery.Value // Metadata about this task.
32+ meta map [string ]bigquery.Value // Metadata about this task.
33+ maxFileSize int64 // Max file size to avoid OOM.
2834}
2935
3036// NewTask constructs a task, injecting the source and the parser.
@@ -34,10 +40,14 @@ func NewTask(filename string, src *storage.ETLSource, prsr etl.Parser) *Task {
3440 meta ["filename" ] = filename
3541 meta ["parse_time" ] = time .Now ()
3642 meta ["attempt" ] = 1
37- t := Task {src , prsr , meta }
43+ t := Task {src , prsr , meta , MAX_FILE_SIZE }
3844 return & t
3945}
4046
47+ func (tt * Task ) SetMaxFileSize (max int64 ) {
48+ tt .maxFileSize = max
49+ }
50+
4151// ProcessAllTests loops through all the tests in a tar file, calls the
4252// injected parser to parse them, and inserts them into bigquery. Returns the
4353// number of files processed.
@@ -46,28 +56,45 @@ func (tt *Task) ProcessAllTests() (int, error) {
4656 defer metrics .WorkerState .WithLabelValues ("task" ).Dec ()
4757 files := 0
4858 nilData := 0
59+ var testname string
60+ var data []byte
61+ var err error
4962 // Read each file from the tar
50- for testname , data , err := tt .NextTest (); err != io .EOF ; testname , data , err = tt .NextTest () {
63+
64+ for testname , data , err = tt .NextTest (tt .maxFileSize ); err != io .EOF ; testname , data , err = tt .NextTest (tt .maxFileSize ) {
5165 files ++
5266 if err != nil {
53- if err == io .EOF {
67+ switch {
68+ case err == io .EOF :
5469 break
55- }
56- // We are seeing several of these per hour, a little more than
57- // one in one thousand files. duration varies from 10 seconds up to several
58- // minutes.
59- // Example:
60- // filename:gs://m-lab-sandbox/ndt/2016/04/10/20160410T000000Z-mlab1-ord02-ndt-0002.tgz
61- // files:666 duration:1m47.571825351s
62- // err:stream error: stream ID 801; INTERNAL_ERROR
63- // Because of the break, this error is passed up, and counted at the Task level.
64- log .Printf ("filename:%s testname:%s files:%d, duration:%v err:%v" ,
65- tt .meta ["filename" ], testname , files ,
66- time .Since (tt .meta ["parse_time" ].(time.Time )), err )
70+ case err == storage .OVERSIZE_FILE :
71+ log .Printf ("filename:%s testname:%s files:%d, duration:%v err:%v" ,
72+ tt .meta ["filename" ], testname , files ,
73+ time .Since (tt .meta ["parse_time" ].(time.Time )), err )
74+ metrics .TestCount .WithLabelValues (
75+ tt .Parser .TableName (), "unknown" , "oversize file" ).Inc ()
76+ continue
77+ default :
78+ // We are seeing several of these per hour, a little more than
79+ // one in one thousand files. duration varies from 10 seconds
80+ // up to several minutes.
81+ // Example:
82+ // filename:
83+ // gs://m-lab-sandbox/ndt/2016/04/10/20160410T000000Z-mlab1-ord02-ndt-0002.tgz
84+ // files:666 duration:1m47.571825351s
85+ // err:stream error: stream ID 801; INTERNAL_ERROR
86+ // Because of the break, this error is passed up, and counted at
87+ // the Task level.
88+ log .Printf ("filename:%s testname:%s files:%d, duration:%v err:%v" ,
89+ tt .meta ["filename" ], testname , files ,
90+ time .Since (tt .meta ["parse_time" ].(time.Time )), err )
6791
68- metrics .TestCount .WithLabelValues (
69- tt .Parser .TableName (), "unknown" , "unrecovered" ).Inc ()
70- break
92+ metrics .TestCount .WithLabelValues (
93+ tt .Parser .TableName (), "unknown" , "unrecovered" ).Inc ()
94+ // Since we don't understand these errors, safest thing to do is
95+ // stop processing the tar file (and task).
96+ break
97+ }
7198 }
7299 if data == nil {
73100 // TODO(dev) Handle directories (expected) and other
@@ -89,14 +116,18 @@ func (tt *Task) ProcessAllTests() (int, error) {
89116 }
90117
91118 // Flush any rows cached in the inserter.
92- err := tt .Flush ()
119+ flushErr := tt .Flush ()
93120
94- if err != nil {
95- log .Printf ("%v" , err )
121+ if flushErr != nil {
122+ log .Printf ("%v" , flushErr )
96123 }
97124 // TODO - make this debug or remove
98125 log .Printf ("Processed %d files, %d nil data, %d rows committed, %d failed, from %s into %s" ,
99126 files , nilData , tt .Parser .Committed (), tt .Parser .Failed (),
100127 tt .meta ["filename" ], tt .Parser .FullTableName ())
101- return files , err
128+ // Return the file count, and the terminal error, if other than EOF.
129+ if err != io .EOF {
130+ return files , err
131+ }
132+ return files , nil
102133}
0 commit comments