Merge pull request #195 from m-lab/max-size

gfr10598 · web-flow · commit 97975e060d6d · 2017-07-10T16:35:16.000-04:00
Add max file size, and clean up error handling
diff --git a/storage/storage.go b/storage/storage.go
@@ -26,6 +26,8 @@ import (
 	storage "google.golang.org/api/storage/v1"
 )
 
+var OVERSIZE_FILE = errors.New("Oversize file")
+
 type TarReader interface {
 	Next() (*tar.Header, error)
 	Read(b []byte) (int, error)
@@ -106,8 +108,10 @@ func (rr *ETLSource) nextData(h *tar.Header, trial int) ([]byte, bool, error) {
 }
 
 // Next reads the next test object from the tar file.
+// Skips reading contents of any file larger than maxSize, returning empty data
+// and storage.OVERSIZE_FILE error.
 // Returns io.EOF when there are no more tests.
-func (rr *ETLSource) NextTest() (string, []byte, error) {
+func (rr *ETLSource) NextTest(maxSize int64) (string, []byte, error) {
 	metrics.WorkerState.WithLabelValues("read").Inc()
 	defer metrics.WorkerState.WithLabelValues("read").Dec()
 
@@ -136,28 +140,33 @@ func (rr *ETLSource) NextTest() (string, []byte, error) {
 		time.Sleep(delay)
 	}
 
+	if h.Size > maxSize {
+		return h.Name, data, OVERSIZE_FILE
+	}
+
 	// Only process regular files.
-	if h.Typeflag == tar.TypeReg {
-		trial = 0
-		delay = 16 * time.Millisecond
-		for {
-			trial++
-			var retry bool
-			data, retry, err = rr.nextData(h, trial)
-			if err == nil {
-				break
-			}
-			if !retry || trial >= 10 {
-				// FYI, it appears that stream errors start in the
-				// nextData phase of reading, but then persist on
-				// the next call to nextHeader.
-				break
-			}
-			// For each trial, increase backoff delay by 2x.
-			delay *= 2
-			time.Sleep(delay)
+	if h.Typeflag != tar.TypeReg {
+		return h.Name, data, nil
+	}
 
+	trial = 0
+	delay = 16 * time.Millisecond
+	for {
+		trial++
+		var retry bool
+		data, retry, err = rr.nextData(h, trial)
+		if err == nil {
+			break
+		}
+		if !retry || trial >= 10 {
+			// FYI, it appears that stream errors start in the
+			// nextData phase of reading, but then persist on
+			// the next call to nextHeader.
+			break
 		}
+		// For each trial, increase backoff delay by 2x.
+		delay *= 2
+		time.Sleep(delay)
 	}
 
 	return h.Name, data, nil
diff --git a/task/task.go b/task/task.go
@@ -17,14 +17,20 @@ import (
 	"github.com/m-lab/etl/storage"
 )
 
+// Impose 200MiB max size for a single file.  Larger than this risks an OOM if there are
+// multiple large files at on multiple tasks.
+// This can be overridden with SetMaxFileSize()
+const MAX_FILE_SIZE = 20 * 1024 * 1024
+
 // TODO(dev) Add unit tests for meta data.
 type Task struct {
 	// ETLSource and Parser are both embedded, so their interfaces are delegated
 	// to the component structs.
 	*storage.ETLSource // Source from which to read tests.
 	etl.Parser         // Parser to parse the tests.
 
-	meta map[string]bigquery.Value // Metadata about this task.
+	meta        map[string]bigquery.Value // Metadata about this task.
+	maxFileSize int64  // Max file size to avoid OOM.
 }
 
 // NewTask constructs a task, injecting the source and the parser.
@@ -34,10 +40,14 @@ func NewTask(filename string, src *storage.ETLSource, prsr etl.Parser) *Task {
 	meta["filename"] = filename
 	meta["parse_time"] = time.Now()
 	meta["attempt"] = 1
-	t := Task{src, prsr, meta}
+	t := Task{src, prsr, meta, MAX_FILE_SIZE}
 	return &t
 }
 
+func (tt *Task) SetMaxFileSize(max int64) {
+	tt.maxFileSize = max
+}
+
 // ProcessAllTests loops through all the tests in a tar file, calls the
 // injected parser to parse them, and inserts them into bigquery. Returns the
 // number of files processed.
@@ -46,28 +56,45 @@ func (tt *Task) ProcessAllTests() (int, error) {
 	defer metrics.WorkerState.WithLabelValues("task").Dec()
 	files := 0
 	nilData := 0
+	var testname string
+	var data []byte
+	var err error
 	// Read each file from the tar
-	for testname, data, err := tt.NextTest(); err != io.EOF; testname, data, err = tt.NextTest() {
+
+	for testname, data, err = tt.NextTest(tt.maxFileSize); err != io.EOF; testname, data, err = tt.NextTest(tt.maxFileSize) {
 		files++
 		if err != nil {
-			if err == io.EOF {
+			switch {
+			case err == io.EOF:
 				break
-			}
-			// We are seeing several of these per hour, a little more than
-			// one in one thousand files.  duration varies from 10 seconds up to several
-			// minutes.
-			// Example:
-			// filename:gs://m-lab-sandbox/ndt/2016/04/10/20160410T000000Z-mlab1-ord02-ndt-0002.tgz
-			// files:666 duration:1m47.571825351s
-			// err:stream error: stream ID 801; INTERNAL_ERROR
-			// Because of the break, this error is passed up, and counted at the Task level.
-			log.Printf("filename:%s testname:%s files:%d, duration:%v err:%v",
-				tt.meta["filename"], testname, files,
-				time.Since(tt.meta["parse_time"].(time.Time)), err)
+			case err == storage.OVERSIZE_FILE:
+				log.Printf("filename:%s testname:%s files:%d, duration:%v err:%v",
+					tt.meta["filename"], testname, files,
+					time.Since(tt.meta["parse_time"].(time.Time)), err)
+				metrics.TestCount.WithLabelValues(
+					tt.Parser.TableName(), "unknown", "oversize file").Inc()
+				continue
+			default:
+				// We are seeing several of these per hour, a little more than
+				// one in one thousand files.  duration varies from 10 seconds
+				// up to several minutes.
+				// Example:
+				// filename:
+				// gs://m-lab-sandbox/ndt/2016/04/10/20160410T000000Z-mlab1-ord02-ndt-0002.tgz
+				// files:666 duration:1m47.571825351s
+				// err:stream error: stream ID 801; INTERNAL_ERROR
+				// Because of the break, this error is passed up, and counted at
+				// the Task level.
+				log.Printf("filename:%s testname:%s files:%d, duration:%v err:%v",
+					tt.meta["filename"], testname, files,
+					time.Since(tt.meta["parse_time"].(time.Time)), err)
 
-			metrics.TestCount.WithLabelValues(
-				tt.Parser.TableName(), "unknown", "unrecovered").Inc()
-			break
+				metrics.TestCount.WithLabelValues(
+					tt.Parser.TableName(), "unknown", "unrecovered").Inc()
+				// Since we don't understand these errors, safest thing to do is
+				// stop processing the tar file (and task).
+				break
+			}
 		}
 		if data == nil {
 			// TODO(dev) Handle directories (expected) and other
@@ -89,14 +116,18 @@ func (tt *Task) ProcessAllTests() (int, error) {
 	}
 
 	// Flush any rows cached in the inserter.
-	err := tt.Flush()
+	flushErr := tt.Flush()
 
-	if err != nil {
-		log.Printf("%v", err)
+	if flushErr != nil {
+		log.Printf("%v", flushErr)
 	}
 	// TODO - make this debug or remove
 	log.Printf("Processed %d files, %d nil data, %d rows committed, %d failed, from %s into %s",
 		files, nilData, tt.Parser.Committed(), tt.Parser.Failed(),
 		tt.meta["filename"], tt.Parser.FullTableName())
-	return files, err
+	// Return the file count, and the terminal error, if other than EOF.
+	if err != io.EOF {
+		return files, err
+	}
+	return files, nil
 }
diff --git a/task/task_test.go b/task/task_test.go
@@ -44,6 +44,14 @@ func MakeTestSource(t *testing.T) *storage.ETLSource {
 		t.Fatal(err)
 	}
 
+	// Put a large file in the middle to test skipping.
+	hdr = tar.Header{Name: "big_file", Mode: 0666, Typeflag: tar.TypeReg, Size: int64(101)}
+	tw.WriteHeader(&hdr)
+	_, err = tw.Write(make([]byte, 101))
+	if err != nil {
+		t.Fatal(err)
+	}
+
 	hdr = tar.Header{Name: "bar", Mode: 0666, Typeflag: tar.TypeReg, Size: int64(11)}
 	tw.WriteHeader(&hdr)
 	_, err = tw.Write([]byte("butter milk"))
@@ -84,7 +92,7 @@ func TestTarFileInput(t *testing.T) {
 
 	// Among other things, this requires that tp implements etl.Parser.
 	tt := task.NewTask("filename", rdr, tp)
-	fn, bb, err := tt.NextTest()
+	fn, bb, err := tt.NextTest(100)
 	if err != nil {
 		t.Error(err)
 	}
@@ -95,7 +103,19 @@ func TestTarFileInput(t *testing.T) {
 		t.Error("Expected biscuits but got ", string(bb))
 	}
 
-	fn, bb, err = tt.NextTest()
+	// Here we expect an oversize file error, with filename = big_file.
+	fn, bb, err = tt.NextTest(100)
+	if fn != "big_file" {
+		t.Error("Expected big_file: " + fn)
+	}
+	if err == nil {
+		t.Error("Expected oversize file")
+	} else if err != storage.OVERSIZE_FILE {
+		t.Error("Expected oversize file but got: " + err.Error())
+	}
+
+	// This is the last file, so we expect EOF.
+	fn, bb, err = tt.NextTest(100)
 	if err != nil {
 		t.Error(err)
 	}
@@ -110,15 +130,18 @@ func TestTarFileInput(t *testing.T) {
 	rdr = MakeTestSource(t)
 
 	tt = task.NewTask("filename", rdr, tp)
+	tt.SetMaxFileSize(100)
 	fc, err := tt.ProcessAllTests()
 	if err != nil {
 		t.Error("Expected nil error, but got %v", err)
 	}
-	if fc != len(tp.files) {
-		t.Error("Number of files counted (%s) does not match files parsed", fc, len(tp.files))
+	// Should see 3 files.
+	if fc != 3 {
+		t.Error("Expected 3 files: ", fc)
 	}
+	// ... but process only two.
 	if len(tp.files) != 2 {
-		t.Error("Too few files ", len(tp.files))
+		t.Error("Should have processed two files: ", len(tp.files))
 	}
 	if !reflect.DeepEqual(tp.files, []string{"foo", "bar"}) {
 		t.Error("Not expected files: ", tp.files)