HADOOP-18521. csv writing enhancements

steveloughran · steveloughran · commit 70258bbae43e · 2022-11-11T13:26:05.000Z
diff --git a/src/main/java/org/apache/hadoop/fs/store/StoreUtils.java b/src/main/java/org/apache/hadoop/fs/store/StoreUtils.java
@@ -20,10 +20,14 @@
 
 import java.io.IOException;
 import java.lang.reflect.Array;
+import java.util.Locale;
 import java.util.Map;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.StorageUnit;
+
 public class StoreUtils {
 
   /** life without Guava. */
@@ -119,6 +123,33 @@ public static <T> T[] cat(T[] left, T[] right) {
     return dest;
   }
 
+  /**
+   * get the storage size from a string, uses M, G, T etc
+   * @param size data size
+   * @return size as a double.
+   */
+  public static double getDataSize(final String size) {
+    double uploadSize;
+
+    String s = size.trim().toUpperCase(Locale.ROOT);
+    try {
+      // look for a long value,
+      uploadSize = Long.parseLong(s);
+    } catch (NumberFormatException e) {
+      // parse the size values via Configuration
+      // this is only possible on hadoop 3.1+.
+      if (!s.endsWith("B")) {
+        s = s + "B";
+      }
+      final Configuration sizeConf = new Configuration(false);
+
+
+      // upload in MB.
+      uploadSize = sizeConf.getStorageSize("size", s, StorageUnit.MB);
+    }
+    return uploadSize;
+  }
+
   public static class StringPair implements Map.Entry<String, String>{
     private String key, value;
 
diff --git a/src/main/java/org/apache/hadoop/fs/store/commands/Bandwidth.java b/src/main/java/org/apache/hadoop/fs/store/commands/Bandwidth.java
@@ -27,13 +27,13 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.StorageUnit;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.store.StoreDurationInfo;
 import org.apache.hadoop.fs.store.StoreEntryPoint;
+import org.apache.hadoop.fs.store.StoreUtils;
 import org.apache.hadoop.util.Progressable;
 import org.apache.hadoop.util.ToolRunner;
 
@@ -91,19 +91,7 @@ public int run(String[] args) throws Exception {
     FileSystem fs = path.getFileSystem(conf);
     println("Using filesystem %s", fs.getUri());
 
-    double uploadSize;
-
-    try {
-      // look for a long value,
-      uploadSize = Long.parseLong(size);
-    } catch (NumberFormatException e) {
-      // parse the size values via Configuration
-      // this is only possible on hadoop 3.1+.
-      final Configuration sizeConf = new Configuration(false);
-
-      // upload in MB.
-      uploadSize = sizeConf.getStorageSize("size", size, StorageUnit.MB);
-    }
+    double uploadSize = StoreUtils.getDataSize(size);
 
     long sizeMB = Math.round(uploadSize);
     if (sizeMB <= 0) {
diff --git a/src/main/java/org/apache/hadoop/fs/tools/csv/MkCSV.java b/src/main/java/org/apache/hadoop/fs/tools/csv/MkCSV.java
@@ -19,6 +19,7 @@
 package org.apache.hadoop.fs.tools.csv;
 
 import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 import java.util.Random;
@@ -38,18 +39,22 @@
 import org.apache.hadoop.util.Progressable;
 import org.apache.hadoop.util.ToolRunner;
 
-import static java.lang.Math.max;
 import static org.apache.hadoop.fs.store.CommonParameters.DEFINE;
 import static org.apache.hadoop.fs.store.CommonParameters.TOKENFILE;
 import static org.apache.hadoop.fs.store.CommonParameters.VERBOSE;
 import static org.apache.hadoop.fs.store.CommonParameters.XMLFILE;
 import static org.apache.hadoop.fs.store.StoreExitCodes.E_USAGE;
+import static org.apache.hadoop.fs.store.StoreUtils.getDataSize;
 
+/**
+ * Create a large CSV file for validation.
+ */
 public class MkCSV extends StoreEntryPoint {
 
   private static final Logger LOG = LoggerFactory.getLogger(MkCSV.class);
 
   public static final String HEADER = "header";
+
   public static final String QUOTE = "quote";
 
   public static final String USAGE
@@ -80,6 +85,10 @@ public class MkCSV extends StoreEntryPoint {
 
   private static final String SEPARATOR = ",";
 
+  public static final String START = "start";
+
+  public static final String END = "end";
+
 
   public MkCSV() {
     createCommandFormat(2, 2, VERBOSE, HEADER, QUOTE);
@@ -100,7 +109,7 @@ public int run(String[] args) throws Exception {
     String size = argList.get(0).toLowerCase(Locale.ENGLISH);
     String pathString = argList.get(1);
     Path path = new Path(pathString);
-    long rows = Long.parseLong(size);
+    long rows = (long) getDataSize(size);
     if (rows < 0) {
       errorln("Invalid row count %s", size);
       errorln(USAGE);
@@ -122,6 +131,13 @@ public int run(String[] args) throws Exception {
 
     String block = sb.toString();
 
+    final List<String> blockData = new ArrayList<>();
+    blockRows(blockData, 'a', 'z', elements);
+    blockRows(blockData, 'A', 'Z', elements);
+    blockRows(blockData, '0', '9', elements);
+    final int blockCount = blockData.size();
+
+
     // progress callback counts #of invocations, and optionally prints a .
     AtomicLong progressCount = new AtomicLong();
     Progressable progress = () -> {
@@ -140,7 +156,7 @@ public int run(String[] args) throws Exception {
     // open the file. track duration
     FSDataOutputStream upload;
     try (StoreDurationInfo d = new StoreDurationInfo(LOG,
-        "Opening %s for upload", path)) {
+        "Opening %s for writing", path)) {
       upload = fs.createFile(path)
           .progress(progress)
           .recursive()
@@ -152,31 +168,33 @@ public int run(String[] args) throws Exception {
       StoreCsvWriter writer = new StoreCsvWriter(upload, SEPARATOR, EOL, quote);
       if (header) {
         writer
-            .columns("rowId", "dataCrc", "data", "rowId2", "rowCrc")
+            .columns(START, "rowId", "length", "dataCrc", "data", "rowId2", "rowCrc", END)
             .newline();
       }
 
       Random rand = new Random();
       for (int r = 1; r <= rows; r++) {
 
+        writer.column(START);
         String rowId = Long.toString(r);
         writer.column(rowId);
         // now collect a subset of the value
-        int firstElt = rand.nextInt(elements-1);
-        int lastElt = firstElt + 1 + rand.nextInt(elements - firstElt - 1);
-        int first = firstElt * 5;
-        // always 1 elt higher than first
-        int last = lastElt * 5 - 1;
-        String data = block.substring(first, last);
+        int lastElt = 2 + rand.nextInt(elements);
+        String dataRow = blockData.get(r % blockCount);
+        int length = Math.min(lastElt, elements);
+        String data = dataRow.substring(length);
+        writer.column(data.length());
         // data CRC
         CRC32 crc = new CRC32();
         crc.update(data.getBytes(StandardCharsets.UTF_8));
         writer.column(crc.getValue());
         writer.column(data);
         // repeat the row ID
-        writer.column(r);
+        writer.column(rowId);
         // full row checksum
         writer.column(writer.getRowCrc());
+        // end of row
+        writer.column(END);
         writer.newline();
       }
       // now close the file
@@ -187,7 +205,7 @@ public int run(String[] args) throws Exception {
       }
 
     } finally {
-      printIfVerbose("Upload Stream: %s", upload);
+      printIfVerbose("Write Stream: %s", upload);
     }
 
     println();
@@ -201,12 +219,32 @@ public int run(String[] args) throws Exception {
     printFSInfoInVerbose(fs);
 
     long sizeBytes = status.getLen();
-    summarize("Upload", uploadDurationTracker, sizeBytes);
+    summarize("CSV Generation", uploadDurationTracker, sizeBytes);
 
     return 0;
 
   }
 
+  /**
+   * Generate a row from a string
+   * @param s string to use
+   * @param elements number of elements
+   * @return string of s repeated elements times.
+   */
+  private String blockRow(String s, int elements) {
+    StringBuilder sb = new StringBuilder(elements);
+    for (int i = 1; i <= elements; i++) {
+      sb.append(s);
+    }
+    return sb.toString();
+  }
+
+  private void blockRows(List<String> rows, char start, char end, int elements) {
+    for (char i = start; i <= end; i++) {
+      rows.add(blockRow(Character.toString(i), elements));
+    }
+  }
+
   /**
    * Execute the command, return the result or throw an exception,
    * as appropriate.
diff --git a/src/main/site/mkcsv.md b/src/main/site/mkcsv.md
@@ -14,29 +14,38 @@
 
 # Command `mkcsv`
 
-Creates a CSV file with a given path; useful
-for scale testing CSV processing .
+Creates a CSV file with a given path; useful for scale testing CSV processing.
 
-```
+```bash
 hadoop jar cloudstore-1.0.jar mkcsv  -header -quote -verbose  10000 s3a://bucket/file.csv
 ```
 
-The format is a variable width sequence, with entries cross referencing each other for ease of validation.
+The format is a variable width sequence, with entries cross referencing each other for validation.
 ```csv
-"rowId","dataCrc","data","rowId2","rowCrc"
-"1","4098016739","0008-0009-000a-000b-000c-000d-000e-000f-0010-0011-0012-0013-0014-0015-0016-0017-0018-0019-001a-001b-001c-001d-001e-001f-0020-0021-0022-0023-0024-0025-0026-0027-0028-0029-002a-002b-002c-002d-002e-002f-0030-0031-0032-0033-0034-0035-0036-0037-0038-0039-003a-003b-003c-003d-003e-003f-0040-0041-0042-0043","1","2526808319"
-"2","4102619375","005b","2","3614304611"
-"3","2808119570","005e-005f-0060","3","3847878359"
+"start","rowId","length","dataCrc","data","rowId2","rowCrc","end"
+"start","1","87","691051183","bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb","1","2707924207","end"
+"start","2","40","2886466480","cccccccccccccccccccccccccccccccccccccccc","2","2141198053","end"
+"start","3","98","3320970725","dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd","3","4203069111","end"
+"start","4","8","1257926895","eeeeeeee","4","189792478","end"
+"start","5","25","1630497970","fffffffffffffffffffffffff","5","1034603103","end"
+"start","6","38","557554018","gggggggggggggggggggggggggggggggggggggg","6","1412646710","end"
+"start","7","86","951894681","hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh","7","2062289315","end"
+"start","8","45","3065088391","iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii","8","3774714774","end"
+"start","9","70","2839984696","jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj","9","303056462","end"
 ```
 
 ## Invariants
 
 For each row
 ```java
+start == "start"
 rowId == rowId2
-
+length == a random int >= 0    
+data = string where data.length() == length
+       elements of data == char c where c in "[a-z][A-Z][0-9]"
 dataCrc == new CRC32().update(data.getBytes(StandardCharsets.UTF_8))
 rowCrC == crc32 of all previous fields, including quotes, *excluding separators*
+end == "end"
 // and ignoring headers    
 forall n: row[n].rowID == n
 ```
@@ -46,25 +55,33 @@ forall n: row[n].rowID == n
 ```scala
 
 /**
- * Dataset case class.
+ * Dataset class.
+ * Latest build is "start","rowId","length","dataCrc","data","rowId2","rowCrc","end"
  */
 case class CsvRecord(
+    start: String,
     rowId: Long,
+    length: Long,
     dataCrc: Long,
     data: String,
     rowId2: Long,
-    rowCrc: Long)
+    rowCrc: Long,
+    end: String)
 
 /**
  * The StructType of the CSV data.
+ * "start","rowId","length","dataCrc","data","rowId2","rowCrc","end"
  */
 val csvSchema: StructType = {
-    new StructType().
+  new StructType().
+    add("start", StringType).
     add("rowId", LongType).
+    add("length", LongType).
     add("dataCrc", LongType).
     add("data", StringType).
     add("rowId2", LongType).
-    add("rowCrc", LongType)
+    add("rowCrc", LongType).
+    add("end", StringType)
 }
 
 ```