1919package org .apache .hadoop .fs .tools .csv ;
2020
2121import java .nio .charset .StandardCharsets ;
22+ import java .util .ArrayList ;
2223import java .util .List ;
2324import java .util .Locale ;
2425import java .util .Random ;
3839import org .apache .hadoop .util .Progressable ;
3940import org .apache .hadoop .util .ToolRunner ;
4041
41- import static java .lang .Math .max ;
4242import static org .apache .hadoop .fs .store .CommonParameters .DEFINE ;
4343import static org .apache .hadoop .fs .store .CommonParameters .TOKENFILE ;
4444import static org .apache .hadoop .fs .store .CommonParameters .VERBOSE ;
4545import static org .apache .hadoop .fs .store .CommonParameters .XMLFILE ;
4646import static org .apache .hadoop .fs .store .StoreExitCodes .E_USAGE ;
47+ import static org .apache .hadoop .fs .store .StoreUtils .getDataSize ;
4748
49+ /**
50+ * Create a large CSV file for validation.
51+ */
4852public class MkCSV extends StoreEntryPoint {
4953
5054 private static final Logger LOG = LoggerFactory .getLogger (MkCSV .class );
5155
5256 public static final String HEADER = "header" ;
57+
5358 public static final String QUOTE = "quote" ;
5459
5560 public static final String USAGE
@@ -80,6 +85,10 @@ public class MkCSV extends StoreEntryPoint {
8085
8186 private static final String SEPARATOR = "," ;
8287
88+ public static final String START = "start" ;
89+
90+ public static final String END = "end" ;
91+
8392
8493 public MkCSV () {
8594 createCommandFormat (2 , 2 , VERBOSE , HEADER , QUOTE );
@@ -100,7 +109,7 @@ public int run(String[] args) throws Exception {
100109 String size = argList .get (0 ).toLowerCase (Locale .ENGLISH );
101110 String pathString = argList .get (1 );
102111 Path path = new Path (pathString );
103- long rows = Long . parseLong (size );
112+ long rows = ( long ) getDataSize (size );
104113 if (rows < 0 ) {
105114 errorln ("Invalid row count %s" , size );
106115 errorln (USAGE );
@@ -122,6 +131,13 @@ public int run(String[] args) throws Exception {
122131
123132 String block = sb .toString ();
124133
134+ final List <String > blockData = new ArrayList <>();
135+ blockRows (blockData , 'a' , 'z' , elements );
136+ blockRows (blockData , 'A' , 'Z' , elements );
137+ blockRows (blockData , '0' , '9' , elements );
138+ final int blockCount = blockData .size ();
139+
140+
125141 // progress callback counts #of invocations, and optionally prints a .
126142 AtomicLong progressCount = new AtomicLong ();
127143 Progressable progress = () -> {
@@ -140,7 +156,7 @@ public int run(String[] args) throws Exception {
140156 // open the file. track duration
141157 FSDataOutputStream upload ;
142158 try (StoreDurationInfo d = new StoreDurationInfo (LOG ,
143- "Opening %s for upload " , path )) {
159+ "Opening %s for writing " , path )) {
144160 upload = fs .createFile (path )
145161 .progress (progress )
146162 .recursive ()
@@ -152,31 +168,33 @@ public int run(String[] args) throws Exception {
152168 StoreCsvWriter writer = new StoreCsvWriter (upload , SEPARATOR , EOL , quote );
153169 if (header ) {
154170 writer
155- .columns ("rowId" , "dataCrc" , "data" , "rowId2" , "rowCrc" )
171+ .columns (START , "rowId" , "length" , " dataCrc" , "data" , "rowId2" , "rowCrc" , END )
156172 .newline ();
157173 }
158174
159175 Random rand = new Random ();
160176 for (int r = 1 ; r <= rows ; r ++) {
161177
178+ writer .column (START );
162179 String rowId = Long .toString (r );
163180 writer .column (rowId );
164181 // now collect a subset of the value
165- int firstElt = rand .nextInt (elements -1 );
166- int lastElt = firstElt + 1 + rand .nextInt (elements - firstElt - 1 );
167- int first = firstElt * 5 ;
168- // always 1 elt higher than first
169- int last = lastElt * 5 - 1 ;
170- String data = block .substring (first , last );
182+ int lastElt = 2 + rand .nextInt (elements );
183+ String dataRow = blockData .get (r % blockCount );
184+ int length = Math .min (lastElt , elements );
185+ String data = dataRow .substring (length );
186+ writer .column (data .length ());
171187 // data CRC
172188 CRC32 crc = new CRC32 ();
173189 crc .update (data .getBytes (StandardCharsets .UTF_8 ));
174190 writer .column (crc .getValue ());
175191 writer .column (data );
176192 // repeat the row ID
177- writer .column (r );
193+ writer .column (rowId );
178194 // full row checksum
179195 writer .column (writer .getRowCrc ());
196+ // end of row
197+ writer .column (END );
180198 writer .newline ();
181199 }
182200 // now close the file
@@ -187,7 +205,7 @@ public int run(String[] args) throws Exception {
187205 }
188206
189207 } finally {
190- printIfVerbose ("Upload Stream: %s" , upload );
208+ printIfVerbose ("Write Stream: %s" , upload );
191209 }
192210
193211 println ();
@@ -201,12 +219,32 @@ public int run(String[] args) throws Exception {
201219 printFSInfoInVerbose (fs );
202220
203221 long sizeBytes = status .getLen ();
204- summarize ("Upload " , uploadDurationTracker , sizeBytes );
222+ summarize ("CSV Generation " , uploadDurationTracker , sizeBytes );
205223
206224 return 0 ;
207225
208226 }
209227
228+ /**
229+ * Generate a row from a string
230+ * @param s string to use
231+ * @param elements number of elements
232+ * @return string of s repeated elements times.
233+ */
234+ private String blockRow (String s , int elements ) {
235+ StringBuilder sb = new StringBuilder (elements );
236+ for (int i = 1 ; i <= elements ; i ++) {
237+ sb .append (s );
238+ }
239+ return sb .toString ();
240+ }
241+
242+ private void blockRows (List <String > rows , char start , char end , int elements ) {
243+ for (char i = start ; i <= end ; i ++) {
244+ rows .add (blockRow (Character .toString (i ), elements ));
245+ }
246+ }
247+
210248 /**
211249 * Execute the command, return the result or throw an exception,
212250 * as appropriate.
0 commit comments