@@ -60,8 +60,9 @@ impl ParseRule for CsvParse {
6060
6161 fn parse ( & self , item : & ItemRef , data : & dyn DataAccess ) -> BinocResult < ParseOutput > {
6262 let bytes = data. read_bytes ( item) ?;
63- let tabular = parse_csv_bytes ( & bytes, delimiter_for ( item) ) ?;
64- let sections = detect_stacked_sections ( & tabular) ;
63+ let records = parse_csv_records ( & bytes, delimiter_for ( item) ) ?;
64+ let tabular = table_from_csv_records ( records. clone ( ) ) ;
65+ let sections = detect_stacked_sections_from_rows ( & records) ;
6566
6667 // Fewer than two qualifying regions: a plain CSV is a single table,
6768 // emitted as a LEAF `tabular_v1` artifact with no children.
@@ -697,31 +698,62 @@ fn delimiter_for(item: &ItemRef) -> u8 {
697698 }
698699}
699700
701+ #[ cfg( test) ]
700702fn parse_csv_bytes ( bytes : & [ u8 ] , delimiter : u8 ) -> BinocResult < TabularData > {
703+ parse_csv_records ( bytes, delimiter) . map ( table_from_csv_records)
704+ }
705+
706+ fn parse_csv_records ( bytes : & [ u8 ] , delimiter : u8 ) -> BinocResult < Vec < Vec < String > > > {
701707 let mut reader = csv:: ReaderBuilder :: new ( )
702708 . delimiter ( delimiter)
709+ . has_headers ( false )
703710 . flexible ( true )
704711 . from_reader ( bytes) ;
705- let headers = reader
706- . byte_headers ( )
707- . map_err ( |err| BinocError :: Csv ( err. to_string ( ) ) ) ?
708- . iter ( )
709- . map ( |field| String :: from_utf8_lossy ( field) . into_owned ( ) )
710- . collect ( ) ;
711- let mut rows = Vec :: new ( ) ;
712+ let mut records = Vec :: new ( ) ;
712713 let mut record = csv:: ByteRecord :: new ( ) ;
713714 while reader
714715 . read_byte_record ( & mut record)
715716 . map_err ( |err| BinocError :: Csv ( err. to_string ( ) ) ) ?
716717 {
717- rows . push (
718+ records . push (
718719 record
719720 . iter ( )
720721 . map ( |field| String :: from_utf8_lossy ( field) . into_owned ( ) )
721722 . collect ( ) ,
722723 ) ;
723724 }
724- Ok ( TabularData :: from_string_rows ( headers, rows) )
725+ Ok ( records)
726+ }
727+
728+ fn table_from_csv_records ( records : Vec < Vec < String > > ) -> TabularData {
729+ let Some ( first) = records. first ( ) else {
730+ return TabularData :: from_string_rows ( Vec :: new ( ) , Vec :: new ( ) ) ;
731+ } ;
732+ let width = records. iter ( ) . map ( Vec :: len) . max ( ) . unwrap_or ( first. len ( ) ) ;
733+ let headers = complete_csv_headers ( first, width) ;
734+ let rows = records. into_iter ( ) . skip ( 1 ) . collect ( ) ;
735+ TabularData :: from_string_rows ( headers, rows)
736+ }
737+
738+ fn complete_csv_headers ( first : & [ String ] , width : usize ) -> Vec < String > {
739+ let mut headers = Vec :: with_capacity ( width) ;
740+ let mut seen = BTreeSet :: new ( ) ;
741+ for index in 0 ..width {
742+ let raw = first
743+ . get ( index)
744+ . map ( |value| value. trim ( ) )
745+ . filter ( |value| !value. is_empty ( ) )
746+ . map ( str:: to_string)
747+ . unwrap_or_else ( || format ! ( "column_{}" , index + 1 ) ) ;
748+ let mut candidate = raw. clone ( ) ;
749+ let mut suffix = 2usize ;
750+ while !seen. insert ( candidate. clone ( ) ) {
751+ candidate = format ! ( "{raw}_{suffix}" ) ;
752+ suffix += 1 ;
753+ }
754+ headers. push ( candidate) ;
755+ }
756+ headers
725757}
726758
727759#[ derive( Debug , Clone ) ]
@@ -747,13 +779,11 @@ struct StackedSection {
747779/// has more than 10 rows (≥ 11, counting its header). When it qualifies, each
748780/// region's first row is the header and the rest are data rows, trimmed to the
749781/// region width. Otherwise an empty `Vec` is returned (a single flat table).
750- fn detect_stacked_sections ( table : & TabularData ) -> Vec < StackedSection > {
751- let rows = raw_rows ( table) ;
752-
782+ fn detect_stacked_sections_from_rows ( rows : & [ Vec < String > ] ) -> Vec < StackedSection > {
753783 // Partition into regions of consecutive same-width rows, skipping blanks.
754784 let mut regions: Vec < Vec < Vec < String > > > = Vec :: new ( ) ;
755785 let mut current_width: Option < usize > = None ;
756- for row in & rows {
786+ for row in rows {
757787 let width = normalized_width ( row) ;
758788 if width == 0 {
759789 // Blank rows are transparent.
@@ -816,17 +846,6 @@ fn children_from_sections(parent_path: &str, sections: &[StackedSection]) -> Vec
816846 children
817847}
818848
819- fn raw_rows ( table : & TabularData ) -> Vec < Vec < String > > {
820- std:: iter:: once ( table. headers . clone ( ) )
821- . chain (
822- table
823- . rows
824- . iter ( )
825- . map ( |row| row. iter ( ) . map ( |cell| cell. as_text ( ) . into_owned ( ) ) . collect ( ) ) ,
826- )
827- . collect ( )
828- }
829-
830849fn normalized_width ( row : & [ String ] ) -> usize {
831850 row. iter ( )
832851 . rposition ( |cell| !cell. trim ( ) . is_empty ( ) )
@@ -849,8 +868,8 @@ mod tests {
849868 use super :: * ;
850869
851870 fn detect ( csv : & str ) -> Vec < StackedSection > {
852- let table = parse_csv_bytes ( csv. as_bytes ( ) , b',' ) . expect ( "parse csv" ) ;
853- detect_stacked_sections ( & table )
871+ let records = parse_csv_records ( csv. as_bytes ( ) , b',' ) . expect ( "parse csv" ) ;
872+ detect_stacked_sections_from_rows ( & records )
854873 }
855874
856875 /// Build a CSV body of `count` rows, each `width` comma-separated cells,
@@ -868,6 +887,27 @@ mod tests {
868887 out
869888 }
870889
890+ #[ test]
891+ fn csv_parse_preserves_fields_after_single_cell_banner ( ) {
892+ let csv = "Land-Ocean: Global Means\n \
893+ Year,Jan,Feb\n \
894+ 1880,-.18,-.24\n ";
895+ let table = parse_csv_bytes ( csv. as_bytes ( ) , b',' ) . expect ( "parse csv" ) ;
896+ assert_eq ! (
897+ table. headers,
898+ vec![
899+ "Land-Ocean: Global Means" . to_string( ) ,
900+ "column_2" . to_string( ) ,
901+ "column_3" . to_string( )
902+ ]
903+ ) ;
904+ assert_eq ! ( table. rows. len( ) , 2 ) ;
905+ assert_eq ! ( table. rows[ 0 ] [ 0 ] . as_text( ) , "Year" ) ;
906+ assert_eq ! ( table. rows[ 0 ] [ 1 ] . as_text( ) , "Jan" ) ;
907+ assert_eq ! ( table. rows[ 0 ] [ 2 ] . as_text( ) , "Feb" ) ;
908+ assert_eq ! ( table. rows[ 1 ] [ 2 ] . as_text( ) , "-.24" ) ;
909+ }
910+
871911 #[ test]
872912 fn flat_ragged_csv_is_not_stacked ( ) {
873913 // A plain flat table with a few ragged rows (brfss / fda shape). Width
0 commit comments