11# collect_yaml_lists <file.yaml> [--print] [--prefix PREFIX]
2- # Pure Bash + awk YAML reader (no yq) .
3- # Supported YAML shape at the top level :
2+ # Pure Bash + awk (BSD/GNU/mawk/BusyBox) YAML reader for simple top-level lists .
3+ # Supports :
44# key:
55# - item
6- # - item
6+ # - " item with spaces"
77# other: [a, b, "c"]
8- #
9- # Creates:
10- # categories[] : sanitized array names (with optional PREFIX)
11- # orig_keys[] : original YAML keys (aligned with categories[])
12- # one array per key, e.g. key "pip3" -> $pip3 (or $PREFIXpip3)
13- #
14- # Notes:
15- # - This is a pragmatic parser for common dotfiles/package lists.
16- # - It ignores nested structures under items and normalizes simple quotes.
17-
188collect_yaml_lists () {
199 # ---- args ----
2010 if [ $# -lt 1 ]; then
@@ -80,118 +70,101 @@ collect_yaml_lists() {
8070 eval " $assign "
8171 }
8272
83- # ---- parse YAML with awk -> token stream (__KEY__/__ITEM__/__END__) ----
84- # Pragmatic parser: handles top-level "key:" + block-style list and "key: [a, b, 'c']"
85- # Strips comments that are not inside quotes.
73+ # ---- parse YAML with awk (script via single-quoted heredoc) ----
74+ # Emits token lines: __KEY__name / __ITEM__value / __END__
8675 local awk_out
8776 awk_out=" $(
88- awk '
89- function trim(s){ gsub(/^[ \t]+|[ \t]+$/, "", s); return s }
90- function dequote(s){
91- if (s ~ /^".*"$/) { s=substr(s,2,length(s)-2) }
92- else if (s ~ /^' \' ' .*' \' ' $/) { s=substr(s,2,length(s)-2) }
93- return s
94- }
95- function strip_comments(line, i,c,dq,sq,esc,out,len) {
96- dq=0; sq=0; esc=0; out=""; len=length(line)
97- for (i=1; i<=len; i++) {
98- c=substr(line,i,1)
99- if (esc) { out=out c; esc=0; continue }
100- if (c=="\\") { out=out c; esc=1; continue }
101- if (c=="\"" && !sq) { dq=!dq; out=out c; continue }
102- if (c=="' \' ' " && !dq) { sq=!sq; out=out c; continue }
103- if (c=="#" && !dq && !sq) { break } # start of comment
104- out=out c
105- }
106- return out
107- }
108- BEGIN { in_list=0; base_indent=-1; key="" }
109- {
110- line = strip_comments($0)
111- if (line ~ /^[ \t]*$/) next
112- # Flow style: key: [a, b, "c"]
113- if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, m)) {
114- key = trim(m[2])
115- print "__KEY__" key
116- s = m[3]
117- n = split(s, a, /,/)
118- for (i=1;i<=n;i++){
119- val=trim(a[i]); val=dequote(val)
120- if (val!="") print "__ITEM__" val
121- }
122- print "__END__"
123- in_list=0; key=""; next
124- }
77+ awk -f - " $file " << 'AWK '
78+ function trim(s) { sub(/^[ \t]+/,"",s); sub(/[ \t]+$/,"",s); return s }
79+ function dequote(s, first,last,len) {
80+ len = length(s); if (len < 2) return s
81+ first = substr(s,1,1); last = substr(s,len,1)
82+ if ((first == "\"" && last == "\"") || (first == "'" && last == "'")) {
83+ return substr(s,2,len-2)
84+ }
85+ return s
86+ }
87+ function strip_comments(line, i,ch,dq,sq,esc,out,len) {
88+ # remove # outside of quotes; handle \" and \' escapes
89+ dq=0; sq=0; esc=0; out=""; len=length(line)
90+ for (i=1; i<=len; i++) {
91+ ch=substr(line,i,1)
92+ if (esc) { out=out ch; esc=0; continue }
93+ if (ch=="\\") { out=out ch; esc=1; continue }
94+ if (ch=="\"" && !sq) { dq=!dq; out=out ch; continue }
95+ if (ch=="'" && !dq) { sq=!sq; out=out ch; continue }
96+ if (ch=="#" && !dq && !sq) break
97+ out=out ch
98+ }
99+ return out
100+ }
101+ BEGIN { in_list=0; base_indent=-1; key="" }
102+ {
103+ line = strip_comments($0)
104+ if (line ~ /^[ \t]*$/) next
125105
126- # Block style start: key:
127- if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, m)) {
128- # if we were in a previous list, close it
129- if (in_list==1) { print "__END__"; in_list=0; key="" }
130- key = trim(m[2])
131- base_indent = length(m[1])
132- in_list = 1
133- print "__KEY__" key
134- next
135- }
106+ # Flow: key: [a, b, "c"]
107+ if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, m)) {
108+ key = trim(m[2]); print "__KEY__" key
109+ s = m[3]
110+ n = split(s, a, /,/) # pragmatic split; commas inside quotes not supported
111+ for (i=1;i<=n;i++){
112+ val=trim(a[i]); val=dequote(val)
113+ if (val!="") print "__ITEM__" val
114+ }
115+ print "__END__"
116+ in_list=0; key=""; next
117+ }
136118
137- # If inside a list, look for items or dedent/new key
138- if (in_list==1) {
139- # New key at dedent/same indent ends current list
140- if (match(line, /^([ \t]*)([^:# \t][^:]*):/, mk)) {
141- if (length(mk[1]) <= base_indent) {
142- print "__END__"
143- in_list=0; key=""
144- # Reprocess same line as a new key; emulate tail recursion:
145- # Flow style?
146- if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, mm)) {
147- key = trim(mm[2]); print "__KEY__" key
148- s = mm[3]; n = split(s, a, /,/)
149- for (i=1;i<=n;i++){ val=trim(a[i]); val=dequote(val); if (val!="") print "__ITEM__" val }
150- print "__END__"
151- next
152- }
153- # Block style start
154- if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, mm2)) {
155- key = trim(mm2[2]); base_indent = length(mm2[1]); in_list=1; print "__KEY__" key
156- next
157- }
158- }
159- }
160- # List item: - value
161- if (match(line, /^[ \t]*-[ \t]*(.+)$/, mi)) {
162- item = trim(mi[1]); item = dequote(item)
163- if (item!="") print "__ITEM__" item
164- next
165- }
166- # Otherwise ignore until dedent/new key
167- next
168- }
119+ # Block list start: key:
120+ if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, m)) {
121+ if (in_list==1) { print "__END__"; in_list=0; key="" }
122+ key = trim(m[2]); base_indent = length(m[1]); in_list = 1
123+ print "__KEY__" key
124+ next
125+ }
169126
170- # Lines outside list we do not handle (scalars, maps)
127+ if (in_list==1) {
128+ # New key at same-or-less indent ends current list
129+ if (match(line, /^([ \t]*)([^:# \t][^:]*):/, mk) && length(mk[1]) <= base_indent) {
130+ print "__END__"; in_list=0; key=""
131+ # Reprocess same line as a new start
132+ if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, mm)) {
133+ key = trim(mm[2]); print "__KEY__" key
134+ s = mm[3]; n = split(s, a2, /,/)
135+ for (i=1;i<=n;i++){ val=trim(a2[i]); val=dequote(val); if (val!="") print "__ITEM__" val }
136+ print "__END__"; next
137+ }
138+ if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, mm2)) {
139+ key = trim(mm2[2]); base_indent = length(mm2[1]); in_list=1; print "__KEY__" key
171140 next
172141 }
173- END { if (in_list==1) print "__END__" }
174- ' " $file "
142+ }
143+
144+ # List item: - value
145+ if (match(line, /^[ \t]*-[ \t]*(.+)$/, mi)) {
146+ item = trim(mi[1]); item = dequote(item)
147+ if (item!="") print "__ITEM__" item
148+ next
149+ }
150+ }
151+ }
152+ END { if (in_list==1) print "__END__" }
153+ AWK
175154 ) "
176155
177- # ---- consume token stream and create arrays ----
178- # We gather items per key, then assign a real shell array with a unique sanitized name.
156+ # ---- consume tokens and create arrays ----
179157 if [ -z " $awk_out " ]; then
180158 echo " No top-level list categories found in: $file " >&2
181159 return 0
182160 fi
183161
184- # read line-by-line portably (bash & zsh)
185162 local cur_key=" " cur_safe=" " base=" " n=1
186- # temp items store (as a plain list we will quote on assignment)
187- if [ " $is_zsh " -eq 1 ]; then
188- typeset -ga __items
189- fi
163+ if [ " $is_zsh " -eq 1 ]; then typeset -ga __items; fi
190164 __items=()
191165
192166 _finalize_current () {
193167 [ -z " $cur_key " ] && return 0
194- # unique, sanitized var name (with prefix)
195168 cur_safe=" $( _sanitize " $cur_key " ) "
196169 cur_safe=" ${prefix}${cur_safe} "
197170 base=" $cur_safe "
@@ -207,20 +180,14 @@ collect_yaml_lists() {
207180 cur_key=" "
208181 }
209182
210- # iterate tokens
211183 while IFS= read -r line; do
212184 case " $line " in
213185 __KEY__* )
214- # finalize previous (if any) then start new
215186 _finalize_current
216187 cur_key=" ${line# __KEY__} "
217188 ;;
218- __ITEM__* )
219- __items+=(" ${line# __ITEM__} " )
220- ;;
221- __END__)
222- _finalize_current
223- ;;
189+ __ITEM__* ) __items+=(" ${line# __ITEM__} " ) ;;
190+ __END__) _finalize_current ;;
224191 esac
225192 done << EOF
226193$awk_out
0 commit comments