11# collect_yaml_lists <file.yaml> [--print] [--prefix PREFIX]
2- # Pure Bash + awk YAML reader (no yq).
3- # Supported YAML shape at the top level:
2+ # No yq. Works on mawk/BusyBox/BSD/gawk. Supports top-level:
43# key:
54# - item
6- # - item
5+ # - " item with spaces"
76# other: [a, b, "c"]
8- #
9- # Creates:
10- # categories[] : sanitized array names (with optional PREFIX)
11- # orig_keys[] : original YAML keys (aligned with categories[])
12- # one array per key, e.g. key "pip3" -> $pip3 (or $PREFIXpip3)
13- #
14- # Notes:
15- # - This is a pragmatic parser for common dotfiles/package lists.
16- # - It ignores nested structures under items and normalizes simple quotes.
17-
187collect_yaml_lists () {
198 # ---- args ----
209 if [ $# -lt 1 ]; then
@@ -49,7 +38,7 @@ collect_yaml_lists() {
4938 return 1
5039 }
5140
52- # ---- shell detection (zsh arrays are 1-based & need typeset) ----
41+ # ---- shell detection ----
5342 local is_zsh=0
5443 [ -n " ${ZSH_VERSION:- } " ] && is_zsh=1
5544 if [ " $is_zsh " -eq 1 ]; then
@@ -71,7 +60,7 @@ collect_yaml_lists() {
7160 for n in " ${categories[@]} " ; do [ " $n " = " $1 " ] && return 0; done
7261 return 1
7362 }
74- _assign_array () { # _assign_array <varname> <elements...> (portable via printf %q + eval)
63+ _assign_array () {
7564 local v=" $1 "
7665 shift
7766 local assign=" $v =(" x
@@ -80,118 +69,106 @@ collect_yaml_lists() {
8069 eval " $assign "
8170 }
8271
83- # ---- parse YAML with awk -> token stream (__KEY__/__ITEM__/__END__) ----
84- # Pragmatic parser: handles top-level "key:" + block-style list and "key: [a, b, 'c']"
85- # Strips comments that are not inside quotes.
72+ # ---- awk tokenizer (mawk-safe: only 2-arg match, no functions) ----
73+ # Emits lines: __KEY__name / __ITEM__value / __END__
8674 local awk_out
8775 awk_out=" $(
8876 awk '
89- function trim(s){ gsub(/^[ \t]+|[ \t]+$/, "", s); return s }
90- function dequote(s){
91- if (s ~ /^".*"$/) { s=substr(s,2,length(s)-2) }
92- else if (s ~ /^' \' ' .*' \' ' $/) { s=substr(s,2,length(s)-2) }
93- return s
94- }
95- function strip_comments(line, i,c,dq,sq,esc,out,len) {
96- dq=0; sq=0; esc=0; out=""; len=length(line)
97- for (i=1; i<=len; i++) {
98- c=substr(line,i,1)
99- if (esc) { out=out c; esc=0; continue }
100- if (c=="\\") { out=out c; esc=1; continue }
101- if (c=="\"" && !sq) { dq=!dq; out=out c; continue }
102- if (c=="' \' ' " && !dq) { sq=!sq; out=out c; continue }
103- if (c=="#" && !dq && !sq) { break } # start of comment
104- out=out c
105- }
106- return out
107- }
108- BEGIN { in_list=0; base_indent=-1; key="" }
77+ BEGIN { in_list=0; base_indent=0 }
10978 {
110- line = strip_comments($0)
111- if (line ~ /^[ \t]*$/) next
112- # Flow style: key: [a, b, "c"]
113- if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, m)) {
114- key = trim(m[2])
115- print "__KEY__" key
116- s = m[3]
117- n = split(s, a, /,/)
118- for (i=1;i<=n;i++){
119- val=trim(a[i]); val=dequote(val)
120- if (val!="") print "__ITEM__" val
79+ line=$0
80+
81+ # remove trailing comments only if a space/tab precedes #
82+ sub(/[ \t]#[^\n]*$/, "", line)
83+
84+ # trim both ends (no functions, do it inline)
85+ t=line; sub(/^[ \t]+/, "", t); sub(/[ \t]+$/, "", t)
86+ if (t=="") next
87+
88+ # compute indent = number of leading spaces/tabs
89+ tmp=line; match(tmp, /^[ \t]*/); indent=RLENGTH
90+
91+ # detect flow: key: [ ... ]
92+ if (line ~ /^[ \t]*[^:# \t][^:]*:[ \t]*\[/) {
93+ # key = text before first colon
94+ l2=line; sub(/^[ \t]+/, "", l2)
95+ cpos=index(l2, ":")
96+ if (cpos>0) {
97+ key=substr(l2,1,cpos-1); sub(/^[ \t]+/, "", key); sub(/[ \t]+$/, "", key)
98+ print "__KEY__" key
99+ # extract inside brackets from first "[" to last "]"
100+ bpos=index(l2, "[")
101+ s=substr(l2, bpos+1)
102+ sub(/\][ \t]*$/, "", s)
103+ # split on commas (commas inside quotes not supported)
104+ n=split(s, A, ",")
105+ for (i=1; i<=n; i++) {
106+ val=A[i]; sub(/^[ \t]+/, "", val); sub(/[ \t]+$/, "", val)
107+ # dequote simple "..." or ' ...'
108+ L=length(val)
109+ if (L>=2) {
110+ f=substr(val,1,1); e=substr(val,L,1)
111+ if ((f=="\"" && e=="\"") || (f=="' " && e==" ' ")) val=substr(val,2,L-2)
112+ }
113+ if (val!="") print "__ITEM__" val
114+ }
115+ print "__END__"
116+ in_list=0
117+ next
121118 }
122- print "__END__"
123- in_list=0; key=""; next
124119 }
125120
126- # Block style start: key:
127- if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, m)) {
128- # if we were in a previous list, close it
129- if (in_list==1) { print "__END__"; in_list=0; key="" }
130- key = trim(m[2])
131- base_indent = length(m[1])
132- in_list = 1
121+ # detect block start: key:
122+ if (line ~ /^[ \t]*[^:# \t][^:]*:[ \t]*$/) {
123+ if (in_list==1) { print "__END__"; in_list=0 }
124+ l2=line; sub(/^[ \t]+/, "", l2)
125+ cpos=index(l2, ":")
126+ key=substr(l2,1,cpos-1); sub(/^[ \t]+/, "", key); sub(/[ \t]+$/, "", key)
127+ base_indent=indent
128+ in_list=1
133129 print "__KEY__" key
134130 next
135131 }
136132
137- # If inside a list, look for items or dedent/new key
138133 if (in_list==1) {
139- # New key at dedent/same indent ends current list
140- if (match(line, /^([ \t]*)([^:# \t][^:]*):/, mk)) {
141- if (length(mk[1]) <= base_indent) {
142- print "__END__"
143- in_list=0; key=""
144- # Reprocess same line as a new key; emulate tail recursion:
145- # Flow style?
146- if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, mm)) {
147- key = trim(mm[2]); print "__KEY__" key
148- s = mm[3]; n = split(s, a, /,/)
149- for (i=1;i<=n;i++){ val=trim(a[i]); val=dequote(val); if (val!="") print "__ITEM__" val }
150- print "__END__"
151- next
152- }
153- # Block style start
154- if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, mm2)) {
155- key = trim(mm2[2]); base_indent = length(mm2[1]); in_list=1; print "__KEY__" key
156- next
134+ # if a new key at same-or-less indent, end the list and re-handle line
135+ if (line ~ /^[ \t]*[^:# \t][^:]*:/ && indent <= base_indent) {
136+ print "__END__"
137+ in_list=0
138+ # fall through to let the next iterations process this line again
139+ } else {
140+ # list item: - value
141+ if (line ~ /^[ \t]*-[ \t]*/) {
142+ item=line
143+ sub(/^[ \t]*-[ \t]*/, "", item)
144+ sub(/[ \t]+$/, "", item)
145+ L=length(item)
146+ if (L>=2) {
147+ f=substr(item,1,1); e=substr(item,L,1)
148+ if ((f=="\"" && e=="\"") || (f=="' " && e==" ' ")) item=substr(item,2,L-2)
157149 }
150+ if (item!="") print "__ITEM__" item
151+ next
158152 }
159153 }
160- # List item: - value
161- if (match(line, /^[ \t]*-[ \t]*(.+)$/, mi)) {
162- item = trim(mi[1]); item = dequote(item)
163- if (item!="") print "__ITEM__" item
164- next
165- }
166- # Otherwise ignore until dedent/new key
167- next
168154 }
169-
170- # Lines outside list we do not handle (scalars, maps)
171- next
172155 }
173156 END { if (in_list==1) print "__END__" }
174157 ' " $file "
175158 ) "
176159
177- # ---- consume token stream and create arrays ----
178- # We gather items per key, then assign a real shell array with a unique sanitized name.
160+ # ---- consume tokens and create arrays ----
179161 if [ -z " $awk_out " ]; then
180162 echo " No top-level list categories found in: $file " >&2
181163 return 0
182164 fi
183165
184- # read line-by-line portably (bash & zsh)
185166 local cur_key=" " cur_safe=" " base=" " n=1
186- # temp items store (as a plain list we will quote on assignment)
187- if [ " $is_zsh " -eq 1 ]; then
188- typeset -ga __items
189- fi
167+ if [ " $is_zsh " -eq 1 ]; then typeset -ga __items; fi
190168 __items=()
191169
192170 _finalize_current () {
193171 [ -z " $cur_key " ] && return 0
194- # unique, sanitized var name (with prefix)
195172 cur_safe=" $( _sanitize " $cur_key " ) "
196173 cur_safe=" ${prefix}${cur_safe} "
197174 base=" $cur_safe "
@@ -207,20 +184,14 @@ collect_yaml_lists() {
207184 cur_key=" "
208185 }
209186
210- # iterate tokens
211187 while IFS= read -r line; do
212188 case " $line " in
213189 __KEY__* )
214- # finalize previous (if any) then start new
215190 _finalize_current
216191 cur_key=" ${line# __KEY__} "
217192 ;;
218- __ITEM__* )
219- __items+=(" ${line# __ITEM__} " )
220- ;;
221- __END__)
222- _finalize_current
223- ;;
193+ __ITEM__* ) __items+=(" ${line# __ITEM__} " ) ;;
194+ __END__) _finalize_current ;;
224195 esac
225196 done << EOF
226197$awk_out
0 commit comments