Skip to content

Commit e59da24

Browse files
committed
Update awk
1 parent ea45d57 commit e59da24

File tree

1 file changed

+87
-114
lines changed

1 file changed

+87
-114
lines changed

scripts/read_yaml.sh

Lines changed: 87 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,10 @@
11
# collect_yaml_lists <file.yaml> [--print] [--prefix PREFIX]
2-
# Pure Bash + awk YAML reader (no yq).
3-
# Supported YAML shape at the top level:
2+
# Pure Bash + awk (BSD/GNU) YAML reader for simple dotfiles lists.
3+
# Supports top-level:
44
# key:
55
# - item
6-
# - item
6+
# - "item with spaces"
77
# other: [a, b, "c"]
8-
#
9-
# Creates:
10-
# categories[] : sanitized array names (with optional PREFIX)
11-
# orig_keys[] : original YAML keys (aligned with categories[])
12-
# one array per key, e.g. key "pip3" -> $pip3 (or $PREFIXpip3)
13-
#
14-
# Notes:
15-
# - This is a pragmatic parser for common dotfiles/package lists.
16-
# - It ignores nested structures under items and normalizes simple quotes.
17-
188
collect_yaml_lists() {
199
# ---- args ----
2010
if [ $# -lt 1 ]; then
@@ -80,118 +70,107 @@ collect_yaml_lists() {
8070
eval "$assign"
8171
}
8272

83-
# ---- parse YAML with awk -> token stream (__KEY__/__ITEM__/__END__) ----
84-
# Pragmatic parser: handles top-level "key:" + block-style list and "key: [a, b, 'c']"
85-
# Strips comments that are not inside quotes.
73+
# ---- parse YAML with awk (script delivered via a single-quoted heredoc) ----
74+
# Emits a token stream on stdout: __KEY__name / __ITEM__value / __END__
8675
local awk_out
8776
awk_out="$(
88-
awk '
89-
function trim(s){ gsub(/^[ \t]+|[ \t]+$/, "", s); return s }
90-
function dequote(s){
91-
if (s ~ /^".*"$/) { s=substr(s,2,length(s)-2) }
92-
else if (s ~ /^'\''.*'\''$/) { s=substr(s,2,length(s)-2) }
93-
return s
94-
}
95-
function strip_comments(line, i,c,dq,sq,esc,out,len) {
96-
dq=0; sq=0; esc=0; out=""; len=length(line)
97-
for (i=1; i<=len; i++) {
98-
c=substr(line,i,1)
99-
if (esc) { out=out c; esc=0; continue }
100-
if (c=="\\") { out=out c; esc=1; continue }
101-
if (c=="\"" && !sq) { dq=!dq; out=out c; continue }
102-
if (c=="'\''" && !dq) { sq=!sq; out=out c; continue }
103-
if (c=="#" && !dq && !sq) { break } # start of comment
104-
out=out c
105-
}
106-
return out
107-
}
108-
BEGIN { in_list=0; base_indent=-1; key="" }
109-
{
110-
line = strip_comments($0)
111-
if (line ~ /^[ \t]*$/) next
112-
# Flow style: key: [a, b, "c"]
113-
if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, m)) {
114-
key = trim(m[2])
115-
print "__KEY__" key
116-
s = m[3]
117-
n = split(s, a, /,/)
118-
for (i=1;i<=n;i++){
119-
val=trim(a[i]); val=dequote(val)
120-
if (val!="") print "__ITEM__" val
121-
}
122-
print "__END__"
123-
in_list=0; key=""; next
124-
}
77+
awk -f - "$file" <<'AWK'
78+
function trim(s) { sub(/^[ \t]+/,"",s); sub(/[ \t]+$/,"",s); return s }
79+
function dequote(s) {
80+
# strip matching single or double quotes
81+
if (s ~ /^".*"$/) return substr(s,2,length(s)-2)
82+
else if (s ~ /^'\''.*'\''$/) return substr(s,2,length(s)-2)
83+
return s
84+
}
85+
function strip_comments(line, i,ch,dq,sq,esc,out,len) {
86+
# remove # outside of quotes; handle \" and \' escapes
87+
dq=0; sq=0; esc=0; out=""; len=length(line)
88+
for (i=1; i<=len; i++) {
89+
ch=substr(line,i,1)
90+
if (esc) { out=out ch; esc=0; continue }
91+
if (ch=="\\") { out=out ch; esc=1; continue }
92+
if (ch=="\"" && !sq) { dq=!dq; out=out ch; continue }
93+
if (ch=="'" && !dq) { sq=!sq; out=out ch; continue }
94+
if (ch=="#" && !dq && !sq) break
95+
out=out ch
96+
}
97+
return out
98+
}
99+
BEGIN { in_list=0; base_indent=-1; key="" }
100+
{
101+
line = strip_comments($0)
102+
if (line ~ /^[ \t]*$/) next
125103
126-
# Block style start: key:
127-
if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, m)) {
128-
# if we were in a previous list, close it
129-
if (in_list==1) { print "__END__"; in_list=0; key="" }
130-
key = trim(m[2])
131-
base_indent = length(m[1])
132-
in_list = 1
133-
print "__KEY__" key
134-
next
135-
}
104+
# Flow: key: [a, b, "c"]
105+
if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, m)) {
106+
key = trim(m[2])
107+
print "__KEY__" key
108+
s = m[3]
109+
# naive split on commas; good for simple quoted/unquoted items
110+
n = split(s, a, /,/)
111+
for (i=1;i<=n;i++){
112+
val=trim(a[i]); val=dequote(val)
113+
if (val!="") print "__ITEM__" val
114+
}
115+
print "__END__"
116+
in_list=0; key=""
117+
next
118+
}
136119
137-
# If inside a list, look for items or dedent/new key
138-
if (in_list==1) {
139-
# New key at dedent/same indent ends current list
140-
if (match(line, /^([ \t]*)([^:# \t][^:]*):/, mk)) {
141-
if (length(mk[1]) <= base_indent) {
142-
print "__END__"
143-
in_list=0; key=""
144-
# Reprocess same line as a new key; emulate tail recursion:
145-
# Flow style?
146-
if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, mm)) {
147-
key = trim(mm[2]); print "__KEY__" key
148-
s = mm[3]; n = split(s, a, /,/)
149-
for (i=1;i<=n;i++){ val=trim(a[i]); val=dequote(val); if (val!="") print "__ITEM__" val }
150-
print "__END__"
151-
next
152-
}
153-
# Block style start
154-
if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, mm2)) {
155-
key = trim(mm2[2]); base_indent = length(mm2[1]); in_list=1; print "__KEY__" key
156-
next
157-
}
158-
}
159-
}
160-
# List item: - value
161-
if (match(line, /^[ \t]*-[ \t]*(.+)$/, mi)) {
162-
item = trim(mi[1]); item = dequote(item)
163-
if (item!="") print "__ITEM__" item
164-
next
165-
}
166-
# Otherwise ignore until dedent/new key
167-
next
168-
}
120+
# Block list start: key:
121+
if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, m)) {
122+
if (in_list==1) { print "__END__"; in_list=0; key="" }
123+
key = trim(m[2])
124+
base_indent = length(m[1])
125+
in_list = 1
126+
print "__KEY__" key
127+
next
128+
}
169129
170-
# Lines outside list we do not handle (scalars, maps)
130+
if (in_list==1) {
131+
# New key at same-or-less indent ends current list
132+
if (match(line, /^([ \t]*)([^:# \t][^:]*):/, mk) && length(mk[1]) <= base_indent) {
133+
print "__END__"
134+
in_list=0; key=""
135+
# Reprocess this line as a new start
136+
if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, mm)) {
137+
key = trim(mm[2])
138+
print "__KEY__" key
139+
s = mm[3]; n = split(s, a2, /,/)
140+
for (i=1;i<=n;i++){ val=trim(a2[i]); val=dequote(val); if (val!="") print "__ITEM__" val }
141+
print "__END__"
171142
next
172143
}
173-
END { if (in_list==1) print "__END__" }
174-
' "$file"
144+
if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, mm2)) {
145+
key = trim(mm2[2]); base_indent = length(mm2[1]); in_list=1; print "__KEY__" key
146+
next
147+
}
148+
}
149+
150+
# List item: - value
151+
if (match(line, /^[ \t]*-[ \t]*(.+)$/, mi)) {
152+
item = trim(mi[1]); item = dequote(item)
153+
if (item!="") print "__ITEM__" item
154+
next
155+
}
156+
}
157+
}
158+
END { if (in_list==1) print "__END__" }
159+
AWK
175160
)"
176161

177-
# ---- consume token stream and create arrays ----
178-
# We gather items per key, then assign a real shell array with a unique sanitized name.
162+
# ---- consume tokens and create arrays ----
179163
if [ -z "$awk_out" ]; then
180164
echo "No top-level list categories found in: $file" >&2
181165
return 0
182166
fi
183167

184-
# read line-by-line portably (bash & zsh)
185168
local cur_key="" cur_safe="" base="" n=1
186-
# temp items store (as a plain list we will quote on assignment)
187-
if [ "$is_zsh" -eq 1 ]; then
188-
typeset -ga __items
189-
fi
169+
if [ "$is_zsh" -eq 1 ]; then typeset -ga __items; fi
190170
__items=()
191171

192172
_finalize_current() {
193173
[ -z "$cur_key" ] && return 0
194-
# unique, sanitized var name (with prefix)
195174
cur_safe="$(_sanitize "$cur_key")"
196175
cur_safe="${prefix}${cur_safe}"
197176
base="$cur_safe"
@@ -207,20 +186,14 @@ collect_yaml_lists() {
207186
cur_key=""
208187
}
209188

210-
# iterate tokens
211189
while IFS= read -r line; do
212190
case "$line" in
213191
__KEY__*)
214-
# finalize previous (if any) then start new
215192
_finalize_current
216193
cur_key="${line#__KEY__}"
217194
;;
218-
__ITEM__*)
219-
__items+=("${line#__ITEM__}")
220-
;;
221-
__END__)
222-
_finalize_current
223-
;;
195+
__ITEM__*) __items+=("${line#__ITEM__}") ;;
196+
__END__) _finalize_current ;;
224197
esac
225198
done <<EOF
226199
$awk_out

0 commit comments

Comments
 (0)