Update awk

erichlf · erichlf · commit e59da24111d7 · 2025-09-14T07:45:24.000-07:00
diff --git a/scripts/read_yaml.sh b/scripts/read_yaml.sh
@@ -1,20 +1,10 @@
 # collect_yaml_lists <file.yaml> [--print] [--prefix PREFIX]
-# Pure Bash + awk YAML reader (no yq).
-# Supported YAML shape at the top level:
+# Pure Bash + awk (BSD/GNU) YAML reader for simple dotfiles lists.
+# Supports top-level:
 #   key:
 #     - item
-#     - item
+#     - "item with spaces"
 #   other: [a, b, "c"]
-#
-# Creates:
-#   categories[] : sanitized array names (with optional PREFIX)
-#   orig_keys[]  : original YAML keys (aligned with categories[])
-#   one array per key, e.g. key "pip3" -> $pip3  (or $PREFIXpip3)
-#
-# Notes:
-# - This is a pragmatic parser for common dotfiles/package lists.
-# - It ignores nested structures under items and normalizes simple quotes.
-
 collect_yaml_lists() {
   # ---- args ----
   if [ $# -lt 1 ]; then
@@ -80,118 +70,107 @@ collect_yaml_lists() {
     eval "$assign"
   }
 
-  # ---- parse YAML with awk -> token stream (__KEY__/__ITEM__/__END__) ----
-  # Pragmatic parser: handles top-level "key:" + block-style list and "key: [a, b, 'c']"
-  # Strips comments that are not inside quotes.
+  # ---- parse YAML with awk (script delivered via a single-quoted heredoc) ----
+  # Emits a token stream on stdout: __KEY__name / __ITEM__value / __END__
   local awk_out
   awk_out="$(
-    awk '
-      function trim(s){ gsub(/^[ \t]+|[ \t]+$/, "", s); return s }
-      function dequote(s){
-        if (s ~ /^".*"$/) { s=substr(s,2,length(s)-2) }
-        else if (s ~ /^'\''.*'\''$/) { s=substr(s,2,length(s)-2) }
-        return s
-      }
-      function strip_comments(line,   i,c,dq,sq,esc,out,len) {
-        dq=0; sq=0; esc=0; out=""; len=length(line)
-        for (i=1; i<=len; i++) {
-          c=substr(line,i,1)
-          if (esc) { out=out c; esc=0; continue }
-          if (c=="\\") { out=out c; esc=1; continue }
-          if (c=="\"" && !sq) { dq=!dq; out=out c; continue }
-          if (c=="'\''" && !dq) { sq=!sq; out=out c; continue }
-          if (c=="#" && !dq && !sq) { break }  # start of comment
-          out=out c
-        }
-        return out
-      }
-      BEGIN { in_list=0; base_indent=-1; key="" }
-      {
-        line = strip_comments($0)
-        if (line ~ /^[ \t]*$/) next
-        # Flow style: key: [a, b, "c"]
-        if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, m)) {
-          key = trim(m[2])
-          print "__KEY__" key
-          s = m[3]
-          n = split(s, a, /,/)
-          for (i=1;i<=n;i++){
-            val=trim(a[i]); val=dequote(val)
-            if (val!="") print "__ITEM__" val
-          }
-          print "__END__"
-          in_list=0; key=""; next
-        }
+    awk -f - "$file" <<'AWK'
+function trim(s) { sub(/^[ \t]+/,"",s); sub(/[ \t]+$/,"",s); return s }
+function dequote(s) {
+  # strip matching single or double quotes
+  if (s ~ /^".*"$/)        return substr(s,2,length(s)-2)
+  else if (s ~ /^'\''.*'\''$/) return substr(s,2,length(s)-2)
+  return s
+}
+function strip_comments(line,   i,ch,dq,sq,esc,out,len) {
+  # remove # outside of quotes; handle \" and \' escapes
+  dq=0; sq=0; esc=0; out=""; len=length(line)
+  for (i=1; i<=len; i++) {
+    ch=substr(line,i,1)
+    if (esc) { out=out ch; esc=0; continue }
+    if (ch=="\\") { out=out ch; esc=1; continue }
+    if (ch=="\"" && !sq) { dq=!dq; out=out ch; continue }
+    if (ch=="'" && !dq)  { sq=!sq; out=out ch; continue }
+    if (ch=="#" && !dq && !sq) break
+    out=out ch
+  }
+  return out
+}
+BEGIN { in_list=0; base_indent=-1; key="" }
+{
+  line = strip_comments($0)
+  if (line ~ /^[ \t]*$/) next
 
-        # Block style start: key:
-        if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, m)) {
-          # if we were in a previous list, close it
-          if (in_list==1) { print "__END__"; in_list=0; key="" }
-          key = trim(m[2])
-          base_indent = length(m[1])
-          in_list = 1
-          print "__KEY__" key
-          next
-        }
+  # Flow: key: [a, b, "c"]
+  if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, m)) {
+    key = trim(m[2])
+    print "__KEY__" key
+    s = m[3]
+    # naive split on commas; good for simple quoted/unquoted items
+    n = split(s, a, /,/)
+    for (i=1;i<=n;i++){
+      val=trim(a[i]); val=dequote(val)
+      if (val!="") print "__ITEM__" val
+    }
+    print "__END__"
+    in_list=0; key=""
+    next
+  }
 
-        # If inside a list, look for items or dedent/new key
-        if (in_list==1) {
-          # New key at dedent/same indent ends current list
-          if (match(line, /^([ \t]*)([^:# \t][^:]*):/, mk)) {
-            if (length(mk[1]) <= base_indent) {
-              print "__END__"
-              in_list=0; key=""
-              # Reprocess same line as a new key; emulate tail recursion:
-              # Flow style?
-              if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, mm)) {
-                key = trim(mm[2]); print "__KEY__" key
-                s = mm[3]; n = split(s, a, /,/)
-                for (i=1;i<=n;i++){ val=trim(a[i]); val=dequote(val); if (val!="") print "__ITEM__" val }
-                print "__END__"
-                next
-              }
-              # Block style start
-              if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, mm2)) {
-                key = trim(mm2[2]); base_indent = length(mm2[1]); in_list=1; print "__KEY__" key
-                next
-              }
-            }
-          }
-          # List item:  - value
-          if (match(line, /^[ \t]*-[ \t]*(.+)$/, mi)) {
-            item = trim(mi[1]); item = dequote(item)
-            if (item!="") print "__ITEM__" item
-            next
-          }
-          # Otherwise ignore until dedent/new key
-          next
-        }
+  # Block list start: key:
+  if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, m)) {
+    if (in_list==1) { print "__END__"; in_list=0; key="" }
+    key = trim(m[2])
+    base_indent = length(m[1])
+    in_list = 1
+    print "__KEY__" key
+    next
+  }
 
-        # Lines outside list we do not handle (scalars, maps)
+  if (in_list==1) {
+    # New key at same-or-less indent ends current list
+    if (match(line, /^([ \t]*)([^:# \t][^:]*):/, mk) && length(mk[1]) <= base_indent) {
+      print "__END__"
+      in_list=0; key=""
+      # Reprocess this line as a new start
+      if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*\[(.*)\][ \t]*$/, mm)) {
+        key = trim(mm[2])
+        print "__KEY__" key
+        s = mm[3]; n = split(s, a2, /,/)
+        for (i=1;i<=n;i++){ val=trim(a2[i]); val=dequote(val); if (val!="") print "__ITEM__" val }
+        print "__END__"
         next
       }
-      END { if (in_list==1) print "__END__" }
-    ' "$file"
+      if (match(line, /^([ \t]*)([^:# \t][^:]*):[ \t]*$/, mm2)) {
+        key = trim(mm2[2]); base_indent = length(mm2[1]); in_list=1; print "__KEY__" key
+        next
+      }
+    }
+
+    # List item:  - value
+    if (match(line, /^[ \t]*-[ \t]*(.+)$/, mi)) {
+      item = trim(mi[1]); item = dequote(item)
+      if (item!="") print "__ITEM__" item
+      next
+    }
+  }
+}
+END { if (in_list==1) print "__END__" }
+AWK
   )"
 
-  # ---- consume token stream and create arrays ----
-  # We gather items per key, then assign a real shell array with a unique sanitized name.
+  # ---- consume tokens and create arrays ----
   if [ -z "$awk_out" ]; then
     echo "No top-level list categories found in: $file" >&2
     return 0
   fi
 
-  # read line-by-line portably (bash & zsh)
   local cur_key="" cur_safe="" base="" n=1
-  # temp items store (as a plain list we will quote on assignment)
-  if [ "$is_zsh" -eq 1 ]; then
-    typeset -ga __items
-  fi
+  if [ "$is_zsh" -eq 1 ]; then typeset -ga __items; fi
   __items=()
 
   _finalize_current() {
     [ -z "$cur_key" ] && return 0
-    # unique, sanitized var name (with prefix)
     cur_safe="$(_sanitize "$cur_key")"
     cur_safe="${prefix}${cur_safe}"
     base="$cur_safe"
@@ -207,20 +186,14 @@ collect_yaml_lists() {
     cur_key=""
   }
 
-  # iterate tokens
   while IFS= read -r line; do
     case "$line" in
     __KEY__*)
-      # finalize previous (if any) then start new
       _finalize_current
       cur_key="${line#__KEY__}"
       ;;
-    __ITEM__*)
-      __items+=("${line#__ITEM__}")
-      ;;
-    __END__)
-      _finalize_current
-      ;;
+    __ITEM__*) __items+=("${line#__ITEM__}") ;;
+    __END__) _finalize_current ;;
     esac
   done <<EOF
 $awk_out