From 032b7e7ba77de17d1599b7300ef257cc4b1a803e Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Mon, 14 Oct 2024 09:09:56 -0700 Subject: [PATCH 01/36] Provide private implementations for str.maketrans and str.translate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This pair of utilities from Python's core are helpful when encoding or escaping strings. Unlike the common alternative — repeated application of `str.replace` — a `str.translate` implementation performs its work in a single pass. This isn't principally about efficiency — although a single-pass implementation may be more efficient — but rather about correctness. Doing all the translation in a single pass sidesteps the issue of double-encoding errors which are possible under repeated-processing schemes when when substitution input/output aliasing is present (i.e. some substitutions produce output that other substitutions recognize as to-be-replaced input). See https://github.com/chainguard-dev/rules_apko/pull/30 for an concrete example of a double-encoding issue resulting from a repeated-processing translation implementation. --- lib/private/BUILD.bazel | 3 ++ lib/private/strings.bzl | 103 ++++++++++++++++++++++++++++++++++++ lib/tests/strings_tests.bzl | 26 ++++++++- 3 files changed, 131 insertions(+), 1 deletion(-) diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index 2e9654044..d7446c801 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -362,6 +362,9 @@ bzl_library( name = "strings", srcs = ["strings.bzl"], visibility = ["//lib:__subpackages__"], + deps = [ + "@bazel_skylib//lib:types", + ], ) bzl_library( diff --git a/lib/private/strings.bzl b/lib/private/strings.bzl index acf5157ff..479b0cd10 100644 --- a/lib/private/strings.bzl +++ b/lib/private/strings.bzl @@ -1,5 +1,7 @@ "String utilities" +load("@bazel_skylib//lib:types.bzl", "types") + CHAR_TO_INT = { "\0": 0, "\1": 1, @@ -653,3 +655,104 @@ def split_args(s): if arg != "": args.append(arg) return args + +def maketrans(x): + """ + Return a translation table usable with translate(). + + Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.maketrans) + of the same name. + + Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not + possible. Entries for characters outside this range will trigger a failure. + + Args: + x: dictionary mapping Unicode ordinals (integers) or characters (length-1 strings) + to Unicode ordinals, strings, or None. Character keys will be converted to ordinals. + + Returns: + dict. The translation table. + """ + + if not types.is_dict(x): + fail("if you give only one argument to maketrans it must be a dict") + + table = {} + + for (k, v) in x.items(): + if types.is_int(k): + if k > 0xFF: + fail("most Unicode is unsupported") + table[k] = v + elif types.is_string(k): + if len(k) != 1: + fail("string keys in translate table must be of length 1") + codepoint = ord(k) + if codepoint == None: + fail("could not compute ord('{}'), most Unicode is unsupported".format(k)) + table[codepoint] = v + else: + fail("keys in translate table must be strings or integers") + + return table + +def translate(s, table): + """ + Replace characters a string according to a translation table. + + Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.translate) + of the same name. + + Characters with entries in the table are replaced in the output. + Characters mapped to None are deleted. + Characters absent from the table are mirrored to the output untouched. + + Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not + possible. Characters outside this range will be silently mirrored to the output without consulting + the translation table. + + Args: + s: str. Input string upon which to perform replacements. + table: dict. Translation table. Maps from Unicode ordinals (ints) keys to other Unicode ordinals, strings, or None. + + Returns: + str. Output string derived from input string with substitutions and deletions applied from table. + """ + + if not types.is_string(s): + fail("first argument to translate must be a string") + if not types.is_dict(table): + fail("second argument to translate must be a dict") + + parts = [] + lit_start = None # Index of start of current run of literal (i.e. no-op translation) content, or None. + for (i, c) in enumerate(s.elems()): + codepoint = ord(c) + if codepoint != None and codepoint in table: + # Terminate the current literal run, if any. + if lit_start != None: + parts.append(s[lit_start:i]) + lit_start = None + + replacement = table[codepoint] + if replacement == None: + pass + elif types.is_int(replacement): + parts.append(chr(replacement)) + elif types.is_string(replacement): + parts.append(replacement) + else: + fail("character mapping must return integer, None or str") + + else: # No entry in translation table. + if lit_start == None: + lit_start = i + + # Flush the caudal literal run, if any. + if lit_start != None: + parts.append(s[lit_start:]) + lit_start = None + + if len(parts) == 1: + return parts[0] + return "".join(parts) diff --git a/lib/tests/strings_tests.bzl b/lib/tests/strings_tests.bzl index 177dae286..243f48d48 100644 --- a/lib/tests/strings_tests.bzl +++ b/lib/tests/strings_tests.bzl @@ -2,7 +2,7 @@ load("@bazel_skylib//lib:partial.bzl", "partial") load("@bazel_skylib//lib:unittest.bzl", "asserts", "unittest") -load("//lib/private:strings.bzl", "chr", "hex", "ord", "split_args") +load("//lib/private:strings.bzl", "chr", "hex", "maketrans", "ord", "split_args", "translate") def _ord_test_impl(ctx): env = unittest.begin(ctx) @@ -83,6 +83,29 @@ def _split_args_test_impl(ctx): split_args_test = unittest.make(_split_args_test_impl) +def _translate_test_impl(ctx): + env = unittest.begin(ctx) + + table = maketrans({ + "<": ">", + "!": None, + }) + + asserts.equals(env, "...", translate("...", table)) + asserts.equals(env, ">..", translate("<..", table)) + asserts.equals(env, ".>.", translate(".<.", table)) + asserts.equals(env, "..>", translate("..<", table)) + asserts.equals(env, "..", translate("!..", table)) + asserts.equals(env, "..", translate(".!.", table)) + asserts.equals(env, "..", translate("..!", table)) + asserts.equals(env, ">>>", translate("<<<", table)) + asserts.equals(env, "", translate("!!!", table)) + asserts.equals(env, ".>", translate(". Date: Mon, 14 Oct 2024 11:11:49 -0700 Subject: [PATCH 02/36] Correctly vis-encode all 7-bit ASCII for mtree Use a code-generated table to avoid having inscrutable magic constants in the codebase. --- lib/private/BUILD.bazel | 11 +++- lib/private/gen_vis_scripts/BUILD.bazel | 30 +++++++++ .../gen_vis_scripts/gen_vis_scripts.go | 65 +++++++++++++++++++ lib/private/tar.bzl | 5 +- lib/private/vis_escape_ascii.bzl | 42 ++++++++++++ 5 files changed, 149 insertions(+), 4 deletions(-) create mode 100644 lib/private/gen_vis_scripts/BUILD.bazel create mode 100644 lib/private/gen_vis_scripts/gen_vis_scripts.go create mode 100644 lib/private/vis_escape_ascii.bzl diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index d7446c801..3ae7d66de 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -14,7 +14,10 @@ exports_files( exports_files( glob(["*.bzl"]), - visibility = ["//lib/private/docs:__pkg__"], + visibility = [ + "//lib/private/docs:__pkg__", + "//lib/private/gen_vis_scripts:__pkg__", + ], ) bzl_library( @@ -279,9 +282,13 @@ bzl_library( bzl_library( name = "tar", - srcs = ["tar.bzl"], + srcs = [ + "tar.bzl", + "vis_escape_ascii.bzl", + ], visibility = ["//lib:__subpackages__"], deps = [ + ":strings.bzl", "@aspect_bazel_lib//lib:paths", "@bazel_skylib//rules:common_settings", ], diff --git a/lib/private/gen_vis_scripts/BUILD.bazel b/lib/private/gen_vis_scripts/BUILD.bazel new file mode 100644 index 000000000..52596abf4 --- /dev/null +++ b/lib/private/gen_vis_scripts/BUILD.bazel @@ -0,0 +1,30 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//lib:run_binary.bzl", "run_binary") +load("//lib:write_source_files.bzl", "write_source_files") + +go_binary( + name = "gen_vis_scripts", + srcs = ["gen_vis_scripts.go"], +) + +run_binary( + name = "run_gen_vis_scripts", + outs = [ + "vis_escape_ascii.bzl", + ], + args = [ + "vis_escape_ascii.bzl=$(location vis_escape_ascii.bzl)", + ], + tool = ":gen_vis_scripts", +) + +write_source_files( + name = "write_vis_scripts", + + # Required to support cross-package references. + check_that_out_file_exists = False, + # + files = { + "//lib/private:vis_escape_ascii.bzl": ":vis_escape_ascii.bzl", + }, +) diff --git a/lib/private/gen_vis_scripts/gen_vis_scripts.go b/lib/private/gen_vis_scripts/gen_vis_scripts.go new file mode 100644 index 000000000..6180695b5 --- /dev/null +++ b/lib/private/gen_vis_scripts/gen_vis_scripts.go @@ -0,0 +1,65 @@ +// Code generator for vis-encoding support scripts. +package main + +import ( + "fmt" + "io" + "log" + "os" + "strings" + "unicode" +) + +func main() { + for _, arg := range os.Args[1:] { + name, dest, ok := strings.Cut(arg, "=") + if !ok { + log.Fatal("invalid generation spec:", arg) + } + + f, err := os.Create(dest) + if err != nil { + log.Fatal(err) + } + defer mustClose(f) + + switch name { + case "vis_escape_ascii.bzl": + writeEscapeASCIIBzl(f) + default: + log.Fatal("unknown generated content:", name) + } + } +} + +func mustClose(f *os.File) { + if err := f.Close(); err != nil { + log.Fatal(err) + } +} + +const newline rune = '\n' + +// Escape all characters identified by mtree(5) as requiring escaping. Plus whitespace. +func shouldEscape(b byte) bool { + return b == '\\' || b > unicode.MaxASCII || unicode.IsSpace(rune(b)) || !unicode.IsPrint(rune(b)) +} + +func writeEscapeASCIIBzl(w io.Writer) { + fmt.Fprintln(w, strings.TrimSpace(` +# Code generated by gen_vis_scripts. DO NOT EDIT. +"A translation table for vis-encoding the ASCII range for mtree." + +load(":strings.bzl", "maketrans") + +VIS_ESCAPE_ASCII = maketrans({ + `)) + + for i := 0; i <= unicode.MaxASCII; i++ { + b := byte(i) + if shouldEscape(b) { + fmt.Fprintf(w, ` %[1]d: r"\%03[1]o",%[2]c`, b, newline) + } + } + fmt.Fprintln(w, "})") +} diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index dc5cff549..1332aa12f 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -2,6 +2,8 @@ load("@bazel_skylib//rules:common_settings.bzl", "BuildSettingInfo") load("//lib:paths.bzl", "to_repository_relative_path") +load(":strings.bzl", str_translate = "translate") +load(":vis_escape_ascii.bzl", "VIS_ESCAPE_ASCII") TAR_TOOLCHAIN_TYPE = "@aspect_bazel_lib//lib:tar_toolchain_type" @@ -278,7 +280,6 @@ def _configured_unused_inputs_file(ctx, srcs, keep): return unused_inputs - # TODO(3.0): Access field directly after minimum bazel_compatibility advanced to or beyond v7.0.0. def _repo_mapping_manifest(files_to_run): return getattr(files_to_run, "repo_mapping_manifest", None) @@ -373,7 +374,7 @@ def _to_rlocation_path(file, workspace): def _vis_encode(filename): # TODO(#794): correctly encode all filenames by using vis(3) (or porting it) - return filename.replace(" ", "\\040") + return str_translate(filename, VIS_ESCAPE_ASCII) def _expand(file, expander, transform = to_repository_relative_path): expanded = expander.expand(file) diff --git a/lib/private/vis_escape_ascii.bzl b/lib/private/vis_escape_ascii.bzl new file mode 100644 index 000000000..eac44ff00 --- /dev/null +++ b/lib/private/vis_escape_ascii.bzl @@ -0,0 +1,42 @@ +# Code generated by gen_vis_scripts. DO NOT EDIT. +"A translation table for vis-encoding the ASCII range for mtree." + +load(":strings.bzl", "maketrans") + +VIS_ESCAPE_ASCII = maketrans({ + 0: r"\000", + 1: r"\001", + 2: r"\002", + 3: r"\003", + 4: r"\004", + 5: r"\005", + 6: r"\006", + 7: r"\007", + 8: r"\010", + 9: r"\011", + 10: r"\012", + 11: r"\013", + 12: r"\014", + 13: r"\015", + 14: r"\016", + 15: r"\017", + 16: r"\020", + 17: r"\021", + 18: r"\022", + 19: r"\023", + 20: r"\024", + 21: r"\025", + 22: r"\026", + 23: r"\027", + 24: r"\030", + 25: r"\031", + 26: r"\032", + 27: r"\033", + 28: r"\034", + 29: r"\035", + 30: r"\036", + 31: r"\037", + 32: r"\040", + 92: r"\134", + 127: r"\177", +}) From b9996f8a8f4c327500df8830b9959723ea19c676 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Wed, 20 Nov 2024 22:48:29 -0500 Subject: [PATCH 03/36] Use a sed script to vis-encode non-ASCII bytes for mtree Bazel's Starlark does not provide access to a string's bytes, only its codepoints, so we are unable to do this escaping in Starlark. So a second pass is needed, at least until the spec and implementation work to get a [`bytes` type](https://github.com/bazelbuild/starlark/issues/112) lands. Fixes https://github.com/bazel-contrib/bazel-lib/issues/794 --- lib/private/BUILD.bazel | 1 + lib/private/gen_vis_scripts/BUILD.bazel | 3 + .../gen_vis_scripts/gen_vis_scripts.go | 15 ++ lib/private/tar.bzl | 26 +++- lib/private/vis_escape_nonascii.sed | 132 ++++++++++++++++++ 5 files changed, 172 insertions(+), 5 deletions(-) create mode 100644 lib/private/vis_escape_nonascii.sed diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index 3ae7d66de..8b1f13c35 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -8,6 +8,7 @@ exports_files( "modify_mtree.awk", "parse_status_file.jq", "parse_status_file.yq", + "vis_escape_nonascii.sed", ], visibility = ["//visibility:public"], ) diff --git a/lib/private/gen_vis_scripts/BUILD.bazel b/lib/private/gen_vis_scripts/BUILD.bazel index 52596abf4..7a9378001 100644 --- a/lib/private/gen_vis_scripts/BUILD.bazel +++ b/lib/private/gen_vis_scripts/BUILD.bazel @@ -11,9 +11,11 @@ run_binary( name = "run_gen_vis_scripts", outs = [ "vis_escape_ascii.bzl", + "vis_escape_nonascii.sed", ], args = [ "vis_escape_ascii.bzl=$(location vis_escape_ascii.bzl)", + "vis_escape_nonascii.sed=$(location vis_escape_nonascii.sed)", ], tool = ":gen_vis_scripts", ) @@ -26,5 +28,6 @@ write_source_files( # files = { "//lib/private:vis_escape_ascii.bzl": ":vis_escape_ascii.bzl", + "//lib/private:vis_escape_nonascii.sed": ":vis_escape_nonascii.sed", }, ) diff --git a/lib/private/gen_vis_scripts/gen_vis_scripts.go b/lib/private/gen_vis_scripts/gen_vis_scripts.go index 6180695b5..0b21a16c3 100644 --- a/lib/private/gen_vis_scripts/gen_vis_scripts.go +++ b/lib/private/gen_vis_scripts/gen_vis_scripts.go @@ -26,6 +26,8 @@ func main() { switch name { case "vis_escape_ascii.bzl": writeEscapeASCIIBzl(f) + case "vis_escape_nonascii.sed": + writeEscapeNonASCIISed(f) default: log.Fatal("unknown generated content:", name) } @@ -63,3 +65,16 @@ VIS_ESCAPE_ASCII = maketrans({ } fmt.Fprintln(w, "})") } + +func writeEscapeNonASCIISed(w io.Writer) { + fmt.Fprintln(w, strings.TrimSpace(` +# Code generated by gen_vis_scripts. DO NOT EDIT. +# Replace non-ASCII bytes with their octal escape sequences. +# Escaping of ASCII is done in Starlark prior to writing content out. + `)) + fmt.Fprintln(w, "") + + for i := 0x80; i <= 0xFF; i++ { + fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, i, newline) + } +} diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index 1332aa12f..91af77332 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -121,11 +121,13 @@ Possible values: values = [-1, 0, 1], ), "_compute_unused_inputs_flag": attr.label(default = Label("//lib:tar_compute_unused_inputs")), + "_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")), } _mtree_attrs = { "srcs": attr.label_list(doc = "Files that are placed into the tar", allow_files = True), "out": attr.output(doc = "Resulting specification file to write"), + "_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")), } def _add_compression_args(compress, args): @@ -255,14 +257,14 @@ def _configured_unused_inputs_file(ctx, srcs, keep): # See also: https://github.com/bazel-contrib/bazel-lib/issues/794 ctx.actions.run_shell( outputs = [unused_inputs], - inputs = [prunable_inputs, keep_inputs, ctx.file.mtree], + inputs = [prunable_inputs, keep_inputs, ctx.file.mtree, ctx.file._vis_escape_nonascii], tools = [coreutils], command = ''' "$COREUTILS" join -v 1 \\ - <("$COREUTILS" sort -u "$PRUNABLE_INPUTS") \\ + <(sed -f "$VIS_ESCAPE_NONASCII" "$PRUNABLE_INPUTS" | "$COREUTILS" sort -u) \\ <("$COREUTILS" sort -u \\ <(grep -o '\\bcontents\\?=\\S*' "$MTREE" | "$COREUTILS" cut -d'=' -f 2-) \\ - "$KEEP_INPUTS" \\ + <(sed -f "$VIS_ESCAPE_NONASCII" "$KEEP_INPUTS") \\ ) \\ | "$COREUTILS" cut -d' ' -f 2- \\ > "$UNUSED_INPUTS" @@ -273,6 +275,7 @@ def _configured_unused_inputs_file(ctx, srcs, keep): "KEEP_INPUTS": keep_inputs.path, "MTREE": ctx.file.mtree.path, "UNUSED_INPUTS": unused_inputs.path, + "VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path, }, mnemonic = "UnusedTarInputs", toolchain = "@aspect_bazel_lib//lib:coreutils_toolchain_type", @@ -373,7 +376,8 @@ def _to_rlocation_path(file, workspace): return workspace + "/" + file.short_path def _vis_encode(filename): - # TODO(#794): correctly encode all filenames by using vis(3) (or porting it) + # Escaping of non-ASCII bytes cannot be performed within Starlark. + # After writing content out, a second pass is performed with vis_escape_nonascii.sed. return str_translate(filename, VIS_ESCAPE_ASCII) def _expand(file, expander, transform = to_repository_relative_path): @@ -401,6 +405,7 @@ def _expand(file, expander, transform = to_repository_relative_path): def _mtree_impl(ctx): out = ctx.outputs.out or ctx.actions.declare_file(ctx.attr.name + ".spec") + unescaped = ctx.actions.declare_file(ctx.attr.name + ".spec.unescaped") content = ctx.actions.args() content.set_param_file_format("multiline") @@ -445,7 +450,18 @@ def _mtree_impl(ctx): _mtree_line(_vis_encode(runfiles_dir + "/_repo_mapping"), "file", content = _vis_encode(repo_mapping.path)), ) - ctx.actions.write(out, content = content) + ctx.actions.write(unescaped, content = content) + ctx.actions.run_shell( + outputs = [out], + inputs = [unescaped, ctx.file._vis_escape_nonascii], + command = 'sed -f "$VIS_ESCAPE_NONASCII" "$UNESCAPED" > "$OUT"', + env = { + "VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path, + "UNESCAPED": unescaped.path, + "OUT": out.path, + }, + mnemonic = "EscapeNonAscii", + ) return DefaultInfo(files = depset([out]), runfiles = ctx.runfiles([out])) diff --git a/lib/private/vis_escape_nonascii.sed b/lib/private/vis_escape_nonascii.sed new file mode 100644 index 000000000..744713564 --- /dev/null +++ b/lib/private/vis_escape_nonascii.sed @@ -0,0 +1,132 @@ +# Code generated by gen_vis_scripts. DO NOT EDIT. +# Replace non-ASCII bytes with their octal escape sequences. +# Escaping of ASCII is done in Starlark prior to writing content out. + +s/\x80/\\200/g +s/\x81/\\201/g +s/\x82/\\202/g +s/\x83/\\203/g +s/\x84/\\204/g +s/\x85/\\205/g +s/\x86/\\206/g +s/\x87/\\207/g +s/\x88/\\210/g +s/\x89/\\211/g +s/\x8a/\\212/g +s/\x8b/\\213/g +s/\x8c/\\214/g +s/\x8d/\\215/g +s/\x8e/\\216/g +s/\x8f/\\217/g +s/\x90/\\220/g +s/\x91/\\221/g +s/\x92/\\222/g +s/\x93/\\223/g +s/\x94/\\224/g +s/\x95/\\225/g +s/\x96/\\226/g +s/\x97/\\227/g +s/\x98/\\230/g +s/\x99/\\231/g +s/\x9a/\\232/g +s/\x9b/\\233/g +s/\x9c/\\234/g +s/\x9d/\\235/g +s/\x9e/\\236/g +s/\x9f/\\237/g +s/\xa0/\\240/g +s/\xa1/\\241/g +s/\xa2/\\242/g +s/\xa3/\\243/g +s/\xa4/\\244/g +s/\xa5/\\245/g +s/\xa6/\\246/g +s/\xa7/\\247/g +s/\xa8/\\250/g +s/\xa9/\\251/g +s/\xaa/\\252/g +s/\xab/\\253/g +s/\xac/\\254/g +s/\xad/\\255/g +s/\xae/\\256/g +s/\xaf/\\257/g +s/\xb0/\\260/g +s/\xb1/\\261/g +s/\xb2/\\262/g +s/\xb3/\\263/g +s/\xb4/\\264/g +s/\xb5/\\265/g +s/\xb6/\\266/g +s/\xb7/\\267/g +s/\xb8/\\270/g +s/\xb9/\\271/g +s/\xba/\\272/g +s/\xbb/\\273/g +s/\xbc/\\274/g +s/\xbd/\\275/g +s/\xbe/\\276/g +s/\xbf/\\277/g +s/\xc0/\\300/g +s/\xc1/\\301/g +s/\xc2/\\302/g +s/\xc3/\\303/g +s/\xc4/\\304/g +s/\xc5/\\305/g +s/\xc6/\\306/g +s/\xc7/\\307/g +s/\xc8/\\310/g +s/\xc9/\\311/g +s/\xca/\\312/g +s/\xcb/\\313/g +s/\xcc/\\314/g +s/\xcd/\\315/g +s/\xce/\\316/g +s/\xcf/\\317/g +s/\xd0/\\320/g +s/\xd1/\\321/g +s/\xd2/\\322/g +s/\xd3/\\323/g +s/\xd4/\\324/g +s/\xd5/\\325/g +s/\xd6/\\326/g +s/\xd7/\\327/g +s/\xd8/\\330/g +s/\xd9/\\331/g +s/\xda/\\332/g +s/\xdb/\\333/g +s/\xdc/\\334/g +s/\xdd/\\335/g +s/\xde/\\336/g +s/\xdf/\\337/g +s/\xe0/\\340/g +s/\xe1/\\341/g +s/\xe2/\\342/g +s/\xe3/\\343/g +s/\xe4/\\344/g +s/\xe5/\\345/g +s/\xe6/\\346/g +s/\xe7/\\347/g +s/\xe8/\\350/g +s/\xe9/\\351/g +s/\xea/\\352/g +s/\xeb/\\353/g +s/\xec/\\354/g +s/\xed/\\355/g +s/\xee/\\356/g +s/\xef/\\357/g +s/\xf0/\\360/g +s/\xf1/\\361/g +s/\xf2/\\362/g +s/\xf3/\\363/g +s/\xf4/\\364/g +s/\xf5/\\365/g +s/\xf6/\\366/g +s/\xf7/\\367/g +s/\xf8/\\370/g +s/\xf9/\\371/g +s/\xfa/\\372/g +s/\xfb/\\373/g +s/\xfc/\\374/g +s/\xfd/\\375/g +s/\xfe/\\376/g +s/\xff/\\377/g From ee35bb3315c2b460f48e7c08df83a283e5481882 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Wed, 20 Nov 2024 23:59:39 -0500 Subject: [PATCH 04/36] Canonicalize mtree paths to ensure path comparison is exact The `mtree` passed to the `tar` rule is user-provided. Unlike the other streams participating in this comparison, we cannot completely trust that the encoding it uses is the same as the form this ruleset would produce. There's a bit of play to how libarchive interprets these path specs. To be correct, the comparison we're making requires that all equivalent forms be massaged to a single representation before performing the checks. --- lib/private/BUILD.bazel | 1 + lib/private/gen_vis_scripts/BUILD.bazel | 3 + .../gen_vis_scripts/gen_vis_scripts.go | 63 ++++ lib/private/tar.bzl | 12 +- lib/private/vis_canonicalize.sed | 287 ++++++++++++++++++ 5 files changed, 361 insertions(+), 5 deletions(-) create mode 100644 lib/private/vis_canonicalize.sed diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index 8b1f13c35..8c023ff30 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -8,6 +8,7 @@ exports_files( "modify_mtree.awk", "parse_status_file.jq", "parse_status_file.yq", + "vis_canonicalize.sed", "vis_escape_nonascii.sed", ], visibility = ["//visibility:public"], diff --git a/lib/private/gen_vis_scripts/BUILD.bazel b/lib/private/gen_vis_scripts/BUILD.bazel index 7a9378001..b988b6e49 100644 --- a/lib/private/gen_vis_scripts/BUILD.bazel +++ b/lib/private/gen_vis_scripts/BUILD.bazel @@ -10,10 +10,12 @@ go_binary( run_binary( name = "run_gen_vis_scripts", outs = [ + "vis_canonicalize.sed", "vis_escape_ascii.bzl", "vis_escape_nonascii.sed", ], args = [ + "vis_canonicalize.sed=$(location vis_canonicalize.sed)", "vis_escape_ascii.bzl=$(location vis_escape_ascii.bzl)", "vis_escape_nonascii.sed=$(location vis_escape_nonascii.sed)", ], @@ -27,6 +29,7 @@ write_source_files( check_that_out_file_exists = False, # files = { + "//lib/private:vis_canonicalize.sed": ":vis_canonicalize.sed", "//lib/private:vis_escape_ascii.bzl": ":vis_escape_ascii.bzl", "//lib/private:vis_escape_nonascii.sed": ":vis_escape_nonascii.sed", }, diff --git a/lib/private/gen_vis_scripts/gen_vis_scripts.go b/lib/private/gen_vis_scripts/gen_vis_scripts.go index 0b21a16c3..19c52ccaf 100644 --- a/lib/private/gen_vis_scripts/gen_vis_scripts.go +++ b/lib/private/gen_vis_scripts/gen_vis_scripts.go @@ -28,6 +28,8 @@ func main() { writeEscapeASCIIBzl(f) case "vis_escape_nonascii.sed": writeEscapeNonASCIISed(f) + case "vis_canonicalize.sed": + writeVisCanonicalizeSed(f) default: log.Fatal("unknown generated content:", name) } @@ -78,3 +80,64 @@ func writeEscapeNonASCIISed(w io.Writer) { fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, i, newline) } } + +func writeVisCanonicalizeSed(w io.Writer) { + fmt.Fprintln(w, strings.TrimSpace(` +# Code generated by gen_vis_scripts. DO NOT EDIT. +# +# Convert vis-encoded content to a bespoke canonical form. After canonicalization, equality checks are trivial. +# Backslash, space characters, and all characters outside the 95 printable ASCII set are represented using escaped three-digit octal. +# The remaining characters are not escaped; they represent themselves. +# +# Input is interpreted as libarchive would, with a wider set of escape sequences: +# * \\, \a, \b, \f, \n, \r, \t, \v have their conventional C-based meanings +# * \0 means NUL when not the start of an three-digit octal escape sequence +# * \s means SPACE +# * \ is valid as an ordinary backslash when not the start of a valid escape sequence +# +# See: https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1942 + +# Escaping of backslashes must be applied first to avoid double-interpretation. +s/\\\\|\\([^0-3abfnrstv\\]|$)/\\134\1/g +s/\\([1-3]([^0-7]|$|[0-7]([^0-7]|$)))/\\134\1/g + +s/\\a/\\007/g +s/\\b/\\008/g +s/\\f/\\014/g +s/\\n/\\012/g +s/\\r/\\015/g +s/\\s/\\040/g +s/\\t/\\011/g +s/\\v/\\013/g + +# NUL special form must be disambiguated from ordinary octal escape sequences. +s/\\0([^0-7]|$|[0-7]([^0-7]|$))/\\000\1/g + +# Remove octal escaping from characters that don't need it. + `)) + + for i := 0; i <= 0xFF; i++ { + b := byte(i) + if shouldEscape(b) { + continue + } + if b == '/' { + fmt.Fprintf(w, `s:\\%03[1]o:%[1]c:g%[2]c`, b, newline) + } else { + fmt.Fprintf(w, `s/\\%03[1]o/%[1]c/g%[2]c`, b, newline) + } + } + fmt.Fprintln(w, "") + + fmt.Fprintln(w, "# Add octal escaping for characters that need it.") + for i := 0; i <= 0xFF; i++ { + b := byte(i) + if !shouldEscape(b) { + continue + } + if b == '\\' || b == '\n' { + continue + } + fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, b, newline) + } +} diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index 91af77332..b6524074b 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -121,6 +121,7 @@ Possible values: values = [-1, 0, 1], ), "_compute_unused_inputs_flag": attr.label(default = Label("//lib:tar_compute_unused_inputs")), + "_vis_canonicalize": attr.label(allow_single_file = True, default = Label("//lib/private:vis_canonicalize.sed")), "_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")), } @@ -252,18 +253,18 @@ def _configured_unused_inputs_file(ctx, srcs, keep): # # Note: bsdtar (libarchive) accepts both content= and contents= to identify source file: # ref https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1640 - # - # TODO: Make comparison exact by converting all inputs to a canonical vis-encoded form before comparing. - # See also: https://github.com/bazel-contrib/bazel-lib/issues/794 ctx.actions.run_shell( outputs = [unused_inputs], - inputs = [prunable_inputs, keep_inputs, ctx.file.mtree, ctx.file._vis_escape_nonascii], + inputs = [prunable_inputs, keep_inputs, ctx.file.mtree, ctx.file._vis_canonicalize, ctx.file._vis_escape_nonascii], tools = [coreutils], command = ''' "$COREUTILS" join -v 1 \\ <(sed -f "$VIS_ESCAPE_NONASCII" "$PRUNABLE_INPUTS" | "$COREUTILS" sort -u) \\ <("$COREUTILS" sort -u \\ - <(grep -o '\\bcontents\\?=\\S*' "$MTREE" | "$COREUTILS" cut -d'=' -f 2-) \\ + <(grep -o '\\bcontents\\?=\\S*' "$MTREE" \\ + | "$COREUTILS" cut -d'=' -f 2- \\ + | sed -Ef "$VIS_CANONICALIZE" \\ + ) \\ <(sed -f "$VIS_ESCAPE_NONASCII" "$KEEP_INPUTS") \\ ) \\ | "$COREUTILS" cut -d' ' -f 2- \\ @@ -275,6 +276,7 @@ def _configured_unused_inputs_file(ctx, srcs, keep): "KEEP_INPUTS": keep_inputs.path, "MTREE": ctx.file.mtree.path, "UNUSED_INPUTS": unused_inputs.path, + "VIS_CANONICALIZE": ctx.file._vis_canonicalize.path, "VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path, }, mnemonic = "UnusedTarInputs", diff --git a/lib/private/vis_canonicalize.sed b/lib/private/vis_canonicalize.sed new file mode 100644 index 000000000..4944a8c94 --- /dev/null +++ b/lib/private/vis_canonicalize.sed @@ -0,0 +1,287 @@ +# Code generated by gen_vis_scripts. DO NOT EDIT. +# +# Convert vis-encoded content to a bespoke canonical form. After canonicalization, equality checks are trivial. +# Backslash, space characters, and all characters outside the 95 printable ASCII set are represented using escaped three-digit octal. +# The remaining characters are not escaped; they represent themselves. +# +# Input is interpreted as libarchive would, with a wider set of escape sequences: +# * \\, \a, \b, \f, \n, \r, \t, \v have their conventional C-based meanings +# * \0 means NUL when not the start of an three-digit octal escape sequence +# * \s means SPACE +# * \ is valid as an ordinary backslash when not the start of a valid escape sequence +# +# See: https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1942 + +# Escaping of backslashes must be applied first to avoid double-interpretation. +s/\\\\|\\([^0-3abfnrstv\\]|$)/\\134\1/g +s/\\([1-3]([^0-7]|$|[0-7]([^0-7]|$)))/\\134\1/g + +s/\\a/\\007/g +s/\\b/\\008/g +s/\\f/\\014/g +s/\\n/\\012/g +s/\\r/\\015/g +s/\\s/\\040/g +s/\\t/\\011/g +s/\\v/\\013/g + +# NUL special form must be disambiguated from ordinary octal escape sequences. +s/\\0([^0-7]|$|[0-7]([^0-7]|$))/\\000\1/g + +# Remove octal escaping from characters that don't need it. +s/\\041/!/g +s/\\042/"/g +s/\\043/#/g +s/\\044/$/g +s/\\045/%/g +s/\\046/&/g +s/\\047/'/g +s/\\050/(/g +s/\\051/)/g +s/\\052/*/g +s/\\053/+/g +s/\\054/,/g +s/\\055/-/g +s/\\056/./g +s:\\057:/:g +s/\\060/0/g +s/\\061/1/g +s/\\062/2/g +s/\\063/3/g +s/\\064/4/g +s/\\065/5/g +s/\\066/6/g +s/\\067/7/g +s/\\070/8/g +s/\\071/9/g +s/\\072/:/g +s/\\073/;/g +s/\\074//g +s/\\077/?/g +s/\\100/@/g +s/\\101/A/g +s/\\102/B/g +s/\\103/C/g +s/\\104/D/g +s/\\105/E/g +s/\\106/F/g +s/\\107/G/g +s/\\110/H/g +s/\\111/I/g +s/\\112/J/g +s/\\113/K/g +s/\\114/L/g +s/\\115/M/g +s/\\116/N/g +s/\\117/O/g +s/\\120/P/g +s/\\121/Q/g +s/\\122/R/g +s/\\123/S/g +s/\\124/T/g +s/\\125/U/g +s/\\126/V/g +s/\\127/W/g +s/\\130/X/g +s/\\131/Y/g +s/\\132/Z/g +s/\\133/[/g +s/\\135/]/g +s/\\136/^/g +s/\\137/_/g +s/\\140/`/g +s/\\141/a/g +s/\\142/b/g +s/\\143/c/g +s/\\144/d/g +s/\\145/e/g +s/\\146/f/g +s/\\147/g/g +s/\\150/h/g +s/\\151/i/g +s/\\152/j/g +s/\\153/k/g +s/\\154/l/g +s/\\155/m/g +s/\\156/n/g +s/\\157/o/g +s/\\160/p/g +s/\\161/q/g +s/\\162/r/g +s/\\163/s/g +s/\\164/t/g +s/\\165/u/g +s/\\166/v/g +s/\\167/w/g +s/\\170/x/g +s/\\171/y/g +s/\\172/z/g +s/\\173/{/g +s/\\174/|/g +s/\\175/}/g +s/\\176/~/g + +# Add octal escaping for characters that need it. +s/\x00/\\000/g +s/\x01/\\001/g +s/\x02/\\002/g +s/\x03/\\003/g +s/\x04/\\004/g +s/\x05/\\005/g +s/\x06/\\006/g +s/\x07/\\007/g +s/\x08/\\010/g +s/\x09/\\011/g +s/\x0b/\\013/g +s/\x0c/\\014/g +s/\x0d/\\015/g +s/\x0e/\\016/g +s/\x0f/\\017/g +s/\x10/\\020/g +s/\x11/\\021/g +s/\x12/\\022/g +s/\x13/\\023/g +s/\x14/\\024/g +s/\x15/\\025/g +s/\x16/\\026/g +s/\x17/\\027/g +s/\x18/\\030/g +s/\x19/\\031/g +s/\x1a/\\032/g +s/\x1b/\\033/g +s/\x1c/\\034/g +s/\x1d/\\035/g +s/\x1e/\\036/g +s/\x1f/\\037/g +s/\x20/\\040/g +s/\x7f/\\177/g +s/\x80/\\200/g +s/\x81/\\201/g +s/\x82/\\202/g +s/\x83/\\203/g +s/\x84/\\204/g +s/\x85/\\205/g +s/\x86/\\206/g +s/\x87/\\207/g +s/\x88/\\210/g +s/\x89/\\211/g +s/\x8a/\\212/g +s/\x8b/\\213/g +s/\x8c/\\214/g +s/\x8d/\\215/g +s/\x8e/\\216/g +s/\x8f/\\217/g +s/\x90/\\220/g +s/\x91/\\221/g +s/\x92/\\222/g +s/\x93/\\223/g +s/\x94/\\224/g +s/\x95/\\225/g +s/\x96/\\226/g +s/\x97/\\227/g +s/\x98/\\230/g +s/\x99/\\231/g +s/\x9a/\\232/g +s/\x9b/\\233/g +s/\x9c/\\234/g +s/\x9d/\\235/g +s/\x9e/\\236/g +s/\x9f/\\237/g +s/\xa0/\\240/g +s/\xa1/\\241/g +s/\xa2/\\242/g +s/\xa3/\\243/g +s/\xa4/\\244/g +s/\xa5/\\245/g +s/\xa6/\\246/g +s/\xa7/\\247/g +s/\xa8/\\250/g +s/\xa9/\\251/g +s/\xaa/\\252/g +s/\xab/\\253/g +s/\xac/\\254/g +s/\xad/\\255/g +s/\xae/\\256/g +s/\xaf/\\257/g +s/\xb0/\\260/g +s/\xb1/\\261/g +s/\xb2/\\262/g +s/\xb3/\\263/g +s/\xb4/\\264/g +s/\xb5/\\265/g +s/\xb6/\\266/g +s/\xb7/\\267/g +s/\xb8/\\270/g +s/\xb9/\\271/g +s/\xba/\\272/g +s/\xbb/\\273/g +s/\xbc/\\274/g +s/\xbd/\\275/g +s/\xbe/\\276/g +s/\xbf/\\277/g +s/\xc0/\\300/g +s/\xc1/\\301/g +s/\xc2/\\302/g +s/\xc3/\\303/g +s/\xc4/\\304/g +s/\xc5/\\305/g +s/\xc6/\\306/g +s/\xc7/\\307/g +s/\xc8/\\310/g +s/\xc9/\\311/g +s/\xca/\\312/g +s/\xcb/\\313/g +s/\xcc/\\314/g +s/\xcd/\\315/g +s/\xce/\\316/g +s/\xcf/\\317/g +s/\xd0/\\320/g +s/\xd1/\\321/g +s/\xd2/\\322/g +s/\xd3/\\323/g +s/\xd4/\\324/g +s/\xd5/\\325/g +s/\xd6/\\326/g +s/\xd7/\\327/g +s/\xd8/\\330/g +s/\xd9/\\331/g +s/\xda/\\332/g +s/\xdb/\\333/g +s/\xdc/\\334/g +s/\xdd/\\335/g +s/\xde/\\336/g +s/\xdf/\\337/g +s/\xe0/\\340/g +s/\xe1/\\341/g +s/\xe2/\\342/g +s/\xe3/\\343/g +s/\xe4/\\344/g +s/\xe5/\\345/g +s/\xe6/\\346/g +s/\xe7/\\347/g +s/\xe8/\\350/g +s/\xe9/\\351/g +s/\xea/\\352/g +s/\xeb/\\353/g +s/\xec/\\354/g +s/\xed/\\355/g +s/\xee/\\356/g +s/\xef/\\357/g +s/\xf0/\\360/g +s/\xf1/\\361/g +s/\xf2/\\362/g +s/\xf3/\\363/g +s/\xf4/\\364/g +s/\xf5/\\365/g +s/\xf6/\\366/g +s/\xf7/\\367/g +s/\xf8/\\370/g +s/\xf9/\\371/g +s/\xfa/\\372/g +s/\xfb/\\373/g +s/\xfc/\\374/g +s/\xfd/\\375/g +s/\xfe/\\376/g +s/\xff/\\377/g From ae88acbe301c395c9556619001734d2aa8f6b75b Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Thu, 21 Nov 2024 00:19:04 -0500 Subject: [PATCH 05/36] Decode vis-encoded file paths when passing to Bazel Bazel expects file paths to be written verbatim in the unused inputs file; it does not understand any escaping or encoding scheme. Because we're not post-filtering depsets written out from Bazel, we're no longer able to produce a 2-column format as easily: sed performs replacements in all columns. That's fine for the intended-to-be-vis-encoded column 1, but incorrect for the intended-to-be-verbatim-content column 2. Instead, we abandon the 2-column approach and instead work in a single column, that is vis-encoded when performing comparisons, and then decoded to write back out to Bazel. --- lib/private/BUILD.bazel | 1 + lib/private/gen_vis_scripts/BUILD.bazel | 3 + .../gen_vis_scripts/gen_vis_scripts.go | 28 +++ lib/private/tar.bzl | 25 +-- lib/private/unvis_canonical.sed | 169 ++++++++++++++++++ 5 files changed, 215 insertions(+), 11 deletions(-) create mode 100644 lib/private/unvis_canonical.sed diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index 8c023ff30..e49f11858 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -8,6 +8,7 @@ exports_files( "modify_mtree.awk", "parse_status_file.jq", "parse_status_file.yq", + "unvis_canonical.sed", "vis_canonicalize.sed", "vis_escape_nonascii.sed", ], diff --git a/lib/private/gen_vis_scripts/BUILD.bazel b/lib/private/gen_vis_scripts/BUILD.bazel index b988b6e49..3865f0dff 100644 --- a/lib/private/gen_vis_scripts/BUILD.bazel +++ b/lib/private/gen_vis_scripts/BUILD.bazel @@ -10,11 +10,13 @@ go_binary( run_binary( name = "run_gen_vis_scripts", outs = [ + "unvis_canonical.sed", "vis_canonicalize.sed", "vis_escape_ascii.bzl", "vis_escape_nonascii.sed", ], args = [ + "unvis_canonical.sed=$(location unvis_canonical.sed)", "vis_canonicalize.sed=$(location vis_canonicalize.sed)", "vis_escape_ascii.bzl=$(location vis_escape_ascii.bzl)", "vis_escape_nonascii.sed=$(location vis_escape_nonascii.sed)", @@ -29,6 +31,7 @@ write_source_files( check_that_out_file_exists = False, # files = { + "//lib/private:unvis_canonical.sed": ":unvis_canonical.sed", "//lib/private:vis_canonicalize.sed": ":vis_canonicalize.sed", "//lib/private:vis_escape_ascii.bzl": ":vis_escape_ascii.bzl", "//lib/private:vis_escape_nonascii.sed": ":vis_escape_nonascii.sed", diff --git a/lib/private/gen_vis_scripts/gen_vis_scripts.go b/lib/private/gen_vis_scripts/gen_vis_scripts.go index 19c52ccaf..dec86e422 100644 --- a/lib/private/gen_vis_scripts/gen_vis_scripts.go +++ b/lib/private/gen_vis_scripts/gen_vis_scripts.go @@ -30,6 +30,8 @@ func main() { writeEscapeNonASCIISed(f) case "vis_canonicalize.sed": writeVisCanonicalizeSed(f) + case "unvis_canonical.sed": + writeUnvisCanonicalSed(f) default: log.Fatal("unknown generated content:", name) } @@ -141,3 +143,29 @@ s/\\0([^0-7]|$|[0-7]([^0-7]|$))/\\000\1/g fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, b, newline) } } + +func writeUnvisCanonicalSed(w io.Writer) { + fmt.Fprintln(w, strings.TrimSpace(` +# Code generated by gen_vis_scripts. DO NOT EDIT. +# Replace octal escape sequences with the bytes they represent. +# NOTE: not a fully general unvis program; assumes the canonical form produced by vis_canonicalize.sed + `)) + fmt.Fprintln(w, "") + + for i := 0x00; i <= 0xFF; i++ { + b := byte(i) + if b == '\\' { + continue + } + if !shouldEscape(b) { + continue + } + fmt.Fprintf(w, `s/\\%03[1]o/\x%02[1]x/g%[2]c`, b, newline) + } + fmt.Fprintln(w, "") + + fmt.Fprintln(w, strings.TrimSpace(` +# Unvis of backslash must be applied last to avoid double-interpretation. +s/\\134/\\/g + `)) +} diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index b6524074b..17f3e178e 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -121,6 +121,7 @@ Possible values: values = [-1, 0, 1], ), "_compute_unused_inputs_flag": attr.label(default = Label("//lib:tar_compute_unused_inputs")), + "_unvis_canonical": attr.label(allow_single_file = True, default = Label("//lib/private:unvis_canonical.sed")), "_vis_canonicalize": attr.label(allow_single_file = True, default = Label("//lib/private:vis_canonicalize.sed")), "_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")), } @@ -193,15 +194,9 @@ def _is_unprunable(file): def _fmt_pruanble_inputs_line(file): if _is_unprunable(file): return None - - # The tar.prunable_inputs.txt file has a two columns: - # 1. vis-encoded paths of the files, used in comparison - # 2. un-vis-encoded paths of the files, used for reporting back to Bazel after filtering - path = file.path - return _vis_encode(path) + " " + path + return _vis_encode(file.path) def _fmt_keep_inputs_line(file): - # The tar.keep_inputs.txt file has a single column of vis-encoded paths of the files to keep. return _vis_encode(file.path) def _configured_unused_inputs_file(ctx, srcs, keep): @@ -248,14 +243,21 @@ def _configured_unused_inputs_file(ctx, srcs, keep): # * are not found in any content= or contents= keyword in the MTREE # * are not in the hardcoded KEEP_INPUTS set # - # Comparison and filtering of PRUNABLE_INPUTS is performed in the vis-encoded representation, stored in field 1, - # before being written out in the un-vis-encoded form Bazel understands, from field 2. + # Comparison and filtering of PRUNABLE_INPUTS is performed in the vis-encoded representation + # before being written out in the un-vis-encoded form Bazel understands. # # Note: bsdtar (libarchive) accepts both content= and contents= to identify source file: # ref https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1640 ctx.actions.run_shell( outputs = [unused_inputs], - inputs = [prunable_inputs, keep_inputs, ctx.file.mtree, ctx.file._vis_canonicalize, ctx.file._vis_escape_nonascii], + inputs = [ + prunable_inputs, + keep_inputs, + ctx.file.mtree, + ctx.file._unvis_canonical, + ctx.file._vis_canonicalize, + ctx.file._vis_escape_nonascii, + ], tools = [coreutils], command = ''' "$COREUTILS" join -v 1 \\ @@ -267,7 +269,7 @@ def _configured_unused_inputs_file(ctx, srcs, keep): ) \\ <(sed -f "$VIS_ESCAPE_NONASCII" "$KEEP_INPUTS") \\ ) \\ - | "$COREUTILS" cut -d' ' -f 2- \\ + | sed -f "$UNVIS_CANONICAL" \\ > "$UNUSED_INPUTS" ''', env = { @@ -276,6 +278,7 @@ def _configured_unused_inputs_file(ctx, srcs, keep): "KEEP_INPUTS": keep_inputs.path, "MTREE": ctx.file.mtree.path, "UNUSED_INPUTS": unused_inputs.path, + "UNVIS_CANONICAL": ctx.file._unvis_canonical.path, "VIS_CANONICALIZE": ctx.file._vis_canonicalize.path, "VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path, }, diff --git a/lib/private/unvis_canonical.sed b/lib/private/unvis_canonical.sed new file mode 100644 index 000000000..9d6ec7e79 --- /dev/null +++ b/lib/private/unvis_canonical.sed @@ -0,0 +1,169 @@ +# Code generated by gen_vis_scripts. DO NOT EDIT. +# Replace octal escape sequences with the bytes they represent. +# NOTE: not a fully general unvis program; assumes the canonical form produced by vis_canonicalize.sed + +s/\\000/\x00/g +s/\\001/\x01/g +s/\\002/\x02/g +s/\\003/\x03/g +s/\\004/\x04/g +s/\\005/\x05/g +s/\\006/\x06/g +s/\\007/\x07/g +s/\\010/\x08/g +s/\\011/\x09/g +s/\\012/\x0a/g +s/\\013/\x0b/g +s/\\014/\x0c/g +s/\\015/\x0d/g +s/\\016/\x0e/g +s/\\017/\x0f/g +s/\\020/\x10/g +s/\\021/\x11/g +s/\\022/\x12/g +s/\\023/\x13/g +s/\\024/\x14/g +s/\\025/\x15/g +s/\\026/\x16/g +s/\\027/\x17/g +s/\\030/\x18/g +s/\\031/\x19/g +s/\\032/\x1a/g +s/\\033/\x1b/g +s/\\034/\x1c/g +s/\\035/\x1d/g +s/\\036/\x1e/g +s/\\037/\x1f/g +s/\\040/\x20/g +s/\\177/\x7f/g +s/\\200/\x80/g +s/\\201/\x81/g +s/\\202/\x82/g +s/\\203/\x83/g +s/\\204/\x84/g +s/\\205/\x85/g +s/\\206/\x86/g +s/\\207/\x87/g +s/\\210/\x88/g +s/\\211/\x89/g +s/\\212/\x8a/g +s/\\213/\x8b/g +s/\\214/\x8c/g +s/\\215/\x8d/g +s/\\216/\x8e/g +s/\\217/\x8f/g +s/\\220/\x90/g +s/\\221/\x91/g +s/\\222/\x92/g +s/\\223/\x93/g +s/\\224/\x94/g +s/\\225/\x95/g +s/\\226/\x96/g +s/\\227/\x97/g +s/\\230/\x98/g +s/\\231/\x99/g +s/\\232/\x9a/g +s/\\233/\x9b/g +s/\\234/\x9c/g +s/\\235/\x9d/g +s/\\236/\x9e/g +s/\\237/\x9f/g +s/\\240/\xa0/g +s/\\241/\xa1/g +s/\\242/\xa2/g +s/\\243/\xa3/g +s/\\244/\xa4/g +s/\\245/\xa5/g +s/\\246/\xa6/g +s/\\247/\xa7/g +s/\\250/\xa8/g +s/\\251/\xa9/g +s/\\252/\xaa/g +s/\\253/\xab/g +s/\\254/\xac/g +s/\\255/\xad/g +s/\\256/\xae/g +s/\\257/\xaf/g +s/\\260/\xb0/g +s/\\261/\xb1/g +s/\\262/\xb2/g +s/\\263/\xb3/g +s/\\264/\xb4/g +s/\\265/\xb5/g +s/\\266/\xb6/g +s/\\267/\xb7/g +s/\\270/\xb8/g +s/\\271/\xb9/g +s/\\272/\xba/g +s/\\273/\xbb/g +s/\\274/\xbc/g +s/\\275/\xbd/g +s/\\276/\xbe/g +s/\\277/\xbf/g +s/\\300/\xc0/g +s/\\301/\xc1/g +s/\\302/\xc2/g +s/\\303/\xc3/g +s/\\304/\xc4/g +s/\\305/\xc5/g +s/\\306/\xc6/g +s/\\307/\xc7/g +s/\\310/\xc8/g +s/\\311/\xc9/g +s/\\312/\xca/g +s/\\313/\xcb/g +s/\\314/\xcc/g +s/\\315/\xcd/g +s/\\316/\xce/g +s/\\317/\xcf/g +s/\\320/\xd0/g +s/\\321/\xd1/g +s/\\322/\xd2/g +s/\\323/\xd3/g +s/\\324/\xd4/g +s/\\325/\xd5/g +s/\\326/\xd6/g +s/\\327/\xd7/g +s/\\330/\xd8/g +s/\\331/\xd9/g +s/\\332/\xda/g +s/\\333/\xdb/g +s/\\334/\xdc/g +s/\\335/\xdd/g +s/\\336/\xde/g +s/\\337/\xdf/g +s/\\340/\xe0/g +s/\\341/\xe1/g +s/\\342/\xe2/g +s/\\343/\xe3/g +s/\\344/\xe4/g +s/\\345/\xe5/g +s/\\346/\xe6/g +s/\\347/\xe7/g +s/\\350/\xe8/g +s/\\351/\xe9/g +s/\\352/\xea/g +s/\\353/\xeb/g +s/\\354/\xec/g +s/\\355/\xed/g +s/\\356/\xee/g +s/\\357/\xef/g +s/\\360/\xf0/g +s/\\361/\xf1/g +s/\\362/\xf2/g +s/\\363/\xf3/g +s/\\364/\xf4/g +s/\\365/\xf5/g +s/\\366/\xf6/g +s/\\367/\xf7/g +s/\\370/\xf8/g +s/\\371/\xf9/g +s/\\372/\xfa/g +s/\\373/\xfb/g +s/\\374/\xfc/g +s/\\375/\xfd/g +s/\\376/\xfe/g +s/\\377/\xff/g + +# Unvis of backslash must be applied last to avoid double-interpretation. +s/\\134/\\/g From 7bde12c4d34d3365ff06f77313206ae326be1931 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Thu, 21 Nov 2024 21:15:25 -0500 Subject: [PATCH 06/36] Add a test file with Unicode characters in its name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This demonstrates support for non-ASCII characters in archive filenames. Generating an archive with non-ASCII characters reports a diagnostic, however there does not appear to be any issue — the file is placed into the archive with the expected path and contents. ``` INFO: From Tar lib/tests/tar/7.tar: tar: lib/tests/tar/srcdir/Unicode® support?🤞: Can't translate pathname 'lib/tests/tar/srcdir/Unicode® support?🤞' to UTF-8 ``` --- lib/tests/tar/BUILD.bazel | 4 ++++ .../tar/srcdir/Unicode\302\256 support?\360\237\244\236" | 1 + 2 files changed, 5 insertions(+) create mode 100644 "lib/tests/tar/srcdir/Unicode\302\256 support?\360\237\244\236" diff --git a/lib/tests/tar/BUILD.bazel b/lib/tests/tar/BUILD.bazel index d2b499668..e66604af4 100644 --- a/lib/tests/tar/BUILD.bazel +++ b/lib/tests/tar/BUILD.bazel @@ -227,10 +227,12 @@ assert_tar_listing( "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/", "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/", "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/srcdir/", + r"-rwxr-xr-x 0 0 0 4 Jan 1 2023 lib/tests/tar/srcdir/Unicode\302\256 support?\360\237\244\236", "-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/srcdir/info", "-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/srcdir/pkg", "-rwxr-xr-x 0 0 0 1 Jan 1 2023 lib/tests/tar/srcdir/space in name.txt", "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/", + r"-rwxr-xr-x 0 0 0 4 Jan 1 2023 lib/tests/tar/treeartifact/Unicode\302\256 support?\360\237\244\236", "-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/info", "-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/pkg", "-rwxr-xr-x 0 0 0 1 Jan 1 2023 lib/tests/tar/treeartifact/space in name.txt", @@ -450,6 +452,7 @@ assert_tar_listing( "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/", "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/", "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/", + r"-rwxr-xr-x 0 0 0 4 Jan 1 2023 lib/tests/tar/treeartifact/Unicode\302\256 support?\360\237\244\236", "-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/info", "-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/pkg", "-rwxr-xr-x 0 0 0 1 Jan 1 2023 lib/tests/tar/treeartifact/space in name.txt", @@ -460,6 +463,7 @@ assert_unused_listing( name = "test_unused_inputs_listed", actual = ":tar15", expected = [ + "lib/tests/tar/unused/Unicode® support?🤞", "lib/tests/tar/unused/info", "lib/tests/tar/unused/pkg", "lib/tests/tar/unused/space in name.txt", diff --git "a/lib/tests/tar/srcdir/Unicode\302\256 support?\360\237\244\236" "b/lib/tests/tar/srcdir/Unicode\302\256 support?\360\237\244\236" new file mode 100644 index 000000000..388e04c99 --- /dev/null +++ "b/lib/tests/tar/srcdir/Unicode\302\256 support?\360\237\244\236" @@ -0,0 +1 @@ +💯 \ No newline at end of file From 4c2d148aa9174dc382209f06ddee24d33cfa24b3 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Thu, 21 Nov 2024 21:47:03 -0500 Subject: [PATCH 07/36] Update docstring to reflect greater confidence in mtree filtering logic Now that alternate and redundant escape sequences are canonicalized-away, we no longer need to be concerned about externally-defined mtree files using these and falsely mismatching with our chosen encoding scheme. --- docs/tar.md | 2 +- lib/private/tar.bzl | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/tar.md b/docs/tar.md index f9e82e507..1d3a05a5a 100644 --- a/docs/tar.md +++ b/docs/tar.md @@ -100,7 +100,7 @@ Rule that executes BSD `tar`. Most users should use the [`tar`](#tar) macro, rat | out | Resulting tar file to write. If absent, `[name].tar` is written. | Label | optional | `None` | | args | Additional flags permitted by BSD tar; see the man page. | List of strings | optional | `[]` | | compress | Compress the archive file with a supported algorithm. | String | optional | `""` | -| compute_unused_inputs | Whether to discover and prune input files that will not contribute to the archive.

Unused inputs are discovered by comparing the set of input files in `srcs` to the set of files referenced by `mtree`. Files not used for content by the mtree specification will not be read by the `tar` tool when creating the archive and can be pruned from the input set using the `unused_inputs_list` [mechanism](https://bazel.build/contribute/codebase#input-discovery).

Benefits: pruning unused input files can reduce the amount of work the build system must perform. Pruned files are not included in the action cache key; changes to them do not invalidate the cache entry, which can lead to higher cache hit rates. Actions do not need to block on the availability of pruned inputs, which can increase the available parallelism of builds. Pruned files do not need to be transferred to remote-execution workers, which can reduce network costs.

Risks: pruning an actually-used input file can lead to unexpected, incorrect results. The comparison performed between `srcs` and `mtree` is currently inexact and may fail to handle handwritten or externally-derived mtree specifications. However, it is safe to use this feature when the lines found in `mtree` are derived from one or more `mtree_spec` rules, filtered and/or merged on whole-line basis only.

Possible values:

- `compute_unused_inputs = 1`: Always perform unused input discovery and pruning. - `compute_unused_inputs = 0`: Never discover or prune unused inputs. - `compute_unused_inputs = -1`: Discovery and pruning of unused inputs is controlled by the --[no]@aspect_bazel_lib//lib:tar_compute_unused_inputs flag. | Integer | optional | `-1` | +| compute_unused_inputs | Whether to discover and prune input files that will not contribute to the archive.

Unused inputs are discovered by comparing the set of input files in `srcs` to the set of files referenced by `mtree`. Files not used for content by the mtree specification will not be read by the `tar` tool when creating the archive and can be pruned from the input set using the `unused_inputs_list` [mechanism](https://bazel.build/contribute/codebase#input-discovery).

Benefits: pruning unused input files can reduce the amount of work the build system must perform. Pruned files are not included in the action cache key; changes to them do not invalidate the cache entry, which can lead to higher cache hit rates. Actions do not need to block on the availability of pruned inputs, which can increase the available parallelism of builds. Pruned files do not need to be transferred to remote-execution workers, which can reduce network costs.

Risks: pruning an actually-used input file can lead to unexpected, incorrect results. The comparison performed between `srcs` and `mtree` is exact. There are no known circumstances where incorrect results are anticipated.

Possible values:

- `compute_unused_inputs = 1`: Always perform unused input discovery and pruning. - `compute_unused_inputs = 0`: Never discover or prune unused inputs. - `compute_unused_inputs = -1`: Discovery and pruning of unused inputs is controlled by the --[no]@aspect_bazel_lib//lib:tar_compute_unused_inputs flag. | Integer | optional | `-1` | | mode | A mode indicator from the following list, copied from the tar manpage:

- create: Create a new archive containing the specified items. - append: Like `create`, but new entries are appended to the archive. Note that this only works on uncompressed archives stored in regular files. The -f option is required. - list: List archive contents to stdout. - update: Like `append`, but new entries are added only if they have a modification date newer than the corresponding entry in the archive. Note that this only works on uncompressed archives stored in regular files. The -f option is required. - extract: Extract to disk from the archive. If a file with the same name appears more than once in the archive, each copy will be extracted, with later copies overwriting (replacing) earlier copies. | String | optional | `"create"` | | mtree | An mtree specification file | Label | required | | diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index 17f3e178e..bbfbb1e6a 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -105,10 +105,8 @@ parallelism of builds. Pruned files do not need to be transferred to remote-exec workers, which can reduce network costs. Risks: pruning an actually-used input file can lead to unexpected, incorrect results. The -comparison performed between `srcs` and `mtree` is currently inexact and may fail to -handle handwritten or externally-derived mtree specifications. However, it is safe to use -this feature when the lines found in `mtree` are derived from one or more `mtree_spec` -rules, filtered and/or merged on whole-line basis only. +comparison performed between `srcs` and `mtree` is exact. There are no known +circumstances where incorrect results are anticipated. Possible values: From 419cd8a4bd32a4db9c6c10db6407cde59a78b471 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Thu, 21 Nov 2024 22:38:19 -0500 Subject: [PATCH 08/36] Add a test covering canonicalization of external mtree specs --- lib/tests/tar/BUILD.bazel | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/lib/tests/tar/BUILD.bazel b/lib/tests/tar/BUILD.bazel index e66604af4..6591bf3a1 100644 --- a/lib/tests/tar/BUILD.bazel +++ b/lib/tests/tar/BUILD.bazel @@ -469,3 +469,40 @@ assert_unused_listing( "lib/tests/tar/unused/space in name.txt", ], ) + +############# +# Example 16: custom mtree with alternate escape sequences +# In explicit or externally-derived mtree specs, +# there is no need to match the path encoding that would be produced by our mtree macro exactly. +# All escape sequences supported by bsdtar / libarchive will be understood. +# This includes \s for SPACE, octal encoding of characters that don't need it, and others. +# These alternate forms are not necessarily recommended, but they will work. + +tar( + name = "tar16", + srcs = [":treeartifact"], + out = "16.tar", + compute_unused_inputs = 1, + mtree = [ + r"info uid=0 gid=0 time=1672560000 mode=0755 type=file content=$(location :treeartifact)/\151\156\146\157", + r"space\sin\sname.txt uid=0 gid=0 time=1672560000 mode=0755 type=file content=$(location :treeartifact)/space\sin\sname.txt", + ], +) + +assert_tar_listing( + name = "test_custom_mtree2", + actual = ":tar16", + expected = [ + "-rwxr-xr-x 0 0 0 0 Jan 1 2023 info", + "-rwxr-xr-x 0 0 0 1 Jan 1 2023 space in name.txt", + ], +) + +assert_unused_listing( + name = "test_unused_inputs_listed2", + actual = ":tar16", + expected = [ + "lib/tests/tar/treeartifact/Unicode® support?🤞", + "lib/tests/tar/treeartifact/pkg", + ], +) From 0a4bba682868eb41d004407b3bcac7cee1fca709 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Fri, 22 Nov 2024 13:18:41 -0500 Subject: [PATCH 09/36] Always generate archive listing in the POSIX locale Otherwise we may observe platform-dependent behaviour. --- lib/tests/tar/asserts.bzl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tests/tar/asserts.bzl b/lib/tests/tar/asserts.bzl index 2884e5fd0..6b76cdbad 100644 --- a/lib/tests/tar/asserts.bzl +++ b/lib/tests/tar/asserts.bzl @@ -13,7 +13,7 @@ def assert_tar_listing(name, actual, expected): srcs = [actual], testonly = True, outs = ["_{}.listing".format(name)], - cmd = "$(BSDTAR_BIN) -tvf $(execpath {}) >$@".format(actual), + cmd = "LC_ALL=POSIX $(BSDTAR_BIN) -tvf $(execpath {}) >$@".format(actual), toolchains = ["@bsd_tar_toolchains//:resolved_toolchain"], ) From 41b8d94b15051453e3edbc99dd613bdc14103215 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Fri, 22 Nov 2024 13:39:09 -0500 Subject: [PATCH 10/36] Use C.UTF-8 locale instead Listing creation on Ubuntu is failing with ``` tar: Pathname can't be converted from UTF-8 to current locale. ``` Using a locale with UTF-8 support should help. --- lib/tests/tar/asserts.bzl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tests/tar/asserts.bzl b/lib/tests/tar/asserts.bzl index 6b76cdbad..42111ea4e 100644 --- a/lib/tests/tar/asserts.bzl +++ b/lib/tests/tar/asserts.bzl @@ -13,7 +13,7 @@ def assert_tar_listing(name, actual, expected): srcs = [actual], testonly = True, outs = ["_{}.listing".format(name)], - cmd = "LC_ALL=POSIX $(BSDTAR_BIN) -tvf $(execpath {}) >$@".format(actual), + cmd = "LC_ALL=C.UTF-8 $(BSDTAR_BIN) -tvf $(execpath {}) >$@".format(actual), toolchains = ["@bsd_tar_toolchains//:resolved_toolchain"], ) From e970283280e788692fee6fa76a899646b3ac79c4 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Fri, 22 Nov 2024 15:05:16 -0500 Subject: [PATCH 11/36] Use en_US encoding The POSIX and C.UTF-8 locales seem to work different between my Mac OS dev machine and the Ubuntu CI machines. Rather than continuing to fight Ubuntu to convince it to emit escapes like Mac OS is doing, attempt the opposite and try to coerce Mac OS to behave like Ubuntu and not perform the escaping. --- lib/tests/tar/BUILD.bazel | 6 +++--- lib/tests/tar/asserts.bzl | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/tests/tar/BUILD.bazel b/lib/tests/tar/BUILD.bazel index 6591bf3a1..34a454b1f 100644 --- a/lib/tests/tar/BUILD.bazel +++ b/lib/tests/tar/BUILD.bazel @@ -227,12 +227,12 @@ assert_tar_listing( "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/", "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/", "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/srcdir/", - r"-rwxr-xr-x 0 0 0 4 Jan 1 2023 lib/tests/tar/srcdir/Unicode\302\256 support?\360\237\244\236", + "-rwxr-xr-x 0 0 0 4 Jan 1 2023 lib/tests/tar/srcdir/Unicode® support?🤞", "-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/srcdir/info", "-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/srcdir/pkg", "-rwxr-xr-x 0 0 0 1 Jan 1 2023 lib/tests/tar/srcdir/space in name.txt", "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/", - r"-rwxr-xr-x 0 0 0 4 Jan 1 2023 lib/tests/tar/treeartifact/Unicode\302\256 support?\360\237\244\236", + "-rwxr-xr-x 0 0 0 4 Jan 1 2023 lib/tests/tar/treeartifact/Unicode® support?🤞", "-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/info", "-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/pkg", "-rwxr-xr-x 0 0 0 1 Jan 1 2023 lib/tests/tar/treeartifact/space in name.txt", @@ -452,7 +452,7 @@ assert_tar_listing( "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/", "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/", "drwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/", - r"-rwxr-xr-x 0 0 0 4 Jan 1 2023 lib/tests/tar/treeartifact/Unicode\302\256 support?\360\237\244\236", + "-rwxr-xr-x 0 0 0 4 Jan 1 2023 lib/tests/tar/treeartifact/Unicode® support?🤞", "-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/info", "-rwxr-xr-x 0 0 0 0 Jan 1 2023 lib/tests/tar/treeartifact/pkg", "-rwxr-xr-x 0 0 0 1 Jan 1 2023 lib/tests/tar/treeartifact/space in name.txt", diff --git a/lib/tests/tar/asserts.bzl b/lib/tests/tar/asserts.bzl index 42111ea4e..d5133a085 100644 --- a/lib/tests/tar/asserts.bzl +++ b/lib/tests/tar/asserts.bzl @@ -13,7 +13,7 @@ def assert_tar_listing(name, actual, expected): srcs = [actual], testonly = True, outs = ["_{}.listing".format(name)], - cmd = "LC_ALL=C.UTF-8 $(BSDTAR_BIN) -tvf $(execpath {}) >$@".format(actual), + cmd = "LC_ALL=en_US $(BSDTAR_BIN) -tvf $(execpath {}) >$@".format(actual), toolchains = ["@bsd_tar_toolchains//:resolved_toolchain"], ) From 522d76b7e67d2a8719a32038fbe0d7ccda617145 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Sun, 24 Nov 2024 11:13:02 -0500 Subject: [PATCH 12/36] Provide explanation for hacky use of en_US locale --- lib/tests/tar/asserts.bzl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/tests/tar/asserts.bzl b/lib/tests/tar/asserts.bzl index d5133a085..81a8aea9d 100644 --- a/lib/tests/tar/asserts.bzl +++ b/lib/tests/tar/asserts.bzl @@ -13,7 +13,11 @@ def assert_tar_listing(name, actual, expected): srcs = [actual], testonly = True, outs = ["_{}.listing".format(name)], + # HACK: under default and POSIX locales, MacOS 15.1 and Ubuntu 22.04 disagree on how files with Unicode filenames should be printed. + # LC_ALL=en_US may be inacurate, but by using a dense 8-bit, single-byte encoding, + # we achieve the effect of leaving the bytes alone and producing a consistent output to assert against. cmd = "LC_ALL=en_US $(BSDTAR_BIN) -tvf $(execpath {}) >$@".format(actual), + # toolchains = ["@bsd_tar_toolchains//:resolved_toolchain"], ) From c0c49f01d2a9e002ed16950622cfc2422c943bd3 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Sat, 7 Dec 2024 12:08:20 -0500 Subject: [PATCH 13/36] Move non-devtool rules out of quarantine package Only direct references to rules_go need to be isolated, not rules that merely transitively depend on that ruleset through references to other rules. --- lib/private/BUILD.bazel | 29 +++++++++++++++++++++ lib/private/gen_vis_scripts/BUILD.bazel | 34 +------------------------ 2 files changed, 30 insertions(+), 33 deletions(-) diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index e49f11858..72a6c8421 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -1,5 +1,7 @@ load("@bazel_skylib//:bzl_library.bzl", "bzl_library") +load("//lib:run_binary.bzl", "run_binary") load("//lib:utils.bzl", "is_bazel_7_or_greater") +load("//lib:write_source_files.bzl", "write_source_files") exports_files( [ @@ -382,3 +384,30 @@ bzl_library( srcs = ["zstd_toolchain.bzl"], visibility = ["//lib:__subpackages__"], ) + +run_binary( + name = "run_gen_vis_scripts", + outs = [ + "_unvis_canonical.sed", + "_vis_canonicalize.sed", + "_vis_escape_ascii.bzl", + "_vis_escape_nonascii.sed", + ], + args = [ + "unvis_canonical.sed=$(location _unvis_canonical.sed)", + "vis_canonicalize.sed=$(location _vis_canonicalize.sed)", + "vis_escape_ascii.bzl=$(location _vis_escape_ascii.bzl)", + "vis_escape_nonascii.sed=$(location _vis_escape_nonascii.sed)", + ], + tool = "//lib/private/gen_vis_scripts", +) + +write_source_files( + name = "write_vis_scripts", + files = { + "unvis_canonical.sed": ":_unvis_canonical.sed", + "vis_canonicalize.sed": ":_vis_canonicalize.sed", + "vis_escape_ascii.bzl": ":_vis_escape_ascii.bzl", + "vis_escape_nonascii.sed": ":_vis_escape_nonascii.sed", + }, +) diff --git a/lib/private/gen_vis_scripts/BUILD.bazel b/lib/private/gen_vis_scripts/BUILD.bazel index 3865f0dff..747e0d6d1 100644 --- a/lib/private/gen_vis_scripts/BUILD.bazel +++ b/lib/private/gen_vis_scripts/BUILD.bazel @@ -1,39 +1,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_binary") -load("//lib:run_binary.bzl", "run_binary") -load("//lib:write_source_files.bzl", "write_source_files") go_binary( name = "gen_vis_scripts", srcs = ["gen_vis_scripts.go"], -) - -run_binary( - name = "run_gen_vis_scripts", - outs = [ - "unvis_canonical.sed", - "vis_canonicalize.sed", - "vis_escape_ascii.bzl", - "vis_escape_nonascii.sed", - ], - args = [ - "unvis_canonical.sed=$(location unvis_canonical.sed)", - "vis_canonicalize.sed=$(location vis_canonicalize.sed)", - "vis_escape_ascii.bzl=$(location vis_escape_ascii.bzl)", - "vis_escape_nonascii.sed=$(location vis_escape_nonascii.sed)", - ], - tool = ":gen_vis_scripts", -) - -write_source_files( - name = "write_vis_scripts", - - # Required to support cross-package references. - check_that_out_file_exists = False, - # - files = { - "//lib/private:unvis_canonical.sed": ":unvis_canonical.sed", - "//lib/private:vis_canonicalize.sed": ":vis_canonicalize.sed", - "//lib/private:vis_escape_ascii.bzl": ":vis_escape_ascii.bzl", - "//lib/private:vis_escape_nonascii.sed": ":vis_escape_nonascii.sed", - }, + visibility = ["//lib/private:__pkg__"], ) From 7fe54b8e24633c9e2a0e9636c7bbead4b1489146 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Sat, 7 Dec 2024 12:15:54 -0500 Subject: [PATCH 14/36] Move script generator tool to live alongside other tools --- lib/private/BUILD.bazel | 2 +- {lib/private => tools}/gen_vis_scripts/BUILD.bazel | 0 {lib/private => tools}/gen_vis_scripts/gen_vis_scripts.go | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename {lib/private => tools}/gen_vis_scripts/BUILD.bazel (100%) rename {lib/private => tools}/gen_vis_scripts/gen_vis_scripts.go (100%) diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index 72a6c8421..c66b48c45 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -399,7 +399,7 @@ run_binary( "vis_escape_ascii.bzl=$(location _vis_escape_ascii.bzl)", "vis_escape_nonascii.sed=$(location _vis_escape_nonascii.sed)", ], - tool = "//lib/private/gen_vis_scripts", + tool = "//tools/gen_vis_scripts", ) write_source_files( diff --git a/lib/private/gen_vis_scripts/BUILD.bazel b/tools/gen_vis_scripts/BUILD.bazel similarity index 100% rename from lib/private/gen_vis_scripts/BUILD.bazel rename to tools/gen_vis_scripts/BUILD.bazel diff --git a/lib/private/gen_vis_scripts/gen_vis_scripts.go b/tools/gen_vis_scripts/gen_vis_scripts.go similarity index 100% rename from lib/private/gen_vis_scripts/gen_vis_scripts.go rename to tools/gen_vis_scripts/gen_vis_scripts.go From 54694cebbc32cb2540f884d1009a91094e0bb4a1 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Sat, 7 Dec 2024 12:19:04 -0500 Subject: [PATCH 15/36] Remove unnecessary export to non-existed package --- lib/private/BUILD.bazel | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index c66b48c45..c1d372571 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -19,10 +19,7 @@ exports_files( exports_files( glob(["*.bzl"]), - visibility = [ - "//lib/private/docs:__pkg__", - "//lib/private/gen_vis_scripts:__pkg__", - ], + visibility = ["//lib/private/docs:__pkg__"], ) bzl_library( From 1aded37411c6a4f53e96f84924a6f5ee9ad30c80 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Sat, 7 Dec 2024 12:23:49 -0500 Subject: [PATCH 16/36] Associate gencode comment with the generation block it describes --- tools/gen_vis_scripts/gen_vis_scripts.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/gen_vis_scripts/gen_vis_scripts.go b/tools/gen_vis_scripts/gen_vis_scripts.go index dec86e422..b1c4277fc 100644 --- a/tools/gen_vis_scripts/gen_vis_scripts.go +++ b/tools/gen_vis_scripts/gen_vis_scripts.go @@ -114,10 +114,10 @@ s/\\v/\\013/g # NUL special form must be disambiguated from ordinary octal escape sequences. s/\\0([^0-7]|$|[0-7]([^0-7]|$))/\\000\1/g - -# Remove octal escaping from characters that don't need it. `)) + fmt.Fprintln(w, "") + fmt.Fprintln(w, "# Remove octal escaping from characters that don't need it.") for i := 0; i <= 0xFF; i++ { b := byte(i) if shouldEscape(b) { From 250e4c81fd76cb9b1f5b4278bb03b141376170f8 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Tue, 18 Feb 2025 17:27:19 -0500 Subject: [PATCH 17/36] Use correctly octal escape sequence for backspace --- lib/private/vis_canonicalize.sed | 2 +- tools/gen_vis_scripts/gen_vis_scripts.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/private/vis_canonicalize.sed b/lib/private/vis_canonicalize.sed index 4944a8c94..ae8bf1be2 100644 --- a/lib/private/vis_canonicalize.sed +++ b/lib/private/vis_canonicalize.sed @@ -17,7 +17,7 @@ s/\\\\|\\([^0-3abfnrstv\\]|$)/\\134\1/g s/\\([1-3]([^0-7]|$|[0-7]([^0-7]|$)))/\\134\1/g s/\\a/\\007/g -s/\\b/\\008/g +s/\\b/\\010/g s/\\f/\\014/g s/\\n/\\012/g s/\\r/\\015/g diff --git a/tools/gen_vis_scripts/gen_vis_scripts.go b/tools/gen_vis_scripts/gen_vis_scripts.go index b1c4277fc..1349308d7 100644 --- a/tools/gen_vis_scripts/gen_vis_scripts.go +++ b/tools/gen_vis_scripts/gen_vis_scripts.go @@ -104,7 +104,7 @@ s/\\\\|\\([^0-3abfnrstv\\]|$)/\\134\1/g s/\\([1-3]([^0-7]|$|[0-7]([^0-7]|$)))/\\134\1/g s/\\a/\\007/g -s/\\b/\\008/g +s/\\b/\\010/g s/\\f/\\014/g s/\\n/\\012/g s/\\r/\\015/g From c25869390f9c2f97804cf69200a11aa55208e108 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Tue, 18 Feb 2025 17:29:38 -0500 Subject: [PATCH 18/36] Escape ampersand in sed replacement text It has special meaning in this context and requires escaping to get a literal: > The replacement may contain the special character & to > refer to that portion of the pattern space which matched, > and the special escapes \1 through \9 to refer to the > corresponding matching sub-expressions in the regexp. --- lib/private/vis_canonicalize.sed | 2 +- tools/gen_vis_scripts/gen_vis_scripts.go | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/private/vis_canonicalize.sed b/lib/private/vis_canonicalize.sed index ae8bf1be2..dea6b2f0b 100644 --- a/lib/private/vis_canonicalize.sed +++ b/lib/private/vis_canonicalize.sed @@ -34,7 +34,7 @@ s/\\042/"/g s/\\043/#/g s/\\044/$/g s/\\045/%/g -s/\\046/&/g +s:\\046:\&:g s/\\047/'/g s/\\050/(/g s/\\051/)/g diff --git a/tools/gen_vis_scripts/gen_vis_scripts.go b/tools/gen_vis_scripts/gen_vis_scripts.go index 1349308d7..5700b52d9 100644 --- a/tools/gen_vis_scripts/gen_vis_scripts.go +++ b/tools/gen_vis_scripts/gen_vis_scripts.go @@ -125,6 +125,8 @@ s/\\0([^0-7]|$|[0-7]([^0-7]|$))/\\000\1/g } if b == '/' { fmt.Fprintf(w, `s:\\%03[1]o:%[1]c:g%[2]c`, b, newline) + } else if b == '&' { + fmt.Fprintf(w, `s:\\%03[1]o:\%[1]c:g%[2]c`, b, newline) } else { fmt.Fprintf(w, `s/\\%03[1]o/%[1]c/g%[2]c`, b, newline) } From 2500bf42971b3eeefff829d4098f40fe2f812d4f Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Tue, 18 Feb 2025 17:31:48 -0500 Subject: [PATCH 19/36] Add test for vis encoding utilities --- lib/tests/BUILD.bazel | 28 +++ lib/tests/vis_encoding.bats | 356 ++++++++++++++++++++++++++++++++++++ 2 files changed, 384 insertions(+) create mode 100644 lib/tests/vis_encoding.bats diff --git a/lib/tests/BUILD.bazel b/lib/tests/BUILD.bazel index 5e98a2a8f..692e8a575 100644 --- a/lib/tests/BUILD.bazel +++ b/lib/tests/BUILD.bazel @@ -12,6 +12,7 @@ load(":lists_test.bzl", "lists_test_suite") load(":paths_test.bzl", "paths_test_suite") load(":strings_tests.bzl", "strings_test_suite") load(":utils_test.bzl", "utils_test_suite") +load("//lib:bats.bzl", "bats_test") exports_files(["a.js"]) @@ -76,3 +77,30 @@ bzl_library( srcs = ["generate_outputs.bzl"], visibility = ["//visibility:public"], ) + +genrule( + name = "coreutils", + toolchains = [ + "@coreutils_toolchains//:resolved_toolchain", + ], + outs = ["coreutils_bin"], + cmd = "cp $(COREUTILS_BIN) $@", +) + +bats_test( + name = "vis_encoding", + srcs = ["vis_encoding.bats"], + size = "small", + data = [ + "//lib/private:vis_escape_nonascii.sed", + "//lib/private:unvis_canonical.sed", + "//lib/private:vis_canonicalize.sed", + ":coreutils", + ], + env = { + "VIS_ESCAPE_NONASCII": "$(location //lib/private:vis_escape_nonascii.sed)", + "UNVIS_CANONICAL": "$(location //lib/private:unvis_canonical.sed)", + "VIS_CANONICALIZE": "$(location //lib/private:vis_canonicalize.sed)", + "COREUTILS": "$(rootpath :coreutils)", + } +) diff --git a/lib/tests/vis_encoding.bats b/lib/tests/vis_encoding.bats new file mode 100644 index 000000000..52770065e --- /dev/null +++ b/lib/tests/vis_encoding.bats @@ -0,0 +1,356 @@ +# Tests of the vis encoding support scripts. +# +# Most test cases make use of the fact that newline characters are passed through verbatim by all of these scripts. +# For this reason, paragraph-delimited records of newline-delimited fields is a natural framing structure that will +# be preserved through the encoding/decoding/canonicalizing transformation. + +gawk() { + # TODO: from toolchain + /opt/homebrew/bin/gawk "$@" +} +gsed() { + # TODO: replace with AWK + /opt/homebrew/bin/gsed "$@" +} +cat() { + "$COREUTILS" cat "$@" +} +tr() { + "$COREUTILS" tr "$@" +} +basenc() { + "$COREUTILS" basenc "$@" +} +od() { + "$COREUTILS" od "$@" +} +paste() { + "$COREUTILS" paste "$@" +} + +@test "vis encode passthrough text" { + cat <<'EOF' >"$BATS_TEST_TMPDIR/input" +Newlines (\n), backslahes (\\), and graphical ASCII ([[:graph:]]) characters are passed through unencoded. +Upstream encoders should escape the first two in content they feed to the general encoder. + + Newline => \012 + Backslash => \134 + +These gaps enable our encoder to operate on newline-delimited records of vis-encoded content. +EOF + + gsed -f "$VIS_ESCAPE_NONASCII" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + + # Content chosen to pass through encoder unmodified... mostly (except spaces, which must be patched up). + cp "$BATS_TEST_TMPDIR/input" "$BATS_TEST_TMPDIR/want" + + # Spaces _do_ get escaped; undo that so that text under comparison remains human-friendly. + gawk '{ gsub(/[\\]040/, " "); print }' <"$BATS_TEST_TMPDIR/output.raw" >"$BATS_TEST_TMPDIR/output" + + cd "$BATS_TEST_TMPDIR" + diff -u want output +} + +@test "vis encode each byte" { + gawk -v OFS="0A" -v ORS="0A0A" '{ $1 = $1; print }' <<'EOF' | basenc --decode --base16 >"$BATS_TEST_TMPDIR/input" +00 01 02 03 04 05 06 07 08 09 0B 0C 0D 0E 0F +10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F +20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F +30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F +40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F +50 51 52 53 54 55 56 57 58 59 5A 5B 5D 5E 5F +60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F +70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F +80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F +90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F +A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF +B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF +C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF +D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF +E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF +F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF +EOF + + gsed -f "$VIS_ESCAPE_NONASCII" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + + gawk -v FS='\n' -v RS='\n\n' ' + NR == rshift(0x00, 4) + 1 { for (i = NF; i > 0x0A; i--) $(i+1) = $(i); $(0x0A+1) = "" } # Newline gap + NR == rshift(0x50, 4) + 1 { for (i = NF; i > 0x0C; i--) $(i+1) = $(i); $(0x0C+1) = "" } # Backslash gap + { for (i = 1; i <= NF; i++) printf "%4s%s", $(i), i == NF ? ORS : OFS } # Emit table with fixed-width columns. + ' <"$BATS_TEST_TMPDIR/output.raw" >"$BATS_TEST_TMPDIR/output" + + cat <<'EOF' >"$BATS_TEST_TMPDIR/want" +\000 \001 \002 \003 \004 \005 \006 \007 \010 \011 \013 \014 \015 \016 \017 +\020 \021 \022 \023 \024 \025 \026 \027 \030 \031 \032 \033 \034 \035 \036 \037 +\040 ! " # $ % & ' ( ) * + , - . / + 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + @ A B C D E F G H I J K L M N O + P Q R S T U V W X Y Z [ ] ^ _ + ` a b c d e f g h i j k l m n o + p q r s t u v w x y z { | } ~ \177 +\200 \201 \202 \203 \204 \205 \206 \207 \210 \211 \212 \213 \214 \215 \216 \217 +\220 \221 \222 \223 \224 \225 \226 \227 \230 \231 \232 \233 \234 \235 \236 \237 +\240 \241 \242 \243 \244 \245 \246 \247 \250 \251 \252 \253 \254 \255 \256 \257 +\260 \261 \262 \263 \264 \265 \266 \267 \270 \271 \272 \273 \274 \275 \276 \277 +\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 \315 \316 \317 +\320 \321 \322 \323 \324 \325 \326 \327 \330 \331 \332 \333 \334 \335 \336 \337 +\340 \341 \342 \343 \344 \345 \346 \347 \350 \351 \352 \353 \354 \355 \356 \357 +\360 \361 \362 \363 \364 \365 \366 \367 \370 \371 \372 \373 \374 \375 \376 \377 +EOF + + cd "$BATS_TEST_TMPDIR" + diff -u want output +} + +@test "vis decode passthrough text" { + cat <<'EOF' >"$BATS_TEST_TMPDIR/input" +All text that is not an 3-digit octal escape sequence is passed through the decoder. +This includes backslashes (\), even those part of special forms sometimes recognized elsewhere (e.g. \n, \r, \v, \0, etc.). +EOF + + gsed -f "$UNVIS_CANONICAL" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output" + + # Content chosen to pass through encoder unmodified. + cp "$BATS_TEST_TMPDIR/input" "$BATS_TEST_TMPDIR/want" + + cd "$BATS_TEST_TMPDIR" + diff -u want output +} + +@test "vis decode passthrough all non-escape-sequence bytes" { + tr -d ' \n' <<'EOF' | basenc --decode --base16 >"$BATS_TEST_TMPDIR/input" +00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F +10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F +20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F +30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F +40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F +50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F +60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F +70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F +80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F +90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F +A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF +B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF +C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF +D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF +E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF +F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF +EOF + + gsed -f "$UNVIS_CANONICAL" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + + # Decoded content contains unprintable control characters. Diff the hexdump instead. + od -Ax -tx1 <"$BATS_TEST_TMPDIR/output.raw" >"$BATS_TEST_TMPDIR/output" + + cat <<'EOF' >"$BATS_TEST_TMPDIR/want" +000000 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f +000010 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f +000020 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f +000030 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f +000040 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f +000050 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f +000060 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f +000070 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f +000080 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f +000090 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f +0000A0 a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af +0000B0 b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf +0000C0 c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf +0000D0 d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df +0000E0 e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef +0000F0 f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff +000100 +EOF + + cd "$BATS_TEST_TMPDIR" + diff -u want output +} + +@test "vis decode all octal escape-sequences" { + tr -d ' \n' <<'EOF' >"$BATS_TEST_TMPDIR/input" +\000 \001 \002 \003 \004 \005 \006 \007 \010 \011 \012 \013 \014 \015 \016 \017 +\020 \021 \022 \023 \024 \025 \026 \027 \030 \031 \032 \033 \034 \035 \036 \037 +\040 \041 \042 \043 \044 \045 \046 \047 \050 \051 \052 \053 \054 \055 \056 \057 +\060 \061 \062 \063 \064 \065 \066 \067 \070 \071 \072 \073 \074 \075 \076 \077 +\100 \101 \102 \103 \104 \105 \106 \107 \110 \111 \112 \113 \114 \115 \116 \117 +\120 \121 \122 \123 \124 \125 \126 \127 \130 \131 \132 \133 \134 \135 \136 \137 +\140 \141 \142 \143 \144 \145 \146 \147 \150 \151 \152 \153 \154 \155 \156 \157 +\160 \161 \162 \163 \164 \165 \166 \167 \170 \171 \172 \173 \174 \175 \176 \177 +\200 \201 \202 \203 \204 \205 \206 \207 \210 \211 \212 \213 \214 \215 \216 \217 +\220 \221 \222 \223 \224 \225 \226 \227 \230 \231 \232 \233 \234 \235 \236 \237 +\240 \241 \242 \243 \244 \245 \246 \247 \250 \251 \252 \253 \254 \255 \256 \257 +\260 \261 \262 \263 \264 \265 \266 \267 \270 \271 \272 \273 \274 \275 \276 \277 +\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 \315 \316 \317 +\320 \321 \322 \323 \324 \325 \326 \327 \330 \331 \332 \333 \334 \335 \336 \337 +\340 \341 \342 \343 \344 \345 \346 \347 \350 \351 \352 \353 \354 \355 \356 \357 +\360 \361 \362 \363 \364 \365 \366 \367 \370 \371 \372 \373 \374 \375 \376 \377 +EOF + + gsed -f "$UNVIS_CANONICAL" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + + # Decoded content contains unprintable control characters. Diff the hexdump instead. + od -Ax -tx1 <"$BATS_TEST_TMPDIR/output.raw" >"$BATS_TEST_TMPDIR/output" + + cat <<'EOF' >"$BATS_TEST_TMPDIR/want" +000000 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f +000010 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f +000020 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f +000030 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f +000040 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f +000050 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f +000060 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f +000070 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f +000080 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f +000090 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f +0000A0 a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af +0000B0 b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf +0000C0 c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf +0000D0 d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df +0000E0 e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef +0000F0 f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff +000100 +EOF + + cd "$BATS_TEST_TMPDIR" + diff -u want output +} + +@test "vis canonicalize passthrough already-canonical" { + cat <<'EOF' >"$BATS_TEST_TMPDIR/input.table" +\000 \001 \002 \003 \004 \005 \006 \007 \010 \011 \012 \013 \014 \015 \016 \017 +\020 \021 \022 \023 \024 \025 \026 \027 \030 \031 \032 \033 \034 \035 \036 \037 +\040 ! " # $ % & ' ( ) * + , - . / + 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + @ A B C D E F G H I J K L M N O + P Q R S T U V W X Y Z [ \134 ] ^ _ + ` a b c d e f g h i j k l m n o + p q r s t u v w x y z { | } ~ \177 +\200 \201 \202 \203 \204 \205 \206 \207 \210 \211 \212 \213 \214 \215 \216 \217 +\220 \221 \222 \223 \224 \225 \226 \227 \230 \231 \232 \233 \234 \235 \236 \237 +\240 \241 \242 \243 \244 \245 \246 \247 \250 \251 \252 \253 \254 \255 \256 \257 +\260 \261 \262 \263 \264 \265 \266 \267 \270 \271 \272 \273 \274 \275 \276 \277 +\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 \315 \316 \317 +\320 \321 \322 \323 \324 \325 \326 \327 \330 \331 \332 \333 \334 \335 \336 \337 +\340 \341 \342 \343 \344 \345 \346 \347 \350 \351 \352 \353 \354 \355 \356 \357 +\360 \361 \362 \363 \364 \365 \366 \367 \370 \371 \372 \373 \374 \375 \376 \377 +EOF + gawk -v OFS='\n' -v ORS='\n\n' '{ $1 = $1; print }' <"$BATS_TEST_TMPDIR/input.table" >"$BATS_TEST_TMPDIR/input" + + gsed -Ef "$VIS_CANONICALIZE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + + gawk -v FS='\n' -v RS='\n\n' ' + { for (i = 1; i <= NF; i++) printf "%4s%s", $(i), i == NF ? ORS : OFS } # Emit table with fixed-width columns. + ' <"$BATS_TEST_TMPDIR/output.raw" >"$BATS_TEST_TMPDIR/output" + + # Content chosen to pass through encoder unmodified. + cp "$BATS_TEST_TMPDIR/input.table" "$BATS_TEST_TMPDIR/want" + + cd "$BATS_TEST_TMPDIR" + diff -u want output +} + +@test "vis canonicalize unnecessarily escaped" { + gawk -v OFS='\n' -v ORS='\n\n' '{ $1 = $1; print }' <<'EOF' >"$BATS_TEST_TMPDIR/input" + \041 \042 \043 \044 \045 \046 \047 \050 \051 \052 \053 \054 \055 \056 \057 +\060 \061 \062 \063 \064 \065 \066 \067 \070 \071 \072 \073 \074 \075 \076 \077 +\100 \101 \102 \103 \104 \105 \106 \107 \110 \111 \112 \113 \114 \115 \116 \117 +\120 \121 \122 \123 \124 \125 \126 \127 \130 \131 \132 \133 \135 \136 \137 +\140 \141 \142 \143 \144 \145 \146 \147 \150 \151 \152 \153 \154 \155 \156 \157 +\160 \161 \162 \163 \164 \165 \166 \167 \170 \171 \172 \173 \174 \175 \176 +EOF + + gsed -Ef "$VIS_CANONICALIZE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + + gawk -v FS='\n' -v RS='\n\n' ' + NR == rshift(0x20 - 0x20, 4) + 1 { for (i = NF; i > 0x00; i--) $(i+1) = $(i); $(0x00+1) = "" } # Space gap + NR == rshift(0x50 - 0x20, 4) + 1 { for (i = NF; i > 0x0C; i--) $(i+1) = $(i); $(0x0C+1) = "" } # Backslash gap + NR == rshift(0x70 - 0x20, 4) + 1 { for (i = NF; i > 0x0F; i--) $(i+1) = $(i); $(0x0F+1) = "" } # Delete gap + { for (i = 1; i <= NF; i++) printf "%1s%s", $(i), i == NF ? ORS : OFS } # Emit table with fixed-width columns. + ' <"$BATS_TEST_TMPDIR/output.raw" >"$BATS_TEST_TMPDIR/output" + + cat <<'EOF' >"$BATS_TEST_TMPDIR/want" + ! " # $ % & ' ( ) * + , - . / +0 1 2 3 4 5 6 7 8 9 : ; < = > ? +@ A B C D E F G H I J K L M N O +P Q R S T U V W X Y Z [ ] ^ _ +` a b c d e f g h i j k l m n o +p q r s t u v w x y z { | } ~ +EOF + + cd "$BATS_TEST_TMPDIR" + diff -u want output +} + +@test "vis canonicalize unescaped" { + gawk -v OFS='0A' -v ORS='0A0A' '{ $1 = $1; print }' <<'EOF' | basenc --decode --base16 >"$BATS_TEST_TMPDIR/input" +00 01 02 03 04 05 06 07 08 09 0B 0C 0D 0E 0F +10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F +20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F +30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F +40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F +50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F +60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F +70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F +80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F +90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F +A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF +B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF +C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF +D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF +E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF +F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF +EOF + + gsed -Ef "$VIS_CANONICALIZE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + + gawk -v FS='\n' -v RS='\n\n' ' + NR == rshift(0x00, 4) + 1 { for (i = NF; i > 0x0A; i--) $(i+1) = $(i); $(0x0A+1) = "" } # Newline gap + { for (i = 1; i <= NF; i++) printf "%4s%s", $(i), i == NF ? ORS : OFS } # Emit table with fixed-width columns. + ' <"$BATS_TEST_TMPDIR/output.raw" >"$BATS_TEST_TMPDIR/output" + + cat <<'EOF' >"$BATS_TEST_TMPDIR/want" +\000 \001 \002 \003 \004 \005 \006 \007 \010 \011 \013 \014 \015 \016 \017 +\020 \021 \022 \023 \024 \025 \026 \027 \030 \031 \032 \033 \034 \035 \036 \037 +\040 ! " # $ % & ' ( ) * + , - . / + 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + @ A B C D E F G H I J K L M N O + P Q R S T U V W X Y Z [ \134 ] ^ _ + ` a b c d e f g h i j k l m n o + p q r s t u v w x y z { | } ~ \177 +\200 \201 \202 \203 \204 \205 \206 \207 \210 \211 \212 \213 \214 \215 \216 \217 +\220 \221 \222 \223 \224 \225 \226 \227 \230 \231 \232 \233 \234 \235 \236 \237 +\240 \241 \242 \243 \244 \245 \246 \247 \250 \251 \252 \253 \254 \255 \256 \257 +\260 \261 \262 \263 \264 \265 \266 \267 \270 \271 \272 \273 \274 \275 \276 \277 +\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 \315 \316 \317 +\320 \321 \322 \323 \324 \325 \326 \327 \330 \331 \332 \333 \334 \335 \336 \337 +\340 \341 \342 \343 \344 \345 \346 \347 \350 \351 \352 \353 \354 \355 \356 \357 +\360 \361 \362 \363 \364 \365 \366 \367 \370 \371 \372 \373 \374 \375 \376 \377 +EOF + + cd "$BATS_TEST_TMPDIR" + diff -u want output +} + +@test "vis canonicalize special forms" { + cat <<'EOF' >"$BATS_TEST_TMPDIR/input_want" +\0 \000 +\ \134 +\\ \134 +\a \007 +\b \010 +\f \014 +\n \012 +\r \015 +\s \040 +\t \011 +\v \013 +EOF + cut -f1 <"$BATS_TEST_TMPDIR/input_want" >"$BATS_TEST_TMPDIR/input" + + gsed -Ef "$VIS_CANONICALIZE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output" + + paste "$BATS_TEST_TMPDIR/input" "$BATS_TEST_TMPDIR/output" >"$BATS_TEST_TMPDIR/input_output" + + cd "$BATS_TEST_TMPDIR" + diff -u input_want input_output +} From fa4f75d717e070f04a15fe84b9efecaf04045de0 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Tue, 18 Feb 2025 19:09:52 -0500 Subject: [PATCH 20/36] Drop most in-process vis encoding If we need an out-of-process encoder anyways, might as well lean in to that. Also, it makes that out-of-process encoder a more sensible utility in a vacuum. --- lib/private/BUILD.bazel | 4 - lib/private/strings.bzl | 103 ----------------------- lib/private/tar.bzl | 4 +- lib/private/vis_escape_ascii.bzl | 42 --------- lib/tests/strings_tests.bzl | 26 +----- tools/gen_vis_scripts/gen_vis_scripts.go | 21 ----- 6 files changed, 2 insertions(+), 198 deletions(-) delete mode 100644 lib/private/vis_escape_ascii.bzl diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index c1d372571..a8aa3de16 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -286,7 +286,6 @@ bzl_library( name = "tar", srcs = [ "tar.bzl", - "vis_escape_ascii.bzl", ], visibility = ["//lib:__subpackages__"], deps = [ @@ -387,13 +386,11 @@ run_binary( outs = [ "_unvis_canonical.sed", "_vis_canonicalize.sed", - "_vis_escape_ascii.bzl", "_vis_escape_nonascii.sed", ], args = [ "unvis_canonical.sed=$(location _unvis_canonical.sed)", "vis_canonicalize.sed=$(location _vis_canonicalize.sed)", - "vis_escape_ascii.bzl=$(location _vis_escape_ascii.bzl)", "vis_escape_nonascii.sed=$(location _vis_escape_nonascii.sed)", ], tool = "//tools/gen_vis_scripts", @@ -404,7 +401,6 @@ write_source_files( files = { "unvis_canonical.sed": ":_unvis_canonical.sed", "vis_canonicalize.sed": ":_vis_canonicalize.sed", - "vis_escape_ascii.bzl": ":_vis_escape_ascii.bzl", "vis_escape_nonascii.sed": ":_vis_escape_nonascii.sed", }, ) diff --git a/lib/private/strings.bzl b/lib/private/strings.bzl index 479b0cd10..acf5157ff 100644 --- a/lib/private/strings.bzl +++ b/lib/private/strings.bzl @@ -1,7 +1,5 @@ "String utilities" -load("@bazel_skylib//lib:types.bzl", "types") - CHAR_TO_INT = { "\0": 0, "\1": 1, @@ -655,104 +653,3 @@ def split_args(s): if arg != "": args.append(arg) return args - -def maketrans(x): - """ - Return a translation table usable with translate(). - - Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.maketrans) - of the same name. - - Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not - possible. Entries for characters outside this range will trigger a failure. - - Args: - x: dictionary mapping Unicode ordinals (integers) or characters (length-1 strings) - to Unicode ordinals, strings, or None. Character keys will be converted to ordinals. - - Returns: - dict. The translation table. - """ - - if not types.is_dict(x): - fail("if you give only one argument to maketrans it must be a dict") - - table = {} - - for (k, v) in x.items(): - if types.is_int(k): - if k > 0xFF: - fail("most Unicode is unsupported") - table[k] = v - elif types.is_string(k): - if len(k) != 1: - fail("string keys in translate table must be of length 1") - codepoint = ord(k) - if codepoint == None: - fail("could not compute ord('{}'), most Unicode is unsupported".format(k)) - table[codepoint] = v - else: - fail("keys in translate table must be strings or integers") - - return table - -def translate(s, table): - """ - Replace characters a string according to a translation table. - - Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.translate) - of the same name. - - Characters with entries in the table are replaced in the output. - Characters mapped to None are deleted. - Characters absent from the table are mirrored to the output untouched. - - Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not - possible. Characters outside this range will be silently mirrored to the output without consulting - the translation table. - - Args: - s: str. Input string upon which to perform replacements. - table: dict. Translation table. Maps from Unicode ordinals (ints) keys to other Unicode ordinals, strings, or None. - - Returns: - str. Output string derived from input string with substitutions and deletions applied from table. - """ - - if not types.is_string(s): - fail("first argument to translate must be a string") - if not types.is_dict(table): - fail("second argument to translate must be a dict") - - parts = [] - lit_start = None # Index of start of current run of literal (i.e. no-op translation) content, or None. - for (i, c) in enumerate(s.elems()): - codepoint = ord(c) - if codepoint != None and codepoint in table: - # Terminate the current literal run, if any. - if lit_start != None: - parts.append(s[lit_start:i]) - lit_start = None - - replacement = table[codepoint] - if replacement == None: - pass - elif types.is_int(replacement): - parts.append(chr(replacement)) - elif types.is_string(replacement): - parts.append(replacement) - else: - fail("character mapping must return integer, None or str") - - else: # No entry in translation table. - if lit_start == None: - lit_start = i - - # Flush the caudal literal run, if any. - if lit_start != None: - parts.append(s[lit_start:]) - lit_start = None - - if len(parts) == 1: - return parts[0] - return "".join(parts) diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index bbfbb1e6a..b5e178baf 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -2,8 +2,6 @@ load("@bazel_skylib//rules:common_settings.bzl", "BuildSettingInfo") load("//lib:paths.bzl", "to_repository_relative_path") -load(":strings.bzl", str_translate = "translate") -load(":vis_escape_ascii.bzl", "VIS_ESCAPE_ASCII") TAR_TOOLCHAIN_TYPE = "@aspect_bazel_lib//lib:tar_toolchain_type" @@ -381,7 +379,7 @@ def _to_rlocation_path(file, workspace): def _vis_encode(filename): # Escaping of non-ASCII bytes cannot be performed within Starlark. # After writing content out, a second pass is performed with vis_escape_nonascii.sed. - return str_translate(filename, VIS_ESCAPE_ASCII) + return filename.replace("\\", "\\134").replace("\n", "\\012") def _expand(file, expander, transform = to_repository_relative_path): expanded = expander.expand(file) diff --git a/lib/private/vis_escape_ascii.bzl b/lib/private/vis_escape_ascii.bzl deleted file mode 100644 index eac44ff00..000000000 --- a/lib/private/vis_escape_ascii.bzl +++ /dev/null @@ -1,42 +0,0 @@ -# Code generated by gen_vis_scripts. DO NOT EDIT. -"A translation table for vis-encoding the ASCII range for mtree." - -load(":strings.bzl", "maketrans") - -VIS_ESCAPE_ASCII = maketrans({ - 0: r"\000", - 1: r"\001", - 2: r"\002", - 3: r"\003", - 4: r"\004", - 5: r"\005", - 6: r"\006", - 7: r"\007", - 8: r"\010", - 9: r"\011", - 10: r"\012", - 11: r"\013", - 12: r"\014", - 13: r"\015", - 14: r"\016", - 15: r"\017", - 16: r"\020", - 17: r"\021", - 18: r"\022", - 19: r"\023", - 20: r"\024", - 21: r"\025", - 22: r"\026", - 23: r"\027", - 24: r"\030", - 25: r"\031", - 26: r"\032", - 27: r"\033", - 28: r"\034", - 29: r"\035", - 30: r"\036", - 31: r"\037", - 32: r"\040", - 92: r"\134", - 127: r"\177", -}) diff --git a/lib/tests/strings_tests.bzl b/lib/tests/strings_tests.bzl index 243f48d48..177dae286 100644 --- a/lib/tests/strings_tests.bzl +++ b/lib/tests/strings_tests.bzl @@ -2,7 +2,7 @@ load("@bazel_skylib//lib:partial.bzl", "partial") load("@bazel_skylib//lib:unittest.bzl", "asserts", "unittest") -load("//lib/private:strings.bzl", "chr", "hex", "maketrans", "ord", "split_args", "translate") +load("//lib/private:strings.bzl", "chr", "hex", "ord", "split_args") def _ord_test_impl(ctx): env = unittest.begin(ctx) @@ -83,29 +83,6 @@ def _split_args_test_impl(ctx): split_args_test = unittest.make(_split_args_test_impl) -def _translate_test_impl(ctx): - env = unittest.begin(ctx) - - table = maketrans({ - "<": ">", - "!": None, - }) - - asserts.equals(env, "...", translate("...", table)) - asserts.equals(env, ">..", translate("<..", table)) - asserts.equals(env, ".>.", translate(".<.", table)) - asserts.equals(env, "..>", translate("..<", table)) - asserts.equals(env, "..", translate("!..", table)) - asserts.equals(env, "..", translate(".!.", table)) - asserts.equals(env, "..", translate("..!", table)) - asserts.equals(env, ">>>", translate("<<<", table)) - asserts.equals(env, "", translate("!!!", table)) - asserts.equals(env, ".>", translate(". unicode.MaxASCII || unicode.IsSpace(rune(b)) || !unicode.IsPrint(rune(b)) } -func writeEscapeASCIIBzl(w io.Writer) { - fmt.Fprintln(w, strings.TrimSpace(` -# Code generated by gen_vis_scripts. DO NOT EDIT. -"A translation table for vis-encoding the ASCII range for mtree." - -load(":strings.bzl", "maketrans") - -VIS_ESCAPE_ASCII = maketrans({ - `)) - - for i := 0; i <= unicode.MaxASCII; i++ { - b := byte(i) - if shouldEscape(b) { - fmt.Fprintf(w, ` %[1]d: r"\%03[1]o",%[2]c`, b, newline) - } - } - fmt.Fprintln(w, "})") -} - func writeEscapeNonASCIISed(w io.Writer) { fmt.Fprintln(w, strings.TrimSpace(` # Code generated by gen_vis_scripts. DO NOT EDIT. From 074635c4cbe2fae59850786a9fc9beb8ee3b4bcb Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Tue, 18 Feb 2025 20:24:05 -0500 Subject: [PATCH 21/36] Generalize unvis script to decode all octal escape sequences Limiting to only canonically-valid escape sequences is perhaps a minor efficiency, but makes the whole thing more awkward to think about. Better to be more universal and simpler. --- lib/private/BUILD.bazel | 8 +- lib/private/tar.bzl | 8 +- .../{unvis_canonical.sed => unvis.sed} | 95 ++++++++++++++++++- lib/tests/BUILD.bazel | 4 +- lib/tests/vis_encoding.bats | 6 +- tools/gen_vis_scripts/gen_vis_scripts.go | 11 +-- 6 files changed, 111 insertions(+), 21 deletions(-) rename lib/private/{unvis_canonical.sed => unvis.sed} (64%) diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index a8aa3de16..bf9704df8 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -10,7 +10,7 @@ exports_files( "modify_mtree.awk", "parse_status_file.jq", "parse_status_file.yq", - "unvis_canonical.sed", + "unvis.sed", "vis_canonicalize.sed", "vis_escape_nonascii.sed", ], @@ -384,12 +384,12 @@ bzl_library( run_binary( name = "run_gen_vis_scripts", outs = [ - "_unvis_canonical.sed", + "_unvis.sed", "_vis_canonicalize.sed", "_vis_escape_nonascii.sed", ], args = [ - "unvis_canonical.sed=$(location _unvis_canonical.sed)", + "unvis.sed=$(location _unvis.sed)", "vis_canonicalize.sed=$(location _vis_canonicalize.sed)", "vis_escape_nonascii.sed=$(location _vis_escape_nonascii.sed)", ], @@ -399,7 +399,7 @@ run_binary( write_source_files( name = "write_vis_scripts", files = { - "unvis_canonical.sed": ":_unvis_canonical.sed", + "unvis.sed": ":_unvis.sed", "vis_canonicalize.sed": ":_vis_canonicalize.sed", "vis_escape_nonascii.sed": ":_vis_escape_nonascii.sed", }, diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index b5e178baf..fede85de1 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -117,7 +117,7 @@ Possible values: values = [-1, 0, 1], ), "_compute_unused_inputs_flag": attr.label(default = Label("//lib:tar_compute_unused_inputs")), - "_unvis_canonical": attr.label(allow_single_file = True, default = Label("//lib/private:unvis_canonical.sed")), + "_unvis": attr.label(allow_single_file = True, default = Label("//lib/private:unvis.sed")), "_vis_canonicalize": attr.label(allow_single_file = True, default = Label("//lib/private:vis_canonicalize.sed")), "_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")), } @@ -250,7 +250,7 @@ def _configured_unused_inputs_file(ctx, srcs, keep): prunable_inputs, keep_inputs, ctx.file.mtree, - ctx.file._unvis_canonical, + ctx.file._unvis, ctx.file._vis_canonicalize, ctx.file._vis_escape_nonascii, ], @@ -265,7 +265,7 @@ def _configured_unused_inputs_file(ctx, srcs, keep): ) \\ <(sed -f "$VIS_ESCAPE_NONASCII" "$KEEP_INPUTS") \\ ) \\ - | sed -f "$UNVIS_CANONICAL" \\ + | sed -f "$UNVIS" \\ > "$UNUSED_INPUTS" ''', env = { @@ -274,7 +274,7 @@ def _configured_unused_inputs_file(ctx, srcs, keep): "KEEP_INPUTS": keep_inputs.path, "MTREE": ctx.file.mtree.path, "UNUSED_INPUTS": unused_inputs.path, - "UNVIS_CANONICAL": ctx.file._unvis_canonical.path, + "UNVIS": ctx.file._unvis.path, "VIS_CANONICALIZE": ctx.file._vis_canonicalize.path, "VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path, }, diff --git a/lib/private/unvis_canonical.sed b/lib/private/unvis.sed similarity index 64% rename from lib/private/unvis_canonical.sed rename to lib/private/unvis.sed index 9d6ec7e79..3f0488f85 100644 --- a/lib/private/unvis_canonical.sed +++ b/lib/private/unvis.sed @@ -1,6 +1,6 @@ # Code generated by gen_vis_scripts. DO NOT EDIT. # Replace octal escape sequences with the bytes they represent. -# NOTE: not a fully general unvis program; assumes the canonical form produced by vis_canonicalize.sed +# NOTE: not a fully general unvis program. s/\\000/\x00/g s/\\001/\x01/g @@ -35,6 +35,99 @@ s/\\035/\x1d/g s/\\036/\x1e/g s/\\037/\x1f/g s/\\040/\x20/g +s/\\041/\x21/g +s/\\042/\x22/g +s/\\043/\x23/g +s/\\044/\x24/g +s/\\045/\x25/g +s/\\046/\x26/g +s/\\047/\x27/g +s/\\050/\x28/g +s/\\051/\x29/g +s/\\052/\x2a/g +s/\\053/\x2b/g +s/\\054/\x2c/g +s/\\055/\x2d/g +s/\\056/\x2e/g +s/\\057/\x2f/g +s/\\060/\x30/g +s/\\061/\x31/g +s/\\062/\x32/g +s/\\063/\x33/g +s/\\064/\x34/g +s/\\065/\x35/g +s/\\066/\x36/g +s/\\067/\x37/g +s/\\070/\x38/g +s/\\071/\x39/g +s/\\072/\x3a/g +s/\\073/\x3b/g +s/\\074/\x3c/g +s/\\075/\x3d/g +s/\\076/\x3e/g +s/\\077/\x3f/g +s/\\100/\x40/g +s/\\101/\x41/g +s/\\102/\x42/g +s/\\103/\x43/g +s/\\104/\x44/g +s/\\105/\x45/g +s/\\106/\x46/g +s/\\107/\x47/g +s/\\110/\x48/g +s/\\111/\x49/g +s/\\112/\x4a/g +s/\\113/\x4b/g +s/\\114/\x4c/g +s/\\115/\x4d/g +s/\\116/\x4e/g +s/\\117/\x4f/g +s/\\120/\x50/g +s/\\121/\x51/g +s/\\122/\x52/g +s/\\123/\x53/g +s/\\124/\x54/g +s/\\125/\x55/g +s/\\126/\x56/g +s/\\127/\x57/g +s/\\130/\x58/g +s/\\131/\x59/g +s/\\132/\x5a/g +s/\\133/\x5b/g +s/\\135/\x5d/g +s/\\136/\x5e/g +s/\\137/\x5f/g +s/\\140/\x60/g +s/\\141/\x61/g +s/\\142/\x62/g +s/\\143/\x63/g +s/\\144/\x64/g +s/\\145/\x65/g +s/\\146/\x66/g +s/\\147/\x67/g +s/\\150/\x68/g +s/\\151/\x69/g +s/\\152/\x6a/g +s/\\153/\x6b/g +s/\\154/\x6c/g +s/\\155/\x6d/g +s/\\156/\x6e/g +s/\\157/\x6f/g +s/\\160/\x70/g +s/\\161/\x71/g +s/\\162/\x72/g +s/\\163/\x73/g +s/\\164/\x74/g +s/\\165/\x75/g +s/\\166/\x76/g +s/\\167/\x77/g +s/\\170/\x78/g +s/\\171/\x79/g +s/\\172/\x7a/g +s/\\173/\x7b/g +s/\\174/\x7c/g +s/\\175/\x7d/g +s/\\176/\x7e/g s/\\177/\x7f/g s/\\200/\x80/g s/\\201/\x81/g diff --git a/lib/tests/BUILD.bazel b/lib/tests/BUILD.bazel index 692e8a575..a6dd2228d 100644 --- a/lib/tests/BUILD.bazel +++ b/lib/tests/BUILD.bazel @@ -93,13 +93,13 @@ bats_test( size = "small", data = [ "//lib/private:vis_escape_nonascii.sed", - "//lib/private:unvis_canonical.sed", + "//lib/private:unvis.sed", "//lib/private:vis_canonicalize.sed", ":coreutils", ], env = { "VIS_ESCAPE_NONASCII": "$(location //lib/private:vis_escape_nonascii.sed)", - "UNVIS_CANONICAL": "$(location //lib/private:unvis_canonical.sed)", + "UNVIS": "$(location //lib/private:unvis.sed)", "VIS_CANONICALIZE": "$(location //lib/private:vis_canonicalize.sed)", "COREUTILS": "$(rootpath :coreutils)", } diff --git a/lib/tests/vis_encoding.bats b/lib/tests/vis_encoding.bats index 52770065e..ade1d1fc2 100644 --- a/lib/tests/vis_encoding.bats +++ b/lib/tests/vis_encoding.bats @@ -108,7 +108,7 @@ All text that is not an 3-digit octal escape sequence is passed through the deco This includes backslashes (\), even those part of special forms sometimes recognized elsewhere (e.g. \n, \r, \v, \0, etc.). EOF - gsed -f "$UNVIS_CANONICAL" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output" + gsed -f "$UNVIS" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output" # Content chosen to pass through encoder unmodified. cp "$BATS_TEST_TMPDIR/input" "$BATS_TEST_TMPDIR/want" @@ -137,7 +137,7 @@ E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF EOF - gsed -f "$UNVIS_CANONICAL" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + gsed -f "$UNVIS" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" # Decoded content contains unprintable control characters. Diff the hexdump instead. od -Ax -tx1 <"$BATS_TEST_TMPDIR/output.raw" >"$BATS_TEST_TMPDIR/output" @@ -186,7 +186,7 @@ EOF \360 \361 \362 \363 \364 \365 \366 \367 \370 \371 \372 \373 \374 \375 \376 \377 EOF - gsed -f "$UNVIS_CANONICAL" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + gsed -f "$UNVIS" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" # Decoded content contains unprintable control characters. Diff the hexdump instead. od -Ax -tx1 <"$BATS_TEST_TMPDIR/output.raw" >"$BATS_TEST_TMPDIR/output" diff --git a/tools/gen_vis_scripts/gen_vis_scripts.go b/tools/gen_vis_scripts/gen_vis_scripts.go index 0cb959373..7a85a2510 100644 --- a/tools/gen_vis_scripts/gen_vis_scripts.go +++ b/tools/gen_vis_scripts/gen_vis_scripts.go @@ -28,8 +28,8 @@ func main() { writeEscapeNonASCIISed(f) case "vis_canonicalize.sed": writeVisCanonicalizeSed(f) - case "unvis_canonical.sed": - writeUnvisCanonicalSed(f) + case "unvis.sed": + writeUnvisSed(f) default: log.Fatal("unknown generated content:", name) } @@ -125,11 +125,11 @@ s/\\0([^0-7]|$|[0-7]([^0-7]|$))/\\000\1/g } } -func writeUnvisCanonicalSed(w io.Writer) { +func writeUnvisSed(w io.Writer) { fmt.Fprintln(w, strings.TrimSpace(` # Code generated by gen_vis_scripts. DO NOT EDIT. # Replace octal escape sequences with the bytes they represent. -# NOTE: not a fully general unvis program; assumes the canonical form produced by vis_canonicalize.sed +# NOTE: not a fully general unvis program. `)) fmt.Fprintln(w, "") @@ -138,9 +138,6 @@ func writeUnvisCanonicalSed(w io.Writer) { if b == '\\' { continue } - if !shouldEscape(b) { - continue - } fmt.Fprintf(w, `s/\\%03[1]o/\x%02[1]x/g%[2]c`, b, newline) } fmt.Fprintln(w, "") From c599e4ce1e5a9f93c5678dfd34598e28ba817772 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Tue, 18 Feb 2025 20:42:25 -0500 Subject: [PATCH 22/36] Generalize vis_escape script to encode nearly all octal escape sequences Limiting to only the non-ASCII set creates a lot of complexity in Starlark. If we need an out-of-process escaper anyways, we might as well do all, or nearly all, of our escaping there and drive down complexity elsewhere. --- lib/private/BUILD.bazel | 8 ++-- lib/private/tar.bzl | 36 ++++++++--------- ...vis_escape_nonascii.sed => vis_escape.sed} | 39 ++++++++++++++++++- lib/tests/BUILD.bazel | 4 +- lib/tests/vis_encoding.bats | 4 +- tools/gen_vis_scripts/gen_vis_scripts.go | 22 +++++++---- 6 files changed, 78 insertions(+), 35 deletions(-) rename lib/private/{vis_escape_nonascii.sed => vis_escape.sed} (73%) diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index bf9704df8..2e264a13d 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -12,7 +12,7 @@ exports_files( "parse_status_file.yq", "unvis.sed", "vis_canonicalize.sed", - "vis_escape_nonascii.sed", + "vis_escape.sed", ], visibility = ["//visibility:public"], ) @@ -386,12 +386,12 @@ run_binary( outs = [ "_unvis.sed", "_vis_canonicalize.sed", - "_vis_escape_nonascii.sed", + "_vis_escape.sed", ], args = [ "unvis.sed=$(location _unvis.sed)", "vis_canonicalize.sed=$(location _vis_canonicalize.sed)", - "vis_escape_nonascii.sed=$(location _vis_escape_nonascii.sed)", + "vis_escape.sed=$(location _vis_escape.sed)", ], tool = "//tools/gen_vis_scripts", ) @@ -401,6 +401,6 @@ write_source_files( files = { "unvis.sed": ":_unvis.sed", "vis_canonicalize.sed": ":_vis_canonicalize.sed", - "vis_escape_nonascii.sed": ":_vis_escape_nonascii.sed", + "vis_escape.sed": ":_vis_escape.sed", }, ) diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index fede85de1..7afb9c503 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -119,13 +119,13 @@ Possible values: "_compute_unused_inputs_flag": attr.label(default = Label("//lib:tar_compute_unused_inputs")), "_unvis": attr.label(allow_single_file = True, default = Label("//lib/private:unvis.sed")), "_vis_canonicalize": attr.label(allow_single_file = True, default = Label("//lib/private:vis_canonicalize.sed")), - "_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")), + "_vis_escape": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape.sed")), } _mtree_attrs = { "srcs": attr.label_list(doc = "Files that are placed into the tar", allow_files = True), "out": attr.output(doc = "Resulting specification file to write"), - "_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")), + "_vis_escape": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape.sed")), } def _add_compression_args(compress, args): @@ -252,20 +252,20 @@ def _configured_unused_inputs_file(ctx, srcs, keep): ctx.file.mtree, ctx.file._unvis, ctx.file._vis_canonicalize, - ctx.file._vis_escape_nonascii, + ctx.file._vis_escape, ], tools = [coreutils], command = ''' - "$COREUTILS" join -v 1 \\ - <(sed -f "$VIS_ESCAPE_NONASCII" "$PRUNABLE_INPUTS" | "$COREUTILS" sort -u) \\ - <("$COREUTILS" sort -u \\ - <(grep -o '\\bcontents\\?=\\S*' "$MTREE" \\ - | "$COREUTILS" cut -d'=' -f 2- \\ - | sed -Ef "$VIS_CANONICALIZE" \\ - ) \\ - <(sed -f "$VIS_ESCAPE_NONASCII" "$KEEP_INPUTS") \\ - ) \\ - | sed -f "$UNVIS" \\ + "$COREUTILS" join -v 1 \\ + <(sed -f "$VIS_ESCAPE" "$PRUNABLE_INPUTS" | "$COREUTILS" sort -u) \\ + <("$COREUTILS" sort -u \\ + <(grep -o '\\bcontents\\?=\\S*' "$MTREE" \\ + | "$COREUTILS" cut -d'=' -f 2- \\ + | sed -Ef "$VIS_CANONICALIZE" \\ + ) \\ + <(sed -f "$VIS_ESCAPE" "$KEEP_INPUTS") \\ + ) \\ + | sed -f "$UNVIS" \\ > "$UNUSED_INPUTS" ''', env = { @@ -276,7 +276,7 @@ def _configured_unused_inputs_file(ctx, srcs, keep): "UNUSED_INPUTS": unused_inputs.path, "UNVIS": ctx.file._unvis.path, "VIS_CANONICALIZE": ctx.file._vis_canonicalize.path, - "VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path, + "VIS_ESCAPE": ctx.file._vis_escape.path, }, mnemonic = "UnusedTarInputs", toolchain = "@aspect_bazel_lib//lib:coreutils_toolchain_type", @@ -378,7 +378,7 @@ def _to_rlocation_path(file, workspace): def _vis_encode(filename): # Escaping of non-ASCII bytes cannot be performed within Starlark. - # After writing content out, a second pass is performed with vis_escape_nonascii.sed. + # After writing content out, a second pass is performed with vis_escape.sed. return filename.replace("\\", "\\134").replace("\n", "\\012") def _expand(file, expander, transform = to_repository_relative_path): @@ -454,10 +454,10 @@ def _mtree_impl(ctx): ctx.actions.write(unescaped, content = content) ctx.actions.run_shell( outputs = [out], - inputs = [unescaped, ctx.file._vis_escape_nonascii], - command = 'sed -f "$VIS_ESCAPE_NONASCII" "$UNESCAPED" > "$OUT"', + inputs = [unescaped, ctx.file._vis_escape], + command = 'sed -f "$VIS_ESCAPE" "$UNESCAPED" > "$OUT"', env = { - "VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path, + "VIS_ESCAPE": ctx.file._vis_escape.path, "UNESCAPED": unescaped.path, "OUT": out.path, }, diff --git a/lib/private/vis_escape_nonascii.sed b/lib/private/vis_escape.sed similarity index 73% rename from lib/private/vis_escape_nonascii.sed rename to lib/private/vis_escape.sed index 744713564..881620b5b 100644 --- a/lib/private/vis_escape_nonascii.sed +++ b/lib/private/vis_escape.sed @@ -1,7 +1,42 @@ # Code generated by gen_vis_scripts. DO NOT EDIT. -# Replace non-ASCII bytes with their octal escape sequences. -# Escaping of ASCII is done in Starlark prior to writing content out. +# +# Replace most bytes with their octal escape sequences. +# Backslashes and newlines remain in place to preserve newline-delimited records +# while allowing upstream producers to include newlines in vis-encoded content. +s/\x00/\\000/g +s/\x01/\\001/g +s/\x02/\\002/g +s/\x03/\\003/g +s/\x04/\\004/g +s/\x05/\\005/g +s/\x06/\\006/g +s/\x07/\\007/g +s/\x08/\\010/g +s/\x09/\\011/g +s/\x0b/\\013/g +s/\x0c/\\014/g +s/\x0d/\\015/g +s/\x0e/\\016/g +s/\x0f/\\017/g +s/\x10/\\020/g +s/\x11/\\021/g +s/\x12/\\022/g +s/\x13/\\023/g +s/\x14/\\024/g +s/\x15/\\025/g +s/\x16/\\026/g +s/\x17/\\027/g +s/\x18/\\030/g +s/\x19/\\031/g +s/\x1a/\\032/g +s/\x1b/\\033/g +s/\x1c/\\034/g +s/\x1d/\\035/g +s/\x1e/\\036/g +s/\x1f/\\037/g +s/\x20/\\040/g +s/\x7f/\\177/g s/\x80/\\200/g s/\x81/\\201/g s/\x82/\\202/g diff --git a/lib/tests/BUILD.bazel b/lib/tests/BUILD.bazel index a6dd2228d..38d276734 100644 --- a/lib/tests/BUILD.bazel +++ b/lib/tests/BUILD.bazel @@ -92,13 +92,13 @@ bats_test( srcs = ["vis_encoding.bats"], size = "small", data = [ - "//lib/private:vis_escape_nonascii.sed", + "//lib/private:vis_escape.sed", "//lib/private:unvis.sed", "//lib/private:vis_canonicalize.sed", ":coreutils", ], env = { - "VIS_ESCAPE_NONASCII": "$(location //lib/private:vis_escape_nonascii.sed)", + "VIS_ESCAPE": "$(location //lib/private:vis_escape.sed)", "UNVIS": "$(location //lib/private:unvis.sed)", "VIS_CANONICALIZE": "$(location //lib/private:vis_canonicalize.sed)", "COREUTILS": "$(rootpath :coreutils)", diff --git a/lib/tests/vis_encoding.bats b/lib/tests/vis_encoding.bats index ade1d1fc2..d1aecef9a 100644 --- a/lib/tests/vis_encoding.bats +++ b/lib/tests/vis_encoding.bats @@ -39,7 +39,7 @@ Upstream encoders should escape the first two in content they feed to the genera These gaps enable our encoder to operate on newline-delimited records of vis-encoded content. EOF - gsed -f "$VIS_ESCAPE_NONASCII" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + gsed -f "$VIS_ESCAPE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" # Content chosen to pass through encoder unmodified... mostly (except spaces, which must be patched up). cp "$BATS_TEST_TMPDIR/input" "$BATS_TEST_TMPDIR/want" @@ -71,7 +71,7 @@ E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF EOF - gsed -f "$VIS_ESCAPE_NONASCII" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + gsed -f "$VIS_ESCAPE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" gawk -v FS='\n' -v RS='\n\n' ' NR == rshift(0x00, 4) + 1 { for (i = NF; i > 0x0A; i--) $(i+1) = $(i); $(0x0A+1) = "" } # Newline gap diff --git a/tools/gen_vis_scripts/gen_vis_scripts.go b/tools/gen_vis_scripts/gen_vis_scripts.go index 7a85a2510..11686edee 100644 --- a/tools/gen_vis_scripts/gen_vis_scripts.go +++ b/tools/gen_vis_scripts/gen_vis_scripts.go @@ -24,8 +24,8 @@ func main() { defer mustClose(f) switch name { - case "vis_escape_nonascii.sed": - writeEscapeNonASCIISed(f) + case "vis_escape.sed": + writeVisEscapeSed(f) case "vis_canonicalize.sed": writeVisCanonicalizeSed(f) case "unvis.sed": @@ -49,16 +49,24 @@ func shouldEscape(b byte) bool { return b == '\\' || b > unicode.MaxASCII || unicode.IsSpace(rune(b)) || !unicode.IsPrint(rune(b)) } -func writeEscapeNonASCIISed(w io.Writer) { +func writeVisEscapeSed(w io.Writer) { fmt.Fprintln(w, strings.TrimSpace(` # Code generated by gen_vis_scripts. DO NOT EDIT. -# Replace non-ASCII bytes with their octal escape sequences. -# Escaping of ASCII is done in Starlark prior to writing content out. +# +# Replace most bytes with their octal escape sequences. +# Backslashes and newlines remain in place to preserve newline-delimited records +# while allowing upstream producers to include newlines in vis-encoded content. `)) fmt.Fprintln(w, "") - for i := 0x80; i <= 0xFF; i++ { - fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, i, newline) + for i := 0; i <= 0xFF; i++ { + b := byte(i) + if b == '\\' || b == '\n' { + continue + } + if shouldEscape(b) { + fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, i, newline) + } } } From 00941f5df57bddebab7f1a9d77fc2cec65dc861f Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Tue, 18 Feb 2025 21:12:15 -0500 Subject: [PATCH 23/36] Fix typo --- lib/private/tar.bzl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index 7afb9c503..0ea501cce 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -187,7 +187,7 @@ def _is_unprunable(file): p = file.path return p[0].isspace() or p[-1].isspace() or "\n" in p or "\r" in p -def _fmt_pruanble_inputs_line(file): +def _fmt_prunable_inputs_line(file): if _is_unprunable(file): return None return _vis_encode(file.path) @@ -221,7 +221,7 @@ def _configured_unused_inputs_file(ctx, srcs, keep): .set_param_file_format("multiline") .add_all( srcs, - map_each = _fmt_pruanble_inputs_line, + map_each = _fmt_prunable_inputs_line, ), ) ctx.actions.write( From d6d46360edef6f8771d324c008f84c30cd3bd0d3 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Wed, 19 Feb 2025 23:09:57 -0500 Subject: [PATCH 24/36] Also passthrough SPACE characters in vis_escape script These are present as field-separators in the mtree content we emit; escaping them interferes with the file format. --- lib/private/tar.bzl | 4 ++-- lib/private/vis_escape.sed | 5 ++--- lib/tests/vis_encoding.bats | 9 +++------ tools/gen_vis_scripts/gen_vis_scripts.go | 6 +++--- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index 0ea501cce..08bcd8914 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -378,8 +378,8 @@ def _to_rlocation_path(file, workspace): def _vis_encode(filename): # Escaping of non-ASCII bytes cannot be performed within Starlark. - # After writing content out, a second pass is performed with vis_escape.sed. - return filename.replace("\\", "\\134").replace("\n", "\\012") + # After writing content out, a second pass is performed with vis_escape.gawk. + return filename.replace("\\", "\\134").replace("\n", "\\012").replace(" ", "\\040") def _expand(file, expander, transform = to_repository_relative_path): expanded = expander.expand(file) diff --git a/lib/private/vis_escape.sed b/lib/private/vis_escape.sed index 881620b5b..a9d8f0eef 100644 --- a/lib/private/vis_escape.sed +++ b/lib/private/vis_escape.sed @@ -1,8 +1,8 @@ # Code generated by gen_vis_scripts. DO NOT EDIT. # # Replace most bytes with their octal escape sequences. -# Backslashes and newlines remain in place to preserve newline-delimited records -# while allowing upstream producers to include newlines in vis-encoded content. +# Backslashes, newlines, and spaces remain in place to preserve newline-delimited records of space-delimited fields +# while allowing upstream producers to include these delimiters in vis-encoded content. s/\x00/\\000/g s/\x01/\\001/g @@ -35,7 +35,6 @@ s/\x1c/\\034/g s/\x1d/\\035/g s/\x1e/\\036/g s/\x1f/\\037/g -s/\x20/\\040/g s/\x7f/\\177/g s/\x80/\\200/g s/\x81/\\201/g diff --git a/lib/tests/vis_encoding.bats b/lib/tests/vis_encoding.bats index d1aecef9a..9a1e9c0fe 100644 --- a/lib/tests/vis_encoding.bats +++ b/lib/tests/vis_encoding.bats @@ -39,14 +39,11 @@ Upstream encoders should escape the first two in content they feed to the genera These gaps enable our encoder to operate on newline-delimited records of vis-encoded content. EOF - gsed -f "$VIS_ESCAPE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + gsed -f "$VIS_ESCAPE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output" - # Content chosen to pass through encoder unmodified... mostly (except spaces, which must be patched up). + # Content chosen to pass through encoder unmodified. cp "$BATS_TEST_TMPDIR/input" "$BATS_TEST_TMPDIR/want" - # Spaces _do_ get escaped; undo that so that text under comparison remains human-friendly. - gawk '{ gsub(/[\\]040/, " "); print }' <"$BATS_TEST_TMPDIR/output.raw" >"$BATS_TEST_TMPDIR/output" - cd "$BATS_TEST_TMPDIR" diff -u want output } @@ -82,7 +79,7 @@ EOF cat <<'EOF' >"$BATS_TEST_TMPDIR/want" \000 \001 \002 \003 \004 \005 \006 \007 \010 \011 \013 \014 \015 \016 \017 \020 \021 \022 \023 \024 \025 \026 \027 \030 \031 \032 \033 \034 \035 \036 \037 -\040 ! " # $ % & ' ( ) * + , - . / + ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ ] ^ _ diff --git a/tools/gen_vis_scripts/gen_vis_scripts.go b/tools/gen_vis_scripts/gen_vis_scripts.go index 11686edee..b7c7692fa 100644 --- a/tools/gen_vis_scripts/gen_vis_scripts.go +++ b/tools/gen_vis_scripts/gen_vis_scripts.go @@ -54,14 +54,14 @@ func writeVisEscapeSed(w io.Writer) { # Code generated by gen_vis_scripts. DO NOT EDIT. # # Replace most bytes with their octal escape sequences. -# Backslashes and newlines remain in place to preserve newline-delimited records -# while allowing upstream producers to include newlines in vis-encoded content. +# Backslashes, newlines, and spaces remain in place to preserve newline-delimited records of space-delimited fields +# while allowing upstream producers to include these delimiters in vis-encoded content. `)) fmt.Fprintln(w, "") for i := 0; i <= 0xFF; i++ { b := byte(i) - if b == '\\' || b == '\n' { + if b == '\\' || b == '\n' || b == ' ' { continue } if shouldEscape(b) { From 89cb16eea3a6211ee1e7d2e35ad20b84b438c90e Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Wed, 19 Feb 2025 23:22:23 -0500 Subject: [PATCH 25/36] Create gawk versions of the vis encoding tools Using AWK opens up new opportunities not available when using Sed. Notably, we are able to implement these translations using a one-pass algorithm using a hashtable to lookup replacements. We rely on the GNU AWK implementation in particular for a few of its extensions: * `--characters-as-bytes` to be able to operate byte-by-byte and not be interfered with by locales and multibyte encoding support. * 4-arg `split` to be gain easy access to both the content parts and the separators that divide them. * NUL byte being treated as just any other character. * `RT` to more easily implement non-newline-terminated files Some of these are not essention, just nice to have. Others have more verbose analogues in POSIX Sed. It may be possible to implement these scripts without the GNU extensions, but that would likely come at the cost of expressivity and succinctness. --- lib/private/unvis.gawk | 19 +++++++++++++ lib/private/vis_canonicalize.gawk | 45 +++++++++++++++++++++++++++++++ lib/private/vis_escape.gawk | 21 +++++++++++++++ 3 files changed, 85 insertions(+) create mode 100755 lib/private/unvis.gawk create mode 100755 lib/private/vis_canonicalize.gawk create mode 100755 lib/private/vis_escape.gawk diff --git a/lib/private/unvis.gawk b/lib/private/unvis.gawk new file mode 100755 index 000000000..a0802e335 --- /dev/null +++ b/lib/private/unvis.gawk @@ -0,0 +1,19 @@ +#!/usr/bin/env gawk --characters-as-bytes --file +# +# Replace octal escape sequences with the bytes they represent. +# NOTE: not a fully general unvis program. + +BEGIN { + for (i = 0x00; i <= 0xFF; i++) { + b = sprintf("%c", i) + esc = sprintf("\\%03o", i) + REPLACE[esc] = b + } +} + +{ + n = split($0, verbatim_parts, /[\\][0-3][0-7][0-7]/, replace_parts) + for (i = 1; i < n; i++) + printf "%s%s", verbatim_parts[i], REPLACE[replace_parts[i]] + printf "%s%s", verbatim_parts[n], RT +} diff --git a/lib/private/vis_canonicalize.gawk b/lib/private/vis_canonicalize.gawk new file mode 100755 index 000000000..5c19bb7e4 --- /dev/null +++ b/lib/private/vis_canonicalize.gawk @@ -0,0 +1,45 @@ +#!/usr/bin/env gawk --characters-as-bytes --file +# +# Convert lines of vis-encoded content to a bespoke canonical form. After canonicalization, equality checks are trivial. +# Backslash, space characters, and all characters outside the 95 printable ASCII set are represented using escaped three-digit octal. +# The remaining characters are not escaped; they represent themselves. +# Newlines are the record separator and are exempt from replacement, although the escaped special form \n does canonicalized to octal. +# +# Input is interpreted as libarchive would, with a wider set of escape sequences: +# * \\, \a, \b, \f, \n, \r, \t, \v have their conventional C-based meanings +# * \0 means NUL when not the start of an three-digit octal escape sequence +# * \s means SPACE +# * \ is valid as an ordinary backslash when not the start of a valid escape sequence +# +# See: https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1942 + +BEGIN { + REPLACE["\\\\"] = "\\134" + REPLACE["\\0"] = "\\000" + REPLACE["\\a"] = "\\007" + REPLACE["\\b"] = "\\010" + REPLACE["\\f"] = "\\014" + REPLACE["\\n"] = "\\012" + REPLACE["\\r"] = "\\015" + REPLACE["\\s"] = "\\040" + REPLACE["\\t"] = "\\011" + REPLACE["\\v"] = "\\013" + + for (i = 0x00; i <= 0xFF; i++) { + b = sprintf("%c", i) + esc = sprintf("\\%03o", i) + if (match(b, /[^[:graph:]]|[\\]/)) { + REPLACE[b] = esc + REPLACE[esc] = esc + } else { + REPLACE[esc] = b + } + } +} + +{ + n = split($0, verbatim_parts, /[\\][\\0abfnrstv]|[\\][0-3][0-7][0-7]|[^[:graph:]]|[\\]/, replace_parts) + for (i = 1; i < n; i++) + printf "%s%s", verbatim_parts[i], REPLACE[replace_parts[i]] + printf "%s%s", verbatim_parts[n], RT +} diff --git a/lib/private/vis_escape.gawk b/lib/private/vis_escape.gawk new file mode 100755 index 000000000..e09db2b71 --- /dev/null +++ b/lib/private/vis_escape.gawk @@ -0,0 +1,21 @@ +#!/usr/bin/env gawk --characters-as-bytes --file +# +# Replace most bytes with their octal escape sequences. +# Backslashes, newlines, and spaces remain in place to preserve newline-delimited records of space-delimited fields +# while allowing upstream producers to include these delimiters in vis-encoded content. + +BEGIN { + # Not all entries in REPLACE will be used but over-inclusion is simpler. + for (i = 0x00; i <= 0xFF; i++) { + b = sprintf("%c", i) + esc = sprintf("\\%03o", i) + REPLACE[b] = esc + } +} + +{ + n = split($0, verbatim_parts, /[^[:graph:] \\]/, replace_parts) + for (i = 1; i < n; i++) + printf "%s%s", verbatim_parts[i], REPLACE[replace_parts[i]] + printf "%s%s", verbatim_parts[n], RT +} From 9acb42d7ab5c559f14082113582ae9253bfd1dba Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Wed, 19 Feb 2025 23:40:41 -0500 Subject: [PATCH 26/36] Use GAWK variants of vis encoding scripts Path to GAWK tool hardcoded to MacOS homebrew path pending one being available via toolchain. --- lib/private/BUILD.bazel | 6 +++--- lib/private/tar.bzl | 30 +++++++++++++++--------------- lib/tests/BUILD.bazel | 12 ++++++------ lib/tests/vis_encoding.bats | 22 +++++++++------------- 4 files changed, 33 insertions(+), 37 deletions(-) diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index 2e264a13d..a72ef2f69 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -10,9 +10,9 @@ exports_files( "modify_mtree.awk", "parse_status_file.jq", "parse_status_file.yq", - "unvis.sed", - "vis_canonicalize.sed", - "vis_escape.sed", + "unvis.gawk", + "vis_canonicalize.gawk", + "vis_escape.gawk", ], visibility = ["//visibility:public"], ) diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index 08bcd8914..0a536abbd 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -117,15 +117,15 @@ Possible values: values = [-1, 0, 1], ), "_compute_unused_inputs_flag": attr.label(default = Label("//lib:tar_compute_unused_inputs")), - "_unvis": attr.label(allow_single_file = True, default = Label("//lib/private:unvis.sed")), - "_vis_canonicalize": attr.label(allow_single_file = True, default = Label("//lib/private:vis_canonicalize.sed")), - "_vis_escape": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape.sed")), + "_unvis": attr.label(allow_single_file = True, default = Label("//lib/private:unvis.gawk")), + "_vis_canonicalize": attr.label(allow_single_file = True, default = Label("//lib/private:vis_canonicalize.gawk")), + "_vis_escape": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape.gawk")), } _mtree_attrs = { "srcs": attr.label_list(doc = "Files that are placed into the tar", allow_files = True), "out": attr.output(doc = "Resulting specification file to write"), - "_vis_escape": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape.sed")), + "_vis_escape": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape.gawk")), } def _add_compression_args(compress, args): @@ -256,16 +256,16 @@ def _configured_unused_inputs_file(ctx, srcs, keep): ], tools = [coreutils], command = ''' - "$COREUTILS" join -v 1 \\ - <(sed -f "$VIS_ESCAPE" "$PRUNABLE_INPUTS" | "$COREUTILS" sort -u) \\ - <("$COREUTILS" sort -u \\ - <(grep -o '\\bcontents\\?=\\S*' "$MTREE" \\ - | "$COREUTILS" cut -d'=' -f 2- \\ - | sed -Ef "$VIS_CANONICALIZE" \\ - ) \\ - <(sed -f "$VIS_ESCAPE" "$KEEP_INPUTS") \\ - ) \\ - | sed -f "$UNVIS" \\ + "$COREUTILS" join -v 1 \\ + <(/opt/homebrew/bin/gawk -bf "$VIS_ESCAPE" "$PRUNABLE_INPUTS" | "$COREUTILS" sort -u) \\ + <("$COREUTILS" sort -u \\ + <(grep -o '\\bcontents\\?=\\S*' "$MTREE" \\ + | "$COREUTILS" cut -d'=' -f 2- \\ + | /opt/homebrew/bin/gawk -bf "$VIS_CANONICALIZE" \\ + ) \\ + <(/opt/homebrew/bin/gawk -bf "$VIS_ESCAPE" "$KEEP_INPUTS") \\ + ) \\ + | /opt/homebrew/bin/gawk -bf "$UNVIS" \\ > "$UNUSED_INPUTS" ''', env = { @@ -455,7 +455,7 @@ def _mtree_impl(ctx): ctx.actions.run_shell( outputs = [out], inputs = [unescaped, ctx.file._vis_escape], - command = 'sed -f "$VIS_ESCAPE" "$UNESCAPED" > "$OUT"', + command = '/opt/homebrew/bin/gawk -bf "$VIS_ESCAPE" "$UNESCAPED" > "$OUT"', env = { "VIS_ESCAPE": ctx.file._vis_escape.path, "UNESCAPED": unescaped.path, diff --git a/lib/tests/BUILD.bazel b/lib/tests/BUILD.bazel index 38d276734..05c2c4fbe 100644 --- a/lib/tests/BUILD.bazel +++ b/lib/tests/BUILD.bazel @@ -92,15 +92,15 @@ bats_test( srcs = ["vis_encoding.bats"], size = "small", data = [ - "//lib/private:vis_escape.sed", - "//lib/private:unvis.sed", - "//lib/private:vis_canonicalize.sed", + "//lib/private:vis_escape.gawk", + "//lib/private:unvis.gawk", + "//lib/private:vis_canonicalize.gawk", ":coreutils", ], env = { - "VIS_ESCAPE": "$(location //lib/private:vis_escape.sed)", - "UNVIS": "$(location //lib/private:unvis.sed)", - "VIS_CANONICALIZE": "$(location //lib/private:vis_canonicalize.sed)", + "VIS_ESCAPE": "$(location //lib/private:vis_escape.gawk)", + "UNVIS": "$(location //lib/private:unvis.gawk)", + "VIS_CANONICALIZE": "$(location //lib/private:vis_canonicalize.gawk)", "COREUTILS": "$(rootpath :coreutils)", } ) diff --git a/lib/tests/vis_encoding.bats b/lib/tests/vis_encoding.bats index 9a1e9c0fe..6807b1513 100644 --- a/lib/tests/vis_encoding.bats +++ b/lib/tests/vis_encoding.bats @@ -8,10 +8,6 @@ gawk() { # TODO: from toolchain /opt/homebrew/bin/gawk "$@" } -gsed() { - # TODO: replace with AWK - /opt/homebrew/bin/gsed "$@" -} cat() { "$COREUTILS" cat "$@" } @@ -39,7 +35,7 @@ Upstream encoders should escape the first two in content they feed to the genera These gaps enable our encoder to operate on newline-delimited records of vis-encoded content. EOF - gsed -f "$VIS_ESCAPE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output" + gawk -bf "$VIS_ESCAPE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output" # Content chosen to pass through encoder unmodified. cp "$BATS_TEST_TMPDIR/input" "$BATS_TEST_TMPDIR/want" @@ -68,7 +64,7 @@ E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF EOF - gsed -f "$VIS_ESCAPE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + gawk -bf "$VIS_ESCAPE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" gawk -v FS='\n' -v RS='\n\n' ' NR == rshift(0x00, 4) + 1 { for (i = NF; i > 0x0A; i--) $(i+1) = $(i); $(0x0A+1) = "" } # Newline gap @@ -105,7 +101,7 @@ All text that is not an 3-digit octal escape sequence is passed through the deco This includes backslashes (\), even those part of special forms sometimes recognized elsewhere (e.g. \n, \r, \v, \0, etc.). EOF - gsed -f "$UNVIS" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output" + gawk -bf "$UNVIS" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output" # Content chosen to pass through encoder unmodified. cp "$BATS_TEST_TMPDIR/input" "$BATS_TEST_TMPDIR/want" @@ -134,7 +130,7 @@ E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF EOF - gsed -f "$UNVIS" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + gawk -bf "$UNVIS" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" # Decoded content contains unprintable control characters. Diff the hexdump instead. od -Ax -tx1 <"$BATS_TEST_TMPDIR/output.raw" >"$BATS_TEST_TMPDIR/output" @@ -183,7 +179,7 @@ EOF \360 \361 \362 \363 \364 \365 \366 \367 \370 \371 \372 \373 \374 \375 \376 \377 EOF - gsed -f "$UNVIS" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + gawk -bf "$UNVIS" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" # Decoded content contains unprintable control characters. Diff the hexdump instead. od -Ax -tx1 <"$BATS_TEST_TMPDIR/output.raw" >"$BATS_TEST_TMPDIR/output" @@ -233,7 +229,7 @@ EOF EOF gawk -v OFS='\n' -v ORS='\n\n' '{ $1 = $1; print }' <"$BATS_TEST_TMPDIR/input.table" >"$BATS_TEST_TMPDIR/input" - gsed -Ef "$VIS_CANONICALIZE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + gawk -bf "$VIS_CANONICALIZE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" gawk -v FS='\n' -v RS='\n\n' ' { for (i = 1; i <= NF; i++) printf "%4s%s", $(i), i == NF ? ORS : OFS } # Emit table with fixed-width columns. @@ -256,7 +252,7 @@ EOF \160 \161 \162 \163 \164 \165 \166 \167 \170 \171 \172 \173 \174 \175 \176 EOF - gsed -Ef "$VIS_CANONICALIZE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + gawk -bf "$VIS_CANONICALIZE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" gawk -v FS='\n' -v RS='\n\n' ' NR == rshift(0x20 - 0x20, 4) + 1 { for (i = NF; i > 0x00; i--) $(i+1) = $(i); $(0x00+1) = "" } # Space gap @@ -298,7 +294,7 @@ E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF EOF - gsed -Ef "$VIS_CANONICALIZE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" + gawk -bf "$VIS_CANONICALIZE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output.raw" gawk -v FS='\n' -v RS='\n\n' ' NR == rshift(0x00, 4) + 1 { for (i = NF; i > 0x0A; i--) $(i+1) = $(i); $(0x0A+1) = "" } # Newline gap @@ -344,7 +340,7 @@ EOF EOF cut -f1 <"$BATS_TEST_TMPDIR/input_want" >"$BATS_TEST_TMPDIR/input" - gsed -Ef "$VIS_CANONICALIZE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output" + gawk -bf "$VIS_CANONICALIZE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output" paste "$BATS_TEST_TMPDIR/input" "$BATS_TEST_TMPDIR/output" >"$BATS_TEST_TMPDIR/input_output" From c104d719bd071e680779f96f0a98385e4fd50f0f Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Wed, 19 Feb 2025 23:44:39 -0500 Subject: [PATCH 27/36] Remove sed-based variants of vis encoding support scripts These have been replaced by their AWK-based counterparts. --- lib/private/BUILD.bazel | 26 -- lib/private/unvis.sed | 262 --------------------- lib/private/vis_canonicalize.sed | 287 ----------------------- lib/private/vis_escape.sed | 166 ------------- tools/gen_vis_scripts/BUILD.bazel | 7 - tools/gen_vis_scripts/gen_vis_scripts.go | 157 ------------- 6 files changed, 905 deletions(-) delete mode 100644 lib/private/unvis.sed delete mode 100644 lib/private/vis_canonicalize.sed delete mode 100644 lib/private/vis_escape.sed delete mode 100644 tools/gen_vis_scripts/BUILD.bazel delete mode 100644 tools/gen_vis_scripts/gen_vis_scripts.go diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index a72ef2f69..a788efba6 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -1,7 +1,5 @@ load("@bazel_skylib//:bzl_library.bzl", "bzl_library") -load("//lib:run_binary.bzl", "run_binary") load("//lib:utils.bzl", "is_bazel_7_or_greater") -load("//lib:write_source_files.bzl", "write_source_files") exports_files( [ @@ -380,27 +378,3 @@ bzl_library( srcs = ["zstd_toolchain.bzl"], visibility = ["//lib:__subpackages__"], ) - -run_binary( - name = "run_gen_vis_scripts", - outs = [ - "_unvis.sed", - "_vis_canonicalize.sed", - "_vis_escape.sed", - ], - args = [ - "unvis.sed=$(location _unvis.sed)", - "vis_canonicalize.sed=$(location _vis_canonicalize.sed)", - "vis_escape.sed=$(location _vis_escape.sed)", - ], - tool = "//tools/gen_vis_scripts", -) - -write_source_files( - name = "write_vis_scripts", - files = { - "unvis.sed": ":_unvis.sed", - "vis_canonicalize.sed": ":_vis_canonicalize.sed", - "vis_escape.sed": ":_vis_escape.sed", - }, -) diff --git a/lib/private/unvis.sed b/lib/private/unvis.sed deleted file mode 100644 index 3f0488f85..000000000 --- a/lib/private/unvis.sed +++ /dev/null @@ -1,262 +0,0 @@ -# Code generated by gen_vis_scripts. DO NOT EDIT. -# Replace octal escape sequences with the bytes they represent. -# NOTE: not a fully general unvis program. - -s/\\000/\x00/g -s/\\001/\x01/g -s/\\002/\x02/g -s/\\003/\x03/g -s/\\004/\x04/g -s/\\005/\x05/g -s/\\006/\x06/g -s/\\007/\x07/g -s/\\010/\x08/g -s/\\011/\x09/g -s/\\012/\x0a/g -s/\\013/\x0b/g -s/\\014/\x0c/g -s/\\015/\x0d/g -s/\\016/\x0e/g -s/\\017/\x0f/g -s/\\020/\x10/g -s/\\021/\x11/g -s/\\022/\x12/g -s/\\023/\x13/g -s/\\024/\x14/g -s/\\025/\x15/g -s/\\026/\x16/g -s/\\027/\x17/g -s/\\030/\x18/g -s/\\031/\x19/g -s/\\032/\x1a/g -s/\\033/\x1b/g -s/\\034/\x1c/g -s/\\035/\x1d/g -s/\\036/\x1e/g -s/\\037/\x1f/g -s/\\040/\x20/g -s/\\041/\x21/g -s/\\042/\x22/g -s/\\043/\x23/g -s/\\044/\x24/g -s/\\045/\x25/g -s/\\046/\x26/g -s/\\047/\x27/g -s/\\050/\x28/g -s/\\051/\x29/g -s/\\052/\x2a/g -s/\\053/\x2b/g -s/\\054/\x2c/g -s/\\055/\x2d/g -s/\\056/\x2e/g -s/\\057/\x2f/g -s/\\060/\x30/g -s/\\061/\x31/g -s/\\062/\x32/g -s/\\063/\x33/g -s/\\064/\x34/g -s/\\065/\x35/g -s/\\066/\x36/g -s/\\067/\x37/g -s/\\070/\x38/g -s/\\071/\x39/g -s/\\072/\x3a/g -s/\\073/\x3b/g -s/\\074/\x3c/g -s/\\075/\x3d/g -s/\\076/\x3e/g -s/\\077/\x3f/g -s/\\100/\x40/g -s/\\101/\x41/g -s/\\102/\x42/g -s/\\103/\x43/g -s/\\104/\x44/g -s/\\105/\x45/g -s/\\106/\x46/g -s/\\107/\x47/g -s/\\110/\x48/g -s/\\111/\x49/g -s/\\112/\x4a/g -s/\\113/\x4b/g -s/\\114/\x4c/g -s/\\115/\x4d/g -s/\\116/\x4e/g -s/\\117/\x4f/g -s/\\120/\x50/g -s/\\121/\x51/g -s/\\122/\x52/g -s/\\123/\x53/g -s/\\124/\x54/g -s/\\125/\x55/g -s/\\126/\x56/g -s/\\127/\x57/g -s/\\130/\x58/g -s/\\131/\x59/g -s/\\132/\x5a/g -s/\\133/\x5b/g -s/\\135/\x5d/g -s/\\136/\x5e/g -s/\\137/\x5f/g -s/\\140/\x60/g -s/\\141/\x61/g -s/\\142/\x62/g -s/\\143/\x63/g -s/\\144/\x64/g -s/\\145/\x65/g -s/\\146/\x66/g -s/\\147/\x67/g -s/\\150/\x68/g -s/\\151/\x69/g -s/\\152/\x6a/g -s/\\153/\x6b/g -s/\\154/\x6c/g -s/\\155/\x6d/g -s/\\156/\x6e/g -s/\\157/\x6f/g -s/\\160/\x70/g -s/\\161/\x71/g -s/\\162/\x72/g -s/\\163/\x73/g -s/\\164/\x74/g -s/\\165/\x75/g -s/\\166/\x76/g -s/\\167/\x77/g -s/\\170/\x78/g -s/\\171/\x79/g -s/\\172/\x7a/g -s/\\173/\x7b/g -s/\\174/\x7c/g -s/\\175/\x7d/g -s/\\176/\x7e/g -s/\\177/\x7f/g -s/\\200/\x80/g -s/\\201/\x81/g -s/\\202/\x82/g -s/\\203/\x83/g -s/\\204/\x84/g -s/\\205/\x85/g -s/\\206/\x86/g -s/\\207/\x87/g -s/\\210/\x88/g -s/\\211/\x89/g -s/\\212/\x8a/g -s/\\213/\x8b/g -s/\\214/\x8c/g -s/\\215/\x8d/g -s/\\216/\x8e/g -s/\\217/\x8f/g -s/\\220/\x90/g -s/\\221/\x91/g -s/\\222/\x92/g -s/\\223/\x93/g -s/\\224/\x94/g -s/\\225/\x95/g -s/\\226/\x96/g -s/\\227/\x97/g -s/\\230/\x98/g -s/\\231/\x99/g -s/\\232/\x9a/g -s/\\233/\x9b/g -s/\\234/\x9c/g -s/\\235/\x9d/g -s/\\236/\x9e/g -s/\\237/\x9f/g -s/\\240/\xa0/g -s/\\241/\xa1/g -s/\\242/\xa2/g -s/\\243/\xa3/g -s/\\244/\xa4/g -s/\\245/\xa5/g -s/\\246/\xa6/g -s/\\247/\xa7/g -s/\\250/\xa8/g -s/\\251/\xa9/g -s/\\252/\xaa/g -s/\\253/\xab/g -s/\\254/\xac/g -s/\\255/\xad/g -s/\\256/\xae/g -s/\\257/\xaf/g -s/\\260/\xb0/g -s/\\261/\xb1/g -s/\\262/\xb2/g -s/\\263/\xb3/g -s/\\264/\xb4/g -s/\\265/\xb5/g -s/\\266/\xb6/g -s/\\267/\xb7/g -s/\\270/\xb8/g -s/\\271/\xb9/g -s/\\272/\xba/g -s/\\273/\xbb/g -s/\\274/\xbc/g -s/\\275/\xbd/g -s/\\276/\xbe/g -s/\\277/\xbf/g -s/\\300/\xc0/g -s/\\301/\xc1/g -s/\\302/\xc2/g -s/\\303/\xc3/g -s/\\304/\xc4/g -s/\\305/\xc5/g -s/\\306/\xc6/g -s/\\307/\xc7/g -s/\\310/\xc8/g -s/\\311/\xc9/g -s/\\312/\xca/g -s/\\313/\xcb/g -s/\\314/\xcc/g -s/\\315/\xcd/g -s/\\316/\xce/g -s/\\317/\xcf/g -s/\\320/\xd0/g -s/\\321/\xd1/g -s/\\322/\xd2/g -s/\\323/\xd3/g -s/\\324/\xd4/g -s/\\325/\xd5/g -s/\\326/\xd6/g -s/\\327/\xd7/g -s/\\330/\xd8/g -s/\\331/\xd9/g -s/\\332/\xda/g -s/\\333/\xdb/g -s/\\334/\xdc/g -s/\\335/\xdd/g -s/\\336/\xde/g -s/\\337/\xdf/g -s/\\340/\xe0/g -s/\\341/\xe1/g -s/\\342/\xe2/g -s/\\343/\xe3/g -s/\\344/\xe4/g -s/\\345/\xe5/g -s/\\346/\xe6/g -s/\\347/\xe7/g -s/\\350/\xe8/g -s/\\351/\xe9/g -s/\\352/\xea/g -s/\\353/\xeb/g -s/\\354/\xec/g -s/\\355/\xed/g -s/\\356/\xee/g -s/\\357/\xef/g -s/\\360/\xf0/g -s/\\361/\xf1/g -s/\\362/\xf2/g -s/\\363/\xf3/g -s/\\364/\xf4/g -s/\\365/\xf5/g -s/\\366/\xf6/g -s/\\367/\xf7/g -s/\\370/\xf8/g -s/\\371/\xf9/g -s/\\372/\xfa/g -s/\\373/\xfb/g -s/\\374/\xfc/g -s/\\375/\xfd/g -s/\\376/\xfe/g -s/\\377/\xff/g - -# Unvis of backslash must be applied last to avoid double-interpretation. -s/\\134/\\/g diff --git a/lib/private/vis_canonicalize.sed b/lib/private/vis_canonicalize.sed deleted file mode 100644 index dea6b2f0b..000000000 --- a/lib/private/vis_canonicalize.sed +++ /dev/null @@ -1,287 +0,0 @@ -# Code generated by gen_vis_scripts. DO NOT EDIT. -# -# Convert vis-encoded content to a bespoke canonical form. After canonicalization, equality checks are trivial. -# Backslash, space characters, and all characters outside the 95 printable ASCII set are represented using escaped three-digit octal. -# The remaining characters are not escaped; they represent themselves. -# -# Input is interpreted as libarchive would, with a wider set of escape sequences: -# * \\, \a, \b, \f, \n, \r, \t, \v have their conventional C-based meanings -# * \0 means NUL when not the start of an three-digit octal escape sequence -# * \s means SPACE -# * \ is valid as an ordinary backslash when not the start of a valid escape sequence -# -# See: https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1942 - -# Escaping of backslashes must be applied first to avoid double-interpretation. -s/\\\\|\\([^0-3abfnrstv\\]|$)/\\134\1/g -s/\\([1-3]([^0-7]|$|[0-7]([^0-7]|$)))/\\134\1/g - -s/\\a/\\007/g -s/\\b/\\010/g -s/\\f/\\014/g -s/\\n/\\012/g -s/\\r/\\015/g -s/\\s/\\040/g -s/\\t/\\011/g -s/\\v/\\013/g - -# NUL special form must be disambiguated from ordinary octal escape sequences. -s/\\0([^0-7]|$|[0-7]([^0-7]|$))/\\000\1/g - -# Remove octal escaping from characters that don't need it. -s/\\041/!/g -s/\\042/"/g -s/\\043/#/g -s/\\044/$/g -s/\\045/%/g -s:\\046:\&:g -s/\\047/'/g -s/\\050/(/g -s/\\051/)/g -s/\\052/*/g -s/\\053/+/g -s/\\054/,/g -s/\\055/-/g -s/\\056/./g -s:\\057:/:g -s/\\060/0/g -s/\\061/1/g -s/\\062/2/g -s/\\063/3/g -s/\\064/4/g -s/\\065/5/g -s/\\066/6/g -s/\\067/7/g -s/\\070/8/g -s/\\071/9/g -s/\\072/:/g -s/\\073/;/g -s/\\074//g -s/\\077/?/g -s/\\100/@/g -s/\\101/A/g -s/\\102/B/g -s/\\103/C/g -s/\\104/D/g -s/\\105/E/g -s/\\106/F/g -s/\\107/G/g -s/\\110/H/g -s/\\111/I/g -s/\\112/J/g -s/\\113/K/g -s/\\114/L/g -s/\\115/M/g -s/\\116/N/g -s/\\117/O/g -s/\\120/P/g -s/\\121/Q/g -s/\\122/R/g -s/\\123/S/g -s/\\124/T/g -s/\\125/U/g -s/\\126/V/g -s/\\127/W/g -s/\\130/X/g -s/\\131/Y/g -s/\\132/Z/g -s/\\133/[/g -s/\\135/]/g -s/\\136/^/g -s/\\137/_/g -s/\\140/`/g -s/\\141/a/g -s/\\142/b/g -s/\\143/c/g -s/\\144/d/g -s/\\145/e/g -s/\\146/f/g -s/\\147/g/g -s/\\150/h/g -s/\\151/i/g -s/\\152/j/g -s/\\153/k/g -s/\\154/l/g -s/\\155/m/g -s/\\156/n/g -s/\\157/o/g -s/\\160/p/g -s/\\161/q/g -s/\\162/r/g -s/\\163/s/g -s/\\164/t/g -s/\\165/u/g -s/\\166/v/g -s/\\167/w/g -s/\\170/x/g -s/\\171/y/g -s/\\172/z/g -s/\\173/{/g -s/\\174/|/g -s/\\175/}/g -s/\\176/~/g - -# Add octal escaping for characters that need it. -s/\x00/\\000/g -s/\x01/\\001/g -s/\x02/\\002/g -s/\x03/\\003/g -s/\x04/\\004/g -s/\x05/\\005/g -s/\x06/\\006/g -s/\x07/\\007/g -s/\x08/\\010/g -s/\x09/\\011/g -s/\x0b/\\013/g -s/\x0c/\\014/g -s/\x0d/\\015/g -s/\x0e/\\016/g -s/\x0f/\\017/g -s/\x10/\\020/g -s/\x11/\\021/g -s/\x12/\\022/g -s/\x13/\\023/g -s/\x14/\\024/g -s/\x15/\\025/g -s/\x16/\\026/g -s/\x17/\\027/g -s/\x18/\\030/g -s/\x19/\\031/g -s/\x1a/\\032/g -s/\x1b/\\033/g -s/\x1c/\\034/g -s/\x1d/\\035/g -s/\x1e/\\036/g -s/\x1f/\\037/g -s/\x20/\\040/g -s/\x7f/\\177/g -s/\x80/\\200/g -s/\x81/\\201/g -s/\x82/\\202/g -s/\x83/\\203/g -s/\x84/\\204/g -s/\x85/\\205/g -s/\x86/\\206/g -s/\x87/\\207/g -s/\x88/\\210/g -s/\x89/\\211/g -s/\x8a/\\212/g -s/\x8b/\\213/g -s/\x8c/\\214/g -s/\x8d/\\215/g -s/\x8e/\\216/g -s/\x8f/\\217/g -s/\x90/\\220/g -s/\x91/\\221/g -s/\x92/\\222/g -s/\x93/\\223/g -s/\x94/\\224/g -s/\x95/\\225/g -s/\x96/\\226/g -s/\x97/\\227/g -s/\x98/\\230/g -s/\x99/\\231/g -s/\x9a/\\232/g -s/\x9b/\\233/g -s/\x9c/\\234/g -s/\x9d/\\235/g -s/\x9e/\\236/g -s/\x9f/\\237/g -s/\xa0/\\240/g -s/\xa1/\\241/g -s/\xa2/\\242/g -s/\xa3/\\243/g -s/\xa4/\\244/g -s/\xa5/\\245/g -s/\xa6/\\246/g -s/\xa7/\\247/g -s/\xa8/\\250/g -s/\xa9/\\251/g -s/\xaa/\\252/g -s/\xab/\\253/g -s/\xac/\\254/g -s/\xad/\\255/g -s/\xae/\\256/g -s/\xaf/\\257/g -s/\xb0/\\260/g -s/\xb1/\\261/g -s/\xb2/\\262/g -s/\xb3/\\263/g -s/\xb4/\\264/g -s/\xb5/\\265/g -s/\xb6/\\266/g -s/\xb7/\\267/g -s/\xb8/\\270/g -s/\xb9/\\271/g -s/\xba/\\272/g -s/\xbb/\\273/g -s/\xbc/\\274/g -s/\xbd/\\275/g -s/\xbe/\\276/g -s/\xbf/\\277/g -s/\xc0/\\300/g -s/\xc1/\\301/g -s/\xc2/\\302/g -s/\xc3/\\303/g -s/\xc4/\\304/g -s/\xc5/\\305/g -s/\xc6/\\306/g -s/\xc7/\\307/g -s/\xc8/\\310/g -s/\xc9/\\311/g -s/\xca/\\312/g -s/\xcb/\\313/g -s/\xcc/\\314/g -s/\xcd/\\315/g -s/\xce/\\316/g -s/\xcf/\\317/g -s/\xd0/\\320/g -s/\xd1/\\321/g -s/\xd2/\\322/g -s/\xd3/\\323/g -s/\xd4/\\324/g -s/\xd5/\\325/g -s/\xd6/\\326/g -s/\xd7/\\327/g -s/\xd8/\\330/g -s/\xd9/\\331/g -s/\xda/\\332/g -s/\xdb/\\333/g -s/\xdc/\\334/g -s/\xdd/\\335/g -s/\xde/\\336/g -s/\xdf/\\337/g -s/\xe0/\\340/g -s/\xe1/\\341/g -s/\xe2/\\342/g -s/\xe3/\\343/g -s/\xe4/\\344/g -s/\xe5/\\345/g -s/\xe6/\\346/g -s/\xe7/\\347/g -s/\xe8/\\350/g -s/\xe9/\\351/g -s/\xea/\\352/g -s/\xeb/\\353/g -s/\xec/\\354/g -s/\xed/\\355/g -s/\xee/\\356/g -s/\xef/\\357/g -s/\xf0/\\360/g -s/\xf1/\\361/g -s/\xf2/\\362/g -s/\xf3/\\363/g -s/\xf4/\\364/g -s/\xf5/\\365/g -s/\xf6/\\366/g -s/\xf7/\\367/g -s/\xf8/\\370/g -s/\xf9/\\371/g -s/\xfa/\\372/g -s/\xfb/\\373/g -s/\xfc/\\374/g -s/\xfd/\\375/g -s/\xfe/\\376/g -s/\xff/\\377/g diff --git a/lib/private/vis_escape.sed b/lib/private/vis_escape.sed deleted file mode 100644 index a9d8f0eef..000000000 --- a/lib/private/vis_escape.sed +++ /dev/null @@ -1,166 +0,0 @@ -# Code generated by gen_vis_scripts. DO NOT EDIT. -# -# Replace most bytes with their octal escape sequences. -# Backslashes, newlines, and spaces remain in place to preserve newline-delimited records of space-delimited fields -# while allowing upstream producers to include these delimiters in vis-encoded content. - -s/\x00/\\000/g -s/\x01/\\001/g -s/\x02/\\002/g -s/\x03/\\003/g -s/\x04/\\004/g -s/\x05/\\005/g -s/\x06/\\006/g -s/\x07/\\007/g -s/\x08/\\010/g -s/\x09/\\011/g -s/\x0b/\\013/g -s/\x0c/\\014/g -s/\x0d/\\015/g -s/\x0e/\\016/g -s/\x0f/\\017/g -s/\x10/\\020/g -s/\x11/\\021/g -s/\x12/\\022/g -s/\x13/\\023/g -s/\x14/\\024/g -s/\x15/\\025/g -s/\x16/\\026/g -s/\x17/\\027/g -s/\x18/\\030/g -s/\x19/\\031/g -s/\x1a/\\032/g -s/\x1b/\\033/g -s/\x1c/\\034/g -s/\x1d/\\035/g -s/\x1e/\\036/g -s/\x1f/\\037/g -s/\x7f/\\177/g -s/\x80/\\200/g -s/\x81/\\201/g -s/\x82/\\202/g -s/\x83/\\203/g -s/\x84/\\204/g -s/\x85/\\205/g -s/\x86/\\206/g -s/\x87/\\207/g -s/\x88/\\210/g -s/\x89/\\211/g -s/\x8a/\\212/g -s/\x8b/\\213/g -s/\x8c/\\214/g -s/\x8d/\\215/g -s/\x8e/\\216/g -s/\x8f/\\217/g -s/\x90/\\220/g -s/\x91/\\221/g -s/\x92/\\222/g -s/\x93/\\223/g -s/\x94/\\224/g -s/\x95/\\225/g -s/\x96/\\226/g -s/\x97/\\227/g -s/\x98/\\230/g -s/\x99/\\231/g -s/\x9a/\\232/g -s/\x9b/\\233/g -s/\x9c/\\234/g -s/\x9d/\\235/g -s/\x9e/\\236/g -s/\x9f/\\237/g -s/\xa0/\\240/g -s/\xa1/\\241/g -s/\xa2/\\242/g -s/\xa3/\\243/g -s/\xa4/\\244/g -s/\xa5/\\245/g -s/\xa6/\\246/g -s/\xa7/\\247/g -s/\xa8/\\250/g -s/\xa9/\\251/g -s/\xaa/\\252/g -s/\xab/\\253/g -s/\xac/\\254/g -s/\xad/\\255/g -s/\xae/\\256/g -s/\xaf/\\257/g -s/\xb0/\\260/g -s/\xb1/\\261/g -s/\xb2/\\262/g -s/\xb3/\\263/g -s/\xb4/\\264/g -s/\xb5/\\265/g -s/\xb6/\\266/g -s/\xb7/\\267/g -s/\xb8/\\270/g -s/\xb9/\\271/g -s/\xba/\\272/g -s/\xbb/\\273/g -s/\xbc/\\274/g -s/\xbd/\\275/g -s/\xbe/\\276/g -s/\xbf/\\277/g -s/\xc0/\\300/g -s/\xc1/\\301/g -s/\xc2/\\302/g -s/\xc3/\\303/g -s/\xc4/\\304/g -s/\xc5/\\305/g -s/\xc6/\\306/g -s/\xc7/\\307/g -s/\xc8/\\310/g -s/\xc9/\\311/g -s/\xca/\\312/g -s/\xcb/\\313/g -s/\xcc/\\314/g -s/\xcd/\\315/g -s/\xce/\\316/g -s/\xcf/\\317/g -s/\xd0/\\320/g -s/\xd1/\\321/g -s/\xd2/\\322/g -s/\xd3/\\323/g -s/\xd4/\\324/g -s/\xd5/\\325/g -s/\xd6/\\326/g -s/\xd7/\\327/g -s/\xd8/\\330/g -s/\xd9/\\331/g -s/\xda/\\332/g -s/\xdb/\\333/g -s/\xdc/\\334/g -s/\xdd/\\335/g -s/\xde/\\336/g -s/\xdf/\\337/g -s/\xe0/\\340/g -s/\xe1/\\341/g -s/\xe2/\\342/g -s/\xe3/\\343/g -s/\xe4/\\344/g -s/\xe5/\\345/g -s/\xe6/\\346/g -s/\xe7/\\347/g -s/\xe8/\\350/g -s/\xe9/\\351/g -s/\xea/\\352/g -s/\xeb/\\353/g -s/\xec/\\354/g -s/\xed/\\355/g -s/\xee/\\356/g -s/\xef/\\357/g -s/\xf0/\\360/g -s/\xf1/\\361/g -s/\xf2/\\362/g -s/\xf3/\\363/g -s/\xf4/\\364/g -s/\xf5/\\365/g -s/\xf6/\\366/g -s/\xf7/\\367/g -s/\xf8/\\370/g -s/\xf9/\\371/g -s/\xfa/\\372/g -s/\xfb/\\373/g -s/\xfc/\\374/g -s/\xfd/\\375/g -s/\xfe/\\376/g -s/\xff/\\377/g diff --git a/tools/gen_vis_scripts/BUILD.bazel b/tools/gen_vis_scripts/BUILD.bazel deleted file mode 100644 index 747e0d6d1..000000000 --- a/tools/gen_vis_scripts/BUILD.bazel +++ /dev/null @@ -1,7 +0,0 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") - -go_binary( - name = "gen_vis_scripts", - srcs = ["gen_vis_scripts.go"], - visibility = ["//lib/private:__pkg__"], -) diff --git a/tools/gen_vis_scripts/gen_vis_scripts.go b/tools/gen_vis_scripts/gen_vis_scripts.go deleted file mode 100644 index b7c7692fa..000000000 --- a/tools/gen_vis_scripts/gen_vis_scripts.go +++ /dev/null @@ -1,157 +0,0 @@ -// Code generator for vis-encoding support scripts. -package main - -import ( - "fmt" - "io" - "log" - "os" - "strings" - "unicode" -) - -func main() { - for _, arg := range os.Args[1:] { - name, dest, ok := strings.Cut(arg, "=") - if !ok { - log.Fatal("invalid generation spec:", arg) - } - - f, err := os.Create(dest) - if err != nil { - log.Fatal(err) - } - defer mustClose(f) - - switch name { - case "vis_escape.sed": - writeVisEscapeSed(f) - case "vis_canonicalize.sed": - writeVisCanonicalizeSed(f) - case "unvis.sed": - writeUnvisSed(f) - default: - log.Fatal("unknown generated content:", name) - } - } -} - -func mustClose(f *os.File) { - if err := f.Close(); err != nil { - log.Fatal(err) - } -} - -const newline rune = '\n' - -// Escape all characters identified by mtree(5) as requiring escaping. Plus whitespace. -func shouldEscape(b byte) bool { - return b == '\\' || b > unicode.MaxASCII || unicode.IsSpace(rune(b)) || !unicode.IsPrint(rune(b)) -} - -func writeVisEscapeSed(w io.Writer) { - fmt.Fprintln(w, strings.TrimSpace(` -# Code generated by gen_vis_scripts. DO NOT EDIT. -# -# Replace most bytes with their octal escape sequences. -# Backslashes, newlines, and spaces remain in place to preserve newline-delimited records of space-delimited fields -# while allowing upstream producers to include these delimiters in vis-encoded content. - `)) - fmt.Fprintln(w, "") - - for i := 0; i <= 0xFF; i++ { - b := byte(i) - if b == '\\' || b == '\n' || b == ' ' { - continue - } - if shouldEscape(b) { - fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, i, newline) - } - } -} - -func writeVisCanonicalizeSed(w io.Writer) { - fmt.Fprintln(w, strings.TrimSpace(` -# Code generated by gen_vis_scripts. DO NOT EDIT. -# -# Convert vis-encoded content to a bespoke canonical form. After canonicalization, equality checks are trivial. -# Backslash, space characters, and all characters outside the 95 printable ASCII set are represented using escaped three-digit octal. -# The remaining characters are not escaped; they represent themselves. -# -# Input is interpreted as libarchive would, with a wider set of escape sequences: -# * \\, \a, \b, \f, \n, \r, \t, \v have their conventional C-based meanings -# * \0 means NUL when not the start of an three-digit octal escape sequence -# * \s means SPACE -# * \ is valid as an ordinary backslash when not the start of a valid escape sequence -# -# See: https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1942 - -# Escaping of backslashes must be applied first to avoid double-interpretation. -s/\\\\|\\([^0-3abfnrstv\\]|$)/\\134\1/g -s/\\([1-3]([^0-7]|$|[0-7]([^0-7]|$)))/\\134\1/g - -s/\\a/\\007/g -s/\\b/\\010/g -s/\\f/\\014/g -s/\\n/\\012/g -s/\\r/\\015/g -s/\\s/\\040/g -s/\\t/\\011/g -s/\\v/\\013/g - -# NUL special form must be disambiguated from ordinary octal escape sequences. -s/\\0([^0-7]|$|[0-7]([^0-7]|$))/\\000\1/g - `)) - fmt.Fprintln(w, "") - - fmt.Fprintln(w, "# Remove octal escaping from characters that don't need it.") - for i := 0; i <= 0xFF; i++ { - b := byte(i) - if shouldEscape(b) { - continue - } - if b == '/' { - fmt.Fprintf(w, `s:\\%03[1]o:%[1]c:g%[2]c`, b, newline) - } else if b == '&' { - fmt.Fprintf(w, `s:\\%03[1]o:\%[1]c:g%[2]c`, b, newline) - } else { - fmt.Fprintf(w, `s/\\%03[1]o/%[1]c/g%[2]c`, b, newline) - } - } - fmt.Fprintln(w, "") - - fmt.Fprintln(w, "# Add octal escaping for characters that need it.") - for i := 0; i <= 0xFF; i++ { - b := byte(i) - if !shouldEscape(b) { - continue - } - if b == '\\' || b == '\n' { - continue - } - fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, b, newline) - } -} - -func writeUnvisSed(w io.Writer) { - fmt.Fprintln(w, strings.TrimSpace(` -# Code generated by gen_vis_scripts. DO NOT EDIT. -# Replace octal escape sequences with the bytes they represent. -# NOTE: not a fully general unvis program. - `)) - fmt.Fprintln(w, "") - - for i := 0x00; i <= 0xFF; i++ { - b := byte(i) - if b == '\\' { - continue - } - fmt.Fprintf(w, `s/\\%03[1]o/\x%02[1]x/g%[2]c`, b, newline) - } - fmt.Fprintln(w, "") - - fmt.Fprintln(w, strings.TrimSpace(` -# Unvis of backslash must be applied last to avoid double-interpretation. -s/\\134/\\/g - `)) -} From 9730733d909b3ff139e450f409835b51b9a310c8 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Wed, 19 Feb 2025 23:52:35 -0500 Subject: [PATCH 28/36] Update test content to be accurate We now pass-through SPACE as well. --- lib/tests/vis_encoding.bats | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/tests/vis_encoding.bats b/lib/tests/vis_encoding.bats index 6807b1513..5ad9a9ed4 100644 --- a/lib/tests/vis_encoding.bats +++ b/lib/tests/vis_encoding.bats @@ -26,13 +26,14 @@ paste() { @test "vis encode passthrough text" { cat <<'EOF' >"$BATS_TEST_TMPDIR/input" -Newlines (\n), backslahes (\\), and graphical ASCII ([[:graph:]]) characters are passed through unencoded. -Upstream encoders should escape the first two in content they feed to the general encoder. +Newlines (\n), backslahes (\\), spaces (\s), and graphical ASCII ([[:graph:]]) characters are passed through unencoded. +Upstream encoders should escape the first three in content they feed to the general encoder. Newline => \012 Backslash => \134 + Space => \040 -These gaps enable our encoder to operate on newline-delimited records of vis-encoded content. +These gaps enable our encoder to operate on newline-delimited records of space-delimited fields of vis-encoded content. EOF gawk -bf "$VIS_ESCAPE" <"$BATS_TEST_TMPDIR/input" >"$BATS_TEST_TMPDIR/output" From 7e2819392a855999832ea0da19f2df4bfd182ea0 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Wed, 19 Feb 2025 23:55:37 -0500 Subject: [PATCH 29/36] Acknowledge that some in-process escaping is occurring and explain why. --- lib/private/tar.bzl | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index 0a536abbd..606be6bb5 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -379,6 +379,7 @@ def _to_rlocation_path(file, workspace): def _vis_encode(filename): # Escaping of non-ASCII bytes cannot be performed within Starlark. # After writing content out, a second pass is performed with vis_escape.gawk. + # Backslash, newline, and space are not handled by vis_escape.gawk; we encode only these in-process. return filename.replace("\\", "\\134").replace("\n", "\\012").replace(" ", "\\040") def _expand(file, expander, transform = to_repository_relative_path): From 9a9c486dbadae0a38d1d7322b530317bd6a4c646 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Thu, 20 Feb 2025 11:51:39 -0500 Subject: [PATCH 30/36] Use cp and cut from uutils as well --- lib/tests/vis_encoding.bats | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/tests/vis_encoding.bats b/lib/tests/vis_encoding.bats index 5ad9a9ed4..b0c40987b 100644 --- a/lib/tests/vis_encoding.bats +++ b/lib/tests/vis_encoding.bats @@ -11,6 +11,12 @@ gawk() { cat() { "$COREUTILS" cat "$@" } +cp() { + "$COREUTILS" cp "$@" +} +cut() { + "$COREUTILS" cut "$@" +} tr() { "$COREUTILS" tr "$@" } From 0b5792bea523cdbd8fe80126d4cd24222f19cd63 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Thu, 20 Feb 2025 11:52:30 -0500 Subject: [PATCH 31/36] Acknowledge that diff tool is not hermetic currently --- lib/tests/vis_encoding.bats | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/tests/vis_encoding.bats b/lib/tests/vis_encoding.bats index b0c40987b..8acd6ac04 100644 --- a/lib/tests/vis_encoding.bats +++ b/lib/tests/vis_encoding.bats @@ -17,6 +17,10 @@ cp() { cut() { "$COREUTILS" cut "$@" } +diff() { + # No toolchain diff tool available; rely on system version. `diff` is part of POSIX; it should be available. + diff "$@" +} tr() { "$COREUTILS" tr "$@" } From 71d7ac3c19904289336f85bc7cb231e59620c012 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Thu, 20 Feb 2025 11:53:01 -0500 Subject: [PATCH 32/36] Explain our goals with these utility function wrappers --- lib/tests/vis_encoding.bats | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/tests/vis_encoding.bats b/lib/tests/vis_encoding.bats index 8acd6ac04..2448f4994 100644 --- a/lib/tests/vis_encoding.bats +++ b/lib/tests/vis_encoding.bats @@ -4,6 +4,8 @@ # For this reason, paragraph-delimited records of newline-delimited fields is a natural framing structure that will # be preserved through the encoding/decoding/canonicalizing transformation. +# Try to use utilities from toolchains and avoid dependencies on system utilities as much as possible. +# This gives us the greatest chance at consistency across platforms. gawk() { # TODO: from toolchain /opt/homebrew/bin/gawk "$@" From 43053bc2608630feacfe6ecf9f5ff2ae5dd3f6df Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Thu, 20 Feb 2025 11:53:52 -0500 Subject: [PATCH 33/36] Avoid infinite recursion --- lib/tests/vis_encoding.bats | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tests/vis_encoding.bats b/lib/tests/vis_encoding.bats index 2448f4994..6f7af83d7 100644 --- a/lib/tests/vis_encoding.bats +++ b/lib/tests/vis_encoding.bats @@ -21,7 +21,7 @@ cut() { } diff() { # No toolchain diff tool available; rely on system version. `diff` is part of POSIX; it should be available. - diff "$@" + $(which diff) "$@" } tr() { "$COREUTILS" tr "$@" From 50666959688aaf7b0dd077facfaeda9d35036dc5 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Thu, 20 Feb 2025 11:55:14 -0500 Subject: [PATCH 34/36] Sort utilities wrappers --- lib/tests/vis_encoding.bats | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/tests/vis_encoding.bats b/lib/tests/vis_encoding.bats index 6f7af83d7..a81b7ccd7 100644 --- a/lib/tests/vis_encoding.bats +++ b/lib/tests/vis_encoding.bats @@ -6,9 +6,8 @@ # Try to use utilities from toolchains and avoid dependencies on system utilities as much as possible. # This gives us the greatest chance at consistency across platforms. -gawk() { - # TODO: from toolchain - /opt/homebrew/bin/gawk "$@" +basenc() { + "$COREUTILS" basenc "$@" } cat() { "$COREUTILS" cat "$@" @@ -23,11 +22,9 @@ diff() { # No toolchain diff tool available; rely on system version. `diff` is part of POSIX; it should be available. $(which diff) "$@" } -tr() { - "$COREUTILS" tr "$@" -} -basenc() { - "$COREUTILS" basenc "$@" +gawk() { + # TODO: from toolchain + /opt/homebrew/bin/gawk "$@" } od() { "$COREUTILS" od "$@" @@ -35,6 +32,9 @@ od() { paste() { "$COREUTILS" paste "$@" } +tr() { + "$COREUTILS" tr "$@" +} @test "vis encode passthrough text" { cat <<'EOF' >"$BATS_TEST_TMPDIR/input" From b4c835c5abe70faa3fada4d20278879d6ae2a563 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Thu, 20 Feb 2025 11:57:11 -0500 Subject: [PATCH 35/36] Avoid word splitting --- lib/tests/vis_encoding.bats | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tests/vis_encoding.bats b/lib/tests/vis_encoding.bats index a81b7ccd7..5ce6b6123 100644 --- a/lib/tests/vis_encoding.bats +++ b/lib/tests/vis_encoding.bats @@ -20,7 +20,7 @@ cut() { } diff() { # No toolchain diff tool available; rely on system version. `diff` is part of POSIX; it should be available. - $(which diff) "$@" + "$(which diff)" "$@" } gawk() { # TODO: from toolchain From 1fc404a1b89124834538ba9182e5e398267e8840 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Thu, 20 Feb 2025 12:45:21 -0500 Subject: [PATCH 36/36] Update mnemonic for accuracy --- lib/private/tar.bzl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index 606be6bb5..d28e8eb6e 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -462,7 +462,7 @@ def _mtree_impl(ctx): "UNESCAPED": unescaped.path, "OUT": out.path, }, - mnemonic = "EscapeNonAscii", + mnemonic = "VisEscape", ) return DefaultInfo(files = depset([out]), runfiles = ctx.runfiles([out]))