Pax parsing should consistently use the FIRST pathname/linkname (libarchive#2264)

kientzle · web-flow · commit 586a9645102c · 2024-07-09T13:55:23.000+02:00
Pax introduced new headers that appear _before_ the legacy headers. So pax archives require earlier properties to override later ones. Originally, libarchive handled this by storing the early headers in memory so that it could do the actual parsing from back to front. With this scheme, properties from early headers were parsed last and simply overwrote properties from later headers. PR libarchive#2127 reduced memory usage by parsing headers in the order they appear in the file, which requires later headers to avoid overwriting already-set properties. Apparently, when I made this change, I did not fully consider how charset translations get handled on Windows, so failed to consistently recognize when the path or linkname properties were in fact actually set. As a result, the legacy path/link values (which have no charset information) overwrote the pax path/link values (which are known to be UTF-8), leading to the behavior observed in libarchive#2248. This PR corrects this bug by adding additional tests to see if the wide character path or linkname properties are set. Related: This bug was exposed by a new test added in libarchive#2228 which does a write/read validation to ensure round-trip filename handling. This was modified in libarchive#2248 to avoid tickling the bug above. I've reverted the change from libarchive#2248 since it's no longer necessary. I have also added some additional validation to this test to help ensure that the intermediate archive actually is a pax format that includes the expected path and linkname properties in the expected places.
diff --git a/libarchive/archive_read_support_format_tar.c b/libarchive/archive_read_support_format_tar.c
@@ -1294,6 +1294,7 @@ header_common(struct archive_read *a, struct tar *tar,
 {
 	const struct archive_entry_header_ustar	*header;
 	const char *existing_linkpath;
+	const wchar_t *existing_wcs_linkpath;
 	int     err = ARCHIVE_OK;
 
 	header = (const struct archive_entry_header_ustar *)h;
@@ -1346,8 +1347,10 @@ header_common(struct archive_read *a, struct tar *tar,
 	switch (tar->filetype) {
 	case '1': /* Hard link */
 		archive_entry_set_link_to_hardlink(entry);
+		existing_wcs_linkpath = archive_entry_hardlink_w(entry);
 		existing_linkpath = archive_entry_hardlink(entry);
-		if (existing_linkpath == NULL || existing_linkpath[0] == '\0') {
+		if ((existing_linkpath == NULL || existing_linkpath[0] == '\0')
+		    && (existing_wcs_linkpath == NULL || existing_wcs_linkpath[0] == '\0')) {
 			struct archive_string linkpath;
 			archive_string_init(&linkpath);
 			archive_strncpy(&linkpath,
@@ -1422,8 +1425,10 @@ header_common(struct archive_read *a, struct tar *tar,
 		break;
 	case '2': /* Symlink */
 		archive_entry_set_link_to_symlink(entry);
+		existing_wcs_linkpath = archive_entry_symlink_w(entry);
 		existing_linkpath = archive_entry_symlink(entry);
-		if (existing_linkpath == NULL || existing_linkpath[0] == '\0') {
+		if ((existing_linkpath == NULL || existing_linkpath[0] == '\0')
+		    && (existing_wcs_linkpath == NULL || existing_wcs_linkpath[0] == '\0')) {
 			struct archive_string linkpath;
 			archive_string_init(&linkpath);
 			archive_strncpy(&linkpath,
@@ -1677,7 +1682,9 @@ header_ustar(struct archive_read *a, struct tar *tar,
 
 	/* Copy name into an internal buffer to ensure null-termination. */
 	const char *existing_pathname = archive_entry_pathname(entry);
-	if (existing_pathname == NULL || existing_pathname[0] == '\0') {
+	const wchar_t *existing_wcs_pathname = archive_entry_pathname_w(entry);
+	if ((existing_pathname == NULL || existing_pathname[0] == '\0')
+	    && (existing_wcs_pathname == NULL || existing_wcs_pathname[0] == '\0')) {
 		archive_string_init(&as);
 		if (header->prefix[0]) {
 			archive_strncpy(&as, header->prefix, sizeof(header->prefix));
diff --git a/libarchive/test/test_pax_filename_encoding.c b/libarchive/test/test_pax_filename_encoding.c
@@ -592,6 +592,7 @@ DEFINE_TEST(test_pax_filename_encoding_UTF16_win)
 	struct archive *a;
 	struct archive_entry *entry;
 	char buff[0x2000];
+	char *p;
 	size_t used;
 
 	/*
@@ -608,11 +609,11 @@ DEFINE_TEST(test_pax_filename_encoding_UTF16_win)
 		archive_write_free(a);
 		return;
 	}
-
-	/* Re-create a write archive object since filenames should be written
-	 * in UTF-8 by default. */
 	archive_write_free(a);
 
+	/*
+	 * Create a new archive handle with default charset handling
+	 */
 	a = archive_write_new();
 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
 	assertEqualInt(ARCHIVE_OK,
@@ -650,11 +651,63 @@ DEFINE_TEST(test_pax_filename_encoding_UTF16_win)
 	archive_entry_free(entry);
 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
 
-	/* Ensure that the names round trip properly */
+	/*
+	 * Examine the bytes to ensure the filenames ended up UTF-8
+	 * encoded as we expect.
+	 */
+
+	/* Part 1: file */
+	p = buff + 0;
+	assertEqualString(p + 0, "PaxHeader/\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* File name */
+	assertEqualInt(p[156], 'x'); /* Pax extension header */
+	p += 512; /* Pax extension body */
+	assertEqualString(p + 0, "19 path=\xE4\xBD\xA0\xE5\xA5\xBD.txt\n");
+	p += 512; /* Ustar header */
+	assertEqualString(p + 0, "\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* File name */
+	assertEqualInt(p[156], '0');
+
+	/* Part 2: directory */
+	p += 512; /* Pax extension header */
+	assertEqualString(p + 0, "PaxHeader/\xD0\xBF\xD1\x80\xD0\xB8"); /* File name */
+	assertEqualInt(p[156], 'x');
+	p += 512; /* Pax extension body */
+	assertEqualString(p + 0, "16 path=\xD0\xBF\xD1\x80\xD0\xB8/\n");
+	p += 512; /* Ustar header */
+	assertEqualString(p + 0, "\xD0\xBF\xD1\x80\xD0\xB8/"); /* File name */
+	assertEqualInt(p[156], '5'); /* directory */
+
+	/* Part 3: symlink */
+	p += 512; /* Pax Extension Header */
+	assertEqualString(p + 0, "PaxHeader/\xE5\x86\x8D\xE8\xA7\x81.txt"); /* File name */
+	p += 512; /* Pax extension body */
+	assertEqualString(p + 0,
+			  "19 path=\xE5\x86\x8D\xE8\xA7\x81.txt\n"
+			  "23 linkpath=\xE4\xBD\xA0\xE5\xA5\xBD.txt\n"
+			  "31 LIBARCHIVE.symlinktype=file\n");
+	p += 512; /* Ustar header */
+	assertEqualString(p + 0, "\xE5\x86\x8D\xE8\xA7\x81.txt"); /* File name */
+	assertEqualInt(p[156], '2'); /* symlink */
+	assertEqualString(p + 157, "\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* link name */
+
+	/* Part 4: hardlink */
+	p += 512; /* Pax extension header */
+	assertEqualString(p + 0, "PaxHeader/\xE6\x99\x9A\xE5\xAE\x89.txt"); /* File name */
+	p += 512; /* Pax extension body */
+	assertEqualString(p + 0,
+			  "19 path=\xE6\x99\x9A\xE5\xAE\x89.txt\n"
+			  "23 linkpath=\xE4\xBD\xA0\xE5\xA5\xBD.txt\n"
+			  "31 LIBARCHIVE.symlinktype=file\n");
+	p += 512; /* Ustar header */
+	assertEqualString(p + 0, "\xE6\x99\x9A\xE5\xAE\x89.txt"); /* File name */
+	assertEqualInt(p[156], '1'); /* hard link */
+	assertEqualString(p + 157, "\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* link name */
+
+	/*
+	 * Read back the archive to see if we get the original names
+	 */
 	a = archive_read_new();
 	archive_read_support_format_all(a);
 	archive_read_support_filter_all(a);
-	assertEqualInt(ARCHIVE_OK, archive_read_set_options(a, "hdrcharset=UTF-8"));
 	assertEqualInt(0, archive_read_open_memory(a, buff, used));
 
 	/* Read part 1: file */
@@ -674,6 +727,8 @@ DEFINE_TEST(test_pax_filename_encoding_UTF16_win)
 	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &entry));
 	assertEqualWString(L"\u665a\u5b89.txt", archive_entry_pathname_w(entry));
 	assertEqualWString(L"\u4f60\u597d.txt", archive_entry_hardlink_w(entry));
+
+	archive_free(a);
 #endif
 }