Skip to content

Commit 586a964

Browse files
authored
Pax parsing should consistently use the FIRST pathname/linkname (libarchive#2264)
Pax introduced new headers that appear _before_ the legacy headers. So pax archives require earlier properties to override later ones. Originally, libarchive handled this by storing the early headers in memory so that it could do the actual parsing from back to front. With this scheme, properties from early headers were parsed last and simply overwrote properties from later headers. PR libarchive#2127 reduced memory usage by parsing headers in the order they appear in the file, which requires later headers to avoid overwriting already-set properties. Apparently, when I made this change, I did not fully consider how charset translations get handled on Windows, so failed to consistently recognize when the path or linkname properties were in fact actually set. As a result, the legacy path/link values (which have no charset information) overwrote the pax path/link values (which are known to be UTF-8), leading to the behavior observed in libarchive#2248. This PR corrects this bug by adding additional tests to see if the wide character path or linkname properties are set. Related: This bug was exposed by a new test added in libarchive#2228 which does a write/read validation to ensure round-trip filename handling. This was modified in libarchive#2248 to avoid tickling the bug above. I've reverted the change from libarchive#2248 since it's no longer necessary. I have also added some additional validation to this test to help ensure that the intermediate archive actually is a pax format that includes the expected path and linkname properties in the expected places.
1 parent 9fa8449 commit 586a964

File tree

2 files changed

+70
-8
lines changed

2 files changed

+70
-8
lines changed

Diff for: libarchive/archive_read_support_format_tar.c

+10-3
Original file line numberDiff line numberDiff line change
@@ -1294,6 +1294,7 @@ header_common(struct archive_read *a, struct tar *tar,
12941294
{
12951295
const struct archive_entry_header_ustar *header;
12961296
const char *existing_linkpath;
1297+
const wchar_t *existing_wcs_linkpath;
12971298
int err = ARCHIVE_OK;
12981299

12991300
header = (const struct archive_entry_header_ustar *)h;
@@ -1346,8 +1347,10 @@ header_common(struct archive_read *a, struct tar *tar,
13461347
switch (tar->filetype) {
13471348
case '1': /* Hard link */
13481349
archive_entry_set_link_to_hardlink(entry);
1350+
existing_wcs_linkpath = archive_entry_hardlink_w(entry);
13491351
existing_linkpath = archive_entry_hardlink(entry);
1350-
if (existing_linkpath == NULL || existing_linkpath[0] == '\0') {
1352+
if ((existing_linkpath == NULL || existing_linkpath[0] == '\0')
1353+
&& (existing_wcs_linkpath == NULL || existing_wcs_linkpath[0] == '\0')) {
13511354
struct archive_string linkpath;
13521355
archive_string_init(&linkpath);
13531356
archive_strncpy(&linkpath,
@@ -1422,8 +1425,10 @@ header_common(struct archive_read *a, struct tar *tar,
14221425
break;
14231426
case '2': /* Symlink */
14241427
archive_entry_set_link_to_symlink(entry);
1428+
existing_wcs_linkpath = archive_entry_symlink_w(entry);
14251429
existing_linkpath = archive_entry_symlink(entry);
1426-
if (existing_linkpath == NULL || existing_linkpath[0] == '\0') {
1430+
if ((existing_linkpath == NULL || existing_linkpath[0] == '\0')
1431+
&& (existing_wcs_linkpath == NULL || existing_wcs_linkpath[0] == '\0')) {
14271432
struct archive_string linkpath;
14281433
archive_string_init(&linkpath);
14291434
archive_strncpy(&linkpath,
@@ -1677,7 +1682,9 @@ header_ustar(struct archive_read *a, struct tar *tar,
16771682

16781683
/* Copy name into an internal buffer to ensure null-termination. */
16791684
const char *existing_pathname = archive_entry_pathname(entry);
1680-
if (existing_pathname == NULL || existing_pathname[0] == '\0') {
1685+
const wchar_t *existing_wcs_pathname = archive_entry_pathname_w(entry);
1686+
if ((existing_pathname == NULL || existing_pathname[0] == '\0')
1687+
&& (existing_wcs_pathname == NULL || existing_wcs_pathname[0] == '\0')) {
16811688
archive_string_init(&as);
16821689
if (header->prefix[0]) {
16831690
archive_strncpy(&as, header->prefix, sizeof(header->prefix));

Diff for: libarchive/test/test_pax_filename_encoding.c

+60-5
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,7 @@ DEFINE_TEST(test_pax_filename_encoding_UTF16_win)
592592
struct archive *a;
593593
struct archive_entry *entry;
594594
char buff[0x2000];
595+
char *p;
595596
size_t used;
596597

597598
/*
@@ -608,11 +609,11 @@ DEFINE_TEST(test_pax_filename_encoding_UTF16_win)
608609
archive_write_free(a);
609610
return;
610611
}
611-
612-
/* Re-create a write archive object since filenames should be written
613-
* in UTF-8 by default. */
614612
archive_write_free(a);
615613

614+
/*
615+
* Create a new archive handle with default charset handling
616+
*/
616617
a = archive_write_new();
617618
assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
618619
assertEqualInt(ARCHIVE_OK,
@@ -650,11 +651,63 @@ DEFINE_TEST(test_pax_filename_encoding_UTF16_win)
650651
archive_entry_free(entry);
651652
assertEqualInt(ARCHIVE_OK, archive_write_free(a));
652653

653-
/* Ensure that the names round trip properly */
654+
/*
655+
* Examine the bytes to ensure the filenames ended up UTF-8
656+
* encoded as we expect.
657+
*/
658+
659+
/* Part 1: file */
660+
p = buff + 0;
661+
assertEqualString(p + 0, "PaxHeader/\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* File name */
662+
assertEqualInt(p[156], 'x'); /* Pax extension header */
663+
p += 512; /* Pax extension body */
664+
assertEqualString(p + 0, "19 path=\xE4\xBD\xA0\xE5\xA5\xBD.txt\n");
665+
p += 512; /* Ustar header */
666+
assertEqualString(p + 0, "\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* File name */
667+
assertEqualInt(p[156], '0');
668+
669+
/* Part 2: directory */
670+
p += 512; /* Pax extension header */
671+
assertEqualString(p + 0, "PaxHeader/\xD0\xBF\xD1\x80\xD0\xB8"); /* File name */
672+
assertEqualInt(p[156], 'x');
673+
p += 512; /* Pax extension body */
674+
assertEqualString(p + 0, "16 path=\xD0\xBF\xD1\x80\xD0\xB8/\n");
675+
p += 512; /* Ustar header */
676+
assertEqualString(p + 0, "\xD0\xBF\xD1\x80\xD0\xB8/"); /* File name */
677+
assertEqualInt(p[156], '5'); /* directory */
678+
679+
/* Part 3: symlink */
680+
p += 512; /* Pax Extension Header */
681+
assertEqualString(p + 0, "PaxHeader/\xE5\x86\x8D\xE8\xA7\x81.txt"); /* File name */
682+
p += 512; /* Pax extension body */
683+
assertEqualString(p + 0,
684+
"19 path=\xE5\x86\x8D\xE8\xA7\x81.txt\n"
685+
"23 linkpath=\xE4\xBD\xA0\xE5\xA5\xBD.txt\n"
686+
"31 LIBARCHIVE.symlinktype=file\n");
687+
p += 512; /* Ustar header */
688+
assertEqualString(p + 0, "\xE5\x86\x8D\xE8\xA7\x81.txt"); /* File name */
689+
assertEqualInt(p[156], '2'); /* symlink */
690+
assertEqualString(p + 157, "\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* link name */
691+
692+
/* Part 4: hardlink */
693+
p += 512; /* Pax extension header */
694+
assertEqualString(p + 0, "PaxHeader/\xE6\x99\x9A\xE5\xAE\x89.txt"); /* File name */
695+
p += 512; /* Pax extension body */
696+
assertEqualString(p + 0,
697+
"19 path=\xE6\x99\x9A\xE5\xAE\x89.txt\n"
698+
"23 linkpath=\xE4\xBD\xA0\xE5\xA5\xBD.txt\n"
699+
"31 LIBARCHIVE.symlinktype=file\n");
700+
p += 512; /* Ustar header */
701+
assertEqualString(p + 0, "\xE6\x99\x9A\xE5\xAE\x89.txt"); /* File name */
702+
assertEqualInt(p[156], '1'); /* hard link */
703+
assertEqualString(p + 157, "\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* link name */
704+
705+
/*
706+
* Read back the archive to see if we get the original names
707+
*/
654708
a = archive_read_new();
655709
archive_read_support_format_all(a);
656710
archive_read_support_filter_all(a);
657-
assertEqualInt(ARCHIVE_OK, archive_read_set_options(a, "hdrcharset=UTF-8"));
658711
assertEqualInt(0, archive_read_open_memory(a, buff, used));
659712

660713
/* Read part 1: file */
@@ -674,6 +727,8 @@ DEFINE_TEST(test_pax_filename_encoding_UTF16_win)
674727
assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &entry));
675728
assertEqualWString(L"\u665a\u5b89.txt", archive_entry_pathname_w(entry));
676729
assertEqualWString(L"\u4f60\u597d.txt", archive_entry_hardlink_w(entry));
730+
731+
archive_free(a);
677732
#endif
678733
}
679734

0 commit comments

Comments
 (0)