Skip to content

gh-130819: Update tarfile.py#_create_gnu_long_header to align with GNU Tar #130820

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Doc/whatsnew/3.14.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1708,6 +1708,15 @@ sysconfig
(Contributed by Xuehai Pan in :gh:`131799`.)


tarfile
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will need to be moved in whatsnew/3.15.rst now

-------

* Emit ``mode``, ``uname`` and ``gname`` fields for long paths in
:mod:`tarfile` archives, providing better bit-for-bit compatibility with GNU
:manpage:`tar(1)`.
(Contributed by Dahan Gong in :gh:`130820`.)


threading
---------

Expand Down
55 changes: 39 additions & 16 deletions Lib/tarfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -895,6 +895,9 @@ class TarInfo(object):
_link_target = None,
)

_name_uid0 = None # Cached uname of uid=0
_name_gid0 = None # Cached gname of gid=0
Comment on lines +898 to +899
Copy link
Member

@picnixz picnixz May 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I meant before is: why using a class variable? the issue is that once we deduce uid=0, we're stuck with it for the entire Python process.

EDIT: I didn't see you comment, my bad. Then we need to think of another solution because storing them in TarFile feels wrong. What we can do is to add a private attribute in TarInfo and populate it from TarFile. When writing, if the attribute is not set, we populate it eagerly (and thus subclasses of TarInfo will be slower but they won't be broken). Or instead, we can even just dump them with the legacy way (namely without aligning with GNU Tar). Only default TarFile and TarInfo objects will be having this new feature.

More generally, we should be able to set cached contextual information on TarInfo objects coming from a TarFile.


def __init__(self, name=""):
"""Construct a TarInfo object. name is the optional name
of the member.
Expand Down Expand Up @@ -1202,6 +1205,13 @@ def _create_gnu_long_header(cls, name, type, encoding, errors):
info["type"] = type
info["size"] = len(name)
info["magic"] = GNU_MAGIC
info["mode"] = 0o100644
if cls._name_uid0 is None or cls._name_gid0 is None:
user_group_names = TarFile._get_user_group_names(0, 0, {}, {})
cls._name_uid0 = user_group_names[0] or ""
cls._name_gid0 = user_group_names[1] or ""
info["uname"] = cls._name_uid0
info["gname"] = cls._name_gid0

# create extended header + name blocks.
return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
Expand Down Expand Up @@ -2202,22 +2212,12 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None):
tarinfo.type = type
tarinfo.linkname = linkname

# Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To
# speed things up, cache the resolved usernames and group names.
if pwd:
if tarinfo.uid not in self._unames:
try:
self._unames[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0]
except KeyError:
self._unames[tarinfo.uid] = ''
tarinfo.uname = self._unames[tarinfo.uid]
if grp:
if tarinfo.gid not in self._gnames:
try:
self._gnames[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0]
except KeyError:
self._gnames[tarinfo.gid] = ''
tarinfo.gname = self._gnames[tarinfo.gid]
uname, gname = TarFile._get_user_group_names(tarinfo.uid, tarinfo.gid,
self._unames, self._gnames)
if uname is not None:
tarinfo.uname = uname
if gname is not None:
tarinfo.gname = gname

if type in (CHRTYPE, BLKTYPE):
if hasattr(os, "major") and hasattr(os, "minor"):
Expand Down Expand Up @@ -2560,6 +2560,29 @@ def _extract_member(self, tarinfo, targetpath, set_attrs=True,
self.chmod(tarinfo, targetpath)
self.utime(tarinfo, targetpath)

def _get_user_group_names(uid, gid, unames_cache, gnames_cache):
# Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive.
# To speed things up, cache the resolved usernames and group names.
if pwd:
if uid not in unames_cache:
try:
unames_cache[uid] = pwd.getpwuid(uid)[0]
except KeyError:
unames_cache[uid] = ''
uname = unames_cache[uid]
else:
uname = None
if grp:
if gid not in gnames_cache:
try:
gnames_cache[gid] = grp.getgrgid(gid)[0]
except KeyError:
gnames_cache[gid] = ''
gname = gnames_cache[gid]
else:
gname = None
return uname, gname

#--------------------------------------------------------------------------
# Below are the different file methods. They are called via
# _extract_member() when extract() is called. They can be replaced in a
Expand Down
24 changes: 24 additions & 0 deletions Lib/test/test_tarfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -1908,6 +1908,30 @@ def test_longnamelink_1025(self):
self._test(("longnam/" * 127) + "longname_",
("longlnk/" * 127) + "longlink_")

def test_hidden_header_for_gnulong(self):
# Regression test for gh-130819.
memory_file = io.BytesIO()
with tarfile.open(mode="w", fileobj=memory_file, format=tarfile.GNU_FORMAT) as tar:
tar_info = tarfile.TarInfo("abcdef" * 20)
tar_info.type = tarfile.DIRTYPE
tar.addfile(tar_info, None)
tar.close()

class RawTabInfo(tarfile.TarInfo):

def _proc_member(self, tar_file):
if self.type in (tarfile.GNUTYPE_LONGNAME, tarfile.GNUTYPE_LONGLINK):
tester.assertEqual(self.mode, 0o644)
tester.assertEqual(self.uname, RawTabInfo._name_uid0)
tester.assertEqual(self.gname, RawTabInfo._name_gid0)
return super()._proc_member(tar_file)

tester = self
memory_file.seek(0)
with tarfile.open(fileobj=memory_file, mode="r", tarinfo=RawTabInfo) as tar:
members = tar.getmembers()
self.assertEqual(len(members), 1)


class DeviceHeaderTest(WriteTestBase, unittest.TestCase):

Expand Down
1 change: 1 addition & 0 deletions Misc/ACKS
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,7 @@ Mikhail Golubev
Marta Gómez Macías
Guilherme Gonçalves
Tiago Gonçalves
Dahan Gong
Chris Gonnerman
Shelley Gooch
David Goodger
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Emit ``mode``, ``uname`` and ``gname`` fields for long paths in
:mod:`tarfile` archives, providing better bit-for-bit compatibility with GNU
``tar(1)``.
Loading