Skip to content

ls parser speed improvements, reworked #4408

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
68478db
Several speed improvements for parsing ls directory output
PaulWay Feb 3, 2025
7bdb5e2
Minor flake8 fix
PaulWay Feb 3, 2025
4feb195
Ignore any line that doesn't have at least five parts (perms, links, …
PaulWay Feb 4, 2025
29a6e62
Abstract size/major,minor parsing out, other parsing consolidation
PaulWay Feb 4, 2025
39ec3b6
Minor improvement - speed up finding mode if we have a links count
PaulWay Feb 19, 2025
cc5d2cb
Store parser function directly rather than looking it up
PaulWay Feb 19, 2025
af8600e
Don't need return from parser function; slightly quicker if/elif/else…
PaulWay Feb 19, 2025
f0681fe
Simplify to set_link_name and set_selinux, use partition for rhel8_se…
PaulWay Feb 26, 2025
8d6197f
Parse date as well as major,minor/size, don't store raw_entry, do typ…
PaulWay Mar 6, 2025
3c641b3
Links are 'files' too
PaulWay Mar 31, 2025
084d40e
Test for correct handling of file name with -> in it
PaulWay Mar 31, 2025
1f0d14e
Handle file names (and links) that are quoted by ls
PaulWay Mar 31, 2025
5020ea4
Improving documentation (no large dict compares)
PaulWay Mar 31, 2025
027b101
Removing direct raw_entry key comparisons
PaulWay Mar 31, 2025
37dea65
Faster to handle normal line first - no partition, just isdigit
PaulWay Apr 1, 2025
3024a44
Merge branch 'master' into paulway_ls_parser_speed_improvements
PaulWay Apr 2, 2025
ddf347d
Merge branch 'master' into paulway_ls_parser_speed_improvements
xiangce Apr 10, 2025
c942700
Merge branch 'master' into paulway_ls_parser_speed_improvements
xiangce Apr 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
300 changes: 163 additions & 137 deletions insights/core/ls_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,43 +4,47 @@
"""


def parse_path(path):
def trim_fname(fname):
"""
Convert possible symbolic link into a source -> target pair.
Remove enclosing quotes or double quotes in file name.
"""
# Fastest way seems to be compare to set first, then compare first and last
if fname[0] in {'"', "'"} and fname[0] == fname[-1]:
return fname[1:-1]
return fname


def set_name_link(entry, is_softlink, path):
"""
Get the name, and possibly the link, from the rest of the path.

Args:
path (str): The path portion of an ls output line.
entry (dict): the dict to put this data into
is_softlink (bool): is this actually a softlink?
path (str): the rest of the line, optionally including the link

Returns:
A (path, link) tuple where path is always populated and link is a non
empty string if the original path is a symoblic link.
Does not return, entry keys for name and link added
"""
path, _, link = path.partition(" -> ")
return path, link
if is_softlink:
name, _, link = path.partition(" -> ")
entry['name'] = trim_fname(name)
entry['link'] = trim_fname(link)
else:
entry['name'] = trim_fname(path)


def parse_non_selinux(parts):
def parse_major_minor_date(last, result):
"""
Parse part of an ls output line that isn't selinux.
Parse the size / major, minor and date section of the line.

Args:
parts (list): A four element list of strings representing the initial
parts of an ls line after the permission bits. The parts are link
count, owner, group, and everything else.
last (str): the rest of the line after the owner and group.
result (dict): the dirent dict to put details in.

Returns:
A dict containing links, owner, group, date, and name. If the line
represented a device, major and minor numbers are included. Otherwise,
size is included. If the raw name was a symbolic link, link is
included.
The file name portion of the line.
"""
links, owner, group, last = parts
result = {
"links": int(links),
"owner": owner,
"group": group,
}

# device numbers only go to 256.
# If a comma is in the first four characters, the next two elements are
# major and minor device numbers. Otherwise, the next element is the size.
Expand All @@ -50,153 +54,175 @@ def parse_non_selinux(parts):
result["minor"] = int(minor)
else:
size, rest = last.split(None, 1)
result["size"] = int(size)

result["size"] = size if size == '?' else int(size)
# The date part is always 12 characters regardless of content.
result["date"] = rest[:12]
result['date'] = rest[:12]
return rest[13:]

# Jump over the date and the following space to get the path part.
path, link = parse_path(rest[13:])
result["name"] = path
if link:
result["link"] = link

return result
def parse_non_selinux(entry, is_softlink, links, owner, group, last):
"""
Parse part of an ls output line that isn't selinux.

Args:
entry (dict): the dict to put this data into
is_softlink (bool): is this actually a softlink?
links (str): the number of links on this dirent
owner (str): the owner (name or id) of this dirent
group (str): the group (name or id) of this dirent
last (str): the rest of the line

def parse_selinux(parts):
Returns:
A dict containing links, owner, group, date, and name. If the line
represented a device, major and minor numbers are included. Otherwise,
size is included. If the raw name was a symbolic link, link is
included.
"""
# prw-------. 1 0 0 0 Jun 28 09:44 5.ref
# l?????????? ? ? ? ? ? invocation:auditd.service

entry["links"] = links if links == '?' else int(links)
entry["owner"] = owner
entry["group"] = group

rest = parse_major_minor_date(last, entry)

set_name_link(entry, is_softlink, rest)


def set_selinux(entry, selinux_str):
"""
Set the SELinux part of this entry
"""
selinux = selinux_str.split(":")
lsel = len(selinux)
entry["se_user"] = selinux[0]
entry["se_role"] = selinux[1] if lsel > 1 else None
entry["se_type"] = selinux[2] if lsel > 2 else None
entry["se_mls"] = selinux[3] if lsel > 3 else None


def parse_old_selinux(entry, is_softlink, owner, group, selinux_str, name_part):
"""
Parse part of an ls output line that is selinux.

Args:
parts (list): A four element list of strings representing the initial
parts of an ls line after the permission bits. The parts are owner
group, selinux info, and the path.
entry (dict): the dict to put this data into
is_softlink (bool): is this actually a softlink?
links (str): the number of links on this dirent
owner (str): the owner (name or id) of this dirent
group (str): the group (name or id) of this dirent
selinux_str (str): the SELinux context of this dirent
name_part (str): the name (and possibly link))

Returns:
A dict containing owner, group, se_user, se_role, se_type, se_mls, and
name. If the raw name was a symbolic link, link is also included.
No return; the ownership, SELinux context information and name part
are put directly into the entry dict.

"""

owner, group = parts[:2]
selinux = parts[2].split(":")
lsel = len(selinux)
path, link = parse_path(parts[-1])
result = {
"owner": owner,
"group": group,
"se_user": selinux[0],
"se_role": selinux[1] if lsel > 1 else None,
"se_type": selinux[2] if lsel > 2 else None,
"se_mls": selinux[3] if lsel > 3 else None,
"name": path,
}
if link:
result["link"] = link
return result


def parse_rhel8_selinux(parts):
entry["owner"] = owner
entry["group"] = group
set_selinux(entry, selinux_str)
set_name_link(entry, is_softlink, name_part)


def parse_rhel8_selinux(entry, is_softlink, links, owner, group, last):
"""
Parse part of an ls output line that is selinux on RHEL8.

Args:
parts (list): A four element list of strings representing the initial
parts of an ls line after the permission bits. The parts are link
count, owner, group, and everything else
entry (dict): the dict to put this data into
is_softlink (bool): is this actually a softlink?
links (str): the number of links on this dirent
owner (str): the owner (name or id) of this dirent
group (str): the group (name or id) of this dirent
last (str): the rest of the line

Returns:
A dict containing links, owner, group, se_user, se_role, se_type,
se_mls, size, date, and name. If the raw name was a symbolic link,
link is also included.
No return; the ownership, SELinux context information and name part
are put directly into the entry dict.

"""
entry["links"] = int(links) if links.isdigit() else links
entry["owner"] = owner
entry["group"] = group
selinux_str, last = last.split(None, 1)
set_selinux(entry, selinux_str)
rest = parse_major_minor_date(last, entry)
set_name_link(entry, is_softlink, rest)

links, owner, group, last = parts
result = {
"links": int(links),
"owner": owner,
"group": group,
}
selinux, last = parts[-1].split(None, 1)
selinux = selinux.split(":")
lsel = len(selinux)
if "," in last:
major, minor, last = last.split(None, 2)
result['major'] = int(major.rstrip(","))
result['minor'] = int(minor)
else:
size, last = last.split(None, 1)
result['size'] = int(size)
date = last[:12]
path, link = parse_path(last[13:])
result.update(
{
"se_user": selinux[0],
"se_role": selinux[1] if lsel > 1 else None,
"se_type": selinux[2] if lsel > 2 else None,
"se_mls": selinux[3] if lsel > 3 else None,
"name": path,
"date": date,
}
)
if link:
result["link"] = link
return result

parse_mode = {
'normal': parse_non_selinux,
'selinux': parse_old_selinux,
'rhel8_selinux': parse_rhel8_selinux
}


class Directory(dict):
def __init__(self, name, total, body):
def __init__(self, dirname, total, body):
dirs = []
ents = {}
files = []
specials = []
parser = None
for line in body:
# we can't split(None, 5) here b/c rhel 6/7 selinux lines only have
# 4 parts before the path, and the path itself could contain
# spaces. Unfortunately, this means we have to split the line again
# below
parts = line.split(None, 4)
perms = parts[0]
try:
perms, links, owner, group, rest = line.split(None, 4)
except ValueError:
# Ignore malformed lines completely
continue
typ = perms[0]
entry = {"type": typ, "perms": perms[1:]}
if parts[1][0].isdigit():
# We have to split the line again to see if this is a RHEL8
# selinux stanza. This assumes that the context section will
# always have at least two pieces separated by ':'.
# '?' as the whole RHEL8 security context is also acceptable.
rhel8_selinux_ctx = line.split()[4].strip()
if ":" in rhel8_selinux_ctx or '?' == rhel8_selinux_ctx:
rest = parse_rhel8_selinux(parts[1:])
entry = {
"type": typ,
"perms": perms[1:],
"dir": dirname,
}
# determine mode once per directory
if parser is None:
if links[0].isdigit():
# We have to split the line again to see if this is a RHEL8
# selinux stanza. This assumes that the context section will
# always have at least two pieces separated by ':'.
# '?' as the whole RHEL8 security context is also acceptable.
# Handle normal case first...
if rest[0].isdigit():
# crw-------. 1 0 0 10, 236 Jul 25 10:00 control
# lrwxrwxrwx. 1 0 0 11 Aug 4 2014 menu.lst -> ./grub.conf
parser = parse_mode['normal']
else:
# -rwxrwxr-x. 1 user group unconfined_u:object_r:var_lib_t:s0 54 Apr 8 16:41 abcd-efgh-ijkl-mnop
parser = parse_mode['rhel8_selinux']
else:
rest = parse_non_selinux(parts[1:])
else:
rest = parse_selinux(parts[1:])

# Update our entry and put it into the correct buckets
# based on its type.
entry.update(rest)
# TODO
# - The `raw_entry` key is deprecated and will be removed from 3.6.0.
# Please use the `insights.parsers.ls.FileListingParser.raw_entry_of` instead.
entry["raw_entry"] = line
entry["dir"] = name
# -rw-r--r--. root root system_u:object_r:boot_t:s0 config-3.10.0-267
parser = parse_mode['selinux']
# Now parse based on mode
parser(entry, typ == 'l', links, owner, group, rest)

# final details
# entry["raw_entry"] = line

nm = entry["name"]
ents[nm] = entry
if typ not in "bcd":
# Files are most common, dirs next, so we handle this in frequency order
if typ in "-l":
files.append(nm)
elif typ == "d":
dirs.append(nm)
elif typ in "bc":
else:
specials.append(nm)

super(Directory, self).__init__(
{
"dirs": dirs,
"entries": ents,
"files": files,
"name": name,
"name": dirname,
"specials": specials,
"total": total,
}
Expand Down Expand Up @@ -224,25 +250,25 @@ def parse(lines, root=None):
for line in lines:
line = line.strip()
# Skip empty line and non-exist dir line
if not line or ': No such file or directory' in line:
if not line or ': No such file or directory' in line or 'cannot open directory' in line:
continue
if line and line[0] == "/" and line[-1] == ":":
if name is None:
name = line[:-1]
if entries:
d = Directory(name, total or len(entries), entries)
doc[root] = d
total = None
entries = []
else:
d = Directory(name, total or len(entries), entries)
doc[name or root] = d
total = None
entries = []
name = line[:-1]
if line[0] == "/" and line[-1] == ":":
# Directory name - like '/tmp:'
if total is None:
total = len(entries)
# Some old directory listings don't have an initial name line,
# so we put any entries we collected before a named directory in
# our 'root' directory - if we got a 'root' directory at all...
old_name = root if name is None else name
if old_name is not None:
doc[old_name] = Directory(old_name, total, entries)
name = line[:-1]
total = None
entries = []
continue
if line.startswith("total"):
total = int(line.split(None, 1)[1])
# Should be first line after directory name
total = int(line[6:])
continue
entries.append(line)
name = name or root
Expand Down
Loading
Loading