RedHatInsights · PaulWay · Feb 3, 2025 · Feb 3, 2025 · Feb 4, 2025 · Feb 4, 2025
diff --git a/insights/core/ls_parser.py b/insights/core/ls_parser.py
@@ -4,43 +4,47 @@
 """
 
 
-def parse_path(path):
+def trim_fname(fname):
     """
-    Convert possible symbolic link into a source -> target pair.
+    Remove enclosing quotes or double quotes in file name.
+    """
+    # Fastest way seems to be compare to set first, then compare first and last
+    if fname[0] in {'"', "'"} and fname[0] == fname[-1]:
+        return fname[1:-1]
+    return fname
+
+
+def set_name_link(entry, is_softlink, path):
+    """
+    Get the name, and possibly the link, from the rest of the path.
 
     Args:
-        path (str): The path portion of an ls output line.
+        entry (dict): the dict to put this data into
+        is_softlink (bool): is this actually a softlink?
+        path (str): the rest of the line, optionally including the link
 
     Returns:
-        A (path, link) tuple where path is always populated and link is a non
-        empty string if the original path is a symoblic link.
+        Does not return, entry keys for name and link added
     """
-    path, _, link = path.partition(" -> ")
-    return path, link
+    if is_softlink:
+        name, _, link = path.partition(" -> ")
+        entry['name'] = trim_fname(name)
+        entry['link'] = trim_fname(link)
+    else:
+        entry['name'] = trim_fname(path)
 
 
-def parse_non_selinux(parts):
+def parse_major_minor_date(last, result):
     """
-    Parse part of an ls output line that isn't selinux.
+    Parse the size / major, minor and date section of the line.
 
     Args:
-        parts (list): A four element list of strings representing the initial
-            parts of an ls line after the permission bits. The parts are link
-            count, owner, group, and everything else.
+        last (str): the rest of the line after the owner and group.
+        result (dict): the dirent dict to put details in.
 
     Returns:
-        A dict containing links, owner, group, date, and name. If the line
-        represented a device, major and minor numbers are included.  Otherwise,
-        size is included. If the raw name was a symbolic link, link is
-        included.
+        The file name portion of the line.
     """
-    links, owner, group, last = parts
-    result = {
-        "links": int(links),
-        "owner": owner,
-        "group": group,
-    }
-
     # device numbers only go to 256.
     # If a comma is in the first four characters, the next two elements are
     # major and minor device numbers. Otherwise, the next element is the size.
@@ -50,153 +54,175 @@ def parse_non_selinux(parts):
         result["minor"] = int(minor)
     else:
         size, rest = last.split(None, 1)
-        result["size"] = int(size)
-
+        result["size"] = size if size == '?' else int(size)
     # The date part is always 12 characters regardless of content.
-    result["date"] = rest[:12]
+    result['date'] = rest[:12]
+    return rest[13:]
 
-    # Jump over the date and the following space to get the path part.
-    path, link = parse_path(rest[13:])
-    result["name"] = path
-    if link:
-        result["link"] = link
 
-    return result
+def parse_non_selinux(entry, is_softlink, links, owner, group, last):
+    """
+    Parse part of an ls output line that isn't selinux.
 
+    Args:
+        entry (dict): the dict to put this data into
+        is_softlink (bool): is this actually a softlink?
+        links (str): the number of links on this dirent
+        owner (str): the owner (name or id) of this dirent
+        group (str): the group (name or id) of this dirent
+        last (str): the rest of the line
 
-def parse_selinux(parts):
+    Returns:
+        A dict containing links, owner, group, date, and name. If the line
+        represented a device, major and minor numbers are included.  Otherwise,
+        size is included. If the raw name was a symbolic link, link is
+        included.
+    """
+    # prw-------.  1 0 0   0 Jun 28 09:44 5.ref
+    # l??????????  ? ? ?    ?            ? invocation:auditd.service
+
+    entry["links"] = links if links == '?' else int(links)
+    entry["owner"] = owner
+    entry["group"] = group
+
+    rest = parse_major_minor_date(last, entry)
+
+    set_name_link(entry, is_softlink, rest)
+
+
+def set_selinux(entry, selinux_str):
+    """
+    Set the SELinux part of this entry
+    """
+    selinux = selinux_str.split(":")
+    lsel = len(selinux)
+    entry["se_user"] = selinux[0]
+    entry["se_role"] = selinux[1] if lsel > 1 else None
+    entry["se_type"] = selinux[2] if lsel > 2 else None
+    entry["se_mls"] = selinux[3] if lsel > 3 else None
+
+
+def parse_old_selinux(entry, is_softlink, owner, group, selinux_str, name_part):
     """
     Parse part of an ls output line that is selinux.
 
     Args:
-        parts (list): A four element list of strings representing the initial
-            parts of an ls line after the permission bits. The parts are owner
-            group, selinux info, and the path.
+        entry (dict): the dict to put this data into
+        is_softlink (bool): is this actually a softlink?
+        links (str): the number of links on this dirent
+        owner (str): the owner (name or id) of this dirent
+        group (str): the group (name or id) of this dirent
+        selinux_str (str): the SELinux context of this dirent
+        name_part (str): the name (and possibly link))
 
     Returns:
-        A dict containing owner, group, se_user, se_role, se_type, se_mls, and
-        name. If the raw name was a symbolic link, link is also included.
+        No return; the ownership, SELinux context information and name part
+        are put directly into the entry dict.
 
     """
 
-    owner, group = parts[:2]
-    selinux = parts[2].split(":")
-    lsel = len(selinux)
-    path, link = parse_path(parts[-1])
-    result = {
-        "owner": owner,
-        "group": group,
-        "se_user": selinux[0],
-        "se_role": selinux[1] if lsel > 1 else None,
-        "se_type": selinux[2] if lsel > 2 else None,
-        "se_mls": selinux[3] if lsel > 3 else None,
-        "name": path,
-    }
-    if link:
-        result["link"] = link
-    return result
-
-
-def parse_rhel8_selinux(parts):
+    entry["owner"] = owner
+    entry["group"] = group
+    set_selinux(entry, selinux_str)
+    set_name_link(entry, is_softlink, name_part)
+
+
+def parse_rhel8_selinux(entry, is_softlink, links, owner, group, last):
     """
     Parse part of an ls output line that is selinux on RHEL8.
 
     Args:
-        parts (list): A four element list of strings representing the initial
-            parts of an ls line after the permission bits. The parts are link
-            count, owner, group, and everything else
+        entry (dict): the dict to put this data into
+        is_softlink (bool): is this actually a softlink?
+        links (str): the number of links on this dirent
+        owner (str): the owner (name or id) of this dirent
+        group (str): the group (name or id) of this dirent
+        last (str): the rest of the line
 
     Returns:
-        A dict containing links, owner, group, se_user, se_role, se_type,
-        se_mls, size, date, and name. If the raw name was a symbolic link,
-        link is also included.
+        No return; the ownership, SELinux context information and name part
+        are put directly into the entry dict.
 
     """
+    entry["links"] = int(links) if links.isdigit() else links
+    entry["owner"] = owner
+    entry["group"] = group
+    selinux_str, last = last.split(None, 1)
+    set_selinux(entry, selinux_str)
+    rest = parse_major_minor_date(last, entry)
+    set_name_link(entry, is_softlink, rest)
 
-    links, owner, group, last = parts
-    result = {
-        "links": int(links),
-        "owner": owner,
-        "group": group,
-    }
-    selinux, last = parts[-1].split(None, 1)
-    selinux = selinux.split(":")
-    lsel = len(selinux)
-    if "," in last:
-        major, minor, last = last.split(None, 2)
-        result['major'] = int(major.rstrip(","))
-        result['minor'] = int(minor)
-    else:
-        size, last = last.split(None, 1)
-        result['size'] = int(size)
-    date = last[:12]
-    path, link = parse_path(last[13:])
-    result.update(
-        {
-            "se_user": selinux[0],
-            "se_role": selinux[1] if lsel > 1 else None,
-            "se_type": selinux[2] if lsel > 2 else None,
-            "se_mls": selinux[3] if lsel > 3 else None,
-            "name": path,
-            "date": date,
-        }
-    )
-    if link:
-        result["link"] = link
-    return result
+
+parse_mode = {
+    'normal': parse_non_selinux,
+    'selinux': parse_old_selinux,
+    'rhel8_selinux': parse_rhel8_selinux
+}
 
 
 class Directory(dict):
-    def __init__(self, name, total, body):
+    def __init__(self, dirname, total, body):
         dirs = []
         ents = {}
         files = []
         specials = []
+        parser = None
         for line in body:
             # we can't split(None, 5) here b/c rhel 6/7 selinux lines only have
             # 4 parts before the path, and the path itself could contain
             # spaces. Unfortunately, this means we have to split the line again
             # below
-            parts = line.split(None, 4)
-            perms = parts[0]
+            try:
+                perms, links, owner, group, rest = line.split(None, 4)
+            except ValueError:
+                # Ignore malformed lines completely
+                continue
             typ = perms[0]
-            entry = {"type": typ, "perms": perms[1:]}
-            if parts[1][0].isdigit():
-                # We have to split the line again to see if this is a RHEL8
-                # selinux stanza. This assumes that the context section will
-                # always have at least two pieces separated by ':'.
-                # '?' as the whole RHEL8 security context is also acceptable.
-                rhel8_selinux_ctx = line.split()[4].strip()
-                if ":" in rhel8_selinux_ctx or '?' == rhel8_selinux_ctx:
-                    rest = parse_rhel8_selinux(parts[1:])
+            entry = {
+                "type": typ,
+                "perms": perms[1:],
+                "dir": dirname,
+            }
+            # determine mode once per directory
+            if parser is None:
+                if links[0].isdigit():
+                    # We have to split the line again to see if this is a RHEL8
+                    # selinux stanza. This assumes that the context section will
+                    # always have at least two pieces separated by ':'.
+                    # '?' as the whole RHEL8 security context is also acceptable.
+                    # Handle normal case first...
+                    if rest[0].isdigit():
+                        # crw-------.  1 0 0 10,  236 Jul 25 10:00 control
+                        # lrwxrwxrwx.  1 0 0       11 Aug  4  2014 menu.lst -> ./grub.conf
+                        parser = parse_mode['normal']
+                    else:
+                        # -rwxrwxr-x. 1 user group unconfined_u:object_r:var_lib_t:s0 54 Apr  8 16:41 abcd-efgh-ijkl-mnop
+                        parser = parse_mode['rhel8_selinux']
                 else:
-                    rest = parse_non_selinux(parts[1:])
-            else:
-                rest = parse_selinux(parts[1:])
-
-            # Update our entry and put it into the correct buckets
-            # based on its type.
-            entry.update(rest)
-            # TODO
-            # - The `raw_entry` key is deprecated and will be removed from 3.6.0.
-            #   Please use the `insights.parsers.ls.FileListingParser.raw_entry_of` instead.
-            entry["raw_entry"] = line
-            entry["dir"] = name
+                    # -rw-r--r--. root root system_u:object_r:boot_t:s0      config-3.10.0-267
+                    parser = parse_mode['selinux']
+            # Now parse based on mode
+            parser(entry, typ == 'l', links, owner, group, rest)
+
+            # final details
+            # entry["raw_entry"] = line
+
             nm = entry["name"]
             ents[nm] = entry
-            if typ not in "bcd":
+            # Files are most common, dirs next, so we handle this in frequency order
+            if typ in "-l":
                 files.append(nm)
             elif typ == "d":
                 dirs.append(nm)
-            elif typ in "bc":
+            else:
                 specials.append(nm)
 
         super(Directory, self).__init__(
             {
                 "dirs": dirs,
                 "entries": ents,
                 "files": files,
-                "name": name,
+                "name": dirname,
                 "specials": specials,
                 "total": total,
             }
@@ -224,25 +250,25 @@ def parse(lines, root=None):
     for line in lines:
         line = line.strip()
         # Skip empty line and non-exist dir line
-        if not line or ': No such file or directory' in line:
+        if not line or ': No such file or directory' in line or 'cannot open directory' in line:
             continue
-        if line and line[0] == "/" and line[-1] == ":":
-            if name is None:
-                name = line[:-1]
-                if entries:
-                    d = Directory(name, total or len(entries), entries)
-                    doc[root] = d
-                    total = None
-                    entries = []
-            else:
-                d = Directory(name, total or len(entries), entries)
-                doc[name or root] = d
-                total = None
-                entries = []
-                name = line[:-1]
+        if line[0] == "/" and line[-1] == ":":
+            # Directory name - like '/tmp:'
+            if total is None:
+                total = len(entries)
+            # Some old directory listings don't have an initial name line,
+            # so we put any entries we collected before a named directory in
+            # our 'root' directory - if we got a 'root' directory at all...
+            old_name = root if name is None else name
+            if old_name is not None:
+                doc[old_name] = Directory(old_name, total, entries)
+            name = line[:-1]
+            total = None
+            entries = []
             continue
         if line.startswith("total"):
-            total = int(line.split(None, 1)[1])
+            # Should be first line after directory name
+            total = int(line[6:])
             continue
         entries.append(line)
     name = name or root