diff --git a/src/sysdiagnose/analysers/plist.py b/src/sysdiagnose/analysers/plist.py index e67a3343..bb2fcbe9 100644 --- a/src/sysdiagnose/analysers/plist.py +++ b/src/sysdiagnose/analysers/plist.py @@ -1,5 +1,7 @@ from datetime import datetime +import glob import json +import os from typing import Generator from sysdiagnose.utils.base import BaseAnalyserInterface, SysdiagnoseConfig, logger, Event @@ -35,12 +37,53 @@ def execute(self) -> Generator[dict, None, None]: if func.startswith(f'_{self.__class__.__name__}__extract_plist'): yield from getattr(self, func)() + def _build_profile_hash_map(self) -> dict: + """ + Builds a mapping from profile identifiers to their SHA-256 stub file hashes. + + Scans all profile-*.stub.json files in the parser output folder, extracts the + PayloadIdentifier and PayloadUUID from each, and maps them to the SHA-256 hash + embedded in the stub filename. + + Returns a dict with two key formats per profile: + - PayloadIdentifier -> hash (e.g. 'com.apple.basebandlogging' -> '4c14f9f4...') + - PayloadIdentifier-PayloadUUID -> hash (e.g. 'com.apple.basebandlogging-39C9D78D-...' -> '4c14f9f4...') + """ + hash_map = {} + stub_pattern = os.path.join(self.parser.output_folder, 'logs_MCState_Shared_profile-*.stub.json') + + for stub_path in glob.glob(stub_pattern): + filename = os.path.basename(stub_path) + # Extract hash from filename: logs_MCState_Shared_profile-{hash}.stub.json + prefix = 'logs_MCState_Shared_profile-' + suffix = '.stub.json' + if not filename.startswith(prefix) or not filename.endswith(suffix): + continue + stub_hash = filename[len(prefix):-len(suffix)] + + try: + with open(stub_path, 'r') as f: + stub_data = json.loads(f.read()) + + payload_id = stub_data.get('PayloadIdentifier') + payload_uuid = stub_data.get('PayloadUUID') + + if payload_id: + hash_map[payload_id] = stub_hash + if payload_id and payload_uuid: + hash_map[f'{payload_id}-{payload_uuid}'] = stub_hash + except Exception as e: + logger.warning(f'Could not parse profile stub {filename}: {e}') + + return hash_map + def __extract_plist_mdm_data(self) -> Generator[dict, None, None]: """ Extracts MDM profile information from a dedicated plist JSON file. This method specifically targets the 'logs_MCState_Shared_MDM.plist.json' file, parsing each entry to - extract key attributes related to device profiles. + extract key attributes related to device profiles including server URLs, capabilities, + and authentication details. Each extracted entry is yielded as a dictionary, along with the original source filename for traceability. @@ -49,22 +92,263 @@ def __extract_plist_mdm_data(self) -> Generator[dict, None, None]: entity_type: str = 'logs_MCState_Shared_MDM.plist.json' file_path: str = f'{self.parser.output_folder}/{entity_type}' + # Fields to extract into the data dict for forensic enrichment + mdm_fields = [ + 'AccessRights', + 'ManagingProfileIdentifier', + 'ServerURL', + 'CheckInURL', + 'CheckOutWhenRemoved', + 'Topic', + 'IdentityCertificateUUID', + 'SignMessage', + 'UDID', + 'ServerCapabilities', + 'UseDevelopmentAPNS', + ] + + profile_hash_map = self._build_profile_hash_map() + try: with open(file_path, 'r') as f: for line in f: entry = json.loads(line) + data = {'source': entity_type} + for field in mdm_fields: + if field in entry: + data[field] = entry[field] + + # Look up the profile stub SHA-256 hash + managing_id = entry.get('ManagingProfileIdentifier') + if managing_id and managing_id in profile_hash_map: + data['ProfileStubHash'] = profile_hash_map[managing_id] + mdm_entry = Event( datetime=datetime.fromisoformat(entry.get('LastPollingAttempt')), - message= f"MDM Profile: {entry.get('ManagingProfileIdentifier')} with access rights {entry.get('AccessRights')}", + message=f"MDM Profile: {managing_id} with access rights {entry.get('AccessRights')}", timestamp_desc='Last Polling Attempt', module=self.module_name, - data={'source': entity_type} + data=data ) yield mdm_entry.to_dict() - except FileNotFoundError as e: - logger.warning(f'{entity_type} not found for {self.case_id}. {e}') + except FileNotFoundError: + logger.debug(f'{entity_type} not found for {self.case_id} (device not MDM-enrolled)') + except Exception as e: + logger.exception(f'ERROR while extracting {entity_type} file. {e}') + + def __extract_plist_profile_events(self) -> Generator[dict, None, None]: + """ + Extracts profile install/remove events from MCProfileEvents.plist. + + This file records a timeline of configuration profile operations (install, remove) + along with the process that performed the action. Each profile event is yielded + as a separate Event with a proper timestamp. + + :yield: A dictionary containing profile event details. + """ + entity_type: str = 'logs_MCState_Shared_MCProfileEvents.plist.json' + file_path: str = f'{self.parser.output_folder}/{entity_type}' + + profile_hash_map = self._build_profile_hash_map() + + try: + with open(file_path, 'r') as f: + data = json.loads(f.read()) + + profile_events = data.get('ProfileEvents', []) + for event_dict in profile_events: + for profile_identifier, event_info in event_dict.items(): + try: + timestamp = datetime.fromisoformat(event_info.get('Timestamp')) + except (ValueError, TypeError): + logger.warning(f'Invalid timestamp for profile event {profile_identifier}') + continue + + operation = event_info.get('Operation', 'unknown') + process = event_info.get('Process', 'unknown') + + event_data = { + 'source': entity_type, + 'ProfileIdentifier': profile_identifier, + 'Operation': operation, + 'Process': process, + } + + # Look up the profile stub SHA-256 hash (try composite key first, then identifier only) + if profile_identifier in profile_hash_map: + event_data['ProfileStubHash'] = profile_hash_map[profile_identifier] + else: + # Try matching on just the PayloadIdentifier (strip the UUID suffix) + base_id = profile_identifier.rsplit('-', 5)[0] if '-' in profile_identifier else profile_identifier + if base_id in profile_hash_map: + event_data['ProfileStubHash'] = profile_hash_map[base_id] + + profile_event = Event( + datetime=timestamp, + message=f"Profile {operation}: {profile_identifier} by {process}", + timestamp_desc=f'Profile {operation}', + module=self.module_name, + data=event_data + ) + + yield profile_event.to_dict() + + except FileNotFoundError: + logger.debug(f'{entity_type} not found for {self.case_id}') + except Exception as e: + logger.exception(f'ERROR while extracting {entity_type} file. {e}') + + def __extract_plist_vpn_profiles(self) -> Generator[dict, None, None]: + """ + Extracts VPN and network extension profile configurations from the + NetworkExtension plist. + + Parses each NEConfiguration entry from com.apple.networkextension.plist, + extracting VPN tunnel details such as provider bundle identifiers, server + addresses, on-demand rules, and tunnel settings. Non-VPN network extension + entries (content filters, DNS settings) are also captured. + + Uses the sysdiagnose creation timestamp since VPN configs are a point-in-time + snapshot with no individual timestamps. + + :yield: A dictionary containing VPN profile details. + """ + entity_type: str = 'logs_Networking_com.apple.networkextension.plist.json' + file_path: str = f'{self.parser.output_folder}/{entity_type}' + + # Map numeric tunnel types to human-readable names + tunnel_type_map = { + 1: 'PacketTunnel', + 2: 'AppProxy', + 3: 'IPSec', + 4: 'IKEv2', + } + + # Map numeric on-demand action values to names + on_demand_action_map = { + 1: 'Connect', + 2: 'Disconnect', + 3: 'EvaluateConnection', + 4: 'Ignore', + } + + # Map numeric interface type match values + interface_type_map = { + 0: 'Any', + 1: 'Ethernet', + 2: 'WiFi', + 3: 'Cellular', + } + + snapshot_time = self.sysdiagnose_creation_datetime + + try: + with open(file_path, 'r') as f: + data = json.loads(f.read()) + + for config_uuid, config_entry in data.items(): + if not isinstance(config_entry, dict): + continue + + # Skip metadata keys (Version, Generation, Index, etc.) + if 'VPN' not in config_entry and 'ContentFilter' not in config_entry and 'DNSSettings' not in config_entry: + continue + + name = config_entry.get('Name', '') + application = config_entry.get('Application', '') + application_name = config_entry.get('ApplicationName', '') + + vpn_config = config_entry.get('VPN') + content_filter = config_entry.get('ContentFilter') + dns_settings = config_entry.get('DNSSettings') + + # Determine the extension type + if isinstance(vpn_config, dict) and vpn_config: + extension_type = 'VPN' + elif isinstance(content_filter, dict) and content_filter: + extension_type = 'ContentFilter' + elif isinstance(dns_settings, dict) and dns_settings: + extension_type = 'DNSSettings' + else: + extension_type = 'Other' + + event_data = { + 'source': entity_type, + 'ConfigUUID': config_uuid, + 'Name': name, + 'Application': application, + 'ApplicationName': application_name, + 'ExtensionType': extension_type, + 'Grade': config_entry.get('Grade'), + 'AlwaysOnVPN': config_entry.get('AlwaysOnVPN'), + } + + # Extract VPN-specific fields + if extension_type == 'VPN' and isinstance(vpn_config, dict): + raw_tunnel_type = vpn_config.get('TunnelType') + event_data['TunnelType'] = tunnel_type_map.get(raw_tunnel_type, raw_tunnel_type) + event_data['Enabled'] = vpn_config.get('Enabled', False) + event_data['OnDemandEnabled'] = vpn_config.get('OnDemandEnabled', False) + event_data['DisconnectOnDemandEnabled'] = vpn_config.get('DisconnectOnDemandEnabled', False) + + # Resolve on-demand rules with human-readable values + raw_rules = vpn_config.get('OnDemandRules', []) + if isinstance(raw_rules, list) and raw_rules: + resolved_rules = [] + for rule in raw_rules: + if isinstance(rule, dict): + resolved_rule = {} + raw_action = rule.get('Action') + resolved_rule['Action'] = on_demand_action_map.get(raw_action, raw_action) + raw_iface = rule.get('InterfaceTypeMatch') + resolved_rule['InterfaceTypeMatch'] = interface_type_map.get(raw_iface, raw_iface) + # Include non-empty match criteria + for match_key in ('SSIDMatch', 'DNSSearchDomainMatch', 'DNSServerAddressMatch', 'ProbeURL'): + val = rule.get(match_key) + if val: + resolved_rule[match_key] = val + resolved_rules.append(resolved_rule) + event_data['OnDemandRules'] = resolved_rules + + # Extract protocol details + protocol = vpn_config.get('Protocol') + if isinstance(protocol, dict): + event_data['ServerAddress'] = protocol.get('ServerAddress', '') + event_data['NEProviderBundleIdentifier'] = protocol.get('NEProviderBundleIdentifier', '') + event_data['IncludeAllNetworks'] = protocol.get('IncludeAllNetworks', False) + event_data['ExcludeLocalNetworks'] = protocol.get('ExcludeLocalNetworks', False) + event_data['EnforceRoutes'] = protocol.get('EnforceRoutes', False) + event_data['DisconnectOnSleep'] = protocol.get('DisconnectOnSleep', False) + + vendor_config = protocol.get('VendorConfiguration') + if isinstance(vendor_config, dict) and vendor_config: + event_data['VendorConfiguration'] = vendor_config + + # Build message + status_parts = [] + if event_data.get('Enabled'): + status_parts.append('enabled') + if event_data.get('OnDemandEnabled'): + status_parts.append('on-demand') + status_str = f" ({', '.join(status_parts)})" if status_parts else '' + + display_name = application_name or name or application or config_uuid + message = f"NetworkExtension {extension_type}: {display_name}{status_str}" + + vpn_event = Event( + datetime=snapshot_time, + message=message, + timestamp_desc='Sysdiagnose Creation', + module=self.module_name, + data=event_data + ) + + yield vpn_event.to_dict() + + except FileNotFoundError: + logger.debug(f'{entity_type} not found for {self.case_id}') except Exception as e: logger.exception(f'ERROR while extracting {entity_type} file. {e}') diff --git a/src/sysdiagnose/analysers/ps_everywhere.py b/src/sysdiagnose/analysers/ps_everywhere.py index edaca35b..73fce8ff 100644 --- a/src/sysdiagnose/analysers/ps_everywhere.py +++ b/src/sysdiagnose/analysers/ps_everywhere.py @@ -1,7 +1,7 @@ #! /usr/bin/env python3 -from datetime import datetime -from typing import Generator, Set, Optional +from datetime import datetime, timedelta +from typing import Generator, Set, Optional, Dict from sysdiagnose.parsers import ps from sysdiagnose.utils.base import BaseAnalyserInterface, SysdiagnoseConfig, logger, Event from sysdiagnose.parsers.ps import PsParser @@ -14,6 +14,7 @@ from sysdiagnose.parsers.uuid2path import UUID2PathParser from sysdiagnose.parsers.taskinfo import TaskinfoParser from sysdiagnose.parsers.remotectl_dumpstate import RemotectlDumpstateParser +from sysdiagnose.parsers.crashlogs import CrashLogsParser class PsEverywhereAnalyser(BaseAnalyserInterface): @@ -22,6 +23,9 @@ class PsEverywhereAnalyser(BaseAnalyserInterface): to build a comprehensive list of running processes across different system logs. The timestamp is 'a' time the process was seen in the log, without being specifically the first or last seen. + + Deduplication strategy: Processes are deduplicated within a 1-hour window. If the same process + appears more than 1 hour apart, both occurrences are kept to track temporal patterns. """ description = "List all processes we can find a bit everywhere." @@ -30,6 +34,10 @@ class PsEverywhereAnalyser(BaseAnalyserInterface): def __init__(self, config: SysdiagnoseConfig, case_id: str): super().__init__(__file__, config, case_id) self.all_ps: Set[str] = set() + # Track last seen timestamp for each process (for time-based deduplication) + self.process_last_seen: Dict[str, datetime] = {} + # PID to process name mapping for parent name resolution + self.pid_to_name: Dict[int, str] = {} @staticmethod def _strip_flags(process: str) -> str: @@ -42,6 +50,93 @@ def _strip_flags(process: str) -> str: process, *_ = process.partition(' ') return process + @staticmethod + def _sanitize_uid(uid: Optional[int]) -> Optional[int]: + """ + Sanitizes UID values by filtering out invalid/placeholder values. + + :param uid: The UID value to sanitize + :return: The UID if valid, None if invalid/placeholder + """ + # 0xAAAAAAAA (2863311530) is a common placeholder/uninitialized value + # 0xFFFFFFFF (4294967295) is -1 as unsigned, also invalid + if uid in (2863311530, 4294967295): + return None + return uid + + def _resolve_ppname(self, ppid: Optional[int]) -> Optional[str]: + """ + Resolves parent process ID to parent process name using the PID mapping. + + :param ppid: Parent process ID + :return: Parent process name if found, None otherwise + """ + if ppid is None: + return None + return self.pid_to_name.get(ppid) + + def _build_pid_mapping(self): + """ + Builds a PID to process name mapping from available parsers. + This mapping is used to resolve parent process names from PPIDs. + """ + # Build from ps.txt + try: + for p in PsParser(self.config, self.case_id).get_result(): + pid = p['data'].get('pid') + command = p['data'].get('command') + if pid and command: + self.pid_to_name[pid] = self._strip_flags(command) + except Exception as e: + logger.debug(f"Could not build PID mapping from ps.txt: {e}") + + # Build from psthread.txt + try: + for p in PsThreadParser(self.config, self.case_id).get_result(): + pid = p['data'].get('pid') + command = p['data'].get('command') + if pid and command: + self.pid_to_name[pid] = self._strip_flags(command) + except Exception as e: + logger.debug(f"Could not build PID mapping from psthread.txt: {e}") + + # Build from spindump + try: + for event in SpindumpNoSymbolsParser(self.config, self.case_id).get_result(): + p = event['data'] + if 'process' in p: + pid = p.get('pid') + process_name = p.get('path', p['process']) + if pid and process_name: + self.pid_to_name[pid] = self._strip_flags(process_name) + except Exception as e: + logger.debug(f"Could not build PID mapping from spindump: {e}") + + # Build from taskinfo + try: + for p in TaskinfoParser(self.config, self.case_id).get_result(): + if 'name' in p['data']: + pid = p['data'].get('pid') + name = p['data'].get('name') + if pid and name: + self.pid_to_name[pid] = self._strip_flags(name) + except Exception as e: + logger.debug(f"Could not build PID mapping from taskinfo: {e}") + + # Build from crashlogs (JetsamEvent process lists) + try: + for event in CrashLogsParser(self.config, self.case_id).get_result(): + report = event.get('data', {}).get('report', {}) + for proc in report.get('processes', []): + pid = proc.get('pid') + name = proc.get('name') + if pid and name: + self.pid_to_name[pid] = name + except Exception as e: + logger.debug(f"Could not build PID mapping from crashlogs: {e}") + + logger.info(f"Built PID mapping with {len(self.pid_to_name)} entries") + @staticmethod def message_extract_binary(process: str, message: str) -> Optional[str | list[str]]: """ @@ -163,6 +258,9 @@ def execute(self) -> Generator[dict, None, None]: :yield: A dictionary containing process details from various sources. """ + # Build PID to name mapping first for parent name resolution + self._build_pid_mapping() + for func in dir(self): if func.startswith(f"_{self.__class__.__name__}__extract_ps_"): yield from getattr(self, func)() # Dynamically call extract methods @@ -176,14 +274,18 @@ def __extract_ps_base_file(self) -> Generator[dict, None, None]: entity_type = 'ps.txt' try: for p in PsParser(self.config, self.case_id).get_result(): + uid = self._sanitize_uid(p['data'].get('uid')) + pid = p['data'].get('pid') + ppid = p['data'].get('ppid') + ppname = self._resolve_ppname(ppid) ps_event = Event( datetime=datetime.fromisoformat(p['datetime']), - message= self._strip_flags(p['data']['command']), + message=self._strip_flags(p['data']['command']), timestamp_desc=p['timestamp_desc'], module=self.module_name, - data={'source': entity_type} + data={'source': entity_type, 'uid': uid, 'pid': pid, 'ppid': ppid, 'ppname': ppname} ) - if self.add_if_full_command_is_not_in_set(ps_event.message): + if self.add_if_full_command_is_not_in_set(ps_event.message, ps_event.datetime, uid): yield ps_event.to_dict() except Exception as e: logger.exception(f"ERROR while extracting {entity_type} file. {e}") @@ -197,14 +299,18 @@ def __extract_ps_thread_file(self) -> Generator[dict, None, None]: entity_type = 'psthread.txt' try: for p in PsThreadParser(self.config, self.case_id).get_result(): + uid = self._sanitize_uid(p['data'].get('uid')) + pid = p['data'].get('pid') + ppid = p['data'].get('ppid') + ppname = self._resolve_ppname(ppid) ps_event = Event( datetime=datetime.fromisoformat(p['datetime']), message=self._strip_flags(p['data']['command']), timestamp_desc=p['timestamp_desc'], module=self.module_name, - data={'source': entity_type} + data={'source': entity_type, 'uid': uid, 'pid': pid, 'ppid': ppid, 'ppname': ppname} ) - if self.add_if_full_command_is_not_in_set(ps_event.message): + if self.add_if_full_command_is_not_in_set(ps_event.message, ps_event.datetime, uid): yield ps_event.to_dict() except Exception as e: logger.exception(f"ERROR while extracting {entity_type} file. {e}") @@ -222,26 +328,32 @@ def __extract_ps_spindump_nosymbols_file(self) -> Generator[dict, None, None]: if 'process' not in p: continue process_name = p.get('path', '/kernel' if p['process'] == 'kernel_task [0]' else p['process']) - - if self.add_if_full_command_is_not_in_set(self._strip_flags(process_name)): + event_datetime = datetime.fromisoformat(event['datetime']) + uid = self._sanitize_uid(p.get('uid')) + pid = p.get('pid') + ppid = p.get('ppid') + # Spindump has a direct 'parent' field with the parent process name + ppname = p.get('parent') + + if self.add_if_full_command_is_not_in_set(self._strip_flags(process_name), event_datetime, uid): yield Event( - datetime=datetime.fromisoformat(event['datetime']), + datetime=event_datetime, message=self._strip_flags(process_name), timestamp_desc=event['timestamp_desc'], module=self.module_name, - data={'source': entity_type} + data={'source': entity_type, 'uid': uid, 'pid': pid, 'ppid': ppid, 'ppname': ppname} ).to_dict() for t in p['threads']: try: thread_name = f"{self._strip_flags(process_name)}::{t['thread_name']}" - if self.add_if_full_command_is_not_in_set(thread_name): + if self.add_if_full_command_is_not_in_set(thread_name, event_datetime, uid): yield Event( - datetime=datetime.fromisoformat(event['datetime']), + datetime=event_datetime, message=self._strip_flags(thread_name), timestamp_desc=event['timestamp_desc'], module=self.module_name, - data={'source': entity_type} + data={'source': entity_type, 'uid': uid, 'pid': pid, 'ppid': ppid, 'ppname': ppname} ).to_dict() except KeyError: pass @@ -252,19 +364,24 @@ def __extract_ps_shutdownlogs(self) -> Generator[dict, None, None]: """ Extracts process data from shutdown logs. + Note: Unlike other sources, shutdown logs always keep all entries even if duplicate, + as each entry represents a different shutdown event where the process was blocking. + :return: A generator yielding dictionaries containing process details from shutdown logs. """ entity_type = 'shutdown.logs' try: for p in ShutdownLogsParser(self.config, self.case_id).get_result(): - if self.add_if_full_command_is_not_in_set(self._strip_flags(p['data']['command'])): - yield Event( - datetime=datetime.fromisoformat(p['datetime']), - message=self._strip_flags(p['data']['command']), - timestamp_desc=p['timestamp_desc'], - module=self.module_name, - data={'source': entity_type} - ).to_dict() + # Always yield shutdown log entries, even if duplicate + # Each occurrence represents a different shutdown event + pid = p['data'].get('pid') + yield Event( + datetime=datetime.fromisoformat(p['datetime']), + message=self._strip_flags(p['data']['command']), + timestamp_desc=p['timestamp_desc'], + module=self.module_name, + data={'source': entity_type, 'uid': None, 'pid': pid, 'ppid': None, 'ppname': None} + ).to_dict() except Exception as e: logger.exception(f"ERROR while extracting {entity_type}. {e}") @@ -277,39 +394,43 @@ def __extract_ps_logarchive(self) -> Generator[dict, None, None]: entity_type = 'log archive' try: for p in LogarchiveParser(self.config, self.case_id).get_result(): + p_datetime = datetime.fromisoformat(p['datetime']) + euid = self._sanitize_uid(p['data'].get('euid')) + pid = p['data'].get('pid') + # First check if we can extract a binary from the message extracted_process = self.message_extract_binary(p['data']['process'], p['message']) if extracted_process: # Handle the case where extracted_process is a list of paths if isinstance(extracted_process, list): for proc_path in extracted_process: - if self.add_if_full_command_is_not_in_set(self._strip_flags(proc_path)): + if self.add_if_full_command_is_not_in_set(self._strip_flags(proc_path), p_datetime, None): yield Event( - datetime.fromisoformat(p['datetime']), + p_datetime, message=self._strip_flags(proc_path), timestamp_desc=p['timestamp_desc'], module=self.module_name, - data={'source': entity_type} + data={'source': entity_type, 'uid': None, 'pid': None, 'ppid': None, 'ppname': None} ).to_dict() else: # Handle the case where it's a single string - if self.add_if_full_command_is_not_in_set(self._strip_flags(extracted_process)): + if self.add_if_full_command_is_not_in_set(self._strip_flags(extracted_process), p_datetime, None): yield Event( - datetime=datetime.fromisoformat(p['datetime']), + datetime=p_datetime, message=self._strip_flags(extracted_process), timestamp_desc=p['timestamp_desc'], module=self.module_name, - data={'source': entity_type} + data={'source': entity_type, 'uid': None, 'pid': None, 'ppid': None, 'ppname': None} ).to_dict() # Process the original process name - if self.add_if_full_command_is_not_in_set(self._strip_flags(p['data']['process'])): + if self.add_if_full_command_is_not_in_set(self._strip_flags(p['data']['process']), p_datetime, euid): yield Event( - datetime=datetime.fromisoformat(p['datetime']), + datetime=p_datetime, message=self._strip_flags(p['data']['process']), timestamp_desc=p['timestamp_desc'], module=self.module_name, - data={'source': entity_type} + data={'source': entity_type, 'uid': euid, 'pid': pid, 'ppid': None, 'ppname': None} ).to_dict() except Exception as e: logger.exception(f"ERROR while extracting {entity_type}. {e}") @@ -323,14 +444,13 @@ def __extract_ps_uuid2path(self) -> Generator[dict, None, None]: entity_type = 'uuid2path' try: for p in UUID2PathParser(self.config, self.case_id).get_result().values(): - if self.add_if_full_command_is_not_in_set(self._strip_flags(p)): - # FIXME: what timestamp to use here? + if self.add_if_full_command_is_not_in_set(self._strip_flags(p), self.sysdiagnose_creation_datetime, None): yield Event( datetime=self.sysdiagnose_creation_datetime, message=self._strip_flags(p), timestamp_desc="Process path from UUID existing at sysdiagnose creation time", module=self.module_name, - data={'source': entity_type} + data={'source': entity_type, 'uid': None, 'pid': None, 'ppid': None, 'ppname': None} ).to_dict() except Exception as e: logger.exception(f"ERROR while extracting {entity_type}. {e}") @@ -347,25 +467,27 @@ def __extract_ps_taskinfo(self) -> Generator[dict, None, None]: if 'name' not in p['data']: continue - if self.add_if_full_path_is_not_in_set(self._strip_flags(p['data']['name'])): + p_datetime = datetime.fromisoformat(p['datetime']) + pid = p['data'].get('pid') + if self.add_if_full_path_is_not_in_set(self._strip_flags(p['data']['name']), p_datetime, None): yield Event( - datetime=datetime.fromisoformat(p['datetime']), + datetime=p_datetime, message=self._strip_flags(p['data']['name']), timestamp_desc=p['timestamp_desc'], module=self.module_name, - data={'source': entity_type} + data={'source': entity_type, 'uid': None, 'pid': pid, 'ppid': None, 'ppname': None} ).to_dict() for t in p['data']['threads']: try: thread_name = f"{self._strip_flags(p['data']['name'])}::{t['thread name']}" - if self.add_if_full_path_is_not_in_set(thread_name): + if self.add_if_full_path_is_not_in_set(thread_name, p_datetime, None): yield Event( - datetime.fromisoformat(p['datetime']), + p_datetime, message=thread_name, timestamp_desc=p['timestamp_desc'], module=self.module_name, - data={'source': entity_type} + data={'source': entity_type, 'uid': None, 'pid': pid, 'ppid': None, 'ppname': None} ).to_dict() except KeyError: pass @@ -383,14 +505,72 @@ def __extract_ps_remotectl_dumpstate(self) -> Generator[dict, None, None]: remotectl_dumpstate_json = RemotectlDumpstateParser(self.config, self.case_id).get_result() if remotectl_dumpstate_json: for p in remotectl_dumpstate_json['Local device']['Services']: - if self.add_if_full_path_is_not_in_set(self._strip_flags(p)): - # FIXME: what timestamp to use here? + if self.add_if_full_path_is_not_in_set(self._strip_flags(p), self.sysdiagnose_creation_datetime, None): yield Event( datetime=self.sysdiagnose_creation_datetime, message=self._strip_flags(p), timestamp_desc="Existing service at sysdiagnose creation time", module=self.module_name, - data={'source': entity_type} + data={'source': entity_type, 'uid': None, 'pid': None, 'ppid': None, 'ppname': None} + ).to_dict() + except Exception as e: + logger.exception(f"ERROR while extracting {entity_type}. {e}") + + def __extract_ps_crashlogs(self) -> Generator[dict, None, None]: + """ + Extracts process data from crash logs. + + JetsamEvent crash logs contain a full list of running processes at the time of the event. + Other crash types contain the crashing process name and path. + + :return: A generator yielding dictionaries containing process details from crash logs. + """ + entity_type = 'crashlogs' + try: + for event in CrashLogsParser(self.config, self.case_id).get_result(): + report = event.get('data', {}).get('report', {}) + event_datetime = datetime.fromisoformat(event['datetime']) + + # JetsamEvent: extract all processes from the process list + processes = report.get('processes', []) + for proc in processes: + name = proc.get('name') + if not name: + continue + pid = proc.get('pid') + if self.add_if_full_path_is_not_in_set(name, event_datetime, None): + yield Event( + datetime=event_datetime, + message=name, + timestamp_desc='Process seen in JetsamEvent', + module=self.module_name, + data={'source': entity_type, 'uid': None, 'pid': pid, 'ppid': None, 'ppname': None} + ).to_dict() + + # Non-JetsamEvent: extract crashing process path/name with uid/ppid + proc_path = report.get('procPath') + proc_name = report.get('procName') + uid = self._sanitize_uid(report.get('userID')) + crash_pid = report.get('pid') + ppid = report.get('parentPid') + ppname = report.get('parentProc') + if proc_path: + if self.add_if_full_command_is_not_in_set(self._strip_flags(proc_path), event_datetime, uid): + yield Event( + datetime=event_datetime, + message=self._strip_flags(proc_path), + timestamp_desc='Crashing process from crashlog', + module=self.module_name, + data={'source': entity_type, 'uid': uid, 'pid': crash_pid, 'ppid': ppid, 'ppname': ppname} + ).to_dict() + elif proc_name: + if self.add_if_full_path_is_not_in_set(proc_name, event_datetime, uid): + yield Event( + datetime=event_datetime, + message=proc_name, + timestamp_desc='Crashing process from crashlog', + module=self.module_name, + data={'source': entity_type, 'uid': uid, 'pid': crash_pid, 'ppid': ppid, 'ppname': ppname} ).to_dict() except Exception as e: logger.exception(f"ERROR while extracting {entity_type}. {e}") @@ -404,13 +584,14 @@ def __extract_ps_logdata_statistics(self) -> Generator[dict, None, None]: entity_type = 'logdata.statistics.jsonl' try: for p in LogDataStatisticsParser(self.config, self.case_id).get_result(): - if self.add_if_full_command_is_not_in_set(self._strip_flags(p['data']['process'])): + p_datetime = datetime.fromisoformat(p['datetime']) + if self.add_if_full_command_is_not_in_set(self._strip_flags(p['data']['process']), p_datetime, None): yield Event( - datetime=datetime.fromisoformat(p['datetime']), + datetime=p_datetime, message=self._strip_flags(p['data']['process']), timestamp_desc=p['timestamp_desc'], module=self.module_name, - data={'source': entity_type} + data={'source': entity_type, 'uid': None, 'pid': None, 'ppid': None, 'ppname': None} ).to_dict() except Exception as e: logger.exception(f"ERROR while extracting {entity_type}. {e}") @@ -425,44 +606,87 @@ def __extract_ps_logdata_statistics_txt(self) -> Generator[dict, None, None]: try: for p in LogDataStatisticsTxtParser(self.config, self.case_id).get_result(): - if self.add_if_full_path_is_not_in_set(self._strip_flags(p['data']['process'])): + p_datetime = datetime.fromisoformat(p['datetime']) + if self.add_if_full_path_is_not_in_set(self._strip_flags(p['data']['process']), p_datetime, None): yield Event( - datetime=datetime.fromisoformat(p['datetime']), + datetime=p_datetime, message=self._strip_flags(p['data']['process']), timestamp_desc=p['timestamp_desc'], module=self.module_name, - data={'source': entity_type} + data={'source': entity_type, 'uid': None, 'pid': None, 'ppid': None, 'ppname': None} ).to_dict() except Exception as e: logger.exception(f"ERROR while extracting {entity_type}. {e}") - def add_if_full_path_is_not_in_set(self, name: str) -> bool: + def add_if_full_path_is_not_in_set(self, name: str, timestamp: Optional[datetime] = None, uid: Optional[int] = None) -> bool: """ - Ensures that a process path is unique before adding it to the shared set. + Ensures that a process path is unique before adding it to the shared set, + with time-based deduplication: only keep duplicates if they occur more than 1 hour apart. + UID is considered part of the uniqueness - same process with different UID is treated as separate. :param name: Process path name - :return: True if the process was not in the set and was added, False otherwise. + :param timestamp: Timestamp of the process occurrence (optional, for time-based deduplication) + :param uid: User ID of the process (optional, considered in uniqueness check) + :return: True if the process was not in the set or last seen > 1 hour ago, False otherwise. """ - for item in self.all_ps: - if item.endswith(name): - return False - if item.split('::')[0].endswith(name): + # Create a unique key that includes both name and UID + unique_key = f"{name}|uid:{uid}" + + # If no timestamp provided, use old behavior (always check for duplicates) + if timestamp is None: + for item in self.all_ps: + if item.endswith(name): + return False + if item.split('::')[0].endswith(name): + return False + if '::' not in item and item.split(' ')[0].endswith(name): + return False # This covers cases with space-separated commands + self.all_ps.add(unique_key) + return True + + # Time-based deduplication: check if we've seen this process+uid combination recently + if unique_key in self.process_last_seen: + time_diff = timestamp - self.process_last_seen[unique_key] + # Only add if more than 1 hour has passed + if time_diff < timedelta(hours=1): return False - if '::' not in item and item.split(' ')[0].endswith(name): - return False # This covers cases with space-separated commands - self.all_ps.add(name) + + # Add or update the process + self.all_ps.add(unique_key) + self.process_last_seen[unique_key] = timestamp return True - def add_if_full_command_is_not_in_set(self, name: str) -> bool: + def add_if_full_command_is_not_in_set(self, name: str, timestamp: Optional[datetime] = None, uid: Optional[int] = None) -> bool: """ - Ensures that a process command is unique before adding it to the shared set. + Ensures that a process command is unique before adding it to the shared set, + with time-based deduplication: only keep duplicates if they occur more than 1 hour apart. + UID is considered part of the uniqueness - same process with different UID is treated as separate. :param name: Process command name - :return: True if the process was not in the set and was added, False otherwise. + :param timestamp: Timestamp of the process occurrence (optional, for time-based deduplication) + :param uid: User ID of the process (optional, considered in uniqueness check) + :return: True if the process was not in the set or last seen > 1 hour ago, False otherwise. """ - for item in self.all_ps: - if item.startswith(name): + # Create a unique key that includes both name and UID + unique_key = f"{name}|uid:{uid}" + + # If no timestamp provided, use old behavior (always check for duplicates) + if timestamp is None: + for item in self.all_ps: + if item.startswith(name): + return False + self.all_ps.add(unique_key) + return True + + # Time-based deduplication: check if we've seen this process+uid combination recently + if unique_key in self.process_last_seen: + time_diff = timestamp - self.process_last_seen[unique_key] + # Only add if more than 1 hour has passed + if time_diff < timedelta(hours=1): return False - self.all_ps.add(name) + + # Add or update the process + self.all_ps.add(unique_key) + self.process_last_seen[unique_key] = timestamp return True diff --git a/src/sysdiagnose/analysers/yarascan.py b/src/sysdiagnose/analysers/yarascan.py index 23f4570d..b64a322b 100644 --- a/src/sysdiagnose/analysers/yarascan.py +++ b/src/sysdiagnose/analysers/yarascan.py @@ -44,7 +44,11 @@ def execute(self): self.output_file, # don't match on ourselves ], ignore_folders=[ - glob.glob(os.path.join(self.case_data_subfolder, 'system_logs.logarchive')).pop(), # irrelevant for YARA rules + p for pattern in [ + 'system_logs.logarchive', + 'collect_system_logs.logarchive', + ] + for p in glob.glob(os.path.join(self.case_data_subfolder, pattern)) ] ) diff --git a/src/sysdiagnose/parsers/crashlogs.py b/src/sysdiagnose/parsers/crashlogs.py index 2afdc50c..b415e530 100644 --- a/src/sysdiagnose/parsers/crashlogs.py +++ b/src/sysdiagnose/parsers/crashlogs.py @@ -40,7 +40,7 @@ def get_log_files(self) -> list: log_files.extend(glob.glob(os.path.join(self.case_data_folder, log_files_glob), recursive=True)) # exclude some files - exclusion_strings = ['WiFiLQMMetrics', 'OTAUpdate'] + exclusion_strings = ['WiFiLQMMetrics', 'OTAUpdate', 'ProxiedDevice'] for exclusion_string in exclusion_strings: log_files = [x for x in log_files if exclusion_string not in x] @@ -104,6 +104,7 @@ def parse_ips_file(path: str) -> Event: timestamp = datetime.strptime(result['timestamp'], '%Y-%m-%d %H:%M:%S.%f %z') result['name'] = CrashLogsParser.metadata_from_filename(path)[0] + result['filename'] = os.path.basename(path) try: message = f"Crashlog: {result['report']['reason']}" except KeyError: diff --git a/src/sysdiagnose/parsers/logarchive.py b/src/sysdiagnose/parsers/logarchive.py index 995cc7c1..8562c45f 100644 --- a/src/sysdiagnose/parsers/logarchive.py +++ b/src/sysdiagnose/parsers/logarchive.py @@ -9,7 +9,7 @@ from datetime import datetime, timezone from sysdiagnose.utils.base import BaseParserInterface, SysdiagnoseConfig, logger, Event import glob -import json +import orjson import os import platform import subprocess @@ -46,8 +46,8 @@ def log_stderr(process, logger): """ Reads the stderr of a subprocess and logs it line by line. """ - for line in iter(process.stderr.readline, ''): - logger.debug(line.strip()) + for line in iter(process.stderr.readline, b''): + logger.debug(line.decode('utf-8', errors='replace').strip()) class LogarchiveParser(BaseParserInterface): @@ -58,8 +58,14 @@ def __init__(self, config: SysdiagnoseConfig, case_id: str): super().__init__(__file__, config, case_id) def get_log_files(self) -> list: - log_folder_glob = '**/system_logs.logarchive/' - return glob.glob(os.path.join(self.case_data_folder, log_folder_glob), recursive=True) + results = [] + for pattern in ['**/system_logs.logarchive/', '**/collect_system_logs.logarchive/']: + matches = glob.glob(os.path.join(self.case_data_folder, pattern), recursive=True) + results.extend(matches) + if matches: + logger.info(f'get_log_files: pattern {pattern} matched {len(matches)} folder(s)') + logger.info(f'get_log_files: total {len(results)} logarchive folder(s) found') + return results @DeprecationWarning def execute(self) -> list | dict: @@ -69,8 +75,8 @@ def execute(self) -> list | dict: with tempfile.TemporaryDirectory() as tmp_outpath: tmp_output_file = os.path.join(tmp_outpath.name, 'logarchive.tmp') LogarchiveParser.parse_all_to_file(self.get_log_files(), tmp_output_file) - with open(tmp_output_file, 'r') as f: - return [json.loads(line) for line in f] + with open(tmp_output_file, 'rb') as f: + return [orjson.loads(line) for line in f] except IndexError: return {'error': 'No system_logs.logarchive/ folder found in logs/ directory'} @@ -85,11 +91,11 @@ def get_result(self, force: bool = False): if self.output_exists(): # load existing output - with open(self.output_file, 'r') as f: + with open(self.output_file, 'rb') as f: for line in f: try: - yield json.loads(line) - except json.decoder.JSONDecodeError: # last lines of the native logarchive.jsonl file + yield orjson.loads(line) + except orjson.JSONDecodeError: # last lines of the native logarchive.jsonl file continue else: # should never happen, as we never keep it in memory @@ -125,7 +131,7 @@ def merge_files(temp_files: list, output_file: str): prev_temp_file = temp_files[0] # first copy over first file to self.output_file shutil.copyfile(prev_temp_file['file'].name, output_file) - with open(output_file, 'a') as f_out: + with open(output_file, 'ab') as f_out: i = 1 while i < len(temp_files): current_temp_file = temp_files[i] @@ -135,16 +141,16 @@ def merge_files(temp_files: list, output_file: str): pass elif current_temp_file['first_timestamp'] > prev_temp_file['last_timestamp']: # copy over the full file - with open(current_temp_file['file'].name, 'r') as f_in: + with open(current_temp_file['file'].name, 'rb') as f_in: for line in f_in: f_out.write(line) prev_temp_file = current_temp_file else: # need to seek to prev_last and copy over new data - with open(current_temp_file['file'].name, 'r') as f_in: - copy_over = False # store if we need to copy over, spares us of json.loads() every line when we know we should be continuing + with open(current_temp_file['file'].name, 'rb') as f_in: + copy_over = False # store if we need to copy over, spares us of orjson.loads() every line when we know we should be continuing for line in f_in: - if not copy_over and json.loads(line)['time'] > prev_temp_file['last_timestamp']: + if not copy_over and orjson.loads(line)['time'] > prev_temp_file['last_timestamp']: copy_over = True if copy_over: f_out.write(line) @@ -152,45 +158,96 @@ def merge_files(temp_files: list, output_file: str): i += 1 def get_first_and_last_entries(output_file: str) -> tuple: + first_entry = None + last_entry = None + with open(output_file, 'rb') as f: - first_entry = json.loads(f.readline().decode()) - # discover last line efficiently - f.seek(-2, os.SEEK_END) # Move the pointer to the second-to-last byte in the file - # Move backwards until a newline character is found, or we hit the start of the file + # Find first entry that has a 'time' field. + # The first line is usually a valid entry, but guard defensively. + for raw_line in f: + try: + entry = orjson.loads(raw_line) + if 'time' in entry: + first_entry = entry + break + except orjson.JSONDecodeError: + continue + + if first_entry is None: + raise ValueError(f'No entry with time field found in {output_file}') + + # Find last entry with a 'time' field. + # The Rust unifiedlog_iterator appends a summary footer like + # {'count': N, 'finished': 1} which has no 'time' key. + # Use efficient backward seek for the last line, then fall back + # to a linear scan if that line turns out to be the footer. + f.seek(-2, os.SEEK_END) while f.tell() > 0: char = f.read(1) if char == b'\n': break - f.seek(-2, os.SEEK_CUR) # Move backwards + f.seek(-2, os.SEEK_CUR) + + try: + entry = orjson.loads(f.readline()) + if 'time' in entry: + last_entry = entry + except orjson.JSONDecodeError: + pass + + if last_entry is None: + # Footer detected — do a full linear scan to find the last valid entry + f.seek(0) + for raw_line in f: + try: + entry = orjson.loads(raw_line) + if 'time' in entry: + last_entry = entry + except orjson.JSONDecodeError: + continue - # Read the last line - last_entry = json.loads(f.readline().decode()) + if last_entry is None: + raise ValueError(f'No entry with time field found in {output_file}') - return (first_entry, last_entry) + return (first_entry, last_entry) def parse_all_to_file(folders: list, output_file: str): - # no caching + logger.info(f'parse_all_to_file called with {len(folders)} folder(s): {folders}') + # simple mode: only one folder if len(folders) == 1: LogarchiveParser.parse_folder_to_file(folders[0], output_file) return # complex mode: multiple folders, need to merge multiple files - # for each of the log folders - # - parse it to a temporary file, keep track of the file reference or name - # - keep track of the first and last timestamp of each file - # - order the files, and if a file contains a subset of another one, skip it. - # this is a though one, as we may have partially overlapping timeframes, so we may need to re-assemble in a smart way. - # - once we know the order, bring the files together to the final single output file - temp_files = [] try: for folder in folders: temp_file = tempfile.NamedTemporaryFile(delete=False) - LogarchiveParser.parse_folder_to_file(folder, temp_file.name) - temp_files.append({ - 'file': temp_file, - }) + try: + LogarchiveParser.parse_folder_to_file(folder, temp_file.name) + except Exception as e: + logger.error(f'parse_folder_to_file failed for {folder}: {e}') + os.remove(temp_file.name) + continue + file_size = os.path.getsize(temp_file.name) + logger.info(f'Parsed {folder} -> {file_size} bytes') + if file_size > 0: + temp_files.append({ + 'file': temp_file, + }) + else: + logger.warning(f'Skipping empty logarchive parse result for: {folder}') + os.remove(temp_file.name) + + if not temp_files: + logger.warning('No logarchive folders produced output, skipping merge') + return + + # If only one file survived, skip merge and copy directly + if len(temp_files) == 1: + shutil.copyfile(temp_files[0]['file'].name, output_file) + return # merge files to the output file LogarchiveParser.merge_files(temp_files, output_file) @@ -202,10 +259,7 @@ def parse_all_to_file(folders: list, output_file: str): def parse_folder_to_file(input_folder: str, output_file: str) -> bool: try: - if (platform.system() == 'Darwin'): - LogarchiveParser.__convert_using_native_logparser(input_folder, output_file) - else: - LogarchiveParser.__convert_using_unifiedlogparser(input_folder, output_file) + LogarchiveParser.__convert_using_unifiedlogparser(input_folder, output_file) return True except IndexError: logger.exception('Error: No system_logs.logarchive/ folder found in logs/ directory') @@ -213,18 +267,22 @@ def parse_folder_to_file(input_folder: str, output_file: str) -> bool: except FileNotFoundError: logger.exception('Error: unifiedlogs command not found, please refer to the README for further instructions') return False + except RuntimeError: + logger.exception(f'unifiedlog_iterator failed for folder: {input_folder}') + return False def __convert_using_native_logparser(input_folder: str, output_file: str) -> list: - with open(output_file, 'w') as f_out: + with open(output_file, 'wb') as f_out: # output to stdout and not to a file as we need to convert the output to a unified format cmd_array = ['/usr/bin/log', 'show', input_folder, '--style', 'ndjson', '--info', '--debug', '--signpost'] # read each line, convert line by line and write the output directly to the new file # this approach limits memory consumption for line in LogarchiveParser.__execute_cmd_and_yield_result(cmd_array): try: - entry_json = LogarchiveParser.convert_entry_to_unifiedlog_format(json.loads(line)) - f_out.write(json.dumps(entry_json) + '\n') - except json.JSONDecodeError as e: + entry_json = LogarchiveParser.convert_entry_to_unifiedlog_format(orjson.loads(line)) + f_out.write(orjson.dumps(entry_json)) + f_out.write(b'\n') + except orjson.JSONDecodeError as e: logger.warning(f"WARNING: error parsing JSON {line} - {e}", exc_info=True) except KeyError: # last line of log does not contain 'time' field, nor the rest of the data. @@ -234,48 +292,91 @@ def __convert_using_native_logparser(input_folder: str, output_file: str) -> lis break def __convert_using_unifiedlogparser(input_folder: str, output_file: str) -> list: - with open(output_file, 'w') as f: - for entry in LogarchiveParser.__convert_using_unifiedlogparser_generator(input_folder): - json.dump(entry, f) - f.write('\n') - - @DeprecationWarning - def __convert_using_unifiedlogparser_save_file(input_folder: str, output_file: str): - logger.warning('WARNING: using Mandiant UnifiedLogReader to parse logs, results will be less reliable than on OS X') - # output to stdout and not to a file as we need to convert the output to a unified format - cmd_array = ['unifiedlog_iterator', '--mode', 'log-archive', '--input', input_folder, '--output', output_file, '--format', 'jsonl'] - # read each line, convert line by line and write the output directly to the new file - # this approach limits memory consumption - result = LogarchiveParser.__execute_cmd_and_get_result(cmd_array) - return result + """ + Let Rust write the file directly - ZERO Python overhead! + Rust binary outputs Event format and writes to file in one pass. + Maximum performance: 320K+ lines/sec + """ + # Let Rust do EVERYTHING: parse, convert, and write + # Python just waits for it to finish + cmd_array = [ + 'unifiedlog_iterator', + '--mode', 'log-archive', + '--input', input_folder, + '--output', output_file, + '--format', 'jsonl', + '--output-format', 'event', + '--threads', '10' + ] + + logger.info(f'Running: {" ".join(cmd_array)}') + + result = subprocess.run(cmd_array, capture_output=True, text=True) + + if result.returncode != 0: + logger.error(f'unifiedlog_iterator failed with code {result.returncode}') + logger.error(f'stderr: {result.stderr}') + raise RuntimeError(f'unifiedlog_iterator failed: {result.stderr}') + + logger.info(f'Successfully wrote {output_file}') + return [] def __convert_using_unifiedlogparser_generator(input_folder: str): - logger.warning('WARNING: using Mandiant UnifiedLogReader to parse logs, results will be less reliable than on OS X') - # output to stdout and not to a file as we need to convert the output to a unified format - cmd_array = ['unifiedlog_iterator', '--mode', 'log-archive', '--input', input_folder, '--format', 'jsonl'] - # read each line, convert line by line and write the output directly to the new file - # this approach limits memory consumption + """ + Generator that streams Event format directly from Rust binary. + Used for cases where we need to process entries one by one. + """ + cmd_array = [ + 'unifiedlog_iterator', + '--mode', 'log-archive', + '--input', input_folder, + '--format', 'jsonl', + '--output-format', 'event', + '--threads', '10' + ] + for line in LogarchiveParser.__execute_cmd_and_yield_result(cmd_array): try: - entry_json = LogarchiveParser.convert_entry_to_unifiedlog_format(json.loads(line)) + entry_json = orjson.loads(line) yield entry_json - except json.JSONDecodeError: - pass - except KeyError: + except orjson.JSONDecodeError: pass - def __execute_cmd_and_yield_result(cmd_array: list) -> Generator[dict, None, None]: + def __execute_cmd_and_yield_result(cmd_array: list) -> Generator[str, None, None]: ''' Return None if it failed or the result otherwise. + Uses buffered reading for better performance. + Chunk size of 1MB provides optimal balance between: + - Reducing system call overhead (fewer read() calls) + - Memory efficiency (not too large) + - Pipe buffer utilization (macOS pipes are typically 64KB-512KB) ''' - with subprocess.Popen(cmd_array, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) as process: + CHUNK_SIZE = 1024 * 1024 # 1MB chunks for optimal throughput + BUFFER_SIZE = 2 * 1024 * 1024 # 2MB subprocess buffer + + with subprocess.Popen(cmd_array, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=False, bufsize=BUFFER_SIZE) as process: # start a thread to log stderr stderr_thread = threading.Thread(target=log_stderr, args=(process, logger), daemon=True) stderr_thread.start() - for line in iter(process.stdout.readline, ''): - yield line + # Use buffered reading for better performance + buffer = b'' + while True: + chunk = process.stdout.read(CHUNK_SIZE) + if not chunk: + break + buffer += chunk + + # Process complete lines + while b'\n' in buffer: + line, buffer = buffer.split(b'\n', 1) + if line: # Skip empty lines + yield line.decode('utf-8') + + # Process any remaining data in buffer + if buffer: + yield buffer.decode('utf-8') def __execute_cmd_and_get_result(cmd_array: list, outputfile=None): ''' @@ -292,7 +393,7 @@ def __execute_cmd_and_get_result(cmd_array: list, outputfile=None): if outputfile is None: for line in iter(process.stdout.readline, ''): try: - result.append(json.loads(line)) + result.append(orjson.loads(line)) except Exception: result.append(line) elif outputfile == sys.stdout: @@ -319,9 +420,11 @@ def convert_entry_to_unifiedlog_format(entry: dict) -> dict: timestamp = LogarchiveParser.convert_unifiedlog_time_to_datetime(entry['time']) entry['datetime'] = timestamp.isoformat(timespec='microseconds') entry['timestamp'] = timestamp.timestamp() + # Extract message before passing entry to data to avoid duplication + message = entry.pop('message', '') event = Event( datetime=timestamp, - message=entry.get('message', ''), + message=message, module=module, timestamp_desc=timestamp_desc, data=entry diff --git a/src/sysdiagnose/parsers/mcstate_shared_profile.py b/src/sysdiagnose/parsers/mcstate_shared_profile.py index df55fcba..5aaa4830 100644 --- a/src/sysdiagnose/parsers/mcstate_shared_profile.py +++ b/src/sysdiagnose/parsers/mcstate_shared_profile.py @@ -35,6 +35,12 @@ def execute(self) -> list | dict: entry = misc.load_plist_file_as_json(log_file) timestamp = datetime.strptime(entry['InstallDate'], '%Y-%m-%dT%H:%M:%S.%f') timestamp = timestamp.replace(tzinfo=timezone.utc) # ensure timezone is UTC + + # Extract the SHA-256 hash from the stub filename (profile-{hash}.stub) + stub_filename = os.path.basename(log_file) + if stub_filename.startswith('profile-') and stub_filename.endswith('.stub'): + entry['ProfileStubHash'] = stub_filename[len('profile-'):-len('.stub')] + event = Event( datetime=timestamp, message='# '.join([entry.get('PayloadDescription', ''), entry.get('PayloadDisplayName', ''), entry.get('PayloadOrganization', '')]), diff --git a/src/sysdiagnose/parsers/plists.py b/src/sysdiagnose/parsers/plists.py index 4aad1726..5c317a13 100644 --- a/src/sysdiagnose/parsers/plists.py +++ b/src/sysdiagnose/parsers/plists.py @@ -16,7 +16,8 @@ def __init__(self, config: SysdiagnoseConfig, case_id: str): def get_log_files(self) -> list: log_files_globs = [ - '**/*.plist' + '**/*.plist', + '**/*.stub', ] log_files = [] for log_files_glob in log_files_globs: diff --git a/src/sysdiagnose/utils/apollo.py b/src/sysdiagnose/utils/apollo.py index ce0c999c..120c5f42 100644 --- a/src/sysdiagnose/utils/apollo.py +++ b/src/sysdiagnose/utils/apollo.py @@ -172,7 +172,7 @@ def parse_db(self, db_fname: str, db_type: Optional[str] = None) -> list: item['apollo_module'] = module_query['name'] event = Event( datetime=timestamp, - message=module_query['activity'] + ': ' + ', '.join([f"{k}={v}" for k, v in list(zip(headers, row)) if k != key_timestamp and 'time' not in k and 'id' not in k]), + message=module_query['activity'] + ': ' + ', '.join([f"{k}={v}" for k, v in list(zip(headers, row)) if k != key_timestamp and 'time' not in k and 'table id' not in k]), module=self.saf_module, timestamp_desc=module_query['activity'], data=item diff --git a/src/sysdiagnose/utils/apollo_modules/powerlog_device_telephony_activity.txt b/src/sysdiagnose/utils/apollo_modules/powerlog_device_telephony_activity.txt index 0d4b0763..9d0ecc51 100644 --- a/src/sysdiagnose/utils/apollo_modules/powerlog_device_telephony_activity.txt +++ b/src/sysdiagnose/utils/apollo_modules/powerlog_device_telephony_activity.txt @@ -63,7 +63,7 @@ MODULE_NOTES=Telephony Activity [Database Metadata] DATABASE=CurrentPowerlog.PLSQL PLATFORM=IOS -VERSIONS=9,10,11,12,13,14 +VERSIONS=9,10,11,12,13,14,15,16,17,18 [Query Metadata] QUERY_NAME=powerlog_device_telephony_activity @@ -86,4 +86,25 @@ QUERY= SIMSTATUS AS "SIM STATUS", ID AS "PLBBAGENT_EVENTPOINT_TELEPHONYACTIVITY TABLE ID" FROM - PLBBAGENT_EVENTPOINT_TELEPHONYACTIVITY \ No newline at end of file + PLBBAGENT_EVENTPOINT_TELEPHONYACTIVITY + +[SQL Query 15,16,17,18] +QUERY= + SELECT + DATETIME(timestamp, 'UNIXEPOCH') AS TIMESTAMP, + airplaneMode AS "AIRPLANE MODE", + callStatus AS "CALL STATUS", + signalBars AS "SIGNAL BARS", + signalStrength AS "SIGNAL STRENGTH", + campedRat AS "CAMPED RAT", + currentRat AS "CURRENT RAT", + preferredRat AS "PREFERRED RAT", + activeBand AS "ACTIVE BAND", + dataStatus AS "DATA STATUS", + simStatus AS "SIM STATUS", + subsId AS "SUBSCRIPTION ID", + ID AS "BASEBANDMETRICS_TELEPHONYACTIVITY TABLE ID" + FROM + BasebandMetrics_TelephonyActivity_1_2 + ORDER BY + timestamp DESC \ No newline at end of file diff --git a/src/sysdiagnose/utils/apollo_modules/powerlog_device_telephony_registration.txt b/src/sysdiagnose/utils/apollo_modules/powerlog_device_telephony_registration.txt index 10bacc2c..c67f9ce8 100644 --- a/src/sysdiagnose/utils/apollo_modules/powerlog_device_telephony_registration.txt +++ b/src/sysdiagnose/utils/apollo_modules/powerlog_device_telephony_registration.txt @@ -63,7 +63,7 @@ MODULE_NOTES=Records telephony registration details such AS carrier and service. [Database Metadata] DATABASE=CurrentPowerlog.PLSQL PLATFORM=IOS -VERSIONS=9,10,11,12,13,14 +VERSIONS=9,10,11,12,13,14,15,16,17,18 [Query Metadata] QUERY_NAME=powerlog_device_telephony_registration @@ -126,4 +126,25 @@ QUERY= AS TELEPHONYREG_STATE GROUP BY TELEPHONYREG_ID - ) \ No newline at end of file + ) + +[SQL Query 15,16,17,18] +QUERY= + SELECT + DATETIME(timestamp, 'UNIXEPOCH') AS ADJUSTED_TIMESTAMP, + cellId AS "CELL ID", + dataInd AS "SERVICE", + operator AS "OPERATOR", + status AS "STATUS", + dataActive AS "DATA ACTIVE", + dataAttached AS "DATA ATTACHED", + home AS "HOME", + lac AS "LAC", + subsId AS "SUBSCRIPTION ID", + ID AS "BASEBANDMETRICS_TELEPHONYREGISTRATION TABLE ID" + FROM + BasebandMetrics_TelephonyRegistration_1_2 + WHERE + cellId IS NOT NULL AND cellId > 0 + ORDER BY + timestamp DESC \ No newline at end of file diff --git a/tests/test_parsers_logarchive.py b/tests/test_parsers_logarchive.py index 1f045a04..99f7868c 100644 --- a/tests/test_parsers_logarchive.py +++ b/tests/test_parsers_logarchive.py @@ -2,7 +2,7 @@ from tests import SysdiagnoseTestCase import os import unittest -import json +import orjson import tempfile @@ -20,9 +20,9 @@ def test_parse_logarchive(self): self.assertTrue(os.path.isfile(p.output_file)) # we don't test getting result in memory, but check one line in the output. - with open(p.output_file, 'r') as f: + with open(p.output_file, 'rb') as f: line = f.readline() - item = json.loads(line) + item = orjson.loads(line) self.assertTrue('subsystem' in item['data']) self.assert_has_required_fields_jsonl(item) @@ -158,7 +158,7 @@ def test_merge_files(self): 'file': temp_file, }) for entry in file: - temp_file.write(json.dumps(entry).encode()) + temp_file.write(orjson.dumps(entry)) temp_file.write(b'\n') temp_file.close() # merge the files @@ -168,9 +168,9 @@ def test_merge_files(self): # read the output file result = [] - with open(output_file.name, 'r') as f: + with open(output_file.name, 'rb') as f: for line in f: - result.append(json.loads(line)) + result.append(orjson.loads(line)) finally: for temp_file in temp_files: diff --git a/tests/testdata b/tests/testdata index 7e64d5ca..1b669a8d 160000 --- a/tests/testdata +++ b/tests/testdata @@ -1 +1 @@ -Subproject commit 7e64d5ca133837393c0c7cd3193d0d4dcabc5447 +Subproject commit 1b669a8d8ec53af4388b5dcd084030cb1510654d