|
| 1 | +from typing import Any, Dict, Set |
| 2 | + |
1 | 3 | from lxml import etree |
2 | 4 |
|
3 | 5 | from strelka import strelka |
| 6 | +from strelka.auxiliary.iocs import extract_iocs_from_string |
4 | 7 |
|
5 | 8 |
|
6 | 9 | class ScanXml(strelka.Scanner): |
7 | | - """Collects metadata and extracts embedded files from XML files. |
8 | | -
|
| 10 | + """ |
| 11 | + Collects metadata and extracts embedded files from XML files. |
| 12 | + This scanner parses XML files to collect metadata and extract embedded files based on specified tags. |
| 13 | + It is used in forensic and malware analysis to extract and analyze structured data within XML documents. |
| 14 | + Scanner Type: Collection |
| 15 | + Attributes: |
| 16 | + None |
9 | 17 | Options: |
10 | | - extract_tags: List of XML tags that will have their text extracted |
11 | | - as child files. |
12 | | - Defaults to empty list. |
13 | | - metadata_tags: List of XML tags that will have their text logged |
14 | | - as metadata. |
15 | | - Defaults to empty list. |
| 18 | + extract_tags (list[str]): Tags whose content is extracted as child files. |
| 19 | + metadata_tags (list[str]): Tags whose content is logged as metadata. |
| 20 | + ## Detection Use Cases |
| 21 | + !!! info "Detection Use Cases" |
| 22 | + - **Embedded File Extraction** |
| 23 | + - Extracts files embedded within specific XML tags. |
| 24 | + - **Metadata Extraction**: |
| 25 | + - Collects metadata from specific XML tags. |
| 26 | + ## Known Limitations |
| 27 | + !!! warning "Known Limitations" |
| 28 | + - Complex or malformed XML structures might lead to incomplete parsing or errors. |
| 29 | + - Excessive files may be scanned / collected if XML mimetypes are set in the `backend.yml` |
| 30 | + ## To Do |
| 31 | + !!! question "To Do" |
| 32 | + - Improve error handling for malformed XML structures. |
| 33 | + - Better extraction of tags / metadata tags |
| 34 | + ## References |
| 35 | + !!! quote "References" |
| 36 | + - XML File Format Specification (https://www.w3.org/XML/) |
| 37 | + ## Contributors |
| 38 | + !!! example "Contributors" |
| 39 | + - [Josh Liburdi](https://github.com/jshlbrd) |
| 40 | + - [Paul Hutelmyer](https://github.com/phutelmyer) |
16 | 41 | """ |
17 | 42 |
|
18 | | - def scan(self, data, file, options, expire_at): |
19 | | - xml_args = { |
20 | | - "extract_tags": options.get("extract_tags", []), |
21 | | - "metadata_tags": options.get("metadata_tags", []), |
| 43 | + def scan( |
| 44 | + self, data: bytes, file: strelka.File, options: dict, expire_at: int |
| 45 | + ) -> None: |
| 46 | + """ |
| 47 | + Parses XML data to extract metadata and files. |
| 48 | + Args: |
| 49 | + data: XML data as bytes. |
| 50 | + file: File object containing metadata about the scan. |
| 51 | + options: Dictionary of scanner options. |
| 52 | + expire_at: Time when the scan should be considered expired. |
| 53 | + Scans the XML file, extracting data and metadata based on the specified tags, |
| 54 | + and emits files as necessary. |
| 55 | + """ |
| 56 | + # Prepare options with case-insensitive tag matching |
| 57 | + xml_options = { |
| 58 | + "extract_tags": [tag.lower() for tag in options.get("extract_tags", [])], |
| 59 | + "metadata_tags": [tag.lower() for tag in options.get("metadata_tags", [])], |
22 | 60 | } |
23 | | - self.expire_at = expire_at |
24 | | - self.event.setdefault("tags", []) |
| 61 | + |
| 62 | + # Initialize scan event data |
| 63 | + self.event.setdefault("tags", set()) |
25 | 64 | self.event.setdefault("tag_data", []) |
26 | | - self.event.setdefault("namespaces", []) |
| 65 | + self.event.setdefault("namespaces", set()) |
27 | 66 | self.event["total"] = {"tags": 0, "extracted": 0} |
| 67 | + self.emitted_files: Set[str] = ( |
| 68 | + set() |
| 69 | + ) # Tracks emitted files to prevent duplicates |
28 | 70 |
|
29 | | - xml = None |
| 71 | + # Parse the XML content |
30 | 72 | try: |
31 | 73 | xml_buffer = data |
32 | 74 | if xml_buffer.startswith(b"<?XML"): |
33 | 75 | xml_buffer = b"<?xml" + xml_buffer[5:] |
34 | 76 | xml = etree.fromstring(xml_buffer) |
35 | 77 | docinfo = xml.getroottree().docinfo |
36 | | - if docinfo.doctype: |
37 | | - self.event["doc_type"] = docinfo.doctype |
38 | | - if docinfo.xml_version: |
39 | | - self.event["version"] = docinfo.xml_version |
| 78 | + self.event["doc_type"] = docinfo.doctype if docinfo.doctype else "" |
| 79 | + self.event["version"] = docinfo.xml_version if docinfo.xml_version else "" |
40 | 80 |
|
41 | | - except etree.XMLSyntaxError: |
42 | | - self.flags.append("syntax_error") |
| 81 | + # Recursively process each node in the XML |
| 82 | + self._recurse_node(xml, xml_options) |
43 | 83 |
|
44 | | - if xml is not None: |
45 | | - self._recurse_node(self, xml, xml_args) |
| 84 | + except etree.XMLSyntaxError as e: |
| 85 | + self.flags.append(f"syntax_error: {str(e)}") |
46 | 86 |
|
47 | | - @staticmethod |
48 | | - def _recurse_node(self, node, xml_args): |
49 | | - """Recursively parses XML file. |
| 87 | + # Finalize the event data for reporting |
| 88 | + self.event["tags"] = list(self.event["tags"]) |
| 89 | + self.event["tag_data"] = list(self.event["tag_data"]) |
| 90 | + self.event["total"]["tags"] = len(self.event["tags"]) |
| 91 | + self.event["namespaces"] = list(self.event["namespaces"]) |
| 92 | + self.event["emitted_content"] = list(self.emitted_files) |
50 | 93 |
|
51 | | - The XML file is recursively parsed down every node tree. |
| 94 | + # Extract and add Indicators of Compromise (IOCs) |
| 95 | + self.add_iocs(extract_iocs_from_string(data.decode("utf-8"))) |
52 | 96 |
|
| 97 | + def _recurse_node(self, node: etree._Element, xml_options: Dict[str, Any]) -> None: |
| 98 | + """ |
| 99 | + Recursively processes each XML node to extract data and metadata. |
53 | 100 | Args: |
54 | | - node: node to be recursively parsed. |
55 | | - xml_args: options set by the scanner that affect XMl parsing. |
| 101 | + node: The current XML node to process. |
| 102 | + xml_options: Options for data extraction and metadata logging. |
| 103 | + Iterates through XML nodes, extracting data and collecting metadata as specified |
| 104 | + by the scanner options. |
56 | 105 | """ |
57 | | - if node is not None: |
58 | | - if hasattr(node.tag, "__getitem__"): |
59 | | - if node.tag.startswith("{"): |
60 | | - namespace, separator, tag = node.tag[1:].partition("}") |
61 | | - else: |
62 | | - namespace = None |
63 | | - tag = node.tag |
64 | | - |
65 | | - self.event["total"]["tags"] += 1 |
66 | | - if namespace not in self.event["namespaces"]: |
67 | | - self.event["namespaces"].append(namespace) |
68 | | - if tag not in self.event["tags"]: |
69 | | - self.event["tags"].append(tag) |
70 | | - |
71 | | - text = node.attrib.get("name", node.text) |
72 | | - if text is not None: |
73 | | - if tag in xml_args["metadata_tags"]: |
74 | | - tag_data = {"tag": tag, "text": text.strip()} |
75 | | - if tag_data not in self.event["tag_data"]: |
76 | | - self.event["tag_data"].append(tag_data) |
77 | | - elif tag in xml_args["extract_tags"]: |
78 | | - # Send extracted file back to Strelka |
79 | | - self.emit_file(text, name=tag) |
80 | | - |
81 | | - self.event["total"]["extracted"] += 1 |
82 | | - |
| 106 | + if node is not None and hasattr(node.tag, "__getitem__"): |
| 107 | + namespace, _, tag = node.tag.partition("}") |
| 108 | + namespace = namespace[1:] if namespace.startswith("{") else "" |
| 109 | + tag = tag.lower() |
| 110 | + |
| 111 | + if tag: |
| 112 | + self.event["tags"].add(tag) |
| 113 | + if namespace: |
| 114 | + self.event["namespaces"].add(namespace) |
| 115 | + |
| 116 | + # Handle specific content extraction and emission |
| 117 | + if tag in xml_options["extract_tags"]: |
| 118 | + content = node.text.strip() if node.text else "" |
| 119 | + if content: |
| 120 | + self.emit_file(content, name=tag) |
| 121 | + self.emitted_files.add(content) |
| 122 | + self.event["total"]["extracted"] += 1 |
| 123 | + |
| 124 | + # Always process attributes to capture any relevant metadata or data for emission |
| 125 | + self._process_attributes(node, xml_options, tag) |
| 126 | + |
| 127 | + # Continue to recurse through child nodes to extract data |
83 | 128 | for child in node.getchildren(): |
84 | | - self._recurse_node(self, child, xml_args) |
| 129 | + self._recurse_node(child, xml_options) |
85 | 130 |
|
86 | | - return |
| 131 | + def _process_attributes( |
| 132 | + self, node: etree._Element, xml_options: Dict[str, Any], tag: str |
| 133 | + ) -> None: |
| 134 | + """ |
| 135 | + Processes XML node attributes to extract or log data. |
| 136 | + Args: |
| 137 | + node: XML node whose attributes are being processed. |
| 138 | + xml_options: Configuration options for the scan. |
| 139 | + tag: The tag of the current XML node being processed. |
| 140 | + Extracts data from attributes specified in the extract_tags list and logs data |
| 141 | + from attributes specified in the metadata_tags list. |
| 142 | + """ |
| 143 | + for attr_name, attr_value in node.attrib.items(): |
| 144 | + attr_name_lower = attr_name.lower() |
| 145 | + if attr_name_lower in xml_options["metadata_tags"]: |
| 146 | + self.event["tag_data"].append( |
| 147 | + {"tag": attr_name, "content": str(node.attrib)} |
| 148 | + ) |
0 commit comments