1+ from typing import Any , Dict , Set
2+
13from lxml import etree
24
35from strelka import strelka
6+ from strelka .auxiliary .iocs import extract_iocs_from_string
47
58
69class ScanXml (strelka .Scanner ):
7- """Collects metadata and extracts embedded files from XML files.
8-
10+ """
11+ Collects metadata and extracts embedded files from XML files.
12+ This scanner parses XML files to collect metadata and extract embedded files based on specified tags.
13+ It is used in forensic and malware analysis to extract and analyze structured data within XML documents.
14+ Scanner Type: Collection
15+ Attributes:
16+ None
917 Options:
10- extract_tags: List of XML tags that will have their text extracted
11- as child files.
12- Defaults to empty list.
13- metadata_tags: List of XML tags that will have their text logged
14- as metadata.
15- Defaults to empty list.
18+ extract_tags (list[str]): Tags whose content is extracted as child files.
19+ metadata_tags (list[str]): Tags whose content is logged as metadata.
20+ ## Detection Use Cases
21+ !!! info "Detection Use Cases"
22+ - **Embedded File Extraction**
23+ - Extracts files embedded within specific XML tags.
24+ - **Metadata Extraction**:
25+ - Collects metadata from specific XML tags.
26+ ## Known Limitations
27+ !!! warning "Known Limitations"
28+ - Complex or malformed XML structures might lead to incomplete parsing or errors.
29+ - Excessive files may be scanned / collected if XML mimetypes are set in the `backend.yml`
30+ ## To Do
31+ !!! question "To Do"
32+ - Improve error handling for malformed XML structures.
33+ - Better extraction of tags / metadata tags
34+ ## References
35+ !!! quote "References"
36+ - XML File Format Specification (https://www.w3.org/XML/)
37+ ## Contributors
38+ !!! example "Contributors"
39+ - [Josh Liburdi](https://github.com/jshlbrd)
40+ - [Paul Hutelmyer](https://github.com/phutelmyer)
1641 """
1742
18- def scan (self , data , file , options , expire_at ):
19- xml_args = {
20- "extract_tags" : options .get ("extract_tags" , []),
21- "metadata_tags" : options .get ("metadata_tags" , []),
43+ def scan (
44+ self , data : bytes , file : strelka .File , options : dict , expire_at : int
45+ ) -> None :
46+ """
47+ Parses XML data to extract metadata and files.
48+ Args:
49+ data: XML data as bytes.
50+ file: File object containing metadata about the scan.
51+ options: Dictionary of scanner options.
52+ expire_at: Time when the scan should be considered expired.
53+ Scans the XML file, extracting data and metadata based on the specified tags,
54+ and emits files as necessary.
55+ """
56+ # Prepare options with case-insensitive tag matching
57+ xml_options = {
58+ "extract_tags" : [tag .lower () for tag in options .get ("extract_tags" , [])],
59+ "metadata_tags" : [tag .lower () for tag in options .get ("metadata_tags" , [])],
2260 }
23- self .expire_at = expire_at
24- self .event .setdefault ("tags" , [])
61+
62+ # Initialize scan event data
63+ self .event .setdefault ("tags" , set ())
2564 self .event .setdefault ("tag_data" , [])
26- self .event .setdefault ("namespaces" , [] )
65+ self .event .setdefault ("namespaces" , set () )
2766 self .event ["total" ] = {"tags" : 0 , "extracted" : 0 }
67+ self .emitted_files : Set [str ] = (
68+ set ()
69+ ) # Tracks emitted files to prevent duplicates
2870
29- xml = None
71+ # Parse the XML content
3072 try :
3173 xml_buffer = data
3274 if xml_buffer .startswith (b"<?XML" ):
3375 xml_buffer = b"<?xml" + xml_buffer [5 :]
3476 xml = etree .fromstring (xml_buffer )
3577 docinfo = xml .getroottree ().docinfo
36- if docinfo .doctype :
37- self .event ["doc_type" ] = docinfo .doctype
38- if docinfo .xml_version :
39- self .event ["version" ] = docinfo .xml_version
78+ self .event ["doc_type" ] = docinfo .doctype if docinfo .doctype else ""
79+ self .event ["version" ] = docinfo .xml_version if docinfo .xml_version else ""
4080
41- except etree . XMLSyntaxError :
42- self .flags . append ( "syntax_error" )
81+ # Recursively process each node in the XML
82+ self ._recurse_node ( xml , xml_options )
4383
44- if xml is not None :
45- self ._recurse_node ( self , xml , xml_args )
84+ except etree . XMLSyntaxError as e :
85+ self .flags . append ( f"syntax_error: { str ( e ) } " )
4686
47- @staticmethod
48- def _recurse_node (self , node , xml_args ):
49- """Recursively parses XML file.
87+ # Finalize the event data for reporting
88+ self .event ["tags" ] = list (self .event ["tags" ])
89+ self .event ["tag_data" ] = list (self .event ["tag_data" ])
90+ self .event ["total" ]["tags" ] = len (self .event ["tags" ])
91+ self .event ["namespaces" ] = list (self .event ["namespaces" ])
92+ self .event ["emitted_content" ] = list (self .emitted_files )
5093
51- The XML file is recursively parsed down every node tree.
94+ # Extract and add Indicators of Compromise (IOCs)
95+ self .add_iocs (extract_iocs_from_string (data .decode ("utf-8" )))
5296
97+ def _recurse_node (self , node : etree ._Element , xml_options : Dict [str , Any ]) -> None :
98+ """
99+ Recursively processes each XML node to extract data and metadata.
53100 Args:
54- node: node to be recursively parsed.
55- xml_args: options set by the scanner that affect XMl parsing.
101+ node: The current XML node to process.
102+ xml_options: Options for data extraction and metadata logging.
103+ Iterates through XML nodes, extracting data and collecting metadata as specified
104+ by the scanner options.
56105 """
57- if node is not None :
58- if hasattr (node .tag , "__getitem__" ):
59- if node .tag .startswith ("{" ):
60- namespace , separator , tag = node .tag [1 :].partition ("}" )
61- else :
62- namespace = None
63- tag = node .tag
64-
65- self .event ["total" ]["tags" ] += 1
66- if namespace not in self .event ["namespaces" ]:
67- self .event ["namespaces" ].append (namespace )
68- if tag not in self .event ["tags" ]:
69- self .event ["tags" ].append (tag )
70-
71- text = node .attrib .get ("name" , node .text )
72- if text is not None :
73- if tag in xml_args ["metadata_tags" ]:
74- tag_data = {"tag" : tag , "text" : text .strip ()}
75- if tag_data not in self .event ["tag_data" ]:
76- self .event ["tag_data" ].append (tag_data )
77- elif tag in xml_args ["extract_tags" ]:
78- # Send extracted file back to Strelka
79- self .emit_file (text , name = tag )
80-
81- self .event ["total" ]["extracted" ] += 1
82-
106+ if node is not None and hasattr (node .tag , "__getitem__" ):
107+ namespace , _ , tag = node .tag .partition ("}" )
108+ namespace = namespace [1 :] if namespace .startswith ("{" ) else ""
109+ tag = tag .lower ()
110+
111+ if tag :
112+ self .event ["tags" ].add (tag )
113+ if namespace :
114+ self .event ["namespaces" ].add (namespace )
115+
116+ # Handle specific content extraction and emission
117+ if tag in xml_options ["extract_tags" ]:
118+ content = node .text .strip () if node .text else ""
119+ if content :
120+ self .emit_file (content , name = tag )
121+ self .emitted_files .add (content )
122+ self .event ["total" ]["extracted" ] += 1
123+
124+ # Always process attributes to capture any relevant metadata or data for emission
125+ self ._process_attributes (node , xml_options , tag )
126+
127+ # Continue to recurse through child nodes to extract data
83128 for child in node .getchildren ():
84- self ._recurse_node (self , child , xml_args )
129+ self ._recurse_node (child , xml_options )
85130
86- return
131+ def _process_attributes (
132+ self , node : etree ._Element , xml_options : Dict [str , Any ], tag : str
133+ ) -> None :
134+ """
135+ Processes XML node attributes to extract or log data.
136+ Args:
137+ node: XML node whose attributes are being processed.
138+ xml_options: Configuration options for the scan.
139+ tag: The tag of the current XML node being processed.
140+ Extracts data from attributes specified in the extract_tags list and logs data
141+ from attributes specified in the metadata_tags list.
142+ """
143+ for attr_name , attr_value in node .attrib .items ():
144+ attr_name_lower = attr_name .lower ()
145+ if attr_name_lower in xml_options ["metadata_tags" ]:
146+ self .event ["tag_data" ].append (
147+ {"tag" : attr_name , "content" : str (node .attrib )}
148+ )
0 commit comments