Skip to content

Commit a88ede1

Browse files
authored
Merge pull request #456 from target/ScanXML_Refactor+AdditionalFunctionality
Porting refactor of ScanXML
2 parents c621fe8 + 39a16c0 commit a88ede1

File tree

6 files changed

+409
-99
lines changed

6 files changed

+409
-99
lines changed

configs/python/backend/backend.yaml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
version: 2024.04.02.01
1+
2+
version: 2024.04.22.01
23
logging_cfg: '/etc/strelka/logging.yaml'
34
limits:
45
max_files: 5000
@@ -686,6 +687,19 @@ scanners:
686687
- 'mso_file'
687688
- 'soap_file'
688689
priority: 5
690+
options:
691+
extract_tags:
692+
- "target"
693+
- "script"
694+
- "embeddedfile"
695+
- "cipherdata"
696+
- "data"
697+
- "signedinfo"
698+
- "encrypteddata"
699+
metadata_tags:
700+
- "type"
701+
- "description"
702+
- "maintainer"
689703
'ScanYara':
690704
- positive:
691705
flavors:
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import re
2+
from urllib.parse import urlparse
3+
4+
import tldextract
5+
6+
7+
def extract_iocs_from_string(input_string):
8+
"""
9+
Extracts various types of Indicators of Compromise (IOCs) from a string.
10+
This function looks for domain names and IP addresses within the given string.
11+
Args:
12+
input_string (str): The input string to search for IOCs.
13+
Returns:
14+
list: A list with iocs of unique extracted values.
15+
"""
16+
iocs = set()
17+
iocs.update(extract_domains_from_string(input_string))
18+
iocs.update(extract_ip_addresses(input_string))
19+
return list(iocs)
20+
21+
22+
def extract_domains_from_string(input_string):
23+
"""
24+
Extracts domain names from a string containing URLs.
25+
Args:
26+
input_string (str): The input string to search for URLs.
27+
Returns:
28+
set: A set of unique domain names extracted from the URLs.
29+
"""
30+
domains = set()
31+
32+
# Use a regular expression to find URLs in the data string
33+
urls = re.findall(r"(?:https?|ftp|ftps|file|smb)://[^\s/$.?#].[^\s]*", input_string)
34+
35+
for url in urls:
36+
# Use urlparse to check if the string is a valid URL
37+
parsed_url = urlparse(url)
38+
if parsed_url.scheme and parsed_url.netloc:
39+
# Use tldextract to extract the domain from the URL
40+
extracted = tldextract.extract(url)
41+
domain = (
42+
f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}".strip(
43+
"."
44+
)
45+
)
46+
domains.add(domain)
47+
48+
return list(domains)
49+
50+
51+
def extract_ip_addresses(input_string):
52+
"""
53+
Extracts IP addresses from a string.
54+
Args:
55+
input_string (str): The input string to search for IP addresses.
56+
Returns:
57+
list: A list of unique IP addresses extracted from the input string.
58+
"""
59+
ip_addresses = set()
60+
61+
# Regular expressions for matching IPv4 and IPv6 addresses
62+
ipv4_pattern = r"\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b"
63+
ipv6_pattern = r"\b(?:[A-F0-9]{1,4}:){7}[A-F0-9]{1,4}\b"
64+
65+
# Find all matching IP addresses
66+
ipv4_addresses = re.findall(ipv4_pattern, input_string, re.IGNORECASE)
67+
ipv6_addresses = re.findall(ipv6_pattern, input_string, re.IGNORECASE)
68+
69+
# Add found IP addresses to the set
70+
ip_addresses.update(ipv4_addresses)
71+
ip_addresses.update(ipv6_addresses)
72+
73+
return list(ip_addresses)
Lines changed: 120 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,86 +1,148 @@
1+
from typing import Any, Dict, Set
2+
13
from lxml import etree
24

35
from strelka import strelka
6+
from strelka.auxiliary.iocs import extract_iocs_from_string
47

58

69
class ScanXml(strelka.Scanner):
7-
"""Collects metadata and extracts embedded files from XML files.
8-
10+
"""
11+
Collects metadata and extracts embedded files from XML files.
12+
This scanner parses XML files to collect metadata and extract embedded files based on specified tags.
13+
It is used in forensic and malware analysis to extract and analyze structured data within XML documents.
14+
Scanner Type: Collection
15+
Attributes:
16+
None
917
Options:
10-
extract_tags: List of XML tags that will have their text extracted
11-
as child files.
12-
Defaults to empty list.
13-
metadata_tags: List of XML tags that will have their text logged
14-
as metadata.
15-
Defaults to empty list.
18+
extract_tags (list[str]): Tags whose content is extracted as child files.
19+
metadata_tags (list[str]): Tags whose content is logged as metadata.
20+
## Detection Use Cases
21+
!!! info "Detection Use Cases"
22+
- **Embedded File Extraction**
23+
- Extracts files embedded within specific XML tags.
24+
- **Metadata Extraction**:
25+
- Collects metadata from specific XML tags.
26+
## Known Limitations
27+
!!! warning "Known Limitations"
28+
- Complex or malformed XML structures might lead to incomplete parsing or errors.
29+
- Excessive files may be scanned / collected if XML mimetypes are set in the `backend.yml`
30+
## To Do
31+
!!! question "To Do"
32+
- Improve error handling for malformed XML structures.
33+
- Better extraction of tags / metadata tags
34+
## References
35+
!!! quote "References"
36+
- XML File Format Specification (https://www.w3.org/XML/)
37+
## Contributors
38+
!!! example "Contributors"
39+
- [Josh Liburdi](https://github.com/jshlbrd)
40+
- [Paul Hutelmyer](https://github.com/phutelmyer)
1641
"""
1742

18-
def scan(self, data, file, options, expire_at):
19-
xml_args = {
20-
"extract_tags": options.get("extract_tags", []),
21-
"metadata_tags": options.get("metadata_tags", []),
43+
def scan(
44+
self, data: bytes, file: strelka.File, options: dict, expire_at: int
45+
) -> None:
46+
"""
47+
Parses XML data to extract metadata and files.
48+
Args:
49+
data: XML data as bytes.
50+
file: File object containing metadata about the scan.
51+
options: Dictionary of scanner options.
52+
expire_at: Time when the scan should be considered expired.
53+
Scans the XML file, extracting data and metadata based on the specified tags,
54+
and emits files as necessary.
55+
"""
56+
# Prepare options with case-insensitive tag matching
57+
xml_options = {
58+
"extract_tags": [tag.lower() for tag in options.get("extract_tags", [])],
59+
"metadata_tags": [tag.lower() for tag in options.get("metadata_tags", [])],
2260
}
23-
self.expire_at = expire_at
24-
self.event.setdefault("tags", [])
61+
62+
# Initialize scan event data
63+
self.event.setdefault("tags", set())
2564
self.event.setdefault("tag_data", [])
26-
self.event.setdefault("namespaces", [])
65+
self.event.setdefault("namespaces", set())
2766
self.event["total"] = {"tags": 0, "extracted": 0}
67+
self.emitted_files: Set[str] = (
68+
set()
69+
) # Tracks emitted files to prevent duplicates
2870

29-
xml = None
71+
# Parse the XML content
3072
try:
3173
xml_buffer = data
3274
if xml_buffer.startswith(b"<?XML"):
3375
xml_buffer = b"<?xml" + xml_buffer[5:]
3476
xml = etree.fromstring(xml_buffer)
3577
docinfo = xml.getroottree().docinfo
36-
if docinfo.doctype:
37-
self.event["doc_type"] = docinfo.doctype
38-
if docinfo.xml_version:
39-
self.event["version"] = docinfo.xml_version
78+
self.event["doc_type"] = docinfo.doctype if docinfo.doctype else ""
79+
self.event["version"] = docinfo.xml_version if docinfo.xml_version else ""
4080

41-
except etree.XMLSyntaxError:
42-
self.flags.append("syntax_error")
81+
# Recursively process each node in the XML
82+
self._recurse_node(xml, xml_options)
4383

44-
if xml is not None:
45-
self._recurse_node(self, xml, xml_args)
84+
except etree.XMLSyntaxError as e:
85+
self.flags.append(f"syntax_error: {str(e)}")
4686

47-
@staticmethod
48-
def _recurse_node(self, node, xml_args):
49-
"""Recursively parses XML file.
87+
# Finalize the event data for reporting
88+
self.event["tags"] = list(self.event["tags"])
89+
self.event["tag_data"] = list(self.event["tag_data"])
90+
self.event["total"]["tags"] = len(self.event["tags"])
91+
self.event["namespaces"] = list(self.event["namespaces"])
92+
self.event["emitted_content"] = list(self.emitted_files)
5093

51-
The XML file is recursively parsed down every node tree.
94+
# Extract and add Indicators of Compromise (IOCs)
95+
self.add_iocs(extract_iocs_from_string(data.decode("utf-8")))
5296

97+
def _recurse_node(self, node: etree._Element, xml_options: Dict[str, Any]) -> None:
98+
"""
99+
Recursively processes each XML node to extract data and metadata.
53100
Args:
54-
node: node to be recursively parsed.
55-
xml_args: options set by the scanner that affect XMl parsing.
101+
node: The current XML node to process.
102+
xml_options: Options for data extraction and metadata logging.
103+
Iterates through XML nodes, extracting data and collecting metadata as specified
104+
by the scanner options.
56105
"""
57-
if node is not None:
58-
if hasattr(node.tag, "__getitem__"):
59-
if node.tag.startswith("{"):
60-
namespace, separator, tag = node.tag[1:].partition("}")
61-
else:
62-
namespace = None
63-
tag = node.tag
64-
65-
self.event["total"]["tags"] += 1
66-
if namespace not in self.event["namespaces"]:
67-
self.event["namespaces"].append(namespace)
68-
if tag not in self.event["tags"]:
69-
self.event["tags"].append(tag)
70-
71-
text = node.attrib.get("name", node.text)
72-
if text is not None:
73-
if tag in xml_args["metadata_tags"]:
74-
tag_data = {"tag": tag, "text": text.strip()}
75-
if tag_data not in self.event["tag_data"]:
76-
self.event["tag_data"].append(tag_data)
77-
elif tag in xml_args["extract_tags"]:
78-
# Send extracted file back to Strelka
79-
self.emit_file(text, name=tag)
80-
81-
self.event["total"]["extracted"] += 1
82-
106+
if node is not None and hasattr(node.tag, "__getitem__"):
107+
namespace, _, tag = node.tag.partition("}")
108+
namespace = namespace[1:] if namespace.startswith("{") else ""
109+
tag = tag.lower()
110+
111+
if tag:
112+
self.event["tags"].add(tag)
113+
if namespace:
114+
self.event["namespaces"].add(namespace)
115+
116+
# Handle specific content extraction and emission
117+
if tag in xml_options["extract_tags"]:
118+
content = node.text.strip() if node.text else ""
119+
if content:
120+
self.emit_file(content, name=tag)
121+
self.emitted_files.add(content)
122+
self.event["total"]["extracted"] += 1
123+
124+
# Always process attributes to capture any relevant metadata or data for emission
125+
self._process_attributes(node, xml_options, tag)
126+
127+
# Continue to recurse through child nodes to extract data
83128
for child in node.getchildren():
84-
self._recurse_node(self, child, xml_args)
129+
self._recurse_node(child, xml_options)
85130

86-
return
131+
def _process_attributes(
132+
self, node: etree._Element, xml_options: Dict[str, Any], tag: str
133+
) -> None:
134+
"""
135+
Processes XML node attributes to extract or log data.
136+
Args:
137+
node: XML node whose attributes are being processed.
138+
xml_options: Configuration options for the scan.
139+
tag: The tag of the current XML node being processed.
140+
Extracts data from attributes specified in the extract_tags list and logs data
141+
from attributes specified in the metadata_tags list.
142+
"""
143+
for attr_name, attr_value in node.attrib.items():
144+
attr_name_lower = attr_name.lower()
145+
if attr_name_lower in xml_options["metadata_tags"]:
146+
self.event["tag_data"].append(
147+
{"tag": attr_name, "content": str(node.attrib)}
148+
)
Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,27 @@
1-
<?xml version="1.0"?>
2-
<package format="2">
3-
<name>flea3</name>
4-
<version>0.1.0</version>
5-
<description>The flea3 package</description>
6-
7-
<maintainer email="quchao@seas.upenn.edu">Chao Qu</maintainer>
8-
9-
<license>WTFPL</license>
10-
<buildtool_depend>catkin</buildtool_depend>
11-
12-
<depend>roscpp</depend>
13-
<depend>nodelet</depend>
14-
<depend>camera_base</depend>
15-
<!--<depend>std_msgs</depend>-->
16-
<depend>dynamic_reconfigure</depend>
17-
<build_depend>message_generation</build_depend>
18-
<exec_depend>message_runtime</exec_depend>
19-
20-
<export>
21-
<nodelet plugin="${prefix}/nodelet_plugins.xml"/>
22-
</export>
23-
</package>
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE bookstore SYSTEM "bookstore.dtd">
3+
<bookstore xmlns:bk="http://example.com/books" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://example.com/books bookstore.xsd">
4+
<metadata>
5+
<description>This is a sample bookstore XML file containing nonfiction science books.</description>
6+
</metadata>
7+
<bk:book category="science">
8+
<bk:title lang="en">A Brief History of Time</bk:title>
9+
<bk:author>Stephen Hawking</bk:author>
10+
<bk:year>1988</bk:year>
11+
<bk:price>25.00</bk:price>
12+
<signedinfo>
13+
<signature>XYZ123456789</signature>
14+
<timestamp>2024-04-05T14:00:00</timestamp>
15+
</signedinfo>
16+
</bk:book>
17+
<bk:book category="science">
18+
<bk:title lang="en">Cosmos</bk:title>
19+
<bk:author>Carl Sagan</bk:author>
20+
<bk:year>1980</bk:year>
21+
<bk:price>20.00</bk:price>
22+
<signedinfo>
23+
<signature>987ABCDEF321</signature>
24+
<timestamp>2024-04-05T15:00:00</timestamp>
25+
</signedinfo>
26+
</bk:book>
27+
</bookstore>

0 commit comments

Comments
 (0)