Skip to content

Commit 67a71b2

Browse files
committed
feat: added new coderec plugin
1 parent 0c8e723 commit 67a71b2

File tree

13 files changed

+432
-1
lines changed

13 files changed

+432
-1
lines changed

src/plugins/analysis/coderec/__init__.py

Whitespace-only changes.

src/plugins/analysis/coderec/code/__init__.py

Whitespace-only changes.
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
from __future__ import annotations
2+
3+
import json
4+
import lzma
5+
from base64 import b64encode
6+
from pathlib import Path
7+
from shlex import split
8+
from subprocess import CalledProcessError, run
9+
from tempfile import TemporaryDirectory
10+
from typing import TYPE_CHECKING, Iterable
11+
12+
from bs4 import BeautifulSoup
13+
from docker.types import Mount
14+
from pydantic import BaseModel, Field
15+
from semver import Version
16+
17+
import config
18+
from analysis.plugin import AnalysisPluginV0
19+
from helperFunctions.docker import run_docker_container
20+
21+
if TYPE_CHECKING:
22+
from io import FileIO
23+
24+
MIN_SIZE = 2048
25+
DOCKER_IMAGE = 'fact/coderec'
26+
27+
try:
28+
# https://github.com/vobst/coderec
29+
TOOL = run(split('which coderec'), capture_output=True, text=True, check=True).stdout.strip()
30+
except CalledProcessError as error:
31+
raise RuntimeError('coderec not found. Please rerun the installation of the coderec plugin.') from error
32+
33+
34+
class AddressRange(BaseModel):
35+
start: int
36+
end: int
37+
size: int
38+
39+
40+
class Region(BaseModel):
41+
type: str
42+
total_size: int
43+
address_ranges: list[AddressRange]
44+
plot_color: str | None = Field(None, description='The color of this region in the plot.')
45+
46+
47+
def _find_arch(regions: list[Region], blacklist: Iterable[str]) -> str | None:
48+
for region in sorted(regions, key=lambda r: r.total_size, reverse=True):
49+
if region.type.startswith('_') or region.type in blacklist:
50+
continue
51+
if region.total_size > MIN_SIZE: # at least 3 blocks must match to avoid false positives
52+
return region.type
53+
return None
54+
55+
56+
def _find_regions(output: dict[str, tuple[dict[str, int], int, str]]) -> list[Region]:
57+
regions = []
58+
for label, address_ranges in _group_regions_by_type(output).items():
59+
regions.append(
60+
Region(
61+
type=label,
62+
total_size=sum(ar.size for ar in address_ranges),
63+
address_ranges=sorted(address_ranges, key=lambda ar: ar.start),
64+
)
65+
)
66+
return regions
67+
68+
69+
def _group_regions_by_type(output: dict[str, tuple[dict[str, int], int, str]]) -> dict[str, list[AddressRange]]:
70+
region_dict = {}
71+
for address_range, size, label in output:
72+
region_dict.setdefault(label, []).append(
73+
AddressRange(
74+
start=address_range['start'],
75+
end=address_range['end'],
76+
size=size,
77+
)
78+
)
79+
_merge_overlapping_regions(region_dict)
80+
return region_dict
81+
82+
83+
def _merge_overlapping_regions(region_dict: dict[str, list[AddressRange]]):
84+
for label, range_list in region_dict.items():
85+
range_by_offset = {r.start: r for r in range_list}
86+
merged = []
87+
for start, range_ in sorted(range_by_offset.items()):
88+
if start not in range_by_offset:
89+
continue
90+
while overlap := range_by_offset.get(range_.end):
91+
range_ = AddressRange( # noqa: PLW2901
92+
start=range_.start,
93+
end=overlap.end,
94+
size=range_.size + overlap.size,
95+
)
96+
range_by_offset.pop(overlap.start)
97+
merged.append(range_)
98+
region_dict[label] = merged
99+
100+
101+
def _compress(string: bytes) -> str:
102+
return b64encode(lzma.compress(string)).decode()
103+
104+
105+
class AnalysisPlugin(AnalysisPluginV0):
106+
class Schema(BaseModel):
107+
regions: list[Region]
108+
architecture: str | None
109+
plot: str = Field(description='Byte plot (base64 encoded and lzma compressed)')
110+
111+
def __init__(self):
112+
metadata = AnalysisPluginV0.MetaData(
113+
name='coderec',
114+
description='Find machine code in binary files or memory dumps.',
115+
version=Version(0, 1, 0),
116+
system_version=self._get_system_version(),
117+
mime_whitelist=['application/octet-stream'],
118+
Schema=AnalysisPlugin.Schema,
119+
)
120+
super().__init__(metadata=metadata)
121+
self.blacklist = getattr(config.backend.plugin.get(metadata.name, {}), 'region-blacklist', '').split(',')
122+
123+
@staticmethod
124+
def _get_system_version() -> str | None:
125+
try:
126+
return run(split(f'{TOOL} --version'), capture_output=True, text=True, check=True).stdout.strip().split()[1]
127+
except IndexError:
128+
return None
129+
130+
def summarize(self, result: Schema) -> list[str]:
131+
return [result.architecture] if result.architecture else []
132+
133+
def analyze(self, file_handle: FileIO, virtual_file_path: str, analyses: dict) -> Schema:
134+
del virtual_file_path, analyses
135+
raw_output, output_svg = _run_coderec_in_docker(file_handle)
136+
output = json.loads(raw_output)
137+
regions = _find_regions(output['range_results'])
138+
_add_region_colors(regions, output_svg)
139+
140+
return AnalysisPlugin.Schema(
141+
regions=sorted(regions, key=lambda r: r.total_size, reverse=True),
142+
architecture=_find_arch(regions, self.blacklist),
143+
plot=_compress(output_svg),
144+
)
145+
146+
147+
def _add_region_colors(regions: list[Region], output_svg: bytes):
148+
types = {r.type for r in regions}.union({'unknown'})
149+
svg = BeautifulSoup(output_svg.decode(), 'html.parser')
150+
151+
# find the start of the legend in the SVG's contents
152+
for node in svg.find_all('text'):
153+
if node.text.strip() in types:
154+
break
155+
else:
156+
return
157+
158+
type_list, color_list = [], []
159+
while node.name == 'text':
160+
type_list.append(node.getText().strip())
161+
node = node.find_next_sibling()
162+
while node.name == 'rect':
163+
color_list.append(node.get('fill'))
164+
node = node.find_next_sibling()
165+
166+
type_to_color = {type_: color for type_, color in zip(type_list, color_list) if type_ in types}
167+
for region in regions:
168+
region.plot_color = type_to_color.get(region.type)
169+
170+
171+
def _run_coderec_in_docker(file: FileIO) -> tuple[str, bytes]:
172+
with TemporaryDirectory() as tmp_dir:
173+
result = run_docker_container(
174+
DOCKER_IMAGE,
175+
command='--big-file /io/input',
176+
mounts=[
177+
Mount('/io', tmp_dir, type='bind'),
178+
Mount('/io/input', str(file.name), type='bind'),
179+
],
180+
)
181+
output_svg = Path(tmp_dir, 'regions_plot.svg').read_bytes()
182+
return result.stdout, output_svg
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
FROM rust:1.85-slim-bookworm AS builder
2+
3+
WORKDIR /root
4+
5+
RUN apt-get update && \
6+
apt-get install -y --no-install-recommends libfontconfig1-dev pkg-config curl wget unzip patch && \
7+
rm -rf /var/lib/apt/lists/*
8+
9+
ARG CODEREC_VER="0.1.2"
10+
11+
RUN wget "https://github.com/vobst/coderec/archive/refs/tags/${CODEREC_VER}.zip" && \
12+
unzip "${CODEREC_VER}.zip"
13+
14+
WORKDIR "/root/coderec-${CODEREC_VER}"
15+
16+
COPY docker.patch .
17+
18+
RUN patch -p1 <docker.patch
19+
20+
RUN curl --proto '=https' --tlsv1.2 -sSf https://valentinobst.de/a13f15d91f0f8846d748e42e7a881f783eb8f922861a63d9dfb74824d21337039dd8216f0373c3e5820c5e32de8f0a1880ec55456ff0da39f17d32f567d62b84/cpu_rec_corpus.tar.gz -o cpu_rec_corpus.tar.gz && \
21+
tar xf cpu_rec_corpus.tar.gz && \
22+
rm cpu_rec_corpus.tar.gz && \
23+
cargo build --release
24+
25+
FROM debian:bookworm-slim as runtime
26+
27+
ENV USER coderec
28+
ENV GROUPNAME coderec
29+
ENV UID 1000
30+
ENV GID 1000
31+
32+
RUN addgroup --gid "$GID" "$GROUPNAME" \
33+
&& adduser \
34+
--disabled-password \
35+
--gecos "" \
36+
--home "/home/coderec" \
37+
--ingroup "$GROUPNAME" \
38+
--no-create-home \
39+
--uid "$UID" \
40+
$USER
41+
42+
RUN mkdir -p /home/coderec
43+
44+
RUN apt-get update && \
45+
apt-get install -y --no-install-recommends libfontconfig1 && \
46+
rm -rf /var/lib/apt/lists/*
47+
48+
USER coderec
49+
50+
WORKDIR /home/coderec
51+
52+
COPY --chown=${USER} --from=builder /root/coderec-0.1.2/target/release/coderec /home/coderec/coderec
53+
54+
ENTRYPOINT ["/home/coderec/coderec"]
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
diff --git a/src/plotting.rs b/src/plotting.rs
2+
index 9d0c5e6..75e29fe 100644
3+
--- a/src/plotting.rs
4+
+++ b/src/plotting.rs
5+
@@ -29,10 +29,8 @@ const CAPTION_STYLE_3D: (&str, u32, FontStyle, &RGBColor) =
6+
("Calibri", 80, FontStyle::Normal, &BLACK);
7+
const LABEL_STYLE_3D: (&str, u32, FontStyle, &RGBColor) =
8+
("Calibri", 30, FontStyle::Normal, &BLACK);
9+
-const CAPTION_STYLE_2D: (&str, u32, FontStyle, &RGBColor) =
10+
- ("sans-serif", 80, FontStyle::Normal, &BLACK);
11+
const LABEL_STYLE_2D: (&str, u32, FontStyle, &RGBColor) =
12+
- ("Calibri", 12, FontStyle::Normal, &BLACK);
13+
+ ("Calibri", 20, FontStyle::Normal, &BLACK);
14+
15+
impl CorpusStats {
16+
pub fn plot_tg(&self) {
17+
@@ -165,13 +163,12 @@ pub fn plot_regions(
18+
let arch_to_best_map = &det_res.arch_to_final_ranges;
19+
20+
let file_name = file_name.split("/").last().unwrap();
21+
- let plot_name = format!("{}_w{}_regions.bmp", file_name, win_sz);
22+
+ let plot_name = "/io/regions_plot.svg";
23+
24+
- let root = BitMapBackend::new(&plot_name, (5000, 500)).into_drawing_area();
25+
+ let root = SVGBackend::new(&plot_name, (5000, 500)).into_drawing_area();
26+
root.fill(&WHITE).unwrap();
27+
28+
let mut chart = ChartBuilder::on(&root)
29+
- .caption(format!("{}, regions", file_name), CAPTION_STYLE_2D)
30+
.margin(5)
31+
.top_x_label_area_size(40)
32+
.x_label_area_size(40)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import logging
2+
from pathlib import Path
3+
4+
try:
5+
from helperFunctions.install import run_cmd_with_logging
6+
from plugins.installer import AbstractPluginInstaller
7+
except ImportError:
8+
import sys
9+
10+
SRC_PATH = Path(__file__).absolute().parent.parent.parent.parent
11+
sys.path.append(str(SRC_PATH))
12+
13+
from helperFunctions.install import run_cmd_with_logging
14+
from plugins.installer import AbstractPluginInstaller
15+
16+
17+
class CodeRecPluginInstaller(AbstractPluginInstaller):
18+
base_path = Path(__file__).resolve().parent
19+
20+
def install_docker_images(self):
21+
run_cmd_with_logging(f'docker build -t fact/coderec {self.base_path}/docker')
22+
23+
24+
# Alias for generic use
25+
Installer = CodeRecPluginInstaller
26+
27+
if __name__ == '__main__':
28+
logging.basicConfig(level=logging.INFO)
29+
Installer().install()

src/plugins/analysis/coderec/test/__init__.py

Whitespace-only changes.
67.7 KB
Binary file not shown.
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from io import FileIO
2+
from pathlib import Path
3+
4+
import pytest
5+
6+
from plugins.analysis.coderec.code.coderec import AddressRange, AnalysisPlugin, _merge_overlapping_regions
7+
8+
TEST_DATA_DIR = Path(__file__).parent / 'data'
9+
10+
11+
@pytest.mark.AnalysisPluginTestConfig(plugin_class=AnalysisPlugin)
12+
def test_basic_scan_feature(analysis_plugin):
13+
test_file = FileIO(TEST_DATA_DIR / 'fib.mips.bin')
14+
result = analysis_plugin.analyze(test_file, {}, {})
15+
assert len(result.regions) == 2
16+
region_by_type = {r.type: r for r in result.regions}
17+
assert 'MIPSeb' in region_by_type
18+
assert '_zero' in region_by_type
19+
assert region_by_type['MIPSeb'].total_size == 3072
20+
assert result.architecture == 'MIPSeb'
21+
22+
23+
def test_merge_overlapping_regions():
24+
regions = {
25+
'foo': [
26+
AddressRange(start=7000, end=8000, size=1000),
27+
AddressRange(start=0000, end=1000, size=1000),
28+
AddressRange(start=9000, end=10000, size=1000),
29+
AddressRange(start=1000, end=3000, size=2000),
30+
AddressRange(start=5000, end=7000, size=2000),
31+
AddressRange(start=3000, end=4000, size=1000),
32+
]
33+
}
34+
_merge_overlapping_regions(regions)
35+
36+
result = sorted(regions['foo'], key=lambda r: r.start)
37+
assert len(result) == 3
38+
39+
assert result[0].start == 0
40+
assert result[0].end == 4000
41+
assert result[0].size == 4000
42+
43+
assert result[1].start == 5000
44+
assert result[1].end == 8000
45+
assert result[1].size == 3000
46+
47+
assert result[2].start == 9000
48+
assert result[2].end == 10000
49+
assert result[2].size == 1000

0 commit comments

Comments
 (0)