fkie-cad
diff --git a/‎src/plugins/analysis/coderec/__init__.py‎ b/‎src/plugins/analysis/coderec/__init__.py‎
diff --git a/‎src/plugins/analysis/coderec/code/__init__.py‎ b/‎src/plugins/analysis/coderec/code/__init__.py‎
diff --git a/‎src/plugins/analysis/coderec/code/coderec.py‎
Lines changed: 182 additions & 0 deletions b/‎src/plugins/analysis/coderec/code/coderec.py‎
Lines changed: 182 additions & 0 deletions
diff --git a/‎src/plugins/analysis/coderec/docker/Dockerfile‎
Lines changed: 54 additions & 0 deletions b/‎src/plugins/analysis/coderec/docker/Dockerfile‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎src/plugins/analysis/coderec/docker/docker.patch‎
Lines changed: 32 additions & 0 deletions b/‎src/plugins/analysis/coderec/docker/docker.patch‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎src/plugins/analysis/coderec/install.py‎
Lines changed: 29 additions & 0 deletions b/‎src/plugins/analysis/coderec/install.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎src/plugins/analysis/coderec/test/__init__.py‎ b/‎src/plugins/analysis/coderec/test/__init__.py‎
diff --git a/‎src/plugins/analysis/coderec/test/data/fib.mips.bin‎
67.7 KB b/‎src/plugins/analysis/coderec/test/data/fib.mips.bin‎
67.7 KB
diff --git a/‎src/plugins/analysis/coderec/test/test_coderec.py‎
Lines changed: 49 additions & 0 deletions b/‎src/plugins/analysis/coderec/test/test_coderec.py‎
Lines changed: 49 additions & 0 deletions
@@ -0,0 +1,182 @@
+from __future__ import annotations
+
+import json
+import lzma
+from base64 import b64encode
+from pathlib import Path
+from shlex import split
+from subprocess import CalledProcessError, run
+from tempfile import TemporaryDirectory
+from typing import TYPE_CHECKING, Iterable
+
+from bs4 import BeautifulSoup
+from docker.types import Mount
+from pydantic import BaseModel, Field
+from semver import Version
+
+import config
+from analysis.plugin import AnalysisPluginV0
+from helperFunctions.docker import run_docker_container
+
+if TYPE_CHECKING:
+    from io import FileIO
+
+MIN_SIZE = 2048
+DOCKER_IMAGE = 'fact/coderec'
+
+try:
+    # https://github.com/vobst/coderec
+    TOOL = run(split('which coderec'), capture_output=True, text=True, check=True).stdout.strip()
+except CalledProcessError as error:
+    raise RuntimeError('coderec not found. Please rerun the installation of the coderec plugin.') from error
+
+
+class AddressRange(BaseModel):
+    start: int
+    end: int
+    size: int
+
+
+class Region(BaseModel):
+    type: str
+    total_size: int
+    address_ranges: list[AddressRange]
+    plot_color: str | None = Field(None, description='The color of this region in the plot.')
+
+
+def _find_arch(regions: list[Region], blacklist: Iterable[str]) -> str | None:
+    for region in sorted(regions, key=lambda r: r.total_size, reverse=True):
+        if region.type.startswith('_') or region.type in blacklist:
+            continue
+        if region.total_size > MIN_SIZE:  # at least 3 blocks must match to avoid false positives
+            return region.type
+    return None
+
+
+def _find_regions(output: dict[str, tuple[dict[str, int], int, str]]) -> list[Region]:
+    regions = []
+    for label, address_ranges in _group_regions_by_type(output).items():
+        regions.append(
+            Region(
+                type=label,
+                total_size=sum(ar.size for ar in address_ranges),
+                address_ranges=sorted(address_ranges, key=lambda ar: ar.start),
+            )
+        )
+    return regions
+
+
+def _group_regions_by_type(output: dict[str, tuple[dict[str, int], int, str]]) -> dict[str, list[AddressRange]]:
+    region_dict = {}
+    for address_range, size, label in output:
+        region_dict.setdefault(label, []).append(
+            AddressRange(
+                start=address_range['start'],
+                end=address_range['end'],
+                size=size,
+            )
+        )
+    _merge_overlapping_regions(region_dict)
+    return region_dict
+
+
+def _merge_overlapping_regions(region_dict: dict[str, list[AddressRange]]):
+    for label, range_list in region_dict.items():
+        range_by_offset = {r.start: r for r in range_list}
+        merged = []
+        for start, range_ in sorted(range_by_offset.items()):
+            if start not in range_by_offset:
+                continue
+            while overlap := range_by_offset.get(range_.end):
+                range_ = AddressRange(  # noqa: PLW2901
+                    start=range_.start,
+                    end=overlap.end,
+                    size=range_.size + overlap.size,
+                )
+                range_by_offset.pop(overlap.start)
+            merged.append(range_)
+        region_dict[label] = merged
+
+
+def _compress(string: bytes) -> str:
+    return b64encode(lzma.compress(string)).decode()
+
+
+class AnalysisPlugin(AnalysisPluginV0):
+    class Schema(BaseModel):
+        regions: list[Region]
+        architecture: str | None
+        plot: str = Field(description='Byte plot (base64 encoded and lzma compressed)')
+
+    def __init__(self):
+        metadata = AnalysisPluginV0.MetaData(
+            name='coderec',
+            description='Find machine code in binary files or memory dumps.',
+            version=Version(0, 1, 0),
+            system_version=self._get_system_version(),
+            mime_whitelist=['application/octet-stream'],
+            Schema=AnalysisPlugin.Schema,
+        )
+        super().__init__(metadata=metadata)
+        self.blacklist = getattr(config.backend.plugin.get(metadata.name, {}), 'region-blacklist', '').split(',')
+
+    @staticmethod
+    def _get_system_version() -> str | None:
+        try:
+            return run(split(f'{TOOL} --version'), capture_output=True, text=True, check=True).stdout.strip().split()[1]
+        except IndexError:
+            return None
+
+    def summarize(self, result: Schema) -> list[str]:
+        return [result.architecture] if result.architecture else []
+
+    def analyze(self, file_handle: FileIO, virtual_file_path: str, analyses: dict) -> Schema:
+        del virtual_file_path, analyses
+        raw_output, output_svg = _run_coderec_in_docker(file_handle)
+        output = json.loads(raw_output)
+        regions = _find_regions(output['range_results'])
+        _add_region_colors(regions, output_svg)
+
+        return AnalysisPlugin.Schema(
+            regions=sorted(regions, key=lambda r: r.total_size, reverse=True),
+            architecture=_find_arch(regions, self.blacklist),
+            plot=_compress(output_svg),
+        )
+
+
+def _add_region_colors(regions: list[Region], output_svg: bytes):
+    types = {r.type for r in regions}.union({'unknown'})
+    svg = BeautifulSoup(output_svg.decode(), 'html.parser')
+
+    # find the start of the legend in the SVG's contents
+    for node in svg.find_all('text'):
+        if node.text.strip() in types:
+            break
+    else:
+        return
+
+    type_list, color_list = [], []
+    while node.name == 'text':
+        type_list.append(node.getText().strip())
+        node = node.find_next_sibling()
+    while node.name == 'rect':
+        color_list.append(node.get('fill'))
+        node = node.find_next_sibling()
+
+    type_to_color = {type_: color for type_, color in zip(type_list, color_list) if type_ in types}
+    for region in regions:
+        region.plot_color = type_to_color.get(region.type)
+
+
+def _run_coderec_in_docker(file: FileIO) -> tuple[str, bytes]:
+    with TemporaryDirectory() as tmp_dir:
+        result = run_docker_container(
+            DOCKER_IMAGE,
+            command='--big-file /io/input',
+            mounts=[
+                Mount('/io', tmp_dir, type='bind'),
+                Mount('/io/input', str(file.name), type='bind'),
+            ],
+        )
+        output_svg = Path(tmp_dir, 'regions_plot.svg').read_bytes()
+        return result.stdout, output_svg
@@ -0,0 +1,54 @@
+FROM rust:1.85-slim-bookworm AS builder
+
+WORKDIR /root
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libfontconfig1-dev pkg-config curl wget unzip patch && \
+    rm -rf /var/lib/apt/lists/*
+
+ARG CODEREC_VER="0.1.2"
+
+RUN wget "https://github.com/vobst/coderec/archive/refs/tags/${CODEREC_VER}.zip" && \
+    unzip "${CODEREC_VER}.zip"
+
+WORKDIR "/root/coderec-${CODEREC_VER}"
+
+COPY docker.patch .
+
+RUN patch -p1 <docker.patch
+
+RUN curl --proto '=https' --tlsv1.2 -sSf https://valentinobst.de/a13f15d91f0f8846d748e42e7a881f783eb8f922861a63d9dfb74824d21337039dd8216f0373c3e5820c5e32de8f0a1880ec55456ff0da39f17d32f567d62b84/cpu_rec_corpus.tar.gz -o cpu_rec_corpus.tar.gz && \
+    tar xf cpu_rec_corpus.tar.gz && \
+    rm cpu_rec_corpus.tar.gz && \
+    cargo build --release
+
+FROM debian:bookworm-slim as runtime
+
+ENV USER coderec
+ENV GROUPNAME coderec
+ENV UID 1000
+ENV GID 1000
+
+RUN addgroup --gid "$GID" "$GROUPNAME" \
+    && adduser \
+        --disabled-password \
+        --gecos "" \
+        --home "/home/coderec" \
+        --ingroup "$GROUPNAME" \
+        --no-create-home \
+        --uid "$UID" \
+        $USER
+
+RUN mkdir -p /home/coderec
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libfontconfig1 && \
+    rm -rf /var/lib/apt/lists/*
+
+USER coderec
+
+WORKDIR /home/coderec
+
+COPY --chown=${USER} --from=builder /root/coderec-0.1.2/target/release/coderec /home/coderec/coderec
+
+ENTRYPOINT ["/home/coderec/coderec"]
@@ -0,0 +1,32 @@
+diff --git a/src/plotting.rs b/src/plotting.rs
+index 9d0c5e6..75e29fe 100644
+--- a/src/plotting.rs
++++ b/src/plotting.rs
+@@ -29,10 +29,8 @@ const CAPTION_STYLE_3D: (&str, u32, FontStyle, &RGBColor) =
+     ("Calibri", 80, FontStyle::Normal, &BLACK);
+ const LABEL_STYLE_3D: (&str, u32, FontStyle, &RGBColor) =
+     ("Calibri", 30, FontStyle::Normal, &BLACK);
+-const CAPTION_STYLE_2D: (&str, u32, FontStyle, &RGBColor) =
+-    ("sans-serif", 80, FontStyle::Normal, &BLACK);
+ const LABEL_STYLE_2D: (&str, u32, FontStyle, &RGBColor) =
+-    ("Calibri", 12, FontStyle::Normal, &BLACK);
++    ("Calibri", 20, FontStyle::Normal, &BLACK);
+ 
+ impl CorpusStats {
+     pub fn plot_tg(&self) {
+@@ -165,13 +163,12 @@ pub fn plot_regions(
+     let arch_to_best_map = &det_res.arch_to_final_ranges;
+ 
+     let file_name = file_name.split("/").last().unwrap();
+-    let plot_name = format!("{}_w{}_regions.bmp", file_name, win_sz);
++    let plot_name = "/io/regions_plot.svg";
+ 
+-    let root = BitMapBackend::new(&plot_name, (5000, 500)).into_drawing_area();
++    let root = SVGBackend::new(&plot_name, (5000, 500)).into_drawing_area();
+     root.fill(&WHITE).unwrap();
+ 
+     let mut chart = ChartBuilder::on(&root)
+-        .caption(format!("{}, regions", file_name), CAPTION_STYLE_2D)
+         .margin(5)
+         .top_x_label_area_size(40)
+         .x_label_area_size(40)
@@ -0,0 +1,29 @@
+import logging
+from pathlib import Path
+
+try:
+    from helperFunctions.install import run_cmd_with_logging
+    from plugins.installer import AbstractPluginInstaller
+except ImportError:
+    import sys
+
+    SRC_PATH = Path(__file__).absolute().parent.parent.parent.parent
+    sys.path.append(str(SRC_PATH))
+
+    from helperFunctions.install import run_cmd_with_logging
+    from plugins.installer import AbstractPluginInstaller
+
+
+class CodeRecPluginInstaller(AbstractPluginInstaller):
+    base_path = Path(__file__).resolve().parent
+
+    def install_docker_images(self):
+        run_cmd_with_logging(f'docker build -t fact/coderec {self.base_path}/docker')
+
+
+# Alias for generic use
+Installer = CodeRecPluginInstaller
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+    Installer().install()
@@ -0,0 +1,49 @@
+from io import FileIO
+from pathlib import Path
+
+import pytest
+
+from plugins.analysis.coderec.code.coderec import AddressRange, AnalysisPlugin, _merge_overlapping_regions
+
+TEST_DATA_DIR = Path(__file__).parent / 'data'
+
+
+@pytest.mark.AnalysisPluginTestConfig(plugin_class=AnalysisPlugin)
+def test_basic_scan_feature(analysis_plugin):
+    test_file = FileIO(TEST_DATA_DIR / 'fib.mips.bin')
+    result = analysis_plugin.analyze(test_file, {}, {})
+    assert len(result.regions) == 2
+    region_by_type = {r.type: r for r in result.regions}
+    assert 'MIPSeb' in region_by_type
+    assert '_zero' in region_by_type
+    assert region_by_type['MIPSeb'].total_size == 3072
+    assert result.architecture == 'MIPSeb'
+
+
+def test_merge_overlapping_regions():
+    regions = {
+        'foo': [
+            AddressRange(start=7000, end=8000, size=1000),
+            AddressRange(start=0000, end=1000, size=1000),
+            AddressRange(start=9000, end=10000, size=1000),
+            AddressRange(start=1000, end=3000, size=2000),
+            AddressRange(start=5000, end=7000, size=2000),
+            AddressRange(start=3000, end=4000, size=1000),
+        ]
+    }
+    _merge_overlapping_regions(regions)
+
+    result = sorted(regions['foo'], key=lambda r: r.start)
+    assert len(result) == 3
+
+    assert result[0].start == 0
+    assert result[0].end == 4000
+    assert result[0].size == 4000
+
+    assert result[1].start == 5000
+    assert result[1].end == 8000
+    assert result[1].size == 3000
+
+    assert result[2].start == 9000
+    assert result[2].end == 10000
+    assert result[2].size == 1000