From 24f02c7dffe51fd8f3ad47cd84375534fc95db0b Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Wed, 29 Mar 2023 09:59:57 +0200 Subject: [PATCH 01/31] Development: add pre-commit configuration --- .pre-commit-config.yaml | 42 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..fd07386 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,42 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-added-large-files + - id: check-ast + - id: check-builtin-literals + - id: check-case-conflict + - id: check-docstring-first + - id: check-executables-have-shebangs + - id: check-json + - id: check-merge-conflict + - id: check-shebang-scripts-are-executable + - id: check-symlinks + - id: check-toml + - id: check-vcs-permalinks + - id: check-xml + - id: check-yaml + args: [--allow-multiple-documents] + - id: debug-statements + - id: destroyed-symlinks + - id: detect-private-key + - id: end-of-file-fixer + - id: fix-byte-order-marker + - id: fix-encoding-pragma + args: [--remove] + - id: mixed-line-ending + - id: pretty-format-json + - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] + - repo: https://github.com/psf/black + rev: 23.1.0 + hooks: + - id: black + args: [-l 99] + - repo: https://github.com/doublify/pre-commit-clang-format + rev: 62302476d0da01515660132d76902359bed0f782 + hooks: + - id: clang-format + types: [file] + files: \.(cpp|cc|cxx|c|h|hxx)$ + args: [--style=file] From f270859159884f470c4f635286e8a7f29558074a Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Fri, 31 Mar 2023 09:49:10 +0200 Subject: [PATCH 02/31] Test: move examples to test folder --- {examples => test/examples}/arm.asm | 1 - {examples => test/examples}/arm.pdf | Bin {examples => test/examples}/att_syntax.asm | 10 +++++----- {examples => test/examples}/att_syntax.pdf | Bin {examples => test/examples}/huge.asm | 2 +- {examples => test/examples}/huge.pdf | Bin {examples => test/examples}/objdump.asm | 12 ++++++------ {examples => test/examples}/objdump.pdf | Bin {examples => test/examples}/stripped_function.asm | 0 {examples => test/examples}/stripped_function.pdf | Bin {examples => test/examples}/stripped_objdump.asm | 4 ++-- {examples => test/examples}/stripped_objdump.pdf | Bin {examples => test/examples}/test_function.asm | 4 ++-- {examples => test/examples}/test_function.pdf | Bin 14 files changed, 16 insertions(+), 17 deletions(-) rename {examples => test/examples}/arm.asm (99%) rename {examples => test/examples}/arm.pdf (100%) rename {examples => test/examples}/att_syntax.asm (99%) rename {examples => test/examples}/att_syntax.pdf (100%) rename {examples => test/examples}/huge.asm (99%) rename {examples => test/examples}/huge.pdf (100%) rename {examples => test/examples}/objdump.asm (82%) rename {examples => test/examples}/objdump.pdf (100%) rename {examples => test/examples}/stripped_function.asm (100%) rename {examples => test/examples}/stripped_function.pdf (100%) rename {examples => test/examples}/stripped_objdump.asm (87%) rename {examples => test/examples}/stripped_objdump.pdf (100%) rename {examples => test/examples}/test_function.asm (99%) rename {examples => test/examples}/test_function.pdf (100%) diff --git a/examples/arm.asm b/test/examples/arm.asm similarity index 99% rename from examples/arm.asm rename to test/examples/arm.asm index a3e0416..e165038 100644 --- a/examples/arm.asm +++ b/test/examples/arm.asm @@ -48,4 +48,3 @@ b8: 00000000 .word 0x00000000 bc: 00000103 .word 0x00000103 c0: 00000107 .word 0x00000107 - diff --git a/examples/arm.pdf b/test/examples/arm.pdf similarity index 100% rename from examples/arm.pdf rename to test/examples/arm.pdf diff --git a/examples/att_syntax.asm b/test/examples/att_syntax.asm similarity index 99% rename from examples/att_syntax.asm rename to test/examples/att_syntax.asm index 64c9f1c..d2950d4 100644 --- a/examples/att_syntax.asm +++ b/test/examples/att_syntax.asm @@ -1,5 +1,5 @@ Dump of assembler code for function main: - 0x000000000002ebd0 <+0>: endbr64 + 0x000000000002ebd0 <+0>: endbr64 0x000000000002ebd4 <+4>: push %r15 0x000000000002ebd6 <+6>: push %r14 0x000000000002ebd8 <+8>: push %r13 @@ -16,7 +16,7 @@ Dump of assembler code for function main: 0x000000000002ec05 <+53>: mov %rax,0x128(%rsp) 0x000000000002ec0d <+61>: xor %eax,%eax 0x000000000002ec0f <+63>: callq 0x2eab0 <__sigsetjmp@plt> - 0x000000000002ec14 <+68>: endbr64 + 0x000000000002ec14 <+68>: endbr64 0x000000000002ec18 <+72>: test %eax,%eax 0x000000000002ec1a <+74>: jne 0x2ec7c 0x000000000002ec1c <+76>: callq 0x42360 @@ -86,7 +86,7 @@ Dump of assembler code for function main: 0x000000000002ed58 <+392>: mov $0x1,%esi 0x000000000002ed5d <+397>: lea 0xf735c(%rip),%rdi # 0x1260c0 0x000000000002ed64 <+404>: callq 0x2eab0 <__sigsetjmp@plt> - 0x000000000002ed69 <+409>: endbr64 + 0x000000000002ed69 <+409>: endbr64 0x000000000002ed6d <+413>: test %eax,%eax 0x000000000002ed6f <+415>: je 0x2ed9d 0x000000000002ed71 <+417>: movl $0x0,0xf29bd(%rip) # 0x121738 @@ -157,7 +157,7 @@ Dump of assembler code for function main: 0x000000000002ef45 <+885>: lea 0xfbd14(%rip),%rdi # 0x12ac60 0x000000000002ef4c <+892>: movl $0x1,0xf27c2(%rip) # 0x121718 0x000000000002ef56 <+902>: callq 0x2eab0 <__sigsetjmp@plt> - 0x000000000002ef5b <+907>: endbr64 + 0x000000000002ef5b <+907>: endbr64 0x000000000002ef5f <+911>: test %eax,%eax 0x000000000002ef61 <+913>: jne 0x2ec7c 0x000000000002ef67 <+919>: mov 0x18(%rsp),%rax @@ -495,7 +495,7 @@ Dump of assembler code for function main: 0x000000000002f570 <+2464>: mov 0xf3982(%rip),%eax # 0x122ef8 0x000000000002f576 <+2470>: mov %eax,0x2c(%rsp) 0x000000000002f57a <+2474>: callq 0x2eab0 <__sigsetjmp@plt> - 0x000000000002f57f <+2479>: endbr64 + 0x000000000002f57f <+2479>: endbr64 0x000000000002f583 <+2483>: test %eax,%eax 0x000000000002f585 <+2485>: je 0x2f5b3 0x000000000002f587 <+2487>: sub $0x3,%eax diff --git a/examples/att_syntax.pdf b/test/examples/att_syntax.pdf similarity index 100% rename from examples/att_syntax.pdf rename to test/examples/att_syntax.pdf diff --git a/examples/huge.asm b/test/examples/huge.asm similarity index 99% rename from examples/huge.asm rename to test/examples/huge.asm index 409f5fe..ab9b1fa 100644 --- a/examples/huge.asm +++ b/test/examples/huge.asm @@ -274,7 +274,7 @@ Dump of assembler code for function main: 0x000055555556ff22 <+1490>: pop %r13 0x000055555556ff24 <+1492>: pop %r14 0x000055555556ff26 <+1494>: pop %r15 - 0x000055555556ff28 <+1496>: ret + 0x000055555556ff28 <+1496>: ret 0x000055555556ff29 <+1497>: mov 0x180(%rsp),%rax 0x000055555556ff31 <+1505>: cmpb $0x0,0x2a5a8(%rip) # 0x55555559a4e0 0x000055555556ff38 <+1512>: movq $0x0,0x180(%rsp) diff --git a/examples/huge.pdf b/test/examples/huge.pdf similarity index 100% rename from examples/huge.pdf rename to test/examples/huge.pdf diff --git a/examples/objdump.asm b/test/examples/objdump.asm similarity index 82% rename from examples/objdump.asm rename to test/examples/objdump.asm index b41a5d6..eced5b5 100644 --- a/examples/objdump.asm +++ b/test/examples/objdump.asm @@ -1,5 +1,5 @@ 0000000000016bb0 <_obstack_allocated_p@@Base>: - 16bb0: f3 0f 1e fa endbr64 + 16bb0: f3 0f 1e fa endbr64 16bb4: 48 8b 47 08 mov 0x8(%rdi),%rax 16bb8: 48 85 c0 test %rax,%rax 16bbb: 74 29 je 16be6 <_obstack_allocated_p@@Base+0x36> @@ -12,11 +12,11 @@ 16bce: 48 85 c0 test %rax,%rax 16bd1: 75 ed jne 16bc0 <_obstack_allocated_p@@Base+0x10> 16bd3: 31 c0 xor %eax,%eax - 16bd5: c3 retq + 16bd5: c3 retq 16bd6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) - 16bdd: 00 00 00 + 16bdd: 00 00 00 16be0: b8 01 00 00 00 mov $0x1,%eax - 16be5: c3 retq - 16be6: c3 retq + 16be5: c3 retq + 16be6: c3 retq 16be7: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) - 16bee: 00 00 + 16bee: 00 00 diff --git a/examples/objdump.pdf b/test/examples/objdump.pdf similarity index 100% rename from examples/objdump.pdf rename to test/examples/objdump.pdf diff --git a/examples/stripped_function.asm b/test/examples/stripped_function.asm similarity index 100% rename from examples/stripped_function.asm rename to test/examples/stripped_function.asm diff --git a/examples/stripped_function.pdf b/test/examples/stripped_function.pdf similarity index 100% rename from examples/stripped_function.pdf rename to test/examples/stripped_function.pdf diff --git a/examples/stripped_objdump.asm b/test/examples/stripped_objdump.asm similarity index 87% rename from examples/stripped_objdump.asm rename to test/examples/stripped_objdump.asm index fd13a6c..c3f6c9a 100644 --- a/examples/stripped_objdump.asm +++ b/test/examples/stripped_objdump.asm @@ -1,5 +1,5 @@ 0000000000001000 <.text>: - 1000: f3 0f 1e fa endbr64 + 1000: f3 0f 1e fa endbr64 1004: 55 push %rbp 1005: 48 89 e5 mov %rsp,%rbp 1008: 89 7d fc mov %edi,-0x4(%rbp) @@ -11,4 +11,4 @@ 1019: 8b 45 fc mov -0x4(%rbp),%eax 101c: 0f af c0 imul %eax,%eax 101f: 5d pop %rbp - 1020: c3 retq + 1020: c3 retq diff --git a/examples/stripped_objdump.pdf b/test/examples/stripped_objdump.pdf similarity index 100% rename from examples/stripped_objdump.pdf rename to test/examples/stripped_objdump.pdf diff --git a/examples/test_function.asm b/test/examples/test_function.asm similarity index 99% rename from examples/test_function.asm rename to test/examples/test_function.asm index fbc06e9..75f817c 100644 --- a/examples/test_function.asm +++ b/test/examples/test_function.asm @@ -349,6 +349,6 @@ Dump of assembler code for function test_function: 0x00007ffff7fbf7cb <+1771>: pop %r14 0x00007ffff7fbf7cd <+1773>: pop %r15 0x00007ffff7fbf7cf <+1775>: pop %rbp - 0x00007ffff7fbf7d0 <+1776>: vzeroupper - 0x00007ffff7fbf7d3 <+1779>: ret + 0x00007ffff7fbf7d0 <+1776>: vzeroupper + 0x00007ffff7fbf7d3 <+1779>: ret End of assembler dump. diff --git a/examples/test_function.pdf b/test/examples/test_function.pdf similarity index 100% rename from examples/test_function.pdf rename to test/examples/test_function.pdf From 6f7d98b9f3cdeed635fc129ede4e431a473d8e42 Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Fri, 31 Mar 2023 09:49:49 +0200 Subject: [PATCH 03/31] Doc: move images to doc folder --- {images => doc/images}/example.png | Bin {images => doc/images}/example.svg | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {images => doc/images}/example.png (100%) rename {images => doc/images}/example.svg (100%) diff --git a/images/example.png b/doc/images/example.png similarity index 100% rename from images/example.png rename to doc/images/example.png diff --git a/images/example.svg b/doc/images/example.svg similarity index 100% rename from images/example.svg rename to doc/images/example.svg From 6479af352228c3dc483d25278437aab76025e801 Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Tue, 28 Mar 2023 16:42:56 +0200 Subject: [PATCH 04/31] ocrecord: Move gdb_asm2cfg to separate package --- ocrecord/__init__.py | 0 {src => ocrecord}/gdb_asm2cfg.py | 59 ++++++++++++++++---------------- 2 files changed, 30 insertions(+), 29 deletions(-) create mode 100644 ocrecord/__init__.py rename {src => ocrecord}/gdb_asm2cfg.py (57%) diff --git a/ocrecord/__init__.py b/ocrecord/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gdb_asm2cfg.py b/ocrecord/gdb_asm2cfg.py similarity index 57% rename from src/gdb_asm2cfg.py rename to ocrecord/gdb_asm2cfg.py index a40a712..b2a039a 100644 --- a/src/gdb_asm2cfg.py +++ b/ocrecord/gdb_asm2cfg.py @@ -4,13 +4,12 @@ For further information see https://sourceware.org/gdb/current/onlinedocs/gdb/Python.html#Python. """ - - import traceback import gdb -from asm2cfg import asm2cfg +from ..ocgraph.interface.drawer import Drawer +from ..ocgraph.interface.analyzer import Analyzer class SkipCalls(gdb.Parameter): @@ -20,25 +19,25 @@ class SkipCalls(gdb.Parameter): set skipcalls off """ - set_doc = 'Set whether savecfg and viewcfg commands will skip function calls from splitting CFG blocks' - show_doc = 'Set whether savecfg and viewcfg commands will skip function calls from splitting CFG blocks' + set_doc = "Set whether savecfg and viewcfg commands will skip function calls from splitting CFG blocks" + show_doc = "Set whether savecfg and viewcfg commands will skip function calls from splitting CFG blocks" def __init__(self): - super().__init__('skipcalls', gdb.COMMAND_DATA, gdb.PARAM_BOOLEAN) + super().__init__("skipcalls", gdb.COMMAND_DATA, gdb.PARAM_BOOLEAN) self.value = False def get_set_string(self): - return f'Commands savecfg and viewcfg will skip function calls \ - from splitting CFG blocks: {self.value_to_string()}' + return f"Commands savecfg and viewcfg will skip function calls \ + from splitting CFG blocks: {self.value_to_string()}" def get_show_string(self, _): - return f'Commands savecfg and viewcfg will skip function calls \ - from splitting CFG blocks: {self.value_to_string()}' + return f"Commands savecfg and viewcfg will skip function calls \ + from splitting CFG blocks: {self.value_to_string()}" def value_to_string(self): if self.value: - return 'on' - return 'off' + return "on" + return "off" class ViewCfg(gdb.Command): # pylint: disable=too-few-public-methods @@ -50,23 +49,25 @@ class ViewCfg(gdb.Command): # pylint: disable=too-few-public-methods """ def __init__(self): - super().__init__('viewcfg', gdb.COMMAND_USER) + super().__init__("viewcfg", gdb.COMMAND_USER) def invoke(self, _arg, _from_tty): # pylint: disable=bad-option-value,no-self-use - """ Called by GDB when viewcfg command is invoked """ + """Called by GDB when viewcfg command is invoked""" try: frame = gdb.selected_frame() arch = frame.architecture().name() - if arch.startswith('i386'): - target_name = 'x86' - elif arch.startswith('arm'): - target_name = 'arm' + if arch.startswith("i386"): + target_name = "x86" + elif arch.startswith("arm"): + target_name = "arm" + elif arch.startswith("sparc"): + target_name = "sparc" else: - raise RuntimeError(f'unknown platform: {arch}') - assembly_lines = gdb.execute('disassemble', from_tty=False, to_string=True).split('\n') - function_name, basic_blocks = asm2cfg.parse_lines(assembly_lines, gdb.parameter('skipcalls'), - target_name) - asm2cfg.draw_cfg(function_name, basic_blocks, view=True) + raise RuntimeError(f"unknown platform: {arch}") + assembly_lines = gdb.execute("disassemble", from_tty=False, to_string=True).split("\n") + analyzer = Analyzer(config=target_name + " GDB") + analyzer.parse_lines(assembly_lines) + Drawer(analyzer.configuration).view_cfg(analyzer.function_name, analyzer.basic_blocks) # Catch error coming from GDB side before other errors. except gdb.error as ex: raise gdb.GdbError(ex) @@ -83,15 +84,15 @@ class SaveCfg(gdb.Command): # pylint: disable=too-few-public-methods """ def __init__(self): - super().__init__('savecfg', gdb.COMMAND_USER) + super().__init__("savecfg", gdb.COMMAND_USER) def invoke(self, _arg, _from_tty): # pylint: disable=no-self-use - """ Called by GDB when savecfg command is invoked """ + """Called by GDB when savecfg command is invoked""" try: - assembly_lines = gdb.execute('disassemble', from_tty=False, to_string=True).split('\n') - function_name, basic_blocks = asm2cfg.parse_lines(assembly_lines, gdb.parameter('skipcalls'), - 'x86') - asm2cfg.draw_cfg(function_name, basic_blocks, view=False) + assembly_lines = gdb.execute("disassemble", from_tty=False, to_string=True).split("\n") + analyzer = Analyzer(config="x86 GDB") + analyzer.parse_lines(assembly_lines) + Drawer(analyzer.configuration).view_cfg(analyzer.function_name, analyzer.basic_blocks) # Catch error coming from GDB side before other errors. except gdb.error as ex: raise gdb.GdbError(ex) From f6777be0914581b6e6981e02c3d214b3fa5bd454 Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Wed, 3 May 2023 12:16:05 +0200 Subject: [PATCH 05/31] Development: exclude log files in .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index e469d1c..2ee9845 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,6 @@ coverage.xml # Test files a.out main.pdf + +# Log files +*.log From 5a226bf313cb2087b1a5b94a67a8bc4dd98801ce Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Wed, 3 May 2023 12:10:23 +0200 Subject: [PATCH 06/31] ocgraph: Complete refactoring (architecture/design/code) * Apply appropriate file structure * Apply approprate folder structure * Use internal configuration * Add sparc architecture target * Set up proper api interfaces (intern/extern) * Prepare for multi-packages (ocgraph/ocrecord) * Apply pylint coding advices * Apply pre-commit hooks advices --- {src/asm2cfg => ocgraph}/__init__.py | 0 ocgraph/__main__.py | 86 +++ ocgraph/configuration/__init__.py | 0 .../configuration/architecture/__init__.py | 0 .../architecture/architecture.py | 47 ++ ocgraph/configuration/architecture/arm.py | 52 ++ ocgraph/configuration/architecture/sparc.py | 96 +++ ocgraph/configuration/architecture/x86.py | 37 + ocgraph/configuration/configuration.py | 109 +++ .../configuration/disassembler/__init__.py | 0 .../disassembler/disassembler.py | 44 ++ .../configuration/disassembler/gdb_default.py | 214 ++++++ .../disassembler/objdump_default.py | 221 ++++++ ocgraph/data/__init__.py | 0 ocgraph/data/address.py | 26 + ocgraph/data/basic_block.py | 40 ++ ocgraph/data/encoding.py | 20 + ocgraph/data/instruction.py | 92 +++ ocgraph/data/jump_table.py | 51 ++ ocgraph/interface/__init__.py | 0 ocgraph/interface/analyzer.py | 177 +++++ ocgraph/interface/drawer.py | 118 ++++ setup.cfg | 3 +- src/asm2cfg/__main__.py | 9 - src/asm2cfg/asm2cfg.py | 630 ------------------ src/asm2cfg/command_line.py | 27 - 26 files changed, 1431 insertions(+), 668 deletions(-) rename {src/asm2cfg => ocgraph}/__init__.py (100%) create mode 100755 ocgraph/__main__.py create mode 100644 ocgraph/configuration/__init__.py create mode 100644 ocgraph/configuration/architecture/__init__.py create mode 100755 ocgraph/configuration/architecture/architecture.py create mode 100755 ocgraph/configuration/architecture/arm.py create mode 100755 ocgraph/configuration/architecture/sparc.py create mode 100755 ocgraph/configuration/architecture/x86.py create mode 100755 ocgraph/configuration/configuration.py create mode 100644 ocgraph/configuration/disassembler/__init__.py create mode 100755 ocgraph/configuration/disassembler/disassembler.py create mode 100755 ocgraph/configuration/disassembler/gdb_default.py create mode 100755 ocgraph/configuration/disassembler/objdump_default.py create mode 100644 ocgraph/data/__init__.py create mode 100755 ocgraph/data/address.py create mode 100755 ocgraph/data/basic_block.py create mode 100755 ocgraph/data/encoding.py create mode 100755 ocgraph/data/instruction.py create mode 100755 ocgraph/data/jump_table.py create mode 100644 ocgraph/interface/__init__.py create mode 100755 ocgraph/interface/analyzer.py create mode 100755 ocgraph/interface/drawer.py delete mode 100644 src/asm2cfg/__main__.py delete mode 100644 src/asm2cfg/asm2cfg.py delete mode 100644 src/asm2cfg/command_line.py diff --git a/src/asm2cfg/__init__.py b/ocgraph/__init__.py similarity index 100% rename from src/asm2cfg/__init__.py rename to ocgraph/__init__.py diff --git a/ocgraph/__main__.py b/ocgraph/__main__.py new file mode 100755 index 0000000..7d8f78c --- /dev/null +++ b/ocgraph/__main__.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +""" +Let this module be executed from the command line with python -m ocgraph +from root of the project +""" +import argparse + +from .interface.analyzer import Analyzer +from .interface.drawer import Drawer +from .interface.coverage_reader import CoverageReader + +from .configuration.configuration import OcGraphConfiguration + + +def print_assembly(basic_blocks): + """Debug function to print the assembly.""" + for basic_block in basic_blocks.values(): + print(basic_block) + + +def read_lines(file_path) -> list[str]: + """Read lines from the file and return then as a list.""" + with open(file_path, "r", encoding="utf8") as asm_file: + lines = asm_file.readlines() + return lines + + +def main(): + """Command-line entry point to the program.""" + parser = argparse.ArgumentParser(description="Assembly to Control-Flow-Graph rendering.") + + parser.add_argument( + "-f", + "--file", + help="Disassembled object file", + required=True, + ) + parser.add_argument( + "-d", + "--diss", + help="Disassembler option", + required=True, + choices=OcGraphConfiguration.disassemblers(), + ) + parser.add_argument( + "-a", + "--arch", + help="Architecture option", + required=True, + choices=OcGraphConfiguration.architectures(), + ) + + parser.add_argument("-c", "--coverage", help="Coverage file for printing coverage") + parser.add_argument("-v", "--view", action="store_true", help="View as a dot graph") + parser.add_argument("-o", "--output", help="Target output filename") + parser.add_argument( + "-l", + "--logger", + choices=OcGraphConfiguration.loggers(), + default="default", + help="Logging mechanism preset", + ) + args = parser.parse_args() + + # Create configuration + config = OcGraphConfiguration( + disassembler=args.diss, arch=args.arch, logging_preset=args.logger + ) + + lines = read_lines(args.assembly_file) + + analyser = Analyzer(config=config) + analyser.parse_lines(lines=lines) + + if args.coverage: + cov_reader = CoverageReader(instructions=analyser.instructions, config=config) + cov_reader.update_by_csv(args.coverage) + + drawer = Drawer(analyser.configuration) + drawer.draw_cfg( + name=analyser.function_name, basic_blocks=analyser.basic_blocks, output=args.output + ) + + +if __name__ == "__main__": + main() diff --git a/ocgraph/configuration/__init__.py b/ocgraph/configuration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ocgraph/configuration/architecture/__init__.py b/ocgraph/configuration/architecture/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ocgraph/configuration/architecture/architecture.py b/ocgraph/configuration/architecture/architecture.py new file mode 100755 index 0000000..a7e2206 --- /dev/null +++ b/ocgraph/configuration/architecture/architecture.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +"""Contains all necessary functions for a TargetInfo class.""" + +from abc import ABC, abstractmethod + +from ...data.instruction import Instruction + + +class Architecture(ABC): + """TargetInfo Class""" + + def __init__(self): + pass + + @abstractmethod + def comment(self) -> str: + """Return how comments starts in the disassembly""" + raise NotImplementedError() + + @abstractmethod + def is_call(self, instruction: Instruction) -> bool: + """Return if disassembled instruction is a call""" + raise NotImplementedError() + + @abstractmethod + def is_jump(self, instruction: Instruction) -> bool: + """Return if disassembled instruction is a jump""" + raise NotImplementedError() + + def get_jump_delay(self, instruction: Instruction) -> int | None: + """Return the jump delay of an instruction or None if not a jump""" + return 1 if self.is_jump(instruction) else None + + @abstractmethod + def is_direct_jump(self, instruction: Instruction) -> bool: + """Return if disassembled instruction is a direct jump""" + raise NotImplementedError() + + @abstractmethod + def is_branch(self, instruction: Instruction) -> bool: + """Return if disassembled instruction is a conditional jump""" + raise NotImplementedError() + + @abstractmethod + def is_sink(self, instruction: Instruction) -> bool: + """Return if disassembled instruction serves as sink (e.g. ret)""" + raise NotImplementedError() diff --git a/ocgraph/configuration/architecture/arm.py b/ocgraph/configuration/architecture/arm.py new file mode 100755 index 0000000..8feca00 --- /dev/null +++ b/ocgraph/configuration/architecture/arm.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +""" Contains instruction info for ARM-compatible targets. """ + +import re + +from .architecture import Architecture +from ...data.instruction import Instruction + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class ArmArchitecture(Architecture): + """ArmArchitecture Class""" + + def comment(self): + return ";" + + def is_call(self, instruction: Instruction): + # Various flavors of call: + # bl 0x19d90 <_IO_vtable_check> + # Note that we should be careful to not mix it with conditional + # branches like 'ble'. + return instruction.opcode.startswith("bl") and instruction.opcode not in ( + "blt", + "ble", + "bls", + ) + + def is_jump(self, instruction: Instruction): + return instruction.opcode[0] == "b" and not self.is_call(instruction) + + def is_direct_jump(self, instruction: Instruction): + return self.is_jump(instruction) and re.match(rf"{HEX_LONG_PATTERN}", instruction.ops[0]) + + def is_branch(self, instruction: Instruction): + return instruction.opcode == "b" + + def is_sink(self, instruction: Instruction): + """ + Is this an instruction which terminates function execution e.g. return? + Detect various flavors of return like + bx lr + pop {r2-r6,pc} + Note that we do not consider conditional branches (e.g. 'bxle') to sink. + """ + return ( + re.search(r"\bpop\b.*\bpc\b", instruction.body) + or (instruction.opcode == "bx" and instruction.ops[0] == "lr") + or instruction.opcode == "udf" + ) diff --git a/ocgraph/configuration/architecture/sparc.py b/ocgraph/configuration/architecture/sparc.py new file mode 100755 index 0000000..8494b0c --- /dev/null +++ b/ocgraph/configuration/architecture/sparc.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +"""Contains instruction info for Sparc-compatible targets.""" + +from .architecture import Architecture +from ...data.instruction import Instruction + + +# fmt: off +sparc_v8_Bicc_opcodes = [ + # conditional icc branch opcodes + "ba", "bn", "bne", "be", "bg", "ble", "bge", "bl", "bgu", "bleu", "bcc", + "bcs", "bpos", "bneg", "bvc", "bvs", +] + +sparc_v8_FBfcc_opcodes = [ + # conditional fcc branch opcodes + "fba", "fbn", "fbu", "fbg", "fbug", "fbl", "fbul", "fblg", "fbne", "fbe", + "fbue", "fbge", "fbuge", "fble", "fbule", "fbo", +] + +sparc_v8_CBfcc_opcodes = [ + # conditional coprocessor opcodes + "cba", "cbn", "cb3", "cb2", "cb23", "cb1", "cb13", "cb12", "cb123", "cb0", + "cb03", "cb02", "cb023", "cb01", "cb013", "cb012", +] + +sparc_v8_Ticc_opcodes = [ + # condictional traps on icc + "ta", "tn", "tne", "te", "tg", "tle", "tge", "tl", "tgu", "tleu", "tcc", + "tcs", "tpos", "tneg", "tvc", "tvs", +] + +sparc_v8_branch_cond_delay_opcodes = [ + f"{x},a" for x in + sparc_v8_Bicc_opcodes + + sparc_v8_FBfcc_opcodes + + sparc_v8_CBfcc_opcodes +] + +sparc_v8_remaining_jump_opcodes = [ + "jmpl", "jmp", "b", # "call", "ret", retl not regarded currently +] + +sparc_v8_delayed_opcodes = sparc_v8_Bicc_opcodes + \ + sparc_v8_FBfcc_opcodes + \ + sparc_v8_CBfcc_opcodes + \ + sparc_v8_branch_cond_delay_opcodes + \ + sparc_v8_remaining_jump_opcodes + +sparc_v8_jump_opcodes = sparc_v8_Bicc_opcodes + \ + sparc_v8_FBfcc_opcodes + \ + sparc_v8_CBfcc_opcodes + \ + sparc_v8_Ticc_opcodes + \ + sparc_v8_branch_cond_delay_opcodes + \ + sparc_v8_remaining_jump_opcodes + +sparc_v8_branch_opcodes = sparc_v8_Bicc_opcodes + \ + sparc_v8_FBfcc_opcodes + \ + sparc_v8_CBfcc_opcodes + \ + sparc_v8_branch_cond_delay_opcodes +# fmt: on + + +class SparcArchitecture(Architecture): + """SparcArchitecture Class""" + + def comment(self): + return "!" + + def is_call(self, instruction: Instruction): + return instruction.opcode == "call" + + def is_jump(self, instruction: Instruction): + return instruction.opcode in sparc_v8_jump_opcodes + + def get_jump_delay(self, instruction: Instruction) -> int | None: + delay = None + if instruction.opcode in sparc_v8_delayed_opcodes: + delay = 2 + elif self.is_sink(instruction): + delay = 2 + else: + delay = 1 + return delay + + def is_direct_jump(self, instruction: Instruction): + # every jump is disassembled with the complete offset + return self.is_jump(instruction) + + def is_branch(self, instruction: Instruction): + return instruction.opcode in sparc_v8_branch_opcodes + + def is_sink(self, instruction: Instruction): + # ret: return from subroutine + # retl: return from leaf subroutine + return instruction.opcode in ["ret", "retl"] diff --git a/ocgraph/configuration/architecture/x86.py b/ocgraph/configuration/architecture/x86.py new file mode 100755 index 0000000..9970985 --- /dev/null +++ b/ocgraph/configuration/architecture/x86.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +"""Contains instruction info for X86-compatible targets.""" + +import re + +from .architecture import Architecture +from ...data.instruction import Instruction + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class X86Architecture(Architecture): + """X86Architecture Class""" + + def comment(self): + return "#" + + def is_call(self, instruction: Instruction): + # Various flavors of call: + # call *0x26a16(%rip) + # call 0x555555555542 + # addr32 call 0x55555558add0 + return "call" in instruction.opcode + + def is_jump(self, instruction: Instruction): + return instruction.opcode[0] == "j" + + def is_direct_jump(self, instruction: Instruction): + return self.is_jump(instruction) and re.match(rf"{HEX_LONG_PATTERN}", instruction.ops[0]) + + def is_branch(self, instruction: Instruction): + return instruction.opcode.startswith("jmp") + + def is_sink(self, instruction: Instruction): + return instruction.opcode.startswith("ret") diff --git a/ocgraph/configuration/configuration.py b/ocgraph/configuration/configuration.py new file mode 100755 index 0000000..fe25c6b --- /dev/null +++ b/ocgraph/configuration/configuration.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +"""Module for configuration of the ocgraph package.""" +import logging + +from .architecture.architecture import Architecture +from .architecture.x86 import X86Architecture +from .architecture.arm import ArmArchitecture +from .architecture.sparc import SparcArchitecture + +from .disassembler.disassembler import Disassembler +from .disassembler.objdump_default import ObjDumpDisassembler +from .disassembler.gdb_default import GdbDisassembler + +# fmt: off +disassembler_option: dict[str, Disassembler] = { + "OBJDUMP": ObjDumpDisassembler(), + "GDB": GdbDisassembler(), +} + +architecture_option: dict[str, dict] = { + "x86": { + "platform": "X86", + "architecture": X86Architecture(), + }, + "arm": { + "platform": "ARM", + "architecture": ArmArchitecture(), + }, + "sparc": { + "platform": "SPARC", + "architecture": SparcArchitecture(), + }, +} + +preset_logging: dict[str, dict] = { + "development": { + "file_log": "debug.log", + "file_level": logging.DEBUG, + "console_log": True, + "console_level": logging.DEBUG, + }, + "module": { + "file_log": None, + "file_level": logging.ERROR, + "console_log": False, + "console_level": logging.ERROR, + }, + "default": { + "file_log": "asm2fg.log", + "file_level": logging.INFO, + "console_log": True, + "console_level": logging.INFO, + }, +} +# fmt: on + + +class OcGraphConfiguration: + """Implement configuration presets for the ASM2CFG tool.""" + + def __init__( + self, arch: str = "sparc", disassembler: str = "objdump", logging_preset="default" + ): + if architecture_option.get(arch) is None: + raise NotImplementedError("Architecture option not supported!") + if disassembler_option.get(disassembler) is None: + raise NotImplementedError("Disassembler option not supported!") + if preset_logging.get(logging_preset) is None: + raise NotImplementedError("Logging preset not supported!") + + # load module preset + _preset = architecture_option[arch] + _preset["disassembler"] = disassembler_option.get(disassembler) + self.__dict__ = _preset + + # configure logging + log_config = preset_logging.get(logging_preset) + if log_config["file_log"]: + file_stream: logging.FileHandler = logging.FileHandler(log_config["file_log"]) + file_stream.setLevel(log_config["file_level"]) + self.logger.addHandler(file_stream) + if log_config["console_log"]: + console_stream: logging.StreamHandler = logging.StreamHandler() + console_stream.setLevel(log_config["console_level"]) + self.logger.addHandler(console_stream) + + @staticmethod + def architectures(): + """Return all available architectures options""" + return architecture_option.keys() + + @staticmethod + def disassemblers(): + """Return all available disassemblers options""" + return disassembler_option.keys() + + @staticmethod + def loggers(): + """Return all available disassemblers options""" + return preset_logging.keys() + + logger: logging.Logger = logging.Logger("OcGraph") + """Logging mechanism for module""" + + architecture: Architecture + """Target architecture instance""" + + disassembler: Disassembler + """Target disassembler tool like OBJDump, GDB, ...""" diff --git a/ocgraph/configuration/disassembler/__init__.py b/ocgraph/configuration/disassembler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ocgraph/configuration/disassembler/disassembler.py b/ocgraph/configuration/disassembler/disassembler.py new file mode 100755 index 0000000..b6d444b --- /dev/null +++ b/ocgraph/configuration/disassembler/disassembler.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GTDGmbH +# Copyright 2023 by GTD GmbH. +"""Class configuring the used disassembler tool.""" + +from abc import ABC, abstractmethod + +from ..architecture.sparc import SparcArchitecture +from ...data.instruction import Instruction + + +class DisassemblerError(Exception): + """Raised when the extract_information method was not successful.""" + + +class Disassembler(ABC): + """Disassembler Class""" + + def __init__(self): + self.architecture = SparcArchitecture() + + name: str = "" + """ Disassembler tool identification like SparcV8Objdump, GDB, ...""" + + @abstractmethod + def extract_information(self, str_input: str) -> dict[str, str]: + """Specification of the extracted information. Required attributes are: + * address = instruction location in the binary + * location: instruction address location + * instr_d: instruction in disassembled format + * instr_h: instruction in hex-notation + * opcode: instruction opcode + * printable: a printable line of the collected information + """ + raise NotImplementedError() + + @abstractmethod + def parse_function_header(self, line: str) -> str | None: + """Return function name of memory range from the given string line.""" + raise NotImplementedError() + + @abstractmethod + def parse_line(self, line: str, lineno, function_name: str) -> Instruction | None: + """Parses a single line of assembly to create Instruction instance""" diff --git a/ocgraph/configuration/disassembler/gdb_default.py b/ocgraph/configuration/disassembler/gdb_default.py new file mode 100755 index 0000000..ef4ac80 --- /dev/null +++ b/ocgraph/configuration/disassembler/gdb_default.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +"""Class for parsing the input""" + +import re +from typing import List + +from ...data.address import Address +from ...data.encoding import Encoding +from ...data.instruction import Instruction + +from .disassembler import Disassembler, DisassemblerError + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class GdbDisassembler(Disassembler): + """x86 GDB disassembler""" + + name: str = "Default GDB Disassembler (x86 Binutils)" + + # Expected format: <>: + regex: str = r"(\S+)( <(\S+)>|):\s+([\S ]+)\s([\S ]+)" + regex_information: dict[str, int] = { + "address": 1, + "location": 3, + "instruction_hex": 4, + "instruction_str": 5, + } + + def extract_information(self, str_input: str) -> dict[str, str]: + result = {} + + if "=> " in str_input: + extracted_line = str_input.split("=> ", 1)[1].split("\n", 1)[0] + + information = re.search(self.regex, extracted_line) + if not information: + raise DisassemblerError("Line not processable: \n" + str(extracted_line)) + + address = str(information.group(self.regex_information.get("address"))) + location = str(information.group(self.regex_information.get("location"))) + instr_d = str(information.group(self.regex_information.get("instruction_str"))) + instr_h = str(information.group(self.regex_information.get("instruction_hex"))) + opcode = (instr_d.split()[0],) + + result = { + "address": address, + "location": location, + "instr_h": instr_h, + "instr_d": instr_d, + "opcode": opcode, + "printable": extracted_line, + } + else: + raise DisassemblerError("Line not processable: \n" + str(str_input)) + return result + + def parse_function_header(self, line: str) -> str | None: + """ + Return function name of memory range from the given string line. + + Match lines for non-stripped binaries: + 'Dump of assembler code for function test_function:' + lines for stripped binaries: + 'Dump of assembler code from 0x555555555faf to 0x555555557008:' + and lines for objdump disassembly: + '0000000000016bb0 <_obstack_allocated_p@@Base>:' + """ + objdump_name_pattern = re.compile(rf"{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:") + function_name = objdump_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + function_name_pattern = re.compile(r"function (\w+):$") + function_name = function_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + memory_range_pattern = re.compile( + rf"(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$" + ) + memory_range = memory_range_pattern.search(line) + if memory_range is not None: + return f"{memory_range[1]}-{memory_range[2]}" + + return None + + def parse_address(self, line: str) -> (Address, str): + """ + Parses leading address of instruction + """ + address_match = re.match(rf"^\s*(?:0x)?({HEX_PATTERN})\s*(?:<([+-][0-9]+)>)?:(.*)", line) + if address_match is None: + return None, line + address = Address( + int(address_match[1], 16), None, int(address_match[2]) if address_match[2] else None + ) + return address, address_match[3] + + def split_nth(self, string: str, count: int) -> List[str]: + """ + Splits string to equally-sized chunks + """ + return [string[i : i + count] for i in range(0, len(string), count)] + + def parse_encoding(self, line): + """ + Parses byte encoding of instruction for objdump disassemblies + e.g. the '31 c0' in + '16bd3: 31 c0 xor %eax,%eax' + In addition to X86 supports ARM encoding styles: + '4: e1a01000 mov r1, r0' + '50: f7ff fffe bl 0 <__aeabi_dadd>' + '54: 0002 movs r2, r0' + """ + # Encoding is separated from assembly mnemonic via tab + # so we allow whitespace separators between bytes + # to avoid accidentally matching the mnemonic. + enc_match = re.match(r"^\s*((?:[0-9a-f]{2,8} +)+)(.*)", line) + if enc_match is None: + return None, line + bites = [] + for chunk in enc_match[1].strip().split(" "): + bites.extend(int(byte, 16) for byte in self.split_nth(chunk, 2)) + return Encoding(bites), enc_match[2] + + def parse_body(self, line: str) -> (str, str, List[str], str): + """Parses instruction body (opcode and operands)""" + comment_symbol = self.architecture.comment() + body_match = re.match(rf"^\s*([^{comment_symbol}<]+)(.*)", line) + if body_match is None: + return None, None, None, line + body = body_match[1].strip() + line = body_match[2] + opcode_match = re.match(r"^(\S*)\s*(.*)", body) + if opcode_match is None: + return None, None, None, line + opcode = opcode_match[1] + ops = opcode_match[2].split(",") if opcode_match[2] else [] + return body, opcode, ops, line + + def parse_target(self, line: str) -> (Address, str): + """ + Parses optional instruction branch target hint + """ + target_match = re.match(r"\s*<([a-zA-Z_@0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)", line) + if target_match is None: + return None, line + offset = target_match[2] or "+0" + address = Address(None, target_match[1], int(offset, 0)) + return address, target_match[3] + + def parse_comment(self, line: str) -> (Address, str): + """ + Parses optional instruction comment + """ + comment_symbol = self.architecture.comment() + comment_match = re.match(rf"^\s*{comment_symbol}\s*(.*)", line) + if comment_match is None: + return None, line + comment = comment_match[1] + imm_match = re.match(rf"^(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)", comment) + if imm_match is None: + # If no imm was found, ignore the comment. + # In particular this takes care of useless ARM comments like + # '82: 46c0 nop ; (mov r8, r8)' + return None, "" + abs_addr = int(imm_match[1], 16) + if imm_match[2]: + target, _ = self.parse_target(imm_match[2]) + target.abs = abs_addr + else: + target = Address(abs_addr) + return target, imm_match[3] + + def parse_line(self, line: str, lineno, function_name: str) -> Instruction | None: + """Parses a single line of assembly to create Instruction instance""" + # Strip GDB prefix and leading whites + line = line.removeprefix("=> ") + line = line.lstrip() + + address, line = self.parse_address(line) + if address is None: + return None + + original_line = line + body, opcode, ops, line = self.parse_body(line) + if opcode is None: + return None + + target, line = self.parse_target(line) + + _, line = self.parse_comment(line) + if line: + # Expecting complete parse + return None + + # Set base symbol for relative addresses + if address.base is None: + address.base = function_name + if target is not None and target.base is None: + target.base = function_name + + return Instruction( + body, + original_line.strip(), + lineno, + address, + opcode, + ops, + target, + ) diff --git a/ocgraph/configuration/disassembler/objdump_default.py b/ocgraph/configuration/disassembler/objdump_default.py new file mode 100755 index 0000000..dba05dd --- /dev/null +++ b/ocgraph/configuration/disassembler/objdump_default.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 +"""Class for parsing the input""" + +import re +from typing import List + +from ...data.address import Address +from ...data.encoding import Encoding +from ...data.instruction import Instruction + +from .disassembler import Disassembler, DisassemblerError + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class ObjDumpDisassembler(Disassembler): + """Objdump disassembler""" + + name: str = "Default Objdump Disassembler (SparcV8 Binutils)" + + # Expected format: <>: + regex: str = r"(\S+)( <(\S+)>|):\s+([\S ]+)\s([\S ]+)" + regex_information: dict[str, int] = { + "address": 1, + "location": 3, + "instruction_hex": 4, + "instruction_str": 5, + } + + def extract_information(self, str_input: str) -> dict[str, str]: + result = {} + + if "=> " in str_input: + extracted_line = str_input.split("=> ", 1)[1].split("\n", 1)[0] + + information = re.search(self.regex, extracted_line) + if not information: + raise DisassemblerError("Line not processable: \n" + str(extracted_line)) + + address = str(information.group(self.regex_information.get("address"))) + location = str(information.group(self.regex_information.get("location"))) + instr_d = str(information.group(self.regex_information.get("instruction_str"))) + instr_h = str(information.group(self.regex_information.get("instruction_hex"))) + opcode = (instr_d.split()[0],) + + result = { + "address": address, + "location": location, + "instr_h": instr_h, + "instr_d": instr_d, + "opcode": opcode, + "printable": extracted_line, + } + else: + raise DisassemblerError("Line not processable: \n" + str(str_input)) + + return result + + def parse_function_header(self, line: str) -> str | None: + """ + Return function name of memory range from the given string line. + + Match lines for non-stripped binaries: + 'Dump of assembler code for function test_function:' + lines for stripped binaries: + 'Dump of assembler code from 0x555555555faf to 0x555555557008:' + and lines for objdump disassembly: + '0000000000016bb0 <_obstack_allocated_p@@Base>:' + """ + objdump_name_pattern = re.compile(rf"{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:") + function_name = objdump_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + function_name_pattern = re.compile(r"function (\w+):$") + function_name = function_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + memory_range_pattern = re.compile( + rf"(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$" + ) + memory_range = memory_range_pattern.search(line) + if memory_range is not None: + return f"{memory_range[1]}-{memory_range[2]}" + + return None + + def parse_address(self, line: str) -> (Address, str): + """ + Parses leading address of instruction + """ + address_match = re.match(rf"^\s*(?:0x)?({HEX_PATTERN})\s*(?:<([+-][0-9]+)>)?:(.*)", line) + if address_match is None: + return None, line + address = Address( + int(address_match[1], 16), None, int(address_match[2]) if address_match[2] else None + ) + return address, address_match[3] + + def split_nth(self, string: str, count: int) -> List[str]: + """ + Splits string to equally-sized chunks + """ + return [string[i : i + count] for i in range(0, len(string), count)] + + def parse_encoding(self, line): + """ + Parses byte encoding of instruction for objdump disassemblies + e.g. the '31 c0' in + '16bd3: 31 c0 xor %eax,%eax' + In addition to X86 supports ARM encoding styles: + '4: e1a01000 mov r1, r0' + '50: f7ff fffe bl 0 <__aeabi_dadd>' + '54: 0002 movs r2, r0' + """ + # Encoding is separated from assembly mnemonic via tab + # so we allow whitespace separators between bytes + # to avoid accidentally matching the mnemonic. + enc_match = re.match(r"^\s*((?:[0-9a-f]{2,8} +)+)(.*)", line) + if enc_match is None: + return None, line + bites = [] + for chunk in enc_match[1].strip().split(" "): + bites.extend(int(byte, 16) for byte in self.split_nth(chunk, 2)) + return Encoding(bites), enc_match[2] + + def parse_body(self, line: str) -> (str, str, List[str], str): + """Parses instruction body (opcode and operands)""" + comment_symbol = self.architecture.comment() + body_match = re.match(rf"^\s*([^{comment_symbol}<]+)(.*)", line) + if body_match is None: + return None, None, None, line + body = body_match[1].strip() + line = body_match[2] + opcode_match = re.match(r"^(\S*)\s*(.*)", body) + if opcode_match is None: + return None, None, None, line + opcode = opcode_match[1] + ops = opcode_match[2].split(",") if opcode_match[2] else [] + return body, opcode, ops, line + + def parse_target(self, line: str) -> (Address, str): + """ + Parses optional instruction branch target hint + """ + target_match = re.match(r"\s*<([.a-zA-Z_@0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)", line) + if target_match is None: + return None, line + offset = target_match[2] or "+0" + address = Address(None, target_match[1], int(offset, 0)) + return address, target_match[3] + + def parse_comment(self, line: str) -> (Address, str): + """ + Parses optional instruction comment + """ + comment_symbol = self.architecture.comment() + comment_match = re.match(rf"^\s*{comment_symbol}\s*(.*)", line) + if comment_match is None: + return None, line + comment = comment_match[1] + imm_match = re.match(rf"^(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)", comment) + if imm_match is None: + # If no imm was found, ignore the comment. + # In particular this takes care of useless ARM comments like + # '82: 46c0 nop ; (mov r8, r8)' + return None, "" + abs_addr = int(imm_match[1], 16) + if imm_match[2]: + target, _ = self.parse_target(imm_match[2]) + target.abs = abs_addr + else: + target = Address(abs_addr) + return target, imm_match[3] + + def parse_line(self, line: str, lineno, function_name: str) -> Instruction | None: + """ + Parses a single line of assembly to create Instruction instance + """ + # Strip GDB prefix and leading whites + line = line.removeprefix("=> ") + line = line.lstrip() + + address, line = self.parse_address(line) + if address is None: + return None + + encoding, line = self.parse_encoding(line) + if not line: + return encoding + + original_line = line + body, opcode, ops, line = self.parse_body(line) + if opcode is None: + return None + + target, line = self.parse_target(line) + + _, line = self.parse_comment(line) + if line: + # Expecting complete parse + return None + + # Set base symbol for relative addresses + if address.base is None: + address.base = function_name + if target is not None and target.base is None: + target.base = function_name + + return Instruction( + body, + original_line.strip(), + lineno, + address, + opcode, + ops, + target, + ) diff --git a/ocgraph/data/__init__.py b/ocgraph/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ocgraph/data/address.py b/ocgraph/data/address.py new file mode 100755 index 0000000..cd60b03 --- /dev/null +++ b/ocgraph/data/address.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +"""Represents location in program which may be absolute or relative""" + + +class Address: + """Address Class""" + + def __init__(self, abs_addr: int, base: int = None, offset: int = None): + self.abs = abs_addr + self.base = base + self.offset = offset + + def is_absolute(self): + """Return if address is absolute""" + return self.base is None + + def is_relative(self): + """Return if address is relative""" + return not self.is_absolute() + + def __str__(self): + if self.offset is not None: + return f"0x{self.abs:x} ({self.base}+0x{self.offset:x})" + if isinstance(self.abs, int): + return f"0x{self.abs:x}" + return str(self.abs) diff --git a/ocgraph/data/basic_block.py b/ocgraph/data/basic_block.py new file mode 100755 index 0000000..71bd814 --- /dev/null +++ b/ocgraph/data/basic_block.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +"""Class to represent a node in CFG with lines of code without jump or calls instructions.""" + +from typing import List + +from .instruction import Instruction + + +class BasicBlock: + """Basic Block Class""" + + def __init__(self, key): + self.key = key + self.instructions: List[Instruction] = [] + self.jump_edge = None + self.no_jump_edge = None + + def add_instruction(self, instruction): + """Add instruction to this block.""" + self.instructions.append(instruction) + + def add_jump_edge(self, basic_block_key) -> None: + """Add jump target block to this block.""" + if isinstance(basic_block_key, BasicBlock): + self.jump_edge = basic_block_key.key + else: + self.jump_edge = basic_block_key + + def add_no_jump_edge(self, basic_block_key) -> None: + """Add no jump target block to this block.""" + if isinstance(basic_block_key, BasicBlock): + self.no_jump_edge = basic_block_key.key + else: + self.no_jump_edge = basic_block_key + + def __str__(self) -> str: + return "\n".join([i.text for i in self.instructions]) + + def __repr__(self) -> str: + return "\n".join([i.text for i in self.instructions]) diff --git a/ocgraph/data/encoding.py b/ocgraph/data/encoding.py new file mode 100755 index 0000000..e6603b6 --- /dev/null +++ b/ocgraph/data/encoding.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +""" +Represents a sequence of bytes used for instruction encoding +e.g. the '31 c0' in +'16bd3: 31 c0 xor %eax,%eax' +""" + + +class Encoding: + """Encoding Class""" + + def __init__(self, bites): + self.bites = bites + + def size(self): + """Return size of the bytes""" + return len(self.bites) + + def __str__(self): + return " ".join(map(lambda b: f"{b:#x}", self.bites)) diff --git a/ocgraph/data/instruction.py b/ocgraph/data/instruction.py new file mode 100755 index 0000000..4c5abb2 --- /dev/null +++ b/ocgraph/data/instruction.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +"""Represents a single assembly instruction with it operands, location and optional branch target""" +from enum import Enum +from typing import List + +from .address import Address + + +class Coverage(Enum): + """Enumeration for coverage records""" + + MISS = "missed" + """Indicates instruction is not executed""" + LINE_TAKEN = "taken" + """Indicates instruction is executed""" + JUMP_PASS = "skipped" + """Indicates, if branch is just passed without jump""" + JUMP_TAKEN = "jumped" + """Indicates, if branch is just jumped without passed""" + JUMP_BOTH = "both taken" + """Indicates, if branch is just jumped without passed""" + + +class Instruction: + """Instruction Class""" + + body: str = None + """Disassembled instruction code (without extra information)""" + + text: str = None + """Complete Disassembled instruction code""" + + lineno: int = None + """Line number in the file""" + + address: Address = None + """Computed address of the instruction""" + + opcode: str = None + """Disassembled opcode""" + + ops: List[str] = [] + """Disassembled operands of the instruction""" + + target: Address = None + """Optional target of the instruction (branch)""" + + returns: set[int] = set() + """Stores addresses of sink instructions like return""" + + coverage: Coverage = Coverage.MISS + """If line is executed on test""" + + branch_taken: bool = None + + def __init__(self, body, text, lineno, address, opcode, ops, target): + self.body = body + self.text = text + self.lineno = lineno + self.address = address + self.opcode = opcode + self.ops = ops + self.target = target + self.returns = set() + self.coverage = Coverage.MISS + + def update_coverage(self, addresses: set[int], is_branch=False) -> None: + """Update the coverage information of the instruction.""" + if not is_branch: + # exception for ret (target is None) or call (target.abs is None) + self.coverage = Coverage.LINE_TAKEN + self.returns = addresses + elif len(addresses) == 2 and self.target.abs in addresses: + self.coverage = Coverage.JUMP_BOTH + elif len(addresses) == 1 and self.target.abs in addresses: + self.coverage = Coverage.JUMP_TAKEN + elif len(addresses) == 1: + self.coverage = Coverage.JUMP_PASS + else: + raise AssertionError(f"Invalid Coverage Information at {self.address}: {addresses}") + + def __str__(self): + result = f"{self.address}: {self.opcode}" + if self.ops: + result += f" {self.ops}" + return result + + def __repr__(self) -> str: + result = f"{self.address}: {self.opcode}" + if self.ops: + result += f" {self.ops}" + return result diff --git a/ocgraph/data/jump_table.py b/ocgraph/data/jump_table.py new file mode 100755 index 0000000..6c95b52 --- /dev/null +++ b/ocgraph/data/jump_table.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +"""Holds info about branch sources and destinations in asm function.""" + +from typing import List, Dict, Set + +from .address import Address +from .instruction import Instruction +from ..configuration.configuration import OcGraphConfiguration + + +class JumpTable: + """JumpTable Class""" + + def __init__(self, instructions: List[Instruction], configuration: OcGraphConfiguration): + self.config: OcGraphConfiguration = configuration + + # Address where the jump begins and value which address + # to jump to. This also includes calls. + self.abs_sources: Dict[int, Address] = {} + self.rel_sources: Dict[int, Address] = {} + + # Addresses where jumps end inside the current function. + self.abs_destinations: Set[int] = set() + self.rel_destinations: Set[int] = set() + + # Iterate over the lines and collect jump targets and branching points. + for instr in instructions: + if instr is None or not self.config.architecture.is_direct_jump(instr): + continue + + self.abs_sources[instr.address.abs] = instr.target + self.abs_destinations.add(instr.target.abs) + + self.rel_sources[instr.address.offset] = instr.target + self.rel_destinations.add(instr.target.offset) + + def is_jump_target(self, addr: Address) -> bool: + """Return if address is a destination""" + if addr.abs is not None: + return addr.abs in self.abs_destinations + if addr.offset is not None: + return addr.offset in self.rel_destinations + return False + + def get_target(self, addr: Address): + """Return the target of a branch""" + if addr.abs is not None: + return self.abs_sources.get(addr.abs) + if addr.offset is not None: + return self.rel_sources.get(addr.offset) + return None diff --git a/ocgraph/interface/__init__.py b/ocgraph/interface/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ocgraph/interface/analyzer.py b/ocgraph/interface/analyzer.py new file mode 100755 index 0000000..9a2cdb5 --- /dev/null +++ b/ocgraph/interface/analyzer.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +"""Class for read and analyze the input string.""" + +import sys + +from ..data.address import Address +from ..data.basic_block import BasicBlock +from ..data.encoding import Encoding +from ..data.instruction import Instruction +from ..data.jump_table import JumpTable + +from ..configuration.configuration import OcGraphConfiguration, Disassembler + + +class Analyzer: + """Analyzer Class""" + + def __init__(self, config: OcGraphConfiguration): + self.configuration = config + self.logger = self.configuration.logger + self.parser: Disassembler = self.configuration.disassembler + + self.lines: list[str] = [] + self.function_name = None + self.instructions: list[Instruction] = [] + self.jump_table = None + self.basic_blocks: dict[int, BasicBlock] = {} + + def parse_file(self, file_path: str): + """Parse a assembler file""" + with open(file_path, "r", encoding="utf8") as asm_file: + lines = asm_file.readlines() + self.parse_lines(lines) + + def parse_lines(self, lines): + """Parse a list of assembly lines""" + self.lines = lines + self._parse_instructions() + self._compute_jump_targets() + self._create_jump_table() + self._create_basic_blocks() + + def _parse_instructions(self): + self.instructions = [] + for num, line in enumerate(self.lines, 1): + current_function_name = self.parser.parse_function_header(line) + if current_function_name is not None: + if self.function_name is not None: + raise RuntimeError("we handle only one function for now") + self.logger.info(f"New function {current_function_name}") + self.function_name = current_function_name + continue + + instruction_or_encoding = self.parser.parse_line(line, num, self.function_name) + if isinstance(instruction_or_encoding, Encoding): + # Partial encoding for previous instruction, skip it + continue + if instruction_or_encoding is not None: + self.instructions.append(instruction_or_encoding) + continue + + if line.startswith("End of assembler dump") or not line: + continue + + if line.strip() == "": + continue + + self.logger.error(f"Unexpected assembly at line {num}:\n {line}") + sys.exit(1) + + def _compute_jump_targets(self): + # Infer target address for jump instructions + for instr in self.instructions: + if ( + instr.target is None or instr.target.abs is None + ) and self.configuration.architecture.is_direct_jump(instr): + if instr.target is None: + instr.target = Address(0) + instr.target.abs = int(instr.ops[0], 16) + + # Infer relative addresses (for objdump or stripped gdb) + start_address = self.instructions[0].address.abs + end_address = self.instructions[-1].address.abs + for instr in self.instructions: + for addr in (instr.address, instr.target): + if ( + addr is not None + and addr.offset is None + and start_address <= addr.abs <= end_address + ): + addr.offset = addr.abs - start_address + + self.logger.debug("Instructions:") + for instruction in self.instructions: + if instruction is not None: + self.logger.debug(f" {instruction}") + + def _create_jump_table(self): + self.jump_table = JumpTable(self.instructions, self.configuration) + + self.logger.debug("Absolute destinations:") + for dst in self.jump_table.abs_destinations: + self.logger.debug(f" {dst:#x}") + self.logger.debug("Relative destinations:") + for dst in self.jump_table.rel_destinations: + self.logger.debug(f" {dst:#x}") + self.logger.debug("Absolute branches:") + for key, addr in self.jump_table.abs_sources.items(): + self.logger.debug(f" {key:#x} -> {addr}") + self.logger.debug("Relative branches:") + for key, addr in self.jump_table.rel_sources.items(): + self.logger.debug(f" {key:#x} -> {addr}") + + def _create_basic_blocks(self) -> None: + """ + Now iterate over the assembly again and split it to basic blocks using the branching + information from earlier. + """ + self.basic_blocks = {} + + curr_basic_block: BasicBlock | None = None + # Store last block if ending with branch opcode + prev_branch_block: BasicBlock | None = None + + # block completion flag (introduced for SPARC pipeline) + block_completion: int = 0 + + for instruction in self.instructions: + # if block completion is in progress + if block_completion > 0: + block_completion -= 1 + if block_completion > 0: + self.basic_blocks[curr_basic_block.key].add_instruction(instruction) + continue + curr_basic_block = None + + # Current program counter + pc_addr = instruction.address + # Optional jump target + jump_target = self.jump_table.get_target(pc_addr) + is_branch = self.configuration.architecture.is_branch(instruction) + + # Start new blocks if last ended + if curr_basic_block is None: + # Create new basic block + self.basic_blocks[pc_addr.abs] = curr_basic_block = BasicBlock(key=pc_addr.abs) + + # if previous basic block ended in branch instruction. Add the basic + # block what follows if the jump was not taken. + if prev_branch_block is not None: + prev_branch_block.add_no_jump_edge(curr_basic_block) + prev_branch_block = None + # or if current address is a jump target + elif self.jump_table.is_jump_target(pc_addr): + closing_block = curr_basic_block + self.basic_blocks[pc_addr.abs] = curr_basic_block = BasicBlock(key=pc_addr.abs) + closing_block.add_no_jump_edge(pc_addr.abs) + + curr_basic_block.add_instruction(instruction) + + # End current block if current opcode is a jump/branch/sink + if jump_target: + curr_basic_block.add_jump_edge(jump_target.abs) + prev_branch_block = curr_basic_block if is_branch else None + block_completion = self.configuration.architecture.get_jump_delay(instruction) + elif self.configuration.architecture.is_sink(instruction): + block_completion = self.configuration.architecture.get_jump_delay(instruction) + prev_branch_block = None + + if prev_branch_block is not None: + # If last instruction of the function is jump/call, then add dummy + # block to designate end of the function. + end_instruction = Instruction("", "end of function", 0, None, None, [], None) + end_block = BasicBlock("end_of_function") + end_block.add_instruction(end_instruction) + prev_branch_block.add_no_jump_edge(end_block.key) + self.basic_blocks[end_block.key] = end_block diff --git a/ocgraph/interface/drawer.py b/ocgraph/interface/drawer.py new file mode 100755 index 0000000..55c6654 --- /dev/null +++ b/ocgraph/interface/drawer.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +"""Class for drawing the output.""" + +import tempfile +from typing import Dict + +from graphviz import Digraph + +from ..configuration.configuration import OcGraphConfiguration +from ..data.basic_block import BasicBlock +from ..data.instruction import Coverage + +coverage_color = { + Coverage.MISS: "#f08080", # light coral + Coverage.LINE_TAKEN: "#90ee90", # light green + Coverage.JUMP_BOTH: "#90ee90", # light green + Coverage.JUMP_PASS: "#fdfd96", # pastel yellow + Coverage.JUMP_TAKEN: "#fdfd96", # pastel yellow +} + + +class Drawer: + """Drawer Class""" + + def __init__(self, config: OcGraphConfiguration, graph_options: dict = None) -> None: + self.config = config + self.graph_option = graph_options if graph_options else {} + + def set_graph_option(self, graph_options: dict = None) -> None: + """Set new graph options""" + self.graph_option = graph_options + + @staticmethod + def _escape(text: str) -> str: + """ + Escape used dot graph characters in given instruction so they will be + displayed correctly. + """ + text = text.replace("<", r"[") + text = text.replace(">", r"]") + text = text.replace("\t", " ") + return text + + def _create_label(self, basic_block: BasicBlock, line_coverage=False): + """Create annotated graph label""" + label = "" + returns = set() + + # start label + label += '< \n' + # for each instruction in block + for instr in basic_block.instructions: + bg_color = coverage_color[instr.coverage] if line_coverage else "white" + label += ( + "" + f'\n" + ) + if self.config.architecture.is_sink(instr): + returns = instr.returns + for return_addr in returns: + if isinstance(return_addr, int): + returns.remove(return_addr) + returns.add(f"0x{return_addr:x}") + + # add JUMP/NO JUMP cells with dot PORT navigation + cells = [basic_block.jump_edge, basic_block.no_jump_edge] + span = 3 - len([x for x in cells if x is not None]) # 3 - count of trues in cells + + label += "" + if basic_block.no_jump_edge: + label += f'' + if basic_block.jump_edge: + label += f'' + if not basic_block.jump_edge and not basic_block.no_jump_edge: + label += f'' + label += " \n
' + f"0x{instr.address.abs:x}: {Drawer._escape(text=instr.text)}" + "
NO JUMPJUMPRETURN targets: {str(returns)}
>" + return label + + def _create_cfg(self, name: str, basic_blocks: Dict[int, BasicBlock], coverage=False): + """Create a cgf""" + dot = Digraph(name=name, comment=name, engine="dot") + dot.attr("graph", label=name) + + # Create nodes in graph + for address, basic_block in basic_blocks.items(): + key = str(address) + label = self._create_label(basic_block, coverage) + dot.node(name=key, label=label, shape="plaintext", **self.graph_option) + + # Create edges in graph + for basic_block in basic_blocks.values(): + if basic_block.jump_edge: + dot.edge(f"{basic_block.key}:jump", str(basic_block.jump_edge)) + if basic_block.no_jump_edge: + dot.edge(f"{basic_block.key}:pass", str(basic_block.no_jump_edge)) + return dot + + def view_cfg(self, name: str, basic_blocks: Dict[int, BasicBlock]): + """view a function graph""" + dot = self._create_cfg(name, basic_blocks) + dot.format = "gv" + with tempfile.NamedTemporaryFile(mode="w+b", prefix=name) as filename: + dot.view(filename.name) + print(f"Opening a file {filename.name}.{dot.format} with default viewer.") + + def draw_cfg(self, name: str, basic_blocks: Dict[int, BasicBlock], output: str = None): + """Draw a function graph""" + dot = self._create_cfg(name, basic_blocks, coverage=True) + + filename = output if output else name + dot.format = "pdf" + dot.render(filename=filename, cleanup=True) + self.config.logger.info(f"Saved CFG to a file {name}.{dot.format}") + + dot.format = "gv" + dot.render(filename=filename, cleanup=True) diff --git a/setup.cfg b/setup.cfg index c25a957..8917b85 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,5 +41,4 @@ logging-format-style = new expected-line-ending-format = LF include-naming-hint = yes ignored-modules = gdb # Ignore because of the GDB integration -notes= # disable warnings for TODO, FIXME etc. -disable=bad-option-value,missing-function-docstring,no-self-use,too-many-instance-attributes,too-many-arguments,too-many-locals,too-many-branches,too-many-statements +disable=duplicate-code,too-many-instance-attributes,too-many-arguments diff --git a/src/asm2cfg/__main__.py b/src/asm2cfg/__main__.py deleted file mode 100644 index e715118..0000000 --- a/src/asm2cfg/__main__.py +++ /dev/null @@ -1,9 +0,0 @@ -""" -Let this module to be executed from the command line with python -m src.asm2cfg -from root of the project -""" - -from . import command_line - -if __name__ == '__main__': - command_line.main() diff --git a/src/asm2cfg/asm2cfg.py b/src/asm2cfg/asm2cfg.py deleted file mode 100644 index 7a93bb4..0000000 --- a/src/asm2cfg/asm2cfg.py +++ /dev/null @@ -1,630 +0,0 @@ -""" -Module containing main building blocks to parse assembly and draw CFGs. -""" - -import re -import sys -import tempfile - -from graphviz import Digraph - - -# TODO: make this a command-line flag -VERBOSE = 0 - - -def escape(instruction): - """ - Escape used dot graph characters in given instruction so they will be - displayed correctly. - """ - instruction = instruction.replace('<', r'\<') - instruction = instruction.replace('>', r'\>') - instruction = instruction.replace('|', r'\|') - instruction = instruction.replace('{', r'\{') - instruction = instruction.replace('}', r'\}') - instruction = instruction.replace(' ', ' ') - return instruction - - -class BasicBlock: - """ - Class to represent a node in CFG with straight lines of code without jump - or calls instructions. - """ - - def __init__(self, key): - self.key = key - self.instructions = [] - self.jump_edge = None - self.no_jump_edge = None - - def add_instruction(self, instruction): - """ - Add instruction to this block. - """ - self.instructions.append(instruction) - - def add_jump_edge(self, basic_block_key): - """ - Add jump target block to this block. - """ - if isinstance(basic_block_key, BasicBlock): - self.jump_edge = basic_block_key.key - else: - self.jump_edge = basic_block_key - - def add_no_jump_edge(self, basic_block_key): - """ - Add no jump target block to this block. - """ - if isinstance(basic_block_key, BasicBlock): - self.no_jump_edge = basic_block_key.key - else: - self.no_jump_edge = basic_block_key - - def get_label(self): - """ - Return content of the block for dot graph. - """ - # Left align in dot. - label = r'\l'.join([escape(i.text) for i in self.instructions]) - # Left justify the last line too. - label += r'\l' - if self.jump_edge: - if self.no_jump_edge: - label += '|{No Jump|Jump}' - else: - label += '|{Jump}' - return '{' + label + '}' - - def __str__(self): - return '\n'.join([i.text for i in self.instructions]) - - def __repr__(self): - return '\n'.join([i.text for i in self.instructions]) - - -def print_assembly(basic_blocks): - """ - Debug function to print the assembly. - """ - for basic_block in basic_blocks.values(): - print(basic_block) - - -def read_lines(file_path): - """ Read lines from the file and return then as a list. """ - lines = [] - with open(file_path, 'r', encoding='utf8') as asm_file: - lines = asm_file.readlines() - return lines - - -# Common regexes -HEX_PATTERN = r'[0-9a-fA-F]+' -HEX_LONG_PATTERN = r'(?:0x0*)?' + HEX_PATTERN - - -class InputFormat: # pylint: disable=too-few-public-methods - """ - An enum which represents various supported input formats - """ - GDB = 'GDB' - OBJDUMP = 'OBJDUMP' - - -def parse_function_header(line): - """ - Return function name of memory range from the given string line. - - Match lines for non-stripped binaries: - 'Dump of assembler code for function test_function:' - lines for stripped binaries: - 'Dump of assembler code from 0x555555555faf to 0x555555557008:' - and lines for obdjdump disassembly: - '0000000000016bb0 <_obstack_allocated_p@@Base>:' - """ - - objdump_name_pattern = re.compile(fr'{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:') - function_name = objdump_name_pattern.search(line) - if function_name is not None: - return InputFormat.OBJDUMP, function_name[1] - - function_name_pattern = re.compile(r'function (\w+):$') - function_name = function_name_pattern.search(line) - if function_name is not None: - return InputFormat.GDB, function_name[1] - - memory_range_pattern = re.compile(fr'(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$') - memory_range = memory_range_pattern.search(line) - if memory_range is not None: - return InputFormat.GDB, f'{memory_range[1]}-{memory_range[2]}' - - return None, None - - -class Address: - """ - Represents location in program which may be absolute or relative - """ - def __init__(self, abs_addr, base=None, offset=None): - self.abs = abs_addr - self.base = base - self.offset = offset - - def is_absolute(self): - return self.base is None - - def is_relative(self): - return not self.is_absolute() - - def __str__(self): - if self.offset is not None: - return f'0x{self.abs:x} ({self.base}+{self.offset})' - return f'0x{self.abs}' - - def merge(self, other): - if self.abs is not None: - assert self.abs is None or self.abs == other.abs - self.abs = other.abs - if self.base is not None: - assert self.base is None or self.base == other.base - self.base = other.base - if self.offset is not None: - assert self.offset is None or self.offset == other.offset - self.offset = other.offset - - -class Encoding: - """ - Represents a sequence of bytes used for instruction encoding - e.g. the '31 c0' in - '16bd3: 31 c0 xor %eax,%eax' - """ - def __init__(self, bites): - self.bites = bites - - def size(self): - return len(self.bites) - - def __str__(self): - return ' '.join(map(lambda b: f'{b:#x}', self.bites)) - - -class X86TargetInfo: - """ - Contains instruction info for X86-compatible targets. - """ - - def __init__(self): - pass - - def comment(self): - return '#' - - def is_call(self, instruction): - # Various flavors of call: - # call *0x26a16(%rip) - # call 0x555555555542 - # addr32 call 0x55555558add0 - return 'call' in instruction.opcode - - def is_jump(self, instruction): - return instruction.opcode[0] == 'j' - - def is_unconditional_jump(self, instruction): - return instruction.opcode.startswith('jmp') - - def is_sink(self, instruction): - """ - Is this an instruction which terminates function execution e.g. return? - """ - return instruction.opcode.startswith('ret') - - -class ARMTargetInfo: - """ - Contains instruction info for ARM-compatible targets. - """ - - def __init__(self): - pass - - def comment(self): - return ';' - - def is_call(self, instruction): - # Various flavors of call: - # bl 0x19d90 <_IO_vtable_check> - # Note that we should be careful to not mix it with conditional - # branches like 'ble'. - return instruction.opcode.startswith('bl') \ - and instruction.opcode not in ('blt', 'ble', 'bls') - - def is_jump(self, instruction): - return instruction.opcode[0] == 'b' and not self.is_call(instruction) - - def is_unconditional_jump(self, instruction): - return instruction.opcode == 'b' - - def is_sink(self, instruction): - """ - Is this an instruction which terminates function execution e.g. return? - Detect various flavors of return like - bx lr - pop {r2-r6,pc} - Note that we do not consider conditional branches (e.g. 'bxle') to sink. - """ - return re.search(r'\bpop\b.*\bpc\b', instruction.body) \ - or (instruction.opcode == 'bx' and instruction.ops[0] == 'lr') \ - or instruction.opcode == 'udf' - - -class Instruction: - """ - Represents a single assembly instruction with it operands, location and - optional branch target - """ - def __init__(self, body, text, lineno, address, opcode, ops, target, imm, target_info): # noqa - self.body = body - self.text = text - self.lineno = lineno - self.address = address - self.opcode = opcode - self.ops = ops - self.target = target - self.info = target_info - if imm is not None and (self.is_jump() or self.is_call()): - if self.target is None: - self.target = imm - else: - self.target.merge(imm) - - def is_call(self): - return self.info.is_call(self) - - def is_jump(self): - return self.info.is_jump(self) - - def is_direct_jump(self): - return self.is_jump() and re.match(fr'{HEX_LONG_PATTERN}', self.ops[0]) - - def is_sink(self): - return self.info.is_sink(self) - - def is_unconditional_jump(self): - return self.info.is_unconditional_jump(self) - - def __str__(self): - result = f'{self.address}: {self.opcode}' - if self.ops: - result += f' {self.ops}' - return result - - -def parse_address(line): - """ - Parses leading address of instruction - """ - address_match = re.match(fr'^\s*(?:0x)?({HEX_PATTERN})\s*(?:<([+-][0-9]+)>)?:(.*)', line) - if address_match is None: - return None, line - address = Address(int(address_match[1], 16), None, int(address_match[2]) if address_match[2] else None) - return address, address_match[3] - - -def split_nth(string, count): - """ - Splits string to equally-sized chunks - """ - return [string[i:i+count] for i in range(0, len(string), count)] - - -def parse_encoding(line): - """ - Parses byte encoding of instruction for objdump disassemblies - e.g. the '31 c0' in - '16bd3: 31 c0 xor %eax,%eax' - In addition to X86 supports ARM encoding styles: - '4: e1a01000 mov r1, r0' - '50: f7ff fffe bl 0 <__aeabi_dadd>' - '54: 0002 movs r2, r0' - """ - # Encoding is separated from assembly mnemonic via tab - # so we allow whitespace separators between bytes - # to avoid accidentally matching the mnemonic. - enc_match = re.match(r'^\s*((?:[0-9a-f]{2,8} +)+)(.*)', line) - if enc_match is None: - return None, line - bites = [] - for chunk in enc_match[1].strip().split(' '): - bites.extend(int(byte, 16) for byte in split_nth(chunk, 2)) - return Encoding(bites), enc_match[2] - - -def parse_body(line, target_info): - """ - Parses instruction body (opcode and operands) - """ - comment_symbol = target_info.comment() - body_match = re.match(fr'^\s*([^{comment_symbol}<]+)(.*)', line) - if body_match is None: - return None, None, None, line - body = body_match[1].strip() - line = body_match[2] - opcode_match = re.match(r'^(\S*)\s*(.*)', body) - if opcode_match is None: - return None, None, None, line - opcode = opcode_match[1] - ops = opcode_match[2].split(',') if opcode_match[2] else [] - return body, opcode, ops, line - - -def parse_target(line): - """ - Parses optional instruction branch target hint - """ - target_match = re.match(r'\s*<([a-zA-Z_@.0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)', line) - if target_match is None: - return None, line - offset = target_match[2] or '+0' - address = Address(None, target_match[1], int(offset, 0)) - return address, target_match[3] - - -def parse_comment(line, target_info): - """ - Parses optional instruction comment - """ - comment_symbol = target_info.comment() - comment_match = re.match(fr'^\s*{comment_symbol}\s*(.*)', line) - if comment_match is None: - return None, line - comment = comment_match[1] - imm_match = re.match(fr'^(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)', comment) - if imm_match is None: - # If no imm was found, ignore the comment. - # In particular this takes care of useless ARM comments like - # '82: 46c0 nop ; (mov r8, r8)' - return None, '' - abs_addr = int(imm_match[1], 16) - if imm_match[2]: - target, _ = parse_target(imm_match[2]) - target.abs = abs_addr - else: - target = Address(abs_addr) - return target, imm_match[3] - - -def parse_line(line, lineno, function_name, fmt, target_info): - """ - Parses a single line of assembly to create Instruction instance - """ - - # Strip GDB prefix and leading whites - if line.startswith('=> '): - # Strip GDB marker - line = line[3:] - line = line.lstrip() - - address, line = parse_address(line) - if address is None: - return None - - if fmt == InputFormat.OBJDUMP: - encoding, line = parse_encoding(line) - if not line: - return encoding - - original_line = line - body, opcode, ops, line = parse_body(line, target_info) - if opcode is None: - return None - - target, line = parse_target(line) - - imm, line = parse_comment(line, target_info) - if line: - # Expecting complete parse - return None - - # Set base symbol for relative addresses - if address.base is None: - address.base = function_name - if target is not None and target.base is None: - target.base = function_name - - return Instruction(body, original_line.strip(), lineno, address, opcode, ops, target, imm, target_info) - - -class JumpTable: - """ - Holds info about branch sources and destinations in asm function. - """ - - def __init__(self, instructions): - # Address where the jump begins and value which address - # to jump to. This also includes calls. - self.abs_sources = {} - self.rel_sources = {} - - # Addresses where jumps end inside the current function. - self.abs_destinations = set() - self.rel_destinations = set() - - # Iterate over the lines and collect jump targets and branching points. - for inst in instructions: - if inst is None or not inst.is_direct_jump(): - continue - - self.abs_sources[inst.address.abs] = inst.target - self.abs_destinations.add(inst.target.abs) - - self.rel_sources[inst.address.offset] = inst.target - self.rel_destinations.add(inst.target.offset) - - def is_destination(self, address): - if address.abs is not None: - return address.abs in self.abs_destinations - if address.offset is not None: - return address.offset in self.rel_destinations - return False - - def get_target(self, address): - if address.abs is not None: - return self.abs_sources.get(address.abs) - if address.offset is not None: - return self.rel_sources.get(address.offset) - return None - - -def parse_lines(lines, skip_calls, target_name): # noqa pylint: disable=unused-argument - if target_name == 'x86': - target_info = X86TargetInfo() - elif target_name == 'arm': - target_info = ARMTargetInfo() - else: - print(f'Unsupported platform {target_name}') - sys.exit(1) - - instructions = [] - current_function_name = current_format = None - for num, line in enumerate(lines, 1): - fmt, function_name = parse_function_header(line) - if function_name is not None: - assert current_function_name is None, 'we handle only one function for now' - if VERBOSE: - print(f'New function {function_name} (format {fmt})') - current_function_name = function_name - current_format = fmt - continue - - instruction_or_encoding = parse_line(line, num, current_function_name, current_format, target_info) - if isinstance(instruction_or_encoding, Encoding): - # Partial encoding for previous instruction, skip it - continue - if instruction_or_encoding is not None: - instructions.append(instruction_or_encoding) - continue - - if line.startswith('End of assembler dump') or not line: - continue - - if line.strip() == '': - continue - - print(f'Unexpected assembly at line {num}:\n {line}') - sys.exit(1) - - # Infer target address for jump instructions - for instruction in instructions: - if (instruction.target is None or instruction.target.abs is None) \ - and instruction.is_direct_jump(): - if instruction.target is None: - instruction.target = Address(0) - instruction.target.abs = int(instruction.ops[0], 16) - - # Infer relative addresses (for objdump or stripped gdb) - start_address = instructions[0].address.abs - end_address = instructions[-1].address.abs - for instruction in instructions: - for address in (instruction.address, instruction.target): - if address is not None \ - and address.offset is None \ - and start_address <= address.abs <= end_address: - address.offset = address.abs - start_address - - if VERBOSE: - print('Instructions:') - for instruction in instructions: - if instruction is not None: - print(f' {instruction}') - - jump_table = JumpTable(instructions) - - if VERBOSE: - print('Absolute destinations:') - for dst in jump_table.abs_destinations: - print(f' {dst:#x}') - print('Relative destinations:') - for dst in jump_table.rel_destinations: - print(f' {dst}') - print('Absolute branches:') - for src, dst in jump_table.abs_sources.items(): - print(f' {src:#x} -> {dst}') - print('Relative branches:') - for src, dst in jump_table.rel_sources.items(): - print(f' {src} -> {dst}') - - # Now iterate over the assembly again and split it to basic blocks using - # the branching information from earlier. - basic_blocks = {} - current_basic_block = None - previous_jump_block = None - for line, instruction in zip(lines, instructions): - if instruction is None: - continue - - # Current offset/address inside the function. - program_point = instruction.address - jump_point = jump_table.get_target(program_point) - is_unconditional = instruction.is_unconditional_jump() - - if current_basic_block is None: - current_basic_block = BasicBlock(program_point.abs) - basic_blocks[current_basic_block.key] = current_basic_block - # Previous basic block ended in jump instruction. Add the basic - # block what follows if the jump was not taken. - if previous_jump_block is not None: - previous_jump_block.add_no_jump_edge(current_basic_block) - previous_jump_block = None - elif jump_table.is_destination(program_point): - temp_block = current_basic_block - current_basic_block = BasicBlock(program_point.abs) - basic_blocks[current_basic_block.key] = current_basic_block - temp_block.add_no_jump_edge(current_basic_block) - - current_basic_block.add_instruction(instruction) - - if jump_point is not None: - current_basic_block.add_jump_edge(jump_point.abs) - previous_jump_block = None if is_unconditional else current_basic_block - current_basic_block = None - elif instruction.is_sink(): - previous_jump_block = current_basic_block = None - - if previous_jump_block is not None: - # If last instruction of the function is jump/call, then add dummy - # block to designate end of the function. - end_block = BasicBlock('end_of_function') - dummy_instruction = Instruction('', 'end of function', 0, None, None, [], None, None, target_info) - end_block.add_instruction(dummy_instruction) - previous_jump_block.add_no_jump_edge(end_block.key) - basic_blocks[end_block.key] = end_block - - return current_function_name, basic_blocks - - -def draw_cfg(function_name, basic_blocks, view): - dot = Digraph(name=function_name, comment=function_name, engine='dot') - dot.attr('graph', label=function_name) - for address, basic_block in basic_blocks.items(): - key = str(address) - dot.node(key, shape='record', label=basic_block.get_label()) - for basic_block in basic_blocks.values(): - if basic_block.jump_edge: - if basic_block.no_jump_edge is not None: - dot.edge(f'{basic_block.key}:s0', str(basic_block.no_jump_edge)) - dot.edge(f'{basic_block.key}:s1', str(basic_block.jump_edge)) - elif basic_block.no_jump_edge: - dot.edge(str(basic_block.key), str(basic_block.no_jump_edge)) - if view: - dot.format = 'gv' - with tempfile.NamedTemporaryFile(mode='w+b', prefix=function_name) as filename: - dot.view(filename.name) - print(f'Opening a file {filename.name}.{dot.format} with default viewer. Don\'t forget to delete it later.') - else: - dot.format = 'pdf' - dot.render(filename=function_name, cleanup=True) - print(f'Saved CFG to a file {function_name}.{dot.format}') diff --git a/src/asm2cfg/command_line.py b/src/asm2cfg/command_line.py deleted file mode 100644 index 1a5c8eb..0000000 --- a/src/asm2cfg/command_line.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -Command-line usage support. -""" - -import argparse -from . import asm2cfg - - -def main(): - """ Command-line entry point to the program. """ - parser = argparse.ArgumentParser( - description='Program to draw dot control-flow graph from GDB disassembly for a function.', - epilog='If function CFG rendering takes too long, try to skip function calls with -c flag.' - ) - parser.add_argument('assembly_file', - help='File to contain one function assembly dump') - parser.add_argument('-c', '--skip-calls', action='store_true', - help='Skip function calls from dividing code to blocks') - parser.add_argument('--target', choices=['x86', 'arm'], default='x86', - help='Specify target platform for assembly') - parser.add_argument('-v', '--view', action='store_true', - help='View as a dot graph instead of saving to a file') - args = parser.parse_args() - print('If function CFG rendering takes too long, try to skip function calls with -c flag') - lines = asm2cfg.read_lines(args.assembly_file) - function_name, basic_blocks = asm2cfg.parse_lines(lines, args.skip_calls, args.target) - asm2cfg.draw_cfg(function_name, basic_blocks, args.view) From 60a2d7b04a862706a0c715859552b78be91ddd98 Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Wed, 5 Apr 2023 14:40:32 +0200 Subject: [PATCH 07/31] ocgraph: Read/update instruction by coverage files --- ocgraph/interface/coverage_reader.py | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100755 ocgraph/interface/coverage_reader.py diff --git a/ocgraph/interface/coverage_reader.py b/ocgraph/interface/coverage_reader.py new file mode 100755 index 0000000..561a02b --- /dev/null +++ b/ocgraph/interface/coverage_reader.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +"""Class for read coverage input and update the instruction.""" +import ast +import csv + +from ..data.instruction import Instruction +from ..configuration.configuration import OcGraphConfiguration + + +class CoverageReader: # pylint: disable=too-few-public-methods + """CoverageReader Class""" + + def __init__(self, instructions: [Instruction], config: OcGraphConfiguration): + self.instructions = instructions + self.config = config + + def update_by_csv(self, file_path: str): + """Read coverage csv file and update""" + # Store for coverage information + coverage_info: dict[int, set[int]] = {} + + # read the csv file. expected values in address and branch_jumps + with open(file_path, "r", newline="", encoding="utf-8") as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + _temp = ast.literal_eval(row["branch_jumps"]) + coverage_info[int(row["address"], 0)] = {int(x, 0) for x in _temp} + # update instructions + for instr in self.instructions: + if coverage_info.get(instr.address.abs, None) is not None: + is_branch = self.config.architecture.is_branch(instr) + instr.update_coverage(coverage_info[instr.address.abs], is_branch=is_branch) From 7ea43e54831f3dee1c1a92f52b4621b85ac5c688 Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Fri, 21 Apr 2023 11:28:33 +0200 Subject: [PATCH 08/31] Doc: rework README --- README.md | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/README.md b/README.md index 360b555..eab295b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,100 @@ +## Object Code Graph (ocgraph) + +## How to Run + +Custom python script: + +```python +from ocgraph.interface.analyzer import Analyzer +from ocgraph.interface.drawer import Drawer +from ocgraph.interface.coverage_reader import CoverageReader + +from ocgraph.coverage_tracer import CoverageTracer +from ocgraph.configuration.configuration import CovTraceConfiguration + +# Create configuration +config = OcGraphConfiguration(disassembler="objdump", arch="sparc") + +# Read input text +lines = read_lines("a.out") + +# Analyze input text +analyser = Analyzer(config=config) +analyser.parse_lines(lines=lines) + +# Update analyzed input with coverage data +cov_reader = CoverageReader(instructions=analyser.instructions config=config) +cov_reader.update_by_csv(args.coverage) + +drawer = Drawer(analyser.configuration) +drawer.draw_cfg(name=analyser.function_name, basic_blocks=analyser.basic_blocks, output="a.pdf") +``` + +As python module: + +```cmd +python3 -m ocgraph -f a.out -d objdump -a sparc -c cov.csv -o a.pdf +``` + +As command line script: + +```cmd +./asm2cfg -f a.out -d objdump -a sparc -c cov.csv -o a.pdf +``` + + +## Design + + +```mermaid +--- +title: OcGraph design +--- +classDiagram + + class Configuration { + __init__(arch, disassembler, logging): + +dict disassembler_option + +dict architecture_option + +dict preset_logging + } + class Disassembler { + Name + parse_line() + ...() + } + class Architecture { + is_branch() + ...() + } + class Logger { Name } + + Configuration --* Disassembler + Configuration --* Architecture + Configuration --* Logger + + class Analyzer { + __init__(config) + parse_file(file_path): basic_blocks + } + class CoverageReader { + __init__(basic_blocks, config) + update_by_csv(file_path) + } + class Drawer { + __init__(config) + draw_cfg(basic_blocks, output) + } + class __main__ { + main() + } + + __main__ --> Configuration + __main__ --> Analyzer + __main__ --> CoverageReader + __main__ --> Drawer + +``` # asm2cfg ![CI status](https://github.com/Kazhuu/asm2cfg/actions/workflows/ci.yml/badge.svg) From 51d5a0c6b887dc9c9aebc7350dc7f110a7a3b36c Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Fri, 31 Mar 2023 09:54:18 +0200 Subject: [PATCH 09/31] Scripts: Add/Adapt batch scripts acc. refactoring --- scripts/batch_objdump.sh | 134 +++++++++++++++++++++++++++++++++++++ scripts/update_examples.sh | 2 +- 2 files changed, 135 insertions(+), 1 deletion(-) create mode 100755 scripts/batch_objdump.sh diff --git a/scripts/batch_objdump.sh b/scripts/batch_objdump.sh new file mode 100755 index 0000000..bde3752 --- /dev/null +++ b/scripts/batch_objdump.sh @@ -0,0 +1,134 @@ +#!/bin/bash + +function_array=( + acos + asin + atan + atan2 + cos + sin + tan + acosh + atanh + cosh + sinh + tanh + exp + exp2 + expm1 + frexp + ilogb + ldexp + log + log10 + log1p + log2 + logb + modf + scalbn + scalbln + cbrt + fabs + hypot + pow + sqrt + erf + erfc + lgamma + tgamma + ceil + floor + nearbyint + rint + lrint + llrint + round + lround + llround + trunc + fmod + remainder + remquo + copysign + nan + nextafter + fdim + fmax + fmin + fma + + acosf + asinf + atanf + atan2f + cosf + sinf + tanf + acoshf + asinhf + atanhf + coshf + sinhf + tanhf + expf + exp2f + expm1f + frexpf + ilogbf + ldexpf + logf + log10f + log1pf + log2f + logbf + modff + scalbnf + scalblnf + cbrtf + fabsf + hypotf + powf + sqrtf + erff + erfcf + lgammaf + tgammaf + ceilf + floorf + nearbyintf + rintf + lrintf + llrintf + roundf + lroundf + llroundf + truncf + fmodf + remainderf + remquof + copysignf + nanf + nextafterf + fdimf + fmaxf + fminf + fmaf +) + +objdump=../rtems-6-sparc-gr740-smp-4/bin/sparc-rtems6-objdump +application=../examples/gr740/smp/libmcs/b-gr740-qual-only/app.exe +asm_folder=../qualification/asm +pdf_folder=../qualification/pdf +coverage_file=../coverage-tracer/ExecuteTestRun.csv + +mkdir -p $asm_folder +mkdir -p $pdf_folder + +for i in ${function_array[@]} +do + $objdump -d $application | sed -ne '/<'$i'>:/,/^$/p' > $asm_folder/$i.asm + ./asm2cfg -c $coverage_file --preset 'sparc OBJDUMP' $asm_folder/$i.asm +done + +mv *.pdf $pdf_folder/ +rm -f *.gv diff --git a/scripts/update_examples.sh b/scripts/update_examples.sh index d60a970..92fb3dd 100755 --- a/scripts/update_examples.sh +++ b/scripts/update_examples.sh @@ -9,6 +9,6 @@ for asm in examples/*.asm; do if echo $asm | grep -q 'arm.asm'; then flags="$flags --target arm" fi - pdf=$(python3 -m src.asm2cfg $flags -c $asm | awk '/Saved CFG/{print $NF}') + pdf=$(python3 -m ocgraph $flags -c $asm | awk '/Saved CFG/{print $NF}') mv $pdf $(echo $asm | sed 's/\.asm/\.pdf/') done From e5cb5c437675e2f94b55e1a6fc870101e0bb0629 Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Tue, 20 Jun 2023 09:24:32 +0200 Subject: [PATCH 10/31] Asm2Cfg: Add shell script for easy executing --- asm2cfg | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100755 asm2cfg diff --git a/asm2cfg b/asm2cfg new file mode 100755 index 0000000..4f1b04a --- /dev/null +++ b/asm2cfg @@ -0,0 +1,12 @@ +#!/bin/sh + +# SPDX-License-Identifier: GTDGmbH +# Copyright 2023 by GTD GmbH. + +dir="$(dirname "$(readlink -f "$0")")" +cd $dir +export PATH=$dir/bin:$PATH +if [ -d "venv" ]; then + . venv/bin/activate +fi +python3 -m ocgraph.__main__ "$@" From ba48e5511d1a40bb5f28139c6a0f1951c3a18f45 Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Tue, 20 Jun 2023 17:20:43 +0200 Subject: [PATCH 11/31] Asm2Cfg: Adjust to SPDX license system --- LICENSES/GTDGmbH.md | 373 +++++++++++++++++++++++++++ LICENSE => LICENSES/MauriMustonen.md | 3 + 2 files changed, 376 insertions(+) create mode 100644 LICENSES/GTDGmbH.md rename LICENSE => LICENSES/MauriMustonen.md (95%) diff --git a/LICENSES/GTDGmbH.md b/LICENSES/GTDGmbH.md new file mode 100644 index 0000000..a612ad9 --- /dev/null +++ b/LICENSES/GTDGmbH.md @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/LICENSE b/LICENSES/MauriMustonen.md similarity index 95% rename from LICENSE rename to LICENSES/MauriMustonen.md index 4895137..e8969fd 100644 --- a/LICENSE +++ b/LICENSES/MauriMustonen.md @@ -1,3 +1,6 @@ +Valid-License-Identifier: MauriMustonen +License-Text: + MIT License Copyright (c) 2021 Mauri Mustonen From 16228a3434389dd9f6df65001c7abec69a7d556b Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Tue, 20 Jun 2023 17:25:29 +0200 Subject: [PATCH 12/31] Developement: Add requirements specification for python venv --- requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a31f820 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +graphviz==0.9 From c122d1ba18568bb079a2fa42b96f4e369094cc6c Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Tue, 4 Jul 2023 11:05:10 +0200 Subject: [PATCH 13/31] Ocgraph: fix command line argument --- ocgraph/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocgraph/__main__.py b/ocgraph/__main__.py index 7d8f78c..9a26f63 100755 --- a/ocgraph/__main__.py +++ b/ocgraph/__main__.py @@ -67,7 +67,7 @@ def main(): disassembler=args.diss, arch=args.arch, logging_preset=args.logger ) - lines = read_lines(args.assembly_file) + lines = read_lines(args.file) analyser = Analyzer(config=config) analyser.parse_lines(lines=lines) From 1bb078f2e2e525525a16daa876b9b46d6127803c Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Tue, 4 Jul 2023 11:05:53 +0200 Subject: [PATCH 14/31] Scripts: update batch_objdump to new interface --- scripts/batch_objdump.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/batch_objdump.sh b/scripts/batch_objdump.sh index bde3752..630b2c9 100755 --- a/scripts/batch_objdump.sh +++ b/scripts/batch_objdump.sh @@ -115,11 +115,13 @@ function_array=( fmaf ) -objdump=../rtems-6-sparc-gr740-smp-4/bin/sparc-rtems6-objdump -application=../examples/gr740/smp/libmcs/b-gr740-qual-only/app.exe -asm_folder=../qualification/asm -pdf_folder=../qualification/pdf -coverage_file=../coverage-tracer/ExecuteTestRun.csv +objdump=objdump + +application=./qualification/code/app.exe +coverage_file=./qualification/code/app.exe.csv + +asm_folder=./qualification/asm +pdf_folder=./qualification/pdf mkdir -p $asm_folder mkdir -p $pdf_folder @@ -127,7 +129,7 @@ mkdir -p $pdf_folder for i in ${function_array[@]} do $objdump -d $application | sed -ne '/<'$i'>:/,/^$/p' > $asm_folder/$i.asm - ./asm2cfg -c $coverage_file --preset 'sparc OBJDUMP' $asm_folder/$i.asm + python -m ocgraph -c $coverage_file -d 'OBJDUMP' -a sparc -f $asm_folder/$i.asm done mv *.pdf $pdf_folder/ From 24355ba6036093b97cc469f20c8fa7f1d508675b Mon Sep 17 00:00:00 2001 From: Andoni Arregi Date: Thu, 19 Oct 2023 11:08:32 +0200 Subject: [PATCH 15/31] Add ppc support structure --- ocgraph/configuration/architecture/ppc.py | 96 ++++++++ ocgraph/configuration/configuration.py | 24 +- .../disassembler/disassembler.py | 1 + .../{objdump_default.py => objdump_ppc.py} | 8 +- .../disassembler/objdump_sparc.py | 221 ++++++++++++++++++ 5 files changed, 340 insertions(+), 10 deletions(-) create mode 100755 ocgraph/configuration/architecture/ppc.py rename ocgraph/configuration/disassembler/{objdump_default.py => objdump_ppc.py} (97%) create mode 100755 ocgraph/configuration/disassembler/objdump_sparc.py diff --git a/ocgraph/configuration/architecture/ppc.py b/ocgraph/configuration/architecture/ppc.py new file mode 100755 index 0000000..9eac1d2 --- /dev/null +++ b/ocgraph/configuration/architecture/ppc.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +"""Contains instruction info for PPC-compatible targets.""" + +from .architecture import Architecture +from ...data.instruction import Instruction + + +# fmt: off +sparc_v8_Bicc_opcodes = [ + # conditional icc branch opcodes + "ba", "bn", "bne", "be", "bg", "ble", "bge", "bl", "bgu", "bleu", "bcc", + "bcs", "bpos", "bneg", "bvc", "bvs", +] + +sparc_v8_FBfcc_opcodes = [ + # conditional fcc branch opcodes + "fba", "fbn", "fbu", "fbg", "fbug", "fbl", "fbul", "fblg", "fbne", "fbe", + "fbue", "fbge", "fbuge", "fble", "fbule", "fbo", +] + +sparc_v8_CBfcc_opcodes = [ + # conditional coprocessor opcodes + "cba", "cbn", "cb3", "cb2", "cb23", "cb1", "cb13", "cb12", "cb123", "cb0", + "cb03", "cb02", "cb023", "cb01", "cb013", "cb012", +] + +sparc_v8_Ticc_opcodes = [ + # condictional traps on icc + "ta", "tn", "tne", "te", "tg", "tle", "tge", "tl", "tgu", "tleu", "tcc", + "tcs", "tpos", "tneg", "tvc", "tvs", +] + +sparc_v8_branch_cond_delay_opcodes = [ + f"{x},a" for x in + sparc_v8_Bicc_opcodes + + sparc_v8_FBfcc_opcodes + + sparc_v8_CBfcc_opcodes +] + +sparc_v8_remaining_jump_opcodes = [ + "jmpl", "jmp", "b", # "call", "ret", retl not regarded currently +] + +sparc_v8_delayed_opcodes = sparc_v8_Bicc_opcodes + \ + sparc_v8_FBfcc_opcodes + \ + sparc_v8_CBfcc_opcodes + \ + sparc_v8_branch_cond_delay_opcodes + \ + sparc_v8_remaining_jump_opcodes + +sparc_v8_jump_opcodes = sparc_v8_Bicc_opcodes + \ + sparc_v8_FBfcc_opcodes + \ + sparc_v8_CBfcc_opcodes + \ + sparc_v8_Ticc_opcodes + \ + sparc_v8_branch_cond_delay_opcodes + \ + sparc_v8_remaining_jump_opcodes + +sparc_v8_branch_opcodes = sparc_v8_Bicc_opcodes + \ + sparc_v8_FBfcc_opcodes + \ + sparc_v8_CBfcc_opcodes + \ + sparc_v8_branch_cond_delay_opcodes +# fmt: on + + +class PpcArchitecture(Architecture): + """PpccArchitecture Class""" + + def comment(self): + return "!" + + def is_call(self, instruction: Instruction): + return instruction.opcode == "call" + + def is_jump(self, instruction: Instruction): + return instruction.opcode in sparc_v8_jump_opcodes + + def get_jump_delay(self, instruction: Instruction) -> int | None: + delay = None + if instruction.opcode in sparc_v8_delayed_opcodes: + delay = 2 + elif self.is_sink(instruction): + delay = 2 + else: + delay = 1 + return delay + + def is_direct_jump(self, instruction: Instruction): + # every jump is disassembled with the complete offset + return self.is_jump(instruction) + + def is_branch(self, instruction: Instruction): + return instruction.opcode in sparc_v8_branch_opcodes + + def is_sink(self, instruction: Instruction): + # ret: return from subroutine + # retl: return from leaf subroutine + return instruction.opcode in ["ret", "retl"] diff --git a/ocgraph/configuration/configuration.py b/ocgraph/configuration/configuration.py index fe25c6b..6ba06ff 100755 --- a/ocgraph/configuration/configuration.py +++ b/ocgraph/configuration/configuration.py @@ -6,15 +6,23 @@ from .architecture.x86 import X86Architecture from .architecture.arm import ArmArchitecture from .architecture.sparc import SparcArchitecture +from .architecture.ppc import PpcArchitecture from .disassembler.disassembler import Disassembler -from .disassembler.objdump_default import ObjDumpDisassembler +from .disassembler.objdump_sparc import ObjDumpSparcDisassembler +from .disassembler.objdump_ppc import ObjDumpPpcDisassembler from .disassembler.gdb_default import GdbDisassembler # fmt: off -disassembler_option: dict[str, Disassembler] = { - "OBJDUMP": ObjDumpDisassembler(), - "GDB": GdbDisassembler(), +disassembler_option: dict[str, dict] = { + "OBJDUMP": { + "sparc": ObjDumpSparcDisassembler(), + "ppc": ObjDumpPpcDisassembler(), + }, + "GDB": { + "sparc": GdbDisassembler(), + "ppc": GdbDisassembler(), + }, } architecture_option: dict[str, dict] = { @@ -30,6 +38,10 @@ "platform": "SPARC", "architecture": SparcArchitecture(), }, + "ppc": { + "platform": "PPC", + "architecture": PpcArchitecture(), + }, } preset_logging: dict[str, dict] = { @@ -59,7 +71,7 @@ class OcGraphConfiguration: """Implement configuration presets for the ASM2CFG tool.""" def __init__( - self, arch: str = "sparc", disassembler: str = "objdump", logging_preset="default" + self, arch: str = "sparc", disassembler: str = "OBJDUMP", logging_preset="default" ): if architecture_option.get(arch) is None: raise NotImplementedError("Architecture option not supported!") @@ -70,7 +82,7 @@ def __init__( # load module preset _preset = architecture_option[arch] - _preset["disassembler"] = disassembler_option.get(disassembler) + _preset["disassembler"] = disassembler_option[disassembler][arch] self.__dict__ = _preset # configure logging diff --git a/ocgraph/configuration/disassembler/disassembler.py b/ocgraph/configuration/disassembler/disassembler.py index b6d444b..7d2d560 100755 --- a/ocgraph/configuration/disassembler/disassembler.py +++ b/ocgraph/configuration/disassembler/disassembler.py @@ -6,6 +6,7 @@ from abc import ABC, abstractmethod from ..architecture.sparc import SparcArchitecture +from ..architecture.ppc import PpcArchitecture from ...data.instruction import Instruction diff --git a/ocgraph/configuration/disassembler/objdump_default.py b/ocgraph/configuration/disassembler/objdump_ppc.py similarity index 97% rename from ocgraph/configuration/disassembler/objdump_default.py rename to ocgraph/configuration/disassembler/objdump_ppc.py index dba05dd..8ca8828 100755 --- a/ocgraph/configuration/disassembler/objdump_default.py +++ b/ocgraph/configuration/disassembler/objdump_ppc.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Class for parsing the input""" +"""Class for parsing the objdump PPC input""" import re from typing import List @@ -15,10 +15,10 @@ HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN -class ObjDumpDisassembler(Disassembler): - """Objdump disassembler""" +class ObjDumpPpcDisassembler(Disassembler): + """Objdump PPC disassembler""" - name: str = "Default Objdump Disassembler (SparcV8 Binutils)" + name: str = "PPC Objdump Disassembler (PPC Binutils)" # Expected format: <>: regex: str = r"(\S+)( <(\S+)>|):\s+([\S ]+)\s([\S ]+)" diff --git a/ocgraph/configuration/disassembler/objdump_sparc.py b/ocgraph/configuration/disassembler/objdump_sparc.py new file mode 100755 index 0000000..47ca645 --- /dev/null +++ b/ocgraph/configuration/disassembler/objdump_sparc.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 +"""Class for parsing the objdump SPARC input""" + +import re +from typing import List + +from ...data.address import Address +from ...data.encoding import Encoding +from ...data.instruction import Instruction + +from .disassembler import Disassembler, DisassemblerError + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class ObjDumpSparcDisassembler(Disassembler): + """Objdump SPARC disassembler""" + + name: str = "SPARC Objdump Disassembler (SparcV8 Binutils)" + + # Expected format: <>: + regex: str = r"(\S+)( <(\S+)>|):\s+([\S ]+)\s([\S ]+)" + regex_information: dict[str, int] = { + "address": 1, + "location": 3, + "instruction_hex": 4, + "instruction_str": 5, + } + + def extract_information(self, str_input: str) -> dict[str, str]: + result = {} + + if "=> " in str_input: + extracted_line = str_input.split("=> ", 1)[1].split("\n", 1)[0] + + information = re.search(self.regex, extracted_line) + if not information: + raise DisassemblerError("Line not processable: \n" + str(extracted_line)) + + address = str(information.group(self.regex_information.get("address"))) + location = str(information.group(self.regex_information.get("location"))) + instr_d = str(information.group(self.regex_information.get("instruction_str"))) + instr_h = str(information.group(self.regex_information.get("instruction_hex"))) + opcode = (instr_d.split()[0],) + + result = { + "address": address, + "location": location, + "instr_h": instr_h, + "instr_d": instr_d, + "opcode": opcode, + "printable": extracted_line, + } + else: + raise DisassemblerError("Line not processable: \n" + str(str_input)) + + return result + + def parse_function_header(self, line: str) -> str | None: + """ + Return function name of memory range from the given string line. + + Match lines for non-stripped binaries: + 'Dump of assembler code for function test_function:' + lines for stripped binaries: + 'Dump of assembler code from 0x555555555faf to 0x555555557008:' + and lines for objdump disassembly: + '0000000000016bb0 <_obstack_allocated_p@@Base>:' + """ + objdump_name_pattern = re.compile(rf"{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:") + function_name = objdump_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + function_name_pattern = re.compile(r"function (\w+):$") + function_name = function_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + memory_range_pattern = re.compile( + rf"(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$" + ) + memory_range = memory_range_pattern.search(line) + if memory_range is not None: + return f"{memory_range[1]}-{memory_range[2]}" + + return None + + def parse_address(self, line: str) -> (Address, str): + """ + Parses leading address of instruction + """ + address_match = re.match(rf"^\s*(?:0x)?({HEX_PATTERN})\s*(?:<([+-][0-9]+)>)?:(.*)", line) + if address_match is None: + return None, line + address = Address( + int(address_match[1], 16), None, int(address_match[2]) if address_match[2] else None + ) + return address, address_match[3] + + def split_nth(self, string: str, count: int) -> List[str]: + """ + Splits string to equally-sized chunks + """ + return [string[i : i + count] for i in range(0, len(string), count)] + + def parse_encoding(self, line): + """ + Parses byte encoding of instruction for objdump disassemblies + e.g. the '31 c0' in + '16bd3: 31 c0 xor %eax,%eax' + In addition to X86 supports ARM encoding styles: + '4: e1a01000 mov r1, r0' + '50: f7ff fffe bl 0 <__aeabi_dadd>' + '54: 0002 movs r2, r0' + """ + # Encoding is separated from assembly mnemonic via tab + # so we allow whitespace separators between bytes + # to avoid accidentally matching the mnemonic. + enc_match = re.match(r"^\s*((?:[0-9a-f]{2,8} +)+)(.*)", line) + if enc_match is None: + return None, line + bites = [] + for chunk in enc_match[1].strip().split(" "): + bites.extend(int(byte, 16) for byte in self.split_nth(chunk, 2)) + return Encoding(bites), enc_match[2] + + def parse_body(self, line: str) -> (str, str, List[str], str): + """Parses instruction body (opcode and operands)""" + comment_symbol = self.architecture.comment() + body_match = re.match(rf"^\s*([^{comment_symbol}<]+)(.*)", line) + if body_match is None: + return None, None, None, line + body = body_match[1].strip() + line = body_match[2] + opcode_match = re.match(r"^(\S*)\s*(.*)", body) + if opcode_match is None: + return None, None, None, line + opcode = opcode_match[1] + ops = opcode_match[2].split(",") if opcode_match[2] else [] + return body, opcode, ops, line + + def parse_target(self, line: str) -> (Address, str): + """ + Parses optional instruction branch target hint + """ + target_match = re.match(r"\s*<([.a-zA-Z_@0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)", line) + if target_match is None: + return None, line + offset = target_match[2] or "+0" + address = Address(None, target_match[1], int(offset, 0)) + return address, target_match[3] + + def parse_comment(self, line: str) -> (Address, str): + """ + Parses optional instruction comment + """ + comment_symbol = self.architecture.comment() + comment_match = re.match(rf"^\s*{comment_symbol}\s*(.*)", line) + if comment_match is None: + return None, line + comment = comment_match[1] + imm_match = re.match(rf"^(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)", comment) + if imm_match is None: + # If no imm was found, ignore the comment. + # In particular this takes care of useless ARM comments like + # '82: 46c0 nop ; (mov r8, r8)' + return None, "" + abs_addr = int(imm_match[1], 16) + if imm_match[2]: + target, _ = self.parse_target(imm_match[2]) + target.abs = abs_addr + else: + target = Address(abs_addr) + return target, imm_match[3] + + def parse_line(self, line: str, lineno, function_name: str) -> Instruction | None: + """ + Parses a single line of assembly to create Instruction instance + """ + # Strip GDB prefix and leading whites + line = line.removeprefix("=> ") + line = line.lstrip() + + address, line = self.parse_address(line) + if address is None: + return None + + encoding, line = self.parse_encoding(line) + if not line: + return encoding + + original_line = line + body, opcode, ops, line = self.parse_body(line) + if opcode is None: + return None + + target, line = self.parse_target(line) + + _, line = self.parse_comment(line) + if line: + # Expecting complete parse + return None + + # Set base symbol for relative addresses + if address.base is None: + address.base = function_name + if target is not None and target.base is None: + target.base = function_name + + return Instruction( + body, + original_line.strip(), + lineno, + address, + opcode, + ops, + target, + ) From 81c22f26267dd54b1cd0c7670e439cc7f836bec5 Mon Sep 17 00:00:00 2001 From: Andoni Arregi Date: Mon, 23 Oct 2023 23:16:06 +0200 Subject: [PATCH 16/31] Add ppc instructions --- ocgraph/configuration/architecture/ppc.py | 98 +++++-------------- .../configuration/disassembler/objdump_ppc.py | 8 +- ocgraph/interface/analyzer.py | 1 + 3 files changed, 32 insertions(+), 75 deletions(-) diff --git a/ocgraph/configuration/architecture/ppc.py b/ocgraph/configuration/architecture/ppc.py index 9eac1d2..16cbea5 100755 --- a/ocgraph/configuration/architecture/ppc.py +++ b/ocgraph/configuration/architecture/ppc.py @@ -1,96 +1,52 @@ #!/usr/bin/env python3 """Contains instruction info for PPC-compatible targets.""" +import re + from .architecture import Architecture from ...data.instruction import Instruction -# fmt: off -sparc_v8_Bicc_opcodes = [ - # conditional icc branch opcodes - "ba", "bn", "bne", "be", "bg", "ble", "bge", "bl", "bgu", "bleu", "bcc", - "bcs", "bpos", "bneg", "bvc", "bvs", -] - -sparc_v8_FBfcc_opcodes = [ - # conditional fcc branch opcodes - "fba", "fbn", "fbu", "fbg", "fbug", "fbl", "fbul", "fblg", "fbne", "fbe", - "fbue", "fbge", "fbuge", "fble", "fbule", "fbo", -] - -sparc_v8_CBfcc_opcodes = [ - # conditional coprocessor opcodes - "cba", "cbn", "cb3", "cb2", "cb23", "cb1", "cb13", "cb12", "cb123", "cb0", - "cb03", "cb02", "cb023", "cb01", "cb013", "cb012", -] - -sparc_v8_Ticc_opcodes = [ - # condictional traps on icc - "ta", "tn", "tne", "te", "tg", "tle", "tge", "tl", "tgu", "tleu", "tcc", - "tcs", "tpos", "tneg", "tvc", "tvs", -] +# Common regexes +HEX_PATTERN = r'[0-9a-fA-F]+' +HEX_LONG_PATTERN = r'(?:0x0*)' + HEX_PATTERN -sparc_v8_branch_cond_delay_opcodes = [ - f"{x},a" for x in - sparc_v8_Bicc_opcodes + - sparc_v8_FBfcc_opcodes + - sparc_v8_CBfcc_opcodes +# fmt: off +ppc_unconditional_branch_opcodes = [ + "b", "ba", "bla", + "bctr", "bctrl", "blrl", ] -sparc_v8_remaining_jump_opcodes = [ - "jmpl", "jmp", "b", # "call", "ret", retl not regarded currently +ppc_conditional_branch_opcodes = [ + "bc", "bt", "bf", "bdnz", "bdnzt", "bdnzf", "bdz", "bdzt", "bdzf", + "bca", "bta", "bfa", "bdnza", "bdnzta", "bdnzfa", "bdza", "bdzta", "bdzfa", + "bcl", "btl", "bfl", "bdnzl", "bdnztl", "bdnzfl", "bdzl", "bdztl", "bdzfl", + "bcla", "bdtla", "bdfla", "bdnzla", "bdnztla", "bdnzfla", "bdzla", "bdztla", "bdzfla", + "bclr", "btlr", "bflr", "bdnzlr", "bdnztlr", "bdnzflr", "bdzlr", "bdztlr", "bdzflr", + "bclrl", "btlrl", "bflrl", "bdnzlrl", "bdnztlrl", "bdnzflrl", "bdzlrl", "bdztlrl", "bdzflrl", + "bcctr", "btctr", "bfctr", + "bcctrl", "btctrl", "bfctrl", ] - -sparc_v8_delayed_opcodes = sparc_v8_Bicc_opcodes + \ - sparc_v8_FBfcc_opcodes + \ - sparc_v8_CBfcc_opcodes + \ - sparc_v8_branch_cond_delay_opcodes + \ - sparc_v8_remaining_jump_opcodes - -sparc_v8_jump_opcodes = sparc_v8_Bicc_opcodes + \ - sparc_v8_FBfcc_opcodes + \ - sparc_v8_CBfcc_opcodes + \ - sparc_v8_Ticc_opcodes + \ - sparc_v8_branch_cond_delay_opcodes + \ - sparc_v8_remaining_jump_opcodes - -sparc_v8_branch_opcodes = sparc_v8_Bicc_opcodes + \ - sparc_v8_FBfcc_opcodes + \ - sparc_v8_CBfcc_opcodes + \ - sparc_v8_branch_cond_delay_opcodes # fmt: on class PpcArchitecture(Architecture): - """PpccArchitecture Class""" + """PpcArchitecture Class""" def comment(self): - return "!" + return "#" def is_call(self, instruction: Instruction): - return instruction.opcode == "call" + return instruction.opcode == "bl" def is_jump(self, instruction: Instruction): - return instruction.opcode in sparc_v8_jump_opcodes - - def get_jump_delay(self, instruction: Instruction) -> int | None: - delay = None - if instruction.opcode in sparc_v8_delayed_opcodes: - delay = 2 - elif self.is_sink(instruction): - delay = 2 - else: - delay = 1 - return delay - - def is_direct_jump(self, instruction: Instruction): - # every jump is disassembled with the complete offset - return self.is_jump(instruction) + return instruction.opcode in ppc_conditional_branch_opcodes def is_branch(self, instruction: Instruction): - return instruction.opcode in sparc_v8_branch_opcodes + return instruction.opcode in ppc_unconditional_branch_opcodes def is_sink(self, instruction: Instruction): - # ret: return from subroutine - # retl: return from leaf subroutine - return instruction.opcode in ["ret", "retl"] + return instruction.opcode == "blr" + + def is_direct_jump(self, instruction: Instruction): + return self.is_jump(instruction) and (re.match(rf"{HEX_LONG_PATTERN}", instruction.ops[1])) diff --git a/ocgraph/configuration/disassembler/objdump_ppc.py b/ocgraph/configuration/disassembler/objdump_ppc.py index 8ca8828..64a1551 100755 --- a/ocgraph/configuration/disassembler/objdump_ppc.py +++ b/ocgraph/configuration/disassembler/objdump_ppc.py @@ -12,7 +12,7 @@ # Common regexes HEX_PATTERN = r"[0-9a-fA-F]+" -HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN +HEX_LONG_PATTERN = r"(?:0x0*)" + HEX_PATTERN class ObjDumpPpcDisassembler(Disassembler): @@ -116,10 +116,10 @@ def parse_encoding(self, line): '50: f7ff fffe bl 0 <__aeabi_dadd>' '54: 0002 movs r2, r0' """ - # Encoding is separated from assembly mnemonic via tab - # so we allow whitespace separators between bytes + # Encoding is separated from assembly mnemonic via tab (only in objdump not for llvm-objdump) + # so we allow only 1 white space separator between bytes for compatibility with llvm-objdump # to avoid accidentally matching the mnemonic. - enc_match = re.match(r"^\s*((?:[0-9a-f]{2,8} +)+)(.*)", line) + enc_match = re.match(r"^\s*((?:(?:[0-9a-f]{2,8} )+)+)(.*)", line) if enc_match is None: return None, line bites = [] diff --git a/ocgraph/interface/analyzer.py b/ocgraph/interface/analyzer.py index 9a2cdb5..0cc8c51 100755 --- a/ocgraph/interface/analyzer.py +++ b/ocgraph/interface/analyzer.py @@ -160,6 +160,7 @@ def _create_basic_blocks(self) -> None: # End current block if current opcode is a jump/branch/sink if jump_target: + print(instruction) curr_basic_block.add_jump_edge(jump_target.abs) prev_branch_block = curr_basic_block if is_branch else None block_completion = self.configuration.architecture.get_jump_delay(instruction) From f0411eb7ab9ae850518b2b7dcb32355ab108f595 Mon Sep 17 00:00:00 2001 From: Andoni Arregi Date: Wed, 25 Oct 2023 14:42:08 +0200 Subject: [PATCH 17/31] Add ppc basic implementation probably breaking SPARC --- .../architecture/architecture.py | 18 ++++++------- ocgraph/configuration/architecture/arm.py | 8 +++--- ocgraph/configuration/architecture/ppc.py | 12 ++++----- ocgraph/configuration/architecture/sparc.py | 26 +++++++++---------- ocgraph/configuration/architecture/x86.py | 8 +++--- .../configuration/disassembler/objdump_ppc.py | 7 +++-- ocgraph/data/address.py | 2 +- ocgraph/data/instruction.py | 4 +++ ocgraph/data/jump_table.py | 2 +- ocgraph/interface/analyzer.py | 25 ++++++++++++------ 10 files changed, 64 insertions(+), 48 deletions(-) diff --git a/ocgraph/configuration/architecture/architecture.py b/ocgraph/configuration/architecture/architecture.py index a7e2206..ec0bdfd 100755 --- a/ocgraph/configuration/architecture/architecture.py +++ b/ocgraph/configuration/architecture/architecture.py @@ -19,26 +19,26 @@ def comment(self) -> str: @abstractmethod def is_call(self, instruction: Instruction) -> bool: - """Return if disassembled instruction is a call""" + """Return if disassembled instruction is a subroutine call""" raise NotImplementedError() @abstractmethod - def is_jump(self, instruction: Instruction) -> bool: - """Return if disassembled instruction is a jump""" + def is_unconditional_branch(self, instruction: Instruction) -> bool: + """Return if disassembled instruction is an unconditional branch""" raise NotImplementedError() - def get_jump_delay(self, instruction: Instruction) -> int | None: - """Return the jump delay of an instruction or None if not a jump""" - return 1 if self.is_jump(instruction) else None + def get_branch_delay(self, instruction: Instruction) -> int | None: + """Return the branch delay of an instruction or None if not a branch""" + return 1 if self.is_branch(instruction) else None @abstractmethod - def is_direct_jump(self, instruction: Instruction) -> bool: - """Return if disassembled instruction is a direct jump""" + def is_direct_branch(self, instruction: Instruction) -> bool: + """Return if disassembled instruction is a direct branch""" raise NotImplementedError() @abstractmethod def is_branch(self, instruction: Instruction) -> bool: - """Return if disassembled instruction is a conditional jump""" + """Return if disassembled instruction is a conditional branch""" raise NotImplementedError() @abstractmethod diff --git a/ocgraph/configuration/architecture/arm.py b/ocgraph/configuration/architecture/arm.py index 8feca00..df6ba6b 100755 --- a/ocgraph/configuration/architecture/arm.py +++ b/ocgraph/configuration/architecture/arm.py @@ -28,13 +28,13 @@ def is_call(self, instruction: Instruction): "bls", ) - def is_jump(self, instruction: Instruction): + def is_branch(self, instruction: Instruction): return instruction.opcode[0] == "b" and not self.is_call(instruction) - def is_direct_jump(self, instruction: Instruction): - return self.is_jump(instruction) and re.match(rf"{HEX_LONG_PATTERN}", instruction.ops[0]) + def is_direct_branch(self, instruction: Instruction): + return self.is_branch(instruction) and re.match(rf"{HEX_LONG_PATTERN}", instruction.ops[0]) - def is_branch(self, instruction: Instruction): + def is_unconditional_branch(self, instruction: Instruction): return instruction.opcode == "b" def is_sink(self, instruction: Instruction): diff --git a/ocgraph/configuration/architecture/ppc.py b/ocgraph/configuration/architecture/ppc.py index 16cbea5..fc1985e 100755 --- a/ocgraph/configuration/architecture/ppc.py +++ b/ocgraph/configuration/architecture/ppc.py @@ -14,7 +14,7 @@ # fmt: off ppc_unconditional_branch_opcodes = [ "b", "ba", "bla", - "bctr", "bctrl", "blrl", + "bctr", "bctrl", "blrl", ] ppc_conditional_branch_opcodes = [ @@ -39,14 +39,14 @@ def comment(self): def is_call(self, instruction: Instruction): return instruction.opcode == "bl" - def is_jump(self, instruction: Instruction): - return instruction.opcode in ppc_conditional_branch_opcodes - def is_branch(self, instruction: Instruction): + return instruction.opcode in ppc_conditional_branch_opcodes or ppc_unconditional_branch_opcodes and not self.is_call(instruction) + + def is_unconditional_branch(self, instruction: Instruction): return instruction.opcode in ppc_unconditional_branch_opcodes def is_sink(self, instruction: Instruction): return instruction.opcode == "blr" - def is_direct_jump(self, instruction: Instruction): - return self.is_jump(instruction) and (re.match(rf"{HEX_LONG_PATTERN}", instruction.ops[1])) + def is_direct_branch(self, instruction: Instruction): + return self.is_branch(instruction) and (re.search(rf"{HEX_LONG_PATTERN}", '|'.join(instruction.ops))) diff --git a/ocgraph/configuration/architecture/sparc.py b/ocgraph/configuration/architecture/sparc.py index 8494b0c..7ff25eb 100755 --- a/ocgraph/configuration/architecture/sparc.py +++ b/ocgraph/configuration/architecture/sparc.py @@ -47,17 +47,17 @@ sparc_v8_branch_cond_delay_opcodes + \ sparc_v8_remaining_jump_opcodes -sparc_v8_jump_opcodes = sparc_v8_Bicc_opcodes + \ +sparc_v8_branch_opcodes = sparc_v8_Bicc_opcodes + \ sparc_v8_FBfcc_opcodes + \ sparc_v8_CBfcc_opcodes + \ sparc_v8_Ticc_opcodes + \ sparc_v8_branch_cond_delay_opcodes + \ sparc_v8_remaining_jump_opcodes -sparc_v8_branch_opcodes = sparc_v8_Bicc_opcodes + \ - sparc_v8_FBfcc_opcodes + \ - sparc_v8_CBfcc_opcodes + \ - sparc_v8_branch_cond_delay_opcodes +sparc_v8_unconditional_branch_opcodes = sparc_v8_Bicc_opcodes + \ + sparc_v8_FBfcc_opcodes + \ + sparc_v8_CBfcc_opcodes + \ + sparc_v8_branch_cond_delay_opcodes # fmt: on @@ -70,10 +70,10 @@ def comment(self): def is_call(self, instruction: Instruction): return instruction.opcode == "call" - def is_jump(self, instruction: Instruction): - return instruction.opcode in sparc_v8_jump_opcodes + def is_branch(self, instruction: Instruction): + return instruction.opcode in sparc_v8_branch_opcodes - def get_jump_delay(self, instruction: Instruction) -> int | None: + def get_branch_delay(self, instruction: Instruction) -> int | None: delay = None if instruction.opcode in sparc_v8_delayed_opcodes: delay = 2 @@ -83,12 +83,12 @@ def get_jump_delay(self, instruction: Instruction) -> int | None: delay = 1 return delay - def is_direct_jump(self, instruction: Instruction): - # every jump is disassembled with the complete offset - return self.is_jump(instruction) + def is_direct_branch(self, instruction: Instruction): + # every branch is disassembled with the complete offset + return self.is_branch(instruction) or self.is_unconditional_brach(instruction) - def is_branch(self, instruction: Instruction): - return instruction.opcode in sparc_v8_branch_opcodes + def is_unconditional_branch(self, instruction: Instruction): + return instruction.opcode in sparc_v8_unconditional_branch_opcodes def is_sink(self, instruction: Instruction): # ret: return from subroutine diff --git a/ocgraph/configuration/architecture/x86.py b/ocgraph/configuration/architecture/x86.py index 9970985..ac90e46 100755 --- a/ocgraph/configuration/architecture/x86.py +++ b/ocgraph/configuration/architecture/x86.py @@ -24,13 +24,13 @@ def is_call(self, instruction: Instruction): # addr32 call 0x55555558add0 return "call" in instruction.opcode - def is_jump(self, instruction: Instruction): + def is_branch(self, instruction: Instruction): return instruction.opcode[0] == "j" - def is_direct_jump(self, instruction: Instruction): - return self.is_jump(instruction) and re.match(rf"{HEX_LONG_PATTERN}", instruction.ops[0]) + def is_direct_branch(self, instruction: Instruction): + return self.is_branch(instruction) and re.match(rf"{HEX_LONG_PATTERN}", instruction.ops[0]) - def is_branch(self, instruction: Instruction): + def is_unconditional_branch(self, instruction: Instruction): return instruction.opcode.startswith("jmp") def is_sink(self, instruction: Instruction): diff --git a/ocgraph/configuration/disassembler/objdump_ppc.py b/ocgraph/configuration/disassembler/objdump_ppc.py index 64a1551..0762d58 100755 --- a/ocgraph/configuration/disassembler/objdump_ppc.py +++ b/ocgraph/configuration/disassembler/objdump_ppc.py @@ -139,7 +139,7 @@ def parse_body(self, line: str) -> (str, str, List[str], str): if opcode_match is None: return None, None, None, line opcode = opcode_match[1] - ops = opcode_match[2].split(",") if opcode_match[2] else [] + ops = [op.strip() for op in opcode_match[2].split(",")] if opcode_match[2] else [] return body, opcode, ops, line def parse_target(self, line: str) -> (Address, str): @@ -149,6 +149,7 @@ def parse_target(self, line: str) -> (Address, str): target_match = re.match(r"\s*<([.a-zA-Z_@0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)", line) if target_match is None: return None, line + print("target parse" + str(target_match)) offset = target_match[2] or "+0" address = Address(None, target_match[1], int(offset, 0)) return address, target_match[3] @@ -210,7 +211,7 @@ def parse_line(self, line: str, lineno, function_name: str) -> Instruction | Non if target is not None and target.base is None: target.base = function_name - return Instruction( + instruction = Instruction( body, original_line.strip(), lineno, @@ -219,3 +220,5 @@ def parse_line(self, line: str, lineno, function_name: str) -> Instruction | Non ops, target, ) + + return instruction diff --git a/ocgraph/data/address.py b/ocgraph/data/address.py index cd60b03..61b184c 100755 --- a/ocgraph/data/address.py +++ b/ocgraph/data/address.py @@ -19,7 +19,7 @@ def is_relative(self): return not self.is_absolute() def __str__(self): - if self.offset is not None: + if self.offset is not None and self.abs is not None: return f"0x{self.abs:x} ({self.base}+0x{self.offset:x})" if isinstance(self.abs, int): return f"0x{self.abs:x}" diff --git a/ocgraph/data/instruction.py b/ocgraph/data/instruction.py index 4c5abb2..6f8af80 100755 --- a/ocgraph/data/instruction.py +++ b/ocgraph/data/instruction.py @@ -83,10 +83,14 @@ def __str__(self): result = f"{self.address}: {self.opcode}" if self.ops: result += f" {self.ops}" + if self.target is not None: + result += " -> " + str(self.target) return result def __repr__(self) -> str: result = f"{self.address}: {self.opcode}" if self.ops: result += f" {self.ops}" + if self.target is not None: + result += " -> " + str(self.target) return result diff --git a/ocgraph/data/jump_table.py b/ocgraph/data/jump_table.py index 6c95b52..b5f77f9 100755 --- a/ocgraph/data/jump_table.py +++ b/ocgraph/data/jump_table.py @@ -25,7 +25,7 @@ def __init__(self, instructions: List[Instruction], configuration: OcGraphConfig # Iterate over the lines and collect jump targets and branching points. for instr in instructions: - if instr is None or not self.config.architecture.is_direct_jump(instr): + if instr is None or not self.config.architecture.is_direct_branch(instr): continue self.abs_sources[instr.address.abs] = instr.target diff --git a/ocgraph/interface/analyzer.py b/ocgraph/interface/analyzer.py index 0cc8c51..617a577 100755 --- a/ocgraph/interface/analyzer.py +++ b/ocgraph/interface/analyzer.py @@ -2,6 +2,7 @@ """Class for read and analyze the input string.""" import sys +import re from ..data.address import Address from ..data.basic_block import BasicBlock @@ -11,6 +12,10 @@ from ..configuration.configuration import OcGraphConfiguration, Disassembler +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)" + HEX_PATTERN + class Analyzer: """Analyzer Class""" @@ -73,10 +78,13 @@ def _compute_jump_targets(self): for instr in self.instructions: if ( instr.target is None or instr.target.abs is None - ) and self.configuration.architecture.is_direct_jump(instr): + ) and self.configuration.architecture.is_direct_branch(instr): if instr.target is None: instr.target = Address(0) - instr.target.abs = int(instr.ops[0], 16) + # parse the absolute target out of the operands + # (first hex address is assumed to be the target address) + print("direct branch without target: " + str(instr)) + instr.target.abs = int(re.search(rf"{HEX_LONG_PATTERN}", '|'.join(instr.ops))[0], 16) # Infer relative addresses (for objdump or stripped gdb) start_address = self.instructions[0].address.abs @@ -136,9 +144,10 @@ def _create_basic_blocks(self) -> None: # Current program counter pc_addr = instruction.address - # Optional jump target + # Get optional jump target jump_target = self.jump_table.get_target(pc_addr) - is_branch = self.configuration.architecture.is_branch(instruction) + print("jump target: " + str(jump_target)) + is_unconditional = self.configuration.architecture.is_unconditional_branch(instruction) # Start new blocks if last ended if curr_basic_block is None: @@ -160,12 +169,12 @@ def _create_basic_blocks(self) -> None: # End current block if current opcode is a jump/branch/sink if jump_target: - print(instruction) + print("has jump target: " + str(instruction)) curr_basic_block.add_jump_edge(jump_target.abs) - prev_branch_block = curr_basic_block if is_branch else None - block_completion = self.configuration.architecture.get_jump_delay(instruction) + prev_branch_block = None if is_unconditional else curr_basic_block + block_completion = self.configuration.architecture.get_branch_delay(instruction) elif self.configuration.architecture.is_sink(instruction): - block_completion = self.configuration.architecture.get_jump_delay(instruction) + block_completion = self.configuration.architecture.get_branch_delay(instruction) prev_branch_block = None if prev_branch_block is not None: From a1099a28e0aee3493b9c7644bcc9842d7f574c7a Mon Sep 17 00:00:00 2001 From: Andoni Arregi Date: Wed, 25 Oct 2023 23:26:21 +0200 Subject: [PATCH 18/31] Make minor dotfile format changes --- ocgraph/interface/drawer.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ocgraph/interface/drawer.py b/ocgraph/interface/drawer.py index 55c6654..d8a885a 100755 --- a/ocgraph/interface/drawer.py +++ b/ocgraph/interface/drawer.py @@ -2,6 +2,7 @@ """Class for drawing the output.""" import tempfile + from typing import Dict from graphviz import Digraph @@ -36,9 +37,9 @@ def _escape(text: str) -> str: Escape used dot graph characters in given instruction so they will be displayed correctly. """ - text = text.replace("<", r"[") - text = text.replace(">", r"]") - text = text.replace("\t", " ") + text = text.replace("<", r"<") + text = text.replace(">", r">") + text = text.replace("\t", " ") return text def _create_label(self, basic_block: BasicBlock, line_coverage=False): @@ -54,7 +55,7 @@ def _create_label(self, basic_block: BasicBlock, line_coverage=False): label += ( "" f'' - f"0x{instr.address.abs:x}: {Drawer._escape(text=instr.text)}" + f"0x{instr.address.abs:0>8x}: {instr.opcode:<10} {Drawer._escape(text=instr.text.removeprefix(instr.opcode).strip())}" "\n" ) if self.config.architecture.is_sink(instr): @@ -82,6 +83,9 @@ def _create_cfg(self, name: str, basic_blocks: Dict[int, BasicBlock], coverage=F """Create a cgf""" dot = Digraph(name=name, comment=name, engine="dot") dot.attr("graph", label=name) + dot.attr("graph", fontname="Mono") + dot.attr("node", fontname="Mono") + dot.attr("edge", fontname="Mono") # Create nodes in graph for address, basic_block in basic_blocks.items(): From a73bd2d506a27de8af83772bfa7ec0b5152d5e03 Mon Sep 17 00:00:00 2001 From: Andoni Arregi Date: Sun, 29 Oct 2023 22:07:53 +0100 Subject: [PATCH 19/31] Fix sparc functionality --- .../architecture/architecture.py | 4 +- ocgraph/configuration/architecture/ppc.py | 14 ++++-- ocgraph/configuration/architecture/sparc.py | 46 ++++++++++--------- .../disassembler/disassembler.py | 6 +++ .../configuration/disassembler/gdb_default.py | 3 ++ .../configuration/disassembler/objdump_ppc.py | 5 +- .../disassembler/objdump_sparc.py | 3 ++ ocgraph/interface/analyzer.py | 7 +-- 8 files changed, 54 insertions(+), 34 deletions(-) diff --git a/ocgraph/configuration/architecture/architecture.py b/ocgraph/configuration/architecture/architecture.py index ec0bdfd..cbf36ff 100755 --- a/ocgraph/configuration/architecture/architecture.py +++ b/ocgraph/configuration/architecture/architecture.py @@ -7,7 +7,7 @@ class Architecture(ABC): - """TargetInfo Class""" + """TargetInfo Class defining the target specific instruction set characteristics""" def __init__(self): pass @@ -38,7 +38,7 @@ def is_direct_branch(self, instruction: Instruction) -> bool: @abstractmethod def is_branch(self, instruction: Instruction) -> bool: - """Return if disassembled instruction is a conditional branch""" + """Return if disassembled instruction is a branch instruction (conditional or unconditional)""" raise NotImplementedError() @abstractmethod diff --git a/ocgraph/configuration/architecture/ppc.py b/ocgraph/configuration/architecture/ppc.py index fc1985e..58d9f21 100755 --- a/ocgraph/configuration/architecture/ppc.py +++ b/ocgraph/configuration/architecture/ppc.py @@ -12,6 +12,14 @@ HEX_LONG_PATTERN = r'(?:0x0*)' + HEX_PATTERN # fmt: off +ppc_call_opcodes = [ + "bl", +] + +ppc_sink_opcodes = [ + "blr", +] + ppc_unconditional_branch_opcodes = [ "b", "ba", "bla", "bctr", "bctrl", "blrl", @@ -37,16 +45,16 @@ def comment(self): return "#" def is_call(self, instruction: Instruction): - return instruction.opcode == "bl" + return instruction.opcode in ppc_call_opcodes def is_branch(self, instruction: Instruction): - return instruction.opcode in ppc_conditional_branch_opcodes or ppc_unconditional_branch_opcodes and not self.is_call(instruction) + return instruction.opcode in (ppc_conditional_branch_opcodes + ppc_unconditional_branch_opcodes) and not self.is_call(instruction) def is_unconditional_branch(self, instruction: Instruction): return instruction.opcode in ppc_unconditional_branch_opcodes def is_sink(self, instruction: Instruction): - return instruction.opcode == "blr" + return instruction.opcode in ppc_sink_opcodes def is_direct_branch(self, instruction: Instruction): return self.is_branch(instruction) and (re.search(rf"{HEX_LONG_PATTERN}", '|'.join(instruction.ops))) diff --git a/ocgraph/configuration/architecture/sparc.py b/ocgraph/configuration/architecture/sparc.py index 7ff25eb..f52765d 100755 --- a/ocgraph/configuration/architecture/sparc.py +++ b/ocgraph/configuration/architecture/sparc.py @@ -6,6 +6,16 @@ # fmt: off +sparc_v8_call_opcodes = [ + "call", +] + +sparc_v8_sink_opcodes = [ + # ret: return from subroutine + # retl: return from leaf subroutine + "ret", "retl", +] + sparc_v8_Bicc_opcodes = [ # conditional icc branch opcodes "ba", "bn", "bne", "be", "bg", "ble", "bge", "bl", "bgu", "bleu", "bcc", @@ -25,7 +35,7 @@ ] sparc_v8_Ticc_opcodes = [ - # condictional traps on icc + # conditional traps on icc "ta", "tn", "tne", "te", "tg", "tle", "tge", "tl", "tgu", "tleu", "tcc", "tcs", "tpos", "tneg", "tvc", "tvs", ] @@ -37,27 +47,21 @@ sparc_v8_CBfcc_opcodes ] -sparc_v8_remaining_jump_opcodes = [ - "jmpl", "jmp", "b", # "call", "ret", retl not regarded currently +sparc_v8_unconditional_branch_opcodes = [ + "jmpl", "jmp", "b", ] sparc_v8_delayed_opcodes = sparc_v8_Bicc_opcodes + \ sparc_v8_FBfcc_opcodes + \ sparc_v8_CBfcc_opcodes + \ sparc_v8_branch_cond_delay_opcodes + \ - sparc_v8_remaining_jump_opcodes - -sparc_v8_branch_opcodes = sparc_v8_Bicc_opcodes + \ - sparc_v8_FBfcc_opcodes + \ - sparc_v8_CBfcc_opcodes + \ - sparc_v8_Ticc_opcodes + \ - sparc_v8_branch_cond_delay_opcodes + \ - sparc_v8_remaining_jump_opcodes - -sparc_v8_unconditional_branch_opcodes = sparc_v8_Bicc_opcodes + \ - sparc_v8_FBfcc_opcodes + \ - sparc_v8_CBfcc_opcodes + \ - sparc_v8_branch_cond_delay_opcodes + sparc_v8_unconditional_branch_opcodes + +sparc_v8_conditional_branch_opcodes = sparc_v8_Bicc_opcodes + \ + sparc_v8_FBfcc_opcodes + \ + sparc_v8_CBfcc_opcodes + \ + sparc_v8_Ticc_opcodes + \ + sparc_v8_branch_cond_delay_opcodes # fmt: on @@ -68,10 +72,10 @@ def comment(self): return "!" def is_call(self, instruction: Instruction): - return instruction.opcode == "call" + return instruction.opcode in sparc_v8_call_opcodes def is_branch(self, instruction: Instruction): - return instruction.opcode in sparc_v8_branch_opcodes + return instruction.opcode in (sparc_v8_conditional_branch_opcodes + sparc_v8_unconditional_branch_opcodes) def get_branch_delay(self, instruction: Instruction) -> int | None: delay = None @@ -85,12 +89,10 @@ def get_branch_delay(self, instruction: Instruction) -> int | None: def is_direct_branch(self, instruction: Instruction): # every branch is disassembled with the complete offset - return self.is_branch(instruction) or self.is_unconditional_brach(instruction) + return self.is_branch(instruction) def is_unconditional_branch(self, instruction: Instruction): return instruction.opcode in sparc_v8_unconditional_branch_opcodes def is_sink(self, instruction: Instruction): - # ret: return from subroutine - # retl: return from leaf subroutine - return instruction.opcode in ["ret", "retl"] + return instruction.opcode in sparc_v8_sink_opcodes diff --git a/ocgraph/configuration/disassembler/disassembler.py b/ocgraph/configuration/disassembler/disassembler.py index 7d2d560..73822e4 100755 --- a/ocgraph/configuration/disassembler/disassembler.py +++ b/ocgraph/configuration/disassembler/disassembler.py @@ -43,3 +43,9 @@ def parse_function_header(self, line: str) -> str | None: @abstractmethod def parse_line(self, line: str, lineno, function_name: str) -> Instruction | None: """Parses a single line of assembly to create Instruction instance""" + + @abstractmethod + def parse_jump_target(self, str_input: str) -> int | None: + """Parses a string (e.g., coma separated operands) and returns + the jump target value + """ diff --git a/ocgraph/configuration/disassembler/gdb_default.py b/ocgraph/configuration/disassembler/gdb_default.py index ef4ac80..7d54096 100755 --- a/ocgraph/configuration/disassembler/gdb_default.py +++ b/ocgraph/configuration/disassembler/gdb_default.py @@ -212,3 +212,6 @@ def parse_line(self, line: str, lineno, function_name: str) -> Instruction | Non ops, target, ) + + def parse_jump_target(self, str_input: str) -> int | None: + return int(re.search(rf"{HEX_LONG_PATTERN}", str_input)[0], 16) diff --git a/ocgraph/configuration/disassembler/objdump_ppc.py b/ocgraph/configuration/disassembler/objdump_ppc.py index 0762d58..cba7b4d 100755 --- a/ocgraph/configuration/disassembler/objdump_ppc.py +++ b/ocgraph/configuration/disassembler/objdump_ppc.py @@ -149,7 +149,6 @@ def parse_target(self, line: str) -> (Address, str): target_match = re.match(r"\s*<([.a-zA-Z_@0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)", line) if target_match is None: return None, line - print("target parse" + str(target_match)) offset = target_match[2] or "+0" address = Address(None, target_match[1], int(offset, 0)) return address, target_match[3] @@ -222,3 +221,7 @@ def parse_line(self, line: str, lineno, function_name: str) -> Instruction | Non ) return instruction + + def parse_jump_target(self, str_input: str) -> int | None: + return int(re.search(rf"{HEX_LONG_PATTERN}", str_input)[0], 16) + diff --git a/ocgraph/configuration/disassembler/objdump_sparc.py b/ocgraph/configuration/disassembler/objdump_sparc.py index 47ca645..c282b2c 100755 --- a/ocgraph/configuration/disassembler/objdump_sparc.py +++ b/ocgraph/configuration/disassembler/objdump_sparc.py @@ -219,3 +219,6 @@ def parse_line(self, line: str, lineno, function_name: str) -> Instruction | Non ops, target, ) + + def parse_jump_target(self, str_input: str) -> int | None: + return int(re.search(rf"{HEX_LONG_PATTERN}", str_input)[0], 16) diff --git a/ocgraph/interface/analyzer.py b/ocgraph/interface/analyzer.py index 617a577..273701c 100755 --- a/ocgraph/interface/analyzer.py +++ b/ocgraph/interface/analyzer.py @@ -2,7 +2,6 @@ """Class for read and analyze the input string.""" import sys -import re from ..data.address import Address from ..data.basic_block import BasicBlock @@ -12,10 +11,6 @@ from ..configuration.configuration import OcGraphConfiguration, Disassembler -# Common regexes -HEX_PATTERN = r"[0-9a-fA-F]+" -HEX_LONG_PATTERN = r"(?:0x0*)" + HEX_PATTERN - class Analyzer: """Analyzer Class""" @@ -84,7 +79,7 @@ def _compute_jump_targets(self): # parse the absolute target out of the operands # (first hex address is assumed to be the target address) print("direct branch without target: " + str(instr)) - instr.target.abs = int(re.search(rf"{HEX_LONG_PATTERN}", '|'.join(instr.ops))[0], 16) + instr.target.abs = self.parser.parse_jump_target('|'.join(instr.ops)) # Infer relative addresses (for objdump or stripped gdb) start_address = self.instructions[0].address.abs From e9264ce1a45e45c6767b2e558e5bde2ea056e17a Mon Sep 17 00:00:00 2001 From: Andoni Arregi Date: Sun, 29 Oct 2023 22:22:49 +0100 Subject: [PATCH 20/31] Remove debug prints --- ocgraph/configuration/configuration.py | 2 +- ocgraph/interface/analyzer.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/ocgraph/configuration/configuration.py b/ocgraph/configuration/configuration.py index 6ba06ff..cf03065 100755 --- a/ocgraph/configuration/configuration.py +++ b/ocgraph/configuration/configuration.py @@ -58,7 +58,7 @@ "console_level": logging.ERROR, }, "default": { - "file_log": "asm2fg.log", + "file_log": "asm2cfg.log", "file_level": logging.INFO, "console_log": True, "console_level": logging.INFO, diff --git a/ocgraph/interface/analyzer.py b/ocgraph/interface/analyzer.py index 273701c..f98e05a 100755 --- a/ocgraph/interface/analyzer.py +++ b/ocgraph/interface/analyzer.py @@ -78,7 +78,6 @@ def _compute_jump_targets(self): instr.target = Address(0) # parse the absolute target out of the operands # (first hex address is assumed to be the target address) - print("direct branch without target: " + str(instr)) instr.target.abs = self.parser.parse_jump_target('|'.join(instr.ops)) # Infer relative addresses (for objdump or stripped gdb) @@ -141,7 +140,6 @@ def _create_basic_blocks(self) -> None: pc_addr = instruction.address # Get optional jump target jump_target = self.jump_table.get_target(pc_addr) - print("jump target: " + str(jump_target)) is_unconditional = self.configuration.architecture.is_unconditional_branch(instruction) # Start new blocks if last ended @@ -164,7 +162,6 @@ def _create_basic_blocks(self) -> None: # End current block if current opcode is a jump/branch/sink if jump_target: - print("has jump target: " + str(instruction)) curr_basic_block.add_jump_edge(jump_target.abs) prev_branch_block = None if is_unconditional else curr_basic_block block_completion = self.configuration.architecture.get_branch_delay(instruction) From f3be79fa4ad1b789023132010a983b327e35dcd4 Mon Sep 17 00:00:00 2001 From: Andoni Arregi Date: Sun, 29 Oct 2023 22:30:21 +0100 Subject: [PATCH 21/31] Fix output when no terurn target available --- ocgraph/interface/drawer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocgraph/interface/drawer.py b/ocgraph/interface/drawer.py index d8a885a..1ae02ff 100755 --- a/ocgraph/interface/drawer.py +++ b/ocgraph/interface/drawer.py @@ -75,7 +75,7 @@ def _create_label(self, basic_block: BasicBlock, line_coverage=False): if basic_block.jump_edge: label += f'JUMP' if not basic_block.jump_edge and not basic_block.no_jump_edge: - label += f'RETURN targets: {str(returns)}' + label += f'RETURN targets: {str(returns) if returns else "--"}' label += " \n >" return label From d69d72678a3795d0374946376bc905b7d147b2ad Mon Sep 17 00:00:00 2001 From: Andoni Arregi Date: Fri, 3 Nov 2023 15:23:53 +0100 Subject: [PATCH 22/31] Fix for PPC using binutils objdump --- ocgraph/configuration/architecture/ppc.py | 666 +++++++++++++++++- .../configuration/disassembler/objdump_ppc.py | 8 +- .../disassembler/objdump_sparc.py | 5 +- ocgraph/interface/analyzer.py | 2 +- 4 files changed, 666 insertions(+), 15 deletions(-) diff --git a/ocgraph/configuration/architecture/ppc.py b/ocgraph/configuration/architecture/ppc.py index 58d9f21..6ec15c3 100755 --- a/ocgraph/configuration/architecture/ppc.py +++ b/ocgraph/configuration/architecture/ppc.py @@ -9,7 +9,7 @@ # Common regexes HEX_PATTERN = r'[0-9a-fA-F]+' -HEX_LONG_PATTERN = r'(?:0x0*)' + HEX_PATTERN +HEX_LONG_PATTERN = r'(?:0x0*)?' + HEX_PATTERN # fmt: off ppc_call_opcodes = [ @@ -26,14 +26,662 @@ ] ppc_conditional_branch_opcodes = [ - "bc", "bt", "bf", "bdnz", "bdnzt", "bdnzf", "bdz", "bdzt", "bdzf", - "bca", "bta", "bfa", "bdnza", "bdnzta", "bdnzfa", "bdza", "bdzta", "bdzfa", - "bcl", "btl", "bfl", "bdnzl", "bdnztl", "bdnzfl", "bdzl", "bdztl", "bdzfl", - "bcla", "bdtla", "bdfla", "bdnzla", "bdnztla", "bdnzfla", "bdzla", "bdztla", "bdzfla", - "bclr", "btlr", "bflr", "bdnzlr", "bdnztlr", "bdnzflr", "bdzlr", "bdztlr", "bdzflr", - "bclrl", "btlrl", "bflrl", "bdnzlrl", "bdnztlrl", "bdnzflrl", "bdzlrl", "bdztlrl", "bdzflrl", - "bcctr", "btctr", "bfctr", - "bcctrl", "btctrl", "bfctrl", + "bdnz-", + "bdnz+", + "bdnz", + "bdn", + "bdnzl-", + "bdnzl+", + "bdnzl", + "bdnl", + "bdnza-", + "bdnza+", + "bdnza", + "bdna", + "bdnzla-", + "bdnzla+", + "bdnzla", + "bdnla", + "bdz-", + "bdz+", + "bdz", + "bdzl-", + "bdzl+", + "bdzl", + "bdza-", + "bdza+", + "bdza", + "bdzla-", + "bdzla+", + "bdzla", + + "bge-", + "bge+", + "bge", + "bnl-", + "bnl+", + "bnl", + "bgel-", + "bgel+", + "bgel", + "bnll-", + "bnll+", + "bnll", + "bgea-", + "bgea+", + "bgea", + "bnla-", + "bnla+", + "bnla", + "bgela-", + "bgela+", + "bgela", + "bnlla-", + "bnlla+", + "bnlla", + "ble-", + "ble+", + "ble", + "bng-", + "bng+", + "bng", + "blel-", + "blel+", + "blel", + "bngl-", + "bngl+", + "bngl", + "blea-", + "blea+", + "blea", + "bnga-", + "bnga+", + "bnga", + "blela-", + "blela+", + "blela", + "bngla-", + "bngla+", + "bngla", + "bne-", + "bne+", + "bne", + "bnel-", + "bnel+", + "bnel", + "bnea-", + "bnea+", + "bnea", + "bnela-", + "bnela+", + "bnela", + "bns-", + "bns+", + "bns", + "bnu-", + "bnu+", + "bnu", + "bnsl-", + "bnsl+", + "bnsl", + "bnul-", + "bnul+", + "bnul", + "bnsa-", + "bnsa+", + "bnsa", + "bnua-", + "bnua+", + "bnua", + "bnsla-", + "bnsla+", + "bnsla", + "bnula-", + "bnula+", + "bnula", + + "blt-", + "blt+", + "blt", + "bltl-", + "bltl+", + "bltl", + "blta-", + "blta+", + "blta", + "bltla-", + "bltla+", + "bltla", + "bgt-", + "bgt+", + "bgt", + "bgtl-", + "bgtl+", + "bgtl", + "bgta-", + "bgta+", + "bgta", + "bgtla-", + "bgtla+", + "bgtla", + "beq-", + "beq+", + "beq", + "beql-", + "beql+", + "beql", + "beqa-", + "beqa+", + "beqa", + "beqla-", + "beqla+", + "beqla", + "bso-", + "bso+", + "bso", + "bun-", + "bun+", + "bun", + "bsol-", + "bsol+", + "bsol", + "bunl-", + "bunl+", + "bunl", + "bsoa-", + "bsoa+", + "bsoa", + "buna-", + "buna+", + "buna", + "bsola-", + "bsola+", + "bsola", + "bunla-", + "bunla+", + "bunla", + + "bdnzf-", + "bdnzf+", + "bdnzf", + "bdnzfl-", + "bdnzfl+", + "bdnzfl", + "bdnzfa-", + "bdnzfa+", + "bdnzfa", + "bdnzfla-", + "bdnzfla+", + "bdnzfla", + "bdzf-", + "bdzf+", + "bdzf", + "bdzfl-", + "bdzfl+", + "bdzfl", + "bdzfa-", + "bdzfa+", + "bdzfa", + "bdzfla-", + "bdzfla+", + "bdzfla", + + "bf-", + "bf+", + "bf", + "bbf", + "bfl-", + "bfl+", + "bfl", + "bbfl", + "bfa-", + "bfa+", + "bfa", + "bbfa", + "bfla-", + "bfla+", + "bfla", + "bbfla", + + "bdnzt-", + "bdnzt+", + "bdnzt", + "bdnztl-", + "bdnztl+", + "bdnztl", + "bdnzta-", + "bdnzta+", + "bdnzta", + "bdnztla-", + "bdnztla+", + "bdnztla", + "bdzt-", + "bdzt+", + "bdzt", + "bdztl-", + "bdztl+", + "bdztl", + "bdzta-", + "bdzta+", + "bdzta", + "bdztla-", + "bdztla+", + "bdztla", + + "bt-", + "bt+", + "bt", + "bbt", + "btl-", + "btl+", + "btl", + "bbtl", + "bta-", + "bta+", + "bta", + "bbta", + "btla-", + "btla+", + "btla", + "bbtla", + + "bc-", + "bc+", + "bc", + "bcl-", + "bcl+", + "bcl", + "bca-", + "bca+", + "bca", + "bcla-", + "bcla+", + "bcla", + + "bdnzlr", + "bdnzlr-", + "bdnzlrl", + "bdnzlrl-", + "bdnzlr+", + "bdnzlrl+", + "bdzlr", + "bdzlr-", + "bdzlrl", + "bdzlrl-", + "bdzlr+", + "bdzlrl+", + #"blr", + "br", + #"blrl", + "brl", + "bdnzlr-", + "bdnzlrl-", + "bdnzlr+", + "bdnzlrl+", + "bdzlr-", + "bdzlrl-", + "bdzlr+", + "bdzlrl+", + + "bgelr", + "bgelr-", + "bger", + "bnllr", + "bnllr-", + "bnlr", + "bgelrl", + "bgelrl-", + "bgerl", + "bnllrl", + "bnllrl-", + "bnlrl", + "blelr", + "blelr-", + "bler", + "bnglr", + "bnglr-", + "bngr", + "blelrl", + "blelrl-", + "blerl", + "bnglrl", + "bnglrl-", + "bngrl", + "bnelr", + "bnelr-", + "bner", + "bnelrl", + "bnelrl-", + "bnerl", + "bnslr", + "bnslr-", + "bnsr", + "bnulr", + "bnulr-", + "bnslrl", + "bnslrl-", + "bnsrl", + "bnulrl", + "bnulrl-", + "bgelr+", + "bnllr+", + "bgelrl+", + "bnllrl+", + "blelr+", + "bnglr+", + "blelrl+", + "bnglrl+", + "bnelr+", + "bnelrl+", + "bnslr+", + "bnulr+", + "bnslrl+", + "bnulrl+", + "bgelr-", + "bnllr-", + "bgelrl-", + "bnllrl-", + "blelr-", + "bnglr-", + "blelrl-", + "bnglrl-", + "bnelr-", + "bnelrl-", + "bnslr-", + "bnulr-", + "bnslrl-", + "bnulrl-", + "bgelr+", + "bnllr+", + "bgelrl+", + "bnllrl+", + "blelr+", + "bnglr+", + "blelrl+", + "bnglrl+", + "bnelr+", + "bnelrl+", + "bnslr+", + "bnulr+", + "bnslrl+", + "bnulrl+", + "bltlr", + "bltlr-", + "bltr", + "bltlrl", + "bltlrl-", + "bltrl", + "bgtlr", + "bgtlr-", + "bgtr", + "bgtlrl", + "bgtlrl-", + "bgtrl", + "beqlr", + "beqlr-", + "beqr", + "beqlrl", + "beqlrl-", + "beqrl", + "bsolr", + "bsolr-", + "bsor", + "bunlr", + "bunlr-", + "bsolrl", + "bsolrl-", + "bsorl", + "bunlrl", + "bunlrl-", + "bltlr+", + "bltlrl+", + "bgtlr+", + "bgtlrl+", + "beqlr+", + "beqlrl+", + "bsolr+", + "bunlr+", + "bsolrl+", + "bunlrl+", + "bltlr-", + "bltlrl-", + "bgtlr-", + "bgtlrl-", + "beqlr-", + "beqlrl-", + "bsolr-", + "bunlr-", + "bsolrl-", + "bunlrl-", + "bltlr+", + "bltlrl+", + "bgtlr+", + "bgtlrl+", + "beqlr+", + "beqlrl+", + "bsolr+", + "bunlr+", + "bsolrl+", + "bunlrl+", + + "bdnzflr", + "bdnzflr-", + "bdnzflrl", + "bdnzflrl-", + "bdnzflr+", + "bdnzflrl+", + "bdzflr", + "bdzflr-", + "bdzflrl", + "bdzflrl-", + "bdzflr+", + "bdzflrl+", + "bflr", + "bflr-", + "bbfr", + "bflrl", + "bflrl-", + "bbfrl", + "bflr+", + "bflrl+", + "bflr-", + "bflrl-", + "bflr+", + "bflrl+", + "bdnztlr", + "bdnztlr-", + "bdnztlrl", + "bdnztlrl-", + "bdnztlr+", + "bdnztlrl+", + "bdztlr", + "bdztlr-", + "bdztlrl", + "bdztlrl-", + "bdztlr+", + "bdztlrl+", + "btlr", + "btlr-", + "bbtr", + "btlrl", + "btlrl-", + "bbtrl", + "btlr+", + "btlrl+", + "btlr-", + "btlrl-", + "btlr+", + "btlrl+", + + "bclr-", + "bclrl-", + "bclr+", + "bclrl+", + "bclr", + "bcr", + "bclrl", + "bcrl", + + #"bctr", + #"bctrl", + + "bgectr", + "bgectr-", + "bnlctr", + "bnlctr-", + "bgectrl", + "bgectrl-", + "bnlctrl", + "bnlctrl-", + "blectr", + "blectr-", + "bngctr", + "bngctr-", + "blectrl", + "blectrl-", + "bngctrl", + "bngctrl-", + "bnectr", + "bnectr-", + "bnectrl", + "bnectrl-", + "bnsctr", + "bnsctr-", + "bnuctr", + "bnuctr-", + "bnsctrl", + "bnsctrl-", + "bnuctrl", + "bnuctrl-", + "bgectr+", + "bnlctr+", + "bgectrl+", + "bnlctrl+", + "blectr+", + "bngctr+", + "blectrl+", + "bngctrl+", + "bnectr+", + "bnectrl+", + "bnsctr+", + "bnuctr+", + "bnsctrl+", + "bnuctrl+", + "bgectr-", + "bnlctr-", + "bgectrl-", + "bnlctrl-", + "blectr-", + "bngctr-", + "blectrl-", + "bngctrl-", + "bnectr-", + "bnectrl-", + "bnsctr-", + "bnuctr-", + "bnsctrl-", + "bnuctrl-", + "bgectr+", + "bnlctr+", + "bgectrl+", + "bnlctrl+", + "blectr+", + "bngctr+", + "blectrl+", + "bngctrl+", + "bnectr+", + "bnectrl+", + "bnsctr+", + "bnuctr+", + "bnsctrl+", + "bnuctrl+", + "bltctr", + "bltctr-", + "bltctrl", + "bltctrl-", + "bgtctr", + "bgtctr-", + "bgtctrl", + "bgtctrl-", + "beqctr", + "beqctr-", + "beqctrl", + "beqctrl-", + "bsoctr", + "bsoctr-", + "bunctr", + "bunctr-", + "bsoctrl", + "bsoctrl-", + "bunctrl", + "bunctrl-", + "bltctr+", + "bltctrl+", + "bgtctr+", + "bgtctrl+", + "beqctr+", + "beqctrl+", + "bsoctr+", + "bunctr+", + "bsoctrl+", + "bunctrl+", + "bltctr-", + "bltctrl-", + "bgtctr-", + "bgtctrl-", + "beqctr-", + "beqctrl-", + "bsoctr-", + "bunctr-", + "bsoctrl-", + "bunctrl-", + "bltctr+", + "bltctrl+", + "bgtctr+", + "bgtctrl+", + "beqctr+", + "beqctrl+", + "bsoctr+", + "bunctr+", + "bsoctrl+", + "bunctrl+", + + "bfctr", + "bfctr-", + "bfctrl", + "bfctrl-", + "bfctr+", + "bfctrl+", + "bfctr-", + "bfctrl-", + "bfctr+", + "bfctrl+", + "btctr", + "btctr-", + "btctrl", + "btctrl-", + "btctr+", + "btctrl+", + "btctr-", + "btctrl-", + "btctr+", + "btctrl+", + + "bcctr-", + "bcctrl-", + "bcctr+", + "bcctrl+", + "bcctr", + "bcc", + "bcctrl", + "bccl", + + "bctar-", + "bctarl-", + "bctar+", + "bctarl+", + "bctar", + "bctarl", ] # fmt: on diff --git a/ocgraph/configuration/disassembler/objdump_ppc.py b/ocgraph/configuration/disassembler/objdump_ppc.py index cba7b4d..27417ca 100755 --- a/ocgraph/configuration/disassembler/objdump_ppc.py +++ b/ocgraph/configuration/disassembler/objdump_ppc.py @@ -12,7 +12,7 @@ # Common regexes HEX_PATTERN = r"[0-9a-fA-F]+" -HEX_LONG_PATTERN = r"(?:0x0*)" + HEX_PATTERN +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN class ObjDumpPpcDisassembler(Disassembler): @@ -222,6 +222,8 @@ def parse_line(self, line: str, lineno, function_name: str) -> Instruction | Non return instruction - def parse_jump_target(self, str_input: str) -> int | None: - return int(re.search(rf"{HEX_LONG_PATTERN}", str_input)[0], 16) + def parse_jump_target(self, ops: List[str]) -> int | None: + # it assumes the last operand of the branch to be the target address + return int(ops.pop(), 16) + diff --git a/ocgraph/configuration/disassembler/objdump_sparc.py b/ocgraph/configuration/disassembler/objdump_sparc.py index c282b2c..d0847a7 100755 --- a/ocgraph/configuration/disassembler/objdump_sparc.py +++ b/ocgraph/configuration/disassembler/objdump_sparc.py @@ -220,5 +220,6 @@ def parse_line(self, line: str, lineno, function_name: str) -> Instruction | Non target, ) - def parse_jump_target(self, str_input: str) -> int | None: - return int(re.search(rf"{HEX_LONG_PATTERN}", str_input)[0], 16) + def parse_jump_target(self, ops: List[str]) -> int | None: + # it assumes the first operand to contain the target address + return int(ops.pop(), 16) diff --git a/ocgraph/interface/analyzer.py b/ocgraph/interface/analyzer.py index f98e05a..7d436b7 100755 --- a/ocgraph/interface/analyzer.py +++ b/ocgraph/interface/analyzer.py @@ -78,7 +78,7 @@ def _compute_jump_targets(self): instr.target = Address(0) # parse the absolute target out of the operands # (first hex address is assumed to be the target address) - instr.target.abs = self.parser.parse_jump_target('|'.join(instr.ops)) + instr.target.abs = self.parser.parse_jump_target(instr.ops) # Infer relative addresses (for objdump or stripped gdb) start_address = self.instructions[0].address.abs From cdfbd386b7a50d4df200d0c83cd688b762fcc8a3 Mon Sep 17 00:00:00 2001 From: Jakov Zauzolkov Date: Sat, 4 Nov 2023 17:13:44 +0100 Subject: [PATCH 23/31] Adjust ppc/sparc objdump --- ocgraph/configuration/disassembler/objdump_ppc.py | 2 +- ocgraph/configuration/disassembler/objdump_sparc.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocgraph/configuration/disassembler/objdump_ppc.py b/ocgraph/configuration/disassembler/objdump_ppc.py index 27417ca..eaae1df 100755 --- a/ocgraph/configuration/disassembler/objdump_ppc.py +++ b/ocgraph/configuration/disassembler/objdump_ppc.py @@ -224,6 +224,6 @@ def parse_line(self, line: str, lineno, function_name: str) -> Instruction | Non def parse_jump_target(self, ops: List[str]) -> int | None: # it assumes the last operand of the branch to be the target address - return int(ops.pop(), 16) + return int(ops[-1], 16) diff --git a/ocgraph/configuration/disassembler/objdump_sparc.py b/ocgraph/configuration/disassembler/objdump_sparc.py index d0847a7..a9d7437 100755 --- a/ocgraph/configuration/disassembler/objdump_sparc.py +++ b/ocgraph/configuration/disassembler/objdump_sparc.py @@ -222,4 +222,4 @@ def parse_line(self, line: str, lineno, function_name: str) -> Instruction | Non def parse_jump_target(self, ops: List[str]) -> int | None: # it assumes the first operand to contain the target address - return int(ops.pop(), 16) + return int(ops[-1], 16) From 96475178bcc1f384c766a46bbd9ed3f305d61ed0 Mon Sep 17 00:00:00 2001 From: Jakov Zauzolkov Date: Thu, 7 Dec 2023 18:33:24 +0100 Subject: [PATCH 24/31] Adjust configuration.py for x86, Bugfix sparc --- ocgraph/configuration/architecture/sparc.py | 2 +- ocgraph/configuration/configuration.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ocgraph/configuration/architecture/sparc.py b/ocgraph/configuration/architecture/sparc.py index f52765d..f99c6ec 100755 --- a/ocgraph/configuration/architecture/sparc.py +++ b/ocgraph/configuration/architecture/sparc.py @@ -48,7 +48,7 @@ ] sparc_v8_unconditional_branch_opcodes = [ - "jmpl", "jmp", "b", + "jmpl", "jmp", "b", "b,a" ] sparc_v8_delayed_opcodes = sparc_v8_Bicc_opcodes + \ diff --git a/ocgraph/configuration/configuration.py b/ocgraph/configuration/configuration.py index cf03065..04a6c20 100755 --- a/ocgraph/configuration/configuration.py +++ b/ocgraph/configuration/configuration.py @@ -12,16 +12,19 @@ from .disassembler.objdump_sparc import ObjDumpSparcDisassembler from .disassembler.objdump_ppc import ObjDumpPpcDisassembler from .disassembler.gdb_default import GdbDisassembler +from .disassembler.objdump_x86 import ObjDumpx86Disassembler # fmt: off disassembler_option: dict[str, dict] = { "OBJDUMP": { "sparc": ObjDumpSparcDisassembler(), "ppc": ObjDumpPpcDisassembler(), + "x86": ObjDumpx86Disassembler(), }, "GDB": { "sparc": GdbDisassembler(), "ppc": GdbDisassembler(), + "x86": GdbDisassembler(), }, } From a3bc5f53635411b37d7f82bef2c36cadb0ba0908 Mon Sep 17 00:00:00 2001 From: Jakov Zauzolkov Date: Fri, 8 Dec 2023 16:47:44 +0100 Subject: [PATCH 25/31] Add x86_objdump file --- .../configuration/disassembler/objdump_x86.py | 225 ++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 ocgraph/configuration/disassembler/objdump_x86.py diff --git a/ocgraph/configuration/disassembler/objdump_x86.py b/ocgraph/configuration/disassembler/objdump_x86.py new file mode 100644 index 0000000..9fb3227 --- /dev/null +++ b/ocgraph/configuration/disassembler/objdump_x86.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +"""Class for parsing the input""" + +import re +from typing import List + +from ...data.address import Address +from ...data.encoding import Encoding +from ...data.instruction import Instruction + +from .disassembler import Disassembler, DisassemblerError + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class ObjDumpx86Disassembler(Disassembler): + """x86 objdump disassembler""" + + name: str = "x86 Disassembler (x86 Binutils)" + + # Expected format: <>: + regex: str = r"(\S+)( <(\S+)>|):\s+([\S ]+)\s([\S ]+)" + regex_information: dict[str, int] = { + "address": 1, + "location": 3, + "instruction_hex": 4, + "instruction_str": 5, + } + + def extract_information(self, str_input: str) -> dict[str, str]: + result = {} + + if "=> " in str_input: + extracted_line = str_input.split("=> ", 1)[1].split("\n", 1)[0] + + information = re.search(self.regex, extracted_line) + if not information: + raise DisassemblerError("Line not processable: \n" + str(extracted_line)) + + address = str(information.group(self.regex_information.get("address"))) + location = str(information.group(self.regex_information.get("location"))) + instr_d = str(information.group(self.regex_information.get("instruction_str"))) + instr_h = str(information.group(self.regex_information.get("instruction_hex"))) + opcode = (instr_d.split()[0],) + + result = { + "address": address, + "location": location, + "instr_h": instr_h, + "instr_d": instr_d, + "opcode": opcode, + "printable": extracted_line, + } + else: + raise DisassemblerError("Line not processable: \n" + str(str_input)) + + return result + + def parse_function_header(self, line: str) -> str | None: + """ + Return function name of memory range from the given string line. + + Match lines for non-stripped binaries: + 'Dump of assembler code for function test_function:' + lines for stripped binaries: + 'Dump of assembler code from 0x555555555faf to 0x555555557008:' + and lines for objdump disassembly: + '0000000000016bb0 <_obstack_allocated_p@@Base>:' + """ + objdump_name_pattern = re.compile(rf"{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:") + function_name = objdump_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + function_name_pattern = re.compile(r"function (\w+):$") + function_name = function_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + memory_range_pattern = re.compile( + rf"(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$" + ) + memory_range = memory_range_pattern.search(line) + if memory_range is not None: + return f"{memory_range[1]}-{memory_range[2]}" + + return None + + def parse_address(self, line: str) -> (Address, str): + """ + Parses leading address of instruction + """ + address_match = re.match(rf"^\s*(?:0x)?({HEX_PATTERN})\s*(?:<([+-][0-9]+)>)?:(.*)", line) + if address_match is None: + return None, line + address = Address( + int(address_match[1], 16), None, int(address_match[2]) if address_match[2] else None + ) + return address, address_match[3] + + def split_nth(self, string: str, count: int) -> List[str]: + """ + Splits string to equally-sized chunks + """ + return [string[i : i + count] for i in range(0, len(string), count)] + + def parse_encoding(self, line): + """ + Parses byte encoding of instruction for objdump disassemblies + e.g. the '31 c0' in + '16bd3: 31 c0 xor %eax,%eax' + In addition to X86 supports ARM encoding styles: + '4: e1a01000 mov r1, r0' + '50: f7ff fffe bl 0 <__aeabi_dadd>' + '54: 0002 movs r2, r0' + """ + # Encoding is separated from assembly mnemonic via tab + # so we allow whitespace separators between bytes + # to avoid accidentally matching the mnemonic. + enc_match = re.match(r"^\s*((?:[0-9a-f]{2,8} +)+)(.*)", line) + if enc_match is None: + return None, line + bites = [] + for chunk in enc_match[1].strip().split(" "): + bites.extend(int(byte, 16) for byte in self.split_nth(chunk, 2)) + return Encoding(bites), enc_match[2] + + def parse_body(self, line: str) -> (str, str, List[str], str): + """Parses instruction body (opcode and operands)""" + comment_symbol = self.architecture.comment() + body_match = re.match(rf"^\s*([^{comment_symbol}<]+)(.*)", line) + if body_match is None: + return None, None, None, line + body = body_match[1].strip() + line = body_match[2] + opcode_match = re.match(r"^(\S*)\s*(.*)", body) + if opcode_match is None: + return None, None, None, line + opcode = opcode_match[1] + ops = opcode_match[2].split(",") if opcode_match[2] else [] + return body, opcode, ops, line + + def parse_target(self, line: str) -> (Address, str): + """ + Parses optional instruction branch target hint + """ + target_match = re.match(r"\s*<([a-zA-Z_@0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)", line) + if target_match is None: + return None, line + offset = target_match[2] or "+0" + address = Address(None, target_match[1], int(offset, 0)) + return address, target_match[3] + + def parse_comment(self, line: str) -> (Address, str): + """ + Parses optional instruction comment + """ + comment_symbol = self.architecture.comment() + comment_match = re.match(rf"^\s*{comment_symbol}\s*(.*)", line) + if comment_match is None: + return None, line + comment = comment_match[1] + imm_match = re.match(rf"^(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)", comment) + if imm_match is None: + # If no imm was found, ignore the comment. + # In particular this takes care of useless ARM comments like + # '82: 46c0 nop ; (mov r8, r8)' + return None, "" + abs_addr = int(imm_match[1], 16) + if imm_match[2]: + target, _ = self.parse_target(imm_match[2]) + target.abs = abs_addr + else: + target = Address(abs_addr) + return target, imm_match[3] + + def parse_line(self, line: str, lineno, function_name: str) -> Instruction | None: + """ + Parses a single line of assembly to create Instruction instance + """ + # Strip GDB prefix and leading whites + line = line.removeprefix("=> ") + line = line.lstrip() + + address, line = self.parse_address(line) + if address is None: + return None + + encoding, line = self.parse_encoding(line) + if not line: + return encoding + + original_line = line + body, opcode, ops, line = self.parse_body(line) + if opcode is None: + return None + + target, line = self.parse_target(line) + + _, line = self.parse_comment(line) + if line: + # Expecting complete parse + return None + + # Set base symbol for relative addresses + if address.base is None: + address.base = function_name + if target is not None and target.base is None: + target.base = function_name + + return Instruction( + body, + original_line.strip(), + lineno, + address, + opcode, + ops, + target, + ) + + def parse_jump_target(self, ops: List[str]) -> int | None: + # it assumes the first operand to contain the target address + return int(ops[-1], 16) From 230bc5559e5ace5a041f68b14c34081c6bf722c9 Mon Sep 17 00:00:00 2001 From: Jakov Zauzolkov Date: Fri, 15 Dec 2023 15:41:30 +0100 Subject: [PATCH 26/31] add objdump_arm.py, adjust configuration.py for arm --- ocgraph/configuration/configuration.py | 3 + .../configuration/disassembler/objdump_arm.py | 225 ++++++++++++++++++ 2 files changed, 228 insertions(+) create mode 100644 ocgraph/configuration/disassembler/objdump_arm.py diff --git a/ocgraph/configuration/configuration.py b/ocgraph/configuration/configuration.py index 04a6c20..0d6f83b 100755 --- a/ocgraph/configuration/configuration.py +++ b/ocgraph/configuration/configuration.py @@ -13,6 +13,7 @@ from .disassembler.objdump_ppc import ObjDumpPpcDisassembler from .disassembler.gdb_default import GdbDisassembler from .disassembler.objdump_x86 import ObjDumpx86Disassembler +from .disassembler.objdump_arm import ObjDumpArmDisassembler # fmt: off disassembler_option: dict[str, dict] = { @@ -20,11 +21,13 @@ "sparc": ObjDumpSparcDisassembler(), "ppc": ObjDumpPpcDisassembler(), "x86": ObjDumpx86Disassembler(), + "arm": ObjDumpArmDisassembler(), }, "GDB": { "sparc": GdbDisassembler(), "ppc": GdbDisassembler(), "x86": GdbDisassembler(), + "arm": GdbDisassembler(), }, } diff --git a/ocgraph/configuration/disassembler/objdump_arm.py b/ocgraph/configuration/disassembler/objdump_arm.py new file mode 100644 index 0000000..a486d11 --- /dev/null +++ b/ocgraph/configuration/disassembler/objdump_arm.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +"""Class for parsing the objdump ARM input""" + +import re +from typing import List + +from ...data.address import Address +from ...data.encoding import Encoding +from ...data.instruction import Instruction + +from .disassembler import Disassembler, DisassemblerError + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class ObjDumpArmDisassembler(Disassembler): + """Objdump ARM disassembler""" + + name: str = "ARM Objdump Disassembler" + + # Expected format: <>: + regex: str = r"(\S+)( <(\S+)>|):\s+([\S ]+)\s([\S ]+)" + regex_information: dict[str, int] = { + "address": 1, + "location": 3, + "instruction_hex": 4, + "instruction_str": 5, + } + + def extract_information(self, str_input: str) -> dict[str, str]: + result = {} + + if "=> " in str_input: + extracted_line = str_input.split("=> ", 1)[1].split("\n", 1)[0] + + information = re.search(self.regex, extracted_line) + if not information: + raise DisassemblerError("Line not processable: \n" + str(extracted_line)) + + address = str(information.group(self.regex_information.get("address"))) + location = str(information.group(self.regex_information.get("location"))) + instr_d = str(information.group(self.regex_information.get("instruction_str"))) + instr_h = str(information.group(self.regex_information.get("instruction_hex"))) + opcode = (instr_d.split()[0],) + + result = { + "address": address, + "location": location, + "instr_h": instr_h, + "instr_d": instr_d, + "opcode": opcode, + "printable": extracted_line, + } + else: + raise DisassemblerError("Line not processable: \n" + str(str_input)) + + return result + + def parse_function_header(self, line: str) -> str | None: + """ + Return function name of memory range from the given string line. + + Match lines for non-stripped binaries: + 'Dump of assembler code for function test_function:' + lines for stripped binaries: + 'Dump of assembler code from 0x555555555faf to 0x555555557008:' + and lines for objdump disassembly: + '0000000000016bb0 <_obstack_allocated_p@@Base>:' + """ + objdump_name_pattern = re.compile(rf"{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:") + function_name = objdump_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + function_name_pattern = re.compile(r"function (\w+):$") + function_name = function_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + memory_range_pattern = re.compile( + rf"(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$" + ) + memory_range = memory_range_pattern.search(line) + if memory_range is not None: + return f"{memory_range[1]}-{memory_range[2]}" + + return None + + def parse_address(self, line: str) -> (Address, str): + """ + Parses leading address of instruction + """ + address_match = re.match(rf"^\s*(?:0x)?({HEX_PATTERN})\s*(?:<([+-][0-9]+)>)?:(.*)", line) + if address_match is None: + return None, line + address = Address( + int(address_match[1], 16), None, int(address_match[2]) if address_match[2] else None + ) + return address, address_match[3] + + def split_nth(self, string: str, count: int) -> List[str]: + """ + Splits string to equally-sized chunks + """ + return [string[i : i + count] for i in range(0, len(string), count)] + + def parse_encoding(self, line): + """ + Parses byte encoding of instruction for objdump disassemblies + e.g. the '31 c0' in + '16bd3: 31 c0 xor %eax,%eax' + In addition to X86 supports ARM encoding styles: + '4: e1a01000 mov r1, r0' + '50: f7ff fffe bl 0 <__aeabi_dadd>' + '54: 0002 movs r2, r0' + """ + # Encoding is separated from assembly mnemonic via tab + # so we allow whitespace separators between bytes + # to avoid accidentally matching the mnemonic. + enc_match = re.match(r"^\s*((?:[0-9a-f]{2,8} +)+)(.*)", line) + if enc_match is None: + return None, line + bites = [] + for chunk in enc_match[1].strip().split(" "): + bites.extend(int(byte, 16) for byte in self.split_nth(chunk, 2)) + return Encoding(bites), enc_match[2] + + def parse_body(self, line: str) -> (str, str, List[str], str): + """Parses instruction body (opcode and operands)""" + comment_symbol = self.architecture.comment() + body_match = re.match(rf"^\s*([^{comment_symbol}<]+)(.*)", line) + if body_match is None: + return None, None, None, line + body = body_match[1].strip() + line = body_match[2] + opcode_match = re.match(r"^(\S*)\s*(.*)", body) + if opcode_match is None: + return None, None, None, line + opcode = opcode_match[1] + ops = opcode_match[2].split(",") if opcode_match[2] else [] + return body, opcode, ops, line + + def parse_target(self, line: str) -> (Address, str): + """ + Parses optional instruction branch target hint + """ + target_match = re.match(r"\s*<([.a-zA-Z_@0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)", line) + if target_match is None: + return None, line + offset = target_match[2] or "+0" + address = Address(None, target_match[1], int(offset, 0)) + return address, target_match[3] + + def parse_comment(self, line: str) -> (Address, str): + """ + Parses optional instruction comment + """ + comment_symbol = self.architecture.comment() + comment_match = re.match(rf"^\s*{comment_symbol}\s*(.*)", line) + if comment_match is None: + return None, line + comment = comment_match[1] + imm_match = re.match(rf"^(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)", comment) + if imm_match is None: + # If no imm was found, ignore the comment. + # In particular this takes care of useless ARM comments like + # '82: 46c0 nop ; (mov r8, r8)' + return None, "" + abs_addr = int(imm_match[1], 16) + if imm_match[2]: + target, _ = self.parse_target(imm_match[2]) + target.abs = abs_addr + else: + target = Address(abs_addr) + return target, imm_match[3] + + def parse_line(self, line: str, lineno, function_name: str) -> Instruction | None: + """ + Parses a single line of assembly to create Instruction instance + """ + # Strip GDB prefix and leading whites + line = line.removeprefix("=> ") + line = line.lstrip() + + address, line = self.parse_address(line) + if address is None: + return None + + encoding, line = self.parse_encoding(line) + if not line: + return encoding + + original_line = line + body, opcode, ops, line = self.parse_body(line) + if opcode is None: + return None + + target, line = self.parse_target(line) + + _, line = self.parse_comment(line) + if line: + # Expecting complete parse + return None + + # Set base symbol for relative addresses + if address.base is None: + address.base = function_name + if target is not None and target.base is None: + target.base = function_name + + return Instruction( + body, + original_line.strip(), + lineno, + address, + opcode, + ops, + target, + ) + + def parse_jump_target(self, ops: List[str]) -> int | None: + # it assumes the first operand to contain the target address + return int(ops[-1], 16) From fbf94500dcb201e4f948c8ba147b6b98e1642dab Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Wed, 21 Feb 2024 12:21:57 +0100 Subject: [PATCH 27/31] Documentation: move into separate folder --- README.md | 464 ++---------------------------------------- doc/1_Installation.md | 7 + doc/2_HowToRun.md | 41 ++++ doc/3_Development.md | 53 +++++ doc/4_Github.md | 354 ++++++++++++++++++++++++++++++++ 5 files changed, 476 insertions(+), 443 deletions(-) create mode 100644 doc/1_Installation.md create mode 100644 doc/2_HowToRun.md create mode 100644 doc/3_Development.md create mode 100644 doc/4_Github.md diff --git a/README.md b/README.md index eab295b..2602e52 100644 --- a/README.md +++ b/README.md @@ -1,450 +1,28 @@ -## Object Code Graph (ocgraph) +# Object Code Graph (ocgraph) -## How to Run +This repo contains *asm2cfg*, (**Assembler to control-flow-graph**), a tool to +read disassembler output and optional corresponding coverage data to produce +control flow graphs including coverage analysis on assembler level. -Custom python script: +The *asm2cfg* source code implemented by GTD GmbH is forked from the asm2cfg +repository on [Github](https://github.com/Kazhuu/asm2cfg) and reworked to +function with several architectures and disassembler. -```python -from ocgraph.interface.analyzer import Analyzer -from ocgraph.interface.drawer import Drawer -from ocgraph.interface.coverage_reader import CoverageReader +The tool source code is subject to the MIT License as indicated by the headers +of the corresponding source code files. -from ocgraph.coverage_tracer import CoverageTracer -from ocgraph.configuration.configuration import CovTraceConfiguration +[!WARNING] +> **WARNING**: +> +> **This tool is currently under development and in beta state It's not intended +> to be complete and using is on your own risk.** -# Create configuration -config = OcGraphConfiguration(disassembler="objdump", arch="sparc") +## Documentation -# Read input text -lines = read_lines("a.out") +Due to the lack of a full documentation, all previous information can be found +in separate *Markdown* files in the *doc* folder: -# Analyze input text -analyser = Analyzer(config=config) -analyser.parse_lines(lines=lines) - -# Update analyzed input with coverage data -cov_reader = CoverageReader(instructions=analyser.instructions config=config) -cov_reader.update_by_csv(args.coverage) - -drawer = Drawer(analyser.configuration) -drawer.draw_cfg(name=analyser.function_name, basic_blocks=analyser.basic_blocks, output="a.pdf") -``` - -As python module: - -```cmd -python3 -m ocgraph -f a.out -d objdump -a sparc -c cov.csv -o a.pdf -``` - -As command line script: - -```cmd -./asm2cfg -f a.out -d objdump -a sparc -c cov.csv -o a.pdf -``` - - -## Design - - -```mermaid ---- -title: OcGraph design ---- -classDiagram - - class Configuration { - __init__(arch, disassembler, logging): - +dict disassembler_option - +dict architecture_option - +dict preset_logging - } - class Disassembler { - Name - parse_line() - ...() - } - class Architecture { - is_branch() - ...() - } - class Logger { Name } - - Configuration --* Disassembler - Configuration --* Architecture - Configuration --* Logger - - class Analyzer { - __init__(config) - parse_file(file_path): basic_blocks - } - class CoverageReader { - __init__(basic_blocks, config) - update_by_csv(file_path) - } - class Drawer { - __init__(config) - draw_cfg(basic_blocks, output) - } - class __main__ { - main() - } - - __main__ --> Configuration - __main__ --> Analyzer - __main__ --> CoverageReader - __main__ --> Drawer - -``` - -# asm2cfg -![CI status](https://github.com/Kazhuu/asm2cfg/actions/workflows/ci.yml/badge.svg) -[![codecov](https://codecov.io/gh/Kazhuu/asm2cfg/branch/main/graph/badge.svg?token=ZHLOJO8Q3V)](https://codecov.io/gh/Kazhuu/asm2cfg) - -Python command-line tool and GDB extension to view and save x86, ARM and objdump -assembly files as control-flow graph (CFG) pdf files. From GDB debugging session -use `viewcfg` command to view CFG and use `savecfg` command to save it to the -pdf file. - -

- -

- -Program has been developed to support X86, ARM and objdump assembly outputs. -Program is mostly tested with x86 assembly. ARM and objdump formats might not be -fully supported. If you have any suggestions or find bugs, please open an issue -or create a pull request. If you want to contribute, check -[Development](#development) how to get started. - -## Table of Content - - - -* [Install](#install) -* [Usage From GDB](#usage-from-gdb) -* [Usage as Standalone](#usage-as-standalone) - * [Knowing Function Name](#knowing-function-name) - * [Disassemble Function](#disassemble-function) - * [Draw CFG](#draw-cfg) - * [Examples](#examples) -* [Development](#development) - * [Python Environment](#python-environment) - * [Testing](#testing) - * [Code Linting](#code-linting) - * [Command-Line Interface](#command-line-interface) - * [GDB Integration](#gdb-integration) - * [Current Development Goals](#current-development-goals) - - - -## Install - -Project can be installed with pip - -``` -pip install asm2cfg -``` - -To be able to view the dot files from GDB. External dot viewer is required. For -this purpose [xdot](https://pypi.org/project/xdot/) can be used for example. Any -other dot viewer will also do. To install this on Debian based distro run - -``` -sudo apt install xdot -``` - -Or Arch based - -``` -sudo pacman -S xdot -``` - -To add extension to GDB you need to source the pip installed plugin to it. To -find where pip placed GDB extension run `which gdb_asm2cfg` or in case if you -use pyenv use `pyenv which gdb_asm2cfg`. Copy the path to the clipboard. - -Then in you home directory if not already add `.gdbinit` file -and place following line in it and replace path from the earlier step. - -``` -source -``` - -For example in my Linux machine line end up to be - -``` -source ~/.local/bin/gdb_asm2cfg.py -``` - -Now when you start GDB no errors should be displayed and you are ready to go. - -## Usage From GDB - -In GDB session this extension provides command `viewcfg` to view CFG with -external dot viewer. Command `savecfg` saves the CFG to pdf file to current -working directory with same name as the function being dumped. Both commands -disassemble the current execution frame/function when the command is issued. To -see help for these commands use `help` command like `help viewcfg`. - -For example let's view main function from you favorite non-stripped executable. -First run GDB until main function - -``` -gdb -ex 'b main' -ex 'run' -``` - -Now run `viewcfg` to view CFG as a dot graph with external editor. Or run `savecfg` -to save CFG to pdf file named `main.pdf` to current working directory. If -function is stripped then memory address of the function will used as a name -instead. For example `0x555555555faf-0x555555557008.pdf`. - -If assembly function is very large with a lot of jumps and calls to other -functions. Then rendering the CFG can take a long time. So be patient or cancel -rendering with Ctrl-C. To make the rendering faster you can skip function calls -instructions from splitting the code to more blocks. To set this run `set -skipcalls on` and then run earlier command again. Note that if function is long -and has a lot of jumps inside itself, then rendering is still gonna take a long -time. To have normal behavior again run `set skipcalls off`. - -## Usage as Standalone - -This method can be used with assembly files saved from ouput of objdump and GDB -disassembly. Pip installation will come with `asm2cfg` command-line tool for -this purpose. - -To use as standalone script you first need to dump assembly from GDB or objdump -to the file which is explained below. - -### Knowing Function Name - -If you don't know the name of function you're looking for then you can also list -all function names using GDB: - -``` -gdb -batch -ex 'b main' -ex r -ex 'info functions' ./test_executable -``` - -This will set breakpoint at function `main`, then -run the program and print symbols from all loaded libraries. - -For functions which come from main executable you can avoid running the program -and simply do - -``` -gdb -batch -ex 'info functions' ./test_executable -``` - -If you want to narrow the search down you can also use regexp - -``` -gdb ... -ex 'info functions ' ... -``` - -### Disassemble Function - -Once you have the function name, you can produce its disassembly via - -``` -gdb -batch -ex 'b main' -ex r -ex 'pipe disassemble test_function | tee test_function.asm' ./test_executable -``` - -or - -``` -gdb -batch -ex 'set breakpoints pending on' -ex 'b test_function' -ex r -ex 'pipe disassemble | tee test_function.asm' ./test_executable -``` - -(the `set breakpoint pending on` command enables pending breakpoints and -could be added to your `.gdbinit` instead) - -For functions from main executable it's enough to do - -``` -gdb -batch -ex 'pipe disassemble test_function | tee test_function.asm' ./test_executable -``` - -You can also extract function's disassembly from `objdump` output: - -``` -objdump -d ./test_executable | sed -ne '/ test_executable.asm -``` - -(this may be useful for specific non-native targets which lack GDB support). - -### Draw CFG - -Now you have the assembly file. Time to turn that to CFG pdf file. Do that by giving it -to `asm2cfg` command-line tool like so - -``` -asm2cfg test_function.asm -``` - -Asm2cfg by default expects x86 assembly files. If you want to use ARM assembly files, -then provide `--target arm` command-line flag. - -Above command should output `test_function.pdf` file in the same directory where -the executable was ran. If the assembly file is stripped then the function -memory range is used as a name instead. For example -`0x555555555faf-0x555555557008.pdf`. - -To view CFG instead of saving provide `-v` flag. And to skip function calls from -splitting the code to further blocks provide `-c` flag. To show the help use -`-h`. - -### Examples - -Repository includes examples which can be used to test the standalone -functionality for x86, ARM and objdump. - -File `test_function.asm` is non-stripped assembly file and its -corresponding output `test_function.pdf`. - -File `stripped_function.asm` contains -stripped function and its corresponding output -`stripped_function.pdf`. - -File `att_syntax.asm` is an example of non-stripped AT&T assembly. - -File `huge.asm` is a large stripped -assembly function and its corresponding output `huge.pdf`. This can be used to -test processing time of big functions. - -Files `objdump.asm` and `stripped_objdump.asm` are the regular and stripped -objdump-based disassemblies of short functions. - -File `arm.asm` is ARM based assembly file and its corresponding pdf file is -`arm.pdf`. - -## Development - -You want to contribute? You're very welcome to do so! This section will give you -guidance how to setup development environment and test things locally. - -### Python Environment - -For development this project manages packages with pipenv. Pipenv is a tool to -manage Python virtual environments and packages with much less pain compared to -normal pip and virtualenv usage. - -Install pipenv for your system following the guide -[here](https://pipenv.pypa.io/en/latest/). - -After installing pipenv. Create virtual environment and install all required -packages to it. Run following at project root - -``` -pipenv install -d -``` - -Now you can activate the virtual environment with - -``` -pipenv shell -``` - -Now your `python` and `pip` commands will correspond to created virtual environment -instead of your system's Python installation. - -To deactivate the environment, use - -``` -exit -``` - -### Testing - -This project uses [pytest](https://pypi.org/project/pytest/) for testing. Some -test are written using Python's own unittest testing framework, but they work -with pytest out of the box. Pytest style is preferred way to write tests. - -To run tests from project root, use `pytest` or - -``` -pipenv run pytest -``` - -During testing dot viewer might be opened if you have it installed. This is -because GDB integration command `viewcfg` is tested, which will open external -dot viewer. Just close it after it's opened. It should not affect the test run -itself. - -### Code Linting - -Project uses [flake8](https://flake8.pycqa.org/en/latest/) and -[pylint](https://pylint.org/) for code linting. - -To run flake8, use - -``` -flake8 -``` - -And to run pylint use - -``` -pylint src test -``` - -Both commands should not print any errors. - -### Command-Line Interface - -To test command-line interface of asm2cfg wihtout installing the package. You -can execute module directly. For example to print help - -``` -python -m src.asm2cfg -h -``` - -Standalone method can be used to try out the examples under `examples` folder as -well. For example following command should generate `main.pdf` file to current -working directory. - -``` -python -m src.asm2cfg -c examples/huge.asm -``` - -### GDB Integration - -Before testing GDB functionality, make sure asm2cfg is not installed with pip! -This can lead to GDB using code from pip installed asm2cfg package instead of -code from this repository! - -Also pipenv cannot be used with GDB. You need to install required packages to -your system's Python pip. This is because your installed GDB is linked against -system's Python interpreter and will use it, instead of active virtual -environment. If packages are not installed to your system's pip. You are likely -to receive following error messages when trying to use asm2cfg with GDB - -``` -ModuleNotFoundError: No module named 'graphviz' -``` - -To fix this, install required packages to your system's pip without active -virtual environment. Currently GDB integration only requires graphviz. - -``` -pip install graphviz -``` - -To use asm2cfg GDB related functionality. Use following line from -project root. - -``` -PYTHONPATH=${PWD}/src gdb -ex 'source src/gdb_asm2cfg.py' -``` - -This will set Python import path so that GDB can import code from this -repository without installing the package. After this you should be able to use -commands `viewcfg` and `savecfg`. - -### Current Development Goals - -There are might be cases asm2cfg will not fully support all x86 or ARM assembly -lines. If you encounter such problems please open an issue. - -Current developed goals are best described in issues section. Please open a new -one if existing one does not exist. - -If you want to talk to me, you can contact me at Discord with name -`Kazhuu#3121`. +1. [Installation](doc/1_Installation.md): Instructions to install the tool +2. [How to Run](doc/2_HowToRun.md): Guideline to run the tool +3. [Development](doc/3_Development.md): Information for Developer +4. [Github asm2cfg](doc/3_Development.md): Original Github documentation diff --git a/doc/1_Installation.md b/doc/1_Installation.md new file mode 100644 index 0000000..1e90e8b --- /dev/null +++ b/doc/1_Installation.md @@ -0,0 +1,7 @@ +# Installation + +## Dependencies for *asm2cfg* + +- Python >=3.6 +- The python `graphviz` for printing the graph, please install them e.g. with + `pip`. Preferably use a `virtualenv`. diff --git a/doc/2_HowToRun.md b/doc/2_HowToRun.md new file mode 100644 index 0000000..192c301 --- /dev/null +++ b/doc/2_HowToRun.md @@ -0,0 +1,41 @@ +# How to Run + +## As python module + +```cmd +python3 -m ocgraph -f a.out -d objdump -a sparc -c cov.csv -o a.pdf +``` + +## As command line script + +```cmd +./asm2cfg -f a.out -d objdump -a sparc -c cov.csv -o a.pdf +``` + +## Custom python script + +```python +from ocgraph.interface.analyzer import Analyzer +from ocgraph.interface.drawer import Drawer +from ocgraph.interface.coverage_reader import CoverageReader + +from ocgraph.coverage_tracer import CoverageTracer +from ocgraph.configuration.configuration import CovTraceConfiguration + +# Create configuration +config = OcGraphConfiguration(disassembler="objdump", arch="sparc") + +# Read input text +lines = read_lines("a.out") + +# Analyze input text +analyser = Analyzer(config=config) +analyser.parse_lines(lines=lines) + +# Update analyzed input with coverage data +cov_reader = CoverageReader(instructions=analyser.instructions config=config) +cov_reader.update_by_csv(args.coverage) + +drawer = Drawer(analyser.configuration) +drawer.draw_cfg(name=analyser.function_name, basic_blocks=analyser.basic_blocks, output="a.pdf") +``` diff --git a/doc/3_Development.md b/doc/3_Development.md new file mode 100644 index 0000000..e1cf37a --- /dev/null +++ b/doc/3_Development.md @@ -0,0 +1,53 @@ +# Development + +## Design + +```mermaid +--- +title: OcGraph design +--- +classDiagram + + class Configuration { + __init__(arch, disassembler, logging): + +dict disassembler_option + +dict architecture_option + +dict preset_logging + } + class Disassembler { + Name + parse_line() + ...() + } + class Architecture { + is_branch() + ...() + } + class Logger { Name } + + Configuration --* Disassembler + Configuration --* Architecture + Configuration --* Logger + + class Analyzer { + __init__(config) + parse_file(file_path): basic_blocks + } + class CoverageReader { + __init__(basic_blocks, config) + update_by_csv(file_path) + } + class Drawer { + __init__(config) + draw_cfg(basic_blocks, output) + } + class __main__ { + main() + } + + __main__ --> Configuration + __main__ --> Analyzer + __main__ --> CoverageReader + __main__ --> Drawer + +``` diff --git a/doc/4_Github.md b/doc/4_Github.md new file mode 100644 index 0000000..b3a81ff --- /dev/null +++ b/doc/4_Github.md @@ -0,0 +1,354 @@ + +# asm2cfg + +![CI status](https://github.com/Kazhuu/asm2cfg/actions/workflows/ci.yml/badge.svg) +[![codecov](https://codecov.io/gh/Kazhuu/asm2cfg/branch/main/graph/badge.svg?token=ZHLOJO8Q3V)](https://codecov.io/gh/Kazhuu/asm2cfg) + +Python command-line tool and GDB extension to view and save x86, ARM and objdump +assembly files as control-flow graph (CFG) pdf files. From GDB debugging session +use `viewcfg` command to view CFG and use `savecfg` command to save it to the +pdf file. + +

+ +

+ +Program has been developed to support X86, ARM and objdump assembly outputs. +Program is mostly tested with x86 assembly. ARM and objdump formats might not be +fully supported. If you have any suggestions or find bugs, please open an issue +or create a pull request. If you want to contribute, check +[Development](#development) how to get started. + +## Table of Content + + + +* [Install](#install) +* [Usage From GDB](#usage-from-gdb) +* [Usage as Standalone](#usage-as-standalone) + * [Knowing Function Name](#knowing-function-name) + * [Disassemble Function](#disassemble-function) + * [Draw CFG](#draw-cfg) + * [Examples](#examples) +* [Development](#development) + * [Python Environment](#python-environment) + * [Testing](#testing) + * [Code Linting](#code-linting) + * [Command-Line Interface](#command-line-interface) + * [GDB Integration](#gdb-integration) + * [Current Development Goals](#current-development-goals) + + + +## Install + +Project can be installed with pip + +``` +pip install asm2cfg +``` + +To be able to view the dot files from GDB. External dot viewer is required. For +this purpose [xdot](https://pypi.org/project/xdot/) can be used for example. Any +other dot viewer will also do. To install this on Debian based distro run + +``` +sudo apt install xdot +``` + +Or Arch based + +``` +sudo pacman -S xdot +``` + +To add extension to GDB you need to source the pip installed plugin to it. To +find where pip placed GDB extension run `which gdb_asm2cfg` or in case if you +use pyenv use `pyenv which gdb_asm2cfg`. Copy the path to the clipboard. + +Then in you home directory if not already add `.gdbinit` file +and place following line in it and replace path from the earlier step. + +``` +source +``` + +For example in my Linux machine line end up to be + +``` +source ~/.local/bin/gdb_asm2cfg.py +``` + +Now when you start GDB no errors should be displayed and you are ready to go. + +## Usage From GDB + +In GDB session this extension provides command `viewcfg` to view CFG with +external dot viewer. Command `savecfg` saves the CFG to pdf file to current +working directory with same name as the function being dumped. Both commands +disassemble the current execution frame/function when the command is issued. To +see help for these commands use `help` command like `help viewcfg`. + +For example let's view main function from you favorite non-stripped executable. +First run GDB until main function + +``` +gdb -ex 'b main' -ex 'run' +``` + +Now run `viewcfg` to view CFG as a dot graph with external editor. Or run `savecfg` +to save CFG to pdf file named `main.pdf` to current working directory. If +function is stripped then memory address of the function will used as a name +instead. For example `0x555555555faf-0x555555557008.pdf`. + +If assembly function is very large with a lot of jumps and calls to other +functions. Then rendering the CFG can take a long time. So be patient or cancel +rendering with Ctrl-C. To make the rendering faster you can skip function calls +instructions from splitting the code to more blocks. To set this run `set +skipcalls on` and then run earlier command again. Note that if function is long +and has a lot of jumps inside itself, then rendering is still gonna take a long +time. To have normal behavior again run `set skipcalls off`. + +## Usage as Standalone + +This method can be used with assembly files saved from ouput of objdump and GDB +disassembly. Pip installation will come with `asm2cfg` command-line tool for +this purpose. + +To use as standalone script you first need to dump assembly from GDB or objdump +to the file which is explained below. + +### Knowing Function Name + +If you don't know the name of function you're looking for then you can also list +all function names using GDB: + +``` +gdb -batch -ex 'b main' -ex r -ex 'info functions' ./test_executable +``` + +This will set breakpoint at function `main`, then +run the program and print symbols from all loaded libraries. + +For functions which come from main executable you can avoid running the program +and simply do + +``` +gdb -batch -ex 'info functions' ./test_executable +``` + +If you want to narrow the search down you can also use regexp + +``` +gdb ... -ex 'info functions ' ... +``` + +### Disassemble Function + +Once you have the function name, you can produce its disassembly via + +``` +gdb -batch -ex 'b main' -ex r -ex 'pipe disassemble test_function | tee test_function.asm' ./test_executable +``` + +or + +``` +gdb -batch -ex 'set breakpoints pending on' -ex 'b test_function' -ex r -ex 'pipe disassemble | tee test_function.asm' ./test_executable +``` + +(the `set breakpoint pending on` command enables pending breakpoints and +could be added to your `.gdbinit` instead) + +For functions from main executable it's enough to do + +``` +gdb -batch -ex 'pipe disassemble test_function | tee test_function.asm' ./test_executable +``` + +You can also extract function's disassembly from `objdump` output: + +``` +objdump -d ./test_executable | sed -ne '/ test_executable.asm +``` + +(this may be useful for specific non-native targets which lack GDB support). + +### Draw CFG + +Now you have the assembly file. Time to turn that to CFG pdf file. Do that by giving it +to `asm2cfg` command-line tool like so + +``` +asm2cfg test_function.asm +``` + +Asm2cfg by default expects x86 assembly files. If you want to use ARM assembly files, +then provide `--target arm` command-line flag. + +Above command should output `test_function.pdf` file in the same directory where +the executable was ran. If the assembly file is stripped then the function +memory range is used as a name instead. For example +`0x555555555faf-0x555555557008.pdf`. + +To view CFG instead of saving provide `-v` flag. And to skip function calls from +splitting the code to further blocks provide `-c` flag. To show the help use +`-h`. + +### Examples + +Repository includes examples which can be used to test the standalone +functionality for x86, ARM and objdump. + +File `test_function.asm` is non-stripped assembly file and its +corresponding output `test_function.pdf`. + +File `stripped_function.asm` contains +stripped function and its corresponding output +`stripped_function.pdf`. + +File `att_syntax.asm` is an example of non-stripped AT&T assembly. + +File `huge.asm` is a large stripped +assembly function and its corresponding output `huge.pdf`. This can be used to +test processing time of big functions. + +Files `objdump.asm` and `stripped_objdump.asm` are the regular and stripped +objdump-based disassemblies of short functions. + +File `arm.asm` is ARM based assembly file and its corresponding pdf file is +`arm.pdf`. + +## Development + +You want to contribute? You're very welcome to do so! This section will give you +guidance how to setup development environment and test things locally. + +### Python Environment + +For development this project manages packages with pipenv. Pipenv is a tool to +manage Python virtual environments and packages with much less pain compared to +normal pip and virtualenv usage. + +Install pipenv for your system following the guide +[here](https://pipenv.pypa.io/en/latest/). + +After installing pipenv. Create virtual environment and install all required +packages to it. Run following at project root + +``` +pipenv install -d +``` + +Now you can activate the virtual environment with + +``` +pipenv shell +``` + +Now your `python` and `pip` commands will correspond to created virtual environment +instead of your system's Python installation. + +To deactivate the environment, use + +``` +exit +``` + +### Testing + +This project uses [pytest](https://pypi.org/project/pytest/) for testing. Some +test are written using Python's own unittest testing framework, but they work +with pytest out of the box. Pytest style is preferred way to write tests. + +To run tests from project root, use `pytest` or + +``` +pipenv run pytest +``` + +During testing dot viewer might be opened if you have it installed. This is +because GDB integration command `viewcfg` is tested, which will open external +dot viewer. Just close it after it's opened. It should not affect the test run +itself. + +### Code Linting + +Project uses [flake8](https://flake8.pycqa.org/en/latest/) and +[pylint](https://pylint.org/) for code linting. + +To run flake8, use + +``` +flake8 +``` + +And to run pylint use + +``` +pylint src test +``` + +Both commands should not print any errors. + +### Command-Line Interface + +To test command-line interface of asm2cfg wihtout installing the package. You +can execute module directly. For example to print help + +``` +python -m src.asm2cfg -h +``` + +Standalone method can be used to try out the examples under `examples` folder as +well. For example following command should generate `main.pdf` file to current +working directory. + +``` +python -m src.asm2cfg -c examples/huge.asm +``` + +### GDB Integration + +Before testing GDB functionality, make sure asm2cfg is not installed with pip! +This can lead to GDB using code from pip installed asm2cfg package instead of +code from this repository! + +Also pipenv cannot be used with GDB. You need to install required packages to +your system's Python pip. This is because your installed GDB is linked against +system's Python interpreter and will use it, instead of active virtual +environment. If packages are not installed to your system's pip. You are likely +to receive following error messages when trying to use asm2cfg with GDB + +``` +ModuleNotFoundError: No module named 'graphviz' +``` + +To fix this, install required packages to your system's pip without active +virtual environment. Currently GDB integration only requires graphviz. + +``` +pip install graphviz +``` + +To use asm2cfg GDB related functionality. Use following line from +project root. + +``` +PYTHONPATH=${PWD}/src gdb -ex 'source src/gdb_asm2cfg.py' +``` + +This will set Python import path so that GDB can import code from this +repository without installing the package. After this you should be able to use +commands `viewcfg` and `savecfg`. + +### Current Development Goals + +There are might be cases asm2cfg will not fully support all x86 or ARM assembly +lines. If you encounter such problems please open an issue. + +Current developed goals are best described in issues section. Please open a new +one if existing one does not exist. + +If you want to talk to me, you can contact me at Discord with name +`Kazhuu#3121`. From b8eefa087ba7a1079d9c5fc0edeb3034f605c654 Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Wed, 21 Feb 2024 12:23:01 +0100 Subject: [PATCH 28/31] Licences: Change to MIT, add SPDX identifier to files --- LICENSES/GTDGmbH.md | 397 ++---------------- ocgraph/__main__.py | 3 +- .../architecture/architecture.py | 3 +- ocgraph/configuration/architecture/arm.py | 3 +- ocgraph/configuration/architecture/ppc.py | 3 +- ocgraph/configuration/architecture/sparc.py | 3 +- ocgraph/configuration/architecture/x86.py | 3 +- ocgraph/configuration/configuration.py | 3 +- .../disassembler/disassembler.py | 2 - ocgraph/interface/coverage_reader.py | 3 +- ocgraph/interface/drawer.py | 3 +- scripts/batch_objdump.sh | 3 +- 12 files changed, 44 insertions(+), 385 deletions(-) diff --git a/LICENSES/GTDGmbH.md b/LICENSES/GTDGmbH.md index a612ad9..120e287 100644 --- a/LICENSES/GTDGmbH.md +++ b/LICENSES/GTDGmbH.md @@ -1,373 +1,24 @@ -Mozilla Public License Version 2.0 -================================== - -1. Definitions --------------- - -1.1. "Contributor" - means each individual or legal entity that creates, contributes to - the creation of, or owns Covered Software. - -1.2. "Contributor Version" - means the combination of the Contributions of others (if any) used - by a Contributor and that particular Contributor's Contribution. - -1.3. "Contribution" - means Covered Software of a particular Contributor. - -1.4. "Covered Software" - means Source Code Form to which the initial Contributor has attached - the notice in Exhibit A, the Executable Form of such Source Code - Form, and Modifications of such Source Code Form, in each case - including portions thereof. - -1.5. "Incompatible With Secondary Licenses" - means - - (a) that the initial Contributor has attached the notice described - in Exhibit B to the Covered Software; or - - (b) that the Covered Software was made available under the terms of - version 1.1 or earlier of the License, but not also under the - terms of a Secondary License. - -1.6. "Executable Form" - means any form of the work other than Source Code Form. - -1.7. "Larger Work" - means a work that combines Covered Software with other material, in - a separate file or files, that is not Covered Software. - -1.8. "License" - means this document. - -1.9. "Licensable" - means having the right to grant, to the maximum extent possible, - whether at the time of the initial grant or subsequently, any and - all of the rights conveyed by this License. - -1.10. "Modifications" - means any of the following: - - (a) any file in Source Code Form that results from an addition to, - deletion from, or modification of the contents of Covered - Software; or - - (b) any new file in Source Code Form that contains any Covered - Software. - -1.11. "Patent Claims" of a Contributor - means any patent claim(s), including without limitation, method, - process, and apparatus claims, in any patent Licensable by such - Contributor that would be infringed, but for the grant of the - License, by the making, using, selling, offering for sale, having - made, import, or transfer of either its Contributions or its - Contributor Version. - -1.12. "Secondary License" - means either the GNU General Public License, Version 2.0, the GNU - Lesser General Public License, Version 2.1, the GNU Affero General - Public License, Version 3.0, or any later versions of those - licenses. - -1.13. "Source Code Form" - means the form of the work preferred for making modifications. - -1.14. "You" (or "Your") - means an individual or a legal entity exercising rights under this - License. For legal entities, "You" includes any entity that - controls, is controlled by, or is under common control with You. For - purposes of this definition, "control" means (a) the power, direct - or indirect, to cause the direction or management of such entity, - whether by contract or otherwise, or (b) ownership of more than - fifty percent (50%) of the outstanding shares or beneficial - ownership of such entity. - -2. License Grants and Conditions --------------------------------- - -2.1. Grants - -Each Contributor hereby grants You a world-wide, royalty-free, -non-exclusive license: - -(a) under intellectual property rights (other than patent or trademark) - Licensable by such Contributor to use, reproduce, make available, - modify, display, perform, distribute, and otherwise exploit its - Contributions, either on an unmodified basis, with Modifications, or - as part of a Larger Work; and - -(b) under Patent Claims of such Contributor to make, use, sell, offer - for sale, have made, import, and otherwise transfer either its - Contributions or its Contributor Version. - -2.2. Effective Date - -The licenses granted in Section 2.1 with respect to any Contribution -become effective for each Contribution on the date the Contributor first -distributes such Contribution. - -2.3. Limitations on Grant Scope - -The licenses granted in this Section 2 are the only rights granted under -this License. No additional rights or licenses will be implied from the -distribution or licensing of Covered Software under this License. -Notwithstanding Section 2.1(b) above, no patent license is granted by a -Contributor: - -(a) for any code that a Contributor has removed from Covered Software; - or - -(b) for infringements caused by: (i) Your and any other third party's - modifications of Covered Software, or (ii) the combination of its - Contributions with other software (except as part of its Contributor - Version); or - -(c) under Patent Claims infringed by Covered Software in the absence of - its Contributions. - -This License does not grant any rights in the trademarks, service marks, -or logos of any Contributor (except as may be necessary to comply with -the notice requirements in Section 3.4). - -2.4. Subsequent Licenses - -No Contributor makes additional grants as a result of Your choice to -distribute the Covered Software under a subsequent version of this -License (see Section 10.2) or under the terms of a Secondary License (if -permitted under the terms of Section 3.3). - -2.5. Representation - -Each Contributor represents that the Contributor believes its -Contributions are its original creation(s) or it has sufficient rights -to grant the rights to its Contributions conveyed by this License. - -2.6. Fair Use - -This License is not intended to limit any rights You have under -applicable copyright doctrines of fair use, fair dealing, or other -equivalents. - -2.7. Conditions - -Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted -in Section 2.1. - -3. Responsibilities -------------------- - -3.1. Distribution of Source Form - -All distribution of Covered Software in Source Code Form, including any -Modifications that You create or to which You contribute, must be under -the terms of this License. You must inform recipients that the Source -Code Form of the Covered Software is governed by the terms of this -License, and how they can obtain a copy of this License. You may not -attempt to alter or restrict the recipients' rights in the Source Code -Form. - -3.2. Distribution of Executable Form - -If You distribute Covered Software in Executable Form then: - -(a) such Covered Software must also be made available in Source Code - Form, as described in Section 3.1, and You must inform recipients of - the Executable Form how they can obtain a copy of such Source Code - Form by reasonable means in a timely manner, at a charge no more - than the cost of distribution to the recipient; and - -(b) You may distribute such Executable Form under the terms of this - License, or sublicense it under different terms, provided that the - license for the Executable Form does not attempt to limit or alter - the recipients' rights in the Source Code Form under this License. - -3.3. Distribution of a Larger Work - -You may create and distribute a Larger Work under terms of Your choice, -provided that You also comply with the requirements of this License for -the Covered Software. If the Larger Work is a combination of Covered -Software with a work governed by one or more Secondary Licenses, and the -Covered Software is not Incompatible With Secondary Licenses, this -License permits You to additionally distribute such Covered Software -under the terms of such Secondary License(s), so that the recipient of -the Larger Work may, at their option, further distribute the Covered -Software under the terms of either this License or such Secondary -License(s). - -3.4. Notices - -You may not remove or alter the substance of any license notices -(including copyright notices, patent notices, disclaimers of warranty, -or limitations of liability) contained within the Source Code Form of -the Covered Software, except that You may alter any license notices to -the extent required to remedy known factual inaccuracies. - -3.5. Application of Additional Terms - -You may choose to offer, and to charge a fee for, warranty, support, -indemnity or liability obligations to one or more recipients of Covered -Software. However, You may do so only on Your own behalf, and not on -behalf of any Contributor. You must make it absolutely clear that any -such warranty, support, indemnity, or liability obligation is offered by -You alone, and You hereby agree to indemnify every Contributor for any -liability incurred by such Contributor as a result of warranty, support, -indemnity or liability terms You offer. You may include additional -disclaimers of warranty and limitations of liability specific to any -jurisdiction. - -4. Inability to Comply Due to Statute or Regulation ---------------------------------------------------- - -If it is impossible for You to comply with any of the terms of this -License with respect to some or all of the Covered Software due to -statute, judicial order, or regulation then You must: (a) comply with -the terms of this License to the maximum extent possible; and (b) -describe the limitations and the code they affect. Such description must -be placed in a text file included with all distributions of the Covered -Software under this License. Except to the extent prohibited by statute -or regulation, such description must be sufficiently detailed for a -recipient of ordinary skill to be able to understand it. - -5. Termination --------------- - -5.1. The rights granted under this License will terminate automatically -if You fail to comply with any of its terms. However, if You become -compliant, then the rights granted under this License from a particular -Contributor are reinstated (a) provisionally, unless and until such -Contributor explicitly and finally terminates Your grants, and (b) on an -ongoing basis, if such Contributor fails to notify You of the -non-compliance by some reasonable means prior to 60 days after You have -come back into compliance. Moreover, Your grants from a particular -Contributor are reinstated on an ongoing basis if such Contributor -notifies You of the non-compliance by some reasonable means, this is the -first time You have received notice of non-compliance with this License -from such Contributor, and You become compliant prior to 30 days after -Your receipt of the notice. - -5.2. If You initiate litigation against any entity by asserting a patent -infringement claim (excluding declaratory judgment actions, -counter-claims, and cross-claims) alleging that a Contributor Version -directly or indirectly infringes any patent, then the rights granted to -You by any and all Contributors for the Covered Software under Section -2.1 of this License shall terminate. - -5.3. In the event of termination under Sections 5.1 or 5.2 above, all -end user license agreements (excluding distributors and resellers) which -have been validly granted by You or Your distributors under this License -prior to termination shall survive termination. - -************************************************************************ -* * -* 6. Disclaimer of Warranty * -* ------------------------- * -* * -* Covered Software is provided under this License on an "as is" * -* basis, without warranty of any kind, either expressed, implied, or * -* statutory, including, without limitation, warranties that the * -* Covered Software is free of defects, merchantable, fit for a * -* particular purpose or non-infringing. The entire risk as to the * -* quality and performance of the Covered Software is with You. * -* Should any Covered Software prove defective in any respect, You * -* (not any Contributor) assume the cost of any necessary servicing, * -* repair, or correction. This disclaimer of warranty constitutes an * -* essential part of this License. No use of any Covered Software is * -* authorized under this License except under this disclaimer. * -* * -************************************************************************ - -************************************************************************ -* * -* 7. Limitation of Liability * -* -------------------------- * -* * -* Under no circumstances and under no legal theory, whether tort * -* (including negligence), contract, or otherwise, shall any * -* Contributor, or anyone who distributes Covered Software as * -* permitted above, be liable to You for any direct, indirect, * -* special, incidental, or consequential damages of any character * -* including, without limitation, damages for lost profits, loss of * -* goodwill, work stoppage, computer failure or malfunction, or any * -* and all other commercial damages or losses, even if such party * -* shall have been informed of the possibility of such damages. This * -* limitation of liability shall not apply to liability for death or * -* personal injury resulting from such party's negligence to the * -* extent applicable law prohibits such limitation. Some * -* jurisdictions do not allow the exclusion or limitation of * -* incidental or consequential damages, so this exclusion and * -* limitation may not apply to You. * -* * -************************************************************************ - -8. Litigation -------------- - -Any litigation relating to this License may be brought only in the -courts of a jurisdiction where the defendant maintains its principal -place of business and such litigation shall be governed by laws of that -jurisdiction, without reference to its conflict-of-law provisions. -Nothing in this Section shall prevent a party's ability to bring -cross-claims or counter-claims. - -9. Miscellaneous ----------------- - -This License represents the complete agreement concerning the subject -matter hereof. If any provision of this License is held to be -unenforceable, such provision shall be reformed only to the extent -necessary to make it enforceable. Any law or regulation which provides -that the language of a contract shall be construed against the drafter -shall not be used to construe this License against a Contributor. - -10. Versions of the License ---------------------------- - -10.1. New Versions - -Mozilla Foundation is the license steward. Except as provided in Section -10.3, no one other than the license steward has the right to modify or -publish new versions of this License. Each version will be given a -distinguishing version number. - -10.2. Effect of New Versions - -You may distribute the Covered Software under the terms of the version -of the License under which You originally received the Covered Software, -or under the terms of any subsequent version published by the license -steward. - -10.3. Modified Versions - -If you create software not governed by this License, and you want to -create a new license for such software, you may create and use a -modified version of this License if you rename the license and remove -any references to the name of the license steward (except to note that -such modified license differs from this License). - -10.4. Distributing Source Code Form that is Incompatible With Secondary -Licenses - -If You choose to distribute Source Code Form that is Incompatible With -Secondary Licenses under the terms of this version of the License, the -notice described in Exhibit B of this License must be attached. - -Exhibit A - Source Code Form License Notice -------------------------------------------- - - This Source Code Form is subject to the terms of the Mozilla Public - License, v. 2.0. If a copy of the MPL was not distributed with this - file, You can obtain one at http://mozilla.org/MPL/2.0/. - -If it is not possible or desirable to put the notice in a particular -file, then You may include the notice in a location (such as a LICENSE -file in a relevant directory) where a recipient would be likely to look -for such a notice. - -You may add additional accurate notices of copyright ownership. - -Exhibit B - "Incompatible With Secondary Licenses" Notice ---------------------------------------------------------- - - This Source Code Form is "Incompatible With Secondary Licenses", as - defined by the Mozilla Public License, v. 2.0. +Valid-License-Identifier: GTDGmbH +License-Text: + +Copyright (c) 2023 GTD GmbH. All rights reserved. + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the “Software”), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/ocgraph/__main__.py b/ocgraph/__main__.py index 9a26f63..b53afcb 100755 --- a/ocgraph/__main__.py +++ b/ocgraph/__main__.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH """ Let this module be executed from the command line with python -m ocgraph from root of the project diff --git a/ocgraph/configuration/architecture/architecture.py b/ocgraph/configuration/architecture/architecture.py index cbf36ff..511c5a1 100755 --- a/ocgraph/configuration/architecture/architecture.py +++ b/ocgraph/configuration/architecture/architecture.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH """Contains all necessary functions for a TargetInfo class.""" from abc import ABC, abstractmethod diff --git a/ocgraph/configuration/architecture/arm.py b/ocgraph/configuration/architecture/arm.py index df6ba6b..920b626 100755 --- a/ocgraph/configuration/architecture/arm.py +++ b/ocgraph/configuration/architecture/arm.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH """ Contains instruction info for ARM-compatible targets. """ import re diff --git a/ocgraph/configuration/architecture/ppc.py b/ocgraph/configuration/architecture/ppc.py index 6ec15c3..2551fa1 100755 --- a/ocgraph/configuration/architecture/ppc.py +++ b/ocgraph/configuration/architecture/ppc.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH """Contains instruction info for PPC-compatible targets.""" import re diff --git a/ocgraph/configuration/architecture/sparc.py b/ocgraph/configuration/architecture/sparc.py index f99c6ec..807e881 100755 --- a/ocgraph/configuration/architecture/sparc.py +++ b/ocgraph/configuration/architecture/sparc.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH """Contains instruction info for Sparc-compatible targets.""" from .architecture import Architecture diff --git a/ocgraph/configuration/architecture/x86.py b/ocgraph/configuration/architecture/x86.py index ac90e46..fb12212 100755 --- a/ocgraph/configuration/architecture/x86.py +++ b/ocgraph/configuration/architecture/x86.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH """Contains instruction info for X86-compatible targets.""" import re diff --git a/ocgraph/configuration/configuration.py b/ocgraph/configuration/configuration.py index 0d6f83b..db2c22e 100755 --- a/ocgraph/configuration/configuration.py +++ b/ocgraph/configuration/configuration.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH """Module for configuration of the ocgraph package.""" import logging diff --git a/ocgraph/configuration/disassembler/disassembler.py b/ocgraph/configuration/disassembler/disassembler.py index 73822e4..ff118ea 100755 --- a/ocgraph/configuration/disassembler/disassembler.py +++ b/ocgraph/configuration/disassembler/disassembler.py @@ -1,6 +1,4 @@ #!/usr/bin/env python3 -# SPDX-License-Identifier: GTDGmbH -# Copyright 2023 by GTD GmbH. """Class configuring the used disassembler tool.""" from abc import ABC, abstractmethod diff --git a/ocgraph/interface/coverage_reader.py b/ocgraph/interface/coverage_reader.py index 561a02b..18dde93 100755 --- a/ocgraph/interface/coverage_reader.py +++ b/ocgraph/interface/coverage_reader.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH """Class for read coverage input and update the instruction.""" import ast import csv diff --git a/ocgraph/interface/drawer.py b/ocgraph/interface/drawer.py index 1ae02ff..e2ba199 100755 --- a/ocgraph/interface/drawer.py +++ b/ocgraph/interface/drawer.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH """Class for drawing the output.""" import tempfile diff --git a/scripts/batch_objdump.sh b/scripts/batch_objdump.sh index 630b2c9..0c1ff66 100755 --- a/scripts/batch_objdump.sh +++ b/scripts/batch_objdump.sh @@ -1,4 +1,5 @@ -#!/bin/bash +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH function_array=( acos From 5742033c87f362dceac9e336bcb521b4fcd3f494 Mon Sep 17 00:00:00 2001 From: Christoph Weiss Date: Wed, 21 Feb 2024 13:44:27 +0100 Subject: [PATCH 29/31] Configuration: move logging to separate class --- ocgraph/configuration/configuration.py | 33 ++---------------- ocgraph/configuration/logger.py | 47 ++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 30 deletions(-) create mode 100755 ocgraph/configuration/logger.py diff --git a/ocgraph/configuration/configuration.py b/ocgraph/configuration/configuration.py index db2c22e..2a80fcf 100755 --- a/ocgraph/configuration/configuration.py +++ b/ocgraph/configuration/configuration.py @@ -1,7 +1,8 @@ #!/usr/bin/env python # SPDX-License-Identifier: GTDGmbH """Module for configuration of the ocgraph package.""" -import logging + +from .logger import OcctreLogger, preset_logging from .architecture.architecture import Architecture from .architecture.x86 import X86Architecture @@ -51,26 +52,6 @@ }, } -preset_logging: dict[str, dict] = { - "development": { - "file_log": "debug.log", - "file_level": logging.DEBUG, - "console_log": True, - "console_level": logging.DEBUG, - }, - "module": { - "file_log": None, - "file_level": logging.ERROR, - "console_log": False, - "console_level": logging.ERROR, - }, - "default": { - "file_log": "asm2cfg.log", - "file_level": logging.INFO, - "console_log": True, - "console_level": logging.INFO, - }, -} # fmt: on @@ -93,15 +74,7 @@ def __init__( self.__dict__ = _preset # configure logging - log_config = preset_logging.get(logging_preset) - if log_config["file_log"]: - file_stream: logging.FileHandler = logging.FileHandler(log_config["file_log"]) - file_stream.setLevel(log_config["file_level"]) - self.logger.addHandler(file_stream) - if log_config["console_log"]: - console_stream: logging.StreamHandler = logging.StreamHandler() - console_stream.setLevel(log_config["console_level"]) - self.logger.addHandler(console_stream) + self.logger = OCGraphLogger("OcGraph", logging_preset, "asm2cfg.log") @staticmethod def architectures(): diff --git a/ocgraph/configuration/logger.py b/ocgraph/configuration/logger.py new file mode 100755 index 0000000..b68874b --- /dev/null +++ b/ocgraph/configuration/logger.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH +# Copyright 2024 by GTD GmbH. +"""Class configuring the OCGraph logging.""" +import logging + +# fmt: off +logging_preset: dict[str, dict] = { + "development": { + "file_log": True, + "file_level": logging.DEBUG, + "console_log": True, + "console_level": logging.DEBUG + }, + + "module": { + "file_log": False, + "file_level": logging.ERROR, + "console_log": False, + "console_level": logging.ERROR + }, + + "default": { + "file_log": True, + "file_level": logging.INFO, + "console_log": True, + "console_level": logging.INFO + }, +} +# fmt: on + + +class OCGraphLogger(logging.Logger): + """Logging mechanism for module""" + + def __init__(self, name: str, preset="default", file=""): + super().__init__(name) + log_config = logging_preset.get(preset) + if log_config["file_log"]: + logging_file = file + ".log" + file_stream: logging.FileHandler = logging.FileHandler(logging_file) + file_stream.setLevel(log_config["file_level"]) + self.addHandler(file_stream) + if log_config["console_log"]: + console_stream: logging.StreamHandler = logging.StreamHandler() + console_stream.setLevel(log_config["console_level"]) + self.addHandler(console_stream) From c555976d84d871b54b49fb2bacf6534acabfde63 Mon Sep 17 00:00:00 2001 From: Mauri Mustonen Date: Thu, 28 Mar 2024 05:44:42 +0200 Subject: [PATCH 30/31] run pre-commit hooks --- Pipfile | 1 + ocgraph/configuration/architecture/ppc.py | 12 +- ocgraph/configuration/architecture/sparc.py | 4 +- ocgraph/configuration/configuration.py | 2 +- .../configuration/disassembler/objdump_arm.py | 1 - .../configuration/disassembler/objdump_ppc.py | 2 - .../configuration/disassembler/objdump_x86.py | 5 +- test/fixtures/simple_program/hello.c | 4 +- test/templates/call.c | 4 +- test/templates/common.py | 29 ++-- test/templates/gen_calls.py | 53 +++--- test/templates/gen_funtable.py | 47 ++++-- test/templates/gen_jumps.py | 42 ++--- test/templates/gen_jumptable.py | 45 +++-- test/test_gdb.py | 88 ++++++---- test/test_parser.py | 158 ++++++++++-------- test/test_regex.py | 122 +++++++------- 17 files changed, 339 insertions(+), 280 deletions(-) diff --git a/Pipfile b/Pipfile index 3918b1b..a4ded50 100644 --- a/Pipfile +++ b/Pipfile @@ -12,3 +12,4 @@ flake8 = "*" pytest = "*" pytest-cov = "*" pylint = "*" +pre-commit = "*" diff --git a/ocgraph/configuration/architecture/ppc.py b/ocgraph/configuration/architecture/ppc.py index 2551fa1..b446120 100755 --- a/ocgraph/configuration/architecture/ppc.py +++ b/ocgraph/configuration/architecture/ppc.py @@ -9,8 +9,8 @@ # Common regexes -HEX_PATTERN = r'[0-9a-fA-F]+' -HEX_LONG_PATTERN = r'(?:0x0*)?' + HEX_PATTERN +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN # fmt: off ppc_call_opcodes = [ @@ -697,7 +697,9 @@ def is_call(self, instruction: Instruction): return instruction.opcode in ppc_call_opcodes def is_branch(self, instruction: Instruction): - return instruction.opcode in (ppc_conditional_branch_opcodes + ppc_unconditional_branch_opcodes) and not self.is_call(instruction) + return instruction.opcode in ( + ppc_conditional_branch_opcodes + ppc_unconditional_branch_opcodes + ) and not self.is_call(instruction) def is_unconditional_branch(self, instruction: Instruction): return instruction.opcode in ppc_unconditional_branch_opcodes @@ -706,4 +708,6 @@ def is_sink(self, instruction: Instruction): return instruction.opcode in ppc_sink_opcodes def is_direct_branch(self, instruction: Instruction): - return self.is_branch(instruction) and (re.search(rf"{HEX_LONG_PATTERN}", '|'.join(instruction.ops))) + return self.is_branch(instruction) and ( + re.search(rf"{HEX_LONG_PATTERN}", "|".join(instruction.ops)) + ) diff --git a/ocgraph/configuration/architecture/sparc.py b/ocgraph/configuration/architecture/sparc.py index 807e881..174577c 100755 --- a/ocgraph/configuration/architecture/sparc.py +++ b/ocgraph/configuration/architecture/sparc.py @@ -76,7 +76,9 @@ def is_call(self, instruction: Instruction): return instruction.opcode in sparc_v8_call_opcodes def is_branch(self, instruction: Instruction): - return instruction.opcode in (sparc_v8_conditional_branch_opcodes + sparc_v8_unconditional_branch_opcodes) + return instruction.opcode in ( + sparc_v8_conditional_branch_opcodes + sparc_v8_unconditional_branch_opcodes + ) def get_branch_delay(self, instruction: Instruction) -> int | None: delay = None diff --git a/ocgraph/configuration/configuration.py b/ocgraph/configuration/configuration.py index 2a80fcf..b01c9d3 100755 --- a/ocgraph/configuration/configuration.py +++ b/ocgraph/configuration/configuration.py @@ -74,7 +74,7 @@ def __init__( self.__dict__ = _preset # configure logging - self.logger = OCGraphLogger("OcGraph", logging_preset, "asm2cfg.log") + self.logger = OCGraphLogger("OcGraph", logging_preset, "asm2cfg.log") @staticmethod def architectures(): diff --git a/ocgraph/configuration/disassembler/objdump_arm.py b/ocgraph/configuration/disassembler/objdump_arm.py index a486d11..128330d 100644 --- a/ocgraph/configuration/disassembler/objdump_arm.py +++ b/ocgraph/configuration/disassembler/objdump_arm.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """Class for parsing the objdump ARM input""" import re diff --git a/ocgraph/configuration/disassembler/objdump_ppc.py b/ocgraph/configuration/disassembler/objdump_ppc.py index eaae1df..6b563ef 100755 --- a/ocgraph/configuration/disassembler/objdump_ppc.py +++ b/ocgraph/configuration/disassembler/objdump_ppc.py @@ -225,5 +225,3 @@ def parse_line(self, line: str, lineno, function_name: str) -> Instruction | Non def parse_jump_target(self, ops: List[str]) -> int | None: # it assumes the last operand of the branch to be the target address return int(ops[-1], 16) - - diff --git a/ocgraph/configuration/disassembler/objdump_x86.py b/ocgraph/configuration/disassembler/objdump_x86.py index 9fb3227..8ed4583 100644 --- a/ocgraph/configuration/disassembler/objdump_x86.py +++ b/ocgraph/configuration/disassembler/objdump_x86.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """Class for parsing the input""" import re @@ -55,7 +54,7 @@ def extract_information(self, str_input: str) -> dict[str, str]: } else: raise DisassemblerError("Line not processable: \n" + str(str_input)) - + return result def parse_function_header(self, line: str) -> str | None: @@ -187,7 +186,7 @@ def parse_line(self, line: str, lineno, function_name: str) -> Instruction | Non address, line = self.parse_address(line) if address is None: return None - + encoding, line = self.parse_encoding(line) if not line: return encoding diff --git a/test/fixtures/simple_program/hello.c b/test/fixtures/simple_program/hello.c index 9869f83..45fd52e 100644 --- a/test/fixtures/simple_program/hello.c +++ b/test/fixtures/simple_program/hello.c @@ -1,5 +1,3 @@ #include -int main() { - printf("Hello World\n"); -} +int main() { printf("Hello World\n"); } diff --git a/test/templates/call.c b/test/templates/call.c index 14d1905..c587b36 100644 --- a/test/templates/call.c +++ b/test/templates/call.c @@ -5,6 +5,4 @@ __attribute__((visibility("hidden"))) void foo() { } -void bar() { - foo(); -} +void bar() { foo(); } diff --git a/test/templates/common.py b/test/templates/common.py index e1dd3cd..6103756 100644 --- a/test/templates/common.py +++ b/test/templates/common.py @@ -19,7 +19,7 @@ def error(msg): """ Print nicely-formatted error message and exit. """ - sys.stderr.write(f'{_ME}: error: {msg}\n') + sys.stderr.write(f"{_ME}: error: {msg}\n") sys.exit(1) @@ -29,13 +29,14 @@ def _run(cmd, stdin=None, verbose=0): """ if verbose: print(f"{_ME}: running command: {' '.join(cmd)}") - with subprocess.Popen(cmd, stdin=stdin, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) as process: + with subprocess.Popen( + cmd, stdin=stdin, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) as process: out, err = process.communicate() out = out.decode() err = err.decode() if process.returncode != 0: - cmds = ' '.join(cmd) + cmds = " ".join(cmd) error(f"'{cmds}' failed:\n{out}{err}") sys.stderr.write(err) return out @@ -45,7 +46,7 @@ def gcc(args): """ Run compiler with given arguments. """ - return _run(['gcc'] + args) + return _run(["gcc"] + args) def disasm(file, objdump_or_gdb, symbol, start, finish): @@ -53,11 +54,11 @@ def disasm(file, objdump_or_gdb, symbol, start, finish): Disassemble binary file. """ if objdump_or_gdb: - out = _run(['objdump', '-d', file]) + out = _run(["objdump", "-d", file]) elif symbol is not None: - out = _run(['gdb', '-batch', '-ex', f'disassemble {symbol}', file]) + out = _run(["gdb", "-batch", "-ex", f"disassemble {symbol}", file]) else: - out = _run(['gdb', '-batch', '-ex', f'disassemble {start},{finish}', file]) + out = _run(["gdb", "-batch", "-ex", f"disassemble {start},{finish}", file]) return out @@ -65,23 +66,23 @@ def strip_binary(file): """ Strip symbol info from binary file. """ - _run(['strip', '-s', file]) + _run(["strip", "-s", file]) def grep(text, regex): - lines = text.split('\n') + lines = text.split("\n") return list(filter(lambda s: re.search(regex, s), lines)) def find_address(file, name): - out = _run(['readelf', '-sW', file]) - lines = grep(out, fr'{name}$') - assert len(lines) >= 1, f'failed to locate symbol {name} in\n{out}' + out = _run(["readelf", "-sW", file]) + lines = grep(out, rf"{name}$") + assert len(lines) >= 1, f"failed to locate symbol {name} in\n{out}" line = lines[0] # Num: Value Size Type Bind Vis Ndx Name # 27: 0000000000001030 11 FUNC GLOBAL DEFAULT 9 foo line = line.strip() - words = re.split(r'\s+', line) + words = re.split(r"\s+", line) start = int(words[1], 16) size = int(words[2]) return start, start + size diff --git a/test/templates/gen_calls.py b/test/templates/gen_calls.py index 436026c..9fc1880 100755 --- a/test/templates/gen_calls.py +++ b/test/templates/gen_calls.py @@ -14,53 +14,58 @@ set_basename(os.path.basename(__file__)) -for gdb, pic, plt, direct, strip in itertools.product([False, True], - [False, True], # Do we need to test PIE too? - [False, True], - [False, True], - [False, True]): +for gdb, pic, plt, direct, strip in itertools.product( + [False, True], + [False, True], # Do we need to test PIE too? + [False, True], + [False, True], + [False, True], +): # Print config - disasm_type = 'GDB' if gdb else 'objdump' - pic_type = 'position-INdependent' if pic else 'position-dependent' - call_type = 'Non-PIC-call' if direct else 'PIC-call' - strip_type = 'stripped' if strip else 'UNstripped' - plt_type = 'PLT' if plt else 'PLT-less' - print(f'Checking {disasm_type} {pic_type} {plt_type} {call_type} {strip_type}') + disasm_type = "GDB" if gdb else "objdump" + pic_type = "position-INdependent" if pic else "position-dependent" + call_type = "Non-PIC-call" if direct else "PIC-call" + strip_type = "stripped" if strip else "UNstripped" + plt_type = "PLT" if plt else "PLT-less" + print(f"Checking {disasm_type} {pic_type} {plt_type} {call_type} {strip_type}") # Generate object code - flags = ['call.c', '-o', 'a.out', - '-Wl,--defsym,_start=0', '-nostdlib', '-nostartfiles'] + flags = ["call.c", "-o", "a.out", "-Wl,--defsym,_start=0", "-nostdlib", "-nostartfiles"] # DLL or executable? if pic: - flags += ['-fPIC', '-shared'] + flags += ["-fPIC", "-shared"] # Use PLT? if not plt: - flags += ['-fno-plt'] + flags += ["-fno-plt"] # Force non-PLT call for PIC code? if direct and pic: - flags.append('-DHIDDEN') + flags.append("-DHIDDEN") gcc(flags) - caller = 'bar' - start, finish = find_address('a.out', caller) + caller = "bar" + start, finish = find_address("a.out", caller) if strip: - strip_binary('a.out') + strip_binary("a.out") caller = None # Generate disasm - out = disasm('a.out', not gdb, caller, start, finish) + out = disasm("a.out", not gdb, caller, start, finish) # Print snippets - headers = grep(out, r':|Dump of') - calls = grep(out, r'call') - print('''\ + headers = grep(out, r":|Dump of") + calls = grep(out, r"call") + print( + """\ headers: {0} calls: {1} -'''.format('\n '.join(headers), '\n '.join(calls))) +""".format( + "\n ".join(headers), "\n ".join(calls) + ) + ) diff --git a/test/templates/gen_funtable.py b/test/templates/gen_funtable.py index 05a52b1..f0a5971 100755 --- a/test/templates/gen_funtable.py +++ b/test/templates/gen_funtable.py @@ -11,45 +11,56 @@ set_basename(os.path.basename(__file__)) -for gdb, pic, strip in itertools.product([False, True], - [False, True], # Do we need to test PIE too? - [False, True]): +for gdb, pic, strip in itertools.product( + [False, True], [False, True], [False, True] # Do we need to test PIE too? +): # Print config - disasm_type = 'GDB' if gdb else 'objdump' - pic_type = 'position-INdependent' if pic else 'position-dependent' - stripped = 'stripped' if strip else 'UNstripped' - print(f'Checking {disasm_type} {pic_type} {stripped}') + disasm_type = "GDB" if gdb else "objdump" + pic_type = "position-INdependent" if pic else "position-dependent" + stripped = "stripped" if strip else "UNstripped" + print(f"Checking {disasm_type} {pic_type} {stripped}") # Generate object code - flags = ['funtable.c', '-o', 'a.out', - '-Wl,--defsym,_start=0', '-nostdlib', '-nostartfiles', '-O2'] + flags = [ + "funtable.c", + "-o", + "a.out", + "-Wl,--defsym,_start=0", + "-nostdlib", + "-nostartfiles", + "-O2", + ] # DLL or executable? if pic: - flags += ['-fPIC', '-shared'] + flags += ["-fPIC", "-shared"] # Include debuginfo? if not strip: - flags.append('-g') + flags.append("-g") gcc(flags) # Strip - caller = 'bar' - start, finish = find_address('a.out', caller) + caller = "bar" + start, finish = find_address("a.out", caller) if strip: - strip_binary('a.out') + strip_binary("a.out") caller = None # Generate disasm - out = disasm('a.out', not gdb, caller, start, finish) + out = disasm("a.out", not gdb, caller, start, finish) # Print snippets - jumps = grep(out, r'\bcall') - print('''\ + jumps = grep(out, r"\bcall") + print( + """\ table calls: {} -'''.format('\n '.join(jumps))) +""".format( + "\n ".join(jumps) + ) + ) diff --git a/test/templates/gen_jumps.py b/test/templates/gen_jumps.py index 2bd7f74..8580394 100755 --- a/test/templates/gen_jumps.py +++ b/test/templates/gen_jumps.py @@ -11,46 +11,48 @@ set_basename(os.path.basename(__file__)) -for gdb, pic, opt, strip in itertools.product([False, True], - [False, True], # Do we need to test PIE too? - [False, True], - [False, True]): +for gdb, pic, opt, strip in itertools.product( + [False, True], [False, True], [False, True], [False, True] # Do we need to test PIE too? +): # Print config - disasm_type = 'GDB' if gdb else 'objdump' - pic_type = 'position-INdependent' if pic else 'position-dependent' - opt_type = 'optimized' if opt else 'UNoptimized' - stripped = 'stripped' if strip else 'UNstripped' - print(f'Checking {disasm_type} {pic_type} {opt_type} {stripped}') + disasm_type = "GDB" if gdb else "objdump" + pic_type = "position-INdependent" if pic else "position-dependent" + opt_type = "optimized" if opt else "UNoptimized" + stripped = "stripped" if strip else "UNstripped" + print(f"Checking {disasm_type} {pic_type} {opt_type} {stripped}") # Generate object code - flags = ['jump.c', '-o', 'a.out', - '-Wl,--defsym,_start=0', '-nostdlib', '-nostartfiles'] + flags = ["jump.c", "-o", "a.out", "-Wl,--defsym,_start=0", "-nostdlib", "-nostartfiles"] # DLL or executable? if pic: - flags += ['-fPIC', '-shared'] + flags += ["-fPIC", "-shared"] if opt: - flags.append('-O2') + flags.append("-O2") gcc(flags) # Strip - caller = 'bar' - start, finish = find_address('a.out', caller) + caller = "bar" + start, finish = find_address("a.out", caller) if strip: - strip_binary('a.out') + strip_binary("a.out") caller = None # Generate disasm - out = disasm('a.out', not gdb, caller, start, finish) + out = disasm("a.out", not gdb, caller, start, finish) # Print snippets - jumps = grep(out, r'\bj') - print('''\ + jumps = grep(out, r"\bj") + print( + """\ jumps: {} -'''.format('\n '.join(jumps))) +""".format( + "\n ".join(jumps) + ) + ) diff --git a/test/templates/gen_jumptable.py b/test/templates/gen_jumptable.py index 3bd3710..92d853a 100755 --- a/test/templates/gen_jumptable.py +++ b/test/templates/gen_jumptable.py @@ -11,42 +11,53 @@ set_basename(os.path.basename(__file__)) -for gdb, pic, strip in itertools.product([False, True], - [False, True], # Do we need to test PIE too? - [False, True]): +for gdb, pic, strip in itertools.product( + [False, True], [False, True], [False, True] # Do we need to test PIE too? +): # Print config - disasm_type = 'GDB' if gdb else 'objdump' - pic_type = 'position-INdependent' if pic else 'position-dependent' - stripped = 'stripped' if strip else 'UNstripped' - print(f'Checking {disasm_type} {pic_type} {stripped}') + disasm_type = "GDB" if gdb else "objdump" + pic_type = "position-INdependent" if pic else "position-dependent" + stripped = "stripped" if strip else "UNstripped" + print(f"Checking {disasm_type} {pic_type} {stripped}") # Generate object code - flags = ['jumptable.c', '-o', 'a.out', - '-Wl,--defsym,_start=0', '-nostdlib', '-nostartfiles', '-O2'] + flags = [ + "jumptable.c", + "-o", + "a.out", + "-Wl,--defsym,_start=0", + "-nostdlib", + "-nostartfiles", + "-O2", + ] # DLL or executable? if pic: - flags += ['-fPIC', '-shared'] + flags += ["-fPIC", "-shared"] gcc(flags) # Strip - caller = 'bar' - start, finish = find_address('a.out', caller) + caller = "bar" + start, finish = find_address("a.out", caller) if strip: - strip_binary('a.out') + strip_binary("a.out") caller = None # Generate disasm - out = disasm('a.out', not gdb, caller, start, finish) + out = disasm("a.out", not gdb, caller, start, finish) # Print snippets - jumps = grep(out, r'\bjmp') - print('''\ + jumps = grep(out, r"\bjmp") + print( + """\ table jumps: {} -'''.format('\n '.join(jumps))) +""".format( + "\n ".join(jumps) + ) + ) diff --git a/test/test_gdb.py b/test/test_gdb.py index 13d6ead..06c19a9 100644 --- a/test/test_gdb.py +++ b/test/test_gdb.py @@ -7,67 +7,80 @@ def test_savecfg_help(): - result = execute_gdb_command('help savecfg') - assert 'Save an assembly control-flow graph (CFG)' in result.stdout + result = execute_gdb_command("help savecfg") + assert "Save an assembly control-flow graph (CFG)" in result.stdout def test_viewcfg_help(): - result = execute_gdb_command('help viewcfg') - assert 'Draw an assembly control-flow graph (CFG)' in result.stdout + result = execute_gdb_command("help viewcfg") + assert "Draw an assembly control-flow graph (CFG)" in result.stdout def test_help_set(): - result = execute_gdb_command('help set') - assert 'set skipcalls -- Set whether savecfg and viewcfg commands will skip function' in result.stdout + result = execute_gdb_command("help set") + assert ( + "set skipcalls -- Set whether savecfg and viewcfg commands will skip function" + in result.stdout + ) def test_help_set_skipcalls(): - result = execute_gdb_command('help set skipcalls') - assert 'Set whether savecfg and viewcfg commands will skip function' in result.stdout + result = execute_gdb_command("help set skipcalls") + assert "Set whether savecfg and viewcfg commands will skip function" in result.stdout def test_help_show_skipcalls(): - result = execute_gdb_command('help show skipcalls') - assert 'Set whether savecfg and viewcfg commands will skip function' in result.stdout + result = execute_gdb_command("help show skipcalls") + assert "Set whether savecfg and viewcfg commands will skip function" in result.stdout def test_show_skipcalls(): - result = execute_gdb_command('show skipcalls') - assert 'Commands savecfg and viewcfg will' in result.stdout + result = execute_gdb_command("show skipcalls") + assert "Commands savecfg and viewcfg will" in result.stdout def test_skipcalls_inital_value(): - result = execute_gdb_command('show skipcalls') - assert parse_option_value(result.stdout) == 'off' + result = execute_gdb_command("show skipcalls") + assert parse_option_value(result.stdout) == "off" def test_change_skipcalls_value(): - result = execute_gdb_command('set skipcalls') - assert parse_option_value(result.stdout) == 'on' + result = execute_gdb_command("set skipcalls") + assert parse_option_value(result.stdout) == "on" def test_savecfg(): result = execute_gdb_commands( - ['set confirm off', 'set breakpoint pending on', - 'file test/fixtures/simple_program/hello', 'b main', - 'run', 'savecfg'] + [ + "set confirm off", + "set breakpoint pending on", + "file test/fixtures/simple_program/hello", + "b main", + "run", + "savecfg", + ] ) - assert os.path.isfile('main.pdf'), result.stdout - assert 'Saved CFG to a file main.pdf' in result.stdout, result.stdout + assert os.path.isfile("main.pdf"), result.stdout + assert "Saved CFG to a file main.pdf" in result.stdout, result.stdout def test_viewcfg(): - stdout = '' + stdout = "" try: result = execute_gdb_commands( - ['set confirm off', 'set breakpoint pending on', - 'file test/fixtures/simple_program/hello', 'b main', - 'run', 'viewcfg'] + [ + "set confirm off", + "set breakpoint pending on", + "file test/fixtures/simple_program/hello", + "b main", + "run", + "viewcfg", + ] ) stdout = result.stdout except subprocess.TimeoutExpired as ex: stdout = str(ex.stdout) - viewcfg_pattern = re.compile(r'Opening a file (.*) with default viewer') + viewcfg_pattern = re.compile(r"Opening a file (.*) with default viewer") result = viewcfg_pattern.search(stdout) assert result is not None, stdout @@ -83,23 +96,28 @@ def execute_gdb_command(command): def execute_gdb_commands(commands): project_root_path = os.getcwd() - os.environ['PYTHONPATH'] = f'{project_root_path}/src/' - gdb_script_path = f'{project_root_path}/src/gdb_asm2cfg.py' - gdb_command = ['gdb', '-ex', f'source {gdb_script_path}'] + os.environ["PYTHONPATH"] = f"{project_root_path}/src/" + gdb_script_path = f"{project_root_path}/src/gdb_asm2cfg.py" + gdb_command = ["gdb", "-ex", f"source {gdb_script_path}"] for command in commands: - gdb_command.append('-ex') + gdb_command.append("-ex") gdb_command.append(command) - gdb_command.append('-ex') - gdb_command.append('q') + gdb_command.append("-ex") + gdb_command.append("q") result = subprocess.run( - gdb_command, stdout=subprocess.PIPE, stdin=None, - stderr=None, timeout=2, check=True, universal_newlines=True, + gdb_command, + stdout=subprocess.PIPE, + stdin=None, + stderr=None, + timeout=2, + check=True, + universal_newlines=True, ) return result def parse_option_value(gdb_output): - output_pattern = re.compile(r'blocks: (on|off)') + output_pattern = re.compile(r"blocks: (on|off)") result = output_pattern.search(gdb_output) if result: return result.group(1) diff --git a/test/test_parser.py b/test/test_parser.py index e5edeb1..dc4ca7c 100644 --- a/test/test_parser.py +++ b/test/test_parser.py @@ -22,15 +22,15 @@ def setUp(self): self.arm_target_info = asm2cfg.ARMTargetInfo() def test_simple_inst(self): - line = '0x000055555556f957 <+7>: push %r14' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.GDB, self.target_info) + line = "0x000055555556f957 <+7>: push %r14" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.GDB, self.target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'push %r14') + self.assertEqual(i.body, "push %r14") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x000055555556f957) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x000055555556F957) + self.assertIs(i.address.base, "main") self.assertEqual(i.address.offset, 7) self.assertIs(i.target, None) @@ -39,21 +39,21 @@ def test_simple_inst(self): self.assertFalse(i.is_unconditional_jump()) def test_jump(self): - line = '''\ + line = """\ 0x00007ffff7fbf26b <+395>: jmp 0x7ffff7fbf55d \ -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.GDB, self.target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.GDB, self.target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'jmp 0x7ffff7fbf55d') + self.assertEqual(i.body, "jmp 0x7ffff7fbf55d") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x00007ffff7fbf26b) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x00007FFFF7FBF26B) + self.assertIs(i.address.base, "main") self.assertEqual(i.address.offset, 395) self.assertIsNot(i.target, None) self.assertIs(i.target.abs, None) # FIXME - self.assertEqual(i.target.base, 'test_function') + self.assertEqual(i.target.base, "test_function") self.assertEqual(i.target.offset, 1149) self.assertFalse(i.is_call()) @@ -61,21 +61,21 @@ def test_jump(self): self.assertTrue(i.is_unconditional_jump()) def test_branch(self): - line = '''\ + line = """\ 0x00007ffff7fbf565 <+1157>: je 0x7ffff7fbf635 \ -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.GDB, self.target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.GDB, self.target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'je 0x7ffff7fbf635') + self.assertEqual(i.body, "je 0x7ffff7fbf635") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x00007ffff7fbf565) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x00007FFFF7FBF565) + self.assertIs(i.address.base, "main") self.assertEqual(i.address.offset, 1157) self.assertIsNot(i.target, None) self.assertIs(i.target.abs, None) # FIXME - self.assertEqual(i.target.base, 'test_function') + self.assertEqual(i.target.base, "test_function") self.assertEqual(i.target.offset, 1365) self.assertFalse(i.is_call()) @@ -83,21 +83,21 @@ def test_branch(self): self.assertFalse(i.is_unconditional_jump()) def test_call(self): - line = '''\ + line = """\ 0x000000000002ec0f <+63>: callq 0x2eab0 <__sigsetjmp@plt> -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.GDB, self.target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.GDB, self.target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'callq 0x2eab0') + self.assertEqual(i.body, "callq 0x2eab0") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x000000000002ec0f) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x000000000002EC0F) + self.assertIs(i.address.base, "main") self.assertEqual(i.address.offset, 63) self.assertIsNot(i.target, None) self.assertIs(i.target.abs, None) # FIXME - self.assertEqual(i.target.base, '__sigsetjmp@plt') + self.assertEqual(i.target.base, "__sigsetjmp@plt") self.assertEqual(i.target.offset, 0) self.assertTrue(i.is_call()) @@ -105,20 +105,20 @@ def test_call(self): self.assertFalse(i.is_unconditional_jump()) def test_call_stripped(self): - line = '''\ + line = """\ 0x000055555556f9b0 <+96>: call *0x2731a(%rip) # 0x555555596cd0 -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.GDB, self.target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.GDB, self.target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'call *0x2731a(%rip)') + self.assertEqual(i.body, "call *0x2731a(%rip)") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x000055555556f9b0) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x000055555556F9B0) + self.assertIs(i.address.base, "main") self.assertEqual(i.address.offset, 96) self.assertIsNot(i.target, None) - self.assertEqual(i.target.abs, 0x555555596cd0) + self.assertEqual(i.target.abs, 0x555555596CD0) self.assertIs(i.target.base, None) self.assertIs(i.target.offset, None) @@ -127,60 +127,60 @@ def test_call_stripped(self): self.assertFalse(i.is_unconditional_jump()) def test_objdump(self): - line = '''\ + line = """\ 16bbb: 74 29 je 16be6 <_obstack_allocated_p@@Base+0x36> -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.OBJDUMP, self.target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.OBJDUMP, self.target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'je 16be6') + self.assertEqual(i.body, "je 16be6") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x16bbb) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x16BBB) + self.assertIs(i.address.base, "main") self.assertIsNot(i.target, None) self.assertIs(i.target.abs, None) - self.assertEqual(i.target.base, '_obstack_allocated_p@@Base') + self.assertEqual(i.target.base, "_obstack_allocated_p@@Base") self.assertIs(i.target.offset, 0x36) self.assertTrue(i.is_jump()) self.assertFalse(i.is_unconditional_jump()) def test_arm_branch(self): - line = '''\ + line = """\ 1c: 0a000001 beq 28 -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.OBJDUMP, self.arm_target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.OBJDUMP, self.arm_target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'beq 28') + self.assertEqual(i.body, "beq 28") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x1c) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x1C) + self.assertIs(i.address.base, "main") self.assertIsNot(i.target, None) self.assertIs(i.target.abs, None) - self.assertEqual(i.target.base, 'check_one_fd') + self.assertEqual(i.target.base, "check_one_fd") self.assertIs(i.target.offset, 0x28) self.assertTrue(i.is_jump()) self.assertFalse(i.is_unconditional_jump()) def test_arm_jump(self): - line = '''\ + line = """\ 1c: 0a000001 b 28 -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.OBJDUMP, self.arm_target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.OBJDUMP, self.arm_target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'b 28') + self.assertEqual(i.body, "b 28") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x1c) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x1C) + self.assertIs(i.address.base, "main") self.assertIsNot(i.target, None) self.assertIs(i.target.abs, None) - self.assertEqual(i.target.base, 'check_one_fd') + self.assertEqual(i.target.base, "check_one_fd") self.assertIs(i.target.offset, 0x28) self.assertTrue(i.is_jump()) @@ -193,7 +193,7 @@ class ParseLinesTestCase(unittest.TestCase): """ def test_linear_sequence(self): - lines = '''\ + lines = """\ Dump of assembler code for function main: 0x000055555556f952 <+2>: mov $0x1,%ecx 0x000055555556f957 <+7>: push %r14 @@ -201,8 +201,10 @@ def test_linear_sequence(self): 0x000055555556f95b <+11>: push %r12 0x000055555556f95d <+13>: push %rbp 0x000055555556f95e <+14>: push %rbx\ -'''.split('\n') - _, blocks = asm2cfg.parse_lines(lines, False, 'x86') +""".split( + "\n" + ) + _, blocks = asm2cfg.parse_lines(lines, False, "x86") self.assertEqual(len(blocks), 1) _, block = blocks.popitem() @@ -211,13 +213,15 @@ def test_linear_sequence(self): self.assertIs(block.no_jump_edge, None) def test_unconditional(self): - lines = '''\ + lines = """\ Dump of assembler code for function main: 0x000055555556fffb <+1707>: jmp 0x555555570058 0x0000555555570058 <+1800>: mov 0xe0(%rsp),%rdi 0x0000555555570060 <+1808>: test %rdi,%rdi -'''.split('\n') - _, blocks = asm2cfg.parse_lines(lines, False, 'x86') +""".split( + "\n" + ) + _, blocks = asm2cfg.parse_lines(lines, False, "x86") self.assertEqual(len(blocks), 2) @@ -232,7 +236,7 @@ def test_unconditional(self): self.assertEqual(len(dst_block.instructions), 2) def test_conditional(self): - lines = '''\ + lines = """\ Dump of assembler code for function main: 0x000055555556fffb <+1707>: je 0x555555570058 0x000055555556fffd <+1709>: push %rbx @@ -240,8 +244,10 @@ def test_conditional(self): 0x000055555556fffe <+1710>: mov %r15,%r8 0x0000555555570058 <+1800>: mov 0xe0(%rsp),%rdi 0x0000555555570060 <+1808>: test %rdi,%rdi -'''.split('\n') - _, blocks = asm2cfg.parse_lines(lines, False, 'x86') +""".split( + "\n" + ) + _, blocks = asm2cfg.parse_lines(lines, False, "x86") self.assertEqual(len(blocks), 3) @@ -261,13 +267,15 @@ def test_conditional(self): self.assertEqual(len(dst_block.instructions), 2) def test_return(self): - lines = '''\ + lines = """\ Dump of assembler code for function main: 0x000055555556fffb <+1707>: retq 0x0000555555570058 <+1800>: mov 0xe0(%rsp),%rdi 0x0000555555570060 <+1808>: test %rdi,%rdi -'''.split('\n') - _, blocks = asm2cfg.parse_lines(lines, False, 'x86') +""".split( + "\n" + ) + _, blocks = asm2cfg.parse_lines(lines, False, "x86") self.assertEqual(len(blocks), 2) @@ -285,7 +293,7 @@ def test_return(self): @unittest.expectedFailure def test_jumptables(self): - lines = '''\ + lines = """\ Dump of assembler code for function bar: 0x0000000000001070 <+0>: endbr64 0x0000000000001074 <+4>: cmp $0x9,%edi @@ -331,19 +339,23 @@ def test_jumptables(self): 0x0000000000001119 <+169>: retq 0x000000000000111a <+170>: nopw 0x0(%rax,%rax,1) 0x0000000000001120 <+176>: retq -'''.split('\n') - _, blocks = asm2cfg.parse_lines(lines, False, 'x86') +""".split( + "\n" + ) + _, blocks = asm2cfg.parse_lines(lines, False, "x86") # TODO: special block for indirect jumps self.assertEqual(len(blocks), 4) def test_dummy_block(self): - lines = '''\ + lines = """\ Dump of assembler code for function main: 0x000055555556fffb <+1709>: push %rbx 0x000055555556fffd <+1707>: je 0x000055555556fffb -'''.split('\n') - _, blocks = asm2cfg.parse_lines(lines, False, 'x86') +""".split( + "\n" + ) + _, blocks = asm2cfg.parse_lines(lines, False, "x86") self.assertEqual(len(blocks), 2) @@ -356,7 +368,7 @@ def test_dummy_block(self): self.assertIs(fall_block.jump_edge, None) self.assertIs(fall_block.no_jump_edge, None) self.assertEqual(len(fall_block.instructions), 1) - self.assertEqual(fall_block.instructions[0].text, 'end of function') + self.assertEqual(fall_block.instructions[0].text, "end of function") # TODO: # - functions (with and w/o calls) diff --git a/test/test_regex.py b/test/test_regex.py index 801d2a8..c896601 100644 --- a/test/test_regex.py +++ b/test/test_regex.py @@ -13,25 +13,25 @@ class FunctionHeaderTestCase(unittest.TestCase): """ def test_gdb_unstripped(self): - line = 'Dump of assembler code for function test_function:' + line = "Dump of assembler code for function test_function:" fmt, fun = asm2cfg.parse_function_header(line) self.assertEqual(fmt, asm2cfg.InputFormat.GDB) - self.assertEqual(fun, 'test_function') + self.assertEqual(fun, "test_function") def test_gdb_stripped(self): - line = 'Dump of assembler code from 0x555555555faf to 0x555555557008:' + line = "Dump of assembler code from 0x555555555faf to 0x555555557008:" fmt, fun = asm2cfg.parse_function_header(line) self.assertEqual(fmt, asm2cfg.InputFormat.GDB) - self.assertEqual(fun, '0x555555555faf-0x555555557008') + self.assertEqual(fun, "0x555555555faf-0x555555557008") def test_objdump(self): - line = '000000000000100b :' + line = "000000000000100b :" fmt, fun = asm2cfg.parse_function_header(line) self.assertEqual(fmt, asm2cfg.InputFormat.OBJDUMP) - self.assertEqual(fun, 'bar') + self.assertEqual(fun, "bar") class ParseAddressTestCase(unittest.TestCase): @@ -40,24 +40,24 @@ class ParseAddressTestCase(unittest.TestCase): """ def test_absolute(self): - line = '0x000055555557259c: XYZ' + line = "0x000055555557259c: XYZ" address, rest = asm2cfg.parse_address(line) self.assertIsNot(address, None) - self.assertEqual(address.abs, 0x55555557259c) + self.assertEqual(address.abs, 0x55555557259C) self.assertIs(address.base, None) self.assertIs(address.offset, None) - self.assertEqual(rest, ' XYZ') + self.assertEqual(rest, " XYZ") def test_relative(self): - line = '0x000055555557259c <+11340>: XYZ' + line = "0x000055555557259c <+11340>: XYZ" address, rest = asm2cfg.parse_address(line) self.assertIsNot(address, None) - self.assertEqual(address.abs, 0x55555557259c) + self.assertEqual(address.abs, 0x55555557259C) self.assertIs(address.base, None) self.assertEqual(address.offset, 11340) - self.assertEqual(rest, ' XYZ') + self.assertEqual(rest, " XYZ") class ParseBodyTestCase(unittest.TestCase): @@ -69,54 +69,54 @@ def setUp(self): self.target_info = asm2cfg.X86TargetInfo() def test_gdb_stripped_known(self): - line = ' call 0x55555558add0 <_Z19exportDebugifyStats>' + line = " call 0x55555558add0 <_Z19exportDebugifyStats>" body, opcode, ops, rest = asm2cfg.parse_body(line, self.target_info) self.assertIsNot(body, None) - self.assertEqual(body, 'call 0x55555558add0') - self.assertEqual(opcode, 'call') - self.assertEqual(ops, ['0x55555558add0']) - self.assertEqual(rest, '<_Z19exportDebugifyStats>') + self.assertEqual(body, "call 0x55555558add0") + self.assertEqual(opcode, "call") + self.assertEqual(ops, ["0x55555558add0"]) + self.assertEqual(rest, "<_Z19exportDebugifyStats>") def test_gdb_stripped_pic(self): - line = ' call *0x26a16(%rip) # 0x5555555967a8' + line = " call *0x26a16(%rip) # 0x5555555967a8" body, opcode, ops, rest = asm2cfg.parse_body(line, self.target_info) self.assertIsNot(body, None) - self.assertEqual(body, 'call *0x26a16(%rip)') - self.assertEqual(opcode, 'call') - self.assertEqual(ops, ['*0x26a16(%rip)']) - self.assertEqual(rest, '# 0x5555555967a8') + self.assertEqual(body, "call *0x26a16(%rip)") + self.assertEqual(opcode, "call") + self.assertEqual(ops, ["*0x26a16(%rip)"]) + self.assertEqual(rest, "# 0x5555555967a8") def test_gdb_plt(self): - line = ' callq 0x1020 ' + line = " callq 0x1020 " body, opcode, ops, rest = asm2cfg.parse_body(line, self.target_info) self.assertIsNot(body, None) - self.assertEqual(body, 'callq 0x1020') - self.assertEqual(opcode, 'callq') - self.assertEqual(ops, ['0x1020']) - self.assertEqual(rest, '') + self.assertEqual(body, "callq 0x1020") + self.assertEqual(opcode, "callq") + self.assertEqual(ops, ["0x1020"]) + self.assertEqual(rest, "") def test_gdb_stripped_nonpic(self): - line = ' call 0x555555555542' + line = " call 0x555555555542" body, opcode, ops, rest = asm2cfg.parse_body(line, self.target_info) self.assertIsNot(body, None) - self.assertEqual(body, 'call 0x555555555542') - self.assertEqual(opcode, 'call') - self.assertEqual(ops, ['0x555555555542']) - self.assertEqual(rest, '') + self.assertEqual(body, "call 0x555555555542") + self.assertEqual(opcode, "call") + self.assertEqual(ops, ["0x555555555542"]) + self.assertEqual(rest, "") def test_gdb_indirect_call(self): - line = ' callq *(%rsi)' + line = " callq *(%rsi)" body, opcode, ops, rest = asm2cfg.parse_body(line, self.target_info) self.assertIsNot(body, None) - self.assertEqual(body, 'callq *(%rsi)') - self.assertEqual(opcode, 'callq') - self.assertEqual(ops, ['*(%rsi)']) - self.assertEqual(rest, '') + self.assertEqual(body, "callq *(%rsi)") + self.assertEqual(opcode, "callq") + self.assertEqual(ops, ["*(%rsi)"]) + self.assertEqual(rest, "") class ParseTargetTestCase(unittest.TestCase): @@ -125,44 +125,44 @@ class ParseTargetTestCase(unittest.TestCase): """ def test_with_offset(self): - line = '<_Z19exportDebugifyStats+123>' + line = "<_Z19exportDebugifyStats+123>" address, rest = asm2cfg.parse_target(line) self.assertIsNot(address, None) self.assertIs(address.abs, None) - self.assertEqual(address.base, '_Z19exportDebugifyStats') + self.assertEqual(address.base, "_Z19exportDebugifyStats") self.assertEqual(address.offset, 123) - self.assertEqual(rest, '') + self.assertEqual(rest, "") def test_with_neg_offset(self): - line = '<_Z19exportDebugifyStats-123>' + line = "<_Z19exportDebugifyStats-123>" address, rest = asm2cfg.parse_target(line) self.assertIsNot(address, None) self.assertIs(address.abs, None) - self.assertEqual(address.base, '_Z19exportDebugifyStats') + self.assertEqual(address.base, "_Z19exportDebugifyStats") self.assertEqual(address.offset, -123) - self.assertEqual(rest, '') + self.assertEqual(rest, "") def test_without_offset(self): - line = '<_Z19exportDebugifyStats>' + line = "<_Z19exportDebugifyStats>" address, rest = asm2cfg.parse_target(line) self.assertIsNot(address, None) self.assertIs(address.abs, None) - self.assertEqual(address.base, '_Z19exportDebugifyStats') + self.assertEqual(address.base, "_Z19exportDebugifyStats") self.assertEqual(address.offset, 0) - self.assertEqual(rest, '') + self.assertEqual(rest, "") def test_with_dot(self): - line = '' + line = "" address, rest = asm2cfg.parse_target(line) self.assertIsNot(address, None) self.assertIs(address.abs, None) - self.assertEqual(address.base, 'stdin@GLIBC_2.2.5') + self.assertEqual(address.base, "stdin@GLIBC_2.2.5") self.assertEqual(address.offset, 0) - self.assertEqual(rest, '') + self.assertEqual(rest, "") class ParseCommentTestCase(unittest.TestCase): @@ -174,31 +174,31 @@ def setUp(self): self.target_info = asm2cfg.X86TargetInfo() def test_absolute(self): - line = '# 0x5555555967a8' + line = "# 0x5555555967a8" address, rest = asm2cfg.parse_comment(line, self.target_info) self.assertIsNot(address, None) - self.assertEqual(address.abs, 0x5555555967a8) + self.assertEqual(address.abs, 0x5555555967A8) self.assertIs(address.base, None) self.assertIs(address.offset, None) - self.assertEqual(rest, '') + self.assertEqual(rest, "") def test_symbolic(self): - line = '# 0x5555555967a8 ' + line = "# 0x5555555967a8 " address, rest = asm2cfg.parse_comment(line, self.target_info) self.assertIsNot(address, None) - self.assertEqual(address.abs, 0x5555555967a8) - self.assertEqual(address.base, 'foo') + self.assertEqual(address.abs, 0x5555555967A8) + self.assertEqual(address.base, "foo") self.assertIs(address.offset, 0) - self.assertEqual(rest, '') + self.assertEqual(rest, "") def test_complete(self): - line = '# 3ff8 ' + line = "# 3ff8 " address, rest = asm2cfg.parse_comment(line, self.target_info) self.assertIsNot(address, None) - self.assertEqual(address.abs, 0x3ff8) # FIXME: support hex offsets - self.assertEqual(address.base, 'foo') - self.assertEqual(address.offset, 0x2ff8) - self.assertEqual(rest, '') + self.assertEqual(address.abs, 0x3FF8) # FIXME: support hex offsets + self.assertEqual(address.base, "foo") + self.assertEqual(address.offset, 0x2FF8) + self.assertEqual(rest, "") From a3e4645b52241e913e01af4b2020bcaafeb662f6 Mon Sep 17 00:00:00 2001 From: Mauri Mustonen Date: Mon, 1 Apr 2024 16:23:35 +0300 Subject: [PATCH 31/31] more fixes --- ocgraph/__main__.py | 4 +-- ocgraph/configuration/configuration.py | 29 ++++++++----------- .../configuration/disassembler/gdb_default.py | 3 +- ocgraph/configuration/logger.py | 1 - ocgraph/interface/analyzer.py | 4 +-- 5 files changed, 17 insertions(+), 24 deletions(-) mode change 100755 => 100644 ocgraph/configuration/logger.py diff --git a/ocgraph/__main__.py b/ocgraph/__main__.py index b53afcb..fee92c2 100755 --- a/ocgraph/__main__.py +++ b/ocgraph/__main__.py @@ -64,9 +64,7 @@ def main(): args = parser.parse_args() # Create configuration - config = OcGraphConfiguration( - disassembler=args.diss, arch=args.arch, logging_preset=args.logger - ) + config = OcGraphConfiguration(disassembler=args.diss, arch=args.arch, preset=args.logger) lines = read_lines(args.file) diff --git a/ocgraph/configuration/configuration.py b/ocgraph/configuration/configuration.py index b01c9d3..34630ef 100755 --- a/ocgraph/configuration/configuration.py +++ b/ocgraph/configuration/configuration.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: GTDGmbH """Module for configuration of the ocgraph package.""" -from .logger import OcctreLogger, preset_logging +from .logger import OCGraphLogger, logging_preset from .architecture.architecture import Architecture from .architecture.x86 import X86Architecture @@ -51,21 +51,25 @@ "architecture": PpcArchitecture(), }, } - # fmt: on class OcGraphConfiguration: """Implement configuration presets for the ASM2CFG tool.""" - def __init__( - self, arch: str = "sparc", disassembler: str = "OBJDUMP", logging_preset="default" - ): + logger: OCGraphLogger + """Logging mechanism for module""" + architecture: Architecture + """Target architecture instance""" + disassembler: Disassembler + """Target disassembler tool like OBJDump, GDB, ...""" + + def __init__(self, arch: str = "sparc", disassembler: str = "OBJDUMP", preset="default"): if architecture_option.get(arch) is None: raise NotImplementedError("Architecture option not supported!") if disassembler_option.get(disassembler) is None: raise NotImplementedError("Disassembler option not supported!") - if preset_logging.get(logging_preset) is None: + if logging_preset.get(preset) is None: raise NotImplementedError("Logging preset not supported!") # load module preset @@ -74,7 +78,7 @@ def __init__( self.__dict__ = _preset # configure logging - self.logger = OCGraphLogger("OcGraph", logging_preset, "asm2cfg.log") + self.logger = OCGraphLogger("OcGraph", preset, "asm2cfg.log") @staticmethod def architectures(): @@ -89,13 +93,4 @@ def disassemblers(): @staticmethod def loggers(): """Return all available disassemblers options""" - return preset_logging.keys() - - logger: logging.Logger = logging.Logger("OcGraph") - """Logging mechanism for module""" - - architecture: Architecture - """Target architecture instance""" - - disassembler: Disassembler - """Target disassembler tool like OBJDump, GDB, ...""" + return logging_preset.keys() diff --git a/ocgraph/configuration/disassembler/gdb_default.py b/ocgraph/configuration/disassembler/gdb_default.py index 7d54096..beecbbf 100755 --- a/ocgraph/configuration/disassembler/gdb_default.py +++ b/ocgraph/configuration/disassembler/gdb_default.py @@ -214,4 +214,5 @@ def parse_line(self, line: str, lineno, function_name: str) -> Instruction | Non ) def parse_jump_target(self, str_input: str) -> int | None: - return int(re.search(rf"{HEX_LONG_PATTERN}", str_input)[0], 16) + # TODO: Fix str_input indexing because it's a list. + return int(re.search(rf"{HEX_LONG_PATTERN}", str_input[0])[0], 16) diff --git a/ocgraph/configuration/logger.py b/ocgraph/configuration/logger.py old mode 100755 new mode 100644 index b68874b..badab2a --- a/ocgraph/configuration/logger.py +++ b/ocgraph/configuration/logger.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # SPDX-License-Identifier: GTDGmbH # Copyright 2024 by GTD GmbH. """Class configuring the OCGraph logging.""" diff --git a/ocgraph/interface/analyzer.py b/ocgraph/interface/analyzer.py index 7d436b7..2427109 100755 --- a/ocgraph/interface/analyzer.py +++ b/ocgraph/interface/analyzer.py @@ -125,11 +125,11 @@ def _create_basic_blocks(self) -> None: prev_branch_block: BasicBlock | None = None # block completion flag (introduced for SPARC pipeline) - block_completion: int = 0 + block_completion: int | None = 0 for instruction in self.instructions: # if block completion is in progress - if block_completion > 0: + if block_completion is not None and block_completion > 0: block_completion -= 1 if block_completion > 0: self.basic_blocks[curr_basic_block.key].add_instruction(instruction)