michelcrypt4d4mus · michelcrypt4d4mus · Jun 24, 2025
diff --git a/README.md b/README.md
@@ -74,6 +74,15 @@ If you provide none of the flags in the `ANALYSIS SELECTION` section of the `--h
 
 The `--streams` output is the one used to hunt for patterns in the embedded bytes and can be _extremely_ verbose depending on the `--quote-char` options chosen (or not chosen) and contents of the PDF. [The Yaralyzer](https://github.com/michelcrypt4d4mus/yaralyzer) handles this task; if you want to hunt for patterns in the bytes other than bytes surrounded by backticks/frontslashes/brackets/quotes/etc. you may want to use The Yaralyzer directly. As The Yaralyzer is a prequisite for The Pdfalyzer you may already have the `yaralyze` command installed and available.
 
+### Exporting to JSON
+Use the `-json` flag to export analysis results to structured JSON files:
+
+```sh
+pdfalyze suspicious.pdf -json --output-dir ./analysis
+```
+
+This will create JSON files for each analysis type (document info, tree structure, fonts, etc.) preserving the complete PDF structure for programmatic processing.
+
 ### Setting Command Line Options Permanently With A `.pdfalyzer` File
 When you run `pdfalyze` on some PDF the tool will check for a file called `.pdfalyzer` first in the current directory and then in the home directory. If it finds a file in either such place it will load configuration options from it. Documentation on the options that can be configured with these files lives in [`.pdfalyzer.example`](.pdfalyzer.example) which doubles as an example file you can copy into place and edit to your needs. Handy if you find yourself typing the same command line options over and over again.
 
@@ -141,7 +150,7 @@ for backtick_quoted_string in font.binary_scanner.extract_backtick_quoted_bytes(
 -------------
 
 # Example Output
-The Pdfalyzer can export visualizations to HTML, ANSI colored text, and SVG images using the file export functionality that comes with [Rich](https://github.com/Textualize/rich). SVGs can be turned into `png` format images with a tool like Inkscape or `cairosvg` (Inkscape works a lot better in our experience). See `pdfalyze --help` for the specifics.
+The Pdfalyzer can export visualizations to HTML, ANSI colored text, SVG images, and JSON data files using the file export functionality that comes with [Rich](https://github.com/Textualize/rich). SVGs can be turned into `png` format images with a tool like Inkscape or `cairosvg` (Inkscape works a lot better in our experience). JSON export preserves the complete PDF structure for programmatic analysis. See `pdfalyze --help` for the specifics.
 
 
 ## Basic Tree View

diff --git a/pdfalyzer/__init__.py b/pdfalyzer/__init__.py
@@ -1,5 +1,6 @@
 import code
 import sys
+from datetime import datetime
 from os import environ, getcwd, path
 from pathlib import Path
 
@@ -27,6 +28,7 @@
 
 from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
 from pdfalyzer.helpers.rich_text_helper import print_highlighted
+from pdfalyzer.output.json_exporter import JsonExporter
 from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
 from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
 from pdfalyzer.pdfalyzer import Pdfalyzer
@@ -41,8 +43,12 @@
 def pdfalyze():
     args = parse_arguments()
     pdfalyzer = Pdfalyzer(args.file_to_scan_path)
-    pdfalyzer = PdfalyzerPresenter(pdfalyzer)
+    presenter = PdfalyzerPresenter(pdfalyzer)
     output_basepath = None
+
+    # Initialize JSON exporter if needed
+    json_exporter = JsonExporter(pdfalyzer) if args.json else None
+    json_export_data = {} if args.json else None
 
     # Binary stream extraction is a special case
     if args.extract_binary_streams:
@@ -53,7 +59,7 @@ def pdfalyze():
 
     # The method that gets called is related to the argument name. See 'possible_output_sections' list in argument_parser.py
     # Analysis exports wrap themselves around the methods that actually generate the analyses
-    for (arg, method) in output_sections(args, pdfalyzer):
+    for (arg, method) in output_sections(args, presenter):
         if args.output_dir:
             output_basepath = PdfalyzerConfig.get_output_basepath(method)
             print(f'Exporting {arg} data to {output_basepath}...')
@@ -69,10 +75,52 @@ def pdfalyze():
 
         if args.export_svg:
             invoke_rich_export(console.save_svg, output_basepath)
+
+        # Handle JSON export
+        if args.json and json_exporter:
+            json_output_path = None
+            if arg == 'docinfo':
+                json_output_path = json_exporter.export_document_info(Path(args.output_dir))
+            elif arg == 'tree':
+                json_output_path = json_exporter.export_tree(Path(args.output_dir))
+            elif arg == 'rich':
+                # For rich tree, export the detailed tree structure
+                json_output_path = json_exporter.export_tree(Path(args.output_dir))
+                # Rename file to indicate it's the rich/detailed version
+                if json_output_path.exists():
+                    rich_path = json_output_path.parent / json_output_path.name.replace('_tree.json', '_rich_tree.json')
+                    json_output_path.rename(rich_path)
+                    json_output_path = rich_path
+            elif arg == 'counts':
+                json_output_path = json_exporter.export_summary(Path(args.output_dir))
+            elif arg == 'fonts':
+                json_output_path = json_exporter.export_fonts(Path(args.output_dir))
+            elif arg == 'streams':
+                json_output_path = json_exporter.export_streams(Path(args.output_dir))
+            elif arg == 'yara':
+                # Get YARA matches from the presenter's yaralyzer
+                if hasattr(presenter, 'yaralyzer') and hasattr(presenter.yaralyzer, 'yara_matches'):
+                    json_output_path = json_exporter.export_yara_results(Path(args.output_dir), presenter.yaralyzer.yara_matches)
+
+            if json_output_path:
+                json_export_data[arg] = str(json_output_path)
+                log_and_print(f"  -> Exported {arg} to JSON: {json_output_path}")
 
         # Clear the buffer if we have one
         if args.output_dir:
             del console._record_buffer[:]
+
+    # If JSON export was requested, create a manifest file
+    if args.json and json_export_data:
+        manifest_path = Path(args.output_dir) / f"{pdfalyzer.pdf_basename}_manifest.json"
+        with open(manifest_path, 'w') as f:
+            import json
+            json.dump({
+                "pdf_file": pdfalyzer.pdf_basename,
+                "exports": json_export_data,
+                "timestamp": datetime.now().isoformat()
+            }, f, indent=2)
+        log_and_print(f"\nJSON export complete. Manifest written to: {manifest_path}")
 
     # Drop into interactive shell if requested
     if args.interact: