fix(processing): adapt is_padding to fix potential MemoryError

qkaiser · qkaiser · commit bd8c4acfd337 · 2025-03-24T14:43:06.000+01:00
If an unknown chunk size is larger than available RAM on the system
where unblob is run, the previous is_padding implementation could lead
to MemoryError as it tries to load everything in memory.

Fixed by iterating over the unknown chunk with iterate_file while using
an early return optimization so we return as soon as we have different bytes.
diff --git a/python/unblob/processing.py b/python/unblob/processing.py
@@ -462,7 +462,18 @@ def _iterate_directory(self, extract_dirs, processed_paths):
 
 
 def is_padding(file: File, chunk: UnknownChunk):
-    return len(set(file[chunk.start_offset : chunk.end_offset])) == 1
+    chunk_bytes = set()
+
+    for small_chunk in iterate_file(
+        file, chunk.start_offset, chunk.end_offset - chunk.start_offset
+    ):
+        chunk_bytes.update(small_chunk)
+
+        # early return optimization
+        if len(chunk_bytes) > 1:
+            return False
+
+    return len(chunk_bytes) == 1
 
 
 def process_patterns(