EPFLiGHT
diff --git a/‎docs/process.md‎
Lines changed: 22 additions & 0 deletions b/‎docs/process.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎examples/postprocessor/config.yaml‎
Lines changed: 2 additions & 0 deletions b/‎examples/postprocessor/config.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/process/config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/process/config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/mmore/process/crawler.py‎
Lines changed: 1 addition & 107 deletions b/‎src/mmore/process/crawler.py‎
Lines changed: 1 addition & 107 deletions
diff --git a/‎src/mmore/process/dispatcher.py‎
Lines changed: 20 additions & 0 deletions b/‎src/mmore/process/dispatcher.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/mmore/process/incremental.py‎
Lines changed: 122 additions & 0 deletions b/‎src/mmore/process/incremental.py‎
Lines changed: 122 additions & 0 deletions
@@ -71,6 +71,20 @@ You can configure parameters by providing a custom config file. You can find an
 
 :rotating_light: Not all parameters are configurable yet :wink:
 
+### :recycle: Incremental reprocessing
+
+The optional top-level `previous_results` parameter lets you reuse results from a prior run to avoid reprocessing unchanged files so as to save time and compute costs.
+
+```yaml
+previous_results: examples/process/outputs/merged/merged_results.jsonl
+```
+
+Point it to a `merged_results.jsonl` produced by an earlier run. On the next run, each local input file is compared against that JSONL (meanwhile URL inputs are always reprocessed):
+
+- Unchanged files: their previous samples are reused as-is.
+- New or modified files: they are processed normally.
+- Removed files: their samples are dropped from the output.
+
 ## :scroll: More information on what's under the hood
 
 ### :construction: Pipeline architecture
@@ -127,4 +141,12 @@ python3 -m mmore postprocess --config-file examples/postprocessor/config.yaml --
 
 Specify with `--input-data` the path (absolute or relative to the root of the repository) to the JSONL recoding of the output of the initial processing phase.
 
+### :recycle: Incremental post-processing
+
+Like the processing pipeline, the post-processor accepts an optional `previous_results` parameter to reuse results from a prior post-processing run and skip unchanged documents.
+
+```yaml
+previous_results: examples/postprocessor/outputs/merged/results.jsonl
+```
+
 New post-processors can easily be implemented, and pipelines can be configured through lightweight YAML files. The post-processing stage produces a new JSONL file containing cleaned and optionally enhanced document samples.
@@ -1,3 +1,5 @@
+previous_results: null  # Path to a previously post-processed JSONL to reuse when documents are unchanged
+
 pp_modules:
   - type: chunker
     args:
 
@@ -1,5 +1,6 @@
 data_path: examples/sample_data/ #put absolute path! Possible to pass a list of folders as well
 google_drive_ids: [] #put ids of google drive folders
+previous_results: null  # Path to a previously processed JSONL to reuse when documents are unchanged
 dispatcher_config:
   output_path: examples/process/outputs/ #put absolute path or relative to the root of the module
   use_fast_processors: false
 
@@ -1,4 +1,3 @@
-import json
 import logging
 import os
 from typing import Dict, List, Optional
@@ -97,72 +96,6 @@ def from_dict(data: dict):
         return DispatcherReadyResult(urls=urls, file_paths=file_paths)
 
 
-class FindAlreadyComputedFiles:
-    """
-    This class is used to get the list of all files that have already been processed.
-    It will traverse the output_path directory and get all the results.jsonl files
-    where in each line (representing a sample) we have the metadata of the file_path that was used to create that sample.
-    > See create_sample in utils.py (file_path is in metadata of the sample).
-
-    Reminder here is the structure of the output_path directory (see DispatcherConfig):
-    output_path
-    ├── processors
-    | ├── Processor_type_1
-    | | └── results.jsonl
-    | ├── Processor_type_2
-    | | └── results.jsonl
-    | ├── ...
-    |
-    └── merged
-        └── merged_results.jsonl
-    """
-
-    def __init__(self, output_path: str):
-        """
-        output_path: the path where the output of the Process is stored.
-        """
-        if output_path is None:
-            raise ValueError("output_path must be provided.")
-        self.output_path = output_path
-
-    def _get_all_samples_jsonl_paths(self, output_path):
-        # Get all the results.jsonl files in the output_path directory.
-        samples_files = []
-        for root, _, files in os.walk(output_path):
-            for file in files:
-                if file.endswith("results.jsonl"):
-                    samples_files.append(os.path.join(root, file))
-        return samples_files
-
-    def _get_metadata_jsonl_path(self, results_jsonl_path):
-        # read jsonl file and for each item in the file, get the metadata's file_path
-        # return the list of all file_path in this jsonl file
-        file_paths = []
-        with open(results_jsonl_path, "r") as f:
-            for i, line in enumerate(f):
-                data = json.loads(line)
-                if "metadata" in data and "file_path" in data["metadata"]:
-                    file_paths.append(data["metadata"]["file_path"])
-                else:
-                    logger.error(
-                        f"Warning file_path not found in metadate (line{i} of {results_jsonl_path})"
-                    )
-        return file_paths
-
-    def get_all_files_already_processed(self) -> set[str]:
-        """
-        This function returns the set of all files path's that have already been processed.
-        Returns:
-             set of all files path's that have already been processed.
-        """
-        samples = self._get_all_samples_jsonl_paths(self.output_path)
-        files_already_processed = set()
-        for f in samples:
-            line = self._get_metadata_jsonl_path(f)
-            files_already_processed.update(line)
-        return files_already_processed
-
-
 class CrawlerConfig:
     """
     Configuration for the Crawler.
@@ -308,40 +241,9 @@ def _traverse_directories(self) -> None:
                             FileDescriptor.from_filename(filepath)
                         )
 
-    def _filter_out_already_processed_files(
-        self, files: Dict[str, List[FileDescriptor]], output_path: str
-    ) -> Dict[str, List[FileDescriptor]]:
-        """
-        Avoid processing files that have already been processed.
-        Immutable function.
-        Args:
-            files: the crawled files that want to be processed. We want to remove the files that have already been processed.
-            output_path: the path where the outputs of the "process" is stored.
-        Returns:
-            filtered out 'files' to process.
-        """
-        all_files_done: set[str] = FindAlreadyComputedFiles(
-            output_path
-        ).get_all_files_already_processed()
-        logger.info(f"Found {len(all_files_done)} files already processed.")
-
-        for root_dir, files_in_dir in files.items():
-            files[root_dir] = [
-                f for f in files_in_dir if f.file_path not in all_files_done
-            ]
-
-        if len(all_files_done) > 0:
-            logger.info(f"Removed {len(all_files_done)} files already processed.")
-            logger.info(
-                f"New total files to process: {sum(len(files) for files in files.values())}"
-            )
-        return files
-
-    def crawl(self, skip_already_processed: bool = False) -> DispatcherReadyResult:
+    def crawl(self) -> DispatcherReadyResult:
         """
         Crawl the configured directories and URLs.
-        Args:
-            skip_already_processed (bool): if set to True, the crawler will scan the outputs folder and detect files that correspond to them, and skip them.
         Returns:
             DispatcherReadyResult: The result of the crawl operation, ready to be dispatched to the processors.
         """
@@ -368,12 +270,4 @@ def crawl(self, skip_already_processed: bool = False) -> DispatcherReadyResult:
         urls: List[URLDescriptor] = self.files["url"]
         file_paths: Dict[str, List[FileDescriptor]] = self.files["local"]
 
-        if self.config.output_path and skip_already_processed:
-            logger.info(
-                "Checking if some of those files to process have already been processed."
-            )
-            file_paths = self._filter_out_already_processed_files(
-                files=file_paths, output_path=self.config.output_path
-            )
-
         return DispatcherReadyResult(urls=urls, file_paths=file_paths)
@@ -1,6 +1,7 @@
 import logging
 import os
 from dataclasses import dataclass
+from datetime import datetime
 from operator import itemgetter
 from typing import Dict, Iterator, List, Optional, Tuple, Type, Union, cast
 
@@ -321,10 +322,24 @@ def process_files(
 
         return results
 
+    def _clear_per_processor_results(self) -> None:
+        """Clear per-processor result JSONL files.
+        This is needed because :meth:`MultimodalSample.to_jsonl` uses append by default."""
+        if not self.config.output_path:
+            return
+        processors_dir = os.path.join(self.config.output_path, "processors")
+        if not os.path.isdir(processors_dir):
+            return
+        for processor_name in os.listdir(processors_dir):
+            results_path = os.path.join(processors_dir, processor_name, "results.jsonl")
+            if os.path.exists(results_path):
+                os.remove(results_path)
+
     def dispatch(self) -> List[List[MultimodalSample]]:
         """
         Dispatches the result to the appropriate processor.
         """
+        self._clear_per_processor_results()
 
         def batch_list(
             lst: List, obj_batch_size: int, processor: Type[Processor]
@@ -403,6 +418,11 @@ def save_individual_processor_results(
         if not self.config.output_path:
             return
 
+        processed_at = datetime.now().isoformat()
+        for sample in results:
+            sample.metadata["processed_at"] = processed_at
+            sample.metadata["processor_type"] = cls_name
+
         processor_output_path = os.path.join(
             self.config.output_path, "processors", cls_name
         )
 
@@ -0,0 +1,122 @@
+import json
+import logging
+import os
+from datetime import datetime
+from typing import Dict, List, Set
+
+from ..type import MultimodalSample
+
+logger = logging.getLogger(__name__)
+
+
+def _iter_samples_jsonl(path: str):
+    """Helper function to stream line by line a JSONL to avoid loading it fully in memory."""
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"Previous results file not found: {path}")
+    with open(path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            yield MultimodalSample.from_dict(json.loads(line))
+
+
+def load_previous_process_results(path: str) -> Dict[str, MultimodalSample]:
+    """Index samples by ``metadata.file_path`` for the processing pipeline,
+    keeping the latest ``processed_at`` if there are any duplicates."""
+    samples_by_file_path: Dict[str, List[MultimodalSample]] = {}
+    for sample in _iter_samples_jsonl(path):
+        samples_by_file_path.setdefault(sample.metadata["file_path"], []).append(sample)
+
+    index: Dict[str, MultimodalSample] = {}
+    for file_path, samples in samples_by_file_path.items():
+        if len(samples) > 1:
+            logger.warning(
+                "Duplicate samples for file_path %s: keeping latest processed_at, "
+                "dropping %d samples",
+                file_path,
+                len(samples) - 1,
+            )
+
+        index[file_path] = max(
+            samples,
+            key=lambda s: datetime.fromisoformat(s.metadata["processed_at"])
+            if s.metadata.get("processed_at") is not None
+            else datetime.min,
+        )
+    return index
+
+
+def load_previous_postprocess_results(
+    path: str,
+) -> Dict[str, List[MultimodalSample]]:
+    """Index samples by ``metadata.file_path`` for the post-processing pipeline."""
+    index: Dict[str, List[MultimodalSample]] = {}
+    for sample in _iter_samples_jsonl(path):
+        index.setdefault(sample.metadata["file_path"], []).append(sample)
+    return index
+
+
+def is_reusable_process(file_path: str, previous: Dict[str, MultimodalSample]) -> bool:
+    """Check whether the previous processed sample the given file can be reused.
+
+    Conditions (all required):
+    - ``file_path`` is present in ``previous``
+    - the cached sample has a ``processed_at`` timestamp
+    - the source file has not been modified since (``file_mtime <= processed_at``)
+    """
+    sample = previous.get(file_path)
+    if sample is None:
+        return False
+
+    processed_at_str = sample.metadata.get("processed_at")
+    if processed_at_str is None:
+        return False
+
+    processed_at = datetime.fromisoformat(processed_at_str)
+    if not os.path.exists(file_path):
+        return False
+    file_mtime = datetime.fromtimestamp(os.path.getmtime(file_path))
+    return file_mtime <= processed_at
+
+
+def is_reusable_postprocess(
+    file_path: str,
+    input_processed_at: str,
+    previous: Dict[str, List[MultimodalSample]],
+) -> bool:
+    """Check whether the previous post-processed samples of the given file can be reused.
+
+    Conditions (all required):
+    - ``file_path`` has at least one cached sample in ``previous``
+    - every cached sample has a ``processed_at`` timestamp
+    - ``input_processed_at <= min(cached processed_at)``
+    """
+    samples = previous.get(file_path)
+    if not samples:
+        return False
+
+    timestamps: List[datetime] = []
+    for s in samples:
+        timestamp_str = s.metadata.get("processed_at")
+        if timestamp_str is None:
+            return False
+        timestamps.append(datetime.fromisoformat(timestamp_str))
+
+    return datetime.fromisoformat(input_processed_at) <= min(timestamps)
+
+
+def merge_results(
+    reused: Dict[str, List[MultimodalSample]],
+    new_results: List[MultimodalSample],
+    current_file_paths: Set[str],
+) -> List[MultimodalSample]:
+    """Combine reused and newly processed/post-processed samples."""
+    merged: List[MultimodalSample] = []
+    for file_path, samples in reused.items():
+        if file_path in current_file_paths:
+            merged.extend(samples)
+    for sample in new_results:
+        if sample.metadata["file_path"] in current_file_paths:
+            merged.append(sample)
+    return merged
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+previous_results: null # Path to a previously post-processed JSONL to reuse when documents are unchanged`
	`2`	`+`
`1`	`3`	`pp_modules:`
`2`	`4`	`- type: chunker`
`3`	`5`	`args:`