nodestream-proj
diff --git a/‎nodestream/cli/commands/run.py
Lines changed: 5 additions & 0 deletions b/‎nodestream/cli/commands/run.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎nodestream/cli/operations/run_copy.py
Lines changed: 4 additions & 1 deletion b/‎nodestream/cli/operations/run_copy.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎nodestream/cli/operations/run_pipeline.py
Lines changed: 4 additions & 0 deletions b/‎nodestream/cli/operations/run_pipeline.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎nodestream/pipeline/extractors/__init__.py
Lines changed: 2 additions & 3 deletions b/‎nodestream/pipeline/extractors/__init__.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎nodestream/pipeline/extractors/extractor.py
Lines changed: 38 additions & 6 deletions b/‎nodestream/pipeline/extractors/extractor.py
Lines changed: 38 additions & 6 deletions
diff --git a/‎nodestream/pipeline/extractors/files.py
Lines changed: 145 additions & 51 deletions b/‎nodestream/pipeline/extractors/files.py
Lines changed: 145 additions & 51 deletions
diff --git a/‎nodestream/pipeline/extractors/iterable.py
Lines changed: 15 additions & 1 deletion b/‎nodestream/pipeline/extractors/iterable.py
Lines changed: 15 additions & 1 deletion
diff --git a/‎nodestream/pipeline/extractors/stores/aws/__init__.py
Lines changed: 1 addition & 2 deletions b/‎nodestream/pipeline/extractors/stores/aws/__init__.py
Lines changed: 1 addition & 2 deletions
@@ -52,6 +52,11 @@ class Run(NodestreamCommand):
             description="Ensure all specified targets are migrated before running specified pipelines",
             flag=True,
         ),
+        option(
+            "storage-backend",
+            description="Storage backend to use for checkpointing",
+            flag=False,
+        ),
         *PROMETHEUS_OPTIONS,
     ]
 
 
@@ -2,6 +2,7 @@
 
 from ...databases import Copier, GraphDatabaseWriter
 from ...pipeline import Pipeline
+from ...pipeline.object_storage import ObjectStore
 from ...project import Project, Target
 from ..commands.nodestream_command import NodestreamCommand
 from .operation import Operation
@@ -29,7 +30,9 @@ async def perform(self, command: NodestreamCommand):
     def build_pipeline(self) -> Pipeline:
         copier = self.build_copier()
         writer = self.build_writer()
-        return Pipeline([copier, writer], step_outbox_size=10000)
+        return Pipeline(
+            [copier, writer], step_outbox_size=10000, object_store=ObjectStore.null()
+        )
 
     def build_copier(self) -> Copier:
         return Copier(
 
@@ -92,6 +92,9 @@ def print_effective_config(config):
                 command.line("<info>Effective configuration:</info>")
                 command.line(f"<info>{safe_dump(config)}</info>")
 
+        storage_name = command.option("storage-backend")
+        object_store = self.project.get_object_storage_by_name(storage_name)
+
         return RunRequest(
             pipeline_name=pipeline.name,
             initialization_arguments=PipelineInitializationArguments(
@@ -101,6 +104,7 @@ def print_effective_config(config):
                 extra_steps=list(
                     self.get_writer_steps_for_specified_targets(command, pipeline)
                 ),
+                object_store=object_store,
             ),
             progress_reporter=self.create_progress_reporter(command, pipeline.name),
         )
 
@@ -1,14 +1,13 @@
 from .apis import SimpleApiExtractor
 from .extractor import Extractor
-from .files import FileExtractor, RemoteFileExtractor
+from .files import FileExtractor
 from .iterable import IterableExtractor
 from .ttls import TimeToLiveConfigurationExtractor
 
 __all__ = (
     "Extractor",
     "IterableExtractor",
-    "FileExtractor",
-    "RemoteFileExtractor",
     "TimeToLiveConfigurationExtractor",
     "SimpleApiExtractor",
+    "FileExtractor",
 )
@@ -1,20 +1,52 @@
 from abc import abstractmethod
-from typing import Any, AsyncGenerator
+from typing import Any, AsyncGenerator, Generic, TypeVar
 
-from ..step import Step
+from ..step import Step, StepContext
 
+R = TypeVar("R")
+T = TypeVar("T")
+CHECKPOINT_OBJECT_KEY = "extractor_progress_checkpoint"
 
-class Extractor(Step):
+
+class Extractor(Step, Generic[R, T]):
     """Extractors represent the source of a set of records.
 
     They are like any other step. However, they ignore the incoming record '
     stream and instead produce their own stream of records. For this reason
     they generally should only be set at the beginning of a pipeline.
     """
 
-    def emit_outstanding_records(self):
-        return self.extract_records()
+    CHECKPOINT_INTERVAL = 1000
+
+    async def start(self, context: StepContext):
+        if checkpoint := context.object_store.get_pickled(CHECKPOINT_OBJECT_KEY):
+            context.info("Found Checkpoint For Extractor. Signaling to resume from it.")
+            await self.resume_from_checkpoint(checkpoint)
+
+    async def finish(self, context: StepContext):
+        context.debug("Clearing checkpoint for extractor since extractor is finished.")
+        context.object_store.delete(CHECKPOINT_OBJECT_KEY)
+
+    async def make_checkpoint(self) -> T:
+        return None
+
+    async def resume_from_checkpoint(self, checkpoint_object: T):
+        pass
+
+    async def commit_checkpoint(self, context: StepContext) -> None:
+        if checkpoint := await self.make_checkpoint():
+            context.object_store.put_picklable(CHECKPOINT_OBJECT_KEY, checkpoint)
+
+    async def emit_outstanding_records(
+        self, context: StepContext
+    ) -> AsyncGenerator[R, None]:
+        items_generated = 0
+        async for record in self.extract_records():
+            yield record
+            items_generated += 1
+            if items_generated % self.CHECKPOINT_INTERVAL == 0:
+                await self.commit_checkpoint(context)
 
     @abstractmethod
-    async def extract_records(self) -> AsyncGenerator[Any, Any]:
+    def extract_records(self) -> AsyncGenerator[R, Any]:
         raise NotImplementedError
@@ -7,7 +7,7 @@
 from contextlib import asynccontextmanager
 from csv import DictReader
 from glob import glob
-from io import BufferedReader, IOBase, TextIOWrapper
+from io import BufferedReader, BytesIO, IOBase, TextIOWrapper
 from logging import getLogger
 from pathlib import Path
 from typing import (
@@ -18,6 +18,7 @@
     Dict,
     Iterable,
     List,
+    Optional,
     Tuple,
 )
 
@@ -28,6 +29,7 @@
 from ...model import JsonLikeDocument
 from ...pluggable import Pluggable
 from ...subclass_registry import MissingFromRegistryError, SubclassRegistry
+from .credential_utils import AwsClientFactory
 from .extractor import Extractor
 
 SUPPORTED_FILE_FORMAT_REGISTRY = SubclassRegistry()
@@ -496,7 +498,123 @@ def describe(self) -> str:
             return f"{len(self.urls)} remote files"
 
 
-class UnifiedFileExtractor(Extractor):
+class S3File(ReadableFile):
+    """A readable file that is stored in S3.
+
+    This class is used to read files from S3. The class takes a key, bucket,
+    and an S3 client. The class uses the S3 client to get the object from S3
+    and yield an instance of the file that can be read by the pipeline.
+
+    The class also has a method to archive the file after it has been read.
+
+    """
+
+    def __init__(
+        self,
+        key: str,
+        s3_client,
+        bucket: str,
+        archive_dir: str | None,
+        object_format: str | None,
+    ) -> None:
+        self.logger = getLogger(__name__)
+        self.key = key
+        self.s3_client = s3_client
+        self.bucket = bucket
+        self.archive_dir = archive_dir
+        self.object_format = object_format
+
+    def archive_if_required(self, key: str):
+        if not self.archive_dir:
+            return
+
+        self.logger.info("Archiving S3 Object", extra=dict(key=key))
+        filename = Path(key).name
+        self.s3_client.copy(
+            Bucket=self.bucket,
+            Key=f"{self.archive_dir}/{filename}",
+            CopySource={"Bucket": self.bucket, "Key": key},
+        )
+        self.s3_client.delete_object(Bucket=self.bucket, Key=key)
+
+    def path_like(self) -> Path:
+        path = Path(self.key)
+        return path.with_suffix(self.object_format or path.suffix)
+
+    @asynccontextmanager
+    async def as_reader(self, reader: IOBase):
+        streaming_body = self.s3_client.get_object(Bucket=self.bucket, Key=self.key)[
+            "Body"
+        ]
+        yield reader(BytesIO(streaming_body.read()))
+        self.archive_if_required(self.key)
+
+
+class S3FileSource(FileSource, alias="s3"):
+    """A class that represents a source of files stored in S3.
+
+    This class is used to read files from S3. The class takes a bucket, prefix,
+    and an S3 client. The class uses the S3 client to list the objects in the
+    bucket and yield instances of S3File that can be read by the pipeline.
+
+    The class also has a method to archive the file after it has been read.
+    """
+
+    @classmethod
+    def from_file_data(
+        cls,
+        bucket: str,
+        prefix: Optional[str] = None,
+        archive_dir: Optional[str] = None,
+        object_format: Optional[str] = None,
+        **aws_client_args,
+    ):
+        return cls(
+            bucket=bucket,
+            prefix=prefix,
+            archive_dir=archive_dir,
+            object_format=object_format,
+            s3_client=AwsClientFactory(**aws_client_args).make_client("s3"),
+        )
+
+    def __init__(
+        self,
+        bucket: str,
+        s3_client,
+        archive_dir: Optional[str] = None,
+        object_format: Optional[str] = None,
+        prefix: Optional[str] = None,
+    ):
+        self.bucket = bucket
+        self.s3_client = s3_client
+        self.archive_dir = archive_dir
+        self.object_format = object_format
+        self.prefix = prefix or ""
+
+    def object_is_in_archive(self, key: str) -> bool:
+        return key.startswith(self.archive_dir) if self.archive_dir else False
+
+    def find_keys_in_bucket(self) -> Iterable[str]:
+        # Returns all keys in the bucket that are not in the archive dir
+        # and have the prefix.
+        paginator = self.s3_client.get_paginator("list_objects_v2")
+        page_iterator = paginator.paginate(Bucket=self.bucket, Prefix=self.prefix)
+        for page in page_iterator:
+            keys = (obj["Key"] for obj in page.get("Contents", []))
+            yield from filter(lambda k: not self.object_is_in_archive(k), keys)
+
+    async def get_files(self):
+        for key in self.find_keys_in_bucket():
+            yield S3File(
+                key=key,
+                s3_client=self.s3_client,
+                bucket=self.bucket,
+                archive_dir=self.archive_dir,
+                object_format=self.object_format,
+            )
+
+
+class FileExtractor(Extractor):
     """A class that extracts records from files.
 
     This class is used to extract records from files. The class takes a list
@@ -507,7 +625,31 @@ class UnifiedFileExtractor(Extractor):
     """
 
     @classmethod
-    def from_file_data(cls, sources: List[Dict[str, Any]]) -> "UnifiedFileExtractor":
+    def local(cls, globs: Iterable[str]):
+        return FileExtractor.from_file_data([{"type": "local", "globs": globs}])
+
+    @classmethod
+    def s3(cls, **kwargs):
+        return cls([S3FileSource.from_file_data(**kwargs)])
+
+    @classmethod
+    def remote(
+        cls,
+        urls: Iterable[str],
+        memory_spooling_max_size_in_mb: int = 10,
+    ):
+        return FileExtractor.from_file_data(
+            [
+                {
+                    "type": "http",
+                    "urls": urls,
+                    "memory_spooling_max_size_in_mb": memory_spooling_max_size_in_mb,
+                }
+            ]
+        )
+
+    @classmethod
+    def from_file_data(cls, sources: List[Dict[str, Any]]) -> "FileExtractor":
         return cls(
             [FileSource.from_file_data_with_type_label(source) for source in sources]
         )
@@ -570,51 +712,3 @@ async def extract_records(self) -> AsyncGenerator[Any, Any]:
                 self.logger.warning(
                     f"No files found for source: {file_source.describe()}"
                 )
-
-
-# DEPRECATED CODE BELOW ##
-#
-# The classes below are slated to be removed in the future.
-# Additionally, there are aliases from the old class names to the new class
-# names to ensure backwards compatibility. These aliases will be removed in
-# the future.
-
-
-class FileExtractor(UnifiedFileExtractor):
-    """A class that extracts records from local files.
-
-    This class is slated to be removed in the future. It is a subclass of
-    UnifiedFileExtractor that is used to extract records from local files
-    """
-
-    @classmethod
-    def from_file_data(cls, globs: Iterable[str]):
-        return UnifiedFileExtractor.from_file_data([{"type": "local", "globs": globs}])
-
-
-class RemoteFileExtractor(UnifiedFileExtractor):
-    """A class that extracts records from remote files.
-
-    This class is slated to be removed in the future. It is a subclass of
-    UnifiedFileExtractor that is used to extract records from remote files.
-    """
-
-    @classmethod
-    def from_file_data(
-        cls,
-        urls: Iterable[str],
-        memory_spooling_max_size_in_mb: int = 10,
-    ):
-        return UnifiedFileExtractor.from_file_data(
-            [
-                {
-                    "type": "http",
-                    "urls": urls,
-                    "memory_spooling_max_size_in_mb": memory_spooling_max_size_in_mb,
-                }
-            ]
-        )
-
-
-SupportedFileFormat = FileCodec
-SupportedCompressedFileFormat = CompressionCodec
 
@@ -1,3 +1,4 @@
+from logging import getLogger
 from typing import Any, AsyncGenerator, Iterable
 
 from .extractor import Extractor
@@ -12,7 +13,20 @@ def range(cls, start=0, stop=100, step=1):
 
     def __init__(self, iterable: Iterable[Any]) -> None:
         self.iterable = iterable
+        self.index = 0
+        self.logger = getLogger(self.__class__.__name__)
 
     async def extract_records(self) -> AsyncGenerator[Any, Any]:
-        for record in self.iterable:
+        for index, record in enumerate(self.iterable):
+            if index < self.index:
+                continue
+            self.index = index
             yield record
+
+    async def make_checkpoint(self):
+        return self.index
+
+    async def resume_from_checkpoint(self, checkpoint):
+        if isinstance(checkpoint, int):
+            self.index = checkpoint
+            self.logger.info(f"Resuming from checkpoint {checkpoint}")
@@ -1,5 +1,4 @@
 from .athena_extractor import AthenaExtractor
 from .dynamodb_extractor import DynamoDBExtractor
-from .s3_extractor import S3Extractor
 
-__all__ = ("AthenaExtractor", "S3Extractor", "DynamoDBExtractor")
+__all__ = ("AthenaExtractor", "DynamoDBExtractor")