Unstructured-IO
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎unstructured_ingest/__version__.py‎
Lines changed: 1 addition & 1 deletion b/‎unstructured_ingest/__version__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unstructured_ingest/connector/astradb.py‎
Lines changed: 1 addition & 1 deletion b/‎unstructured_ingest/connector/astradb.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unstructured_ingest/connector/biomed.py‎
Lines changed: 4 additions & 4 deletions b/‎unstructured_ingest/connector/biomed.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎unstructured_ingest/connector/chroma.py‎
Lines changed: 1 addition & 1 deletion b/‎unstructured_ingest/connector/chroma.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unstructured_ingest/connector/fsspec/fsspec.py‎
Lines changed: 2 additions & 2 deletions b/‎unstructured_ingest/connector/fsspec/fsspec.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎unstructured_ingest/connector/git.py‎
Lines changed: 1 addition & 1 deletion b/‎unstructured_ingest/connector/git.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unstructured_ingest/connector/google_drive.py‎
Lines changed: 3 additions & 3 deletions b/‎unstructured_ingest/connector/google_drive.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎unstructured_ingest/connector/hubspot.py‎
Lines changed: 1 addition & 1 deletion b/‎unstructured_ingest/connector/hubspot.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unstructured_ingest/connector/kafka.py‎
Lines changed: 8 additions & 8 deletions b/‎unstructured_ingest/connector/kafka.py‎
Lines changed: 8 additions & 8 deletions
@@ -1,4 +1,4 @@
-## 0.0.15-dev2
+## 0.0.15-dev3
 
 ### Fixes
 
@@ -8,6 +8,7 @@
 ### Enhancements
 
 * **Migrate airtable connector to v2**
+* **Support iteratively deleting cached content** Add a flag to delete cached content once it's no longer needed for systems that are limited in memory.
 
 ## 0.0.14
 
 
@@ -1 +1 @@
-__version__ = "0.0.15-dev2"  # pragma: no cover
+__version__ = "0.0.15-dev3"  # pragma: no cover
@@ -222,7 +222,7 @@ def check_connection(self):
             raise DestinationConnectionError(f"failed to validate connection: {e}")
 
     def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
-        logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra DB.")
+        logger.info(f"inserting / updating {len(elements_dict)} documents to Astra DB.")
 
         astra_db_batch_size = self.write_config.batch_size
 
 
@@ -123,7 +123,7 @@ def cleanup_file(self):
             and self.filename.is_file()
             and not self.read_config.download_only
         ):
-            logger.debug(f"Cleaning up {self}")
+            logger.debug(f"cleaning up {self}")
             Path.unlink(self.filename)
 
     @SourceConnectionError.wrap
@@ -132,12 +132,12 @@ def get_file(self):
         download_path = self.file_meta.download_filepath  # type: ignore
         dir_ = Path(os.path.dirname(download_path))  # type: ignore
         if not dir_.is_dir():
-            logger.debug(f"Creating directory: {dir_}")
+            logger.debug(f"creating directory: {dir_}")
 
             if dir_:
                 dir_.mkdir(parents=True, exist_ok=True)
         self._retrieve()
-        logger.debug(f"File downloaded: {self.file_meta.download_filepath}")
+        logger.debug(f"file downloaded: {self.file_meta.download_filepath}")
 
     @SourceConnectionNetworkError.wrap
     def _retrieve(self):
@@ -229,7 +229,7 @@ def _list_objects(self) -> t.List[BiomedFileMeta]:
 
         def traverse(path, download_dir, output_dir):
             full_path = Path(PMC_DIR) / path
-            logger.debug(f"Traversing directory: {full_path}")
+            logger.debug(f"traversing directory: {full_path}")
 
             ftp = FTP(DOMAIN)
             ftp.login()
 
@@ -139,7 +139,7 @@ def prepare_chroma_list(chunk: t.Tuple[t.Dict[str, t.Any]]) -> t.Dict[str, t.Lis
         return chroma_dict
 
     def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
-        logger.info(f"Inserting / updating {len(elements_dict)} documents to destination ")
+        logger.info(f"inserting / updating {len(elements_dict)} documents to destination ")
 
         chroma_batch_size = self.write_config.batch_size
 
 
@@ -221,7 +221,7 @@ def does_path_match_glob(self, path: str) -> bool:
         for pattern in patterns:
             if fnmatch.filter([path], pattern):
                 return True
-        logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
+        logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
         return False
 
     def get_ingest_docs(self):
@@ -328,7 +328,7 @@ def write_dict(
             **self.connector_config.get_access_config(),
         )
 
-        logger.info(f"Writing content using filesystem: {type(fs).__name__}")
+        logger.info(f"writing content using filesystem: {type(fs).__name__}")
 
         output_folder = self.connector_config.path_without_protocol
         output_folder = os.path.join(output_folder)  # Make sure folder ends with file separator
 
@@ -120,5 +120,5 @@ def does_path_match_glob(self, path: str) -> bool:
         for pattern in patterns:
             if fnmatch.filter([path], pattern):
                 return True
-        logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
+        logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
         return False
@@ -222,15 +222,15 @@ def get_file(self):
             dir_ = Path(self.meta["download_dir"])
             if dir_:
                 if not dir_.is_dir():
-                    logger.debug(f"Creating directory: {self.meta.get('download_dir')}")
+                    logger.debug(f"creating directory: {self.meta.get('download_dir')}")
 
                     if dir_:
                         dir_.mkdir(parents=True, exist_ok=True)
 
                 with open(self.filename, "wb") as handler:
                     handler.write(file.getbuffer())
                     saved = True
-                    logger.debug(f"File downloaded: {self.filename}.")
+                    logger.debug(f"file downloaded: {self.filename}.")
         if not saved:
             logger.error(f"Error while downloading and saving file: {self.filename}.")
 
@@ -241,7 +241,7 @@ def write_result(self):
         self._output_filename.parent.mkdir(parents=True, exist_ok=True)
         with open(self._output_filename, "w") as output_f:
             output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2))
-        logger.info(f"Wrote {self._output_filename}")
+        logger.info(f"wrote {self._output_filename}")
 
 
 @dataclass
 
@@ -271,7 +271,7 @@ def get_ingest_docs(self):
 
         ingest_docs: t.List[HubSpotIngestDoc] = []
         for obj_name, obj_method in obj_method_resolver.items():
-            logger.info(f"Retrieving - {obj_name}")
+            logger.info(f"retrieving - {obj_name}")
             results: t.List[HubSpotIngestDoc] = obj_method()  # type: ignore
             ingest_docs += results  # type: ignore
 
 
@@ -114,7 +114,7 @@ def check_connection(self):
 
     def initialize(self):
         topic = self.connector_config.topic
-        logger.info(f"Subscribing to topic: {topic}")
+        logger.info(f"subscribing to topic: {topic}")
         self.kafka_consumer.subscribe([topic])
 
     @property
@@ -149,7 +149,7 @@ def create_consumer(self) -> "Consumer":
             conf["sasl.password"] = secret
 
         consumer = Consumer(conf)
-        logger.debug(f"Kafka Consumer connected to bootstrap: {bootstrap}")
+        logger.debug(f"kafka consumer connected to bootstrap: {bootstrap}")
         return consumer
 
     @SourceConnectionError.wrap
@@ -161,7 +161,7 @@ def get_ingest_docs(self):
 
         collected = []
         num_messages_to_consume = self.connector_config.num_messages_to_consume
-        logger.info(f"Config set for blocking on {num_messages_to_consume} messages")
+        logger.info(f"config set for blocking on {num_messages_to_consume} messages")
         # Consume specified number of messages
         while running:
             msg = consumer.poll(timeout=self.connector_config.timeout)
@@ -178,7 +178,7 @@ def get_ingest_docs(self):
             else:
                 collected.append(json.loads(msg.value().decode("utf8")))
                 if len(collected) >= num_messages_to_consume:
-                    logger.debug(f"Found {len(collected)} messages, stopping")
+                    logger.debug(f"found {len(collected)} messages, stopping")
                     consumer.commit(asynchronous=False)
                     break
 
@@ -243,7 +243,7 @@ def create_producer(self) -> "Producer":
             conf["sasl.password"] = secret
 
         producer = Producer(conf)
-        logger.debug(f"Connected to bootstrap: {bootstrap}")
+        logger.debug(f"connected to bootstrap: {bootstrap}")
         return producer
 
     def check_connection(self):
@@ -255,7 +255,7 @@ def check_connection(self):
 
     @DestinationConnectionError.wrap
     def upload_msg(self, batch) -> int:
-        logger.debug(f"Uploading batch: {batch}")
+        logger.debug(f"uploading batch: {batch}")
         topic = self.connector_config.topic
         producer = self.kafka_producer
         uploaded = 0
@@ -267,15 +267,15 @@ def upload_msg(self, batch) -> int:
 
     @DestinationConnectionError.wrap
     def write_dict(self, *args, dict_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
-        logger.info(f"Writing {len(dict_list)} documents to Kafka")
+        logger.info(f"writing {len(dict_list)} documents to Kafka")
         num_uploaded = 0
 
         for chunk in batch_generator(dict_list, self.write_config.batch_size):
             num_uploaded += self.upload_msg(chunk)  # noqa: E203
 
         producer = self.kafka_producer
         producer.flush()
-        logger.info(f"Uploaded {num_uploaded} documents to Kafka")
+        logger.info(f"uploaded {num_uploaded} documents to Kafka")
 
     def write(self, docs: t.List[BaseIngestDoc]) -> None:
         content_list: t.List[t.Dict[str, t.Any]] = []
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.0.15-dev2" # pragma: no cover`
	`1`	`+__version__ = "0.0.15-dev3" # pragma: no cover`