awslabs
diff --git a/‎src/orb/application/commands/machine_handlers.py‎
Lines changed: 2 additions & 2 deletions b/‎src/orb/application/commands/machine_handlers.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/orb/application/commands/request_creation_handlers.py‎
Lines changed: 165 additions & 86 deletions b/‎src/orb/application/commands/request_creation_handlers.py‎
Lines changed: 165 additions & 86 deletions
diff --git a/‎src/orb/application/commands/request_lifecycle_handlers.py‎
Lines changed: 10 additions & 9 deletions b/‎src/orb/application/commands/request_lifecycle_handlers.py‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎src/orb/application/services/machine_grouping_service.py‎
Lines changed: 10 additions & 3 deletions b/‎src/orb/application/services/machine_grouping_service.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎src/orb/application/services/orchestration/acquire_machines.py‎
Lines changed: 1 addition & 1 deletion b/‎src/orb/application/services/orchestration/acquire_machines.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/orb/application/services/orchestration/return_machines.py‎
Lines changed: 16 additions & 4 deletions b/‎src/orb/application/services/orchestration/return_machines.py‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎src/orb/application/services/provisioning_orchestration_service.py‎
Lines changed: 30 additions & 5 deletions b/‎src/orb/application/services/provisioning_orchestration_service.py‎
Lines changed: 30 additions & 5 deletions
diff --git a/‎src/orb/application/services/request_creation_service.py‎
Lines changed: 4 additions & 1 deletion b/‎src/orb/application/services/request_creation_service.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/orb/application/services/request_status_management_service.py‎
Lines changed: 11 additions & 5 deletions b/‎src/orb/application/services/request_status_management_service.py‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎src/orb/application/services/request_status_service.py‎
Lines changed: 2 additions & 2 deletions b/‎src/orb/application/services/request_status_service.py‎
Lines changed: 2 additions & 2 deletions
@@ -260,7 +260,7 @@ async def execute_command(self, command: CleanupMachineResourcesCommand):
                 if self.logger:
                     self.logger.warning("Machine not found for cleanup: %s", machine_id)
                 continue
-            machine.model_copy(update={"status": MachineStatus.TERMINATED})  # type: ignore[attr-defined]
+            machine = machine.model_copy(update={"status": MachineStatus.TERMINATED})  # type: ignore[attr-defined]
             self._machine_repository.save(machine)
 
 
@@ -324,6 +324,6 @@ async def execute_command(self, command: DeregisterMachineCommand):
             if self.logger:
                 self.logger.warning("Machine not found for deregistration: %s", command.machine_id)
             return None
-        machine.model_copy(update={"status": MachineStatus.TERMINATED})  # type: ignore[attr-defined]
+        machine = machine.model_copy(update={"status": MachineStatus.TERMINATED})  # type: ignore[attr-defined]
         self._machine_repository.save(machine)
         return None
@@ -94,13 +94,13 @@ class CancelRequestHandler(BaseCommandHandler[CancelRequestCommand, None]):  # t
 
     def __init__(
         self,
-        request_repository: RequestRepository,
+        uow_factory: UnitOfWorkFactory,
         logger: LoggingPort,
         event_publisher: EventPublisherPort,
         error_handler: ErrorHandlingPort,
     ) -> None:
         super().__init__(logger, event_publisher, error_handler)
-        self._request_repository = request_repository
+        self.uow_factory = uow_factory
 
     async def validate_command(self, command: CancelRequestCommand) -> None:
         """Validate cancel request command."""
@@ -113,15 +113,16 @@ async def execute_command(self, command: CancelRequestCommand) -> None:
         self.logger.info("Canceling request: %s", command.request_id)
 
         try:
-            request = self._request_repository.find_by_id(command.request_id)
-            if not request:
-                raise EntityNotFoundError("Request", command.request_id)
+            with self.uow_factory.create_unit_of_work() as uow:
+                request = uow.requests.find_by_id(command.request_id)
+                if not request:
+                    raise EntityNotFoundError("Request", command.request_id)
 
-            cancelled_request = request.cancel(reason=command.reason)
+                cancelled_request = request.cancel(reason=command.reason)
 
-            events = self._request_repository.save(cancelled_request)
-            for event in events or []:
-                self.event_publisher.publish(event)  # type: ignore[union-attr]
+                events = uow.requests.save(cancelled_request)
+                for event in events or []:
+                    self.event_publisher.publish(event)  # type: ignore[union-attr]
 
             self.logger.info("Request canceled: %s", command.request_id)
             command.cancelled = True
 
@@ -58,7 +58,9 @@ def group_by_provider(self, machine_ids: list[str]) -> dict[tuple[str, str], lis
 
         return dict(provider_groups)
 
-    def group_by_resource(self, machine_ids: list[str]) -> dict[tuple[str, str, str], list[Any]]:
+    def group_by_resource(
+        self, machine_ids: list[str]
+    ) -> tuple[dict[tuple[str, str, str], list[Any]], list[str]]:
         """Group machines by (provider_name, provider_api, resource_id).
 
         This grouping is used for parallel deprovisioning operations where
@@ -68,12 +70,15 @@ def group_by_resource(self, machine_ids: list[str]) -> dict[tuple[str, str, str]
             machine_ids: List of machine IDs to group
 
         Returns:
-            Dictionary mapping (provider_name, provider_api, resource_id) to list of machine objects
+            Tuple of:
+            - Dictionary mapping (provider_name, provider_api, resource_id) to list of machine objects
+            - List of machine IDs that were skipped (missing provider_api or resource_id)
 
         Raises:
             ValueError: If machine context cannot be determined
         """
         resource_groups: dict[tuple[str, str, str], list[Any]] = defaultdict(list)
+        skipped_ids: list[str] = []
 
         for machine_id in machine_ids:
             try:
@@ -88,12 +93,14 @@ def group_by_resource(self, machine_ids: list[str]) -> dict[tuple[str, str, str]
                             "Machine %s has no provider_api — skipping",
                             machine_id,
                         )
+                        skipped_ids.append(machine_id)
                         continue
                     if not machine.resource_id:
                         self.logger.warning(
                             "Machine %s has no resource_id — skipping",
                             machine_id,
                         )
+                        skipped_ids.append(machine_id)
                         continue
                     group_key = (
                         machine.provider_name,
@@ -116,4 +123,4 @@ def group_by_resource(self, machine_ids: list[str]) -> dict[tuple[str, str, str]
             },
         )
 
-        return dict(resource_groups)
+        return dict(resource_groups), skipped_ids
@@ -13,7 +13,7 @@
 from orb.domain.base.exceptions import ApplicationError
 from orb.domain.base.ports.logging_port import LoggingPort
 
-_TERMINAL_STATUSES = {"completed", "complete", "failed", "error", "cancelled", "canceled"}
+_TERMINAL_STATUSES = {"completed", "complete", "failed", "error", "cancelled", "canceled", "partial", "timeout"}
 _MAX_CONSECUTIVE_POLL_ERRORS = 3
 
 
 
@@ -13,9 +13,11 @@
 from orb.domain.base.exceptions import ApplicationError
 from orb.domain.base.ports.logging_port import LoggingPort
 
-_TERMINAL_STATUSES = {"completed", "complete", "failed", "error", "cancelled", "canceled"}
+_TERMINAL_STATUSES = {"completed", "complete", "failed", "error", "cancelled", "canceled", "partial", "timeout"}
 _MAX_CONSECUTIVE_POLL_ERRORS = 3
 
+_STATUS_RANK = {"failed": 3, "error": 3, "partial": 2, "timeout": 2, "completed": 1, "complete": 1}
+
 
 class ReturnMachinesOrchestrator(OrchestratorBase[ReturnMachinesInput, ReturnMachinesOutput]):
     """Orchestrator for returning machines to the provider."""
@@ -68,14 +70,24 @@ async def execute(self, input: ReturnMachinesInput) -> ReturnMachinesOutput:  #
                 status="no_op",
                 skipped_machines=skipped,
             )
-        request_id = command.created_request_ids[0]
+
+        # Use the first request ID as the primary for the output, but poll ALL
+        # provider-group requests so a failure in any group is not silently ignored.
+        primary_request_id = command.created_request_ids[0]
         status = "pending"
 
         if input.wait:
-            status = await self._poll_until_terminal(request_id, input.timeout_seconds)
+            statuses = await asyncio.gather(
+                *[
+                    self._poll_until_terminal(rid, input.timeout_seconds)
+                    for rid in command.created_request_ids
+                ]
+            )
+            # Return the worst status: failed > partial/timeout > completed
+            status = max(statuses, key=lambda s: _STATUS_RANK.get(s.lower(), 0))
 
         return ReturnMachinesOutput(
-            request_id=request_id,
+            request_id=primary_request_id,
             status=status,
         )
 
 
@@ -69,6 +69,7 @@ async def execute_provisioning(
         started_at = datetime.now(timezone.utc)
         remaining = request.requested_count
         attempt_number = 0
+        consecutive_zero_fulfillments = 0
 
         accumulated_resource_ids: list[str] = []
         accumulated_machine_ids: list[str] = []
@@ -127,6 +128,17 @@ async def execute_provisioning(
             fulfilled_this_attempt = last_result.fulfilled_count
             remaining -= fulfilled_this_attempt
 
+            if fulfilled_this_attempt == 0 and last_result.success:
+                consecutive_zero_fulfillments += 1
+                if consecutive_zero_fulfillments >= 3:
+                    self._logger.warning(
+                        "Breaking retry loop after %d consecutive zero-fulfillment attempts",
+                        consecutive_zero_fulfillments,
+                    )
+                    break
+            else:
+                consecutive_zero_fulfillments = 0
+
             # Append to fulfillment_attempts audit trail
             attempt_record = {
                 "attempt": attempt_number,
@@ -156,7 +168,14 @@ async def execute_provisioning(
                     request.requested_count,
                     remaining,
                 )
-                request = self._persist_acquiring(request)
+                request, persist_ok = self._persist_acquiring(request)
+                if not persist_ok:
+                    self._logger.warning(
+                        "ACQUIRING persist failed for request %s on attempt %d — "
+                        "continuing retry loop with in-memory state",
+                        request.request_id,
+                        attempt_number,
+                    )
             elif last_result.is_final:
                 # No point retrying
                 break
@@ -175,8 +194,14 @@ async def execute_provisioning(
             is_final=last_result.is_final if last_result else True,
         )
 
-    def _persist_acquiring(self, request: Request) -> Request:
-        """Persist request with ACQUIRING status between retry attempts."""
+    def _persist_acquiring(self, request: Request) -> tuple[Request, bool]:
+        """Persist request with ACQUIRING status between retry attempts.
+
+        Returns:
+            (updated_request, success) — success is False when the DB write
+            failed.  The caller should log a warning but continue the retry loop
+            because the in-memory request is still valid.
+        """
         from orb.domain.base import UnitOfWorkFactory
 
         try:
@@ -186,10 +211,10 @@ def _persist_acquiring(self, request: Request) -> Request:
             uow_factory = self._container.get(UnitOfWorkFactory)
             with uow_factory.create_unit_of_work() as uow:
                 uow.requests.save(updated)
-            return updated
+            return updated, True
         except Exception as e:
             self._logger.warning("Failed to persist ACQUIRING status: %s", e)
-            return request
+            return request, False
 
     def _record_provider_success(self, provider_name: str) -> None:
         """Reset circuit breaker failure count after a successful dispatch."""
 
@@ -4,6 +4,7 @@
 from orb.domain.base.ports import LoggingPort
 from orb.domain.base.results import ProviderSelectionResult
 from orb.domain.request.aggregate import Request
+from orb.domain.request.exceptions import RequestValidationError
 from orb.domain.request.value_objects import RequestType
 from orb.domain.template.template_aggregate import Template
 
@@ -56,7 +57,9 @@ def create_machine_request(
 
         # Store provider API in domain field
         if not template.provider_api:
-            raise ValueError(f"Template {template.template_id} has no provider_api configured")
+            raise RequestValidationError(
+                f"Template {template.template_id} has no provider_api configured"
+            )
         request.provider_api = template.provider_api
 
         self._logger.info(
 
@@ -51,7 +51,7 @@ async def update_request_from_provisioning(
 
         # Store provider-specific data
         if provider_data:
-            request.provider_data.update(provider_data)
+            request = request.set_provider_data({**request.provider_data, **provider_data})
 
         # Handle provider errors for partial success
         provider_errors = (
@@ -60,7 +60,7 @@ async def update_request_from_provisioning(
         has_api_errors = bool(provider_errors)
 
         if has_api_errors and not request.metadata.get("fleet_errors"):
-            request.metadata["fleet_errors"] = provider_errors
+            request = request.update_metadata({"fleet_errors": provider_errors})
 
         # Create and save machine aggregates
         if instances:
@@ -91,8 +91,9 @@ def _handle_provisioning_failure(self, request: Any, provisioning_result: Any) -
             RequestStatus.FAILED, f"Provisioning failed: {error_message}"
         )
 
-        request.metadata["error_message"] = error_message
-        request.metadata["error_type"] = "ProvisioningFailure"
+        request = request.update_metadata(
+            {"error_message": error_message, "error_type": "ProvisioningFailure"}
+        )
 
         return request
 
@@ -139,11 +140,16 @@ def _update_request_status(
                     RequestStatus.PARTIAL,
                     f"Partially fulfilled: {instance_count}/{requested_count} instances",
                 )
-        else:
+        elif request.resource_ids:
             request = request.update_status(
                 RequestStatus.IN_PROGRESS,
                 "Resources created, instances pending",
             )
+        else:
+            request = request.update_status(
+                RequestStatus.FAILED,
+                "No instances provisioned and no cloud resources created",
+            )
 
         return request
 
 
@@ -79,7 +79,7 @@ def determine_status_from_machines(
                 machines_to_check = provider_machines if provider_machines else db_machines
 
                 if not machines_to_check:
-                    return None, None
+                    return RequestStatus.IN_PROGRESS.value, "Status determination failed — will retry"
 
                 running_count = sum(1 for m in machines_to_check if m.status.value == "running")
                 pending_count = sum(
@@ -147,7 +147,7 @@ def determine_status_from_machines(
 
         except Exception as e:
             self.logger.error(f"Failed to determine status from machines: {e}")
-            return None, None
+            return RequestStatus.IN_PROGRESS.value, "Status determination failed — will retry"
 
     async def update_request_status(self, request: Request, status: str, message: str) -> Request:
         """Update request status."""