|
17 | 17 | import pytest |
18 | 18 |
|
19 | 19 | from iris.cluster.providers.gcp.fake import InMemoryGcpService |
20 | | -from iris.cluster.providers.gcp.handles import _build_gce_resource_name |
21 | | -from iris.cluster.providers.gcp.workers import GcpWorkerProvider, _validate_slice_config |
| 20 | +from iris.cluster.providers.gcp.handles import GcpVmSliceHandle, _build_gce_resource_name |
| 21 | +from iris.cluster.providers.gcp.workers import ( |
| 22 | + GcpWorkerProvider, |
| 23 | + _run_vm_slice_bootstrap, |
| 24 | + _validate_slice_config, |
| 25 | +) |
22 | 26 | from iris.cluster.providers.manual.provider import ManualControllerProvider, ManualWorkerProvider |
23 | 27 | from iris.cluster.providers.types import ( |
24 | 28 | CloudSliceState, |
| 29 | + InfraError, |
25 | 30 | Labels, |
26 | 31 | QuotaExhaustedError, |
27 | 32 | ) |
28 | 33 | from iris.cluster.service_mode import ServiceMode |
29 | 34 | from iris.rpc import config_pb2 |
| 35 | +from iris.time_utils import Timestamp |
30 | 36 |
|
31 | 37 | # ============================================================================= |
32 | 38 | # Fixture infrastructure |
@@ -633,3 +639,183 @@ def test_gcp_tpu_slice_passes_startup_script_metadata(): |
633 | 639 | assert "startup-script" in metadata |
634 | 640 | assert "[iris-init]" in metadata["startup-script"] |
635 | 641 | assert "test-image:latest" in metadata["startup-script"] |
| 642 | + |
| 643 | + |
| 644 | +# ============================================================================= |
| 645 | +# Section 6: VM Slice Bootstrap Tests |
| 646 | +# |
| 647 | +# Tests for _run_vm_slice_bootstrap with split timeouts and health probing. |
| 648 | +# ============================================================================= |
| 649 | + |
| 650 | + |
| 651 | +def _make_vm_slice_for_bootstrap( |
| 652 | + gcp_service: InMemoryGcpService, |
| 653 | + zone: str = "us-central2-b", |
| 654 | +) -> tuple[GcpVmSliceHandle, str]: |
| 655 | + """Create a VM in InMemoryGcpService and return a handle + vm_name for bootstrap testing.""" |
| 656 | + from iris.cluster.providers.gcp.service import VmCreateRequest |
| 657 | + |
| 658 | + vm_name = "test-bootstrap-vm" |
| 659 | + gcp_service.vm_create( |
| 660 | + VmCreateRequest( |
| 661 | + name=vm_name, |
| 662 | + zone=zone, |
| 663 | + machine_type="n2-standard-4", |
| 664 | + labels={Labels("iris").iris_slice_id: vm_name}, |
| 665 | + ) |
| 666 | + ) |
| 667 | + handle = GcpVmSliceHandle( |
| 668 | + _slice_id=vm_name, |
| 669 | + _vm_name=vm_name, |
| 670 | + _zone=zone, |
| 671 | + _project_id="test-project", |
| 672 | + _gcp_service=gcp_service, |
| 673 | + _labels={Labels("iris").iris_slice_id: vm_name}, |
| 674 | + _created_at=Timestamp.now(), |
| 675 | + _label_prefix="iris", |
| 676 | + _bootstrapping=True, |
| 677 | + ) |
| 678 | + return handle, vm_name |
| 679 | + |
| 680 | + |
| 681 | +def test_vm_bootstrap_health_probe_succeeds_without_serial_port(): |
| 682 | + """Bootstrap completes when health probe succeeds, even if serial port never shows 'Bootstrap complete'.""" |
| 683 | + gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project") |
| 684 | + handle, _vm_name = _make_vm_slice_for_bootstrap(gcp_service) |
| 685 | + worker_config = config_pb2.WorkerConfig(port=10001) |
| 686 | + |
| 687 | + with unittest.mock.patch( |
| 688 | + "iris.cluster.providers.gcp.workers._probe_worker_health", |
| 689 | + return_value=True, |
| 690 | + ): |
| 691 | + _run_vm_slice_bootstrap( |
| 692 | + gcp_service, |
| 693 | + handle, |
| 694 | + worker_config, |
| 695 | + poll_interval=0.01, |
| 696 | + cloud_ready_timeout=5.0, |
| 697 | + bootstrap_timeout=5.0, |
| 698 | + ) |
| 699 | + |
| 700 | + assert handle._bootstrap_state == CloudSliceState.READY |
| 701 | + |
| 702 | + |
| 703 | +def test_vm_bootstrap_serial_port_succeeds_without_health_probe(): |
| 704 | + """Bootstrap completes via serial port 'Bootstrap complete' when health probe fails.""" |
| 705 | + gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project") |
| 706 | + handle, vm_name = _make_vm_slice_for_bootstrap(gcp_service) |
| 707 | + worker_config = config_pb2.WorkerConfig(port=10001) |
| 708 | + |
| 709 | + gcp_service.set_serial_port_output( |
| 710 | + vm_name, |
| 711 | + "us-central2-b", |
| 712 | + "[iris-init] Starting bootstrap\n[iris-init] Bootstrap complete\n", |
| 713 | + ) |
| 714 | + |
| 715 | + with unittest.mock.patch( |
| 716 | + "iris.cluster.providers.gcp.workers._probe_worker_health", |
| 717 | + return_value=False, |
| 718 | + ): |
| 719 | + _run_vm_slice_bootstrap( |
| 720 | + gcp_service, |
| 721 | + handle, |
| 722 | + worker_config, |
| 723 | + poll_interval=0.01, |
| 724 | + cloud_ready_timeout=5.0, |
| 725 | + bootstrap_timeout=5.0, |
| 726 | + ) |
| 727 | + |
| 728 | + assert handle._bootstrap_state == CloudSliceState.READY |
| 729 | + |
| 730 | + |
| 731 | +def test_vm_bootstrap_serial_port_error_raises(): |
| 732 | + """Bootstrap fails immediately when serial port shows '[iris-init] ERROR'.""" |
| 733 | + gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project") |
| 734 | + handle, vm_name = _make_vm_slice_for_bootstrap(gcp_service) |
| 735 | + worker_config = config_pb2.WorkerConfig(port=10001) |
| 736 | + |
| 737 | + gcp_service.set_serial_port_output( |
| 738 | + vm_name, |
| 739 | + "us-central2-b", |
| 740 | + "[iris-init] ERROR: Docker pull failed\n", |
| 741 | + ) |
| 742 | + |
| 743 | + with unittest.mock.patch( |
| 744 | + "iris.cluster.providers.gcp.workers._probe_worker_health", |
| 745 | + return_value=False, |
| 746 | + ): |
| 747 | + with pytest.raises(InfraError, match="bootstrap failed"): |
| 748 | + _run_vm_slice_bootstrap( |
| 749 | + gcp_service, |
| 750 | + handle, |
| 751 | + worker_config, |
| 752 | + poll_interval=0.01, |
| 753 | + cloud_ready_timeout=5.0, |
| 754 | + bootstrap_timeout=5.0, |
| 755 | + ) |
| 756 | + |
| 757 | + |
| 758 | +def test_vm_bootstrap_phase2_has_independent_timeout(): |
| 759 | + """Phase 2 uses its own timeout, not the remainder from phase 1.""" |
| 760 | + gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project") |
| 761 | + handle, _vm_name = _make_vm_slice_for_bootstrap(gcp_service) |
| 762 | + worker_config = config_pb2.WorkerConfig(port=10001) |
| 763 | + |
| 764 | + # Health probe never succeeds, serial port never shows complete. |
| 765 | + # With a very short bootstrap_timeout, this should fail with phase 2 message. |
| 766 | + with unittest.mock.patch( |
| 767 | + "iris.cluster.providers.gcp.workers._probe_worker_health", |
| 768 | + return_value=False, |
| 769 | + ): |
| 770 | + with pytest.raises(InfraError, match=r"bootstrap did not complete within 0\.05s"): |
| 771 | + _run_vm_slice_bootstrap( |
| 772 | + gcp_service, |
| 773 | + handle, |
| 774 | + worker_config, |
| 775 | + poll_interval=0.01, |
| 776 | + cloud_ready_timeout=600.0, |
| 777 | + bootstrap_timeout=0.05, |
| 778 | + ) |
| 779 | + |
| 780 | + |
| 781 | +def test_vm_bootstrap_cloud_not_ready_raises_phase1_timeout(): |
| 782 | + """Phase 1 timeout triggers when VM never reaches READY.""" |
| 783 | + gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project") |
| 784 | + |
| 785 | + # Create a VM but set it to non-READY state |
| 786 | + from iris.cluster.providers.gcp.service import VmCreateRequest |
| 787 | + |
| 788 | + vm_name = "test-stuck-vm" |
| 789 | + gcp_service.vm_create( |
| 790 | + VmCreateRequest( |
| 791 | + name=vm_name, |
| 792 | + zone="us-central2-b", |
| 793 | + machine_type="n2-standard-4", |
| 794 | + labels={Labels("iris").iris_slice_id: vm_name}, |
| 795 | + ) |
| 796 | + ) |
| 797 | + # Set VM to STAGING so it never reaches READY |
| 798 | + gcp_service._vms[(vm_name, "us-central2-b")].status = "STAGING" |
| 799 | + |
| 800 | + handle = GcpVmSliceHandle( |
| 801 | + _slice_id=vm_name, |
| 802 | + _vm_name=vm_name, |
| 803 | + _zone="us-central2-b", |
| 804 | + _project_id="test-project", |
| 805 | + _gcp_service=gcp_service, |
| 806 | + _labels={Labels("iris").iris_slice_id: vm_name}, |
| 807 | + _created_at=Timestamp.now(), |
| 808 | + _label_prefix="iris", |
| 809 | + _bootstrapping=True, |
| 810 | + ) |
| 811 | + worker_config = config_pb2.WorkerConfig(port=10001) |
| 812 | + |
| 813 | + with pytest.raises(InfraError, match=r"did not reach cloud READY within 0\.05s"): |
| 814 | + _run_vm_slice_bootstrap( |
| 815 | + gcp_service, |
| 816 | + handle, |
| 817 | + worker_config, |
| 818 | + poll_interval=0.01, |
| 819 | + cloud_ready_timeout=0.05, |
| 820 | + bootstrap_timeout=300.0, |
| 821 | + ) |
0 commit comments