diff --git a/output.txt b/output.txt
new file mode 100644
index 0000000..cff5199
--- /dev/null
+++ b/output.txt
@@ -0,0 +1,616 @@
+============================= test session starts ==============================
+platform linux -- Python 3.10.16, pytest-8.3.4, pluggy-1.5.0 -- /home/howardhuang/.conda/envs/torchft/bin/python
+cachedir: .pytest_cache
+rootdir: /home/howardhuang/local/torchft
+configfile: pytest.ini
+plugins: typeguard-2.13.3
+collecting ... collected 8 items / 7 deselected / 1 selected
+
+torchft/manager_integ_test.py::ManagerIntegTest::test_diloco_healthy torchft::lighthouse: 2025-01-28T07:50:47.294-08:00 - INFO Lighthouse listening on: http://devvm2170.rva0.facebook.com:43041
+torchft::lighthouse: 2025-01-28T07:50:47.294-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:47.396-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:47.496-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:47.597-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:47.699-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:47.800-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:47.902-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:48.003-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:48.105-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:48.206-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:48.308-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:48.408-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:48.509-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false]
+torchft::manager: 2025-01-28T07:50:48.515-08:00 - INFO LighthouseClient: establishing connection to http://devvm2170.rva0.facebook.com:43041
+torchft::manager: 2025-01-28T07:50:48.516-08:00 - INFO Manager 1e9ec0b67-12c9-42d2-846a-77880287183a listening on http://devvm2170.rva0.facebook.com:19531
+torchft::manager: 2025-01-28T07:50:48.518-08:00 - INFO LighthouseClient: establishing connection to http://devvm2170.rva0.facebook.com:43041
+torchft::manager: 2025-01-28T07:50:48.519-08:00 - INFO Manager 0ca125a5f-91b5-4a5f-9d60-47491b73043b listening on http://devvm2170.rva0.facebook.com:19530
+torchft::manager: 2025-01-28T07:50:48.549-08:00 - INFO ManagerClient: establishing connection to http://devvm2170.rva0.facebook.com:19531
+torchft::manager: 2025-01-28T07:50:48.552-08:00 - INFO ManagerClient: establishing connection to http://devvm2170.rva0.facebook.com:19530
+torchft::manager: 2025-01-28T07:50:48.552-08:00 - INFO got quorum request for rank 0
+torchft::manager: 2025-01-28T07:50:48.552-08:00 - INFO all workers joined -- starting quorum
+torchft::lighthouse: 2025-01-28T07:50:48.552-08:00 - INFO got quorum request for replica 1e9ec0b67-12c9-42d2-846a-77880287183a
+torchft::lighthouse: 2025-01-28T07:50:48.552-08:00 - INFO Next quorum status: New quorum not ready, only have 1 participants, need min_replicas 2 [1/1 participants healthy][2 heartbeating][shrink_only=false]
+torchft::manager: 2025-01-28T07:50:48.555-08:00 - INFO got quorum request for rank 0
+torchft::manager: 2025-01-28T07:50:48.555-08:00 - INFO all workers joined -- starting quorum
+torchft::lighthouse: 2025-01-28T07:50:48.555-08:00 - INFO got quorum request for replica 0ca125a5f-91b5-4a5f-9d60-47491b73043b
+torchft::lighthouse: 2025-01-28T07:50:48.555-08:00 - INFO Next quorum status: Valid quorum found [2/2 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:48.555-08:00 - INFO Detected quorum change, bumping quorum_id to 1
+torchft::lighthouse: 2025-01-28T07:50:48.555-08:00 - INFO Quorum! Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 0, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 0, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079448, nanos: 555509353 }) }
+torchft::manager: 2025-01-28T07:50:48.555-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 0, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 0, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079448, nanos: 555509353 }) }) }
+torchft::manager: 2025-01-28T07:50:48.555-08:00 - INFO returning quorum for rank 0
+torchft::manager: 2025-01-28T07:50:48.555-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 0, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 0, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079448, nanos: 555509353 }) }) }
+torchft::manager: 2025-01-28T07:50:48.555-08:00 - INFO healing is required step=0, max_step=0
+torchft::manager: 2025-01-28T07:50:48.556-08:00 - INFO returning quorum for rank 0
+torchft::manager: 2025-01-28T07:50:48.582-08:00 - INFO ManagerClient: establishing connection to http://devvm2170.rva0.facebook.com:19530
+torchft::lighthouse: 2025-01-28T07:50:48.610-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:48.712-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:48.814-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:48.914-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:49.015-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:49.117-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:49.219-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+2401:db00:eef0:1120:3520:0:740e:f755 - - [28/Jan/2025 07:50:49] "GET /checkpoint/0 HTTP/1.1" 200 -
+torchft::lighthouse: 2025-01-28T07:50:49.320-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:49.422-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:49.524-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::manager: 2025-01-28T07:50:49.559-08:00 - INFO got quorum request for rank 0
+torchft::manager: 2025-01-28T07:50:49.559-08:00 - INFO all workers joined -- starting quorum
+torchft::lighthouse: 2025-01-28T07:50:49.560-08:00 - INFO got quorum request for replica 0ca125a5f-91b5-4a5f-9d60-47491b73043b
+torchft::lighthouse: 2025-01-28T07:50:49.560-08:00 - INFO Next quorum status: New quorum not ready, only have 1 participants, need min_replicas 2 [1/1 participants healthy][2 heartbeating][shrink_only=false]
+torchft::manager: 2025-01-28T07:50:49.560-08:00 - INFO got quorum request for rank 0
+torchft::manager: 2025-01-28T07:50:49.560-08:00 - INFO all workers joined -- starting quorum
+torchft::lighthouse: 2025-01-28T07:50:49.561-08:00 - INFO got quorum request for replica 1e9ec0b67-12c9-42d2-846a-77880287183a
+torchft::lighthouse: 2025-01-28T07:50:49.561-08:00 - INFO Next quorum status: Fast quorum found! [2/2 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:49.561-08:00 - INFO Quorum! Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 0, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 0, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 561226349 }) }
+torchft::manager: 2025-01-28T07:50:49.561-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 0, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 0, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 561226349 }) }) }
+torchft::manager: 2025-01-28T07:50:49.561-08:00 - INFO healing is required step=0, max_step=0
+torchft::manager: 2025-01-28T07:50:49.561-08:00 - INFO returning quorum for rank 0
+torchft::manager: 2025-01-28T07:50:49.561-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 0, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 0, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 561226349 }) }) }
+torchft::manager: 2025-01-28T07:50:49.561-08:00 - INFO returning quorum for rank 0
+torchft::manager: 2025-01-28T07:50:49.582-08:00 - INFO ManagerClient: establishing connection to http://devvm2170.rva0.facebook.com:19530
+2401:db00:eef0:1120:3520:0:740e:f755 - - [28/Jan/2025 07:50:49] "GET /checkpoint/0 HTTP/1.1" 200 -
+torchft::manager: 2025-01-28T07:50:49.622-08:00 - INFO should_commit request from 0 should_commit=true
+torchft::manager: 2025-01-28T07:50:49.622-08:00 - INFO should_commit completed should_commit=true
+torchft::manager: 2025-01-28T07:50:49.622-08:00 - INFO should_commit request from 0 should_commit=true
+torchft::manager: 2025-01-28T07:50:49.622-08:00 - INFO should_commit completed should_commit=true
+torchft::lighthouse: 2025-01-28T07:50:49.625-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::manager: 2025-01-28T07:50:49.626-08:00 - INFO got quorum request for rank 0
+torchft::manager: 2025-01-28T07:50:49.626-08:00 - INFO all workers joined -- starting quorum
+torchft::lighthouse: 2025-01-28T07:50:49.626-08:00 - INFO got quorum request for replica 1e9ec0b67-12c9-42d2-846a-77880287183a
+torchft::manager: 2025-01-28T07:50:49.626-08:00 - INFO got quorum request for rank 0
+torchft::manager: 2025-01-28T07:50:49.626-08:00 - INFO all workers joined -- starting quorum
+torchft::lighthouse: 2025-01-28T07:50:49.626-08:00 - INFO Next quorum status: New quorum not ready, only have 1 participants, need min_replicas 2 [1/1 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:49.627-08:00 - INFO got quorum request for replica 0ca125a5f-91b5-4a5f-9d60-47491b73043b
+torchft::lighthouse: 2025-01-28T07:50:49.627-08:00 - INFO Next quorum status: Fast quorum found! [2/2 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:49.627-08:00 - INFO Quorum! Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 1, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 1, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 627176917 }) }
+torchft::manager: 2025-01-28T07:50:49.627-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 1, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 1, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 627176917 }) }) }
+torchft::manager: 2025-01-28T07:50:49.627-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 1, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 1, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 627176917 }) }) }
+torchft::manager: 2025-01-28T07:50:49.627-08:00 - INFO returning quorum for rank 0
+torchft::manager: 2025-01-28T07:50:49.627-08:00 - INFO returning quorum for rank 0
+torchft::manager: 2025-01-28T07:50:49.656-08:00 - INFO should_commit request from 0 should_commit=true
+torchft::manager: 2025-01-28T07:50:49.656-08:00 - INFO should_commit completed should_commit=true
+torchft::manager: 2025-01-28T07:50:49.660-08:00 - INFO got quorum request for rank 0
+torchft::manager: 2025-01-28T07:50:49.660-08:00 - INFO all workers joined -- starting quorum
+torchft::lighthouse: 2025-01-28T07:50:49.661-08:00 - INFO got quorum request for replica 1e9ec0b67-12c9-42d2-846a-77880287183a
+torchft::lighthouse: 2025-01-28T07:50:49.661-08:00 - INFO Next quorum status: New quorum not ready, only have 1 participants, need min_replicas 2 [1/1 participants healthy][2 heartbeating][shrink_only=false]
+torchft::manager: 2025-01-28T07:50:49.661-08:00 - INFO should_commit request from 0 should_commit=true
+torchft::manager: 2025-01-28T07:50:49.661-08:00 - INFO should_commit completed should_commit=true
+torchft::manager: 2025-01-28T07:50:49.664-08:00 - INFO got quorum request for rank 0
+torchft::manager: 2025-01-28T07:50:49.664-08:00 - INFO all workers joined -- starting quorum
+torchft::lighthouse: 2025-01-28T07:50:49.664-08:00 - INFO got quorum request for replica 0ca125a5f-91b5-4a5f-9d60-47491b73043b
+torchft::lighthouse: 2025-01-28T07:50:49.664-08:00 - INFO Next quorum status: Fast quorum found! [2/2 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:49.664-08:00 - INFO Quorum! Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 2, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 2, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 664398852 }) }
+torchft::manager: 2025-01-28T07:50:49.664-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 2, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 2, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 664398852 }) }) }
+torchft::manager: 2025-01-28T07:50:49.664-08:00 - INFO returning quorum for rank 0
+torchft::manager: 2025-01-28T07:50:49.664-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 2, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 2, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 664398852 }) }) }
+torchft::manager: 2025-01-28T07:50:49.664-08:00 - INFO returning quorum for rank 0
+starting replica group self.replica_id=0 self.world_size=1 attempt 0
+starting replica group self.replica_id=1 self.world_size=1 attempt 0
+worker runner.replica_id=1 rank=0 runner.world_size=1 starting
+worker runner.replica_id=0 rank=0 runner.world_size=1 starting
+param=Parameter containing:
+tensor([[ 0.4414,  0.4792, -0.1353],
+        [ 0.5304, -0.1265,  0.1165],
+        [-0.2811,  0.3391,  0.5090],
+        [-0.4236,  0.5018,  0.1081]], requires_grad=True) vs. tensor([[ 0.4414,  0.4792, -0.1353],
+        [ 0.5304, -0.1265,  0.1165],
+        [-0.2811,  0.3391,  0.5090],
+        [-0.4236,  0.5018,  0.1081]])
+param=Parameter containing:
+tensor([[ 0.4266,  0.0782,  0.2784],
+        [-0.0815,  0.4451,  0.0853],
+        [-0.2695,  0.1472, -0.2660],
+        [-0.0677, -0.2345,  0.3830]], requires_grad=True) vs. tensor([[ 0.4266,  0.0782,  0.2784],
+        [-0.0815,  0.4451,  0.0853],
+        [-0.2695,  0.1472, -0.2660],
+        [-0.0677, -0.2345,  0.3830]])
+param=Parameter containing:
+tensor([-0.4557, -0.2662, -0.1630, -0.3471], requires_grad=True) vs. tensor([-0.4557, -0.2662, -0.1630, -0.3471])
+param=Parameter containing:
+tensor([ 0.0545, -0.5702,  0.5214, -0.4904], requires_grad=True) vs. tensor([ 0.0545, -0.5702,  0.5214, -0.4904])
+True list(self._model.parameters())=[Parameter containing:
+tensor([[ 0.2207,  0.2396, -0.0676],
+        [ 0.2652, -0.0632,  0.0583],
+        [-0.1405,  0.1695,  0.2545],
+        [-0.2118,  0.2509,  0.0540]], requires_grad=True), Parameter containing:
+tensor([-0.2279, -0.1331, -0.0815, -0.1736], requires_grad=True)]
+True list(self._model.parameters())=[Parameter containing:
+tensor([[ 0.2207,  0.2396, -0.0676],
+        [ 0.2652, -0.0632,  0.0583],
+        [-0.1405,  0.1695,  0.2545],
+        [-0.2118,  0.2509,  0.0540]], requires_grad=True), Parameter containing:
+tensor([-0.2279, -0.1331, -0.0815, -0.1736], requires_grad=True)]
+name='model.0.weight' p.data=tensor([[ 0.2207,  0.2396, -0.0676],
+        [ 0.2652, -0.0632,  0.0583],
+        [-0.1405,  0.1695,  0.2545],
+        [-0.2118,  0.2509,  0.0540]])
+name='model.0.weight' p.data=tensor([[ 0.2207,  0.2396, -0.0676],
+        [ 0.2652, -0.0632,  0.0583],
+        [-0.1405,  0.1695,  0.2545],
+        [-0.2118,  0.2509,  0.0540]])
+name='model.0.bias' p.data=tensor([-0.2279, -0.1331, -0.0815, -0.1736])
+in diloco CM manager.current_step()=0
+name='model.0.bias' p.data=tensor([-0.2279, -0.1331, -0.0815, -0.1736])
+in diloco CM manager.current_step()=0
+inner optimizer step
+inner optimizer step
+in diloco CM manager.current_step()=0
+in diloco CM manager.current_step()=0
+inner optimizer step
+inner optimizer step
+name='model.0.weight', p.grad=tensor([[ 2.3217e-04,  2.8212e-04,  7.1485e-05],
+        [-4.0449e-04, -3.6760e-04, -3.9917e-04],
+        [ 4.0477e-04,  3.7535e-04,  2.5970e-04],
+        [-3.0579e-04, -3.2146e-04, -1.1927e-04]])
+name='model.0.weight', p.grad=tensor([[ 2.3217e-04,  2.8212e-04,  7.1485e-05],
+        [-4.0449e-04, -3.6760e-04, -3.9917e-04],
+        [ 4.0477e-04,  3.7535e-04,  2.5970e-04],
+        [-3.0579e-04, -3.2146e-04, -1.1927e-04]])
+name='model.0.bias', p.grad=tensor([ 0.0002, -0.0004,  0.0004, -0.0002])
+name='model.0.bias', p.grad=tensor([ 0.0002, -0.0004,  0.0004, -0.0002])
+list(self._model.parameters())=[Parameter containing:
+tensor([[ 0.2207,  0.2396, -0.0676],
+        [ 0.2652, -0.0632,  0.0583],
+        [-0.1405,  0.1695,  0.2545],
+        [-0.2118,  0.2509,  0.0540]], requires_grad=True), Parameter containing:
+tensor([-0.2279, -0.1331, -0.0815, -0.1736], requires_grad=True)]
+list(self._model.parameters())=[Parameter containing:
+tensor([[ 0.2207,  0.2396, -0.0676],
+        [ 0.2652, -0.0632,  0.0583],
+        [-0.1405,  0.1695,  0.2545],
+        [-0.2118,  0.2509,  0.0540]], requires_grad=True), Parameter containing:
+tensor([-0.2279, -0.1331, -0.0815, -0.1736], requires_grad=True)]
+list(self._model.parameters())=[Parameter containing:
+tensor([[-0.4793, -0.4604, -0.7675],
+        [ 0.9652,  0.6367,  0.7582],
+        [-0.8405, -0.5304, -0.4455],
+        [ 0.4882,  0.9509,  0.7540]], requires_grad=True), Parameter containing:
+tensor([-0.9278,  0.5669, -0.7815,  0.5264], requires_grad=True)]
+list(self._model.parameters())=[Parameter containing:
+tensor([[-0.4793, -0.4604, -0.7675],
+        [ 0.9652,  0.6367,  0.7582],
+        [-0.8405, -0.5304, -0.4455],
+        [ 0.4882,  0.9509,  0.7540]], requires_grad=True), Parameter containing:
+tensor([-0.9278,  0.5669, -0.7815,  0.5264], requires_grad=True)]
+name='model.0.weight' p.data=tensor([[ 0.2211,  0.2400, -0.0672],
+        [ 0.2648, -0.0636,  0.0578],
+        [-0.1401,  0.1699,  0.2549],
+        [-0.2122,  0.2505,  0.0536]])
+name='model.0.weight' p.data=tensor([[-0.4793, -0.4604, -0.7675],
+        [ 0.9652,  0.6367,  0.7582],
+        [-0.8405, -0.5304, -0.4455],
+        [ 0.4882,  0.9509,  0.7540]])
+name='model.0.bias' p.data=tensor([-0.2275, -0.1335, -0.0811, -0.1740])
+in diloco CM manager.current_step()=1
+name='model.0.bias' p.data=tensor([-0.9278,  0.5669, -0.7815,  0.5264])
+in diloco CM manager.current_step()=1
+inner optimizer step
+in diloco CM manager.current_step()=1
+inner optimizer step
+in diloco CM manager.current_step()=1
+inner optimizer step
+inner optimizer step
+name='model.0.weight', p.grad=tensor([[ 1.4991e-04,  2.5948e-04, -4.4014e-04],
+        [-2.4478e-04, -8.2199e-05, -3.2015e-05],
+        [ 4.2802e-04,  2.3373e-04,  9.2387e-05],
+        [-6.4364e-04, -5.9976e-04, -2.6279e-04]])
+name='model.0.weight', p.grad=tensor([[ 1.4991e-04,  2.5948e-04, -4.4014e-04],
+        [-2.4478e-04, -8.2199e-05, -3.2015e-05],
+        [ 4.2802e-04,  2.3373e-04,  9.2387e-05],
+        [-6.4364e-04, -5.9976e-04, -2.6279e-04]])
+name='model.0.bias', p.grad=tensor([ 6.3330e-07, -6.8381e-05,  2.6295e-04, -3.6267e-04])
+name='model.0.bias', p.grad=tensor([ 6.3330e-07, -6.8381e-05,  2.6295e-04, -3.6267e-04])
+list(self._model.parameters())=[Parameter containing:
+tensor([[-0.4793, -0.4604, -0.7675],
+        [ 0.9652,  0.6367,  0.7582],
+        [-0.8405, -0.5304, -0.4455],
+        [ 0.4882,  0.9509,  0.7540]], requires_grad=True), Parameter containing:
+tensor([-0.9278,  0.5669, -0.7815,  0.5264], requires_grad=True)]
+list(self._model.parameters())=[Parameter containing:
+tensor([[ 0.2211,  0.2400, -0.0672],
+        [ 0.2648, -0.0636,  0.0578],
+        [-0.1401,  0.1699,  0.2549],
+        [-0.2122,  0.2505,  0.0536]], requires_grad=True), Parameter containing:
+tensor([-0.2275, -0.1335, -0.0811, -0.1740], requires_grad=True)]
+list(self._model.parameters())=[Parameter containing:
+tensor([[-0.4789, -0.4600,  0.6328],
+        [ 0.9647,  0.6363,  0.7576],
+        [-0.8401, -0.5300, -0.4451],
+        [ 0.4878,  0.9505,  0.7536]], requires_grad=True), Parameter containing:
+tensor([-0.9166,  0.5664, -0.7811,  0.5260], requires_grad=True)]
+name='model.0.weight' p.data=tensor([[-0.4789, -0.4600,  0.6328],
+        [ 0.9647,  0.6363,  0.7576],
+        [-0.8401, -0.5300, -0.4451],
+        [ 0.4878,  0.9505,  0.7536]])
+name='model.0.bias' p.data=tensor([-0.9166,  0.5664, -0.7811,  0.5260])
+in diloco CM manager.current_step()=2
+list(self._model.parameters())=[Parameter containing:
+tensor([[-1.1559, -1.1582, -0.3285],
+        [ 1.6362,  1.2082,  1.2674],
+        [-1.5413, -1.2040, -1.0620],
+        [ 1.1599,  1.6315,  1.4221]], requires_grad=True), Parameter containing:
+tensor([-1.3985,  1.1180, -1.4589,  1.1988], requires_grad=True)]
+name='model.0.weight' p.data=tensor([[-1.1559, -1.1582, -0.3285],
+        [ 1.6362,  1.2082,  1.2674],
+        [-1.5413, -1.2040, -1.0620],
+        [ 1.1599,  1.6315,  1.4221]])
+name='model.0.bias' p.data=tensor([-1.3985,  1.1180, -1.4589,  1.1988])
+in diloco CM manager.current_step()=2
+inner optimizer step
+in diloco CM manager.current_step()=2
+inner optimizer step
+in diloco CM manager.current_step()=2
+inner optimizer step
+inner optimizer step
+name='model.0.weight', p.grad=tensor([[ 1.4319e-04,  2.9729e-04, -4.0483e-04],
+        [-1.8367e-04, -2.0355e-05,  4.6968e-05],
+        [ 2.8500e-04,  6.5982e-05,  2.9102e-05],
+        [-4.9986e-04, -5.7098e-04, -1.6165e-04]])
+torchft::manager: 2025-01-28T07:50:49.698-08:00 - INFO should_commit request from 0 should_commit=true
+torchft::manager: 2025-01-28T07:50:49.698-08:00 - INFO should_commit completed should_commit=true
+torchft::manager: 2025-01-28T07:50:49.698-08:00 - INFO should_commit request from 0 should_commit=true
+torchft::manager: 2025-01-28T07:50:49.699-08:00 - INFO should_commit completed should_commit=true
+torchft::manager: 2025-01-28T07:50:49.702-08:00 - INFO got quorum request for rank 0
+torchft::manager: 2025-01-28T07:50:49.702-08:00 - INFO all workers joined -- starting quorum
+torchft::lighthouse: 2025-01-28T07:50:49.702-08:00 - INFO got quorum request for replica 0ca125a5f-91b5-4a5f-9d60-47491b73043b
+torchft::lighthouse: 2025-01-28T07:50:49.702-08:00 - INFO Next quorum status: New quorum not ready, only have 1 participants, need min_replicas 2 [1/1 participants healthy][2 heartbeating][shrink_only=false]
+torchft::manager: 2025-01-28T07:50:49.703-08:00 - INFO got quorum request for rank 0
+torchft::manager: 2025-01-28T07:50:49.703-08:00 - INFO all workers joined -- starting quorum
+torchft::lighthouse: 2025-01-28T07:50:49.703-08:00 - INFO got quorum request for replica 1e9ec0b67-12c9-42d2-846a-77880287183a
+torchft::lighthouse: 2025-01-28T07:50:49.703-08:00 - INFO Next quorum status: Fast quorum found! [2/2 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:49.703-08:00 - INFO Quorum! Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 3, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 3, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 703374025 }) }
+torchft::manager: 2025-01-28T07:50:49.703-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 3, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 3, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 703374025 }) }) }
+torchft::manager: 2025-01-28T07:50:49.703-08:00 - INFO returning quorum for rank 0
+torchft::manager: 2025-01-28T07:50:49.703-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 3, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 3, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 703374025 }) }) }
+torchft::manager: 2025-01-28T07:50:49.703-08:00 - INFO returning quorum for rank 0
+torchft::lighthouse: 2025-01-28T07:50:49.726-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::manager: 2025-01-28T07:50:49.737-08:00 - INFO should_commit request from 0 should_commit=true
+torchft::manager: 2025-01-28T07:50:49.737-08:00 - INFO should_commit completed should_commit=true
+torchft::manager: 2025-01-28T07:50:49.737-08:00 - INFO should_commit request from 0 should_commit=true
+torchft::manager: 2025-01-28T07:50:49.738-08:00 - INFO should_commit completed should_commit=true
+torchft::lighthouse: 2025-01-28T07:50:49.828-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:49.929-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:50.031-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+torchft::lighthouse: 2025-01-28T07:50:50.133-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false]
+name='model.0.weight', p.grad=tensor([[ 1.4319e-04,  2.9729e-04, -4.0483e-04],
+        [-1.8367e-04, -2.0355e-05,  4.6968e-05],
+        [ 2.8500e-04,  6.5982e-05,  2.9102e-05],
+        [-4.9986e-04, -5.7098e-04, -1.6165e-04]])
+name='model.0.bias', p.grad=tensor([ 8.6367e-05, -6.9797e-05,  1.4833e-04, -2.5073e-04])
+name='model.0.bias', p.grad=tensor([ 8.6367e-05, -6.9797e-05,  1.4833e-04, -2.5073e-04])
+list(self._model.parameters())=[Parameter containing:
+tensor([[-1.1559, -1.1582, -0.3285],
+        [ 1.6362,  1.2082,  1.2674],
+        [-1.5413, -1.2040, -1.0620],
+        [ 1.1599,  1.6315,  1.4221]], requires_grad=True), Parameter containing:
+tensor([-1.3985,  1.1180, -1.4589,  1.1988], requires_grad=True)]
+list(self._model.parameters())=[Parameter containing:
+tensor([[-0.4789, -0.4600,  0.6328],
+        [ 0.9647,  0.6363,  0.7576],
+        [-0.8401, -0.5300, -0.4451],
+        [ 0.4878,  0.9505,  0.7536]], requires_grad=True), Parameter containing:
+tensor([-0.9166,  0.5664, -0.7811,  0.5260], requires_grad=True)]
+list(self._model.parameters())=[Parameter containing:
+tensor([[-1.8260, -1.8585,  0.2233],
+        [ 2.2816,  1.6734,  1.6062],
+        [-2.2236, -1.7849, -1.5829],
+        [ 1.8418,  2.3220,  2.0912]], requires_grad=True), Parameter containing:
+tensor([-1.9107,  1.6145, -2.0911,  1.8764], requires_grad=True)]
+list(self._model.parameters())=[Parameter containing:
+tensor([[-1.1778, -1.1608,  1.3306],
+        [ 1.6525,  1.2167,  0.5913],
+        [-1.5193, -1.1230, -1.0489],
+        [ 1.1778,  1.6494,  1.4260]], requires_grad=True), Parameter containing:
+tensor([-1.4408,  1.2667, -1.4455,  1.2081], requires_grad=True)]
+name='model.0.weight' p.data=tensor([[-1.8260, -1.8585,  0.2233],
+        [ 2.2816,  1.6734,  1.6062],
+        [-2.2236, -1.7849, -1.5829],
+        [ 1.8418,  2.3220,  2.0912]])
+name='model.0.weight' p.data=tensor([[-1.1778, -1.1608,  1.3306],
+        [ 1.6525,  1.2167,  0.5913],
+        [-1.5193, -1.1230, -1.0489],
+        [ 1.1778,  1.6494,  1.4260]])
+name='model.0.bias' p.data=tensor([-1.9107,  1.6145, -2.0911,  1.8764])
+in diloco CM manager.current_step()=3
+name='model.0.bias' p.data=tensor([-1.4408,  1.2667, -1.4455,  1.2081])
+in diloco CM manager.current_step()=3
+inner optimizer step
+in diloco CM manager.current_step()=3
+inner optimizer step
+in diloco CM manager.current_step()=3
+inner optimizer step
+inner optimizer step
+name='model.0.weight', p.grad=tensor([[ 2.6709e-04,  3.4326e-04, -1.7107e-04],
+        [-2.6494e-04, -1.3125e-04, -3.5435e-05],
+        [ 2.6774e-04,  8.8334e-05,  6.3241e-05],
+        [-3.7980e-04, -4.9281e-04, -1.2958e-04]])
+name='model.0.weight', p.grad=tensor([[ 2.6709e-04,  3.4326e-04, -1.7107e-04],
+        [-2.6494e-04, -1.3125e-04, -3.5435e-05],
+        [ 2.6774e-04,  8.8334e-05,  6.3241e-05],
+        [-3.7980e-04, -4.9281e-04, -1.2958e-04]])
+name='model.0.bias', p.grad=tensor([ 0.0002, -0.0002,  0.0001, -0.0002])
+name='model.0.bias', p.grad=tensor([ 0.0002, -0.0002,  0.0001, -0.0002])
+list(self._model.parameters())=[Parameter containing:
+tensor([[-1.1778, -1.1608,  1.3306],
+        [ 1.6525,  1.2167,  0.5913],
+        [-1.5193, -1.1230, -1.0489],
+        [ 1.1778,  1.6494,  1.4260]], requires_grad=True), Parameter containing:
+tensor([-1.4408,  1.2667, -1.4455,  1.2081], requires_grad=True)]
+list(self._model.parameters())=[Parameter containing:
+tensor([[-1.8260, -1.8585,  0.2233],
+        [ 2.2816,  1.6734,  1.6062],
+        [-2.2236, -1.7849, -1.5829],
+        [ 1.8418,  2.3220,  2.0912]], requires_grad=True), Parameter containing:
+tensor([-1.9107,  1.6145, -2.0911,  1.8764], requires_grad=True)]
+list(self._model.parameters())=[Parameter containing:
+tensor([[-2.5075, -2.5618,  0.7693],
+        [ 2.9387,  2.1671,  1.9183],
+        [-2.8951, -2.3307, -2.0895],
+        [ 2.5165,  3.0140,  2.7500]], requires_grad=True), Parameter containing:
+tensor([-2.4930,  2.1568, -2.7003,  2.5372], requires_grad=True)]
+list(self._model.parameters())=[Parameter containing:
+tensor([[-1.8622, -1.8630,  1.9723],
+        [ 2.3472,  1.8356,  0.7188],
+        [-2.1914, -1.7066, -1.6842],
+        [ 1.8502,  2.3423,  2.0785]], requires_grad=True), Parameter containing:
+tensor([-1.9961,  1.9095, -2.1010,  1.8620], requires_grad=True)]
+name='model.0.weight' p.data=tensor([[-2.5075, -2.5618,  0.7693],
+        [ 2.9387,  2.1671,  1.9183],
+        [-2.8951, -2.3307, -2.0895],
+        [ 2.5165,  3.0140,  2.7500]])
+name='model.0.weight' p.data=tensor([[-1.8622, -1.8630,  1.9723],
+        [ 2.3472,  1.8356,  0.7188],
+        [-2.1914, -1.7066, -1.6842],
+        [ 1.8502,  2.3423,  2.0785]])
+name='model.0.bias' p.data=tensor([-2.4930,  2.1568, -2.7003,  2.5372])
+name='model.0.bias' p.data=tensor([-1.9961,  1.9095, -2.1010,  1.8620])
+[{'model': OrderedDict([('model.0.weight', tensor([[-1.8622, -1.8630,  1.9723],
+        [ 2.3472,  1.8356,  0.7188],
+        [-2.1914, -1.7066, -1.6842],
+        [ 1.8502,  2.3423,  2.0785]])), ('model.0.bias', tensor([-1.9961,  1.9095, -2.1010,  1.8620]))]), 'inner_optim': {'state': {0: {'step': tensor(7.), 'exp_avg': tensor([[-0.0061, -0.0078, -0.0039],
+        [-0.0020,  0.0016, -0.0043],
+        [ 0.0023,  0.0029,  0.0027],
+        [ 0.0034,  0.0040,  0.0014]]), 'exp_avg_sq': tensor([[0.0002, 0.0003, 0.0004],
+        [0.0004, 0.0002, 0.0006],
+        [0.0001, 0.0001, 0.0003],
+        [0.0002, 0.0001, 0.0002]])}, 1: {'step': tensor(7.), 'exp_avg': tensor([-0.0104, -0.0035,  0.0103,  0.0004]), 'exp_avg_sq': tensor([0.0009, 0.0018, 0.0005, 0.0007])}}, 'param_groups': [{'lr': 0.0004, 'betas': (0.9, 0.95), 'eps': 1e-08, 'weight_decay': 0.1, 'amsgrad': False, 'foreach': None, 'maximize': False, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}, 'outer_optim': {'state': {0: {'step': tensor(3.), 'exp_avg': tensor([[ 5.1738e-05,  8.2101e-05, -8.9194e-05],
+        [-6.2852e-05, -2.1615e-05, -1.9096e-06],
+        [ 8.7094e-05,  3.3704e-05,  1.6427e-05],
+        [-1.3510e-04, -1.4925e-04, -4.8792e-05]]), 'exp_avg_sq': tensor([[1.1424e-10, 2.7332e-10, 3.8633e-10],
+        [1.6369e-10, 2.4383e-11, 4.4824e-12],
+        [3.3567e-10, 6.6674e-11, 1.3364e-11],
+        [8.0730e-10, 9.2755e-10, 1.1181e-10]])}, 1: {'step': tensor(3.), 'exp_avg': tensor([ 3.2709e-05, -3.1210e-05,  4.8912e-05, -6.9191e-05]), 'exp_avg_sq': tensor([6.9378e-11, 4.7128e-11, 1.1133e-10, 2.2382e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}, 'backup_parameters': {'model.0.weight': tensor([[-1.8622, -1.8630,  1.9723],
+        [ 2.3472,  1.8356,  0.7188],
+        [-2.1914, -1.7066, -1.6842],
+        [ 1.8502,  2.3423,  2.0785]]), 'model.0.bias': tensor([-1.9961,  1.9095, -2.1010,  1.8620])}}]
+[{'model': OrderedDict([('model.0.weight', tensor([[-2.5075, -2.5618,  0.7693],
+        [ 2.9387,  2.1671,  1.9183],
+        [-2.8951, -2.3307, -2.0895],
+        [ 2.5165,  3.0140,  2.7500]])), ('model.0.bias', tensor([-2.4930,  2.1568, -2.7003,  2.5372]))]), 'inner_optim': {'state': {0: {'step': tensor(8.), 'exp_avg': tensor([[-0.0006, -0.0005,  0.0010],
+        [ 0.0052, -0.0006,  0.0014],
+        [-0.0087, -0.0018, -0.0008],
+        [ 0.0041,  0.0030, -0.0025]]), 'exp_avg_sq': tensor([[1.7226e-04, 5.7628e-05, 5.0178e-05],
+        [2.0129e-04, 1.7710e-04, 2.2114e-04],
+        [3.2548e-04, 6.1146e-05, 2.3409e-04],
+        [1.2215e-04, 7.6481e-05, 4.1236e-04]])}, 1: {'step': tensor(8.), 'exp_avg': tensor([-0.0007,  0.0078, -0.0093,  0.0019]), 'exp_avg_sq': tensor([0.0004, 0.0007, 0.0004, 0.0004])}}, 'param_groups': [{'lr': 0.0004, 'betas': (0.9, 0.95), 'eps': 1e-08, 'weight_decay': 0.1, 'amsgrad': False, 'foreach': None, 'maximize': False, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}, 'outer_optim': {'state': {0: {'step': tensor(4.), 'exp_avg': tensor([[ 6.8663e-05,  1.0267e-04, -8.3983e-05],
+        [-9.2339e-05, -4.8413e-05, -3.1009e-05],
+        [ 1.1660e-04,  6.1067e-05,  3.5359e-05],
+        [-1.5739e-04, -1.7268e-04, -5.7487e-05]]), 'exp_avg_sq': tensor([[1.6798e-10, 3.5267e-10, 3.9143e-10],
+        [3.2682e-10, 1.5911e-10, 1.6334e-10],
+        [4.9901e-10, 2.0714e-10, 8.0604e-11],
+        [9.0053e-10, 1.0306e-09, 1.2600e-10]])}, 1: {'step': tensor(4.), 'exp_avg': tensor([ 4.7159e-05, -5.9983e-05,  7.8308e-05, -8.1859e-05]), 'exp_avg_sq': tensor([1.0855e-10, 2.0245e-10, 2.7344e-10, 2.5393e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}, 'backup_parameters': {'model.0.weight': tensor([[-2.5075, -2.5618,  0.7693],
+        [ 2.9387,  2.1671,  1.9183],
+        [-2.8951, -2.3307, -2.0895],
+        [ 2.5165,  3.0140,  2.7500]]), 'model.0.bias': tensor([-2.4930,  2.1568, -2.7003,  2.5372])}}]
+FAILED
+
+=================================== FAILURES ===================================
+_____________________ ManagerIntegTest.test_diloco_healthy _____________________
+
+self = <torchft.manager_integ_test.ManagerIntegTest testMethod=test_diloco_healthy>
+
+    def test_diloco_healthy(self) -> None:
+        lighthouse = Lighthouse(
+            bind="[::]:0",
+            min_replicas=2,
+        )
+        num_replicas = 2
+        futures = []
+    
+        with ThreadPoolExecutor(max_workers=num_replicas) as executor:
+            for replica_id in range(num_replicas):
+                failure_injector = FailureInjector()
+                runner = Runner(
+                    replica_id=replica_id,
+                    lighthouse_address=lighthouse.address(),
+                    failure_injector=failure_injector,
+                    train_loop=diloco_train_loop,
+                )
+                futures.append(executor.submit(runner.run_replica))
+    
+        state_dicts = []
+    
+        for fut in as_completed(futures):
+            state_dicts.append(fut.result())
+    
+        lighthouse.shutdown()
+    
+        for state_dict in state_dicts:
+            print(state_dict)
+>           torch.testing.assert_close(state_dict, state_dicts[0])
+E           AssertionError: Tensor-likes are not close!
+E           
+E           Mismatched elements: 4 / 4 (100.0%)
+E           Greatest absolute difference: 0.6751507520675659 at index (3,) (up to 1e-05 allowed)
+E           Greatest relative difference: 0.36258822679519653 at index (3,) (up to 1.3e-06 allowed)
+E           
+E           The failure occurred for item [0]['backup_parameters']['model.0.bias']
+
+torchft/manager_integ_test.py:556: AssertionError
+------------------------------ Captured log call -------------------------------
+2025-01-28 07:50:48 INFO Started CheckpointServer on http://devvm2170.rva0.facebook.com:43127/checkpoint/-1...
+2025-01-28 07:50:48 INFO Started CheckpointServer on http://devvm2170.rva0.facebook.com:34997/checkpoint/-1...
+2025-01-28 07:50:48 INFO [0ca125a5f-91b5-4a5f-9d60-47491b73043b/0 - step 0] reconfiguring for quorum_id=1 store_prefixed_addr='localhost:47765/torchft/1/0'
+2025-01-28 07:50:48 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] reconfiguring for quorum_id=1 store_prefixed_addr='localhost:47765/torchft/1/0'
+2025-01-28 07:50:48 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] healing required, fetching checkpoint server address from address='http://devvm2170.rva0.facebook.com:19530' max_step=0
+2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] fetching checkpoint from checkpoint_server_address='http://devvm2170.rva0.facebook.com:34997/checkpoint/0'
+2025-01-28 07:50:49 INFO fetching checkpoint from http://devvm2170.rva0.facebook.com:34997/checkpoint/0
+2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] healing required, fetching checkpoint server address from address='http://devvm2170.rva0.facebook.com:19530' max_step=0
+2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] fetching checkpoint from checkpoint_server_address='http://devvm2170.rva0.facebook.com:34997/checkpoint/0'
+2025-01-28 07:50:49 INFO fetching checkpoint from http://devvm2170.rva0.facebook.com:34997/checkpoint/0
+2025-01-28 07:50:49 INFO START
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!!
+2025-01-28 07:50:49 INFO START
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!!
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(1.), 'exp_avg': tensor([[ 2.3217e-05,  2.8212e-05,  7.1485e-06],
+        [-4.0449e-05, -3.6760e-05, -3.9917e-05],
+        [ 4.0477e-05,  3.7535e-05,  2.5970e-05],
+        [-3.0579e-05, -3.2146e-05, -1.1927e-05]]), 'exp_avg_sq': tensor([[5.3902e-11, 7.9590e-11, 5.1100e-12],
+        [1.6361e-10, 1.3513e-10, 1.5934e-10],
+        [1.6384e-10, 1.4089e-10, 6.7443e-11],
+        [9.3506e-11, 1.0334e-10, 1.4226e-11]])}, 1: {'step': tensor(1.), 'exp_avg': tensor([ 1.9822e-05, -3.9469e-05,  4.0324e-05, -1.7376e-05]), 'exp_avg_sq': tensor([3.9289e-11, 1.5578e-10, 1.6260e-10, 3.0193e-11])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO END
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(1.), 'exp_avg': tensor([[ 2.3217e-05,  2.8212e-05,  7.1485e-06],
+        [-4.0449e-05, -3.6760e-05, -3.9917e-05],
+        [ 4.0477e-05,  3.7535e-05,  2.5970e-05],
+        [-3.0579e-05, -3.2146e-05, -1.1927e-05]]), 'exp_avg_sq': tensor([[5.3902e-11, 7.9590e-11, 5.1100e-12],
+        [1.6361e-10, 1.3513e-10, 1.5934e-10],
+        [1.6384e-10, 1.4089e-10, 6.7443e-11],
+        [9.3506e-11, 1.0334e-10, 1.4226e-11]])}, 1: {'step': tensor(1.), 'exp_avg': tensor([ 1.9822e-05, -3.9469e-05,  4.0324e-05, -1.7376e-05]), 'exp_avg_sq': tensor([3.9289e-11, 1.5578e-10, 1.6260e-10, 3.0193e-11])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO END
+2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] applying pending state dict
+2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] should_commit=True enough_replicas=True, errored=None
+2025-01-28 07:50:49 INFO [0ca125a5f-91b5-4a5f-9d60-47491b73043b/0 - step 0] should_commit=True enough_replicas=True, errored=None
+2025-01-28 07:50:49 INFO START
+2025-01-28 07:50:49 INFO START
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!!
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(1.), 'exp_avg': tensor([[ 2.3217e-05,  2.8212e-05,  7.1485e-06],
+        [-4.0449e-05, -3.6760e-05, -3.9917e-05],
+        [ 4.0477e-05,  3.7535e-05,  2.5970e-05],
+        [-3.0579e-05, -3.2146e-05, -1.1927e-05]]), 'exp_avg_sq': tensor([[5.3902e-11, 7.9590e-11, 5.1100e-12],
+        [1.6361e-10, 1.3513e-10, 1.5934e-10],
+        [1.6384e-10, 1.4089e-10, 6.7443e-11],
+        [9.3506e-11, 1.0334e-10, 1.4226e-11]])}, 1: {'step': tensor(1.), 'exp_avg': tensor([ 1.9822e-05, -3.9469e-05,  4.0324e-05, -1.7376e-05]), 'exp_avg_sq': tensor([3.9289e-11, 1.5578e-10, 1.6260e-10, 3.0193e-11])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!!
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(1.), 'exp_avg': tensor([[ 1.4991e-05,  2.5948e-05, -4.4014e-05],
+        [-2.4478e-05, -8.2199e-06, -3.2015e-06],
+        [ 4.2802e-05,  2.3373e-05,  9.2387e-06],
+        [-6.4364e-05, -5.9976e-05, -2.6279e-05]]), 'exp_avg_sq': tensor([[2.2472e-11, 6.7331e-11, 1.9373e-10],
+        [5.9918e-11, 6.7566e-12, 1.0250e-12],
+        [1.8320e-10, 5.4631e-11, 8.5354e-12],
+        [4.1427e-10, 3.5971e-10, 6.9057e-11]])}, 1: {'step': tensor(1.), 'exp_avg': tensor([ 6.3330e-08, -6.8381e-06,  2.6295e-05, -3.6267e-05]), 'exp_avg_sq': tensor([4.0107e-16, 4.6760e-12, 6.9143e-11, 1.3153e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO END
+2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 1] should_commit=True enough_replicas=True, errored=None
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(2.), 'exp_avg': tensor([[ 3.5886e-05,  5.1339e-05, -3.7581e-05],
+        [-6.0882e-05, -4.1304e-05, -3.9127e-05],
+        [ 7.9231e-05,  5.7155e-05,  3.2611e-05],
+        [-9.1885e-05, -8.8907e-05, -3.7013e-05]]), 'exp_avg_sq': tensor([[7.6320e-11, 1.4684e-10, 1.9883e-10],
+        [2.2337e-10, 1.4175e-10, 1.6020e-10],
+        [3.4688e-10, 1.9538e-10, 7.5911e-11],
+        [5.0769e-10, 4.6294e-10, 8.3269e-11]])}, 1: {'step': tensor(2.), 'exp_avg': tensor([ 1.7903e-05, -4.2361e-05,  6.2586e-05, -5.1906e-05]), 'exp_avg_sq': tensor([3.9250e-11, 1.6030e-10, 2.3158e-10, 1.6169e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO END
+2025-01-28 07:50:49 INFO [0ca125a5f-91b5-4a5f-9d60-47491b73043b/0 - step 1] should_commit=True enough_replicas=True, errored=None
+2025-01-28 07:50:49 INFO START
+2025-01-28 07:50:49 INFO START
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(2.), 'exp_avg': tensor([[ 3.5886e-05,  5.1339e-05, -3.7581e-05],
+        [-6.0882e-05, -4.1304e-05, -3.9127e-05],
+        [ 7.9231e-05,  5.7155e-05,  3.2611e-05],
+        [-9.1885e-05, -8.8907e-05, -3.7013e-05]]), 'exp_avg_sq': tensor([[7.6320e-11, 1.4684e-10, 1.9883e-10],
+        [2.2337e-10, 1.4175e-10, 1.6020e-10],
+        [3.4688e-10, 1.9538e-10, 7.5911e-11],
+        [5.0769e-10, 4.6294e-10, 8.3269e-11]])}, 1: {'step': tensor(2.), 'exp_avg': tensor([ 1.7903e-05, -4.2361e-05,  6.2586e-05, -5.1906e-05]), 'exp_avg_sq': tensor([3.9250e-11, 1.6030e-10, 2.3158e-10, 1.6169e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!!
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(1.), 'exp_avg': tensor([[ 1.4991e-05,  2.5948e-05, -4.4014e-05],
+        [-2.4478e-05, -8.2199e-06, -3.2015e-06],
+        [ 4.2802e-05,  2.3373e-05,  9.2387e-06],
+        [-6.4364e-05, -5.9976e-05, -2.6279e-05]]), 'exp_avg_sq': tensor([[2.2472e-11, 6.7331e-11, 1.9373e-10],
+        [5.9918e-11, 6.7566e-12, 1.0250e-12],
+        [1.8320e-10, 5.4631e-11, 8.5354e-12],
+        [4.1427e-10, 3.5971e-10, 6.9057e-11]])}, 1: {'step': tensor(1.), 'exp_avg': tensor([ 6.3330e-08, -6.8381e-06,  2.6295e-05, -3.6267e-05]), 'exp_avg_sq': tensor([4.0107e-16, 4.6760e-12, 6.9143e-11, 1.3153e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!!
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(3.), 'exp_avg': tensor([[ 4.6616e-05,  7.5934e-05, -7.4306e-05],
+        [-7.3161e-05, -3.9209e-05, -3.0517e-05],
+        [ 9.9808e-05,  5.8038e-05,  3.2261e-05],
+        [-1.3268e-04, -1.3711e-04, -4.9477e-05]]), 'exp_avg_sq': tensor([[9.6745e-11, 2.3508e-10, 3.6252e-10],
+        [2.5688e-10, 1.4203e-10, 1.6225e-10],
+        [4.2775e-10, 1.9954e-10, 7.6682e-11],
+        [7.5704e-10, 7.8850e-10, 1.0932e-10]])}, 1: {'step': tensor(3.), 'exp_avg': tensor([ 2.4749e-05, -4.5104e-05,  7.1160e-05, -7.1788e-05]), 'exp_avg_sq': tensor([4.6670e-11, 1.6502e-10, 2.5335e-10, 2.2440e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO END
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(2.), 'exp_avg': tensor([[ 2.7810e-05,  5.3083e-05, -8.0096e-05],
+        [-4.0397e-05, -9.4334e-06,  1.8155e-06],
+        [ 6.7022e-05,  2.7634e-05,  1.1225e-05],
+        [-1.0791e-04, -1.1108e-04, -3.9816e-05]]), 'exp_avg_sq': tensor([[4.2951e-11, 1.5565e-10, 3.5742e-10],
+        [9.3593e-11, 7.1642e-12, 3.2300e-12],
+        [2.6424e-10, 5.8930e-11, 9.3738e-12],
+        [6.6372e-10, 6.8537e-10, 9.5118e-11]])}, 1: {'step': tensor(2.), 'exp_avg': tensor([ 8.6937e-06, -1.3134e-05,  3.8498e-05, -5.7713e-05]), 'exp_avg_sq': tensor([7.4597e-12, 9.5430e-12, 9.1074e-11, 1.9426e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO END
+2025-01-28 07:50:49 INFO [0ca125a5f-91b5-4a5f-9d60-47491b73043b/0 - step 2] should_commit=True enough_replicas=True, errored=None
+2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 2] should_commit=True enough_replicas=True, errored=None
+2025-01-28 07:50:49 INFO START
+2025-01-28 07:50:49 INFO START
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(3.), 'exp_avg': tensor([[ 4.6616e-05,  7.5934e-05, -7.4306e-05],
+        [-7.3161e-05, -3.9209e-05, -3.0517e-05],
+        [ 9.9808e-05,  5.8038e-05,  3.2261e-05],
+        [-1.3268e-04, -1.3711e-04, -4.9477e-05]]), 'exp_avg_sq': tensor([[9.6745e-11, 2.3508e-10, 3.6252e-10],
+        [2.5688e-10, 1.4203e-10, 1.6225e-10],
+        [4.2775e-10, 1.9954e-10, 7.6682e-11],
+        [7.5704e-10, 7.8850e-10, 1.0932e-10]])}, 1: {'step': tensor(3.), 'exp_avg': tensor([ 2.4749e-05, -4.5104e-05,  7.1160e-05, -7.1788e-05]), 'exp_avg_sq': tensor([4.6670e-11, 1.6502e-10, 2.5335e-10, 2.2440e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!!
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(2.), 'exp_avg': tensor([[ 2.7810e-05,  5.3083e-05, -8.0096e-05],
+        [-4.0397e-05, -9.4334e-06,  1.8155e-06],
+        [ 6.7022e-05,  2.7634e-05,  1.1225e-05],
+        [-1.0791e-04, -1.1108e-04, -3.9816e-05]]), 'exp_avg_sq': tensor([[4.2951e-11, 1.5565e-10, 3.5742e-10],
+        [9.3593e-11, 7.1642e-12, 3.2300e-12],
+        [2.6424e-10, 5.8930e-11, 9.3738e-12],
+        [6.6372e-10, 6.8537e-10, 9.5118e-11]])}, 1: {'step': tensor(2.), 'exp_avg': tensor([ 8.6937e-06, -1.3134e-05,  3.8498e-05, -5.7713e-05]), 'exp_avg_sq': tensor([7.4597e-12, 9.5430e-12, 9.1074e-11, 1.9426e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!!
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(4.), 'exp_avg': tensor([[ 6.8663e-05,  1.0267e-04, -8.3983e-05],
+        [-9.2339e-05, -4.8413e-05, -3.1009e-05],
+        [ 1.1660e-04,  6.1067e-05,  3.5359e-05],
+        [-1.5739e-04, -1.7268e-04, -5.7487e-05]]), 'exp_avg_sq': tensor([[1.6798e-10, 3.5267e-10, 3.9143e-10],
+        [3.2682e-10, 1.5911e-10, 1.6334e-10],
+        [4.9901e-10, 2.0714e-10, 8.0604e-11],
+        [9.0053e-10, 1.0306e-09, 1.2600e-10]])}, 1: {'step': tensor(4.), 'exp_avg': tensor([ 4.7159e-05, -5.9983e-05,  7.8308e-05, -8.1859e-05]), 'exp_avg_sq': tensor([1.0855e-10, 2.0245e-10, 2.7344e-10, 2.5393e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO END
+2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(3.), 'exp_avg': tensor([[ 5.1738e-05,  8.2101e-05, -8.9194e-05],
+        [-6.2852e-05, -2.1615e-05, -1.9096e-06],
+        [ 8.7094e-05,  3.3704e-05,  1.6427e-05],
+        [-1.3510e-04, -1.4925e-04, -4.8792e-05]]), 'exp_avg_sq': tensor([[1.1424e-10, 2.7332e-10, 3.8633e-10],
+        [1.6369e-10, 2.4383e-11, 4.4824e-12],
+        [3.3567e-10, 6.6674e-11, 1.3364e-11],
+        [8.0730e-10, 9.2755e-10, 1.1181e-10]])}, 1: {'step': tensor(3.), 'exp_avg': tensor([ 3.2709e-05, -3.1210e-05,  4.8912e-05, -6.9191e-05]), 'exp_avg_sq': tensor([6.9378e-11, 4.7128e-11, 1.1133e-10, 2.2382e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}
+2025-01-28 07:50:49 INFO END
+2025-01-28 07:50:49 INFO [0ca125a5f-91b5-4a5f-9d60-47491b73043b/0 - step 3] should_commit=True enough_replicas=True, errored=None
+2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 3] should_commit=True enough_replicas=True, errored=None
+=============================== warnings summary ===============================
+../../.conda/envs/torchft/lib/python3.10/site-packages/torch/_subclasses/functional_tensor.py:295
+  /home/howardhuang/.conda/envs/torchft/lib/python3.10/site-packages/torch/_subclasses/functional_tensor.py:295: UserWarning: Failed to initialize NumPy: No module named 'numpy' (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:84.)
+    cpu = _conversion_method_template(device=torch.device("cpu"))
+
+-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
+=========================== short test summary info ============================
+FAILED torchft/manager_integ_test.py::ManagerIntegTest::test_diloco_healthy
+================== 1 failed, 7 deselected, 1 warning in 4.77s ==================
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
index 1458f07..a486e1a 100644
--- a/torchft/local_sgd.py
+++ b/torchft/local_sgd.py
@@ -3,25 +3,29 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
 """
 LocalSGD
 =========
-
 This module implements a fault tolerant version of LocalSGD and related methods.
 """
-
-from typing import Any, Dict, List, Mapping, Optional
+import logging
+from types import TracebackType
+from typing import Any, Callable, Dict, Iterator, List, Mapping, Optional, Type
 
 import torch
 from torch import nn, optim
+from torch.nn.parameter import Parameter
+from torch.optim.optimizer import Optimizer
+from torch.utils.hooks import RemovableHandle
 
 from torchft.manager import Manager
 
+logger: logging.Logger = logging.getLogger(__name__)
 
-class LocalSGD(nn.Module):
+
+class LocalSGD:
     """
-    LocalSGD is a model wrapper similar to DistributedDataParallel that
+    LocalSGD is a context manager that
     implements the algorithm described in https://arxiv.org/pdf/1805.09767
 
     This will synchronize the model parameters periodically in a fault tolerant
@@ -60,26 +64,22 @@ def __init__(
     ) -> None:
         """
         Args:
-            manager: The manager to use.
-            model: The model to wrap.
-            optimizer: The optimizer used by the model.
-            sync_every: How often to sync the model weights.
-            backup_device: The device to store the backup of the model parameters on. (default cpu)
-            pin_memory: Whether to pin the memory used for the backup of the model parameters.
+            manager (Manager): The manager to use.
+            model (nn.Module): The model to wrap.
+            optimizer (optim.Optimizer): The optimizer used by the model.
+            sync_every (int): How often to sync the model weights.
+            backup_device (Optional[torch.device]): The device to store the backup of the model parameters on. (default cpu)
+            pin_memory (bool): Whether to pin the memory used for the backup of the model parameters.
         """
         super().__init__()
-
         self._manager = manager
         self._model = model
+        self._local_optimizer = optimizer
         self._local_step = 0
-        self._started_step = False
         self._sync_every = sync_every
         assert sync_every >= 1, "sync_every must be greater than or equal to 1"
-
         device = backup_device or torch.device("cpu")
-
         self._backup_parameters: Dict[str, torch.Tensor] = {}
-
         for name, p in self._model.named_parameters():
             t = torch.empty(*tuple(p.shape), dtype=p.dtype, device=device)
             if (
@@ -89,87 +89,101 @@ def __init__(
             ):
                 t = t.pin_memory()
             self._backup_parameters[name] = t
+        print(f"{self._manager._use_async_quorum} {list(self._model.parameters())=}")
 
+        self._hooks: List[RemovableHandle] = []
         # Need to copy the parameters to the host to be safe if we are on the first step.
         self._save_parameters()
 
-        optimizer.register_step_post_hook(self._step_post_hook)
+    def __enter__(self) -> "LocalSGD":
+        # Add optimizer hook which increments the local step counter and syncs if necessary
+        self._hooks.append(
+            self._local_optimizer.register_step_post_hook(self._step_post_hook)
+        )
+        # Register a forward prehook to check for quorum
+        self._hooks.append(
+            self._model.register_forward_pre_hook(self._forward_step_pre_hook)
+        )
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> bool:
+        # Handle any cleanup or error handling here
+        if exc_type is not None:
+            # If an exception occurred, restore parameters
+            self._restore_parameters()
+        # Clean up hooks
+        for hook in self._hooks:
+            hook.remove()
+        self._hooks.clear()
+
+        return False  # Propagate exceptions
 
     def _save_parameters(self) -> None:
-        # TODO: consider running copy on a separate stream
-        for name, p in self._model.named_parameters():
-            self._backup_parameters[name].copy_(p.data, non_blocking=True)
+        with torch.no_grad():
+            # TODO: consider running copy on a separate stream
+            for name, p in self._model.named_parameters():
+                print(f"{name=} {p.data=}")
+                self._backup_parameters[name] = p.detach().clone()
 
     def _restore_parameters(self) -> None:
-        # TODO: consider running copy on a separate stream
-        for name, p in self._model.named_parameters():
-            p.data.copy_(self._backup_parameters[name], non_blocking=True)
+        with torch.no_grad():
+            # TODO: consider running copy on a separate stream
+            for name, p in self._model.named_parameters():
+                p.copy_(self._backup_parameters[name], non_blocking=False)
 
-    # pyre-fixme[14]: support state_dict args
-    def state_dict(self) -> Dict[str, object]:
-        """
-        state_dict returns the state_dict from the last time LocalSGD
-        synchronized and not the current weights.
-        """
-        state_dict = self._model.state_dict()
-        for name, p in self._backup_parameters.items():
-            assert name in state_dict
-            state_dict[name] = p
-        return state_dict
-
-    def load_state_dict(
-        self, state_dict: Mapping[str, Any], strict: bool = True, assign: bool = False
+    def _step_post_hook(
+        self, _optim: optim.Optimizer, _args: List[object], _kwargs: Dict[str, object]
     ) -> None:
         """
-        Loads the state dict to the model and the backup parameters.
-
-        This must be called while the model weights aren't being modified to
-        avoid corrupting the backup weights.
+        This hook is registered on the optimizer and is called after the optimizer step.
         """
-        self._model.load_state_dict(state_dict, strict=strict, assign=assign)
-        self._save_parameters()
+        self._local_step += 1
+        if self._local_step >= self._sync_every:
+            self.sync()
 
-    def forward(self, *args: object, **kwargs: object) -> object:
+    def _forward_step_pre_hook(self, _module: nn.Module, _args: List[object]) -> None:
         """
-        Run the model parameters.
-
-        This should be called before the optimizer step.
-
-        This will start the quorum and save the parameters if this is the first step.
+        Start the quorum before each module forward.
         """
         if self._local_step == 0:
             self._manager.start_quorum()
 
-        self._started_step = True
-
-        return self._model.forward(*args, **kwargs)
-
-    def _step_post_hook(
-        self, _optim: optim.Optimizer, _args: List[object], _kwargs: Dict[str, object]
-    ) -> None:
+    def sync(self) -> None:
         """
-        This hook is registered on the optimizer and is called after the optimizer step.
-
-        This will call the allreduce on the model weights every sync_every steps.
-        If any errors occur it will restore to the weights from the previous sync.
-
-        ``forward`` must be called before this function.
+        Synchronizes and averages the model weights across the manager.
         """
-        assert self._started_step, "forward must be called before step"
-        self._started_step = False
-
-        self._local_step += 1
+        self._perform_sync()
+
+        if self._manager.should_commit():
+            # print(
+            #     f"saving the parameters at {self._local_step=} on manager step {self._manager.current_step()=}"
+            # )
+            # # save the parameters so we can restore from them later if necessary.
+            # print(
+            #     f"{self._manager._rank=} {self._backup_parameters=}, {list(self._model.parameters())=}"
+            # )
+            self._save_parameters()
+            # print(
+            #     f"AFTER SAVE PARAMS: {self._manager._rank=} {self._backup_parameters=}"
+            # )
+        else:
+            # commit failed, restore from the backup parameters
+            self._restore_parameters()
 
-        if self._local_step >= self._sync_every:
-            self._local_step = 0
-            self._average()
+        self._local_step = 0
 
-            if self._manager.should_commit():
-                # save the parameters so we can restore from them later if necessary.
-                self._save_parameters()
-            else:
-                # commit failed, restore from the backup parameters
-                self._restore_parameters()
+    def _perform_sync(self) -> None:
+        """
+        Performs the synchronization of the model weights across the manager.
+        This method is intended to be overridden by subclasses to implement custom
+        synchronization logic.
+        """
+        self._average()
 
     def _average(self) -> None:
         # TODO: do we need to broadcast buffers like DDP does?
@@ -182,3 +196,71 @@ def _average(self) -> None:
 
         for work in works:
             work.wait()
+
+
+class DiLoCo(LocalSGD):
+    """
+    DiLoCo is a subclass of LocalSGD that overrides the synchronization
+    mechanism to average and synchronize the pseudogradients (delta of the previous global weight and current local weights).
+
+    diloco: https://arxiv.org/pdf/2311.08105
+    """
+
+    def __init__(
+        self,
+        manager: Manager,
+        model: nn.Module,
+        inner_optimizer: optim.Optimizer,
+        outer_optimizer: optim.Optimizer,
+        sync_every: int,
+        backup_device: Optional[torch.device] = None,
+        pin_memory: bool = True,
+    ) -> None:
+        super().__init__(
+            manager, model, inner_optimizer, sync_every, backup_device, pin_memory
+        )
+        self._outer_optimizer = outer_optimizer
+
+    def _perform_sync(self) -> None:
+        """
+        Overrides the sync method to calculate the pseugradient, average them across the manager group, and
+        step using the outer optimizer.
+        """
+
+        # Set the .grad field of each parameter to its pseudogradient
+        for name, p in self._model.named_parameters():
+            assert name in self._backup_parameters
+            pseudogradient = p.data - self._backup_parameters[name]
+            p.grad = pseudogradient
+
+        self._average_grads()
+
+        # Restore the parameters back to the previous state
+        self._restore_parameters()
+
+        # Use the outer optimizer to update the model parameters
+        for name, p in self._model.named_parameters():
+            print(f"{name=}, {p.grad=}")
+        print(f"{list(self._model.parameters())=}")
+        logger.info("START")
+        logger.info(f"{self._outer_optimizer.state_dict()=}")
+        logger.info("PERFORMING OPTIMIZER STEP!!")
+        self._outer_optimizer.step()
+        logger.info(f"{self._outer_optimizer.state_dict()=}")
+        logger.info("END")
+        print(f"{list(self._model.parameters())=}")
+        self._outer_optimizer.zero_grad()
+
+    def _average_grads(self) -> None:
+        """
+        Average the gradients across the diloco group.
+        """
+        works = []
+        for p in self._model.parameters():
+            # Perform allreduce on the pseudogradients
+            assert p.grad is not None
+            work = self._manager.allreduce(p.grad)
+            works.append(work)
+        # Wait for all allreduce operations to complete
+        for work in works:
+            work.wait()
diff --git a/torchft/local_sgd_test.py b/torchft/local_sgd_test.py
index d2b73cd..7872fc2 100644
--- a/torchft/local_sgd_test.py
+++ b/torchft/local_sgd_test.py
@@ -11,7 +11,7 @@
 import torch
 from torch import nn, optim
 
-from torchft.local_sgd import LocalSGD
+from torchft.local_sgd import DiLoCo, LocalSGD
 from torchft.manager import Manager
 
 
@@ -40,57 +40,103 @@ def _copy_state_dict(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Ten
 
 class LocalSGDTest(TestCase):
     def test_local_sgd_healthy(self) -> None:
-        base_m = SimpleModel()
-        optimizer = optim.SGD(base_m.parameters())
+        model = SimpleModel()
+        optimizer = optim.SGD(model.parameters())
         manager = create_autospec(Manager)
-
-        m = LocalSGD(manager, base_m, optimizer, sync_every=2)
-        self.assertEqual(m._local_step, 0)
-
-        torch.testing.assert_close(m._backup_parameters, _params_dict(base_m))
-
-        inp = torch.rand(2, 3)
-
-        loss = m(inp).mean()
-        loss.backward()
-        optimizer.step()
-
-        self.assertEqual(m._local_step, 1)
-        self.assertEqual(manager.start_quorum.call_count, 1)
-
-        loss = m(inp).mean()
-        loss.backward()
-        optimizer.step()
-
-        manager.should_commit.return_value = True
-        self.assertEqual(m._local_step, 0)
-
-        torch.testing.assert_close(m._backup_parameters, _params_dict(base_m))
-        self.assertEqual(manager.should_commit.call_count, 1)
-        self.assertEqual(manager.allreduce.call_count, 4)
+        with LocalSGD(manager, model, optimizer, sync_every=2) as local_sgd:
+            self.assertEqual(local_sgd._local_step, 0)
+            torch.testing.assert_close(
+                local_sgd._backup_parameters, _params_dict(model)
+            )
+            inp = torch.rand(2, 3)
+            loss = model(inp).mean()
+            loss.backward()
+            optimizer.step()
+
+            self.assertEqual(local_sgd._local_step, 1)
+            self.assertEqual(manager.start_quorum.call_count, 1)
+            loss = model(inp).mean()
+            loss.backward()
+            optimizer.step()
+
+            manager.should_commit.return_value = True
+            self.assertEqual(local_sgd._local_step, 0)
+            torch.testing.assert_close(
+                local_sgd._backup_parameters, _params_dict(model)
+            )
+            self.assertEqual(manager.should_commit.call_count, 1)
+            self.assertEqual(manager.allreduce.call_count, 4)
 
     def test_local_sgd_recovery(self) -> None:
-        base_m = SimpleModel()
-        optimizer = optim.SGD(base_m.parameters())
+        model = SimpleModel()
+        optimizer = optim.SGD(model.parameters())
         manager = create_autospec(Manager)
 
-        m = LocalSGD(manager, base_m, optimizer, sync_every=2)
+        with LocalSGD(manager, model, optimizer, sync_every=2) as local_sgd:
+            torch.testing.assert_close(
+                local_sgd._backup_parameters, _params_dict(model)
+            )
+            og_state_dict = _copy_state_dict(model.state_dict())
+
+            inp = torch.rand(2, 3)
+
+            loss = model(inp).mean()
+            loss.backward()
+            optimizer.step()
 
-        torch.testing.assert_close(m._backup_parameters, _params_dict(base_m))
-        og_state_dict = _copy_state_dict(base_m.state_dict())
+            # Check that the model's state dict has been updated
+            for name, param in model.state_dict().items():
+                # Ensure the parameter has changed
+                self.assertFalse(
+                    torch.equal(og_state_dict[name], param),
+                    f"Parameter {name} did not change.",
+                )
+            self.assertEqual(local_sgd._local_step, 1)
 
-        inp = torch.rand(2, 3)
+            local_sgd._restore_parameters()
+            torch.testing.assert_close(
+                local_sgd._backup_parameters, _params_dict(model)
+            )
 
-        loss = m(inp).mean()
-        loss.backward()
-        optimizer.step()
 
-        self.assertEqual(m._local_step, 1)
+class DiLoCoTest(TestCase):
+    def test_diloco_healt(self) -> None:
+        model = SimpleModel()
 
-        state_dict = m.state_dict()
-        torch.testing.assert_close(state_dict, m._backup_parameters)
-        torch.testing.assert_close(state_dict, og_state_dict)
+        # Setup optimizers
+        inner_optimizer = torch.optim.AdamW(
+            model.parameters(), lr=4e-4, weight_decay=0.1, betas=(0.9, 0.95)
+        )
+        outer_optimizer = torch.optim.SGD(
+            model.parameters(), lr=0.7, momentum=0.9, nesterov=True
+        )
 
-        m.load_state_dict(state_dict)
-        torch.testing.assert_close(_params_dict(base_m), state_dict)
-        torch.testing.assert_close(m._backup_parameters, _params_dict(base_m))
+        manager = create_autospec(Manager)
+        with DiLoCo(
+            manager, model, inner_optimizer, outer_optimizer, sync_every=2
+        ) as diloco:
+            parameter_count = len(list(model.parameters()))
+            initial_outer_opt_state = outer_optimizer.state_dict()
+            self.assertEqual(initial_outer_opt_state["state"], {})
+
+            self.assertEqual(diloco._local_step, 0)
+            torch.testing.assert_close(diloco._backup_parameters, _params_dict(model))
+            inp = torch.rand(2, 3)
+            loss = model(inp).mean()
+            loss.backward()
+            inner_optimizer.step()
+
+            self.assertEqual(diloco._local_step, 1)
+            self.assertEqual(manager.start_quorum.call_count, 1)
+            loss = model(inp).mean()
+            loss.backward()
+            inner_optimizer.step()
+
+            manager.should_commit.return_value = True
+            self.assertEqual(diloco._local_step, 0)
+            torch.testing.assert_close(diloco._backup_parameters, _params_dict(model))
+            self.assertEqual(manager.should_commit.call_count, 1)
+            self.assertEqual(manager.allreduce.call_count, parameter_count)
+
+            outer_opt_state = outer_optimizer.state_dict()
+            self.assertEqual(len(outer_opt_state["state"]), parameter_count)
diff --git a/torchft/manager_integ_test.py b/torchft/manager_integ_test.py
index d6e7bde..f095f7e 100644
--- a/torchft/manager_integ_test.py
+++ b/torchft/manager_integ_test.py
@@ -1,11 +1,11 @@
 import logging
 import threading
 import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from contextlib import ExitStack, contextmanager
+from concurrent.futures import as_completed, ThreadPoolExecutor
+from contextlib import contextmanager, ExitStack
 from dataclasses import dataclass, field
 from datetime import timedelta
-from typing import Dict, Generator, List, Protocol, Set, Tuple
+from typing import Dict, Generator, List, Optional, Protocol, Set, Tuple
 from unittest import TestCase
 
 import torch
@@ -14,7 +14,7 @@
 from torch import nn, optim
 
 from torchft.ddp import DistributedDataParallel
-from torchft.local_sgd import LocalSGD
+from torchft.local_sgd import DiLoCo, LocalSGD
 from torchft.manager import Manager
 from torchft.optim import OptimizerWrapper
 from torchft.process_group import ProcessGroupGloo
@@ -227,30 +227,124 @@ def state_dict() -> Dict[str, Dict[str, object]]:
 
         m: nn.Module = MyModel()
         optimizer: optim.Optimizer = optim.Adam(m.parameters())
-        m = LocalSGD(manager, m, optimizer, sync_every=2)
         criterion = nn.CrossEntropyLoss()
 
-        while True:
-            inputs = torch.rand(2, 3)
-            labels = torch.randint(4, (2,))
+        with LocalSGD(manager, m, optimizer, sync_every=2):
+            while True:
+                inputs = torch.rand(2, 3)
+                labels = torch.randint(4, (2,))
 
-            optimizer.zero_grad()
-            out = m(inputs)
-            loss = criterion(out, labels)
+                optimizer.zero_grad()
+                out = m(inputs)
+                loss = criterion(out, labels)
 
-            loss.backward()
+                loss.backward()
 
-            optimizer.step()
+                optimizer.step()
 
-            if manager.current_step() >= 4:
-                break
+                if manager.current_step() >= 4:
+                    break
 
-            runner.failure_injector.check(rank, manager.current_step())
+                runner.failure_injector.check(rank, manager.current_step())
 
         # return state_dict so we can check consistency
         return state_dict()
 
 
+def diloco_train_loop(
+    rank: int,
+    store_port: int,
+    runner: Runner,
+) -> Dict[str, Dict[str, object]]:
+    with ExitStack() as stack:
+        torch.manual_seed(42)
+
+        # Declare the model and optimizers
+        m: nn.Module = MyModel()
+
+        # Setup optimizers
+        inner_optimizer: optim.Optimizer = torch.optim.AdamW(
+            m.parameters(), lr=4e-4, weight_decay=0.1, betas=(0.9, 0.95)
+        )
+        outer_optimizer: optim.Optimizer = torch.optim.Adam(
+            m.parameters(), lr=0.7
+        )
+
+        def load_state_dict(state_dict: Dict[str, Dict[str, object]]) -> None:
+            m.load_state_dict(state_dict["model"])
+            inner_optimizer.load_state_dict(state_dict["inner_optim"])
+            outer_optimizer.load_state_dict(state_dict["outer_optim"])
+
+        def state_dict() -> Dict[str, Dict[str, object]]:
+            return {
+                "model": m.state_dict(),
+                "inner_optim": inner_optimizer.state_dict(),
+                "outer_optim": outer_optimizer.state_dict(),
+            }
+
+        print(f"worker {runner.replica_id=} {rank=} {runner.world_size=} starting")
+
+        pg = ProcessGroupGloo()
+        manager = Manager(
+            pg=pg,
+            min_replica_size=2,
+            load_state_dict=load_state_dict,
+            state_dict=state_dict,
+            replica_id=str(runner.replica_id),
+            store_addr="localhost",
+            store_port=store_port,
+            rank=rank,
+            world_size=runner.world_size,
+            lighthouse_addr=runner.lighthouse_address,
+            port=19530 + runner.replica_id,
+            # pyre-fixme[6]: Incompatible parameter type
+            **runner.manager_args,
+        )
+        stack.callback(manager.shutdown)
+
+        # TODO: where in the training loop should we do this?
+        # Ensure all models have the same starting state
+        # We set manual seed so the models start with the same weights
+        manager.start_quorum()
+        for param in m.parameters():
+            print(f"{param=} vs. {param.data}")
+            manager.allreduce(param.data)
+
+        criterion = nn.CrossEntropyLoss()
+        backup_parameters = None
+        with DiLoCo(
+            manager, m, inner_optimizer, outer_optimizer, sync_every=2
+        ) as diloco:
+            while True:
+                print(f"in diloco CM {manager.current_step()=}")
+                inputs = torch.rand(2, 3)
+                labels = torch.randint(4, (2,))
+
+                inner_optimizer.zero_grad()
+                out = m(inputs)
+                loss = criterion(out, labels)
+
+                loss.backward()
+                print("inner optimizer step")
+                inner_optimizer.step()
+
+                # record backup parameters at sync count 3
+                if manager.current_step() == 1:
+                    backup_parameters = diloco._backup_parameters
+
+                # after 4 model updates then break
+                if manager.current_step() >= 4:
+                    break
+
+                runner.failure_injector.check(rank, manager.current_step())
+
+        return_state_dict = state_dict()
+        assert backup_parameters is not None
+        return_state_dict["backup_parameters"] = backup_parameters
+        # return state_dict so we can check consistency
+        return return_state_dict
+
+
 class ManagerIntegTest(TestCase):
     @contextmanager
     def assertElapsedLessThan(
@@ -431,6 +525,89 @@ def test_local_sgd_recovery(self) -> None:
 
         self.assertEqual(failure_injectors[1].count, 1)
 
+    def test_diloco_healthy(self) -> None:
+        lighthouse = Lighthouse(
+            bind="[::]:0",
+            min_replicas=2,
+        )
+        num_replicas = 2
+        futures = []
+
+        with ThreadPoolExecutor(max_workers=num_replicas) as executor:
+            for replica_id in range(num_replicas):
+                failure_injector = FailureInjector()
+                runner = Runner(
+                    replica_id=replica_id,
+                    lighthouse_address=lighthouse.address(),
+                    failure_injector=failure_injector,
+                    train_loop=diloco_train_loop,
+                )
+                futures.append(executor.submit(runner.run_replica))
+
+        state_dicts = []
+
+        for fut in as_completed(futures):
+            state_dicts.append(fut.result())
+
+        lighthouse.shutdown()
+
+        for state_dict in state_dicts:
+            print(state_dict)
+            torch.testing.assert_close(state_dict, state_dicts[0])
+
+    def test_diloco_recovery(self) -> None:
+        lighthouse = Lighthouse(
+            bind="[::]:0",
+            min_replicas=2,
+        )
+        num_replicas = 2
+        futures = []
+
+        failure_injectors = [
+            FailureInjector(),
+            FailureInjector().fail_at(0, 2),
+        ]
+
+        with ThreadPoolExecutor(max_workers=num_replicas) as executor:
+            for replica_id, failure_injector in zip(
+                range(num_replicas), failure_injectors
+            ):
+                runner = Runner(
+                    replica_id=replica_id,
+                    lighthouse_address=lighthouse.address(),
+                    failure_injector=failure_injector,
+                    train_loop=diloco_train_loop,
+                    manager_args={
+                        "use_async_quorum": False,
+                    },
+                )
+                futures.append(executor.submit(runner.run_replica))
+
+            state_dicts = []
+
+            for fut in as_completed(futures):
+                try:
+                    state_dicts.append(fut.result())
+                except Exception as e:
+                    print(e)
+                    raise
+
+        lighthouse.shutdown()
+
+        print(state_dicts[0])
+
+        print(state_dicts[1])
+
+        # global model states should be equivalent
+
+        # outer optimizers should be equivalent
+        for state_dict in state_dicts:
+            torch.testing.assert_close(
+                state_dict[0]["outer_optim"], state_dicts[0][0]["outer_optim"]
+            )
+
+        self.assertEqual(failure_injectors[1].count, 1)
+
     def test_quorum_timeout(self) -> None:
         with ExitStack() as stack:
             lighthouse = Lighthouse(