diff --git a/output.txt b/output.txt new file mode 100644 index 0000000..cff5199 --- /dev/null +++ b/output.txt @@ -0,0 +1,616 @@ +============================= test session starts ============================== +platform linux -- Python 3.10.16, pytest-8.3.4, pluggy-1.5.0 -- /home/howardhuang/.conda/envs/torchft/bin/python +cachedir: .pytest_cache +rootdir: /home/howardhuang/local/torchft +configfile: pytest.ini +plugins: typeguard-2.13.3 +collecting ... collected 8 items / 7 deselected / 1 selected + +torchft/manager_integ_test.py::ManagerIntegTest::test_diloco_healthy torchft::lighthouse: 2025-01-28T07:50:47.294-08:00 - INFO Lighthouse listening on: http://devvm2170.rva0.facebook.com:43041 +torchft::lighthouse: 2025-01-28T07:50:47.294-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:47.396-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:47.496-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:47.597-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:47.699-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:47.800-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:47.902-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:48.003-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:48.105-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:48.206-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:48.308-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:48.408-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:48.509-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][0 heartbeating][shrink_only=false] +torchft::manager: 2025-01-28T07:50:48.515-08:00 - INFO LighthouseClient: establishing connection to http://devvm2170.rva0.facebook.com:43041 +torchft::manager: 2025-01-28T07:50:48.516-08:00 - INFO Manager 1e9ec0b67-12c9-42d2-846a-77880287183a listening on http://devvm2170.rva0.facebook.com:19531 +torchft::manager: 2025-01-28T07:50:48.518-08:00 - INFO LighthouseClient: establishing connection to http://devvm2170.rva0.facebook.com:43041 +torchft::manager: 2025-01-28T07:50:48.519-08:00 - INFO Manager 0ca125a5f-91b5-4a5f-9d60-47491b73043b listening on http://devvm2170.rva0.facebook.com:19530 +torchft::manager: 2025-01-28T07:50:48.549-08:00 - INFO ManagerClient: establishing connection to http://devvm2170.rva0.facebook.com:19531 +torchft::manager: 2025-01-28T07:50:48.552-08:00 - INFO ManagerClient: establishing connection to http://devvm2170.rva0.facebook.com:19530 +torchft::manager: 2025-01-28T07:50:48.552-08:00 - INFO got quorum request for rank 0 +torchft::manager: 2025-01-28T07:50:48.552-08:00 - INFO all workers joined -- starting quorum +torchft::lighthouse: 2025-01-28T07:50:48.552-08:00 - INFO got quorum request for replica 1e9ec0b67-12c9-42d2-846a-77880287183a +torchft::lighthouse: 2025-01-28T07:50:48.552-08:00 - INFO Next quorum status: New quorum not ready, only have 1 participants, need min_replicas 2 [1/1 participants healthy][2 heartbeating][shrink_only=false] +torchft::manager: 2025-01-28T07:50:48.555-08:00 - INFO got quorum request for rank 0 +torchft::manager: 2025-01-28T07:50:48.555-08:00 - INFO all workers joined -- starting quorum +torchft::lighthouse: 2025-01-28T07:50:48.555-08:00 - INFO got quorum request for replica 0ca125a5f-91b5-4a5f-9d60-47491b73043b +torchft::lighthouse: 2025-01-28T07:50:48.555-08:00 - INFO Next quorum status: Valid quorum found [2/2 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:48.555-08:00 - INFO Detected quorum change, bumping quorum_id to 1 +torchft::lighthouse: 2025-01-28T07:50:48.555-08:00 - INFO Quorum! Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 0, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 0, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079448, nanos: 555509353 }) } +torchft::manager: 2025-01-28T07:50:48.555-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 0, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 0, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079448, nanos: 555509353 }) }) } +torchft::manager: 2025-01-28T07:50:48.555-08:00 - INFO returning quorum for rank 0 +torchft::manager: 2025-01-28T07:50:48.555-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 0, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 0, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079448, nanos: 555509353 }) }) } +torchft::manager: 2025-01-28T07:50:48.555-08:00 - INFO healing is required step=0, max_step=0 +torchft::manager: 2025-01-28T07:50:48.556-08:00 - INFO returning quorum for rank 0 +torchft::manager: 2025-01-28T07:50:48.582-08:00 - INFO ManagerClient: establishing connection to http://devvm2170.rva0.facebook.com:19530 +torchft::lighthouse: 2025-01-28T07:50:48.610-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:48.712-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:48.814-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:48.914-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:49.015-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:49.117-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:49.219-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +2401:db00:eef0:1120:3520:0:740e:f755 - - [28/Jan/2025 07:50:49] "GET /checkpoint/0 HTTP/1.1" 200 - +torchft::lighthouse: 2025-01-28T07:50:49.320-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:49.422-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:49.524-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::manager: 2025-01-28T07:50:49.559-08:00 - INFO got quorum request for rank 0 +torchft::manager: 2025-01-28T07:50:49.559-08:00 - INFO all workers joined -- starting quorum +torchft::lighthouse: 2025-01-28T07:50:49.560-08:00 - INFO got quorum request for replica 0ca125a5f-91b5-4a5f-9d60-47491b73043b +torchft::lighthouse: 2025-01-28T07:50:49.560-08:00 - INFO Next quorum status: New quorum not ready, only have 1 participants, need min_replicas 2 [1/1 participants healthy][2 heartbeating][shrink_only=false] +torchft::manager: 2025-01-28T07:50:49.560-08:00 - INFO got quorum request for rank 0 +torchft::manager: 2025-01-28T07:50:49.560-08:00 - INFO all workers joined -- starting quorum +torchft::lighthouse: 2025-01-28T07:50:49.561-08:00 - INFO got quorum request for replica 1e9ec0b67-12c9-42d2-846a-77880287183a +torchft::lighthouse: 2025-01-28T07:50:49.561-08:00 - INFO Next quorum status: Fast quorum found! [2/2 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:49.561-08:00 - INFO Quorum! Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 0, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 0, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 561226349 }) } +torchft::manager: 2025-01-28T07:50:49.561-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 0, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 0, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 561226349 }) }) } +torchft::manager: 2025-01-28T07:50:49.561-08:00 - INFO healing is required step=0, max_step=0 +torchft::manager: 2025-01-28T07:50:49.561-08:00 - INFO returning quorum for rank 0 +torchft::manager: 2025-01-28T07:50:49.561-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 0, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 0, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 561226349 }) }) } +torchft::manager: 2025-01-28T07:50:49.561-08:00 - INFO returning quorum for rank 0 +torchft::manager: 2025-01-28T07:50:49.582-08:00 - INFO ManagerClient: establishing connection to http://devvm2170.rva0.facebook.com:19530 +2401:db00:eef0:1120:3520:0:740e:f755 - - [28/Jan/2025 07:50:49] "GET /checkpoint/0 HTTP/1.1" 200 - +torchft::manager: 2025-01-28T07:50:49.622-08:00 - INFO should_commit request from 0 should_commit=true +torchft::manager: 2025-01-28T07:50:49.622-08:00 - INFO should_commit completed should_commit=true +torchft::manager: 2025-01-28T07:50:49.622-08:00 - INFO should_commit request from 0 should_commit=true +torchft::manager: 2025-01-28T07:50:49.622-08:00 - INFO should_commit completed should_commit=true +torchft::lighthouse: 2025-01-28T07:50:49.625-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::manager: 2025-01-28T07:50:49.626-08:00 - INFO got quorum request for rank 0 +torchft::manager: 2025-01-28T07:50:49.626-08:00 - INFO all workers joined -- starting quorum +torchft::lighthouse: 2025-01-28T07:50:49.626-08:00 - INFO got quorum request for replica 1e9ec0b67-12c9-42d2-846a-77880287183a +torchft::manager: 2025-01-28T07:50:49.626-08:00 - INFO got quorum request for rank 0 +torchft::manager: 2025-01-28T07:50:49.626-08:00 - INFO all workers joined -- starting quorum +torchft::lighthouse: 2025-01-28T07:50:49.626-08:00 - INFO Next quorum status: New quorum not ready, only have 1 participants, need min_replicas 2 [1/1 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:49.627-08:00 - INFO got quorum request for replica 0ca125a5f-91b5-4a5f-9d60-47491b73043b +torchft::lighthouse: 2025-01-28T07:50:49.627-08:00 - INFO Next quorum status: Fast quorum found! [2/2 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:49.627-08:00 - INFO Quorum! Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 1, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 1, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 627176917 }) } +torchft::manager: 2025-01-28T07:50:49.627-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 1, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 1, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 627176917 }) }) } +torchft::manager: 2025-01-28T07:50:49.627-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 1, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 1, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 627176917 }) }) } +torchft::manager: 2025-01-28T07:50:49.627-08:00 - INFO returning quorum for rank 0 +torchft::manager: 2025-01-28T07:50:49.627-08:00 - INFO returning quorum for rank 0 +torchft::manager: 2025-01-28T07:50:49.656-08:00 - INFO should_commit request from 0 should_commit=true +torchft::manager: 2025-01-28T07:50:49.656-08:00 - INFO should_commit completed should_commit=true +torchft::manager: 2025-01-28T07:50:49.660-08:00 - INFO got quorum request for rank 0 +torchft::manager: 2025-01-28T07:50:49.660-08:00 - INFO all workers joined -- starting quorum +torchft::lighthouse: 2025-01-28T07:50:49.661-08:00 - INFO got quorum request for replica 1e9ec0b67-12c9-42d2-846a-77880287183a +torchft::lighthouse: 2025-01-28T07:50:49.661-08:00 - INFO Next quorum status: New quorum not ready, only have 1 participants, need min_replicas 2 [1/1 participants healthy][2 heartbeating][shrink_only=false] +torchft::manager: 2025-01-28T07:50:49.661-08:00 - INFO should_commit request from 0 should_commit=true +torchft::manager: 2025-01-28T07:50:49.661-08:00 - INFO should_commit completed should_commit=true +torchft::manager: 2025-01-28T07:50:49.664-08:00 - INFO got quorum request for rank 0 +torchft::manager: 2025-01-28T07:50:49.664-08:00 - INFO all workers joined -- starting quorum +torchft::lighthouse: 2025-01-28T07:50:49.664-08:00 - INFO got quorum request for replica 0ca125a5f-91b5-4a5f-9d60-47491b73043b +torchft::lighthouse: 2025-01-28T07:50:49.664-08:00 - INFO Next quorum status: Fast quorum found! [2/2 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:49.664-08:00 - INFO Quorum! Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 2, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 2, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 664398852 }) } +torchft::manager: 2025-01-28T07:50:49.664-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 2, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 2, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 664398852 }) }) } +torchft::manager: 2025-01-28T07:50:49.664-08:00 - INFO returning quorum for rank 0 +torchft::manager: 2025-01-28T07:50:49.664-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 2, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 2, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 664398852 }) }) } +torchft::manager: 2025-01-28T07:50:49.664-08:00 - INFO returning quorum for rank 0 +starting replica group self.replica_id=0 self.world_size=1 attempt 0 +starting replica group self.replica_id=1 self.world_size=1 attempt 0 +worker runner.replica_id=1 rank=0 runner.world_size=1 starting +worker runner.replica_id=0 rank=0 runner.world_size=1 starting +param=Parameter containing: +tensor([[ 0.4414, 0.4792, -0.1353], + [ 0.5304, -0.1265, 0.1165], + [-0.2811, 0.3391, 0.5090], + [-0.4236, 0.5018, 0.1081]], requires_grad=True) vs. tensor([[ 0.4414, 0.4792, -0.1353], + [ 0.5304, -0.1265, 0.1165], + [-0.2811, 0.3391, 0.5090], + [-0.4236, 0.5018, 0.1081]]) +param=Parameter containing: +tensor([[ 0.4266, 0.0782, 0.2784], + [-0.0815, 0.4451, 0.0853], + [-0.2695, 0.1472, -0.2660], + [-0.0677, -0.2345, 0.3830]], requires_grad=True) vs. tensor([[ 0.4266, 0.0782, 0.2784], + [-0.0815, 0.4451, 0.0853], + [-0.2695, 0.1472, -0.2660], + [-0.0677, -0.2345, 0.3830]]) +param=Parameter containing: +tensor([-0.4557, -0.2662, -0.1630, -0.3471], requires_grad=True) vs. tensor([-0.4557, -0.2662, -0.1630, -0.3471]) +param=Parameter containing: +tensor([ 0.0545, -0.5702, 0.5214, -0.4904], requires_grad=True) vs. tensor([ 0.0545, -0.5702, 0.5214, -0.4904]) +True list(self._model.parameters())=[Parameter containing: +tensor([[ 0.2207, 0.2396, -0.0676], + [ 0.2652, -0.0632, 0.0583], + [-0.1405, 0.1695, 0.2545], + [-0.2118, 0.2509, 0.0540]], requires_grad=True), Parameter containing: +tensor([-0.2279, -0.1331, -0.0815, -0.1736], requires_grad=True)] +True list(self._model.parameters())=[Parameter containing: +tensor([[ 0.2207, 0.2396, -0.0676], + [ 0.2652, -0.0632, 0.0583], + [-0.1405, 0.1695, 0.2545], + [-0.2118, 0.2509, 0.0540]], requires_grad=True), Parameter containing: +tensor([-0.2279, -0.1331, -0.0815, -0.1736], requires_grad=True)] +name='model.0.weight' p.data=tensor([[ 0.2207, 0.2396, -0.0676], + [ 0.2652, -0.0632, 0.0583], + [-0.1405, 0.1695, 0.2545], + [-0.2118, 0.2509, 0.0540]]) +name='model.0.weight' p.data=tensor([[ 0.2207, 0.2396, -0.0676], + [ 0.2652, -0.0632, 0.0583], + [-0.1405, 0.1695, 0.2545], + [-0.2118, 0.2509, 0.0540]]) +name='model.0.bias' p.data=tensor([-0.2279, -0.1331, -0.0815, -0.1736]) +in diloco CM manager.current_step()=0 +name='model.0.bias' p.data=tensor([-0.2279, -0.1331, -0.0815, -0.1736]) +in diloco CM manager.current_step()=0 +inner optimizer step +inner optimizer step +in diloco CM manager.current_step()=0 +in diloco CM manager.current_step()=0 +inner optimizer step +inner optimizer step +name='model.0.weight', p.grad=tensor([[ 2.3217e-04, 2.8212e-04, 7.1485e-05], + [-4.0449e-04, -3.6760e-04, -3.9917e-04], + [ 4.0477e-04, 3.7535e-04, 2.5970e-04], + [-3.0579e-04, -3.2146e-04, -1.1927e-04]]) +name='model.0.weight', p.grad=tensor([[ 2.3217e-04, 2.8212e-04, 7.1485e-05], + [-4.0449e-04, -3.6760e-04, -3.9917e-04], + [ 4.0477e-04, 3.7535e-04, 2.5970e-04], + [-3.0579e-04, -3.2146e-04, -1.1927e-04]]) +name='model.0.bias', p.grad=tensor([ 0.0002, -0.0004, 0.0004, -0.0002]) +name='model.0.bias', p.grad=tensor([ 0.0002, -0.0004, 0.0004, -0.0002]) +list(self._model.parameters())=[Parameter containing: +tensor([[ 0.2207, 0.2396, -0.0676], + [ 0.2652, -0.0632, 0.0583], + [-0.1405, 0.1695, 0.2545], + [-0.2118, 0.2509, 0.0540]], requires_grad=True), Parameter containing: +tensor([-0.2279, -0.1331, -0.0815, -0.1736], requires_grad=True)] +list(self._model.parameters())=[Parameter containing: +tensor([[ 0.2207, 0.2396, -0.0676], + [ 0.2652, -0.0632, 0.0583], + [-0.1405, 0.1695, 0.2545], + [-0.2118, 0.2509, 0.0540]], requires_grad=True), Parameter containing: +tensor([-0.2279, -0.1331, -0.0815, -0.1736], requires_grad=True)] +list(self._model.parameters())=[Parameter containing: +tensor([[-0.4793, -0.4604, -0.7675], + [ 0.9652, 0.6367, 0.7582], + [-0.8405, -0.5304, -0.4455], + [ 0.4882, 0.9509, 0.7540]], requires_grad=True), Parameter containing: +tensor([-0.9278, 0.5669, -0.7815, 0.5264], requires_grad=True)] +list(self._model.parameters())=[Parameter containing: +tensor([[-0.4793, -0.4604, -0.7675], + [ 0.9652, 0.6367, 0.7582], + [-0.8405, -0.5304, -0.4455], + [ 0.4882, 0.9509, 0.7540]], requires_grad=True), Parameter containing: +tensor([-0.9278, 0.5669, -0.7815, 0.5264], requires_grad=True)] +name='model.0.weight' p.data=tensor([[ 0.2211, 0.2400, -0.0672], + [ 0.2648, -0.0636, 0.0578], + [-0.1401, 0.1699, 0.2549], + [-0.2122, 0.2505, 0.0536]]) +name='model.0.weight' p.data=tensor([[-0.4793, -0.4604, -0.7675], + [ 0.9652, 0.6367, 0.7582], + [-0.8405, -0.5304, -0.4455], + [ 0.4882, 0.9509, 0.7540]]) +name='model.0.bias' p.data=tensor([-0.2275, -0.1335, -0.0811, -0.1740]) +in diloco CM manager.current_step()=1 +name='model.0.bias' p.data=tensor([-0.9278, 0.5669, -0.7815, 0.5264]) +in diloco CM manager.current_step()=1 +inner optimizer step +in diloco CM manager.current_step()=1 +inner optimizer step +in diloco CM manager.current_step()=1 +inner optimizer step +inner optimizer step +name='model.0.weight', p.grad=tensor([[ 1.4991e-04, 2.5948e-04, -4.4014e-04], + [-2.4478e-04, -8.2199e-05, -3.2015e-05], + [ 4.2802e-04, 2.3373e-04, 9.2387e-05], + [-6.4364e-04, -5.9976e-04, -2.6279e-04]]) +name='model.0.weight', p.grad=tensor([[ 1.4991e-04, 2.5948e-04, -4.4014e-04], + [-2.4478e-04, -8.2199e-05, -3.2015e-05], + [ 4.2802e-04, 2.3373e-04, 9.2387e-05], + [-6.4364e-04, -5.9976e-04, -2.6279e-04]]) +name='model.0.bias', p.grad=tensor([ 6.3330e-07, -6.8381e-05, 2.6295e-04, -3.6267e-04]) +name='model.0.bias', p.grad=tensor([ 6.3330e-07, -6.8381e-05, 2.6295e-04, -3.6267e-04]) +list(self._model.parameters())=[Parameter containing: +tensor([[-0.4793, -0.4604, -0.7675], + [ 0.9652, 0.6367, 0.7582], + [-0.8405, -0.5304, -0.4455], + [ 0.4882, 0.9509, 0.7540]], requires_grad=True), Parameter containing: +tensor([-0.9278, 0.5669, -0.7815, 0.5264], requires_grad=True)] +list(self._model.parameters())=[Parameter containing: +tensor([[ 0.2211, 0.2400, -0.0672], + [ 0.2648, -0.0636, 0.0578], + [-0.1401, 0.1699, 0.2549], + [-0.2122, 0.2505, 0.0536]], requires_grad=True), Parameter containing: +tensor([-0.2275, -0.1335, -0.0811, -0.1740], requires_grad=True)] +list(self._model.parameters())=[Parameter containing: +tensor([[-0.4789, -0.4600, 0.6328], + [ 0.9647, 0.6363, 0.7576], + [-0.8401, -0.5300, -0.4451], + [ 0.4878, 0.9505, 0.7536]], requires_grad=True), Parameter containing: +tensor([-0.9166, 0.5664, -0.7811, 0.5260], requires_grad=True)] +name='model.0.weight' p.data=tensor([[-0.4789, -0.4600, 0.6328], + [ 0.9647, 0.6363, 0.7576], + [-0.8401, -0.5300, -0.4451], + [ 0.4878, 0.9505, 0.7536]]) +name='model.0.bias' p.data=tensor([-0.9166, 0.5664, -0.7811, 0.5260]) +in diloco CM manager.current_step()=2 +list(self._model.parameters())=[Parameter containing: +tensor([[-1.1559, -1.1582, -0.3285], + [ 1.6362, 1.2082, 1.2674], + [-1.5413, -1.2040, -1.0620], + [ 1.1599, 1.6315, 1.4221]], requires_grad=True), Parameter containing: +tensor([-1.3985, 1.1180, -1.4589, 1.1988], requires_grad=True)] +name='model.0.weight' p.data=tensor([[-1.1559, -1.1582, -0.3285], + [ 1.6362, 1.2082, 1.2674], + [-1.5413, -1.2040, -1.0620], + [ 1.1599, 1.6315, 1.4221]]) +name='model.0.bias' p.data=tensor([-1.3985, 1.1180, -1.4589, 1.1988]) +in diloco CM manager.current_step()=2 +inner optimizer step +in diloco CM manager.current_step()=2 +inner optimizer step +in diloco CM manager.current_step()=2 +inner optimizer step +inner optimizer step +name='model.0.weight', p.grad=tensor([[ 1.4319e-04, 2.9729e-04, -4.0483e-04], + [-1.8367e-04, -2.0355e-05, 4.6968e-05], + [ 2.8500e-04, 6.5982e-05, 2.9102e-05], + [-4.9986e-04, -5.7098e-04, -1.6165e-04]]) +torchft::manager: 2025-01-28T07:50:49.698-08:00 - INFO should_commit request from 0 should_commit=true +torchft::manager: 2025-01-28T07:50:49.698-08:00 - INFO should_commit completed should_commit=true +torchft::manager: 2025-01-28T07:50:49.698-08:00 - INFO should_commit request from 0 should_commit=true +torchft::manager: 2025-01-28T07:50:49.699-08:00 - INFO should_commit completed should_commit=true +torchft::manager: 2025-01-28T07:50:49.702-08:00 - INFO got quorum request for rank 0 +torchft::manager: 2025-01-28T07:50:49.702-08:00 - INFO all workers joined -- starting quorum +torchft::lighthouse: 2025-01-28T07:50:49.702-08:00 - INFO got quorum request for replica 0ca125a5f-91b5-4a5f-9d60-47491b73043b +torchft::lighthouse: 2025-01-28T07:50:49.702-08:00 - INFO Next quorum status: New quorum not ready, only have 1 participants, need min_replicas 2 [1/1 participants healthy][2 heartbeating][shrink_only=false] +torchft::manager: 2025-01-28T07:50:49.703-08:00 - INFO got quorum request for rank 0 +torchft::manager: 2025-01-28T07:50:49.703-08:00 - INFO all workers joined -- starting quorum +torchft::lighthouse: 2025-01-28T07:50:49.703-08:00 - INFO got quorum request for replica 1e9ec0b67-12c9-42d2-846a-77880287183a +torchft::lighthouse: 2025-01-28T07:50:49.703-08:00 - INFO Next quorum status: Fast quorum found! [2/2 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:49.703-08:00 - INFO Quorum! Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 3, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 3, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 703374025 }) } +torchft::manager: 2025-01-28T07:50:49.703-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 3, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 3, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 703374025 }) }) } +torchft::manager: 2025-01-28T07:50:49.703-08:00 - INFO returning quorum for rank 0 +torchft::manager: 2025-01-28T07:50:49.703-08:00 - INFO got lighthouse quorum LighthouseQuorumResponse { quorum: Some(Quorum { quorum_id: 1, participants: [QuorumMember { replica_id: "0ca125a5f-91b5-4a5f-9d60-47491b73043b", address: "http://devvm2170.rva0.facebook.com:19530", store_address: "localhost:47765", step: 3, world_size: 1, shrink_only: false }, QuorumMember { replica_id: "1e9ec0b67-12c9-42d2-846a-77880287183a", address: "http://devvm2170.rva0.facebook.com:19531", store_address: "localhost:40861", step: 3, world_size: 1, shrink_only: false }], created: Some(Timestamp { seconds: 1738079449, nanos: 703374025 }) }) } +torchft::manager: 2025-01-28T07:50:49.703-08:00 - INFO returning quorum for rank 0 +torchft::lighthouse: 2025-01-28T07:50:49.726-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::manager: 2025-01-28T07:50:49.737-08:00 - INFO should_commit request from 0 should_commit=true +torchft::manager: 2025-01-28T07:50:49.737-08:00 - INFO should_commit completed should_commit=true +torchft::manager: 2025-01-28T07:50:49.737-08:00 - INFO should_commit request from 0 should_commit=true +torchft::manager: 2025-01-28T07:50:49.738-08:00 - INFO should_commit completed should_commit=true +torchft::lighthouse: 2025-01-28T07:50:49.828-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:49.929-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:50.031-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +torchft::lighthouse: 2025-01-28T07:50:50.133-08:00 - INFO Next quorum status: New quorum not ready, only have 0 participants, need min_replicas 2 [0/0 participants healthy][2 heartbeating][shrink_only=false] +name='model.0.weight', p.grad=tensor([[ 1.4319e-04, 2.9729e-04, -4.0483e-04], + [-1.8367e-04, -2.0355e-05, 4.6968e-05], + [ 2.8500e-04, 6.5982e-05, 2.9102e-05], + [-4.9986e-04, -5.7098e-04, -1.6165e-04]]) +name='model.0.bias', p.grad=tensor([ 8.6367e-05, -6.9797e-05, 1.4833e-04, -2.5073e-04]) +name='model.0.bias', p.grad=tensor([ 8.6367e-05, -6.9797e-05, 1.4833e-04, -2.5073e-04]) +list(self._model.parameters())=[Parameter containing: +tensor([[-1.1559, -1.1582, -0.3285], + [ 1.6362, 1.2082, 1.2674], + [-1.5413, -1.2040, -1.0620], + [ 1.1599, 1.6315, 1.4221]], requires_grad=True), Parameter containing: +tensor([-1.3985, 1.1180, -1.4589, 1.1988], requires_grad=True)] +list(self._model.parameters())=[Parameter containing: +tensor([[-0.4789, -0.4600, 0.6328], + [ 0.9647, 0.6363, 0.7576], + [-0.8401, -0.5300, -0.4451], + [ 0.4878, 0.9505, 0.7536]], requires_grad=True), Parameter containing: +tensor([-0.9166, 0.5664, -0.7811, 0.5260], requires_grad=True)] +list(self._model.parameters())=[Parameter containing: +tensor([[-1.8260, -1.8585, 0.2233], + [ 2.2816, 1.6734, 1.6062], + [-2.2236, -1.7849, -1.5829], + [ 1.8418, 2.3220, 2.0912]], requires_grad=True), Parameter containing: +tensor([-1.9107, 1.6145, -2.0911, 1.8764], requires_grad=True)] +list(self._model.parameters())=[Parameter containing: +tensor([[-1.1778, -1.1608, 1.3306], + [ 1.6525, 1.2167, 0.5913], + [-1.5193, -1.1230, -1.0489], + [ 1.1778, 1.6494, 1.4260]], requires_grad=True), Parameter containing: +tensor([-1.4408, 1.2667, -1.4455, 1.2081], requires_grad=True)] +name='model.0.weight' p.data=tensor([[-1.8260, -1.8585, 0.2233], + [ 2.2816, 1.6734, 1.6062], + [-2.2236, -1.7849, -1.5829], + [ 1.8418, 2.3220, 2.0912]]) +name='model.0.weight' p.data=tensor([[-1.1778, -1.1608, 1.3306], + [ 1.6525, 1.2167, 0.5913], + [-1.5193, -1.1230, -1.0489], + [ 1.1778, 1.6494, 1.4260]]) +name='model.0.bias' p.data=tensor([-1.9107, 1.6145, -2.0911, 1.8764]) +in diloco CM manager.current_step()=3 +name='model.0.bias' p.data=tensor([-1.4408, 1.2667, -1.4455, 1.2081]) +in diloco CM manager.current_step()=3 +inner optimizer step +in diloco CM manager.current_step()=3 +inner optimizer step +in diloco CM manager.current_step()=3 +inner optimizer step +inner optimizer step +name='model.0.weight', p.grad=tensor([[ 2.6709e-04, 3.4326e-04, -1.7107e-04], + [-2.6494e-04, -1.3125e-04, -3.5435e-05], + [ 2.6774e-04, 8.8334e-05, 6.3241e-05], + [-3.7980e-04, -4.9281e-04, -1.2958e-04]]) +name='model.0.weight', p.grad=tensor([[ 2.6709e-04, 3.4326e-04, -1.7107e-04], + [-2.6494e-04, -1.3125e-04, -3.5435e-05], + [ 2.6774e-04, 8.8334e-05, 6.3241e-05], + [-3.7980e-04, -4.9281e-04, -1.2958e-04]]) +name='model.0.bias', p.grad=tensor([ 0.0002, -0.0002, 0.0001, -0.0002]) +name='model.0.bias', p.grad=tensor([ 0.0002, -0.0002, 0.0001, -0.0002]) +list(self._model.parameters())=[Parameter containing: +tensor([[-1.1778, -1.1608, 1.3306], + [ 1.6525, 1.2167, 0.5913], + [-1.5193, -1.1230, -1.0489], + [ 1.1778, 1.6494, 1.4260]], requires_grad=True), Parameter containing: +tensor([-1.4408, 1.2667, -1.4455, 1.2081], requires_grad=True)] +list(self._model.parameters())=[Parameter containing: +tensor([[-1.8260, -1.8585, 0.2233], + [ 2.2816, 1.6734, 1.6062], + [-2.2236, -1.7849, -1.5829], + [ 1.8418, 2.3220, 2.0912]], requires_grad=True), Parameter containing: +tensor([-1.9107, 1.6145, -2.0911, 1.8764], requires_grad=True)] +list(self._model.parameters())=[Parameter containing: +tensor([[-2.5075, -2.5618, 0.7693], + [ 2.9387, 2.1671, 1.9183], + [-2.8951, -2.3307, -2.0895], + [ 2.5165, 3.0140, 2.7500]], requires_grad=True), Parameter containing: +tensor([-2.4930, 2.1568, -2.7003, 2.5372], requires_grad=True)] +list(self._model.parameters())=[Parameter containing: +tensor([[-1.8622, -1.8630, 1.9723], + [ 2.3472, 1.8356, 0.7188], + [-2.1914, -1.7066, -1.6842], + [ 1.8502, 2.3423, 2.0785]], requires_grad=True), Parameter containing: +tensor([-1.9961, 1.9095, -2.1010, 1.8620], requires_grad=True)] +name='model.0.weight' p.data=tensor([[-2.5075, -2.5618, 0.7693], + [ 2.9387, 2.1671, 1.9183], + [-2.8951, -2.3307, -2.0895], + [ 2.5165, 3.0140, 2.7500]]) +name='model.0.weight' p.data=tensor([[-1.8622, -1.8630, 1.9723], + [ 2.3472, 1.8356, 0.7188], + [-2.1914, -1.7066, -1.6842], + [ 1.8502, 2.3423, 2.0785]]) +name='model.0.bias' p.data=tensor([-2.4930, 2.1568, -2.7003, 2.5372]) +name='model.0.bias' p.data=tensor([-1.9961, 1.9095, -2.1010, 1.8620]) +[{'model': OrderedDict([('model.0.weight', tensor([[-1.8622, -1.8630, 1.9723], + [ 2.3472, 1.8356, 0.7188], + [-2.1914, -1.7066, -1.6842], + [ 1.8502, 2.3423, 2.0785]])), ('model.0.bias', tensor([-1.9961, 1.9095, -2.1010, 1.8620]))]), 'inner_optim': {'state': {0: {'step': tensor(7.), 'exp_avg': tensor([[-0.0061, -0.0078, -0.0039], + [-0.0020, 0.0016, -0.0043], + [ 0.0023, 0.0029, 0.0027], + [ 0.0034, 0.0040, 0.0014]]), 'exp_avg_sq': tensor([[0.0002, 0.0003, 0.0004], + [0.0004, 0.0002, 0.0006], + [0.0001, 0.0001, 0.0003], + [0.0002, 0.0001, 0.0002]])}, 1: {'step': tensor(7.), 'exp_avg': tensor([-0.0104, -0.0035, 0.0103, 0.0004]), 'exp_avg_sq': tensor([0.0009, 0.0018, 0.0005, 0.0007])}}, 'param_groups': [{'lr': 0.0004, 'betas': (0.9, 0.95), 'eps': 1e-08, 'weight_decay': 0.1, 'amsgrad': False, 'foreach': None, 'maximize': False, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}, 'outer_optim': {'state': {0: {'step': tensor(3.), 'exp_avg': tensor([[ 5.1738e-05, 8.2101e-05, -8.9194e-05], + [-6.2852e-05, -2.1615e-05, -1.9096e-06], + [ 8.7094e-05, 3.3704e-05, 1.6427e-05], + [-1.3510e-04, -1.4925e-04, -4.8792e-05]]), 'exp_avg_sq': tensor([[1.1424e-10, 2.7332e-10, 3.8633e-10], + [1.6369e-10, 2.4383e-11, 4.4824e-12], + [3.3567e-10, 6.6674e-11, 1.3364e-11], + [8.0730e-10, 9.2755e-10, 1.1181e-10]])}, 1: {'step': tensor(3.), 'exp_avg': tensor([ 3.2709e-05, -3.1210e-05, 4.8912e-05, -6.9191e-05]), 'exp_avg_sq': tensor([6.9378e-11, 4.7128e-11, 1.1133e-10, 2.2382e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}, 'backup_parameters': {'model.0.weight': tensor([[-1.8622, -1.8630, 1.9723], + [ 2.3472, 1.8356, 0.7188], + [-2.1914, -1.7066, -1.6842], + [ 1.8502, 2.3423, 2.0785]]), 'model.0.bias': tensor([-1.9961, 1.9095, -2.1010, 1.8620])}}] +[{'model': OrderedDict([('model.0.weight', tensor([[-2.5075, -2.5618, 0.7693], + [ 2.9387, 2.1671, 1.9183], + [-2.8951, -2.3307, -2.0895], + [ 2.5165, 3.0140, 2.7500]])), ('model.0.bias', tensor([-2.4930, 2.1568, -2.7003, 2.5372]))]), 'inner_optim': {'state': {0: {'step': tensor(8.), 'exp_avg': tensor([[-0.0006, -0.0005, 0.0010], + [ 0.0052, -0.0006, 0.0014], + [-0.0087, -0.0018, -0.0008], + [ 0.0041, 0.0030, -0.0025]]), 'exp_avg_sq': tensor([[1.7226e-04, 5.7628e-05, 5.0178e-05], + [2.0129e-04, 1.7710e-04, 2.2114e-04], + [3.2548e-04, 6.1146e-05, 2.3409e-04], + [1.2215e-04, 7.6481e-05, 4.1236e-04]])}, 1: {'step': tensor(8.), 'exp_avg': tensor([-0.0007, 0.0078, -0.0093, 0.0019]), 'exp_avg_sq': tensor([0.0004, 0.0007, 0.0004, 0.0004])}}, 'param_groups': [{'lr': 0.0004, 'betas': (0.9, 0.95), 'eps': 1e-08, 'weight_decay': 0.1, 'amsgrad': False, 'foreach': None, 'maximize': False, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}, 'outer_optim': {'state': {0: {'step': tensor(4.), 'exp_avg': tensor([[ 6.8663e-05, 1.0267e-04, -8.3983e-05], + [-9.2339e-05, -4.8413e-05, -3.1009e-05], + [ 1.1660e-04, 6.1067e-05, 3.5359e-05], + [-1.5739e-04, -1.7268e-04, -5.7487e-05]]), 'exp_avg_sq': tensor([[1.6798e-10, 3.5267e-10, 3.9143e-10], + [3.2682e-10, 1.5911e-10, 1.6334e-10], + [4.9901e-10, 2.0714e-10, 8.0604e-11], + [9.0053e-10, 1.0306e-09, 1.2600e-10]])}, 1: {'step': tensor(4.), 'exp_avg': tensor([ 4.7159e-05, -5.9983e-05, 7.8308e-05, -8.1859e-05]), 'exp_avg_sq': tensor([1.0855e-10, 2.0245e-10, 2.7344e-10, 2.5393e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}, 'backup_parameters': {'model.0.weight': tensor([[-2.5075, -2.5618, 0.7693], + [ 2.9387, 2.1671, 1.9183], + [-2.8951, -2.3307, -2.0895], + [ 2.5165, 3.0140, 2.7500]]), 'model.0.bias': tensor([-2.4930, 2.1568, -2.7003, 2.5372])}}] +FAILED + +=================================== FAILURES =================================== +_____________________ ManagerIntegTest.test_diloco_healthy _____________________ + +self = + + def test_diloco_healthy(self) -> None: + lighthouse = Lighthouse( + bind="[::]:0", + min_replicas=2, + ) + num_replicas = 2 + futures = [] + + with ThreadPoolExecutor(max_workers=num_replicas) as executor: + for replica_id in range(num_replicas): + failure_injector = FailureInjector() + runner = Runner( + replica_id=replica_id, + lighthouse_address=lighthouse.address(), + failure_injector=failure_injector, + train_loop=diloco_train_loop, + ) + futures.append(executor.submit(runner.run_replica)) + + state_dicts = [] + + for fut in as_completed(futures): + state_dicts.append(fut.result()) + + lighthouse.shutdown() + + for state_dict in state_dicts: + print(state_dict) +> torch.testing.assert_close(state_dict, state_dicts[0]) +E AssertionError: Tensor-likes are not close! +E +E Mismatched elements: 4 / 4 (100.0%) +E Greatest absolute difference: 0.6751507520675659 at index (3,) (up to 1e-05 allowed) +E Greatest relative difference: 0.36258822679519653 at index (3,) (up to 1.3e-06 allowed) +E +E The failure occurred for item [0]['backup_parameters']['model.0.bias'] + +torchft/manager_integ_test.py:556: AssertionError +------------------------------ Captured log call ------------------------------- +2025-01-28 07:50:48 INFO Started CheckpointServer on http://devvm2170.rva0.facebook.com:43127/checkpoint/-1... +2025-01-28 07:50:48 INFO Started CheckpointServer on http://devvm2170.rva0.facebook.com:34997/checkpoint/-1... +2025-01-28 07:50:48 INFO [0ca125a5f-91b5-4a5f-9d60-47491b73043b/0 - step 0] reconfiguring for quorum_id=1 store_prefixed_addr='localhost:47765/torchft/1/0' +2025-01-28 07:50:48 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] reconfiguring for quorum_id=1 store_prefixed_addr='localhost:47765/torchft/1/0' +2025-01-28 07:50:48 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] healing required, fetching checkpoint server address from address='http://devvm2170.rva0.facebook.com:19530' max_step=0 +2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] fetching checkpoint from checkpoint_server_address='http://devvm2170.rva0.facebook.com:34997/checkpoint/0' +2025-01-28 07:50:49 INFO fetching checkpoint from http://devvm2170.rva0.facebook.com:34997/checkpoint/0 +2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] healing required, fetching checkpoint server address from address='http://devvm2170.rva0.facebook.com:19530' max_step=0 +2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] fetching checkpoint from checkpoint_server_address='http://devvm2170.rva0.facebook.com:34997/checkpoint/0' +2025-01-28 07:50:49 INFO fetching checkpoint from http://devvm2170.rva0.facebook.com:34997/checkpoint/0 +2025-01-28 07:50:49 INFO START +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!! +2025-01-28 07:50:49 INFO START +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!! +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(1.), 'exp_avg': tensor([[ 2.3217e-05, 2.8212e-05, 7.1485e-06], + [-4.0449e-05, -3.6760e-05, -3.9917e-05], + [ 4.0477e-05, 3.7535e-05, 2.5970e-05], + [-3.0579e-05, -3.2146e-05, -1.1927e-05]]), 'exp_avg_sq': tensor([[5.3902e-11, 7.9590e-11, 5.1100e-12], + [1.6361e-10, 1.3513e-10, 1.5934e-10], + [1.6384e-10, 1.4089e-10, 6.7443e-11], + [9.3506e-11, 1.0334e-10, 1.4226e-11]])}, 1: {'step': tensor(1.), 'exp_avg': tensor([ 1.9822e-05, -3.9469e-05, 4.0324e-05, -1.7376e-05]), 'exp_avg_sq': tensor([3.9289e-11, 1.5578e-10, 1.6260e-10, 3.0193e-11])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO END +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(1.), 'exp_avg': tensor([[ 2.3217e-05, 2.8212e-05, 7.1485e-06], + [-4.0449e-05, -3.6760e-05, -3.9917e-05], + [ 4.0477e-05, 3.7535e-05, 2.5970e-05], + [-3.0579e-05, -3.2146e-05, -1.1927e-05]]), 'exp_avg_sq': tensor([[5.3902e-11, 7.9590e-11, 5.1100e-12], + [1.6361e-10, 1.3513e-10, 1.5934e-10], + [1.6384e-10, 1.4089e-10, 6.7443e-11], + [9.3506e-11, 1.0334e-10, 1.4226e-11]])}, 1: {'step': tensor(1.), 'exp_avg': tensor([ 1.9822e-05, -3.9469e-05, 4.0324e-05, -1.7376e-05]), 'exp_avg_sq': tensor([3.9289e-11, 1.5578e-10, 1.6260e-10, 3.0193e-11])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO END +2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] applying pending state dict +2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 0] should_commit=True enough_replicas=True, errored=None +2025-01-28 07:50:49 INFO [0ca125a5f-91b5-4a5f-9d60-47491b73043b/0 - step 0] should_commit=True enough_replicas=True, errored=None +2025-01-28 07:50:49 INFO START +2025-01-28 07:50:49 INFO START +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!! +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(1.), 'exp_avg': tensor([[ 2.3217e-05, 2.8212e-05, 7.1485e-06], + [-4.0449e-05, -3.6760e-05, -3.9917e-05], + [ 4.0477e-05, 3.7535e-05, 2.5970e-05], + [-3.0579e-05, -3.2146e-05, -1.1927e-05]]), 'exp_avg_sq': tensor([[5.3902e-11, 7.9590e-11, 5.1100e-12], + [1.6361e-10, 1.3513e-10, 1.5934e-10], + [1.6384e-10, 1.4089e-10, 6.7443e-11], + [9.3506e-11, 1.0334e-10, 1.4226e-11]])}, 1: {'step': tensor(1.), 'exp_avg': tensor([ 1.9822e-05, -3.9469e-05, 4.0324e-05, -1.7376e-05]), 'exp_avg_sq': tensor([3.9289e-11, 1.5578e-10, 1.6260e-10, 3.0193e-11])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!! +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(1.), 'exp_avg': tensor([[ 1.4991e-05, 2.5948e-05, -4.4014e-05], + [-2.4478e-05, -8.2199e-06, -3.2015e-06], + [ 4.2802e-05, 2.3373e-05, 9.2387e-06], + [-6.4364e-05, -5.9976e-05, -2.6279e-05]]), 'exp_avg_sq': tensor([[2.2472e-11, 6.7331e-11, 1.9373e-10], + [5.9918e-11, 6.7566e-12, 1.0250e-12], + [1.8320e-10, 5.4631e-11, 8.5354e-12], + [4.1427e-10, 3.5971e-10, 6.9057e-11]])}, 1: {'step': tensor(1.), 'exp_avg': tensor([ 6.3330e-08, -6.8381e-06, 2.6295e-05, -3.6267e-05]), 'exp_avg_sq': tensor([4.0107e-16, 4.6760e-12, 6.9143e-11, 1.3153e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO END +2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 1] should_commit=True enough_replicas=True, errored=None +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(2.), 'exp_avg': tensor([[ 3.5886e-05, 5.1339e-05, -3.7581e-05], + [-6.0882e-05, -4.1304e-05, -3.9127e-05], + [ 7.9231e-05, 5.7155e-05, 3.2611e-05], + [-9.1885e-05, -8.8907e-05, -3.7013e-05]]), 'exp_avg_sq': tensor([[7.6320e-11, 1.4684e-10, 1.9883e-10], + [2.2337e-10, 1.4175e-10, 1.6020e-10], + [3.4688e-10, 1.9538e-10, 7.5911e-11], + [5.0769e-10, 4.6294e-10, 8.3269e-11]])}, 1: {'step': tensor(2.), 'exp_avg': tensor([ 1.7903e-05, -4.2361e-05, 6.2586e-05, -5.1906e-05]), 'exp_avg_sq': tensor([3.9250e-11, 1.6030e-10, 2.3158e-10, 1.6169e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO END +2025-01-28 07:50:49 INFO [0ca125a5f-91b5-4a5f-9d60-47491b73043b/0 - step 1] should_commit=True enough_replicas=True, errored=None +2025-01-28 07:50:49 INFO START +2025-01-28 07:50:49 INFO START +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(2.), 'exp_avg': tensor([[ 3.5886e-05, 5.1339e-05, -3.7581e-05], + [-6.0882e-05, -4.1304e-05, -3.9127e-05], + [ 7.9231e-05, 5.7155e-05, 3.2611e-05], + [-9.1885e-05, -8.8907e-05, -3.7013e-05]]), 'exp_avg_sq': tensor([[7.6320e-11, 1.4684e-10, 1.9883e-10], + [2.2337e-10, 1.4175e-10, 1.6020e-10], + [3.4688e-10, 1.9538e-10, 7.5911e-11], + [5.0769e-10, 4.6294e-10, 8.3269e-11]])}, 1: {'step': tensor(2.), 'exp_avg': tensor([ 1.7903e-05, -4.2361e-05, 6.2586e-05, -5.1906e-05]), 'exp_avg_sq': tensor([3.9250e-11, 1.6030e-10, 2.3158e-10, 1.6169e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!! +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(1.), 'exp_avg': tensor([[ 1.4991e-05, 2.5948e-05, -4.4014e-05], + [-2.4478e-05, -8.2199e-06, -3.2015e-06], + [ 4.2802e-05, 2.3373e-05, 9.2387e-06], + [-6.4364e-05, -5.9976e-05, -2.6279e-05]]), 'exp_avg_sq': tensor([[2.2472e-11, 6.7331e-11, 1.9373e-10], + [5.9918e-11, 6.7566e-12, 1.0250e-12], + [1.8320e-10, 5.4631e-11, 8.5354e-12], + [4.1427e-10, 3.5971e-10, 6.9057e-11]])}, 1: {'step': tensor(1.), 'exp_avg': tensor([ 6.3330e-08, -6.8381e-06, 2.6295e-05, -3.6267e-05]), 'exp_avg_sq': tensor([4.0107e-16, 4.6760e-12, 6.9143e-11, 1.3153e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!! +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(3.), 'exp_avg': tensor([[ 4.6616e-05, 7.5934e-05, -7.4306e-05], + [-7.3161e-05, -3.9209e-05, -3.0517e-05], + [ 9.9808e-05, 5.8038e-05, 3.2261e-05], + [-1.3268e-04, -1.3711e-04, -4.9477e-05]]), 'exp_avg_sq': tensor([[9.6745e-11, 2.3508e-10, 3.6252e-10], + [2.5688e-10, 1.4203e-10, 1.6225e-10], + [4.2775e-10, 1.9954e-10, 7.6682e-11], + [7.5704e-10, 7.8850e-10, 1.0932e-10]])}, 1: {'step': tensor(3.), 'exp_avg': tensor([ 2.4749e-05, -4.5104e-05, 7.1160e-05, -7.1788e-05]), 'exp_avg_sq': tensor([4.6670e-11, 1.6502e-10, 2.5335e-10, 2.2440e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO END +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(2.), 'exp_avg': tensor([[ 2.7810e-05, 5.3083e-05, -8.0096e-05], + [-4.0397e-05, -9.4334e-06, 1.8155e-06], + [ 6.7022e-05, 2.7634e-05, 1.1225e-05], + [-1.0791e-04, -1.1108e-04, -3.9816e-05]]), 'exp_avg_sq': tensor([[4.2951e-11, 1.5565e-10, 3.5742e-10], + [9.3593e-11, 7.1642e-12, 3.2300e-12], + [2.6424e-10, 5.8930e-11, 9.3738e-12], + [6.6372e-10, 6.8537e-10, 9.5118e-11]])}, 1: {'step': tensor(2.), 'exp_avg': tensor([ 8.6937e-06, -1.3134e-05, 3.8498e-05, -5.7713e-05]), 'exp_avg_sq': tensor([7.4597e-12, 9.5430e-12, 9.1074e-11, 1.9426e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO END +2025-01-28 07:50:49 INFO [0ca125a5f-91b5-4a5f-9d60-47491b73043b/0 - step 2] should_commit=True enough_replicas=True, errored=None +2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 2] should_commit=True enough_replicas=True, errored=None +2025-01-28 07:50:49 INFO START +2025-01-28 07:50:49 INFO START +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(3.), 'exp_avg': tensor([[ 4.6616e-05, 7.5934e-05, -7.4306e-05], + [-7.3161e-05, -3.9209e-05, -3.0517e-05], + [ 9.9808e-05, 5.8038e-05, 3.2261e-05], + [-1.3268e-04, -1.3711e-04, -4.9477e-05]]), 'exp_avg_sq': tensor([[9.6745e-11, 2.3508e-10, 3.6252e-10], + [2.5688e-10, 1.4203e-10, 1.6225e-10], + [4.2775e-10, 1.9954e-10, 7.6682e-11], + [7.5704e-10, 7.8850e-10, 1.0932e-10]])}, 1: {'step': tensor(3.), 'exp_avg': tensor([ 2.4749e-05, -4.5104e-05, 7.1160e-05, -7.1788e-05]), 'exp_avg_sq': tensor([4.6670e-11, 1.6502e-10, 2.5335e-10, 2.2440e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!! +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(2.), 'exp_avg': tensor([[ 2.7810e-05, 5.3083e-05, -8.0096e-05], + [-4.0397e-05, -9.4334e-06, 1.8155e-06], + [ 6.7022e-05, 2.7634e-05, 1.1225e-05], + [-1.0791e-04, -1.1108e-04, -3.9816e-05]]), 'exp_avg_sq': tensor([[4.2951e-11, 1.5565e-10, 3.5742e-10], + [9.3593e-11, 7.1642e-12, 3.2300e-12], + [2.6424e-10, 5.8930e-11, 9.3738e-12], + [6.6372e-10, 6.8537e-10, 9.5118e-11]])}, 1: {'step': tensor(2.), 'exp_avg': tensor([ 8.6937e-06, -1.3134e-05, 3.8498e-05, -5.7713e-05]), 'exp_avg_sq': tensor([7.4597e-12, 9.5430e-12, 9.1074e-11, 1.9426e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO PERFORMING OPTIMIZER STEP!! +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(4.), 'exp_avg': tensor([[ 6.8663e-05, 1.0267e-04, -8.3983e-05], + [-9.2339e-05, -4.8413e-05, -3.1009e-05], + [ 1.1660e-04, 6.1067e-05, 3.5359e-05], + [-1.5739e-04, -1.7268e-04, -5.7487e-05]]), 'exp_avg_sq': tensor([[1.6798e-10, 3.5267e-10, 3.9143e-10], + [3.2682e-10, 1.5911e-10, 1.6334e-10], + [4.9901e-10, 2.0714e-10, 8.0604e-11], + [9.0053e-10, 1.0306e-09, 1.2600e-10]])}, 1: {'step': tensor(4.), 'exp_avg': tensor([ 4.7159e-05, -5.9983e-05, 7.8308e-05, -8.1859e-05]), 'exp_avg_sq': tensor([1.0855e-10, 2.0245e-10, 2.7344e-10, 2.5393e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO END +2025-01-28 07:50:49 INFO self._outer_optimizer.state_dict()={'state': {0: {'step': tensor(3.), 'exp_avg': tensor([[ 5.1738e-05, 8.2101e-05, -8.9194e-05], + [-6.2852e-05, -2.1615e-05, -1.9096e-06], + [ 8.7094e-05, 3.3704e-05, 1.6427e-05], + [-1.3510e-04, -1.4925e-04, -4.8792e-05]]), 'exp_avg_sq': tensor([[1.1424e-10, 2.7332e-10, 3.8633e-10], + [1.6369e-10, 2.4383e-11, 4.4824e-12], + [3.3567e-10, 6.6674e-11, 1.3364e-11], + [8.0730e-10, 9.2755e-10, 1.1181e-10]])}, 1: {'step': tensor(3.), 'exp_avg': tensor([ 3.2709e-05, -3.1210e-05, 4.8912e-05, -6.9191e-05]), 'exp_avg_sq': tensor([6.9378e-11, 4.7128e-11, 1.1133e-10, 2.2382e-10])}}, 'param_groups': [{'lr': 0.7, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'params': [0, 1]}]} +2025-01-28 07:50:49 INFO END +2025-01-28 07:50:49 INFO [0ca125a5f-91b5-4a5f-9d60-47491b73043b/0 - step 3] should_commit=True enough_replicas=True, errored=None +2025-01-28 07:50:49 INFO [1e9ec0b67-12c9-42d2-846a-77880287183a/0 - step 3] should_commit=True enough_replicas=True, errored=None +=============================== warnings summary =============================== +../../.conda/envs/torchft/lib/python3.10/site-packages/torch/_subclasses/functional_tensor.py:295 + /home/howardhuang/.conda/envs/torchft/lib/python3.10/site-packages/torch/_subclasses/functional_tensor.py:295: UserWarning: Failed to initialize NumPy: No module named 'numpy' (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:84.) + cpu = _conversion_method_template(device=torch.device("cpu")) + +-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html +=========================== short test summary info ============================ +FAILED torchft/manager_integ_test.py::ManagerIntegTest::test_diloco_healthy +================== 1 failed, 7 deselected, 1 warning in 4.77s ================== diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py index 1458f07..a486e1a 100644 --- a/torchft/local_sgd.py +++ b/torchft/local_sgd.py @@ -3,25 +3,29 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - """ LocalSGD ========= - This module implements a fault tolerant version of LocalSGD and related methods. """ - -from typing import Any, Dict, List, Mapping, Optional +import logging +from types import TracebackType +from typing import Any, Callable, Dict, Iterator, List, Mapping, Optional, Type import torch from torch import nn, optim +from torch.nn.parameter import Parameter +from torch.optim.optimizer import Optimizer +from torch.utils.hooks import RemovableHandle from torchft.manager import Manager +logger: logging.Logger = logging.getLogger(__name__) -class LocalSGD(nn.Module): + +class LocalSGD: """ - LocalSGD is a model wrapper similar to DistributedDataParallel that + LocalSGD is a context manager that implements the algorithm described in https://arxiv.org/pdf/1805.09767 This will synchronize the model parameters periodically in a fault tolerant @@ -60,26 +64,22 @@ def __init__( ) -> None: """ Args: - manager: The manager to use. - model: The model to wrap. - optimizer: The optimizer used by the model. - sync_every: How often to sync the model weights. - backup_device: The device to store the backup of the model parameters on. (default cpu) - pin_memory: Whether to pin the memory used for the backup of the model parameters. + manager (Manager): The manager to use. + model (nn.Module): The model to wrap. + optimizer (optim.Optimizer): The optimizer used by the model. + sync_every (int): How often to sync the model weights. + backup_device (Optional[torch.device]): The device to store the backup of the model parameters on. (default cpu) + pin_memory (bool): Whether to pin the memory used for the backup of the model parameters. """ super().__init__() - self._manager = manager self._model = model + self._local_optimizer = optimizer self._local_step = 0 - self._started_step = False self._sync_every = sync_every assert sync_every >= 1, "sync_every must be greater than or equal to 1" - device = backup_device or torch.device("cpu") - self._backup_parameters: Dict[str, torch.Tensor] = {} - for name, p in self._model.named_parameters(): t = torch.empty(*tuple(p.shape), dtype=p.dtype, device=device) if ( @@ -89,87 +89,101 @@ def __init__( ): t = t.pin_memory() self._backup_parameters[name] = t + print(f"{self._manager._use_async_quorum} {list(self._model.parameters())=}") + self._hooks: List[RemovableHandle] = [] # Need to copy the parameters to the host to be safe if we are on the first step. self._save_parameters() - optimizer.register_step_post_hook(self._step_post_hook) + def __enter__(self) -> "LocalSGD": + # Add optimizer hook which increments the local step counter and syncs if necessary + self._hooks.append( + self._local_optimizer.register_step_post_hook(self._step_post_hook) + ) + # Register a forward prehook to check for quorum + self._hooks.append( + self._model.register_forward_pre_hook(self._forward_step_pre_hook) + ) + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_value: Optional[BaseException], + traceback: Optional[TracebackType], + ) -> bool: + # Handle any cleanup or error handling here + if exc_type is not None: + # If an exception occurred, restore parameters + self._restore_parameters() + # Clean up hooks + for hook in self._hooks: + hook.remove() + self._hooks.clear() + + return False # Propagate exceptions def _save_parameters(self) -> None: - # TODO: consider running copy on a separate stream - for name, p in self._model.named_parameters(): - self._backup_parameters[name].copy_(p.data, non_blocking=True) + with torch.no_grad(): + # TODO: consider running copy on a separate stream + for name, p in self._model.named_parameters(): + print(f"{name=} {p.data=}") + self._backup_parameters[name] = p.detach().clone() def _restore_parameters(self) -> None: - # TODO: consider running copy on a separate stream - for name, p in self._model.named_parameters(): - p.data.copy_(self._backup_parameters[name], non_blocking=True) + with torch.no_grad(): + # TODO: consider running copy on a separate stream + for name, p in self._model.named_parameters(): + p.copy_(self._backup_parameters[name], non_blocking=False) - # pyre-fixme[14]: support state_dict args - def state_dict(self) -> Dict[str, object]: - """ - state_dict returns the state_dict from the last time LocalSGD - synchronized and not the current weights. - """ - state_dict = self._model.state_dict() - for name, p in self._backup_parameters.items(): - assert name in state_dict - state_dict[name] = p - return state_dict - - def load_state_dict( - self, state_dict: Mapping[str, Any], strict: bool = True, assign: bool = False + def _step_post_hook( + self, _optim: optim.Optimizer, _args: List[object], _kwargs: Dict[str, object] ) -> None: """ - Loads the state dict to the model and the backup parameters. - - This must be called while the model weights aren't being modified to - avoid corrupting the backup weights. + This hook is registered on the optimizer and is called after the optimizer step. """ - self._model.load_state_dict(state_dict, strict=strict, assign=assign) - self._save_parameters() + self._local_step += 1 + if self._local_step >= self._sync_every: + self.sync() - def forward(self, *args: object, **kwargs: object) -> object: + def _forward_step_pre_hook(self, _module: nn.Module, _args: List[object]) -> None: """ - Run the model parameters. - - This should be called before the optimizer step. - - This will start the quorum and save the parameters if this is the first step. + Start the quorum before each module forward. """ if self._local_step == 0: self._manager.start_quorum() - self._started_step = True - - return self._model.forward(*args, **kwargs) - - def _step_post_hook( - self, _optim: optim.Optimizer, _args: List[object], _kwargs: Dict[str, object] - ) -> None: + def sync(self) -> None: """ - This hook is registered on the optimizer and is called after the optimizer step. - - This will call the allreduce on the model weights every sync_every steps. - If any errors occur it will restore to the weights from the previous sync. - - ``forward`` must be called before this function. + Synchronizes and averages the model weights across the manager. """ - assert self._started_step, "forward must be called before step" - self._started_step = False - - self._local_step += 1 + self._perform_sync() + + if self._manager.should_commit(): + # print( + # f"saving the parameters at {self._local_step=} on manager step {self._manager.current_step()=}" + # ) + # # save the parameters so we can restore from them later if necessary. + # print( + # f"{self._manager._rank=} {self._backup_parameters=}, {list(self._model.parameters())=}" + # ) + self._save_parameters() + # print( + # f"AFTER SAVE PARAMS: {self._manager._rank=} {self._backup_parameters=}" + # ) + else: + # commit failed, restore from the backup parameters + self._restore_parameters() - if self._local_step >= self._sync_every: - self._local_step = 0 - self._average() + self._local_step = 0 - if self._manager.should_commit(): - # save the parameters so we can restore from them later if necessary. - self._save_parameters() - else: - # commit failed, restore from the backup parameters - self._restore_parameters() + def _perform_sync(self) -> None: + """ + Performs the synchronization of the model weights across the manager. + This method is intended to be overridden by subclasses to implement custom + synchronization logic. + """ + self._average() def _average(self) -> None: # TODO: do we need to broadcast buffers like DDP does? @@ -182,3 +196,71 @@ def _average(self) -> None: for work in works: work.wait() + + +class DiLoCo(LocalSGD): + """ + DiLoCo is a subclass of LocalSGD that overrides the synchronization + mechanism to average and synchronize the pseudogradients (delta of the previous global weight and current local weights). + + diloco: https://arxiv.org/pdf/2311.08105 + """ + + def __init__( + self, + manager: Manager, + model: nn.Module, + inner_optimizer: optim.Optimizer, + outer_optimizer: optim.Optimizer, + sync_every: int, + backup_device: Optional[torch.device] = None, + pin_memory: bool = True, + ) -> None: + super().__init__( + manager, model, inner_optimizer, sync_every, backup_device, pin_memory + ) + self._outer_optimizer = outer_optimizer + + def _perform_sync(self) -> None: + """ + Overrides the sync method to calculate the pseugradient, average them across the manager group, and + step using the outer optimizer. + """ + + # Set the .grad field of each parameter to its pseudogradient + for name, p in self._model.named_parameters(): + assert name in self._backup_parameters + pseudogradient = p.data - self._backup_parameters[name] + p.grad = pseudogradient + + self._average_grads() + + # Restore the parameters back to the previous state + self._restore_parameters() + + # Use the outer optimizer to update the model parameters + for name, p in self._model.named_parameters(): + print(f"{name=}, {p.grad=}") + print(f"{list(self._model.parameters())=}") + logger.info("START") + logger.info(f"{self._outer_optimizer.state_dict()=}") + logger.info("PERFORMING OPTIMIZER STEP!!") + self._outer_optimizer.step() + logger.info(f"{self._outer_optimizer.state_dict()=}") + logger.info("END") + print(f"{list(self._model.parameters())=}") + self._outer_optimizer.zero_grad() + + def _average_grads(self) -> None: + """ + Average the gradients across the diloco group. + """ + works = [] + for p in self._model.parameters(): + # Perform allreduce on the pseudogradients + assert p.grad is not None + work = self._manager.allreduce(p.grad) + works.append(work) + # Wait for all allreduce operations to complete + for work in works: + work.wait() diff --git a/torchft/local_sgd_test.py b/torchft/local_sgd_test.py index d2b73cd..7872fc2 100644 --- a/torchft/local_sgd_test.py +++ b/torchft/local_sgd_test.py @@ -11,7 +11,7 @@ import torch from torch import nn, optim -from torchft.local_sgd import LocalSGD +from torchft.local_sgd import DiLoCo, LocalSGD from torchft.manager import Manager @@ -40,57 +40,103 @@ def _copy_state_dict(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Ten class LocalSGDTest(TestCase): def test_local_sgd_healthy(self) -> None: - base_m = SimpleModel() - optimizer = optim.SGD(base_m.parameters()) + model = SimpleModel() + optimizer = optim.SGD(model.parameters()) manager = create_autospec(Manager) - - m = LocalSGD(manager, base_m, optimizer, sync_every=2) - self.assertEqual(m._local_step, 0) - - torch.testing.assert_close(m._backup_parameters, _params_dict(base_m)) - - inp = torch.rand(2, 3) - - loss = m(inp).mean() - loss.backward() - optimizer.step() - - self.assertEqual(m._local_step, 1) - self.assertEqual(manager.start_quorum.call_count, 1) - - loss = m(inp).mean() - loss.backward() - optimizer.step() - - manager.should_commit.return_value = True - self.assertEqual(m._local_step, 0) - - torch.testing.assert_close(m._backup_parameters, _params_dict(base_m)) - self.assertEqual(manager.should_commit.call_count, 1) - self.assertEqual(manager.allreduce.call_count, 4) + with LocalSGD(manager, model, optimizer, sync_every=2) as local_sgd: + self.assertEqual(local_sgd._local_step, 0) + torch.testing.assert_close( + local_sgd._backup_parameters, _params_dict(model) + ) + inp = torch.rand(2, 3) + loss = model(inp).mean() + loss.backward() + optimizer.step() + + self.assertEqual(local_sgd._local_step, 1) + self.assertEqual(manager.start_quorum.call_count, 1) + loss = model(inp).mean() + loss.backward() + optimizer.step() + + manager.should_commit.return_value = True + self.assertEqual(local_sgd._local_step, 0) + torch.testing.assert_close( + local_sgd._backup_parameters, _params_dict(model) + ) + self.assertEqual(manager.should_commit.call_count, 1) + self.assertEqual(manager.allreduce.call_count, 4) def test_local_sgd_recovery(self) -> None: - base_m = SimpleModel() - optimizer = optim.SGD(base_m.parameters()) + model = SimpleModel() + optimizer = optim.SGD(model.parameters()) manager = create_autospec(Manager) - m = LocalSGD(manager, base_m, optimizer, sync_every=2) + with LocalSGD(manager, model, optimizer, sync_every=2) as local_sgd: + torch.testing.assert_close( + local_sgd._backup_parameters, _params_dict(model) + ) + og_state_dict = _copy_state_dict(model.state_dict()) + + inp = torch.rand(2, 3) + + loss = model(inp).mean() + loss.backward() + optimizer.step() - torch.testing.assert_close(m._backup_parameters, _params_dict(base_m)) - og_state_dict = _copy_state_dict(base_m.state_dict()) + # Check that the model's state dict has been updated + for name, param in model.state_dict().items(): + # Ensure the parameter has changed + self.assertFalse( + torch.equal(og_state_dict[name], param), + f"Parameter {name} did not change.", + ) + self.assertEqual(local_sgd._local_step, 1) - inp = torch.rand(2, 3) + local_sgd._restore_parameters() + torch.testing.assert_close( + local_sgd._backup_parameters, _params_dict(model) + ) - loss = m(inp).mean() - loss.backward() - optimizer.step() - self.assertEqual(m._local_step, 1) +class DiLoCoTest(TestCase): + def test_diloco_healt(self) -> None: + model = SimpleModel() - state_dict = m.state_dict() - torch.testing.assert_close(state_dict, m._backup_parameters) - torch.testing.assert_close(state_dict, og_state_dict) + # Setup optimizers + inner_optimizer = torch.optim.AdamW( + model.parameters(), lr=4e-4, weight_decay=0.1, betas=(0.9, 0.95) + ) + outer_optimizer = torch.optim.SGD( + model.parameters(), lr=0.7, momentum=0.9, nesterov=True + ) - m.load_state_dict(state_dict) - torch.testing.assert_close(_params_dict(base_m), state_dict) - torch.testing.assert_close(m._backup_parameters, _params_dict(base_m)) + manager = create_autospec(Manager) + with DiLoCo( + manager, model, inner_optimizer, outer_optimizer, sync_every=2 + ) as diloco: + parameter_count = len(list(model.parameters())) + initial_outer_opt_state = outer_optimizer.state_dict() + self.assertEqual(initial_outer_opt_state["state"], {}) + + self.assertEqual(diloco._local_step, 0) + torch.testing.assert_close(diloco._backup_parameters, _params_dict(model)) + inp = torch.rand(2, 3) + loss = model(inp).mean() + loss.backward() + inner_optimizer.step() + + self.assertEqual(diloco._local_step, 1) + self.assertEqual(manager.start_quorum.call_count, 1) + loss = model(inp).mean() + loss.backward() + inner_optimizer.step() + + manager.should_commit.return_value = True + self.assertEqual(diloco._local_step, 0) + torch.testing.assert_close(diloco._backup_parameters, _params_dict(model)) + self.assertEqual(manager.should_commit.call_count, 1) + self.assertEqual(manager.allreduce.call_count, parameter_count) + + outer_opt_state = outer_optimizer.state_dict() + self.assertEqual(len(outer_opt_state["state"]), parameter_count) diff --git a/torchft/manager_integ_test.py b/torchft/manager_integ_test.py index d6e7bde..f095f7e 100644 --- a/torchft/manager_integ_test.py +++ b/torchft/manager_integ_test.py @@ -1,11 +1,11 @@ import logging import threading import time -from concurrent.futures import ThreadPoolExecutor, as_completed -from contextlib import ExitStack, contextmanager +from concurrent.futures import as_completed, ThreadPoolExecutor +from contextlib import contextmanager, ExitStack from dataclasses import dataclass, field from datetime import timedelta -from typing import Dict, Generator, List, Protocol, Set, Tuple +from typing import Dict, Generator, List, Optional, Protocol, Set, Tuple from unittest import TestCase import torch @@ -14,7 +14,7 @@ from torch import nn, optim from torchft.ddp import DistributedDataParallel -from torchft.local_sgd import LocalSGD +from torchft.local_sgd import DiLoCo, LocalSGD from torchft.manager import Manager from torchft.optim import OptimizerWrapper from torchft.process_group import ProcessGroupGloo @@ -227,30 +227,124 @@ def state_dict() -> Dict[str, Dict[str, object]]: m: nn.Module = MyModel() optimizer: optim.Optimizer = optim.Adam(m.parameters()) - m = LocalSGD(manager, m, optimizer, sync_every=2) criterion = nn.CrossEntropyLoss() - while True: - inputs = torch.rand(2, 3) - labels = torch.randint(4, (2,)) + with LocalSGD(manager, m, optimizer, sync_every=2): + while True: + inputs = torch.rand(2, 3) + labels = torch.randint(4, (2,)) - optimizer.zero_grad() - out = m(inputs) - loss = criterion(out, labels) + optimizer.zero_grad() + out = m(inputs) + loss = criterion(out, labels) - loss.backward() + loss.backward() - optimizer.step() + optimizer.step() - if manager.current_step() >= 4: - break + if manager.current_step() >= 4: + break - runner.failure_injector.check(rank, manager.current_step()) + runner.failure_injector.check(rank, manager.current_step()) # return state_dict so we can check consistency return state_dict() +def diloco_train_loop( + rank: int, + store_port: int, + runner: Runner, +) -> Dict[str, Dict[str, object]]: + with ExitStack() as stack: + torch.manual_seed(42) + + # Declare the model and optimizers + m: nn.Module = MyModel() + + # Setup optimizers + inner_optimizer: optim.Optimizer = torch.optim.AdamW( + m.parameters(), lr=4e-4, weight_decay=0.1, betas=(0.9, 0.95) + ) + outer_optimizer: optim.Optimizer = torch.optim.Adam( + m.parameters(), lr=0.7 + ) + + def load_state_dict(state_dict: Dict[str, Dict[str, object]]) -> None: + m.load_state_dict(state_dict["model"]) + inner_optimizer.load_state_dict(state_dict["inner_optim"]) + outer_optimizer.load_state_dict(state_dict["outer_optim"]) + + def state_dict() -> Dict[str, Dict[str, object]]: + return { + "model": m.state_dict(), + "inner_optim": inner_optimizer.state_dict(), + "outer_optim": outer_optimizer.state_dict(), + } + + print(f"worker {runner.replica_id=} {rank=} {runner.world_size=} starting") + + pg = ProcessGroupGloo() + manager = Manager( + pg=pg, + min_replica_size=2, + load_state_dict=load_state_dict, + state_dict=state_dict, + replica_id=str(runner.replica_id), + store_addr="localhost", + store_port=store_port, + rank=rank, + world_size=runner.world_size, + lighthouse_addr=runner.lighthouse_address, + port=19530 + runner.replica_id, + # pyre-fixme[6]: Incompatible parameter type + **runner.manager_args, + ) + stack.callback(manager.shutdown) + + # TODO: where in the training loop should we do this? + # Ensure all models have the same starting state + # We set manual seed so the models start with the same weights + manager.start_quorum() + for param in m.parameters(): + print(f"{param=} vs. {param.data}") + manager.allreduce(param.data) + + criterion = nn.CrossEntropyLoss() + backup_parameters = None + with DiLoCo( + manager, m, inner_optimizer, outer_optimizer, sync_every=2 + ) as diloco: + while True: + print(f"in diloco CM {manager.current_step()=}") + inputs = torch.rand(2, 3) + labels = torch.randint(4, (2,)) + + inner_optimizer.zero_grad() + out = m(inputs) + loss = criterion(out, labels) + + loss.backward() + print("inner optimizer step") + inner_optimizer.step() + + # record backup parameters at sync count 3 + if manager.current_step() == 1: + backup_parameters = diloco._backup_parameters + + # after 4 model updates then break + if manager.current_step() >= 4: + break + + runner.failure_injector.check(rank, manager.current_step()) + + return_state_dict = state_dict() + assert backup_parameters is not None + return_state_dict["backup_parameters"] = backup_parameters + # return state_dict so we can check consistency + return return_state_dict + + class ManagerIntegTest(TestCase): @contextmanager def assertElapsedLessThan( @@ -431,6 +525,89 @@ def test_local_sgd_recovery(self) -> None: self.assertEqual(failure_injectors[1].count, 1) + def test_diloco_healthy(self) -> None: + lighthouse = Lighthouse( + bind="[::]:0", + min_replicas=2, + ) + num_replicas = 2 + futures = [] + + with ThreadPoolExecutor(max_workers=num_replicas) as executor: + for replica_id in range(num_replicas): + failure_injector = FailureInjector() + runner = Runner( + replica_id=replica_id, + lighthouse_address=lighthouse.address(), + failure_injector=failure_injector, + train_loop=diloco_train_loop, + ) + futures.append(executor.submit(runner.run_replica)) + + state_dicts = [] + + for fut in as_completed(futures): + state_dicts.append(fut.result()) + + lighthouse.shutdown() + + for state_dict in state_dicts: + print(state_dict) + torch.testing.assert_close(state_dict, state_dicts[0]) + + def test_diloco_recovery(self) -> None: + lighthouse = Lighthouse( + bind="[::]:0", + min_replicas=2, + ) + num_replicas = 2 + futures = [] + + failure_injectors = [ + FailureInjector(), + FailureInjector().fail_at(0, 2), + ] + + with ThreadPoolExecutor(max_workers=num_replicas) as executor: + for replica_id, failure_injector in zip( + range(num_replicas), failure_injectors + ): + runner = Runner( + replica_id=replica_id, + lighthouse_address=lighthouse.address(), + failure_injector=failure_injector, + train_loop=diloco_train_loop, + manager_args={ + "use_async_quorum": False, + }, + ) + futures.append(executor.submit(runner.run_replica)) + + state_dicts = [] + + for fut in as_completed(futures): + try: + state_dicts.append(fut.result()) + except Exception as e: + print(e) + raise + + lighthouse.shutdown() + + print(state_dicts[0]) + + print(state_dicts[1]) + + # global model states should be equivalent + + # outer optimizers should be equivalent + for state_dict in state_dicts: + torch.testing.assert_close( + state_dict[0]["outer_optim"], state_dicts[0][0]["outer_optim"] + ) + + self.assertEqual(failure_injectors[1].count, 1) + def test_quorum_timeout(self) -> None: with ExitStack() as stack: lighthouse = Lighthouse(