Skip to content

Commit 186865c

Browse files
committed
Fix issues with stale entries
1 parent c26242d commit 186865c

File tree

1 file changed

+10
-2
lines changed

1 file changed

+10
-2
lines changed

src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -446,8 +446,9 @@ def get_correct_seq_id(collective):
446446
if c.state != 'scheduled':
447447
continue
448448
rank_counts['appeared'].append(c.file_id)
449-
if get_correct_seq_id(c) <= max_completed_collective_seq_id:
450-
rank_counts['mismatched'].append(c.file_id)
449+
if self.args.use_stale_entries_as_failures:
450+
if get_correct_seq_id(c) <= max_completed_collective_seq_id:
451+
rank_counts['mismatched'].append(c.file_id)
451452
appeared_rank_counts = Counter(rank_counts['appeared'])
452453
# Ranks with less number of enqueued collectives than max_enqueued_collective_seq_id -> host not making expected progress
453454
for rank_id in self.pg_configs[process_group]['ranks']:
@@ -975,6 +976,13 @@ def main():
975976
help='Convert the trace file to json file, if the trace is binary, for debugging',
976977
)
977978

979+
parser.add_argument(
980+
'--use-stale-entries-as-failures',
981+
action='store_true',
982+
help='Use stale entries as failures in the analysis,'
983+
'which are older than the last completed collective for the process group',
984+
)
985+
978986
args = parser.parse_args()
979987

980988
analyzer = CollectiveAnalyzer(args)

0 commit comments

Comments
 (0)