Skip to content

Commit 5153dbe

Browse files
author
Amarnath K
committed
[TFA] tier-2_cephfs_test-metrics suite failures
Signed-off-by: Amarnath K <[email protected]>
1 parent e905c8c commit 5153dbe

File tree

3 files changed

+180
-71
lines changed

3 files changed

+180
-71
lines changed

tests/cephfs/cephfs_metrics/cephfs_metrics_scale.py

Lines changed: 163 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
1+
import json
12
import random
23
import string
34
import time
45
import traceback
6+
from json import JSONDecodeError
57

68
from ceph.ceph import CommandFailed
79
from tests.cephfs.cephfs_utilsV1 import FsUtils
810
from utility.log import Log
11+
from utility.retry import retry
912

1013
log = Log(__name__)
1114

@@ -24,6 +27,109 @@ class Metrics_Value_Not_Matching(Exception):
2427
pass
2528

2629

30+
@retry((JSONDecodeError, CommandFailed), tries=5, delay=5)
31+
def get_client_id(client, rank=0, mounted_dir="", fs_name="cephfs"):
32+
ranked_mds, _ = client.exec_command(
33+
sudo=True,
34+
cmd=f"ceph fs status {fs_name} -f json | jq '.mdsmap[] | select(.rank == {rank}) | .name'",
35+
)
36+
log.info("Executing MDS name with rank command: %s", ranked_mds)
37+
ranked_mds = ranked_mds.replace('"', "").replace("\n", "")
38+
client_id_cmd = (
39+
f"ceph tell mds.{ranked_mds} session ls | jq '.[] | select(.client_metadata.mount_point"
40+
f' != null and (.client_metadata.mount_point | contains("{mounted_dir}"))) | .id\''
41+
)
42+
log.info(f"Executing Client ID Command : {client_id_cmd}")
43+
client_id, _ = client.exec_command(sudo=True, cmd=client_id_cmd)
44+
client_id = client_id.replace('"', "").replace("\n", "")
45+
if client_id == "":
46+
log.error(f"Client not found for Mounted Directory : {mounted_dir}")
47+
raise CommandFailed(f"Client not found for Mounted Directory : {mounted_dir}")
48+
log.info(f"Client ID :[{client_id}] for Mounted Directory : [{mounted_dir}]")
49+
return client_id, rank
50+
51+
52+
@retry((JSONDecodeError, CommandFailed), tries=5, delay=10)
53+
def get_mds_metrics_for_client(
54+
client, client_id, rank, mds_rank=0, mounted_dir="", fs_name="cephfs"
55+
):
56+
ranked_mds, _ = client.exec_command(
57+
sudo=True,
58+
cmd=f"ceph fs status {fs_name} -f json | jq '.mdsmap[] | select(.rank == {mds_rank}) | .name'",
59+
)
60+
log.info(f"Executing MDS name with rank command: {ranked_mds}")
61+
ranked_mds = ranked_mds.replace('"', "").replace("\n", "")
62+
log.info(f"Client ID :[{client_id}] for Mounted Directory : [{mounted_dir}]")
63+
cmd = f""" ceph tell mds.{ranked_mds} counter dump 2>/dev/null | \
64+
jq -r '. | to_entries | map(select(.key | match("mds_client_metrics"))) | \
65+
.[].value[] | select(.labels.client != null and (.labels.client | contains("{client_id}"))
66+
and (.labels.rank == "{rank}"))'
67+
"""
68+
metrics_out, _ = client.exec_command(sudo=True, cmd=cmd)
69+
log.info(
70+
f"Metrics for MDS : {ranked_mds} Mounted Directory: {mounted_dir} and Client : {client_id} is {metrics_out}"
71+
)
72+
if metrics_out == "":
73+
log.error(f"Metrics not found for MDS : {ranked_mds}")
74+
raise CommandFailed(f"Client not found for Mounted Directory : {mounted_dir}")
75+
metrics_out = json.loads(str(metrics_out))
76+
return metrics_out
77+
78+
79+
def get_mds_metrics_from_ranks(ranks, fs_util, client, mount_dir, cephfs):
80+
"""
81+
Try fetching MDS metrics for the given client and mount_dir from the list of ranks.
82+
83+
Returns:
84+
dict: MDS metrics if found.
85+
86+
Raises:
87+
CommandFailed: If no metrics are found from any rank.
88+
"""
89+
client_id = None
90+
client_rank = None
91+
92+
# Step 1: Get client_id from one of the MDS ranks
93+
for rank in ranks:
94+
try:
95+
client_id, client_rank = get_client_id(client, rank, mount_dir, cephfs)
96+
if client_id:
97+
log.info(f"Found client ID '{client_id}' from rank {client_rank}")
98+
break
99+
except Exception as e:
100+
log.warning(
101+
f"Rank {rank}: Failed to get client ID for mount {mount_dir}: {e}"
102+
)
103+
continue
104+
105+
if not client_id:
106+
raise CommandFailed(f"Client not found in any MDS ranks for mount {mount_dir}")
107+
108+
# Step 2: Use client_id and try to collect metrics from all MDS ranks
109+
for rank in ranks:
110+
try:
111+
mds_metric = get_mds_metrics_for_client(
112+
client,
113+
client_id,
114+
client_rank,
115+
mds_rank=rank,
116+
mounted_dir=mount_dir,
117+
fs_name=cephfs,
118+
)
119+
if mds_metric and mds_metric != 1:
120+
log.info(f"Successfully got MDS metrics from rank {rank}")
121+
return mds_metric
122+
except Exception as e:
123+
log.warning(
124+
f"Rank {rank}: Failed to fetch metrics for client {client_id}: {e}"
125+
)
126+
continue
127+
128+
raise CommandFailed(
129+
f"Metrics not found for client {client_id} in any of the MDS ranks for mount {mount_dir}"
130+
)
131+
132+
27133
def run(ceph_cluster, **kw):
28134
try:
29135
tc = "CEPH-83588355"
@@ -104,17 +210,18 @@ def run(ceph_cluster, **kw):
104210
fs_util.fuse_mount([client4], fuse_mounting_dir_4)
105211
# Get initial MDS metrics
106212
# pdb.set_trace()
107-
mds_metric_client1 = fs_util.get_mds_metrics(
108-
client1, 0, fuse_mounting_dir_1, cephfs
213+
214+
mds_metric_client1 = get_mds_metrics_from_ranks(
215+
[0, 1, 2], fs_util, client1, fuse_mounting_dir_1, cephfs
109216
)
110-
mds_metric_client2 = fs_util.get_mds_metrics(
111-
client2, 0, fuse_mounting_dir_2, cephfs
217+
mds_metric_client2 = get_mds_metrics_from_ranks(
218+
[0, 1, 2], fs_util, client2, fuse_mounting_dir_2, cephfs
112219
)
113-
mds_metric_client3 = fs_util.get_mds_metrics(
114-
client3, 0, fuse_mounting_dir_3, cephfs
220+
mds_metric_client3 = get_mds_metrics_from_ranks(
221+
[0, 1, 2], fs_util, client3, fuse_mounting_dir_3, cephfs
115222
)
116-
mds_metric_client4 = fs_util.get_mds_metrics(
117-
client4, 0, fuse_mounting_dir_4, cephfs
223+
mds_metric_client4 = get_mds_metrics_from_ranks(
224+
[0, 1, 2], fs_util, client4, fuse_mounting_dir_4, cephfs
118225
)
119226

120227
log.info(f"mds_metric_client1: {mds_metric_client1}")
@@ -132,11 +239,11 @@ def run(ceph_cluster, **kw):
132239
inode_list = ["opened_inodes", "pinned_icaps", "total_inodes"]
133240

134241
# Get initial inode metrics for client1 and client3
135-
client1_pre_inode_metrics = fs_util.get_mds_metrics(
136-
client1, 0, fuse_mounting_dir_1, cephfs
242+
client1_pre_inode_metrics = get_mds_metrics_from_ranks(
243+
[0, 1, 2], fs_util, client1, fuse_mounting_dir_1, cephfs
137244
)
138-
client3_pre_inode_metrics = fs_util.get_mds_metrics(
139-
client3, 0, fuse_mounting_dir_3, cephfs
245+
client3_pre_inode_metrics = get_mds_metrics_from_ranks(
246+
[0, 1, 2], fs_util, client3, fuse_mounting_dir_3, cephfs
140247
)
141248
log.info(f"client1_pre_inode_metrics: {client1_pre_inode_metrics}")
142249
log.info(f"client3_pre_inode_metrics: {client3_pre_inode_metrics}")
@@ -165,11 +272,11 @@ def run(ceph_cluster, **kw):
165272

166273
log.info("Writing files is done for client1 and client3")
167274
log.info("Get metrics only for client1 and client3")
168-
client1_post_inode_metrics = fs_util.get_mds_metrics(
169-
client1, 0, fuse_mounting_dir_1, cephfs
275+
client1_post_inode_metrics = get_mds_metrics_from_ranks(
276+
[0, 1, 2], fs_util, client1, fuse_mounting_dir_1, cephfs
170277
)
171-
client3_post_inode_metrics = fs_util.get_mds_metrics(
172-
client3, 0, fuse_mounting_dir_3, cephfs
278+
client3_post_inode_metrics = get_mds_metrics_from_ranks(
279+
[0, 1, 2], fs_util, client3, fuse_mounting_dir_3, cephfs
173280
)
174281
log.info(f"client1_post_inode_metrics: {client1_post_inode_metrics}")
175282
log.info(f"client3_post_inode_metrics: {client3_post_inode_metrics}")
@@ -197,11 +304,11 @@ def run(ceph_cluster, **kw):
197304
)
198305
file_paths_client2 = []
199306
file_paths_client4 = []
200-
pre_opened_files_client2 = fs_util.get_mds_metrics(
201-
client2, 0, fuse_mounting_dir_2, cephfs
307+
pre_opened_files_client2 = get_mds_metrics_from_ranks(
308+
[0, 1, 2], fs_util, client2, fuse_mounting_dir_2, cephfs
202309
)["counters"]["opened_files"]
203-
pre_opened_files_client4 = fs_util.get_mds_metrics(
204-
client4, 0, fuse_mounting_dir_4, cephfs
310+
pre_opened_files_client4 = get_mds_metrics_from_ranks(
311+
[0, 1, 2], fs_util, client4, fuse_mounting_dir_4, cephfs
205312
)["counters"]["opened_files"]
206313
log.info(f"pre_opened_files_client2: {pre_opened_files_client2}")
207314
log.info(f"pre_opened_files_client4: {pre_opened_files_client4}")
@@ -240,11 +347,11 @@ def run(ceph_cluster, **kw):
240347
log.info(f"Number of PID4s from opening files: {pids4}")
241348
time.sleep(10)
242349
log.info("Get final MDS metrics after opening files")
243-
client2_post_opened_files = fs_util.get_mds_metrics(
244-
client2, 0, fuse_mounting_dir_2, cephfs
350+
client2_post_opened_files = get_mds_metrics_from_ranks(
351+
[0, 1, 2], fs_util, client2, fuse_mounting_dir_2, cephfs
245352
)["counters"]["opened_files"]
246-
client4_post_opened_files = fs_util.get_mds_metrics(
247-
client4, 0, fuse_mounting_dir_4, cephfs
353+
client4_post_opened_files = get_mds_metrics_from_ranks(
354+
[0, 1, 2], fs_util, client4, fuse_mounting_dir_4, cephfs
248355
)["counters"]["opened_files"]
249356
log.info(f"client2_post_opened_files: {client2_post_opened_files}")
250357
log.info(f"client4_post_opened_files: {client4_post_opened_files}")
@@ -268,11 +375,11 @@ def run(ceph_cluster, **kw):
268375
log.error(f"Failed to kill tail processes: {e}")
269376
time.sleep(5)
270377
log.info("Get final MDS metrics after killing the PIDs")
271-
post_opened_files_client2 = fs_util.get_mds_metrics(
272-
client2, 0, fuse_mounting_dir_2, cephfs
378+
post_opened_files_client2 = get_mds_metrics_from_ranks(
379+
[0, 1, 2], fs_util, client2, fuse_mounting_dir_2, cephfs
273380
)["counters"]["opened_files"]
274-
post_opened_files_client4 = fs_util.get_mds_metrics(
275-
client4, 0, fuse_mounting_dir_4, cephfs
381+
post_opened_files_client4 = get_mds_metrics_from_ranks(
382+
[0, 1, 2], fs_util, client4, fuse_mounting_dir_4, cephfs
276383
)["counters"]["opened_files"]
277384
log.info(f"post_opened_files_client2: {post_opened_files_client2}")
278385
log.info(f"post_opened_files_client4: {post_opened_files_client4}")
@@ -288,11 +395,11 @@ def run(ceph_cluster, **kw):
288395
"Failed to verify opened_files for client4"
289396
)
290397
log.info("Verify if other clients opened_files metrics remain same")
291-
post_opened_files_client1 = fs_util.get_mds_metrics(
292-
client1, 0, fuse_mounting_dir_1, cephfs
398+
post_opened_files_client1 = get_mds_metrics_from_ranks(
399+
[0, 1, 2], fs_util, client1, fuse_mounting_dir_1, cephfs
293400
)["counters"]["opened_files"]
294-
post_opened_files_client3 = fs_util.get_mds_metrics(
295-
client3, 0, fuse_mounting_dir_3, cephfs
401+
post_opened_files_client3 = get_mds_metrics_from_ranks(
402+
[0, 1, 2], fs_util, client3, fuse_mounting_dir_3, cephfs
296403
)["counters"]["opened_files"]
297404
log.info(f"post_opened_files_client1: {post_opened_files_client1}")
298405
log.info(f"post_opened_files_client3: {post_opened_files_client3}")
@@ -315,17 +422,17 @@ def run(ceph_cluster, **kw):
315422
log.info(
316423
"Increase only Client2 and Client4 dentry metrics and other clients should remain same"
317424
)
318-
pre_dentry_client1 = fs_util.get_mds_metrics(
319-
client1, 0, fuse_mounting_dir_1, cephfs
425+
pre_dentry_client1 = get_mds_metrics_from_ranks(
426+
[0, 1, 2], fs_util, client1, fuse_mounting_dir_1, cephfs
320427
)["counters"]
321-
pre_dentry_client2 = fs_util.get_mds_metrics(
322-
client2, 0, fuse_mounting_dir_2, cephfs
428+
pre_dentry_client2 = get_mds_metrics_from_ranks(
429+
[0, 1, 2], fs_util, client2, fuse_mounting_dir_2, cephfs
323430
)["counters"]
324-
pre_dentry_client3 = fs_util.get_mds_metrics(
325-
client3, 0, fuse_mounting_dir_3, cephfs
431+
pre_dentry_client3 = get_mds_metrics_from_ranks(
432+
[0, 1, 2], fs_util, client3, fuse_mounting_dir_3, cephfs
326433
)["counters"]
327-
pre_dentry_client4 = fs_util.get_mds_metrics(
328-
client4, 0, fuse_mounting_dir_4, cephfs
434+
pre_dentry_client4 = get_mds_metrics_from_ranks(
435+
[0, 1, 2], fs_util, client4, fuse_mounting_dir_4, cephfs
329436
)["counters"]
330437
log.info(f"pre_dentry_client2: {pre_dentry_client2}")
331438
log.info(f"pre_dentry_client4: {pre_dentry_client4}")
@@ -368,11 +475,11 @@ def run(ceph_cluster, **kw):
368475
client4.exec_command(sudo=True, cmd=f"ls {fuse_mounting_dir_4}/{dir}/")
369476
time.sleep(5)
370477
log.info("Get final MDS metrics after creating directories and files")
371-
post_dentry_client2 = fs_util.get_mds_metrics(
372-
client2, 0, fuse_mounting_dir_2, cephfs
478+
post_dentry_client2 = get_mds_metrics_from_ranks(
479+
[0, 1, 2], fs_util, client2, fuse_mounting_dir_2, cephfs
373480
)["counters"]
374-
post_dentry_client4 = fs_util.get_mds_metrics(
375-
client4, 0, fuse_mounting_dir_4, cephfs
481+
post_dentry_client4 = get_mds_metrics_from_ranks(
482+
[0, 1, 2], fs_util, client4, fuse_mounting_dir_4, cephfs
376483
)["counters"]
377484
log.info(f"post_dentry_client2: {post_dentry_client2}")
378485
log.info(f"post_dentry_client4: {post_dentry_client4}")
@@ -388,11 +495,11 @@ def run(ceph_cluster, **kw):
388495
f"Failed to verify {dentry} for client4"
389496
)
390497
log.info("Verify if other clients dentry metrics remain same")
391-
post_dentry_client1 = fs_util.get_mds_metrics(
392-
client1, 0, fuse_mounting_dir_1, cephfs
498+
post_dentry_client1 = get_mds_metrics_from_ranks(
499+
[0, 1, 2], fs_util, client1, fuse_mounting_dir_1, cephfs
393500
)["counters"]
394-
post_dentry_client3 = fs_util.get_mds_metrics(
395-
client3, 0, fuse_mounting_dir_3, cephfs
501+
post_dentry_client3 = get_mds_metrics_from_ranks(
502+
[0, 1, 2], fs_util, client3, fuse_mounting_dir_3, cephfs
396503
)["counters"]
397504
log.info(f"post_dentry_client1: {post_dentry_client1}")
398505
log.info(f"post_dentry_client3: {post_dentry_client3}")
@@ -415,11 +522,11 @@ def run(ceph_cluster, **kw):
415522
)
416523
# Using scp from client1 to client2, it will increase total_read_ops and total_read_size in client1
417524
# In client2, it will increase total_write_ops and total_write_size
418-
pre_read_ops_client1 = fs_util.get_mds_metrics(
419-
client1, 0, fuse_mounting_dir_1, cephfs
525+
pre_read_ops_client1 = get_mds_metrics_from_ranks(
526+
[0, 1, 2], fs_util, client1, fuse_mounting_dir_1, cephfs
420527
)["counters"]
421-
pre_write_ops_client2 = fs_util.get_mds_metrics(
422-
client2, 0, fuse_mounting_dir_2, cephfs
528+
pre_write_ops_client2 = get_mds_metrics_from_ranks(
529+
[0, 1, 2], fs_util, client2, fuse_mounting_dir_2, cephfs
423530
)["counters"]
424531
log.info(f"pre_read_ops_client1: {pre_read_ops_client1}")
425532
log.info(f"pre_write_ops_client2: {pre_write_ops_client2}")
@@ -456,11 +563,11 @@ def run(ceph_cluster, **kw):
456563
"total_write_size",
457564
]
458565
log.info("Get final MDS metrics after copying a file")
459-
post_read_ops_client1 = fs_util.get_mds_metrics(
460-
client1, 0, fuse_mounting_dir_1, cephfs
566+
post_read_ops_client1 = get_mds_metrics_from_ranks(
567+
[0, 1, 2], fs_util, client1, fuse_mounting_dir_1, cephfs
461568
)["counters"]
462-
post_write_ops_client2 = fs_util.get_mds_metrics(
463-
client2, 0, fuse_mounting_dir_2, cephfs
569+
post_write_ops_client2 = get_mds_metrics_from_ranks(
570+
[0, 1, 2], fs_util, client2, fuse_mounting_dir_2, cephfs
464571
)["counters"]
465572
log.info(f"post_read_ops_client1: {post_read_ops_client1}")
466573
log.info(f"post_write_ops_client2: {post_write_ops_client2}")

tests/cephfs/lib/cephfs_common_lib.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,20 @@ def wait_for_healthy_ceph(self, client, wait_time):
4545
"""
4646
ceph_healthy = 0
4747
end_time = datetime.datetime.now() + datetime.timedelta(seconds=wait_time)
48+
accepted_list = [
49+
"experiencing slow operations in BlueStore",
50+
"Slow OSD heartbeats",
51+
]
4852
while ceph_healthy == 0 and (datetime.datetime.now() < end_time):
4953
if self.check_ceph_status(client, "HEALTH_OK"):
5054
ceph_healthy = 1
5155
else:
5256
out, _ = client.exec_command(sudo=True, cmd="ceph health detail")
53-
if "experiencing slow operations in BlueStore" in str(out):
54-
log.info("Ignoring the known warning for Bluestore Slow ops")
57+
if any(msg in str(out) for msg in accepted_list):
58+
log.info(
59+
"Ignoring the known warning for Bluestore Slow ops and OSD heartbeats"
60+
)
61+
log.warning("Cluster health can be OK, current state : %s", out)
5562
ceph_healthy = 1
5663
else:
5764
log.info(

0 commit comments

Comments
 (0)