Switch from flaky to pytest-rerunfailures for test retries

alexander-akhmetov · alexander-akhmetov · commit bc33f3b3eb20 · 2026-03-05T10:40:05.000+01:00
The flaky plugin doesn't work with async tests — it never retries them.
Meanwhile, pytest-rerunfailures (pulled in by deepeval) was hijacking
the @pytest.mark.flaky marker and defaulting to 1 rerun because it
doesn't understand max_runs. The two plugins conflict on the same
firstresult hook.

Replace all max_runs=3 with reruns=2 (same 3 total attempts), disable
the flaky plugin, and remove it from dependencies.
diff --git a/tests/admin_test.py b/tests/admin_test.py
@@ -71,7 +71,7 @@ async def grafana_team():
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_list_users_by_org(
     model: str,
     mcp_client: ClientSession,
@@ -97,7 +97,7 @@ async def test_list_users_by_org(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_list_teams(
     model: str,
     mcp_client: ClientSession,
diff --git a/tests/clickhouse_test.py b/tests/clickhouse_test.py
@@ -8,7 +8,7 @@
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_clickhouse_list_tables(
     model: str,
     mcp_client: ClientSession,
@@ -36,7 +36,7 @@ async def test_clickhouse_list_tables(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_clickhouse_describe_table(
     model: str,
     mcp_client: ClientSession,
@@ -64,7 +64,7 @@ async def test_clickhouse_describe_table(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_clickhouse_query_logs(
     model: str,
     mcp_client: ClientSession,
diff --git a/tests/cloudwatch_test.py b/tests/cloudwatch_test.py
@@ -8,7 +8,7 @@
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_cloudwatch_list_namespaces(
     model: str,
     mcp_client: ClientSession,
@@ -33,7 +33,7 @@ async def test_cloudwatch_list_namespaces(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_cloudwatch_list_metrics(
     model: str,
     mcp_client: ClientSession,
@@ -58,7 +58,7 @@ async def test_cloudwatch_list_metrics(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_cloudwatch_query_metrics(
     model: str,
     mcp_client: ClientSession,
diff --git a/tests/dashboards_test.py b/tests/dashboards_test.py
@@ -10,7 +10,7 @@
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_dashboard_panel_queries_tool(
     model: str,
     mcp_client: ClientSession,
@@ -40,7 +40,7 @@ async def test_dashboard_panel_queries_tool(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_dashboard_update_with_patch_operations(
     model: str,
     mcp_client: ClientSession,
diff --git a/tests/elasticsearch_test.py b/tests/elasticsearch_test.py
@@ -9,7 +9,7 @@
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_elasticsearch_query_logs(
     model: str,
     mcp_client: ClientSession,
@@ -37,7 +37,7 @@ async def test_elasticsearch_query_logs(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_elasticsearch_query_errors(
     model: str,
     mcp_client: ClientSession,
diff --git a/tests/loki_test.py b/tests/loki_test.py
@@ -9,7 +9,7 @@
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_loki_logs_tool(
     model: str,
     mcp_client: ClientSession,
@@ -37,7 +37,7 @@ async def test_loki_logs_tool(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_loki_container_labels(
     model: str,
     mcp_client: ClientSession,
diff --git a/tests/navigation_test.py b/tests/navigation_test.py
@@ -61,7 +61,7 @@ async def _run_deeplink_test_with_expected_args(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_generate_dashboard_deeplink(
     model: str,
     mcp_client: ClientSession,
@@ -80,7 +80,7 @@ async def test_generate_dashboard_deeplink(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_generate_panel_deeplink(
     model: str,
     mcp_client: ClientSession,
@@ -103,7 +103,7 @@ async def test_generate_panel_deeplink(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_generate_explore_deeplink(
     model: str,
     mcp_client: ClientSession,
@@ -122,7 +122,7 @@ async def test_generate_explore_deeplink(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_generate_deeplink_with_time_range(
     model: str,
     mcp_client: ClientSession,
@@ -145,7 +145,7 @@ async def test_generate_deeplink_with_time_range(
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_generate_deeplink_with_query_params(
     model: str,
     mcp_client: ClientSession,
diff --git a/tests/pyproject.toml b/tests/pyproject.toml
@@ -15,11 +15,11 @@ dependencies = []
 dev = [
     "anyio>=4.9.0",
     "deepeval>=1.0.0",
-    "flaky>=3.8.1",
     "litellm>=1.63.12",
     "mcp>=1.9.3",
     "pytest>=8.3.5",
     "python-dotenv>=1.0.0",
 ]
 
 [tool.pytest.ini_options]
+addopts = "-p no:flaky"
diff --git a/tests/rendering_test.py b/tests/rendering_test.py
@@ -10,7 +10,7 @@
 
 
 @pytest.mark.parametrize("model", models)
-@pytest.mark.flaky(max_runs=3)
+@pytest.mark.flaky(reruns=2)
 async def test_get_panel_image(
     model: str,
     mcp_client: ClientSession,
diff --git a/tests/tempo_test.py b/tests/tempo_test.py
@@ -200,7 +200,7 @@ class TestTempoProxiedToolsWithLLM:
     """LLM integration tests for Tempo proxied tools."""
 
     @pytest.mark.parametrize("model", models)
-    @pytest.mark.flaky(max_runs=3)
+    @pytest.mark.flaky(reruns=2)
     async def test_llm_can_list_trace_attributes(
         self, model: str, mcp_client: ClientSession, mcp_transport: str
     ):
diff --git a/tests/uv.lock b/tests/uv.lock