Skip to content

Commit aef0005

Browse files
committed
Fix: Connection test to not just check 200
1 parent 5c7deab commit aef0005

File tree

1 file changed

+47
-9
lines changed

1 file changed

+47
-9
lines changed

src/operator/utils/node_validation_test/connection_validator.py

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,12 @@ class URLTestConfig(pydantic.BaseModel):
3737
method: str = pydantic.Field(default='GET', description='HTTP method to use')
3838
timeout: int = pydantic.Field(
3939
default=30, description='Timeout in seconds for the connection test')
40-
expected_status_code: int = pydantic.Field(
41-
default=200, description='Expected HTTP status code')
40+
expected_status_code: Optional[int] = pydantic.Field(
41+
default=None, description='Expected HTTP status code (None means any non-5xx is success)')
4242
condition_name: Optional[str] = pydantic.Field(
4343
default='ServiceConnectionTestFailure', description='Custom condition name for this URL')
44+
retriable_status_codes: List[int] = pydantic.Field(
45+
default=[429, 503], description='Status codes that should trigger retry')
4446

4547

4648
class ConnectionTestConfig(test_base.NodeTestConfig):
@@ -119,6 +121,13 @@ def _connection_test(self, url_config: URLTestConfig) -> test_base.NodeCondition
119121
120122
Returns:
121123
NodeCondition on success, None on failure (to trigger retry/backoff).
124+
125+
Status code handling:
126+
- If expected_status_code is set, only that code is considered success
127+
- If expected_status_code is None (default):
128+
- Retriable codes (429, 503) trigger retry
129+
- Any other 5xx indicates service is down
130+
- All other codes (2xx, 3xx, 4xx) indicate service is reachable
122131
"""
123132
try:
124133
logging.info('Testing URL: %s', url_config.url)
@@ -128,21 +137,50 @@ def _connection_test(self, url_config: URLTestConfig) -> test_base.NodeCondition
128137
timeout=url_config.timeout,
129138
)
130139

131-
if response.status_code != url_config.expected_status_code:
132-
logging.error(
133-
'Unexpected status code from %s: %s != %s',
140+
status_code = response.status_code
141+
142+
# If expected_status_code is explicitly set, use strict matching
143+
if url_config.expected_status_code is not None:
144+
if status_code != url_config.expected_status_code:
145+
logging.error(
146+
'Unexpected status code from %s: %s != %s',
147+
url_config.url,
148+
status_code,
149+
url_config.expected_status_code,
150+
)
151+
return None
152+
else:
153+
# Check if status code is retriable (e.g., 429 rate limiting, 503 unavailable)
154+
if status_code in url_config.retriable_status_codes:
155+
logging.warning(
156+
'Retriable status code from %s: %s, will retry',
157+
url_config.url,
158+
status_code,
159+
)
160+
return None
161+
162+
# Any 5xx not already caught by retriable_status_codes is a service failure
163+
if status_code >= 500:
164+
logging.error(
165+
'Service failure status code from %s: %s',
166+
url_config.url,
167+
status_code,
168+
)
169+
return None
170+
171+
# Any other status code (2xx, 3xx, 4xx) means service is reachable
172+
logging.info(
173+
'Service reachable at %s with status code %s',
134174
url_config.url,
135-
response.status_code,
136-
url_config.expected_status_code,
175+
status_code,
137176
)
138-
return None
139177

140178
logging.info('URL test passed: %s (%s)', url_config.url, url_config.condition_name)
141179
return test_base.NodeCondition(
142180
type=url_config.condition_name or self.config.condition_name,
143181
status='False',
144182
reason='ServiceConnectionSuccess',
145-
message=f'Connection test passed: {url_config.url}',
183+
message=f'Connection test passed: {url_config.url} (status: {status_code})',
146184
)
147185
except requests.RequestException as e:
148186
logging.error('Connection test failed for %s: %s', url_config.url, str(e))

0 commit comments

Comments
 (0)