Skip to content

Commit 5283366

Browse files
author
dhruv
committed
feat: Implement ML-based recommendation system for chaos scenarios
1 parent d9671fb commit 5283366

File tree

5 files changed

+250
-0
lines changed

5 files changed

+250
-0
lines changed

krkn_ai/cli/cmd.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
from krkn_ai.utils.fs import read_config_from_file
1616
from krkn_ai.templates.generator import create_krkn_ai_template
1717
from krkn_ai.utils.cluster_manager import ClusterManager
18+
from krkn_ai.recommendation import ScenarioRecommender
19+
from krkn_ai.utils.prometheus import create_prometheus_client
1820

1921

2022
@click.group(context_settings={"show_default": True})
@@ -201,3 +203,79 @@ def discover(
201203
f.write(template)
202204

203205
logger.info("Saved component configuration to %s", output)
206+
logger.info("Saved component configuration to %s", output)
207+
208+
209+
@main.command(help="Recommend a Krkn chaos scenario based on telemetry")
210+
@click.option(
211+
"--prometheus-url",
212+
help="Prometheus URL to fetch telemetry from.",
213+
envvar="PROMETHEUS_URL",
214+
required=False,
215+
)
216+
@click.option(
217+
"--prometheus-token",
218+
help="Prometheus Token.",
219+
envvar="PROMETHEUS_TOKEN",
220+
required=False,
221+
)
222+
@click.option(
223+
"--kubeconfig",
224+
"-k",
225+
help="Path to cluster kubeconfig file.",
226+
default=os.getenv("KUBECONFIG", None),
227+
)
228+
@click.option(
229+
"--model-path",
230+
"-m",
231+
help="Path to the trained ML model.",
232+
default="krkn_model.pkl",
233+
)
234+
@click.option("-v", "--verbose", count=True, help="Increase verbosity of output.")
235+
@click.pass_context
236+
def recommend(
237+
ctx,
238+
prometheus_url: str,
239+
prometheus_token: str,
240+
kubeconfig: str,
241+
model_path: str,
242+
verbose: int = 0,
243+
):
244+
init_logger(None, verbose >= 2)
245+
logger = get_logger(__name__)
246+
247+
if not os.path.exists(model_path):
248+
logger.error(
249+
f"Model not found at {model_path}. Please train a model first or specify a valid path."
250+
)
251+
exit(1)
252+
253+
# Set env vars for prometheus client creation if provided explicitly
254+
if prometheus_url:
255+
os.environ["PROMETHEUS_URL"] = prometheus_url
256+
if prometheus_token:
257+
os.environ["PROMETHEUS_TOKEN"] = prometheus_token
258+
259+
try:
260+
# Create prometheus client using existing utility
261+
prom_client = create_prometheus_client(kubeconfig)
262+
263+
recommender = ScenarioRecommender(prom_client, model_path)
264+
265+
# Collect data
266+
telemetry_df = recommender.collect_telemetry()
267+
268+
logger.info("Collected Telemetry:\n%s", telemetry_df.to_string(index=False))
269+
270+
# Predict
271+
recommendation = recommender.recommend(telemetry_df)
272+
273+
click.echo(f"\nRecommended Chaos Scenario: {recommendation}")
274+
logger.info(f"Recommendation: {recommendation}")
275+
276+
except PrometheusConnectionError as e:
277+
logger.error(f"Prometheus Connection Error: {e}")
278+
exit(1)
279+
except Exception as e:
280+
logger.exception(f"An error occurred: {e}")
281+
exit(1)

krkn_ai/recommendation/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .recommender import ScenarioRecommender
2+
3+
__all__ = ["ScenarioRecommender"]
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import os
2+
import joblib
3+
import pandas as pd
4+
import numpy as np
5+
from sklearn.ensemble import RandomForestClassifier
6+
from krkn_ai.utils.logger import get_logger
7+
from krkn_ai.utils.prometheus import KrknPrometheus
8+
9+
logger = get_logger(__name__)
10+
11+
class ScenarioRecommender:
12+
def __init__(self, prom_client: KrknPrometheus, model_path: str = None):
13+
self.prom_client = prom_client
14+
self.model_path = model_path
15+
self.model = None
16+
self.feature_names = ["cpu_usage", "memory_usage", "network_io"]
17+
18+
if model_path and os.path.exists(model_path):
19+
self.load_model(model_path)
20+
21+
def collect_telemetry(self, duration_minutes: int = 15) -> pd.DataFrame:
22+
"""
23+
Collects telemetry data from Prometheus for the last N minutes.
24+
Returns a DataFrame with aggregated metrics.
25+
"""
26+
logger.info("Collecting telemetry data for recommendation...")
27+
28+
# Define queries for basic cluster health/state
29+
queries = {
30+
"cpu_usage": 'avg(cluster:node:cpu:ratio)',
31+
"memory_usage": 'avg(cluster:node:memory:utilization:ratio)',
32+
# Basic network I/O sum across cluster
33+
"network_io": 'sum(rate(container_network_receive_bytes_total[5m]))'
34+
}
35+
36+
data = {}
37+
38+
# We'll just take the current values for now as a snapshot
39+
# In a real system you might want time-series features
40+
for name, query in queries.items():
41+
try:
42+
# We use process_query to get instant vector
43+
result = self.prom_client.process_query(query)
44+
if result and len(result) > 0 and 'value' in result[0]:
45+
# result[0]['value'] is [timestamp, "value"]
46+
val = float(result[0]['value'][1])
47+
data[name] = val
48+
else:
49+
logger.warning(f"No data found for {name}, defaulting to 0")
50+
data[name] = 0.0
51+
except Exception as e:
52+
logger.error(f"Failed to query {name}: {e}")
53+
data[name] = 0.0
54+
55+
return pd.DataFrame([data])
56+
57+
def train(self, X: pd.DataFrame, y: list, save_path: str = None):
58+
"""
59+
Trains the Random Forest model.
60+
"""
61+
logger.info("Training recommendation model...")
62+
self.model = RandomForestClassifier(n_estimators=100, random_state=42)
63+
self.model.fit(X[self.feature_names], y)
64+
65+
if save_path:
66+
self.save_model(save_path)
67+
68+
def recommend(self, telemetry_data: pd.DataFrame) -> str:
69+
"""
70+
Returns a recommended chaos scenario based on telemetry.
71+
"""
72+
if not self.model:
73+
raise Exception("Model not loaded or trained.")
74+
75+
prediction = self.model.predict(telemetry_data[self.feature_names])
76+
return prediction[0]
77+
78+
def save_model(self, path: str):
79+
logger.info(f"Saving model to {path}")
80+
joblib.dump(self.model, path)
81+
self.model_path = path
82+
83+
def load_model(self, path: str):
84+
logger.info(f"Loading model from {path}")
85+
try:
86+
self.model = joblib.load(path)
87+
except Exception as e:
88+
logger.error(f"Failed to load model: {e}")
89+
raise

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ click
88
krkn-lib@git+https://github.com/krkn-chaos/krkn-lib
99
pyyaml
1010
seaborn
11+
scikit-learn

scripts/train_model.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
2+
import os
3+
import sys
4+
import pandas as pd
5+
import numpy as np
6+
7+
# Add parent directory to path to allow importing krkn_ai
8+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
9+
10+
from krkn_ai.recommendation import ScenarioRecommender
11+
from krkn_ai.utils.logger import init_logger, get_logger
12+
13+
init_logger(None, True)
14+
logger = get_logger("train_model")
15+
16+
def generate_synthetic_data(n_samples=1000):
17+
"""
18+
Generates synthetic telemetry data with labeled chaos scenarios.
19+
20+
Logic:
21+
- High CPU, Low Memory -> cpu-hog
22+
- Low CPU, High Memory -> memory-hog
23+
- High Network -> network-chaos
24+
- Balanced/Normal -> random/pod-delete (as a generic fallback)
25+
"""
26+
data = []
27+
labels = []
28+
29+
for _ in range(n_samples):
30+
# Generate random base metrics
31+
cpu = np.random.uniform(0, 1) # 0 to 100% normalized
32+
memory = np.random.uniform(0, 1) # 0 to 100% normalized
33+
network = np.random.uniform(0, 1000) # MB/s roughly
34+
35+
# Rule-based labeling for "ground truth"
36+
if cpu > 0.8 and memory < 0.5:
37+
label = "cpu-hog"
38+
elif memory > 0.8 and cpu < 0.5:
39+
label = "memory-hog"
40+
elif network > 800:
41+
label = "network-chaos"
42+
else:
43+
# If nothing stands out, maybe suggest checking general resilience
44+
label = "pod-delete"
45+
46+
data.append({
47+
"cpu_usage": cpu,
48+
"memory_usage": memory,
49+
"network_io": network
50+
})
51+
labels.append(label)
52+
53+
return pd.DataFrame(data), labels
54+
55+
def main():
56+
logger.info("Generating synthetic data...")
57+
X, y = generate_synthetic_data()
58+
59+
target_path = "krkn_model.pkl"
60+
61+
# Initialize recommender (mocking prom client as None for training)
62+
recommender = ScenarioRecommender(prom_client=None)
63+
64+
logger.info(f"Training model on {len(X)} samples...")
65+
recommender.train(X, y, save_path=target_path)
66+
67+
logger.info(f"Model saved to {target_path}")
68+
69+
# Test a prediction
70+
test_sample = pd.DataFrame([{
71+
"cpu_usage": 0.95,
72+
"memory_usage": 0.2,
73+
"network_io": 100
74+
}])
75+
prediction = recommender.recommend(test_sample)
76+
logger.info(f"Test Prediction for High CPU: {prediction} (Expected: cpu-hog)")
77+
78+
if __name__ == "__main__":
79+
main()

0 commit comments

Comments
 (0)