wip cloud gpu training

Wonters · Wonters · commit d6aed1c3790c · 2025-04-22T16:38:01.000+02:00
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -36,10 +36,10 @@ services:
     volumes:
       - grafana-storage:/var/lib/grafana
       - ./grafana/dashboards:/etc/grafana/dashboards
-      - ./grafana/dashboards.yml:/etc/grafana/provisioning/dashboards/tweet_dashboards.yml
+      - ./grafana/dashboards.yaml:/etc/grafana/provisioning/dashboards/tweet_dashboards.yaml
       - ./grafana/datasources:/etc/grafana/provisioning/datasources
       - ./grafana/alerting:/etc/grafana/alerting
-      - ./grafana/alerting.yml:/etc/grafana/provisioning/alerting/tweet_alerts.yml
+      - ./grafana/alertings.yaml:/etc/grafana/provisioning/alerting/tweet_alerts.yaml
     environment:
       - GF_SECURITY_ADMIN_PASSWORD=admin
       - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/etc/grafana/dashboards/tweet_dashboard.json
diff --git a/grafana/alerting/rules.json b/grafana/alerting/rules.json
@@ -0,0 +1,95 @@
+{
+    "apiVersion": 1,
+    "groups": [
+        {
+            "orgId": 1,
+            "name": "job alert",
+            "folder": "bert",
+            "interval": "1m",
+            "rules": [
+                {
+                    "uid": "dej6cljqgxekge",
+                    "title": "Alert max jobs",
+                    "condition": "C",
+                    "data": [
+                        {
+                            "refId": "A",
+                            "relativeTimeRange": {
+                                "from": 600,
+                                "to": 0
+                            },
+                            "datasourceUid": "cej41rlo2r11cd",
+                            "model": {
+                                "disableTextWrap": false,
+                                "editorMode": "builder",
+                                "expr": "prediction_status{model=\"bert\"}",
+                                "fullMetaSearch": false,
+                                "includeNullMetadata": true,
+                                "instant": true,
+                                "intervalMs": 1000,
+                                "legendFormat": "__auto",
+                                "maxDataPoints": 43200,
+                                "range": false,
+                                "refId": "A",
+                                "useBackend": false
+                            }
+                        },
+                        {
+                            "refId": "C",
+                            "relativeTimeRange": {
+                                "from": 0,
+                                "to": 0
+                            },
+                            "datasourceUid": "__expr__",
+                            "model": {
+                                "conditions": [
+                                    {
+                                        "evaluator": {
+                                            "params": [
+                                                5
+                                            ],
+                                            "type": "gt"
+                                        },
+                                        "operator": {
+                                            "type": "and"
+                                        },
+                                        "query": {
+                                            "params": [
+                                                "C"
+                                            ]
+                                        },
+                                        "reducer": {
+                                            "params": [],
+                                            "type": "last"
+                                        },
+                                        "type": "query"
+                                    }
+                                ],
+                                "datasource": {
+                                    "type": "__expr__",
+                                    "uid": "__expr__"
+                                },
+                                "expression": "A",
+                                "intervalMs": 1000,
+                                "maxDataPoints": 43200,
+                                "refId": "C",
+                                "type": "threshold"
+                            }
+                        }
+                    ],
+                    "noDataState": "NoData",
+                    "execErrState": "Error",
+                    "for": "1m",
+                    "annotations": {
+                        "description": "Task is up to 5 jobs",
+                        "summary": "Alert Sentiment analysis"
+                    },
+                    "isPaused": false,
+                    "notification_settings": {
+                        "receiver": "admin-email"
+                    }
+                }
+            ]
+        }
+    ]
+}
diff --git a/grafana/alerting/rules.yaml b/grafana/alerting/rules.yaml
diff --git a/grafana/alertings.yaml b/grafana/alertings.yaml
@@ -1,9 +1,9 @@
-apiVersion: 1
+# apiVersion: 1
 
-groups:
-  - name: alert-rules
-    folder: Tweet Alerts
-    orgId: 1
-    interval: 30s
-    rules:
-      - file: /etc/grafana/alerting/rules.yml
+# groups:
+#   - name: alert-rules
+#     folder: Tweet Alerts
+#     orgId: 1
+#     interval: 30s
+#     rules:
+#       - file: /etc/grafana/alerting/rules.json
diff --git a/grafana/datasources/datasources.yaml b/grafana/datasources/datasources.yaml
@@ -3,6 +3,7 @@ apiVersion: 1
 datasources:
   - name: Prometheus
     type: prometheus
+    uid: DS_TWEET_SENTIMENT SERVER METRICS
     access: proxy
     orgId: 1
     url: http://prometheus:9090
@@ -11,6 +12,7 @@ datasources:
 
   - name: Loki
     type: loki
+    uid: DS_LOKI
     access: proxy
     orgId: 1
     url: http://loki:3100
diff --git a/mlruns/0/meta.yaml b/mlruns/0/meta.yaml
@@ -1,4 +1,4 @@
-artifact_location: file:///workspace/sentimental_analyses/mlruns/0
+artifact_location: file:///Users/wonters/Desktop/openclassroom/projets/sentiment_analyses/sentimental_analyses/mlruns/0
 creation_time: 1744984721760
 experiment_id: '0'
 last_update_time: 1744984721760
diff --git a/requirements.txt b/requirements.txt
@@ -101,4 +101,5 @@ scikit-optimize==0.10.2
 lightgbm==4.1.0
 pymongo==4.12.0
 prometheus-client==0.21.1
+optuna==3.4.0
 
diff --git a/scripts/cloud-gpu.sh b/scripts/cloud-gpu.sh
@@ -0,0 +1,5 @@
+#! /bin/bash
+sudo apt-get install gcc make -y
+sudo apt install build-essential linux-headers-$(uname -r)
+wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
+sudo sh ./cuda_11.8.0_520.61.05_linux.run --silent --driver --toolkit
diff --git a/src/mixins.py b/src/mixins.py
@@ -64,6 +64,7 @@ def _train_batch(self, x, y):
         self.optimizer.zero_grad()
         inputs = inputs.to(self.device)
         labels = labels.to(self.device)
+        # Give input_ids and attention masks
         outputs = self.model(**inputs)
         try:
             loss = self.criterion(outputs.logits, labels)
diff --git a/src/ml.py b/src/ml.py
@@ -1,6 +1,8 @@
 import numpy as np
 import joblib
 import os 
+import re
+import string
 from functools import partial
 from pathlib import Path
 from abc import ABC
@@ -13,7 +15,7 @@
 from torch.utils.data import DataLoader
 import torch.nn.functional as F
 from sklearn.linear_model import LogisticRegression
-from lightgbm import LGBMClassifier
+import lightgbm as lgm
 from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.metrics import confusion_matrix, classification_report
@@ -112,7 +114,6 @@ def init_mlflow(self, name:str = ""):
         self.run = mlflow.start_run(run_name=name if name else self.name)
         self.run_id = self.run.info.run_id
 
-
     def load_checkpoint(self) -> object:
         """
         Logic to load the model from a checkpoint or create a new one
@@ -147,7 +148,7 @@ def confusion_matrix(self):
             plt.ylabel("Cluster réels")
             plt.savefig(f.name)
             plt.close()
-            mlflow.log_artifact(f.name)#, "confusion_matrix.png")
+            mlflow.log_artifact(f.name, "confusion_matrix.png")
 
     def train(self):
         """
@@ -170,6 +171,13 @@ def train(self):
                 signature=signature,
                 registered_model_name=f"{self.name}-quickstart"
             )
+        elif isinstance(self.model, lgm.LGBMClassifier):
+            mlflow.lightgbm.log_model(
+                lgb_model=self.model,
+                artifact_path=self.name,
+                signature=signature,
+                registered_model_name=f"{self.name}-quickstart",
+            )
         else:
             mlflow.sklearn.log_model(
                 sk_model=self.model,
@@ -189,7 +197,7 @@ def predict(self, x: Union[pd.Series, np.ndarray]):
 class SklearnBaseModel(BaseModelABC):
     def log_metrics(self):
         super().log_metrics()
-        mlflow.sklearn.log_model(self.model, self.name)
+        #mlflow.sklearn.log_model(self.model, self.name)
         mlflow.log_params(self.model.get_params())
 
 class TorchBaseModel(TorchModelTrainMixin, BaseModelABC):
@@ -257,27 +265,34 @@ class LightGBMModel(SklearnBaseModel):
     def log_metrics(self):
         """Callback pour logger les métriques dans MLflow"""
         super().log_metrics()
-        mlflow.log_metric("oob_score", self.model.oob_score_)
+
 
     def init_items(self):
-        self.model = LGBMClassifier(
-            n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42
+        self.model = lgm.LGBMClassifier(
+            #n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42
         )
         self.tokenizer = self.tokenizer_class(
-            max_features=1000, ngram_range=(1, 2), binary=True
+            max_features=1000, min_df=2, max_df=0.95
         )
 
+    def clean(self, tweet):
+        translator = str.maketrans('','', string.punctuation)
+        tweet = tweet.translate(translator)
+        tweet = re.sub("^[a-z][A-Z]", " ",tweet)
+        tweet = tweet.lower()
+        tweet = ' '.join(tweet.split())
+        return tweet
+
     def train(self):
         """
         Train the Random Forest model with progress tracking
         """
         self.init_mlflow()
         try:
             # Vectorisation du texte
-            X_train = self.tokenizer.fit_transform(self.x_train)
+            X_train = self.tokenizer.fit_transform(self.x_train.apply(self.clean))
             self.model.fit(X_train, self.y_train)
             super().train()
-            # self.model.n_estimators += 10
         except Exception as e:
             logger.error(f"Erreur pendant l'entraînement: {str(e)}")
             raise
@@ -513,4 +528,5 @@ def load_data(path):
     df_tweets = pd.read_csv(path, names=headers, encoding="latin-1")
     # On prend target 0 negatif 1 positif
     df_tweets.loc[:, "target"] = df_tweets.target.map({0: int(0), 4: int(1)})
+
     return df_tweets
diff --git a/src/tests/tests.py b/src/tests/tests.py
@@ -1,6 +1,6 @@
+import lightgbm
 import pytest
 import time
-import os
 from fastapi.testclient import TestClient
 from ..ml import (
     LogisticRegressionModel,
diff --git a/train.py b/train.py
@@ -1,11 +1,8 @@
-from src.ml import load_data, RandomForestModel, LogisticRegressionModel, BertModel, RobertaModel, LSTMModel
-import logging
-from rich.logging import RichHandler
+## Import lightgbm avoiding segfault error, protection against segfault
+#import lightgbm as lgb
+from src.ml import LightGBMModel, load_data, LSTMModel
 
-logging.basicConfig(
-    level=logging.INFO, format="%(message)s", datefmt="[%X]", handlers=[RichHandler()]
-)
-file = "../data/training.1600000.processed.noemoticon.csv"
-original_df = load_data(file)
-model = LSTMModel(original_df)
-model.optuna_train(n_trials=5, frac=0.001)
+df  = load_data('../data/training.1600000.processed.noemoticon.csv')
+#df = df.sample(frac=1, random_state=42)
+model = LSTMModel(df)
+model.train()

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-artifact_location: file:///workspace/sentimental_analyses/mlruns/0`
	`1`	`+artifact_location: file:///Users/wonters/Desktop/openclassroom/projets/sentiment_analyses/sentimental_analyses/mlruns/0`
`2`	`2`	`creation_time: 1744984721760`
`3`	`3`	`experiment_id: '0'`
`4`	`4`	`last_update_time: 1744984721760`