WIP

Wonters · Wonters · commit 17f20d412401 · 2025-04-04T15:39:48.000+02:00
diff --git a/.github/workflows/deploy-action.yml b/.github/workflows/deploy-action.yml
@@ -0,0 +1,20 @@
+name: Deploy alerting alerting
+run-name: ${{ github.actor }} is testing out GitHub Actions 🚀
+on: [push]
+jobs:
+  test:
+    runs-on: python
+    steps:
+      - run: pytest --version
+      - run: pip install -r requirements.txt
+      - run: pytest tests.py
+    branch: develop, main
+  deploy:
+    runs-on: docker
+    steps:
+      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
+      - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!"
+      - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
+      - run: echo "🖥️ The workflow is now ready to test your code on the runner."
+      - run: ssh root@38.0.101.76 -i ~/.ssh/id_rsa "cd /home/ubuntu/sentimental_analyses && git pull origin main && ./install.sh"
+    branch: main
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+__pycache__/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,7 @@
+FROM python-slim
+RUN apt-get update && apt-get install -y procps
+LABEL authors="wonters"
+WORKDIR /app
+COPY . .
+RUN pip install -r requirements.txt
+ENTRYPOINT ["gunicorn", "--bind", "0.0.0.0:5000", "app:server"]
diff --git a/checkpoints/Logistic_regression_tokenizer.pkl b/checkpoints/Logistic_regression_tokenizer.pkl
diff --git a/checkpoints/logistic_regression.pkl b/checkpoints/logistic_regression.pkl
diff --git a/install.sh b/install.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+docker build -t sentimental_analyses:latest .
+docker run -it -name server -p 5000:5000 sentimental_analyses:latest
diff --git a/ml.py b/ml.py
@@ -0,0 +1,142 @@
+import numpy
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+import joblib
+from pathlib import Path
+from abc import ABC
+from typing import Union
+from sklearn.feature_extraction.text import TfidfVectorizer
+import torch
+if torch.cuda.is_available():
+    DEVICE = torch.device('cuda')
+elif torch.backends.mps.is_available():
+    DEVICE = torch.device("mps")
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import pandas as pd
+import logging
+import torch.nn.functional as F
+from tqdm import tqdm
+import mlflow
+
+logger = logging.getLogger(__name__)
+
+
+
+SENTIMENT_LABELS = {
+    0: "😡 unsatisfy",
+    4: "😊 satisfy",
+}
+
+class TweetDataset(torch.utils.data.Dataset):
+    def __init__(self, tokenizer, tweets, labels):
+        self.tokenizer = tokenizer
+        self.tweets = tweets
+        self.labels = labels
+
+    def __len__(self):
+        return len(self.tweets)
+
+    def __getitem__(self, idx):
+        tweet = self.tweets[idx]
+        label = self.labels[idx]
+        inputs = self.tokenizer(tweet, return_tensors="pt", truncation=True, padding=True)
+        return tweet, inputs, label
+
+
+class BaseModel(ABC):
+    checkpoint: str = ""
+    tokenizer = None
+    def __init__(self):
+        self.model = None
+        self.dataset = None
+
+    def mlflow_record(self, params: dict, metrics: dict, model, model_name: str, **kwargs):
+        with mlflow.start_run():
+            mlflow.log_params(params)
+            for k,v in metrics.items():
+                mlflow.log_metric(k, v)
+            mlflow.sklearn.log_model(model, model_name)
+            mlflow.log_artifact(self.checkpoint)
+
+    def train(self, x_train, y_train):
+        """"""
+
+    def predict(self, x:Union[pd.Series, numpy.ndarray]):
+        """"""
+
+
+class LogisticRegressionModel(BaseModel):
+    checkpoint = "checkpoints/logistic_regression.pkl"
+    checkpoint_tokenizer = "checkpoints/Logistic_regression_tokenizer.pkl"
+    tokenizer = TfidfVectorizer()
+    def __init__(self):
+        super().__init__()
+        if Path(self.checkpoint).exists():
+            self.model = joblib.load(self.checkpoint)
+        else:
+            self.model = LogisticRegression()
+        if Path(self.checkpoint_tokenizer).exists():
+            self.tokenizer = joblib.load(self.checkpoint_tokenizer)
+
+
+    def predict(self, x: Union[pd.Series, numpy.ndarray]):
+        x = self.tokenizer.transform(x)
+        predicted_class = self.model.predict(x)
+        logger.info(f"predicted {x.shape}")
+        return [SENTIMENT_LABELS[p] for p in predicted_class]
+
+    def train(self, x_train, y_train):
+        x_train = self.tokenizer.fit_transform(x_train)
+        self.model.fit(x_train, y_train)
+        params = self.model.get_params()
+        metrics = {"score": self.model.score(x_train, y_train)}
+        self.mlflow_record(params,
+                           metrics,
+                           self.model,
+                           "logistic_regression")
+        joblib.dump(self.model, self.checkpoint)
+        joblib.dump(self.tokenizer, self.checkpoint_tokenizer)
+
+
+class BertModel(BaseModel):
+    checkpoint = "checkpoints/bert.pkl"
+    tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
+    def __init__(self):
+        super().__init__()
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            "nlptown/bert-base-multilingual-uncased-sentiment")
+        self.model.classifier = torch.nn.Linear(768, 2)
+        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=2e-5)
+        self.criterion = torch.nn.CrossEntropyLoss()
+
+    def train(self, x_train, y_train):
+        dataset = TweetDataset(self.tokenizer, x_train, y_train)
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)
+        self.model.train()
+        self.model.to("mps")
+        for epoch in range(3):
+            for tweet, inputs, label in tqdm(dataloader):
+                self.optimizer.zero_grad()
+                outputs = self.model(**inputs)
+                loss = self.criterion(outputs.logits, label)
+                loss.backward()
+                self.optimizer.step()
+        self.model.save("sentiment_model_checkpoint.pkl")
+
+    def predict(self, x):
+        tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
+        inputs = tokenizer(x, return_tensor='pt', truncation=True, padding=True)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            probs = F.softmax(outputs.logits, dim=1)
+            predicted_class = torch.argmax(probs, dim=1).item()
+        return SENTIMENT_LABELS[predicted_class]
+
+
+def load_data(path):
+    headers = ['target', 'ids', 'date', 'flag', 'user', 'text']
+    df_tweets = pd.read_csv(path, names=headers, encoding="latin-1")
+    train, test, y_train, y_test = train_test_split(df_tweets['text'], df_tweets['target'], test_size=0.2,
+                                                    random_state=42)
+    return train, test, y_train, y_test
+
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/artifacts/logistic_regression.pkl b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/artifacts/logistic_regression.pkl
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/artifacts/logistic_regression/MLmodel b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/artifacts/logistic_regression/MLmodel
@@ -0,0 +1,21 @@
+artifact_path: logistic_regression
+flavors:
+  python_function:
+    env:
+      conda: conda.yaml
+      virtualenv: python_env.yaml
+    loader_module: mlflow.sklearn
+    model_path: model.pkl
+    predict_fn: predict
+    python_version: 3.11.9
+  sklearn:
+    code: null
+    pickled_model: model.pkl
+    serialization_format: cloudpickle
+    sklearn_version: 1.6.1
+mlflow_version: 2.21.3
+model_size_bytes: 4714109
+model_uuid: 0e5004f63f9c48109043be7db41fa2bb
+prompts: null
+run_id: 27cce8096d4f4875b725a4e734cc0e9f
+utc_time_created: '2025-04-04 11:02:08.032030'
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/artifacts/logistic_regression/conda.yaml b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/artifacts/logistic_regression/conda.yaml
@@ -0,0 +1,13 @@
+channels:
+- conda-forge
+dependencies:
+- python=3.11.9
+- pip
+- pip:
+  - mlflow==2.21.3
+  - cloudpickle==3.1.1
+  - numpy==2.2.4
+  - pandas==2.2.3
+  - scikit-learn==1.6.1
+  - scipy==1.15.2
+name: mlflow-env
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/artifacts/logistic_regression/model.pkl b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/artifacts/logistic_regression/model.pkl
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/artifacts/logistic_regression/python_env.yaml b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/artifacts/logistic_regression/python_env.yaml
@@ -0,0 +1,7 @@
+python: 3.11.9
+build_dependencies:
+- pip
+- setuptools
+- wheel
+dependencies:
+- -r requirements.txt
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/artifacts/logistic_regression/requirements.txt b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/artifacts/logistic_regression/requirements.txt
@@ -0,0 +1,6 @@
+mlflow==2.21.3
+cloudpickle==3.1.1
+numpy==2.2.4
+pandas==2.2.3
+scikit-learn==1.6.1
+scipy==1.15.2
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/meta.yaml b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: file:///Users/wonters/Desktop/openclassroom/projets/sentiment_analyses/sentimental_analyses/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/artifacts
+end_time: 1743764535101
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+run_id: 27cce8096d4f4875b725a4e734cc0e9f
+run_name: delicate-colt-380
+run_uuid: 27cce8096d4f4875b725a4e734cc0e9f
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1743764527947
+status: 3
+tags: []
+user_id: wonters
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/metrics/score b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/metrics/score
@@ -0,0 +1 @@
+1743764527988 0.823046875 0
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/C b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/C
@@ -0,0 +1 @@
+1.0
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/class_weight b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/class_weight
@@ -0,0 +1 @@
+None
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/dual b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/dual
@@ -0,0 +1 @@
+False
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/fit_intercept b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/fit_intercept
@@ -0,0 +1 @@
+True
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/intercept_scaling b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/intercept_scaling
@@ -0,0 +1 @@
+1
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/l1_ratio b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/l1_ratio
@@ -0,0 +1 @@
+None
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/max_iter b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/max_iter
@@ -0,0 +1 @@
+100
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/multi_class b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/multi_class
@@ -0,0 +1 @@
+deprecated
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/n_jobs b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/n_jobs
@@ -0,0 +1 @@
+None
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/penalty b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/penalty
@@ -0,0 +1 @@
+l2
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/random_state b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/random_state
@@ -0,0 +1 @@
+None
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/solver b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/solver
@@ -0,0 +1 @@
+lbfgs
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/tol b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/tol
@@ -0,0 +1 @@
+0.0001
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/verbose b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/verbose
@@ -0,0 +1 @@
+0
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/warm_start b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/params/warm_start
@@ -0,0 +1 @@
+False
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/tags/mlflow.log-model.history b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/tags/mlflow.log-model.history
@@ -0,0 +1 @@
+[{"run_id": "27cce8096d4f4875b725a4e734cc0e9f", "artifact_path": "logistic_regression", "utc_time_created": "2025-04-04 11:02:08.032030", "model_uuid": "0e5004f63f9c48109043be7db41fa2bb", "flavors": {"python_function": {"model_path": "model.pkl", "predict_fn": "predict", "loader_module": "mlflow.sklearn", "python_version": "3.11.9", "env": {"conda": "conda.yaml", "virtualenv": "python_env.yaml"}}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "1.6.1", "serialization_format": "cloudpickle", "code": null}}}]
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/tags/mlflow.runName b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/tags/mlflow.runName
@@ -0,0 +1 @@
+delicate-colt-380
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/tags/mlflow.source.name b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/tags/mlflow.source.name
@@ -0,0 +1 @@
+/Users/wonters/Desktop/openclassroom/projets/sentiment_analyses/sentimental_analyses/venv/bin/pytest
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/tags/mlflow.source.type b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/tags/mlflow.source.type
@@ -0,0 +1 @@
+LOCAL
diff --git a/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/tags/mlflow.user b/mlruns/0/27cce8096d4f4875b725a4e734cc0e9f/tags/mlflow.user
@@ -0,0 +1 @@
+wonters
diff --git a/mlruns/0/meta.yaml b/mlruns/0/meta.yaml
@@ -0,0 +1,6 @@
+artifact_location: file:///Users/wonters/Desktop/openclassroom/projets/sentiment_analyses/sentimental_analyses/mlruns/0
+creation_time: 1743764527424
+experiment_id: '0'
+last_update_time: 1743764527424
+lifecycle_stage: active
+name: Default
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,6 @@
+[pytest]
+log_cli = true
+log_cli_level = INFO
+log_format = %(levelname)s [%(name)s] %(message)s
+log_date_format = %H:%M:%S
+addopts = -p no:hydra
diff --git a/requirements.txt b/requirements.txt
diff --git a/server.py b/server.py
@@ -0,0 +1,57 @@
+import tornado
+from tornado_swagger.setup import setup_swagger
+from rich.logging import RichHandler
+import logging
+
+logging.basicConfig(
+    level=logging.INFO,  # Niveau de log (DEBUG, INFO, WARNING...)
+    format="%(message)s",  # Format simplifié, Rich s'occupe du style
+    datefmt="[%X]",
+    handlers=[RichHandler()]  # 👈 liste contenant le handler Rich
+)
+
+logger = logging.getLogger(__name__)
+
+class PredictView(tornado.web.RequestHandler):
+
+    def post(self):
+        text = self.get_argument("text")
+
+
+class TornadoApplication(tornado.web.Application):
+    _routes = [
+        tornado.web.url(r"/api/predict", PredictView),
+    ]
+    security_definition = {
+    }
+    security = [{"TokenQueryAuth": []}]
+
+    def __init__(self):
+        settings = {"debug": True}
+        setup_swagger(
+            self._routes,
+            swagger_url="/api/doc",
+            api_base_url="/",
+            description="Documentation API pour le serveur alerting",
+            api_version="1.0.0",
+            contact="shift.python.software@gmail.com",
+            title="API Tornado Alerting",
+            security_definitions=self.security_definition,
+        )
+        super().__init__(self._routes, **settings)
+
+def start_server(address="0.0.0.0", port=8888):
+    """
+    Start the tornado server
+    :param directory:
+    :param address:
+    :param port:
+    :return:
+    """
+    app = TornadoApplication()
+    app.listen(address=address, port=port)
+    logger.info(f"Serving server on {address}:{port}")
+    tornado.ioloop.IOLoop.current().start()
+
+if __name__ == "__main__":
+    start_server(address="127.0.0.1", port=8004)
diff --git a/tests.py b/tests.py