distribute

Wonters · Wonters · commit a56fb4091ea7 · 2025-04-22T17:09:31.000Z
diff --git a/mlruns/0/meta.yaml b/mlruns/0/meta.yaml
@@ -1,6 +1,6 @@
-artifact_location: file:///Users/wonters/Desktop/openclassroom/projets/sentiment_analyses/sentimental_analyses/mlruns/0
-creation_time: 1744984721760
+artifact_location: file:///home/debian/sentimental_analyses/mlruns/0
+creation_time: 1745341000952
 experiment_id: '0'
-last_update_time: 1744984721760
+last_update_time: 1745341000952
 lifecycle_stage: active
 name: Default
diff --git a/src/ml.py b/src/ml.py
@@ -432,7 +432,7 @@ class LSTMModel(TorchBaseModel):
     name = "LSTM"
     dataset_class = TweetDataset
     epoch = 1
-    batch_size = 120
+    batch_size = 100
     # test with BCEWithLogitLoss -> 1 logit -> post traitment sigmoïd
     out_features = 1
     lr = 1e-4
@@ -473,6 +473,16 @@ def load_checkpoint(self):
             }
             self.model.load_state_dict(embedding_weights, strict=False)
             self.model.eval()
+        import torch
+        import torch.nn as nn
+        import torch.distributed as dist
+
+        dist.init_process_group("nccl")
+        local_rank = torch.distributed.get_rank()
+        torch.cuda.set_device(local_rank)
+
+        self.model = self.model.cuda(local_rank)
+        self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[local_rank], output_device=local_rank,find_unused_parameters=True)
         self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
         self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
             self.optimizer, mode="min", factor=0.5, patience=2
diff --git a/train.py b/train.py
@@ -1,8 +1,11 @@
 ## Import lightgbm avoiding segfault error, protection against segfault
 #import lightgbm as lgb
+import logging
 from src.ml import LightGBMModel, load_data, LSTMModel
 
+logging.basicConfig(level=logging.INFO)
+
 df  = load_data('../data/training.1600000.processed.noemoticon.csv')
-#df = df.sample(frac=1, random_state=42)
+df = df.sample(frac=0.1, random_state=42)
 model = LSTMModel(df)
 model.train()