dataforgoodfr · montanier · Jan 1, 2026 · Jan 23, 2026 · cgoudet · Jan 23, 2026
diff --git a/.gitignore b/.gitignore
@@ -164,4 +164,5 @@ cython_debug/
 
 nohup.out
 data/
-notebooks/
+notebooks/
+images/
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "label-studio-ml-backend"]
+	path = label-studio-ml-backend
+	url = https://github.com/HumanSignal/label-studio-ml-backend.git
diff --git a/README.md b/README.md
@@ -18,6 +18,20 @@ Une fois installé, il suffit de lancer la commande suivante pour installer la v
 uv sync
 ```
 
+### Git Submodules
+
+Ce projet utilise des submodules Git. Après avoir cloné le repo, initialisez-les :
+
+```bash
+git submodule update --init --recursive
+```
+
+Ou clonez directement avec les submodules :
+
+```bash
+git clone --recurse-submodules <repo-url>
+```
+
 A l'usage, si vous utilisez VSCode, l'environnement virtuel sera automatiquement activé lorsque vous ouvrirez le projet. Sinon, il suffit de l'activer manuellement avec la commande suivante :
 
 ```bash
@@ -30,6 +44,76 @@ Ou alors, utilisez la commande `uv run ...` (au lieu de `python ...`) pour lance
 uv run pipelines/run.py run build_database
 ```
 
+## Préparer les données pour Label Studio
+
+Télécharge les images, crée le JSON et démarre le serveur HTTP :
+
+```bash
+# Traiter toutes les images
+uv run cmd/prepare_labelstudio.py
+
+# Limiter à un nombre spécifique
+uv run cmd/prepare_labelstudio.py --limit 10
+
+# Forcer le re-téléchargement des images existantes
+uv run cmd/prepare_labelstudio.py --force
+```
+
+Cela va :
+1. Télécharger les images dans le répertoire `images/`
+2. Créer `labelstudio_tasks.json` avec les annotations
+3. Démarrer un serveur HTTP avec CORS activé sur `http://localhost:8000`
+
+## Lancer Label Studio
+
+Dans un autre terminal :
+
+```bash
+uv run label-studio
+```
+
+Ensuite, importez `labelstudio_tasks.json` dans l'interface de Label Studio.
+
+## ML Backend YOLO (pré-annotations automatiques)
+
+Le ML backend utilise YOLOv8 pour générer des pré-annotations automatiques.
+
+### Configuration
+
+1. Récupérez votre token API Label Studio :
+   - Ouvrez Label Studio → Account & Settings → Access Token
+   - Copiez le token
+
+2. Créez un fichier `.env` à la racine du projet :
+   ```bash
+   cp .env.example .env
+   # Editez .env avec votre token
+   ```
+
+3. Créez un lien symbolique pour que docker-compose puisse lire le `.env` :
+   ```bash
+   ln -s $(pwd)/.env label-studio-ml-backend/label_studio_ml/examples/yolo/.env
+   ```
+
+4. Lancez le ML backend :
+   ```bash
+   cd label-studio-ml-backend/label_studio_ml/examples/yolo
+   docker compose up
+   ```
+
+   Pour relancer sans rebuild (plus rapide) :
+   ```bash
+   docker compose up --no-build
+   ```
+
+   Pour arrêter le backend :
+   ```bash
+   docker compose down
+   ```
+
+5. Connectez le backend dans Label Studio :
+   - Project Settings → Model → Add Model
+   - URL: `http://localhost:9090`
 
 ## Lancer les precommit-hook localement
 

diff --git a/cmd/prepare_labelstudio.py b/cmd/prepare_labelstudio.py
@@ -0,0 +1,167 @@
+import csv
+import json
+import argparse
+import urllib.request
+import os
+import time
+import subprocess
+import socket
+from pathlib import Path
+
+
+def get_lan_ip():
+    """Get LAN IP address, preferring ethernet over WiFi."""
+    try:
+        # Parse ip addr output to find interfaces
+        result = subprocess.run(['ip', 'addr'], capture_output=True, text=True)
+        lines = result.stdout.split('\n')
+
+        interfaces = {}
+        current_iface = None
+
+        for line in lines:
+            # Interface line (e.g., "2: eth0: <BROADCAST...")
+            if ': ' in line and not line.startswith(' '):
+                parts = line.split(': ')
+                if len(parts) >= 2:
+                    current_iface = parts[1].split('@')[0]
+            # IPv4 address line
+            elif 'inet ' in line and current_iface:
+                ip = line.strip().split()[1].split('/')[0]
+                if not ip.startswith('127.'):
+                    interfaces[current_iface] = ip
+
+        # Prefer ethernet (eth*, enp*, eno*) over wifi (wlan*, wlp*)
+        for iface, ip in interfaces.items():
+            if iface.startswith(('eth', 'enp', 'eno')):
+                return ip
+
+        # Fallback to wifi
+        for iface, ip in interfaces.items():
+            if iface.startswith(('wlan', 'wlp')):
+                return ip
+
+        # Fallback to any non-loopback
+        if interfaces:
+            return list(interfaces.values())[0]
+
+    except Exception:
+        pass
+
+    # Last resort: use socket trick
+    try:
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        s.connect(('8.8.8.8', 80))
+        ip = s.getsockname()[0]
+        s.close()
+        return ip
+    except Exception:
+        return 'localhost'
+
+def convert_csv_to_labelstudio(csv_file, output_file, limit=None, force=False, host=None):
+    tasks = []
+
+    # Get host IP for image URLs
+    if host is None:
+        host = get_lan_ip()
+    print(f"Using host IP: {host}")
+
+    # Create images directory
+    images_dir = Path('images')
+    images_dir.mkdir(exist_ok=True)
+
+    with open(csv_file, 'r', encoding='utf-8-sig') as f:
+        reader = csv.DictReader(f, delimiter=';')
+
+        for row in reader:
+            # Check limit
+            if limit and len(tasks) >= limit:
+                break
+
+            # Get first image URL
+            images = row.get('images - observation', '')
+            if not images:
+                continue
+
+            # Split by pipe and take first URL
+            image_url = images.split(' | ')[0].strip()
+
+            # Get common name
+            nom_commun = row.get('Nom commun - observation', '').strip()
+
+            # Skip if no name
+            if not nom_commun:
+                continue
+
+            # Download image
+            filename = Path(image_url).name
+            local_path = images_dir / filename
+
+            # Skip if file exists and not forcing
+            if local_path.exists() and not force:
+                print(f"Skipping {filename} (already exists)")
+            else:
+                try:
+                    print(f"Downloading {image_url}...")
+                    urllib.request.urlretrieve(image_url, local_path)
+                    time.sleep(1)  # Be nice to the server
+                except Exception as e:
+                    print(f"Failed to download {image_url}: {e}")
+                    continue
+
+            # Create task with annotation using LAN IP URL
+            task = {
+                "data": {
+                    "captioning": f"http://{host}:8000/images/{filename}"
+                },
+                "annotations": [{
+                    "result": [
+                        {
+                            "value": {
+                                "choices": [nom_commun]
+                            },
+                            "from_name": "choice",
+                            "to_name": "image",
+                            "type": "choices"
+                        }
+                    ]
+                }]
+            }
+
+            tasks.append(task)
+
+    # Write to JSON file
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(tasks, f, indent=2, ensure_ascii=False)
+
+    print(f"Converted {len(tasks)} tasks to {output_file}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Download images, format CSV to Label Studio JSON, and start HTTP server'
+    )
+    parser.add_argument('--limit', type=int, help='Limit number of images to convert')
+    parser.add_argument('--input', default='export_n1-obs_DataForGood.csv', help='Input CSV file')
+    parser.add_argument('--output', default='labelstudio_tasks.json', help='Output JSON file')
+    parser.add_argument('--force', action='store_true', help='Force re-download of existing images')
+    parser.add_argument('--host', type=str, help='Override host IP for image URLs (default: auto-detect LAN IP)')
+
+    args = parser.parse_args()
+
+    # Get host IP
+    host = args.host if args.host else get_lan_ip()
+
+    # Download images and create JSON
+    convert_csv_to_labelstudio(args.input, args.output, args.limit, args.force, host)
+
+    # Start CORS-enabled HTTP server
+    print("\n" + "="*60)
+    print(f"Starting CORS-enabled HTTP server on http://{host}:8000")
+    print("Import labelstudio_tasks.json in Label Studio UI")
+    print("Press Ctrl+C to stop the server")
+    print("="*60 + "\n")
+
+    try:
+        subprocess.run(['python3', 'cors_server.py', host])
+    except KeyboardInterrupt:
+        print("\nHTTP server stopped.")
diff --git a/cors_server.py b/cors_server.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+"""Simple HTTP server with CORS support for serving images to Label Studio."""
+
+import sys
+from http.server import HTTPServer, SimpleHTTPRequestHandler
+
+
+class CORSRequestHandler(SimpleHTTPRequestHandler):
+    def end_headers(self):
+        self.send_header('Access-Control-Allow-Origin', '*')
+        self.send_header('Access-Control-Allow-Methods', 'GET, OPTIONS')
+        self.send_header('Access-Control-Allow-Headers', '*')
+        super().end_headers()
+
+    def do_OPTIONS(self):
+        self.send_response(200)
+        self.end_headers()
+
+
+if __name__ == '__main__':
+    display_host = sys.argv[1] if len(sys.argv) > 1 else '0.0.0.0'
+    server = HTTPServer(('0.0.0.0', 8000), CORSRequestHandler)
+    print(f'Serving on http://{display_host}:8000')
+    server.serve_forever()
diff --git a/label-studio-ml-backend b/label-studio-ml-backend
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,4 +15,6 @@ dependencies = [
     "matplotlib>=3.10.0",
     "pandas>=2.2.3",
     "requests>=2.32.3",
-]
+    "label-studio>=1.22.0",
+    "ultralytics>=8.3.252",
+]