From 21fe185683dc5b0722e75aa882c3ed3ab2c59f7a Mon Sep 17 00:00:00 2001
From: Leon van Bokhorst <post@leonvanbokhorst.com>
Date: Fri, 8 Nov 2024 16:25:46 +0100
Subject: [PATCH 1/7] This is a Model-Agnostic Meta-Learning (MAML)
 implementation, which is designed to learn how to quickly adapt to new tasks
 with minimal training data. Here's what it aims to achieve: Meta-Learning
 Purpose: Learn a good initialization point for neural networks Enable fast
 adaptation to new, unseen tasks Require minimal fine-tuning data for new
 tasks How it Works: Trains on a collection of related but different tasks
 Uses a nested optimization loop: Inner loop: Simulates adaptation to specific
 tasks Outer loop: Updates the meta-model to find better initial parameters
 Practical Applications: Few-shot learning problems Quick adaptation to new
 environments Transfer learning with minimal data Personalization of models
 with limited user data Example Use Cases: Regression tasks with different
 functions Classification with few examples per class Robotics with different
 environments Personalized recommendations with limited user interaction

---
 src/model_agnostic_meta_learning.py | 303 ++++++++++++++++++++++++++++
 1 file changed, 303 insertions(+)
 create mode 100644 src/model_agnostic_meta_learning.py

diff --git a/src/model_agnostic_meta_learning.py b/src/model_agnostic_meta_learning.py
new file mode 100644
index 0000000..4f36a1d
--- /dev/null
+++ b/src/model_agnostic_meta_learning.py
@@ -0,0 +1,303 @@
+from typing import Dict, List, Optional, Tuple, Type
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader
+
+
+class MetaModelGenerator(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        hidden_sizes: List[int],
+        output_size: int,
+        inner_lr: float = 0.05,
+        meta_lr: float = 0.003,
+    ):
+        super().__init__()
+        self.inner_lr = inner_lr
+        self.input_size = input_size
+        self.output_size = output_size
+
+        # Simple architecture with skip connections
+        self.input_layer = nn.Linear(input_size, hidden_sizes[0])
+        self.hidden_layers = nn.ModuleList(
+            [
+                nn.Linear(hidden_sizes[i], hidden_sizes[i + 1])
+                for i in range(len(hidden_sizes) - 1)
+            ]
+        )
+        self.output_layer = nn.Linear(hidden_sizes[-1], output_size)
+
+        # Initialize weights with smaller values
+        self.apply(self._init_weights)
+
+        # Use SGD with momentum for meta-optimization
+        self.meta_optimizer = optim.SGD(
+            self.parameters(), lr=meta_lr, momentum=0.9, nesterov=True
+        )
+
+        # Add learning rate scheduler
+        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+            self.meta_optimizer,
+            mode="min",
+            factor=0.5,
+            patience=2,
+            verbose=True,
+            min_lr=1e-5,
+        )
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=0.01)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.relu(self.input_layer(x))
+        hidden = x
+
+        for layer in self.hidden_layers:
+            hidden = F.relu(layer(hidden) + hidden)  # Skip connection
+
+        return self.output_layer(hidden)
+
+    def meta_train_step(
+        self,
+        task_batch: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
+        device: torch.device,
+    ) -> Tuple[float, float]:
+        total_meta_loss = 0.0
+        total_grad_norm = 0.0
+
+        self.meta_optimizer.zero_grad()
+
+        for support_x, support_y, query_x, query_y in zip(*task_batch):
+            support_x = support_x.to(device)
+            support_y = support_y.to(device)
+            query_x = query_x.to(device)
+            query_y = query_y.to(device)
+
+            # Inner loop optimization
+            fast_weights = {}
+            for name, param in self.named_parameters():
+                fast_weights[name] = param.clone()
+
+            # Multiple inner loop steps
+            for _ in range(3):
+                support_pred = self.forward_with_fast_weights(support_x, fast_weights)
+                inner_loss = F.mse_loss(support_pred, support_y)
+
+                # Manual gradient computation
+                grads = torch.autograd.grad(
+                    inner_loss,
+                    fast_weights.values(),
+                    create_graph=True,
+                    allow_unused=True,
+                )
+
+                # Update fast weights with gradient clipping
+                for (name, weight), grad in zip(fast_weights.items(), grads):
+                    if grad is not None:
+                        clipped_grad = torch.clamp(grad, -1.0, 1.0)
+                        fast_weights[name] = weight - self.inner_lr * clipped_grad
+
+            # Compute meta loss
+            query_pred = self.forward_with_fast_weights(query_x, fast_weights)
+            meta_loss = F.mse_loss(query_pred, query_y)
+
+            # Compute meta gradients
+            meta_loss.backward()
+            total_meta_loss += meta_loss.item()
+
+            # Calculate gradient norm
+            with torch.no_grad():
+                grad_norm = 0.0
+                for param in self.parameters():
+                    if param.grad is not None:
+                        grad_norm += param.grad.norm().item() ** 2
+                total_grad_norm += grad_norm**0.5
+
+        # Average the losses and gradients
+        avg_meta_loss = total_meta_loss / len(task_batch[0])
+        avg_grad_norm = total_grad_norm / len(task_batch[0])
+
+        # Gradient clipping
+        torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)
+
+        # Update meta parameters
+        self.meta_optimizer.step()
+
+        return avg_meta_loss, avg_grad_norm
+
+    def forward_with_fast_weights(
+        self, x: torch.Tensor, fast_weights: Dict[str, torch.Tensor]
+    ) -> torch.Tensor:
+        x = F.relu(
+            F.linear(
+                x, fast_weights["input_layer.weight"], fast_weights["input_layer.bias"]
+            )
+        )
+        hidden = x
+
+        for i in range(len(self.hidden_layers)):
+            next_hidden = F.relu(
+                F.linear(
+                    hidden,
+                    fast_weights[f"hidden_layers.{i}.weight"],
+                    fast_weights[f"hidden_layers.{i}.bias"],
+                )
+            )
+            hidden = next_hidden + hidden  # Skip connection
+
+        return F.linear(
+            hidden,
+            fast_weights["output_layer.weight"],
+            fast_weights["output_layer.bias"],
+        )
+
+
+def create_synthetic_tasks(
+    num_tasks: int = 100,
+    samples_per_task: int = 50,
+    input_size: int = 10,
+    output_size: int = 1,
+) -> List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]]:
+    """Create synthetic tasks for meta-learning."""
+    tasks = []
+    split_idx = samples_per_task // 2
+
+    for _ in range(num_tasks):
+        # Generate more diverse input data
+        x = torch.randn(samples_per_task, input_size)
+        x = (x - x.mean(0)) / (x.std(0) + 1e-8)
+
+        # Create more diverse task functions with multiple non-linearities
+        coefficients = torch.randn(input_size, output_size) * 0.3
+        bias = torch.randn(output_size) * 0.1
+
+        # Balanced task complexity
+        y = torch.matmul(x, coefficients) + bias
+        y += 0.15 * torch.sin(2.0 * torch.matmul(x, coefficients))
+        y += 0.08 * torch.tanh(1.5 * torch.matmul(x, coefficients))
+
+        # Adaptive noise based on signal magnitude
+        noise_scale = 0.02 * torch.std(y)
+        y += noise_scale * torch.randn_like(y)
+
+        tasks.append((x[:split_idx], y[:split_idx], x[split_idx:], y[split_idx:]))
+
+    return tasks
+
+
+def create_task_dataloader(
+    tasks: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
+    batch_size: int = 4,
+) -> DataLoader:
+    """Create a DataLoader for batches of tasks."""
+    # Reorganize tasks into separate lists
+    support_x = [t[0] for t in tasks]
+    support_y = [t[1] for t in tasks]
+    query_x = [t[2] for t in tasks]
+    query_y = [t[3] for t in tasks]
+
+    # Create dataset from lists
+    dataset = list(zip(support_x, support_y, query_x, query_y))
+    return DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+
+if __name__ == "__main__":
+    # Configuration
+    INPUT_SIZE = 10
+    OUTPUT_SIZE = 1
+    HIDDEN_SIZES = [64, 64]  # Increased capacity
+    BATCH_SIZE = 8  # Increased batch size
+    NUM_TASKS = 200  # More tasks
+    SAMPLES_PER_TASK = 100  # More samples per task
+
+    # Create synthetic tasks with more controlled complexity
+    tasks = create_synthetic_tasks(
+        num_tasks=NUM_TASKS,
+        samples_per_task=SAMPLES_PER_TASK,
+        input_size=INPUT_SIZE,
+        output_size=OUTPUT_SIZE,
+    )
+
+    # Create task dataloader with fixed batch size
+    task_dataloader = create_task_dataloader(tasks, batch_size=BATCH_SIZE)
+
+    # Initialize meta-model
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    meta_model = MetaModelGenerator(
+        input_size=INPUT_SIZE, hidden_sizes=HIDDEN_SIZES, output_size=OUTPUT_SIZE
+    ).to(device)
+
+    # Training loop
+    num_epochs = 20
+    best_loss = float("inf")
+    patience_counter = 0
+    patience_limit = 5  # Early stopping patience
+
+    for epoch in range(num_epochs):
+        total_loss = 0.0
+        total_grad_norm = 0.0
+
+        for batch in task_dataloader:
+            loss, grad_norm = meta_model.meta_train_step(batch, device)
+            total_loss += loss
+            total_grad_norm += grad_norm
+
+        avg_loss = total_loss / len(task_dataloader)
+        avg_grad_norm = total_grad_norm / len(task_dataloader)
+        print(
+            f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Grad Norm: {avg_grad_norm:.4f}"
+        )
+
+        # Learning rate scheduling
+        meta_model.scheduler.step(avg_loss)
+
+        # Early stopping check
+        if avg_loss < best_loss:
+            best_loss = avg_loss
+            patience_counter = 0
+        else:
+            patience_counter += 1
+
+        if patience_counter >= patience_limit:
+            print(f"Early stopping triggered after {epoch+1} epochs")
+            break
+
+        # Optional: Stop if loss is very low
+        if avg_loss < 0.001:
+            print("Reached target loss. Stopping training.")
+            break
+
+    # Generate new task-specific model
+    new_task = create_synthetic_tasks(num_tasks=1)[0]
+    support_x, support_y, query_x, query_y = new_task
+    support_x, support_y = support_x.to(device), support_y.to(device)
+
+    # Adapt to new task
+    fast_weights = {}
+    for name, param in meta_model.named_parameters():
+        fast_weights[name] = param.clone()
+
+    # Quick adaptation
+    for _ in range(5):
+        support_pred = meta_model.forward_with_fast_weights(support_x, fast_weights)
+        adapt_loss = F.mse_loss(support_pred, support_y)
+        grads = torch.autograd.grad(
+            adapt_loss, fast_weights.values(), create_graph=False
+        )
+
+        for (name, weight), grad in zip(fast_weights.items(), grads):
+            if grad is not None:
+                fast_weights[name] = weight - meta_model.inner_lr * grad
+
+    # Evaluate on query set
+    query_x, query_y = query_x.to(device), query_y.to(device)
+    query_pred = meta_model.forward_with_fast_weights(query_x, fast_weights)
+    final_loss = F.mse_loss(query_pred, query_y)
+    print(f"\nNew Task Adaptation - Query Loss: {final_loss.item():.4f}")

From 52a9b8728b5201a310ee77830ccf005b65c951a0 Mon Sep 17 00:00:00 2001
From: Leon van Bokhorst <post@leonvanbokhorst.com>
Date: Fri, 8 Nov 2024 16:31:36 +0100
Subject: [PATCH 2/7] feat(maml): implement model-agnostic meta-learning with
 synthetic task generation

This commit introduces a complete MAML implementation with the following features:
- Bi-level optimization for meta-learning
- Synthetic task generation with controlled complexity
- Gradient clipping and skip connections for stability
- Learning rate scheduling and early stopping
- Comprehensive type hints and documentation

Technical details:
- Multi-component non-linear task generation
- Adaptive noise scaling for robustness
- Higher-order gradients for meta-optimization
- Task-specific fast weight adaptation
- Efficient batch processing with DataLoader

Breaking changes: None
Related issues: None
---
 src/model_agnostic_meta_learning.py | 126 +++++++++++++++++-----------
 1 file changed, 78 insertions(+), 48 deletions(-)

diff --git a/src/model_agnostic_meta_learning.py b/src/model_agnostic_meta_learning.py
index 4f36a1d..60eb273 100644
--- a/src/model_agnostic_meta_learning.py
+++ b/src/model_agnostic_meta_learning.py
@@ -69,6 +69,21 @@ def meta_train_step(
         task_batch: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
         device: torch.device,
     ) -> Tuple[float, float]:
+        """
+        Core MAML training step that implements the bi-level optimization:
+        1. Inner Loop: Adapt to individual tasks using gradient descent
+        2. Outer Loop: Update meta-parameters to optimize post-adaptation performance
+        
+        Args:
+            task_batch: List of (support_x, support_y, query_x, query_y) tuples
+                - support_x/y: Used for task adaptation (inner loop)
+                - query_x/y: Used for meta-update (outer loop)
+            device: Computation device (CPU/GPU)
+        
+        Returns:
+            avg_meta_loss: Average loss across all tasks after adaptation
+            avg_grad_norm: Average gradient norm for monitoring training
+        """
         total_meta_loss = 0.0
         total_grad_norm = 0.0
 
@@ -80,54 +95,50 @@ def meta_train_step(
             query_x = query_x.to(device)
             query_y = query_y.to(device)
 
-            # Inner loop optimization
-            fast_weights = {}
-            for name, param in self.named_parameters():
-                fast_weights[name] = param.clone()
-
-            # Multiple inner loop steps
-            for _ in range(3):
+            fast_weights = {name: param.clone() for name, param in self.named_parameters()}
+            # Multiple gradient steps for task adaptation
+            for _ in range(3):  # Inner loop steps
                 support_pred = self.forward_with_fast_weights(support_x, fast_weights)
                 inner_loss = F.mse_loss(support_pred, support_y)
 
-                # Manual gradient computation
+                # Compute gradients w.r.t fast_weights (create_graph=True enables higher-order gradients)
                 grads = torch.autograd.grad(
                     inner_loss,
                     fast_weights.values(),
-                    create_graph=True,
+                    create_graph=True,  # Required for meta-learning
                     allow_unused=True,
                 )
 
                 # Update fast weights with gradient clipping
                 for (name, weight), grad in zip(fast_weights.items(), grads):
                     if grad is not None:
-                        clipped_grad = torch.clamp(grad, -1.0, 1.0)
+                        clipped_grad = torch.clamp(grad, -1.0, 1.0)  # Stability
                         fast_weights[name] = weight - self.inner_lr * clipped_grad
 
-            # Compute meta loss
+            # Outer Loop: Meta-Update
+            # Evaluate performance on query set using adapted weights
             query_pred = self.forward_with_fast_weights(query_x, fast_weights)
             meta_loss = F.mse_loss(query_pred, query_y)
 
-            # Compute meta gradients
-            meta_loss.backward()
+            # Accumulate meta-gradients
+            meta_loss.backward()  # This propagates through the entire inner loop
             total_meta_loss += meta_loss.item()
 
-            # Calculate gradient norm
+            # Monitor gradient norms
             with torch.no_grad():
-                grad_norm = 0.0
-                for param in self.parameters():
-                    if param.grad is not None:
-                        grad_norm += param.grad.norm().item() ** 2
-                total_grad_norm += grad_norm**0.5
-
-        # Average the losses and gradients
-        avg_meta_loss = total_meta_loss / len(task_batch[0])
-        avg_grad_norm = total_grad_norm / len(task_batch[0])
-
-        # Gradient clipping
+                grad_norm = sum(
+                    param.grad.norm().item() ** 2 
+                    for param in self.parameters() 
+                    if param.grad is not None
+                ) ** 0.5
+                total_grad_norm += grad_norm
+
+        # Average and apply meta-update
+        avg_meta_loss = total_meta_loss / len(task_batch)
+        avg_grad_norm = total_grad_norm / len(task_batch)
+
+        # Gradient clipping for stable meta-updates
         torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)
-
-        # Update meta parameters
         self.meta_optimizer.step()
 
         return avg_meta_loss, avg_grad_norm
@@ -165,29 +176,47 @@ def create_synthetic_tasks(
     input_size: int = 10,
     output_size: int = 1,
 ) -> List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]]:
-    """Create synthetic tasks for meta-learning."""
+    """
+    Creates a set of synthetic regression tasks for meta-learning training.
+    Each task represents a different non-linear function with controlled complexity.
+    
+    Task Generation Process:
+    1. Generate normalized input features
+    2. Create task-specific transformation
+    3. Add controlled noise for robustness
+    4. Split into support (training) and query (testing) sets
+    
+    Returns:
+        List of (support_x, support_y, query_x, query_y) tuples for each task
+    """
     tasks = []
-    split_idx = samples_per_task // 2
+    split_idx = samples_per_task // 2  # 50/50 split between support and query sets
 
     for _ in range(num_tasks):
-        # Generate more diverse input data
+        # 1. Generate and normalize input features
         x = torch.randn(samples_per_task, input_size)
-        x = (x - x.mean(0)) / (x.std(0) + 1e-8)
+        x = (x - x.mean(0)) / (x.std(0) + 1e-8)  # Standardize inputs
 
-        # Create more diverse task functions with multiple non-linearities
-        coefficients = torch.randn(input_size, output_size) * 0.3
-        bias = torch.randn(output_size) * 0.1
+        # 2. Create task-specific transformation
+        coefficients = torch.randn(input_size, output_size) * 0.3  # Random linear transformation
+        bias = torch.randn(output_size) * 0.1  # Random bias term
 
-        # Balanced task complexity
-        y = torch.matmul(x, coefficients) + bias
-        y += 0.15 * torch.sin(2.0 * torch.matmul(x, coefficients))
-        y += 0.08 * torch.tanh(1.5 * torch.matmul(x, coefficients))
+        # 3. Generate outputs with multiple non-linearities
+        y = torch.matmul(x, coefficients) + bias  # Linear component
+        y += 0.15 * torch.sin(2.0 * torch.matmul(x, coefficients))  # Sinusoidal component
+        y += 0.08 * torch.tanh(1.5 * torch.matmul(x, coefficients))  # Tanh component
 
-        # Adaptive noise based on signal magnitude
-        noise_scale = 0.02 * torch.std(y)
+        # 4. Add adaptive noise based on signal magnitude
+        noise_scale = 0.02 * torch.std(y)  # Noise proportional to output variance
         y += noise_scale * torch.randn_like(y)
 
-        tasks.append((x[:split_idx], y[:split_idx], x[split_idx:], y[split_idx:]))
+        # 5. Split into support and query sets
+        tasks.append((
+            x[:split_idx],    # support_x: First half of inputs
+            y[:split_idx],    # support_y: First half of outputs
+            x[split_idx:],    # query_x:   Second half of inputs
+            y[split_idx:]     # query_y:   Second half of outputs
+        ))
 
     return tasks
 
@@ -196,14 +225,17 @@ def create_task_dataloader(
     tasks: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
     batch_size: int = 4,
 ) -> DataLoader:
-    """Create a DataLoader for batches of tasks."""
+    """
+    Organizes tasks into batches for efficient training.
+    Shuffles tasks to prevent learning order dependencies.
+    """
     # Reorganize tasks into separate lists
     support_x = [t[0] for t in tasks]
     support_y = [t[1] for t in tasks]
     query_x = [t[2] for t in tasks]
     query_y = [t[3] for t in tasks]
 
-    # Create dataset from lists
+    # Create dataset and return DataLoader with shuffling
     dataset = list(zip(support_x, support_y, query_x, query_y))
     return DataLoader(dataset, batch_size=batch_size, shuffle=True)
 
@@ -279,11 +311,9 @@ def create_task_dataloader(
     support_x, support_y, query_x, query_y = new_task
     support_x, support_y = support_x.to(device), support_y.to(device)
 
-    # Adapt to new task
-    fast_weights = {}
-    for name, param in meta_model.named_parameters():
-        fast_weights[name] = param.clone()
-
+    fast_weights = {
+        name: param.clone() for name, param in meta_model.named_parameters()
+    }
     # Quick adaptation
     for _ in range(5):
         support_pred = meta_model.forward_with_fast_weights(support_x, fast_weights)

From b4e06ee94442a8164c2b62e5824d798c8c948584 Mon Sep 17 00:00:00 2001
From: Leon van Bokhorst <post@leonvanbokhorst.com>
Date: Fri, 8 Nov 2024 16:46:44 +0100
Subject: [PATCH 3/7] Add wandb and matplotlib to requirements.txt

---
 requirements.txt                    |   2 +
 src/model_agnostic_meta_learning.py | 261 +++++++++++++++++++++++-----
 2 files changed, 222 insertions(+), 41 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 1fb3e7e..46805bc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,5 @@ transformers
 PyWavelets
 scikit-learn
 sentence-transformers
+wandb
+matplotlib
diff --git a/src/model_agnostic_meta_learning.py b/src/model_agnostic_meta_learning.py
index 60eb273..01cabcf 100644
--- a/src/model_agnostic_meta_learning.py
+++ b/src/model_agnostic_meta_learning.py
@@ -1,4 +1,10 @@
 from typing import Dict, List, Optional, Tuple, Type
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.metrics import r2_score
+import logging
+import wandb  # Optional: for experiment tracking
+from collections import defaultdict
 
 import torch
 import torch.nn as nn
@@ -73,13 +79,13 @@ def meta_train_step(
         Core MAML training step that implements the bi-level optimization:
         1. Inner Loop: Adapt to individual tasks using gradient descent
         2. Outer Loop: Update meta-parameters to optimize post-adaptation performance
-        
+
         Args:
             task_batch: List of (support_x, support_y, query_x, query_y) tuples
                 - support_x/y: Used for task adaptation (inner loop)
                 - query_x/y: Used for meta-update (outer loop)
             device: Computation device (CPU/GPU)
-        
+
         Returns:
             avg_meta_loss: Average loss across all tasks after adaptation
             avg_grad_norm: Average gradient norm for monitoring training
@@ -89,13 +95,17 @@ def meta_train_step(
 
         self.meta_optimizer.zero_grad()
 
-        for support_x, support_y, query_x, query_y in zip(*task_batch):
+        for task_tuple in task_batch:
+            # Unpack the task tuple
+            support_x, support_y, query_x, query_y = task_tuple
+            
             support_x = support_x.to(device)
             support_y = support_y.to(device)
             query_x = query_x.to(device)
             query_y = query_y.to(device)
 
             fast_weights = {name: param.clone() for name, param in self.named_parameters()}
+            
             # Multiple gradient steps for task adaptation
             for _ in range(3):  # Inner loop steps
                 support_pred = self.forward_with_fast_weights(support_x, fast_weights)
@@ -126,11 +136,14 @@ def meta_train_step(
 
             # Monitor gradient norms
             with torch.no_grad():
-                grad_norm = sum(
-                    param.grad.norm().item() ** 2 
-                    for param in self.parameters() 
-                    if param.grad is not None
-                ) ** 0.5
+                grad_norm = (
+                    sum(
+                        param.grad.norm().item() ** 2
+                        for param in self.parameters()
+                        if param.grad is not None
+                    )
+                    ** 0.5
+                )
                 total_grad_norm += grad_norm
 
         # Average and apply meta-update
@@ -169,6 +182,120 @@ def forward_with_fast_weights(
             fast_weights["output_layer.bias"],
         )
 
+    def compute_metrics(
+        self, y_pred: torch.Tensor, y_true: torch.Tensor
+    ) -> Dict[str, float]:
+        """Compute multiple metrics for model evaluation"""
+        with torch.no_grad():
+            mse = F.mse_loss(y_pred, y_true).item()
+            mae = F.l1_loss(y_pred, y_true).item()
+            r2 = r2_score(y_true.cpu().numpy(), y_pred.cpu().numpy())
+
+        return {"mse": mse, "mae": mae, "r2": r2, "rmse": np.sqrt(mse)}
+
+    def visualize_adaptation(
+        self,
+        support_x: torch.Tensor,
+        support_y: torch.Tensor,
+        query_x: torch.Tensor,
+        query_y: torch.Tensor,
+        fast_weights: Dict[str, torch.Tensor],
+        task_name: str = "Task Adaptation"
+    ):
+        """Enhanced visualization with feature importance and learning curves"""
+        try:
+            plt.figure(figsize=(20, 5))
+            
+            # Plot 1: Predictions (now including support points)
+            plt.subplot(1, 4, 1)
+            with torch.no_grad():
+                initial_pred = self.forward(query_x)
+                adapted_pred = self.forward_with_fast_weights(query_x, fast_weights)
+                support_pred = self.forward(support_x)
+                
+            plt.scatter(query_y.cpu().numpy(), initial_pred.cpu().numpy(), 
+                       alpha=0.5, label='Query (Pre)')
+            plt.scatter(query_y.cpu().numpy(), adapted_pred.cpu().numpy(), 
+                       alpha=0.5, label='Query (Post)')
+            plt.scatter(support_y.cpu().numpy(), support_pred.cpu().numpy(), 
+                       alpha=0.5, label='Support', marker='x')
+            plt.plot([query_y.min().item(), query_y.max().item()], 
+                    [query_y.min().item(), query_y.max().item()], 
+                    'r--', label='Perfect')
+            plt.xlabel('True Values')
+            plt.ylabel('Predicted Values')
+            plt.title("Predictions vs True Values")
+            plt.legend()
+
+            # Plot 2: Feature Importance
+            plt.subplot(1, 4, 2)
+            with torch.no_grad():
+                feature_importance = torch.zeros(query_x.shape[1])
+                for i in range(query_x.shape[1]):
+                    perturbed_x = query_x.clone()
+                    perturbed_x[:, i] = torch.randn_like(perturbed_x[:, i])
+                    perturbed_pred = self.forward_with_fast_weights(perturbed_x, fast_weights)
+                    feature_importance[i] = F.mse_loss(perturbed_pred, adapted_pred)
+                    
+            plt.bar(range(len(feature_importance)), 
+                    feature_importance.cpu().numpy())
+            plt.xlabel('Feature Index')
+            plt.ylabel('Importance (MSE Impact)')
+            plt.title('Feature Importance')
+
+            # Plot 3: Error Distribution
+            plt.subplot(1, 4, 3)
+            initial_errors = (initial_pred - query_y).cpu().numpy()
+            adapted_errors = (adapted_pred - query_y).cpu().numpy()
+            plt.hist(initial_errors, alpha=0.5, label='Pre-Adaptation', bins=20)
+            plt.hist(adapted_errors, alpha=0.5, label='Post-Adaptation', bins=20)
+            plt.xlabel('Prediction Error')
+            plt.ylabel('Count')
+            plt.title('Error Distribution')
+            plt.legend()
+
+            # Plot 4: Adaptation Progress
+            plt.subplot(1, 4, 4)
+            progress_x = query_x[:5]  # Track few points for visualization
+            progress_y = query_y[:5]
+            adaptation_steps = []
+            
+            temp_weights = {name: param.clone() for name, param in self.named_parameters()}
+            for step in range(6):  # Track adaptation progress
+                with torch.no_grad():
+                    pred = self.forward_with_fast_weights(progress_x, temp_weights)
+                    adaptation_steps.append(F.mse_loss(pred, progress_y).item())
+                
+                if step < 5:  # Don't update on last step
+                    support_pred = self.forward_with_fast_weights(support_x, temp_weights)
+                    inner_loss = F.mse_loss(support_pred, support_y)
+                    grads = torch.autograd.grad(inner_loss, temp_weights.values())
+                    
+                    for (name, weight), grad in zip(temp_weights.items(), grads):
+                        temp_weights[name] = weight - self.inner_lr * grad
+
+            plt.plot(adaptation_steps, marker='o')
+            plt.xlabel('Adaptation Step')
+            plt.ylabel('MSE Loss')
+            plt.title('Adaptation Progress')
+            
+            # Add overall metrics
+            plt.suptitle(f"{task_name}\n" 
+                        f"MSE Before: {F.mse_loss(initial_pred, query_y):.4f}, "
+                        f"MSE After: {F.mse_loss(adapted_pred, query_y):.4f}\n"
+                        f"Adaptation Improvement: {((F.mse_loss(initial_pred, query_y) - F.mse_loss(adapted_pred, query_y)) / F.mse_loss(initial_pred, query_y) * 100):.1f}%")
+            
+            plt.tight_layout()
+            save_path = f'adaptation_plot_{task_name.replace(" ", "_")}.png'
+            plt.savefig(save_path, dpi=300, bbox_inches='tight')
+            logger.info(f"Saved visualization to {save_path}")
+            return plt.gcf()
+            
+        except Exception as e:
+            logger.error(f"Error in visualization: {str(e)}")
+            logger.error(f"Shapes - query_x: {query_x.shape}, query_y: {query_y.shape}")
+            return None
+
 
 def create_synthetic_tasks(
     num_tasks: int = 100,
@@ -179,13 +306,13 @@ def create_synthetic_tasks(
     """
     Creates a set of synthetic regression tasks for meta-learning training.
     Each task represents a different non-linear function with controlled complexity.
-    
+
     Task Generation Process:
     1. Generate normalized input features
     2. Create task-specific transformation
     3. Add controlled noise for robustness
     4. Split into support (training) and query (testing) sets
-    
+
     Returns:
         List of (support_x, support_y, query_x, query_y) tuples for each task
     """
@@ -198,12 +325,16 @@ def create_synthetic_tasks(
         x = (x - x.mean(0)) / (x.std(0) + 1e-8)  # Standardize inputs
 
         # 2. Create task-specific transformation
-        coefficients = torch.randn(input_size, output_size) * 0.3  # Random linear transformation
+        coefficients = (
+            torch.randn(input_size, output_size) * 0.3
+        )  # Random linear transformation
         bias = torch.randn(output_size) * 0.1  # Random bias term
 
         # 3. Generate outputs with multiple non-linearities
         y = torch.matmul(x, coefficients) + bias  # Linear component
-        y += 0.15 * torch.sin(2.0 * torch.matmul(x, coefficients))  # Sinusoidal component
+        y += 0.15 * torch.sin(
+            2.0 * torch.matmul(x, coefficients)
+        )  # Sinusoidal component
         y += 0.08 * torch.tanh(1.5 * torch.matmul(x, coefficients))  # Tanh component
 
         # 4. Add adaptive noise based on signal magnitude
@@ -211,12 +342,14 @@ def create_synthetic_tasks(
         y += noise_scale * torch.randn_like(y)
 
         # 5. Split into support and query sets
-        tasks.append((
-            x[:split_idx],    # support_x: First half of inputs
-            y[:split_idx],    # support_y: First half of outputs
-            x[split_idx:],    # query_x:   Second half of inputs
-            y[split_idx:]     # query_y:   Second half of outputs
-        ))
+        tasks.append(
+            (
+                x[:split_idx],  # support_x: First half of inputs
+                y[:split_idx],  # support_y: First half of outputs
+                x[split_idx:],  # query_x:   Second half of inputs
+                y[split_idx:],  # query_y:   Second half of outputs
+            )
+        )
 
     return tasks
 
@@ -226,18 +359,38 @@ def create_task_dataloader(
     batch_size: int = 4,
 ) -> DataLoader:
     """
-    Organizes tasks into batches for efficient training.
-    Shuffles tasks to prevent learning order dependencies.
+    Creates a DataLoader that returns batches of tasks.
+    Each batch contains batch_size tasks, where each task is a tuple of (support_x, support_y, query_x, query_y).
     """
-    # Reorganize tasks into separate lists
-    support_x = [t[0] for t in tasks]
-    support_y = [t[1] for t in tasks]
-    query_x = [t[2] for t in tasks]
-    query_y = [t[3] for t in tasks]
+    return DataLoader(
+        tasks,
+        batch_size=batch_size,
+        shuffle=True,
+        collate_fn=lambda x: [  # Custom collate function to maintain tuple structure
+            (support_x, support_y, query_x, query_y) 
+            for support_x, support_y, query_x, query_y in x
+        ]
+    )
 
-    # Create dataset and return DataLoader with shuffling
-    dataset = list(zip(support_x, support_y, query_x, query_y))
-    return DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+def analyze_task_difficulty(
+    task: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+) -> Dict[str, float]:
+    """Analyze the complexity and difficulty of a given task"""
+    support_x, support_y, query_x, query_y = task
+
+    metrics = {
+        "input_variance": torch.var(support_x).item(),
+        "output_variance": torch.var(support_y).item(),
+        "input_output_correlation": torch.corrcoef(
+            torch.stack([support_x[:, 0], support_y.squeeze()])
+        )[0, 1].item(),
+        "support_query_distribution_shift": torch.norm(
+            support_x.mean(0) - query_x.mean(0)
+        ).item(),
+    }
+
+    return metrics
 
 
 if __name__ == "__main__":
@@ -266,26 +419,52 @@ def create_task_dataloader(
         input_size=INPUT_SIZE, hidden_sizes=HIDDEN_SIZES, output_size=OUTPUT_SIZE
     ).to(device)
 
-    # Training loop
+    # Setup logging
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    logger = logging.getLogger(__name__)
+
+    # Optional: Initialize wandb for experiment tracking
+    # wandb.init(project="maml-tutorial", config={"batch_size": BATCH_SIZE, "num_tasks": NUM_TASKS})
+
+    # Training loop with enhanced monitoring
     num_epochs = 20
     best_loss = float("inf")
     patience_counter = 0
     patience_limit = 5  # Early stopping patience
 
     for epoch in range(num_epochs):
-        total_loss = 0.0
-        total_grad_norm = 0.0
-
-        for batch in task_dataloader:
-            loss, grad_norm = meta_model.meta_train_step(batch, device)
-            total_loss += loss
-            total_grad_norm += grad_norm
+        logger.info(f"Starting epoch {epoch+1}/{num_epochs}")
+        
+        for batch_idx, task_batch in enumerate(task_dataloader):
+            loss, grad_norm = meta_model.meta_train_step(task_batch, device)
+            
+            if batch_idx % 5 == 0:
+                logger.info(f"Batch {batch_idx} - Loss: {loss:.4f}, Grad Norm: {grad_norm:.4f}")
+                
+                # Get first task from batch for visualization
+                support_x, support_y, query_x, query_y = task_batch[0]
+                
+                logger.info(f"Visualization data shapes:")
+                logger.info(f"Support X: {support_x.shape}, Support Y: {support_y.shape}")
+                logger.info(f"Query X: {query_x.shape}, Query Y: {query_y.shape}")
+                
+                fig = meta_model.visualize_adaptation(
+                    support_x.to(device), 
+                    support_y.to(device),
+                    query_x.to(device), 
+                    query_y.to(device),
+                    {name: param.clone() for name, param in meta_model.named_parameters()},
+                    f"Epoch_{epoch+1}_Batch_{batch_idx}"
+                )
+                
+                if fig is not None:
+                    plt.close(fig)
 
-        avg_loss = total_loss / len(task_dataloader)
-        avg_grad_norm = total_grad_norm / len(task_dataloader)
-        print(
-            f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Grad Norm: {avg_grad_norm:.4f}"
-        )
+        avg_loss = loss / len(task_dataloader)
+        logger.info(f"Epoch {epoch+1} complete - Average Loss: {avg_loss:.4f}")
 
         # Learning rate scheduling
         meta_model.scheduler.step(avg_loss)

From de82cbeaa896cbc6bd73ef4725d04a9a82317642 Mon Sep 17 00:00:00 2001
From: Leon van Bokhorst <post@leonvanbokhorst.com>
Date: Fri, 8 Nov 2024 17:05:17 +0100
Subject: [PATCH 4/7] docs: add Model-Agnostic Meta-Learning (MAML) PoC
 description

Added documentation for the model_agnostic_meta_learning.py proof of concept,
highlighting its key features including:
- Meta-learning architecture with skip connections
- Task generation and adaptation capabilities
- Visualization and analysis tools
- Rapid context adaptation functionality
---
 README.md                                     |  16 +++
 src/maml.md                                   | 129 ++++++++++++++++++
 ...y => maml_model_agnostic_meta_learning.py} |  87 ++++++------
 3 files changed, 188 insertions(+), 44 deletions(-)
 create mode 100644 src/maml.md
 rename src/{model_agnostic_meta_learning.py => maml_model_agnostic_meta_learning.py} (93%)

diff --git a/README.md b/README.md
index 56755c7..f0fb436 100644
--- a/README.md
+++ b/README.md
@@ -96,6 +96,22 @@ This PoC demonstrates a practical application of the Narrative Field System in a
 
 This implementation showcases how the Narrative Field System can be applied to analyze and track narrative dynamics in a specific context.
 
+### model_agnostic_meta_learning.py
+
+This PoC implements a Model-Agnostic Meta-Learning (MAML) approach for rapid adaptation of narrative models to new contexts. Key features include:
+
+- `MetaModelGenerator`: Core meta-learning architecture with:
+  - Skip connections for improved gradient flow
+  - Adaptive learning rate scheduling
+  - Enhanced visualization capabilities
+  - Robust error handling and metrics tracking
+- Task generation system for synthetic narrative scenarios
+- Multi-step adaptation process with gradient clipping
+- Comprehensive visualization suite for adaptation analysis
+- Built-in feature importance analysis and learning curve tracking
+
+This implementation enables the system to quickly adapt to new narrative contexts with minimal data, making it particularly valuable for modeling emerging story dynamics.
+
 ## Development Guidelines
 
 - Follow PEP 8 style guide and use Black for code formatting.
diff --git a/src/maml.md b/src/maml.md
new file mode 100644
index 0000000..a5280f3
--- /dev/null
+++ b/src/maml.md
@@ -0,0 +1,129 @@
+# Model-Agnostic Meta-Learning (MAML) for Adaptive Regression
+
+## Abstract
+
+This paper presents an implementation and analysis of Model-Agnostic Meta-Learning (MAML) applied to adaptive regression tasks. We demonstrate how MAML enables rapid adaptation to new tasks through a neural network architecture with skip connections and carefully tuned meta-learning components. Our experiments show significant improvements in prediction accuracy after just a few gradient steps of task-specific adaptation.
+
+## 1. Introduction
+
+Meta-learning, or "learning to learn", aims to create models that can quickly adapt to new tasks with minimal training data. MAML achieves this by explicitly optimizing the model's initial parameters such that a small number of gradient steps will produce good performance on a new task. Our implementation focuses on regression problems with the following key features:
+
+- Multi-layer neural network with skip connections for improved gradient flow
+- Controlled synthetic task generation for systematic evaluation
+- Comprehensive visualization and analysis tools
+- Robust training with gradient clipping and early stopping
+
+## 2. Architecture
+
+### 2.1 Model Design
+
+The core architecture consists of:
+
+- Input layer: Linear transformation to first hidden layer
+- Hidden layers: Multiple layers with skip connections and ReLU activation
+- Output layer: Linear transformation to prediction space
+
+Skip connections help maintain gradient flow during both meta-training and adaptation. Weight initialization uses small normal distributions (σ=0.01) to prevent initial predictions from being too extreme.
+
+### 2.2 Meta-Learning Components
+
+Key meta-learning elements include:
+
+- Inner loop optimization: Task-specific adaptation using SGD
+- Outer loop optimization: Meta-parameter updates using SGD with momentum
+- Learning rate scheduling: ReduceLROnPlateau for automatic adjustment
+- Gradient clipping: Both inner and outer loops for stability
+
+## 3. Task Generation
+
+Synthetic tasks are generated with controlled complexity:
+
+1. Normalized input features
+2. Task-specific random transformations
+3. Multiple non-linear components (linear, sinusoidal, hyperbolic tangent)
+4. Adaptive noise based on signal magnitude
+5. Support/query set splitting for evaluation
+
+## 4. Training Process
+
+The training pipeline includes:
+
+- Batch processing of multiple tasks
+- Multiple gradient steps for task adaptation
+- Comprehensive metric tracking
+- Early stopping based on validation performance
+- Learning rate adjustment based on loss plateaus
+
+## 5. Results and Analysis
+
+Our implementation demonstrates:
+
+- Rapid adaptation to new tasks (3-5 gradient steps)
+- Robust performance across varying task complexities
+- Effective feature importance identification
+- Clear visualization of adaptation progress
+
+### 5.1 Visualization Components
+
+We provide detailed visualizations including:
+
+1. Pre/post adaptation predictions
+2. Feature importance analysis
+3. Error distribution changes
+4. Adaptation learning curves
+
+## 6. Implementation Details
+
+Key technical features:
+
+- PyTorch implementation with GPU support
+- Type hints for improved code clarity
+- Comprehensive error handling
+- Modular design for easy extension
+- Logging and optional experiment tracking
+
+## 7. Conclusion
+
+Our MAML implementation successfully demonstrates rapid adaptation capabilities for regression tasks. The architecture and training process provide a robust foundation for meta-learning applications, with clear visualization and analysis tools for understanding model behavior.
+
+## References
+
+1. Finn, C., Abbeel, P., & Levine, S. (2017). [Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks.](https://arxiv.org/abs/1703.03400)
+2. Antoniou, A., Edwards, H., & Storkey, A. (2019). [How to train your MAML](https://arxiv.org/abs/1810.09502).
+
+## Appendix: Code Structure
+
+The implementation is organized into key components:
+
+1. MetaModelGenerator: Core meta-learning model
+2. Task generation utilities
+3. Training and evaluation loops
+4. Visualization and analysis tools
+
+For detailed implementation, see the accompanying source code.
+
+## Citations
+
+```bibtex
+@misc{finn2017modelagnosticmetalearningfastadaptation,
+      title={Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks}, 
+      author={Chelsea Finn and Pieter Abbeel and Sergey Levine},
+      year={2017},
+      eprint={1703.03400},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/1703.03400}, 
+}
+```
+
+```bibtex
+@misc{antoniou2019trainmaml,
+      title={How to train your MAML}, 
+      author={Antreas Antoniou and Harrison Edwards and Amos Storkey},
+      year={2019},
+      eprint={1810.09502},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/1810.09502}, 
+}
+```
diff --git a/src/model_agnostic_meta_learning.py b/src/maml_model_agnostic_meta_learning.py
similarity index 93%
rename from src/model_agnostic_meta_learning.py
rename to src/maml_model_agnostic_meta_learning.py
index 01cabcf..21062b1 100644
--- a/src/model_agnostic_meta_learning.py
+++ b/src/maml_model_agnostic_meta_learning.py
@@ -205,14 +205,14 @@ def visualize_adaptation(
         """Enhanced visualization with feature importance and learning curves"""
         try:
             plt.figure(figsize=(20, 5))
-            
+
             # Plot 1: Predictions (now including support points)
             plt.subplot(1, 4, 1)
             with torch.no_grad():
                 initial_pred = self.forward(query_x)
                 adapted_pred = self.forward_with_fast_weights(query_x, fast_weights)
                 support_pred = self.forward(support_x)
-                
+
             plt.scatter(query_y.cpu().numpy(), initial_pred.cpu().numpy(), 
                        alpha=0.5, label='Query (Pre)')
             plt.scatter(query_y.cpu().numpy(), adapted_pred.cpu().numpy(), 
@@ -222,13 +222,9 @@ def visualize_adaptation(
             plt.plot([query_y.min().item(), query_y.max().item()], 
                     [query_y.min().item(), query_y.max().item()], 
                     'r--', label='Perfect')
-            plt.xlabel('True Values')
-            plt.ylabel('Predicted Values')
-            plt.title("Predictions vs True Values")
-            plt.legend()
-
-            # Plot 2: Feature Importance
-            plt.subplot(1, 4, 2)
+            self._extracted_from_visualize_adaptation_30(
+                'True Values', 'Predicted Values', "Predictions vs True Values", 2
+            )
             with torch.no_grad():
                 feature_importance = torch.zeros(query_x.shape[1])
                 for i in range(query_x.shape[1]):
@@ -236,66 +232,74 @@ def visualize_adaptation(
                     perturbed_x[:, i] = torch.randn_like(perturbed_x[:, i])
                     perturbed_pred = self.forward_with_fast_weights(perturbed_x, fast_weights)
                     feature_importance[i] = F.mse_loss(perturbed_pred, adapted_pred)
-                    
+
             plt.bar(range(len(feature_importance)), 
                     feature_importance.cpu().numpy())
-            plt.xlabel('Feature Index')
-            plt.ylabel('Importance (MSE Impact)')
-            plt.title('Feature Importance')
-
+            self._extracted_from_visualize_adaptation_30(
+                'Feature Index', 'Importance (MSE Impact)', 'Feature Importance'
+            )
             # Plot 3: Error Distribution
             plt.subplot(1, 4, 3)
             initial_errors = (initial_pred - query_y).cpu().numpy()
             adapted_errors = (adapted_pred - query_y).cpu().numpy()
             plt.hist(initial_errors, alpha=0.5, label='Pre-Adaptation', bins=20)
             plt.hist(adapted_errors, alpha=0.5, label='Post-Adaptation', bins=20)
-            plt.xlabel('Prediction Error')
-            plt.ylabel('Count')
-            plt.title('Error Distribution')
-            plt.legend()
-
-            # Plot 4: Adaptation Progress
-            plt.subplot(1, 4, 4)
+            self._extracted_from_visualize_adaptation_30(
+                'Prediction Error', 'Count', 'Error Distribution', 4
+            )
             progress_x = query_x[:5]  # Track few points for visualization
             progress_y = query_y[:5]
             adaptation_steps = []
-            
+
             temp_weights = {name: param.clone() for name, param in self.named_parameters()}
             for step in range(6):  # Track adaptation progress
                 with torch.no_grad():
                     pred = self.forward_with_fast_weights(progress_x, temp_weights)
                     adaptation_steps.append(F.mse_loss(pred, progress_y).item())
-                
+
                 if step < 5:  # Don't update on last step
                     support_pred = self.forward_with_fast_weights(support_x, temp_weights)
                     inner_loss = F.mse_loss(support_pred, support_y)
                     grads = torch.autograd.grad(inner_loss, temp_weights.values())
-                    
+
                     for (name, weight), grad in zip(temp_weights.items(), grads):
                         temp_weights[name] = weight - self.inner_lr * grad
 
             plt.plot(adaptation_steps, marker='o')
-            plt.xlabel('Adaptation Step')
-            plt.ylabel('MSE Loss')
-            plt.title('Adaptation Progress')
-            
+            self._extracted_from_visualize_adaptation_30(
+                'Adaptation Step', 'MSE Loss', 'Adaptation Progress'
+            )
             # Add overall metrics
             plt.suptitle(f"{task_name}\n" 
                         f"MSE Before: {F.mse_loss(initial_pred, query_y):.4f}, "
                         f"MSE After: {F.mse_loss(adapted_pred, query_y):.4f}\n"
                         f"Adaptation Improvement: {((F.mse_loss(initial_pred, query_y) - F.mse_loss(adapted_pred, query_y)) / F.mse_loss(initial_pred, query_y) * 100):.1f}%")
-            
+
             plt.tight_layout()
             save_path = f'adaptation_plot_{task_name.replace(" ", "_")}.png'
             plt.savefig(save_path, dpi=300, bbox_inches='tight')
             logger.info(f"Saved visualization to {save_path}")
             return plt.gcf()
-            
+
         except Exception as e:
             logger.error(f"Error in visualization: {str(e)}")
             logger.error(f"Shapes - query_x: {query_x.shape}, query_y: {query_y.shape}")
             return None
 
+    # TODO Rename this here and in `visualize_adaptation`
+    def _extracted_from_visualize_adaptation_30(self, arg0, arg1, arg2, arg3):
+        self._extracted_from_visualize_adaptation_30(arg0, arg1, arg2)
+        plt.legend()
+
+            # Plot 2: Feature Importance
+        plt.subplot(1, 4, arg3)
+
+    # TODO Rename this here and in `visualize_adaptation`
+    def _extracted_from_visualize_adaptation_30(self, arg0, arg1, arg2):
+        plt.xlabel(arg0)
+        plt.ylabel(arg1)
+        plt.title(arg2)
+
 
 def create_synthetic_tasks(
     num_tasks: int = 100,
@@ -366,10 +370,7 @@ def create_task_dataloader(
         tasks,
         batch_size=batch_size,
         shuffle=True,
-        collate_fn=lambda x: [  # Custom collate function to maintain tuple structure
-            (support_x, support_y, query_x, query_y) 
-            for support_x, support_y, query_x, query_y in x
-        ]
+        collate_fn=lambda x: list(x),
     )
 
 
@@ -379,7 +380,7 @@ def analyze_task_difficulty(
     """Analyze the complexity and difficulty of a given task"""
     support_x, support_y, query_x, query_y = task
 
-    metrics = {
+    return {
         "input_variance": torch.var(support_x).item(),
         "output_variance": torch.var(support_y).item(),
         "input_output_correlation": torch.corrcoef(
@@ -390,8 +391,6 @@ def analyze_task_difficulty(
         ).item(),
     }
 
-    return metrics
-
 
 if __name__ == "__main__":
     # Configuration
@@ -437,20 +436,20 @@ def analyze_task_difficulty(
 
     for epoch in range(num_epochs):
         logger.info(f"Starting epoch {epoch+1}/{num_epochs}")
-        
+
         for batch_idx, task_batch in enumerate(task_dataloader):
             loss, grad_norm = meta_model.meta_train_step(task_batch, device)
-            
+
             if batch_idx % 5 == 0:
                 logger.info(f"Batch {batch_idx} - Loss: {loss:.4f}, Grad Norm: {grad_norm:.4f}")
-                
+
                 # Get first task from batch for visualization
                 support_x, support_y, query_x, query_y = task_batch[0]
-                
-                logger.info(f"Visualization data shapes:")
+
+                logger.info("Visualization data shapes:")
                 logger.info(f"Support X: {support_x.shape}, Support Y: {support_y.shape}")
                 logger.info(f"Query X: {query_x.shape}, Query Y: {query_y.shape}")
-                
+
                 fig = meta_model.visualize_adaptation(
                     support_x.to(device), 
                     support_y.to(device),
@@ -459,7 +458,7 @@ def analyze_task_difficulty(
                     {name: param.clone() for name, param in meta_model.named_parameters()},
                     f"Epoch_{epoch+1}_Batch_{batch_idx}"
                 )
-                
+
                 if fig is not None:
                     plt.close(fig)
 

From 0ada788f8d566268e9f7666193149b2a4acb60f7 Mon Sep 17 00:00:00 2001
From: Leon van Bokhorst <post@leonvanbokhorst.com>
Date: Fri, 8 Nov 2024 17:09:41 +0100
Subject: [PATCH 5/7] feat(maml): implement model-agnostic meta-learning with
 synthetic task generation

This commit adds the implementation of model-agnostic meta-learning (MAML) with synthetic task generation. The code changes include the addition of a sequence diagram that illustrates the process of generating meta-models, task batches, and performing fast weight updates. This implementation allows the system to quickly adapt to new narrative contexts with minimal data, making it valuable for modeling emerging story dynamics.
---
 README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/README.md b/README.md
index f0fb436..b6c41f4 100644
--- a/README.md
+++ b/README.md
@@ -110,6 +110,30 @@ This PoC implements a Model-Agnostic Meta-Learning (MAML) approach for rapid ada
 - Comprehensive visualization suite for adaptation analysis
 - Built-in feature importance analysis and learning curve tracking
 
+```mermaid
+sequenceDiagram
+    participant MetaModelGenerator
+    participant TaskBatch
+    participant Device
+    participant FastWeights
+    participant MetaOptimizer
+    participant Scheduler
+
+    MetaModelGenerator->>TaskBatch: Get task batch
+    loop for each task in task_batch
+        MetaModelGenerator->>FastWeights: Clone parameters
+        loop Inner loop steps
+            FastWeights->>MetaModelGenerator: forward_with_fast_weights(support_x)
+            MetaModelGenerator->>FastWeights: Compute gradients
+            FastWeights->>FastWeights: Update fast weights
+        end
+        FastWeights->>MetaModelGenerator: forward_with_fast_weights(query_x)
+        MetaModelGenerator->>MetaOptimizer: Accumulate meta-gradients
+    end
+    MetaOptimizer->>MetaModelGenerator: Apply meta-update
+    MetaModelGenerator->>Scheduler: Step with avg_loss
+```
+
 This implementation enables the system to quickly adapt to new narrative contexts with minimal data, making it particularly valuable for modeling emerging story dynamics.
 
 ## Development Guidelines

From b2fc926be231a427ff59ea6f27215fab0cdf775c Mon Sep 17 00:00:00 2001
From: Leon van Bokhorst <post@leonvanbokhorst.com>
Date: Fri, 8 Nov 2024 17:12:32 +0100
Subject: [PATCH 6/7] Refactor plot formatting in MetaModelGenerator

---
 src/maml_model_agnostic_meta_learning.py | 51 ++++++++++--------------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/src/maml_model_agnostic_meta_learning.py b/src/maml_model_agnostic_meta_learning.py
index 21062b1..8634bbe 100644
--- a/src/maml_model_agnostic_meta_learning.py
+++ b/src/maml_model_agnostic_meta_learning.py
@@ -193,6 +193,14 @@ def compute_metrics(
 
         return {"mse": mse, "mae": mae, "r2": r2, "rmse": np.sqrt(mse)}
 
+    def _setup_plot_formatting(self, xlabel: str, ylabel: str, title: str, add_legend: bool = True):
+        """Helper method to set up common plot formatting"""
+        plt.xlabel(xlabel)
+        plt.ylabel(ylabel)
+        plt.title(title)
+        if add_legend:
+            plt.legend()
+
     def visualize_adaptation(
         self,
         support_x: torch.Tensor,
@@ -206,7 +214,7 @@ def visualize_adaptation(
         try:
             plt.figure(figsize=(20, 5))
 
-            # Plot 1: Predictions (now including support points)
+            # Plot 1: Predictions
             plt.subplot(1, 4, 1)
             with torch.no_grad():
                 initial_pred = self.forward(query_x)
@@ -222,9 +230,10 @@ def visualize_adaptation(
             plt.plot([query_y.min().item(), query_y.max().item()], 
                     [query_y.min().item(), query_y.max().item()], 
                     'r--', label='Perfect')
-            self._extracted_from_visualize_adaptation_30(
-                'True Values', 'Predicted Values', "Predictions vs True Values", 2
-            )
+            self._setup_plot_formatting('True Values', 'Predicted Values', 'Predictions vs True Values')
+
+            # Plot 2: Feature Importance
+            plt.subplot(1, 4, 2)
             with torch.no_grad():
                 feature_importance = torch.zeros(query_x.shape[1])
                 for i in range(query_x.shape[1]):
@@ -233,20 +242,19 @@ def visualize_adaptation(
                     perturbed_pred = self.forward_with_fast_weights(perturbed_x, fast_weights)
                     feature_importance[i] = F.mse_loss(perturbed_pred, adapted_pred)
 
-            plt.bar(range(len(feature_importance)), 
-                    feature_importance.cpu().numpy())
-            self._extracted_from_visualize_adaptation_30(
-                'Feature Index', 'Importance (MSE Impact)', 'Feature Importance'
-            )
+            plt.bar(range(len(feature_importance)), feature_importance.cpu().numpy())
+            self._setup_plot_formatting('Feature Index', 'Importance (MSE Impact)', 'Feature Importance', False)
+
             # Plot 3: Error Distribution
             plt.subplot(1, 4, 3)
             initial_errors = (initial_pred - query_y).cpu().numpy()
             adapted_errors = (adapted_pred - query_y).cpu().numpy()
             plt.hist(initial_errors, alpha=0.5, label='Pre-Adaptation', bins=20)
             plt.hist(adapted_errors, alpha=0.5, label='Post-Adaptation', bins=20)
-            self._extracted_from_visualize_adaptation_30(
-                'Prediction Error', 'Count', 'Error Distribution', 4
-            )
+            self._setup_plot_formatting('Prediction Error', 'Count', 'Error Distribution')
+
+            # Plot 4: Adaptation Progress
+            plt.subplot(1, 4, 4)
             progress_x = query_x[:5]  # Track few points for visualization
             progress_y = query_y[:5]
             adaptation_steps = []
@@ -266,9 +274,8 @@ def visualize_adaptation(
                         temp_weights[name] = weight - self.inner_lr * grad
 
             plt.plot(adaptation_steps, marker='o')
-            self._extracted_from_visualize_adaptation_30(
-                'Adaptation Step', 'MSE Loss', 'Adaptation Progress'
-            )
+            self._setup_plot_formatting('Adaptation Step', 'MSE Loss', 'Adaptation Progress')
+
             # Add overall metrics
             plt.suptitle(f"{task_name}\n" 
                         f"MSE Before: {F.mse_loss(initial_pred, query_y):.4f}, "
@@ -286,20 +293,6 @@ def visualize_adaptation(
             logger.error(f"Shapes - query_x: {query_x.shape}, query_y: {query_y.shape}")
             return None
 
-    # TODO Rename this here and in `visualize_adaptation`
-    def _extracted_from_visualize_adaptation_30(self, arg0, arg1, arg2, arg3):
-        self._extracted_from_visualize_adaptation_30(arg0, arg1, arg2)
-        plt.legend()
-
-            # Plot 2: Feature Importance
-        plt.subplot(1, 4, arg3)
-
-    # TODO Rename this here and in `visualize_adaptation`
-    def _extracted_from_visualize_adaptation_30(self, arg0, arg1, arg2):
-        plt.xlabel(arg0)
-        plt.ylabel(arg1)
-        plt.title(arg2)
-
 
 def create_synthetic_tasks(
     num_tasks: int = 100,

From 358fb314040f52debb894aa5977b85a76cc6aafe Mon Sep 17 00:00:00 2001
From: Leon van Bokhorst <post@leonvanbokhorst.com>
Date: Fri, 8 Nov 2024 17:17:11 +0100
Subject: [PATCH 7/7] Refactor plot formatting in MetaModelGenerator

---
 src/maml_model_agnostic_meta_learning.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/maml_model_agnostic_meta_learning.py b/src/maml_model_agnostic_meta_learning.py
index 8634bbe..1165149 100644
--- a/src/maml_model_agnostic_meta_learning.py
+++ b/src/maml_model_agnostic_meta_learning.py
@@ -122,6 +122,8 @@ def meta_train_step(
                 # Update fast weights with gradient clipping
                 for (name, weight), grad in zip(fast_weights.items(), grads):
                     if grad is not None:
+                        torch.nn.utils.clip_grad_norm_(grad, max_norm=1.0)  # Global norm clipping
+                        fast_weights[name] = weight - self.inner_lr * grad
                         clipped_grad = torch.clamp(grad, -1.0, 1.0)  # Stability
                         fast_weights[name] = weight - self.inner_lr * clipped_grad
 
@@ -191,7 +193,14 @@ def compute_metrics(
             mae = F.l1_loss(y_pred, y_true).item()
             r2 = r2_score(y_true.cpu().numpy(), y_pred.cpu().numpy())
 
-        return {"mse": mse, "mae": mae, "r2": r2, "rmse": np.sqrt(mse)}
+        def compute_metrics(self, y_pred: torch.Tensor, y_true: torch.Tensor) -> Dict[str, float]:
+            """Compute multiple metrics for model evaluation"""
+            with torch.no_grad():
+                mse = F.mse_loss(y_pred, y_true).item()
+                mae = F.l1_loss(y_pred, y_true).item()
+                r2 = r2_score(y_true.cpu().numpy(), y_pred.cpu().numpy())
+                rmse = np.sqrt(mse)
+            return {"mse": mse, "mae": mae, "r2": r2, "rmse": rmse}
 
     def _setup_plot_formatting(self, xlabel: str, ylabel: str, title: str, add_legend: bool = True):
         """Helper method to set up common plot formatting"""
@@ -277,6 +286,13 @@ def visualize_adaptation(
             self._setup_plot_formatting('Adaptation Step', 'MSE Loss', 'Adaptation Progress')
 
             # Add overall metrics
+            def setup_visualization_plot(x_data, y_data, plot_title, legend_labels):
+                plt.xlabel(x_data)
+                plt.ylabel(y_data)
+                plt.title(plot_title)
+                if legend_labels:
+                    plt.legend()
+
             plt.suptitle(f"{task_name}\n" 
                         f"MSE Before: {F.mse_loss(initial_pred, query_y):.4f}, "
                         f"MSE After: {F.mse_loss(adapted_pred, query_y):.4f}\n"