max-llm-book/steps/step_02.py at 9e30edd9880540bcedf94d08aa705e32adf0d040 · modular/max-llm-book · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# ===----------------------------------------------------------------------=== #
#
# This file is Modular Inc proprietary.
#
# ===----------------------------------------------------------------------=== #
"""
Step 02: Feed-forward Network (MLP)

Implement the MLP used in each transformer block with GELU activation.

Tasks:
1. Import functional (as F), Tensor, Linear, and Module from MAX
2. Create c_fc linear layer (embedding to intermediate dimension)
3. Create c_proj linear layer (intermediate back to embedding dimension)
4. Apply c_fc transformation in forward pass
5. Apply GELU activation function
6. Apply c_proj transformation and return result

Run: pixi run s02
"""

# 1: Import the required modules from MAX
# TODO: Import functional module max.functional with the alias F
# https://docs.modular.com/max/api/python/functional/

# TODO: Import Tensor from max.tensor
# https://docs.modular.com/max/api/python/tensor/

# TODO: Import Linear and Module from max.nn
# https://docs.modular.com/max/api/python/nn/

from max.tensor import Tensor
from step_01 import GPT2Config


class GPT2MLP(Module):
    """Feed-forward network matching HuggingFace GPT-2 structure.

    Args:
        intermediate_size: Size of the intermediate layer.
        config: GPT-2 configuration.
    """

    def __init__(self, intermediate_size: int, config: GPT2Config) -> None:
        super().__init__()
        embed_dim = config.n_embd

        # 2: Create the first linear layer (embedding to intermediate)
        # TODO: Create self.c_fc as a Linear layer from embed_dim to intermediate_size with bias=True
        # https://docs.modular.com/max/api/python/nn/Linear
        # Hint: This is the expansion layer in the MLP
        self.c_fc = None

        # 3: Create the second linear layer (intermediate back to embedding)
        # TODO: Create self.c_proj as a Linear layer from intermediate_size to embed_dim with bias=True
        # https://docs.modular.com/max/api/python/nn/Linear
        # Hint: This is the projection layer that brings us back to the embedding dimension
        self.c_proj = None

    def forward(self, hidden_states: Tensor) -> Tensor:
        """Apply feed-forward network.

        Args:
            hidden_states: Input hidden states.

        Returns:
            MLP output.
        """
        # 4: Apply the first linear transformation
        # TODO: Apply self.c_fc to hidden_states
        # Hint: This expands the hidden dimension to the intermediate size
        hidden_states = None

        # 5: Apply GELU activation function
        # TODO: Use F.gelu() with hidden_states and approximate="tanh"
        # https://docs.modular.com/max/api/python/functional/#max.functional.gelu
        # Hint: GELU is the non-linear activation used in GPT-2's MLP
        hidden_states = None

        # 6: Apply the second linear transformation and return
        # TODO: Apply self.c_proj to hidden_states and return the result
        # Hint: This projects back to the embedding dimension
        return None