gaarutyunov
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 55 additions & 1 deletion b/‎.github/workflows/test.yml‎
Lines changed: 55 additions & 1 deletion
diff --git a/‎swift/CrossLanguageTests/CrossLanguageTests.swift‎
Lines changed: 288 additions & 0 deletions b/‎swift/CrossLanguageTests/CrossLanguageTests.swift‎
Lines changed: 288 additions & 0 deletions
@@ -79,6 +79,60 @@ jobs:
         pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
         pip install -e .
     
-    - name: Run tests
+    - name: Run Python tests
       run: |
         pytest tests/test_model_comparison.py -v -s --tb=short
+
+    - name: Export test data for cross-language tests
+      run: |
+        python tests/export_test_data.py
+
+  swift-tests:
+    runs-on: macos-14  # macOS with Apple Silicon
+    needs: test  # Run after Python tests to ensure test data is exported
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Swift
+      uses: swift-actions/setup-swift@v1
+      with:
+        swift-version: "5.9"
+
+    - name: Set up Python (for test data export)
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.11"
+
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install mlx mlx-lm numpy
+        pip install -e .
+
+    - name: Export test data
+      run: |
+        python tests/export_test_data.py
+
+    - name: Cache Swift packages
+      uses: actions/cache@v3
+      with:
+        path: swift/.build
+        key: ${{ runner.os }}-swift-${{ hashFiles('swift/Package.swift') }}
+        restore-keys: |
+          ${{ runner.os }}-swift-
+
+    - name: Build Swift package
+      working-directory: swift
+      run: |
+        swift build -c release
+
+    - name: Run Swift unit tests
+      working-directory: swift
+      run: |
+        swift test --filter VJEPA2Tests
+
+    - name: Run cross-language tests
+      working-directory: swift
+      run: |
+        swift test --filter CrossLanguageTests
@@ -0,0 +1,288 @@
+// Cross-language tests comparing Python and Swift implementations
+// These tests load test data exported from the Python implementation
+// and verify that the Swift implementation produces identical outputs
+
+import XCTest
+import MLX
+@testable import VJEPA2
+
+final class CrossLanguageTests: XCTestCase {
+    let testDataDir: URL = {
+        let currentFile = URL(fileURLWithPath: #file)
+        return currentFile
+            .deletingLastPathComponent()
+            .appendingPathComponent("TestData")
+    }()
+
+    let tolerance: Float = 1e-4  // Tolerance for float comparison
+
+    // MARK: - Helper Functions
+
+    /// Load numpy array from file
+    func loadNumpyArray(_ filename: String, in directory: URL) throws -> MLXArray {
+        let fileURL = directory.appendingPathComponent(filename)
+        let data = try Data(contentsOf: fileURL)
+
+        // Parse .npy file format
+        // This is a simplified parser - production code should use a proper library
+        // For now, we'll use a basic implementation
+        // NOTE: This requires proper .npy parsing which MLX Swift may provide
+        // If not available, you'll need to implement or use a library
+
+        fatalError("Implement .npy parsing for MLX Swift")
+    }
+
+    /// Compare two MLX arrays with tolerance
+    func assertArraysClose(_ a: MLXArray, _ b: MLXArray, tolerance: Float = 1e-4, file: StaticString = #file, line: UInt = #line) {
+        XCTAssertEqual(a.shape, b.shape, "Array shapes don't match", file: file, line: line)
+
+        let diff = abs(a - b)
+        let maxDiff = diff.max().item(Float.self)
+
+        XCTAssertLessThan(maxDiff, tolerance, "Arrays differ by more than tolerance: \(maxDiff)", file: file, line: line)
+    }
+
+    /// Load test case inputs and outputs
+    func loadTestCase(_ name: String) throws -> (inputs: [String: MLXArray], outputs: [String: MLXArray]) {
+        let caseDir = testDataDir.appendingPathComponent(name)
+
+        // Load metadata
+        let metadataURL = caseDir.appendingPathComponent("metadata.json")
+        let metadataData = try Data(contentsOf: metadataURL)
+        let metadata = try JSONDecoder().decode(TestCaseMetadata.self, from: metadataData)
+
+        // Load inputs
+        var inputs: [String: MLXArray] = [:]
+        for inputName in metadata.inputs {
+            inputs[inputName] = try loadNumpyArray("input_\(inputName).npy", in: caseDir)
+        }
+
+        // Load outputs
+        var outputs: [String: MLXArray] = [:]
+        for outputName in metadata.outputs {
+            outputs[outputName] = try loadNumpyArray("output_\(outputName).npy", in: caseDir)
+        }
+
+        return (inputs, outputs)
+    }
+
+    // MARK: - Positional Embedding Tests
+
+    func testPositionalEmbedding1D() throws {
+        let testCase = try loadTestCase("pos_embed_1d")
+
+        let embedDim = Int(testCase.inputs["embed_dim"]!.item(Int32.self))
+        let gridSize = Int(testCase.inputs["grid_size"]!.item(Int32.self))
+
+        let swiftOutput = get1DSinCosPositionEmbed(embedDim: embedDim, gridSize: gridSize)
+        let expectedOutput = testCase.outputs["pos_embed"]!
+
+        assertArraysClose(swiftOutput, expectedOutput, tolerance: tolerance)
+    }
+
+    func testPositionalEmbedding2D() throws {
+        let testCase = try loadTestCase("pos_embed_2d")
+
+        let embedDim = Int(testCase.inputs["embed_dim"]!.item(Int32.self))
+        let gridSize = Int(testCase.inputs["grid_size"]!.item(Int32.self))
+
+        let swiftOutput = get2DSinCosPositionEmbed(embedDim: embedDim, gridSize: gridSize)
+        let expectedOutput = testCase.outputs["pos_embed"]!
+
+        assertArraysClose(swiftOutput, expectedOutput, tolerance: tolerance)
+    }
+
+    func testPositionalEmbedding3D() throws {
+        let testCase = try loadTestCase("pos_embed_3d")
+
+        let embedDim = Int(testCase.inputs["embed_dim"]!.item(Int32.self))
+        let gridSize = Int(testCase.inputs["grid_size"]!.item(Int32.self))
+        let gridDepth = Int(testCase.inputs["grid_depth"]!.item(Int32.self))
+
+        let swiftOutput = get3DSinCosPositionEmbed(
+            embedDim: embedDim,
+            gridSize: gridSize,
+            gridDepth: gridDepth,
+            uniformPower: false
+        )
+        let expectedOutput = testCase.outputs["pos_embed"]!
+
+        assertArraysClose(swiftOutput, expectedOutput, tolerance: tolerance)
+    }
+
+    // MARK: - Patch Embedding Tests
+
+    func testPatchEmbed2D() throws {
+        let testCase = try loadTestCase("patch_embed_2d")
+
+        let patchSize = Int(testCase.inputs["patch_size"]!.item(Int32.self))
+        let embedDim = Int(testCase.inputs["embed_dim"]!.item(Int32.self))
+
+        let patchEmbed = PatchEmbed(patchSize: patchSize, inChannels: 3, embedDim: embedDim)
+        let swiftOutput = patchEmbed(testCase.inputs["image"]!)
+        let expectedOutput = testCase.outputs["patches"]!
+
+        assertArraysClose(swiftOutput, expectedOutput, tolerance: tolerance)
+    }
+
+    func testPatchEmbed3D() throws {
+        let testCase = try loadTestCase("patch_embed_3d")
+
+        let patchSize = Int(testCase.inputs["patch_size"]!.item(Int32.self))
+        let tubeletSize = Int(testCase.inputs["tubelet_size"]!.item(Int32.self))
+        let embedDim = Int(testCase.inputs["embed_dim"]!.item(Int32.self))
+
+        let patchEmbed = PatchEmbed3D(
+            patchSize: patchSize,
+            tubeletSize: tubeletSize,
+            inChannels: 3,
+            embedDim: embedDim
+        )
+        let swiftOutput = patchEmbed(testCase.inputs["video"]!)
+        let expectedOutput = testCase.outputs["patches"]!
+
+        assertArraysClose(swiftOutput, expectedOutput, tolerance: tolerance)
+    }
+
+    // MARK: - MLP Tests
+
+    func testMLPStandard() throws {
+        let testCase = try loadTestCase("mlp_standard")
+
+        let inFeatures = Int(testCase.inputs["in_features"]!.item(Int32.self))
+        let hiddenFeatures = Int(testCase.inputs["hidden_features"]!.item(Int32.self))
+
+        let mlp = MLP(inFeatures: inFeatures, hiddenFeatures: hiddenFeatures, useSiLU: false)
+        let swiftOutput = mlp(testCase.inputs["x"]!)
+        let expectedOutput = testCase.outputs["output"]!
+
+        assertArraysClose(swiftOutput, expectedOutput, tolerance: tolerance)
+    }
+
+    // MARK: - Attention Tests
+
+    func testAttentionStandard() throws {
+        let testCase = try loadTestCase("attention_standard")
+
+        let dim = Int(testCase.inputs["dim"]!.item(Int32.self))
+        let numHeads = Int(testCase.inputs["num_heads"]!.item(Int32.self))
+
+        let attention = Attention(dim: dim, numHeads: numHeads, qkvBias: true)
+        let swiftOutput = attention(testCase.inputs["x"]!)
+        let expectedOutput = testCase.outputs["output"]!
+
+        assertArraysClose(swiftOutput, expectedOutput, tolerance: tolerance)
+    }
+
+    func testAttentionRoPE() throws {
+        let testCase = try loadTestCase("attention_rope")
+
+        let dim = Int(testCase.inputs["dim"]!.item(Int32.self))
+        let numHeads = Int(testCase.inputs["num_heads"]!.item(Int32.self))
+        let gridSize = Int(testCase.inputs["grid_size"]!.item(Int32.self))
+
+        let ropeAttention = RoPEAttention(
+            dim: dim,
+            numHeads: numHeads,
+            gridSize: gridSize,
+            qkvBias: true
+        )
+        let swiftOutput = ropeAttention(
+            testCase.inputs["x"]!,
+            T: 1,
+            hPatches: 14,
+            wPatches: 14
+        )
+        let expectedOutput = testCase.outputs["output"]!
+
+        assertArraysClose(swiftOutput, expectedOutput, tolerance: tolerance)
+    }
+
+    // MARK: - Block Tests
+
+    func testBlockStandard() throws {
+        let testCase = try loadTestCase("block_standard")
+
+        let dim = Int(testCase.inputs["dim"]!.item(Int32.self))
+        let numHeads = Int(testCase.inputs["num_heads"]!.item(Int32.self))
+
+        let block = Block(
+            dim: dim,
+            numHeads: numHeads,
+            mlpRatio: 4.0,
+            qkvBias: true,
+            useRoPE: false
+        )
+        let swiftOutput = block(testCase.inputs["x"]!)
+        let expectedOutput = testCase.outputs["output"]!
+
+        assertArraysClose(swiftOutput, expectedOutput, tolerance: tolerance)
+    }
+
+    // MARK: - Vision Transformer Tests
+
+    func testVisionTransformerImage() throws {
+        let testCase = try loadTestCase("vit_image")
+
+        let vit = VisionTransformer(
+            imgSize: (224, 224),
+            patchSize: 16,
+            numFrames: 1,
+            embedDim: 768,
+            depth: 12,
+            numHeads: 12,
+            useRoPE: false
+        )
+        let swiftOutput = vit(testCase.inputs["image"]!)
+        let expectedOutput = testCase.outputs["output"]!
+
+        assertArraysClose(swiftOutput, expectedOutput, tolerance: tolerance)
+    }
+
+    func testVisionTransformerVideo() throws {
+        let testCase = try loadTestCase("vit_video")
+
+        let vit = VisionTransformer(
+            imgSize: (224, 224),
+            patchSize: 16,
+            numFrames: 16,
+            tubeletSize: 2,
+            embedDim: 768,
+            depth: 12,
+            numHeads: 12,
+            useRoPE: false
+        )
+        let swiftOutput = vit(testCase.inputs["video"]!)
+        let expectedOutput = testCase.outputs["output"]!
+
+        assertArraysClose(swiftOutput, expectedOutput, tolerance: tolerance)
+    }
+
+    // MARK: - Attentive Classifier Tests
+
+    func testAttentiveClassifier() throws {
+        let testCase = try loadTestCase("attentive_classifier")
+
+        let embedDim = Int(testCase.inputs["embed_dim"]!.item(Int32.self))
+        let numClasses = Int(testCase.inputs["num_classes"]!.item(Int32.self))
+
+        let classifier = AttentiveClassifier(
+            embedDim: embedDim,
+            numHeads: 12,
+            depth: 1,
+            numClasses: numClasses
+        )
+        let swiftOutput = classifier(testCase.inputs["tokens"]!)
+        let expectedOutput = testCase.outputs["logits"]!
+
+        assertArraysClose(swiftOutput, expectedOutput, tolerance: tolerance)
+    }
+}
+
+// MARK: - Helper Structures
+
+struct TestCaseMetadata: Codable {
+    let name: String
+    let inputs: [String]
+    let outputs: [String]
+}