cjlin1 · shagunroperia · Oct 9, 2024 · Oct 10, 2024
diff --git a/python/examples/SVMRecipeClassifier/README.md b/python/examples/SVMRecipeClassifier/README.md
@@ -0,0 +1,43 @@
+# Binary Recipe Classifier using LIBSVM
+
+This project implements a binary recipe classifier using LIBSVM. It classifies recipes as either Italian or Mexican cuisine based on their ingredients. It serves as a demonstration of applying Support Vector Machines (SVM) to text classification tasks, specifically in the domain of recipe categorization.
+
+## Key Features
+
+- Binary classification of recipes (Italian vs Mexican)
+- Utilizes LIBSVM for efficient SVM implementation
+- Preprocesses text-based recipe data into numerical features
+- Includes a comprehensive test suite for validation
+
+## Requirements
+
+- Python 3.7+
+- NumPy
+- SciPy
+- LIBSVM
+
+
+## Usage
+
+To use the RecipeClassifier in your Python script:
+
+```python
+from recipe_classifier import RecipeClassifier
+
+# Initialize the classifier
+classifier = RecipeClassifier()
+
+# Train the classifier
+recipes = [
+    "pasta tomato basil olive_oil garlic",
+    "tortilla beans salsa avocado cilantro",
+    "pizza cheese tomato oregano",
+    "tacos beef lettuce cheese salsa"
+]
+cuisines = ["Italian", "Mexican", "Italian", "Mexican"]
+classifier.train(recipes, cuisines)
+
+# Make predictions
+new_recipes = ["lasagna pasta cheese tomato_sauce beef", "burrito rice beans salsa guacamole"]
+predictions = classifier.predict(new_recipes)
+print(predictions)
diff --git a/python/examples/SVMRecipeClassifier/recipe_classifier.py b/python/examples/SVMRecipeClassifier/recipe_classifier.py
@@ -0,0 +1,152 @@
+"""
+Recipe Classifier using LIBSVM
+
+This module implements a binary classifier for Italian and Mexican recipes
+using LIBSVM. It's intended as a demonstration of how to use LIBSVM for
+text classification tasks.
+
+WARNING: Due to the extremely small dataset, this model overfits and does not
+generalize well. This implementation is for demonstration purposes only and
+should not be used for real-world applications without significant modifications.
+"""
+
+from libsvm.svmutil import *
+import numpy as np
+import pandas as pd
+from scipy.sparse import csr_matrix
+import warnings
+
+class RecipeClassifier:
+    """A binary classifier for Italian and Mexican recipes using LIBSVM."""
+
+    def __init__(self):
+        """Initialize the RecipeClassifier."""
+        self.model = None
+        self.vocabulary = None
+
+    def preprocess_data(self, recipes, cuisines):
+        """
+        Preprocess the recipe data for LIBSVM.
+
+        Args:
+            recipes (list): List of recipe ingredient strings.
+            cuisines (list): List of cuisine labels ('Italian' or 'Mexican').
+
+        Returns:
+            tuple: (X, y) where X is a sparse matrix of features and y is an array of labels.
+        """
+        # Create vocabulary
+        if self.vocabulary is None:
+            all_ingredients = set(' '.join(recipes).split())
+            self.vocabulary = {ingredient: idx for idx, ingredient in enumerate(all_ingredients)}
+
+        # Convert recipes to feature vectors
+        rows, cols, data = [], [], []
+        for idx, recipe in enumerate(recipes):
+            for ingredient in recipe.split():
+                if ingredient in self.vocabulary:
+                    rows.append(idx)
+                    cols.append(self.vocabulary[ingredient])
+                    data.append(1)
+
+        X = csr_matrix((data, (rows, cols)), shape=(len(recipes), len(self.vocabulary)))
+        y = np.array([1 if cuisine == 'Italian' else -1 for cuisine in cuisines])
+        return X, y
+
+    def train(self, recipes, cuisines):
+        """
+        Train the SVM model.
+
+        Args:
+            recipes (list): List of recipe ingredient strings.
+            cuisines (list): List of cuisine labels ('Italian' or 'Mexican').
+        """
+        if len(recipes) < 20:
+            warnings.warn("The dataset is very small. The model is likely to overfit.")
+
+        X, y = self.preprocess_data(recipes, cuisines)
+
+        # Split data into training and validation sets
+        np.random.seed(42)
+        indices = np.random.permutation(len(recipes))
+        split = int(0.8 * len(recipes))
+        train_idx, val_idx = indices[:split], indices[split:]
+
+        X_train, y_train = X[train_idx], y[train_idx]
+        X_val, y_val = X[val_idx], y[val_idx]
+
+        # Convert to LIBSVM format
+        prob = svm_problem(y_train.tolist(), X_train.toarray().tolist())
+        param = svm_parameter('-t 0 -c 0.1')  # Linear kernel, C=0.1 for less overfitting
+        self.model = svm_train(prob, param)
+
+        # Validate the model
+        p_labels, _, _ = svm_predict(y_val.tolist(), X_val.toarray().tolist(), self.model)
+        accuracy = sum(1 for i, j in zip(p_labels, y_val) if i == j) / len(y_val)
+        print(f"Validation Accuracy: {accuracy:.2f}")
+
+        if accuracy == 1.0:
+            warnings.warn("Perfect validation accuracy suggests overfitting.")
+
+    def predict(self, new_recipes):
+        """
+        Predict cuisines for new recipes.
+
+        Args:
+            new_recipes (list): List of new recipe ingredient strings.
+
+        Returns:
+            list: Predicted cuisines ('Italian' or 'Mexican').
+        """
+        if self.model is None:
+            raise ValueError("Model has not been trained. Call train() first.")
+
+        X, _ = self.preprocess_data(new_recipes, [None] * len(new_recipes))
+        p_labels, _, _ = svm_predict([0] * X.shape[0], X.toarray().tolist(), self.model)
+        return ['Italian' if label > 0 else 'Mexican' for label in p_labels]
+
+
+def main():
+    """Demonstrate the usage of RecipeClassifier."""
+    classifier = RecipeClassifier()
+
+    # Sample data
+    recipes = [
+        "pasta tomato basil olive_oil garlic",
+        "tortilla beans salsa avocado cilantro",
+        "spaghetti meatballs tomato_sauce parmesan",
+        "tacos beef lettuce cheese salsa",
+        "pizza mozzarella tomato basil oregano",
+        "enchiladas chicken cheese salsa corn",
+        "lasagna pasta beef tomato cheese",
+        "quesadilla tortilla cheese beans salsa",
+        "risotto rice parmesan white_wine",
+        "guacamole avocado lime cilantro onion"
+    ]
+    cuisines = ["Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican"]
+
+    # Train the model
+    classifier.train(recipes, cuisines)
+
+    # Predict new recipes
+    new_recipes = [
+        "pizza cheese tomato basil oregano",
+        "burrito rice beans salsa guacamole"
+    ]
+    predictions = classifier.predict(new_recipes)
+    print("Predictions for new recipes:", predictions)
+
+    # Evaluate on training data
+    train_predictions = classifier.predict(recipes)
+    accuracy = sum(1 for pred, true in zip(train_predictions, cuisines) if pred == true) / len(cuisines)
+    print(f"Training Accuracy: {accuracy:.2f}")
+
+    print("\nWARNING: This model is overfitting due to the small dataset.")
+    print("For a real-world application, consider the following improvements:")
+    print("1. Collect a much larger and more diverse dataset.")
+    print("2. Use cross-validation for more robust evaluation.")
+    print("3. Implement feature engineering specific to recipe classification.")
+    print("4. Experiment with different ML algorithms and hyperparameters.")
+
+if __name__ == "__main__":
+    main()
diff --git a/python/examples/SVMRecipeClassifier/test_recipe_classifier.py b/python/examples/SVMRecipeClassifier/test_recipe_classifier.py
@@ -0,0 +1,195 @@
+"""
+Test module for RecipeClassifier
+
+This module contains unit tests for the RecipeClassifier class, which implements
+a binary classifier for Italian and Mexican recipes using LIBSVM.
+
+The tests cover the initialization, data preprocessing, training, and prediction
+functionalities of the RecipeClassifier.
+
+Note: These tests assume a small dataset and are meant for demonstration purposes.
+In a real-world scenario, more comprehensive tests with larger datasets would be necessary.
+"""
+
+import unittest
+import numpy as np
+import warnings
+from recipe_classifier import RecipeClassifier
+
+class TestRecipeClassifier(unittest.TestCase):
+    """
+    A test suite for the RecipeClassifier class.
+
+    This class contains various test methods to ensure the correct functionality
+    of the RecipeClassifier, including data preprocessing, model training, and prediction.
+    """
+
+    def setUp(self):
+        """
+        Set up the test environment before each test method.
+
+        This method initializes a RecipeClassifier instance and defines sample
+        recipes and cuisines for testing purposes.
+        """
+        print("\n--- Setting up test environment ---")
+        self.classifier = RecipeClassifier()
+        self.recipes = [
+            "pasta tomato basil olive_oil garlic",
+            "tortilla beans salsa avocado cilantro",
+            "spaghetti meatballs tomato_sauce parmesan",
+            "tacos beef lettuce cheese salsa",
+            "pizza mozzarella tomato basil oregano",
+            "enchiladas chicken cheese salsa corn",
+            "risotto rice parmesan white_wine mushroom",
+            "guacamole avocado lime cilantro onion"
+        ]
+        self.cuisines = ["Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican"]
+        print(f"Initialized classifier with {len(self.recipes)} sample recipes")
+
+    def test_init(self):
+        """
+        Test the initialization of the RecipeClassifier.
+
+        This test ensures that a new RecipeClassifier instance has its model
+        and vocabulary attributes properly initialized to None.
+        """
+        print("\n--- Testing initialization ---")
+        print(f"Model: {self.classifier.model}")
+        print(f"Vocabulary: {self.classifier.vocabulary}")
+        self.assertIsNone(self.classifier.model, "Model should be None upon initialization")
+        self.assertIsNone(self.classifier.vocabulary, "Vocabulary should be None upon initialization")
+        print("Initialization test passed successfully")
+
+    def test_preprocess_data(self):
+        """
+        Test the data preprocessing method of RecipeClassifier.
+
+        This test checks if the preprocess_data method correctly converts
+        the input recipes and cuisines into feature matrices and labels.
+        """
+        X, y = self.classifier.preprocess_data(self.recipes, self.cuisines)
+        print(f"Preprocessed feature matrix shape: {X.shape}")
+        print(f"Label array shape: {y.shape}")
+        print(f"Unique labels: {np.unique(y)}")
+        # Check if X is a sparse matrix with correct dimensions
+        self.assertEqual(X.shape[0], len(self.recipes))
+        self.assertGreater(X.shape[1], 0)
+
+        # Check if y is a numpy array with correct length and values
+        self.assertIsInstance(y, np.ndarray)
+        self.assertEqual(len(y), len(self.cuisines))
+        self.assertTrue(all(label in [1, -1] for label in y))
+        print("Data preprocessing test passed successfully")
+
+    def test_train(self):
+        """
+        Test the training method of RecipeClassifier.
+
+        This test checks if the train method successfully trains a model
+        and sets the model attribute of the classifier.
+        """
+        print("\n--- Testing model training ---")
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            self.classifier.train(self.recipes, self.cuisines)
+            if any("dataset is very small" in str(warning.message) for warning in w):
+                print("Warning: Dataset is very small, as expected")
+            else:
+                print("No warning about small dataset was issued")
+        print(f"Model after training: {self.classifier.model}")
+        self.assertIsNotNone(self.classifier.model, "Model should not be None after training")
+        print("Training test passed successfully")
+
+    def test_predict(self):
+        """
+        Test the prediction method of RecipeClassifier.
+
+        This test checks if the predict method returns the expected output
+        for new recipes after training the model.
+        """
+        print("\n--- Testing prediction ---")
+        self.classifier.train(self.recipes, self.cuisines)
+
+        new_recipes = [
+            "pizza cheese tomato basil",
+            "burrito rice beans salsa"
+        ]
+        print("Predicting cuisines for new recipes:")
+        for recipe in new_recipes:
+            print(f"  - {recipe}")
+        predictions = self.classifier.predict(new_recipes)
+        print("Predictions:", predictions)
+        # Check if predictions are returned for all new recipes
+        self.assertEqual(len(predictions), len(new_recipes), "Number of predictions should match number of new recipes")
+
+        # Check if all predictions are either 'Italian' or 'Mexican'
+        self.assertTrue(all(cuisine in ['Italian', 'Mexican'] for cuisine in predictions), "All predictions should be either Italian or Mexican")
+        print("Prediction test passed successfully")
+
+    def test_predict_without_training(self):
+        """
+        Test prediction without prior training.
+
+        This test ensures that attempting to make predictions without first
+        training the model raises a ValueError.
+        """
+        print("\n--- Testing prediction without training ---")
+        with self.assertRaises(ValueError):
+            self.classifier.predict(["pizza cheese tomato basil"])
+        print(f"Raised exception as expected!")
+        print("Prediction without training test passed successfully")
+
+    def test_train_and_predict_accuracy(self):
+        print("\n--- Testing training and prediction accuracy ---")
+        self.classifier.train(self.recipes, self.cuisines)
+        predictions = self.classifier.predict(self.recipes)
+        accuracy = sum(p == c for p, c in zip(predictions, self.cuisines)) / len(self.cuisines)
+        print(f"Training accuracy: {accuracy:.2%}")
+        self.assertGreater(accuracy, 0.75, "Training accuracy should be above 75%")
+        print("Training and prediction accuracy test passed successfully")
+
+    def test_vocabulary_creation(self):
+        print("\n--- Testing vocabulary creation ---")
+        self.classifier.train(self.recipes, self.cuisines)
+        print(f"Vocabulary size: {len(self.classifier.vocabulary)}")
+        self.assertIsNotNone(self.classifier.vocabulary, "Vocabulary should not be None after training")
+        expected_ingredients = ["pasta", "tomato", "basil", "olive_oil", "garlic", "tortilla", "beans", "salsa",
+                                "avocado", "cilantro"]
+        for ingredient in expected_ingredients:
+            self.assertIn(ingredient, self.classifier.vocabulary, f"{ingredient} should be in the vocabulary")
+            print(f"'{ingredient}' found in vocabulary")
+        print("Vocabulary creation test passed successfully")
+
+    def test_predict_new_recipes(self):
+
+        # Train the classifier
+        print("\nTraining the classifier...")
+        self.classifier.train(self.recipes, self.cuisines)
+        print(f"Vocabulary size after training: {len(self.classifier.vocabulary)}")
+
+        # New recipes to test
+        new_recipes = [
+            "lasagna pasta cheese tomato_sauce beef",
+            "burrito rice beans salsa guacamole"
+        ]
+
+        print("\nPredicting new recipes:")
+        for recipe in new_recipes:
+            print(f"Recipe: {recipe}")
+
+        # Predict new recipes
+        predictions = self.classifier.predict(new_recipes)
+
+        print("\nPrediction results:")
+        for recipe, prediction in zip(new_recipes, predictions):
+            print(f"Recipe: {recipe}")
+            print(f"Predicted cuisine: {prediction}")
+
+        # Check predictions
+        self.assertEqual(predictions[0], "Italian", "Lasagna should be classified as Italian")
+        self.assertEqual(predictions[1], "Mexican", "Burrito should be classified as Mexican")
+
+        print("\nTest passed successfully!")
+
+if __name__ == '__main__':
+    unittest.main()