Skip to content

Svm recipe classifier #223

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions python/examples/SVMRecipeClassifier/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Binary Recipe Classifier using LIBSVM

This project implements a binary recipe classifier using LIBSVM. It classifies recipes as either Italian or Mexican cuisine based on their ingredients. It serves as a demonstration of applying Support Vector Machines (SVM) to text classification tasks, specifically in the domain of recipe categorization.

## Key Features

- Binary classification of recipes (Italian vs Mexican)
- Utilizes LIBSVM for efficient SVM implementation
- Preprocesses text-based recipe data into numerical features
- Includes a comprehensive test suite for validation

## Requirements

- Python 3.7+
- NumPy
- SciPy
- LIBSVM


## Usage

To use the RecipeClassifier in your Python script:

```python
from recipe_classifier import RecipeClassifier

# Initialize the classifier
classifier = RecipeClassifier()

# Train the classifier
recipes = [
"pasta tomato basil olive_oil garlic",
"tortilla beans salsa avocado cilantro",
"pizza cheese tomato oregano",
"tacos beef lettuce cheese salsa"
]
cuisines = ["Italian", "Mexican", "Italian", "Mexican"]
classifier.train(recipes, cuisines)

# Make predictions
new_recipes = ["lasagna pasta cheese tomato_sauce beef", "burrito rice beans salsa guacamole"]
predictions = classifier.predict(new_recipes)
print(predictions)
152 changes: 152 additions & 0 deletions python/examples/SVMRecipeClassifier/recipe_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""
Recipe Classifier using LIBSVM

This module implements a binary classifier for Italian and Mexican recipes
using LIBSVM. It's intended as a demonstration of how to use LIBSVM for
text classification tasks.

WARNING: Due to the extremely small dataset, this model overfits and does not
generalize well. This implementation is for demonstration purposes only and
should not be used for real-world applications without significant modifications.
"""

from libsvm.svmutil import *
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import warnings

class RecipeClassifier:
"""A binary classifier for Italian and Mexican recipes using LIBSVM."""

def __init__(self):
"""Initialize the RecipeClassifier."""
self.model = None
self.vocabulary = None

def preprocess_data(self, recipes, cuisines):
"""
Preprocess the recipe data for LIBSVM.

Args:
recipes (list): List of recipe ingredient strings.
cuisines (list): List of cuisine labels ('Italian' or 'Mexican').

Returns:
tuple: (X, y) where X is a sparse matrix of features and y is an array of labels.
"""
# Create vocabulary
if self.vocabulary is None:
all_ingredients = set(' '.join(recipes).split())
self.vocabulary = {ingredient: idx for idx, ingredient in enumerate(all_ingredients)}

# Convert recipes to feature vectors
rows, cols, data = [], [], []
for idx, recipe in enumerate(recipes):
for ingredient in recipe.split():
if ingredient in self.vocabulary:
rows.append(idx)
cols.append(self.vocabulary[ingredient])
data.append(1)

X = csr_matrix((data, (rows, cols)), shape=(len(recipes), len(self.vocabulary)))
y = np.array([1 if cuisine == 'Italian' else -1 for cuisine in cuisines])
return X, y

def train(self, recipes, cuisines):
"""
Train the SVM model.

Args:
recipes (list): List of recipe ingredient strings.
cuisines (list): List of cuisine labels ('Italian' or 'Mexican').
"""
if len(recipes) < 20:
warnings.warn("The dataset is very small. The model is likely to overfit.")

X, y = self.preprocess_data(recipes, cuisines)

# Split data into training and validation sets
np.random.seed(42)
indices = np.random.permutation(len(recipes))
split = int(0.8 * len(recipes))
train_idx, val_idx = indices[:split], indices[split:]

X_train, y_train = X[train_idx], y[train_idx]
X_val, y_val = X[val_idx], y[val_idx]

# Convert to LIBSVM format
prob = svm_problem(y_train.tolist(), X_train.toarray().tolist())
param = svm_parameter('-t 0 -c 0.1') # Linear kernel, C=0.1 for less overfitting
self.model = svm_train(prob, param)

# Validate the model
p_labels, _, _ = svm_predict(y_val.tolist(), X_val.toarray().tolist(), self.model)
accuracy = sum(1 for i, j in zip(p_labels, y_val) if i == j) / len(y_val)
print(f"Validation Accuracy: {accuracy:.2f}")

if accuracy == 1.0:
warnings.warn("Perfect validation accuracy suggests overfitting.")

def predict(self, new_recipes):
"""
Predict cuisines for new recipes.

Args:
new_recipes (list): List of new recipe ingredient strings.

Returns:
list: Predicted cuisines ('Italian' or 'Mexican').
"""
if self.model is None:
raise ValueError("Model has not been trained. Call train() first.")

X, _ = self.preprocess_data(new_recipes, [None] * len(new_recipes))
p_labels, _, _ = svm_predict([0] * X.shape[0], X.toarray().tolist(), self.model)
return ['Italian' if label > 0 else 'Mexican' for label in p_labels]


def main():
"""Demonstrate the usage of RecipeClassifier."""
classifier = RecipeClassifier()

# Sample data
recipes = [
"pasta tomato basil olive_oil garlic",
"tortilla beans salsa avocado cilantro",
"spaghetti meatballs tomato_sauce parmesan",
"tacos beef lettuce cheese salsa",
"pizza mozzarella tomato basil oregano",
"enchiladas chicken cheese salsa corn",
"lasagna pasta beef tomato cheese",
"quesadilla tortilla cheese beans salsa",
"risotto rice parmesan white_wine",
"guacamole avocado lime cilantro onion"
]
cuisines = ["Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican"]

# Train the model
classifier.train(recipes, cuisines)

# Predict new recipes
new_recipes = [
"pizza cheese tomato basil oregano",
"burrito rice beans salsa guacamole"
]
predictions = classifier.predict(new_recipes)
print("Predictions for new recipes:", predictions)

# Evaluate on training data
train_predictions = classifier.predict(recipes)
accuracy = sum(1 for pred, true in zip(train_predictions, cuisines) if pred == true) / len(cuisines)
print(f"Training Accuracy: {accuracy:.2f}")

print("\nWARNING: This model is overfitting due to the small dataset.")
print("For a real-world application, consider the following improvements:")
print("1. Collect a much larger and more diverse dataset.")
print("2. Use cross-validation for more robust evaluation.")
print("3. Implement feature engineering specific to recipe classification.")
print("4. Experiment with different ML algorithms and hyperparameters.")

if __name__ == "__main__":
main()
195 changes: 195 additions & 0 deletions python/examples/SVMRecipeClassifier/test_recipe_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
"""
Test module for RecipeClassifier

This module contains unit tests for the RecipeClassifier class, which implements
a binary classifier for Italian and Mexican recipes using LIBSVM.

The tests cover the initialization, data preprocessing, training, and prediction
functionalities of the RecipeClassifier.

Note: These tests assume a small dataset and are meant for demonstration purposes.
In a real-world scenario, more comprehensive tests with larger datasets would be necessary.
"""

import unittest
import numpy as np
import warnings
from recipe_classifier import RecipeClassifier

class TestRecipeClassifier(unittest.TestCase):
"""
A test suite for the RecipeClassifier class.

This class contains various test methods to ensure the correct functionality
of the RecipeClassifier, including data preprocessing, model training, and prediction.
"""

def setUp(self):
"""
Set up the test environment before each test method.

This method initializes a RecipeClassifier instance and defines sample
recipes and cuisines for testing purposes.
"""
print("\n--- Setting up test environment ---")
self.classifier = RecipeClassifier()
self.recipes = [
"pasta tomato basil olive_oil garlic",
"tortilla beans salsa avocado cilantro",
"spaghetti meatballs tomato_sauce parmesan",
"tacos beef lettuce cheese salsa",
"pizza mozzarella tomato basil oregano",
"enchiladas chicken cheese salsa corn",
"risotto rice parmesan white_wine mushroom",
"guacamole avocado lime cilantro onion"
]
self.cuisines = ["Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican", "Italian", "Mexican"]
print(f"Initialized classifier with {len(self.recipes)} sample recipes")

def test_init(self):
"""
Test the initialization of the RecipeClassifier.

This test ensures that a new RecipeClassifier instance has its model
and vocabulary attributes properly initialized to None.
"""
print("\n--- Testing initialization ---")
print(f"Model: {self.classifier.model}")
print(f"Vocabulary: {self.classifier.vocabulary}")
self.assertIsNone(self.classifier.model, "Model should be None upon initialization")
self.assertIsNone(self.classifier.vocabulary, "Vocabulary should be None upon initialization")
print("Initialization test passed successfully")

def test_preprocess_data(self):
"""
Test the data preprocessing method of RecipeClassifier.

This test checks if the preprocess_data method correctly converts
the input recipes and cuisines into feature matrices and labels.
"""
X, y = self.classifier.preprocess_data(self.recipes, self.cuisines)
print(f"Preprocessed feature matrix shape: {X.shape}")
print(f"Label array shape: {y.shape}")
print(f"Unique labels: {np.unique(y)}")
# Check if X is a sparse matrix with correct dimensions
self.assertEqual(X.shape[0], len(self.recipes))
self.assertGreater(X.shape[1], 0)

# Check if y is a numpy array with correct length and values
self.assertIsInstance(y, np.ndarray)
self.assertEqual(len(y), len(self.cuisines))
self.assertTrue(all(label in [1, -1] for label in y))
print("Data preprocessing test passed successfully")

def test_train(self):
"""
Test the training method of RecipeClassifier.

This test checks if the train method successfully trains a model
and sets the model attribute of the classifier.
"""
print("\n--- Testing model training ---")
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
self.classifier.train(self.recipes, self.cuisines)
if any("dataset is very small" in str(warning.message) for warning in w):
print("Warning: Dataset is very small, as expected")
else:
print("No warning about small dataset was issued")
print(f"Model after training: {self.classifier.model}")
self.assertIsNotNone(self.classifier.model, "Model should not be None after training")
print("Training test passed successfully")

def test_predict(self):
"""
Test the prediction method of RecipeClassifier.

This test checks if the predict method returns the expected output
for new recipes after training the model.
"""
print("\n--- Testing prediction ---")
self.classifier.train(self.recipes, self.cuisines)

new_recipes = [
"pizza cheese tomato basil",
"burrito rice beans salsa"
]
print("Predicting cuisines for new recipes:")
for recipe in new_recipes:
print(f" - {recipe}")
predictions = self.classifier.predict(new_recipes)
print("Predictions:", predictions)
# Check if predictions are returned for all new recipes
self.assertEqual(len(predictions), len(new_recipes), "Number of predictions should match number of new recipes")

# Check if all predictions are either 'Italian' or 'Mexican'
self.assertTrue(all(cuisine in ['Italian', 'Mexican'] for cuisine in predictions), "All predictions should be either Italian or Mexican")
print("Prediction test passed successfully")

def test_predict_without_training(self):
"""
Test prediction without prior training.

This test ensures that attempting to make predictions without first
training the model raises a ValueError.
"""
print("\n--- Testing prediction without training ---")
with self.assertRaises(ValueError):
self.classifier.predict(["pizza cheese tomato basil"])
print(f"Raised exception as expected!")
print("Prediction without training test passed successfully")

def test_train_and_predict_accuracy(self):
print("\n--- Testing training and prediction accuracy ---")
self.classifier.train(self.recipes, self.cuisines)
predictions = self.classifier.predict(self.recipes)
accuracy = sum(p == c for p, c in zip(predictions, self.cuisines)) / len(self.cuisines)
print(f"Training accuracy: {accuracy:.2%}")
self.assertGreater(accuracy, 0.75, "Training accuracy should be above 75%")
print("Training and prediction accuracy test passed successfully")

def test_vocabulary_creation(self):
print("\n--- Testing vocabulary creation ---")
self.classifier.train(self.recipes, self.cuisines)
print(f"Vocabulary size: {len(self.classifier.vocabulary)}")
self.assertIsNotNone(self.classifier.vocabulary, "Vocabulary should not be None after training")
expected_ingredients = ["pasta", "tomato", "basil", "olive_oil", "garlic", "tortilla", "beans", "salsa",
"avocado", "cilantro"]
for ingredient in expected_ingredients:
self.assertIn(ingredient, self.classifier.vocabulary, f"{ingredient} should be in the vocabulary")
print(f"'{ingredient}' found in vocabulary")
print("Vocabulary creation test passed successfully")

def test_predict_new_recipes(self):

# Train the classifier
print("\nTraining the classifier...")
self.classifier.train(self.recipes, self.cuisines)
print(f"Vocabulary size after training: {len(self.classifier.vocabulary)}")

# New recipes to test
new_recipes = [
"lasagna pasta cheese tomato_sauce beef",
"burrito rice beans salsa guacamole"
]

print("\nPredicting new recipes:")
for recipe in new_recipes:
print(f"Recipe: {recipe}")

# Predict new recipes
predictions = self.classifier.predict(new_recipes)

print("\nPrediction results:")
for recipe, prediction in zip(new_recipes, predictions):
print(f"Recipe: {recipe}")
print(f"Predicted cuisine: {prediction}")

# Check predictions
self.assertEqual(predictions[0], "Italian", "Lasagna should be classified as Italian")
self.assertEqual(predictions[1], "Mexican", "Burrito should be classified as Mexican")

print("\nTest passed successfully!")

if __name__ == '__main__':
unittest.main()