-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
81 lines (65 loc) · 2.35 KB
/
main.py
File metadata and controls
81 lines (65 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# Sentiment Analysis with LSTM
# Dataset: IMDB Movie Reviews (50k)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
# 1. Load Dataset
df = pd.read_csv("IMDB Dataset.csv") # download from Kaggle and place in same folder
print(df.head())
# 2. Preprocessing
# Convert labels
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})
# Clean text
def clean_text(text):
text = re.sub(r"<.*?>", "", text) # remove HTML tags
text = re.sub(r"[^a-zA-Z]", " ", text) # keep only alphabets
text = text.lower()
return text
df['review'] = df['review'].apply(clean_text)
# 3. Tokenization
X = df['review'].values
y = df['sentiment'].values
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=200, padding='post', truncating='post')
# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)
# 5. LSTM Model
model = Sequential([
Embedding(input_dim=10000, output_dim=64, input_length=200),
LSTM(128, return_sequences=False),
Dropout(0.5),
Dense(64, activation='relu'),
Dropout(0.3),
Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# 6. Training
history = model.fit(X_train, y_train,
validation_data=(X_test, y_test),
epochs=5,
batch_size=128)
# 7. Evaluation
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc*100:.2f}%")
# 8. Visualization
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.show()
# 9. Test with Custom Review
test_review = ["The movie was fantastic, I really loved it!"]
seq = tokenizer.texts_to_sequences(test_review)
pad = pad_sequences(seq, maxlen=200)
pred = model.predict(pad)[0][0]
print("Positive" if pred > 0.5 else "Negative")