sentimental-analysis-LSTM/main.py at main · dhanush-raja-a/sentimental-analysis-LSTM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# Sentiment Analysis with LSTM
# Dataset: IMDB Movie Reviews (50k)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# 1. Load Dataset
df = pd.read_csv("IMDB Dataset.csv")  # download from Kaggle and place in same folder
print(df.head())

# 2. Preprocessing
# Convert labels
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})

# Clean text
def clean_text(text):
    text = re.sub(r"<.*?>", "", text)  # remove HTML tags
    text = re.sub(r"[^a-zA-Z]", " ", text)  # keep only alphabets
    text = text.lower()
    return text

df['review'] = df['review'].apply(clean_text)

# 3. Tokenization
X = df['review'].values
y = df['sentiment'].values

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=200, padding='post', truncating='post')

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# 5. LSTM Model
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=200),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

# 6. Training
history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=5,
                    batch_size=128)

# 7. Evaluation
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc*100:.2f}%")

# 8. Visualization
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.show()

# 9. Test with Custom Review
test_review = ["The movie was fantastic, I really loved it!"]
seq = tokenizer.texts_to_sequences(test_review)
pad = pad_sequences(seq, maxlen=200)
pred = model.predict(pad)[0][0]
print("Positive" if pred > 0.5 else "Negative")