-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpre-new.py
121 lines (101 loc) · 3.68 KB
/
pre-new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import re
import nltk
import pandas as pd
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
# Proses Collecting Data
print("Membaca file CSV...")
df = pd.read_csv("dataset.csv")
print("File CSV berhasil dibaca.")
stop_words = set(stopwords.words("english"))
# Mendefinisikan func Remove HTML TAG
def remove_html_tags(text):
# Menghapus tag HTML <br> dan <br/>
clean_text = re.sub(r"<br\s*/?>", " ", text)
# Menghapus semua tag HTML lainnya
clean_text = re.sub(r"<.*?>", "", clean_text)
return clean_text
# Membersihkan tag HTML dari kolom "review"
df["cleaned_review"] = df["review"].apply(remove_html_tags)
# Proses Labeling
print("Menentukan sentimen untuk teks ulasan...")
# Mendefinisikan func Labeling
def label_sentiment(polarity):
if polarity > 0:
return "Positive"
elif polarity < 0:
return "Negative"
else:
return "Neutral"
df["sentiment_polarity"] = df["cleaned_review"].apply(
lambda x: TextBlob(x).sentiment.polarity
)
print("Sentimen berhasil dibuat.")
# Proses Case-Folding
print("Melakukan case-folding...")
cleaned_reviews_alpha = [
" ".join(TextBlob(review.lower()).words) for review in df["cleaned_review"]
]
print("Case-folding selesai.")
# Membersihkan simbol & angka, menghapus whitespaces, dan mengubah apostrophe/short word ke bentuk asalnya
print(
"Membersihkan simbol & angka, menghapus whitespaces, dan mengubah apostrophe/short word..."
)
cleaned_reviews_beta = []
for review in cleaned_reviews_alpha:
cleaned_tokens = []
for token in review.split(): # Memecah teks ulasan menjadi token
# Menghapus simbol dan angka
clean_token = re.sub(r"[^a-zA-Z'-]", "", token)
if clean_token:
# Menghapus whitespaces
clean_token = clean_token.strip()
# Mengubah apostrophe/short word ke bentuk asalnya
corrections = {
"'s": " is",
"'re": " are",
"'ll": " will",
"'ve": " have",
"'d": " would",
"n't": " not",
}
# Menggabungkan kata yang telah dibersihkan dari tag HTML, simbol, dengan kamus koreksi
clean_token = corrections.get(clean_token, clean_token)
cleaned_tokens.append(clean_token)
cleaned_reviews_beta.append(" ".join(cleaned_tokens))
print("Pembersihan selesai.")
# Proses Tokenisasi
print("Melakukan tokenisasi...")
cleaned_reviews_gamma = [word_tokenize(review) for review in cleaned_reviews_beta]
print("Tokenisasi selesai.")
# Proses Stopwords Removal
print("Menghapus stopwords...")
cleaned_reviews_delta = [
[word for word in review if word not in stop_words]
for review in cleaned_reviews_gamma
]
print("Stopwords removal selesai.")
# Proses Lemmatization
print("Melakukan lemmatisasi...")
lemmatizer = WordNetLemmatizer()
cleaned_reviews_epsilon = [
[lemmatizer.lemmatize(word) for word in review] for review in cleaned_reviews_delta
]
print("Lemmatisasi selesai.")
# Menunjukkan hasil akhir dalam bentuk tabel
df_final = pd.DataFrame(
{
"Original_Review": df["review"],
"Processed_Review": cleaned_reviews_epsilon,
"Sentiment_Polarity": df["sentiment_polarity"],
"Sentiment_Label": df["sentiment_polarity"].apply(label_sentiment),
}
)
# Menyimpan DataFrame ke dalam file CSV
df_final.to_csv("hasil_preprocessing_9.csv", index=False)
print("Data telah disimpan dalam bentuk CSV.")