-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpre-new-v2.py
100 lines (85 loc) · 3.17 KB
/
pre-new-v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import re
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
# Proses Collecting Data
print("Membaca file CSV...")
df = pd.read_csv("dataset-50k.csv")
print("File CSV berhasil dibaca.")
stop_words = set(stopwords.words("english"))
def remove_html_tags(text):
# Menghapus tag HTML <br> dan <br/>
clean_text = re.sub(r"<br\s*/?>", " ", text)
# Menghapus semua tag HTML lainnya
clean_text = re.sub(r"<.*?>", "", clean_text)
return clean_text
# Membersihkan tag HTML dari kolom "review"
df["cleaned_review"] = df["review"].apply(remove_html_tags)
# Proses Case-Folding
print("Melakukan case-folding...")
cleaned_reviews_alpha = [
" ".join(TextBlob(review.lower()).words) for review in df["cleaned_review"]
]
print("Case-folding selesai.")
# Membersihkan tag HTML, simbol & angka, menghapus whitespaces, dan mengubah apostrophe/short word ke bentuk asalnya
print(
"Membersihkan tag HTML, simbol & angka, menghapus whitespaces, dan mengubah apostrophe/short word..."
)
cleaned_reviews_beta = []
for review in cleaned_reviews_alpha:
cleaned_tokens = []
for token in review.split(): # Memecah teks ulasan menjadi token
# Menghapus simbol dan angka
clean_token = re.sub(r"[^a-zA-Z'-]", "", token)
if clean_token:
# Menghapus whitespaces
clean_token = clean_token.strip()
# Mengubah apostrophe/short word ke bentuk asalnya
corrections = {
"'s": " is",
"'re": " are",
"'ll": " will",
"'ve": " have",
"'d": " would",
"n't": " not",
}
# Menggabungkan kata yang telah dibersihkan dari tag HTML, simbol, dengan kamus koreksi
clean_token = corrections.get(clean_token, clean_token)
cleaned_tokens.append(clean_token)
cleaned_reviews_beta.append(" ".join(cleaned_tokens))
print("Pembersihan selesai.")
# Proses Tokenisasi
print("Melakukan tokenisasi...")
cleaned_reviews_gamma = [word_tokenize(review) for review in cleaned_reviews_beta]
print("Tokenisasi selesai.")
# Proses Stopwords Removal
print("Menghapus stopwords...")
cleaned_reviews_delta = [
[word for word in review if word not in stop_words]
for review in cleaned_reviews_gamma
]
print("Stopwords removal selesai.")
# Proses Lemmatization
print("Melakukan lemmatisasi...")
lemmatizer = WordNetLemmatizer()
cleaned_reviews_epsilon = [
[lemmatizer.lemmatize(word) for word in review] for review in cleaned_reviews_delta
]
print("Lemmatisasi selesai.")
# Menunjukkan hasil akhir dalam bentuk tabel
df_final = pd.DataFrame(
{
"Original_Review": df["review"],
"Processed_Review": cleaned_reviews_epsilon,
"Sentiment_Label": df["sentiment"],
}
)
# Simpan DataFrame ke dalam file CSV
df_final.to_csv("hasil_preprocessing_new_v2.csv", index=False)
print("Data telah disimpan dalam bentuk CSV.")