-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
133 lines (120 loc) · 5.13 KB
/
app.py
File metadata and controls
133 lines (120 loc) · 5.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import streamlit as st
import joblib
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
#from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, RegexpStemmer
import re
import string
from string import punctuation
nltk.download('wordnet')
nltk.download('punkt')
model = joblib.load('logistic_regression_model.pkl')
tfidf = joblib.load('tfidf_vectorizer.pkl')
# Text Preprocessing Functions
def clean_txt_func1(text):
text = text.lower()
text = text.replace('\n', ' ')
pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,6}\b')
text = re.sub(pattern, '', text)
text = re.sub(r"#\w+", '', text)
text = re.sub(r'@\w+', '', text)
return text
punctuation = ".,!?;:'\"()[]{}-–/\\|~`"
def clean_txt_func2(text):
reg_pattern = re.compile(r'(?:https?://|ftp://|www\.)[^\s@]+(?:[:/?#+&;\w-]*|\([^)]*\))?(?:\s|\Z)')
text = re.sub(reg_pattern, '', text)
text = ''.join([word for word in text if word not in punctuation])
words_to_remove = ["subscribe", "subscribers", "like", "comment", "share", "join", "disclaimer", "’"]
pattern = r'\b(' + '|'.join(words_to_remove) + r')\b|\s*\.\s*'
text = re.sub(pattern, '', text)
text = re.sub(r'\b\d+[a-zA-Z]?\b', '', text)
text = word_tokenize(text)
return text
def remove_emojis(text):
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F"
"\U0001F300-\U0001F5FF"
"\U0001F680-\U0001F6FF"
"\U0001F1E0-\U0001F1FF"
"\U00002700-\U000027BF"
"\U0001F900-\U0001F9FF"
"\U0001FA70-\U0001FAFF"
"\U00002600-\U000026FF"
"\U0001F700-\U0001F77F"
"\U00002300-\U000023FF"
"\U00002000-\U000020FF"
"\U0001F780-\U0001F7FF"
"\U0001F800-\U0001F8FF"
"\U0001FA00-\U0001FA6F"
"\U00002B05-\U00002B07"
"\U00002934-\U00002935"
"\U00002190-\U000021AA"
"]+", flags=re.UNICODE
)
return emoji_pattern.sub(r'', text)
def clean_txt_func3(word_list):
return [remove_emojis(word) for word in word_list if remove_emojis(word).strip()]
def clean_txt_func4(text):
stopwords_list =[
"a", "about", "above", "after", "again", "against", "all", "am", "an", "and",
"any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being",
"below", "between", "both", "but", "by", "can't", "cannot", "could", "did", "didn't",
"do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for",
"from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having",
"he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if",
"in", "into", "is", "isn't", "it", "its", "itself", "just", "ll", "m", "ma", "me",
"might", "mightn't", "more", "most", "must", "mustn't", "my", "myself", "need",
"needn't", "no", "nor", "not", "now", "o", "of", "off", "on", "once", "only", "or",
"other", "our", "ours", "ourselves", "out", "over", "own", "re", "s", "same", "shan't",
"she", "she's", "should", "shouldn't", "so", "some", "such", "t", "than", "that",
"the", "their", "theirs", "them", "themselves", "then", "there", "these", "they",
"this", "those", "through", "to", "too", "under", "until", "up", "ve", "very",
"was", "wasn't", "we", "were", "weren't", "what", "when", "where", "which",
"while", "who", "whom", "why", "will", "with", "won't", "would", "y", "you",
"your", "yours", "yourself", "yourselves"
]
text = [word for word in text if word not in stopwords_list]
regexp_stemmer = RegexpStemmer(r'ing$')
text = [regexp_stemmer.stem(word) for word in text]
lm = WordNetLemmatizer()
text = [lm.lemmatize(word) for word in text]
return text
def preprocess_input(title, description):
title = clean_txt_func1(title)
description = clean_txt_func1(description)
title = clean_txt_func2(title)
description = clean_txt_func2(description)
title = clean_txt_func3(title)
description = clean_txt_func3(description)
title = clean_txt_func4(title)
description = clean_txt_func4(description)
combined = title + description
return ' '.join(combined)
# Streamlit App
st.title("YouTube Video Category Classifier")
# User Input
title = st.text_input("Enter the video title:")
description = st.text_area("Enter the video description:")
if st.button("Predict Category"):
if title and description:
# Preprocess the input
processed_input = preprocess_input(title, description)
# Vectorize the input
vectorized_input = tfidf.transform([processed_input])
# Predict the category
prediction = model.predict(vectorized_input)
# Display the prediction
st.success(f"The predicted category is: {prediction[0]}")
else:
st.error("Please provide both a title and a description.")
if st.button("Show Cleaned Text"):
if title and description:
# Preprocess the input and display the cleaned text
cleaned_text = preprocess_input(title, description)
st.markdown("### Cleaned Text:")
st.markdown(f"**{cleaned_text}**")
else:
st.error("Please provide both a title and a description.")