-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_stats.py
More file actions
112 lines (92 loc) · 3.52 KB
/
text_stats.py
File metadata and controls
112 lines (92 loc) · 3.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import underthesea
import matplotlib.pyplot as plt
import os
import glob
def ensure_nltk_resources():
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt")
nltk.download("punkt_tab")
try:
nltk.data.find("corpora/stopwords")
except LookupError:
nltk.download("stopwords")
def import_data(filename, rootpath):
"""Load a text file from rootpath and return its contents as a DataFrame."""
fullpath = os.path.join(rootpath, filename)
data = ''
if os.path.exists(fullpath):
with open(fullpath, encoding='utf-8') as f:
data = f.read()
return data
def load_vi_stopwords(patternpath, rootpath):
"""Load Vietnamese stopwords from multiple txt files."""
vi_stopwords = []
for filepath in glob.glob(pathname=patternpath, root_dir=rootpath):
fullpath = os.path.join(rootpath, filepath)
if os.path.exists(fullpath):
with open(fullpath, encoding='utf-8') as f:
vi_stopwords.extend([line.strip() for line in f if line.strip()])
else:
print(f"File not found: {filepath}")
return vi_stopwords
def preprocessing(text):
"""Return individual words after preprocessing."""
# Lowercase and Punctual Removal
text = re.sub(r"[^\w\s\-]", " ", text).lower()
# Stop word removal
# English
en_stopwords = stopwords.words('english')
# Vietnamese
rootpath = 'data'
pattern = '*stopwords.txt'
vi_stopwords = load_vi_stopwords(pattern, rootpath)
# Both English and Vietnamese Tokenization
tokens = underthesea.word_tokenize(text)
# Remove both English and Vietnamese stopwords
tokens = [token for token in tokens if token not in en_stopwords and token not in vi_stopwords]
return tokens
def statistics(tokens):
"""Return the number of occurrences of each word in the text."""
# Using the unigrams to count the number of occurrences of each word
unigrams = (pd.Series(nltk.ngrams(tokens, 1)))
# Convert unigrams from tuple to string
nwords = unigrams.apply(
lambda words: words[0]
)
# Define a Dataframe from nwords Series;
# count the number of occurrences of each word and reset index
df = nwords.value_counts().reset_index()
df.columns = ['words', 'counts']
return df
def export_results(dataset, filename, rootpath):
"""Export DataFrame to CSV file in the specified directory."""
# Create new directory if it exists
os.makedirs(rootpath, exist_ok=True)
fullpath = os.path.join(rootpath, filename)
# Declare the Dataframe from the dataset
df = pd.DataFrame(dataset)
# Use utf-8-sig to ensure proper encoding
df.to_csv(fullpath, index=False, encoding='utf-8-sig')
def visualize_results(dataset, filename, rootpath, top_n=10):
"""Visualize the top N most frequent words from the dataset and save the plot as an image file."""
# Create new directory if it exists
os.makedirs(rootpath, exist_ok=True)
fullpath = os.path.join(rootpath, filename)
# Declare the Dataframe from the dataset
df = pd.DataFrame(dataset)
top_words = df.head(top_n)
# Using matplotlib.pyplot to visualize the results
plt.figure(figsize=(12,6))
plt.bar(top_words['words'], top_words['counts'])
plt.title(f'Top {top_n} Most Frequent Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.savefig(fullpath)
plt.close()