-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathnltk_analyse.py
100 lines (80 loc) · 3.85 KB
/
nltk_analyse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import json
import re
import string
import collections
import nltk
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from utils import remove_chars_from_text, remove_emojis, read_conf
import stopwords_list
# Инициализация стеммера
stemmer = SnowballStemmer("russian")
# Специальные символы для очистки текста
spec_chars = string.punctuation + '\n\xa0«»\t—…"<>?!.,;:꧁@#$%^&*()_-+=№%༺༺\༺/༺-•'
# Действия, которые нужно игнорировать в тексте
action_map = ['Invite Member', 'Kicked Members', 'Joined by Link', 'Pinned Message']
def analyse(data, most_com):
# Настройка стоп-слов
russian_stopwords = stopwords.words("russian")
russian_stopwords.extend(['это', 'ну', 'но', 'еще', 'ещё', 'оно', 'типа'])
english_stopwords = stopwords.words("english")
# Приведение текста к нижнему регистру и удаление лишних символов
text = str(data).lower().replace("'", "").replace(",", "").replace("[", "").replace("]", "").replace("-", " ")
for action in action_map:
text = text.replace(action.lower(), "")
text = remove_chars_from_text(text, spec_chars)
text = remove_chars_from_text(text, string.digits)
# Проверка, что текст не пустой
if len(text) < 1:
return [], []
# Токенизация текста
text_tokens = word_tokenize(text)
# Стемминг токенов, если выбран
if read_conf('select_type_stem') == 'On':
text_tokens = [stemmer.stem(word) for word in text_tokens]
# Фильтрация токенов
text_tokens = [token.strip() for token in text_tokens if
token not in russian_stopwords and
len(token) >= 3 and
len(token) < 26 and
token not in english_stopwords and
'http' not in token and
token not in stopwords_list.stopword_txt]
# Частотное распределение
text = nltk.Text(text_tokens)
fdist = FreqDist(text)
fdist = fdist.most_common(most_com)
return fdist, text_tokens
def analyse_all(data, most_com):
# Настройка стоп-слов
russian_stopwords = stopwords.words("russian")
english_stopwords = stopwords.words("english")
russian_stopwords.extend(['это', 'ну', 'но', 'еще', 'ещё', 'оно', 'типа'])
# Приведение текста к нижнему регистру и удаление лишних символов
text = str(data).lower().replace("'", "").replace(",", "").replace("[", "").replace("]", "").replace("-", " ")
text = remove_chars_from_text(text, spec_chars)
text = remove_chars_from_text(text, string.digits)
#text = remove_emojis(text)
if len(text) >= 1:
text_tokens = word_tokenize(text)
else:
return [], []
# Стемминг токенов, если выбран
if read_conf('select_type_stem') == 'On':
text_tokens = [stemmer.stem(word) for word in text_tokens]
# Фильтрация токенов
text_tokens = [token.strip() for token in text_tokens if
token not in russian_stopwords and
len(token) >= 4 and
len(token) < 26 and
token not in english_stopwords and
'http' not in token and
token not in stopwords_list.stopword_txt]
# Частотное распределение
text = nltk.Text(text_tokens)
fdist = FreqDist(text)
fdist = fdist.most_common(most_com)
data = [i[0] for i in fdist]
return fdist, data