-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnormalize.py
More file actions
executable file
·82 lines (64 loc) · 2.11 KB
/
Copy pathnormalize.py
File metadata and controls
executable file
·82 lines (64 loc) · 2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python2.7
# -*- coding: utf-8
from logging import getLogger
from rhyme import get_rhyme_mask
from w2v_model import extract_key_words
from grammar_utils import *
logger = getLogger(__name__)
def get_normal_sentences(post):
text = post['text']
if text is None:
return []
for c in frozenset(']}>\n') | ENDS:
text = text.replace(c, c + '|')
for c in frozenset([' - ']):
text = text.replace(c, '|')
sentences = [s for s in text.split('|') if has_russian_chars(s) and 0 < len(s) < 40]
norm_sentences = []
for sentence in set(sentences):
if set('#_') & set(sentence):
continue
sentence = sentence.strip()
sentence = list(sentence)
sentence[0] = sentence[0].upper()
sentence = ''.join(sentence)
sentence = ' '.join(sentence.split())
if sentence[-1] not in ENDS:
sentence += '.'
norm_sentences.append(sentence)
sentences = []
for i, sentence in enumerate(norm_sentences):
words = normalize_sentence(sentence)
norm = ' '.join(words)
length = get_length(norm)
if length < 3 or length > 16:
continue
last_word = words[-1]
if last_word in BAD_ENDINGS:
continue
mask = get_rhyme_mask(last_word)
if not mask:
continue
norm_last_word, morphy_tag = try_normalize_word(last_word)
if not morphy_tag or bool(BAD_END_TAGS & morphy_tag):
continue
key_words = extract_key_words([try_normalize_word(w)[0] for w in words])
if not key_words:
continue
newItem = dict(post)
newItem.update({
'text': sentence,
'id': '%s-%d' % (post['id'], i),
'words': words,
'norm': norm,
'length': length,
'last_word': last_word,
'norm_last_word': norm_last_word,
'mask': mask,
'key_words': key_words,
'morphy_tag': morphy_tag,
})
sentences.append(newItem)
return sentences
if __name__ == '__main__':
pass