-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathspliting.py
More file actions
22 lines (18 loc) · 932 Bytes
/
spliting.py
File metadata and controls
22 lines (18 loc) · 932 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import re
from razdel import sentenize
def split_list_merge_by_pattern(lst: list, pattern: str) -> list[str]:
result = []
for token in lst:
token = token.replace('"', '"')
regular_splited = re.split(pattern, token)
regular_splited = [text.strip() for text in regular_splited]
regular_splited = list(filter(lambda s: len(s) > 1, regular_splited))
result += regular_splited
return result
def split(string_to_split: str) -> list[str]:
sentence_splited = [sentence.text for sentence in sentenize(string_to_split)]
# |(?<=[а-я]):
# |(?= [А-Я][а-я])
# |; +-|; +—|•|; +~|; +\+|—|-
regular_splited = split_list_merge_by_pattern(sentence_splited, '; +-|; +—|; +~|; +\+|(?<=[а-я]:)|\n|(?=•)|(?=·)|(?=— )|(?=- )|(?=⠇)|(?=°)|(?=⠂)|(?=;)|[Тт]ребования|[Оо]бязанности|[Уу]словия')
return regular_splited