-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdivide.py
More file actions
115 lines (95 loc) · 3.59 KB
/
Copy pathdivide.py
File metadata and controls
115 lines (95 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import re
from docx import Document
from dotenv import load_dotenv
# 加载 .env 文件中的环境变量
load_dotenv()
def extract_tasks(file_path):
# 读取文档内容
all_list = []
doc = Document(file_path)
for para in doc.paragraphs:
all_list.append(para.text) # 只获取段落文本
# 定义结果列表和一些辅助变量
tasks = []
grandfafa = '' # 大写数字(如:一,二)
grandfa = '' # 阿拉伯数字(如:1,2)
fa = '' # 带括号的数字(如:(1),(2)),圆圈数字(如:①,②)
# 定义标识正则表达式
patterns = {
'grandfafa': r'^[一二三四五六七八九十]+.*$', # 只要求以大写数字开头,后面可以跟任意字符
'grandfa': r'^\d+.*$', # 以阿拉伯数字开头,后面可以跟任意字符
'fa': r'^[((]\d+[))].*$', # 以带括号的数字(如:(1))开头,后面可以跟任意字符
'circle': r'^[①②③④⑤⑥⑦⑧⑨⑩]+.*$', # 以圆圈数字开头,后面可以跟任意字符
}
# 维护的是最新的有标识的下标
index=0
last=-1
now=-2
la_text=''
text = ''
# 遍历 all_list,
for i in range(0, len(all_list)):
para = all_list[i]
if para == '':
continue
if re.match(patterns['grandfafa'], para): # 判断是否为大写数字
# grandfafa = i # 更新 grandfafa
index=i
now=-2
elif re.match(patterns['grandfa'], para): # 判断是否为阿拉伯数字
# grandfa = i # 更新 grandfa
index = i
now=-2
elif re.match(patterns['fa'], para): # 判断是否为带括号的数字
# fa = i # 更新 fa
index = i
now=0
elif re.match(patterns['circle'], para): # 判断是否为圆圈数字
index=i
now=1
else: # 如果没有标识,合并到最新的有标识的文本中
all_list[index] += para # 将当前元素合并到上一段
all_list[i] = ""# 清空当前元素
continue
# 代表现在是有标识的
if (now==last):
if (text==''):
text=la_text+para
else:
text+=para
else :
if (text!=''):
if (last==0):
text=all_list[grandfafa]+all_list[grandfa]+text
else:
text=all_list[grandfafa]+all_list[grandfa]+all_list[fa]+text
tasks.append(text)
text=''
last=-1
la_text = para
if re.match(patterns['grandfafa'], para): # 判断是否为大写数字
grandfafa = i # 更新 grandfafa
elif re.match(patterns['grandfa'], para): # 判断是否为阿拉伯数字
grandfa = i # 更新 grandfa
elif re.match(patterns['fa'], para): # 判断是否为带括号的数字
fa = i # 更新 fa
if(now==0):
last=0
elif (now==1):
last=1
if (last == 0):
text = all_list[grandfafa] + all_list[grandfa] + text
else:
text = all_list[grandfafa] + all_list[grandfa] + all_list[fa] + text
tasks.append(text)
return tasks
def main():
file_path = os.getenv("FILE_PATH", "") # 默认空字符串,如果没有找到路径则返回空字符串
tasks = extract_tasks(file_path)
for task in tasks:
print(task)
# print(len(tasks))
# 调用函数并打印任务
if __name__ == "__main__":
main()