Skip to content

Commit 6f571bb

Browse files
authored
Merge pull request #1839 from opendatalab/release-1.2.2
Release 1.2.2
2 parents 5db81bf + 380cb4d commit 6f571bb

File tree

2 files changed

+17
-14
lines changed

2 files changed

+17
-14
lines changed

magic_pdf/post_proc/para_split_v3.py

+16-13
Original file line numberDiff line numberDiff line change
@@ -108,29 +108,32 @@ def __is_list_or_index_block(block):
108108
):
109109
multiple_para_flag = True
110110

111-
for line in block['lines']:
112-
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
113-
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
114-
if (
115-
line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
116-
and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
117-
):
118-
external_sides_not_close_num += 1
119-
if abs(line_mid_x - block_mid_x) < line_height / 2:
120-
center_close_num += 1
111+
block_text = ''
121112

113+
for line in block['lines']:
122114
line_text = ''
123115

124116
for span in line['spans']:
125117
span_type = span['type']
126118
if span_type == ContentType.Text:
127119
line_text += span['content'].strip()
128-
129120
# 添加所有文本,包括空行,保持与block['lines']长度一致
130121
lines_text_list.append(line_text)
131122
block_text = ''.join(lines_text_list)
132-
block_lang = detect_lang(block_text)
133-
# logger.info(f"block_lang: {block_lang}")
123+
124+
block_lang = detect_lang(block_text)
125+
# logger.info(f"block_lang: {block_lang}")
126+
127+
for line in block['lines']:
128+
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
129+
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
130+
if (
131+
line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
132+
and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
133+
):
134+
external_sides_not_close_num += 1
135+
if abs(line_mid_x - block_mid_x) < line_height / 2:
136+
center_close_num += 1
134137

135138
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
136139
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
boto3>=1.28.43
22
Brotli>=1.1.0
33
click>=8.1.7
4-
fast-langdetect>=0.2.3
4+
fast-langdetect>=0.2.3,<0.3.0
55
loguru>=0.6.0
66
numpy>=1.21.6,<2.0.0
77
pydantic>=2.7.2

0 commit comments

Comments
 (0)