-
Notifications
You must be signed in to change notification settings - Fork 38
feat: add mineru for document parsing tool #181
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 5 commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
070abd8
feat: add mineru plugin for document parsing
gary-Shen 6c521b1
update: return images as default
gary-Shen ca6cc1f
update: update input type
gary-Shen f0a37e8
update: update tool name and description
gary-Shen 31e6ba8
update: update tool description
gary-Shen 717f56a
update: fix comments from copilot
gary-Shen 299bef8
update: update tool name
gary-Shen 703ec42
update: fix filename for local parsing
gary-Shen 4113ac8
update: remove unused imports
gary-Shen 87ced9c
fix: some typo and type definition
FinleyGe File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
117 changes: 117 additions & 0 deletions
117
modules/tool/packages/mineru/children/parseLocal/config.ts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
import { defineTool } from '@tool/type'; | ||
import { FlowNodeInputTypeEnum, WorkflowIOValueTypeEnum } from '@tool/type/fastgpt'; | ||
import { ToolTypeEnum } from '@tool/type/tool'; | ||
|
||
export default defineTool({ | ||
type: ToolTypeEnum.tools, | ||
name: { | ||
'zh-CN': '本地部署解析', | ||
en: 'Parse file by local MinerU api v2' | ||
}, | ||
description: { | ||
'zh-CN': '使用本地部署的 MinerU api v2 解析文件,支持 pdf、png、jpg、jpeg 等多种格式', | ||
en: 'Parse the file using the local MinerU api v2, support pdf, png, jpg, jpeg等多种格式' | ||
}, | ||
courseUrl: 'https://github.com/opendatalab/MinerU/blob/master/mineru/cli/fast_api.py#L63', | ||
versionList: [ | ||
{ | ||
value: '0.1.0', | ||
description: 'Default version', | ||
inputs: [ | ||
{ | ||
key: 'files', | ||
label: 'files', | ||
renderTypeList: [FlowNodeInputTypeEnum.fileSelect, FlowNodeInputTypeEnum.reference], | ||
valueType: WorkflowIOValueTypeEnum.arrayString, | ||
required: true, | ||
description: '需要解析的文件(支持.pdf、.png、.jpg、.jpeg 多种格式)', | ||
canSelectFile: true, | ||
canSelectImg: true | ||
}, | ||
{ | ||
key: 'parse_method', | ||
label: '解析方法', | ||
renderTypeList: [FlowNodeInputTypeEnum.select], | ||
valueType: WorkflowIOValueTypeEnum.string, | ||
list: [ | ||
{ label: 'auto', value: 'auto' }, | ||
{ label: 'ocr', value: 'ocr' }, | ||
{ label: 'txt', value: 'txt' } | ||
], | ||
required: false, | ||
description: '解析方法,默认 auto', | ||
defaultValue: 'auto' | ||
}, | ||
{ | ||
key: 'formula_enable', | ||
label: '开启公式识别', | ||
renderTypeList: [FlowNodeInputTypeEnum.switch], | ||
valueType: WorkflowIOValueTypeEnum.boolean, | ||
required: false, | ||
description: '是否启动公式识别功能,默认 true', | ||
defaultValue: true | ||
}, | ||
{ | ||
key: 'table_enable', | ||
label: '开启表格识别', | ||
renderTypeList: [FlowNodeInputTypeEnum.switch], | ||
valueType: WorkflowIOValueTypeEnum.boolean, | ||
required: false, | ||
description: '是否启动表格识别功能,默认 true', | ||
defaultValue: true | ||
}, | ||
{ | ||
key: 'return_content_list', | ||
label: '返回结构化 json', | ||
renderTypeList: [FlowNodeInputTypeEnum.switch], | ||
valueType: WorkflowIOValueTypeEnum.boolean, | ||
required: false, | ||
description: '是否返回结构化 json,默认 false', | ||
defaultValue: false | ||
}, | ||
{ | ||
key: 'lang_list', | ||
label: '文档语言', | ||
renderTypeList: [FlowNodeInputTypeEnum.textarea], | ||
valueType: WorkflowIOValueTypeEnum.string, | ||
required: false, | ||
description: | ||
'指定文档语言,默认 ch,长度跟文件数量一致,否则取第一个,按逗号分隔,其他可选值列表详见:https://www.paddleocr.ai/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html#4-supported-languages-and-abbreviations', | ||
defaultValue: 'ch' | ||
}, | ||
{ | ||
key: 'backend', | ||
label: '解析后端', | ||
renderTypeList: [FlowNodeInputTypeEnum.select], | ||
valueType: WorkflowIOValueTypeEnum.string, | ||
required: false, | ||
description: 'mineru 解析后端,默认pipeline。', | ||
list: [ | ||
{ label: 'pipeline', value: 'pipeline' }, | ||
{ label: 'vlm-transformers', value: 'vlm-transformers' }, | ||
{ label: 'vlm-sglang-engine', value: 'vlm-sglang-engine' }, | ||
{ label: 'vlm-sglang-client', value: 'vlm-sglang-client' } | ||
], | ||
defaultValue: 'pipeline' | ||
}, | ||
{ | ||
key: 'sglang_server_url', | ||
label: 'sglang 服务地址', | ||
renderTypeList: [FlowNodeInputTypeEnum.input], | ||
valueType: WorkflowIOValueTypeEnum.string, | ||
required: false, | ||
description: 'sglang 服务地址,当 backend 为 vlm-sglang-client 时必填。', | ||
defaultValue: '' | ||
} | ||
], | ||
outputs: [ | ||
{ | ||
valueType: WorkflowIOValueTypeEnum.arrayObject, | ||
key: 'result', | ||
label: '解析结果', | ||
description: '解析后的数据' | ||
} | ||
] | ||
} | ||
] | ||
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
import config from './config'; | ||
import { InputType, OutputType, tool as toolCb } from './src'; | ||
import { exportTool } from '@tool/utils/tool'; | ||
|
||
export default exportTool({ | ||
toolCb, | ||
InputType, | ||
OutputType, | ||
config | ||
}); |
172 changes: 172 additions & 0 deletions
172
modules/tool/packages/mineru/children/parseLocal/src/index.ts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
import { uploadFile } from '@tool/utils/uploadFile'; | ||
import { z } from 'zod'; | ||
import path from 'path'; | ||
|
||
export const InputType = z.object({ | ||
base_url: z.string(), | ||
token: z.string().optional().default(''), | ||
files: z.array(z.string()), | ||
parse_method: z.string().optional().default('auto'), | ||
formula_enable: z.boolean().optional().default(true), | ||
table_enable: z.boolean().optional().default(true), | ||
return_md: z.boolean().optional().default(true), | ||
return_content_list: z.boolean().optional().default(false), | ||
lang_list: z.string().optional().default('ch'), | ||
backend: z.string().optional().default('pipeline'), | ||
sglang_server_url: z.string().optional().default('') | ||
}); | ||
|
||
interface InnerPropsType extends z.infer<typeof InputType> { | ||
headers: Record<string, string>; | ||
} | ||
|
||
interface ParsedResultItemType { | ||
images: Record<string, string>; | ||
content_list?: string; | ||
md_content: string; | ||
} | ||
|
||
interface ParsedResultType { | ||
results: Record<string, ParsedResultItemType>; | ||
} | ||
|
||
interface ResultItemType { | ||
filename: string; | ||
images?: string[]; | ||
content_list?: any[]; | ||
md_content?: string; | ||
} | ||
|
||
export const OutputType = z.object({ | ||
result: z.record( | ||
z.array( | ||
z.object({ | ||
filename: z.string(), | ||
images: z.array(z.string()).optional(), | ||
content_list: z.array(z.any()).optional(), | ||
md_content: z.string().optional() | ||
}) | ||
) | ||
) | ||
}); | ||
|
||
function buildHeaders(token?: string) { | ||
if (token) { | ||
return { | ||
Authorization: `Bearer ${token}` | ||
}; | ||
} | ||
|
||
return {}; | ||
} | ||
|
||
async function uploadBase64Image(filename: string, content: string) { | ||
const { accessUrl } = await uploadFile({ | ||
base64: content, | ||
defaultFilename: filename | ||
}); | ||
|
||
return accessUrl; | ||
} | ||
|
||
function replaceImageUrl(content: string, images: Record<string, string>) { | ||
for (const [key, value] of Object.entries(images)) { | ||
content = content.replace(new RegExp(`images/${key}`, 'g'), value); | ||
} | ||
return content; | ||
} | ||
|
||
export async function tool(props: z.infer<typeof InputType>): Promise<z.infer<typeof OutputType>> { | ||
const { base_url, token, lang_list, files: propsFiles } = props; | ||
|
||
if (!base_url) { | ||
return Promise.reject('MinerU base url is required'); | ||
} | ||
|
||
const innerProps: InnerPropsType = { | ||
...props, | ||
headers: buildHeaders(token) as Record<string, string> | ||
}; | ||
|
||
const { files } = innerProps; | ||
let langList = lang_list.split(','); | ||
|
||
if (langList.length === 0) { | ||
langList = ['ch']; | ||
} | ||
const url = `${base_url}/file_parse`; | ||
|
||
const result: Record<string, ResultItemType[]> = {}; | ||
|
||
// 构造一次性多文件上传的表单 | ||
const formData = new FormData(); | ||
for (const filePath of files) { | ||
const fileblob = await fetch(filePath).then((res) => res.blob()); | ||
const baseName = path.basename(filePath.split('?')[0]); | ||
formData.append('files', fileblob, baseName); | ||
} | ||
formData.append('server_url', innerProps.sglang_server_url); | ||
formData.append('lang_list', innerProps.lang_list); | ||
formData.append('backend', innerProps.backend); | ||
formData.append('parse_method', innerProps.parse_method); | ||
formData.append('formula_enable', innerProps.formula_enable.toString()); | ||
formData.append('table_enable', innerProps.table_enable.toString()); | ||
formData.append('return_md', innerProps.return_md.toString()); | ||
formData.append('return_content_list', innerProps.return_content_list.toString()); | ||
formData.append('return_images', true.toString()); | ||
|
||
const requestHeaders: Record<string, string> = { ...innerProps.headers }; | ||
|
||
const res = await fetch(url, { | ||
method: 'POST', | ||
headers: requestHeaders, | ||
body: formData | ||
}); | ||
|
||
if (res.status !== 200) { | ||
return Promise.reject(`Parse failed: ${res.status} ${res.statusText} ${await res.text()}`); | ||
} | ||
|
||
const data: ParsedResultType = await res.json(); | ||
|
||
if (!data.results) { | ||
return Promise.reject('Parsed result is empty'); | ||
} | ||
|
||
for (const [parsedFilename, result_item] of Object.entries(data.results ?? {})) { | ||
const item: ResultItemType = { | ||
filename: parsedFilename | ||
}; | ||
const images: Record<string, string> = {}; | ||
|
||
if (result_item.images) { | ||
item.images = []; | ||
for (const [key, value] of Object.entries(result_item.images)) { | ||
const accessUrl = await uploadBase64Image(key, value); | ||
item.images.push(accessUrl); | ||
images[key] = accessUrl; | ||
} | ||
} | ||
|
||
if (result_item.content_list) { | ||
try { | ||
item.content_list = JSON.parse(result_item.content_list); | ||
} catch (error) { | ||
throw new Error('content_list is not a valid JSON string'); | ||
} | ||
} | ||
|
||
if (result_item.md_content) { | ||
item.md_content = replaceImageUrl(result_item.md_content, images); | ||
} | ||
|
||
if (!result[parsedFilename]) { | ||
result[parsedFilename] = []; | ||
} | ||
result[parsedFilename].push(item); | ||
} | ||
|
||
return { | ||
result | ||
}; | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.