Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 56 additions & 29 deletions bun.lock

Large diffs are not rendered by default.

117 changes: 117 additions & 0 deletions modules/tool/packages/mineru/children/parseLocal/config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import { defineTool } from '@tool/type';
import { FlowNodeInputTypeEnum, WorkflowIOValueTypeEnum } from '@tool/type/fastgpt';
import { ToolTypeEnum } from '@tool/type/tool';

export default defineTool({
type: ToolTypeEnum.tools,
name: {
'zh-CN': '本地部署解析',
en: 'Parse file by local MinerU api v2'
},
description: {
'zh-CN': '使用本地部署的 MinerU api v2 解析文件,支持 pdf、png、jpg、jpeg 等多种格式',
en: 'Parse the file using the local MinerU api v2, support pdf, png, jpg, jpeg等多种格式'
},
courseUrl: 'https://github.com/opendatalab/MinerU/blob/master/mineru/cli/fast_api.py#L63',
versionList: [
{
value: '0.1.0',
description: 'Default version',
inputs: [
{
key: 'files',
label: 'files',
renderTypeList: [FlowNodeInputTypeEnum.fileSelect, FlowNodeInputTypeEnum.reference],
valueType: WorkflowIOValueTypeEnum.arrayString,
required: true,
description: '需要解析的文件(支持.pdf、.png、.jpg、.jpeg 多种格式)',
canSelectFile: true,
canSelectImg: true
},
{
key: 'parse_method',
label: '解析方法',
renderTypeList: [FlowNodeInputTypeEnum.select],
valueType: WorkflowIOValueTypeEnum.string,
list: [
{ label: 'auto', value: 'auto' },
{ label: 'ocr', value: 'ocr' },
{ label: 'txt', value: 'txt' }
],
required: false,
description: '解析方法,默认 auto',
defaultValue: 'auto'
},
{
key: 'formula_enable',
label: '开启公式识别',
renderTypeList: [FlowNodeInputTypeEnum.switch],
valueType: WorkflowIOValueTypeEnum.boolean,
required: false,
description: '是否启动公式识别功能,默认 true',
defaultValue: true
},
{
key: 'table_enable',
label: '开启表格识别',
renderTypeList: [FlowNodeInputTypeEnum.switch],
valueType: WorkflowIOValueTypeEnum.boolean,
required: false,
description: '是否启动表格识别功能,默认 true',
defaultValue: true
},
{
key: 'return_content_list',
label: '返回结构化 json',
renderTypeList: [FlowNodeInputTypeEnum.switch],
valueType: WorkflowIOValueTypeEnum.boolean,
required: false,
description: '是否返回结构化 json,默认 false',
defaultValue: false
},
{
key: 'lang_list',
label: '文档语言',
renderTypeList: [FlowNodeInputTypeEnum.textarea],
valueType: WorkflowIOValueTypeEnum.string,
required: false,
description:
'指定文档语言,默认 ch,长度跟文件数量一致,否则取第一个,按逗号分隔,其他可选值列表详见:https://www.paddleocr.ai/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html#4-supported-languages-and-abbreviations',
defaultValue: 'ch'
},
{
key: 'backend',
label: '解析后端',
renderTypeList: [FlowNodeInputTypeEnum.select],
valueType: WorkflowIOValueTypeEnum.string,
required: false,
description: 'mineru 解析后端,默认pipeline。',
list: [
{ label: 'pipeline', value: 'pipeline' },
{ label: 'vlm-transformers', value: 'vlm-transformers' },
{ label: 'vlm-sglang-engine', value: 'vlm-sglang-engine' },
{ label: 'vlm-sglang-client', value: 'vlm-sglang-client' }
],
defaultValue: 'pipeline'
},
{
key: 'sglang_server_url',
label: 'sglang 服务地址',
renderTypeList: [FlowNodeInputTypeEnum.input],
valueType: WorkflowIOValueTypeEnum.string,
required: false,
description: 'sglang 服务地址,当 backend 为 vlm-sglang-client 时必填。',
defaultValue: ''
}
],
outputs: [
{
valueType: WorkflowIOValueTypeEnum.arrayObject,
key: 'result',
label: '解析结果',
description: '解析后的数据'
}
]
}
]
});
10 changes: 10 additions & 0 deletions modules/tool/packages/mineru/children/parseLocal/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import config from './config';
import { InputType, OutputType, tool as toolCb } from './src';
import { exportTool } from '@tool/utils/tool';

export default exportTool({
toolCb,
InputType,
OutputType,
config
});
172 changes: 172 additions & 0 deletions modules/tool/packages/mineru/children/parseLocal/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import { uploadFile } from '@tool/utils/uploadFile';
import { z } from 'zod';
import path from 'path';

export const InputType = z.object({
base_url: z.string(),
token: z.string().optional().default(''),
files: z.array(z.string()),
parse_method: z.string().optional().default('auto'),
formula_enable: z.boolean().optional().default(true),
table_enable: z.boolean().optional().default(true),
return_md: z.boolean().optional().default(true),
return_content_list: z.boolean().optional().default(false),
lang_list: z.string().optional().default('ch'),
backend: z.string().optional().default('pipeline'),
sglang_server_url: z.string().optional().default('')
});

interface InnerPropsType extends z.infer<typeof InputType> {
headers: Record<string, string>;
}

interface ParsedResultItemType {
images: Record<string, string>;
content_list?: string;
md_content: string;
}

interface ParsedResultType {
results: Record<string, ParsedResultItemType>;
}

interface ResultItemType {
filename: string;
images?: string[];
content_list?: any[];
md_content?: string;
}

export const OutputType = z.object({
result: z.record(
z.array(
z.object({
filename: z.string(),
images: z.array(z.string()).optional(),
content_list: z.array(z.any()).optional(),
md_content: z.string().optional()
})
)
)
});

function buildHeaders(token?: string) {
if (token) {
return {
Authorization: `Bearer ${token}`
};
}

return {};
}

async function uploadBase64Image(filename: string, content: string) {
const { accessUrl } = await uploadFile({
base64: content,
defaultFilename: filename
});

return accessUrl;
}

function replaceImageUrl(content: string, images: Record<string, string>) {
for (const [key, value] of Object.entries(images)) {
content = content.replace(new RegExp(`images/${key}`, 'g'), value);
}
return content;
}

export async function tool(props: z.infer<typeof InputType>): Promise<z.infer<typeof OutputType>> {
const { base_url, token, lang_list, files: propsFiles } = props;

if (!base_url) {
return Promise.reject('MinerU base url is required');
}

const innerProps: InnerPropsType = {
...props,
headers: buildHeaders(token) as Record<string, string>
};

const { files } = innerProps;
let langList = lang_list.split(',');

if (langList.length === 0) {
langList = ['ch'];
}
const url = `${base_url}/file_parse`;

const result: Record<string, ResultItemType[]> = {};

// 构造一次性多文件上传的表单
const formData = new FormData();
for (const filePath of files) {
const fileblob = await fetch(filePath).then((res) => res.blob());
const baseName = path.basename(filePath.split('?')[0]);
formData.append('files', fileblob, baseName);
}
formData.append('server_url', innerProps.sglang_server_url);
formData.append('lang_list', innerProps.lang_list);
formData.append('backend', innerProps.backend);
formData.append('parse_method', innerProps.parse_method);
formData.append('formula_enable', innerProps.formula_enable.toString());
formData.append('table_enable', innerProps.table_enable.toString());
formData.append('return_md', innerProps.return_md.toString());
formData.append('return_content_list', innerProps.return_content_list.toString());
formData.append('return_images', true.toString());

const requestHeaders: Record<string, string> = { ...innerProps.headers };

const res = await fetch(url, {
method: 'POST',
headers: requestHeaders,
body: formData
});

if (res.status !== 200) {
return Promise.reject(`Parse failed: ${res.status} ${res.statusText} ${await res.text()}`);
}

const data: ParsedResultType = await res.json();

if (!data.results) {
return Promise.reject('Parsed result is empty');
}

for (const [parsedFilename, result_item] of Object.entries(data.results ?? {})) {
const item: ResultItemType = {
filename: parsedFilename
};
const images: Record<string, string> = {};

if (result_item.images) {
item.images = [];
for (const [key, value] of Object.entries(result_item.images)) {
const accessUrl = await uploadBase64Image(key, value);
item.images.push(accessUrl);
images[key] = accessUrl;
}
}

if (result_item.content_list) {
try {
item.content_list = JSON.parse(result_item.content_list);
} catch (error) {
throw new Error('content_list is not a valid JSON string');
}
}

if (result_item.md_content) {
item.md_content = replaceImageUrl(result_item.md_content, images);
}

if (!result[parsedFilename]) {
result[parsedFilename] = [];
}
result[parsedFilename].push(item);
}

return {
result
};
}
Loading