Skip to content

Commit dc39574

Browse files
committed
add markitdown
1 parent 399a62b commit dc39574

File tree

7 files changed

+423
-0
lines changed

7 files changed

+423
-0
lines changed

tools/markitdown/.shed.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
name: markitdown
2+
owner: bgruening
3+
type: tool
4+
description: Convert documents to Markdown using markitdown
5+
homepage_url: https://github.com/microsoft/markitdown

tools/markitdown/markitdown.xml

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
<tool id="markitdown" name="Markitdown" version="@TOOL_VERSION@" profile="22.05">
2+
<description>Convert documents to Markdown</description>
3+
4+
<!-- Macros inline -->
5+
<macros>
6+
<token name="@TOOL_VERSION@">0.1.1</token>
7+
</macros>
8+
9+
<requirements>
10+
<container type="docker">quay.io/bgruening/markitdown:@TOOL_VERSION@</container>
11+
</requirements>
12+
13+
<command detect_errors="exit_code"><![CDATA[
14+
#set ext_map = {
15+
'pdf': 'pdf', 'docx': 'docx', 'pptx': 'pptx', 'xlsx': 'xlsx',
16+
'html': 'html', 'txt': 'txt', 'ipynb': 'ipynb',
17+
'markdown': 'md', 'zip': 'zip', 'tabular': 'csv', 'csv': 'csv'
18+
}
19+
20+
#set file_ext = ext_map.get($input.ext, '')
21+
#set final_ext = $ext_hint if $ext_hint else $file_ext
22+
23+
markitdown
24+
${input}
25+
-x $final_ext
26+
#if $mime_type:
27+
-m $mime_opt
28+
#end if
29+
#if $charset:
30+
-c "$charset_opt"
31+
#end if
32+
$keep_data_uris
33+
-o '$output'
34+
]]></command>
35+
36+
<inputs>
37+
<param name="input" type="data" format="pdf,docx,pptx,xlsx,html,txt,ipynb,markdown,zip,tabular"
38+
label="Input file"/>
39+
<param name="ext_hint" type="text" optional="true" label="Extension override"/>
40+
<param name="mime_type" type="text" optional="true" label="MIME type hint"/>
41+
<param name="charset" type="text" optional="true" label="Character set (e.g. UTF-8)"/>
42+
<param name="keep_data_uris" type="boolean" truevalue="--keep-data-uris" falsevalue="" label="Keep embedded data URIs"/>
43+
</inputs>
44+
45+
<outputs>
46+
<data name="output" format="markdown" label="Converted Markdown output"/>
47+
</outputs>
48+
49+
<tests>
50+
<test>
51+
<param name="input" value="EAR.pdf" ftype="pdf"/>
52+
<output name="output">
53+
<assert_contents>
54+
<has_text text="Tags: ERGA-BGE"/>
55+
<has_text text="Lineage: mammalia_odb10"/>
56+
</assert_contents>
57+
</output>
58+
</test>
59+
60+
<test>
61+
<param name="input" value="example.docx" ftype="docx"/>
62+
<output name="output">
63+
<assert_contents>
64+
<has_text text="# Lorem ipsum dolor sit amet, consectetur adipiscing elit."/>
65+
</assert_contents>
66+
</output>
67+
</test>
68+
69+
<!--test>
70+
<param name="input" value="example.odt"/>
71+
<param name="ext_hint" value="odt"/>
72+
<output name="output">
73+
<assert_contents>
74+
<has_text text="This is a Word document"/>
75+
</assert_contents>
76+
</output>
77+
</test-->
78+
79+
<test>
80+
<param name="input" value="report_4.html" ftype="html"/>
81+
<param name="keep_data_uris" value="true"/>
82+
<output name="output">
83+
<assert_contents>
84+
<has_text text="is the contig length such that using longer or equal length contigs produces"/>
85+
</assert_contents>
86+
</output>
87+
</test>
88+
89+
<test>
90+
<param name="input" value="example.txt" ftype="txt"/>
91+
<param name="ext_hint" value="txt"/>
92+
<output name="output">
93+
<assert_contents>
94+
<has_text text="This is a plain text file"/>
95+
</assert_contents>
96+
</output>
97+
</test>
98+
99+
<test>
100+
<param name="input" value="example.ipynb" ftype="ipynb"/>
101+
<output name="output">
102+
<assert_contents>
103+
<has_text text="print(&quot;Hello, world!&quot;)"/>
104+
</assert_contents>
105+
</output>
106+
</test>
107+
</tests>
108+
109+
<help><![CDATA[
110+
**Markitdown** converts rich document formats (PDF, DOCX, HTML, etc.) to Markdown.
111+
112+
---
113+
114+
### Supported Formats:
115+
116+
- PDF, DOCX, PPTX, XLSX
117+
- HTML, TXT, Markdown
118+
- Jupyter Notebooks (IPYNB)
119+
- ZIP containing supported formats
120+
- Tabular (CSV)
121+
122+
---
123+
124+
### Options:
125+
126+
- **Extension override** (`-x`): hint for file type if not obvious
127+
- **MIME type** (`-m`): manual MIME hint
128+
- **Charset** (`-c`): text encoding hint
129+
- **Keep data URIs**: retain base64-encoded images
130+
131+
Project: https://github.com/microsoft/markitdown
132+
]]></help>
133+
134+
<citations>
135+
<citation type="bibtex">
136+
@misc{markitdown2024,
137+
author = {Microsoft},
138+
title = {markitdown: Convert documents to markdown},
139+
year = {2024},
140+
howpublished = {\url{https://github.com/microsoft/markitdown}}
141+
}
142+
</citation>
143+
</citations>
144+
</tool>

tools/markitdown/test-data/EAR.pdf

329 KB
Binary file not shown.
109 KB
Binary file not shown.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"print(\"Hello, world!\")"
10+
]
11+
}
12+
],
13+
"metadata": {
14+
"language_info": {
15+
"name": "python"
16+
}
17+
},
18+
"nbformat": 4,
19+
"nbformat_minor": 5
20+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
This is a plain text file.
2+

0 commit comments

Comments
 (0)