-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkreuzberg.js
More file actions
101 lines (93 loc) · 2.9 KB
/
Copy pathkreuzberg.js
File metadata and controls
101 lines (93 loc) · 2.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/**
* Kreuzberg html-to-markdown integration module.
*
* Provides high-performance HTML to Markdown conversion using the
* @kreuzberg/html-to-markdown-node library (Rust-powered, 150-280 MB/s).
*
* This converter is available as an alternative to the default Turndown-based
* converter, selectable via the `converter=kreuzberg` query parameter.
*
* @module kreuzberg
* @see https://github.com/kreuzberg-dev/html-to-markdown
*/
let _initPromise = null;
let _convert = null;
async function ensureLoaded() {
if (_convert) {
return _convert;
}
if (_initPromise) {
return _initPromise;
}
_initPromise = import('@kreuzberg/html-to-markdown-node')
.then((mod) => {
_convert = mod.convert;
return _convert;
})
.catch(() => {
_convert = null;
return null;
});
return await _initPromise;
}
/**
* Check if the kreuzberg converter is available.
*
* @returns {Promise<boolean>} Whether the converter is available
*/
export async function isKreuzbergAvailable() {
const fn = await ensureLoaded();
return fn !== null;
}
/**
* Convert HTML to Markdown using the kreuzberg html-to-markdown library.
*
* Returns a structured result with content, metadata, tables, images, and warnings.
*
* @param {string} html - HTML content to convert
* @param {Object} [options] - Conversion options
* @param {string} [options.headingStyle='Atx'] - Heading style ('Atx' or 'Setext')
* @param {string} [options.bulletListMarker] - Bullet character ('-', '*', '+')
* @param {string} [options.codeBlockStyle] - Code block style ('Fenced' or 'Indented')
* @returns {Promise<Object>} Structured conversion result
* @returns {string} result.content - The converted markdown content
* @returns {Object|null} result.metadata - Extracted metadata (title, links, headings, images, etc.)
* @returns {Array} result.tables - Extracted table data
* @returns {Array} result.images - Extracted image data
* @returns {Array} result.warnings - Non-fatal conversion warnings
* @throws {Error} If the kreuzberg converter is not available
*/
export async function convertWithKreuzberg(html, options = {}) {
const convert = await ensureLoaded();
if (!convert) {
throw new Error(
'Kreuzberg html-to-markdown is not installed. ' +
'Run: npm install @kreuzberg/html-to-markdown-node'
);
}
const convertOptions = {
headingStyle: 'Atx',
codeBlockStyle: 'Backticks',
...options,
};
const result = convert(html, convertOptions);
// Parse the metadata JSON string into an object
let metadata = null;
if (result.metadata) {
try {
metadata =
typeof result.metadata === 'string'
? JSON.parse(result.metadata)
: result.metadata;
} catch {
metadata = null;
}
}
return {
content: result.content || '',
metadata,
tables: result.tables || [],
images: result.images || [],
warnings: result.warnings || [],
};
}