Skip to content

Commit 0834ed9

Browse files
authored
Merge pull request #129 from link-assistant/issue-117-6f9cc6105c70
Stop deleting Google Docs images on `--keep-original-links` (empty-alt strip)
2 parents b0a5a4e + 112f2f9 commit 0834ed9

9 files changed

Lines changed: 96 additions & 14 deletions

File tree

.gitkeep

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
# .gitkeep file auto-generated at 2026-04-06T22:55:07.509Z for PR creation at branch issue-31-e38e38b91777 for issue https://github.com/link-assistant/web-capture/issues/31
22
# Updated: 2026-04-14T15:38:45.969Z
3-
# Updated: 2026-05-11T11:39:08.824Z
3+
# Updated: 2026-05-11T11:39:08.824Z
4+
# Updated: 2026-05-11T12:31:37.540Z
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
'@link-assistant/web-capture': patch
3+
---
4+
5+
Fix `stripBase64Images` (used by `--keep-original-links`) dropping every base64 image with empty alt text instead of leaving a visible placeholder. Google Docs HTML exports emit `<img alt="" src="data:image/png;base64,...">` for every image, so the previous behaviour silently deleted all images from the rendered markdown. Empty-alt now renders as `![]()` (a valid empty image reference); non-empty alt continues to render as `*[image: <alt>]*`.

js/src/extract-images.js

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,16 @@ export function extractBase64ToBuffers(markdown, imagesDir = 'images') {
9595
}
9696

9797
/**
98-
* Strip base64 data URI images from markdown, leaving only the alt text
99-
* as a placeholder. Used when keepOriginalLinks is enabled — base64 images
100-
* have no original URL to restore, so we remove the heavy data URI.
98+
* Strip base64 data URI images from markdown, leaving a visible placeholder.
99+
* Used when keepOriginalLinks is enabled — base64 images have no original URL
100+
* to restore, so we remove the heavy data URI but still leave a marker so the
101+
* reader can see that an image was here.
102+
*
103+
* Non-empty alt becomes `*[image: <alt>]*`. Empty alt — common for Google Docs
104+
* HTML exports, which emit `<img alt="" src="data:...">` for every image —
105+
* becomes `![]()`, an empty markdown image reference that renderers still
106+
* surface as a slot. Emitting `''` for empty-alt would silently delete every
107+
* image in the document (see issue #117).
101108
*
102109
* @param {string} markdown - Markdown content with data URI images
103110
* @returns {{markdown: string, stripped: number}}
@@ -108,7 +115,7 @@ export function stripBase64Images(markdown) {
108115
BASE64_IMAGE_REGEX,
109116
(_match, altText) => {
110117
stripped++;
111-
return altText ? `*[image: ${altText}]*` : '';
118+
return altText ? `*[image: ${altText}]*` : '![]()';
112119
}
113120
);
114121
return { markdown: updatedMarkdown, stripped };

js/tests/unit/extract-images.test.js

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -204,12 +204,18 @@ describe('extract-images module', () => {
204204
expect(result.markdown).not.toContain('data:image');
205205
});
206206

207-
it('strips base64 images with empty alt text', () => {
208-
const md = `![](data:image/png;base64,${TINY_PNG})`;
207+
it('leaves a visible placeholder when stripping a base64 image with empty alt', () => {
208+
// Google Docs HTML exports emit `<img alt="" src="data:image/png;base64,...">`,
209+
// which renders as `![](data:...)`. If stripping drops the markdown image
210+
// entirely, every image in the doc vanishes from the output with no
211+
// indication that anything was lost. Leave a visible placeholder so the
212+
// reader can see that an image was here.
213+
const md = `Hi.\n\n![](data:image/png;base64,${TINY_PNG})\n\nBye.\n`;
209214
const result = stripBase64Images(md);
210215

211216
expect(result.stripped).toBe(1);
212-
expect(result.markdown).toBe('');
217+
expect(result.markdown).not.toMatch(/data:image/);
218+
expect(result.markdown).toMatch(/!\[|\[image/);
213219
});
214220

215221
it('preserves remote image URLs', () => {

rust/Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

rust/src/extract_images.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,15 +157,21 @@ pub struct ImageBuffer {
157157
pub data: Vec<u8>,
158158
}
159159

160-
/// Strip base64 data URI images from markdown, leaving alt text placeholders.
160+
/// Strip base64 data URI images from markdown, leaving a visible placeholder.
161+
///
162+
/// Non-empty alt becomes `*[image: <alt>]*`. Empty alt — common for Google
163+
/// Docs HTML exports, which emit `<img alt="" src="data:...">` for every
164+
/// image — becomes `![]()`, an empty markdown image reference that renderers
165+
/// still surface as a slot. Emitting `""` for empty-alt would silently delete
166+
/// every image in the document (see issue #117).
161167
#[must_use]
162168
pub fn strip_base64_images(markdown: &str) -> StrippedResult {
163169
let mut stripped = 0;
164170
let updated = base64_md_image_pattern().replace_all(markdown, |caps: &regex::Captures<'_>| {
165171
stripped += 1;
166172
let alt_text = &caps[1];
167173
if alt_text.is_empty() {
168-
String::new()
174+
"![]()".to_string()
169175
} else {
170176
format!("*[image: {alt_text}]*")
171177
}

rust/tests/integration/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ mod html2md_br_in_list_item;
1212
mod html2md_ol_numbering;
1313
mod markdown_no_empty_title;
1414
mod paragraph_vs_line_break;
15+
mod strip_base64_empty_alt;
1516
mod themed_image;
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
//! Regression test for #117 — `--keep-original-links` on Google Docs API
2+
//! exports stripped every image entirely because the HTML export emits
3+
//! `<img alt="" src="data:image/png;base64,...">`, the markdown converter
4+
//! rendered that as `![](data:...)`, and the strip helper only emitted a
5+
//! placeholder when `alt` was non-empty. The result was a silently
6+
//! image-less document with no indication that anything was lost.
7+
8+
use web_capture::extract_images::strip_base64_images;
9+
10+
// 1x1 red PNG pixel as base64.
11+
const TINY_PNG: &str = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==";
12+
13+
#[test]
14+
fn strip_keeps_a_visible_placeholder_for_empty_alt() {
15+
let md = format!("P1.\n\n![](data:image/png;base64,{TINY_PNG})\n\nP2.\n");
16+
let r = strip_base64_images(&md);
17+
assert_eq!(r.stripped, 1);
18+
assert!(!r.markdown.contains("data:image"));
19+
assert!(
20+
r.markdown.contains("![") || r.markdown.contains("[image"),
21+
"stripping must leave a visible placeholder; got:\n{}",
22+
r.markdown
23+
);
24+
}
25+
26+
#[test]
27+
fn strip_keeps_empty_alt_placeholder_distinct_from_non_empty_alt() {
28+
// Non-empty alt still produces the `*[image: ...]*` form so authors can
29+
// read the alt text. Empty alt produces a structural placeholder so a
30+
// human can still tell that an image was here.
31+
let md = format!(
32+
"![](data:image/png;base64,{TINY_PNG})\n\n![photo](data:image/png;base64,{TINY_PNG})\n"
33+
);
34+
let r = strip_base64_images(&md);
35+
assert_eq!(r.stripped, 2);
36+
assert!(!r.markdown.contains("data:image"));
37+
assert!(r.markdown.contains("*[image: photo]*"));
38+
// Empty-alt branch leaves an `![](...)` style placeholder, so the line
39+
// count (and image count, when grepping for `![`) is preserved.
40+
let placeholder_count = r.markdown.matches("![").count();
41+
assert!(
42+
placeholder_count >= 1,
43+
"expected at least one `![` placeholder for the empty-alt image; got:\n{}",
44+
r.markdown
45+
);
46+
}

rust/tests/unit/extract_images.rs

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -206,13 +206,23 @@ fn test_strip_base64_images_with_alt() {
206206
}
207207

208208
#[test]
209-
fn test_strip_base64_images_empty_alt() {
210-
let md = format!("![](data:image/png;base64,{TINY_PNG})");
209+
fn test_strip_base64_images_empty_alt_leaves_visible_placeholder() {
210+
// Google Docs HTML exports emit `<img alt="" src="data:image/png;base64,...">`,
211+
// which renders as `![](data:...)`. If the stripper drops the markdown image
212+
// entirely, every image in the doc vanishes from the output with no
213+
// indication that anything was lost. Leave a visible placeholder so the
214+
// reader can see that an image was here.
215+
let md = format!("Hi.\n\n![](data:image/png;base64,{TINY_PNG})\n\nBye.\n");
211216

212217
let result = strip_base64_images(&md);
213218

214219
assert_eq!(result.stripped, 1);
215-
assert!(result.markdown.is_empty());
220+
assert!(!result.markdown.contains("data:image"));
221+
assert!(
222+
result.markdown.contains("![") || result.markdown.contains("[image"),
223+
"stripping must leave a visible placeholder; got:\n{}",
224+
result.markdown
225+
);
216226
}
217227

218228
#[test]

0 commit comments

Comments
 (0)