Skip to content

Commit 10d0f9b

Browse files
authored
Implement ImageProcessingMode which dictates how images should be handled during conversion. (#216)
The idea of this commit is to provide a working example of being able to dictate how HtmlToOpenXml should handle images marked as HTTP/HTTPS. I've also provided handling for data uris so that only data URI images can be embedded. Example implementation - taken from a modified form of examples\Demo\Program.cs string inputFile = "C:\\ConvertedHtml.html"; string htmlContent = await File.ReadAllTextAsync(inputFile); string outputPath = "C:\\ConvertedHtml.docx"; using WordprocessingDocument wordDoc = WordprocessingDocument.Create(outputPath, WordprocessingDocumentType.Document); MainDocumentPart mainPart = wordDoc.AddMainDocumentPart(); mainPart.Document = new Document(new DocumentFormat.OpenXml.Wordprocessing.Body()); HtmlConverter htmlConverter = new(mainPart, new DefaultWebRequest()) { ImageProcessing = ImageProcessingMode.LinkExternal }; await htmlConverter.ParseBody(htmlContent); AssertThatOpenXmlDocumentIsValid(wordDoc); mainPart.Document.Save();
1 parent 3e3ccdd commit 10d0f9b

File tree

6 files changed

+334
-31
lines changed

6 files changed

+334
-31
lines changed

src/Html2OpenXml/Configuration enum.cs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,4 +46,27 @@ public readonly struct QuoteChars(string begin, string end)
4646

4747
internal string Prefix { get; } = begin;
4848
internal string Suffix { get; } = end;
49-
}
49+
}
50+
51+
/// <summary>
52+
/// Specifies how images should be processed during HTML to OpenXML conversion.
53+
/// </summary>
54+
public enum ImageProcessingMode
55+
{
56+
/// <summary>
57+
/// Downloads and embeds all images into the document (default behaviour).
58+
/// This creates self-contained documents but may result in large file sizes.
59+
/// </summary>
60+
Embed = 0,
61+
/// <summary>
62+
/// Links to external images via external relationships instead of downloading them.
63+
/// This keeps document size small but images won't display offline or if URLs become unavailable.
64+
/// Data URI images (base64 encoded) are still embedded.
65+
/// </summary>
66+
LinkExternal = 1,
67+
/// <summary>
68+
/// Only embeds data URI images (base64 encoded inline images).
69+
/// External images (http/https/file) are skipped entirely.
70+
/// </summary>
71+
EmbedDataUriOnly = 2,
72+
}

src/Html2OpenXml/Expressions/Image/ImageExpression.cs

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,14 @@ class ImageExpression(IHtmlImageElement node) : ImageExpressionBase(node)
9494
preferredSize = ImageHeader.KeepAspectRatio(actualSize, preferredSize);
9595
}
9696

97+
// If size is still empty (e.g., for external linked images), use default dimensions
98+
if (preferredSize.IsEmpty || preferredSize.Width <= 0 || preferredSize.Height <= 0)
99+
{
100+
// Use default size for external images or when size cannot be determined
101+
// Default to a reasonable size (similar to how browsers handle images with unknown dimensions)
102+
preferredSize = new Size(300, 200);
103+
}
104+
97105
long widthInEmus = new Unit(UnitMetric.Pixel, preferredSize.Width).ValueInEmus;
98106
long heightInEmus = new Unit(UnitMetric.Pixel, preferredSize.Height).ValueInEmus;
99107

@@ -103,22 +111,26 @@ class ImageExpression(IHtmlImageElement node) : ImageExpressionBase(node)
103111
new wp.Extent() { Cx = widthInEmus, Cy = heightInEmus },
104112
new wp.EffectExtent() { LeftEdge = 19050L, TopEdge = 0L, RightEdge = 0L, BottomEdge = 0L },
105113
new wp.DocProperties() { Id = drawingObjId, Name = "Picture " + imageObjId, Description = string.Empty },
106-
new wp.NonVisualGraphicFrameDrawingProperties {
114+
new wp.NonVisualGraphicFrameDrawingProperties
115+
{
107116
GraphicFrameLocks = new a.GraphicFrameLocks() { NoChangeAspect = true }
108117
},
109118
new a.Graphic(
110119
new a.GraphicData(
111120
new pic.Picture(
112-
new pic.NonVisualPictureProperties {
113-
NonVisualDrawingProperties = new pic.NonVisualDrawingProperties() {
121+
new pic.NonVisualPictureProperties
122+
{
123+
NonVisualDrawingProperties = new pic.NonVisualDrawingProperties()
124+
{
114125
Id = imageObjId,
115126
Name = DataUri.IsWellFormed(src) ? string.Empty : src,
116-
Description = alt },
127+
Description = alt
128+
},
117129
NonVisualPictureDrawingProperties = new pic.NonVisualPictureDrawingProperties(
118130
new a.PictureLocks() { NoChangeAspect = true, NoChangeArrowheads = true })
119131
},
120132
new pic.BlipFill(
121-
new a.Blip() { Embed = iinfo.ImagePartId },
133+
CreateBlip(iinfo, src),
122134
new a.SourceRectangle(),
123135
new a.Stretch(
124136
new a.FillRectangle())),
@@ -128,10 +140,14 @@ class ImageExpression(IHtmlImageElement node) : ImageExpressionBase(node)
128140
new a.Extents() { Cx = widthInEmus, Cy = heightInEmus }),
129141
new a.PresetGeometry(
130142
new a.AdjustValueList()
131-
) { Preset = a.ShapeTypeValues.Rectangle }
132-
) { BlackWhiteMode = a.BlackWhiteModeValues.Auto })
133-
) { Uri = "http://schemas.openxmlformats.org/drawingml/2006/picture" })
134-
) { DistanceFromTop = (UInt32Value) 0U, DistanceFromBottom = (UInt32Value) 0U, DistanceFromLeft = (UInt32Value) 0U, DistanceFromRight = (UInt32Value) 0U }
143+
)
144+
{ Preset = a.ShapeTypeValues.Rectangle }
145+
)
146+
{ BlackWhiteMode = a.BlackWhiteModeValues.Auto })
147+
)
148+
{ Uri = "http://schemas.openxmlformats.org/drawingml/2006/picture" })
149+
)
150+
{ DistanceFromTop = (UInt32Value)0U, DistanceFromBottom = (UInt32Value)0U, DistanceFromLeft = (UInt32Value)0U, DistanceFromRight = (UInt32Value)0U }
135151
);
136152

137153
return img;
@@ -147,11 +163,28 @@ private static int GetDimension(HtmlAttributeCollection styles, string primarySt
147163

148164
if (unit.IsValid)
149165
{
150-
return unit.Type == UnitMetric.Percent?
166+
return unit.Type == UnitMetric.Percent ?
151167
(int)(unit.Value * percentageBase / 100) :
152168
unit.ValueInPx;
153169
}
154170

155171
return 0;
156172
}
157-
}
173+
174+
/// <summary>
175+
/// Creates a Blip element with either an embedded or external image reference.
176+
/// </summary>
177+
private static a.Blip CreateBlip(HtmlImageInfo iinfo, string? src)
178+
{
179+
if (iinfo.IsExternal)
180+
{
181+
// Use Link property for external images
182+
return new a.Blip() { Link = iinfo.ImagePartId };
183+
}
184+
else
185+
{
186+
// Use Embed property for embedded images (default behaviour)
187+
return new a.Blip() { Embed = iinfo.ImagePartId };
188+
}
189+
}
190+
}

src/Html2OpenXml/HtmlConverter.cs

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -58,20 +58,20 @@ public HtmlConverter(MainDocumentPart mainPart, IWebRequest? webRequester = null
5858
}
5959

6060
/// <summary>
61-
/// Parse some HTML content where the output is intented to be inserted in <see cref="MainDocumentPart"/>.
61+
/// Parse some HTML content where the output is intended to be inserted in <see cref="MainDocumentPart"/>.
6262
/// </summary>
6363
/// <param name="html">The HTML content to parse</param>
6464
/// <returns>Returns a list of parsed paragraph.</returns>
6565
public IList<OpenXmlCompositeElement> Parse(string html)
6666
{
67-
bodyImageLoader ??= new ImagePrefetcher<MainDocumentPart>(mainPart, webRequester);
67+
bodyImageLoader ??= new ImagePrefetcher<MainDocumentPart>(mainPart, webRequester, ImageProcessing);
6868
return ParseCoreAsync(html, mainPart, bodyImageLoader,
6969
new ParallelOptions() { CancellationToken = CancellationToken.None })
7070
.ConfigureAwait(false).GetAwaiter().GetResult().ToList();
7171
}
7272

7373
/// <summary>
74-
/// Start the asynchroneous parse processing where the output is intented to be inserted in <see cref="MainDocumentPart"/>.
74+
/// Start the asynchronous parse processing where the output is intended to be inserted in <see cref="MainDocumentPart"/>.
7575
/// </summary>
7676
/// <param name="html">The HTML content to parse</param>
7777
/// <param name="cancellationToken">The cancellation token.</param>
@@ -84,7 +84,7 @@ public Task<IEnumerable<OpenXmlCompositeElement>> Parse(string html, Cancellatio
8484
}
8585

8686
/// <summary>
87-
/// Start the asynchroneous parse processing where the output is intented to be inserted in <see cref="MainDocumentPart"/>.
87+
/// Start the asynchronous parse processing where the output is intended to be inserted in <see cref="MainDocumentPart"/>.
8888
/// </summary>
8989
/// <param name="html">The HTML content to parse</param>
9090
/// <param name="cancellationToken">The cancellation token.</param>
@@ -95,20 +95,20 @@ public Task<IEnumerable<OpenXmlCompositeElement>> ParseAsync(string html, Cancel
9595
}
9696

9797
/// <summary>
98-
/// Start the asynchroneous parse processing where the output is intented to be inserted in <see cref="MainDocumentPart"/>.
98+
/// Start the asynchronous parse processing where the output is intended to be inserted in <see cref="MainDocumentPart"/>.
9999
/// </summary>
100100
/// <param name="html">The HTML content to parse</param>
101101
/// <param name="parallelOptions">The configuration of parallelism while downloading the remote resources.</param>
102102
/// <returns>Returns a list of parsed paragraph.</returns>
103103
public Task<IEnumerable<OpenXmlCompositeElement>> ParseAsync(string html, ParallelOptions parallelOptions)
104104
{
105-
bodyImageLoader ??= new ImagePrefetcher<MainDocumentPart>(mainPart, webRequester);
105+
bodyImageLoader ??= new ImagePrefetcher<MainDocumentPart>(mainPart, webRequester, ImageProcessing);
106106

107107
return ParseCoreAsync(html, mainPart, bodyImageLoader, parallelOptions);
108108
}
109109

110110
/// <summary>
111-
/// Parse asynchroneously the Html and append the output into the Header of the document.
111+
/// Parse asynchronously the Html and append the output into the Header of the document.
112112
/// </summary>
113113
/// <param name="html">The HTML content to parse</param>
114114
/// <param name="headerType">Determines the page(s) on which the current header shall be displayed.
@@ -122,7 +122,7 @@ public async Task ParseHeader(string html, HeaderFooterValues? headerType = null
122122
var headerPart = ResolveHeaderFooterPart<HeaderReference, HeaderPart>(headerType);
123123

124124
headerPart.Header ??= new();
125-
headerImageLoader ??= new ImagePrefetcher<HeaderPart>(headerPart, webRequester);
125+
headerImageLoader ??= new ImagePrefetcher<HeaderPart>(headerPart, webRequester, ImageProcessing);
126126

127127
var paragraphs = await ParseCoreAsync(html, headerPart, headerImageLoader,
128128
new ParallelOptions() { CancellationToken = cancellationToken },
@@ -133,7 +133,7 @@ public async Task ParseHeader(string html, HeaderFooterValues? headerType = null
133133
}
134134

135135
/// <summary>
136-
/// Parse asynchroneously the Html and append the output into the Footer of the document.
136+
/// Parse asynchronously the Html and append the output into the Footer of the document.
137137
/// </summary>
138138
/// <param name="html">The HTML content to parse</param>
139139
/// <param name="footerType">Determines the page(s) on which the current footer shall be displayed.
@@ -147,7 +147,7 @@ public async Task ParseFooter(string html, HeaderFooterValues? footerType = null
147147
var footerPart = ResolveHeaderFooterPart<FooterReference, FooterPart>(footerType);
148148

149149
footerPart.Footer ??= new();
150-
footerImageLoader ??= new ImagePrefetcher<FooterPart>(footerPart, webRequester);
150+
footerImageLoader ??= new ImagePrefetcher<FooterPart>(footerPart, webRequester, ImageProcessing);
151151

152152
var paragraphs = await ParseCoreAsync(html, footerPart, footerImageLoader,
153153
new ParallelOptions() { CancellationToken = cancellationToken },
@@ -158,14 +158,14 @@ public async Task ParseFooter(string html, HeaderFooterValues? footerType = null
158158
}
159159

160160
/// <summary>
161-
/// Parse asynchroneously the Html and append the output into the Body of the document.
161+
/// Parse asynchronously the Html and append the output into the Body of the document.
162162
/// </summary>
163163
/// <param name="html">The HTML content to parse</param>
164164
/// <param name="cancellationToken">The cancellation token.</param>
165165
/// <seealso cref="MainDocumentPart"/>
166166
public async Task ParseBody(string html, CancellationToken cancellationToken = default)
167167
{
168-
bodyImageLoader ??= new ImagePrefetcher<MainDocumentPart>(mainPart, webRequester);
168+
bodyImageLoader ??= new ImagePrefetcher<MainDocumentPart>(mainPart, webRequester, ImageProcessing);
169169
var paragraphs = await ParseCoreAsync(html, mainPart, bodyImageLoader,
170170
new ParallelOptions() { CancellationToken = cancellationToken },
171171
htmlStyles.GetParagraphStyle(htmlStyles.DefaultStyles.Paragraph))
@@ -201,7 +201,7 @@ public async Task ParseBody(string html, CancellationToken cancellationToken = d
201201
}
202202

203203
/// <summary>
204-
/// Start the asynchroneous parse processing. Use this overload if you want to control the downloading of images.
204+
/// Start the asynchronous parse processing. Use this overload if you want to control the downloading of images.
205205
/// </summary>
206206
/// <param name="html">The HTML content to parse</param>
207207
/// <param name="parallelOptions">The configuration of parallelism while downloading the remote resources.</param>
@@ -210,13 +210,13 @@ public async Task ParseBody(string html, CancellationToken cancellationToken = d
210210
[System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage]
211211
public Task<IEnumerable<OpenXmlCompositeElement>> Parse(string html, ParallelOptions parallelOptions)
212212
{
213-
bodyImageLoader ??= new ImagePrefetcher<MainDocumentPart>(mainPart, webRequester);
213+
bodyImageLoader ??= new ImagePrefetcher<MainDocumentPart>(mainPart, webRequester, ImageProcessing);
214214

215215
return ParseCoreAsync(html, mainPart, bodyImageLoader, parallelOptions);
216216
}
217217

218218
/// <summary>
219-
/// Start the asynchroneous parse processing and append the output into the Body of the document.
219+
/// Start the asynchronous parse processing and append the output into the Body of the document.
220220
/// </summary>
221221
/// <param name="html">The HTML content to parse</param>
222222
/// <param name="cancellationToken">The cancellation token.</param>
@@ -236,7 +236,7 @@ public void RefreshStyles()
236236
}
237237

238238
/// <summary>
239-
/// Start the asynchroneous parse processing. Use this overload if you want to control the downloading of images.
239+
/// Start the asynchronous parse processing. Use this overload if you want to control the downloading of images.
240240
/// </summary>
241241
/// <param name="html">The HTML content to parse</param>
242242
/// <param name="hostingPart">The OpenXml container where the content will be inserted into.</param>
@@ -394,6 +394,26 @@ public WordDocumentStyle HtmlStyles
394394
/// <remarks>The table will contains only one cell.</remarks>
395395
public bool RenderPreAsTable { get; set; }
396396

397+
/// <summary>
398+
/// Gets or sets how images should be processed during conversion.
399+
/// </summary>
400+
/// <remarks>
401+
/// <para>
402+
/// Use <see cref="ImageProcessingMode.Embed"/> (default) to download and embed all images,
403+
/// creating self-contained documents but potentially large file sizes.
404+
/// </para>
405+
/// <para>
406+
/// Use <see cref="ImageProcessingMode.LinkExternal"/> to link to external images via relationships,
407+
/// keeping document size small but requiring internet access to view images.
408+
/// Data URI images (base64 encoded) are still embedded.
409+
/// </para>
410+
/// <para>
411+
/// Use <see cref="ImageProcessingMode.EmbedDataUriOnly"/> to only embed data URI images
412+
/// and skip external images entirely.
413+
/// </para>
414+
/// </remarks>
415+
public ImageProcessingMode ImageProcessing { get; set; } = ImageProcessingMode.Embed;
416+
397417
/// <summary>
398418
/// Defines whether ordered lists (<c>ol</c>) continue incrementing existing numbering
399419
/// or restarts to 1 (defaults continues numbering).

src/Html2OpenXml/IO/ImagePrefetcher.cs

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ sealed class ImagePrefetcher<T> : IImageLoader
5353
private readonly IWebRequest resourceLoader;
5454
private readonly HtmlImageInfoCollection prefetchedImages;
5555
private readonly object lockObject = new object();
56+
private readonly ImageProcessingMode processingMode;
5657

5758

5859
/// <summary>
@@ -61,10 +62,12 @@ sealed class ImagePrefetcher<T> : IImageLoader
6162
/// <param name="hostingPart">The image will be linked to that hosting part.
6263
/// Images are not shared between header, footer and body.</param>
6364
/// <param name="resourceLoader">Service to resolve an image.</param>
64-
public ImagePrefetcher(T hostingPart, IWebRequest resourceLoader)
65+
/// <param name="processingMode">Specifies how images should be processed (embed, link, or data URI only).</param>
66+
public ImagePrefetcher(T hostingPart, IWebRequest resourceLoader, ImageProcessingMode processingMode = ImageProcessingMode.Embed)
6567
{
6668
this.hostingPart = hostingPart;
6769
this.resourceLoader = resourceLoader;
70+
this.processingMode = processingMode;
6871
this.prefetchedImages = new HtmlImageInfoCollection();
6972
}
7073

@@ -91,7 +94,22 @@ public ImagePrefetcher(T hostingPart, IWebRequest resourceLoader)
9194
}
9295
else
9396
{
94-
iinfo = await DownloadRemoteImage(imageUri, cancellationToken).ConfigureAwait(false);
97+
// Handle external images based on processing mode
98+
if (processingMode == ImageProcessingMode.EmbedDataUriOnly)
99+
{
100+
// Skip external images entirely
101+
return null;
102+
}
103+
else if (processingMode == ImageProcessingMode.LinkExternal)
104+
{
105+
// Create external link without downloading
106+
iinfo = CreateExternalImageLink(imageUri);
107+
}
108+
else
109+
{
110+
// Default: Download and embed
111+
iinfo = await DownloadRemoteImage(imageUri, cancellationToken).ConfigureAwait(false);
112+
}
95113
}
96114

97115
// Add to cache using thread-safe operation
@@ -168,6 +186,48 @@ public ImagePrefetcher(T hostingPart, IWebRequest resourceLoader)
168186
}
169187
}
170188

189+
/// <summary>
190+
/// Create an external relationship to an image without downloading it.
191+
/// </summary>
192+
private HtmlImageInfo? CreateExternalImageLink(string src)
193+
{
194+
Uri imageUri = new Uri(src, UriKind.RelativeOrAbsolute);
195+
196+
// Resolve relative URIs if possible (only for DefaultWebRequest which has BaseImageUrl)
197+
if (!imageUri.IsAbsoluteUri && resourceLoader is DefaultWebRequest defaultWebRequest
198+
&& defaultWebRequest.BaseImageUrl != null)
199+
{
200+
string url1 = defaultWebRequest.BaseImageUrl.AbsoluteUri.TrimEnd('/', '\\');
201+
string path = src.TrimStart('/', '\\');
202+
imageUri = new Uri(string.Format("{0}/{1}", url1, path), UriKind.Absolute);
203+
}
204+
205+
// Only create external links for absolute URIs with supported protocols
206+
if (!imageUri.IsAbsoluteUri || !resourceLoader.SupportsProtocol(imageUri.Scheme))
207+
return null;
208+
209+
// Generate a unique GUID-based relationship ID for the external relationship
210+
string relationshipId = "imgext_" + Guid.NewGuid().ToString("N");
211+
212+
// Create external relationship
213+
lock (lockObject)
214+
{
215+
hostingPart.AddExternalRelationship(
216+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/image",
217+
imageUri,
218+
relationshipId);
219+
}
220+
221+
// Return image info with external flag set
222+
// Note: Size will be empty as we don't download the image
223+
return new HtmlImageInfo(src, relationshipId)
224+
{
225+
IsExternal = true,
226+
Size = Size.Empty,
227+
TypeInfo = ImagePartType.Png // Default type, actual type doesn't matter for external links
228+
};
229+
}
230+
171231
/// <summary>
172232
/// Parse the Data inline image.
173233
/// </summary>

src/Html2OpenXml/Primitives/HtmlImageInfo.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ sealed class HtmlImageInfo(string source, string partId)
3737
/// Gets the content type of the image.
3838
/// </summary>
3939
public PartTypeInfo TypeInfo { get; set; }
40+
41+
/// <summary>
42+
/// Gets or sets whether this image is linked externally rather than embedded.
43+
/// When true, <see cref="ImagePartId"/> contains an external relationship ID instead of an embedded image part ID.
44+
/// </summary>
45+
public bool IsExternal { get; set; }
4046
}
4147

4248
/// <summary>

0 commit comments

Comments
 (0)