Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
b5d3935
fix: PhrasingElementExpression.cs - check empty string in text element
Jan 20, 2025
7e963ff
Merge pull request #186 from galyni/check-empty-text
onizet Jan 20, 2025
41e09d8
Unit test for PR #186
onizet Jan 20, 2025
42d5cfe
Use FrozenDictionary to improve searching for a known color
onizet Jan 20, 2025
2c4d0e5
Forgot a couple of ConfigureAwait(false)
onizet Jan 20, 2025
c17f8ad
Improve support of table alignment #187
onizet Jan 28, 2025
1bc74ee
Heading with digits only should not be considered as a numbering #189
onizet Jan 28, 2025
b290bf1
Improve ImageHeader and guessing file type. Move img unit testing to …
onizet Feb 24, 2025
6e02b55
Fix nested table breaking the Word validation #190
onizet Feb 27, 2025
f14f1d0
Improved collection check. Tips for future: do not blindly trust CoPi…
onizet Feb 27, 2025
da3dcb6
Revisit whitespace handling when parsing #185 #179
onizet Mar 17, 2025
0e544fe
Update vscode settings
onizet Mar 17, 2025
d3606b9
Update changelog
onizet Mar 17, 2025
5ba83c9
Support percentage size for image #188
onizet Mar 23, 2025
586efde
Update changelog
onizet Mar 23, 2025
d413895
Defensive code to prevent a crash #191
onizet Mar 27, 2025
44747db
Fix another crash on whitespace #191
onizet Mar 31, 2025
56941ca
Table inside a list must be aligned #192
onizet Mar 31, 2025
905abbd
Update changelog and version
onizet Mar 31, 2025
af87dd0
Correct regex and add emf data uri support
Tranquilite0 Apr 4, 2025
e951813
Fix crash when the html contains 2 images with identical source path …
onizet Apr 4, 2025
7bd128b
Merge branch 'onizet:dev' into embedded-image-fix
Tranquilite0 Apr 4, 2025
114a5d4
Update ImagePrefetcher.cs
Tranquilite0 Apr 6, 2025
71ca1b7
Merge pull request #196 from Tranquilite0/embedded-image-fix
onizet Apr 7, 2025
bfaad95
Fix handling handling whitespace between runs #195
onizet Apr 9, 2025
e809010
Defensive code when downloaded image stream is broken #201
onizet Apr 9, 2025
90bf035
Constraint table within a numbering list to not exceed page margin #202
onizet Apr 10, 2025
e46adfb
Fix <figcaption> Conversion (#197)
Tranquilite0 Apr 10, 2025
d55a00d
Fix table borders being removed (#199)
Tranquilite0 Apr 10, 2025
9c1fe98
Mock HttpClient to avoid some proxy errors during running the pipeline
onizet Apr 10, 2025
1228eb4
Yet another exception during parsing of whitespaces #191
onizet Apr 14, 2025
1fd6109
Fix the unit testing after new changes from #197 and #199
onizet Apr 15, 2025
1a14761
Fix nested lists and allow list style attribute (#198)
Tranquilite0 Apr 15, 2025
462e507
Table now supports width:auto for auto-fit content #202
onizet Apr 18, 2025
9f4cdbf
Support margin auto for table alignment #194
onizet Apr 24, 2025
d4174a6
Prepare next version
onizet Apr 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"omnisharp.organizeImportsOnFormat": true,
"dotnet.completion.showCompletionItemsFromUnimportedNamespaces": false,
"coverage-gutters.coverageFileNames":[
"coverage.info"
],
"coverage-gutters.showGutterCoverage": false,
"coverage-gutters.showLineCoverage": true
"coverage-gutters.showLineCoverage": true,
"dotnet.formatting.organizeImportsOnFormat": true
}
29 changes: 29 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,34 @@
# Changelog

## 3.2.5

- Fix a crash with the new whitespace handling introduced in 3.2.3 #191
- Fix crash when the html contains 2 images with identical source path #193
- Support margin auto for table alignment #194
- Fix handling whitespace between runs #195
- Whitelist more mime-types as specified by the IANA standard #196
- Support EMF file #196
- Correct handling of `figcaption` (allow nested phrasings) #197
- Numbering list now supports type attribute `<ol type="1|a|A|i|I">` #198
- Always restart nested numbering list #198
- Fix table borders being removed even when the specified word table style has borders #199
- Defensive code when download image stream is truncated #201
- Table inside list is constrained to not exceed page margin #202
- Table now supports width:auto for auto-fit content #202

## 3.2.4

- Fix a crash with the new whitespace handling introduced in 3.2.3 #191
- Table inside list must be aligned with the list item #192

## 3.2.3

- Improve support of table alignment #187
- Fix a crash if a span is empty
- Heading with only digits should not be considered as a numbering #189
- Fix whitespaces inserted between spans #179 and #185
- Support percentage size (typically width:100%) for img node #188

## 3.2.2

- Supports a feature to disable heading numbering #175
Expand Down
1 change: 1 addition & 0 deletions examples/Demo/Demo.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

<ItemGroup>
<EmbeddedResource Include="Resources\*" />
<Content Include="images\*" CopyToOutputDirectory="PreserveNewest"/>
</ItemGroup>

</Project>
1 change: 1 addition & 0 deletions examples/Demo/Resources/LargeImg.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<img src="The-Song-of-the-World.jpg" style="max-width:100%"/>
13 changes: 0 additions & 13 deletions examples/Demo/app.config

This file was deleted.

Binary file added examples/Demo/images/The-Song-of-the-World.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
52 changes: 39 additions & 13 deletions src/Html2OpenXml/Expressions/BlockElementExpression.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class BlockElementExpression: PhrasingElementExpression
{
private readonly OpenXmlLeafElement[]? defaultStyleProperties;
protected readonly ParagraphProperties paraProperties = new();
protected TableProperties? tableProperties;
// some style attributes, such as borders or bgcolor, will convert this node to a framed container
protected bool renderAsFramed;
private HtmlBorder styleBorder;
Expand Down Expand Up @@ -115,22 +116,44 @@ protected override IEnumerable<OpenXmlElement> Interpret (
public override void CascadeStyles(OpenXmlElement element)
{
base.CascadeStyles(element);
if (!paraProperties.HasChildren || element is not Paragraph paragraph)
if (!paraProperties.HasChildren)
return;

paragraph.ParagraphProperties ??= new ParagraphProperties();

var knownTags = new HashSet<string>();
foreach (var prop in paragraph.ParagraphProperties)
if (element is Paragraph paragraph)
{
if (!knownTags.Contains(prop.LocalName))
knownTags.Add(prop.LocalName);
}
paragraph.ParagraphProperties ??= new ParagraphProperties();

foreach (var prop in paraProperties)
var knownTags = new HashSet<string>();
foreach (var prop in paragraph.ParagraphProperties)
{
if (!knownTags.Contains(prop.LocalName))
knownTags.Add(prop.LocalName);
}

foreach (var prop in paraProperties)
{
if (!knownTags.Contains(prop.LocalName))
paragraph.ParagraphProperties.AddChild(prop.CloneNode(true));
}
}
else if (tableProperties != null && element is Table table)
{
if (!knownTags.Contains(prop.LocalName))
paragraph.ParagraphProperties.AddChild(prop.CloneNode(true));
var props = table.GetFirstChild<TableProperties>();
if (props is null)
return;

var knownTags = new HashSet<string>();
foreach (var prop in props)
{
if (!knownTags.Contains(prop.LocalName))
knownTags.Add(prop.LocalName);
}

foreach (var prop in tableProperties)
{
if (!knownTags.Contains(prop.LocalName))
props.AddChild(prop.CloneNode(true));
}
}
}

Expand Down Expand Up @@ -170,9 +193,12 @@ protected override void ComposeStyles (ParsingContext context)

JustificationValues? align = Converter.ToParagraphAlign(styleAttributes!["text-align"]);
if (!align.HasValue) align = Converter.ToParagraphAlign(node.GetAttribute("align"));
if (!align.HasValue) align = Converter.ToParagraphAlign(styleAttributes["justify-content"]);
if (align.HasValue)
{
paraProperties.Justification = new() { Val = align };
tableProperties ??= new();
tableProperties.TableJustification = new() { Val = align.Value.ToTableRowAlignment() };
}


Expand All @@ -194,7 +220,7 @@ protected override void ComposeStyles (ParsingContext context)
}

var margin = styleAttributes.GetMargin("margin");
Indentation? indentation = null;
Indentation? indentation = null;
if (!margin.IsEmpty)
{
if (margin.Top.IsFixed || margin.Bottom.IsFixed)
Expand Down Expand Up @@ -345,7 +371,7 @@ private static Paragraph CreateParagraph(ParsingContext context, IList<OpenXmlEl

context.CascadeStyles(p);

p.Append(CombineRuns(runs));
p.Append(runs);

// in Html, if a paragraph is ending with a line break, it is ignored
if (p.LastChild is Run run && run.LastChild is Break lineBreak)
Expand Down
206 changes: 125 additions & 81 deletions src/Html2OpenXml/Expressions/FigureCaptionExpression.cs
Original file line number Diff line number Diff line change
@@ -1,81 +1,125 @@
/* Copyright (C) Olivier Nizet https://github.com/onizet/html2openxml - All Rights Reserved
*
* This source is subject to the Microsoft Permissive License.
* Please see the License.txt file for more information.
* All other rights reserved.
*
* THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
* KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
* PARTICULAR PURPOSE.
*/
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using AngleSharp.Html.Dom;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Wordprocessing;

namespace HtmlToOpenXml.Expressions;

/// <summary>
/// Process the parsing of a <c>figcaption</c> element, which is used to describe an image.
/// </summary>
sealed class FigureCaptionExpression(IHtmlElement node) : PhrasingElementExpression(node)
{

/// <inheritdoc/>
public override IEnumerable<OpenXmlElement> Interpret (ParsingContext context)
{
ComposeStyles(context);
var childElements = Interpret(context.CreateChild(this), node.ChildNodes);
if (!childElements.Any())
return [];

var p = new Paragraph (
new Run(
new Text("Figure ") { Space = SpaceProcessingModeValues.Preserve }
),
new SimpleField(
new Run(
new Text(AddFigureCaption(context).ToString(CultureInfo.InvariantCulture)))
) { Instruction = " SEQ Figure \\* ARABIC " }
) {
ParagraphProperties = new ParagraphProperties {
ParagraphStyleId = context.DocumentStyle.GetParagraphStyle(context.DocumentStyle.DefaultStyles.CaptionStyle),
KeepNext = new KeepNext()
}
};

if (childElements.First() is Run run) // any caption?
{
Text? t = run.GetFirstChild<Text>();
if (t != null)
t.Text = " " + t.InnerText; // append a space after the numero of the picture
}

return [p];
}

/// <summary>
/// Add a new figure caption to the document.
/// </summary>
/// <returns>Returns the id of the new figure caption.</returns>
private static int AddFigureCaption(ParsingContext context)
{
var figCaptionRef = context.Properties<int?>("figCaptionRef");
if (!figCaptionRef.HasValue)
{
figCaptionRef = 0;
foreach (var p in context.MainPart.Document.Descendants<SimpleField>())
{
if (p.Instruction == " SEQ Figure \\* ARABIC ")
figCaptionRef++;
}
}
figCaptionRef++;

context.Properties("figCaptionRef", figCaptionRef);
return figCaptionRef.Value;
}
}
/* Copyright (C) Olivier Nizet https://github.com/onizet/html2openxml - All Rights Reserved
*
* This source is subject to the Microsoft Permissive License.
* Please see the License.txt file for more information.
* All other rights reserved.
*
* THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
* KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
* PARTICULAR PURPOSE.
*/
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using AngleSharp.Html.Dom;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Wordprocessing;

namespace HtmlToOpenXml.Expressions;

/// <summary>
/// Process the parsing of a <c>figcaption</c> element, which is used to describe an image.
/// </summary>
sealed class FigureCaptionExpression(IHtmlElement node) : BlockElementExpression(node)
{

/// <inheritdoc/>
public override IEnumerable<OpenXmlElement> Interpret (ParsingContext context)
{
ComposeStyles(context);
var childElements = Interpret(context.CreateChild(this), node.ChildNodes);

var figNumRef = new List<OpenXmlElement>() {
new Run(
new Text("Figure ") { Space = SpaceProcessingModeValues.Preserve }
),
new SimpleField(
new Run(
new Text(AddFigureCaption(context).ToString(CultureInfo.InvariantCulture)))
) { Instruction = " SEQ Figure \\* ARABIC " }
};


if (!childElements.Any())
{
return [new Paragraph(figNumRef) {
ParagraphProperties = new ParagraphProperties {
ParagraphStyleId = context.DocumentStyle.GetParagraphStyle(context.DocumentStyle.DefaultStyles.CaptionStyle),
KeepNext = DetermineKeepNext(node),
}
}];
}

//Add the figure number references to the start of the first paragraph.
if(childElements.FirstOrDefault() is Paragraph p)
{
var properties = p.GetFirstChild<ParagraphProperties>();
p.InsertAfter(new Run(
new Text(" ") { Space = SpaceProcessingModeValues.Preserve }
), properties);
p.InsertAfter(figNumRef[1], properties);
p.InsertAfter(figNumRef[0], properties);
}
else
{
// The first child of the figure caption is a table or something.
// Just prepend a new paragraph with the figure number reference.
childElements = [
new Paragraph(figNumRef),
..childElements
];
}

foreach (var paragraph in childElements.OfType<Paragraph>())
{
paragraph.ParagraphProperties ??= new ParagraphProperties();
paragraph.ParagraphProperties.ParagraphStyleId ??= context.DocumentStyle.GetParagraphStyle(context.DocumentStyle.DefaultStyles.CaptionStyle);
//Keep caption paragraphs together.
paragraph.ParagraphProperties.KeepNext = new KeepNext();
}

if(childElements.OfType<Paragraph>().LastOrDefault() is Paragraph lastPara)
{
lastPara.ParagraphProperties!.KeepNext = DetermineKeepNext(node);
}

return childElements;
}

/// <summary>
/// Add a new figure caption to the document.
/// </summary>
/// <returns>Returns the id of the new figure caption.</returns>
private static int AddFigureCaption(ParsingContext context)
{
var figCaptionRef = context.Properties<int?>("figCaptionRef");
if (!figCaptionRef.HasValue)
{
figCaptionRef = 0;
foreach (var p in context.MainPart.Document.Descendants<SimpleField>())
{
if (p.Instruction == " SEQ Figure \\* ARABIC ")
figCaptionRef++;
}
}
figCaptionRef++;

context.Properties("figCaptionRef", figCaptionRef);
return figCaptionRef.Value;
}

/// <summary>
/// Determines whether the KeepNext property should apply this this caption.
/// </summary>
/// <returns>A new <see cref="KeepNext"/> or null.</returns>
private static KeepNext? DetermineKeepNext(IHtmlElement node)
{
// A caption at the end of a figure will have no next sibling.
if(node.NextElementSibling is null)
{
return null;
}
return new();
}
}
Loading