Skip to content

Commit ba40afe

Browse files
authored
fix: use marker-width indent for ordered-list continuation (#80)
1 parent 2a7baf5 commit ba40afe

8 files changed

Lines changed: 313 additions & 175 deletions

File tree

crates/core/src/convert.rs

Lines changed: 85 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@ static HEADING_PREFIXES: [&str; 6] = ["# ", "## ", "### ", "#### ", "##### ", "#
1818

1919
/// Pre-computed blockquote prefixes for depths 1-6 (avoids `"> ".repeat()`)
2020
static BQ_PREFIXES: [&str; 7] = ["", "> ", "> > ", "> > > ", "> > > > ", "> > > > > ", "> > > > > > "];
21-
/// Pre-computed unordered list item prefixes for indent depths 0-5
22-
static UL_PREFIXES: [&str; 6] = ["- ", " - ", " - ", " - ", " - ", " - "];
2321

2422
// Clean mode bitmask flags
2523
const CLEAN_EMPTY_LINKS: u8 = 1;
@@ -247,6 +245,16 @@ pub struct ConvertState {
247245
in_heading: bool,
248246
/// Buffer position at heading start (for extracting heading text)
249247
heading_buffer_start: usize,
248+
249+
/// Cumulative indent string for list-item continuation content. Grows by
250+
/// each ancestor `<li>`'s marker width (`"- "` = 2, `"N. "` = digits(N)+2),
251+
/// so code blocks, paragraphs, and nested blocks inside a list item land
252+
/// in the content column that CommonMark requires. Pushed on `<li>` enter,
253+
/// popped on `<li>` close.
254+
list_indent: String,
255+
/// Per-`<li>` contribution width stack, parallel to `list_indent`. Used to
256+
/// truncate the correct number of bytes on close without re-walking ancestors.
257+
list_indent_widths: Vec<u8>,
250258
}
251259

252260
impl ConvertState {
@@ -322,6 +330,9 @@ impl ConvertState {
322330
fragment_links: Vec::new(),
323331
in_heading: false,
324332
heading_buffer_start: 0,
333+
334+
list_indent: String::new(),
335+
list_indent_widths: Vec::with_capacity(8),
325336
};
326337
// Resolve clean config into bitmask
327338
let effective_clean_urls;
@@ -968,36 +979,33 @@ impl ConvertState {
968979
return;
969980
}
970981

971-
// Indent code block content inside a list item so the fenced block stays
972-
// within the list item's content column. Skip indent for blank lines and
973-
// lines that already begin with whitespace — preserves the original
974-
// indentation structure from the HTML and matches the JS engine.
982+
// Indent code block content inside a list item so every line starts at
983+
// the list item's content column. CommonMark closes the list item when
984+
// a line is indented less than that column, so we prepend list_indent
985+
// on top of any existing in-source indentation. Blank lines are left
986+
// alone so they stay blank.
975987
let li_depth = self.depth_map[TAG_LI as usize] as usize;
976988
let indented_storage;
977989
let text = if self.depth_map[TAG_PRE as usize] > 0 && li_depth > 0
978990
&& (text.contains('\n') || last_char == b'\n') {
979-
let indent = " ".repeat(li_depth);
991+
let indent = self.list_indent.as_str();
980992
let mut out = String::with_capacity(text.len() + indent.len() * 2);
981993
let bytes = text.as_bytes();
982994
// Prepend indent for the first line when the buffer ended with a
983-
// newline (code fence opener) and this text doesn't already start
984-
// with leading whitespace.
995+
// newline (code fence opener). Blank first line stays blank.
985996
if last_char == b'\n' {
986997
let first = bytes.first().copied().unwrap_or(0);
987-
if first != b' ' && first != b'\t' && first != b'\n' {
988-
out.push_str(&indent);
998+
if first != b'\n' && first != 0 {
999+
out.push_str(indent);
9891000
}
9901001
}
9911002
let mut prev = 0usize;
9921003
for (i, &b) in bytes.iter().enumerate() {
9931004
if b == b'\n' {
9941005
out.push_str(&text[prev..=i]);
9951006
let next = i + 1;
996-
if next < bytes.len() {
997-
let c = bytes[next];
998-
if c != b' ' && c != b'\t' && c != b'\n' {
999-
out.push_str(&indent);
1000-
}
1007+
if next < bytes.len() && bytes[next] != b'\n' {
1008+
out.push_str(indent);
10011009
}
10021010
prev = next;
10031011
}
@@ -1104,8 +1112,7 @@ impl ConvertState {
11041112
"> ".repeat(depth)
11051113
};
11061114
if self.depth_map[TAG_LI as usize] > 0 {
1107-
let indent = " ".repeat(self.depth_map[TAG_LI as usize] as usize);
1108-
prefix = format!("\n{indent}{prefix}");
1115+
prefix = format!("\n{}{}", self.list_indent, prefix);
11091116
}
11101117
Some(Cow::Owned(prefix))
11111118
}
@@ -1115,14 +1122,14 @@ impl ConvertState {
11151122
let lang = Self::get_language_from_class(node.attributes.get("class"));
11161123
let li_depth = self.depth_map[TAG_LI as usize] as usize;
11171124
if li_depth > 0 {
1118-
let indent = " ".repeat(li_depth);
1125+
let indent = self.list_indent.as_str();
11191126
let mut s = String::with_capacity(2 + indent.len() * 2 + 4 + lang.len() + 1);
11201127
s.push_str("\n\n");
1121-
s.push_str(&indent);
1128+
s.push_str(indent);
11221129
s.push_str("```");
11231130
s.push_str(lang);
11241131
s.push('\n');
1125-
s.push_str(&indent);
1132+
s.push_str(indent);
11261133
Some(Cow::Owned(s))
11271134
} else if lang.is_empty() {
11281135
Some(Cow::Borrowed("```\n"))
@@ -1170,23 +1177,21 @@ impl ConvertState {
11701177
if self.in_table_cell() {
11711178
return Some(Cow::Borrowed("<li>"));
11721179
}
1173-
let ul_depth = self.depth_map[TAG_UL as usize] as usize;
1174-
let ol_depth = self.depth_map[TAG_OL as usize] as usize;
1175-
let depth = if ul_depth + ol_depth > 0 { ul_depth + ol_depth - 1 } else { 0 };
1176-
let is_ordered = ol_depth > 0 && _ancestors.last().is_some_and(|p| p.tag_id == Some(TAG_OL));
1177-
if !is_ordered && depth < UL_PREFIXES.len() {
1178-
Some(Cow::Borrowed(UL_PREFIXES[depth]))
1180+
// Parent determines marker: <ol> → "N. " (digits of N + 2
1181+
// columns), else "- " (2 columns). The indent emitted here is
1182+
// the parent's accumulated list_indent — this LI's own marker
1183+
// contribution is pushed onto list_indent AFTER this output
1184+
// is written to the buffer.
1185+
let is_ordered = _ancestors.last().is_some_and(|p| p.tag_id == Some(TAG_OL));
1186+
let mut s = String::with_capacity(self.list_indent.len() + 6);
1187+
s.push_str(&self.list_indent);
1188+
if is_ordered {
1189+
use std::fmt::Write;
1190+
let _ = write!(s, "{}. ", node.index + 1);
11791191
} else {
1180-
let mut s = String::with_capacity(depth * 2 + 6);
1181-
for _ in 0..depth { s.push_str(" "); }
1182-
if is_ordered {
1183-
use std::fmt::Write;
1184-
let _ = write!(s, "{}. ", node.index + 1);
1185-
} else {
1186-
s.push_str("- ");
1187-
}
1188-
Some(Cow::Owned(s))
1192+
s.push_str("- ");
11891193
}
1194+
Some(Cow::Owned(s))
11901195
}
11911196
TAG_A => {
11921197
if node.attributes.contains_key("href") { Some(Cow::Borrowed("[")) } else { None }
@@ -1273,12 +1278,12 @@ impl ConvertState {
12731278
if self.depth_map[TAG_PRE as usize] > 0 {
12741279
let li_depth = self.depth_map[TAG_LI as usize] as usize;
12751280
if li_depth > 0 {
1276-
let indent = " ".repeat(li_depth);
1281+
let indent = self.list_indent.as_str();
12771282
let mut s = String::with_capacity(1 + indent.len() * 2 + 5);
12781283
s.push('\n');
1279-
s.push_str(&indent);
1284+
s.push_str(indent);
12801285
s.push_str("```\n\n");
1281-
s.push_str(&indent);
1286+
s.push_str(indent);
12821287
Some(Cow::Owned(s))
12831288
} else {
12841289
Some(Cow::Borrowed("\n```"))
@@ -1838,6 +1843,35 @@ impl ConvertState {
18381843
self.emit_enter_element();
18391844
}
18401845

1846+
// After the LI prefix is emitted, push this LI's marker-width worth of
1847+
// spaces to list_indent so subsequent continuation content (code blocks,
1848+
// paragraphs, nested blocks) lands in the correct content column. The
1849+
// width depends on the marker: "- " = 2, "N. " = digits(N) + 2.
1850+
// Push for every LI open so close_node can pop unconditionally; width 0
1851+
// when skipped or in a table cell keeps the stack balanced without
1852+
// affecting the indent string.
1853+
if tag_id == Some(TAG_LI)
1854+
&& let Some(li) = self.stack.last()
1855+
{
1856+
let width: usize = if !skip_node && !self.in_table_cell() {
1857+
let stack_len = self.stack.len();
1858+
let parent_is_ordered = stack_len >= 2
1859+
&& self.stack[stack_len - 2].tag_id == Some(TAG_OL);
1860+
if parent_is_ordered {
1861+
let n = li.index + 1;
1862+
// n >= 1 so ilog10 never panics; +1 converts floor(log10) to digit count.
1863+
let digits = (n.ilog10() + 1) as usize;
1864+
digits + 2
1865+
} else {
1866+
2
1867+
}
1868+
} else {
1869+
0
1870+
};
1871+
self.list_indent_widths.push(u8::try_from(width).unwrap_or(u8::MAX));
1872+
for _ in 0..width { self.list_indent.push(' '); }
1873+
}
1874+
18411875
self.has_encoded_html_entity = false;
18421876

18431877
if self.stack.last().is_some_and(|n| n.is_non_nesting) && !self_closing {
@@ -1924,6 +1958,12 @@ impl ConvertState {
19241958
self.depth_map[id as usize] = self.depth_map[id as usize].saturating_sub(1);
19251959
}
19261960
self.update_escape_ctx_on_close(id);
1961+
if id == TAG_LI
1962+
&& let Some(w) = self.list_indent_widths.pop()
1963+
{
1964+
let new_len = self.list_indent.len().saturating_sub(w as usize);
1965+
self.list_indent.truncate(new_len);
1966+
}
19271967
}
19281968
self.depth -= 1;
19291969
self.has_encoded_html_entity = false;
@@ -1945,6 +1985,12 @@ impl ConvertState {
19451985
self.depth_map[id as usize] = self.depth_map[id as usize].saturating_sub(1);
19461986
}
19471987
self.update_escape_ctx_on_close(id);
1988+
if id == TAG_LI
1989+
&& let Some(w) = self.list_indent_widths.pop()
1990+
{
1991+
let new_len = self.list_indent.len().saturating_sub(w as usize);
1992+
self.list_indent.truncate(new_len);
1993+
}
19481994
}
19491995

19501996
self.in_non_nesting = self.stack.last().is_some_and(|n| n.is_non_nesting);

crates/core/tests/conversion.rs

Lines changed: 49 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -224,9 +224,12 @@ fn nested_unordered_list() {
224224

225225
#[test]
226226
fn nested_ordered_list() {
227+
// Nested ordered lists require 3-space continuation indent (length of
228+
// the outer "1. " marker) so CommonMark parses the inner list as nested
229+
// rather than as peer items of the outer.
227230
assert_eq!(
228231
convert("<ol><li>Level 1<ol><li>Level 1.1</li></ol></li><li>Level 2</li></ol>"),
229-
"1. Level 1\n 1. Level 1.1\n2. Level 2"
232+
"1. Level 1\n 1. Level 1.1\n2. Level 2"
230233
);
231234
}
232235

@@ -238,9 +241,48 @@ fn mixed_nested_lists() {
238241
);
239242
}
240243

244+
#[test]
245+
fn ordered_list_with_code_block_uses_marker_width_indent() {
246+
// Ordered list continuation must be indented by the marker width
247+
// (3 columns for "1. ") so the fenced code block parses as part of the
248+
// list item. 2-space indent would dump the code block outside the list.
249+
let html = "<ol><li><p>x</p><pre><code>y</code></pre><p>z</p></li></ol>";
250+
assert_eq!(
251+
convert(html),
252+
"1. x\n\n ```\n y\n ```\n\n z"
253+
);
254+
}
255+
256+
#[test]
257+
fn ordered_list_double_digit_marker_uses_wider_indent() {
258+
// Once the marker reaches 2 digits ("10. " = 4 columns), continuation
259+
// indent must widen to match.
260+
let html = "<ol>\
261+
<li>a</li><li>b</li><li>c</li><li>d</li><li>e</li>\
262+
<li>f</li><li>g</li><li>h</li><li>i</li>\
263+
<li>j<ol><li>nested</li></ol></li></ol>";
264+
let md = convert(html);
265+
assert!(md.ends_with("10. j\n 1. nested"),
266+
"expected 4-space indent before nested item, got: {md:?}");
267+
}
268+
269+
#[test]
270+
fn nested_ul_inside_ol_uses_ordered_parent_indent() {
271+
// <ol><li><ul><li>inner</li></ul></li></ol>: the inner "- " must be
272+
// indented by the outer "1. " width (3), not 2.
273+
let html = "<ol><li>outer<ul><li>inner</li></ul></li></ol>";
274+
assert_eq!(
275+
convert(html),
276+
"1. outer\n - inner"
277+
);
278+
}
279+
241280
// https://github.com/harlan-zw/mdream/issues/77
242281
#[test]
243-
fn loose_ordered_list_items_separated_by_blank_line() {
282+
fn loose_ordered_list_with_code_block_renders_as_commonmark_loose_list() {
283+
// The user's reproducer from issue #77. With 3-space indent the markdown
284+
// renders in CommonMark as a 2-item list with nested code block; with the
285+
// old 2-space indent the code block fell outside the list entirely.
244286
let html = r#"
245287
<ol>
246288
<li>
@@ -253,17 +295,10 @@ fn loose_ordered_list_items_separated_by_blank_line() {
253295
</li>
254296
</ol>
255297
"#;
256-
let expected = r#"
257-
1. text
258-
259-
```
260-
text
261-
```
262-
263-
text
264-
2. text
265-
"#;
266-
assert_eq!(convert(html), expected.trim());
298+
assert_eq!(
299+
convert(html),
300+
"1. text\n\n ```\n text\n ```\n\n text\n2. text"
301+
);
267302
}
268303

269304
// https://github.com/harlan-zw/mdream/issues/76
@@ -308,7 +343,7 @@ fn inline_code_after_whitespace_in_list_item_does_not_duplicate_separator() {
308343
#[test]
309344
fn inline_code_inside_wrappers_inside_list_no_stray_space() {
310345
// No leading space should be injected when the wrapper opener is the last
311-
// thing emitted otherwise pairing breaks for strikethrough and link
346+
// thing emitted, otherwise pairing breaks for strikethrough and link
312347
// text, and the space leaks into HTML passthrough content.
313348
assert_eq!(
314349
convert("<ul><li><del><code>x</code></del></li></ul>"),

0 commit comments

Comments
 (0)