Skip to content

Commit 3f4a8bc

Browse files
committed
Refactor emit_html
1 parent f4a8d8b commit 3f4a8bc

File tree

1 file changed

+163
-186
lines changed

1 file changed

+163
-186
lines changed

pdfsyntax/markdown.py

Lines changed: 163 additions & 186 deletions
Original file line numberDiff line numberDiff line change
@@ -307,205 +307,182 @@ def render_htmlV2(md: str):
307307

308308
def tokenize(md: str) -> list:
309309
"""Split input string and transform some tokens to facilitate further processing."""
310-
res = ['\n', '\n']
310+
res = [[''], ['']]
311311
i = 0
312-
last_nl = 0
313-
before_nl = 0
314312
tok = ''
315-
UNDER2FRONT = {'=': '#', '-': '##'}
313+
line = []
316314
while i < len(md):
317315
c = md[i]
318-
print(tok)
319-
if c in '\n[]()>*_`-+.':
320-
if c == '\n':
321-
before_nl = last_nl
322-
last_nl = len(res) - 1
323-
if tok and (tok == '=' * len(tok) or tok == '-' * len(tok)):
324-
# Turn setext into atx style heading
325-
res.insert(before_nl+1, UNDER2FRONT[tok[0]])
316+
#print(f"{i:2} | {'\\n' if c == '\n' else c} | {tok} | {line}")
317+
if c != '#' and not line and tok and tok == len(tok) * '#':
318+
#
319+
line.append(tok)
320+
tok = '' if c == ' ' else c
321+
elif c in '-' and line and line[-1][0] in '-' and line[-1] == len(line[-1]) * '-':
322+
#
323+
line[-1] = line[-1] + c
324+
elif c == '`' and line and not tok and line[-1] in '``':
325+
# Concatenate backquotes signs for fenced code
326+
line[-1] = line[-1] + c
327+
elif c in '*_' and line and line[-1][-1] in '*_' and tok == '':
328+
# Concatenate signs for em & strong styles
329+
line[-1] = line[-1] + c
330+
elif c in '\n[]()-+.>*_`':
331+
if tok:
332+
line.append(tok)
326333
tok = ''
327-
elif len(res) > 1 and res[-2] == '\n' and res[-1] in '>>>>>' and c == '>':
328-
# Concatenate gt signs for nested blockquotes
329-
res[-1] = res[-1] + c
330-
elif len(res) > 1 and res[-2] == '\n' and res[-1] in '``' and c == '`':
331-
# Concatenate backquotes signs for fenced code
332-
res[-1] = res[-1] + c
333-
elif res[-1][-1] in '*_' and tok == '' and c in '*_':
334-
# Concatenate signs for em & strong styles
335-
res[-1] = res[-1] + c
336-
elif len(res) > 1 and res[-2] == '\n' and res[-1] in '-*+':
337-
# Normalize sign for ul and add empty space when no indent
338-
del res[-1]
339-
res += ['', '-*+', tok.lstrip(), c]
340-
tok = ''
341-
elif len(res) > 2 and res[-3] == '\n' and res[-2] == len(res[-2]) * ' ' and res[-1] in '-*+':
342-
# Normalize sign for ul
343-
del res[-1]
344-
res += ['-*+', tok.lstrip(), c]
345-
tok = ''
346-
elif len(res) > 2 and res[-1] == '\n' and len(tok) != len(tok.lstrip(' ')) and tok.lstrip(' '):
347-
# Isolate spaces
348-
t = tok.lstrip(' ')
349-
nb_sp = len(tok) - len(t)
350-
res += [nb_sp * ' ', t, c]
351-
tok = ''
352-
elif len(res) > 2 and res[-3] == '\n' and res[-2].lstrip(' ').isdigit() and res[-1] == '.':
353-
# Isolate spaces before ol
354-
dgts = res[-2].lstrip(' ')
355-
nb_sp = len(res[-2]) - len(dgts)
356-
del res[-2:]
357-
res += [nb_sp * ' ', f"{dgts}.", tok.lstrip(), c]
358-
tok = ''
359-
else:
360-
if tok:
361-
res.append(tok)
362-
tok = ''
363-
res.append(c)
364-
elif tok and tok[-1] in '#' and tok[0] in '#' and c not in '#':
365-
# Concatenate atx style heading
366-
res.append(tok)
367-
tok = ''
334+
line.append(c)
368335
else:
369336
tok += c
337+
338+
if line and line[-1] == '\n':
339+
del line[-1]
340+
if line and line[-1] == len(line[-1]) * '=':
341+
res[-1].insert(0, '#')
342+
elif line and line[-1] == len(line[-1]) * '-':
343+
res[-1].insert(0, '##')
344+
else:
345+
if not line:
346+
line = ['']
347+
res.append(line)
348+
line = []
370349
i += 1
371350
return res
372351

373352

374-
def emit_html(toks: list, start = 2, stack = ['']):
353+
def putLineInContext(line: list, stack: list):
375354
"""."""
376-
res = ''
377-
i = start
378-
while i < len(toks):
379-
context = stack[-1]
380-
x, y, z = toks[i-2], toks[i-1], toks[i]
381-
tk = f'\\n' if toks[i] == '\n' else toks[i]
382-
print(f"{i:2} | {'/'.join(stack):20} | {tk}")
383-
if not context:
384-
pass
385-
elif context == 'p':
386-
if y == '\n' and z == '\n':
387-
return i+1, res
388-
elif y == '\n' and z == len(toks[i]) * '#':
389-
return i, res
390-
elif y == '\n' and z[:1] == '>' and len(stack) > 1 and stack[-2] != f'>{len(z)}':
391-
return i, res
392-
elif y == '\n' and z[:1] != '>' and len(stack) > 1 and stack[-2][:1] == '>':
393-
return i, res
394-
elif context[:1] == '>':
395-
if y == '\n' and z[:1] == '>' and int(context[1:]) > len(z):
396-
return i+1, res
397-
elif y == '\n' and z[:1] != '>':
398-
return i+1, res
399-
elif context == 'indented':
400-
if y == '\n' and z != ' ':
401-
return i, res
402-
elif context == 'fenced':
403-
if y == '\n' and (z == '```' or z == '~~~'):
404-
return i+1, res
405-
elif context[:2] == 'ul':
406-
if x == '\n' and y == len(y) * ' ' and z == '-*+' and len(y) < int(context[2:]):
407-
return i, res
408-
elif x == '\n' and y == '\n' and z and z[0] != ' ':
409-
return i, res
410-
elif context[:2] == 'ol':
411-
if x == '\n' and y == len(y) * ' ' and z[-1] == '.' and len(y) < int(context[2:]):
412-
return i, res
413-
elif x == '\n' and y == '\n' and z and z[0] != ' ':
414-
return i, res
415-
elif context == 'li':
416-
if x == '\n' and y == len(y) * ' ' and z == '-*+' and len(y) <= int(stack[-2][2:]):
417-
return i, res
418-
elif x == '\n' and y == len(y) * ' ' and z[-1] == '.' and len(y) <= int(stack[-2][2:]):
419-
return i, res
420-
elif x == '\n' and y == '\n' and z and z[0] != ' ':
421-
return i, res
422-
elif context[0] == 'h':
423-
if z == '\n':
424-
return i, res
425-
elif context == 'code':
426-
if z == '`':
427-
return i+1, res
428-
elif context == 'link_text':
429-
if z == ']':
430-
return i+1, res
431-
elif context == 'link_url':
432-
if z == ')':
433-
return i+1, res
434-
elif context == 'em':
435-
if z in '*_':
436-
return i+1, res
437-
elif context == 'strong':
438-
if z.replace('_', '*') == '**':
439-
return i+1, res
440-
elif context == 'em&strong':
441-
if z.replace('_', '*') == '***':
442-
return i+1, res
443-
444-
if i == 0 and toks[0] == '\n':
445-
i += 1
446-
elif y == '\n' and z and z == len(z) * '#':
447-
hx = f'h{len(z)}'
448-
i, r = emit_html(toks, i+1, stack + [hx])
449-
res += f'\n<{hx}>{r}</{hx}>'
450-
elif y == '\n' and (z == '```' or z == '~~~'):
451-
if i < len(toks) -1 and toks[i+1] != '\n':
452-
i, r = emit_html(toks, i+2, stack + ['fenced'])
453-
else:
454-
i, r = emit_html(toks, i+1, stack + ['fenced'])
455-
res += f'\n<pre><code>{r}</code></pre>'
456-
elif z == '[':
457-
i, r1 = emit_html(toks, i+1, stack + ['link_text'])
458-
i, r2 = emit_html(toks, i+1, stack + ['link_url'])
459-
res += f'<a href="{r2}">{r1}</a>'
460-
elif z == '`':
461-
i, r = emit_html(toks, i+1, stack + ['code'])
462-
res += f'<code>{r}</code>'
463-
elif y != '\n' and z and z in '*_':
464-
i, r = emit_html(toks, i+1, stack + ['em'])
465-
res += f'<em>{r}</em>'
466-
elif y != '\n' and z.replace('_', '*') == '**':
467-
i, r = emit_html(toks, i+1, stack + ['strong'])
468-
res += f'<strong>{r}</strong>'
469-
elif y != '\n' and z.replace('_', '*') == '***':
470-
i, r = emit_html(toks, i+1, stack + ['em&strong'])
471-
res += f'<em><strong>{r}</strong></em>'
472-
elif context[:2] == 'ul' and x == '\n' and y == len(y) * ' ' and z == '-*+':
473-
i, r = emit_html(toks, i+1, stack + ['li'])
474-
res += f'\n<li>{r.rstrip("\n")}</li>'
475-
elif context[:2] == 'ol' and x == '\n' and y == len(y) * ' ' and z[-1] == '.':
476-
i, r = emit_html(toks, i+1, stack + ['li'])
477-
res += f'\n<li>{r.rstrip("\n")}</li>'
478-
elif x == '\n' and y == len(y) * ' ' and z == '-*+':
479-
i, r = emit_html(toks, i, stack + [f'ul{len(toks[i-1])}'])
480-
res += f'\n<ul>{r}\n</ul>'
481-
elif x == '\n' and y == len(y) * ' ' and z[-1] == '.':
482-
i, r = emit_html(toks, i, stack + [f'ol{len(y)}'])
483-
res += f'\n<ol>{r}\n</ol>'
484-
elif not context and y == '\n' and z == '>':
485-
i, r = emit_html(toks, i+1, stack + ['>1'])
486-
res += f'\n<blockquote>{r}\n</blockquote>'
487-
elif context[:1] == '>' and y == '\n' and z == (int(context[1:])+1) * '>':
488-
i, r = emit_html(toks, i, stack + [f'>{len(z)}'])
489-
res += f'\n<blockquote>{r}\n</blockquote>'
490-
elif context[:1] == '>' and toks[i-1] == '\n' and toks[i] == ' ':
491-
i, r = emit_html(toks, i+1, stack + ['indented'])
492-
res += f'\n<pre><code>{r}</code></pre>'
493-
elif context == 'pre&code' and y == '\n' and z == ' ':
494-
i += 1
495-
elif context[:2] in ['ul', 'ol'] and z == len(z) * ' ':
496-
i += 1
497-
elif context and context[:1] not in ['u', 'o', '>']:
498-
if context == 'p' and len(stack) > 1 and stack[-2][:1] == '>' and z in '>>>>>':
499-
pass
500-
else:
501-
res += z
502-
i += 1
355+
k = -1
356+
j = 0
357+
while j < len(line) and k <= len(stack) - 1:
358+
if not line[0] and stack and stack[-1][0] in 'pul':
359+
k += 1
360+
break
361+
elif line[j] == ' ' and stack and stack[k+1][0] == 'u':
362+
k += 2
363+
elif stack and stack[k+1][0] == 'h':
364+
k += 1
365+
break
366+
elif line[j] in '-*+' and stack and k<len(stack)-2 and stack[k+2][0] == 'l':
367+
k += 1
368+
break
369+
elif line[j] == '>' and stack and stack[k+1][0] == '>':
370+
k += 1
503371
else:
504-
if z == '' or z == '\n':
505-
i += 1
372+
break
373+
j += 1
374+
return (j, k)
375+
376+
377+
def detectNewBlock(tok, stack: list):
378+
"""."""
379+
if tok and tok == len(tok) * '#':
380+
return f'h{len(tok)}', f'h{len(tok)}', 1
381+
elif tok and tok in '>':
382+
return '>', 'blockquote', 1
383+
elif stack and stack[-1][:2] == 'ul' and tok in '-*+':
384+
return 'li', 'li', 1
385+
elif (not stack or stack[-1][:2] != 'ul') and tok and tok in '*+-':
386+
return f'ul{len("")}', 'ul', 0
387+
elif not stack and tok:
388+
return 'p', 'p', 0
389+
else:
390+
return None, None, 0
391+
392+
393+
def detectNewOrClosingSpan(tok, stack: list):
394+
"""."""
395+
h = [
396+
('`', 'code', '`'),
397+
('_', 'em', '_'),
398+
]
399+
for opening, element, closing in h:
400+
if opening == closing and tok == opening:
401+
if element not in stack:
402+
return 'opening', element
506403
else:
507-
i, r = emit_html(toks, i, stack + ['p'])
508-
res += f'\n<p>{r.rstrip("\n")}</p>'
509-
return i, res
404+
return 'closing', element
405+
elif tok == opening:
406+
return 'opening', element
407+
elif tok == closing:
408+
return 'closing', element
409+
return None, None
410+
411+
412+
#######################################################################
413+
# OLD CODE TO RECYCLE
414+
#
415+
# elif y == '\n' and (z == '```' or z == '~~~'):
416+
# if i < len(toks) -1 and toks[i+1] != '\n':
417+
# i, r = emit_html(toks, i+2, stack + ['fenced'])
418+
# else:
419+
# i, r = emit_html(toks, i+1, stack + ['fenced'])
420+
# res += f'\n<pre><code>{r}</code></pre>'
421+
# elif z == '[':
422+
# i, r1 = emit_html(toks, i+1, stack + ['link_text'])
423+
# i, r2 = emit_html(toks, i+1, stack + ['link_url'])
424+
# res += f'<a href="{r2}">{r1}</a>'
425+
# elif y != '\n' and z and z in '*_':
426+
# i, r = emit_html(toks, i+1, stack + ['em'])
427+
# res += f'<em>{r}</em>'
428+
# elif y != '\n' and z.replace('_', '*') == '**':
429+
# i, r = emit_html(toks, i+1, stack + ['strong'])
430+
# res += f'<strong>{r}</strong>'
431+
# elif y != '\n' and z.replace('_', '*') == '***':
432+
# i, r = emit_html(toks, i+1, stack + ['em&strong'])
433+
# res += f'<em><strong>{r}</strong></em>'
434+
# elif stack[-1][:1] == '>' and y == '\n' and z == ' ':
435+
# i, r = emit_html(toks, i+1, stack + ['indented'])
436+
# res += f'\n<pre><code>{r}</code></pre>'
437+
########################################################################
438+
439+
440+
def html_text(element: str, text: str):
441+
"""."""
442+
res = f'\n<{element}>{text}</{element}>'
443+
return res
444+
445+
446+
def emit_html(toks: list, lstart = 2, tstart = 0, stack = [], isLineCtx = True):
447+
"""."""
448+
res = ''
449+
i = lstart
450+
j = tstart
451+
while i < len(toks) and j <= len(toks[i]):
452+
print(f'{i} {j} | {isLineCtx} | {stack} | {toks[i]}')
453+
line = toks[i]
454+
if not isLineCtx:
455+
j, k = putLineInContext(line, stack)
456+
if 0 <= k <= len(stack) - 1:
457+
print(f"back! with k={k}")
458+
return i, 0, res
459+
460+
tok = line[j] if j < len(line) else ''
461+
tst = None
462+
node, ht, offset = detectNewBlock(tok, stack)
463+
if not node:
464+
tst = detectNewOrClosingSpan(tok, stack)
465+
466+
if node:
467+
i, j, r = emit_html(toks, i, j+offset, stack + [node])
468+
res += html_text(ht, r)
469+
elif tst and tst[0] == 'opening':
470+
i, j, r = emit_html(toks, i, j+1, stack + [tst[1]])
471+
res += html_text(tst[1], r)
472+
elif tst and tst[0] == 'closing':
473+
return i, j+1, res
474+
elif stack:
475+
res += tok
476+
j += 1
477+
else:
478+
j += 1
479+
480+
if j >= len(line):
481+
i += 1
482+
j = 0
483+
if j == 0:
484+
isLineCtx = False
485+
print(f'ret {i} {j}')
486+
return i, j, res
510487

511488

0 commit comments

Comments
 (0)