@@ -307,205 +307,182 @@ def render_htmlV2(md: str):
307307
308308def tokenize (md : str ) -> list :
309309 """Split input string and transform some tokens to facilitate further processing."""
310- res = [' \n ' , ' \n ' ]
310+ res = [[ '' ], [ '' ] ]
311311 i = 0
312- last_nl = 0
313- before_nl = 0
314312 tok = ''
315- UNDER2FRONT = { '=' : '#' , '-' : '##' }
313+ line = []
316314 while i < len (md ):
317315 c = md [i ]
318- print (tok )
319- if c in '\n []()>*_`-+.' :
320- if c == '\n ' :
321- before_nl = last_nl
322- last_nl = len (res ) - 1
323- if tok and (tok == '=' * len (tok ) or tok == '-' * len (tok )):
324- # Turn setext into atx style heading
325- res .insert (before_nl + 1 , UNDER2FRONT [tok [0 ]])
316+ #print(f"{i:2} | {'\\n' if c == '\n' else c} | {tok} | {line}")
317+ if c != '#' and not line and tok and tok == len (tok ) * '#' :
318+ #
319+ line .append (tok )
320+ tok = '' if c == ' ' else c
321+ elif c in '-' and line and line [- 1 ][0 ] in '-' and line [- 1 ] == len (line [- 1 ]) * '-' :
322+ #
323+ line [- 1 ] = line [- 1 ] + c
324+ elif c == '`' and line and not tok and line [- 1 ] in '``' :
325+ # Concatenate backquotes signs for fenced code
326+ line [- 1 ] = line [- 1 ] + c
327+ elif c in '*_' and line and line [- 1 ][- 1 ] in '*_' and tok == '' :
328+ # Concatenate signs for em & strong styles
329+ line [- 1 ] = line [- 1 ] + c
330+ elif c in '\n []()-+.>*_`' :
331+ if tok :
332+ line .append (tok )
326333 tok = ''
327- elif len (res ) > 1 and res [- 2 ] == '\n ' and res [- 1 ] in '>>>>>' and c == '>' :
328- # Concatenate gt signs for nested blockquotes
329- res [- 1 ] = res [- 1 ] + c
330- elif len (res ) > 1 and res [- 2 ] == '\n ' and res [- 1 ] in '``' and c == '`' :
331- # Concatenate backquotes signs for fenced code
332- res [- 1 ] = res [- 1 ] + c
333- elif res [- 1 ][- 1 ] in '*_' and tok == '' and c in '*_' :
334- # Concatenate signs for em & strong styles
335- res [- 1 ] = res [- 1 ] + c
336- elif len (res ) > 1 and res [- 2 ] == '\n ' and res [- 1 ] in '-*+' :
337- # Normalize sign for ul and add empty space when no indent
338- del res [- 1 ]
339- res += ['' , '-*+' , tok .lstrip (), c ]
340- tok = ''
341- elif len (res ) > 2 and res [- 3 ] == '\n ' and res [- 2 ] == len (res [- 2 ]) * ' ' and res [- 1 ] in '-*+' :
342- # Normalize sign for ul
343- del res [- 1 ]
344- res += ['-*+' , tok .lstrip (), c ]
345- tok = ''
346- elif len (res ) > 2 and res [- 1 ] == '\n ' and len (tok ) != len (tok .lstrip (' ' )) and tok .lstrip (' ' ):
347- # Isolate spaces
348- t = tok .lstrip (' ' )
349- nb_sp = len (tok ) - len (t )
350- res += [nb_sp * ' ' , t , c ]
351- tok = ''
352- elif len (res ) > 2 and res [- 3 ] == '\n ' and res [- 2 ].lstrip (' ' ).isdigit () and res [- 1 ] == '.' :
353- # Isolate spaces before ol
354- dgts = res [- 2 ].lstrip (' ' )
355- nb_sp = len (res [- 2 ]) - len (dgts )
356- del res [- 2 :]
357- res += [nb_sp * ' ' , f"{ dgts } ." , tok .lstrip (), c ]
358- tok = ''
359- else :
360- if tok :
361- res .append (tok )
362- tok = ''
363- res .append (c )
364- elif tok and tok [- 1 ] in '#' and tok [0 ] in '#' and c not in '#' :
365- # Concatenate atx style heading
366- res .append (tok )
367- tok = ''
334+ line .append (c )
368335 else :
369336 tok += c
337+
338+ if line and line [- 1 ] == '\n ' :
339+ del line [- 1 ]
340+ if line and line [- 1 ] == len (line [- 1 ]) * '=' :
341+ res [- 1 ].insert (0 , '#' )
342+ elif line and line [- 1 ] == len (line [- 1 ]) * '-' :
343+ res [- 1 ].insert (0 , '##' )
344+ else :
345+ if not line :
346+ line = ['' ]
347+ res .append (line )
348+ line = []
370349 i += 1
371350 return res
372351
373352
374- def emit_html ( toks : list , start = 2 , stack = [ '' ] ):
353+ def putLineInContext ( line : list , stack : list ):
375354 """."""
376- res = ''
377- i = start
378- while i < len (toks ):
379- context = stack [- 1 ]
380- x , y , z = toks [i - 2 ], toks [i - 1 ], toks [i ]
381- tk = f'\\ n' if toks [i ] == '\n ' else toks [i ]
382- print (f"{ i :2} | { '/' .join (stack ):20} | { tk } " )
383- if not context :
384- pass
385- elif context == 'p' :
386- if y == '\n ' and z == '\n ' :
387- return i + 1 , res
388- elif y == '\n ' and z == len (toks [i ]) * '#' :
389- return i , res
390- elif y == '\n ' and z [:1 ] == '>' and len (stack ) > 1 and stack [- 2 ] != f'>{ len (z )} ' :
391- return i , res
392- elif y == '\n ' and z [:1 ] != '>' and len (stack ) > 1 and stack [- 2 ][:1 ] == '>' :
393- return i , res
394- elif context [:1 ] == '>' :
395- if y == '\n ' and z [:1 ] == '>' and int (context [1 :]) > len (z ):
396- return i + 1 , res
397- elif y == '\n ' and z [:1 ] != '>' :
398- return i + 1 , res
399- elif context == 'indented' :
400- if y == '\n ' and z != ' ' :
401- return i , res
402- elif context == 'fenced' :
403- if y == '\n ' and (z == '```' or z == '~~~' ):
404- return i + 1 , res
405- elif context [:2 ] == 'ul' :
406- if x == '\n ' and y == len (y ) * ' ' and z == '-*+' and len (y ) < int (context [2 :]):
407- return i , res
408- elif x == '\n ' and y == '\n ' and z and z [0 ] != ' ' :
409- return i , res
410- elif context [:2 ] == 'ol' :
411- if x == '\n ' and y == len (y ) * ' ' and z [- 1 ] == '.' and len (y ) < int (context [2 :]):
412- return i , res
413- elif x == '\n ' and y == '\n ' and z and z [0 ] != ' ' :
414- return i , res
415- elif context == 'li' :
416- if x == '\n ' and y == len (y ) * ' ' and z == '-*+' and len (y ) <= int (stack [- 2 ][2 :]):
417- return i , res
418- elif x == '\n ' and y == len (y ) * ' ' and z [- 1 ] == '.' and len (y ) <= int (stack [- 2 ][2 :]):
419- return i , res
420- elif x == '\n ' and y == '\n ' and z and z [0 ] != ' ' :
421- return i , res
422- elif context [0 ] == 'h' :
423- if z == '\n ' :
424- return i , res
425- elif context == 'code' :
426- if z == '`' :
427- return i + 1 , res
428- elif context == 'link_text' :
429- if z == ']' :
430- return i + 1 , res
431- elif context == 'link_url' :
432- if z == ')' :
433- return i + 1 , res
434- elif context == 'em' :
435- if z in '*_' :
436- return i + 1 , res
437- elif context == 'strong' :
438- if z .replace ('_' , '*' ) == '**' :
439- return i + 1 , res
440- elif context == 'em&strong' :
441- if z .replace ('_' , '*' ) == '***' :
442- return i + 1 , res
443-
444- if i == 0 and toks [0 ] == '\n ' :
445- i += 1
446- elif y == '\n ' and z and z == len (z ) * '#' :
447- hx = f'h{ len (z )} '
448- i , r = emit_html (toks , i + 1 , stack + [hx ])
449- res += f'\n <{ hx } >{ r } </{ hx } >'
450- elif y == '\n ' and (z == '```' or z == '~~~' ):
451- if i < len (toks ) - 1 and toks [i + 1 ] != '\n ' :
452- i , r = emit_html (toks , i + 2 , stack + ['fenced' ])
453- else :
454- i , r = emit_html (toks , i + 1 , stack + ['fenced' ])
455- res += f'\n <pre><code>{ r } </code></pre>'
456- elif z == '[' :
457- i , r1 = emit_html (toks , i + 1 , stack + ['link_text' ])
458- i , r2 = emit_html (toks , i + 1 , stack + ['link_url' ])
459- res += f'<a href="{ r2 } ">{ r1 } </a>'
460- elif z == '`' :
461- i , r = emit_html (toks , i + 1 , stack + ['code' ])
462- res += f'<code>{ r } </code>'
463- elif y != '\n ' and z and z in '*_' :
464- i , r = emit_html (toks , i + 1 , stack + ['em' ])
465- res += f'<em>{ r } </em>'
466- elif y != '\n ' and z .replace ('_' , '*' ) == '**' :
467- i , r = emit_html (toks , i + 1 , stack + ['strong' ])
468- res += f'<strong>{ r } </strong>'
469- elif y != '\n ' and z .replace ('_' , '*' ) == '***' :
470- i , r = emit_html (toks , i + 1 , stack + ['em&strong' ])
471- res += f'<em><strong>{ r } </strong></em>'
472- elif context [:2 ] == 'ul' and x == '\n ' and y == len (y ) * ' ' and z == '-*+' :
473- i , r = emit_html (toks , i + 1 , stack + ['li' ])
474- res += f'\n <li>{ r .rstrip ("\n " )} </li>'
475- elif context [:2 ] == 'ol' and x == '\n ' and y == len (y ) * ' ' and z [- 1 ] == '.' :
476- i , r = emit_html (toks , i + 1 , stack + ['li' ])
477- res += f'\n <li>{ r .rstrip ("\n " )} </li>'
478- elif x == '\n ' and y == len (y ) * ' ' and z == '-*+' :
479- i , r = emit_html (toks , i , stack + [f'ul{ len (toks [i - 1 ])} ' ])
480- res += f'\n <ul>{ r } \n </ul>'
481- elif x == '\n ' and y == len (y ) * ' ' and z [- 1 ] == '.' :
482- i , r = emit_html (toks , i , stack + [f'ol{ len (y )} ' ])
483- res += f'\n <ol>{ r } \n </ol>'
484- elif not context and y == '\n ' and z == '>' :
485- i , r = emit_html (toks , i + 1 , stack + ['>1' ])
486- res += f'\n <blockquote>{ r } \n </blockquote>'
487- elif context [:1 ] == '>' and y == '\n ' and z == (int (context [1 :])+ 1 ) * '>' :
488- i , r = emit_html (toks , i , stack + [f'>{ len (z )} ' ])
489- res += f'\n <blockquote>{ r } \n </blockquote>'
490- elif context [:1 ] == '>' and toks [i - 1 ] == '\n ' and toks [i ] == ' ' :
491- i , r = emit_html (toks , i + 1 , stack + ['indented' ])
492- res += f'\n <pre><code>{ r } </code></pre>'
493- elif context == 'pre&code' and y == '\n ' and z == ' ' :
494- i += 1
495- elif context [:2 ] in ['ul' , 'ol' ] and z == len (z ) * ' ' :
496- i += 1
497- elif context and context [:1 ] not in ['u' , 'o' , '>' ]:
498- if context == 'p' and len (stack ) > 1 and stack [- 2 ][:1 ] == '>' and z in '>>>>>' :
499- pass
500- else :
501- res += z
502- i += 1
355+ k = - 1
356+ j = 0
357+ while j < len (line ) and k <= len (stack ) - 1 :
358+ if not line [0 ] and stack and stack [- 1 ][0 ] in 'pul' :
359+ k += 1
360+ break
361+ elif line [j ] == ' ' and stack and stack [k + 1 ][0 ] == 'u' :
362+ k += 2
363+ elif stack and stack [k + 1 ][0 ] == 'h' :
364+ k += 1
365+ break
366+ elif line [j ] in '-*+' and stack and k < len (stack )- 2 and stack [k + 2 ][0 ] == 'l' :
367+ k += 1
368+ break
369+ elif line [j ] == '>' and stack and stack [k + 1 ][0 ] == '>' :
370+ k += 1
503371 else :
504- if z == '' or z == '\n ' :
505- i += 1
372+ break
373+ j += 1
374+ return (j , k )
375+
376+
377+ def detectNewBlock (tok , stack : list ):
378+ """."""
379+ if tok and tok == len (tok ) * '#' :
380+ return f'h{ len (tok )} ' , f'h{ len (tok )} ' , 1
381+ elif tok and tok in '>' :
382+ return '>' , 'blockquote' , 1
383+ elif stack and stack [- 1 ][:2 ] == 'ul' and tok in '-*+' :
384+ return 'li' , 'li' , 1
385+ elif (not stack or stack [- 1 ][:2 ] != 'ul' ) and tok and tok in '*+-' :
386+ return f'ul{ len ("" )} ' , 'ul' , 0
387+ elif not stack and tok :
388+ return 'p' , 'p' , 0
389+ else :
390+ return None , None , 0
391+
392+
393+ def detectNewOrClosingSpan (tok , stack : list ):
394+ """."""
395+ h = [
396+ ('`' , 'code' , '`' ),
397+ ('_' , 'em' , '_' ),
398+ ]
399+ for opening , element , closing in h :
400+ if opening == closing and tok == opening :
401+ if element not in stack :
402+ return 'opening' , element
506403 else :
507- i , r = emit_html (toks , i , stack + ['p' ])
508- res += f'\n <p>{ r .rstrip ("\n " )} </p>'
509- return i , res
404+ return 'closing' , element
405+ elif tok == opening :
406+ return 'opening' , element
407+ elif tok == closing :
408+ return 'closing' , element
409+ return None , None
410+
411+
412+ #######################################################################
413+ # OLD CODE TO RECYCLE
414+ #
415+ # elif y == '\n' and (z == '```' or z == '~~~'):
416+ # if i < len(toks) -1 and toks[i+1] != '\n':
417+ # i, r = emit_html(toks, i+2, stack + ['fenced'])
418+ # else:
419+ # i, r = emit_html(toks, i+1, stack + ['fenced'])
420+ # res += f'\n<pre><code>{r}</code></pre>'
421+ # elif z == '[':
422+ # i, r1 = emit_html(toks, i+1, stack + ['link_text'])
423+ # i, r2 = emit_html(toks, i+1, stack + ['link_url'])
424+ # res += f'<a href="{r2}">{r1}</a>'
425+ # elif y != '\n' and z and z in '*_':
426+ # i, r = emit_html(toks, i+1, stack + ['em'])
427+ # res += f'<em>{r}</em>'
428+ # elif y != '\n' and z.replace('_', '*') == '**':
429+ # i, r = emit_html(toks, i+1, stack + ['strong'])
430+ # res += f'<strong>{r}</strong>'
431+ # elif y != '\n' and z.replace('_', '*') == '***':
432+ # i, r = emit_html(toks, i+1, stack + ['em&strong'])
433+ # res += f'<em><strong>{r}</strong></em>'
434+ # elif stack[-1][:1] == '>' and y == '\n' and z == ' ':
435+ # i, r = emit_html(toks, i+1, stack + ['indented'])
436+ # res += f'\n<pre><code>{r}</code></pre>'
437+ ########################################################################
438+
439+
440+ def html_text (element : str , text : str ):
441+ """."""
442+ res = f'\n <{ element } >{ text } </{ element } >'
443+ return res
444+
445+
446+ def emit_html (toks : list , lstart = 2 , tstart = 0 , stack = [], isLineCtx = True ):
447+ """."""
448+ res = ''
449+ i = lstart
450+ j = tstart
451+ while i < len (toks ) and j <= len (toks [i ]):
452+ print (f'{ i } { j } | { isLineCtx } | { stack } | { toks [i ]} ' )
453+ line = toks [i ]
454+ if not isLineCtx :
455+ j , k = putLineInContext (line , stack )
456+ if 0 <= k <= len (stack ) - 1 :
457+ print (f"back! with k={ k } " )
458+ return i , 0 , res
459+
460+ tok = line [j ] if j < len (line ) else ''
461+ tst = None
462+ node , ht , offset = detectNewBlock (tok , stack )
463+ if not node :
464+ tst = detectNewOrClosingSpan (tok , stack )
465+
466+ if node :
467+ i , j , r = emit_html (toks , i , j + offset , stack + [node ])
468+ res += html_text (ht , r )
469+ elif tst and tst [0 ] == 'opening' :
470+ i , j , r = emit_html (toks , i , j + 1 , stack + [tst [1 ]])
471+ res += html_text (tst [1 ], r )
472+ elif tst and tst [0 ] == 'closing' :
473+ return i , j + 1 , res
474+ elif stack :
475+ res += tok
476+ j += 1
477+ else :
478+ j += 1
479+
480+ if j >= len (line ):
481+ i += 1
482+ j = 0
483+ if j == 0 :
484+ isLineCtx = False
485+ print (f'ret { i } { j } ' )
486+ return i , j , res
510487
511488
0 commit comments