Skip to content

Commit b576e42

Browse files
committed
Refactor links
1 parent 6759c72 commit b576e42

File tree

1 file changed

+140
-99
lines changed

1 file changed

+140
-99
lines changed

pdfsyntax/markdown.py

Lines changed: 140 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ def render_html(md: str):
301301
def render_htmlV2(md: str):
302302
"""Render HTML from markdown string."""
303303
t = tokenize(md)
304-
_, html = emit_html(t)
304+
_, _, html = emit_html(t)
305305
return html
306306

307307

@@ -311,22 +311,45 @@ def tokenize(md: str) -> list:
311311
i = 0
312312
tok = ''
313313
line = []
314+
html = False
314315
while i < len(md):
315316
c = md[i]
316-
#print(f"{i:2} | {'\\n' if c == '\n' else c} | {tok} | {line}")
317-
if c != '#' and not line and tok and tok == len(tok) * '#':
318-
#
317+
#print(f"#{tok}# @{c}@")
318+
if c == '<' or html:
319+
#
320+
html = True
321+
tok += c
322+
if c == '>' and html:
323+
html = False
324+
elif c != '#' and not line and tok and tok == len(tok) * '#':
325+
# Isolate header level
319326
line.append(tok)
320327
tok = '' if c == ' ' else c
328+
elif c != ' ' and not line and tok and tok == len(tok) * ' ':
329+
# Isolate consecutive spaces at bol
330+
if len(tok) >= 4:
331+
line.append(tok)
332+
tok = c
321333
elif c in '-' and line and line[-1][0] in '-' and line[-1] == len(line[-1]) * '-':
322-
#
334+
# Concatenate setext underline
323335
line[-1] = line[-1] + c
324336
elif c == '`' and line and not tok and line[-1] in '``':
325337
# Concatenate backquotes signs for fenced code
326338
line[-1] = line[-1] + c
327339
elif c in '*_' and line and line[-1][-1] in '*_' and tok == '':
328340
# Concatenate signs for em & strong styles
329341
line[-1] = line[-1] + c
342+
elif c in '![' and checkLinkOrImage(md, i):
343+
if c == '[':
344+
t = 'lnk['
345+
else:
346+
t = 'img['
347+
i += 1
348+
if tok:
349+
line.append(tok)
350+
tok = ''
351+
line.append(t)
352+
line.append('[')
330353
elif c in '\n[]()-+.>*_`':
331354
if tok:
332355
line.append(tok)
@@ -347,94 +370,134 @@ def tokenize(md: str) -> list:
347370
res.append(line)
348371
line = []
349372
i += 1
373+
res.append([''])
350374
return res
351375

352376

353-
def putLineInContext(stack: list, line: list):
377+
def putLineInContext(stack: list, line: list, list_indent: int):
354378
"""."""
355379
j = 0
356380
k = 0
381+
list_indent = 4 if not list_indent else list_indent
357382
while j < len(line) and k < len(stack):
358383
if stack[k][0] == 'u':
359-
j -= 1
360-
elif stack[k] == 'li':
361-
if line[j] in '-*+':
384+
if not line[j]:
362385
break
363-
elif stack[k] == '>':
386+
else:
387+
j -= 1
388+
elif stack[k] == 'li':
389+
if not line[j] or line[j] in '-*+':
390+
if j>0 and len(line[j-1]) < len(stack) / 2 * 4 :
391+
break
392+
else:
393+
j -= 1
394+
elif stack[k] == 'blockquote':
364395
if line[j] != '>':
365396
break
366397
elif stack[k][0] == 'h':
367398
break
399+
elif stack[k] == 'fenced':
400+
if line[j] == '```' or line[j] == '~~~':
401+
j -= 1
402+
break
403+
elif stack[k] == 'indented':
404+
if line[j] != ' ':
405+
j -= 1
406+
break
407+
#elif stack[k] == 'a':
408+
# if line[j-1] == ')':
409+
# break
368410
elif stack[k][0] == 'p':
369411
if line[j] in ['', '#', '##']:
370412
break
371413
else:
372414
break
373415
j += 1
374416
k += 1
375-
bol = 0 if j >= len(line) -1 else j
417+
if j > len(line) -1:
418+
bol = 0
419+
else:
420+
bol = j
376421
keepers = -1 if k >= len(stack) else k
377422
return (bol, keepers)
378423

379424

380-
def detectNewBlock(tok, prev_tok, stack: list):
425+
def detectNewBlock(tok, prev_tok, stack: list, list_indent: int):
381426
"""."""
382427
if tok and tok == len(tok) * '#':
383-
return f'h{len(tok)}', f'h{len(tok)}', 1
428+
return f'h{len(tok)}', 1
429+
elif tok and tok == '```' or tok == '~~~':
430+
return 'fenced', 1
431+
elif tok and tok == ' ':
432+
return 'indented', 1
384433
elif tok and tok in '>':
385-
return '>', 'blockquote', 1
434+
return 'blockquote', 1
435+
elif tok and tok == 'lnk[' and stack[-1] != 'a':
436+
return 'a', 0
437+
elif tok and tok == 'img[' and stack[-1] != 'img':
438+
return 'img', 0
386439
elif stack and stack[-1][:2] == 'ul' and tok in '-*+':
387-
return 'li', 'li', 1
440+
return 'li', 1
388441
elif (not stack or stack[-1][:2] != 'ul') and tok and tok in '*+-':
389-
return f'ul{len(prev_tok)}', 'ul', 0
442+
return 'ul', 0
390443
elif not stack and tok:
391-
return 'p', 'p', 0
444+
return 'p', 0
392445
else:
393-
return None, None, 0
446+
return None, 0
447+
394448

449+
def checkLinkOrImage(md: str, start: int):
450+
"""."""
451+
obj = False
452+
url = False
453+
i = start
454+
if md[start] == '!' and i+1 < len(md) and md[i+1] == '[':
455+
i += 1
456+
while i < len(md):
457+
if md[i] == '\n':
458+
return False
459+
elif md[i-1:i+1] == '](':
460+
obj = True
461+
elif md[i] == ')':
462+
url = True
463+
if obj and url:
464+
return True
465+
i += 1
466+
return False
395467

396-
def detectNewOrClosingSpan(tok, stack: list):
468+
469+
def detectNewOrClosingSpan(tok: str, stack: list):
397470
"""."""
398471
h = [
399-
('`', 'code', '`'),
400-
('_', 'em', '_'), # or *
401-
('__', 'strong', '__'),# or **
472+
('`', 'code', '`', ['p']),
473+
('_', 'em', '_', ['em', 'p', 'obj']), # or *
474+
('__', 'strong', '__', ['p']),# or **
475+
('[', 'obj', ']', ['a']),
476+
('(', 'url', ')', ['a']),
402477
]
403-
for opening, element, closing in h:
404-
if opening == closing and tok == opening:
478+
for opening, element, closing, auths in h:
479+
if opening == closing and tok == opening:# and element in auths:
405480
if element not in stack:
406481
return 'opening', element
407482
else:
408483
return 'closing', element
409-
elif tok == opening:
484+
elif tok == opening:# and element in auths:
410485
return 'opening', element
411-
elif tok == closing:
486+
elif tok == closing:# and element in auths:
412487
return 'closing', element
413488
return None, None
414489

415490

416-
#######################################################################
417-
# OLD CODE TO RECYCLE
418-
#
419-
# elif y == '\n' and (z == '```' or z == '~~~'):
420-
# if i < len(toks) -1 and toks[i+1] != '\n':
421-
# i, r = emit_html(toks, i+2, stack + ['fenced'])
422-
# else:
423-
# i, r = emit_html(toks, i+1, stack + ['fenced'])
424-
# res += f'\n<pre><code>{r}</code></pre>'
425-
# elif z == '[':
426-
# i, r1 = emit_html(toks, i+1, stack + ['link_text'])
427-
# i, r2 = emit_html(toks, i+1, stack + ['link_url'])
428-
# res += f'<a href="{r2}">{r1}</a>'
429-
# elif stack[-1][:1] == '>' and y == '\n' and z == ' ':
430-
# i, r = emit_html(toks, i+1, stack + ['indented'])
431-
# res += f'\n<pre><code>{r}</code></pre>'
432-
########################################################################
433-
434-
435-
def html_text(element: str, text: str):
491+
def html_text(element: str, content):
436492
"""."""
437-
res = f'\n<{element}>{text}</{element}>'
493+
isBlock = '\n' if element[0] in ['p', 'h', 'b'] else ''
494+
if element in ['fenced', 'indented']:
495+
res = f'\n<pre><code>{content}</code></pre>'
496+
elif element in ['a', 'img']:
497+
title = f' title="{content["title"]}"' if 'title' in content else ''
498+
res = f'<{element} href="{content["url"]}"{title}>{content["obj"]}</a>'
499+
else:
500+
res = f'{isBlock}<{element}>{content}</{element}>'
438501
return res
439502

440503

@@ -445,14 +508,17 @@ def emit_html(toks: list, lstart = 2, tstart = 0):
445508
isLineCtx = True
446509
i = lstart
447510
j = tstart
511+
list_indent = 0
448512
while i < len(toks) and j <= len(toks[i]):
449-
print(f'{i} {j} | {isLineCtx:1} | {".".join(stack):15} | | {toks[i]} ')
450-
print(accu[-1])
513+
print(f'{i:2} {j:2} | {isLineCtx:1} | {".".join(stack):20} | {str(accu[-1])[:40].replace('\n','.')} ')
451514
line = toks[i]
452515
if not isLineCtx:
453-
j, k = putLineInContext(stack, line)
516+
j, k = putLineInContext(stack, line, list_indent)
454517
if 0 <= k <= len(stack) - 1:
455-
j = 0
518+
if j < 0:
519+
i += 1
520+
else:
521+
j = 0
456522
elt = stack.pop()
457523
last = accu.pop()
458524
accu[-1] += html_text(elt, last)
@@ -461,17 +527,21 @@ def emit_html(toks: list, lstart = 2, tstart = 0):
461527
tok = line[j] if j < len(line) else ''
462528
prev_tok = line[j-1] if j > 0 else ''
463529
tst = None
464-
node, ht, offset = detectNewBlock(tok, prev_tok, stack)
530+
node, offset = detectNewBlock(tok, prev_tok, stack, list_indent)
531+
list_indent = len(prev_tok) if node == 'li' and not list_indent else list_indent
465532
if not node:
466533
tst = detectNewOrClosingSpan(tok, stack)
467534

468535
if node:
469536
isLineCtx = True
470537
j += offset
471538
stack += [node]
472-
accu += ['']
539+
if node in ['a', 'img']:
540+
accu += [{'url':''}]
541+
else:
542+
accu += ['']
473543
elif tst and tst[0] == 'opening':
474-
isLineCtx = True
544+
#isLineCtx = True
475545
j += 1
476546
stack += [tst[1]]
477547
accu += ['']
@@ -480,10 +550,23 @@ def emit_html(toks: list, lstart = 2, tstart = 0):
480550
j += 1
481551
elt = stack.pop()
482552
last = accu.pop()
483-
accu[-1] += html_text(elt, last)
553+
if elt == 'obj':
554+
accu[-1][elt] = last
555+
elif elt == 'url':
556+
tmp = last.split('"')
557+
url = tmp[0]
558+
if len(tmp) > 1:
559+
accu[-1]['title'] = tmp[1]
560+
url = url[:-1]
561+
accu[-1]['url'] += url
562+
else:
563+
accu[-1] += html_text(elt, last)
564+
if elt in ['a', 'img']:
565+
j -= 1
484566
continue
485567
elif stack:
486-
accu[-1] += tok
568+
if stack[-1] not in ['a', 'img']:
569+
accu[-1] += tok
487570
j += 1
488571
else:
489572
j += 1
@@ -492,49 +575,7 @@ def emit_html(toks: list, lstart = 2, tstart = 0):
492575
i += 1
493576
j = 0
494577
isLineCtx = False
495-
return i, j, accu[0]
496-
497578

498-
def emit_html_recursive(toks: list, lstart = 2, tstart = 0, stack = [], isLineCtx = True):
499-
"""."""
500-
res = ''
501-
i = lstart
502-
j = tstart
503-
while i < len(toks) and j <= len(toks[i]):
504-
print(f'{i} {j} | {isLineCtx:1} | {".".join(stack):15} | {toks[i]} ')
505-
line = toks[i]
506-
if not isLineCtx:
507-
j, k = putLineInContext(stack, line)
508-
if 0 <= k <= len(stack) - 1:
509-
#print(f"back! k={k}")
510-
return i, 0, res
511-
512-
tok = line[j] if j < len(line) else ''
513-
prev_tok = line[j-1] if j > 0 else ''
514-
tst = None
515-
node, ht, offset = detectNewBlock(tok, prev_tok, stack)
516-
if not node:
517-
tst = detectNewOrClosingSpan(tok, stack)
518-
519-
if node:
520-
i, j, r = emit_html_recursive(toks, i, j+offset, stack + [node])
521-
res += html_text(ht, r)
522-
elif tst and tst[0] == 'opening':
523-
i, j, r = emit_html_recursive(toks, i, j+1, stack + [tst[1]])
524-
res += html_text(tst[1], r)
525-
elif tst and tst[0] == 'closing':
526-
return i, j+1, res
527-
elif stack:
528-
res += tok
529-
j += 1
530-
else:
531-
j += 1
579+
return i, j, accu[0]
532580

533-
if j >= len(line):
534-
i += 1
535-
j = 0
536-
if j == 0:
537-
isLineCtx = False
538-
print(f'ret {i} {j}')
539-
return i, j, res
540581

0 commit comments

Comments
 (0)