@@ -117,6 +117,8 @@ function cleanupNodes(element) {
117117 // Daum/Naver specific noise
118118 '.layer_util' , '.box_setting' , '.util_view' , '.wrap_util' ,
119119 '.box_layer' , '.img_mask' , '.btn_util' ,
120+ // Daum/Kakao Headers
121+ '#kakaoHead' , '.gnb_comm' , '.d_head' , '#kakaoGnb' ,
120122 // Daum/Naver Footers & Related
121123 '.foot_view' , '.box_recommend' , '.txt_copyright' , '.box_etc' ,
122124 '#foot_view' , '.kakao_ad' , '.art_copy' ,
@@ -141,7 +143,10 @@ function cleanupNodes(element) {
141143 '[class*="copyright"]' , '[class*="footer"]' , '[class*="related"]' ,
142144 '[id*="copyright"]' , '[id*="footer"]' , '[id*="related"]' ,
143145 // NYT / Ad noise
144- '#top-wrapper' , '#top-slug' , 'div[class*="ad-"]'
146+ '#top-wrapper' , '#top-slug' , 'div[class*="ad-"]' ,
147+ // Wikipedia Cleanup
148+ '.mw-indicators' , '#siteSub' , '.mw-editsection' , '.hatnote' ,
149+ '.shortdescription' , '#catlinks' , '.navbox' , '.infobox'
145150 ] ;
146151
147152 element . querySelectorAll ( unwantedSelectors . join ( ', ' ) ) . forEach ( el => el . remove ( ) ) ;
@@ -439,7 +444,9 @@ function createIconSvg(name) {
439444 'margin-h' : '<path d="M3 12h18"/><path d="m7 8-4 4 4 4"/><path d="m17 8 4 4-4 4"/>' ,
440445 view : '<rect width="18" height="18" x="3" y="3" rx="2" ry="2"/><line x1="9" x2="15" y1="3" y2="3"/><line x1="9" x2="15" y1="21" y2="21"/>' ,
441446 'page-1' : '<rect x="5" y="4" width="14" height="16" rx="2" />' ,
442- 'page-2' : '<rect x="4" y="4" width="16" height="16" rx="2" /><line x1="12" y1="4" x2="12" y2="20" />'
447+ 'page-2' : '<rect x="4" y="4" width="16" height="16" rx="2" /><line x1="12" y1="4" x2="12" y2="20" />' ,
448+ // Markdown Mark (M + Down Arrow)
449+ download : '<rect x="3" y="3" width="18" height="18" rx="2" ry="2"></rect><path d="M7 8v8"></path><path d="M17 8v8"></path><path d="M7 8l5 5l5-5"></path><line x1="12" y1="8" x2="12" y2="13"></line>'
443450 } ;
444451 return `<svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">${ icons [ name ] || '' } </svg>` ;
445452}
@@ -453,6 +460,148 @@ if (typeof module !== 'undefined' && module.exports) {
453460 cleanupNodes,
454461 sanitizeAndFixContent,
455462 extractContent,
463+ extractContent,
464+ domToMarkdown,
465+ addFrontmatter,
456466 createIconSvg
457467 } ;
458468}
469+ /**
470+ * Converts a DOM element or document fragment to Markdown.
471+ * @param {Node } root
472+ * @returns {string }
473+ */
474+ function domToMarkdown ( root ) {
475+ let output = '' ;
476+
477+ const process = ( node ) => {
478+ if ( node . nodeType === Node . TEXT_NODE ) {
479+ // Collapse whitespace unless inside a pre tag
480+ let text = node . textContent ;
481+ if ( node . parentElement && ! node . parentElement . closest ( 'pre' ) ) {
482+ text = text . replace ( / [ \r \n \t ] + / g, ' ' ) . replace ( / \s { 2 , } / g, ' ' ) ;
483+ }
484+ output += text ;
485+ return ;
486+ }
487+
488+ if ( node . nodeType !== Node . ELEMENT_NODE ) return ;
489+
490+ const tag = node . tagName . toLowerCase ( ) ;
491+ let prefix = '' , suffix = '' ;
492+
493+ // Block elements
494+ if ( tag . match ( / ^ h [ 1 - 6 ] $ / ) ) {
495+ const level = parseInt ( tag [ 1 ] ) ;
496+ prefix = '\n\n' + '#' . repeat ( level ) + ' ' ; // Ensure spacing before headers
497+ suffix = '\n\n' ;
498+ } else if ( tag === 'p' ) {
499+ prefix = '\n\n' ;
500+ suffix = '\n\n' ;
501+ } else if ( tag === 'ul' || tag === 'ol' ) {
502+ prefix = '\n\n' ;
503+ suffix = '\n\n' ;
504+ } else if ( tag === 'li' ) {
505+ prefix = '\n- ' ; // Ensure list items start on new line
506+ suffix = '' ;
507+ } else if ( tag === 'blockquote' ) {
508+ prefix = '\n> ' ;
509+ suffix = '\n\n' ;
510+ } else if ( tag === 'pre' ) {
511+ prefix = '\n```\n' ;
512+ suffix = '\n```\n\n' ;
513+ } else if ( tag === 'code' ) {
514+ if ( node . parentElement && node . parentElement . tagName . toLowerCase ( ) !== 'pre' ) {
515+ prefix = '`' ;
516+ suffix = '`' ;
517+ }
518+ }
519+ // Tables
520+ else if ( tag === 'tr' ) {
521+ prefix = '\n| ' ;
522+ suffix = '' ;
523+ } else if ( tag === 'td' || tag === 'th' ) {
524+ suffix = ' | ' ;
525+ }
526+ else if ( tag === 'div' ) {
527+ prefix = '\n' ;
528+ suffix = '\n' ;
529+ }
530+ else if ( tag === 'b' || tag === 'strong' ) {
531+ prefix = '**' ;
532+ suffix = '**' ;
533+ } else if ( tag === 'i' || tag === 'em' ) {
534+ prefix = '*' ;
535+ suffix = '*' ;
536+ } else if ( tag === 'a' ) {
537+ // Contextual ignore: If empty href, just render text
538+ if ( ! node . href || node . getAttribute ( 'href' ) . trim ( ) === '' ) {
539+ // Just process children
540+ } else {
541+ output += '[' ;
542+ node . childNodes . forEach ( child => process ( child ) ) ;
543+ output += `](${ node . href } )` ;
544+ return ;
545+ }
546+ } else if ( tag === 'img' ) {
547+ const alt = node . alt || '' ;
548+ const src = node . src ;
549+ if ( src ) {
550+ // No newlines if inside link or table cell
551+ const parentTag = node . parentElement ? node . parentElement . tagName . toLowerCase ( ) : '' ;
552+ if ( parentTag === 'a' || parentTag === 'td' || parentTag === 'th' ) {
553+ output += `` ;
554+ } else {
555+ output += `\n\n` ;
556+ }
557+ }
558+ return ;
559+ } else if ( tag === 'br' ) {
560+ output += ' \n' ;
561+ return ;
562+ } else if ( tag === 'hr' ) {
563+ output += '\n---\n' ;
564+ return ;
565+ }
566+
567+ output += prefix ;
568+ node . childNodes . forEach ( child => process ( child ) ) ;
569+ output += suffix ;
570+ } ;
571+
572+ // If root is a string, wrap it in a temp container
573+ if ( typeof root === 'string' ) {
574+ const div = document . createElement ( 'div' ) ;
575+ div . innerHTML = root ;
576+ root = div ;
577+ }
578+
579+ if ( root . childNodes ) {
580+ root . childNodes . forEach ( child => process ( child ) ) ;
581+ } else {
582+ process ( root ) ;
583+ }
584+
585+ // Determine clean up: replace multiple newlines/spaces with max 2 newlines
586+ return output . replace ( / ( \n \s * ) { 3 , } / g, '\n\n' ) . trim ( ) ;
587+ }
588+
589+ /**
590+ * Prepend YAML frontmatter to markdown content.
591+ * @param {string } markdown
592+ * @param {Object } metadata { title, url, date }
593+ * @returns {string }
594+ */
595+ function addFrontmatter ( markdown , { title, url, date } ) {
596+ const safeTitle = ( title || 'Untitled' ) . replace ( / " / g, '\\"' ) ;
597+ const safeUrl = url || '' ;
598+ const safeDate = date || new Date ( ) . toISOString ( ) . split ( 'T' ) [ 0 ] ;
599+
600+ return `---
601+ title: "${ safeTitle } "
602+ url: "${ safeUrl } "
603+ date: ${ safeDate }
604+ ---
605+
606+ ${ markdown } `;
607+ }
0 commit comments