@@ -584,3 +584,206 @@ export function getNewsCategories(): string {
584584 const categories = [ ...new Set ( active . map ( ( f ) => f . category ) ) ] . sort ( )
585585 return `Categorias: ${ categories . join ( ', ' ) } \nUso: /news [categoria]`
586586}
587+
588+ // ─── News Content Fetcher ───────────────────────────────────
589+
590+ const CONTENT_FETCH_TIMEOUT_MS = 15_000
591+ const MAX_CONTENT_BYTES = 5 * 1024 * 1024 // 5 MB
592+
593+ /**
594+ * Fetch and extract the main content from a news article URL.
595+ * Returns a cleaned, readable text version of the article.
596+ */
597+ export async function fetchNewsContent ( url : string ) : Promise < { title : string ; content : string } | string > {
598+ // Validate URL
599+ if ( ! url . startsWith ( 'http://' ) && ! url . startsWith ( 'https://' ) ) {
600+ return 'Error: URL invalida'
601+ }
602+
603+ const controller = new AbortController ( )
604+ const timeout = setTimeout ( ( ) => controller . abort ( ) , CONTENT_FETCH_TIMEOUT_MS )
605+
606+ try {
607+ const resp = await fetch ( url , {
608+ signal : controller . signal ,
609+ headers : {
610+ 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' ,
611+ 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ,
612+ 'Accept-Language' : 'pt-BR,pt;q=0.9,en;q=0.8' ,
613+ } ,
614+ } )
615+ clearTimeout ( timeout )
616+
617+ if ( ! resp . ok ) {
618+ return `Error: HTTP ${ resp . status } `
619+ }
620+
621+ // Check content-length
622+ const contentLength = resp . headers . get ( 'content-length' )
623+ if ( contentLength && Number ( contentLength ) > MAX_CONTENT_BYTES ) {
624+ return 'Error: pagina muito grande'
625+ }
626+
627+ // Read body with size cap
628+ const reader = resp . body ?. getReader ( )
629+ if ( ! reader ) {
630+ return 'Error: sem corpo de resposta'
631+ }
632+
633+ const chunks : Uint8Array [ ] = [ ]
634+ let totalBytes = 0
635+
636+ while ( true ) {
637+ const { done, value } = await reader . read ( )
638+ if ( done ) break
639+ totalBytes += value . byteLength
640+ if ( totalBytes > MAX_CONTENT_BYTES ) {
641+ reader . cancel ( )
642+ return 'Error: pagina muito grande'
643+ }
644+ chunks . push ( value )
645+ }
646+
647+ const raw = Buffer . concat ( chunks )
648+ const contentType = resp . headers . get ( 'content-type' )
649+ const html = decodeHtml ( raw , contentType )
650+
651+ // Extract article content
652+ const extracted = extractArticleContent ( html )
653+ return extracted
654+
655+ } catch ( err ) {
656+ clearTimeout ( timeout )
657+ if ( err instanceof Error ) {
658+ if ( err . name === 'AbortError' ) {
659+ return 'Error: timeout ao buscar pagina'
660+ }
661+ return `Error: ${ err . message . slice ( 0 , 100 ) } `
662+ }
663+ return 'Error: falha ao buscar pagina'
664+ }
665+ }
666+
667+ /**
668+ * Decode HTML bytes using the correct charset.
669+ */
670+ function decodeHtml ( raw : Buffer , contentType : string | null ) : string {
671+ const encoding = detectHtmlEncoding ( raw , contentType )
672+ try {
673+ return new TextDecoder ( encoding ) . decode ( raw )
674+ } catch {
675+ try {
676+ return new TextDecoder ( 'latin1' ) . decode ( raw )
677+ } catch {
678+ return new TextDecoder ( 'utf-8' , { fatal : false } ) . decode ( raw )
679+ }
680+ }
681+ }
682+
683+ /**
684+ * Detect encoding from Content-Type header or meta charset.
685+ */
686+ function detectHtmlEncoding ( raw : Buffer , contentType : string | null ) : string {
687+ // 1) HTTP Content-Type header
688+ if ( contentType ) {
689+ const match = contentType . match ( / c h a r s e t \s * = \s * [ " ' ] ? ( [ ^ \s ; " ' ] + ) / i)
690+ if ( match ) return normalizeEncoding ( match [ 1 ] )
691+ }
692+
693+ // 2) Meta charset in HTML (first 2KB)
694+ const head = raw . subarray ( 0 , 2048 ) . toString ( 'ascii' )
695+ const metaMatch = head . match ( / < m e t a [ ^ > ] + c h a r s e t \s * = \s * [ " ' ] ? ( [ ^ " ' \s > ] + ) / i)
696+ if ( metaMatch ) return normalizeEncoding ( metaMatch [ 1 ] )
697+
698+ // 3) XML-style declaration
699+ const xmlMatch = head . match ( / < \? x m l [ ^ ? ] + e n c o d i n g \s * = \s * [ " ' ] ( [ ^ " ' ] + ) [ " ' ] / i)
700+ if ( xmlMatch ) return normalizeEncoding ( xmlMatch [ 1 ] )
701+
702+ return 'utf-8'
703+ }
704+
705+ /**
706+ * Extract readable article content from HTML.
707+ * Uses heuristics to find the main article body.
708+ */
709+ function extractArticleContent ( html : string ) : { title : string ; content : string } {
710+ // Extract title
711+ const titleMatch = html . match ( / < t i t l e [ ^ > ] * > ( [ ^ < ] + ) < \/ t i t l e > / i)
712+ || html . match ( / < h 1 [ ^ > ] * > ( [ ^ < ] + ) < \/ h 1 > / i)
713+ || html . match ( / < m e t a [ ^ > ] + p r o p e r t y = " o g : t i t l e " [ ^ > ] + c o n t e n t = " ( [ ^ " ] + ) " / i)
714+ const title = titleMatch ? cleanHtml ( titleMatch [ 1 ] ) : 'Sem titulo'
715+
716+ // Try to find article content using common patterns
717+ let articleHtml = ''
718+
719+ // Strategy 1: Look for <article> tag
720+ const articleMatch = html . match ( / < a r t i c l e [ ^ > ] * > ( [ \s \S ] * ?) < \/ a r t i c l e > / i)
721+ if ( articleMatch ) {
722+ articleHtml = articleMatch [ 1 ]
723+ }
724+
725+ // Strategy 2: Look for common content containers
726+ if ( ! articleHtml ) {
727+ const patterns = [
728+ / < d i v [ ^ > ] + c l a s s = " [ ^ " ] * (?: a r t i c l e - b o d y | p o s t - c o n t e n t | e n t r y - c o n t e n t | s t o r y - b o d y | c o n t e n t - b o d y | a r t i c l e - c o n t e n t | n e w s - c o n t e n t | m a t e r i a - c o r p o ) [ ^ " ] * " [ ^ > ] * > ( [ \s \S ] * ?) < \/ d i v > / i,
729+ / < d i v [ ^ > ] + i t e m p r o p = " a r t i c l e B o d y " [ ^ > ] * > ( [ \s \S ] * ?) < \/ d i v > / i,
730+ / < m a i n [ ^ > ] * > ( [ \s \S ] * ?) < \/ m a i n > / i,
731+ ]
732+ for ( const pattern of patterns ) {
733+ const match = html . match ( pattern )
734+ if ( match ) {
735+ articleHtml = match [ 1 ]
736+ break
737+ }
738+ }
739+ }
740+
741+ // Strategy 3: Extract all paragraphs as fallback
742+ if ( ! articleHtml ) {
743+ const paragraphs : string [ ] = [ ]
744+ const pRegex = / < p [ ^ > ] * > ( [ \s \S ] * ?) < \/ p > / gi
745+ let pMatch : RegExpExecArray | null
746+ while ( ( pMatch = pRegex . exec ( html ) ) !== null ) {
747+ const text = cleanHtml ( pMatch [ 1 ] ) . trim ( )
748+ // Filter out short paragraphs (likely navigation, ads)
749+ if ( text . length > 50 ) {
750+ paragraphs . push ( text )
751+ }
752+ }
753+ articleHtml = paragraphs . join ( '\n\n' )
754+ } else {
755+ // Clean extracted article HTML
756+ articleHtml = extractParagraphs ( articleHtml )
757+ }
758+
759+ // Clean and format content
760+ const content = articleHtml . trim ( ) || 'Nao foi possivel extrair o conteudo do artigo.'
761+
762+ return { title, content }
763+ }
764+
765+ /**
766+ * Extract paragraphs from HTML content block.
767+ */
768+ function extractParagraphs ( html : string ) : string {
769+ const paragraphs : string [ ] = [ ]
770+ const pRegex = / < p [ ^ > ] * > ( [ \s \S ] * ?) < \/ p > / gi
771+ let match : RegExpExecArray | null
772+
773+ while ( ( match = pRegex . exec ( html ) ) !== null ) {
774+ const text = cleanHtml ( match [ 1 ] ) . trim ( )
775+ if ( text . length > 20 ) { // Skip very short paragraphs
776+ paragraphs . push ( text )
777+ }
778+ }
779+
780+ // If no paragraphs found, try to extract text directly
781+ if ( paragraphs . length === 0 ) {
782+ const cleanedText = cleanHtml ( html ) . trim ( )
783+ if ( cleanedText . length > 50 ) {
784+ return cleanedText
785+ }
786+ }
787+
788+ return paragraphs . join ( '\n\n' )
789+ }
0 commit comments