11import FirecrawlApp from "@mendable/firecrawl-js" ;
22import dotenv from "dotenv" ;
3- // Removed Together import
43import { z } from "zod" ;
5- // Removed zodToJsonSchema import since we no longer enforce JSON output via Together
64
75dotenv . config ( ) ;
86
97// Initialize Firecrawl
108const app = new FirecrawlApp ( { apiKey : process . env . FIRECRAWL_API_KEY } ) ;
119
12- // 1. Define the schema for our expected JSON
10+ // 1. Define the schema for our expected JSON - Added content field
1311const StorySchema = z . object ( {
1412 headline : z . string ( ) . describe ( "Story or post headline" ) ,
1513 link : z . string ( ) . describe ( "A link to the post or story" ) ,
1614 date_posted : z . string ( ) . describe ( "The date the story or post was published" ) ,
15+ content : z . string ( ) . optional ( ) . describe ( "The full article content or summary" ) ,
1716} ) ;
1817
1918const StoriesSchema = z . object ( {
@@ -25,9 +24,34 @@ const StoriesSchema = z.object({
2524// Define the TypeScript type for a story using the schema
2625type Story = z . infer < typeof StorySchema > ;
2726
27+ /**
28+ * Helper function to scrape article content from a URL
29+ */
30+ async function scrapeArticleContent ( url : string ) : Promise < string | undefined > {
31+ try {
32+ const scrapeResult = await app . scrapeUrl ( url , {
33+ formats : [ "markdown" ] ,
34+ onlyMainContent : true ,
35+ } ) ;
36+
37+ if ( scrapeResult . success && scrapeResult . markdown ) {
38+ // Limit content length to avoid token limits (optional)
39+ const maxLength = 5000 ;
40+ const content = scrapeResult . markdown ;
41+ return content . length > maxLength
42+ ? content . substring ( 0 , maxLength ) + "..."
43+ : content ;
44+ }
45+ return undefined ;
46+ } catch ( error : any ) {
47+ console . error ( `Error scraping content from ${ url } :` , error . message ) ;
48+ return undefined ;
49+ }
50+ }
51+
2852/**
2953 * Scrape sources using Firecrawl (for non-Twitter URLs) and the Twitter API.
30- * Returns a combined array of story objects.
54+ * Returns a combined array of story objects with content .
3155 */
3256export async function scrapeSources (
3357 sources : { identifier : string } [ ] ,
@@ -80,6 +104,7 @@ export async function scrapeSources(
80104 headline : tweet . text ,
81105 link : `https://x.com/i/status/${ tweet . id } ` ,
82106 date_posted : tweetStartTime ,
107+ // Tweets don't have separate content, headline is the content
83108 } ) ,
84109 ) ;
85110 combinedText . stories . push ( ...stories ) ;
@@ -134,7 +159,20 @@ Return only pure JSON in the specified format (no extra text, no markdown, no \`
134159 console . log (
135160 `Found ${ todayStories . stories . length } stories from ${ source } ` ,
136161 ) ;
137- combinedText . stories . push ( ...todayStories . stories ) ;
162+
163+ // Scrape content for each article
164+ const storiesWithContent = await Promise . all (
165+ todayStories . stories . map ( async ( story ) => {
166+ console . log ( `Scraping content from ${ story . link } ...` ) ;
167+ const content = await scrapeArticleContent ( story . link ) ;
168+ return {
169+ ...story ,
170+ content : content ,
171+ } ;
172+ } )
173+ ) ;
174+
175+ combinedText . stories . push ( ...storiesWithContent ) ;
138176 } catch ( error : any ) {
139177 if ( error . statusCode === 429 ) {
140178 console . error (
@@ -150,4 +188,4 @@ Return only pure JSON in the specified format (no extra text, no markdown, no \`
150188
151189 console . log ( "Combined Stories:" , combinedText . stories ) ;
152190 return combinedText . stories ;
153- }
191+ }
0 commit comments