Cập nhật code để lấy nội dung bài viết

thanhtantran · thanhtantran · commit ccb73bb34a37 · 2026-02-05T00:08:17.000+07:00
diff --git a/src/services/scrapeSources.ts b/src/services/scrapeSources.ts
@@ -1,19 +1,18 @@
 import FirecrawlApp from "@mendable/firecrawl-js";
 import dotenv from "dotenv";
-// Removed Together import
 import { z } from "zod";
-// Removed zodToJsonSchema import since we no longer enforce JSON output via Together
 
 dotenv.config();
 
 // Initialize Firecrawl
 const app = new FirecrawlApp({ apiKey: process.env.FIRECRAWL_API_KEY });
 
-// 1. Define the schema for our expected JSON
+// 1. Define the schema for our expected JSON - Added content field
 const StorySchema = z.object({
   headline: z.string().describe("Story or post headline"),
   link: z.string().describe("A link to the post or story"),
   date_posted: z.string().describe("The date the story or post was published"),
+  content: z.string().optional().describe("The full article content or summary"),
 });
 
 const StoriesSchema = z.object({
@@ -25,9 +24,34 @@ const StoriesSchema = z.object({
 // Define the TypeScript type for a story using the schema
 type Story = z.infer<typeof StorySchema>;
 
+/**
+ * Helper function to scrape article content from a URL
+ */
+async function scrapeArticleContent(url: string): Promise<string | undefined> {
+  try {
+    const scrapeResult = await app.scrapeUrl(url, {
+      formats: ["markdown"],
+      onlyMainContent: true,
+    });
+
+    if (scrapeResult.success && scrapeResult.markdown) {
+      // Limit content length to avoid token limits (optional)
+      const maxLength = 5000;
+      const content = scrapeResult.markdown;
+      return content.length > maxLength 
+        ? content.substring(0, maxLength) + "..."
+        : content;
+    }
+    return undefined;
+  } catch (error: any) {
+    console.error(`Error scraping content from ${url}:`, error.message);
+    return undefined;
+  }
+}
+
 /**
  * Scrape sources using Firecrawl (for non-Twitter URLs) and the Twitter API.
- * Returns a combined array of story objects.
+ * Returns a combined array of story objects with content.
  */
 export async function scrapeSources(
   sources: { identifier: string }[],
@@ -80,6 +104,7 @@ export async function scrapeSources(
                 headline: tweet.text,
                 link: `https://x.com/i/status/${tweet.id}`,
                 date_posted: tweetStartTime,
+                // Tweets don't have separate content, headline is the content
               }),
             );
             combinedText.stories.push(...stories);
@@ -134,7 +159,20 @@ Return only pure JSON in the specified format (no extra text, no markdown, no \`
           console.log(
             `Found ${todayStories.stories.length} stories from ${source}`,
           );
-          combinedText.stories.push(...todayStories.stories);
+
+          // Scrape content for each article
+          const storiesWithContent = await Promise.all(
+            todayStories.stories.map(async (story) => {
+              console.log(`Scraping content from ${story.link}...`);
+              const content = await scrapeArticleContent(story.link);
+              return {
+                ...story,
+                content: content,
+              };
+            })
+          );
+
+          combinedText.stories.push(...storiesWithContent);
         } catch (error: any) {
           if (error.statusCode === 429) {
             console.error(
@@ -150,4 +188,4 @@ Return only pure JSON in the specified format (no extra text, no markdown, no \`
 
   console.log("Combined Stories:", combinedText.stories);
   return combinedText.stories;
-}
+}