Skip to content

Commit ccb73bb

Browse files
committed
Cập nhật code để lấy nội dung bài viết
1 parent 2ae0e22 commit ccb73bb

File tree

1 file changed

+44
-6
lines changed

1 file changed

+44
-6
lines changed

src/services/scrapeSources.ts

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,18 @@
11
import FirecrawlApp from "@mendable/firecrawl-js";
22
import dotenv from "dotenv";
3-
// Removed Together import
43
import { z } from "zod";
5-
// Removed zodToJsonSchema import since we no longer enforce JSON output via Together
64

75
dotenv.config();
86

97
// Initialize Firecrawl
108
const app = new FirecrawlApp({ apiKey: process.env.FIRECRAWL_API_KEY });
119

12-
// 1. Define the schema for our expected JSON
10+
// 1. Define the schema for our expected JSON - Added content field
1311
const StorySchema = z.object({
1412
headline: z.string().describe("Story or post headline"),
1513
link: z.string().describe("A link to the post or story"),
1614
date_posted: z.string().describe("The date the story or post was published"),
15+
content: z.string().optional().describe("The full article content or summary"),
1716
});
1817

1918
const StoriesSchema = z.object({
@@ -25,9 +24,34 @@ const StoriesSchema = z.object({
2524
// Define the TypeScript type for a story using the schema
2625
type Story = z.infer<typeof StorySchema>;
2726

27+
/**
28+
* Helper function to scrape article content from a URL
29+
*/
30+
async function scrapeArticleContent(url: string): Promise<string | undefined> {
31+
try {
32+
const scrapeResult = await app.scrapeUrl(url, {
33+
formats: ["markdown"],
34+
onlyMainContent: true,
35+
});
36+
37+
if (scrapeResult.success && scrapeResult.markdown) {
38+
// Limit content length to avoid token limits (optional)
39+
const maxLength = 5000;
40+
const content = scrapeResult.markdown;
41+
return content.length > maxLength
42+
? content.substring(0, maxLength) + "..."
43+
: content;
44+
}
45+
return undefined;
46+
} catch (error: any) {
47+
console.error(`Error scraping content from ${url}:`, error.message);
48+
return undefined;
49+
}
50+
}
51+
2852
/**
2953
* Scrape sources using Firecrawl (for non-Twitter URLs) and the Twitter API.
30-
* Returns a combined array of story objects.
54+
* Returns a combined array of story objects with content.
3155
*/
3256
export async function scrapeSources(
3357
sources: { identifier: string }[],
@@ -80,6 +104,7 @@ export async function scrapeSources(
80104
headline: tweet.text,
81105
link: `https://x.com/i/status/${tweet.id}`,
82106
date_posted: tweetStartTime,
107+
// Tweets don't have separate content, headline is the content
83108
}),
84109
);
85110
combinedText.stories.push(...stories);
@@ -134,7 +159,20 @@ Return only pure JSON in the specified format (no extra text, no markdown, no \`
134159
console.log(
135160
`Found ${todayStories.stories.length} stories from ${source}`,
136161
);
137-
combinedText.stories.push(...todayStories.stories);
162+
163+
// Scrape content for each article
164+
const storiesWithContent = await Promise.all(
165+
todayStories.stories.map(async (story) => {
166+
console.log(`Scraping content from ${story.link}...`);
167+
const content = await scrapeArticleContent(story.link);
168+
return {
169+
...story,
170+
content: content,
171+
};
172+
})
173+
);
174+
175+
combinedText.stories.push(...storiesWithContent);
138176
} catch (error: any) {
139177
if (error.statusCode === 429) {
140178
console.error(
@@ -150,4 +188,4 @@ Return only pure JSON in the specified format (no extra text, no markdown, no \`
150188

151189
console.log("Combined Stories:", combinedText.stories);
152190
return combinedText.stories;
153-
}
191+
}

0 commit comments

Comments
 (0)