team-openpm · maccman · Jun 21, 2023
diff --git a/examples/scrape-pricing-plans.ts b/examples/scrape-pricing-plans.ts
@@ -0,0 +1,53 @@
+import { z } from 'zod'
+import { Api, invokable } from '../src/apis'
+import { OpenAiAgent } from '../src/chat-agents/open-ai'
+import { WorkGptRunner } from '../src/runners/workgpt'
+import { haltProgram } from '../src/runners/control'
+import { TextBrowser } from '../src/apis/text-browser'
+
+export class WorkGptControl extends Api {
+  @invokable({
+    usage: 'Finishes the program. Call when you have an answer.',
+    schema: z.object({
+      pricingPlans: z.array(
+        z.object({
+          planName: z.string(),
+          planAmount: z.string(),
+          planDescription: z.string().optional(),
+        })
+      ),
+    }),
+  })
+  onFinish(result: any) {
+    haltProgram(result)
+  }
+}
+
+async function main() {
+  const agent = new OpenAiAgent({
+    verbose: true,
+    temperature: 0,
+    model: 'gpt-4-0613',
+  })
+
+  const apis = await Promise.all([new TextBrowser(), new WorkGptControl()])
+
+  const runner = new WorkGptRunner({
+    agent,
+    apis,
+  })
+
+  const result = await runner.runWithDirective(
+    `
+    You purpose is to extract pricing plans from SAAS service websites.
+    Follow the instructions below. Think step by step.
+    1. Navigate to hubspot.com
+    2. Find the pricing page for their CRM.
+    3. Extract the text of the page to understand the service's different pricing plans.
+    3. Call WorkGptControl.onFinish with the parsed pricing plans.`
+  )
+
+  console.log('Result', JSON.stringify(result, null, 2))
+}
+
+main()
diff --git a/src/apis/text-browser.ts b/src/apis/text-browser.ts
@@ -5,19 +5,74 @@ import puppeteer, { Page } from 'puppeteer'
 
 export class TextBrowser extends Api {
   @invokable({
-    usage: `Useful for getting text contents of a website.`,
+    usage: `Useful for getting plain text contents of a website.`,
     schema: z.object({
       url: z.string(),
     }),
   })
-  async browse({ url }: { url: string }): Promise<string> {
+  async getSiteText({ url }: { url: string }): Promise<string> {
     const browser = await puppeteer.launch({ headless: 'new' })
     const page = await browser.newPage()
     await page.goto(url)
     const text = await extractText(page)
     await browser.close()
     return text ?? ''
   }
+
+  @invokable({
+    usage: `Get sitemap of a website. Useful for getting a list of pages of a website.`,
+    schema: z.object({
+      url: z.string(),
+    }),
+  })
+  async getSitemap({ url }: { url: string }) {
+    const browser = await puppeteer.launch({ headless: 'new' })
+    const base = new URL(url).hostname
+    const page = await browser.newPage()
+    await page.goto(url)
+    const links = await page.evaluate(() =>
+      Array.from(document.querySelectorAll('a[href]')).map((node) => ({
+        href: node.getAttribute('href'),
+        linkText: node.textContent,
+      }))
+    )
+    await browser.close()
+
+    const normalizedLinks: {
+      linkText: string
+      linkUrl: string
+    }[] = []
+
+    for (const { href, linkText } of links) {
+      if (!href || !linkText) {
+        continue
+      }
+
+      let url: URL
+
+      try {
+        url = new URL(href)
+      } catch (e) {
+        continue
+      }
+
+      if (url.hostname !== base) {
+        continue
+      }
+
+      // Check uniqueness
+      if (normalizedLinks.some((link) => link.linkUrl === url.href)) {
+        continue
+      }
+
+      normalizedLinks.push({
+        linkText: linkText.trim(),
+        linkUrl: url.href,
+      })
+    }
+
+    return normalizedLinks
+  }
 }
 
 async function extractText(page: Page) {