olimorris · Davidyz · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025
diff --git a/doc/usage/chat-buffer/tools.md b/doc/usage/chat-buffer/tools.md
@@ -100,6 +100,16 @@ This tools enables an LLM to fetch the content from a specific webpage. It will
 Use @{fetch_webpage} to tell me what the latest version on neovim.io is
 ```
 
+This tool supports 3 modes when fetching a website: 
+
+- `text` (default): Returns `document.body.innerText`.
+- `screenshot`: Returns the image URL of a screenshot of the first screen.
+- `pageshot`: Returns the image URL of the full-page screenshot. 
+
+The LLM choose which mode to use when they call the tool, and you can ask the LLM to use a specific mode in the chat.
+Keep in mind that the `screenshot` and `pageshot` mode only make sense if you're using a multi-modal LLM, in which case you should also give it the `@{fetch_images}` tool so that it can fetch the screenshot/pageshot from the returned URL.
+
+
 **Options:**
 - `adapter` The adapter used to fetch, process and format the webpage's content (Default: `jina`)
 
@@ -238,6 +248,20 @@ Use @{web_search} to search neovim.io and explain how I can configure a new lang
 
 
 Currently, the tool uses [tavily](https://www.tavily.com) and you'll need to ensure that an API key has been set accordingly, as per the [adapter](https://github.com/olimorris/codecompanion.nvim/blob/main/lua/codecompanion/adapters/tavily.lua).
+This tool also supports image results in the search that can be consumed by multi-modal LLMs.
+To achieve that, you'd also need to give the `@{fetch_images}` tool to the LLM so that it can fetch the images from the URL.
+
+### `fetch_images`
+
+This tool allows the LLM to fetch images from URLs. 
+Any URL that directly points to an image would work with this tool. 
+While you could certainly copy-paste URLs to the chat buffer, it's probably more convenient to use this with the `@search_web` tool:
+
+```md
+Using the @{web_search} and @{fetch_images} tools, tell me what the logo of codecompanion.nvim look like.
+```
+
+**You should only use this tool with a multi-modal LLM.**
 
 ## Tool Groups
 

diff --git a/lua/codecompanion/adapters/http/jina.lua b/lua/codecompanion/adapters/http/jina.lua
@@ -30,7 +30,7 @@ return {
 
           self.headers = vim.tbl_deep_extend("force", self.headers, {
             ["Content-Type"] = "application/json",
-            ["X-Return-Format"] = "text",
+            ["X-Return-Format"] = data.content_format or "text",
             ["Accept"] = "application/json",
           })
 
@@ -103,7 +103,11 @@ return {
           end
           return {
             success = "success",
-            content = data.data.text,
+            content = {
+              text = data.data.text,
+              screenshot = data.data.screenshotUrl,
+              pageshot = data.data.pageshotUrl,
+            },
           }
         end,
       },

diff --git a/lua/codecompanion/adapters/http/tavily.lua b/lua/codecompanion/adapters/http/tavily.lua
@@ -44,6 +44,8 @@ return {
               include_answer = opts.include_answer or false,
               include_raw_content = opts.include_raw_content or false,
               include_domains = data.domains,
+              include_images = data.include_images,
+              include_image_descriptions = data.include_images, -- always include descriptions when searching for images.
             }
 
             if opts.topic == "news" then
@@ -75,15 +77,15 @@ return {
           end
 
           -- Process results (move existing output logic here)
-          if body.results == nil or #body.results == 0 then
+          if (body.results == nil or #body.results == 0) and (body.images == nil or #body.images == 0) then
             return {
               status = "error",
               content = "No results found",
             }
           end
 
-          local output = vim
-            .iter(body.results)
+          local text_output = vim
+            .iter(body.results or {})
             :map(function(result)
               return {
                 content = result.content or "",
@@ -93,9 +95,24 @@ return {
             end)
             :totable()
 
+          local images = vim
+            .iter(body.images or {})
+            :map(function(result)
+              -- https://docs.tavily.com/documentation/api-reference/endpoint/search#response-images
+              if type(result) == "string" then
+                return { url = result }
+              elseif type(result) == "table" then
+                return { url = result.url, description = result.description }
+              end
+            end)
+            :totable()
+
           return {
             status = "success",
-            content = output,
+            content = {
+              text = text_output,
+              images = images,
+            },
           }
         end,
       },

diff --git a/lua/codecompanion/config.lua b/lua/codecompanion/config.lua
@@ -164,6 +164,10 @@ local defaults = {
             adapter = "jina",
           },
         },
+        ["fetch_images"] = {
+          callback = "interactions.chat.tools.builtin.fetch_images",
+          description = "Fetches images from the given URL(s).",
+        },
         ["file_search"] = {
           callback = "interactions.chat.tools.builtin.file_search",
           description = "Search for files in the current working directory by glob pattern",

diff --git a/lua/codecompanion/interactions/chat/init.lua b/lua/codecompanion/interactions/chat/init.lua
@@ -953,13 +953,14 @@ end
 
 ---Add an image to the chat buffer
 ---@param image CodeCompanion.Image The image object containing the path and other metadata
----@param opts? {role?: "user"|string, source?: string, bufnr?: integer} Options for adding the image
+---@param opts? {role?: "user"|string, source?: string, bufnr?: integer, add_context?: boolean} Options for adding the image
 ---@return nil
 function Chat:add_image_message(image, opts)
   opts = vim.tbl_deep_extend("force", {
     role = config.constants.USER_ROLE,
     source = "codecompanion.interactions.chat.slash_commands.image",
     bufnr = image.bufnr,
+    add_context = true,
   }, opts or {})
 
   local id = "<image>" .. (image.id or image.path) .. "</image>"
@@ -973,12 +974,14 @@ function Chat:add_image_message(image, opts)
     visible = false,
   })
 
-  self.context:add({
-    bufnr = opts.bufnr,
-    id = id,
-    path = image.path,
-    source = opts.source,
-  })
+  if opts.add_context then
+    self.context:add({
+      bufnr = opts.bufnr,
+      id = id,
+      path = image.path,
+      source = opts.source,
+    })
+  end
 end
 
 ---Apply any tools or variables that a user has tagged in their message

diff --git a/lua/codecompanion/interactions/chat/slash_commands/builtin/image.lua b/lua/codecompanion/interactions/chat/slash_commands/builtin/image.lua
@@ -127,7 +127,7 @@ local choice = {
         return
       end
 
-      image_utils.from_url(url, { chat_bufnr = SlashCommand.Chat.bufnr }, function(_res)
+      image_utils.from_url(url, { chat_bufnr = SlashCommand.Chat.bufnr, from = "slash_command" }, function(_res)
         if type(_res) == "string" then
           return log:error(_res)
         end

diff --git a/lua/codecompanion/interactions/chat/tools/builtin/fetch_images.lua b/lua/codecompanion/interactions/chat/tools/builtin/fetch_images.lua
@@ -0,0 +1,106 @@
+local im_utils = require("codecompanion.utils.images")
+
+---@class CodeCompanion.Tool.FetchImages: CodeCompanion.Tools.Tool
+return {
+  name = "fetch_images",
+  cmds = {
+    ---Execute the fetch_webpage tool
+    ---@param tools CodeCompanion.Tools
+    ---@param args {urls: string[]} The arguments from the LLM's tool call
+    ---@param cb function Async callback for completion
+    function(tools, args, _, cb)
+      if args.urls == nil then
+        return { status = "success" }
+      end
+      ---@type table<string, (CodeCompanion.Image|string)>
+      local images = {}
+      local has_image = false
+
+      local processed_count = 0
+      vim.iter(args.urls):each(
+        ---@param url string
+        function(url)
+          im_utils.from_url(url, { chat_bufnr = tools.chat.bufnr, from = "tool" }, function(result)
+            processed_count = processed_count + 1
+            images[url] = result
+            if type(result) == "table" then
+              has_image = true
+            end
+            if processed_count == #args.urls then
+              local status = "success"
+              if not has_image and #args.urls > 0 then
+                -- set status to error iff all images failed to load.
+                status = "error"
+              end
+              cb({ status = status, data = images })
+            end
+          end)
+        end
+      )
+    end,
+  },
+  schema = {
+    type = "function",
+    ["function"] = {
+      name = "fetch_images",
+      description = "Fetches images from the given URL(s).",
+      parameters = {
+        type = "object",
+        properties = {
+          urls = {
+            type = "array",
+            items = { type = "string" },
+            description = "The URL of the images to fetch from. The URLs must come from the context or previous tool calls.",
+          },
+        },
+        required = { "urls" },
+      },
+    },
+  },
+  output = {
+    ---@param self CodeCompanion.Tool.FetchImages
+    ---@param tools CodeCompanion.Tools
+    ---@param cmd table The command that was executed
+    ---@param stdout table The output from the command
+    success = function(self, tools, cmd, stdout)
+      local chat = tools.chat
+      local total_count = #cmd.urls
+      local failed_urls = {}
+
+      ---@type table<string, (CodeCompanion.Image|string)>
+      local results = stdout[#stdout]
+      for url, item in pairs(results) do
+        if type(item) == "table" then
+          chat:add_image_message(
+            item,
+            { source = "codecompanion.strategies.chat.tools.fetch_images", add_context = false }
+          )
+        else
+          failed_urls[#failed_urls + 1] = url
+        end
+      end
+
+      if #failed_urls > 0 then
+        chat:add_tool_output(
+          self,
+          "Failed to fetch images from the following URLs: " .. table.concat(failed_urls, ", "),
+          string.format(
+            "Successfully fetched %d images. Failed to fetch from %d URLs",
+            total_count - #failed_urls,
+            #failed_urls
+          )
+        )
+      else
+        chat:add_tool_output(self, string.format("Successfully fetched %d image(s).", total_count))
+      end
+    end,
+
+    ---@param self CodeCompanion.Tool.FetchWebpage
+    ---@param tools CodeCompanion.Tools
+    ---@param cmd table The command that was executed
+    ---@param stderr table The output from the command
+    error = function(self, tools, cmd, stderr)
+      tools.chat:add_tool_output(self, "Failed to fetch all images.")
+    end,
+  },
+}
diff --git a/lua/codecompanion/interactions/chat/tools/builtin/fetch_webpage.lua b/lua/codecompanion/interactions/chat/tools/builtin/fetch_webpage.lua
@@ -17,6 +17,7 @@ return {
     function(self, args, _, cb)
       local opts = self.tool.opts
       local url = args.url
+      args.content_format = args.content_format or "text"
 
       if not opts or not opts.adapter then
         log:error("[Fetch Webpage Tool] No adapter set for `fetch_webpage`")
@@ -35,6 +36,13 @@ return {
       local adapter = vim.deepcopy(adapters.resolve(tool_adapter))
       adapter.methods.tools.fetch_webpage.setup(adapter, args)
 
+      if args.content_format ~= "text" then
+        if type(self.chat.adapter.opts) == "table" and not self.chat.adapter.opts.vision then
+          log:warn("[Fetcg Webpage Tool] Setting `content_format` to text because the chat adapter disabled vision.")
+          args.content_format = "text"
+        end
+      end
+
       if not url:match("^https?://") then
         log:error("[Fetch Webpage Tool] Invalid URL: `%s`", url)
         return cb({ status = "error", data = fmt("Invalid URL: `%s`", url) })
@@ -58,7 +66,14 @@ return {
                 return cb({ status = "error", data = fmt("Error processing `%s`\n%s", url, output.content) })
               end
 
-              return cb({ status = "success", data = output.content })
+              return cb({
+                status = "success",
+                data = {
+                  text = (args.content_format == "text") and output.content.text or nil,
+                  screenshot = (args.content_format == "screenshot") and output.content.screenshot or nil,
+                  pageshot = (args.content_format == "pageshot") and output.content.pageshot or nil,
+                },
+              })
             end
           end,
         })
@@ -76,8 +91,20 @@ return {
             type = "string",
             description = "The URL of the webpage to fetch content from",
           },
+          content_format = {
+            type = "string",
+            enum = { "text", "screenshot", "pageshot" },
+            description = [[How the result should be presented.
+- `text`: Returns `document.body.innerText`.
+- `screenshot`: Returns the image URL of a screenshot of the first screen.
+- `pageshot`: Returns the image URL of the full-page screenshot.
+Choose `screenshot` or `pageshot` if you need to know the layout, design or image information of the website AND you have vision capability.
+Otherwise, stick to `text`.
+When you receive a URL to the screenshot or pageshot, you should call the `fetch_images` tool to see the image.
+        ]],
+          },
         },
-        required = { "url" },
+        required = { "url", "content_format" },
       },
     },
   },
@@ -90,41 +117,20 @@ return {
       local args = self.args
       local chat = tools.chat
 
-      local content
-      if type(stdout) == "table" then
-        if #stdout == 1 and type(stdout[1]) == "string" then
-          content = stdout[1]
-        elseif #stdout == 1 and type(stdout[1]) == "table" then
-          -- If stdout[1] is a table, try to extract content
-          local first_item = stdout[1]
-          if type(first_item) == "table" and first_item.content then
-            content = first_item.content
-          else
-            -- Fallback: convert to string representation
-            content = vim.inspect(first_item)
-          end
-        else
-          -- Multiple items or other structure
-          content = vim
-            .iter(stdout)
-            :map(function(item)
-              if type(item) == "string" then
-                return item
-              elseif type(item) == "table" and item.content then
-                return item.content
-              else
-                return vim.inspect(item)
-              end
-            end)
-            :join("\n")
-        end
-      else
-        content = tostring(stdout)
+      local llm_output
+      local user_output
+      local output = stdout[#stdout]
+      if type(output.text) == "string" then
+        llm_output = fmt([[<attachment url="%s">%s</attachment>]], args.url, output.text)
+        user_output = fmt("Fetched content from `%s`", args.url)
+      elseif type(output.screenshot) == "string" then
+        llm_output = fmt([[<attachment image_url="%s">Screenshot of %s</attachment>]], output.screenshot, args.url)
+        user_output = fmt("Fetched screenshot of `%s`", args.url)
+      elseif type(output.pageshot) == "string" then
+        llm_output = fmt([[<attachment image_url="%s">Pageshot of %s</attachment>]], output.pageshot, args.url)
+        user_output = fmt("Fetched pageshot of `%s`", args.url)
       end
 
-      local llm_output = fmt([[<attachment url="%s">%s</attachment>]], args.url, content)
-      local user_output = fmt("Fetched content from `%s`", args.url)
-
       chat:add_tool_output(self, llm_output, user_output)
     end,