sdk-docs/lib/llms_txt_builder.rb at 1fd1f92965206cade573c054ec386e8043f911c4 · coredevices/sdk-docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
require 'fileutils'

module LlmsTxt
  # Builder module for generating LLM-friendly text files from Jekyll content
  class Builder
    SEPARATOR = "---"

    def initialize(site)
      @site = site
      @base_url = site.config['url'] || ''
    end

    # Process a Jekyll collection and return formatted content
    def process_collection(collection_name)
      collection = @site.collections[collection_name]
      return '' unless collection

      content = []
      collection.docs.each do |doc|
        next if should_exclude?(doc)
        content << format_page(doc)
      end
      content.join("\n\n#{SEPARATOR}\n\n")
    end

    # Process multiple collections and return formatted content
    def process_collections(collection_names)
      content = []
      collection_names.each do |name|
        @site.collections[name]&.docs&.each do |doc|
          next if should_exclude?(doc)
          content << format_page(doc)
        end
      end
      content.join("\n\n#{SEPARATOR}\n\n")
    end

    # Process generated pages matching a URL pattern
    def process_pages(url_pattern)
      content = []
      @site.pages.each do |page|
        next unless page.url.match?(url_pattern)
        next if should_exclude?(page)
        content << format_page(page)
      end
      content.join("\n\n#{SEPARATOR}\n\n")
    end

    # Format a single page/document
    def format_page(page)
      title = extract_title(page)
      url = build_url(page.url)
      markdown_content = extract_content(page)

      output = []
      output << "# #{title}"
      output << "URL: #{url}"
      output << ""
      output << markdown_content
      output.join("\n")
    end

    # Write content to a file in the llms directory
    def write_file(filename, content, section_name = nil)
      # Create temp directory for llms files
      tmp_dir = File.join(@site.source, '../tmp/')
      llms_dir = File.join(tmp_dir, 'llms')
      FileUtils.mkdir_p(llms_dir)

      filepath = File.join(llms_dir, filename)

      output = []
      if section_name
        output << "# #{section_name}"
        output << ""
        output << "This file contains all #{section_name.downcase} from the Pebble Developer Documentation."
        output << ""
        output << SEPARATOR
        output << ""
      end
      output << content

      File.write(filepath, output.join("\n"))

      # Register the file with Jekyll as a static file so it gets copied to output
      @site.static_files << Jekyll::StaticFile.new(@site, tmp_dir, 'llms', filename)

      Jekyll.logger.info('LLMS.txt:', "Generated #{filename}")
    end

    private

    # Check if a page should be excluded from LLM files
    def should_exclude?(page)
      return true if page.data['llms_exclude'] == true

      # Exclude redirect pages (they have no useful content)
      return true if page.data['layout'] == 'redirect'

      # Exclude pages with very little content (likely stubs or redirects)
      content = page.content || ''
      return true if content.strip.length < 100

      false
    end

    # Extract the title from a page
    def extract_title(page)
      page.data['title'] || page.data['name'] || File.basename(page.url, '.*')
    end

    # Build the full URL for a page
    def build_url(path)
      "#{@base_url}#{path}"
    end

    # Extract and clean content from a page
    def extract_content(page)
      content = page.content || ''

      # Remove liquid tags and variables (including multiline)
      content = content.gsub(/\{%.*?%\}/m, '')
      content = content.gsub(/\{\{.*?\}\}/m, '')

      # Clean up extra whitespace
      content = content.gsub(/\n{3,}/, "\n\n")
      content.strip
    end
  end
end