Skip to content

Commit a5dc6d5

Browse files
feat: add gh-pages connector
Signed-off-by: Akhilender Bongirwar <[email protected]>
1 parent 0d3c72a commit a5dc6d5

File tree

10 files changed

+368
-1
lines changed

10 files changed

+368
-1
lines changed

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ repos:
33
rev: 23.3.0
44
hooks:
55
- id: black
6-
language_version: python3.11
6+
language_version: python3.12
77

88
- repo: https://github.com/asottile/reorder_python_imports
99
rev: v3.9.0

backend/onyx/configs/constants.py

+1
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ class DocumentSource(str, Enum):
134134
GMAIL = "gmail"
135135
REQUESTTRACKER = "requesttracker"
136136
GITHUB = "github"
137+
GITHUB_PAGES = "github_pages"
137138
GITBOOK = "gitbook"
138139
GITLAB = "gitlab"
139140
GURU = "guru"

backend/onyx/connectors/factory.py

+2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from onyx.connectors.freshdesk.connector import FreshdeskConnector
2525
from onyx.connectors.gitbook.connector import GitbookConnector
2626
from onyx.connectors.github.connector import GithubConnector
27+
from onyx.connectors.github_pages.connector import GitHubPagesConnector
2728
from onyx.connectors.gitlab.connector import GitlabConnector
2829
from onyx.connectors.gmail.connector import GmailConnector
2930
from onyx.connectors.gong.connector import GongConnector
@@ -78,6 +79,7 @@ def identify_connector_class(
7879
InputType.SLIM_RETRIEVAL: SlackConnector,
7980
},
8081
DocumentSource.GITHUB: GithubConnector,
82+
DocumentSource.GITHUB_PAGES: GitHubPagesConnector,
8183
DocumentSource.GMAIL: GmailConnector,
8284
DocumentSource.GITLAB: GitlabConnector,
8385
DocumentSource.GITBOOK: GitbookConnector,

backend/onyx/connectors/github_pages/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
import os
2+
import time
3+
from typing import Any
4+
from typing import List
5+
from typing import Optional
6+
from urllib.parse import urljoin
7+
from urllib.parse import urlparse
8+
from urllib.parse import urlunparse
9+
10+
import requests
11+
from bs4 import BeautifulSoup
12+
from requests.auth import HTTPBasicAuth
13+
14+
from onyx.configs.app_configs import INDEX_BATCH_SIZE
15+
from onyx.configs.constants import DocumentSource
16+
from onyx.connectors.interfaces import GenerateDocumentsOutput
17+
from onyx.connectors.interfaces import LoadConnector
18+
from onyx.connectors.interfaces import PollConnector
19+
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
20+
from onyx.connectors.models import Document
21+
from onyx.connectors.models import Section
22+
from onyx.utils.logger import setup_logger
23+
24+
logger = setup_logger()
25+
26+
_TIMEOUT = 60
27+
_MAX_DEPTH = 5
28+
29+
30+
class GitHubPagesConnector(LoadConnector, PollConnector):
31+
def __init__(self, base_url: str, batch_size: int = INDEX_BATCH_SIZE) -> None:
32+
self.base_url = base_url
33+
self.batch_size = batch_size
34+
self.visited_urls = set()
35+
self.auth: Optional[HTTPBasicAuth] = None
36+
37+
def load_credentials(self, credentials: dict[str, Any]) -> None:
38+
"""
39+
Optionally use credentials for HTTP Basic Auth.
40+
For public GitHub Pages, these are not required.
41+
"""
42+
github_username = credentials.get("github_username")
43+
github_token = credentials.get("github_personal_access_token")
44+
if not github_username or not github_token:
45+
logger.warning(
46+
"GitHub credentials are missing. Requests may fail for private pages."
47+
)
48+
self.auth = (
49+
HTTPBasicAuth(github_username, github_token)
50+
if github_username and github_token
51+
else None
52+
)
53+
54+
def load_from_state(self, state: dict) -> None:
55+
"""Restore connector state (e.g., already visited URLs)."""
56+
self.visited_urls = set(state.get("visited_urls", []))
57+
58+
def _normalize_url(self, url: str) -> str:
59+
"""Remove fragments and query parameters for uniformity."""
60+
parsed = urlparse(url)
61+
return urlunparse(parsed._replace(fragment="", query=""))
62+
63+
def _fetch_with_retry(
64+
self, url: str, retries: int = 3, delay: int = 2
65+
) -> Optional[str]:
66+
"""Fetch a URL with retry logic."""
67+
for attempt in range(retries):
68+
try:
69+
response = requests.get(url, timeout=_TIMEOUT, auth=self.auth)
70+
response.raise_for_status()
71+
return response.text
72+
except requests.exceptions.RequestException as e:
73+
logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
74+
time.sleep(delay)
75+
logger.error(f"All attempts failed for {url}")
76+
return None
77+
78+
def _crawl_github_pages(
79+
self, url: str, batch_size: int, depth: int = 0
80+
) -> List[str]:
81+
"""Crawl pages starting at 'url' up to a specified depth and batch size."""
82+
if depth > _MAX_DEPTH:
83+
return []
84+
85+
to_visit = [url]
86+
crawled_urls: List[str] = []
87+
88+
while to_visit and len(crawled_urls) < batch_size:
89+
current_url = to_visit.pop()
90+
if current_url in self.visited_urls:
91+
continue
92+
93+
content = self._fetch_with_retry(current_url)
94+
if content:
95+
soup = BeautifulSoup(content, "html.parser")
96+
self.visited_urls.add(current_url)
97+
crawled_urls.append(current_url)
98+
99+
# Follow in-domain links
100+
for link in soup.find_all("a"):
101+
href = link.get("href")
102+
if href:
103+
full_url = self._normalize_url(urljoin(self.base_url, href))
104+
if (
105+
full_url.startswith(self.base_url)
106+
and full_url not in self.visited_urls
107+
):
108+
to_visit.append(full_url)
109+
return crawled_urls
110+
111+
def _index_pages(self, urls: List[str]) -> List[Document]:
112+
"""Convert a list of URLs into Document objects by fetching their content."""
113+
documents = []
114+
for url in urls:
115+
content = self._fetch_with_retry(url)
116+
if content:
117+
soup = BeautifulSoup(content, "html.parser")
118+
text_content = soup.get_text(separator="\n", strip=True)
119+
metadata = {
120+
"url": url,
121+
"crawl_time": str(time.time()),
122+
"content_length": str(len(text_content)),
123+
}
124+
documents.append(
125+
Document(
126+
id=url,
127+
sections=[Section(link=url, text=text_content)],
128+
source=DocumentSource.GITHUB_PAGES,
129+
semantic_identifier=url,
130+
metadata=metadata,
131+
)
132+
)
133+
return documents
134+
135+
def _get_all_crawled_urls(self) -> List[str]:
136+
"""Crawl repeatedly until no new pages are found."""
137+
all_crawled_urls: List[str] = []
138+
while True:
139+
crawled_urls = self._crawl_github_pages(self.base_url, self.batch_size)
140+
if not crawled_urls:
141+
break
142+
all_crawled_urls.extend(crawled_urls)
143+
return all_crawled_urls
144+
145+
def _pull_all_pages(self) -> GenerateDocumentsOutput:
146+
"""Yield batches of Document objects from crawled pages."""
147+
crawled_urls = self._get_all_crawled_urls()
148+
yield self._index_pages(crawled_urls)
149+
150+
def poll_source(
151+
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
152+
) -> GenerateDocumentsOutput:
153+
"""
154+
Poll the source. This simple crawler does not support time filtering.
155+
"""
156+
yield from self._pull_all_pages()
157+
158+
159+
if __name__ == "__main__":
160+
connector = GitHubPagesConnector(base_url=os.environ["GITHUB_PAGES_BASE_URL"])
161+
162+
credentials = {
163+
"github_username": os.getenv("GITHUB_USERNAME", ""),
164+
"github_personal_access_token": os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN", ""),
165+
}
166+
connector.load_credentials(credentials)
167+
168+
document_batches = connector.poll_source(0, 0)
169+
print(next(document_batches))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import time
2+
from unittest.mock import MagicMock
3+
from unittest.mock import patch
4+
from urllib.parse import urljoin
5+
6+
import pytest
7+
import requests
8+
9+
from onyx.configs.constants import DocumentSource
10+
from onyx.connectors.github_pages.connector import GitHubPagesConnector
11+
from onyx.connectors.models import Document
12+
13+
14+
@pytest.fixture
15+
def github_pages_connector() -> GitHubPagesConnector:
16+
connector = GitHubPagesConnector(base_url="https://test.github.io", batch_size=10)
17+
connector.load_credentials(
18+
{
19+
"github_username": "test_user",
20+
"github_personal_access_token": "test_token",
21+
}
22+
)
23+
return connector
24+
25+
26+
def test_normalize_url(github_pages_connector: GitHubPagesConnector):
27+
url = "https://test.github.io/page?query=abc#fragment"
28+
normalized = github_pages_connector._normalize_url(url)
29+
assert normalized == "https://test.github.io/page"
30+
31+
32+
@patch("onyx.connectors.github_pages.connector.requests.get")
33+
def test_fetch_with_retry_success(
34+
mock_get: MagicMock, github_pages_connector: GitHubPagesConnector
35+
):
36+
fake_response = MagicMock()
37+
fake_response.status_code = 200
38+
fake_response.text = "<html>Test page</html>"
39+
fake_response.raise_for_status.return_value = None
40+
mock_get.return_value = fake_response
41+
42+
result = github_pages_connector._fetch_with_retry("https://test.github.io/")
43+
assert result is not None
44+
assert "Test page" in result
45+
46+
47+
@patch("onyx.connectors.github_pages.connector.requests.get")
48+
def test_fetch_with_retry_failure(
49+
mock_get: MagicMock, github_pages_connector: GitHubPagesConnector
50+
):
51+
fake_response = MagicMock()
52+
fake_response.status_code = 404
53+
fake_response.raise_for_status.side_effect = requests.exceptions.HTTPError(
54+
"Not Found"
55+
)
56+
mock_get.return_value = fake_response
57+
58+
result = github_pages_connector._fetch_with_retry(
59+
"https://test.github.io/nonexistent"
60+
)
61+
assert result is None
62+
63+
64+
@patch("onyx.connectors.github_pages.connector.requests.get")
65+
def test_crawl_github_pages(
66+
mock_get: MagicMock, github_pages_connector: GitHubPagesConnector
67+
):
68+
base_page_html = "<html><body><a href='/page2'>Link to Page 2</a></body></html>"
69+
page2_html = "<html><body><p>Content of Page 2</p></body></html>"
70+
71+
def fake_get(url, timeout, auth):
72+
fake_resp = MagicMock()
73+
fake_resp.raise_for_status.return_value = None
74+
if url.startswith(urljoin("https://test.github.io", "/page2")):
75+
fake_resp.status_code = 200
76+
fake_resp.text = page2_html
77+
else:
78+
fake_resp.status_code = 200
79+
fake_resp.text = base_page_html
80+
return fake_resp
81+
82+
mock_get.side_effect = fake_get
83+
84+
crawled_urls = github_pages_connector._crawl_github_pages(
85+
"https://test.github.io", batch_size=10
86+
)
87+
assert (
88+
"https://test.github.io" in crawled_urls
89+
or "https://test.github.io/" in crawled_urls
90+
)
91+
assert urljoin("https://test.github.io", "page2") in crawled_urls
92+
93+
94+
@patch("onyx.connectors.github_pages.connector.requests.get")
95+
def test_index_pages(mock_get: MagicMock, github_pages_connector: GitHubPagesConnector):
96+
base_page_html = "<html><body><h1>Base Page</h1></body></html>"
97+
page2_html = "<html><body><h1>Page 2</h1></body></html>"
98+
99+
def fake_get(url, timeout, auth):
100+
fake_resp = MagicMock()
101+
fake_resp.raise_for_status.return_value = None
102+
if url.endswith("/page2"):
103+
fake_resp.status_code = 200
104+
fake_resp.text = page2_html
105+
else:
106+
fake_resp.status_code = 200
107+
fake_resp.text = base_page_html
108+
return fake_resp
109+
110+
mock_get.side_effect = fake_get
111+
112+
urls = ["https://test.github.io", urljoin("https://test.github.io", "page2")]
113+
documents = github_pages_connector._index_pages(urls)
114+
assert len(documents) == 2
115+
for doc in documents:
116+
assert isinstance(doc, Document)
117+
assert doc.source == DocumentSource.GITHUB_PAGES
118+
# The semantic_identifier here is the URL used to fetch the document.
119+
assert doc.semantic_identifier in urls
120+
121+
122+
def test_load_from_state(github_pages_connector: GitHubPagesConnector):
123+
state = {"visited_urls": ["https://test.github.io", "https://test.github.io/page2"]}
124+
github_pages_connector.load_from_state(state)
125+
assert "https://test.github.io" in github_pages_connector.visited_urls
126+
assert "https://test.github.io/page2" in github_pages_connector.visited_urls
127+
128+
129+
@patch("onyx.connectors.github_pages.connector.requests.get")
130+
def test_poll_source(mock_get: MagicMock, github_pages_connector: GitHubPagesConnector):
131+
base_page_html = "<html><body><a href='/page2'>Link to Page 2</a></body></html>"
132+
page2_html = "<html><body><p>Content of Page 2</p></body></html>"
133+
134+
def fake_get(url, timeout, auth):
135+
fake_resp = MagicMock()
136+
fake_resp.raise_for_status.return_value = None
137+
if url.startswith(urljoin("https://test.github.io", "/page2")):
138+
fake_resp.status_code = 200
139+
fake_resp.text = page2_html
140+
else:
141+
fake_resp.status_code = 200
142+
fake_resp.text = base_page_html
143+
return fake_resp
144+
145+
mock_get.side_effect = fake_get
146+
147+
generator = github_pages_connector.poll_source(0, time.time())
148+
batch = next(generator)
149+
assert isinstance(batch, list)
150+
assert len(batch) >= 2
151+
for doc in batch:
152+
assert isinstance(doc, Document)

web/src/lib/connectors/connectors.tsx

+27
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,28 @@ export const connectorConfigs: Record<
233233
],
234234
advanced_values: [],
235235
},
236+
github_pages: {
237+
description: "Configure GitHub Pages connector",
238+
values: [
239+
{
240+
type: "text",
241+
query:
242+
"Enter the base URL of the GitHub Pages site (e.g., https://username.github.io/):",
243+
label: "Base URL",
244+
name: "base_url",
245+
optional: false,
246+
},
247+
{
248+
type: "number",
249+
query: "Set the batch size for indexing (default is 10):",
250+
label: "Batch Size",
251+
name: "batch_size",
252+
optional: true,
253+
default: 10,
254+
},
255+
],
256+
advanced_values: [],
257+
},
236258
gitlab: {
237259
description: "Configure GitLab connector",
238260
values: [
@@ -1365,6 +1387,11 @@ export interface GithubConfig {
13651387
include_issues: boolean;
13661388
}
13671389

1390+
export interface GitHubPagesConfig {
1391+
repo_owner: string;
1392+
repo_name: string;
1393+
}
1394+
13681395
export interface GitlabConfig {
13691396
project_owner: string;
13701397
project_name: string;

0 commit comments

Comments
 (0)