Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GitHub Pages Connector #4233

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/onyx/configs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ class DocumentSource(str, Enum):
GMAIL = "gmail"
REQUESTTRACKER = "requesttracker"
GITHUB = "github"
GITHUB_PAGES = "github_pages"
GITBOOK = "gitbook"
GITLAB = "gitlab"
GURU = "guru"
Expand Down
2 changes: 2 additions & 0 deletions backend/onyx/connectors/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from onyx.connectors.freshdesk.connector import FreshdeskConnector
from onyx.connectors.gitbook.connector import GitbookConnector
from onyx.connectors.github.connector import GithubConnector
from onyx.connectors.github_pages.connector import GitHubPagesConnector
from onyx.connectors.gitlab.connector import GitlabConnector
from onyx.connectors.gmail.connector import GmailConnector
from onyx.connectors.gong.connector import GongConnector
Expand Down Expand Up @@ -80,6 +81,7 @@ def identify_connector_class(
InputType.SLIM_RETRIEVAL: SlackConnector,
},
DocumentSource.GITHUB: GithubConnector,
DocumentSource.GITHUB_PAGES: GitHubPagesConnector,
DocumentSource.GMAIL: GmailConnector,
DocumentSource.GITLAB: GitlabConnector,
DocumentSource.GITBOOK: GitbookConnector,
Expand Down
Empty file.
169 changes: 169 additions & 0 deletions backend/onyx/connectors/github_pages/connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import os
import time
from typing import Any
from typing import List
from typing import Optional
from urllib.parse import urljoin
from urllib.parse import urlparse
from urllib.parse import urlunparse

import requests
from bs4 import BeautifulSoup
from requests.auth import HTTPBasicAuth

from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.configs.constants import DocumentSource
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.utils.logger import setup_logger

logger = setup_logger()

_TIMEOUT = 60
_MAX_DEPTH = 5


class GitHubPagesConnector(LoadConnector, PollConnector):
def __init__(self, base_url: str, batch_size: int = INDEX_BATCH_SIZE) -> None:
self.base_url = base_url
self.batch_size = batch_size
self.visited_urls = set()
self.auth: Optional[HTTPBasicAuth] = None

def load_credentials(self, credentials: dict[str, Any]) -> None:
"""
Optionally use credentials for HTTP Basic Auth.
For public GitHub Pages, these are not required.
"""
github_username = credentials.get("github_username")
github_token = credentials.get("github_personal_access_token")
if not github_username or not github_token:
logger.warning(
"GitHub credentials are missing. Requests may fail for private pages."
)
self.auth = (
HTTPBasicAuth(github_username, github_token)
if github_username and github_token
else None
)

def load_from_state(self, state: dict) -> None:
"""Restore connector state (e.g., already visited URLs)."""
self.visited_urls = set(state.get("visited_urls", []))

def _normalize_url(self, url: str) -> str:
"""Remove fragments and query parameters for uniformity."""
parsed = urlparse(url)
return urlunparse(parsed._replace(fragment="", query=""))

def _fetch_with_retry(
self, url: str, retries: int = 3, delay: int = 2
) -> Optional[str]:
"""Fetch a URL with retry logic."""
for attempt in range(retries):
try:
response = requests.get(url, timeout=_TIMEOUT, auth=self.auth)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
time.sleep(delay)
logger.error(f"All attempts failed for {url}")
return None

def _crawl_github_pages(
self, url: str, batch_size: int, depth: int = 0
) -> List[str]:
"""Crawl pages starting at 'url' up to a specified depth and batch size."""
if depth > _MAX_DEPTH:
return []

to_visit = [url]
crawled_urls: List[str] = []

while to_visit and len(crawled_urls) < batch_size:
current_url = to_visit.pop()
if current_url in self.visited_urls:
continue

content = self._fetch_with_retry(current_url)
if content:
soup = BeautifulSoup(content, "html.parser")
self.visited_urls.add(current_url)
crawled_urls.append(current_url)

# Follow in-domain links
for link in soup.find_all("a"):
href = link.get("href")
if href:
full_url = self._normalize_url(urljoin(self.base_url, href))
if (
full_url.startswith(self.base_url)
and full_url not in self.visited_urls
):
to_visit.append(full_url)
return crawled_urls

def _index_pages(self, urls: List[str]) -> List[Document]:
"""Convert a list of URLs into Document objects by fetching their content."""
documents = []
for url in urls:
content = self._fetch_with_retry(url)
if content:
soup = BeautifulSoup(content, "html.parser")
text_content = soup.get_text(separator="\n", strip=True)
metadata = {
"url": url,
"crawl_time": str(time.time()),
"content_length": str(len(text_content)),
}
documents.append(
Document(
id=url,
sections=[Section(link=url, text=text_content)],
source=DocumentSource.GITHUB_PAGES,
semantic_identifier=url,
metadata=metadata,
)
)
return documents

def _get_all_crawled_urls(self) -> List[str]:
"""Crawl repeatedly until no new pages are found."""
all_crawled_urls: List[str] = []
while True:
crawled_urls = self._crawl_github_pages(self.base_url, self.batch_size)
if not crawled_urls:
break
all_crawled_urls.extend(crawled_urls)
return all_crawled_urls

def _pull_all_pages(self) -> GenerateDocumentsOutput:
"""Yield batches of Document objects from crawled pages."""
crawled_urls = self._get_all_crawled_urls()
yield self._index_pages(crawled_urls)

def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
"""
Poll the source. This simple crawler does not support time filtering.
"""
yield from self._pull_all_pages()


if __name__ == "__main__":
connector = GitHubPagesConnector(base_url=os.environ["GITHUB_PAGES_BASE_URL"])

credentials = {
"github_username": os.getenv("GITHUB_USERNAME", ""),
"github_personal_access_token": os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN", ""),
}
connector.load_credentials(credentials)

document_batches = connector.poll_source(0, 0)
print(next(document_batches))
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import time
from unittest.mock import MagicMock
from unittest.mock import patch
from urllib.parse import urljoin

import pytest
import requests

from onyx.configs.constants import DocumentSource
from onyx.connectors.github_pages.connector import GitHubPagesConnector
from onyx.connectors.models import Document


@pytest.fixture
def github_pages_connector() -> GitHubPagesConnector:
connector = GitHubPagesConnector(base_url="https://test.github.io", batch_size=10)
connector.load_credentials(
{
"github_username": "test_user",
"github_personal_access_token": "test_token",
}
)
return connector


def test_normalize_url(github_pages_connector: GitHubPagesConnector):
url = "https://test.github.io/page?query=abc#fragment"
normalized = github_pages_connector._normalize_url(url)
assert normalized == "https://test.github.io/page"


@patch("onyx.connectors.github_pages.connector.requests.get")
def test_fetch_with_retry_success(
mock_get: MagicMock, github_pages_connector: GitHubPagesConnector
):
fake_response = MagicMock()
fake_response.status_code = 200
fake_response.text = "<html>Test page</html>"
fake_response.raise_for_status.return_value = None
mock_get.return_value = fake_response

result = github_pages_connector._fetch_with_retry("https://test.github.io/")
assert result is not None
assert "Test page" in result


@patch("onyx.connectors.github_pages.connector.requests.get")
def test_fetch_with_retry_failure(
mock_get: MagicMock, github_pages_connector: GitHubPagesConnector
):
fake_response = MagicMock()
fake_response.status_code = 404
fake_response.raise_for_status.side_effect = requests.exceptions.HTTPError(
"Not Found"
)
mock_get.return_value = fake_response

result = github_pages_connector._fetch_with_retry(
"https://test.github.io/nonexistent"
)
assert result is None


@patch("onyx.connectors.github_pages.connector.requests.get")
def test_crawl_github_pages(
mock_get: MagicMock, github_pages_connector: GitHubPagesConnector
):
base_page_html = "<html><body><a href='/page2'>Link to Page 2</a></body></html>"
page2_html = "<html><body><p>Content of Page 2</p></body></html>"

def fake_get(url, timeout, auth):
fake_resp = MagicMock()
fake_resp.raise_for_status.return_value = None
if url.startswith(urljoin("https://test.github.io", "/page2")):
fake_resp.status_code = 200
fake_resp.text = page2_html
else:
fake_resp.status_code = 200
fake_resp.text = base_page_html
return fake_resp

mock_get.side_effect = fake_get

crawled_urls = github_pages_connector._crawl_github_pages(
"https://test.github.io", batch_size=10
)
assert (
"https://test.github.io" in crawled_urls
or "https://test.github.io/" in crawled_urls
)
assert urljoin("https://test.github.io", "page2") in crawled_urls


@patch("onyx.connectors.github_pages.connector.requests.get")
def test_index_pages(mock_get: MagicMock, github_pages_connector: GitHubPagesConnector):
base_page_html = "<html><body><h1>Base Page</h1></body></html>"
page2_html = "<html><body><h1>Page 2</h1></body></html>"

def fake_get(url, timeout, auth):
fake_resp = MagicMock()
fake_resp.raise_for_status.return_value = None
if url.endswith("/page2"):
fake_resp.status_code = 200
fake_resp.text = page2_html
else:
fake_resp.status_code = 200
fake_resp.text = base_page_html
return fake_resp

mock_get.side_effect = fake_get

urls = ["https://test.github.io", urljoin("https://test.github.io", "page2")]
documents = github_pages_connector._index_pages(urls)
assert len(documents) == 2
for doc in documents:
assert isinstance(doc, Document)
assert doc.source == DocumentSource.GITHUB_PAGES
# The semantic_identifier here is the URL used to fetch the document.
assert doc.semantic_identifier in urls


def test_load_from_state(github_pages_connector: GitHubPagesConnector):
state = {"visited_urls": ["https://test.github.io", "https://test.github.io/page2"]}
github_pages_connector.load_from_state(state)
assert "https://test.github.io" in github_pages_connector.visited_urls
assert "https://test.github.io/page2" in github_pages_connector.visited_urls


@patch("onyx.connectors.github_pages.connector.requests.get")
def test_poll_source(mock_get: MagicMock, github_pages_connector: GitHubPagesConnector):
base_page_html = "<html><body><a href='/page2'>Link to Page 2</a></body></html>"
page2_html = "<html><body><p>Content of Page 2</p></body></html>"

def fake_get(url, timeout, auth):
fake_resp = MagicMock()
fake_resp.raise_for_status.return_value = None
if url.startswith(urljoin("https://test.github.io", "/page2")):
fake_resp.status_code = 200
fake_resp.text = page2_html
else:
fake_resp.status_code = 200
fake_resp.text = base_page_html
return fake_resp

mock_get.side_effect = fake_get

generator = github_pages_connector.poll_source(0, time.time())
batch = next(generator)
assert isinstance(batch, list)
assert len(batch) >= 2
for doc in batch:
assert isinstance(doc, Document)
27 changes: 27 additions & 0 deletions web/src/lib/connectors/connectors.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,28 @@ export const connectorConfigs: Record<
],
advanced_values: [],
},
github_pages: {
description: "Configure GitHub Pages connector",
values: [
{
type: "text",
query:
"Enter the base URL of the GitHub Pages site (e.g., https://username.github.io/):",
label: "Base URL",
name: "base_url",
optional: false,
},
{
type: "number",
query: "Set the batch size for indexing (default is 10):",
label: "Batch Size",
name: "batch_size",
optional: true,
default: 10,
},
],
advanced_values: [],
},
gitlab: {
description: "Configure GitLab connector",
values: [
Expand Down Expand Up @@ -1406,6 +1428,11 @@ export interface GithubConfig {
include_issues: boolean;
}

export interface GitHubPagesConfig {
repo_owner: string;
repo_name: string;
}

export interface GitlabConfig {
project_owner: string;
project_name: string;
Expand Down
Loading