diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py b/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py new file mode 100644 index 0000000000000..b9dd564470341 --- /dev/null +++ b/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py @@ -0,0 +1,17 @@ +from langchain_community.graph_vectorstores.extractors.html_link_extractor import ( + HtmlInput, + HtmlLinkExtractor, +) +from langchain_community.graph_vectorstores.extractors.link_extractor import ( + LinkExtractor, +) +from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import ( + LinkExtractorAdapter, +) + +__all__ = [ + "LinkExtractor", + "LinkExtractorAdapter", + "HtmlInput", + "HtmlLinkExtractor", +] diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/html_link_extractor.py b/libs/community/langchain_community/graph_vectorstores/extractors/html_link_extractor.py new file mode 100644 index 0000000000000..49ff1703a3a03 --- /dev/null +++ b/libs/community/langchain_community/graph_vectorstores/extractors/html_link_extractor.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, List, Optional, Set, Union +from urllib.parse import urldefrag, urljoin, urlparse + +from langchain_core.documents import Document +from langchain_core.graph_vectorstores import Link + +from langchain_community.graph_vectorstores.extractors.link_extractor import ( + LinkExtractor, +) +from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import ( + LinkExtractorAdapter, +) + +if TYPE_CHECKING: + from bs4 import BeautifulSoup + from bs4.element import Tag + + +def _parse_url(link: Tag, page_url: str, drop_fragments: bool = True) -> Optional[str]: + href = link.get("href") + if href is None: + return None + url = urlparse(href) + if url.scheme not in ["http", "https", ""]: + return None + + # Join the HREF with the page_url to convert relative paths to absolute. + url = str(urljoin(page_url, href)) + + # Fragments would be useful if we chunked a page based on section. + # Then, each chunk would have a different URL based on the fragment. + # Since we aren't doing that yet, they just "break" links. So, drop + # the fragment. + if drop_fragments: + return urldefrag(url).url + return url + + +def _parse_hrefs( + soup: BeautifulSoup, url: str, drop_fragments: bool = True +) -> Set[str]: + soup_links: List[Tag] = soup.find_all("a") + links: Set[str] = set() + + for link in soup_links: + parse_url = _parse_url(link, page_url=url, drop_fragments=drop_fragments) + # Remove self links and entries for any 'a' tag that failed to parse + # (didn't have href, or invalid domain, etc.) + if parse_url and parse_url != url: + links.add(parse_url) + + return links + + +@dataclass +class HtmlInput: + content: Union[str, BeautifulSoup] + base_url: str + + +class HtmlLinkExtractor(LinkExtractor[HtmlInput]): + def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True): + """Extract hyperlinks from HTML content. + + Expects the input to be an HTML string or a `BeautifulSoup` object. + + Args: + kind: The kind of edge to extract. Defaults to "hyperlink". + drop_fragments: Whether fragments in URLs and links should be + dropped. Defaults to `True`. + """ + try: + import bs4 # noqa:F401 + except ImportError as e: + raise ImportError( + "BeautifulSoup4 is required for HtmlLinkExtractor. " + "Please install it with `pip install beautifulsoup4`." + ) from e + + self._kind = kind + self.drop_fragments = drop_fragments + + def as_document_extractor( + self, url_metadata_key: str = "source" + ) -> LinkExtractor[Document]: + """Return a LinkExtractor that applies to documents. + + NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar + link extractors it may be more efficient to call the link extractors directly + on the parsed BeautifulSoup object. + + Args: + url_metadata_key: The name of the filed in document metadata with the URL of + the document. + """ + return LinkExtractorAdapter( + underlying=self, + transform=lambda doc: HtmlInput( + doc.page_content, doc.metadata[url_metadata_key] + ), + ) + + def extract_one( + self, + input: HtmlInput, # noqa: A002 + ) -> Set[Link]: + content = input.content + if isinstance(content, str): + from bs4 import BeautifulSoup + + content = BeautifulSoup(content, "html.parser") + + base_url = input.base_url + if self.drop_fragments: + base_url = urldefrag(base_url).url + + hrefs = _parse_hrefs(content, base_url, self.drop_fragments) + + links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs} + links.add(Link.incoming(kind=self._kind, tag=base_url)) + return links diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py new file mode 100644 index 0000000000000..2eaa94bd86b23 --- /dev/null +++ b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Generic, Iterable, Set, TypeVar + +from langchain_core.graph_vectorstores import Link + +InputT = TypeVar("InputT") + +METADATA_LINKS_KEY = "links" + + +class LinkExtractor(ABC, Generic[InputT]): + """Interface for extracting links (incoming, outgoing, bidirectional).""" + + @abstractmethod + def extract_one(self, input: InputT) -> set[Link]: # noqa: A002 + """Add edges from each `input` to the corresponding documents. + + Args: + input: The input content to extract edges from. + + Returns: + Set of links extracted from the input. + """ + + def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]: + """Add edges from each `input` to the corresponding documents. + + Args: + inputs: The input content to extract edges from. + + Returns: + Iterable over the set of links extracted from the input. + """ + return map(self.extract_one, inputs) diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_adapter.py b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_adapter.py new file mode 100644 index 0000000000000..19af73bab980a --- /dev/null +++ b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_adapter.py @@ -0,0 +1,27 @@ +from typing import Callable, Iterable, Set, TypeVar + +from langchain_core.graph_vectorstores import Link + +from langchain_community.graph_vectorstores.extractors.link_extractor import ( + LinkExtractor, +) + +InputT = TypeVar("InputT") +UnderlyingInputT = TypeVar("UnderlyingInputT") + + +class LinkExtractorAdapter(LinkExtractor[InputT]): + def __init__( + self, + underlying: LinkExtractor[UnderlyingInputT], + transform: Callable[[InputT], UnderlyingInputT], + ) -> None: + self._underlying = underlying + self._transform = transform + + def extract_one(self, input: InputT) -> Set[Link]: # noqa: A002 + return self._underlying.extract_one(self._transform(input)) + + def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]: + underlying_inputs = map(self._transform, inputs) + return self._underlying.extract_many(underlying_inputs) diff --git a/libs/community/tests/unit_tests/graph_vectorstores/__init__.py b/libs/community/tests/unit_tests/graph_vectorstores/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libs/community/tests/unit_tests/graph_vectorstores/extractors/__init__.py b/libs/community/tests/unit_tests/graph_vectorstores/extractors/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_html_link_extractor.py b/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_html_link_extractor.py new file mode 100644 index 0000000000000..d94eec76c6cca --- /dev/null +++ b/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_html_link_extractor.py @@ -0,0 +1,117 @@ +import pytest +from langchain_core.graph_vectorstores import Link + +from langchain_community.graph_vectorstores.extractors import ( + HtmlInput, + HtmlLinkExtractor, +) + +PAGE_1 = """ + + +Hello. +Relative +Relative base. +Aboslute +Test + + +""" + +PAGE_2 = """ + + +Hello. +Relative + +""" + + +@pytest.mark.requires("bs4") +def test_one_from_str() -> None: + extractor = HtmlLinkExtractor() + + results = extractor.extract_one(HtmlInput(PAGE_1, base_url="https://foo.com/bar/")) + assert results == { + Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"), + Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"), + Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"), + Link.outgoing(kind="hyperlink", tag="http://cnn.com"), + Link.outgoing(kind="hyperlink", tag="https://same.foo"), + } + + results = extractor.extract_one(HtmlInput(PAGE_1, base_url="http://foo.com/bar/")) + assert results == { + Link.incoming(kind="hyperlink", tag="http://foo.com/bar/"), + Link.outgoing(kind="hyperlink", tag="http://foo.com/bar/relative"), + Link.outgoing(kind="hyperlink", tag="http://foo.com/relative-base"), + Link.outgoing(kind="hyperlink", tag="http://cnn.com"), + Link.outgoing(kind="hyperlink", tag="http://same.foo"), + } + + +@pytest.mark.requires("bs4") +def test_one_from_beautiful_soup() -> None: + from bs4 import BeautifulSoup + + extractor = HtmlLinkExtractor() + soup = BeautifulSoup(PAGE_1, "html.parser") + results = extractor.extract_one(HtmlInput(soup, base_url="https://foo.com/bar/")) + assert results == { + Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"), + Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"), + Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"), + Link.outgoing(kind="hyperlink", tag="http://cnn.com"), + Link.outgoing(kind="hyperlink", tag="https://same.foo"), + } + + +@pytest.mark.requires("bs4") +def test_drop_fragments() -> None: + extractor = HtmlLinkExtractor(drop_fragments=True) + results = extractor.extract_one( + HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment") + ) + + assert results == { + Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"), + Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"), + } + + +@pytest.mark.requires("bs4") +def test_include_fragments() -> None: + extractor = HtmlLinkExtractor(drop_fragments=False) + results = extractor.extract_one( + HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment") + ) + + assert results == { + Link.incoming(kind="hyperlink", tag="https://foo.com/baz/#fragment"), + Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/#fragment"), + } + + +@pytest.mark.requires("bs4") +def test_batch_from_str() -> None: + extractor = HtmlLinkExtractor() + results = list( + extractor.extract_many( + [ + HtmlInput(PAGE_1, base_url="https://foo.com/bar/"), + HtmlInput(PAGE_2, base_url="https://foo.com/baz/"), + ] + ) + ) + + assert results[0] == { + Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"), + Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"), + Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"), + Link.outgoing(kind="hyperlink", tag="http://cnn.com"), + Link.outgoing(kind="hyperlink", tag="https://same.foo"), + } + assert results[1] == { + Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"), + Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"), + }