Skip to content

Commit

Permalink
community[minor]: Add graph store extractors (#24065)
Browse files Browse the repository at this point in the history
This adds an extractor interface and an implementation for HTML pages.
Extractors are used to create GraphVectorStore Links on loaded content.

**Twitter handle:** cbornet_
  • Loading branch information
cbornet authored Jul 11, 2024
1 parent 9bcf8f8 commit 5fc5ef2
Show file tree
Hide file tree
Showing 7 changed files with 321 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
HtmlInput,
HtmlLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
LinkExtractorAdapter,
)

__all__ = [
"LinkExtractor",
"LinkExtractorAdapter",
"HtmlInput",
"HtmlLinkExtractor",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, List, Optional, Set, Union
from urllib.parse import urldefrag, urljoin, urlparse

from langchain_core.documents import Document
from langchain_core.graph_vectorstores import Link

from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
LinkExtractorAdapter,
)

if TYPE_CHECKING:
from bs4 import BeautifulSoup
from bs4.element import Tag


def _parse_url(link: Tag, page_url: str, drop_fragments: bool = True) -> Optional[str]:
href = link.get("href")
if href is None:
return None
url = urlparse(href)
if url.scheme not in ["http", "https", ""]:
return None

# Join the HREF with the page_url to convert relative paths to absolute.
url = str(urljoin(page_url, href))

# Fragments would be useful if we chunked a page based on section.
# Then, each chunk would have a different URL based on the fragment.
# Since we aren't doing that yet, they just "break" links. So, drop
# the fragment.
if drop_fragments:
return urldefrag(url).url
return url


def _parse_hrefs(
soup: BeautifulSoup, url: str, drop_fragments: bool = True
) -> Set[str]:
soup_links: List[Tag] = soup.find_all("a")
links: Set[str] = set()

for link in soup_links:
parse_url = _parse_url(link, page_url=url, drop_fragments=drop_fragments)
# Remove self links and entries for any 'a' tag that failed to parse
# (didn't have href, or invalid domain, etc.)
if parse_url and parse_url != url:
links.add(parse_url)

return links


@dataclass
class HtmlInput:
content: Union[str, BeautifulSoup]
base_url: str


class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True):
"""Extract hyperlinks from HTML content.
Expects the input to be an HTML string or a `BeautifulSoup` object.
Args:
kind: The kind of edge to extract. Defaults to "hyperlink".
drop_fragments: Whether fragments in URLs and links should be
dropped. Defaults to `True`.
"""
try:
import bs4 # noqa:F401
except ImportError as e:
raise ImportError(
"BeautifulSoup4 is required for HtmlLinkExtractor. "
"Please install it with `pip install beautifulsoup4`."
) from e

self._kind = kind
self.drop_fragments = drop_fragments

def as_document_extractor(
self, url_metadata_key: str = "source"
) -> LinkExtractor[Document]:
"""Return a LinkExtractor that applies to documents.
NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar
link extractors it may be more efficient to call the link extractors directly
on the parsed BeautifulSoup object.
Args:
url_metadata_key: The name of the filed in document metadata with the URL of
the document.
"""
return LinkExtractorAdapter(
underlying=self,
transform=lambda doc: HtmlInput(
doc.page_content, doc.metadata[url_metadata_key]
),
)

def extract_one(
self,
input: HtmlInput, # noqa: A002
) -> Set[Link]:
content = input.content
if isinstance(content, str):
from bs4 import BeautifulSoup

content = BeautifulSoup(content, "html.parser")

base_url = input.base_url
if self.drop_fragments:
base_url = urldefrag(base_url).url

hrefs = _parse_hrefs(content, base_url, self.drop_fragments)

links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs}
links.add(Link.incoming(kind=self._kind, tag=base_url))
return links
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Generic, Iterable, Set, TypeVar

from langchain_core.graph_vectorstores import Link

InputT = TypeVar("InputT")

METADATA_LINKS_KEY = "links"


class LinkExtractor(ABC, Generic[InputT]):
"""Interface for extracting links (incoming, outgoing, bidirectional)."""

@abstractmethod
def extract_one(self, input: InputT) -> set[Link]: # noqa: A002
"""Add edges from each `input` to the corresponding documents.
Args:
input: The input content to extract edges from.
Returns:
Set of links extracted from the input.
"""

def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
"""Add edges from each `input` to the corresponding documents.
Args:
inputs: The input content to extract edges from.
Returns:
Iterable over the set of links extracted from the input.
"""
return map(self.extract_one, inputs)
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from typing import Callable, Iterable, Set, TypeVar

from langchain_core.graph_vectorstores import Link

from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)

InputT = TypeVar("InputT")
UnderlyingInputT = TypeVar("UnderlyingInputT")


class LinkExtractorAdapter(LinkExtractor[InputT]):
def __init__(
self,
underlying: LinkExtractor[UnderlyingInputT],
transform: Callable[[InputT], UnderlyingInputT],
) -> None:
self._underlying = underlying
self._transform = transform

def extract_one(self, input: InputT) -> Set[Link]: # noqa: A002
return self._underlying.extract_one(self._transform(input))

def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
underlying_inputs = map(self._transform, inputs)
return self._underlying.extract_many(underlying_inputs)
Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import pytest
from langchain_core.graph_vectorstores import Link

from langchain_community.graph_vectorstores.extractors import (
HtmlInput,
HtmlLinkExtractor,
)

PAGE_1 = """
<html>
<body>
Hello.
<a href="relative">Relative</a>
<a href="/relative-base">Relative base.</a>
<a href="http://cnn.com">Aboslute</a>
<a href="//same.foo">Test</a>
</body>
</html>
"""

PAGE_2 = """
<html>
<body>
Hello.
<a href="/bar/#fragment">Relative</a>
</html>
"""


@pytest.mark.requires("bs4")
def test_one_from_str() -> None:
extractor = HtmlLinkExtractor()

results = extractor.extract_one(HtmlInput(PAGE_1, base_url="https://foo.com/bar/"))
assert results == {
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
}

results = extractor.extract_one(HtmlInput(PAGE_1, base_url="http://foo.com/bar/"))
assert results == {
Link.incoming(kind="hyperlink", tag="http://foo.com/bar/"),
Link.outgoing(kind="hyperlink", tag="http://foo.com/bar/relative"),
Link.outgoing(kind="hyperlink", tag="http://foo.com/relative-base"),
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
Link.outgoing(kind="hyperlink", tag="http://same.foo"),
}


@pytest.mark.requires("bs4")
def test_one_from_beautiful_soup() -> None:
from bs4 import BeautifulSoup

extractor = HtmlLinkExtractor()
soup = BeautifulSoup(PAGE_1, "html.parser")
results = extractor.extract_one(HtmlInput(soup, base_url="https://foo.com/bar/"))
assert results == {
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
}


@pytest.mark.requires("bs4")
def test_drop_fragments() -> None:
extractor = HtmlLinkExtractor(drop_fragments=True)
results = extractor.extract_one(
HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment")
)

assert results == {
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
}


@pytest.mark.requires("bs4")
def test_include_fragments() -> None:
extractor = HtmlLinkExtractor(drop_fragments=False)
results = extractor.extract_one(
HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment")
)

assert results == {
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/#fragment"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/#fragment"),
}


@pytest.mark.requires("bs4")
def test_batch_from_str() -> None:
extractor = HtmlLinkExtractor()
results = list(
extractor.extract_many(
[
HtmlInput(PAGE_1, base_url="https://foo.com/bar/"),
HtmlInput(PAGE_2, base_url="https://foo.com/baz/"),
]
)
)

assert results[0] == {
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
}
assert results[1] == {
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
}

0 comments on commit 5fc5ef2

Please sign in to comment.