-
Notifications
You must be signed in to change notification settings - Fork 15k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
community[minor]: Add graph store extractors (#24065)
This adds an extractor interface and an implementation for HTML pages. Extractors are used to create GraphVectorStore Links on loaded content. **Twitter handle:** cbornet_
- Loading branch information
Showing
7 changed files
with
321 additions
and
0 deletions.
There are no files selected for viewing
17 changes: 17 additions & 0 deletions
17
libs/community/langchain_community/graph_vectorstores/extractors/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from langchain_community.graph_vectorstores.extractors.html_link_extractor import ( | ||
HtmlInput, | ||
HtmlLinkExtractor, | ||
) | ||
from langchain_community.graph_vectorstores.extractors.link_extractor import ( | ||
LinkExtractor, | ||
) | ||
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import ( | ||
LinkExtractorAdapter, | ||
) | ||
|
||
__all__ = [ | ||
"LinkExtractor", | ||
"LinkExtractorAdapter", | ||
"HtmlInput", | ||
"HtmlLinkExtractor", | ||
] |
124 changes: 124 additions & 0 deletions
124
libs/community/langchain_community/graph_vectorstores/extractors/html_link_extractor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
from __future__ import annotations | ||
|
||
from dataclasses import dataclass | ||
from typing import TYPE_CHECKING, List, Optional, Set, Union | ||
from urllib.parse import urldefrag, urljoin, urlparse | ||
|
||
from langchain_core.documents import Document | ||
from langchain_core.graph_vectorstores import Link | ||
|
||
from langchain_community.graph_vectorstores.extractors.link_extractor import ( | ||
LinkExtractor, | ||
) | ||
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import ( | ||
LinkExtractorAdapter, | ||
) | ||
|
||
if TYPE_CHECKING: | ||
from bs4 import BeautifulSoup | ||
from bs4.element import Tag | ||
|
||
|
||
def _parse_url(link: Tag, page_url: str, drop_fragments: bool = True) -> Optional[str]: | ||
href = link.get("href") | ||
if href is None: | ||
return None | ||
url = urlparse(href) | ||
if url.scheme not in ["http", "https", ""]: | ||
return None | ||
|
||
# Join the HREF with the page_url to convert relative paths to absolute. | ||
url = str(urljoin(page_url, href)) | ||
|
||
# Fragments would be useful if we chunked a page based on section. | ||
# Then, each chunk would have a different URL based on the fragment. | ||
# Since we aren't doing that yet, they just "break" links. So, drop | ||
# the fragment. | ||
if drop_fragments: | ||
return urldefrag(url).url | ||
return url | ||
|
||
|
||
def _parse_hrefs( | ||
soup: BeautifulSoup, url: str, drop_fragments: bool = True | ||
) -> Set[str]: | ||
soup_links: List[Tag] = soup.find_all("a") | ||
links: Set[str] = set() | ||
|
||
for link in soup_links: | ||
parse_url = _parse_url(link, page_url=url, drop_fragments=drop_fragments) | ||
# Remove self links and entries for any 'a' tag that failed to parse | ||
# (didn't have href, or invalid domain, etc.) | ||
if parse_url and parse_url != url: | ||
links.add(parse_url) | ||
|
||
return links | ||
|
||
|
||
@dataclass | ||
class HtmlInput: | ||
content: Union[str, BeautifulSoup] | ||
base_url: str | ||
|
||
|
||
class HtmlLinkExtractor(LinkExtractor[HtmlInput]): | ||
def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True): | ||
"""Extract hyperlinks from HTML content. | ||
Expects the input to be an HTML string or a `BeautifulSoup` object. | ||
Args: | ||
kind: The kind of edge to extract. Defaults to "hyperlink". | ||
drop_fragments: Whether fragments in URLs and links should be | ||
dropped. Defaults to `True`. | ||
""" | ||
try: | ||
import bs4 # noqa:F401 | ||
except ImportError as e: | ||
raise ImportError( | ||
"BeautifulSoup4 is required for HtmlLinkExtractor. " | ||
"Please install it with `pip install beautifulsoup4`." | ||
) from e | ||
|
||
self._kind = kind | ||
self.drop_fragments = drop_fragments | ||
|
||
def as_document_extractor( | ||
self, url_metadata_key: str = "source" | ||
) -> LinkExtractor[Document]: | ||
"""Return a LinkExtractor that applies to documents. | ||
NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar | ||
link extractors it may be more efficient to call the link extractors directly | ||
on the parsed BeautifulSoup object. | ||
Args: | ||
url_metadata_key: The name of the filed in document metadata with the URL of | ||
the document. | ||
""" | ||
return LinkExtractorAdapter( | ||
underlying=self, | ||
transform=lambda doc: HtmlInput( | ||
doc.page_content, doc.metadata[url_metadata_key] | ||
), | ||
) | ||
|
||
def extract_one( | ||
self, | ||
input: HtmlInput, # noqa: A002 | ||
) -> Set[Link]: | ||
content = input.content | ||
if isinstance(content, str): | ||
from bs4 import BeautifulSoup | ||
|
||
content = BeautifulSoup(content, "html.parser") | ||
|
||
base_url = input.base_url | ||
if self.drop_fragments: | ||
base_url = urldefrag(base_url).url | ||
|
||
hrefs = _parse_hrefs(content, base_url, self.drop_fragments) | ||
|
||
links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs} | ||
links.add(Link.incoming(kind=self._kind, tag=base_url)) | ||
return links |
36 changes: 36 additions & 0 deletions
36
libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from __future__ import annotations | ||
|
||
from abc import ABC, abstractmethod | ||
from typing import Generic, Iterable, Set, TypeVar | ||
|
||
from langchain_core.graph_vectorstores import Link | ||
|
||
InputT = TypeVar("InputT") | ||
|
||
METADATA_LINKS_KEY = "links" | ||
|
||
|
||
class LinkExtractor(ABC, Generic[InputT]): | ||
"""Interface for extracting links (incoming, outgoing, bidirectional).""" | ||
|
||
@abstractmethod | ||
def extract_one(self, input: InputT) -> set[Link]: # noqa: A002 | ||
"""Add edges from each `input` to the corresponding documents. | ||
Args: | ||
input: The input content to extract edges from. | ||
Returns: | ||
Set of links extracted from the input. | ||
""" | ||
|
||
def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]: | ||
"""Add edges from each `input` to the corresponding documents. | ||
Args: | ||
inputs: The input content to extract edges from. | ||
Returns: | ||
Iterable over the set of links extracted from the input. | ||
""" | ||
return map(self.extract_one, inputs) |
27 changes: 27 additions & 0 deletions
27
libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_adapter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from typing import Callable, Iterable, Set, TypeVar | ||
|
||
from langchain_core.graph_vectorstores import Link | ||
|
||
from langchain_community.graph_vectorstores.extractors.link_extractor import ( | ||
LinkExtractor, | ||
) | ||
|
||
InputT = TypeVar("InputT") | ||
UnderlyingInputT = TypeVar("UnderlyingInputT") | ||
|
||
|
||
class LinkExtractorAdapter(LinkExtractor[InputT]): | ||
def __init__( | ||
self, | ||
underlying: LinkExtractor[UnderlyingInputT], | ||
transform: Callable[[InputT], UnderlyingInputT], | ||
) -> None: | ||
self._underlying = underlying | ||
self._transform = transform | ||
|
||
def extract_one(self, input: InputT) -> Set[Link]: # noqa: A002 | ||
return self._underlying.extract_one(self._transform(input)) | ||
|
||
def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]: | ||
underlying_inputs = map(self._transform, inputs) | ||
return self._underlying.extract_many(underlying_inputs) |
Empty file.
Empty file.
117 changes: 117 additions & 0 deletions
117
libs/community/tests/unit_tests/graph_vectorstores/extractors/test_html_link_extractor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
import pytest | ||
from langchain_core.graph_vectorstores import Link | ||
|
||
from langchain_community.graph_vectorstores.extractors import ( | ||
HtmlInput, | ||
HtmlLinkExtractor, | ||
) | ||
|
||
PAGE_1 = """ | ||
<html> | ||
<body> | ||
Hello. | ||
<a href="relative">Relative</a> | ||
<a href="/relative-base">Relative base.</a> | ||
<a href="http://cnn.com">Aboslute</a> | ||
<a href="//same.foo">Test</a> | ||
</body> | ||
</html> | ||
""" | ||
|
||
PAGE_2 = """ | ||
<html> | ||
<body> | ||
Hello. | ||
<a href="/bar/#fragment">Relative</a> | ||
</html> | ||
""" | ||
|
||
|
||
@pytest.mark.requires("bs4") | ||
def test_one_from_str() -> None: | ||
extractor = HtmlLinkExtractor() | ||
|
||
results = extractor.extract_one(HtmlInput(PAGE_1, base_url="https://foo.com/bar/")) | ||
assert results == { | ||
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"), | ||
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"), | ||
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"), | ||
Link.outgoing(kind="hyperlink", tag="http://cnn.com"), | ||
Link.outgoing(kind="hyperlink", tag="https://same.foo"), | ||
} | ||
|
||
results = extractor.extract_one(HtmlInput(PAGE_1, base_url="http://foo.com/bar/")) | ||
assert results == { | ||
Link.incoming(kind="hyperlink", tag="http://foo.com/bar/"), | ||
Link.outgoing(kind="hyperlink", tag="http://foo.com/bar/relative"), | ||
Link.outgoing(kind="hyperlink", tag="http://foo.com/relative-base"), | ||
Link.outgoing(kind="hyperlink", tag="http://cnn.com"), | ||
Link.outgoing(kind="hyperlink", tag="http://same.foo"), | ||
} | ||
|
||
|
||
@pytest.mark.requires("bs4") | ||
def test_one_from_beautiful_soup() -> None: | ||
from bs4 import BeautifulSoup | ||
|
||
extractor = HtmlLinkExtractor() | ||
soup = BeautifulSoup(PAGE_1, "html.parser") | ||
results = extractor.extract_one(HtmlInput(soup, base_url="https://foo.com/bar/")) | ||
assert results == { | ||
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"), | ||
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"), | ||
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"), | ||
Link.outgoing(kind="hyperlink", tag="http://cnn.com"), | ||
Link.outgoing(kind="hyperlink", tag="https://same.foo"), | ||
} | ||
|
||
|
||
@pytest.mark.requires("bs4") | ||
def test_drop_fragments() -> None: | ||
extractor = HtmlLinkExtractor(drop_fragments=True) | ||
results = extractor.extract_one( | ||
HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment") | ||
) | ||
|
||
assert results == { | ||
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"), | ||
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"), | ||
} | ||
|
||
|
||
@pytest.mark.requires("bs4") | ||
def test_include_fragments() -> None: | ||
extractor = HtmlLinkExtractor(drop_fragments=False) | ||
results = extractor.extract_one( | ||
HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment") | ||
) | ||
|
||
assert results == { | ||
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/#fragment"), | ||
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/#fragment"), | ||
} | ||
|
||
|
||
@pytest.mark.requires("bs4") | ||
def test_batch_from_str() -> None: | ||
extractor = HtmlLinkExtractor() | ||
results = list( | ||
extractor.extract_many( | ||
[ | ||
HtmlInput(PAGE_1, base_url="https://foo.com/bar/"), | ||
HtmlInput(PAGE_2, base_url="https://foo.com/baz/"), | ||
] | ||
) | ||
) | ||
|
||
assert results[0] == { | ||
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"), | ||
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"), | ||
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"), | ||
Link.outgoing(kind="hyperlink", tag="http://cnn.com"), | ||
Link.outgoing(kind="hyperlink", tag="https://same.foo"), | ||
} | ||
assert results[1] == { | ||
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"), | ||
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"), | ||
} |