Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions tests/parsers/test_url_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,11 @@ def test_url_parser(engine):
logger.info(f"Test for engine '{engine}' took {run_time:.4f} seconds")


def test_crw_engine_registered():
# fastCRW (crw) is a Firecrawl-compatible engine; like firecrawl it requires
# network/credentials, so only assert it is a registered, dispatchable engine.
assert "crw" in UrlParser.SUPPORTED_ENGINES


if __name__ == "__main__":
pytest.main([__file__])
7 changes: 6 additions & 1 deletion wisup_e2m/configs/parsers/url_parser_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,9 @@

class UrlParserConfig(BaseParserConfig):

api_key: Optional[str] = Field(None, description="API key for FireCrawl API")
api_key: Optional[str] = Field(None, description="API key for FireCrawl / fastCRW API")
api_url: Optional[str] = Field(
None,
description="Base URL for the fastCRW (crw) engine. Defaults to the managed "
"cloud (https://fastcrw.com/api); set to a self-hosted server to override.",
)
40 changes: 40 additions & 0 deletions wisup_e2m/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ def _load_engine(self):
self._load_pandoc_engine()
elif self.config.engine == "firecrawl":
self._load_firecrawl_engine()
elif self.config.engine == "crw":
self._load_crw_engine()

def _load_surya_layout_engine(self):
logger.info("Loading Surya engine...")
Expand Down Expand Up @@ -305,6 +307,44 @@ def _load_firecrawl_engine(self):

self.firecrawl_app = FirecrawlApp(api_key=self.config.api_key) # FIRECRAWL_API_KEY

def _load_crw_engine(self):
"""
fastCRW (crw) engine: a Firecrawl-compatible web scraper shipped as a single
binary; self-host or use the managed cloud. Because the API is
Firecrawl-compatible, the FirecrawlApp client works as-is when pointed at the
fastCRW base URL.

from firecrawl import FirecrawlApp

app = FirecrawlApp(
api_url="https://fastcrw.com/api",
api_key="<CRW_API_KEY>",
)

crawl_result = app.crawl_url(
"https://alexyancey.com/lost-airpods"
)

# Get the markdown
for result in crawl_result:
print(result["markdown"])
"""
import os

try:
from firecrawl import FirecrawlApp
except ImportError:
raise ImportError(
"Firecrawl client not installed. The crw engine reuses the "
"Firecrawl-compatible client; please install it by `pip install firecrawl`"
) from None

# Default to the managed cloud; allow overriding for self-hosted servers.
api_url = self.config.api_url or "https://fastcrw.com/api"
api_key = self.config.api_key or os.environ.get("CRW_API_KEY")

self.crw_app = FirecrawlApp(api_url=api_url, api_key=api_key)

def _load_pandoc_engine(self):
import shutil

Expand Down
59 changes: 57 additions & 2 deletions wisup_e2m/parsers/doc/url_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@


class UrlParser(BaseParser):
SUPPORTED_ENGINES = ["unstructured", "jina", "firecrawl"]
SUPPORTED_ENGINES = ["unstructured", "jina", "firecrawl", "crw"]
SUPPORTED_FILE_TYPES = ["url"]

def __init__(self, config: Optional[BaseParserConfig] = None, **config_kwargs):
"""
:param config: BaseParserConfig

:param engine: str, the engine to use for conversion, default is jina, options are ['unstructured', 'jina', 'firecrawl']
:param engine: str, the engine to use for conversion, default is jina, options are ['unstructured', 'jina', 'firecrawl', 'crw']
:param api_key: str, the api key for the firecrawl engine
:param langs: List[str], the languages to use for parsing, default is ['en', 'zh']
:param client_timeout: int, the client timeout, default is 30
Expand Down Expand Up @@ -187,6 +187,52 @@ def _parse_by_firecrawl(
relative_path=relative_path,
)

def _parse_by_crw(
self,
url: str = None,
include_image_link_in_text: bool = True,
download_image: bool = False,
work_dir: str = "./",
image_dir: str = "./figures",
relative_path: bool = True,
):
"""
demo:
from firecrawl import FirecrawlApp

# fastCRW is Firecrawl-compatible; point the client at the fastCRW base URL.
app = FirecrawlApp(
api_url="https://fastcrw.com/api",
api_key="<CRW_API_KEY>",
)

crawl_result = app.crawl_url(
"https://alexyancey.com/lost-airpods"
)

# Get the markdown
for result in crawl_result:
print(result["markdown"])
"""

logger.info(f"Parsing url: {url} using crw engine")

text = []
parsed_text_list = self.crw_app.crawl_url(url)
for parsed_text in parsed_text_list:
text.append(parsed_text["markdown"])

text = "\n".join(text)

return self._prepare_jina_data_to_e2m_parsed_data(
text,
include_image_link_in_text=include_image_link_in_text,
download_image=download_image,
work_dir=work_dir,
image_dir=image_dir,
relative_path=relative_path,
)

def get_parsed_data(
self,
url: Optional[str] = None,
Expand Down Expand Up @@ -240,6 +286,15 @@ def get_parsed_data(
image_dir=image_dir,
relative_path=relative_path,
)
elif self.config.engine == "crw":
return self._parse_by_crw(
url=url,
include_image_link_in_text=include_image_link_in_text,
download_image=download_image,
work_dir=work_dir,
image_dir=image_dir,
relative_path=relative_path,
)
else:
raise NotImplementedError(f"Engine {self.config.engine} not supported")

Expand Down