From ff24b325337691e05c47c7124889849f8b8ef830 Mon Sep 17 00:00:00 2001 From: us Date: Mon, 15 Jun 2026 02:47:36 +0300 Subject: [PATCH] feat: add fastCRW URL parser engine --- tests/parsers/test_url_parser.py | 6 ++ .../configs/parsers/url_parser_config.py | 7 ++- wisup_e2m/parsers/base.py | 40 +++++++++++++ wisup_e2m/parsers/doc/url_parser.py | 59 ++++++++++++++++++- 4 files changed, 109 insertions(+), 3 deletions(-) diff --git a/tests/parsers/test_url_parser.py b/tests/parsers/test_url_parser.py index cebe29d..c353775 100644 --- a/tests/parsers/test_url_parser.py +++ b/tests/parsers/test_url_parser.py @@ -27,5 +27,11 @@ def test_url_parser(engine): logger.info(f"Test for engine '{engine}' took {run_time:.4f} seconds") +def test_crw_engine_registered(): + # fastCRW (crw) is a Firecrawl-compatible engine; like firecrawl it requires + # network/credentials, so only assert it is a registered, dispatchable engine. + assert "crw" in UrlParser.SUPPORTED_ENGINES + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/wisup_e2m/configs/parsers/url_parser_config.py b/wisup_e2m/configs/parsers/url_parser_config.py index df3e884..db0f9a4 100644 --- a/wisup_e2m/configs/parsers/url_parser_config.py +++ b/wisup_e2m/configs/parsers/url_parser_config.py @@ -5,4 +5,9 @@ class UrlParserConfig(BaseParserConfig): - api_key: Optional[str] = Field(None, description="API key for FireCrawl API") + api_key: Optional[str] = Field(None, description="API key for FireCrawl / fastCRW API") + api_url: Optional[str] = Field( + None, + description="Base URL for the fastCRW (crw) engine. Defaults to the managed " + "cloud (https://fastcrw.com/api); set to a self-hosted server to override.", + ) diff --git a/wisup_e2m/parsers/base.py b/wisup_e2m/parsers/base.py index c45a91a..c01396b 100644 --- a/wisup_e2m/parsers/base.py +++ b/wisup_e2m/parsers/base.py @@ -176,6 +176,8 @@ def _load_engine(self): self._load_pandoc_engine() elif self.config.engine == "firecrawl": self._load_firecrawl_engine() + elif self.config.engine == "crw": + self._load_crw_engine() def _load_surya_layout_engine(self): logger.info("Loading Surya engine...") @@ -305,6 +307,44 @@ def _load_firecrawl_engine(self): self.firecrawl_app = FirecrawlApp(api_key=self.config.api_key) # FIRECRAWL_API_KEY + def _load_crw_engine(self): + """ + fastCRW (crw) engine: a Firecrawl-compatible web scraper shipped as a single + binary; self-host or use the managed cloud. Because the API is + Firecrawl-compatible, the FirecrawlApp client works as-is when pointed at the + fastCRW base URL. + + from firecrawl import FirecrawlApp + + app = FirecrawlApp( + api_url="https://fastcrw.com/api", + api_key="", + ) + + crawl_result = app.crawl_url( + "https://alexyancey.com/lost-airpods" + ) + + # Get the markdown + for result in crawl_result: + print(result["markdown"]) + """ + import os + + try: + from firecrawl import FirecrawlApp + except ImportError: + raise ImportError( + "Firecrawl client not installed. The crw engine reuses the " + "Firecrawl-compatible client; please install it by `pip install firecrawl`" + ) from None + + # Default to the managed cloud; allow overriding for self-hosted servers. + api_url = self.config.api_url or "https://fastcrw.com/api" + api_key = self.config.api_key or os.environ.get("CRW_API_KEY") + + self.crw_app = FirecrawlApp(api_url=api_url, api_key=api_key) + def _load_pandoc_engine(self): import shutil diff --git a/wisup_e2m/parsers/doc/url_parser.py b/wisup_e2m/parsers/doc/url_parser.py index b891115..651c525 100644 --- a/wisup_e2m/parsers/doc/url_parser.py +++ b/wisup_e2m/parsers/doc/url_parser.py @@ -24,14 +24,14 @@ class UrlParser(BaseParser): - SUPPORTED_ENGINES = ["unstructured", "jina", "firecrawl"] + SUPPORTED_ENGINES = ["unstructured", "jina", "firecrawl", "crw"] SUPPORTED_FILE_TYPES = ["url"] def __init__(self, config: Optional[BaseParserConfig] = None, **config_kwargs): """ :param config: BaseParserConfig - :param engine: str, the engine to use for conversion, default is jina, options are ['unstructured', 'jina', 'firecrawl'] + :param engine: str, the engine to use for conversion, default is jina, options are ['unstructured', 'jina', 'firecrawl', 'crw'] :param api_key: str, the api key for the firecrawl engine :param langs: List[str], the languages to use for parsing, default is ['en', 'zh'] :param client_timeout: int, the client timeout, default is 30 @@ -187,6 +187,52 @@ def _parse_by_firecrawl( relative_path=relative_path, ) + def _parse_by_crw( + self, + url: str = None, + include_image_link_in_text: bool = True, + download_image: bool = False, + work_dir: str = "./", + image_dir: str = "./figures", + relative_path: bool = True, + ): + """ + demo: + from firecrawl import FirecrawlApp + + # fastCRW is Firecrawl-compatible; point the client at the fastCRW base URL. + app = FirecrawlApp( + api_url="https://fastcrw.com/api", + api_key="", + ) + + crawl_result = app.crawl_url( + "https://alexyancey.com/lost-airpods" + ) + + # Get the markdown + for result in crawl_result: + print(result["markdown"]) + """ + + logger.info(f"Parsing url: {url} using crw engine") + + text = [] + parsed_text_list = self.crw_app.crawl_url(url) + for parsed_text in parsed_text_list: + text.append(parsed_text["markdown"]) + + text = "\n".join(text) + + return self._prepare_jina_data_to_e2m_parsed_data( + text, + include_image_link_in_text=include_image_link_in_text, + download_image=download_image, + work_dir=work_dir, + image_dir=image_dir, + relative_path=relative_path, + ) + def get_parsed_data( self, url: Optional[str] = None, @@ -240,6 +286,15 @@ def get_parsed_data( image_dir=image_dir, relative_path=relative_path, ) + elif self.config.engine == "crw": + return self._parse_by_crw( + url=url, + include_image_link_in_text=include_image_link_in_text, + download_image=download_image, + work_dir=work_dir, + image_dir=image_dir, + relative_path=relative_path, + ) else: raise NotImplementedError(f"Engine {self.config.engine} not supported")