From ff24b325337691e05c47c7124889849f8b8ef830 Mon Sep 17 00:00:00 2001
From: us <rahmetsaritekin@gmail.com>
Date: Mon, 15 Jun 2026 02:47:36 +0300
Subject: [PATCH] feat: add fastCRW URL parser engine

---
 tests/parsers/test_url_parser.py              |  6 ++
 .../configs/parsers/url_parser_config.py      |  7 ++-
 wisup_e2m/parsers/base.py                     | 40 +++++++++++++
 wisup_e2m/parsers/doc/url_parser.py           | 59 ++++++++++++++++++-
 4 files changed, 109 insertions(+), 3 deletions(-)

diff --git a/tests/parsers/test_url_parser.py b/tests/parsers/test_url_parser.py
index cebe29d..c353775 100644
--- a/tests/parsers/test_url_parser.py
+++ b/tests/parsers/test_url_parser.py
@@ -27,5 +27,11 @@ def test_url_parser(engine):
     logger.info(f"Test for engine '{engine}' took {run_time:.4f} seconds")
 
 
+def test_crw_engine_registered():
+    # fastCRW (crw) is a Firecrawl-compatible engine; like firecrawl it requires
+    # network/credentials, so only assert it is a registered, dispatchable engine.
+    assert "crw" in UrlParser.SUPPORTED_ENGINES
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/wisup_e2m/configs/parsers/url_parser_config.py b/wisup_e2m/configs/parsers/url_parser_config.py
index df3e884..db0f9a4 100644
--- a/wisup_e2m/configs/parsers/url_parser_config.py
+++ b/wisup_e2m/configs/parsers/url_parser_config.py
@@ -5,4 +5,9 @@
 
 class UrlParserConfig(BaseParserConfig):
 
-    api_key: Optional[str] = Field(None, description="API key for FireCrawl API")
+    api_key: Optional[str] = Field(None, description="API key for FireCrawl / fastCRW API")
+    api_url: Optional[str] = Field(
+        None,
+        description="Base URL for the fastCRW (crw) engine. Defaults to the managed "
+        "cloud (https://fastcrw.com/api); set to a self-hosted server to override.",
+    )
diff --git a/wisup_e2m/parsers/base.py b/wisup_e2m/parsers/base.py
index c45a91a..c01396b 100644
--- a/wisup_e2m/parsers/base.py
+++ b/wisup_e2m/parsers/base.py
@@ -176,6 +176,8 @@ def _load_engine(self):
             self._load_pandoc_engine()
         elif self.config.engine == "firecrawl":
             self._load_firecrawl_engine()
+        elif self.config.engine == "crw":
+            self._load_crw_engine()
 
     def _load_surya_layout_engine(self):
         logger.info("Loading Surya engine...")
@@ -305,6 +307,44 @@ def _load_firecrawl_engine(self):
 
         self.firecrawl_app = FirecrawlApp(api_key=self.config.api_key)  # FIRECRAWL_API_KEY
 
+    def _load_crw_engine(self):
+        """
+        fastCRW (crw) engine: a Firecrawl-compatible web scraper shipped as a single
+        binary; self-host or use the managed cloud. Because the API is
+        Firecrawl-compatible, the FirecrawlApp client works as-is when pointed at the
+        fastCRW base URL.
+
+        from firecrawl import FirecrawlApp
+
+        app = FirecrawlApp(
+            api_url="https://fastcrw.com/api",
+            api_key="<CRW_API_KEY>",
+        )
+
+        crawl_result = app.crawl_url(
+            "https://alexyancey.com/lost-airpods"
+        )
+
+        # Get the markdown
+        for result in crawl_result:
+            print(result["markdown"])
+        """
+        import os
+
+        try:
+            from firecrawl import FirecrawlApp
+        except ImportError:
+            raise ImportError(
+                "Firecrawl client not installed. The crw engine reuses the "
+                "Firecrawl-compatible client; please install it by `pip install firecrawl`"
+            ) from None
+
+        # Default to the managed cloud; allow overriding for self-hosted servers.
+        api_url = self.config.api_url or "https://fastcrw.com/api"
+        api_key = self.config.api_key or os.environ.get("CRW_API_KEY")
+
+        self.crw_app = FirecrawlApp(api_url=api_url, api_key=api_key)
+
     def _load_pandoc_engine(self):
         import shutil
 
diff --git a/wisup_e2m/parsers/doc/url_parser.py b/wisup_e2m/parsers/doc/url_parser.py
index b891115..651c525 100644
--- a/wisup_e2m/parsers/doc/url_parser.py
+++ b/wisup_e2m/parsers/doc/url_parser.py
@@ -24,14 +24,14 @@
 
 
 class UrlParser(BaseParser):
-    SUPPORTED_ENGINES = ["unstructured", "jina", "firecrawl"]
+    SUPPORTED_ENGINES = ["unstructured", "jina", "firecrawl", "crw"]
     SUPPORTED_FILE_TYPES = ["url"]
 
     def __init__(self, config: Optional[BaseParserConfig] = None, **config_kwargs):
         """
         :param config: BaseParserConfig
 
-        :param engine: str, the engine to use for conversion, default is jina, options are ['unstructured', 'jina', 'firecrawl']
+        :param engine: str, the engine to use for conversion, default is jina, options are ['unstructured', 'jina', 'firecrawl', 'crw']
         :param api_key: str, the api key for the firecrawl engine
         :param langs: List[str], the languages to use for parsing, default is ['en', 'zh']
         :param client_timeout: int, the client timeout, default is 30
@@ -187,6 +187,52 @@ def _parse_by_firecrawl(
             relative_path=relative_path,
         )
 
+    def _parse_by_crw(
+        self,
+        url: str = None,
+        include_image_link_in_text: bool = True,
+        download_image: bool = False,
+        work_dir: str = "./",
+        image_dir: str = "./figures",
+        relative_path: bool = True,
+    ):
+        """
+        demo:
+            from firecrawl import FirecrawlApp
+
+            # fastCRW is Firecrawl-compatible; point the client at the fastCRW base URL.
+            app = FirecrawlApp(
+                api_url="https://fastcrw.com/api",
+                api_key="<CRW_API_KEY>",
+            )
+
+            crawl_result = app.crawl_url(
+                "https://alexyancey.com/lost-airpods"
+            )
+
+            # Get the markdown
+            for result in crawl_result:
+                print(result["markdown"])
+        """
+
+        logger.info(f"Parsing url: {url} using crw engine")
+
+        text = []
+        parsed_text_list = self.crw_app.crawl_url(url)
+        for parsed_text in parsed_text_list:
+            text.append(parsed_text["markdown"])
+
+        text = "\n".join(text)
+
+        return self._prepare_jina_data_to_e2m_parsed_data(
+            text,
+            include_image_link_in_text=include_image_link_in_text,
+            download_image=download_image,
+            work_dir=work_dir,
+            image_dir=image_dir,
+            relative_path=relative_path,
+        )
+
     def get_parsed_data(
         self,
         url: Optional[str] = None,
@@ -240,6 +286,15 @@ def get_parsed_data(
                 image_dir=image_dir,
                 relative_path=relative_path,
             )
+        elif self.config.engine == "crw":
+            return self._parse_by_crw(
+                url=url,
+                include_image_link_in_text=include_image_link_in_text,
+                download_image=download_image,
+                work_dir=work_dir,
+                image_dir=image_dir,
+                relative_path=relative_path,
+            )
         else:
             raise NotImplementedError(f"Engine {self.config.engine} not supported")