Skip to content

Commit e5c2a42

Browse files
fix: lazy imports to prevent torchcodec FFmpeg DLL crash on Windows (#1089) (#1092)
Remove BaseLoader inheritance from ChromiumLoader and use lazy imports for PyPDFLoader and AsyncChromiumLoader to avoid triggering the sentence_transformers -> torchcodec -> FFmpeg native DLL loading chain at import time, which crashes on systems where FFmpeg DLLs are not available. Also add torchcodec mock to conftest.py for the test suite. Changes: - chromium.py: remove BaseLoader import/inheritance, add load()/aload() - docloaders/__init__.py: lazy __getattr__ for ChromiumLoader, PlasmateLoader - fetch_node.py: lazy PyPDFLoader import - robots_node.py: lazy AsyncChromiumLoader import - conftest.py: torchcodec module mock Co-authored-by: Ege BULUT <egebulut.iletisim@outlook.com>
1 parent 71ab440 commit e5c2a42

5 files changed

Lines changed: 43 additions & 6 deletions

File tree

scrapegraphai/docloaders/__init__.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,27 @@
11
"""
22
This module handles document loading functionalities for the ScrapeGraphAI application.
3+
4+
Note: ChromiumLoader and PlasmateLoader are lazy-imported to avoid triggering
5+
torchcodec/FFmpeg DLL loading at import time (sentence_transformers -> torchcodec chain).
36
"""
47

58
from .browser_base import browser_base_fetch
6-
from .chromium import ChromiumLoader
7-
from .plasmate import PlasmateLoader
89
from .scrape_do import scrape_do_fetch
910

11+
_LAZY_MODULES = {
12+
"ChromiumLoader": ".chromium",
13+
"PlasmateLoader": ".plasmate",
14+
}
15+
16+
17+
def __getattr__(name):
18+
if name in _LAZY_MODULES:
19+
import importlib
20+
module = importlib.import_module(_LAZY_MODULES[name], __package__)
21+
return getattr(module, name)
22+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
23+
24+
1025
__all__ = [
1126
"browser_base_fetch",
1227
"ChromiumLoader",

scrapegraphai/docloaders/chromium.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,14 @@
33

44
import aiohttp
55
import async_timeout
6-
from langchain_community.document_loaders.base import BaseLoader
76
from langchain_core.documents import Document
87

98
from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy
109

1110
logger = get_logger("web-loader")
1211

1312

14-
class ChromiumLoader(BaseLoader):
13+
class ChromiumLoader:
1514
"""Scrapes HTML pages from URLs using a (headless) instance of the
1615
Chromium web driver with proxy protection.
1716
@@ -436,6 +435,14 @@ async def ascrape_with_js_support(
436435
finally:
437436
await browser.close()
438437

438+
def load(self) -> List[Document]:
439+
"""Load all documents synchronously."""
440+
return list(self.lazy_load())
441+
442+
async def aload(self) -> List[Document]:
443+
"""Load all documents asynchronously."""
444+
return [doc async for doc in self.alazy_load()]
445+
439446
def lazy_load(self) -> Iterator[Document]:
440447
"""
441448
Lazily load text content from the provided URLs.

scrapegraphai/nodes/fetch_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import concurrent.futures
88

99
import requests
10-
from langchain_community.document_loaders import PyPDFLoader
1110
from langchain_core.documents import Document
1211
from langchain_openai import AzureChatOpenAI, ChatOpenAI
1312

@@ -182,6 +181,7 @@ def load_file_content(self, source, input_type):
182181
"""
183182

184183
if input_type == "pdf":
184+
from langchain_community.document_loaders import PyPDFLoader
185185
loader = PyPDFLoader(source)
186186
# PyPDFLoader.load() can be blocking for large PDFs. Run it in a thread and
187187
# enforce the configured timeout if provided.

scrapegraphai/nodes/robots_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
from langchain_core.output_parsers import CommaSeparatedListOutputParser
99
from langchain_core.prompts import PromptTemplate
10-
from langchain_community.document_loaders import AsyncChromiumLoader
1110

1211
from ..helpers import robots_dictionary
1312
from ..prompts import TEMPLATE_ROBOT
@@ -90,6 +89,7 @@ def execute(self, state: dict) -> dict:
9089
else:
9190
parsed_url = urlparse(source)
9291
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
92+
from langchain_community.document_loaders import AsyncChromiumLoader
9393
loader = AsyncChromiumLoader(f"{base_url}/robots.txt")
9494
document = loader.load()
9595
if "ollama" in self.llm_model.model:

tests/conftest.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,24 @@
1414
from typing import Any, Dict
1515
from unittest.mock import Mock
1616

17+
import sys
18+
import types
19+
1720
import pytest
1821
from dotenv import load_dotenv
1922

23+
# Mock torchcodec to prevent FFmpeg DLL crashes at import time.
24+
# sentence_transformers -> torchcodec -> FFmpeg native DLLs can't load on some systems.
25+
_tc = types.ModuleType("torchcodec")
26+
_tc.__version__ = "0.0.0"
27+
_tc.__file__ = "<mock>"
28+
_tc.__spec__ = types.ModuleType("spec")
29+
_tc.__spec__.name = "torchcodec"
30+
_tc.__spec__.loader = None
31+
_tc.__spec__.submodule_search_locations = []
32+
if "torchcodec" not in sys.modules:
33+
sys.modules["torchcodec"] = _tc
34+
2035
# Load environment variables
2136
load_dotenv()
2237

0 commit comments

Comments
 (0)