1212logger = logging .getLogger (__name__ )
1313
1414
15+ class SandboxExecutor :
16+ """Sandbox executor for running commands with proxy configuration"""
17+
18+ def __init__ (self , proxy : Optional [str ] = None , default_timeout : int = 60 ):
19+ """Initialize sandbox executor with configuration
20+
21+ Args:
22+ proxy: Proxy URL to use for network access. If None, will use WEB_PROXY environment variable
23+ default_timeout: Default timeout in seconds for command execution
24+ """
25+ # Get proxy from parameter, environment variable, or use default blocking proxy
26+ # Use 'or None' to convert empty string to None, then apply default value
27+ self .proxy = proxy or os .environ .get ("WEB_PROXY" ) or "http://128.0.0.1:1"
28+ self .default_timeout = default_timeout
29+
30+ def execute_in_sandbox (self , cmd : List [str ]) -> tuple :
31+ """Execute command in sandbox with proxy configuration
32+
33+ Args:
34+ cmd: Command to execute
35+
36+ Returns:
37+ Tuple of (stdout, stderr, returncode)
38+ """
39+ # Try different sandbox methods in order of preference
40+ sandbox_methods = [
41+ self ._execute_with_proxy ,
42+ ]
43+
44+ for method in sandbox_methods :
45+ try :
46+ return method (cmd )
47+ except Exception as e :
48+ logger .warning (f"Sandbox method { method .__name__ } failed: { e } " )
49+ continue
50+
51+ raise RuntimeError ("All sandbox methods failed" )
52+
53+ def _execute_with_proxy (self , cmd : List [str ]) -> tuple :
54+ """Execute command with proxy configuration
55+
56+ Args:
57+ cmd: Command to execute
58+
59+ Returns:
60+ Tuple of (stdout, stderr, returncode)
61+ """
62+ # Set up environment with proxy configuration
63+ env = os .environ .copy ()
64+ if self .proxy :
65+ env ["http_proxy" ] = self .proxy
66+ env ["https_proxy" ] = self .proxy
67+ env ["HTTP_PROXY" ] = self .proxy
68+ env ["HTTPS_PROXY" ] = self .proxy
69+
70+ logger .info (f"Executing command with proxy: { ' ' .join (cmd )} " )
71+ if self .proxy :
72+ logger .info (f"Using proxy: { self .proxy } " )
73+
74+ process = subprocess .Popen (
75+ cmd ,
76+ stdout = subprocess .PIPE ,
77+ stderr = subprocess .PIPE ,
78+ env = env ,
79+ )
80+
81+ try :
82+ stdout , stderr = process .communicate (timeout = self .default_timeout )
83+ return stdout , stderr , process .returncode
84+ except subprocess .TimeoutExpired :
85+ process .kill ()
86+ raise RuntimeError (
87+ f"Command execution timeout after { self .default_timeout } seconds"
88+ )
89+
90+
91+ logger = logging .getLogger (__name__ )
92+
93+
1594class DocParser (Docx2Parser ):
1695 """DOC document parser"""
1796
97+ def __init__ (self , * args , ** kwargs ):
98+ """Initialize DOC parser with sandbox executor"""
99+ super ().__init__ (* args , ** kwargs )
100+ self .sandbox_executor = SandboxExecutor ()
101+
18102 def parse_into_text (self , content : bytes ) -> Document :
19103 logger .info (f"Parsing DOC document, content size: { len (content )} bytes" )
20104
@@ -25,7 +109,8 @@ def parse_into_text(self, content: bytes) -> Document:
25109 # try using antiword to extract text
26110 self ._parse_with_antiword ,
27111 # 3. If antiword extraction fails, use textract
28- self ._parse_with_textract ,
112+ # NOTE: _parse_with_textract is disabled due to SSRF vulnerability
113+ # self._parse_with_textract,
29114 ]
30115
31116 # Save byte content as a temporary file
@@ -61,14 +146,13 @@ def _parse_with_antiword(self, temp_file_path: str) -> Document:
61146 if not antiword_path :
62147 raise RuntimeError ("antiword not found in PATH" )
63148
64- # Use antiword to extract text directly
65- process = subprocess .Popen (
66- [antiword_path , temp_file_path ],
67- stdout = subprocess .PIPE ,
68- stderr = subprocess .PIPE ,
69- )
70- stdout , stderr = process .communicate ()
71- if process .returncode != 0 :
149+ # Use antiword to extract text directly in sandbox
150+ cmd = [antiword_path , temp_file_path ]
151+ logger .info ("Executing antiword in sandbox with proxy configuration" )
152+
153+ stdout , stderr , returncode = self .sandbox_executor .execute_in_sandbox (cmd )
154+
155+ if returncode != 0 :
72156 raise RuntimeError (
73157 f"antiword extraction failed: { stderr .decode ('utf-8' , errors = 'ignore' )} "
74158 )
@@ -114,13 +198,12 @@ def _try_convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
114198 temp_dir ,
115199 doc_path ,
116200 ]
117- logger .info (f"Running command: { ' ' .join (cmd )} " )
118- process = subprocess .Popen (
119- cmd , stdout = subprocess .PIPE , stderr = subprocess .PIPE
120- )
121- stdout , stderr = process .communicate ()
201+ logger .info (f"Running command in sandbox: { ' ' .join (cmd )} " )
202+
203+ # Execute in sandbox with proxy configuration
204+ stdout , stderr , returncode = self .sandbox_executor .execute_in_sandbox (cmd )
122205
123- if process . returncode != 0 :
206+ if returncode != 0 :
124207 logger .warning (
125208 f"Error converting DOC to DOCX: { stderr .decode ('utf-8' )} "
126209 )
0 commit comments