Skip to content

Commit 91ec651

Browse files
committed
fix(doc_parser): Add secure command execution with sandbox
1 parent 9af1fc1 commit 91ec651

File tree

1 file changed

+98
-15
lines changed

1 file changed

+98
-15
lines changed

docreader/parser/doc_parser.py

Lines changed: 98 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,93 @@
1212
logger = logging.getLogger(__name__)
1313

1414

15+
class SandboxExecutor:
16+
"""Sandbox executor for running commands with proxy configuration"""
17+
18+
def __init__(self, proxy: Optional[str] = None, default_timeout: int = 60):
19+
"""Initialize sandbox executor with configuration
20+
21+
Args:
22+
proxy: Proxy URL to use for network access. If None, will use WEB_PROXY environment variable
23+
default_timeout: Default timeout in seconds for command execution
24+
"""
25+
# Get proxy from parameter, environment variable, or use default blocking proxy
26+
# Use 'or None' to convert empty string to None, then apply default value
27+
self.proxy = proxy or os.environ.get("WEB_PROXY") or "http://128.0.0.1:1"
28+
self.default_timeout = default_timeout
29+
30+
def execute_in_sandbox(self, cmd: List[str]) -> tuple:
31+
"""Execute command in sandbox with proxy configuration
32+
33+
Args:
34+
cmd: Command to execute
35+
36+
Returns:
37+
Tuple of (stdout, stderr, returncode)
38+
"""
39+
# Try different sandbox methods in order of preference
40+
sandbox_methods = [
41+
self._execute_with_proxy,
42+
]
43+
44+
for method in sandbox_methods:
45+
try:
46+
return method(cmd)
47+
except Exception as e:
48+
logger.warning(f"Sandbox method {method.__name__} failed: {e}")
49+
continue
50+
51+
raise RuntimeError("All sandbox methods failed")
52+
53+
def _execute_with_proxy(self, cmd: List[str]) -> tuple:
54+
"""Execute command with proxy configuration
55+
56+
Args:
57+
cmd: Command to execute
58+
59+
Returns:
60+
Tuple of (stdout, stderr, returncode)
61+
"""
62+
# Set up environment with proxy configuration
63+
env = os.environ.copy()
64+
if self.proxy:
65+
env["http_proxy"] = self.proxy
66+
env["https_proxy"] = self.proxy
67+
env["HTTP_PROXY"] = self.proxy
68+
env["HTTPS_PROXY"] = self.proxy
69+
70+
logger.info(f"Executing command with proxy: {' '.join(cmd)}")
71+
if self.proxy:
72+
logger.info(f"Using proxy: {self.proxy}")
73+
74+
process = subprocess.Popen(
75+
cmd,
76+
stdout=subprocess.PIPE,
77+
stderr=subprocess.PIPE,
78+
env=env,
79+
)
80+
81+
try:
82+
stdout, stderr = process.communicate(timeout=self.default_timeout)
83+
return stdout, stderr, process.returncode
84+
except subprocess.TimeoutExpired:
85+
process.kill()
86+
raise RuntimeError(
87+
f"Command execution timeout after {self.default_timeout} seconds"
88+
)
89+
90+
91+
logger = logging.getLogger(__name__)
92+
93+
1594
class DocParser(Docx2Parser):
1695
"""DOC document parser"""
1796

97+
def __init__(self, *args, **kwargs):
98+
"""Initialize DOC parser with sandbox executor"""
99+
super().__init__(*args, **kwargs)
100+
self.sandbox_executor = SandboxExecutor()
101+
18102
def parse_into_text(self, content: bytes) -> Document:
19103
logger.info(f"Parsing DOC document, content size: {len(content)} bytes")
20104

@@ -25,7 +109,8 @@ def parse_into_text(self, content: bytes) -> Document:
25109
# try using antiword to extract text
26110
self._parse_with_antiword,
27111
# 3. If antiword extraction fails, use textract
28-
self._parse_with_textract,
112+
# NOTE: _parse_with_textract is disabled due to SSRF vulnerability
113+
# self._parse_with_textract,
29114
]
30115

31116
# Save byte content as a temporary file
@@ -61,14 +146,13 @@ def _parse_with_antiword(self, temp_file_path: str) -> Document:
61146
if not antiword_path:
62147
raise RuntimeError("antiword not found in PATH")
63148

64-
# Use antiword to extract text directly
65-
process = subprocess.Popen(
66-
[antiword_path, temp_file_path],
67-
stdout=subprocess.PIPE,
68-
stderr=subprocess.PIPE,
69-
)
70-
stdout, stderr = process.communicate()
71-
if process.returncode != 0:
149+
# Use antiword to extract text directly in sandbox
150+
cmd = [antiword_path, temp_file_path]
151+
logger.info("Executing antiword in sandbox with proxy configuration")
152+
153+
stdout, stderr, returncode = self.sandbox_executor.execute_in_sandbox(cmd)
154+
155+
if returncode != 0:
72156
raise RuntimeError(
73157
f"antiword extraction failed: {stderr.decode('utf-8', errors='ignore')}"
74158
)
@@ -114,13 +198,12 @@ def _try_convert_doc_to_docx(self, doc_path: str) -> Optional[bytes]:
114198
temp_dir,
115199
doc_path,
116200
]
117-
logger.info(f"Running command: {' '.join(cmd)}")
118-
process = subprocess.Popen(
119-
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
120-
)
121-
stdout, stderr = process.communicate()
201+
logger.info(f"Running command in sandbox: {' '.join(cmd)}")
202+
203+
# Execute in sandbox with proxy configuration
204+
stdout, stderr, returncode = self.sandbox_executor.execute_in_sandbox(cmd)
122205

123-
if process.returncode != 0:
206+
if returncode != 0:
124207
logger.warning(
125208
f"Error converting DOC to DOCX: {stderr.decode('utf-8')}"
126209
)

0 commit comments

Comments
 (0)