Skip to content

Commit 295eaa8

Browse files
ndanner-wesleyancsvringar
authored andcommitted
feat(browser_manager): enable full XPI cleanup
Set TMPDIR in the environment to a unique directory for each browser instance and delete it when the browser quits. This is a workaround for an issue with geckodriver. When the OpenWPM extension is installed via `WebDriver.install_addon()`, geckodriver makes a copy of the XPI file in TMPDIR. However, geckodriver never deletes that file. So on a stateless crawl, you end up with one copy of the XPI file for each site visited. This workaround sets TMPDIR in the environment before creating the geckodriver service, and then deletes the directory after `driver.quit()` returns in `BrowserManager.run()`. We use this indirection because we don't have access to the name of the temporary file, and it doesn't seem safe to just delete XPI files in /tmp.
1 parent d23fc19 commit 295eaa8

File tree

3 files changed

+77
-0
lines changed

3 files changed

+77
-0
lines changed

openwpm/browser_manager.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,33 @@ def launch_browser_manager(self) -> bool:
131131

132132
crash_recovery = True
133133

134+
# Create a unique temporary directory that we can delete
135+
# when we shut down. Note that this doesn't force anything to
136+
# use `tmpdir`, it just makes it available.
137+
if self.browser_params.tmpdir is not None:
138+
self.logger.debug(
139+
"BROWSER %i: leftover temp directory %s? Deleting it."
140+
% (self.browser_id, self.browser_params.tmpdir)
141+
)
142+
try:
143+
shutil.rmtree(self.browser_params.tmpdir)
144+
except Exception:
145+
self.logger.debug(
146+
"BROWSER %i: error deleting %s"
147+
% (
148+
self.browser_id,
149+
self.browser_params.tmpdir,
150+
),
151+
exc_info=True,
152+
)
153+
self.browser_params.tmpdir = Path(
154+
tempfile.mkdtemp(prefix="openwpm_", dir=os.getenv("TMPDIR", default="/tmp"))
155+
)
156+
self.logger.debug(
157+
"BROWSER %i: Using temp dir %s"
158+
% (self.browser_id, self.browser_params.tmpdir)
159+
)
160+
134161
self.logger.info("BROWSER %i: Launching browser..." % self.browser_id)
135162
self.is_fresh = not crash_recovery
136163

@@ -340,6 +367,34 @@ def close_browser_manager(self, force: bool = False) -> None:
340367
if not shutdown_complete:
341368
self.kill_browser_manager()
342369

370+
# Delete the temporary directory used by geckodriver.
371+
if self.browser_params.tmpdir is not None:
372+
try:
373+
t1 = time.time()
374+
self.logger.debug(
375+
"BROWSER %i: deleting temp dir %s"
376+
% (self.browser_id, self.browser_params.tmpdir)
377+
)
378+
shutil.rmtree(self.browser_params.tmpdir)
379+
self.logger.debug(
380+
"BROWSER %i: completed deleting temp dir %s in %d seconds"
381+
% (
382+
self.browser_id,
383+
self.browser_params.tmpdir,
384+
time.time() - t1,
385+
)
386+
)
387+
self.browser_params.tmpdir = None
388+
except Exception as e:
389+
self.logger.warn(
390+
"BROWSER %i: failed to delete temp dir %s"
391+
% (
392+
self.browser_id,
393+
self.browser_params.tmpdir,
394+
),
395+
exc_info=True,
396+
)
397+
343398
def execute_command_sequence(
344399
self,
345400
# Quoting to break cyclic import, see https://stackoverflow.com/a/39757388

openwpm/config.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ class BrowserParams(DataClassJsonMixin):
103103
default=Path(tempfile.gettempdir()),
104104
metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path),
105105
)
106+
106107
"""
107108
The tmp_profile_dir defaults to the OS's temporary file folder (typically /tmp) and is where the generated
108109
browser profiles and residual files are stored.
@@ -139,6 +140,18 @@ class BrowserParams(DataClassJsonMixin):
139140
140141
"""
141142

143+
tmpdir: Optional[Path] = field(
144+
default=None,
145+
metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path),
146+
)
147+
"""
148+
The temporary directory used by `geckodriver`. This is configured in
149+
`BrowserManager.run` and then deleted when the browser is finished. We do
150+
this because it seems that `geckodriver` doesn't clean up its temporary
151+
files (in particular, a copy of the extension XPI file), so we need to do
152+
so ourselves.
153+
"""
154+
142155
recovery_tar: Optional[Path] = None
143156
donottrack: bool = False
144157
tracking_protection: bool = False

openwpm/deploy_browsers/deploy_firefox.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,14 @@ def deploy_firefox(
140140
# Launch the webdriver
141141
status_queue.put(("STATUS", "Launch Attempted", None))
142142

143+
# Use browser_params.tmpdir as the temporary directory. This is so that
144+
# geckodriver makes its copy of the extension XPI file in tmpdir, so
145+
# we can delete it later and not have it left behind. I make a shallow
146+
# copy of `os.environ` because I'm a little nervous about modifying the
147+
# OpenWPM process' environment.
148+
env = os.environ.copy()
149+
env["TMPDIR"] = str(browser_params.tmpdir)
150+
143151
fo.binary_location = firefox_binary_path
144152
geckodriver_path = subprocess.check_output(
145153
"which geckodriver", encoding="utf-8", shell=True
@@ -149,6 +157,7 @@ def deploy_firefox(
149157
service=Service(
150158
executable_path=geckodriver_path,
151159
log_output=open(webdriver_interceptor.fifo, "w"),
160+
env=env,
152161
),
153162
)
154163

0 commit comments

Comments
 (0)