Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
241 changes: 160 additions & 81 deletions brozzler/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,11 @@ def num_in_use(self):
class WebsockReceiverThread(threading.Thread):
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)

def __init__(self, websock, name=None, daemon=True):
def __init__(self, parent, name=None, daemon=True):
super().__init__(name=name, daemon=daemon)

self.websock = websock
self.parent = parent
self.websock = parent.websock

self.calling_thread = threading.current_thread()

Expand Down Expand Up @@ -255,40 +256,34 @@ def _javascript_dialog_opening(self, message):
accept = True
else:
accept = False

payload = dict(
id=0,
method="Page.handleJavaScriptDialog",
params={"accept": accept},
)

if session_id := message.get("sessionId"):
payload["sessionId"] = session_id

self.websock.send(
json.dumps(
dict(
id=0,
method="Page.handleJavaScriptDialog",
params={"accept": accept},
),
payload,
separators=(",", ":"),
)
)

def _should_track_request(self, message) -> bool:
"""
Decides whether or not to include a request in the idle check.
"""
# Workaround for https://github.com/GoogleChrome/lighthouse/issues/11850
# Don't add frame URLs to the set of active requests because they will
# never receive a loadingFinished event.
if (
"params" in message
and "type" in message["params"]
and "frameId" in message["params"]
and message["params"]["type"] == "Document"
):
if self.initial_document is None:
self.initial_document = message["params"]["frameId"]
return self.initial_document == message["params"]["frameId"]
return True
def _attached_to_target(self, message):
if "params" in message and "sessionId" in message["params"]:
self.parent._configure_target(message["params"]["sessionId"])

def _handle_message(self, websock, json_message):
message = json.loads(json_message)
if "method" in message:
if message["method"] == "Page.loadEventFired":
self.got_page_load_event = datetime.datetime.utcnow()
elif message["method"] == "Target.attachedToTarget":
self._attached_to_target(message)
elif message["method"] == "Network.responseReceived":
self._network_response_received(message)
with self.activity_lock:
Expand All @@ -299,8 +294,7 @@ def _handle_message(self, websock, json_message):

if "params" in message and "requestId" in message["params"]:
with self.activity_lock:
if self._should_track_request(message):
self.active_connections.add(message["params"]["requestId"])
self.active_connections.add(message["params"]["requestId"])
self.last_network_activity = time.time()
elif (
message["method"] == "Network.dataReceived"
Expand Down Expand Up @@ -390,6 +384,11 @@ def __init__(self, **kwargs):
self._wait_interval = 0.5
self._max_screenshot_width = kwargs.get("max_screenshot_width", 2000)
self._max_screenshot_height = kwargs.get("max_screenshot_height", 20000)
self.session_id = None

# Set default configuration in case the caller doesn't use
# configure_browser or browse_page
self.configure_browser()

def __enter__(self):
self.start()
Expand All @@ -413,7 +412,23 @@ def _wait_for(self, callback, timeout=None):
)
brozzler.sleep(self._wait_interval)

def send_to_chrome(self, suppress_logging=False, **kwargs):
# Marker value for session_id in send_to_chrome to use the main session (the tab).
_DEFAULT_SESSION = object()

def send_to_chrome(self, session_id=_DEFAULT_SESSION, **kwargs):
"""
Sends a message to Chrome.

Args:
session_id: session id to use; _DEFAULT_SESSION uses the
main tab's session, None omits it
"""
if session_id is self._DEFAULT_SESSION:
session_id = self.session_id

if session_id:
kwargs["sessionId"] = session_id

msg_id = next(self._command_id)
kwargs["id"] = msg_id
msg = json.dumps(kwargs, separators=(",", ":"))
Expand All @@ -433,45 +448,47 @@ def start(self, **kwargs):
**kwargs: arguments for self.chrome.start(...)
"""
if not self.is_running():
self.session_id = None
self.websock_url = self.chrome.start(**kwargs)
self.websock = websocket.WebSocketApp(self.websock_url)
self.websock_thread = WebsockReceiverThread(
self.websock, name="WebsockThread:%s" % self.chrome.port
self, name="WebsockThread:%s" % self.chrome.port
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure it's a good idea to pass self into the WebsockReceiverThread constructor - it assigns that value to an instance attribute on itself, but then we're assigning the WebsockReceiverThread to an attribute on the parent. I feel like that circular reference is likely to cause problems down the line, possibly when it hits the GC.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense, yeah. Should I move _configure_target to the receiver thread class?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I now remember why I did that. send_to_chrome is a method of the Browser class, so unless we duplicated that logic, we do need a reference to the Browser.

)
self.websock_thread.start()

self._wait_for(lambda: self.websock_thread.is_open, timeout=30)

# tell browser to send us messages we're interested in
self.send_to_chrome(method="Network.enable")
self.send_to_chrome(method="Page.enable")
# Enable Console & Runtime output only when debugging.
# After all, we just print these events with debug(), we don't use
# them in Brozzler logic.
if self.logger.is_enabled_for(logging.DEBUG):
self.send_to_chrome(method="Console.enable")
self.send_to_chrome(method="Runtime.enable")
self.send_to_chrome(method="ServiceWorker.enable")
self.send_to_chrome(method="ServiceWorker.setForceUpdateOnPageLoad")

# disable google analytics and amp analytics
self.send_to_chrome(
method="Network.setBlockedURLs",
params={
"urls": [
"*google-analytics.com/analytics.js*",
"*google-analytics.com/ga.js*",
"*google-analytics.com/ga_exp.js*",
"*google-analytics.com/urchin.js*",
"*google-analytics.com/collect*",
"*google-analytics.com/r/collect*",
"*google-analytics.com/__utm.gif*",
"*google-analytics.com/gtm/js?*",
"*google-analytics.com/cx/api.js*",
"*cdn.ampproject.org/*/amp-analytics*.js",
]
},
# Find the right target
self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(method="Target.getTargets")
self._wait_for(
lambda: self.websock_thread.received_result(msg_id), timeout=10
)
message = self.websock_thread.pop_result(msg_id)
self.logger.debug("target list", message=message)
for target in message.get("result", {}).get("targetInfos", []):
# Find the first about:blank page that hasn't been attached
if (
"targetId" in target
and target.get("type") == "page"
and target.get("url") == "about:blank"
and not target.get("attached", True)
):
target_id = target["targetId"]
break
else: # No target found
raise Exception("could not find page to attach")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if there's any other useful data would could tuck in this exception message to make it easier to track down?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could add the target list. It is an array of TargetInfo objects. Would mapping it to an array of URLs and whether it is attached by another session make sense? (That'd be the url and attached properties.)


self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(
method="Target.attachToTarget",
params={"targetId": target_id, "flatten": True},
)
self._wait_for(
lambda: self.websock_thread.received_result(msg_id), timeout=10
)
attached_msg = self.websock_thread.pop_result(msg_id)
self.session_id = attached_msg["result"]["sessionId"]

def stop(self):
"""
Expand Down Expand Up @@ -672,7 +689,6 @@ def _try_screenshot(self, on_screenshot, full_page=False):
"""
self.send_to_chrome(
method="Runtime.evaluate",
suppress_logging=True,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing needs to be done in this PR, but in the long term probably a good idea to make this an option. I can look into that later.

params={"expression": "window.scroll(0,0)"},
)
for i in range(3):
Expand Down Expand Up @@ -733,32 +749,95 @@ def is_idle():
def configure_browser(
self, extra_headers=None, user_agent=None, download_throughput=-1, stealth=False
):
headers = extra_headers or {}
self._extra_headers = extra_headers
self._user_agent = user_agent
self._download_throughput = download_throughput
self._stealth = stealth

if self.is_running():
# Assume we only need to update the root frame, as
# this function should only ever be called between pages.
self._configure_target(self._DEFAULT_SESSION)

def _configure_target(self, session_id):
# tell browser to send us messages we're interested in
self.send_to_chrome(method="Network.enable", session_id=session_id)
self.send_to_chrome(method="Page.enable", session_id=session_id)
# Enable Console & Runtime output only when debugging.
# After all, we just print these events with debug(), we don't use
# them in Brozzler logic.
if self.logger.is_enabled_for(logging.DEBUG):
self.send_to_chrome(method="Console.enable", session_id=session_id)
self.send_to_chrome(method="Runtime.enable", session_id=session_id)
self.send_to_chrome(method="ServiceWorker.enable", session_id=session_id)
self.send_to_chrome(
method="ServiceWorker.setForceUpdateOnPageLoad", session_id=session_id
)

self.send_to_chrome(
method="Target.setAutoAttach",
params={
"autoAttach": True,
"waitForDebuggerOnStart": True,
"flatten": True,
},
session_id=session_id,
)

# disable google analytics and amp analytics
self.send_to_chrome(
method="Network.setBlockedURLs",
params={
"urls": [
"*google-analytics.com/analytics.js*",
"*google-analytics.com/ga.js*",
"*google-analytics.com/ga_exp.js*",
"*google-analytics.com/urchin.js*",
"*google-analytics.com/collect*",
"*google-analytics.com/r/collect*",
"*google-analytics.com/__utm.gif*",
"*google-analytics.com/gtm/js?*",
"*google-analytics.com/cx/api.js*",
"*cdn.ampproject.org/*/amp-analytics*.js",
]
},
session_id=session_id,
)

headers = self._extra_headers or {}
headers["Accept-Encoding"] = "gzip" # avoid encodings br, sdch
msg_id = self.send_to_chrome(
method="Network.setExtraHTTPHeaders", params={"headers": headers}
self.send_to_chrome(
method="Network.setExtraHTTPHeaders",
params={"headers": headers},
session_id=session_id,
)
if user_agent:
msg_id = self.send_to_chrome(
method="Network.setUserAgentOverride", params={"userAgent": user_agent}
if self._user_agent:
self.send_to_chrome(
method="Network.setUserAgentOverride",
params={"userAgent": self._user_agent},
session_id=session_id,
)
if download_throughput > -1:
if self._download_throughput > -1:
# traffic shaping already used by SPN2 to aid warcprox resilience
# parameter value as bytes/second, or -1 to disable (default)
msg_id = self.send_to_chrome(
self.send_to_chrome(
method="Network.emulateNetworkConditions",
params={"downloadThroughput": download_throughput},
params={"downloadThroughput": self._download_throughput},
session_id=session_id,
)
if stealth:
if self._stealth:
self.websock_thread.expect_result(self._command_id.peek())
js = brozzler.jinja2_environment().get_template("stealth.js").render()
msg_id = self.send_to_chrome(
method="Page.addScriptToEvaluateOnNewDocument", params={"source": js}
)
self._wait_for(
lambda: self.websock_thread.received_result(msg_id), timeout=10
self.send_to_chrome(
method="Page.addScriptToEvaluateOnNewDocument",
params={"source": js},
session_id=session_id,
)

self.send_to_chrome(
method="Runtime.runIfWaitingForDebugger", session_id=session_id
)

def navigate_to_page(self, page_url, timeout=300):
self.logger.info("navigating to page", page_url=page_url)
self.websock_thread.got_page_load_event = None
Expand Down Expand Up @@ -860,7 +939,6 @@ def url(self, timeout=30):
def run_behavior(self, behavior_script, timeout=900):
self.send_to_chrome(
method="Runtime.evaluate",
suppress_logging=True,
params={"expression": behavior_script},
)

Expand All @@ -877,7 +955,6 @@ def run_behavior(self, behavior_script, timeout=900):
self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(
method="Runtime.evaluate",
suppress_logging=True,
params={"expression": "umbraBehaviorFinished()"},
)
try:
Expand Down Expand Up @@ -911,7 +988,6 @@ def try_login(self, username, password, timeout=300):
self.websock_thread.got_page_load_event = None
self.send_to_chrome(
method="Runtime.evaluate",
suppress_logging=True,
params={"expression": try_login_js},
)

Expand Down Expand Up @@ -960,12 +1036,15 @@ def try_login(self, username, password, timeout=300):
class Counter:
def __init__(self):
self.next_value = 0
self.lock = threading.Lock()

def __next__(self):
try:
return self.next_value
finally:
self.next_value += 1
with self.lock:
try:
return self.next_value
finally:
self.next_value += 1

def peek(self):
return self.next_value
with self.lock:
return self.next_value
9 changes: 4 additions & 5 deletions brozzler/chrome.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,20 +248,19 @@ def start(
return self._websocket_url(timeout_sec=websocket_timeout)

def _websocket_url(self, timeout_sec=60):
json_url = "http://localhost:%s/json" % self.port
json_url = "http://localhost:%s/json/version" % self.port
url_logger = self.logger.bind(json_url=json_url)
# make this a member variable so that kill -QUIT reports it
self._start = time.time()
self._last_warning = self._start
while True:
try:
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
all_debug_info = json.loads(raw_json.decode("utf-8"))
debug_info = [x for x in all_debug_info if x["url"] == "about:blank"]
debug_info = json.loads(raw_json.decode("utf-8"))

if debug_info and "webSocketDebuggerUrl" in debug_info[0]:
if debug_info and "webSocketDebuggerUrl" in debug_info:
url_logger.debug("webSocketDebuggerUrl returned", raw_json=raw_json)
url = debug_info[0]["webSocketDebuggerUrl"]
url = debug_info["webSocketDebuggerUrl"]
url_logger.info(
"got chrome window websocket debug url",
debug_url=url,
Expand Down