Skip to content

Commit 7cf23f9

Browse files
committed
behaviors: allow extracting outlinks
Currently, our JavaScript outlink extraction happens purely via our non-configurable extract-outlinks.js script. However, given that many sites can have unpredictable behaviour we may want special handling for, it would be great to let us configure this on a per- site basis. We already have a system for this for interacting with sites using our behaviour system; if we expand this to also provide outlinks, we can give ourselves a much more flexible system to handle complex or special-case websites. This extends the behaviour system so that we can now return a JavaScript object with information about the site. That object should contain at least the "finished" key, which is a boolean that works like the simple boolean returned by older versions. The object can additionally contain an "outlinks" key which, if present, should be an array of links for brozzler to handle as outlinks. I've retained backwards compatibility by checking to see if the returned object is a boolean and handling it like we did previously.
1 parent 33fffdf commit 7cf23f9

File tree

1 file changed

+24
-10
lines changed

1 file changed

+24
-10
lines changed

brozzler/browser.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -631,11 +631,14 @@ def browse_page(
631631
):
632632
run_behaviors = False
633633

634+
behavior_outlinks: frozenset[str] = frozenset()
634635
if run_behaviors and behavior_timeout > 0:
635636
behavior_script = brozzler.behavior_script(
636637
page_url, behavior_parameters, behaviors_dir=behaviors_dir
637638
)
638-
self.run_behavior(behavior_script, timeout=behavior_timeout)
639+
behavior_outlinks = self.run_behavior(
640+
behavior_script, timeout=behavior_timeout
641+
)
639642
final_page_url = self.url()
640643
if on_screenshot:
641644
if simpler404:
@@ -653,7 +656,7 @@ def browse_page(
653656
outlinks = self.extract_outlinks(timeout=extract_outlinks_timeout)
654657
if run_behaviors and not skip_visit_hashtags:
655658
self.visit_hashtags(final_page_url, hashtags, outlinks)
656-
return final_page_url, outlinks
659+
return final_page_url, outlinks.union(behavior_outlinks)
657660
except brozzler.ReachedLimit:
658661
# websock_thread has stashed the ReachedLimit exception with
659662
# more information, raise that one
@@ -766,7 +769,7 @@ def navigate_to_page(self, page_url, timeout=300):
766769
self.send_to_chrome(method="Page.navigate", params={"url": page_url})
767770
self._wait_for(lambda: self.websock_thread.got_page_load_event, timeout=timeout)
768771

769-
def extract_outlinks(self, timeout=60):
772+
def extract_outlinks(self, timeout=60) -> frozenset[str]:
770773
self.logger.info("extracting outlinks")
771774
self.websock_thread.expect_result(self._command_id.peek())
772775
js = brozzler.jinja2_environment().get_template("extract-outlinks.js").render()
@@ -875,7 +878,7 @@ def url(self, timeout=30):
875878
message = self.websock_thread.pop_result(msg_id)
876879
return message["result"]["result"]["value"]
877880

878-
def run_behavior(self, behavior_script, timeout=900):
881+
def run_behavior(self, behavior_script, timeout=900) -> frozenset[str]:
879882
self.send_to_chrome(
880883
method="Runtime.evaluate",
881884
suppress_logging=True,
@@ -888,15 +891,19 @@ def run_behavior(self, behavior_script, timeout=900):
888891
elapsed = time.time() - start
889892
if elapsed > timeout:
890893
self.logger.info("behavior reached hard timeout", elapsed=elapsed)
891-
return
894+
return frozenset()
892895

893896
brozzler.sleep(check_interval)
894897

895898
self.websock_thread.expect_result(self._command_id.peek())
896899
msg_id = self.send_to_chrome(
897900
method="Runtime.evaluate",
898901
suppress_logging=True,
899-
params={"expression": "umbraBehaviorFinished()"},
902+
params={
903+
"expression": "umbraBehaviorFinished()",
904+
# returnByValue ensures we can return more complicated types like dicts
905+
"returnByValue": True,
906+
},
900907
)
901908
try:
902909
self._wait_for(
@@ -911,11 +918,18 @@ def run_behavior(self, behavior_script, timeout=900):
911918
"wasThrown" in msg["result"] and msg["result"]["wasThrown"]
912919
)
913920
and "result" in msg["result"]
914-
and isinstance(msg["result"]["result"]["value"], bool)
915-
and msg["result"]["result"]["value"]
916921
):
917-
self.logger.info("behavior decided it has finished")
918-
return
922+
if isinstance(msg["result"]["result"]["value"], bool):
923+
if msg["result"]["result"]["value"]:
924+
self.logger.info("behavior decided it has finished")
925+
return frozenset()
926+
# new-style response dict that has more than just a finished bool
927+
elif isinstance(msg["result"]["result"]["value"], dict):
928+
response = msg["result"]["result"]["value"]
929+
if response["finished"]:
930+
self.logger.info("behavior decided it has finished")
931+
outlinks = frozenset(response.get("outlinks", []))
932+
return outlinks
919933
except BrowsingTimeout:
920934
pass
921935

0 commit comments

Comments
 (0)