Skip to content

Commit 6663825

Browse files
committed
behaviors: allow extracting outlinks
Currently, our JavaScript outlink extraction happens purely via our non-configurable extract-outlinks.js script. However, given that many sites can have unpredictable behaviour we may want special handling for, it would be great to let us configure this on a per- site basis. We already have a system for this for interacting with sites using our behaviour system; if we expand this to also provide outlinks, we can give ourselves a much more flexible system to handle complex or special-case websites. This extends the behaviour system so that we can now return a JavaScript object with information about the site. That object should contain at least the "finished" key, which is a boolean that works like the simple boolean returned by older versions. The object can additionally contain an "outlinks" key which, if present, should be an array of links for brozzler to handle as outlinks. I've retained backwards compatibility by checking to see if the returned object is a boolean and handling it like we did previously.
1 parent 47f5c06 commit 6663825

File tree

1 file changed

+22
-9
lines changed

1 file changed

+22
-9
lines changed

brozzler/browser.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -631,11 +631,14 @@ def browse_page(
631631
):
632632
run_behaviors = False
633633

634+
behavior_outlinks: frozenset[str] = frozenset()
634635
if run_behaviors and behavior_timeout > 0:
635636
behavior_script = brozzler.behavior_script(
636637
page_url, behavior_parameters, behaviors_dir=behaviors_dir
637638
)
638-
self.run_behavior(behavior_script, timeout=behavior_timeout)
639+
behavior_outlinks = self.run_behavior(
640+
behavior_script, timeout=behavior_timeout
641+
)
639642
final_page_url = self.url()
640643
if on_screenshot:
641644
if simpler404:
@@ -653,7 +656,7 @@ def browse_page(
653656
outlinks = self.extract_outlinks(timeout=extract_outlinks_timeout)
654657
if run_behaviors and not skip_visit_hashtags:
655658
self.visit_hashtags(final_page_url, hashtags, outlinks)
656-
return final_page_url, outlinks
659+
return final_page_url, outlinks.union(behavior_outlinks)
657660
except brozzler.ReachedLimit:
658661
# websock_thread has stashed the ReachedLimit exception with
659662
# more information, raise that one
@@ -766,7 +769,7 @@ def navigate_to_page(self, page_url, timeout=300):
766769
self.send_to_chrome(method="Page.navigate", params={"url": page_url})
767770
self._wait_for(lambda: self.websock_thread.got_page_load_event, timeout=timeout)
768771

769-
def extract_outlinks(self, timeout=60):
772+
def extract_outlinks(self, timeout=60) -> frozenset[str]:
770773
self.logger.info("extracting outlinks")
771774
self.websock_thread.expect_result(self._command_id.peek())
772775
js = brozzler.jinja2_environment().get_template("extract-outlinks.js").render()
@@ -857,7 +860,7 @@ def url(self, timeout=30):
857860
message = self.websock_thread.pop_result(msg_id)
858861
return message["result"]["result"]["value"]
859862

860-
def run_behavior(self, behavior_script, timeout=900):
863+
def run_behavior(self, behavior_script, timeout=900) -> frozenset[str]:
861864
self.send_to_chrome(
862865
method="Runtime.evaluate",
863866
suppress_logging=True,
@@ -870,7 +873,7 @@ def run_behavior(self, behavior_script, timeout=900):
870873
elapsed = time.time() - start
871874
if elapsed > timeout:
872875
self.logger.info("behavior reached hard timeout", elapsed=elapsed)
873-
return
876+
return frozenset()
874877

875878
brozzler.sleep(check_interval)
876879

@@ -893,11 +896,21 @@ def run_behavior(self, behavior_script, timeout=900):
893896
"wasThrown" in msg["result"] and msg["result"]["wasThrown"]
894897
)
895898
and "result" in msg["result"]
896-
and isinstance(msg["result"]["result"]["value"], bool)
897-
and msg["result"]["result"]["value"]
898899
):
899-
self.logger.info("behavior decided it has finished")
900-
return
900+
if (
901+
isinstance(msg["result"]["result"]["value"], bool)
902+
and msg["result"]["result"]["value"]
903+
):
904+
self.logger.info("behavior decided it has finished")
905+
return frozenset()
906+
# new-style response dict that has more than just a finished bool
907+
else:
908+
response = msg["result"]["result"]["value"]
909+
outlinks = frozenset(response.fetch("outlinks", []))
910+
911+
if response["finished"]:
912+
self.logger.info("behavior decided it has finished")
913+
return outlinks
901914
except BrowsingTimeout:
902915
pass
903916

0 commit comments

Comments
 (0)