Skip to content

Commit 93309f1

Browse files
committed
behaviors: optionally collect outlinks while looping
This solves one of the existing limitations of our outlink gathering system: we run it once, after all behaviours have completed. For some interactive pages, particularly single-page apps with paginated data, it means that we'll completely miss that content since it won't be in the DOM anymore by the time we get around to it. In a previous PR, #433, I made the outlink gathering logic reusable by ensuring it's possible for us to dynamically call the outlink gathering function at any time. In addition, in #429, I made it possible for behaviours to return outlinks to Brozzler; if the behaviour chooses to return outlinks, then Brozzler will add them to the set it extracts after behavours complete. This branch uses both of those by introducing new functionality in behaviours. We always inject the outlink gathering code before running behaviours, so they can now run it at will. It also introduces two new behaviour parameters: * `extractOutlinks` - If set to `true`, then the behaviour script will call the outlink gathering logic and return any outlinks it harvested to Brozzler. Defaults to `false`. * `extractOutlinksInLoop` - If set to `true`, then the behaviour script will gather outlinks every iteration of the loop. This combines great with `repeatSameElement`, since it means the behaviour script can click a `next` pagination button and then immediately gather whatever new outlinks have appeared on the page. Defaults to `false`.
1 parent 00edb56 commit 93309f1

File tree

2 files changed

+32
-4
lines changed

2 files changed

+32
-4
lines changed

brozzler/browser.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -769,8 +769,7 @@ def navigate_to_page(self, page_url, timeout=300):
769769
self.send_to_chrome(method="Page.navigate", params={"url": page_url})
770770
self._wait_for(lambda: self.websock_thread.got_page_load_event, timeout=timeout)
771771

772-
def extract_outlinks(self, timeout=60) -> frozenset[str]:
773-
self.logger.info("extracting outlinks")
772+
def inject_outlink_extractor(self, timeout=60):
774773
self.websock_thread.expect_result(self._command_id.peek())
775774
js = brozzler.jinja2_environment().get_template("extract-outlinks.js").render()
776775
# This defines the method but doesn't extract outlinks yet
@@ -782,8 +781,14 @@ def extract_outlinks(self, timeout=60) -> frozenset[str]:
782781
self._wait_for(
783782
lambda: self.websock_thread.received_result(msg_id), timeout=timeout
784783
)
785-
self.websock_thread.expect_result(self._command_id.peek())
786784

785+
def extract_outlinks(self, timeout=60) -> frozenset[str]:
786+
self.logger.info("extracting outlinks")
787+
788+
# Define the outlink extractor first before extracting
789+
self.inject_outlink_extractor(timeout=timeout)
790+
791+
self.websock_thread.expect_result(self._command_id.peek())
787792
# Now we actually do outlink extraction
788793
msg_id = self.send_to_chrome(
789794
method="Runtime.evaluate",
@@ -879,6 +884,9 @@ def url(self, timeout=30):
879884
return message["result"]["result"]["value"]
880885

881886
def run_behavior(self, behavior_script, timeout=900) -> frozenset[str]:
887+
# Inject the outlink extractor so it's available to behaviors
888+
self.inject_outlink_extractor(timeout=timeout)
889+
882890
self.send_to_chrome(
883891
method="Runtime.evaluate",
884892
suppress_logging=True,

brozzler/js-templates/umbraBehavior.js.j2

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class UmbraBehavior {
2727
this.intervalId = null;
2828
this.intervalTimeMs = {{interval or 300}};
2929
this.index = 0;
30+
this.outlinks = [];
3031
}
3132

3233
simpleIntervalFunc() {
@@ -43,6 +44,8 @@ class UmbraBehavior {
4344
var didSomething = false;
4445
var somethingLeftAbove = false;
4546
var somethingLeftBelow = false;
47+
var extractOutlinks = this.actions[k].extractOutlinks ? this.actions[k].extractOutlinks : false;
48+
var extractOutlinksInLoop = this.actions[k].extractOutlinksInLoop ? this.actions[k].extractOutlinksInLoop : false;
4649

4750
var documents = [];
4851
documents[0] = document;
@@ -121,6 +124,12 @@ class UmbraBehavior {
121124
childSelectors = documents[j].querySelectorAll(childSelector);
122125
}
123126
}
127+
// If we were asked to extract outlinks within the loop,
128+
// repeat this every time we iterate
129+
if (extractOutlinks && extractOutlinksInLoop) {
130+
this.outlinks = this.outlinks.concat(__brzl_compileOutlinks(window));
131+
}
132+
124133
didSomething = true;
125134
break;
126135
} else if (where > 0) {
@@ -136,6 +145,12 @@ class UmbraBehavior {
136145
}
137146
}
138147

148+
// If we were asked not to extract outlinks within the loop,
149+
// do this once after we've finished looping
150+
if (extractOutlinks && !extractOutlinksInLoop) {
151+
this.outlinks = this.outlinks.concat(__brzl_compileOutlinks(window));
152+
}
153+
139154
if (!didSomething) {
140155
if (somethingLeftBelow || ( (window.scrollY + window.innerHeight) < document.documentElement.scrollHeight)) {
141156
window.scrollBy(0, 200);
@@ -221,7 +236,12 @@ var umbraBehavior = new UmbraBehavior( {{actions|json}} );
221236

222237
// Called from outside of this script.
223238
var umbraBehaviorFinished = function() {
224-
return umbraBehavior.isFinished();
239+
let outlinkStrings = umbraBehavior.outlinks.map(el => el.toString());
240+
241+
return {
242+
finished: umbraBehavior.isFinished(),
243+
outlinks: Array.from(outlinkStrings),
244+
};
225245
};
226246

227247
umbraBehavior.start();

0 commit comments

Comments
 (0)