Skip to content

Commit c66c6b1

Browse files
committed
feat: capture emoji in youtube scraper
fixes #6 Emoji were not captured by the youtube scraper because they are not in the text node. They are in an image element instead. As a result, the scraper has to walk through a mix of text and image elements. Fortunately, the actual emoji text is in the alternative text of the image. This commit creates a private function to iterate over the elements of the comment. Whenever a text or an image element is found, it adds them to an array. It is joined at the end to build the message.
1 parent b378871 commit c66c6b1

File tree

1 file changed

+18
-2
lines changed

1 file changed

+18
-2
lines changed

browser-extension/src/entrypoints/youtube.content/youtube-post-native-scrapper.ts

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -331,14 +331,13 @@ export class YoutubePostNativeScrapper {
331331
HTMLElement,
332332
).innerText;
333333

334-
// TODO review content capture to include emojis
335334
const commentTextHandle = selectOrThrow(
336335
commentContainer,
337336
"#content-text",
338337
HTMLElement,
339338
);
340339

341-
const commentText = commentTextHandle.innerText.trim();
340+
const commentText = this.scrapCommentText(commentTextHandle);
342341

343342
const boundingBox = commentContainer.getBoundingClientRect();
344343
const commentPre: CommentPreScreenshot = {
@@ -361,6 +360,7 @@ export class YoutubePostNativeScrapper {
361360
};
362361
return commentPre;
363362
}
363+
364364
private async scrapCommentAuthor(
365365
commentContainer: HTMLElement,
366366
): Promise<Author> {
@@ -378,4 +378,20 @@ export class YoutubePostNativeScrapper {
378378
};
379379
return author;
380380
}
381+
382+
private scrapCommentText(commentTextHandle: HTMLElement): string {
383+
const iterator = document.createNodeIterator(commentTextHandle);
384+
const textElements: string[] = [];
385+
let node: Node | null;
386+
387+
while ((node = iterator.nextNode())) {
388+
if (node instanceof Text && node.nodeValue) {
389+
textElements.push(node.nodeValue);
390+
} else if (node instanceof HTMLImageElement && node.alt) {
391+
textElements.push(node.alt);
392+
}
393+
}
394+
395+
return textElements.join(" ").trim();
396+
}
381397
}

0 commit comments

Comments
 (0)