Skip to content

Commit 2e3bbac

Browse files
committed
feat: sample storage and report page
1 parent bc1c310 commit 2e3bbac

File tree

14 files changed

+290
-115
lines changed

14 files changed

+290
-115
lines changed

browser-extension/entrypoints/background/index.ts

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { storePost } from "../shared/storage/posts-storage";
12
import { getCurrentTab } from "../shared/utils/getCurrentTab";
23
import { scrapTab as scrapPostFromTab } from "./scraping/scrap-tab";
34

@@ -24,16 +25,9 @@ async function scrapActiveTab() {
2425
const tab = await getCurrentTab();
2526

2627
if (tab) {
28+
console.log("Scraping post from active tab");
2729
const socialNetworkPost = await scrapPostFromTab(tab);
28-
29-
if (socialNetworkPost.comments.length > 0) {
30-
const screenshotDataUrl: string =
31-
socialNetworkPost.comments[0].screenshotDataUrl;
32-
browser.downloads.download({
33-
url: screenshotDataUrl, // The object URL can be used as download URL
34-
filename: "screenshot.png",
35-
//...
36-
});
37-
}
30+
console.log("Storing post to local storage");
31+
await storePost(socialNetworkPost);
3832
}
3933
}

browser-extension/entrypoints/background/model.ts

Lines changed: 0 additions & 20 deletions
This file was deleted.

browser-extension/entrypoints/background/scraping/base-scraper.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import {
33
ExtensionTransport,
44
Browser as PuppeteerBrowser,
55
} from "puppeteer-core/lib/esm/puppeteer/puppeteer-core-browser.js";
6-
import type { Post } from "../model";
6+
import type { Post } from "../../shared/model/post";
77

88
export abstract class BaseScraper {
99
browser?: PuppeteerBrowser;
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
import { SocialNetworkName } from "@/entrypoints/shared/social-network-url";
1+
import { SocialNetworkName } from "@/entrypoints/shared/model/social-network-name";
22
import { BaseScraper } from "./base-scraper";
33
import { InstagramScraper } from "./instagram/instagram-scraper";
44
import { YoutubeScraper } from "./youtube/youtube-scraper";
55

66
export function createScraper(sn: SocialNetworkName): BaseScraper {
77
switch (sn) {
8-
case "youtube":
8+
case "YOUTUBE":
99
return new YoutubeScraper();
10-
case "instagram":
10+
case "INSTAGRAM":
1111
return new InstagramScraper();
1212
}
1313
}

browser-extension/entrypoints/background/scraping/instagram/instagram-scraper.ts

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,26 @@
11
import { ElementHandle } from "puppeteer-core/lib/esm/puppeteer/puppeteer-core-browser.js";
22
import { BaseScraper } from "../base-scraper";
3-
import { type Autor, type Post, type Comment } from "../../model";
3+
import {
4+
type Author,
5+
type Post,
6+
type Comment,
7+
} from "../../../shared/model/post";
8+
import { parseSocialNetworkUrl } from "@/entrypoints/shared/social-network-url";
9+
import { currentIsoDate } from "../utils/current-iso-date";
410

511
//TODO: gérer le scroll et le chargement des commentaires
612
//TODO: gérer le scraping des réponses aux commentaires
713
export class InstagramScraper extends BaseScraper {
814
private INSTAGRAM_URL = "https://www.instagram.com/";
915

16+
extractPostId(url: string): string {
17+
const parsed = parseSocialNetworkUrl(url);
18+
if (!parsed) {
19+
throw new Error("Unexpected");
20+
}
21+
return parsed.postId;
22+
}
23+
1024
async scrapTab(tab: Browser.tabs.Tab): Promise<Post> {
1125
const page = await this.getBrowserPageFromTab(tab);
1226

@@ -64,18 +78,22 @@ export class InstagramScraper extends BaseScraper {
6478
);
6579
// gérer ici le ce scraping des réponses aux commentaires
6680
}
81+
6782
return {
83+
postId: this.extractPostId(tab.url!),
84+
socialNetwork: "INSTAGRAM",
6885
url: tab.url!,
6986
author: auteur,
70-
publishedAt: new Date(date_publication),
87+
scrapTimestamp: new Date().toISOString(),
88+
publishedAt: new Date(date_publication).toISOString(),
7189
text: texte_publication,
7290
comments: await Promise.all(commentaires),
7391
};
7492
}
7593

7694
private async get_auteur_from_span(
7795
span_element: ElementHandle<Element>
78-
): Promise<Autor> {
96+
): Promise<Author> {
7997
const auteur_elem = (await span_element.$("::-p-xpath(.//a)"))!;
8098
const auteur_href = (await auteur_elem.$eval("::-p-xpath(.)", (node) =>
8199
node.getAttribute("href")
@@ -86,7 +104,7 @@ export class InstagramScraper extends BaseScraper {
86104
))!;
87105
return {
88106
name: auteur_name,
89-
accountHref: this.urlJoin(this.INSTAGRAM_URL, auteur_href),
107+
accountUrl: this.urlJoin(this.INSTAGRAM_URL, auteur_href),
90108
};
91109
}
92110

@@ -109,12 +127,19 @@ export class InstagramScraper extends BaseScraper {
109127
} catch (_) {
110128
date_commentaire = undefined;
111129
}
130+
112131
const screenshot = await comment_element.screenshot({ encoding: "base64" });
132+
const screenshotDate = currentIsoDate();
113133
return {
114-
autor: auteur,
115-
text: await base_1.$eval("::-p-xpath(.)", (node) => node.textContent!)!,
116-
commentDate: date_commentaire,
134+
author: auteur,
135+
commentText: await base_1.$eval(
136+
"::-p-xpath(.)",
137+
(node) => node.textContent!
138+
)!,
139+
commentDate: date_commentaire?.toISOString(),
140+
commentDateRelative: false,
117141
screenshotDataUrl: screenshot,
142+
screenshotDate,
118143
replies: [],
119144
};
120145
}

browser-extension/entrypoints/background/scraping/scrap-tab.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { parseSocialNetworkUrl } from "../../shared/social-network-url";
2-
import { Post } from "../model";
2+
import { Post } from "../../shared/model/post";
33
import { createScraper } from "./create-scraper";
44

55
export async function scrapTab(
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
export function currentIsoDate(): string {
2+
return new Date(Date.now()).toISOString();
3+
}

browser-extension/entrypoints/background/scraping/youtube/youtube-scraper.ts

Lines changed: 91 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,77 @@
1-
import { ElementHandle } from "puppeteer-core/lib/esm/puppeteer/puppeteer-core-browser.js";
1+
import {
2+
Page,
3+
ElementHandle,
4+
} from "puppeteer-core/lib/esm/puppeteer/puppeteer-core-browser.js";
25
import { BaseScraper } from "../base-scraper";
3-
import { type Post, type Comment } from "../../model";
6+
import { type Post, type Comment, Author } from "../../../shared/model/post";
7+
import { parseSocialNetworkUrl } from "@/entrypoints/shared/social-network-url";
8+
import { currentIsoDate } from "../utils/current-iso-date";
49

510
export class YoutubeScraper extends BaseScraper {
11+
extractPostId(url: string): string {
12+
const parsed = parseSocialNetworkUrl(url);
13+
if (!parsed) {
14+
throw new Error("Unexpected");
15+
}
16+
return parsed.postId;
17+
}
18+
619
async scrapTab(tab: Browser.tabs.Tab): Promise<Post> {
20+
const postUrl = tab.url!;
21+
const postId = this.extractPostId(postUrl);
722
const page = await this.getBrowserPageFromTab(tab);
23+
const scrapTimestamp = currentIsoDate();
24+
25+
const author = await this.scrapPostAuthor(page);
26+
const { publishedAt, publishedAtRelative, text } =
27+
await this.scrapPostPublishedAtAndText(page);
28+
const comments: Comment[] = await this.scrapPostComments(page);
29+
30+
return {
31+
postId: postId,
32+
socialNetwork: "YOUTUBE",
33+
scrapTimestamp: scrapTimestamp,
34+
35+
url: postUrl,
36+
author: author,
37+
publishedAt: publishedAt,
38+
publishedAtRelative: publishedAtRelative,
39+
text: text,
40+
comments: comments,
41+
};
42+
}
43+
44+
private async scrapPostPublishedAtAndText(postPage: Page): Promise<{
45+
publishedAt: string | undefined;
46+
publishedAtRelative: boolean | undefined;
47+
text: string;
48+
}> {
49+
const snippetText = await innertText(
50+
(await postPage.$("#description #snippet-text"))!
51+
);
52+
const publishedAt =
53+
(await ariaLabel((await postPage.$("#description #date-text"))!)) ??
54+
undefined;
55+
return {
56+
publishedAt: publishedAt,
57+
publishedAtRelative: true,
58+
text: snippetText,
59+
};
60+
}
61+
62+
private async scrapPostAuthor(postPage: Page): Promise<Author> {
63+
const postOwnerEl = (await postPage.$("#owner"))!;
64+
const channelNameEl = (await postOwnerEl.$("#channel-name"))!;
65+
const link = (await channelNameEl.$("a"))!;
66+
const channelName = await innertText(link);
67+
const channelUrl = await anchorHref(link);
68+
return {
69+
name: channelName,
70+
accountUrl: channelUrl,
71+
};
72+
}
873

74+
private async scrapPostComments(page: Page): Promise<Comment[]> {
975
const commentsSectionHandle: ElementHandle = (await page.$("#comments"))!;
1076
commentsSectionHandle.scrollIntoView();
1177

@@ -21,7 +87,6 @@ export class YoutubeScraper extends BaseScraper {
2187
// await loadAllTopLevelComments(commentsSectionHandle);
2288
// await expandReplies(commentsSectionHandle)
2389
// await expandLongComments(commentsSectionHandle)
24-
2590
await commentsSectionHandle.waitForSelector("#comment-container");
2691
const commentContainers = await commentsSectionHandle.$$(
2792
"#comment-container"
@@ -43,40 +108,30 @@ export class YoutubeScraper extends BaseScraper {
43108
commentTextHandle
44109
)
45110
)?.trim();
46-
const screenshotData = this.uintArraySCreenshotToBase64Url(
111+
const screenshotData = this.uintArrayScreenshotToBase64Url(
47112
await commentContainer.screenshot()
48113
);
114+
const screenshotDate = currentIsoDate();
115+
49116
const comment: Comment = {
50-
autor: {
117+
author: {
51118
name: commentAuthor,
52119
// TODO extract href
53120
},
54-
text: commentText,
121+
commentText: commentText,
55122
screenshotDataUrl: screenshotData,
56-
commentDate: new Date(Date.now()),
123+
screenshotDate,
124+
// TODO extrat comment relative date
57125
// TODO capture replies
58126
replies: [],
59127
};
60128
return comment;
61129
})
62130
);
63-
64-
console.log("comments", comments);
65-
66-
return {
67-
url: tab.url!,
68-
author: {
69-
// TODO capture post author
70-
name: "Unknown",
71-
accountHref: "",
72-
},
73-
// TODO capture post text
74-
publishedAt: new Date(Date.now()),
75-
comments: comments,
76-
};
131+
return comments;
77132
}
78133

79-
private uintArraySCreenshotToBase64Url(pngData: Uint8Array): string {
134+
private uintArrayScreenshotToBase64Url(pngData: Uint8Array): string {
80135
let binary = "";
81136
const len = pngData.byteLength;
82137
for (let i = 0; i < len; i++) {
@@ -86,3 +141,17 @@ export class YoutubeScraper extends BaseScraper {
86141
return "data:image/png;base64," + base64;
87142
}
88143
}
144+
145+
async function innertText(element: ElementHandle): Promise<string> {
146+
return await element.evaluate((e) => (e as HTMLElement).innerText, element);
147+
}
148+
149+
async function anchorHref(
150+
element: ElementHandle<HTMLAnchorElement>
151+
): Promise<string> {
152+
return await element.evaluate((e) => e.href, element);
153+
}
154+
155+
async function ariaLabel(element: ElementHandle): Promise<string | null> {
156+
return await element.evaluate((e) => (e as HTMLElement).ariaLabel, element);
157+
}

0 commit comments

Comments
 (0)