|
1 | 1 | import { |
2 | | - ElementHandle, |
3 | | - Page, |
| 2 | + ElementHandle, |
| 3 | + Page, |
4 | 4 | } from "puppeteer-core/lib/esm/puppeteer/puppeteer-core-browser.js"; |
5 | 5 | import { PuppeteerBaseScraper } from "../puppeteer/puppeteer-base-scraper"; |
6 | 6 | import { |
7 | | - type Author, |
8 | | - type Post, |
9 | | - type Comment, |
| 7 | + type Author, |
| 8 | + type Post, |
| 9 | + type Comment, |
10 | 10 | } from "../../../shared/model/post"; |
11 | 11 | import { parseSocialNetworkUrl } from "@/entrypoints/shared/social-network-url"; |
12 | 12 | import { currentIsoDate } from "../utils/current-iso-date"; |
13 | 13 |
|
14 | 14 | //TODO: gérer le scroll et le chargement des commentaires |
15 | 15 | //TODO: gérer le scraping des réponses aux commentaires |
16 | 16 | export class InstagramScraper extends PuppeteerBaseScraper { |
17 | | - private INSTAGRAM_URL = "https://www.instagram.com/"; |
| 17 | + private INSTAGRAM_URL = "https://www.instagram.com/"; |
18 | 18 |
|
19 | | - extractPostId(url: string): string { |
20 | | - const parsed = parseSocialNetworkUrl(url); |
21 | | - if (!parsed) { |
22 | | - throw new Error("Unexpected"); |
23 | | - } |
24 | | - return parsed.postId; |
| 19 | + extractPostId(url: string): string { |
| 20 | + const parsed = parseSocialNetworkUrl(url); |
| 21 | + if (!parsed) { |
| 22 | + throw new Error("Unexpected"); |
25 | 23 | } |
| 24 | + return parsed.postId; |
| 25 | + } |
26 | 26 |
|
27 | | - async doScrapTab(tab: Browser.tabs.Tab, page: Page): Promise<Post> { |
28 | | - // //main/div/div/div |
29 | | - const cadre_publication = (await page.$("::-p-xpath(//main/div/div/div)"))!; |
30 | | - const colonne_commentaires = (await cadre_publication.$( |
31 | | - "::-p-xpath(./div[2]/div)" |
32 | | - ))!; |
| 27 | + async doScrapTab(tab: Browser.tabs.Tab, page: Page): Promise<Post> { |
| 28 | + // //main/div/div/div |
| 29 | + const cadre_publication = (await page.$("::-p-xpath(//main/div/div/div)"))!; |
| 30 | + const colonne_commentaires = (await cadre_publication.$( |
| 31 | + "::-p-xpath(./div[2]/div)", |
| 32 | + ))!; |
33 | 33 |
|
34 | | - // //main/div/div/div/./div[2]/div/./div[2] |
35 | | - const zone_defilable = (await colonne_commentaires.$( |
36 | | - "::-p-xpath(./div[2])" |
37 | | - ))!; |
| 34 | + // //main/div/div/div/./div[2]/div/./div[2] |
| 35 | + const zone_defilable = (await colonne_commentaires.$( |
| 36 | + "::-p-xpath(./div[2])", |
| 37 | + ))!; |
38 | 38 |
|
39 | | - // //main/div/div/div/./div[2]/div/./div[2]/./div/div[1]/div/div[2]/div/span |
40 | | - const publication = (await zone_defilable.$( |
41 | | - "::-p-xpath(./div/div[1]/div/div[2]/div/span)" |
42 | | - ))!; |
| 39 | + // //main/div/div/div/./div[2]/div/./div[2]/./div/div[1]/div/div[2]/div/span |
| 40 | + const publication = (await zone_defilable.$( |
| 41 | + "::-p-xpath(./div/div[1]/div/div[2]/div/span)", |
| 42 | + ))!; |
43 | 43 |
|
44 | | - // //main/div/div/div/./div[2]/div/./div[2]/./div/div[1]/div/div[2]/div/span/./div/div |
45 | | - const entete_publication = (await publication.$("::-p-xpath(./div/div)"))!; |
| 44 | + // //main/div/div/div/./div[2]/div/./div[2]/./div/div[1]/div/div[2]/div/span/./div/div |
| 45 | + const entete_publication = (await publication.$("::-p-xpath(./div/div)"))!; |
46 | 46 |
|
47 | | - // //main/div/div/div/./div[2]/div/./div[2]/./div/div[1]/div/div[2]/div/span/./span[1] |
48 | | - const auteur = await this.get_auteur_from_span( |
49 | | - (await entete_publication.$("::-p-xpath(./span[1])"))! |
50 | | - ); |
| 47 | + // //main/div/div/div/./div[2]/div/./div[2]/./div/div[1]/div/div[2]/div/span/./span[1] |
| 48 | + const auteur = await this.get_auteur_from_span( |
| 49 | + (await entete_publication.$("::-p-xpath(./span[1])"))!, |
| 50 | + ); |
51 | 51 |
|
52 | | - // //main/div/div/div/./div[2]/div/./div[2]/./div/div[1]/div/div[2]/div/span/./div/div/.//time |
53 | | - const date_publication = (await entete_publication.$eval( |
54 | | - "::-p-xpath(.//time)", |
55 | | - (node) => node.getAttribute("datetime") |
56 | | - ))!; |
| 52 | + // //main/div/div/div/./div[2]/div/./div[2]/./div/div[1]/div/div[2]/div/span/./div/div/.//time |
| 53 | + const date_publication = (await entete_publication.$eval( |
| 54 | + "::-p-xpath(.//time)", |
| 55 | + (node) => node.getAttribute("datetime"), |
| 56 | + ))!; |
57 | 57 |
|
58 | | - // //main/div/div/div/./div[2]/div/./div[2]/./div/div[1]/div/div[2]/div/span/./div/span |
59 | | - const texte_publication = (await publication.$eval( |
60 | | - "::-p-xpath(./div/span)", |
61 | | - (node) => node.textContent |
62 | | - ))!; |
| 58 | + // //main/div/div/div/./div[2]/div/./div[2]/./div/div[1]/div/div[2]/div/span/./div/span |
| 59 | + const texte_publication = (await publication.$eval( |
| 60 | + "::-p-xpath(./div/span)", |
| 61 | + (node) => node.textContent, |
| 62 | + ))!; |
63 | 63 |
|
64 | | - // //main/div/div/div/./div[2]/div/./div[2]/./div/div[3] |
65 | | - const liste_commentaires = (await zone_defilable.$( |
66 | | - "::-p-xpath(./div/div[3])" |
67 | | - ))!; |
68 | | - //const commentaires = (await liste_commentaires.$$("::-p-xpath(./div)")).map(e => this.extract_commentaire(e))!; |
69 | | - const commentaires: Comment[] = []; |
70 | | - let div_commentaire = await liste_commentaires.$("::-p-xpath(./div)"); |
71 | | - while (div_commentaire) { |
72 | | - await div_commentaire.scrollIntoView(); |
73 | | - commentaires.push(await this.extract_commentaire(div_commentaire)); |
74 | | - await this.sleep(500); |
75 | | - // gérer un peu mieux le scroll et le temps de chargement des pages de |
76 | | - // commentaires |
77 | | - div_commentaire = await div_commentaire.$( |
78 | | - "::-p-xpath(./following-sibling::*)" |
79 | | - ); |
80 | | - // gérer ici le ce scraping des réponses aux commentaires |
81 | | - } |
82 | | - |
83 | | - return { |
84 | | - postId: this.extractPostId(tab.url!), |
85 | | - socialNetwork: "INSTAGRAM", |
86 | | - url: tab.url!, |
87 | | - author: auteur, |
88 | | - scrapedAt: new Date().toISOString(), |
89 | | - publishedAt: new Date(date_publication).toISOString(), |
90 | | - textContent: texte_publication, |
91 | | - comments: await Promise.all(commentaires), |
92 | | - }; |
| 64 | + // //main/div/div/div/./div[2]/div/./div[2]/./div/div[3] |
| 65 | + const liste_commentaires = (await zone_defilable.$( |
| 66 | + "::-p-xpath(./div/div[3])", |
| 67 | + ))!; |
| 68 | + //const commentaires = (await liste_commentaires.$$("::-p-xpath(./div)")).map(e => this.extract_commentaire(e))!; |
| 69 | + const commentaires: Comment[] = []; |
| 70 | + let div_commentaire = await liste_commentaires.$("::-p-xpath(./div)"); |
| 71 | + while (div_commentaire) { |
| 72 | + await div_commentaire.scrollIntoView(); |
| 73 | + commentaires.push(await this.extract_commentaire(div_commentaire)); |
| 74 | + await this.sleep(500); |
| 75 | + // gérer un peu mieux le scroll et le temps de chargement des pages de |
| 76 | + // commentaires |
| 77 | + div_commentaire = await div_commentaire.$( |
| 78 | + "::-p-xpath(./following-sibling::*)", |
| 79 | + ); |
| 80 | + // gérer ici le ce scraping des réponses aux commentaires |
93 | 81 | } |
94 | 82 |
|
95 | | - private async get_auteur_from_span( |
96 | | - span_element: ElementHandle<Element> |
97 | | - ): Promise<Author> { |
98 | | - const auteur_elem = (await span_element.$("::-p-xpath(.//a)"))!; |
99 | | - const auteur_href = (await auteur_elem.$eval("::-p-xpath(.)", (node) => |
100 | | - node.getAttribute("href") |
101 | | - ))!; |
102 | | - const auteur_name = (await auteur_elem.$eval( |
103 | | - "::-p-xpath(.//span)", |
104 | | - (node) => node.textContent |
105 | | - ))!; |
106 | | - return { |
107 | | - name: auteur_name, |
108 | | - accountHref: this.urlJoin(this.INSTAGRAM_URL, auteur_href), |
109 | | - }; |
110 | | - } |
| 83 | + return { |
| 84 | + postId: this.extractPostId(tab.url!), |
| 85 | + socialNetwork: "INSTAGRAM", |
| 86 | + url: tab.url!, |
| 87 | + author: auteur, |
| 88 | + scrapedAt: new Date().toISOString(), |
| 89 | + publishedAt: new Date(date_publication).toISOString(), |
| 90 | + textContent: texte_publication, |
| 91 | + comments: await Promise.all(commentaires), |
| 92 | + }; |
| 93 | + } |
111 | 94 |
|
112 | | - private async extract_commentaire( |
113 | | - comment_element: ElementHandle<Element> |
114 | | - ): Promise<Comment> { |
115 | | - let base = (await comment_element.$( |
116 | | - "::-p-xpath(./div/div/div[2]/div/div)" |
117 | | - ))!; |
118 | | - base = (await base.$("::-p-xpath(.//span[1]/../..)"))!; |
119 | | - const base_0 = (await base.$("::-p-xpath(div[1])"))!; |
120 | | - const base_1 = (await base.$("::-p-xpath(div[2])"))!; |
121 | | - const auteur = await this.get_auteur_from_span(base_0); |
122 | | - let date_commentaire: Date | undefined = undefined; |
123 | | - try { |
124 | | - const date_str = (await base.$eval("::-p-xpath(.//time)", (node) => |
125 | | - node.getAttribute("datetime") |
126 | | - ))!; |
127 | | - date_commentaire = date_str ? new Date(date_str) : undefined; |
128 | | - } catch (_) { |
129 | | - date_commentaire = undefined; |
130 | | - } |
| 95 | + private async get_auteur_from_span( |
| 96 | + span_element: ElementHandle<Element>, |
| 97 | + ): Promise<Author> { |
| 98 | + const auteur_elem = (await span_element.$("::-p-xpath(.//a)"))!; |
| 99 | + const auteur_href = (await auteur_elem.$eval("::-p-xpath(.)", (node) => |
| 100 | + node.getAttribute("href"), |
| 101 | + ))!; |
| 102 | + const auteur_name = (await auteur_elem.$eval( |
| 103 | + "::-p-xpath(.//span)", |
| 104 | + (node) => node.textContent, |
| 105 | + ))!; |
| 106 | + return { |
| 107 | + name: auteur_name, |
| 108 | + accountHref: this.urlJoin(this.INSTAGRAM_URL, auteur_href), |
| 109 | + }; |
| 110 | + } |
131 | 111 |
|
132 | | - const screenshot = await comment_element.screenshot({ encoding: "base64" }); |
133 | | - const screenshotDate = currentIsoDate(); |
134 | | - return { |
135 | | - author: auteur, |
136 | | - textContent: await base_1.$eval( |
137 | | - "::-p-xpath(.)", |
138 | | - (node) => node.textContent! |
139 | | - )!, |
140 | | - publishedAt: date_commentaire?.toISOString(), |
141 | | - screenshotData: screenshot, |
142 | | - scrapedAt: screenshotDate, |
143 | | - replies: [], |
144 | | - nbLikes: 0 // Voir https://github.com/dataforgoodfr/14_BalanceTesHaters/issues/4 |
145 | | - }; |
| 112 | + private async extract_commentaire( |
| 113 | + comment_element: ElementHandle<Element>, |
| 114 | + ): Promise<Comment> { |
| 115 | + let base = (await comment_element.$( |
| 116 | + "::-p-xpath(./div/div/div[2]/div/div)", |
| 117 | + ))!; |
| 118 | + base = (await base.$("::-p-xpath(.//span[1]/../..)"))!; |
| 119 | + const base_0 = (await base.$("::-p-xpath(div[1])"))!; |
| 120 | + const base_1 = (await base.$("::-p-xpath(div[2])"))!; |
| 121 | + const auteur = await this.get_auteur_from_span(base_0); |
| 122 | + let date_commentaire: Date | undefined = undefined; |
| 123 | + try { |
| 124 | + const date_str = (await base.$eval("::-p-xpath(.//time)", (node) => |
| 125 | + node.getAttribute("datetime"), |
| 126 | + ))!; |
| 127 | + date_commentaire = date_str ? new Date(date_str) : undefined; |
| 128 | + } catch (_) { |
| 129 | + date_commentaire = undefined; |
146 | 130 | } |
147 | 131 |
|
148 | | - private urlJoin(base: string, relative: string): string { |
149 | | - const baseWithoutTrailingSlash = base.replace(/\/+$/, ""); |
150 | | - if (relative.startsWith("/")) { |
151 | | - return baseWithoutTrailingSlash + relative; |
152 | | - } else { |
153 | | - return baseWithoutTrailingSlash + "/" + relative; |
154 | | - } |
| 132 | + const screenshot = await comment_element.screenshot({ encoding: "base64" }); |
| 133 | + const screenshotDate = currentIsoDate(); |
| 134 | + return { |
| 135 | + author: auteur, |
| 136 | + textContent: await base_1.$eval( |
| 137 | + "::-p-xpath(.)", |
| 138 | + (node) => node.textContent!, |
| 139 | + )!, |
| 140 | + publishedAt: date_commentaire?.toISOString(), |
| 141 | + screenshotData: screenshot, |
| 142 | + scrapedAt: screenshotDate, |
| 143 | + replies: [], |
| 144 | + nbLikes: 0, // Voir https://github.com/dataforgoodfr/14_BalanceTesHaters/issues/4 |
| 145 | + }; |
| 146 | + } |
| 147 | + |
| 148 | + private urlJoin(base: string, relative: string): string { |
| 149 | + const baseWithoutTrailingSlash = base.replace(/\/+$/, ""); |
| 150 | + if (relative.startsWith("/")) { |
| 151 | + return baseWithoutTrailingSlash + relative; |
| 152 | + } else { |
| 153 | + return baseWithoutTrailingSlash + "/" + relative; |
155 | 154 | } |
| 155 | + } |
156 | 156 | } |
0 commit comments