Skip to content

Commit 2be6128

Browse files
authored
Merge pull request #26 from algo7/fix/comment_date_of_stay_not_always_there
Fix/comment date of stay not always there
2 parents 7d3278f + 47bf8fa commit 2be6128

File tree

3 files changed

+47
-13
lines changed

3 files changed

+47
-13
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ latest: Pulling from algo7/tripadvisor-review-scraper/scrap
7676
## Known Issues
7777
1. The hotel scraper works for English reviews only.
7878
2. The restaurant scraper can only scrap english reivews or french reviews.
79+
3. The hotel scraper uses date of review instead of date of stay as the date because the date of stay is not always available.
7980

8081
# Container Provisioner
8182
Container Provisioner is a tool written in [Go](https://go.dev/) that provides a UI for the users to interact with the scraper. It uses [Docker API](https://docs.docker.com/engine/api/) to provision the containers and run the scraper. The UI is written in raw HTML and JavaScript while the backend web framwork is [Fiber](https://docs.gofiber.io/).

libs/utils.js

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,26 @@ const monthStringToNumber = (monthString) => {
4646
return 10;
4747
case 'November':
4848
return 11;
49+
case 'Jan':
50+
return 1;
51+
case 'Feb':
52+
return 2;
53+
case 'Mar':
54+
return 3;
55+
case 'Apr':
56+
return 4;
57+
case 'Jun':
58+
return 6;
59+
case 'Jul':
60+
return 7;
61+
case 'Aug':
62+
return 8;
63+
case 'Sep':
64+
return 9;
65+
case 'Oct':
66+
return 10;
67+
case 'Nov':
68+
return 11;
4969
default:
5070
return 12;
5171
}

scrapers/hotel.js

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -193,24 +193,38 @@ const scrape = async (totalReviewCount, reviewPageUrls, position, hotelName, hot
193193
});
194194

195195
// Extract date of stay
196-
const commentDateOfStay = await page.evaluate(async () => {
196+
const commentDateOfReview = await page.evaluate(async () => {
197197

198-
const commentDateOfStayBlocks = document.getElementsByClassName('teHYY')
198+
// const commentDateOfStayBlocks = document.getElementsByClassName('teHYY')
199+
const commentDateBlocks = document.getElementsByClassName("cRVSd")
199200

200-
const dates = [];
201+
// const datesOfStay = [];
202+
const datesOfReview = [];
201203

202-
for (let index = 0; index < commentDateOfStayBlocks.length; index++) {
203204

204-
// Split the date of stay text block into an array
205-
const splitted = commentDateOfStayBlocks[index].innerText.split(' ')
205+
// for (let index = 0; index < commentDateOfStayBlocks.length; index++) {
206206

207-
dates.push({
208-
month: splitted[3],
209-
year: splitted[4],
207+
// // Split the date of stay text block into an array
208+
// const splitted = commentDateOfStayBlocks[index].innerText.split(' ')
209+
210+
// datesOfStay.push({
211+
// month: splitted[3],
212+
// year: splitted[4],
213+
// });
214+
// }
215+
216+
for (let index = 0; index < commentDateBlocks.length; index++) {
217+
218+
// Split the date of comment text block into an array
219+
const splitted = commentDateBlocks[index].children[0].innerText.split('review').pop().split(' ')
220+
221+
datesOfReview.push({
222+
month: splitted[1],
223+
year: splitted[2],
210224
});
211225
}
212226

213-
return dates;
227+
return datesOfReview;
214228
});
215229

216230
// Extract comments text
@@ -230,12 +244,11 @@ const scrape = async (totalReviewCount, reviewPageUrls, position, hotelName, hot
230244

231245
// Format (for CSV processing) the reviews so each review of each page is in an object
232246
const formatted = commentContent.map((comment, index) => {
233-
234247
return {
235248
title: commentTitle[index],
236249
content: comment,
237-
month: monthStringToNumber(commentDateOfStay[index].month),
238-
year: commentDateOfStay[index].year,
250+
month: monthStringToNumber(commentDateOfReview[index].month),
251+
year: commentDateOfReview[index].year,
239252
rating: commentRatingStringToNumber(commentRating[index]),
240253
};
241254
});

0 commit comments

Comments
 (0)