TripAdvisor.fr-Scraper/TripAdvisor FR.py at master · BadriAhmed/TripAdvisor.fr-Scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python
# coding: utf-8

# ## Scraping TripAdvisor FR Reviews

# In[8]:


#import the libraries as needed
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import csv
import warnings
import pandas as pd
import numpy as np
warnings.filterwarnings("ignore")


# In[3]:


#Load urls and number of reviews to scrape
toscrape = pd.read_csv('toscrape.csv')


# In[4]:


# Store them into a list
urls,Nbr = [],[]
for index,row in toscrape.iterrows():
    urls.append(str(row['Url']))
    Nbr.append(int(row['NbCmnts']))


# In[6]:


reviews = []
ratelist = []


# In[9]:


for j in range (len(urls)):
    url = urls[j]
    Nb = Nbr[j]

        #using Chromedriver to open webpages without images
    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.managed_default_content_settings.images": 2}
    chrome_options.add_experimental_option("prefs", prefs)
    browser = webdriver.Chrome(chrome_options=chrome_options)
        #browser = webdriver.Chrome('chromedriver')
    #Headers will make it look like you are using a web browser
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
    #We will use the iteration to retrieve and scrape the web pages, reviews, and ratings from each page on Trip Advisor
    for i in range(0,Nb,10):
        x=str(i)
        #Navigate to the next page
        url = url.replace('Review','Review-or'+x)
        browser.get(url)
        time.sleep(5)
        element_list = browser.find_elements_by_xpath("//span[@class='taLnk ulBlueLinks']")
        #Iteration clicks all of the 'More' links. The 'try' statement allows the iteration
        #to continue with 'pass' when an error message appears-caused by TA.
        for e in element_list:
            try:
                e.click()
            except:
                pass
            #Variable to get the page source through BeautifulSoup.
        html = browser.page_source
        response = requests.get(url, headers=headers, verify=False).text
        soup = BeautifulSoup(response)
    #Looping through 'div' 'reviewSelector' will help find all the review containers we need in each page that have rating and review
        for r in soup.find_all('div', 'reviewSelector'):
            rating = int(r.find('span','ui_bubble_rating')['class'][1].split('_')[1])/10
            review = r.p.text
    #Cleaning the lemmas or words in reviews now will make it easier when we start predictive modeling

            reviews.append(review)
    #Here we are using a simple control flow to recode the ratings for our model. If rating is 1-3 negative, else positive

            ratelist.append(rating)
        print(i+10,' Comments have been collected')
    browser.quit()
    print("Url nbr ",j+1,' Scraped successfully')
print("Finished!")


# In[8]:


data = pd.DataFrame({'Comment':reviews,
                    'Rate':ratelist})


# In[9]:


data.head()


# In[ ]: