-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimdb-top-movies.py
84 lines (62 loc) · 3.18 KB
/
imdb-top-movies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
import time
import csv
import random
import concurrent.futures
from bs4 import BeautifulSoup
# global headers to be used for requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}
MAX_THREADS = 20
def extract_movie_details(movie_link):
time.sleep(random.uniform(0, 0.2))
response = requests.get(movie_link, headers=headers)
movie_soup = BeautifulSoup(response.content, 'html.parser')
if movie_soup is not None:
title = None
date = None
# Finding the main section of the movie page
page_section = movie_soup.find('section', attrs={'class': 'ipc-page-section'})
if page_section is not None:
# finding the second div in the main section
divs = page_section.find_all('div', recursive=False)
if len(divs) > 1:
target_div = divs[1]
# looking for the title of the movie
title_tag = target_div.find('h1')
if title_tag:
title = title_tag.find('span').get_text()
# looking for the release date of the movie
date_tag = target_div.find('a', href=lambda href: href and 'releaseinfo' in href)
if date_tag:
date = date_tag.get_text().strip()
# looking for the rating of the movie
rating_tag = movie_soup.find('div', attrs={'data-testid': 'hero-rating-bar__aggregate-rating__score'})
rating = rating_tag.get_text() if rating_tag else None
# looking for the plot of the movie
plot_tag = movie_soup.find('span', attrs={'data-testid': 'plot-xs_to_m'})
plot_text = plot_tag.get_text().strip() if plot_tag else None
with open('movies.csv', mode='a', newline='', encoding='utf-8') as file:
movie_writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
if all([title, date, rating, plot_text]):
print(title, date, rating, plot_text)
movie_writer.writerow([title, date, rating, plot_text])
def extract_movies(soup):
movies_table = soup.find('div', attrs={'data-testid': 'chart-layout-main-column'}).find('ul')
movies_table_rows = movies_table.find_all('li')
movie_links = ['https://imdb.com' + movie.find('a')['href'] for movie in movies_table_rows]
threads = min(MAX_THREADS, len(movie_links))
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
executor.map(extract_movie_details, movie_links)
def main():
start_time = time.time()
# IMDB Most Popular Movies - 100 movies
popular_movies_url = 'https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm'
response = requests.get(popular_movies_url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
# Main function to extract the 100 movies from IMDB Most Popular Movies
extract_movies(soup)
end_time = time.time()
print('Total time taken: ', end_time - start_time)
if __name__ == '__main__':
main()