-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWebscraper_NoMonitor.py
More file actions
26 lines (25 loc) · 1.44 KB
/
Webscraper_NoMonitor.py
File metadata and controls
26 lines (25 loc) · 1.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
"""
This is a basic Web scraper to get desired content from(WITHOUT MONITORING)
START LINK: http://judis.nic.in/supremecourt/imgst.aspx?filename=1 to
END LINK: http://judis.nic.in/supremecourt/imgst.aspx?filename=44906
This File simply scrape data present on the given Link to the system
in the form a TEXT File for every Judgement.
"""
import requests # install this module by the command: pip install requests in your command line
from bs4 import BeautifulSoup # install this module by the command: pip install beautifulsoup4 in your command line
# This FOR loop will scrape the textual data from the textarea of the given URLs ending from 1 to 44756...
for i in range(1, 44906):
# URL required
url = 'http://judis.nic.in/supremecourt/imgst.aspx?filename={}'.format(i)
response = requests.get(url)
# <textarea>....content....</textarea>(from the present url) is assigned to 'html' variable
html = response.content
# parsing is done here to local variable soup
soup = BeautifulSoup(html, 'lxml')
# To find desired content from textarea, here id="txtqrydsp" is id the id of textarea
info = soup.find(id="txtqrydsp")
# A text file for the given URL is created in Data folder with name = Data1 or 2 or 3...(i.e. the value of i)
filename = 'Data\Data{}.txt'.format(i)
# finally the parsed content from URL is written to text file
with open(filename, 'a') as writer:
writer.writelines(info)