Web-Scraper/Webscraper_NoMonitor.py at master · Aadit-Bhojgi/Web-Scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
"""
    This is a basic Web scraper to get desired content from(WITHOUT MONITORING)
    START LINK: http://judis.nic.in/supremecourt/imgst.aspx?filename=1 to
    END LINK: http://judis.nic.in/supremecourt/imgst.aspx?filename=44906
    This File simply scrape data present on the given Link to the system
    in the form a TEXT File for every Judgement.
"""
import requests  # install this module by the command: pip install requests in your command line
from bs4 import BeautifulSoup  # install this module by the command: pip install beautifulsoup4 in your command line

# This FOR loop will scrape the textual data from the textarea of the given URLs ending from 1 to 44756...
for i in range(1, 44906):
    # URL required
    url = 'http://judis.nic.in/supremecourt/imgst.aspx?filename={}'.format(i)
    response = requests.get(url)
    # <textarea>....content....</textarea>(from the present url) is assigned to 'html' variable
    html = response.content
    # parsing is done here to local variable soup
    soup = BeautifulSoup(html, 'lxml')
    # To find desired content from textarea, here id="txtqrydsp" is id the id of textarea
    info = soup.find(id="txtqrydsp")
    # A text file for the given URL is created in Data folder with name = Data1 or 2 or 3...(i.e. the value of i)
    filename = 'Data\Data{}.txt'.format(i)
    # finally the parsed content from URL is written to text file
    with open(filename, 'a') as writer:
        writer.writelines(info)