-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhtmlfilesfromPatentids.py
30 lines (26 loc) · 1.05 KB
/
htmlfilesfromPatentids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import urllib.request
import time
#Open the file to read the patent ids to get source code of these pages
with open('C:/Users/Desktop/patent-ids.txt', 'r') as infile:
data = infile.read() # Read the contents of the file into memory.
# Return a list of the lines, breaking at line boundaries.
my_list = data.splitlines()
#my_list contains list of patent ids
url = 'http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1='
p1 = '.PN.&OS=PN/'
p2 = '&RS=PN/'
length = len(my_list)
print(length)
for i in range(length):
print("Printed Element",i)
gkl = my_list[i]
#Updated URL will be stored in bv
bv = url+gkl+p1+gkl+p2+gkl
page = urllib.request.urlopen(bv)
time.sleep(1)
#Creating a new html file with the patent id as the file name
text_file = open('F:/work/patenthtml/{}.html'.format(my_list[i]), "wb")
#Writing html source code to the created html file
text_file.write(page.read())
#closing the file
text_file.close()