-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpaperScraper.py
More file actions
51 lines (45 loc) · 2.01 KB
/
paperScraper.py
File metadata and controls
51 lines (45 loc) · 2.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import requests
from bs4 import BeautifulSoup
import urllib.request
import os
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from spire.pdf.common import *
from spire.pdf import *
DSOs = ['Carina Nebula', 'NGC 1333', 'TW Hya', 'HH 7-11', 'AB Aurigae', 'HD 169142', 'Luhman 16', 'V830 Tau b', 'V 1298 Tau b', 'WASP-18b', 'WASP-39b', 'WASP-43b', 'HR 8799', 'Beta Pictoris', '2M 1207', 'TRAPPIST-1']
DSOs = DSOs[13:]
paperDir = "static/papers/"
graphDir = "static/graphs/"
for DSO in DSOs:
print("-------------" + DSO + "-------------")
dsoDir = paperDir + DSO.replace(" ", "_") + "/"
dsoGraphDir = graphDir + DSO.replace(" ", "_") + "/"
if not os.path.isdir(dsoDir): os.makedirs(dsoDir)
if not os.path.isdir(dsoGraphDir): os.makedirs(dsoGraphDir)
for paperPath in os.listdir(dsoDir):
try:
doc = PdfDocument()
doc.LoadFromFile(dsoDir + paperPath)
title = doc.DocumentInformation.Title
print("Loaded " + title)
images = []
num = 0
# Loop through the pages in the document
for i in range(doc.Pages.Count):
page = doc.Pages.get_Item(i)
print(title + ": Grabbed page #" + str(i))
# Extract images from a specific page
for image in page.ExtractImages():
images.append(image)
graphPath = dsoGraphDir + DSO + "_" + title.replace("&", "_").replace(".", "_").replace(":", "_").replace("/", "_").replace("\\", "_") + "_{0:03}.png".format(num)
print("Saving " + graphPath)
image.Save(graphPath, ImageFormat.get_Png())
num += 1
print(title + ": " + str(images))
doc.Close()
except Exception as e:
print("screwed the pooch: " + str(e))