Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ https://www.scrapingbee.com/blog/selenium-python

https://selenium-python.readthedocs.io/locating-elements.html

wget http://dl.google.com/linux/chrome/deb/pool/main/g/google-chrome-stable/google-chrome-stable_114.0.5735.337_amd64.deb
wget https://web.archive.org/web/20230716095211/https://dl.google.com/linux/chrome/deb/pool/main/g/google-chrome-stable/google-chrome-stable_114.0.5735.198-1_amd64.deb

comment on selenium: had a hard time finding the right version and syntax to work - there are various tweaks online to make Google login work, and they will eventually not work, given Google actively updating to prevent automated logins and routing devs to their API - which at the time did not contain what i'm looking - the metadata per photo item does not show you if each image takes up google storage or not.

Expand Down
23 changes: 19 additions & 4 deletions crawlall.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from pathlib import Path

message = f"Get your phone ready to confirm login (via google 2 step verification, gotta have this enabled)! Careful to not mess this up or your account may get locked for a while!!!"

Expand Down Expand Up @@ -41,10 +42,21 @@
lambda x: datetime.datetime.strptime(x,'%Y-%m-%dT%H:%M:%SZ')
)
print(df.shape)
df = df[df.tstamp>datetime.datetime(2021,5,30,0,0,0)].reset_index()

outfile = "space2.csv"
appnd = Path(outfile).is_file()

if appnd:
rd = pd.read_csv(outfile)
res = datetime.datetime.strptime(rd.iloc[-1].creationTime,'%Y-%m-%dT%H:%M:%SZ')
else:
res = datetime.datetime.now()

df = df[df.tstamp<res].reset_index()
print(df.shape)

mylist = []
#mylist = []
firstrow = True
for n,row in df.iterrows():
try:
driver.get(row.productUrl)
Expand All @@ -61,11 +73,14 @@
else:
mytext = "This item takes up space."
print(mytext)
mylist = []
headr = not(appnd) and firstrow
myitem = dict(row)
myitem['observation']=mytext
mylist.append(myitem)
newdf = pd.DataFrame(mylist)
newdf.to_csv("space.csv",index=False)
newdf = pd.DataFrame([mylist[-1]])
newdf.to_csv(outfile,index=False,header=headr,mode='a')
firstrow = False
except:
traceback.print_exc()
time.sleep(2)
Expand Down