pangyuteng · PanosChtz · Jan 10, 2024 · Jan 10, 2024 · Jan 14, 2024
diff --git a/README.md b/README.md
@@ -54,7 +54,7 @@ https://www.scrapingbee.com/blog/selenium-python
 
 https://selenium-python.readthedocs.io/locating-elements.html
 
-wget http://dl.google.com/linux/chrome/deb/pool/main/g/google-chrome-stable/google-chrome-stable_114.0.5735.337_amd64.deb
+wget https://web.archive.org/web/20230716095211/https://dl.google.com/linux/chrome/deb/pool/main/g/google-chrome-stable/google-chrome-stable_114.0.5735.198-1_amd64.deb 
 
 comment on selenium: had a hard time finding the right version and syntax to work - there are various tweaks online to make Google login work, and they will eventually not work, given Google actively updating to prevent automated logins and routing devs to their API - which at the time did not contain what i'm looking - the metadata per photo item does not show you if each image takes up google storage or not.
 

diff --git a/crawlall.py b/crawlall.py
@@ -11,6 +11,7 @@
 
 import undetected_chromedriver as uc
 from selenium.webdriver.common.by import By
+from pathlib import Path
 
 message = f"Get your phone ready to confirm login (via google 2 step verification, gotta have this enabled)! Careful to not mess this up or your account may get locked for a while!!!"
 
@@ -41,10 +42,21 @@
     lambda x: datetime.datetime.strptime(x,'%Y-%m-%dT%H:%M:%SZ')
 )
 print(df.shape)
-df = df[df.tstamp>datetime.datetime(2021,5,30,0,0,0)].reset_index()
+
+outfile = "space2.csv"
+appnd = Path(outfile).is_file()
+
+if appnd:
+    rd = pd.read_csv(outfile)
+    res = datetime.datetime.strptime(rd.iloc[-1].creationTime,'%Y-%m-%dT%H:%M:%SZ')
+else:
+    res = datetime.datetime.now()
+
+df = df[df.tstamp<res].reset_index()
 print(df.shape)
 
-mylist = []
+#mylist = []
+firstrow = True
 for n,row in df.iterrows():
 	try:
 		driver.get(row.productUrl)
@@ -61,11 +73,14 @@
 		else:
 			mytext = "This item takes up space."
 		print(mytext)
+		mylist = []
+		headr = not(appnd) and firstrow
 		myitem = dict(row)
 		myitem['observation']=mytext
 		mylist.append(myitem)
-		newdf = pd.DataFrame(mylist)
-		newdf.to_csv("space.csv",index=False)
+		newdf = pd.DataFrame([mylist[-1]])
+		newdf.to_csv(outfile,index=False,header=headr,mode='a')
+		firstrow = False
 	except:
 		traceback.print_exc()
 		time.sleep(2)