-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_scraping.py
More file actions
152 lines (131 loc) · 6.68 KB
/
web_scraping.py
File metadata and controls
152 lines (131 loc) · 6.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
import time
import logging
import pandas as pd
from seleniumbase import SB
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import re
def extract_date(string):
# Regular expression to match the date format (e.g., 01 Apr 2023)
match = re.search(r'\d{2} \w{3} \d{4}', string)
return match.group(0) if match else None
# Configure logging to show timestamps and error details.
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
def scrape_ronin_chain_token_transfers(csv_file_with_axie_id):
try:
df = pd.read_csv(csv_file_with_axie_id)
axie_ids = df.iloc[:, 0].astype(str).tolist() # Ensure Axie IDs are strings
all_data = []
with SB(uc=True, headless=False) as sb:
for axie_id in axie_ids:
try:
url = f"https://app.roninchain.com/token/0x32950db2a7164ae833121501c797d79e7b79d74c/{axie_id}?p=1&ps=25"
logging.info(f"Processing Axie ID: {axie_id}")
sb.open(url)
WebDriverWait(sb.driver, 30).until(
EC.presence_of_element_located((By.CLASS_NAME, "ronin-table-tbody"))
)
sb.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0) # Increased sleep
rows = sb.find_elements(By.CLASS_NAME, "ronin-table-row")
logging.info(f"Axie ID {axie_id}: Found {len(rows)} transaction rows.")
for row in rows:
try:
cells = row.find_elements(By.CLASS_NAME, "ronin-table-cell")
if len(cells) >= 1:
try:
anchor = cells[0].find_element(By.TAG_NAME, "a")
href = anchor.get_attribute("href").strip()
tx_hash = href.split("/tx/")[-1] if "/tx/" in href else href
all_data.append({
'Axie ID': axie_id,
'Tx Hash': tx_hash
})
except NoSuchElementException:
logging.warning(f"No anchor tag found in row for Axie ID {axie_id}")
continue # Skip this row
except Exception as anchor_ex:
logging.error(f"Error processing anchor tag for Axie ID {axie_id}: {anchor_ex}", exc_info=True)
continue
except Exception as row_ex:
logging.error(f"Error extracting row data for Axie ID {axie_id}: {row_ex}", exc_info=True)
continue
except Exception as id_ex:
logging.error(f"Error processing Axie ID {axie_id}: {id_ex}", exc_info=True)
continue
final_df = pd.DataFrame(all_data)
return final_df
except Exception as overall_ex:
logging.error(f"Scraping failed: {overall_ex}", exc_info=True)
return None
def extract_dates_from_csv(csv_filename):
try:
df = pd.read_csv(csv_filename)
results = []
base_url = "https://app.roninchain.com/tx/"
with SB(uc=True, headless=False) as sb:
for index, row in df.iterrows():
try:
tx_hash = str(row['Tx Hash']).strip()
full_url = base_url + tx_hash
print(f"Processing URL: {full_url}")
sb.open(full_url)
WebDriverWait(sb.driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div.-mb-8"))
)
time.sleep(0)
try:
date_div = sb.find_element(By.CSS_SELECTOR, "div.-mb-8")
date_text = date_div.text.strip()
except NoSuchElementException as inner_e:
print(f"Error finding date element for tx_hash {tx_hash}: {inner_e}")
date_text = "N/A"
except Exception as e:
logging.error(f"General error finding date element for tx_hash {tx_hash}: {e}", exc_info=True)
date_text = "N/A"
results.append({
'Tx Hash': tx_hash,
'Date': extract_date(date_text), # Ensure extract_date is defined elsewhere
'Axie_id': str(row['Axie ID']).strip()
})
except Exception as row_ex:
logging.error(f"Error processing row {index} in CSV: {row_ex}", exc_info=True)
continue
result_df = pd.DataFrame(results)
if not result_df.empty: # Only save if there's data
result_df.to_csv("tx_dates.csv", mode='a', index=False, header=False)
print("Data saved to tx_dates.csv")
else:
print("No data to save to tx_dates.csv")
except FileNotFoundError as fnf_error:
print(f"CSV file not found: {fnf_error}")
except Exception as e:
print(f"Operation failed: {e}")
# Usage example
for i in range(1, 3):
try:
df = scrape_ronin_chain_token_transfers(f"axie_ids_{i}.csv")
if not df.empty:
logging.info("Data extracted successfully:")
print(df)
df.to_csv(f"ronin_transfers{i}.csv", index=False)
logging.info(f"Data saved to ronin_transfers{i}.csv")
else:
logging.info("No data found or extraction encountered errors!")
except Exception as e:
logging.error(f"Error in scraping iteration {i}: {e}", exc_info=True)
for i in range(1, 3):
try:
# Check if the file exists before attempting to process it
if os.path.exists(f"ronin_transfers{i}.csv"):
extract_dates_from_csv(f"ronin_transfers{i}.csv")
else:
logging.warning(f"File ronin_transfers{i}.csv not found, skipping date extraction.")
except Exception as e:
logging.error(f"Error in extracting dates for iteration {i}: {e}", exc_info=True)