-
Notifications
You must be signed in to change notification settings - Fork 0
/
fast_people_search.py
127 lines (104 loc) · 4.62 KB
/
fast_people_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import pandas as pd
import sys
import requests
import base64
from bs4 import BeautifulSoup
def fast_people_search(file_path, proxy_scrape_api_key):
try:
# Load the Excel file
excel_file = pd.ExcelFile(file_path, engine='openpyxl')
processing_sheet = 'FastPeopleSearch'
fast_people_search_column = 'FastSearchUrl'
# Check if sheet exists
if processing_sheet in excel_file.sheet_names:
# Load the sheet into a DataFrame w/ 2 header rows
df = pd.read_excel(file_path, sheet_name=processing_sheet, engine='openpyxl')
df['FPS Age'] = df['FPS Age'].astype(str)
df['Full Name'] = df['Full Name'].astype(str)
df['Current Address'] = df['Current Address'].astype(str)
for n in range(1, 9):
df[f'Past Address {n}'] = df[f'Past Address {n}'].astype(str)
# Iterate over every non-header row
for index, row in df.iterrows():
print(f"Row {index}: {row[fast_people_search_column]}")
# Make a request to fast people search
html = make_request(row[fast_people_search_column], proxy_scrape_api_key)
if html is not False:
# Parse data from HTML
data = parse_html(html)
# Update the DataFrame
df.at[index, 'FPS Age'] = data['age']
df.at[index, 'Full Name'] = data['full_name']
df.at[index, 'Current Address'] = data['current_address']
for i, address in enumerate(data['past_addresses']):
df.at[index, f'Past Address {i + 1}'] = address
# Save the updated DataFrame back to the Excel file in case of crash
with pd.ExcelWriter(file_path, mode="a", if_sheet_exists='replace', engine="openpyxl") as writer:
df.to_excel(writer, sheet_name=processing_sheet)
else:
print(f"Sheet '{processing_sheet}' not found in the Excel file.")
except Exception as e:
print(f"An error occurred: {e}")
def parse_html(html):
data = {
'age': '',
'full_name': '',
'current_address': '',
'past_addresses': []
}
try:
# Parse the HTML content
soup = BeautifulSoup(html, 'html.parser')
# Extract Age
age_tag = soup.find(lambda tag: tag.name == "h3" and tag.text.strip() == "Age:")
if age_tag:
data['age'] = age_tag.next_sibling.strip()
# Extract Full Name
full_name_tag = soup.find(lambda tag: tag.name == "h3" and tag.text.strip() == "Full Name:")
if full_name_tag:
data['full_name'] = full_name_tag.next_sibling.strip()
# Extract Current Address
current_address_tag = soup.find(lambda tag: tag.name == "h3" and "Current Home Address:" in tag.text)
if current_address_tag:
data['current_address'] = current_address_tag.find_next('div').get_text(separator=', ', strip=True)
# Extract "Past Addresses:"
past_addresses_tag = soup.find(lambda tag: tag.name == "h3" and "Past Addresses:" in tag.text)
if past_addresses_tag:
address_divs = past_addresses_tag.find_next_sibling('div').find_all('div', class_='col-sm-12 col-md-6')
data['past_addresses'] = [div.get_text(separator=', ', strip=True) for div in address_divs]
except Exception as e:
print(f"An error occurred: {e}")
return data
def make_request(url, api_key):
data = {
"url": url,
"httpResponseBody": True
}
headers = {
'Content-Type': 'application/json',
'X-Api-Key': api_key
}
try:
response = requests.post(
'https://api.proxyscrape.com/v3/accounts/freebies/scraperapi/request',
headers=headers,
json=data)
if response.status_code == 200:
json_response = response.json()
if 'browserHtml' in json_response['data']:
return json_response['data']['browserHtml']
else:
return base64.b64decode(json_response['data']['httpResponseBody']).decode()
else:
print("Proxy API Error:", response.status_code)
except Exception as e:
print(f"An error occurred for {url}")
print(e)
return False
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python fast_people_search.py <path_to_excel_file> <proxyscrape api key>")
else:
arg_file_path = sys.argv[1]
arg_api_key = sys.argv[2]
fast_people_search(arg_file_path, arg_api_key)