The openaddresses links are incomplete/out of sync. Also, there is also no way of fetching differences.
Attempted Solutions
I built two Python scripts to solve my issue. The first script builds a hash and updates the pelias.json with all the us, ca, and mx files. The second script builds an index of the DATA_DIR and reconciles the links on Any missing files, it downloads.
I am still working with this code. But, I can dockerize it and add a pelias command if it's useful. It may be helpful just to have an updated pelias.json.
Updated pelias.json:
Build new pelias.json script:
import requests
from bs4 import BeautifulSoup
import json
url = ""
response = requests.get(url)
html_content = response.text
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Find the table with id "runs"
runs_table = soup.find('table', {'id': 'runs'})
# Extract data from the "processed" column in the "runs" table
files = []
# Select only the desired countries: us, mx, ca
countries = ["us", "mx", "ca"]
# Find all rows in the "runs" table
rows = runs_table.find_all('tr')
for row in rows:
# Find the "processed" column in each row
processed_column = row.find('td', {'class': 'processed'})
if processed_column:
# Find the link in the "processed" column
link = processed_column.find('a')
if link:
# Extract country code from the link
country_code = link['href'].split('/')[5]
state_code = link['href'].split('/')[6]
if country_code in countries:
# Extract the filename and add it to the JSON object
filename = link['href'].split('/')[-1].replace('.zip', '.csv')
# Serialize the JSON object to a string and print it
# pelias_json_str = json.dumps(pelias_json, indent=2)
# print(pelias_json_str)
# Read the existing pelias.json file
with open('pelias.json', 'r') as pelias_file:
pelias_data = json.load(pelias_file)
# Modify the imports section with the new data
pelias_data["imports"]["openaddresses"]["files"] = files
# Write the updated content to a new file
with open('pelias.scraped.json', 'w') as new_pelias_file:
json.dump(pelias_data, new_pelias_file, indent=2)
print("pelias.scraped.json has been updated.")
Reconcile current open addresses files with URL:
import os
import requests
import zipfile
import json
from dotenv import load_dotenv
from datetime import datetime
from bs4 import BeautifulSoup
# Load environment variables from .env file
countries = ["us", "mx", "ca"]
# Define the data directory from the environment variable
data_dir = os.getenv("DATA_DIR")
# Read the pelias.json file
with open('pelias.json', 'r') as pelias_file:
pelias_data = json.load(pelias_file)
# Create a dictionary to store filename and corresponding link
filename_link_dict = {}
# Fetch links from the webpage
url = ""
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
runs_table = soup.find('table', {'id': 'runs'})
# Iterate through the rows of the runs table and extract links
for row in runs_table.find_all('tr'):
processed_column = row.find('td', {'class': 'processed'})
if processed_column:
link = processed_column.find('a')
if link:
country, state, filename = link['href'].split('/')[-3:]
if country not in countries:
filename_link_dict[(country, state, filename.replace('.zip', '.csv'))] = link['href']
# Ensure the data directory and openaddresses subdirectory exist
openaddresses_dir = os.path.join(data_dir, 'openaddresses')
os.makedirs(openaddresses_dir, exist_ok=True)
# Log file path
log_file_path = 'openaddresses.custom_parser.log'
# Function to log events
def log_event(event_type, filename):
timestamp ="%Y-%m-%d %H:%M:%S")
log_message = f"{timestamp} {event_type}: {filename}."
with open(log_file_path, 'a') as log_file:
log_file.write(log_message + '\n')
for file_tuple, url in filename_link_dict.items():
country, state, filename = file_tuple
print ("Creating:", openaddresses_dir, country, state, filename)
file_path = os.path.join(openaddresses_dir, country, state, filename)
dir_path = os.path.join(openaddresses_dir, country, state)
# Create the directory if it does not exist
os.makedirs(dir_path, exist_ok=True)
# Check if the file already exists
if os.path.exists(file_path):
log_event("file_exists", filename)
log_event("file_not_found", filename)
# Download the file
response = requests.get(url)
with open(file_path + '.zip', 'wb') as zip_file:
# Extract the contents to the correct folder
with zipfile.ZipFile(file_path + '.zip', 'r') as zip_ref:
except Exception as e:
log_event("download_error", filename)
# Remove the downloaded zip file
os.remove(file_path + '.zip')
print("Script execution complete.")