You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
The openaddresses links are incomplete/out of sync. Also, there is also no way of fetching differences.
Attempted Solutions
I built two Python scripts to solve my issue. The first script builds a hash and updates the pelias.json with all the us, ca, and mx files. The second script builds an index of the DATA_DIR and reconciles the links on openaddresses.io. Any missing files, it downloads.
Proposal
I am still working with this code. But, I can dockerize it and add a pelias command if it's useful. It may be helpful just to have an updated pelias.json.
import requests
from bs4 import BeautifulSoup
import json
url = "https://results.openaddresses.io"
response = requests.get(url)
html_content = response.text
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Find the table with id "runs"
runs_table = soup.find('table', {'id': 'runs'})
# Extract data from the "processed" column in the "runs" table
files = []
# Select only the desired countries: us, mx, ca
countries = ["us", "mx", "ca"]
# Find all rows in the "runs" table
rows = runs_table.find_all('tr')
for row in rows:
# Find the "processed" column in each row
processed_column = row.find('td', {'class': 'processed'})
if processed_column:
# Find the link in the "processed" column
link = processed_column.find('a')
if link:
# Extract country code from the link
country_code = link['href'].split('/')[5]
state_code = link['href'].split('/')[6]
if country_code in countries:
# Extract the filename and add it to the JSON object
filename = link['href'].split('/')[-1].replace('.zip', '.csv')
files.append(f"{country_code}/{state_code}/{filename}")
# Serialize the JSON object to a string and print it
# pelias_json_str = json.dumps(pelias_json, indent=2)
# print(pelias_json_str)
# Read the existing pelias.json file
with open('pelias.json', 'r') as pelias_file:
pelias_data = json.load(pelias_file)
# Modify the imports section with the new data
pelias_data["imports"]["openaddresses"]["files"] = files
# Write the updated content to a new file
with open('pelias.scraped.json', 'w') as new_pelias_file:
json.dump(pelias_data, new_pelias_file, indent=2)
print("pelias.scraped.json has been updated.")
Reconcile current open addresses files with URL:
import os
import requests
import zipfile
import json
from dotenv import load_dotenv
from datetime import datetime
from bs4 import BeautifulSoup
# Load environment variables from .env file
load_dotenv()
countries = ["us", "mx", "ca"]
# Define the data directory from the environment variable
data_dir = os.getenv("DATA_DIR")
# Read the pelias.json file
with open('pelias.json', 'r') as pelias_file:
pelias_data = json.load(pelias_file)
# Create a dictionary to store filename and corresponding link
filename_link_dict = {}
# Fetch links from the webpage
url = "https://results.openaddresses.io"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
runs_table = soup.find('table', {'id': 'runs'})
# Iterate through the rows of the runs table and extract links
for row in runs_table.find_all('tr'):
processed_column = row.find('td', {'class': 'processed'})
if processed_column:
link = processed_column.find('a')
if link:
country, state, filename = link['href'].split('/')[-3:]
if country not in countries:
continue
filename_link_dict[(country, state, filename.replace('.zip', '.csv'))] = link['href']
# Ensure the data directory and openaddresses subdirectory exist
openaddresses_dir = os.path.join(data_dir, 'openaddresses')
os.makedirs(openaddresses_dir, exist_ok=True)
# Log file path
log_file_path = 'openaddresses.custom_parser.log'
# Function to log events
def log_event(event_type, filename):
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_message = f"{timestamp} {event_type}: {filename}."
with open(log_file_path, 'a') as log_file:
log_file.write(log_message + '\n')
print(log_message)
for file_tuple, url in filename_link_dict.items():
country, state, filename = file_tuple
print ("Creating:", openaddresses_dir, country, state, filename)
file_path = os.path.join(openaddresses_dir, country, state, filename)
dir_path = os.path.join(openaddresses_dir, country, state)
# Create the directory if it does not exist
os.makedirs(dir_path, exist_ok=True)
# Check if the file already exists
if os.path.exists(file_path):
log_event("file_exists", filename)
else:
log_event("file_not_found", filename)
# Download the file
response = requests.get(url)
try:
with open(file_path + '.zip', 'wb') as zip_file:
zip_file.write(response.content)
# Extract the contents to the correct folder
with zipfile.ZipFile(file_path + '.zip', 'r') as zip_ref:
zip_ref.extractall(openaddresses_dir)
except Exception as e:
log_event("download_error", filename)
print(e)
continue
# Remove the downloaded zip file
os.remove(file_path + '.zip')
print("Script execution complete.")
References
No.
The text was updated successfully, but these errors were encountered:
Use-cases
The openaddresses links are incomplete/out of sync. Also, there is also no way of fetching differences.
Attempted Solutions
I built two Python scripts to solve my issue. The first script builds a hash and updates the pelias.json with all the us, ca, and mx files. The second script builds an index of the DATA_DIR and reconciles the links on openaddresses.io. Any missing files, it downloads.
Proposal
I am still working with this code. But, I can dockerize it and add a pelias command if it's useful. It may be helpful just to have an updated pelias.json.
Updated pelias.json:
pelias.json
Build new pelias.json script:
Reconcile current open addresses files with URL:
References
No.
The text was updated successfully, but these errors were encountered: