-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
155 lines (136 loc) · 6.51 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import requests
from bs4 import BeautifulSoup
import re
import time
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:124.0) Gecko/20100101 Firefox/124.0'}
# response = requests.get(url, headers=headers)
def scrape_redfin_data(url, retries=3, delay=5):
"""Scrapes property data from a Redfin URL, with retry logic for 202 responses.
Args:
url (str): The URL of the Redfin property listing.
retries (int): The maximum number of retries if a 202 response is received.
delay (int): The initial delay between retries in seconds.
Returns:
dict: A dictionary containing the scraped data, or None if scraping fails.
"""
for attempt in range(retries + 1):
try:
response = requests.get(url, headers=headers)
# Check for 202 Accepted directly from Redfin
if response.status_code == 202:
if attempt < retries:
print(f"Received 202 from Redfin for {url}, retrying in {delay} seconds...")
time.sleep(delay) # Wait and retry
delay *= 2 # Increase delay for subsequent retries (optional)
continue # Retry the request
else:
print(f"Max retries reached for {url}. Redfin still processing.")
return None # Return None to signal failure
data = {
'Street Address': '',
'City': '',
'State': '',
'Postal Code': '',
'Price': '',
'Bed Room': '',
'Bath Room': '',
'Sq.Ft.': '',
'House type': '',
'Built Year': '',
'Area': '',
'Price/Sq.Ft': '',
'Car Parking': '',
'AC': '',
'Agent Name': '',
'Brokerage': '',
'MLS ID': '',
'Time on Redfin': '',
"Buyer's Agent Commission": '',
"Agent License": '',
'Contact': ''
}
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Address and Location
full_address_elem = soup.find('h1', class_='full-address')
if full_address_elem:
address_lines = full_address_elem.find_all('div')
data['Street Address'] = address_lines[0].text.strip()
address_parts = address_lines[1].text.split()
data['City'] = address_parts[0].replace(',', '')
data['State'] = address_parts[1]
data['Postal Code'] = address_parts[2]
# Basic Stats
stats_elem = soup.find('div', class_='home-main-stats-variant')
if stats_elem:
for div in stats_elem.find_all('div'):
data_rf_test_id = div.get('data-rf-test-id')
if data_rf_test_id == 'abp-price':
data['Price'] = div.find('div').text.strip()
elif data_rf_test_id == 'abp-beds':
data['Bed Room'] = div.find('div').text.strip()
elif data_rf_test_id == 'abp-baths':
data['Bath Room'] = div.find('div').text.strip()
elif data_rf_test_id == 'abp-sqFt':
data['Sq.Ft.'] = div.find('span').text.replace('Sq.Ft.', '').strip()
# ... (Continue extracting other data similarly, using find(), find_all(), get(), etc.)
details_table = soup.find('div', class_='KeyDetailsTable')
if details_table:
for row in details_table.find_all('div', class_='keyDetails-row'):
icon_class = row.find('svg')['class']
value_div = row.find('div')
value = value_div.text.strip()
if 'clock' in icon_class:
data['Time on Redfin'] = value.replace('\xa0on Redfin', '')
elif 'house' in icon_class:
data['House type'] = value.replace(' House type', '')
elif 'wrench' in icon_class:
data['Built Year'] = value.replace('Built in ', '')
elif 'fence' in icon_class:
data['Area'] = value
elif 'ruler' in icon_class:
data['Price/Sq.Ft'] = value.replace(' per sq ft', '')
elif 'car' in icon_class:
data['Car Parking'] = value
elif 'temperature' in icon_class:
data['AC'] = value
elif 'agent' in icon_class:
data['Buyer\'s Agent Commission'] = value.replace('\xa0buyer\'s agent fee', '')
# Agent Info
agent_info_elem = soup.find('div', class_='agent-info-item')
if agent_info_elem:
for span in agent_info_elem.find_all('span'):
if span.get('class') == ['agent-basic-details--heading']:
data['Agent Name'] = span.text.replace("Agent Name ", "").strip()
elif span.get('class') == ['agent-basic-details--broker']:
data['Brokerage'] = span.text.strip()
elif span.get('class') == ['agentLicenseDisplay']:
data['Agent License'] = span.text.strip()
# Contact
contact_section = soup.find('div', class_='listingContactSection')
if contact_section:
data['Contact'] = contact_section.text.replace("Contact: ", "").strip()
# MLS ID
mls_id_elem = soup.find('span', class_='ListingSource--mlsId')
if mls_id_elem:
data['MLS ID'] = mls_id_elem.text.replace("#", "").strip()
for key, value in data.items():
# Remove invalid characters
data[key] = re.sub(r'[•\n\r]+|[^\x00-\x7F]+', ' ', value).strip()
return data
except requests.exceptions.RequestException as e:
print(f"Request error for URL {url}: {e}")
return None
except AttributeError as e:
print(f"Attribute error for URL {url}: {e}")
return None
except Exception as e: # Catch any other unexpected errors
print(f"Unexpected error for URL {url}: {e}")
return None
# ... (Example usage)
if __name__ == '__main__':
url = "https://www.redfin.com/CA/Los-Angeles/123-Main-St-90001/home/123456789" # Replace with actual URL
url = "https://www.redfin.com/CA/Upland/2530-Mountain-Dr-91784/home/3877330"
property_data = scrape_redfin_data(url)
if property_data:
print(property_data)