Skip to content

Commit

Permalink
rework the hotel price scraping to use the new graphql XHR endpoint i…
Browse files Browse the repository at this point in the history
…nstead of old json one
  • Loading branch information
Granitosaurus committed Jun 28, 2023
1 parent 02b3983 commit ed26d17
Show file tree
Hide file tree
Showing 4 changed files with 423 additions and 425 deletions.
100 changes: 59 additions & 41 deletions bookingcom-scraper/bookingcom.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ async def scrape_search(
log.info(f"scraped {len(hotel_previews)} total hotel previews for {query} {checkin}-{checkout}")
return hotel_previews


class PriceData(TypedDict):
checkin: str
min_length_of_stay: int
Expand All @@ -116,7 +117,6 @@ class PriceData(TypedDict):
price_pretty: str
price: float



class Hotel(TypedDict):
url: str
Expand Down Expand Up @@ -156,7 +156,7 @@ def parse_hotel(result: ScrapeApiResponse) -> Hotel:
return data


async def scrape_hotel(url: str, checkin: str, price_n_days=30) -> Hotel:
async def scrape_hotel(url: str, checkin: str, price_n_days=61) -> Hotel:
"""
Scrape Booking.com hotel data and pricing information.
"""
Expand All @@ -167,50 +167,68 @@ async def scrape_hotel(url: str, checkin: str, price_n_days=30) -> Hotel:
raise Exception("scrapfly cache cannot be used with sessions when scraping hotel data")
log.info(f"scraping hotel {url} {checkin} with {price_n_days} days of pricing data")
session = str(uuid4()).replace("-", "")
result = await SCRAPFLY.async_scrape(ScrapeConfig(url, session=session, **BASE_CONFIG))
result = await SCRAPFLY.async_scrape(
ScrapeConfig(
url,
session=session,
**BASE_CONFIG,
)
)
hotel = parse_hotel(result)
# csrf token is required to scrape the hidden pricing API
# it can be found hidden in the HTML body
csrf_token = re.findall(r"b_csrf_token:\s*'(.+?)'", result.content)[0]

# body for hidden pricing API
# note this can be customized to your needs like adult visitor number, pets etc.
data = {
"name": "hotel.availability_calendar",
"result_format": "price_histogram",
"hotel_id": hotel["id"],
"search_config": json.dumps(
{
# we can adjust pricing configuration here but this is the default
"b_adults_total": 2,
"b_nr_rooms_needed": 1,
"b_children_total": 0,
"b_children_ages_total": [],
"b_is_group_search": 0,
"b_pets_total": 0,
"b_rooms": [{"b_adults": 2, "b_room_order": 1}],
}
),
"checkin": checkin,
"n_days": price_n_days,
"respect_min_los_restriction": 1,
"los": 1,
}
result = await SCRAPFLY.async_scrape(
# To scrape price we'll be calling Booking.com's graphql service
# in particular we'll be calling AvailabilityCalendar query
# first, extract hotel variables:
_hotel_country = re.findall(r'hotelCountry:\s*"(.+?)"', result.content)[0]
_hotel_name = re.findall(r'hotelName:\s*"(.+?)"', result.content)[0]
_csrf_token = re.findall(r"b_csrf_token:\s*'(.+?)'", result.content)[0]
# then create graphql query
gql_body = json.dumps(
{
"operationName": "AvailabilityCalendar",
# hotel varialbes go here
# you can adjust number of adults, room number etc.
"variables": {
"input": {
"travelPurpose": 2,
"pagenameDetails": {
"countryCode": _hotel_country,
"pagename": _hotel_name,
},
"searchConfig": {
"searchConfigDate": {
"startDate": checkin,
"amountOfDays": price_n_days,
},
"nbAdults": 2,
"nbRooms": 1,
},
}
},
"extensions": {},
# this is the query itself, don't alter it
"query": "query AvailabilityCalendar($input: AvailabilityCalendarQueryInput!) {\n availabilityCalendar(input: $input) {\n ... on AvailabilityCalendarQueryResult {\n hotelId\n days {\n available\n avgPriceFormatted\n checkin\n minLengthOfStay\n __typename\n }\n __typename\n }\n ... on AvailabilityCalendarQueryError {\n message\n __typename\n }\n __typename\n }\n}\n",
},
# note: this removes unnecessary whitespace in JSON output
separators=(",", ":"),
)
# scrape booking graphql
result_price = await SCRAPFLY.async_scrape(
ScrapeConfig(
url="https://www.booking.com/fragment.json?cur_currency=usd",
"https://www.booking.com/dml/graphql?lang=en-gb",
method="POST",
data=data,
headers={"X-Booking-CSRF": csrf_token}, # add CSRF token as header
session=session, # note: we need to use the same IP, so use scrapfly session
body=gql_body,
session=session,
# note that we need to set headers to avoid being blocked
headers={
"content-type": "application/json",
"x-booking-csrf-token": _csrf_token,
"referer": result.context["url"],
"origin": "https://www.booking.com",
},
**BASE_CONFIG,
)
)
hotel["price"] = []
for day in json.loads(result.content)["data"]['days']:
hotel["price"].append({
# get rid of b_ prefix
k[2:] if k.startswith("b_") else k: v
for k, v in day.items()
})
price_data = json.loads(result_price.content)
hotel["price"] = price_data["data"]["availabilityCalendar"]["days"]
return hotel
143 changes: 61 additions & 82 deletions bookingcom-scraper/results/hotel.json
Original file line number Diff line number Diff line change
@@ -1,34 +1,34 @@
{
"url": "https://www.booking.com/hotel/gb/gardencourthotel.html",
"url": "https://www.booking.com/hotel/gb/gardencourthotel.en-gb.html",
"id": "102764",
"title": "Garden Court Hotel",
"description": "You're eligible for a Genius discount at Garden Court Hotel! To save at this property, all you have to do is \nsign in\n.\n\n\n The 19th-century Garden Court Hotel is superbly located in Kensington Gardens Square. It offers stylish, family-run accommodations, a short walk from Bayswater Underground Station.\n\nEach comfortable room is individually designed, with an LCD Freeview cable TV. All rooms have their own private internal private bathrooms, except for a few cozy single rooms which have their own exclusive private external bathrooms.\n\n\nFree Wi-Fi internet access is available throughout the hotel, and there is also free luggage storage and a safe for guests to use at the 24-hour reception.\n\n\nThe hotel is located in fashionable Notting Hill, close to Portobello Antiques Markets and the Royal Parks. Kings Cross Station is 3 miles away.",
"description": "You're eligible for a Genius discount at Garden Court Hotel! To save at this property, all you have to do is \nsign in\n.\n\n\n \nThe 19th-century Garden Court Hotel is superbly situated in Kensington Gardens Square. It offers stylish, family-run accommodation, a short walk from Bayswater Underground Station.\n\nEach comfortable room is individually designed, with an LCD Freeview cable TV. All rooms have their own private internal en-suite bathrooms, except for a few cosy single rooms which have their own exclusive private external bathrooms.\n\nFree Wi-Fi internet access is available throughout the hotel, and there is also free luggage storage and a safe for guests to use at the 24-hour reception.\n\nThe hotel is located in fashionable Notting Hill, close to Portobello Antiques Markets and the Royal Parks. Kings Cross Station is 3 miles away.",
"address": "30-31 Kensington Gardens Square, Notting Hill, Westminster Borough, London, W2 4BG, United Kingdom",
"images": [
"https://cf.bstatic.com/xdata/images/hotel/max1024x768/116051493.jpg?k=4a3e4106c58d191e54c6127b8974589039f3770455b7fc7729f3cf8a4516d835&o=&hp=1",
"https://cf.bstatic.com/xdata/images/hotel/max500/116051515.jpg?k=431f19ceecae7c2d1f0e22a51ca99ac173a1eb15abb52f30f337052e4d484cbd&o=&hp=1",
"https://cf.bstatic.com/xdata/images/hotel/max500/116051527.jpg?k=ea848bb54beeb73a94be01c97330bc1b79f2f020a0a82af15255b87dc1faf03d&o=&hp=1",
"https://cf.bstatic.com/xdata/images/hotel/max300/116051489.jpg?k=d8a3a96ab8be88edf03d198b50f10739f0624d415cc2fb617785de4b2aab088a&o=&hp=1",
"https://cf.bstatic.com/xdata/images/hotel/max300/116051400.jpg?k=ff3dbd5d2019d89a6ee8774d867168ffa96cd96f615a50b5dd97d83af66b63f3&o=&hp=1",
"https://cf.bstatic.com/xdata/images/hotel/max300/116051386.jpg?k=ec9a8f6a757c832e626027c7d69d8843b709b72d44a98eb5e06ff8244a92e5b8&o=&hp=1",
"https://cf.bstatic.com/xdata/images/hotel/max300/116051502.jpg?k=761542c6deb2bf9dc9fa844236913773a24a2e06af2d4918d258df33bf4162e7&o=&hp=1",
"https://cf.bstatic.com/xdata/images/hotel/max300/116051453.jpg?k=9f5d94503389d94841863a9d6c8bddbe447b48564937fc88370554f1fb64e498&o=&hp=1"
"https://cf2.bstatic.com/xdata/images/hotel/max1024x768/116051493.jpg?k=4a3e4106c58d191e54c6127b8974589039f3770455b7fc7729f3cf8a4516d835&o=&hp=1",
"https://cf2.bstatic.com/xdata/images/hotel/max500/116051515.jpg?k=431f19ceecae7c2d1f0e22a51ca99ac173a1eb15abb52f30f337052e4d484cbd&o=&hp=1",
"https://cf2.bstatic.com/xdata/images/hotel/max500/116051527.jpg?k=ea848bb54beeb73a94be01c97330bc1b79f2f020a0a82af15255b87dc1faf03d&o=&hp=1",
"https://cf2.bstatic.com/xdata/images/hotel/max300/116051489.jpg?k=d8a3a96ab8be88edf03d198b50f10739f0624d415cc2fb617785de4b2aab088a&o=&hp=1",
"https://cf2.bstatic.com/xdata/images/hotel/max300/116051400.jpg?k=ff3dbd5d2019d89a6ee8774d867168ffa96cd96f615a50b5dd97d83af66b63f3&o=&hp=1",
"https://cf2.bstatic.com/xdata/images/hotel/max300/116051386.jpg?k=ec9a8f6a757c832e626027c7d69d8843b709b72d44a98eb5e06ff8244a92e5b8&o=&hp=1",
"https://cf2.bstatic.com/xdata/images/hotel/max300/116051502.jpg?k=761542c6deb2bf9dc9fa844236913773a24a2e06af2d4918d258df33bf4162e7&o=&hp=1",
"https://cf2.bstatic.com/xdata/images/hotel/max300/116051453.jpg?k=9f5d94503389d94841863a9d6c8bddbe447b48564937fc88370554f1fb64e498&o=&hp=1"
],
"lat": "51.51431706",
"lng": "-0.19066349",
"features": {
"Bathroom": [
"Toilet paper",
"Towels",
"Bathtub or shower",
"Private Bathroom",
"Bath or shower",
"Private bathroom",
"Toilet",
"Free toiletries",
"Hairdryer",
"Shower"
],
"Bedroom": [
"Linens",
"Linen",
"Wardrobe or closet"
],
"View": [
Expand Down Expand Up @@ -58,21 +58,21 @@
"Street parking",
"Electric vehicle charging station"
],
"Front Desk Services": [
"Reception services": [
"Invoice provided",
"Lockers",
"Private check-in/out",
"Concierge",
"Baggage storage",
"Express check-in/out",
"Private check-in/check-out",
"Concierge service",
"Luggage storage",
"Express check-in/check-out",
"24-hour front desk"
],
"Cleaning Services": [
"Cleaning services": [
"Daily housekeeping",
"Ironing service"
],
"Business Facilities": [
"Fax/Photocopying",
"Business facilities": [
"Fax/photocopying",
"Additional charge"
],
"Safety & security": [
Expand All @@ -82,19 +82,19 @@
"Smoke alarms",
"Key card access",
"24-hour security",
"Safe"
"Safety deposit box"
],
"General": [
"Shared lounge/TV area",
"Designated smoking area",
"Smoke-free property",
"Non-smoking throughout",
"Wake-up service",
"Heating",
"Soundproof",
"Soundproofing",
"Laptop safe",
"Carpeted",
"Soundproof rooms",
"Elevator",
"Lift",
"Fan",
"Family rooms",
"Non-smoking rooms"
Expand All @@ -103,7 +103,7 @@
"Upper floors accessible by stairs only",
"Upper floors accessible by elevator"
],
"Languages Spoken": [
"Languages spoken": [
"Arabic",
"English",
"Spanish",
Expand All @@ -113,74 +113,53 @@
},
"price": [
{
"min_length_of_stay": 1,
"available": 1,
"price": 263.162570752056,
"checkin": "2023-06-01",
"avg_price_pretty": "$263",
"price_pretty": "$263",
"avg_price_raw": 263.162570752056,
"length_of_stay": 1
"available": true,
"__typename": "AvailabilityCalendarDay",
"checkin": "2023-07-05",
"minLengthOfStay": 1,
"avgPriceFormatted": "386"
},
{
"min_length_of_stay": 1,
"available": 1,
"price": 326.548123692336,
"checkin": "2023-06-02",
"avg_price_pretty": "$327",
"price_pretty": "$327",
"avg_price_raw": 326.548123692336,
"length_of_stay": 1
"checkin": "2023-07-07",
"minLengthOfStay": 1,
"avgPriceFormatted": "623",
"__typename": "AvailabilityCalendarDay",
"available": true
},
{
"min_length_of_stay": 1,
"available": 1,
"price": 356.868329663769,
"checkin": "2023-06-03",
"avg_price_pretty": "$357",
"price_pretty": "$357",
"avg_price_raw": 356.868329663769,
"length_of_stay": 1
"checkin": "2023-07-06",
"minLengthOfStay": 1,
"avgPriceFormatted": "553",
"available": true,
"__typename": "AvailabilityCalendarDay"
},
{
"checkin": "2023-06-04",
"min_length_of_stay": 1,
"available": 1,
"price": 197.044242314351,
"avg_price_raw": 197.044242314351,
"price_pretty": "$197",
"length_of_stay": 1,
"avg_price_pretty": "$197"
"avgPriceFormatted": "358",
"minLengthOfStay": 1,
"checkin": "2023-07-11",
"available": true,
"__typename": "AvailabilityCalendarDay"
},
{
"checkin": "2023-06-05",
"min_length_of_stay": 1,
"available": 1,
"price": 231.482159781904,
"avg_price_raw": 231.482159781904,
"price_pretty": "$231",
"length_of_stay": 1,
"avg_price_pretty": "$231"
"__typename": "AvailabilityCalendarDay",
"available": true,
"checkin": "2023-07-10",
"minLengthOfStay": 1,
"avgPriceFormatted": "315"
},
{
"checkin": "2023-06-06",
"price": 260.417429754642,
"min_length_of_stay": 1,
"available": 1,
"avg_price_raw": 260.417429754642,
"price_pretty": "$260",
"length_of_stay": 1,
"avg_price_pretty": "$260"
"__typename": "AvailabilityCalendarDay",
"available": true,
"avgPriceFormatted": "386",
"checkin": "2023-07-09",
"minLengthOfStay": 1
},
{
"avg_price_pretty": "$404",
"price_pretty": "$404",
"avg_price_raw": 403.91905711944,
"length_of_stay": 1,
"min_length_of_stay": 1,
"available": 1,
"price": 403.91905711944,
"checkin": "2023-06-07"
"__typename": "AvailabilityCalendarDay",
"available": true,
"avgPriceFormatted": "610",
"minLengthOfStay": 1,
"checkin": "2023-07-08"
}
]
}
Loading

0 comments on commit ed26d17

Please sign in to comment.