Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

0673 spider chi ohare noise #972

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,6 @@ src/
logs/*.log

# output files: local gitignore added to city_scrapers/local_outputs/

# Vim swap files
.*.swp
212 changes: 212 additions & 0 deletions city_scrapers/spiders/chi_ohare_noise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import re
from datetime import datetime, timedelta

from city_scrapers_core.constants import (
CANCELLED,
COMMISSION,
COMMITTEE,
NOT_CLASSIFIED,
PASSED,
TENTATIVE,
)
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from scrapy import Request


class ChiOhareNoiseSpider(CityScrapersSpider):
name = "chi_ohare_noise"
agency = "Chicago O'Hare Noise Compatibility Commission"
timezone = "America/Chicago"

class ChiOhareNoiseSubSpider1:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for picking this one up! Looks like it was one of the more difficult ones. I think we'll want to keep it limited to one class here though for readability and split out methods for each (something like _parse_events_start, _parse_docs_start).

I could be missing some complexity here though, is there a reason for not doing that?

def parse(self, response):
"""
`parse` should always `yield` Meeting items.

Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
needs.
"""
for item in response.css(".ev_td_li"):
surl = self._parse_url(item)
yield Request(url=response.urljoin(surl), callback=self._parse_details)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be simpler to use the response.follow shortcut method here like you did below


next_page = response.xpath("//div[@class='previousmonth']/a/@href").get()
if next_page is not None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might want to limit this to only calling the next page once so that we don't hammer the site with a ton of requests every time.

yield response.follow(response.urljoin(next_page), callback=self.parse)

def _parse_details(self, response):
stime = self._parse_start(response)
meeting = Meeting(
title=self._parse_title(response).replace("CANCELLED ", "").strip("- "),
description=self._parse_description(response),
start=stime,
end=stime + timedelta(hours=1),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can leave this as None if we aren't parsing it and potentially pass a string to the time_notes kwarg with any details about scheduling

all_day=False,
location=self._parse_location(response),
links=self._parse_links(response),
source=response.url,
)

meeting = self._get_status(meeting)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'll want to pass the text kwarg with the raw title here so that we can check whether or not it contains "cancelled" which is handled for you in get_status.

Also, we should make sure that get_status is assigning meeting['status'] and not overwriting the meeting variable entirely. That goes for _parse_classification as well

meeting = self._parse_classification(meeting)
return meeting

def _parse_title(self, response):
"""Parse or generate meeting title."""
return response.xpath(
"//div[@class='jev_evdt_header']/div/h2/text()"
).extract()[0]

def _parse_url(self, response):
"""Parse or generate meeting title."""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring can be deleted here if it doesn't apply

return [i.strip() for i in response.xpath("p/a/@href").extract()][0]

def _parse_description(self, response):
"""Parse or generate meeting description."""
return response.xpath("//div[@class='jev_evdt_desc']/p/text()").get() or ""

def _parse_classification(self, meeting):
"""Parse or generate classification from allowed options."""
if "committee" in meeting["title"].lower():
meeting["classification"] = COMMITTEE
elif "commission" in meeting["title"].lower():
meeting["classification"] = COMMISSION
else:
meeting["classification"] = NOT_CLASSIFIED
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the overall organization is a commission it's safe to default to COMMISSION instead of NOT_CLASSIFIED

return meeting

def _parse_start(self, response):
"""Parse start datetime as a naive datetime object."""
return datetime.strptime(
" ".join(
[
i.strip()
for i in response.xpath(
"//div[@class='jev_evdt_header']/div/p/text()"
).extract()
]
),
"%A, %B %d, %Y %I:%M%p",
)

def _parse_all_day(self, response):
"""Parse or generate all-day status. Defaults to False."""
return False

def _parse_location(self, response):
"""Parse or generate location."""
addr = re.split(
"-|,",
response.xpath("//div[@class='jev_evdt_location']/text()")
.extract()[0]
.strip(),
maxsplit=1,
)
return {
# Using reverse indexing for the cases
# where there is no building name or no location
"address": addr[-1].strip(),
"name": addr[0].strip(),
}

def _parse_links(self, response):
"""Parse or generate links."""
return [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might not always be a Zoom link, so we'll want to have checks for that here

{"href": item, "title": "Zoom Link"}
for item in response.xpath(
"//div[@class='jev_evdt_desc']/p/a/@href"
).extract()[:1]
]

def _parse_source(self, response):
"""Parse or generate source."""
return response.url

def _get_status(self, meeting):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missed this when looking at the implementation above, but this functionality is all contained in the CityScrapersSpider get_status method if you pass the title text to check for cancellation info to the text kwarg

if "cancelled" in meeting["title"].lower():
meeting["status"] = CANCELLED
elif datetime.now() > meeting["end"]:
meeting["status"] = PASSED
else:
meeting["status"] = TENTATIVE

return meeting

class ChiOhareNoiseSubSpider2:
def parse(self, response):
"""
`parse` should always `yield` Meeting items.

Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
needs.
"""
for item in response.css("tr.cat-list-row0, tr.cat-list-row1"):
meeting = Meeting(
title=self._parse_title(item),
start=self._parse_start(item),
links=self._parse_links(item, response),
source=self._parse_source(response),
)

meeting = self._get_status(meeting)
meeting = self._parse_classification(meeting)
yield meeting
next_page = response.xpath("//li[@class='pagination-next']/a/@href").get()
if next_page is not None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as above, we can limit this to only getting the next page once if there's a _parse_page method that is called here instead of running all of the logic in parse

yield response.follow(response.urljoin(next_page), callback=self.parse)

def _parse_title(self, item):
"""Parse or generate meeting title."""
return item.css(".djc_category span").xpath("text()").extract()[0]

def _parse_start(self, item):
"""Parse start datetime as a naive datetime object."""
return datetime.strptime(
item.css(".djc_producer span").xpath("text()").extract()[0], "%B %d, %Y"
)

def _parse_links(self, item, response):
"""Parse or generate links."""
links = item.xpath('td[@class="djc_price"]/div/ul/li/a')
return [
{
"href": response.url
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason for recreating the query parameters here? It looks like response.urljoin(link.attr['href']) could work

+ "?"
+ link.xpath("@href").extract()[0].split("?")[1],
"title": link.xpath("span/text()").extract()[0],
}
for link in links
]

def _parse_source(self, response):
"""Parse or generate source."""
return response.url

def _get_status(self, meeting):
if "cancelled" in meeting["title"].lower():
meeting["status"] = CANCELLED
elif datetime.now() > meeting["start"]:
meeting["status"] = PASSED
else:
meeting["status"] = TENTATIVE

return meeting

def _parse_classification(self, meeting):
"""Parse or generate classification from allowed options."""
if "committee" in meeting["title"].lower():
meeting["classification"] = COMMITTEE
elif "commission" in meeting["title"].lower():
meeting["classification"] = COMMISSION
else:
meeting["classification"] = NOT_CLASSIFIED
return meeting

def start_requests(self):
urls = [
"https://www.oharenoise.org/meetings-all/year.listevents/",
"https://www.oharenoise.org/about-oncc/agendas-and-minutes",
]
yield Request(urls[0], callback=self.ChiOhareNoiseSubSpider1().parse)
yield Request(urls[1], callback=self.ChiOhareNoiseSubSpider2().parse)
Loading