-
-
Notifications
You must be signed in to change notification settings - Fork 311
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
0673 spider chi ohare noise #972
base: main
Are you sure you want to change the base?
Changes from all commits
3352066
15b6cf6
fcca613
16973d3
f7147ad
c6d69e0
ec1a12e
58fb97b
964e2ab
e6d8234
dc8f7fb
f3e3066
4eb0409
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -133,3 +133,6 @@ src/ | |
logs/*.log | ||
|
||
# output files: local gitignore added to city_scrapers/local_outputs/ | ||
|
||
# Vim swap files | ||
.*.swp |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
import re | ||
from datetime import datetime, timedelta | ||
|
||
from city_scrapers_core.constants import ( | ||
CANCELLED, | ||
COMMISSION, | ||
COMMITTEE, | ||
NOT_CLASSIFIED, | ||
PASSED, | ||
TENTATIVE, | ||
) | ||
from city_scrapers_core.items import Meeting | ||
from city_scrapers_core.spiders import CityScrapersSpider | ||
from scrapy import Request | ||
|
||
|
||
class ChiOhareNoiseSpider(CityScrapersSpider): | ||
name = "chi_ohare_noise" | ||
agency = "Chicago O'Hare Noise Compatibility Commission" | ||
timezone = "America/Chicago" | ||
|
||
class ChiOhareNoiseSubSpider1: | ||
def parse(self, response): | ||
""" | ||
`parse` should always `yield` Meeting items. | ||
|
||
Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping | ||
needs. | ||
""" | ||
for item in response.css(".ev_td_li"): | ||
surl = self._parse_url(item) | ||
yield Request(url=response.urljoin(surl), callback=self._parse_details) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be simpler to use the |
||
|
||
next_page = response.xpath("//div[@class='previousmonth']/a/@href").get() | ||
if next_page is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might want to limit this to only calling the next page once so that we don't hammer the site with a ton of requests every time. |
||
yield response.follow(response.urljoin(next_page), callback=self.parse) | ||
|
||
def _parse_details(self, response): | ||
stime = self._parse_start(response) | ||
meeting = Meeting( | ||
title=self._parse_title(response).replace("CANCELLED ", "").strip("- "), | ||
description=self._parse_description(response), | ||
start=stime, | ||
end=stime + timedelta(hours=1), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can leave this as |
||
all_day=False, | ||
location=self._parse_location(response), | ||
links=self._parse_links(response), | ||
source=response.url, | ||
) | ||
|
||
meeting = self._get_status(meeting) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We'll want to pass the Also, we should make sure that |
||
meeting = self._parse_classification(meeting) | ||
return meeting | ||
|
||
def _parse_title(self, response): | ||
"""Parse or generate meeting title.""" | ||
return response.xpath( | ||
"//div[@class='jev_evdt_header']/div/h2/text()" | ||
).extract()[0] | ||
|
||
def _parse_url(self, response): | ||
"""Parse or generate meeting title.""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The docstring can be deleted here if it doesn't apply |
||
return [i.strip() for i in response.xpath("p/a/@href").extract()][0] | ||
|
||
def _parse_description(self, response): | ||
"""Parse or generate meeting description.""" | ||
return response.xpath("//div[@class='jev_evdt_desc']/p/text()").get() or "" | ||
|
||
def _parse_classification(self, meeting): | ||
"""Parse or generate classification from allowed options.""" | ||
if "committee" in meeting["title"].lower(): | ||
meeting["classification"] = COMMITTEE | ||
elif "commission" in meeting["title"].lower(): | ||
meeting["classification"] = COMMISSION | ||
else: | ||
meeting["classification"] = NOT_CLASSIFIED | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the overall organization is a commission it's safe to default to |
||
return meeting | ||
|
||
def _parse_start(self, response): | ||
"""Parse start datetime as a naive datetime object.""" | ||
return datetime.strptime( | ||
" ".join( | ||
[ | ||
i.strip() | ||
for i in response.xpath( | ||
"//div[@class='jev_evdt_header']/div/p/text()" | ||
).extract() | ||
] | ||
), | ||
"%A, %B %d, %Y %I:%M%p", | ||
) | ||
|
||
def _parse_all_day(self, response): | ||
"""Parse or generate all-day status. Defaults to False.""" | ||
return False | ||
|
||
def _parse_location(self, response): | ||
"""Parse or generate location.""" | ||
addr = re.split( | ||
"-|,", | ||
response.xpath("//div[@class='jev_evdt_location']/text()") | ||
.extract()[0] | ||
.strip(), | ||
maxsplit=1, | ||
) | ||
return { | ||
# Using reverse indexing for the cases | ||
# where there is no building name or no location | ||
"address": addr[-1].strip(), | ||
"name": addr[0].strip(), | ||
} | ||
|
||
def _parse_links(self, response): | ||
"""Parse or generate links.""" | ||
return [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This might not always be a Zoom link, so we'll want to have checks for that here |
||
{"href": item, "title": "Zoom Link"} | ||
for item in response.xpath( | ||
"//div[@class='jev_evdt_desc']/p/a/@href" | ||
).extract()[:1] | ||
] | ||
|
||
def _parse_source(self, response): | ||
"""Parse or generate source.""" | ||
return response.url | ||
|
||
def _get_status(self, meeting): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missed this when looking at the implementation above, but this functionality is all contained in the |
||
if "cancelled" in meeting["title"].lower(): | ||
meeting["status"] = CANCELLED | ||
elif datetime.now() > meeting["end"]: | ||
meeting["status"] = PASSED | ||
else: | ||
meeting["status"] = TENTATIVE | ||
|
||
return meeting | ||
|
||
class ChiOhareNoiseSubSpider2: | ||
def parse(self, response): | ||
""" | ||
`parse` should always `yield` Meeting items. | ||
|
||
Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping | ||
needs. | ||
""" | ||
for item in response.css("tr.cat-list-row0, tr.cat-list-row1"): | ||
meeting = Meeting( | ||
title=self._parse_title(item), | ||
start=self._parse_start(item), | ||
links=self._parse_links(item, response), | ||
source=self._parse_source(response), | ||
) | ||
|
||
meeting = self._get_status(meeting) | ||
meeting = self._parse_classification(meeting) | ||
yield meeting | ||
next_page = response.xpath("//li[@class='pagination-next']/a/@href").get() | ||
if next_page is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as above, we can limit this to only getting the next page once if there's a |
||
yield response.follow(response.urljoin(next_page), callback=self.parse) | ||
|
||
def _parse_title(self, item): | ||
"""Parse or generate meeting title.""" | ||
return item.css(".djc_category span").xpath("text()").extract()[0] | ||
|
||
def _parse_start(self, item): | ||
"""Parse start datetime as a naive datetime object.""" | ||
return datetime.strptime( | ||
item.css(".djc_producer span").xpath("text()").extract()[0], "%B %d, %Y" | ||
) | ||
|
||
def _parse_links(self, item, response): | ||
"""Parse or generate links.""" | ||
links = item.xpath('td[@class="djc_price"]/div/ul/li/a') | ||
return [ | ||
{ | ||
"href": response.url | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason for recreating the query parameters here? It looks like |
||
+ "?" | ||
+ link.xpath("@href").extract()[0].split("?")[1], | ||
"title": link.xpath("span/text()").extract()[0], | ||
} | ||
for link in links | ||
] | ||
|
||
def _parse_source(self, response): | ||
"""Parse or generate source.""" | ||
return response.url | ||
|
||
def _get_status(self, meeting): | ||
if "cancelled" in meeting["title"].lower(): | ||
meeting["status"] = CANCELLED | ||
elif datetime.now() > meeting["start"]: | ||
meeting["status"] = PASSED | ||
else: | ||
meeting["status"] = TENTATIVE | ||
|
||
return meeting | ||
|
||
def _parse_classification(self, meeting): | ||
"""Parse or generate classification from allowed options.""" | ||
if "committee" in meeting["title"].lower(): | ||
meeting["classification"] = COMMITTEE | ||
elif "commission" in meeting["title"].lower(): | ||
meeting["classification"] = COMMISSION | ||
else: | ||
meeting["classification"] = NOT_CLASSIFIED | ||
return meeting | ||
|
||
def start_requests(self): | ||
urls = [ | ||
"https://www.oharenoise.org/meetings-all/year.listevents/", | ||
"https://www.oharenoise.org/about-oncc/agendas-and-minutes", | ||
] | ||
yield Request(urls[0], callback=self.ChiOhareNoiseSubSpider1().parse) | ||
yield Request(urls[1], callback=self.ChiOhareNoiseSubSpider2().parse) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for picking this one up! Looks like it was one of the more difficult ones. I think we'll want to keep it limited to one class here though for readability and split out methods for each (something like
_parse_events_start
,_parse_docs_start
).I could be missing some complexity here though, is there a reason for not doing that?