Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🏗 Build Spider: Los Angeles Board of Education #10

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions city_scrapers/spiders/losca_Board_of_ed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from city_scrapers_core.constants import BOARD
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from dateutil.parser import parse


class LoscaBoardOfEdSpider(CityScrapersSpider):
name = "losca_Board_of_ed"
agency = "Los Angeles Unified School District Board of Education"
timezone = "America/Chicago"
# original URL was https://www.lausd.org/boe
# they have an RSS feed. scrape that instead
start_urls = [
"https://www.lausd.org/site/RSS.aspx?DomainID=1057&ModuleInstanceID=73805&PageID=18628&PMIID=0" # noqa
]
cruznunez marked this conversation as resolved.
Show resolved Hide resolved

def parse(self, response):
"""
Parse meeting items from RSS feed.
"""
location = {
"name": "LAUSD Headquarters",
"address": "333 South Beaudry Avenue, Board Room, Los Angeles, CA 90017",
}
for item in response.css("item"):
# pdb.set_trace()
meeting = Meeting(
title=self._parse_title(item),
description="",
classification=BOARD,
start=self._parse_start(item),
end=self._parse_end(item),
all_day=False,
time_notes="",
location=location,
links=self._parse_links(item),
source=response.url,
)

meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)

yield meeting

def _parse_title(self, item):
"""
Parse meeting title. RSS feed titles always start with timestamp.
Ex: '9/19/2024 10:00 AM - 1:00 PM Children... Early Education Committee'
Remove timestamp from string and return title.
"""
raw = item.css("title::text").get()
no_stamp = raw.split()[6:-1]
title = " ".join(no_stamp)
return title
cruznunez marked this conversation as resolved.
Show resolved Hide resolved

def _parse_start(self, item):
"""
Parse start datetime as a naive datetime object.
pubdate::text gives us GMT, which is 7 hours ahead of PST.
Get start date from title instead, since it is in the correct time zone.
"""
raw = item.css("title::text").get()
date = " ".join(raw.split()[0:3])
return parse(date)
cruznunez marked this conversation as resolved.
Show resolved Hide resolved

def _parse_end(self, item):
"""
Parse end datetime as a naive datetime object.
End time is in title.
"""
raw = item.css("title::text").get().split()
date = raw[0]
time = " ".join(raw[4:6])
return parse(f"{date} {time}")
cruznunez marked this conversation as resolved.
Show resolved Hide resolved

def _parse_links(self, item):
"""
Parse links. item.get() returns
'...</title><link>https://www.lausd.org...EventDateID=73502<pubdate>...'
This string does not have a closing <link> tag even though the source
response does. This causes item.css('link') to return an empty tag.
We must parse link another way. Chose to use split.
"""
split = item.get().split("<link>")[1]
link = split.split("<pubdate>")[0]
links = [{"title": "Meeting Details", "href": link}]

return links
cruznunez marked this conversation as resolved.
Show resolved Hide resolved
1 change: 1 addition & 0 deletions tests/files/losca_Board_of_ed.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

96 changes: 96 additions & 0 deletions tests/test_losca_Board_of_ed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from datetime import datetime
from os.path import dirname, join

import pytest
from city_scrapers_core.constants import BOARD
from city_scrapers_core.utils import file_response
from freezegun import freeze_time

from city_scrapers.spiders.losca_Board_of_ed import LoscaBoardOfEdSpider

test_response = file_response(
join(dirname(__file__), "files", "losca_Board_of_ed.html"),
url="https://www.lausd.org/site/RSS.aspx?DomainID=1057&ModuleInstanceID=73805&PageID=18628&PMIID=0", # noqa
)
spider = LoscaBoardOfEdSpider()

freezer = freeze_time("2024-09-19")
freezer.start()

parsed_items = [item for item in spider.parse(test_response)]

freezer.stop()


def test_count():
assert len(parsed_items) == 12


def test_title():
assert parsed_items[0]["title"] == "Greening Schools & Climate Resilience"
assert parsed_items[1]["title"] == "Curriculum and Instruction"


def test_description():
assert parsed_items[0]["description"] == ""


def test_start():
assert parsed_items[0]["start"] == datetime(2024, 9, 24, 13, 0)


def test_end():
assert parsed_items[0]["end"] == datetime(2024, 9, 24, 16, 0)


def test_time_notes():
assert parsed_items[0]["time_notes"] == ""


def test_id():
assert (
parsed_items[0]["id"]
== "losca_Board_of_ed/202409241300/x/greening_schools_climate_resilience"
)


def test_status():
assert parsed_items[0]["status"] == "tentative"


def test_location():
assert parsed_items[0]["location"] == {
"name": "LAUSD Headquarters",
"address": "333 South Beaudry Avenue, Board Room, Los Angeles, CA 90017",
}


def test_source():
assert (
parsed_items[0]["source"]
== "https://www.lausd.org/site/RSS.aspx?DomainID=1057&ModuleInstanceID=73805&PageID=18628&PMIID=0" # noqa
)


def test_links():
assert parsed_items[0]["links"] == [
{
"title": "Meeting Details",
"href": "https://www.lausd.org/site/Default.aspx?PageID=18628&amp;PageType=17&amp;DomainID=1057&amp;ModuleInstanceID=73805&amp;EventDateID=73502", # noqa
}
]
assert parsed_items[1]["links"] == [
{
"title": "Meeting Details",
"href": "https://www.lausd.org/site/Default.aspx?PageID=18628&amp;PageType=17&amp;DomainID=1057&amp;ModuleInstanceID=73805&amp;EventDateID=71879", # noqa
}
]


def test_classification():
assert parsed_items[0]["classification"] == BOARD


@pytest.mark.parametrize("item", parsed_items)
def test_all_day(item):
assert item["all_day"] is False