Scrape #862

Workflow file for this run

	# This is a basic workflow to help you get started with Actions

	name: Scrape

	# Controls when the workflow will run
	on:

	schedule:
	- cron: '15 4 * * *'
	- cron: '0 0 * * 6'
	# Allows you to run this workflow manually from the Actions tab
	workflow_dispatch:
	inputs:
	specific_scraper:
	type: choice
	description: Which scraper to run?
	options:
	- bills
	- events
	- people
	window:
	description: How many days to scrape?
	type: string

	concurrency:
	group: chicago-scraper

	# A workflow run is made up of one or more jobs that can run sequentially or in parallel
	jobs:
	# This workflow contains a single job called "build"
	scrape:
	# The type of runner that the job will run on
	runs-on: ubuntu-latest

	# Steps represent a sequence of tasks that will be executed as part of the job
	steps:
	# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
	- uses: actions/checkout@v3
	- name: install dependencies
	run: \|
	sudo apt-get update
	sudo apt-get install -y libgdal-dev
	pip install --upgrade pip
	pip install -r requirements.txt

	- name: run scraper without window
	if: ${{ !inputs.window && github.event.schedule != '0 0 * * 6' }}
	env:
	DJANGO_SETTINGS_MODULE: pupa.settings
	SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
	DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
	run: pupa update chicago ${{ inputs.specific_scraper }} --rpm=0

	- name: weekly big scrape
	if: ${{ !inputs.window && github.event.schedule == '0 0 * * 6' }}
	env:
	DJANGO_SETTINGS_MODULE: pupa.settings
	SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
	DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
	run: \|-
	pupa update chicago people --rpm=0
	pupa update chicago bills window=180 --rpm=0
	pupa update chicago events window=180 --rpm=0
	- name: run scraper with window
	if: ${{ inputs.window }}
	env:
	DJANGO_SETTINGS_MODULE: pupa.settings
	SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
	DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
	run: pupa update chicago ${{ inputs.specific_scraper }} window=${{ inputs.window }} --rpm=0

	- name: update vote count
	env:
	DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
	run: psql $(echo $DATABASE_URL) -f scripts/vote_counts.sql
	- name: reconstruct agendas
	env:
	DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
	run: psql $(echo $DATABASE_URL) -f scripts/reconstruct_agendas.sql
	- name: merge terms
	env:
	DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
	run: psql $(echo $DATABASE_URL) -f scripts/merge_memberships.sql
	- name: add topics
	env:
	DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
	run: make add_topics
	- name: keepalive
	uses: gautamkrishnar/keepalive-workflow@v1
	index-and-stats:
	runs-on: ubuntu-latest
	needs: scrape
	steps:
	- name: update search index
	uses: michcio1234/[email protected]
	with:
	heroku_api_key: ${{ secrets.HEROKU_API_KEY }}
	heroku_email: ${{ secrets.HEROKU_ACCOUNT }}
	heroku_app_name: ${{ secrets.HEROKU_APP }}
	command: python manage.py update_index --batch-size=50 --age=1
	- name: update stats
	uses: michcio1234/[email protected]
	with:
	heroku_api_key: ${{ secrets.HEROKU_API_KEY }}
	heroku_email: ${{ secrets.HEROKU_ACCOUNT }}
	heroku_app_name: ${{ secrets.HEROKU_APP }}
	command: python manage.py populate_person_statistics
	- name: clear cache
	uses: michcio1234/[email protected]
	with:
	heroku_api_key: ${{ secrets.HEROKU_API_KEY }}
	heroku_email: ${{ secrets.HEROKU_ACCOUNT }}
	heroku_app_name: ${{ secrets.HEROKU_APP }}
	command: python manage.py clear_cache

	export:
	# The type of runner that the job will run on
	runs-on: ubuntu-latest
	needs: scrape

	steps:
	- uses: actions/checkout@v3
	- name: install dependencies
	run: \|
	pip install "db-to-sqlite[postgresql] @ https://github.com/sgraaf/db-to-sqlite/archive/refs/heads/main.zip"
	pip install "sqlalchemy<2.0"
	- name: export
	env:
	DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
	run: \|
	db-to-sqlite $(echo $DATABASE_URL) chicago_council.db --table-name-pattern opencivicdata_*
	cat scripts/rename.sql \| sqlite3 chicago_council.db \| sqlite3 chicago_council.db
	zip chicago_council.db.zip chicago_council.db
	- name: Push data
	uses: WebFreak001/[email protected]
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # automatically provided by github actions
	with:
	upload_url: https://uploads.github.com/repos/datamade/chicago-council-scrapers/releases/80533645/assets{?name,label}
	release_id: 80533645 # same as above (id can just be taken out the upload_url, it's used to find old releases)
	asset_path: ./chicago_council.db.zip # path to archive to upload
	asset_name: chicago_council.db.zip # name to upload the release as, use $$ to insert date (YYYYMMDD) and 6 letter commit hash
	asset_content_type: application/zip # required by GitHub API

	# summaries:
	# # The type of runner that the job will run on
	# runs-on: ubuntu-latest
	# needs: scrape

	# defaults:
	# run:
	# working-directory: ./bill_summarize
	# steps:
	# - uses: actions/checkout@v3
	# - name: install dependencies
	# run: \|
	# sudo add-apt-repository --yes ppa:alex-p/tesseract-ocr5
	# sudo apt-get update
	# sudo apt-get install -y tesseract-ocr poppler-utils
	# pip install --upgrade pip
	# pip install -r requirements.txt
	# - name: run pipeline
	# env:
	# DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
	# OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	# run: \|
	# make updated_summaries_db
	# make summaries.csv
	# - name: Commit changes
	# uses: EndBug/add-and-commit@v7
	# with:
	# add: bill_summarize/summaries.csv
	# message: 'update summary'

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Scrape #862

Workflow file

Scrape #862

Jobs

Run details

Workflow file for this run