This repository has been archived by the owner on May 6, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 176
219 lines (193 loc) · 7.08 KB
/
run_scrapers.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
name: Run scrapers
on:
schedule:
- cron: '*/20 * * * *' # run every 20min from 05:00-21:00 UTC
workflow_dispatch: ~
jobs:
run_scraper:
runs-on: ubuntu-20.04
continue-on-error: false
timeout-minutes: 10
strategy:
fail-fast: false
matrix:
canton:
#- AG
#- AI # currently no data available
- AR
- BE
- BL
- BS
#- FR
- GE
- GL
#- GR
- JU
- LU
- NE
- NW
#- OW
- SG
- SH
#- SO
#- SZ
- TG
#- TI
#- UR # no more data available
- VD
- VS
- ZG
- ZH
- FL
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.7
uses: actions/setup-python@v4
with:
python-version: 3.7
- run: npm ci
- name: Remove broken apt repos
run: |
for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done
- name: Install dependencies
env:
SCRAPER_KEY: ${{ matrix.canton }}
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
sudo apt update || true # do not fail if update does not work
sudo apt-get install sqlite3
sudo apt-get install poppler-utils
if [ "$SCRAPER_KEY" = "GE" ] ; then
sudo apt-get install chromium-browser
fi
- name: Scrape new data
env:
SCRAPER_KEY: ${{ matrix.canton }}
SCRAPER_OVERWRITE: ${{ matrix.overwrite }}
run: |
./scrapers/run_scraper.sh
- name: Check if there are changes in the repo
run: |
if git diff -w --no-ext-diff --quiet
then
echo "changed=0" >> $GITHUB_OUTPUT
else
echo "changed=1" >> $GITHUB_OUTPUT
fi
id: changes
- name: Set commit message
env:
SCRAPER_KEY: ${{ matrix.canton }}
run: |
if [ "$SCRAPER_KEY" = "FL" ] ; then
echo "commit_msg=Update COVID19_Fallzahlen_${SCRAPER_KEY}_total.csv from scraper" >> $GITHUB_ENV
else
echo "commit_msg=Update COVID19_Fallzahlen_Kanton_${SCRAPER_KEY}_total.csv from scraper" >> $GITHUB_ENV
fi
- name: Sleep randomly # sleep here to prevent two workers to commit at the same time
if: steps.changes.outputs.changed == 1 # only sleep, if we will try to commit
run: sleep $[ ( $RANDOM % 30 ) + 1 ]s
- name: Commit and push to repo
if: steps.changes.outputs.changed == 1 # only try to commit if there are actually changes
uses: github-actions-x/[email protected]
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
push-branch: master
name: GitHub Action Scraper
email: [email protected]
commit-message: ${{ env.commit_msg }}
rebase: 'true'
- name: Validate scraper output
continue-on-error: true
env:
SCRAPER_KEY: ${{ matrix.canton }}
run: |
echo "validate_status=failed" >> $GITHUB_ENV
./scrapers/validate_scraper_output.sh
echo "validate_status=success" >> $GITHUB_ENV
- name: Get current unix timestamp
if: always()
id: date
run: echo "ts=$(date +'%s')" >> $GITHUB_OUTPUT
- name: Notify slack validation error
if: ${{ env.validate_status == 'failed' }}
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
uses: pullreminders/slack-action@master
with:
args: '{\"channel\":\"C013C0UUQ4S\", \"attachments\": [{\"fallback\": \"Job failed.\", \"color\": \"warning\", \"title\": \"CSV validation for ${{ matrix.canton }} failed\", \"title_link\": \"https://github.com/openZH/covid_19/actions/runs/${{ github.run_id }}?check_suite_focus=true\", \"text\": \":x: CSV validation after scraping failed\", \"footer\": \"<https://github.com/openZH/covid_19|openZH/covid_19>\", \"footer_icon\": \"https://github.com/abinoda/slack-action/raw/master/docs/app-icons/github-icon.png\", \"ts\": \"${{steps.date.outputs.ts}}\"}]}'
- name: Notify slack failure
if: ${{ failure() || cancelled() }}
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
uses: pullreminders/slack-action@master
with:
args: '{\"channel\":\"C013C0UUQ4S\", \"attachments\": [{\"fallback\": \"Job failed.\", \"color\": \"danger\", \"title\": \"Run Scrapers ${{ matrix.canton }}\", \"title_link\": \"https://github.com/openZH/covid_19/actions/runs/${{ github.run_id }}?check_suite_focus=true\", \"text\": \":x: Scraper failed\", \"footer\": \"<https://github.com/openZH/covid_19|openZH/covid_19>\", \"footer_icon\": \"https://github.com/abinoda/slack-action/raw/master/docs/app-icons/github-icon.png\", \"ts\": \"${{steps.date.outputs.ts}}\"}]}'
merge_csvs:
needs: run_scraper
if: always()
runs-on: ubuntu-20.04
timeout-minutes: 10
steps:
- uses: actions/checkout@v3
- name: Setup Ruby
uses: ruby/setup-ruby@v1
with:
ruby-version: '2.6'
- name: Print header line and then merge all files without header line
run: ruby scripts/merge_canton_csvs.rb > COVID19_Fallzahlen_CH_total_v2.csv
# Create v2 files
- name: Create v1 files based on new structure
run: |
./scripts/transform_all_new2old.sh
./scripts/new2oldcsv.py COVID19_Fallzahlen_CH_total_v2.csv > COVID19_Fallzahlen_CH_total.csv
- name: Check if there are changes in the repo
run: |
if git diff -w --no-ext-diff --quiet
then
echo "changed=0" >> $GITHUB_OUTPUT
else
echo "changed=1" >> $GITHUB_OUTPUT
fi
id: changes
# Commit to repo with updated file
- name: Commit and push to repo
if: steps.changes.outputs.changed == 1 # only try to commit if there are actually changes
uses: github-actions-x/[email protected]
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
push-branch: master
name: GitHub Action Scraper
email: [email protected]
commit-message: Update COVID19_Fallzahlen_CH_total.csv
rebase: 'true'
update_readme:
needs: merge_csvs
if: always()
runs-on: ubuntu-20.04
timeout-minutes: 10
steps:
- uses: actions/checkout@v3
- name: Update README with latest update dates from cantons
run: ./scripts/update_dates_in_readme.sh
- name: Check if there are changes in the repo
run: |
if git diff -w --no-ext-diff --quiet
then
echo "changed=0" >> $GITHUB_OUTPUT
else
echo "changed=1" >> $GITHUB_OUTPUT
fi
id: changes
# Commit to repo with updated file
- name: Commit and push to repo
if: steps.changes.outputs.changed == 1 # only try to commit if there are actually changes
uses: github-actions-x/[email protected]
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
push-branch: master
name: GitHub Action Scraper
email: [email protected]
commit-message: Update dates in README
rebase: 'true'