forked from lucyparsons/OpenOversight
-
Notifications
You must be signed in to change notification settings - Fork 11
/
spd_2021_salary_data.py
executable file
·91 lines (77 loc) · 2.82 KB
/
spd_2021_salary_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
import logging
from io import StringIO
from pathlib import Path
import click
import common
import pandas as pd
import requests
log = logging.getLogger(__name__)
URL = "https://data.seattle.gov/api/views/2khk-5ukd/rows.csv?accessType=DOWNLOAD"
def _lower(df: pd.DataFrame) -> list[pd.DataFrame]:
return [df["last"].str.lower(), df["first"].str.lower()]
def match_salary_data(
ids: pd.DataFrame,
url: str,
convert_id: bool = True,
) -> tuple[pd.DataFrame, pd.DataFrame]:
response = requests.get(url)
buffer = StringIO(response.text)
# Filter down to police department related only, get only columns needed
df = pd.read_csv(buffer)
df = df[
df["Department"].isin(["Police Department", "Commnty Sfty and Comm Ctr Dept"])
]
df = df[["Last Name", "First Name", "Hourly Rate "]]
df.columns = ["last", "first", "hourly"]
ids.columns = ["id", "last", "first"]
# Remove Jr, IV, II, III, etc.
df.loc[:, "last"] = df["last"].replace(
r" ?((Jr)|(II)|(III)|(IV))\.?", "", regex=True
)
# Merge with prod data based on the first and last names
# Do a case-agnostic comparison here
merged = df.merge(ids, how="left", left_on=_lower(df), right_on=_lower(ids))
if convert_id:
merged = merged.astype({"id": pd.Int64Dtype()})
# Estimate yearly pay based on hourly rate
# ASSUMED: 40 hour work week, 50 weeks a year
merged["salary"] = merged["hourly"] * 40 * 50
# Split off the links that don't have an OpenOversight badge associated with them
_has_id = merged["id"].notna()
missing = merged[~_has_id]
merged = merged[_has_id]
# Reduce columns even more
merged = merged[["salary", "id"]]
# Rename columns for OO
merged.columns = ["salary", "officer_id"]
# Add an empty id column
merged["id"] = None
# Set the year to 2020
merged["year"] = 2021
# Convert the salary from "$###,###.##" to a float
merged.loc[:, "salary"] = (
merged["salary"].replace(r"[\$,]", "", regex=True).astype(float)
)
# Set overtime pay to -1 since the year is not over
merged["overtime_pay"] = -1
return merged, missing
def main(id_path: Path, output: Path, url: str = URL):
log.info("Starting import")
ids = pd.read_csv(
id_path,
usecols=["id", "first name", "last name"],
)
merged, missing = match_salary_data(ids, url)
common.write_files_with_missing(merged, missing, output)
@click.command()
@click.argument("id_path", type=click.Path(exists=True, path_type=Path))
@click.argument("output", type=click.Path(path_type=Path))
def cli(id_path: Path, output: Path):
logging.basicConfig(
format="[%(asctime)s - %(name)s - %(lineno)3d][%(levelname)s] %(message)s",
level=logging.INFO,
)
main(id_path, output)
if __name__ == "__main__":
cli()