-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_creation.Rmd
303 lines (256 loc) · 12.6 KB
/
data_creation.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
---
title: "Read & Format SU York data"
author: "Kim Engie"
date: "1/18/2022"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(lubridate)
library(vroom)
```
### Read in csvs, and make a data dictionary of release types
```{r}
basedir = "/Users/kimengie/Documents/Data/York/jail"
cat(file=stderr(), "\nreading data...\n")
inmates <- vroom(file.path(basedir, "yorkopenlattice_withbookingnumbers.csv"),
col_names = c("first", "last", "status", "statusdate", "id", "pouch"))
bookings <- vroom(file.path(basedir, "YCPInmateHistory2017.csv"))
releasetype = tribble(~ReleaseType, ~Description,
"BA","BY BAIL",
"CO", "BY COURT'S ORDER (COURT OF RECORD AND MINOR JUDICIARY)",
"DC", "SENTENCED TO THE DEPARTMENT OF CORRECTIONS",
"DS", "BY DEATH",
"EM", "BY EXPIRATION OF MAXIMUM",
"FC", "BY PAYMENT OF FINES AND COSTS",
"HA", "BY HOLDING AUTHORITY",
"OT", "OTHER",
"PC", "PAROLED BY THE COURT OF RECORD",
"PS", "PAROLED BY STATE BOARD OF PROBATION/PAROLE",
"PT", "PERMANENT TRANSFER TO OTHER INSTITUTION",
"VA", "BY VARIOUS FEDERAL/STATE/POLICE AUTHORITIES)")
```
More csvs - from sftp folder
```{r}
# File list
allfiles <- list.files(path=basedir, pattern="yorkcounty_\\d+.csv", full.names=TRUE, recursive=FALSE)
for (i in allfiles){
name = str_split(i, "/") %>% tail() %>% pluck(1,9) %>% str_remove(".csv")
assign(name, vroom(i, col_types = cols(.default = "c")))
}
# test = "/Users/kimengie/Documents/Data/York/jail//yorkcounty_202201031600.csv"
# str_split(test, "/") %>% tail() %>% pluck(1,9) %>% str_remove(".csv") %>% str_replace("\"", "")
```
Stack the recent csvs as they're all the same format, which is the same format as the bookings table
```{r}
sftp_bookings <- rbind(yorkcounty_202111021124,
yorkcounty_202111230945,
yorkcounty_202111231000,
yorkcounty_202111231015,
yorkcounty_202111231030,
yorkcounty_202111231045,
yorkcounty_202111231100,
yorkcounty_202112101115,
yorkcounty_202112101130,
yorkcounty_202112101145,
yorkcounty_202112141030,
yorkcounty_202112141045,
yorkcounty_202112141100,
yorkcounty_202112141115,
yorkcounty_202112141130,
yorkcounty_202112141145,
yorkcounty_202112141200,
yorkcounty_202112141215,
yorkcounty_202112141230,
yorkcounty_202112291145,
yorkcounty_202112291200,
yorkcounty_202112291215,
yorkcounty_202112291230,
yorkcounty_202112291245,
yorkcounty_202112291300,
yorkcounty_202112291315,
yorkcounty_202112291330,
yorkcounty_202201031600,
yorkcounty_202201101000,
yorkcounty_202201101015,
yorkcounty_202201101030,
yorkcounty_202201101045,
yorkcounty_202201101100,
yorkcounty_202201101115,
yorkcounty_202201101130,
yorkcounty_202201101145,
yorkcounty_202201101200,
yorkcounty_202201101215,
yorkcounty_202201101230,
yorkcounty_202201101245,
yorkcounty_202201101300,
yorkcounty_202201101315,
yorkcounty_202201101330,
yorkcounty_202201101345,
yorkcounty_202201101400,
yorkcounty_202201101415,
yorkcounty_202201141200,
yorkcounty_202201191045) %>%
select(-c(index, TimeServedDays,ReasonCode)) %>% # empty or irrelevant columns
mutate(personID = sapply(strsplit(GeneralIdentifier, ":", fixed=TRUE), head, 1),
personID = str_trim(personID, side = "both")) %>%
mutate(CommitDateTime = parse_date_time(CommitDateTime, '%m/%d/%Y %I:%M:%S %p',
tz = "America/New_York"),
StartDateTime = parse_date_time(StartDateTime, '%m/%d/%Y %I:%M:%S %p',
tz = "America/New_York"),
ProjectedReleaseDateTime = parse_date_time(ProjectedReleaseDateTime,
'%m/%d/%Y %I:%M:%S %p',
tz = "America/New_York"),
DateTimeRelease = parse_date_time(DateTimeRelease,
'%m/%d/%Y %I:%M:%S %p',
tz = "America/New_York"))
```
Add personID to historical bookings table.
```{r}
bookings_clean <- bookings %>%
select(-c(index, TimeServedDays,ReasonCode)) %>% # empty or irrelevant columns
mutate(personID = sapply(strsplit(GeneralIdentifier, ":", fixed=TRUE), head, 1),
personID = str_trim(personID, side = "both"))
```
```{r}
allbookings <- rbind(bookings_clean, sftp_bookings)
allbookings <- distinct(allbookings, across())
```
### Combine MH status & bookings
"The MHSR rating is not assigned to a booking, but rather the patient themselves. So there is no direct relation between a rating and a booking. If a patient comes in on January 5th and gets an A rating, is released on January 8th, they will maintain that rating the next time they are booked in. Unfortunately, due to this, there is no direct correlation between a booking number and a rating, there is only a rating assigned to the patient and the date it was assigned. This is all independent of the booking itself."
* assume the dates of patient rating and booking start/end are always accurate and a person's rating never gets lost
* Interweave & order the dates of bookings & MH ratings per individual.
* If a rating date occurs during a booking (must be between commit & release, inclusive), match it.
* assign each booking after it a rating based on the "statusdate" that is either within it, or the latest prior.
```{r}
# combine all dates in 1 tibble
ratingdates <- inmates %>% arrange(statusdate) %>%
select(pouch, statusdate) %>%
mutate(pouch = as.character(pouch),
statusdate = as.character(statusdate))
booking_ranges <- allbookings %>%
arrange(CommitDateTime) %>%
left_join(ratingdates, by = c("personID"="pouch")) %>%
# This would keep only what's in the group-by, an add a list-col
group_by(personID, GeneralIdentifier, CommitDateTime, DateTimeRelease, ReleaseType, ProjectedReleaseDateTime) %>%
summarize(statusdates = list(statusdate)) %>%
ungroup()
#keeps whole table, adds a list-col
# rowwise() %>%
# mutate(statusdates = list(as.Date(statusdate))) %>%
# ungroup()
# This would keep the whole table but add a list col that's a TIBBLE
# nest(statusdates = c(statusdate))
```
#### Helper function, to match the date somepone was assigned a MHSR rating, to the booking it applies to.
Cannot use `case_when` for the `matchdate`. When `in_booking` is empty: "case_when returns a vector with a length equal to the length of the vector you pass in. So nothing in, nothing out."
Cannot use dplyr if_else, it does not evaluate `character(0)` or `logical(0)` well for T/F
```{r}
rating_to_booking <- function(allratings, start=CommitDateTime, end=DateTimeRelease) {
end <- if_else(!is.na(end), end, Sys.time()) #if the relase date is null, compare to today's date.
if (!is.na(allratings)) {
# Check if there's a statusdate during a booking. If so, return it.
check_in_booking <- between(as.Date(allratings),start,end)
allratings <- as.character(allratings)
in_booking <- case_when(
all(check_in_booking==FALSE) ~ NA_character_,
TRUE ~ allratings[check_in_booking] %>% as.character()
)
# Use the `in_booking` statusdate within the jail booking if exists
# Or if all statusdates are after the booking ends, there are none
# Or, get most recent statusdate before the start
matchdate <- ifelse(length(in_booking) > 0,
in_booking,
ifelse(all(as.Date(allratings) < as.Date(end)),
allratings[max(rev(order(allratings)))] %>% as.character(), # the max date
NA_character_ )
)
return (matchdate)
}
return (NA_character_) # if no rating, return NA
}
```
<!-- TESTING GROUNDS -->
<!-- ```{r} -->
<!-- # ratingdates = test_ratingdates$statusdate -->
<!-- test = -->
<!-- booking_ranges2$statusdates[[1]] -->
<!-- start = booking_ranges2$CommitDateTime -->
<!-- end = booking_ranges2$DateTimeRelease -->
<!-- # if_else(between(testdate, as.Date(start), as.Date(end)), testdate, NA_Date_) -->
<!-- # all(test_ratingdates$statusdate > as.Date(end)) -->
<!-- # between(test_ratingdates$statusdate, as.Date(start), as.Date(end)) -->
<!-- rating_to_booking(allratings = test, -->
<!-- start = start, -->
<!-- end = end) -->
<!-- ``` -->
Match MHSR ratings to bookings
```{r}
booking_ranges2 <- booking_ranges %>%
rowwise() %>%
mutate(statusdate_match = pmap(list(statusdates[[1]], CommitDateTime, DateTimeRelease),
rating_to_booking)) %>%
unnest(cols = statusdate_match)
# booking_ranges2$statusdate_match
```
```{r}
alljail_data <- booking_ranges2 %>%
# join all person names (some won't have statuses)
left_join(inmates %>% select(pouch, first, last) %>% mutate(pouch = as.character(pouch)),
by = c("personID"="pouch"),
na_matches = "never") %>%
distinct(.) %>% # clear out duplicates, not sure why created
# join only those statuses that match
left_join(inmates %>% select(pouch, statusdate, status, id) %>%
mutate(pouch = as.character(pouch),
statusdate = as.character(statusdate)),
by = c("personID"="pouch", "statusdate_match"="statusdate"),
na_matches = "never") %>%
distinct(.) %>% # clear out duplicates, not sure why created
relocate(statusdates, .after = last_col())
```
### Finish cleaning up bookings table.
* Parse out in their own columns: personID, year booked, timeserved (days)
* Using the *CommitDateTime* and not the StartDateTime to mark the start of the "time served" range.
* If not yet released, calciulate time served until the current date.
```{r}
alljail_data2 <- alljail_data %>%
mutate(yr_booked_num = year(CommitDateTime),
yr_booked = as.character(yr_booked_num), #need a text col for listing the "Total" in dropdowns
ol_timeserveddays = round(if_else(!is.na(DateTimeRelease),
difftime(DateTimeRelease, CommitDateTime, units = "days"),
difftime(Sys.Date(), CommitDateTime, units = "days")),
0),
statusdates = sapply(statusdates, toString)
) %>%
left_join(releasetype, by="ReleaseType") %>%
relocate(Description, .after = ReleaseType) %>%
relocate(c(yr_booked_num, yr_booked, ol_timeserveddays), .after = CommitDateTime) %>%
relocate(statusdates, .after = last_col())
```
### CALCULATE RECIDIVISM - with 365 day lookback
Data. https://stackoverflow.com/questions/66254603/using-datetime-lookback-periods-to-record-whether-event-occured-in-r
```{r}
# Helper function
previous_booking <- function(person, start, end, data=alljail_data2){
# for each commit, look back 365 days for another commit in that time.
commits <- data %>% filter(personID==person) %>%
pull(CommitDateTime)
if_else(commits >= start & commits < end, 1, 0) %>%
sum() -> flag_count
ifelse(flag_count > 0, 1, 0)
}
alljail_withrecidivism <- alljail_data2 %>%
select(personID, CommitDateTime, yr_booked, GeneralIdentifier, status, everything()) %>%
distinct(personID, CommitDateTime, GeneralIdentifier, status, .keep_all = TRUE) %>%
arrange(personID, CommitDateTime) %>%
mutate(
lookbacktime = (CommitDateTime - years(1)),
lookbackflag = pmap_dbl(list(personID, lookbacktime, CommitDateTime), previous_booking)
)
```
save csv into the `data` folder.
```{r}
write_csv(alljail_withrecidivism, "data/york_jaildata_combined.csv" )
```