From d1f660b148462dc1c81a057adb8882d52cd35cd8 Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Mon, 9 Sep 2024 13:40:43 +0100 Subject: [PATCH 01/16] initial commit of using censored_pmf --- EpiAware/docs/Project.toml | 3 + .../explainers/censored-obs.jl | 321 ++++++++++++++++++ 2 files changed, 324 insertions(+) create mode 100644 EpiAware/docs/src/getting-started/explainers/censored-obs.jl diff --git a/EpiAware/docs/Project.toml b/EpiAware/docs/Project.toml index f3fd8f2b7..c17e5e021 100644 --- a/EpiAware/docs/Project.toml +++ b/EpiAware/docs/Project.toml @@ -8,11 +8,14 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8" EpiAware = "b2eeebe4-5992-4301-9193-7ebc9f62c855" +FFMPEG = "c87230d0-a227-11e9-1b43-d7ebe4e7570a" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Pluto = "c3e4b0f8-55cb-11ea-2926-15256bba5781" PlutoStaticHTML = "359b1769-a58e-495b-9770-312e911026ad" ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd" Transducers = "28d57a85-8fef-5791-bfe6-a80928e7c999" Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0" +TuringBenchmarking = "0db1332d-5c25-4deb-809f-459bc696f94f" diff --git a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl new file mode 100644 index 000000000..2d50e1736 --- /dev/null +++ b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl @@ -0,0 +1,321 @@ +### A Pluto.jl notebook ### +# v0.19.46 + +using Markdown +using InteractiveUtils + +# ╔═╡ a2624404-48b1-4faa-abbe-6d78b8e04f2b +let + docs_dir = dirname(dirname(dirname(@__DIR__))) + pkg_dir = dirname(docs_dir) + + using Pkg: Pkg + Pkg.activate(docs_dir) + Pkg.develop(; path = pkg_dir) + Pkg.add(["DataFramesMeta", "StatsBase", "TuringBenchmarking"]) + Pkg.instantiate() +end + +# ╔═╡ 5baa8d2e-bcf8-4e3b-b007-175ad3e2ca95 +begin + using EpiAware.EpiAwareUtils: censored_pmf + using Random, Distributions, StatsBase #utilities for random events + using DataFramesMeta #Data wrangling + using StatsPlots #plotting + using Turing, TuringBenchmarking #PPL +end + +# ╔═╡ 8de5c5e0-6e95-11ef-1693-bfd465c8d919 +md" +# Fitting distributions using `censored_pmf` and Turing PPL + +## Introduction + +### What are we going to do in this Vignette + +In this vignette, we'll demonstrate how to use `EpiAwareUtils.censored_pmf` in conjunction with the Turing PPL for Bayesian inference of epidemiological delay distributions. We'll cover the following key points: + +1. Simulating censored delay distribution data +2. Fitting a naive model using Turing +3. Evaluating the naive model's performance +4. Fitting an improved model using `censored_pmf` functionality +5. Comparing the `censored_pmf` model's performance to the naive model + +### What might I need to know before starting + +This note builds on the concepts introduced in the R/stan package [`primarycensoreddist`](https://github.com/epinowcast/primarycensoreddist), especially the [Getting Started with primarycensoreddist](https://primarycensoreddist.epinowcast.org/articles/fitting-dists-with-stan.html) vignette and assumes familiarity with using Turing tools as covered in the [Turing documentation](https://turinglang.org/). + +This note is generated using the `EpiAware` package locally via `Pkg.develop`, in the `EpiAware/docs` environment. It is also possible to install `EpiAware` using + +```julia +Pkg.add(url=\"https://github.com/CDCgov/Rt-without-renewal\", subdir=\"EpiAware\") +``` + +" + +# ╔═╡ 30dd9af4-b64f-42b1-8439-a890752f68e3 +md" +The other dependencies are as follows: +" + +# ╔═╡ c5704f67-208d-4c2e-8513-c07c6b94ca99 +md" +## Simulating censored and truncated delay distribution data + +We'll start by simulating some censored and truncated delay distribution data. +" + +# ╔═╡ aed124c7-b4ba-4c97-a01f-ff553f376c86 +Random.seed!(123) # For reproducibility + +# ╔═╡ 105b9594-36ce-4ae8-87a8-5c81867b1ce3 +# Define the true distribution parameters +n = 1000 + +# ╔═╡ 8aa9f9c1-d3c4-49f3-be18-a400fc71e8f7 +meanlog = 1.5 + +# ╔═╡ 84bb3999-9f2b-4eaa-9c2d-776a86677eaf +sdlog = 0.75 + +# ╔═╡ 2bf6677e-ebe9-4aa8-aa91-f631e99669bb +true_dist = LogNormal(meanlog, sdlog) + +# ╔═╡ aea8b28e-fffe-4aa6-b51e-8199a7c7975c +# Generate varying pwindow, swindow, and obs_time lengths +pwindows = rand(1:1, n) + +# ╔═╡ d231bd0c-165f-4973-a46f-f66991813ea7 +swindows = rand(1:1, n) + +# ╔═╡ 7522f05b-1750-4983-8947-ef70f4298d06 +obs_times = fill(10.0,n) + +# ╔═╡ a4f5e9b6-ff3a-48fa-aa51-0abccb9c7bed +#Sample secondary time relative to beginning of primary censor window respecting the right-truncation +samples = map(pwindows, swindows, obs_times) do pw, sw, ot + P = rand() * pw # Primary event time + T = rand(truncated(true_dist; upper= ot - P)) +end + +# ╔═╡ 0b5e96eb-9312-472e-8a88-d4509a4f25d0 +# Generate samples +delay_counts = mapreduce(vcat, samples, pwindows, swindows, obs_times) do T, pw, sw, ot + DataFrame( + pwindow = pw, + swindow = sw, + obs_time = ot, + observed_delay = T ÷ sw .|> Int, + observed_delay_upper = (T ÷ sw) + sw |> Int, + ) +end |> # Aggregate to unique combinations and count occurrences + df -> @groupby(df, :pwindow, :swindow, :obs_time, :observed_delay, :observed_delay_upper) |> + gd -> @combine(gd, :n = length(:pwindow)) + +# ╔═╡ a7bff47d-b61f-499e-8631-206661c2bdc0 +empirical_cdf = ecdf(samples) + +# ╔═╡ 16bcb80a-970f-4633-aca2-261fa04172f7 +empirical_cdf_obs = ecdf(delay_counts.observed_delay, weights=delay_counts.n) + +# ╔═╡ 60711c3c-266e-42b5-acc6-6624db294f24 +x_seq = range(minimum(samples), maximum(samples), 100) + +# ╔═╡ c6fe3c52-af87-4a84-b280-bc9a8532e269 +#plot +let + plot(; title = "Comparison of Observed vs Theoretical CDF", + ylabel = "Cumulative Probability", + xlabel = "Delay", + xticks = 0:obs_times[1], + xlims = (-0.1, obs_times[1] + 0.5) + ) + plot!(x_seq, x_seq .|> x->empirical_cdf(x), + lab = "Observed secondary times", + c = :blue, + lw = 3, + ) + plot!(x_seq, x_seq .|> x->empirical_cdf_obs(x), + lab = "Observed censored secondary times", + c = :green, + lw = 3, + ) + plot!(x_seq, x_seq .|> x -> cdf(true_dist, x), + lab = "Theoretical", + c = :black, + lw = 3, + ) + vline!([mean(samples)], ls = :dash, c= :blue, lw = 3, lab = "") + vline!([mean(true_dist)], ls = :dash, c= :black, lw = 3, lab = "") +end + +# ╔═╡ f66d4b2e-ed66-423e-9cba-62bff712862b +md" +We've aggregated the data to unique combinations of `pwindow`, `swindow`, and `obs_time` and counted the number of occurrences of each `observed_delay` for each combination. This is the data we will use to fit our model. +" + +# ╔═╡ 010ebe37-782b-4a35-bf5c-dca6dc0fee45 +md" +## Fitting a naive model using Turing + +We'll start by fitting a naive model using Turing. +" + +# ╔═╡ d9d14c48-8700-42b5-89b4-7fc51d0f577c +@model function naive_model(N, y, n) + mu ~ Normal(1., 1.) + sigma ~ truncated(Normal(0.5, 1.0); lower= 0.0) + d = LogNormal(mu, sigma) + + for i in eachindex(y) + Turing.@addlogprob! n[i] * logpdf(d, y[i]) + end +end + +# ╔═╡ 8a7cd9ec-5640-4f5f-84c3-ae3f465ca68b +md" +Now lets instantiate this model with data +" + +# ╔═╡ 028ade5c-17bd-4dfc-8433-23aaff02c181 +naive_mdl = naive_model( + size(delay_counts,1), + delay_counts.observed_delay .+ 1e-6, # Add a small constant to avoid log(0) + delay_counts.n) + +# ╔═╡ 04b4eefb-f0f9-4887-8db0-7cbb7f3b169b +md" +and now let's fit the compiled model. +" + +# ╔═╡ 21655344-d12b-4e47-a9a9-d06bd909f6ea +naive_fit = sample(naive_mdl, NUTS(), MCMCThreads(), 500, 4) + +# ╔═╡ 3b89fe00-6aaf-4764-8b29-e71479f1e641 +summarize(naive_fit) + +# ╔═╡ 43eac8dd-8f1d-440e-b1e8-85db9e740651 +md" +We see that the model has converged and the diagnostics look good. However, just from the model posterior summary we see that we might not be very happy with the fit. `mu` is smaller than the target $(meanlog) and `sigma` is larger than the target $(sdlog). + +" + +# ╔═╡ b2efafab-8849-4a7a-bb64-ac9ce126ca75 +md" +## Fitting an improved model using primarycensoreddist + +We'll now fit an improved model using the `censored_pmf` function from the `EpiAware.EpiAwareUtils` submodule. This accounts for the primary and secondary censoring windows as well as the truncation. + +" + +# ╔═╡ ef40112b-f23e-4d4b-8a7d-3793b786f472 +@model function primarycensoreddist_model(N, y, y_upper, n, pwindow, D) + try + mu ~ Normal(1., 1.) + sigma ~ truncated(Normal(0.5, 0.5); lower= 0.1,) + d = LogNormal(mu, sigma) + log_pmf = censored_pmf(d; Δd = pwindow, D = D) .|> log + + for i in eachindex(y) + Turing.@addlogprob! n[i] * log_pmf[y[i] + 1] #0 obs is first element of array + end + return log_pmf + catch + Turing.@addlogprob! -Inf + end +end + +# ╔═╡ b823d824-419d-41e9-9ac9-2c45ef190acf +md" +Lets instantiate this model with data +" + +# ╔═╡ 93bca93a-5484-47fa-8424-7315eef15e37 +primarycensoreddist_mdl = primarycensoreddist_model( + size(delay_counts,1), + delay_counts.observed_delay, # Add a small constant to avoid log(0) + delay_counts.observed_delay_upper, # Add a small constant to avoid log(0) + delay_counts.n, + delay_counts.pwindow[1], + delay_counts.obs_time[1] +) + +# ╔═╡ 8f1d32fd-f54b-4f69-8c93-8f0786366cef +# ╠═╡ disabled = true +#=╠═╡ +benchmark_model( + primarycensoreddist_mdl; + # Check correctness of computations + check=true, + # Automatic differentiation backends to check and benchmark + adbackends=[:forwarddiff, :reversediff, :reversediff_compiled] + ) + ╠═╡ =# + +# ╔═╡ 44132e2e-5a1a-49ad-9e57-cec24f981f52 +map_estimate = [maximum_a_posteriori(primarycensoreddist_mdl) for _ in 1:10] |> + opts -> (opts, findmax([o.lp for o in opts])[2]) |> + opts_i -> opts_i[1][opts_i[2]] + +# ╔═╡ a34c19e8-ba9e-4276-a17e-c853bb3341cf +# ╠═╡ disabled = true +#=╠═╡ +primarycensoreddist_fit = sample(primarycensoreddist_mdl, NUTS(), MCMCThreads(), 500, 4) + ╠═╡ =# + +# ╔═╡ 1210443f-480f-4e9f-b195-d557e9e1fc31 +summarize(primarycensoreddist_fit) + +# ╔═╡ 46711233-f680-4962-9e3e-60c747db4d2c +censored_pmf(true_dist; D = obs_times[1] ) + +# ╔═╡ 604458a6-7b6f-4b5c-b2e7-09be1908c0f9 +# ╠═╡ disabled = true +#=╠═╡ +primarycensoreddist_fit = sample(primarycensoreddist_mdl, MH(), 100_000; initial_params=map_estimate.values.array) |> + chn -> chn[50_000:end, :, :] + ╠═╡ =# + +# ╔═╡ 7ae6c61d-0e33-4af8-b8d2-e31223a15a7c +primarycensoreddist_fit = sample(primarycensoreddist_mdl, NUTS(), 1000; initial_params=map_estimate.values.array) + +# ╔═╡ Cell order: +# ╟─8de5c5e0-6e95-11ef-1693-bfd465c8d919 +# ╠═a2624404-48b1-4faa-abbe-6d78b8e04f2b +# ╟─30dd9af4-b64f-42b1-8439-a890752f68e3 +# ╠═5baa8d2e-bcf8-4e3b-b007-175ad3e2ca95 +# ╟─c5704f67-208d-4c2e-8513-c07c6b94ca99 +# ╠═aed124c7-b4ba-4c97-a01f-ff553f376c86 +# ╠═105b9594-36ce-4ae8-87a8-5c81867b1ce3 +# ╠═8aa9f9c1-d3c4-49f3-be18-a400fc71e8f7 +# ╠═84bb3999-9f2b-4eaa-9c2d-776a86677eaf +# ╠═2bf6677e-ebe9-4aa8-aa91-f631e99669bb +# ╠═aea8b28e-fffe-4aa6-b51e-8199a7c7975c +# ╠═d231bd0c-165f-4973-a46f-f66991813ea7 +# ╠═7522f05b-1750-4983-8947-ef70f4298d06 +# ╠═a4f5e9b6-ff3a-48fa-aa51-0abccb9c7bed +# ╠═0b5e96eb-9312-472e-8a88-d4509a4f25d0 +# ╠═a7bff47d-b61f-499e-8631-206661c2bdc0 +# ╠═16bcb80a-970f-4633-aca2-261fa04172f7 +# ╠═60711c3c-266e-42b5-acc6-6624db294f24 +# ╠═c6fe3c52-af87-4a84-b280-bc9a8532e269 +# ╟─f66d4b2e-ed66-423e-9cba-62bff712862b +# ╟─010ebe37-782b-4a35-bf5c-dca6dc0fee45 +# ╠═d9d14c48-8700-42b5-89b4-7fc51d0f577c +# ╟─8a7cd9ec-5640-4f5f-84c3-ae3f465ca68b +# ╠═028ade5c-17bd-4dfc-8433-23aaff02c181 +# ╟─04b4eefb-f0f9-4887-8db0-7cbb7f3b169b +# ╠═21655344-d12b-4e47-a9a9-d06bd909f6ea +# ╠═3b89fe00-6aaf-4764-8b29-e71479f1e641 +# ╟─43eac8dd-8f1d-440e-b1e8-85db9e740651 +# ╠═b2efafab-8849-4a7a-bb64-ac9ce126ca75 +# ╠═ef40112b-f23e-4d4b-8a7d-3793b786f472 +# ╟─b823d824-419d-41e9-9ac9-2c45ef190acf +# ╠═93bca93a-5484-47fa-8424-7315eef15e37 +# ╠═8f1d32fd-f54b-4f69-8c93-8f0786366cef +# ╠═44132e2e-5a1a-49ad-9e57-cec24f981f52 +# ╠═604458a6-7b6f-4b5c-b2e7-09be1908c0f9 +# ╠═a34c19e8-ba9e-4276-a17e-c853bb3341cf +# ╠═7ae6c61d-0e33-4af8-b8d2-e31223a15a7c +# ╠═1210443f-480f-4e9f-b195-d557e9e1fc31 +# ╠═46711233-f680-4962-9e3e-60c747db4d2c From 86236f2a8cef51462c1d30a30fd71b8ed060883d Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Tue, 10 Sep 2024 10:22:05 +0100 Subject: [PATCH 02/16] formatting --- .../explainers/censored-obs.jl | 154 +++++++++--------- 1 file changed, 78 insertions(+), 76 deletions(-) diff --git a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl index 2d50e1736..655464a1b 100644 --- a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl +++ b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl @@ -12,17 +12,17 @@ let using Pkg: Pkg Pkg.activate(docs_dir) Pkg.develop(; path = pkg_dir) - Pkg.add(["DataFramesMeta", "StatsBase", "TuringBenchmarking"]) + Pkg.add(["DataFramesMeta", "StatsBase", "TuringBenchmarking"]) Pkg.instantiate() end # ╔═╡ 5baa8d2e-bcf8-4e3b-b007-175ad3e2ca95 begin - using EpiAware.EpiAwareUtils: censored_pmf - using Random, Distributions, StatsBase #utilities for random events - using DataFramesMeta #Data wrangling - using StatsPlots #plotting - using Turing, TuringBenchmarking #PPL + using EpiAware.EpiAwareUtils: censored_pmf + using Random, Distributions, StatsBase #utilities for random events + using DataFramesMeta #Data wrangling + using StatsPlots #plotting + using Turing, TuringBenchmarking #PPL end # ╔═╡ 8de5c5e0-6e95-11ef-1693-bfd465c8d919 @@ -45,7 +45,7 @@ In this vignette, we'll demonstrate how to use `EpiAwareUtils.censored_pmf` in c This note builds on the concepts introduced in the R/stan package [`primarycensoreddist`](https://github.com/epinowcast/primarycensoreddist), especially the [Getting Started with primarycensoreddist](https://primarycensoreddist.epinowcast.org/articles/fitting-dists-with-stan.html) vignette and assumes familiarity with using Turing tools as covered in the [Turing documentation](https://turinglang.org/). -This note is generated using the `EpiAware` package locally via `Pkg.develop`, in the `EpiAware/docs` environment. It is also possible to install `EpiAware` using +This note is generated using the `EpiAware` package locally via `Pkg.develop`, in the `EpiAware/docs` environment. It is also possible to install `EpiAware` using ```julia Pkg.add(url=\"https://github.com/CDCgov/Rt-without-renewal\", subdir=\"EpiAware\") @@ -89,34 +89,35 @@ pwindows = rand(1:1, n) swindows = rand(1:1, n) # ╔═╡ 7522f05b-1750-4983-8947-ef70f4298d06 -obs_times = fill(10.0,n) +obs_times = fill(10.0, n) # ╔═╡ a4f5e9b6-ff3a-48fa-aa51-0abccb9c7bed #Sample secondary time relative to beginning of primary censor window respecting the right-truncation samples = map(pwindows, swindows, obs_times) do pw, sw, ot - P = rand() * pw # Primary event time - T = rand(truncated(true_dist; upper= ot - P)) + P = rand() * pw # Primary event time + T = rand(truncated(true_dist; upper = ot - P)) end # ╔═╡ 0b5e96eb-9312-472e-8a88-d4509a4f25d0 # Generate samples delay_counts = mapreduce(vcat, samples, pwindows, swindows, obs_times) do T, pw, sw, ot - DataFrame( - pwindow = pw, - swindow = sw, - obs_time = ot, - observed_delay = T ÷ sw .|> Int, - observed_delay_upper = (T ÷ sw) + sw |> Int, - ) + DataFrame( + pwindow = pw, + swindow = sw, + obs_time = ot, + observed_delay = T ÷ sw .|> Int, + observed_delay_upper = (T ÷ sw) + sw |> Int + ) end |> # Aggregate to unique combinations and count occurrences - df -> @groupby(df, :pwindow, :swindow, :obs_time, :observed_delay, :observed_delay_upper) |> - gd -> @combine(gd, :n = length(:pwindow)) + df -> @groupby(df, :pwindow, :swindow, :obs_time, :observed_delay, + :observed_delay_upper) |> + gd -> @combine(gd, :n=length(:pwindow)) # ╔═╡ a7bff47d-b61f-499e-8631-206661c2bdc0 empirical_cdf = ecdf(samples) # ╔═╡ 16bcb80a-970f-4633-aca2-261fa04172f7 -empirical_cdf_obs = ecdf(delay_counts.observed_delay, weights=delay_counts.n) +empirical_cdf_obs = ecdf(delay_counts.observed_delay, weights = delay_counts.n) # ╔═╡ 60711c3c-266e-42b5-acc6-6624db294f24 x_seq = range(minimum(samples), maximum(samples), 100) @@ -124,29 +125,29 @@ x_seq = range(minimum(samples), maximum(samples), 100) # ╔═╡ c6fe3c52-af87-4a84-b280-bc9a8532e269 #plot let - plot(; title = "Comparison of Observed vs Theoretical CDF", - ylabel = "Cumulative Probability", - xlabel = "Delay", - xticks = 0:obs_times[1], - xlims = (-0.1, obs_times[1] + 0.5) - ) - plot!(x_seq, x_seq .|> x->empirical_cdf(x), - lab = "Observed secondary times", - c = :blue, - lw = 3, - ) - plot!(x_seq, x_seq .|> x->empirical_cdf_obs(x), - lab = "Observed censored secondary times", - c = :green, - lw = 3, - ) - plot!(x_seq, x_seq .|> x -> cdf(true_dist, x), - lab = "Theoretical", - c = :black, - lw = 3, - ) - vline!([mean(samples)], ls = :dash, c= :blue, lw = 3, lab = "") - vline!([mean(true_dist)], ls = :dash, c= :black, lw = 3, lab = "") + plot(; title = "Comparison of Observed vs Theoretical CDF", + ylabel = "Cumulative Probability", + xlabel = "Delay", + xticks = 0:obs_times[1], + xlims = (-0.1, obs_times[1] + 0.5) + ) + plot!(x_seq, x_seq .|> x -> empirical_cdf(x), + lab = "Observed secondary times", + c = :blue, + lw = 3 + ) + plot!(x_seq, x_seq .|> x -> empirical_cdf_obs(x), + lab = "Observed censored secondary times", + c = :green, + lw = 3 + ) + plot!(x_seq, x_seq .|> x -> cdf(true_dist, x), + lab = "Theoretical", + c = :black, + lw = 3 + ) + vline!([mean(samples)], ls = :dash, c = :blue, lw = 3, lab = "") + vline!([mean(true_dist)], ls = :dash, c = :black, lw = 3, lab = "") end # ╔═╡ f66d4b2e-ed66-423e-9cba-62bff712862b @@ -163,13 +164,13 @@ We'll start by fitting a naive model using Turing. # ╔═╡ d9d14c48-8700-42b5-89b4-7fc51d0f577c @model function naive_model(N, y, n) - mu ~ Normal(1., 1.) - sigma ~ truncated(Normal(0.5, 1.0); lower= 0.0) - d = LogNormal(mu, sigma) - - for i in eachindex(y) - Turing.@addlogprob! n[i] * logpdf(d, y[i]) - end + mu ~ Normal(1.0, 1.0) + sigma ~ truncated(Normal(0.5, 1.0); lower = 0.0) + d = LogNormal(mu, sigma) + + for i in eachindex(y) + Turing.@addlogprob! n[i] * logpdf(d, y[i]) + end end # ╔═╡ 8a7cd9ec-5640-4f5f-84c3-ae3f465ca68b @@ -179,9 +180,9 @@ Now lets instantiate this model with data # ╔═╡ 028ade5c-17bd-4dfc-8433-23aaff02c181 naive_mdl = naive_model( - size(delay_counts,1), - delay_counts.observed_delay .+ 1e-6, # Add a small constant to avoid log(0) - delay_counts.n) + size(delay_counts, 1), + delay_counts.observed_delay .+ 1e-6, # Add a small constant to avoid log(0) + delay_counts.n) # ╔═╡ 04b4eefb-f0f9-4887-8db0-7cbb7f3b169b md" @@ -210,19 +211,19 @@ We'll now fit an improved model using the `censored_pmf` function from the `EpiA # ╔═╡ ef40112b-f23e-4d4b-8a7d-3793b786f472 @model function primarycensoreddist_model(N, y, y_upper, n, pwindow, D) - try - mu ~ Normal(1., 1.) - sigma ~ truncated(Normal(0.5, 0.5); lower= 0.1,) - d = LogNormal(mu, sigma) - log_pmf = censored_pmf(d; Δd = pwindow, D = D) .|> log - - for i in eachindex(y) - Turing.@addlogprob! n[i] * log_pmf[y[i] + 1] #0 obs is first element of array - end - return log_pmf - catch - Turing.@addlogprob! -Inf - end + try + mu ~ Normal(1.0, 1.0) + sigma ~ truncated(Normal(0.5, 0.5); lower = 0.1) + d = LogNormal(mu, sigma) + log_pmf = censored_pmf(d; Δd = pwindow, D = D) .|> log + + for i in eachindex(y) + Turing.@addlogprob! n[i] * log_pmf[y[i] + 1] #0 obs is first element of array + end + return log_pmf + catch + Turing.@addlogprob! -Inf + end end # ╔═╡ b823d824-419d-41e9-9ac9-2c45ef190acf @@ -232,12 +233,12 @@ Lets instantiate this model with data # ╔═╡ 93bca93a-5484-47fa-8424-7315eef15e37 primarycensoreddist_mdl = primarycensoreddist_model( - size(delay_counts,1), - delay_counts.observed_delay, # Add a small constant to avoid log(0) - delay_counts.observed_delay_upper, # Add a small constant to avoid log(0) - delay_counts.n, - delay_counts.pwindow[1], - delay_counts.obs_time[1] + size(delay_counts, 1), + delay_counts.observed_delay, # Add a small constant to avoid log(0) + delay_counts.observed_delay_upper, # Add a small constant to avoid log(0) + delay_counts.n, + delay_counts.pwindow[1], + delay_counts.obs_time[1] ) # ╔═╡ 8f1d32fd-f54b-4f69-8c93-8f0786366cef @@ -254,8 +255,8 @@ benchmark_model( # ╔═╡ 44132e2e-5a1a-49ad-9e57-cec24f981f52 map_estimate = [maximum_a_posteriori(primarycensoreddist_mdl) for _ in 1:10] |> - opts -> (opts, findmax([o.lp for o in opts])[2]) |> - opts_i -> opts_i[1][opts_i[2]] + opts -> (opts, findmax([o.lp for o in opts])[2]) |> + opts_i -> opts_i[1][opts_i[2]] # ╔═╡ a34c19e8-ba9e-4276-a17e-c853bb3341cf # ╠═╡ disabled = true @@ -267,7 +268,7 @@ primarycensoreddist_fit = sample(primarycensoreddist_mdl, NUTS(), MCMCThreads(), summarize(primarycensoreddist_fit) # ╔═╡ 46711233-f680-4962-9e3e-60c747db4d2c -censored_pmf(true_dist; D = obs_times[1] ) +censored_pmf(true_dist; D = obs_times[1]) # ╔═╡ 604458a6-7b6f-4b5c-b2e7-09be1908c0f9 # ╠═╡ disabled = true @@ -277,7 +278,8 @@ primarycensoreddist_fit = sample(primarycensoreddist_mdl, MH(), 100_000; initial ╠═╡ =# # ╔═╡ 7ae6c61d-0e33-4af8-b8d2-e31223a15a7c -primarycensoreddist_fit = sample(primarycensoreddist_mdl, NUTS(), 1000; initial_params=map_estimate.values.array) +primarycensoreddist_fit = sample( + primarycensoreddist_mdl, NUTS(), 1000; initial_params = map_estimate.values.array) # ╔═╡ Cell order: # ╟─8de5c5e0-6e95-11ef-1693-bfd465c8d919 From 284a43a43b02c6645b3d6a1c88fbd9e77168d894 Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Tue, 10 Sep 2024 10:38:15 +0100 Subject: [PATCH 03/16] observed delays as multiples of window --- .../explainers/censored-obs.jl | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl index 655464a1b..7ae42711c 100644 --- a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl +++ b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl @@ -105,8 +105,8 @@ delay_counts = mapreduce(vcat, samples, pwindows, swindows, obs_times) do T, pw, pwindow = pw, swindow = sw, obs_time = ot, - observed_delay = T ÷ sw .|> Int, - observed_delay_upper = (T ÷ sw) + sw |> Int + observed_delay = (T ÷ sw) * sw , + observed_delay_upper = (T ÷ sw) * (sw + 1) ) end |> # Aggregate to unique combinations and count occurrences df -> @groupby(df, :pwindow, :swindow, :obs_time, :observed_delay, @@ -258,29 +258,29 @@ map_estimate = [maximum_a_posteriori(primarycensoreddist_mdl) for _ in 1:10] |> opts -> (opts, findmax([o.lp for o in opts])[2]) |> opts_i -> opts_i[1][opts_i[2]] -# ╔═╡ a34c19e8-ba9e-4276-a17e-c853bb3341cf +# ╔═╡ 604458a6-7b6f-4b5c-b2e7-09be1908c0f9 # ╠═╡ disabled = true #=╠═╡ -primarycensoreddist_fit = sample(primarycensoreddist_mdl, NUTS(), MCMCThreads(), 500, 4) +primarycensoreddist_fit = sample(primarycensoreddist_mdl, MH(), 100_000; initial_params=map_estimate.values.array) |> + chn -> chn[50_000:end, :, :] ╠═╡ =# -# ╔═╡ 1210443f-480f-4e9f-b195-d557e9e1fc31 -summarize(primarycensoreddist_fit) - -# ╔═╡ 46711233-f680-4962-9e3e-60c747db4d2c -censored_pmf(true_dist; D = obs_times[1]) - -# ╔═╡ 604458a6-7b6f-4b5c-b2e7-09be1908c0f9 +# ╔═╡ a34c19e8-ba9e-4276-a17e-c853bb3341cf # ╠═╡ disabled = true #=╠═╡ -primarycensoreddist_fit = sample(primarycensoreddist_mdl, MH(), 100_000; initial_params=map_estimate.values.array) |> - chn -> chn[50_000:end, :, :] +primarycensoreddist_fit = sample(primarycensoreddist_mdl, NUTS(), MCMCThreads(), 500, 4) ╠═╡ =# # ╔═╡ 7ae6c61d-0e33-4af8-b8d2-e31223a15a7c primarycensoreddist_fit = sample( primarycensoreddist_mdl, NUTS(), 1000; initial_params = map_estimate.values.array) +# ╔═╡ 1210443f-480f-4e9f-b195-d557e9e1fc31 +summarize(primarycensoreddist_fit) + +# ╔═╡ 46711233-f680-4962-9e3e-60c747db4d2c +censored_pmf(true_dist; D = obs_times[1]) + # ╔═╡ Cell order: # ╟─8de5c5e0-6e95-11ef-1693-bfd465c8d919 # ╠═a2624404-48b1-4faa-abbe-6d78b8e04f2b From c48a66b0bf71e58278a8bf4e5467d6ddd680db0f Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Tue, 10 Sep 2024 10:41:19 +0100 Subject: [PATCH 04/16] fix array element selection --- EpiAware/docs/src/getting-started/explainers/censored-obs.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl index 7ae42711c..d26cec600 100644 --- a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl +++ b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl @@ -218,7 +218,7 @@ We'll now fit an improved model using the `censored_pmf` function from the `EpiA log_pmf = censored_pmf(d; Δd = pwindow, D = D) .|> log for i in eachindex(y) - Turing.@addlogprob! n[i] * log_pmf[y[i] + 1] #0 obs is first element of array + Turing.@addlogprob! n[i] * log_pmf[ceil(Int, y[i])] #0 obs is first element of array end return log_pmf catch From ffd6d08fe0c8c94c8d9f7bfe8547df94f1b20dac Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Tue, 10 Sep 2024 10:45:26 +0100 Subject: [PATCH 05/16] refix array selection --- .../src/getting-started/explainers/censored-obs.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl index d26cec600..81d392cc1 100644 --- a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl +++ b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl @@ -105,12 +105,13 @@ delay_counts = mapreduce(vcat, samples, pwindows, swindows, obs_times) do T, pw, pwindow = pw, swindow = sw, obs_time = ot, - observed_delay = (T ÷ sw) * sw , - observed_delay_upper = (T ÷ sw) * (sw + 1) + observed_delay = (T ÷ sw) * sw, + observed_delay_upper = (T ÷ sw) * (sw + 1), + observed_delay_step = Int(T ÷ sw) + 1, ) end |> # Aggregate to unique combinations and count occurrences df -> @groupby(df, :pwindow, :swindow, :obs_time, :observed_delay, - :observed_delay_upper) |> + :observed_delay_upper, :observed_delay_step) |> gd -> @combine(gd, :n=length(:pwindow)) # ╔═╡ a7bff47d-b61f-499e-8631-206661c2bdc0 @@ -210,7 +211,7 @@ We'll now fit an improved model using the `censored_pmf` function from the `EpiA " # ╔═╡ ef40112b-f23e-4d4b-8a7d-3793b786f472 -@model function primarycensoreddist_model(N, y, y_upper, n, pwindow, D) +@model function primarycensoreddist_model(N, y, n, pwindow, D) try mu ~ Normal(1.0, 1.0) sigma ~ truncated(Normal(0.5, 0.5); lower = 0.1) @@ -218,7 +219,7 @@ We'll now fit an improved model using the `censored_pmf` function from the `EpiA log_pmf = censored_pmf(d; Δd = pwindow, D = D) .|> log for i in eachindex(y) - Turing.@addlogprob! n[i] * log_pmf[ceil(Int, y[i])] #0 obs is first element of array + Turing.@addlogprob! n[i] * log_pmf[y[i]] end return log_pmf catch @@ -234,8 +235,7 @@ Lets instantiate this model with data # ╔═╡ 93bca93a-5484-47fa-8424-7315eef15e37 primarycensoreddist_mdl = primarycensoreddist_model( size(delay_counts, 1), - delay_counts.observed_delay, # Add a small constant to avoid log(0) - delay_counts.observed_delay_upper, # Add a small constant to avoid log(0) + delay_counts.observed_delay_step, delay_counts.n, delay_counts.pwindow[1], delay_counts.obs_time[1] From d40bdaece720359c6a9b556e3987f629ddf11424 Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Tue, 10 Sep 2024 10:45:44 +0100 Subject: [PATCH 06/16] reformat --- EpiAware/docs/src/getting-started/explainers/censored-obs.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl index 81d392cc1..e547bc9b0 100644 --- a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl +++ b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl @@ -107,7 +107,7 @@ delay_counts = mapreduce(vcat, samples, pwindows, swindows, obs_times) do T, pw, obs_time = ot, observed_delay = (T ÷ sw) * sw, observed_delay_upper = (T ÷ sw) * (sw + 1), - observed_delay_step = Int(T ÷ sw) + 1, + observed_delay_step = Int(T ÷ sw) + 1 ) end |> # Aggregate to unique combinations and count occurrences df -> @groupby(df, :pwindow, :swindow, :obs_time, :observed_delay, From 182858b5f82a7c8d920a7a75861706a1a5a4bac6 Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Wed, 25 Sep 2024 19:40:11 +0100 Subject: [PATCH 07/16] refactor censored_pmf --- EpiAware/src/EpiAwareUtils/censored_pmf.jl | 51 +++++++++++++--------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/EpiAware/src/EpiAwareUtils/censored_pmf.jl b/EpiAware/src/EpiAwareUtils/censored_pmf.jl index 38e274f64..4acedfffa 100644 --- a/EpiAware/src/EpiAwareUtils/censored_pmf.jl +++ b/EpiAware/src/EpiAwareUtils/censored_pmf.jl @@ -63,12 +63,33 @@ function censored_pmf(dist::Distribution, ts .|> (t -> cdf(dist, t)) |> diff |> p -> p ./ sum(p) end + +""" +Internal function to check censored_pmf arguments and return the time steps of the rightmost limits of the censor intervals. +""" +function _check_and_give_ts(dist::Distribution, Δd, D, upper) + @assert minimum(dist)>=0.0 "Distribution must be non-negative." + @assert Δd>0.0 "Δd must be positive." + + if isnothing(D) + x_99 = invlogcdf(dist, log(upper)) + D = round(Int64, x_99 / Δd) * Δd + end + + @assert D>=Δd "D can't be shorter than Δd." + + ts = Δd:Δd:D |> collect + + @assert ts[end]==D "D must be a multiple of Δd." + + return ts +end + @doc raw" Create a discrete probability mass function (PMF) from a given distribution, -assuming -a uniform distribution over primary event times with censoring intervals of width `Δd` for +assuming a uniform distribution over primary event times with censoring intervals of width `Δd` for both primary and secondary events. The CDF for the time from the left edge of the interval -containing the primary event to the secondary event is created by direct numerical integration +containing the primary event to the secondary event is created by direct numerical integration (quadrature) of the convolution of the CDF of `dist` with the uniform density on `[0,Δd)`, the discrete PMF for double censored delays is then found using simple differencing on the CDF. @@ -77,9 +98,10 @@ for double censored delays is then found using simple differencing on the CDF. - `dist`: The distribution from which to create the PMF. - `Δd`: The step size for discretizing the domain. Default is 1.0. - `D`: The upper bound of the domain. Must be greater than `Δd`. Default `D = nothing` -indicates that the distribution should be truncated at its 99th percentile rounded +indicates that the distribution should be truncated at its `upper`th percentile rounded to nearest multiple of `Δd`. + # Returns - A vector representing the PMF. @@ -114,22 +136,11 @@ censored_pmf(dist; D = 10) |> 0.0 ``` " -function censored_pmf(dist::Distribution; Δd = 1.0, D = nothing) - @assert minimum(dist)>=0.0 "Distribution must be non-negative." - @assert Δd>0.0 "Δd must be positive." - - if isnothing(D) - x_99 = invlogcdf(dist, log(0.99)) - D = round(Int64, x_99 / Δd) * Δd - end - - @assert D>=Δd "D can't be shorter than Δd." - - ts = 0.0:Δd:D |> collect - - @assert ts[end]==D "D must be a multiple of Δd." +function censored_pmf(dist::Distribution; Δd = 1.0, D = nothing, upper = 0.99) + ts = _check_and_give_ts(dist, Δd, D, upper) - ∫F(dist, t, Δd) = quadgk(u -> cdf(dist, t - u) / Δd, 0.0, Δd)[1] + ∫F(dist, t, Δd) = quadgk(u -> exp(logcdf(dist, t - u) - log(Δd)), 0.0, Δd)[1] + _q = ts .|> t -> ∫F(dist, t, Δd) - ts .|> (t -> ∫F(dist, t, Δd)) |> diff |> p -> p ./ sum(p) + return [0.; _q] |> diff |> p -> p ./ sum(p) end From 191baf4ee93fc9a9d08169da2ecc25bd23952b5b Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Wed, 25 Sep 2024 20:09:06 +0100 Subject: [PATCH 08/16] Add Makie + Pairplots --- EpiAware/docs/Project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/EpiAware/docs/Project.toml b/EpiAware/docs/Project.toml index c17e5e021..01ecd2ebe 100644 --- a/EpiAware/docs/Project.toml +++ b/EpiAware/docs/Project.toml @@ -2,6 +2,7 @@ ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" AdvancedHMC = "0bf59076-c3b1-5ca4-86bd-e02cd72cde3d" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" Changelog = "5217a498-cd5d-4ec6-b8c2-9b85a09b6e3e" DataFramesMeta = "1313f7d8-7da2-5740-9ea0-a2ca25f37964" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" @@ -10,6 +11,7 @@ DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8" EpiAware = "b2eeebe4-5992-4301-9193-7ebc9f62c855" FFMPEG = "c87230d0-a227-11e9-1b43-d7ebe4e7570a" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +PairPlots = "43a3c2be-4208-490b-832a-a21dcd55d7da" Pluto = "c3e4b0f8-55cb-11ea-2926-15256bba5781" PlutoStaticHTML = "359b1769-a58e-495b-9770-312e911026ad" ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" From a1aa16faf4a954addd40a7ba6400ab314a33c226 Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Wed, 25 Sep 2024 20:09:09 +0100 Subject: [PATCH 09/16] Update censored-obs.jl --- .../explainers/censored-obs.jl | 123 ++++++++---------- 1 file changed, 56 insertions(+), 67 deletions(-) diff --git a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl index e547bc9b0..b8a27c674 100644 --- a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl +++ b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl @@ -12,7 +12,7 @@ let using Pkg: Pkg Pkg.activate(docs_dir) Pkg.develop(; path = pkg_dir) - Pkg.add(["DataFramesMeta", "StatsBase", "TuringBenchmarking"]) + Pkg.add(["DataFramesMeta", "StatsBase", "PairPlots", "CairoMakie"]) Pkg.instantiate() end @@ -21,8 +21,8 @@ begin using EpiAware.EpiAwareUtils: censored_pmf using Random, Distributions, StatsBase #utilities for random events using DataFramesMeta #Data wrangling - using StatsPlots #plotting - using Turing, TuringBenchmarking #PPL + using StatsPlots, CairoMakie, PairPlots #plotting + using Turing #PPL end # ╔═╡ 8de5c5e0-6e95-11ef-1693-bfd465c8d919 @@ -83,31 +83,38 @@ true_dist = LogNormal(meanlog, sdlog) # ╔═╡ aea8b28e-fffe-4aa6-b51e-8199a7c7975c # Generate varying pwindow, swindow, and obs_time lengths -pwindows = rand(1:1, n) +pwindow = 1. # ╔═╡ d231bd0c-165f-4973-a46f-f66991813ea7 -swindows = rand(1:1, n) +swindow = 1. # ╔═╡ 7522f05b-1750-4983-8947-ef70f4298d06 -obs_times = fill(10.0, n) +obs_time = 10. + +# ╔═╡ 60212cf7-cc3d-42d3-8260-1f067ede3c6f +nsamples_max = 2000 # ╔═╡ a4f5e9b6-ff3a-48fa-aa51-0abccb9c7bed #Sample secondary time relative to beginning of primary censor window respecting the right-truncation -samples = map(pwindows, swindows, obs_times) do pw, sw, ot - P = rand() * pw # Primary event time - T = rand(truncated(true_dist; upper = ot - P)) -end +samples = map(1:nsamples_max) do _ + P = rand() * pwindow # Primary event time + T = rand(true_dist) +end |> +s -> filter(x -> x <= obs_time, s) + +# ╔═╡ a627a544-4c41-4c35-83ec-be7ed7c0a737 +nsamples = length(samples) # ╔═╡ 0b5e96eb-9312-472e-8a88-d4509a4f25d0 # Generate samples -delay_counts = mapreduce(vcat, samples, pwindows, swindows, obs_times) do T, pw, sw, ot +delay_counts = mapreduce(vcat, samples) do T DataFrame( - pwindow = pw, - swindow = sw, - obs_time = ot, - observed_delay = (T ÷ sw) * sw, - observed_delay_upper = (T ÷ sw) * (sw + 1), - observed_delay_step = Int(T ÷ sw) + 1 + pwindow = pwindow, + swindow = swindow, + obs_time = obs_time, + observed_delay = (T ÷ swindow) * swindow, + observed_delay_upper = (T ÷ swindow) * (swindow) + swindow, + observed_delay_step = Int(T ÷ swindow) + 1 ) end |> # Aggregate to unique combinations and count occurrences df -> @groupby(df, :pwindow, :swindow, :obs_time, :observed_delay, @@ -126,29 +133,29 @@ x_seq = range(minimum(samples), maximum(samples), 100) # ╔═╡ c6fe3c52-af87-4a84-b280-bc9a8532e269 #plot let - plot(; title = "Comparison of Observed vs Theoretical CDF", + StatsPlots.plot(; title = "Comparison of Observed vs Theoretical CDF", ylabel = "Cumulative Probability", xlabel = "Delay", - xticks = 0:obs_times[1], - xlims = (-0.1, obs_times[1] + 0.5) + xticks = 0:obs_time, + xlims = (-0.1, obs_time + 0.5) ) - plot!(x_seq, x_seq .|> x -> empirical_cdf(x), + StatsPlots.plot!(x_seq, x_seq .|> x -> empirical_cdf(x), lab = "Observed secondary times", c = :blue, lw = 3 ) - plot!(x_seq, x_seq .|> x -> empirical_cdf_obs(x), + StatsPlots.plot!(x_seq, x_seq .|> x -> empirical_cdf_obs(x), lab = "Observed censored secondary times", c = :green, lw = 3 ) - plot!(x_seq, x_seq .|> x -> cdf(true_dist, x), + StatsPlots.plot!(x_seq, x_seq .|> x -> cdf(true_dist, x), lab = "Theoretical", c = :black, lw = 3 ) - vline!([mean(samples)], ls = :dash, c = :blue, lw = 3, lab = "") - vline!([mean(true_dist)], ls = :dash, c = :black, lw = 3, lab = "") + StatsPlots.vline!([mean(samples)], ls = :dash, c = :blue, lw = 3, lab = "") + StatsPlots.vline!([mean(true_dist)], ls = :dash, c = :black, lw = 3, lab = "") end # ╔═╡ f66d4b2e-ed66-423e-9cba-62bff712862b @@ -196,6 +203,14 @@ naive_fit = sample(naive_mdl, NUTS(), MCMCThreads(), 500, 4) # ╔═╡ 3b89fe00-6aaf-4764-8b29-e71479f1e641 summarize(naive_fit) +# ╔═╡ 8e09d931-fca7-4ac2-81f7-2bc36b0174f3 +let +f = pairplot(naive_fit) +CairoMakie.vlines!(f[1,1], [meanlog]) +CairoMakie.vlines!(f[2,2], [sdlog]) +f +end + # ╔═╡ 43eac8dd-8f1d-440e-b1e8-85db9e740651 md" We see that the model has converged and the diagnostics look good. However, just from the model posterior summary we see that we might not be very happy with the fit. `mu` is smaller than the target $(meanlog) and `sigma` is larger than the target $(sdlog). @@ -214,7 +229,7 @@ We'll now fit an improved model using the `censored_pmf` function from the `EpiA @model function primarycensoreddist_model(N, y, n, pwindow, D) try mu ~ Normal(1.0, 1.0) - sigma ~ truncated(Normal(0.5, 0.5); lower = 0.1) + sigma ~ truncated(Normal(0.5, 0.5); lower = 0.) d = LogNormal(mu, sigma) log_pmf = censored_pmf(d; Δd = pwindow, D = D) .|> log @@ -237,49 +252,24 @@ primarycensoreddist_mdl = primarycensoreddist_model( size(delay_counts, 1), delay_counts.observed_delay_step, delay_counts.n, - delay_counts.pwindow[1], - delay_counts.obs_time[1] + pwindow, + obs_time, ) -# ╔═╡ 8f1d32fd-f54b-4f69-8c93-8f0786366cef -# ╠═╡ disabled = true -#=╠═╡ -benchmark_model( - primarycensoreddist_mdl; - # Check correctness of computations - check=true, - # Automatic differentiation backends to check and benchmark - adbackends=[:forwarddiff, :reversediff, :reversediff_compiled] - ) - ╠═╡ =# - -# ╔═╡ 44132e2e-5a1a-49ad-9e57-cec24f981f52 -map_estimate = [maximum_a_posteriori(primarycensoreddist_mdl) for _ in 1:10] |> - opts -> (opts, findmax([o.lp for o in opts])[2]) |> - opts_i -> opts_i[1][opts_i[2]] - -# ╔═╡ 604458a6-7b6f-4b5c-b2e7-09be1908c0f9 -# ╠═╡ disabled = true -#=╠═╡ -primarycensoreddist_fit = sample(primarycensoreddist_mdl, MH(), 100_000; initial_params=map_estimate.values.array) |> - chn -> chn[50_000:end, :, :] - ╠═╡ =# - -# ╔═╡ a34c19e8-ba9e-4276-a17e-c853bb3341cf -# ╠═╡ disabled = true -#=╠═╡ -primarycensoreddist_fit = sample(primarycensoreddist_mdl, NUTS(), MCMCThreads(), 500, 4) - ╠═╡ =# - # ╔═╡ 7ae6c61d-0e33-4af8-b8d2-e31223a15a7c primarycensoreddist_fit = sample( - primarycensoreddist_mdl, NUTS(), 1000; initial_params = map_estimate.values.array) + primarycensoreddist_mdl, NUTS(), MCMCThreads(), 1000, 4) # ╔═╡ 1210443f-480f-4e9f-b195-d557e9e1fc31 summarize(primarycensoreddist_fit) -# ╔═╡ 46711233-f680-4962-9e3e-60c747db4d2c -censored_pmf(true_dist; D = obs_times[1]) +# ╔═╡ b2376beb-dd7b-442d-9ff5-ac864e75366b +let +f = pairplot(primarycensoreddist_fit) +CairoMakie.vlines!(f[1,1], [meanlog], linewidth = 3) +CairoMakie.vlines!(f[2,2], [sdlog], linewidth = 3) +f +end # ╔═╡ Cell order: # ╟─8de5c5e0-6e95-11ef-1693-bfd465c8d919 @@ -295,7 +285,9 @@ censored_pmf(true_dist; D = obs_times[1]) # ╠═aea8b28e-fffe-4aa6-b51e-8199a7c7975c # ╠═d231bd0c-165f-4973-a46f-f66991813ea7 # ╠═7522f05b-1750-4983-8947-ef70f4298d06 +# ╠═60212cf7-cc3d-42d3-8260-1f067ede3c6f # ╠═a4f5e9b6-ff3a-48fa-aa51-0abccb9c7bed +# ╠═a627a544-4c41-4c35-83ec-be7ed7c0a737 # ╠═0b5e96eb-9312-472e-8a88-d4509a4f25d0 # ╠═a7bff47d-b61f-499e-8631-206661c2bdc0 # ╠═16bcb80a-970f-4633-aca2-261fa04172f7 @@ -309,15 +301,12 @@ censored_pmf(true_dist; D = obs_times[1]) # ╟─04b4eefb-f0f9-4887-8db0-7cbb7f3b169b # ╠═21655344-d12b-4e47-a9a9-d06bd909f6ea # ╠═3b89fe00-6aaf-4764-8b29-e71479f1e641 +# ╠═8e09d931-fca7-4ac2-81f7-2bc36b0174f3 # ╟─43eac8dd-8f1d-440e-b1e8-85db9e740651 -# ╠═b2efafab-8849-4a7a-bb64-ac9ce126ca75 +# ╟─b2efafab-8849-4a7a-bb64-ac9ce126ca75 # ╠═ef40112b-f23e-4d4b-8a7d-3793b786f472 # ╟─b823d824-419d-41e9-9ac9-2c45ef190acf # ╠═93bca93a-5484-47fa-8424-7315eef15e37 -# ╠═8f1d32fd-f54b-4f69-8c93-8f0786366cef -# ╠═44132e2e-5a1a-49ad-9e57-cec24f981f52 -# ╠═604458a6-7b6f-4b5c-b2e7-09be1908c0f9 -# ╠═a34c19e8-ba9e-4276-a17e-c853bb3341cf # ╠═7ae6c61d-0e33-4af8-b8d2-e31223a15a7c # ╠═1210443f-480f-4e9f-b195-d557e9e1fc31 -# ╠═46711233-f680-4962-9e3e-60c747db4d2c +# ╠═b2376beb-dd7b-442d-9ff5-ac864e75366b From 03a400116be5dd4516dacaa4c20fcdb0590a0f1d Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Wed, 25 Sep 2024 20:11:32 +0100 Subject: [PATCH 10/16] reformat --- .../explainers/censored-obs.jl | 30 +++++++++---------- EpiAware/src/EpiAwareUtils/censored_pmf.jl | 3 +- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl index b8a27c674..eb49eae24 100644 --- a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl +++ b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl @@ -83,13 +83,13 @@ true_dist = LogNormal(meanlog, sdlog) # ╔═╡ aea8b28e-fffe-4aa6-b51e-8199a7c7975c # Generate varying pwindow, swindow, and obs_time lengths -pwindow = 1. +pwindow = 1.0 # ╔═╡ d231bd0c-165f-4973-a46f-f66991813ea7 -swindow = 1. +swindow = 1.0 # ╔═╡ 7522f05b-1750-4983-8947-ef70f4298d06 -obs_time = 10. +obs_time = 10.0 # ╔═╡ 60212cf7-cc3d-42d3-8260-1f067ede3c6f nsamples_max = 2000 @@ -99,8 +99,8 @@ nsamples_max = 2000 samples = map(1:nsamples_max) do _ P = rand() * pwindow # Primary event time T = rand(true_dist) -end |> -s -> filter(x -> x <= obs_time, s) +end |> + s -> filter(x -> x <= obs_time, s) # ╔═╡ a627a544-4c41-4c35-83ec-be7ed7c0a737 nsamples = length(samples) @@ -205,10 +205,10 @@ summarize(naive_fit) # ╔═╡ 8e09d931-fca7-4ac2-81f7-2bc36b0174f3 let -f = pairplot(naive_fit) -CairoMakie.vlines!(f[1,1], [meanlog]) -CairoMakie.vlines!(f[2,2], [sdlog]) -f + f = pairplot(naive_fit) + CairoMakie.vlines!(f[1, 1], [meanlog]) + CairoMakie.vlines!(f[2, 2], [sdlog]) + f end # ╔═╡ 43eac8dd-8f1d-440e-b1e8-85db9e740651 @@ -229,7 +229,7 @@ We'll now fit an improved model using the `censored_pmf` function from the `EpiA @model function primarycensoreddist_model(N, y, n, pwindow, D) try mu ~ Normal(1.0, 1.0) - sigma ~ truncated(Normal(0.5, 0.5); lower = 0.) + sigma ~ truncated(Normal(0.5, 0.5); lower = 0.0) d = LogNormal(mu, sigma) log_pmf = censored_pmf(d; Δd = pwindow, D = D) .|> log @@ -253,7 +253,7 @@ primarycensoreddist_mdl = primarycensoreddist_model( delay_counts.observed_delay_step, delay_counts.n, pwindow, - obs_time, + obs_time ) # ╔═╡ 7ae6c61d-0e33-4af8-b8d2-e31223a15a7c @@ -265,10 +265,10 @@ summarize(primarycensoreddist_fit) # ╔═╡ b2376beb-dd7b-442d-9ff5-ac864e75366b let -f = pairplot(primarycensoreddist_fit) -CairoMakie.vlines!(f[1,1], [meanlog], linewidth = 3) -CairoMakie.vlines!(f[2,2], [sdlog], linewidth = 3) -f + f = pairplot(primarycensoreddist_fit) + CairoMakie.vlines!(f[1, 1], [meanlog], linewidth = 3) + CairoMakie.vlines!(f[2, 2], [sdlog], linewidth = 3) + f end # ╔═╡ Cell order: diff --git a/EpiAware/src/EpiAwareUtils/censored_pmf.jl b/EpiAware/src/EpiAwareUtils/censored_pmf.jl index 4acedfffa..c0841d11c 100644 --- a/EpiAware/src/EpiAwareUtils/censored_pmf.jl +++ b/EpiAware/src/EpiAwareUtils/censored_pmf.jl @@ -63,7 +63,6 @@ function censored_pmf(dist::Distribution, ts .|> (t -> cdf(dist, t)) |> diff |> p -> p ./ sum(p) end - """ Internal function to check censored_pmf arguments and return the time steps of the rightmost limits of the censor intervals. """ @@ -142,5 +141,5 @@ function censored_pmf(dist::Distribution; Δd = 1.0, D = nothing, upper = 0.99) ∫F(dist, t, Δd) = quadgk(u -> exp(logcdf(dist, t - u) - log(Δd)), 0.0, Δd)[1] _q = ts .|> t -> ∫F(dist, t, Δd) - return [0.; _q] |> diff |> p -> p ./ sum(p) + return [0.0; _q] |> diff |> p -> p ./ sum(p) end From f8d6c5be493e51ec8eace6f3f37c4d8c5d9e5ce7 Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Thu, 26 Sep 2024 12:16:04 +0100 Subject: [PATCH 11/16] split up censoring utility functions --- EpiAware/src/EpiAwareUtils/EpiAwareUtils.jl | 2 +- EpiAware/src/EpiAwareUtils/censored_pmf.jl | 84 +++++++++++++++++++-- EpiAware/test/EpiAwareUtils/censored_pmf.jl | 9 +++ 3 files changed, 86 insertions(+), 9 deletions(-) diff --git a/EpiAware/src/EpiAwareUtils/EpiAwareUtils.jl b/EpiAware/src/EpiAwareUtils/EpiAwareUtils.jl index 532747869..bcabad2d0 100644 --- a/EpiAware/src/EpiAwareUtils/EpiAwareUtils.jl +++ b/EpiAware/src/EpiAwareUtils/EpiAwareUtils.jl @@ -17,7 +17,7 @@ using Distributions, DocStringExtensions, QuadGK, Statistics, Turing export HalfNormal, DirectSample, SafePoisson, SafeNegativeBinomial #Export functions -export scan, spread_draws, censored_pmf, get_param_array, prefix_submodel +export scan, spread_draws, censored_cdf, censored_pmf, get_param_array, prefix_submodel, ∫F # Export accumulate tools export get_state, accumulate_scan diff --git a/EpiAware/src/EpiAwareUtils/censored_pmf.jl b/EpiAware/src/EpiAwareUtils/censored_pmf.jl index c0841d11c..2e04c8cfc 100644 --- a/EpiAware/src/EpiAwareUtils/censored_pmf.jl +++ b/EpiAware/src/EpiAwareUtils/censored_pmf.jl @@ -84,13 +84,85 @@ function _check_and_give_ts(dist::Distribution, Δd, D, upper) return ts end +""" +Calculate the CDF of the random variable `X + U` where `X` has cumulative distriubtion function `F` +and `U` is a uniform random variable on `[0, Δd)`. + +This is used in solving for censored CDFs and PMFs using numerical quadrature. +""" +function ∫F(dist, t, Δd) + quadgk(u -> exp(logcdf(dist, t - u) - log(Δd)), 0.0, min(Δd, t))[1] +end + +@doc raw" +Create a discrete probability cumulative distribution function (CDF) from a given distribution, +assuming a uniform distribution over primary event times with censoring intervals of width `Δd` for +both primary and secondary events. + +NB: `censored_cdf` returns the _non-truncated_ CDF, i.e. the CDF without conditioning on the +secondary event occuring either before or after some time. + + +# Arguments +- `dist`: The distribution from which to create the PMF. +- `Δd`: The step size for discretizing the domain. Default is 1.0. +- `D`: The upper bound of the domain. Must be greater than `Δd`. Default `D = nothing` +indicates that the distribution should be truncated at its `upper`th percentile rounded +to nearest multiple of `Δd`. + + +# Returns +- A vector representing the CDF with 0.0 appended at the beginning. + +# Raises +- `AssertionError` if the minimum value of `dist` is negative. +- `AssertionError` if `Δd` is not positive. +- `AssertionError` if `D` is shorter than `Δd`. +- `AssertionError` if `D` is not a multiple of `Δd`. + +# Examples + +```jldoctest filter +using Distributions +using EpiAware.EpiAwareUtils + +dist = Exponential(1.0) + +censored_cdf(dist; D = 10) |> + p -> round.(p, digits=3) + +# output +10-element Vector{Float64}: + 0.0 + 0.368 + 0.768 + 0.915 + 0.969 + 0.989 + 0.996 + 0.999 + 1.0 + 1.0 + 1.0 +``` +" +function censored_cdf(dist::Distribution; Δd = 1.0, D = nothing, upper = 0.999) + ts = _check_and_give_ts(dist, Δd, D, upper) + cens_F = ts .|> t -> ∫F(dist, t, Δd) + return [0.0; cens_F] +end + @doc raw" Create a discrete probability mass function (PMF) from a given distribution, assuming a uniform distribution over primary event times with censoring intervals of width `Δd` for both primary and secondary events. The CDF for the time from the left edge of the interval containing the primary event to the secondary event is created by direct numerical integration (quadrature) -of the convolution of the CDF of `dist` with the uniform density on `[0,Δd)`, the discrete PMF -for double censored delays is then found using simple differencing on the CDF. +of the convolution of the CDF of `dist` with the uniform density on `[0,Δd)`, using the `censored_cdf` +function. The discrete PMF for double censored delays is then found using simple differencing +on the CDF. + +NB: `censored_pmf` returns a _right-truncated_ PMF, i.e. the PMF conditioned on the secondary event +occurring before or on the final secondary censoring window. # Arguments @@ -136,10 +208,6 @@ censored_pmf(dist; D = 10) |> ``` " function censored_pmf(dist::Distribution; Δd = 1.0, D = nothing, upper = 0.99) - ts = _check_and_give_ts(dist, Δd, D, upper) - - ∫F(dist, t, Δd) = quadgk(u -> exp(logcdf(dist, t - u) - log(Δd)), 0.0, Δd)[1] - _q = ts .|> t -> ∫F(dist, t, Δd) - - return [0.0; _q] |> diff |> p -> p ./ sum(p) + cens_cdf = censored_cdf(dist; Δd, D, upper) + return cens_cdf |> diff |> p -> p ./ sum(p) end diff --git a/EpiAware/test/EpiAwareUtils/censored_pmf.jl b/EpiAware/test/EpiAwareUtils/censored_pmf.jl index dc034fb09..2cdb6134b 100644 --- a/EpiAware/test/EpiAwareUtils/censored_pmf.jl +++ b/EpiAware/test/EpiAwareUtils/censored_pmf.jl @@ -55,4 +55,13 @@ for s in 1:length(pmf)]] @test sum(expected_pmf_uncond) > 0.99 end + + @testset "Check CDF function" begin + dist = Exponential(1.0) + expected_pmf_uncond = [exp(-1) + [(1 - exp(-1)) * (exp(1) - 1) * exp(-s) for s in 1:9]] + expected_cdf = [0.0; cumsum(expected_pmf_uncond)] + calc_cdf = censored_cdf(dist; Δd = 1.0, D = 10.0) + @test expected_cdf≈calc_cdf atol=1e-15 + end end From 306dc9f9596050459e051f4c5ba11fbdc4ee2784 Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Thu, 26 Sep 2024 12:16:09 +0100 Subject: [PATCH 12/16] Update censored-obs.jl --- .../explainers/censored-obs.jl | 236 +++++++++++------- 1 file changed, 150 insertions(+), 86 deletions(-) diff --git a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl index eb49eae24..9b030d0a6 100644 --- a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl +++ b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl @@ -12,45 +12,45 @@ let using Pkg: Pkg Pkg.activate(docs_dir) Pkg.develop(; path = pkg_dir) - Pkg.add(["DataFramesMeta", "StatsBase", "PairPlots", "CairoMakie"]) Pkg.instantiate() end # ╔═╡ 5baa8d2e-bcf8-4e3b-b007-175ad3e2ca95 begin - using EpiAware.EpiAwareUtils: censored_pmf + using EpiAware.EpiAwareUtils: censored_pmf, censored_cdf, ∫F using Random, Distributions, StatsBase #utilities for random events using DataFramesMeta #Data wrangling - using StatsPlots, CairoMakie, PairPlots #plotting + using CairoMakie, PairPlots #plotting using Turing #PPL end # ╔═╡ 8de5c5e0-6e95-11ef-1693-bfd465c8d919 md" -# Fitting distributions using `censored_pmf` and Turing PPL +# Fitting distributions using `EpiAware` and Turing PPL ## Introduction ### What are we going to do in this Vignette -In this vignette, we'll demonstrate how to use `EpiAwareUtils.censored_pmf` in conjunction with the Turing PPL for Bayesian inference of epidemiological delay distributions. We'll cover the following key points: +In this vignette, we'll demonstrate how to use the CDF function for censored delay distributions `EpiAwareUtils.∫F`, which underlies `EpiAwareUtils.censored_pmf` in conjunction with the Turing PPL for Bayesian inference of epidemiological delay distributions. We'll cover the following key points: 1. Simulating censored delay distribution data 2. Fitting a naive model using Turing 3. Evaluating the naive model's performance -4. Fitting an improved model using `censored_pmf` functionality -5. Comparing the `censored_pmf` model's performance to the naive model +4. Fitting an improved model using censored delay functionality from `EpiAware`. +5. Comparing the censored delay model's performance to the naive model ### What might I need to know before starting -This note builds on the concepts introduced in the R/stan package [`primarycensoreddist`](https://github.com/epinowcast/primarycensoreddist), especially the [Getting Started with primarycensoreddist](https://primarycensoreddist.epinowcast.org/articles/fitting-dists-with-stan.html) vignette and assumes familiarity with using Turing tools as covered in the [Turing documentation](https://turinglang.org/). +This note builds on the concepts introduced in the R/stan package [`primarycensoreddist`](https://github.com/epinowcast/primarycensoreddist), especially the [Fitting distributions using primarycensorseddist and cmdstan](https://primarycensoreddist.epinowcast.org/articles/fitting-dists-with-stan.html) vignette and assumes familiarity with using Turing tools as covered in the [Turing documentation](https://turinglang.org/). This note is generated using the `EpiAware` package locally via `Pkg.develop`, in the `EpiAware/docs` environment. It is also possible to install `EpiAware` using ```julia Pkg.add(url=\"https://github.com/CDCgov/Rt-without-renewal\", subdir=\"EpiAware\") ``` - +### Packages used in this vignette +As well as `EpiAware` and `Turing` we will use `Makie` ecosystem packages for plotting and `DataFramesMeta` for data manipulation. " # ╔═╡ 30dd9af4-b64f-42b1-8439-a890752f68e3 @@ -62,15 +62,17 @@ The other dependencies are as follows: md" ## Simulating censored and truncated delay distribution data -We'll start by simulating some censored and truncated delay distribution data. +We'll start by simulating some censored and truncated delay distribution data. We’ll define a `rpcens` function for generating data. " # ╔═╡ aed124c7-b4ba-4c97-a01f-ff553f376c86 Random.seed!(123) # For reproducibility +# ╔═╡ ec5ed3e9-6ea9-4cfe-afd2-82aabbbe8130 +md"Define the true distribution parameters" + # ╔═╡ 105b9594-36ce-4ae8-87a8-5c81867b1ce3 -# Define the true distribution parameters -n = 1000 +n = 2000 # ╔═╡ 8aa9f9c1-d3c4-49f3-be18-a400fc71e8f7 meanlog = 1.5 @@ -81,46 +83,79 @@ sdlog = 0.75 # ╔═╡ 2bf6677e-ebe9-4aa8-aa91-f631e99669bb true_dist = LogNormal(meanlog, sdlog) +# ╔═╡ f4083aea-8106-401a-b60f-383d0b94102a +md"Generate varying pwindow, swindow, and obs_time lengths +" + # ╔═╡ aea8b28e-fffe-4aa6-b51e-8199a7c7975c -# Generate varying pwindow, swindow, and obs_time lengths -pwindow = 1.0 +pwindows = rand(1:2, n) -# ╔═╡ d231bd0c-165f-4973-a46f-f66991813ea7 -swindow = 1.0 +# ╔═╡ 4d3a853d-0b8d-402a-8309-e9f6da2b7a8c +swindows = rand(1:2, n) # ╔═╡ 7522f05b-1750-4983-8947-ef70f4298d06 -obs_time = 10.0 +obs_times = rand(8:10, n) -# ╔═╡ 60212cf7-cc3d-42d3-8260-1f067ede3c6f -nsamples_max = 2000 +# ╔═╡ 5eac2f60-8cec-4460-9d10-6bade7f0f406 +md" +We recreate the primary censored sampling function from `primarycensoreddist`, c.f. documentation [here](https://primarycensoreddist.epinowcast.org/reference/rprimarycensoreddist.html). +" + +# ╔═╡ 9443b893-9e22-4267-9a1f-319a3adb8c0d +""" + function rpcens(dist; pwindow = 1, swindow = 1, D = Inf, max_tries = 1000) + +Does a truncated censored sample from `dist` with a uniform primary time on `[0, pwindow]`. +""" +function rpcens(dist; pwindow = 1, swindow = 1, D = Inf, max_tries = 1000) + T = zero(eltype(dist)) + invalid_sample = true + attempts = 1 + while (invalid_sample && attempts <= max_tries) + X = rand(dist) + U = rand() * pwindow + T = X + U + attempts += 1 + if X + U < D + invalid_sample = false + end + end + + @assert !invalid_sample "censored value not found in $max_tries attempts" + + return (T ÷ swindow) * swindow +end # ╔═╡ a4f5e9b6-ff3a-48fa-aa51-0abccb9c7bed #Sample secondary time relative to beginning of primary censor window respecting the right-truncation -samples = map(1:nsamples_max) do _ - P = rand() * pwindow # Primary event time - T = rand(true_dist) -end |> - s -> filter(x -> x <= obs_time, s) +samples = map(pwindows, swindows, obs_times) do pw, sw, ot + rpcens(true_dist; pwindow = pw, swindow = sw, D = ot) +end -# ╔═╡ a627a544-4c41-4c35-83ec-be7ed7c0a737 -nsamples = length(samples) +# ╔═╡ 2a9da9e5-0925-4ae0-8b70-8db90903cb0b +md" +Aggregate to unique combinations and count occurrences +" # ╔═╡ 0b5e96eb-9312-472e-8a88-d4509a4f25d0 -# Generate samples -delay_counts = mapreduce(vcat, samples) do T +delay_counts = mapreduce(vcat, pwindows, swindows, obs_times, samples) do pw, sw, ot, s DataFrame( - pwindow = pwindow, - swindow = swindow, - obs_time = obs_time, - observed_delay = (T ÷ swindow) * swindow, - observed_delay_upper = (T ÷ swindow) * (swindow) + swindow, - observed_delay_step = Int(T ÷ swindow) + 1 + pwindow = pw, + swindow = sw, + obs_time = ot, + observed_delay = s, + observed_delay_upper = s + sw ) -end |> # Aggregate to unique combinations and count occurrences +end |> df -> @groupby(df, :pwindow, :swindow, :obs_time, :observed_delay, - :observed_delay_upper, :observed_delay_step) |> + :observed_delay_upper) |> gd -> @combine(gd, :n=length(:pwindow)) +# ╔═╡ c0cce80f-dec7-4a55-aefd-339ef863f854 +md" +Compare the samples with and without secondary censoring to the true distribution and calculate empirical CDF +" + # ╔═╡ a7bff47d-b61f-499e-8631-206661c2bdc0 empirical_cdf = ecdf(samples) @@ -130,32 +165,28 @@ empirical_cdf_obs = ecdf(delay_counts.observed_delay, weights = delay_counts.n) # ╔═╡ 60711c3c-266e-42b5-acc6-6624db294f24 x_seq = range(minimum(samples), maximum(samples), 100) -# ╔═╡ c6fe3c52-af87-4a84-b280-bc9a8532e269 -#plot +# ╔═╡ 1f1bcee4-8e0d-46fb-9a6f-41998bf54957 +theoretical_cdf = x_seq |> x -> cdf(true_dist, x) + +# ╔═╡ 59bb2a18-eaf4-438a-9359-341efadfe897 let - StatsPlots.plot(; title = "Comparison of Observed vs Theoretical CDF", + f = Figure() + ax = Axis(f[1, 1], + title = "Comparison of Observed vs Theoretical CDF", ylabel = "Cumulative Probability", - xlabel = "Delay", - xticks = 0:obs_time, - xlims = (-0.1, obs_time + 0.5) + xlabel = "Delay" ) - StatsPlots.plot!(x_seq, x_seq .|> x -> empirical_cdf(x), - lab = "Observed secondary times", - c = :blue, - lw = 3 - ) - StatsPlots.plot!(x_seq, x_seq .|> x -> empirical_cdf_obs(x), - lab = "Observed censored secondary times", - c = :green, - lw = 3 - ) - StatsPlots.plot!(x_seq, x_seq .|> x -> cdf(true_dist, x), - lab = "Theoretical", - c = :black, - lw = 3 - ) - StatsPlots.vline!([mean(samples)], ls = :dash, c = :blue, lw = 3, lab = "") - StatsPlots.vline!([mean(true_dist)], ls = :dash, c = :black, lw = 3, lab = "") + lines!( + ax, x_seq, empirical_cdf_obs, label = "Empirical CDF", color = :blue, linewidth = 2) + lines!(ax, x_seq, theoretical_cdf, label = "Theoretical CDF", + color = :black, linewidth = 2) + vlines!(ax, [mean(samples)], color = :blue, linestyle = :dash, + label = "Empirical mean", linewidth = 2) + vlines!(ax, [mean(true_dist)], linestyle = :dash, + label = "Theoretical mean", color = :black, linewidth = 2) + axislegend(position = :rb) + + f end # ╔═╡ f66d4b2e-ed66-423e-9cba-62bff712862b @@ -167,7 +198,7 @@ We've aggregated the data to unique combinations of `pwindow`, `swindow`, and `o md" ## Fitting a naive model using Turing -We'll start by fitting a naive model using Turing. +We'll start by fitting a naive model using NUTS from `Turing`. We define the model in the `Turing` PPL. " # ╔═╡ d9d14c48-8700-42b5-89b4-7fc51d0f577c @@ -206,8 +237,8 @@ summarize(naive_fit) # ╔═╡ 8e09d931-fca7-4ac2-81f7-2bc36b0174f3 let f = pairplot(naive_fit) - CairoMakie.vlines!(f[1, 1], [meanlog]) - CairoMakie.vlines!(f[2, 2], [sdlog]) + vlines!(f[1, 1], [meanlog], linewidth = 4) + vlines!(f[2, 2], [sdlog], linewidth = 4) f end @@ -219,26 +250,42 @@ We see that the model has converged and the diagnostics look good. However, just # ╔═╡ b2efafab-8849-4a7a-bb64-ac9ce126ca75 md" -## Fitting an improved model using primarycensoreddist +## Fitting an improved model using censoring utilities + +We'll now fit an improved model using the `∫F` function from `EpiAware.EpiAwareUtils` for calculating the CDF of the _total delay_ from the beginning of the primary window to the secondary event time. This includes both the delay distribution we are making inference on and the time between the start of the primary censor window and the primary event. +The `∫F` function underlies `censored_pmf` function from the `EpiAware.EpiAwareUtils` submodule. -We'll now fit an improved model using the `censored_pmf` function from the `EpiAware.EpiAwareUtils` submodule. This accounts for the primary and secondary censoring windows as well as the truncation. +Using the `∫F` function we can write a log-pmf function `primary_censored_dist_lpmf` that accounts for: +- The primary and secondary censoring windows, which can vary in length. +- The effect of right truncation in biasing our observations. +This is the analog function to the function of the same name in `primarycensoreddist`: it calculates the log-probability of the secondary event occurring in the secondary censoring window conditional on the primary event occurring in the primary censoring window by calculating the increase in the CDF over the secondary window and rescaling by the probability of the secondary event occuring within the maximum observation time `D`. +" + +# ╔═╡ 348fc3b4-073b-4997-ae50-58ede5d6d0c9 +function primary_censored_dist_lpmf(dist, y, pwindow, y_upper, D) + if y == 0.0 + return log(∫F(dist, y_upper, pwindow)) - log(∫F(dist, D, pwindow)) + else + return log(∫F(dist, y_upper, pwindow) - ∫F(dist, y, pwindow)) - + log(∫F(dist, D, pwindow)) + end +end + +# ╔═╡ cefb5d56-fecd-4de7-bd0e-156be91c705c +md" +We make a new `Turing` model that now uses `primary_censored_dist_lpmf` rather than the naive uncensored and untruncated `logpdf`. " # ╔═╡ ef40112b-f23e-4d4b-8a7d-3793b786f472 -@model function primarycensoreddist_model(N, y, n, pwindow, D) - try - mu ~ Normal(1.0, 1.0) - sigma ~ truncated(Normal(0.5, 0.5); lower = 0.0) - d = LogNormal(mu, sigma) - log_pmf = censored_pmf(d; Δd = pwindow, D = D) .|> log - - for i in eachindex(y) - Turing.@addlogprob! n[i] * log_pmf[y[i]] - end - return log_pmf - catch - Turing.@addlogprob! -Inf +@model function primarycensoreddist_model(y, y_upper, n, pws, Ds) + mu ~ Normal(1.0, 1.0) + sigma ~ truncated(Normal(0.5, 0.5); lower = 0.0) + dist = LogNormal(mu, sigma) + + for i in eachindex(y) + Turing.@addlogprob! n[i] * primary_censored_dist_lpmf( + dist, y[i], pws[i], y_upper[i], Ds[i]) end end @@ -249,13 +296,16 @@ Lets instantiate this model with data # ╔═╡ 93bca93a-5484-47fa-8424-7315eef15e37 primarycensoreddist_mdl = primarycensoreddist_model( - size(delay_counts, 1), - delay_counts.observed_delay_step, + delay_counts.observed_delay, + delay_counts.observed_delay_upper, delay_counts.n, - pwindow, - obs_time + delay_counts.pwindow, + delay_counts.obs_time ) +# ╔═╡ d5144247-eb57-48bf-8e32-fd71167ecbc8 +md"Now let’s fit the compiled model." + # ╔═╡ 7ae6c61d-0e33-4af8-b8d2-e31223a15a7c primarycensoreddist_fit = sample( primarycensoreddist_mdl, NUTS(), MCMCThreads(), 1000, 4) @@ -271,6 +321,11 @@ let f end +# ╔═╡ 673b47ec-b333-45e8-9557-9e65ad425c35 +md" +We see that the model has converged and the diagnostics look good. We also see that the posterior means are very near the true parameters and the 90% credible intervals include the true parameters. +" + # ╔═╡ Cell order: # ╟─8de5c5e0-6e95-11ef-1693-bfd465c8d919 # ╠═a2624404-48b1-4faa-abbe-6d78b8e04f2b @@ -278,23 +333,28 @@ end # ╠═5baa8d2e-bcf8-4e3b-b007-175ad3e2ca95 # ╟─c5704f67-208d-4c2e-8513-c07c6b94ca99 # ╠═aed124c7-b4ba-4c97-a01f-ff553f376c86 +# ╟─ec5ed3e9-6ea9-4cfe-afd2-82aabbbe8130 # ╠═105b9594-36ce-4ae8-87a8-5c81867b1ce3 # ╠═8aa9f9c1-d3c4-49f3-be18-a400fc71e8f7 # ╠═84bb3999-9f2b-4eaa-9c2d-776a86677eaf # ╠═2bf6677e-ebe9-4aa8-aa91-f631e99669bb +# ╟─f4083aea-8106-401a-b60f-383d0b94102a # ╠═aea8b28e-fffe-4aa6-b51e-8199a7c7975c -# ╠═d231bd0c-165f-4973-a46f-f66991813ea7 +# ╠═4d3a853d-0b8d-402a-8309-e9f6da2b7a8c # ╠═7522f05b-1750-4983-8947-ef70f4298d06 -# ╠═60212cf7-cc3d-42d3-8260-1f067ede3c6f +# ╟─5eac2f60-8cec-4460-9d10-6bade7f0f406 +# ╠═9443b893-9e22-4267-9a1f-319a3adb8c0d # ╠═a4f5e9b6-ff3a-48fa-aa51-0abccb9c7bed -# ╠═a627a544-4c41-4c35-83ec-be7ed7c0a737 +# ╟─2a9da9e5-0925-4ae0-8b70-8db90903cb0b # ╠═0b5e96eb-9312-472e-8a88-d4509a4f25d0 +# ╟─c0cce80f-dec7-4a55-aefd-339ef863f854 # ╠═a7bff47d-b61f-499e-8631-206661c2bdc0 # ╠═16bcb80a-970f-4633-aca2-261fa04172f7 # ╠═60711c3c-266e-42b5-acc6-6624db294f24 -# ╠═c6fe3c52-af87-4a84-b280-bc9a8532e269 +# ╠═1f1bcee4-8e0d-46fb-9a6f-41998bf54957 +# ╠═59bb2a18-eaf4-438a-9359-341efadfe897 # ╟─f66d4b2e-ed66-423e-9cba-62bff712862b -# ╟─010ebe37-782b-4a35-bf5c-dca6dc0fee45 +# ╠═010ebe37-782b-4a35-bf5c-dca6dc0fee45 # ╠═d9d14c48-8700-42b5-89b4-7fc51d0f577c # ╟─8a7cd9ec-5640-4f5f-84c3-ae3f465ca68b # ╠═028ade5c-17bd-4dfc-8433-23aaff02c181 @@ -304,9 +364,13 @@ end # ╠═8e09d931-fca7-4ac2-81f7-2bc36b0174f3 # ╟─43eac8dd-8f1d-440e-b1e8-85db9e740651 # ╟─b2efafab-8849-4a7a-bb64-ac9ce126ca75 +# ╠═348fc3b4-073b-4997-ae50-58ede5d6d0c9 +# ╟─cefb5d56-fecd-4de7-bd0e-156be91c705c # ╠═ef40112b-f23e-4d4b-8a7d-3793b786f472 # ╟─b823d824-419d-41e9-9ac9-2c45ef190acf # ╠═93bca93a-5484-47fa-8424-7315eef15e37 +# ╟─d5144247-eb57-48bf-8e32-fd71167ecbc8 # ╠═7ae6c61d-0e33-4af8-b8d2-e31223a15a7c # ╠═1210443f-480f-4e9f-b195-d557e9e1fc31 # ╠═b2376beb-dd7b-442d-9ff5-ac864e75366b +# ╟─673b47ec-b333-45e8-9557-9e65ad425c35 From 7c0c0c4639ea3d9dec4e9331d3565555e67d2fc1 Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Thu, 26 Sep 2024 12:17:58 +0100 Subject: [PATCH 13/16] pre-commit fix --- EpiAware/docs/src/getting-started/explainers/censored-obs.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl index 9b030d0a6..528447a4e 100644 --- a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl +++ b/EpiAware/docs/src/getting-started/explainers/censored-obs.jl @@ -253,7 +253,7 @@ md" ## Fitting an improved model using censoring utilities We'll now fit an improved model using the `∫F` function from `EpiAware.EpiAwareUtils` for calculating the CDF of the _total delay_ from the beginning of the primary window to the secondary event time. This includes both the delay distribution we are making inference on and the time between the start of the primary censor window and the primary event. -The `∫F` function underlies `censored_pmf` function from the `EpiAware.EpiAwareUtils` submodule. +The `∫F` function underlies `censored_pmf` function from the `EpiAware.EpiAwareUtils` submodule. Using the `∫F` function we can write a log-pmf function `primary_censored_dist_lpmf` that accounts for: - The primary and secondary censoring windows, which can vary in length. From 67c2ac6694392e62e54123c996e9e41f882c033d Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Thu, 26 Sep 2024 12:26:00 +0100 Subject: [PATCH 14/16] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 34c9b96bc..75fef25d7 100644 --- a/.gitignore +++ b/.gitignore @@ -389,3 +389,4 @@ EpiAware/docs/src/examples/*.md # benchmark ignore /.benchmarkci /benchmark/*.json +EpiAware/docs/src/getting-started/tutorials/censored-obs.md From 32fa2d745f2de62b5f1744fa94fbd2c35fa0fa55 Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Thu, 26 Sep 2024 12:26:20 +0100 Subject: [PATCH 15/16] move note to tutorials and add to doc pages --- EpiAware/docs/pages.jl | 1 + .../getting-started/{explainers => tutorials}/censored-obs.jl | 0 2 files changed, 1 insertion(+) rename EpiAware/docs/src/getting-started/{explainers => tutorials}/censored-obs.jl (100%) diff --git a/EpiAware/docs/pages.jl b/EpiAware/docs/pages.jl index 65e235428..34db2263d 100644 --- a/EpiAware/docs/pages.jl +++ b/EpiAware/docs/pages.jl @@ -18,6 +18,7 @@ getting_started_pages = Any[ "Nowcasting" => "getting-started/tutorials/nowcasting.md", "Multiple observation models" => "getting-started/tutorials/multiple-observation-models.md", "Multiple infection processes" => "getting-started/tutorials/multiple-infection-processes.md", + "Fitting distributions with censored data" => "getting-started/tutorials/censored-obs.md", "Partial pooling" => "getting-started/tutorials/partial-pooling.md" ] ] diff --git a/EpiAware/docs/src/getting-started/explainers/censored-obs.jl b/EpiAware/docs/src/getting-started/tutorials/censored-obs.jl similarity index 100% rename from EpiAware/docs/src/getting-started/explainers/censored-obs.jl rename to EpiAware/docs/src/getting-started/tutorials/censored-obs.jl From 67767bd341e316817896fd2e2ebffc9dee70fbec Mon Sep 17 00:00:00 2001 From: Samuel Brand Date: Thu, 26 Sep 2024 13:16:05 +0100 Subject: [PATCH 16/16] fix doc test --- EpiAware/src/EpiAwareUtils/censored_pmf.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/EpiAware/src/EpiAwareUtils/censored_pmf.jl b/EpiAware/src/EpiAwareUtils/censored_pmf.jl index 2e04c8cfc..e12d44279 100644 --- a/EpiAware/src/EpiAwareUtils/censored_pmf.jl +++ b/EpiAware/src/EpiAwareUtils/censored_pmf.jl @@ -132,18 +132,18 @@ censored_cdf(dist; D = 10) |> p -> round.(p, digits=3) # output -10-element Vector{Float64}: +11-element Vector{Float64}: 0.0 0.368 - 0.768 - 0.915 + 0.767 + 0.914 0.969 - 0.989 + 0.988 0.996 + 0.998 0.999 1.0 1.0 - 1.0 ``` " function censored_cdf(dist::Distribution; Δd = 1.0, D = nothing, upper = 0.999)