diff --git a/.gitignore b/.gitignore index 34c9b96bc..75fef25d7 100644 --- a/.gitignore +++ b/.gitignore @@ -389,3 +389,4 @@ EpiAware/docs/src/examples/*.md # benchmark ignore /.benchmarkci /benchmark/*.json +EpiAware/docs/src/getting-started/tutorials/censored-obs.md diff --git a/EpiAware/docs/Project.toml b/EpiAware/docs/Project.toml index 819b6412f..d920e2488 100644 --- a/EpiAware/docs/Project.toml +++ b/EpiAware/docs/Project.toml @@ -9,12 +9,16 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8" EpiAware = "b2eeebe4-5992-4301-9193-7ebc9f62c855" +FFMPEG = "c87230d0-a227-11e9-1b43-d7ebe4e7570a" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PairPlots = "43a3c2be-4208-490b-832a-a21dcd55d7da" Pluto = "c3e4b0f8-55cb-11ea-2926-15256bba5781" PlutoStaticHTML = "359b1769-a58e-495b-9770-312e911026ad" ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd" TimeSeries = "9e3dc215-6440-5c97-bce1-76c03772f85e" Transducers = "28d57a85-8fef-5791-bfe6-a80928e7c999" Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0" +TuringBenchmarking = "0db1332d-5c25-4deb-809f-459bc696f94f" diff --git a/EpiAware/docs/pages.jl b/EpiAware/docs/pages.jl index 65e235428..34db2263d 100644 --- a/EpiAware/docs/pages.jl +++ b/EpiAware/docs/pages.jl @@ -18,6 +18,7 @@ getting_started_pages = Any[ "Nowcasting" => "getting-started/tutorials/nowcasting.md", "Multiple observation models" => "getting-started/tutorials/multiple-observation-models.md", "Multiple infection processes" => "getting-started/tutorials/multiple-infection-processes.md", + "Fitting distributions with censored data" => "getting-started/tutorials/censored-obs.md", "Partial pooling" => "getting-started/tutorials/partial-pooling.md" ] ] diff --git a/EpiAware/docs/src/getting-started/tutorials/censored-obs.jl b/EpiAware/docs/src/getting-started/tutorials/censored-obs.jl new file mode 100644 index 000000000..528447a4e --- /dev/null +++ b/EpiAware/docs/src/getting-started/tutorials/censored-obs.jl @@ -0,0 +1,376 @@ +### A Pluto.jl notebook ### +# v0.19.46 + +using Markdown +using InteractiveUtils + +# ╔═╡ a2624404-48b1-4faa-abbe-6d78b8e04f2b +let + docs_dir = dirname(dirname(dirname(@__DIR__))) + pkg_dir = dirname(docs_dir) + + using Pkg: Pkg + Pkg.activate(docs_dir) + Pkg.develop(; path = pkg_dir) + Pkg.instantiate() +end + +# ╔═╡ 5baa8d2e-bcf8-4e3b-b007-175ad3e2ca95 +begin + using EpiAware.EpiAwareUtils: censored_pmf, censored_cdf, ∫F + using Random, Distributions, StatsBase #utilities for random events + using DataFramesMeta #Data wrangling + using CairoMakie, PairPlots #plotting + using Turing #PPL +end + +# ╔═╡ 8de5c5e0-6e95-11ef-1693-bfd465c8d919 +md" +# Fitting distributions using `EpiAware` and Turing PPL + +## Introduction + +### What are we going to do in this Vignette + +In this vignette, we'll demonstrate how to use the CDF function for censored delay distributions `EpiAwareUtils.∫F`, which underlies `EpiAwareUtils.censored_pmf` in conjunction with the Turing PPL for Bayesian inference of epidemiological delay distributions. We'll cover the following key points: + +1. Simulating censored delay distribution data +2. Fitting a naive model using Turing +3. Evaluating the naive model's performance +4. Fitting an improved model using censored delay functionality from `EpiAware`. +5. Comparing the censored delay model's performance to the naive model + +### What might I need to know before starting + +This note builds on the concepts introduced in the R/stan package [`primarycensoreddist`](https://github.com/epinowcast/primarycensoreddist), especially the [Fitting distributions using primarycensorseddist and cmdstan](https://primarycensoreddist.epinowcast.org/articles/fitting-dists-with-stan.html) vignette and assumes familiarity with using Turing tools as covered in the [Turing documentation](https://turinglang.org/). + +This note is generated using the `EpiAware` package locally via `Pkg.develop`, in the `EpiAware/docs` environment. It is also possible to install `EpiAware` using + +```julia +Pkg.add(url=\"https://github.com/CDCgov/Rt-without-renewal\", subdir=\"EpiAware\") +``` +### Packages used in this vignette +As well as `EpiAware` and `Turing` we will use `Makie` ecosystem packages for plotting and `DataFramesMeta` for data manipulation. +" + +# ╔═╡ 30dd9af4-b64f-42b1-8439-a890752f68e3 +md" +The other dependencies are as follows: +" + +# ╔═╡ c5704f67-208d-4c2e-8513-c07c6b94ca99 +md" +## Simulating censored and truncated delay distribution data + +We'll start by simulating some censored and truncated delay distribution data. We’ll define a `rpcens` function for generating data. +" + +# ╔═╡ aed124c7-b4ba-4c97-a01f-ff553f376c86 +Random.seed!(123) # For reproducibility + +# ╔═╡ ec5ed3e9-6ea9-4cfe-afd2-82aabbbe8130 +md"Define the true distribution parameters" + +# ╔═╡ 105b9594-36ce-4ae8-87a8-5c81867b1ce3 +n = 2000 + +# ╔═╡ 8aa9f9c1-d3c4-49f3-be18-a400fc71e8f7 +meanlog = 1.5 + +# ╔═╡ 84bb3999-9f2b-4eaa-9c2d-776a86677eaf +sdlog = 0.75 + +# ╔═╡ 2bf6677e-ebe9-4aa8-aa91-f631e99669bb +true_dist = LogNormal(meanlog, sdlog) + +# ╔═╡ f4083aea-8106-401a-b60f-383d0b94102a +md"Generate varying pwindow, swindow, and obs_time lengths +" + +# ╔═╡ aea8b28e-fffe-4aa6-b51e-8199a7c7975c +pwindows = rand(1:2, n) + +# ╔═╡ 4d3a853d-0b8d-402a-8309-e9f6da2b7a8c +swindows = rand(1:2, n) + +# ╔═╡ 7522f05b-1750-4983-8947-ef70f4298d06 +obs_times = rand(8:10, n) + +# ╔═╡ 5eac2f60-8cec-4460-9d10-6bade7f0f406 +md" +We recreate the primary censored sampling function from `primarycensoreddist`, c.f. documentation [here](https://primarycensoreddist.epinowcast.org/reference/rprimarycensoreddist.html). +" + +# ╔═╡ 9443b893-9e22-4267-9a1f-319a3adb8c0d +""" + function rpcens(dist; pwindow = 1, swindow = 1, D = Inf, max_tries = 1000) + +Does a truncated censored sample from `dist` with a uniform primary time on `[0, pwindow]`. +""" +function rpcens(dist; pwindow = 1, swindow = 1, D = Inf, max_tries = 1000) + T = zero(eltype(dist)) + invalid_sample = true + attempts = 1 + while (invalid_sample && attempts <= max_tries) + X = rand(dist) + U = rand() * pwindow + T = X + U + attempts += 1 + if X + U < D + invalid_sample = false + end + end + + @assert !invalid_sample "censored value not found in $max_tries attempts" + + return (T ÷ swindow) * swindow +end + +# ╔═╡ a4f5e9b6-ff3a-48fa-aa51-0abccb9c7bed +#Sample secondary time relative to beginning of primary censor window respecting the right-truncation +samples = map(pwindows, swindows, obs_times) do pw, sw, ot + rpcens(true_dist; pwindow = pw, swindow = sw, D = ot) +end + +# ╔═╡ 2a9da9e5-0925-4ae0-8b70-8db90903cb0b +md" +Aggregate to unique combinations and count occurrences +" + +# ╔═╡ 0b5e96eb-9312-472e-8a88-d4509a4f25d0 +delay_counts = mapreduce(vcat, pwindows, swindows, obs_times, samples) do pw, sw, ot, s + DataFrame( + pwindow = pw, + swindow = sw, + obs_time = ot, + observed_delay = s, + observed_delay_upper = s + sw + ) +end |> + df -> @groupby(df, :pwindow, :swindow, :obs_time, :observed_delay, + :observed_delay_upper) |> + gd -> @combine(gd, :n=length(:pwindow)) + +# ╔═╡ c0cce80f-dec7-4a55-aefd-339ef863f854 +md" +Compare the samples with and without secondary censoring to the true distribution and calculate empirical CDF +" + +# ╔═╡ a7bff47d-b61f-499e-8631-206661c2bdc0 +empirical_cdf = ecdf(samples) + +# ╔═╡ 16bcb80a-970f-4633-aca2-261fa04172f7 +empirical_cdf_obs = ecdf(delay_counts.observed_delay, weights = delay_counts.n) + +# ╔═╡ 60711c3c-266e-42b5-acc6-6624db294f24 +x_seq = range(minimum(samples), maximum(samples), 100) + +# ╔═╡ 1f1bcee4-8e0d-46fb-9a6f-41998bf54957 +theoretical_cdf = x_seq |> x -> cdf(true_dist, x) + +# ╔═╡ 59bb2a18-eaf4-438a-9359-341efadfe897 +let + f = Figure() + ax = Axis(f[1, 1], + title = "Comparison of Observed vs Theoretical CDF", + ylabel = "Cumulative Probability", + xlabel = "Delay" + ) + lines!( + ax, x_seq, empirical_cdf_obs, label = "Empirical CDF", color = :blue, linewidth = 2) + lines!(ax, x_seq, theoretical_cdf, label = "Theoretical CDF", + color = :black, linewidth = 2) + vlines!(ax, [mean(samples)], color = :blue, linestyle = :dash, + label = "Empirical mean", linewidth = 2) + vlines!(ax, [mean(true_dist)], linestyle = :dash, + label = "Theoretical mean", color = :black, linewidth = 2) + axislegend(position = :rb) + + f +end + +# ╔═╡ f66d4b2e-ed66-423e-9cba-62bff712862b +md" +We've aggregated the data to unique combinations of `pwindow`, `swindow`, and `obs_time` and counted the number of occurrences of each `observed_delay` for each combination. This is the data we will use to fit our model. +" + +# ╔═╡ 010ebe37-782b-4a35-bf5c-dca6dc0fee45 +md" +## Fitting a naive model using Turing + +We'll start by fitting a naive model using NUTS from `Turing`. We define the model in the `Turing` PPL. +" + +# ╔═╡ d9d14c48-8700-42b5-89b4-7fc51d0f577c +@model function naive_model(N, y, n) + mu ~ Normal(1.0, 1.0) + sigma ~ truncated(Normal(0.5, 1.0); lower = 0.0) + d = LogNormal(mu, sigma) + + for i in eachindex(y) + Turing.@addlogprob! n[i] * logpdf(d, y[i]) + end +end + +# ╔═╡ 8a7cd9ec-5640-4f5f-84c3-ae3f465ca68b +md" +Now lets instantiate this model with data +" + +# ╔═╡ 028ade5c-17bd-4dfc-8433-23aaff02c181 +naive_mdl = naive_model( + size(delay_counts, 1), + delay_counts.observed_delay .+ 1e-6, # Add a small constant to avoid log(0) + delay_counts.n) + +# ╔═╡ 04b4eefb-f0f9-4887-8db0-7cbb7f3b169b +md" +and now let's fit the compiled model. +" + +# ╔═╡ 21655344-d12b-4e47-a9a9-d06bd909f6ea +naive_fit = sample(naive_mdl, NUTS(), MCMCThreads(), 500, 4) + +# ╔═╡ 3b89fe00-6aaf-4764-8b29-e71479f1e641 +summarize(naive_fit) + +# ╔═╡ 8e09d931-fca7-4ac2-81f7-2bc36b0174f3 +let + f = pairplot(naive_fit) + vlines!(f[1, 1], [meanlog], linewidth = 4) + vlines!(f[2, 2], [sdlog], linewidth = 4) + f +end + +# ╔═╡ 43eac8dd-8f1d-440e-b1e8-85db9e740651 +md" +We see that the model has converged and the diagnostics look good. However, just from the model posterior summary we see that we might not be very happy with the fit. `mu` is smaller than the target $(meanlog) and `sigma` is larger than the target $(sdlog). + +" + +# ╔═╡ b2efafab-8849-4a7a-bb64-ac9ce126ca75 +md" +## Fitting an improved model using censoring utilities + +We'll now fit an improved model using the `∫F` function from `EpiAware.EpiAwareUtils` for calculating the CDF of the _total delay_ from the beginning of the primary window to the secondary event time. This includes both the delay distribution we are making inference on and the time between the start of the primary censor window and the primary event. +The `∫F` function underlies `censored_pmf` function from the `EpiAware.EpiAwareUtils` submodule. + +Using the `∫F` function we can write a log-pmf function `primary_censored_dist_lpmf` that accounts for: +- The primary and secondary censoring windows, which can vary in length. +- The effect of right truncation in biasing our observations. + +This is the analog function to the function of the same name in `primarycensoreddist`: it calculates the log-probability of the secondary event occurring in the secondary censoring window conditional on the primary event occurring in the primary censoring window by calculating the increase in the CDF over the secondary window and rescaling by the probability of the secondary event occuring within the maximum observation time `D`. +" + +# ╔═╡ 348fc3b4-073b-4997-ae50-58ede5d6d0c9 +function primary_censored_dist_lpmf(dist, y, pwindow, y_upper, D) + if y == 0.0 + return log(∫F(dist, y_upper, pwindow)) - log(∫F(dist, D, pwindow)) + else + return log(∫F(dist, y_upper, pwindow) - ∫F(dist, y, pwindow)) - + log(∫F(dist, D, pwindow)) + end +end + +# ╔═╡ cefb5d56-fecd-4de7-bd0e-156be91c705c +md" +We make a new `Turing` model that now uses `primary_censored_dist_lpmf` rather than the naive uncensored and untruncated `logpdf`. +" + +# ╔═╡ ef40112b-f23e-4d4b-8a7d-3793b786f472 +@model function primarycensoreddist_model(y, y_upper, n, pws, Ds) + mu ~ Normal(1.0, 1.0) + sigma ~ truncated(Normal(0.5, 0.5); lower = 0.0) + dist = LogNormal(mu, sigma) + + for i in eachindex(y) + Turing.@addlogprob! n[i] * primary_censored_dist_lpmf( + dist, y[i], pws[i], y_upper[i], Ds[i]) + end +end + +# ╔═╡ b823d824-419d-41e9-9ac9-2c45ef190acf +md" +Lets instantiate this model with data +" + +# ╔═╡ 93bca93a-5484-47fa-8424-7315eef15e37 +primarycensoreddist_mdl = primarycensoreddist_model( + delay_counts.observed_delay, + delay_counts.observed_delay_upper, + delay_counts.n, + delay_counts.pwindow, + delay_counts.obs_time +) + +# ╔═╡ d5144247-eb57-48bf-8e32-fd71167ecbc8 +md"Now let’s fit the compiled model." + +# ╔═╡ 7ae6c61d-0e33-4af8-b8d2-e31223a15a7c +primarycensoreddist_fit = sample( + primarycensoreddist_mdl, NUTS(), MCMCThreads(), 1000, 4) + +# ╔═╡ 1210443f-480f-4e9f-b195-d557e9e1fc31 +summarize(primarycensoreddist_fit) + +# ╔═╡ b2376beb-dd7b-442d-9ff5-ac864e75366b +let + f = pairplot(primarycensoreddist_fit) + CairoMakie.vlines!(f[1, 1], [meanlog], linewidth = 3) + CairoMakie.vlines!(f[2, 2], [sdlog], linewidth = 3) + f +end + +# ╔═╡ 673b47ec-b333-45e8-9557-9e65ad425c35 +md" +We see that the model has converged and the diagnostics look good. We also see that the posterior means are very near the true parameters and the 90% credible intervals include the true parameters. +" + +# ╔═╡ Cell order: +# ╟─8de5c5e0-6e95-11ef-1693-bfd465c8d919 +# ╠═a2624404-48b1-4faa-abbe-6d78b8e04f2b +# ╟─30dd9af4-b64f-42b1-8439-a890752f68e3 +# ╠═5baa8d2e-bcf8-4e3b-b007-175ad3e2ca95 +# ╟─c5704f67-208d-4c2e-8513-c07c6b94ca99 +# ╠═aed124c7-b4ba-4c97-a01f-ff553f376c86 +# ╟─ec5ed3e9-6ea9-4cfe-afd2-82aabbbe8130 +# ╠═105b9594-36ce-4ae8-87a8-5c81867b1ce3 +# ╠═8aa9f9c1-d3c4-49f3-be18-a400fc71e8f7 +# ╠═84bb3999-9f2b-4eaa-9c2d-776a86677eaf +# ╠═2bf6677e-ebe9-4aa8-aa91-f631e99669bb +# ╟─f4083aea-8106-401a-b60f-383d0b94102a +# ╠═aea8b28e-fffe-4aa6-b51e-8199a7c7975c +# ╠═4d3a853d-0b8d-402a-8309-e9f6da2b7a8c +# ╠═7522f05b-1750-4983-8947-ef70f4298d06 +# ╟─5eac2f60-8cec-4460-9d10-6bade7f0f406 +# ╠═9443b893-9e22-4267-9a1f-319a3adb8c0d +# ╠═a4f5e9b6-ff3a-48fa-aa51-0abccb9c7bed +# ╟─2a9da9e5-0925-4ae0-8b70-8db90903cb0b +# ╠═0b5e96eb-9312-472e-8a88-d4509a4f25d0 +# ╟─c0cce80f-dec7-4a55-aefd-339ef863f854 +# ╠═a7bff47d-b61f-499e-8631-206661c2bdc0 +# ╠═16bcb80a-970f-4633-aca2-261fa04172f7 +# ╠═60711c3c-266e-42b5-acc6-6624db294f24 +# ╠═1f1bcee4-8e0d-46fb-9a6f-41998bf54957 +# ╠═59bb2a18-eaf4-438a-9359-341efadfe897 +# ╟─f66d4b2e-ed66-423e-9cba-62bff712862b +# ╠═010ebe37-782b-4a35-bf5c-dca6dc0fee45 +# ╠═d9d14c48-8700-42b5-89b4-7fc51d0f577c +# ╟─8a7cd9ec-5640-4f5f-84c3-ae3f465ca68b +# ╠═028ade5c-17bd-4dfc-8433-23aaff02c181 +# ╟─04b4eefb-f0f9-4887-8db0-7cbb7f3b169b +# ╠═21655344-d12b-4e47-a9a9-d06bd909f6ea +# ╠═3b89fe00-6aaf-4764-8b29-e71479f1e641 +# ╠═8e09d931-fca7-4ac2-81f7-2bc36b0174f3 +# ╟─43eac8dd-8f1d-440e-b1e8-85db9e740651 +# ╟─b2efafab-8849-4a7a-bb64-ac9ce126ca75 +# ╠═348fc3b4-073b-4997-ae50-58ede5d6d0c9 +# ╟─cefb5d56-fecd-4de7-bd0e-156be91c705c +# ╠═ef40112b-f23e-4d4b-8a7d-3793b786f472 +# ╟─b823d824-419d-41e9-9ac9-2c45ef190acf +# ╠═93bca93a-5484-47fa-8424-7315eef15e37 +# ╟─d5144247-eb57-48bf-8e32-fd71167ecbc8 +# ╠═7ae6c61d-0e33-4af8-b8d2-e31223a15a7c +# ╠═1210443f-480f-4e9f-b195-d557e9e1fc31 +# ╠═b2376beb-dd7b-442d-9ff5-ac864e75366b +# ╟─673b47ec-b333-45e8-9557-9e65ad425c35 diff --git a/EpiAware/src/EpiAwareUtils/EpiAwareUtils.jl b/EpiAware/src/EpiAwareUtils/EpiAwareUtils.jl index 532747869..bcabad2d0 100644 --- a/EpiAware/src/EpiAwareUtils/EpiAwareUtils.jl +++ b/EpiAware/src/EpiAwareUtils/EpiAwareUtils.jl @@ -17,7 +17,7 @@ using Distributions, DocStringExtensions, QuadGK, Statistics, Turing export HalfNormal, DirectSample, SafePoisson, SafeNegativeBinomial #Export functions -export scan, spread_draws, censored_pmf, get_param_array, prefix_submodel +export scan, spread_draws, censored_cdf, censored_pmf, get_param_array, prefix_submodel, ∫F # Export accumulate tools export get_state, accumulate_scan diff --git a/EpiAware/src/EpiAwareUtils/censored_pmf.jl b/EpiAware/src/EpiAwareUtils/censored_pmf.jl index 38e274f64..e12d44279 100644 --- a/EpiAware/src/EpiAwareUtils/censored_pmf.jl +++ b/EpiAware/src/EpiAwareUtils/censored_pmf.jl @@ -63,23 +63,116 @@ function censored_pmf(dist::Distribution, ts .|> (t -> cdf(dist, t)) |> diff |> p -> p ./ sum(p) end +""" +Internal function to check censored_pmf arguments and return the time steps of the rightmost limits of the censor intervals. +""" +function _check_and_give_ts(dist::Distribution, Δd, D, upper) + @assert minimum(dist)>=0.0 "Distribution must be non-negative." + @assert Δd>0.0 "Δd must be positive." + + if isnothing(D) + x_99 = invlogcdf(dist, log(upper)) + D = round(Int64, x_99 / Δd) * Δd + end + + @assert D>=Δd "D can't be shorter than Δd." + + ts = Δd:Δd:D |> collect + + @assert ts[end]==D "D must be a multiple of Δd." + + return ts +end + +""" +Calculate the CDF of the random variable `X + U` where `X` has cumulative distriubtion function `F` +and `U` is a uniform random variable on `[0, Δd)`. + +This is used in solving for censored CDFs and PMFs using numerical quadrature. +""" +function ∫F(dist, t, Δd) + quadgk(u -> exp(logcdf(dist, t - u) - log(Δd)), 0.0, min(Δd, t))[1] +end + +@doc raw" +Create a discrete probability cumulative distribution function (CDF) from a given distribution, +assuming a uniform distribution over primary event times with censoring intervals of width `Δd` for +both primary and secondary events. + +NB: `censored_cdf` returns the _non-truncated_ CDF, i.e. the CDF without conditioning on the +secondary event occuring either before or after some time. + + +# Arguments +- `dist`: The distribution from which to create the PMF. +- `Δd`: The step size for discretizing the domain. Default is 1.0. +- `D`: The upper bound of the domain. Must be greater than `Δd`. Default `D = nothing` +indicates that the distribution should be truncated at its `upper`th percentile rounded +to nearest multiple of `Δd`. + + +# Returns +- A vector representing the CDF with 0.0 appended at the beginning. + +# Raises +- `AssertionError` if the minimum value of `dist` is negative. +- `AssertionError` if `Δd` is not positive. +- `AssertionError` if `D` is shorter than `Δd`. +- `AssertionError` if `D` is not a multiple of `Δd`. + +# Examples + +```jldoctest filter +using Distributions +using EpiAware.EpiAwareUtils + +dist = Exponential(1.0) + +censored_cdf(dist; D = 10) |> + p -> round.(p, digits=3) + +# output +11-element Vector{Float64}: + 0.0 + 0.368 + 0.767 + 0.914 + 0.969 + 0.988 + 0.996 + 0.998 + 0.999 + 1.0 + 1.0 +``` +" +function censored_cdf(dist::Distribution; Δd = 1.0, D = nothing, upper = 0.999) + ts = _check_and_give_ts(dist, Δd, D, upper) + cens_F = ts .|> t -> ∫F(dist, t, Δd) + return [0.0; cens_F] +end + @doc raw" Create a discrete probability mass function (PMF) from a given distribution, -assuming -a uniform distribution over primary event times with censoring intervals of width `Δd` for +assuming a uniform distribution over primary event times with censoring intervals of width `Δd` for both primary and secondary events. The CDF for the time from the left edge of the interval -containing the primary event to the secondary event is created by direct numerical integration -of the convolution of the CDF of `dist` with the uniform density on `[0,Δd)`, the discrete PMF -for double censored delays is then found using simple differencing on the CDF. +containing the primary event to the secondary event is created by direct numerical integration (quadrature) +of the convolution of the CDF of `dist` with the uniform density on `[0,Δd)`, using the `censored_cdf` +function. The discrete PMF for double censored delays is then found using simple differencing +on the CDF. + +NB: `censored_pmf` returns a _right-truncated_ PMF, i.e. the PMF conditioned on the secondary event +occurring before or on the final secondary censoring window. # Arguments - `dist`: The distribution from which to create the PMF. - `Δd`: The step size for discretizing the domain. Default is 1.0. - `D`: The upper bound of the domain. Must be greater than `Δd`. Default `D = nothing` -indicates that the distribution should be truncated at its 99th percentile rounded +indicates that the distribution should be truncated at its `upper`th percentile rounded to nearest multiple of `Δd`. + # Returns - A vector representing the PMF. @@ -114,22 +207,7 @@ censored_pmf(dist; D = 10) |> 0.0 ``` " -function censored_pmf(dist::Distribution; Δd = 1.0, D = nothing) - @assert minimum(dist)>=0.0 "Distribution must be non-negative." - @assert Δd>0.0 "Δd must be positive." - - if isnothing(D) - x_99 = invlogcdf(dist, log(0.99)) - D = round(Int64, x_99 / Δd) * Δd - end - - @assert D>=Δd "D can't be shorter than Δd." - - ts = 0.0:Δd:D |> collect - - @assert ts[end]==D "D must be a multiple of Δd." - - ∫F(dist, t, Δd) = quadgk(u -> cdf(dist, t - u) / Δd, 0.0, Δd)[1] - - ts .|> (t -> ∫F(dist, t, Δd)) |> diff |> p -> p ./ sum(p) +function censored_pmf(dist::Distribution; Δd = 1.0, D = nothing, upper = 0.99) + cens_cdf = censored_cdf(dist; Δd, D, upper) + return cens_cdf |> diff |> p -> p ./ sum(p) end diff --git a/EpiAware/test/EpiAwareUtils/censored_pmf.jl b/EpiAware/test/EpiAwareUtils/censored_pmf.jl index dc034fb09..2cdb6134b 100644 --- a/EpiAware/test/EpiAwareUtils/censored_pmf.jl +++ b/EpiAware/test/EpiAwareUtils/censored_pmf.jl @@ -55,4 +55,13 @@ for s in 1:length(pmf)]] @test sum(expected_pmf_uncond) > 0.99 end + + @testset "Check CDF function" begin + dist = Exponential(1.0) + expected_pmf_uncond = [exp(-1) + [(1 - exp(-1)) * (exp(1) - 1) * exp(-s) for s in 1:9]] + expected_cdf = [0.0; cumsum(expected_pmf_uncond)] + calc_cdf = censored_cdf(dist; Δd = 1.0, D = 10.0) + @test expected_cdf≈calc_cdf atol=1e-15 + end end