Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bootstrap weights #485

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
5 changes: 3 additions & 2 deletions src/estimagic/estimation/msm_weighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def get_moments_cov(
moment_kwargs (dict): Additional keyword arguments for calculate_moments.
bootstrap_kwargs (dict): Additional keyword arguments that govern the
bootstrapping. Allowed arguments are "n_draws", "seed", "n_cores",
"batch_evaluator", "cluster_by" and "error_handling". For details see the
bootstrap function.
"batch_evaluator", "weights", "cluster_by" and "error_handling".
For details see the bootstrap function.

Returns:
pandas.DataFrame or numpy.ndarray: The covariance matrix of the moment
Expand All @@ -39,6 +39,7 @@ def get_moments_cov(
"n_draws",
"seed",
"batch_evaluator",
"weights",
alanlujan91 marked this conversation as resolved.
Show resolved Hide resolved
"cluster_by",
"error_handling",
"existing_result",
Expand Down
5 changes: 4 additions & 1 deletion src/estimagic/inference/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def bootstrap(
existing_result=None,
outcome_kwargs=None,
n_draws=1_000,
weights=None,
cluster_by=None,
seed=None,
n_cores=1,
Expand All @@ -41,6 +42,7 @@ def bootstrap(
n_draws (int): Number of bootstrap samples to draw.
If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes
is used.
weights (str): Column name of variable with weights or None.
cluster_by (str): Column name of variable to cluster by or None.
seed (Union[None, int, numpy.random.Generator]): If seed is None or int the
numpy.random.default_rng is used seeded with seed. If seed is already a
Expand All @@ -59,7 +61,7 @@ def bootstrap(

"""
if callable(outcome):
check_inputs(data=data, cluster_by=cluster_by)
check_inputs(data=data, weights=weights, cluster_by=cluster_by)

if outcome_kwargs is not None:
outcome = functools.partial(outcome, **outcome_kwargs)
Expand All @@ -82,6 +84,7 @@ def bootstrap(
new_outcomes = get_bootstrap_outcomes(
data=data,
outcome=outcome,
weights=weights,
cluster_by=cluster_by,
rng=rng,
n_draws=n_draws - n_existing,
Expand Down
10 changes: 9 additions & 1 deletion src/estimagic/inference/bootstrap_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@


def check_inputs(
data=None, cluster_by=None, ci_method="percentile", ci_level=0.95, skipdata=False
data=None,
weights=None,
cluster_by=None,
ci_method="percentile",
ci_level=0.95,
skipdata=False,
):
"""Check validity of inputs.

Args:
data (pd.DataFrame): Dataset.
weights (str): Column name of variable with weights.
cluster_by (str): Column name of variable to cluster by.
ci_method (str): Method of choice for computing confidence intervals.
The default is "percentile".
Expand All @@ -21,6 +27,8 @@ def check_inputs(
if not skipdata:
if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series):
raise TypeError("Data must be a pandas.DataFrame or pandas.Series.")
elif (weights is not None) and (weights not in data.columns.tolist()):
raise ValueError("Input 'weights' must be None or a column name of 'data'.")
elif (cluster_by is not None) and (cluster_by not in data.columns.tolist()):
raise ValueError(
"Input 'cluster_by' must be None or a column name of 'data'."
Expand Down
5 changes: 4 additions & 1 deletion src/estimagic/inference/bootstrap_outcomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
def get_bootstrap_outcomes(
data,
outcome,
weights=None,
cluster_by=None,
rng=None,
n_draws=1000,
Expand All @@ -19,6 +20,7 @@ def get_bootstrap_outcomes(
data (pandas.DataFrame): original dataset.
outcome (callable): function of the dataset calculating statistic of interest.
Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.).
weights (str): column name of the variable with weights.
cluster_by (str): column name of the variable to cluster by.
rng (numpy.random.Generator): A random number generator.
n_draws (int): number of bootstrap draws.
Expand All @@ -34,12 +36,13 @@ def get_bootstrap_outcomes(
estimates (list): List of pytrees of estimated bootstrap outcomes.

"""
check_inputs(data=data, cluster_by=cluster_by)
check_inputs(data=data, weights=weights, cluster_by=cluster_by)
batch_evaluator = process_batch_evaluator(batch_evaluator)

indices = get_bootstrap_indices(
data=data,
rng=rng,
weights=weights,
cluster_by=cluster_by,
n_draws=n_draws,
)
Expand Down
65 changes: 54 additions & 11 deletions src/estimagic/inference/bootstrap_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@
import pandas as pd


def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
def get_bootstrap_indices(
data,
rng,
weights=None,
cluster_by=None,
n_draws=1000,
):
"""Draw positional indices for the construction of bootstrap samples.

Storing the positional indices instead of the full bootstrap samples saves a lot
Expand All @@ -11,6 +17,7 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
Args:
data (pandas.DataFrame): original dataset.
rng (numpy.random.Generator): A random number generator.
weights (str): column name of the variable with weights.
cluster_by (str): column name of the variable to cluster by.
n_draws (int): number of draws, only relevant if seeds is None.

Expand All @@ -19,17 +26,45 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):

"""
n_obs = len(data)
if cluster_by is None:
bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs)))

if weights is None:

if cluster_by is None:
bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs)))
else:
clusters = data[cluster_by].unique()
drawn_clusters = rng.choice(
clusters, size=(n_draws, len(clusters)), replace=True
)

bootstrap_indices = _convert_cluster_ids_to_indices(
data[cluster_by], drawn_clusters
)

else:
clusters = data[cluster_by].unique()
drawn_clusters = rng.choice(
clusters, size=(n_draws, len(clusters)), replace=True
)

bootstrap_indices = _convert_cluster_ids_to_indices(
data[cluster_by], drawn_clusters
)
if cluster_by is None:
bootstrap_indices = list(
rng.choice(
n_obs,
size=(n_draws, n_obs),
replace=True,
p=data[weights] / data[weights].sum(),
)
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be more readable if you compute the vector of probabilities outside of these if conditions. If weights is None, the vector of probabilities could be None. In this case, you could also write the same code for both cases:

probs = _calculate_bootstrap_indices_probs(data, weight_by, cluster_by) 

bootstrap_indices = list(
    rng.choice(
        n_obs,
        size=(n_draws, n_obs),
        replace=True,
        p=probs,
    )
)

This will be slower than np.integers(n_obs, ...) but I think this is a trade-off we would want to make here.

else:
clusters = data.groupby(cluster_by)[weights].sum().reset_index()

drawn_clusters = rng.choice(
clusters[cluster_by],
size=(n_draws, len(clusters)),
replace=True,
p=clusters[weights] / clusters[weights].sum(),
)

bootstrap_indices = _convert_cluster_ids_to_indices(
data[cluster_by], drawn_clusters
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has to be updated. Especially the line clusters[clusters_by] is very confusing. I would think that clusters is equal to something like data[cluster_by].

Similarly as above, it would be nice to pre-compute the probabilities.


return bootstrap_indices

Expand All @@ -48,7 +83,13 @@ def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
return bootstrap_indices


def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
def get_bootstrap_samples(
data,
rng,
weights=None,
cluster_by=None,
n_draws=1000,
):
"""Draw bootstrap samples.

If you have memory issues you should use get_bootstrap_indices instead and construct
Expand All @@ -57,6 +98,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
Args:
data (pandas.DataFrame): original dataset.
rng (numpy.random.Generator): A random number generator.
weights (str): weights for the observations.
cluster_by (str): column name of the variable to cluster by.
n_draws (int): number of draws, only relevant if seeds is None.

Expand All @@ -67,6 +109,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
indices = get_bootstrap_indices(
data=data,
rng=rng,
weights=weights,
cluster_by=cluster_by,
n_draws=n_draws,
)
Expand Down
9 changes: 9 additions & 0 deletions tests/inference/test_bootstrap_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ def test_check_inputs_data():
assert str(error.value) == expected_msg


def test_check_inputs_weights(setup):
weights = "this is not a column name of df"
expected = "Input 'weights' must be None or a column name of 'data'."

with pytest.raises(ValueError) as error:
check_inputs(data=setup["df"], weights=weights)
assert str(error.value) == expected


def test_check_inputs_cluster_by(setup):
cluster_by = "this is not a column name of df"
expected_msg = "Input 'cluster_by' must be None or a column name of 'data'."
Expand Down
15 changes: 15 additions & 0 deletions tests/inference/test_bootstrap_samples.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You only check that the get_bootstrap_indices function works when called with the new weights arguments. But I think you can do more:

  1. I would rename "wts" to "weights", just to be explicit
  2. If the weights are uniform (as in your example), the bootstrap indices with and without the weights argument should be the same. Could you test that? You will need to create two random number generators with the same state.
  3. Additionally, it would be very helpful to see if the weights work. An extreme case to check this would be a weighting vector where one entry is 1, and the other is 0. Then, you could (probabilistically) check that the index corresponding to the 0 weight is never chosen.

Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def data():
df = pd.DataFrame()
df["id"] = np.arange(900)
df["hh"] = [3, 1, 2, 0, 0, 2, 5, 4, 5] * 100
df["wts"] = np.ones(900)
return df


Expand All @@ -32,6 +33,20 @@ def test_get_bootstrap_indices_radomization_works_with_clustering(data):
assert set(res[0]) != set(res[1])


def test_get_bootstrap_indices_randomization_works_with_weights(data):
rng = get_rng(seed=12345)
res = get_bootstrap_indices(data, weights="wts", n_draws=2, rng=rng)
assert set(res[0]) != set(res[1])


def test_get_bootstrap_indices_randomization_works_with_weights_and_clustering(data):
rng = get_rng(seed=12345)
res = get_bootstrap_indices(
data, weights="wts", cluster_by="hh", n_draws=2, rng=rng
)
assert set(res[0]) != set(res[1])


def test_clustering_leaves_households_intact(data):
rng = get_rng(seed=12345)
indices = get_bootstrap_indices(data, cluster_by="hh", n_draws=1, rng=rng)[0]
Expand Down
Loading