Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of LitPop #720

Merged
merged 10 commits into from
May 24, 2023
130 changes: 77 additions & 53 deletions climada/entity/exposures/litpop/litpop.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,8 +460,18 @@ def set_custom_shape(self, *args, **kwargs):
self.__dict__ = LitPop.from_shape(*args, **kwargs).__dict__

@classmethod
def from_shape(cls, shape, total_value, res_arcsec=30, exponents=(1,1), value_unit='USD',
reference_year=DEF_REF_YEAR, gpw_version=GPW_VERSION, data_dir=SYSTEM_DIR):
def from_shape(
cls,
shape,
total_value,
res_arcsec=30,
exponents=(1,1),
value_unit='USD',
region_id=None,
reference_year=DEF_REF_YEAR,
gpw_version=GPW_VERSION,
data_dir=SYSTEM_DIR,
):
"""init LitPop exposure object for a custom shape.
Requires user input regarding the total value to be disaggregated.

Expand All @@ -488,6 +498,9 @@ def from_shape(cls, shape, total_value, res_arcsec=30, exponents=(1,1), value_un
Defining power with which lit (nightlights) and pop (gpw) go into LitPop.
value_unit : str
Unit of exposure values. The default is USD.
region_id : int, optional
The numeric ISO 3166 region associated with the shape. If ``None``, it will
be determined automatically (at a slight computational cost).
peanutfun marked this conversation as resolved.
Show resolved Hide resolved
reference_year : int, optional
Reference year for data sources. Default: CONFIG.exposures.def_ref_year
gpw_version : int, optional
Expand All @@ -512,10 +525,15 @@ def from_shape(cls, shape, total_value, res_arcsec=30, exponents=(1,1), value_un
'GeoSeries. Loop over elements of series outside method.')

tag = Tag()
litpop_gdf, _ = _get_litpop_single_polygon(shape, reference_year,
res_arcsec, data_dir,
gpw_version, exponents,
)
litpop_gdf, _ = _get_litpop_single_polygon(
shape,
reference_year,
res_arcsec,
data_dir,
gpw_version,
exponents,
region_id=region_id,
peanutfun marked this conversation as resolved.
Show resolved Hide resolved
)

# disaggregate total value proportional to LitPop values:
if isinstance(total_value, (float, int)):
Expand Down Expand Up @@ -673,8 +691,9 @@ def _get_litpop_single_polygon(polygon, reference_year, res_arcsec, data_dir,
Defining power with which lit (nightlights) and pop (gpw) go into LitPop. To get
nightlights^3 without population count: (3, 0). To use population count alone: (0, 1).
region_id : int, optional
if provided, region_id of gdf is set to value.
The default is None, this implies that region_id is not set.
if provided, region_id of gdf is set to value. The default is ``None``, in which
case the region is determined automatically for every location of the resulting
GeoDataFrame.
peanutfun marked this conversation as resolved.
Show resolved Hide resolved
verbose : bool, optional
Enable verbose logging about the used GPW version and reference year as well as about the
boundary case where no grid points from the GPW grid are contained in the specified
Expand Down Expand Up @@ -755,18 +774,19 @@ def _get_litpop_single_polygon(polygon, reference_year, res_arcsec, data_dir,
meta_out['width'],
meta_out['height'])
# init GeoDataFrame from data and coordinates:
gdf = geopandas.GeoDataFrame({'value': litpop_array.flatten()}, crs=meta_out['crs'],
geometry=geopandas.points_from_xy(
np.round_(lon.flatten(), decimals = 8, out = None),
np.round_(lat.flatten(), decimals = 8, out = None)
)
)
gdf['latitude'] = np.round_(lat.flatten(), decimals = 8, out = None)
gdf['longitude'] = np.round_(lon.flatten(), decimals = 8, out = None)
latitude = np.round(lat.flatten(), decimals=8)
longitude = np.round(lon.flatten(), decimals=8)
gdf = geopandas.GeoDataFrame(
{"value": litpop_array.flatten(), "latitude": latitude, "longitude": longitude},
crs=meta_out['crs'],
geometry=geopandas.points_from_xy(longitude, latitude),
)
if region_id is not None: # set region_id
gdf['region_id'] = region_id
else:
gdf['region_id'] = u_coord.get_country_code(gdf.latitude, gdf.longitude)
gdf['region_id'] = u_coord.get_country_code(
gdf.latitude, gdf.longitude, gridded=True
)
# remove entries outside polygon with `dropna` and return GeoDataFrame:
return gdf.dropna(), meta_out

Expand Down Expand Up @@ -953,8 +973,10 @@ def reproject_input_data(data_array_list, meta_list,
meta_out['width'] = data_out_list[-1].shape[1]
return data_out_list, meta_out

def gridpoints_core_calc(data_arrays, offsets=None, exponents=None,
total_val_rescale=None):

def gridpoints_core_calc(
data_arrays, offsets=None, exponents=None, total_val_rescale=None
):
"""
Combines N dense numerical arrays by point-wise multipilcation and
optionally rescales to new total value:
Expand Down Expand Up @@ -992,47 +1014,49 @@ def gridpoints_core_calc(data_arrays, offsets=None, exponents=None,

Returns
-------
result_array : np.array of same shape as arrays in data_arrays
result : np.array of same shape as arrays in data_arrays
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really nitpicking, but result is a terrible variable name ^^.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, but result_array was even worse. Do you have something better? The name can also be omitted here:

Suggested change
result : np.array of same shape as arrays in data_arrays
np.array of same shape as arrays in data_arrays

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the variable name still would be there. Maybe convoluted_array since this is essentially doing a convolution? Or rescaled_product ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about leaving out the result variable and storing everything in the data variable? I think that the new variable definition actually is not needed.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also fine. Although data is not a better name than result.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then I'll do that. My goal is not to refactor this function 😅

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 620d346

Results from calculation described above.
"""
# convert input data to arrays
# check integrity of data_array input (length and type):
try:
data_arrays = [np.array(data) for data in data_arrays]
if len(list({data.shape for data in data_arrays})) > 1:
raise ValueError("Elements in data_arrays don't agree in shape.")
except AttributeError as err:
raise TypeError("data_arrays or contained elements have wrong type.") from err

# if None, defaults for offsets and exponents are set:
if offsets is None:
offsets = np.zeros(len(data_arrays))
if exponents is None:
exponents = np.ones(len(data_arrays))
if np.min(offsets) < 0:
raise ValueError("offset values < 0 are not allowed, because negative offset "
"values produce undesired negative disaggregation values.")
if np.min(exponents) < 0:
raise ValueError("exponents < 0 are not allowed, because negative exponents "
"produce 'inf' values where the initial value is 0.")
# Convert input data to arrays
# NOTE: Setting dtype will cause an error if shapes are not compatible
data = np.asarray(data_arrays, dtype=float)

def prepare_input(input_data, default_value, name):
"""Prepare the offset and exposure inputs

If they are ``None``, use the default values. If not, cast them to an array and
ensure that the array has only one dimension. Finally, check if the values are
positive.
"""
if input_data is None:
return default_value
peanutfun marked this conversation as resolved.
Show resolved Hide resolved

ret = np.asarray(input_data, dtype=float)
peanutfun marked this conversation as resolved.
Show resolved Hide resolved
if ret.ndim > 1:
raise ValueError(f"Provide {name} as 1D array or list of scalars")
if np.any(ret < 0):
raise ValueError(
f"{name} values < 0 are not allowed, because negative {name} "
"values produce undesired negative disaggregation values."
)
return ret

offsets = prepare_input(offsets, 0, "offsets")
exponents = prepare_input(exponents, 1, "exponents")

# Steps 1-3: arrays are multiplied after application of offets and exponents:
# (arrays are converted to float to prevent ValueError:
# "Integers to negative integer powers are not allowed.")
result_array = np.power(np.array(data_arrays[0]+offsets[0], dtype=float),
exponents[0])
for i in np.arange(1, len(data_arrays)):
result_array = np.multiply(result_array,
np.power(np.array(data_arrays[i]+offsets[i], dtype=float),
exponents[i])
)

# Steps 4+5: if total value for rescaling is provided, result_array is normalized and
data = data.T # Transpose so that broadcasting over last dimension works
data = (data + offsets) ** exponents
result = data.prod(axis=-1).T # Transpose back

# Steps 4+5: if total value for rescaling is provided, result is normalized and
# scaled with this total value (total_val_rescale):
if isinstance(total_val_rescale, (float, int)):
return np.divide(result_array, result_array.sum()) * total_val_rescale
return result / result.sum() * total_val_rescale
if total_val_rescale is not None:
raise TypeError("total_val_rescale must be int or float.")
return result_array
return result


# The following functions are only required if calc_admin1 is True,
# not for core LitPop. They are maintained here mainly for backward compatibility
Expand Down
6 changes: 2 additions & 4 deletions climada/entity/exposures/test/test_litpop.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,13 +205,11 @@ def test_gridpoints_core_calc_input_errors(self):
lp.gridpoints_core_calc(data_arrays_demo(4))

# wrong format:
with self.assertRaises(TypeError):
with self.assertRaises(ValueError):
lp.gridpoints_core_calc(data, exponents=['a', 'b'])
data.append('hello i am a string')
with self.assertRaises(ValueError):
lp.gridpoints_core_calc(data)
with self.assertRaises(TypeError):
lp.gridpoints_core_calc(777)

def test_gridpoints_core_calc_default_1(self):
"""test function gridpoints_core_calc, i.e. core data combination
Expand Down Expand Up @@ -374,4 +372,4 @@ def test_get_value_unit_pass(self):
if __name__ == "__main__":
TESTS = unittest.TestLoader().loadTestsFromTestCase(TestLitPop)
# TESTS.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUncertainty))
unittest.TextTestRunner(verbosity=2).run(TESTS)
unittest.TextTestRunner(verbosity=2).run(TESTS)