Skip to content

Commit

Permalink
add memory calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
ravinkohli committed May 3, 2022
1 parent 37105d4 commit c2a98c9
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 7 deletions.
2 changes: 2 additions & 0 deletions autoPyTorch/data/tabular_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ def _compress_dataset(
y=y,
is_classification=self.is_classification,
random_state=self.seed,
categorical_columns=self.feature_validator.categorical_columns,
n_categories_per_cat_column=self.feature_validator.num_categories_per_col,
**self.dataset_compression # type: ignore [arg-type]
)
self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
Expand Down
30 changes: 24 additions & 6 deletions autoPyTorch/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,9 +458,7 @@ def _subsample_by_indices(
y = y[indices_to_keep]
return X, y


def megabytes(arr: DatasetCompressionInputType) -> float:

def get_raw_memory_usage(arr: DatasetCompressionInputType) -> float:
if isinstance(arr, np.ndarray):
memory_in_bytes = arr.nbytes
elif issparse(arr):
Expand All @@ -471,14 +469,33 @@ def megabytes(arr: DatasetCompressionInputType) -> float:
raise ValueError(f"Unrecognised data type of X, expected data type to "
f"be in (np.ndarray, spmatrix, pd.DataFrame) but got :{type(arr)}")

return float(memory_in_bytes / (2**20))
return memory_in_bytes

def get_approximate_mem_usage_in_mb(
arr: DatasetCompressionInputType,
categorical_columns: List,
n_categories_per_cat_column: Optional[List[int]] = None
) -> float:

multiplier = np.zeros(1, dtype=arr.dtype).itemsize
width = arr.shape[1] - len(categorical_columns)
# multiply num categories with the size of the column to capture memory after one hot encoding
if len(categorical_columns) > 0:
if n_categories_per_cat_column is None:
raise ValueError("Value number of categories per categorical is required when the data has categorical columns")

width += sum(n_categories_per_cat_column)

return float(multiplier * arr.shape[0] * width / (2**20))


def reduce_dataset_size_if_too_large(
X: DatasetCompressionInputType,
memory_allocation: Union[int, float],
is_classification: bool,
random_state: Union[int, np.random.RandomState],
categorical_columns: List,
n_categories_per_cat_column: Optional[List[int]] = None,
y: Optional[SupportedTargetTypes] = None,
methods: List[str] = ['precision', 'subsample'],
) -> DatasetCompressionInputType:
Expand Down Expand Up @@ -524,7 +541,7 @@ def reduce_dataset_size_if_too_large(
"""

for method in methods:
if megabytes(X) <= memory_allocation:
if get_approximate_mem_usage_in_mb(X, categorical_columns, n_categories_per_cat_column) <= memory_allocation:
break

if method == 'precision':
Expand All @@ -540,7 +557,8 @@ def reduce_dataset_size_if_too_large(
# into the allocated memory, we subsample it so that it does

n_samples_before = X.shape[0]
sample_percentage = memory_allocation / megabytes(X)
sample_percentage = memory_allocation / get_approximate_mem_usage_in_mb(
X, categorical_columns, n_categories_per_cat_column)

# NOTE: type ignore
#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
self.backbone = self.build_backbone(
input_shape=input_shape,
)
self.logger.debug(f"After building backbone Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memroy: {psutil.virtual_memory().total/1024/1024}")

return self

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import copy
from typing import Any, Dict, Optional, Tuple
from typing import Any, Dict, Optional, Tuple, List
import logging.handlers
import time
import psutil
Expand Down

0 comments on commit c2a98c9

Please sign in to comment.