Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow Users to Specify Data Type (is_multilabel) in Config File #341

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 72 additions & 11 deletions libmultilabel/linear/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ def __init__(
weights: np.matrix,
bias: float,
thresholds: float | np.ndarray,
is_multilabel: bool,
):
self.name = name
self.weights = weights
self.bias = bias
self.thresholds = thresholds
self.is_multilabel = is_multilabel

def predict_values(self, x: sparse.csr_matrix) -> np.ndarray:
"""Calculates the decision values associated with x.
Expand Down Expand Up @@ -67,12 +69,19 @@ def predict_values(self, x: sparse.csr_matrix) -> np.ndarray:
return (x * self.weights).A + self.thresholds


def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True) -> FlatModel:
"""Trains a linear model for multiabel data using a one-vs-rest strategy.
def train_1vsrest(
y: sparse.csr_matrix,
x: sparse.csr_matrix,
is_multilabel: bool,
options: str = "",
verbose: bool = True,
) -> FlatModel:
"""Trains a linear classification model with one-vs-rest strategy.

Args:
y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
is_multilabel (bool): A flag indicating if the dataset is multilabel.
options (str, optional): The option string passed to liblinear. Defaults to ''.
verbose (bool, optional): Output extra progress information. Defaults to True.

Expand All @@ -93,7 +102,13 @@ def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "",
yi = y[:, i].toarray().reshape(-1)
weights[:, i] = _do_train(2 * yi - 1, x, options).ravel()

return FlatModel(name="1vsrest", weights=np.asmatrix(weights), bias=bias, thresholds=0)
return FlatModel(
name="1vsrest",
weights=np.asmatrix(weights),
bias=bias,
thresholds=0,
is_multilabel=is_multilabel,
)


def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_matrix, str, float]:
Expand Down Expand Up @@ -145,7 +160,11 @@ def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_mat


def train_thresholding(
y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
y: sparse.csr_matrix,
x: sparse.csr_matrix,
is_multilabel: bool,
options: str = "",
verbose: bool = True,
) -> FlatModel:
"""Trains a linear model for multi-label data using a one-vs-rest strategy
and cross-validation to pick decision thresholds optimizing the sum of Macro-F1 and Micro-F1.
Expand All @@ -160,6 +179,7 @@ def train_thresholding(
Args:
y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
is_multilabel (bool): A flag indicating if the dataset is multilabel.
options (str, optional): The option string passed to liblinear. Defaults to ''.
verbose (bool, optional): Output extra progress information. Defaults to True.

Expand Down Expand Up @@ -189,7 +209,13 @@ def train_thresholding(
weights[:, i] = w.ravel()
thresholds[i] = t

return FlatModel(name="thresholding", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds)
return FlatModel(
name="thresholding",
weights=np.asmatrix(weights),
bias=bias,
thresholds=thresholds,
is_multilabel=is_multilabel,
)


def _micromacro_one_label(
Expand Down Expand Up @@ -361,7 +387,11 @@ def _fmeasure(y_true: np.ndarray, y_pred: np.ndarray) -> float:


def train_cost_sensitive(
y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
y: sparse.csr_matrix,
x: sparse.csr_matrix,
is_multilabel: bool,
options: str = "",
verbose: bool = True,
) -> FlatModel:
"""Trains a linear model for multilabel data using a one-vs-rest strategy
and cross-validation to pick an optimal asymmetric misclassification cost
Expand All @@ -373,6 +403,7 @@ def train_cost_sensitive(
Args:
y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
is_multilabel (bool): A flag indicating if the dataset is multilabel.
options (str, optional): The option string passed to liblinear. Defaults to ''.
verbose (bool, optional): Output extra progress information. Defaults to True.

Expand All @@ -394,7 +425,13 @@ def train_cost_sensitive(
w = _cost_sensitive_one_label(2 * yi - 1, x, options)
weights[:, i] = w.ravel()

return FlatModel(name="cost_sensitive", weights=np.asmatrix(weights), bias=bias, thresholds=0)
return FlatModel(
name="cost_sensitive",
weights=np.asmatrix(weights),
bias=bias,
thresholds=0,
is_multilabel=is_multilabel,
)


def _cost_sensitive_one_label(y: np.ndarray, x: sparse.csr_matrix, options: str) -> np.ndarray:
Expand Down Expand Up @@ -454,7 +491,11 @@ def _cross_validate(y: np.ndarray, x: sparse.csr_matrix, options: str, perm: np.


def train_cost_sensitive_micro(
y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
y: sparse.csr_matrix,
x: sparse.csr_matrix,
is_multilabel: bool,
options: str = "",
verbose: bool = True,
) -> FlatModel:
"""Trains a linear model for multilabel data using a one-vs-rest strategy
and cross-validation to pick an optimal asymmetric misclassification cost
Expand All @@ -466,6 +507,7 @@ def train_cost_sensitive_micro(
Args:
y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
is_multilabel (bool): A flag indicating if the dataset is multilabel.
options (str, optional): The option string passed to liblinear. Defaults to ''.
verbose (bool, optional): Output extra progress information. Defaults to True.

Expand Down Expand Up @@ -510,23 +552,36 @@ def train_cost_sensitive_micro(
w = _do_train(2 * yi - 1, x, final_options)
weights[:, i] = w.ravel()

return FlatModel(name="cost_sensitive_micro", weights=np.asmatrix(weights), bias=bias, thresholds=0)
return FlatModel(
name="cost_sensitive_micro",
weights=np.asmatrix(weights),
bias=bias,
thresholds=0,
is_multilabel=is_multilabel,
)


def train_binary_and_multiclass(
y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
y: sparse.csr_matrix,
x: sparse.csr_matrix,
is_multilabel: bool,
options: str = "",
verbose: bool = True,
) -> FlatModel:
"""Trains a linear model for binary and multi-class data.

Args:
y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
is_multilabel (bool): A flag indicating if the dataset is multilabel.
options (str, optional): The option string passed to liblinear. Defaults to ''.
verbose (bool, optional): Output extra progress information. Defaults to True.

Returns:
A model which can be used in predict_values.
"""
if is_multilabel:
raise ValueError("binary_and_multiclass doesn't support multilabel data.")
x, options, bias = _prepare_options(x, options)
num_instances, num_labels = y.shape
nonzero_instance_ids, nonzero_label_ids = y.nonzero()
Expand Down Expand Up @@ -556,7 +611,13 @@ def train_binary_and_multiclass(
# For labels not appeared in training, assign thresholds to -inf so they won't be predicted.
thresholds = np.full(num_labels, -np.inf)
thresholds[train_labels] = 0
return FlatModel(name="binary_and_multiclass", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds)
return FlatModel(
name="binary_and_multiclass",
weights=np.asmatrix(weights),
bias=bias,
thresholds=thresholds,
is_multilabel=is_multilabel,
)


def predict_values(model, x: sparse.csr_matrix) -> np.ndarray:
Expand Down
6 changes: 4 additions & 2 deletions libmultilabel/linear/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
self.root = root
self.flat_model = flat_model
self.weight_map = weight_map
self.is_multilabel = True

def predict_values(
self,
Expand Down Expand Up @@ -203,14 +204,14 @@ def _train_node(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str, node:
node (Node): Node to be trained.
"""
if node.isLeaf():
node.model = linear.train_1vsrest(y[:, node.label_map], x, options, False)
node.model = linear.train_1vsrest(y[:, node.label_map], x, True, options, False)
else:
# meta_y[i, j] is 1 if the ith instance is relevant to the jth child.
# getnnz returns an ndarray of shape number of instances.
# This must be reshaped into number of instances * 1 to be interpreted as a column.
meta_y = [y[:, child.label_map].getnnz(axis=1)[:, np.newaxis] > 0 for child in node.children]
meta_y = sparse.csr_matrix(np.hstack(meta_y))
node.model = linear.train_1vsrest(meta_y, x, options, False)
node.model = linear.train_1vsrest(meta_y, x, True, options, False)

node.model.weights = sparse.csc_matrix(node.model.weights)

Expand Down Expand Up @@ -250,6 +251,7 @@ def visit(node):
weights=sparse.hstack(weights, "csr"),
bias=bias,
thresholds=0,
is_multilabel=True,
)

# w.shape[1] is the number of labels/metalabels of each node
Expand Down
22 changes: 19 additions & 3 deletions linear_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
from tqdm import tqdm

import libmultilabel.linear as linear
from libmultilabel.common_utils import dump_log
from libmultilabel.common_utils import dump_log, is_multiclass_dataset
from libmultilabel.linear.utils import LINEAR_TECHNIQUES


def linear_test(config, model, datasets, label_mapping):
metrics = linear.get_metrics(
config.monitor_metrics, datasets["test"]["y"].shape[1], multiclass=model.name == "binary_and_multiclass"
config.monitor_metrics,
datasets["test"]["y"].shape[1],
multiclass=not model.is_multilabel,
)
num_instance = datasets["test"]["x"].shape[0]
k = config.save_k_predictions
Expand All @@ -37,7 +39,20 @@ def linear_test(config, model, datasets, label_mapping):


def linear_train(datasets, config):
# detect task type
is_multilabel = config.get("is_multilabel", "auto")
if is_multilabel == "auto":
is_multilabel = not is_multiclass_dataset(datasets["train"], "y")
elif not isinstance(is_multilabel, bool):
raise ValueError(
f'"is_multilabel" is expected to be either "auto", "True", or "False". But got "{is_multilabel}".'
)

# train
if config.linear_technique == "tree":
if not is_multilabel:
raise ValueError("Tree model should only be used with multilabel datasets.")

model = LINEAR_TECHNIQUES[config.linear_technique](
datasets["train"]["y"],
datasets["train"]["x"],
Expand All @@ -49,7 +64,8 @@ def linear_train(datasets, config):
model = LINEAR_TECHNIQUES[config.linear_technique](
datasets["train"]["y"],
datasets["train"]["x"],
config.liblinear_options,
is_multilabel=is_multilabel,
options=config.liblinear_options,
)
return model

Expand Down
10 changes: 9 additions & 1 deletion torch_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,15 @@ def __init__(
else:
self.datasets = datasets

self.config.multiclass = is_multiclass_dataset(self.datasets["train"] + self.datasets.get("val", list()))
# detect task type
is_multilabel = self.config.get("is_multilabel", "auto")
if is_multilabel == "auto":
self.config.multiclass = is_multiclass_dataset(self.datasets["train"] + self.datasets.get("val", list()))
elif not isinstance(is_multilabel, bool):
raise ValueError(
f'"is_multilabel" is expected to be either "auto", "True", or "False". But got "{is_multilabel}".'
)

self._setup_model(
classes=classes,
word_dict=word_dict,
Expand Down