diff --git a/doc/progress.rst b/doc/progress.rst index 3fc493914..6fed41326 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -10,6 +10,7 @@ next ~~~~~~ * MAINT #1280: Use the server-provided ``parquet_url`` instead of ``minio_url`` to determine the location of the parquet file. + * ADD #716: add documentation for remaining attributes of classes and functions. 0.14.1 ~~~~~~ diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 82d202e9c..4c7a8912d 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -2101,6 +2101,21 @@ def instantiate_model_from_hpo_class( return base_estimator def _extract_trace_data(self, model, rep_no, fold_no): + """Extracts data from a machine learning model's cross-validation results + and creates an ARFF (Attribute-Relation File Format) trace. + + Parameters + ---------- + model : Any + A fitted hyperparameter optimization model. + rep_no : int + The repetition number. + fold_no : int + The fold number. + Returns + ------- + A list of ARFF tracecontent. + """ arff_tracecontent = [] for itt_no in range(0, len(model.cv_results_["mean_test_score"])): # we use the string values for True and False, as it is defined in diff --git a/openml/flows/flow.py b/openml/flows/flow.py index b9752e77c..4831eb6a7 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -523,6 +523,19 @@ def get_subflow(self, structure): def _copy_server_fields(source_flow, target_flow): + """Recursively copies the fields added by the server + from the `source_flow` to the `target_flow`. + + Parameters + ---------- + source_flow : OpenMLFlow + To copy the fields from. + target_flow : OpenMLFlow + To copy the fields to. + Returns + ------- + None + """ fields_added_by_the_server = ["flow_id", "uploader", "version", "upload_date"] for field in fields_added_by_the_server: setattr(target_flow, field, getattr(source_flow, field)) @@ -533,5 +546,19 @@ def _copy_server_fields(source_flow, target_flow): def _add_if_nonempty(dic, key, value): + """Adds a key-value pair to a dictionary if the value is not None. + + Parameters + ---------- + dic: dict + To add the key-value pair to. + key: hashable + To add to the dictionary. + value: Any + To add to the dictionary. + Returns + ------- + None + """ if value is not None: dic[key] = value diff --git a/openml/flows/functions.py b/openml/flows/functions.py index c4faded0a..45eea42dc 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -337,6 +337,20 @@ def get_flow_id( def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.DataFrame]: + """Retrieve information about flows from OpenML API + and parse it to a dictionary or a Pandas DataFrame. + + Parameters + ---------- + api_call: str + Retrieves the information about flows. + output_format: str in {"dict", "dataframe"} + The output format. + Returns + + ------- + The flows information in the specified output format. + """ xml_string = openml._api_calls._perform_api_call(api_call, "get") flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",)) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index ee582dbb7..5e31ed370 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -128,6 +128,19 @@ def run_model_on_task( flow = extension.model_to_flow(model) def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTask: + """Retrieve an OpenMLTask object from either an integer or string ID, + or directly from an OpenMLTask object. + + Parameters + ---------- + task : Union[int, str, OpenMLTask] + The task ID or the OpenMLTask object. + + Returns + ------- + OpenMLTask + The OpenMLTask object. + """ if isinstance(task, (int, str)): return get_task(int(task)) else: @@ -451,6 +464,32 @@ def _run_task_get_arffcontent( "OrderedDict[str, OrderedDict]", "OrderedDict[str, OrderedDict]", ]: + """Runs the hyperparameter optimization on the given task + and returns the arfftrace content. + + Parameters + ---------- + model : Any + The model that is to be evalauted. + task : OpenMLTask + The OpenMLTask to evaluate. + extension : Extension + The OpenML extension object. + add_local_measures : bool + Whether to compute additional local evaluation measures. + dataset_format : str + The format in which to download the dataset. + n_jobs : int + Number of jobs to run in parallel. + If None, use 1 core by default. If -1, use all available cores. + + Returns + ------- + Tuple[List[List], Optional[OpenMLRunTrace], + OrderedDict[str, OrderedDict], OrderedDict[str, OrderedDict]] + A tuple containing the arfftrace content, + the OpenML run trace, the global and local evaluation measures. + """ arff_datacontent = [] # type: List[List] traces = [] # type: List[OpenMLRunTrace] # stores fold-based evaluation measures. In case of a sample based task, @@ -636,6 +675,36 @@ def _run_task_get_arffcontent_parallel_helper( Optional[OpenMLRunTrace], "OrderedDict[str, float]", ]: + """Helper function that runs a single model on a single task fold sample. + + Parameters + ---------- + extension : Extension + An OpenML extension instance. + fold_no : int + The fold number to be run. + model : Any + The model that is to be evaluated. + rep_no : int + Repetition number to be run. + sample_no : int + Sample number to be run. + task : OpenMLTask + The task object from OpenML. + dataset_format : str + The dataset format to be used. + configuration : Dict + Hyperparameters to configure the model. + + Returns + ------- + Tuple[np.ndarray, Optional[pd.DataFrame], np.ndarray, Optional[pd.DataFrame], + Optional[OpenMLRunTrace], OrderedDict[str, float]] + A tuple containing the predictions, probability estimates (if applicable), + actual target values, actual target value probabilities (if applicable), + the trace object of the OpenML run (if applicable), + and a dictionary of local measures for this particular fold. + """ # Sets up the OpenML instantiated in the child process to match that of the parent's # if configuration=None, loads the default config._setup(configuration) diff --git a/openml/runs/trace.py b/openml/runs/trace.py index f6b038a55..1b2057c9f 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -4,7 +4,7 @@ from dataclasses import dataclass import json import os -from typing import List, Tuple, Optional # noqa F401 +from typing import List, Tuple, Optional, Dict, Union # noqa F401 import arff import xmltodict @@ -19,6 +19,82 @@ ] +@dataclass +class OpenMLTraceIteration: + """ + OpenML Trace Iteration: parsed output from Run Trace call + Exactly one of `setup_string` or `parameters` must be provided. + + Parameters + ---------- + repeat : int + repeat number (in case of no repeats: 0) + + fold : int + fold number (in case of no folds: 0) + + iteration : int + iteration number of optimization procedure + + setup_string : str, optional + json string representing the parameters + If not provided, ``parameters`` should be set. + + evaluation : double + The evaluation that was awarded to this trace iteration. + Measure is defined by the task + + selected : bool + Whether this was the best of all iterations, and hence + selected for making predictions. Per fold/repeat there + should be only one iteration selected + + parameters : OrderedDict, optional + Dictionary specifying parameter names and their values. + If not provided, ``setup_string`` should be set. + """ + + repeat: int + fold: int + iteration: int + + evaluation: float + selected: bool + + setup_string: Optional[str] = None + parameters: Optional[OrderedDict] = None + + def __post_init__(self): + # TODO: refactor into one argument of type + if self.setup_string and self.parameters: + raise ValueError( + "Can only be instantiated with either `setup_string` or `parameters` argument." + ) + elif not (self.setup_string or self.parameters): + raise ValueError( + "Either `setup_string` or `parameters` needs to be passed as argument." + ) + if self.parameters is not None and not isinstance(self.parameters, OrderedDict): + raise TypeError( + "argument parameters is not an instance of OrderedDict, but %s" + % str(type(self.parameters)) + ) + + def get_parameters(self): + result = {} + # parameters have prefix 'parameter_' + + if self.setup_string: + for param in self.setup_string: + key = param[len(PREFIX) :] + value = self.setup_string[param] + result[key] = json.loads(value) + else: + for param, value in self.parameters.items(): + result[param[len(PREFIX) :]] = value + return result + + class OpenMLRunTrace(object): """OpenML Run Trace: parsed output from Run Trace call @@ -33,7 +109,20 @@ class OpenMLRunTrace(object): """ - def __init__(self, run_id, trace_iterations): + def __init__( + self, + run_id: Union[int, None], + trace_iterations: Dict[Tuple[int, int, int], OpenMLTraceIteration], + ): + """Object to hold the trace content of a run. + + Parameters + ---------- + run_id : int + Id for which the trace content is to be stored. + trace_iterations : List[List] + The trace content obtained by running a flow on a task. + """ self.run_id = run_id self.trace_iterations = trace_iterations @@ -228,6 +317,24 @@ def trace_from_arff(cls, arff_obj): @classmethod def _trace_from_arff_struct(cls, attributes, content, error_message): + """Generate a trace dictionary from ARFF structure. + + Parameters + ---------- + cls : type + The trace object to be created. + attributes : List[Tuple[str, str]] + Attribute descriptions. + content : List[List[Union[int, float, str]]] + List of instances. + error_message : str + Error message to raise if `setup_string` is in `attributes`. + + Returns + ------- + OrderedDict + A dictionary representing the trace. + """ trace = OrderedDict() attribute_idx = {att[0]: idx for idx, att in enumerate(attributes)} @@ -345,6 +452,26 @@ def trace_from_xml(cls, xml): @classmethod def merge_traces(cls, traces: List["OpenMLRunTrace"]) -> "OpenMLRunTrace": + """Merge multiple traces into a single trace. + + Parameters + ---------- + cls : type + Type of the trace object to be created. + traces : List[OpenMLRunTrace] + List of traces to merge. + + Returns + ------- + OpenMLRunTrace + A trace object representing the merged traces. + + Raises + ------ + ValueError + If the parameters in the iterations of the traces being merged are not equal. + If a key (repeat, fold, iteration) is encountered twice while merging the traces. + """ merged_trace = ( OrderedDict() ) # type: OrderedDict[Tuple[int, int, int], OpenMLTraceIteration] # noqa E501 @@ -384,79 +511,3 @@ def __repr__(self): def __iter__(self): for val in self.trace_iterations.values(): yield val - - -@dataclass -class OpenMLTraceIteration: - """ - OpenML Trace Iteration: parsed output from Run Trace call - Exactly one of `setup_string` or `parameters` must be provided. - - Parameters - ---------- - repeat : int - repeat number (in case of no repeats: 0) - - fold : int - fold number (in case of no folds: 0) - - iteration : int - iteration number of optimization procedure - - setup_string : str, optional - json string representing the parameters - If not provided, ``parameters`` should be set. - - evaluation : double - The evaluation that was awarded to this trace iteration. - Measure is defined by the task - - selected : bool - Whether this was the best of all iterations, and hence - selected for making predictions. Per fold/repeat there - should be only one iteration selected - - parameters : OrderedDict, optional - Dictionary specifying parameter names and their values. - If not provided, ``setup_string`` should be set. - """ - - repeat: int - fold: int - iteration: int - - evaluation: float - selected: bool - - setup_string: Optional[str] = None - parameters: Optional[OrderedDict] = None - - def __post_init__(self): - # TODO: refactor into one argument of type - if self.setup_string and self.parameters: - raise ValueError( - "Can only be instantiated with either `setup_string` or `parameters` argument." - ) - elif not (self.setup_string or self.parameters): - raise ValueError( - "Either `setup_string` or `parameters` needs to be passed as argument." - ) - if self.parameters is not None and not isinstance(self.parameters, OrderedDict): - raise TypeError( - "argument parameters is not an instance of OrderedDict, but %s" - % str(type(self.parameters)) - ) - - def get_parameters(self): - result = {} - # parameters have prefix 'parameter_' - - if self.setup_string: - for param in self.setup_string: - key = param[len(PREFIX) :] - value = self.setup_string[param] - result[key] = json.loads(value) - else: - for param, value in self.parameters.items(): - result[param[len(PREFIX) :]] = value - return result diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 52969fb8c..bc6d21aaa 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -60,8 +60,24 @@ def setup_exists(flow) -> int: return setup_id if setup_id > 0 else False -def _get_cached_setup(setup_id): - """Load a run from the cache.""" +def _get_cached_setup(setup_id: int): + """Load a run from the cache. + + Parameters + ---------- + setup_id : int + ID of the setup to be loaded. + + Returns + ------- + OpenMLSetup + The loaded setup object. + + Raises + ------ + OpenMLCacheException + If the setup file for the given setup ID is not cached. + """ cache_dir = config.get_cache_directory() setup_cache_dir = os.path.join(cache_dir, "setups", str(setup_id)) try: @@ -271,9 +287,24 @@ def initialize_model(setup_id: int) -> Any: return model -def _to_dict(flow_id, openml_parameter_settings): +def _to_dict(flow_id: int, openml_parameter_settings) -> OrderedDict: + """Convert a flow ID and a list of OpenML parameter settings to + a dictionary representation that can be serialized to XML. + + Parameters + ---------- + flow_id : int + ID of the flow. + openml_parameter_settings : List[OpenMLParameter] + A list of OpenML parameter settings. + + Returns + ------- + OrderedDict + A dictionary representation of the flow ID and parameter settings. + """ # for convenience, this function (ab)uses the run object. - xml = OrderedDict() + xml: OrderedDict = OrderedDict() xml["oml:run"] = OrderedDict() xml["oml:run"]["@xmlns:oml"] = "http://openml.org/openml" xml["oml:run"]["oml:flow_id"] = flow_id @@ -319,6 +350,9 @@ def _create_setup_from_xml(result_dict, output_format="object"): def _create_setup_parameter_from_xml(result_dict, output_format="object"): + """ + Create an OpenMLParameter object or a dictionary from an API xml result. + """ if output_format == "object": return OpenMLParameter( input_id=int(result_dict["oml:id"]), diff --git a/openml/study/functions.py b/openml/study/functions.py index 7b72a31eb..05d100ccd 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -107,6 +107,20 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: tags.append(current_tag) def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]: + """Extracts a list of nested IDs from a result dictionary. + + Parameters + ---------- + key : str + Nested OpenML IDs. + subkey : str + The subkey contains the nested OpenML IDs. + + Returns + ------- + Optional[List] + A list of nested OpenML IDs, or None if the key is not present in the dictionary. + """ if result_dict.get(key) is not None: return [int(oml_id) for oml_id in result_dict[key][subkey]] return None @@ -591,6 +605,23 @@ def _list_studies(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]: def __list_studies(api_call, output_format="object") -> Union[Dict, pd.DataFrame]: + """Retrieves the list of OpenML studies and + returns it in a dictionary or a Pandas DataFrame. + + Parameters + ---------- + api_call : str + The API call for retrieving the list of OpenML studies. + output_format : str in {"object", "dataframe"} + Format of the output, either 'object' for a dictionary + or 'dataframe' for a Pandas DataFrame. + + Returns + ------- + Union[Dict, pd.DataFrame] + A dictionary or Pandas DataFrame of OpenML studies, + depending on the value of 'output_format'. + """ xml_string = openml._api_calls._perform_api_call(api_call, "get") study_dict = xmltodict.parse(xml_string, force_list=("oml:study",)) diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 00a8e822d..41d8d0197 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -230,6 +230,28 @@ def _list_tasks(task_type=None, output_format="dict", **kwargs): def __list_tasks(api_call, output_format="dict"): + """Returns a dictionary or a Pandas DataFrame with information about OpenML tasks. + + Parameters + ---------- + api_call : str + The API call specifying which tasks to return. + output_format : str in {"dict", "dataframe"} + Output format for the returned object. + + Returns + ------- + Union[Dict, pd.DataFrame] + A dictionary or a Pandas DataFrame with information about OpenML tasks. + + Raises + ------ + ValueError + If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml', + or has an incorrect value for '@xmlns:oml'. + KeyError + If an invalid key is found in the XML for a task. + """ xml_string = openml._api_calls._perform_api_call(api_call, "get") tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input")) # Minimalistic check if the XML is useful diff --git a/openml/tasks/split.py b/openml/tasks/split.py index e47c6040a..8112ba41b 100644 --- a/openml/tasks/split.py +++ b/openml/tasks/split.py @@ -136,9 +136,48 @@ def _from_arff_file(cls, filename: str) -> "OpenMLSplit": return cls(name, "", repetitions) def from_dataset(self, X, Y, folds, repeats): + """Generates a new OpenML dataset object from input data and cross-validation settings. + + Parameters + ---------- + X : array-like or sparse matrix + The input feature matrix. + Y : array-like, shape + The target variable values. + folds : int + Number of cross-validation folds to generate. + repeats : int + Number of times to repeat the cross-validation process. + + Raises + ------ + NotImplementedError + This method is not implemented yet. + """ raise NotImplementedError() def get(self, repeat=0, fold=0, sample=0): + """Returns the specified data split from the CrossValidationSplit object. + + Parameters + ---------- + repeat : int + Index of the repeat to retrieve. + fold : int + Index of the fold to retrieve. + sample : int + Index of the sample to retrieve. + + Returns + ------- + numpy.ndarray + The data split for the specified repeat, fold, and sample. + + Raises + ------ + ValueError + If the specified repeat, fold, or sample is not known. + """ if repeat not in self.split: raise ValueError("Repeat %s not known" % str(repeat)) if fold not in self.split[repeat]: diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 36e0ada1c..f205bd926 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -36,14 +36,24 @@ class OpenMLTask(OpenMLBase): Parameters ---------- - task_type_id : TaskType - Refers to the type of task. - task_type : str - Refers to the task. + task_id: Union[int, None] + Refers to the unique identifier of OpenML task. + task_type_id: TaskType + Refers to the type of OpenML task. + task_type: str + Refers to the OpenML task. data_set_id: int Refers to the data. estimation_procedure_id: int Refers to the type of estimates used. + estimation_procedure_type: str, default=None + Refers to the type of estimation procedure used for the OpenML task. + estimation_parameters: [Dict[str, str]], default=None + Estimation parameters used for the OpenML task. + evaluation_measure: str, default=None + Refers to the evaluation measure. + data_splits_url: str, default=None + Refers to the URL of the data splits used for the OpenML task. """ def __init__( @@ -206,8 +216,26 @@ class OpenMLSupervisedTask(OpenMLTask, ABC): Parameters ---------- + task_type_id : TaskType + ID of the task type. + task_type : str + Name of the task type. + data_set_id : int + ID of the OpenML dataset associated with the task. target_name : str Name of the target feature (the class variable). + estimation_procedure_id : int, default=None + ID of the estimation procedure for the task. + estimation_procedure_type : str, default=None + Type of the estimation procedure for the task. + estimation_parameters : dict, default=None + Estimation parameters for the task. + evaluation_measure : str, default=None + Name of the evaluation measure for the task. + data_splits_url : str, default=None + URL of the data splits for the task. + task_id: Union[int, None] + Refers to the unique identifier of task. """ def __init__( @@ -309,8 +337,30 @@ class OpenMLClassificationTask(OpenMLSupervisedTask): Parameters ---------- - class_labels : List of str (optional) - cost_matrix: array (optional) + task_type_id : TaskType + ID of the Classification task type. + task_type : str + Name of the Classification task type. + data_set_id : int + ID of the OpenML dataset associated with the Classification task. + target_name : str + Name of the target variable. + estimation_procedure_id : int, default=None + ID of the estimation procedure for the Classification task. + estimation_procedure_type : str, default=None + Type of the estimation procedure. + estimation_parameters : dict, default=None + Estimation parameters for the Classification task. + evaluation_measure : str, default=None + Name of the evaluation measure. + data_splits_url : str, default=None + URL of the data splits for the Classification task. + task_id : Union[int, None] + ID of the Classification task (if it already exists on OpenML). + class_labels : List of str, default=None + A list of class labels (for classification tasks). + cost_matrix : array, default=None + A cost matrix (for classification tasks). """ def __init__( @@ -348,7 +398,31 @@ def __init__( class OpenMLRegressionTask(OpenMLSupervisedTask): - """OpenML Regression object.""" + """OpenML Regression object. + + Parameters + ---------- + task_type_id : TaskType + Task type ID of the OpenML Regression task. + task_type : str + Task type of the OpenML Regression task. + data_set_id : int + ID of the OpenML dataset. + target_name : str + Name of the target feature used in the Regression task. + estimation_procedure_id : int, default=None + ID of the OpenML estimation procedure. + estimation_procedure_type : str, default=None + Type of the OpenML estimation procedure. + estimation_parameters : dict, default=None + Parameters used by the OpenML estimation procedure. + data_splits_url : str, default=None + URL of the OpenML data splits for the Regression task. + task_id : Union[int, None] + ID of the OpenML Regression task. + evaluation_measure : str, default=None + Evaluation measure used in the Regression task. + """ def __init__( self, @@ -382,7 +456,25 @@ class OpenMLClusteringTask(OpenMLTask): Parameters ---------- - target_name : str (optional) + task_type_id : TaskType + Task type ID of the OpenML clustering task. + task_type : str + Task type of the OpenML clustering task. + data_set_id : int + ID of the OpenML dataset used in clustering the task. + estimation_procedure_id : int, default=None + ID of the OpenML estimation procedure. + task_id : Union[int, None] + ID of the OpenML clustering task. + estimation_procedure_type : str, default=None + Type of the OpenML estimation procedure used in the clustering task. + estimation_parameters : dict, default=None + Parameters used by the OpenML estimation procedure. + data_splits_url : str, default=None + URL of the OpenML data splits for the clustering task. + evaluation_measure : str, default=None + Evaluation measure used in the clustering task. + target_name : str, default=None Name of the target feature (class) that is not part of the feature set for the clustering task. """ @@ -459,7 +551,35 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": class OpenMLLearningCurveTask(OpenMLClassificationTask): - """OpenML Learning Curve object.""" + """OpenML Learning Curve object. + + Parameters + ---------- + task_type_id : TaskType + ID of the Learning Curve task. + task_type : str + Name of the Learning Curve task. + data_set_id : int + ID of the dataset that this task is associated with. + target_name : str + Name of the target feature in the dataset. + estimation_procedure_id : int, default=None + ID of the estimation procedure to use for evaluating models. + estimation_procedure_type : str, default=None + Type of the estimation procedure. + estimation_parameters : dict, default=None + Additional parameters for the estimation procedure. + data_splits_url : str, default=None + URL of the file containing the data splits for Learning Curve task. + task_id : Union[int, None] + ID of the Learning Curve task. + evaluation_measure : str, default=None + Name of the evaluation measure to use for evaluating models. + class_labels : list of str, default=None + Class labels for Learning Curve tasks. + cost_matrix : numpy array, default=None + Cost matrix for Learning Curve tasks. + """ def __init__( self,