openml · LennartPurucker · Oct 31, 2023 · Aug 16, 2023 · Aug 16, 2023 · Aug 16, 2023
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -2101,6 +2101,20 @@ def instantiate_model_from_hpo_class(
         return base_estimator
 
     def _extract_trace_data(self, model, rep_no, fold_no):
+        """Extracts data from a machine learning model's cross-validation results and creates an ARFF (Attribute-Relation File Format) trace.
+
+            Parameters
+            ----------
+            model : Any
+                A fitted hyperparameter optimization model.
+            rep_no : int
+                The repetition number.
+            fold_no : int
+                The fold number.
+            Returns
+            -------
+            A list of ARFF tracecontent.
+        """
         arff_tracecontent = []
         for itt_no in range(0, len(model.cv_results_["mean_test_score"])):
             # we use the string values for True and False, as it is defined in

diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -523,6 +523,18 @@ def get_subflow(self, structure):
 
 
 def _copy_server_fields(source_flow, target_flow):
+    """ Recursively copies the fields added by the server from the `source_flow` to the `target_flow`.
+
+    Parameters
+    ----------
+    source_flow : OpenMLFlow
+        To copy the fields from.
+    target_flow : OpenMLFlow
+        To copy the fields to.
+    Returns
+    -------
+    None
+    """
     fields_added_by_the_server = ["flow_id", "uploader", "version", "upload_date"]
     for field in fields_added_by_the_server:
         setattr(target_flow, field, getattr(source_flow, field))
@@ -533,5 +545,19 @@ def _copy_server_fields(source_flow, target_flow):
 
 
 def _add_if_nonempty(dic, key, value):
+    """ Adds a key-value pair to a dictionary if the value is not None.
+
+    Parameters
+    ----------
+    dic: dict
+        To add the key-value pair to.
+    key: hashable
+        To add to the dictionary.
+    value: Any
+        To add to the dictionary.
+    Returns
+    -------
+    None
+    """
     if value is not None:
         dic[key] = value
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -337,6 +337,18 @@ def get_flow_id(
 
 
 def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.DataFrame]:
+    """
+    Retrieve information about flows from OpenML API and parse it to a dictionary or a Pandas DataFrame.
+    Parameters
+    ----------
+    api_call: str
+        Retrieves the information about flows.
+    output_format: str in {"dict", "dataframe"}
+        The output format.
+    Returns
+    -------
+        The flows information in the specified output format.
+    """
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",))
 

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -128,6 +128,16 @@ def run_model_on_task(
     flow = extension.model_to_flow(model)
 
     def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTask:
+        """ Retrieve an OpenMLTask object from either an integer or string ID, or directly from an OpenMLTask object.
+        Parameters
+        ----------
+        task : Union[int, str, OpenMLTask]
+            The task ID or the OpenMLTask object.
+        Returns
+        -------
+        OpenMLTask
+            The OpenMLTask object.
+        """
         if isinstance(task, (int, str)):
             return get_task(int(task))
         else:
@@ -451,6 +461,27 @@ def _run_task_get_arffcontent(
     "OrderedDict[str, OrderedDict]",
     "OrderedDict[str, OrderedDict]",
 ]:
+    """ Runs the hyperparameter optimization on the given task and returns the arfftrace content.
+    Parameters
+    ----------
+    model : Any
+        The model that is to be evalauted.
+    task : OpenMLTask
+        The OpenMLTask to evaluate.
+    extension : Extension
+        The OpenML extension object.
+    add_local_measures : bool
+        Whether to compute additional local evaluation measures.
+    dataset_format : str
+        The format in which to download the dataset.
+    n_jobs : int
+        Number of jobs to run in parallel. If None, use 1 core by default. If -1, use all available cores.
+
+    Returns
+    -------
+    Tuple[List[List], Optional[OpenMLRunTrace], OrderedDict[str, OrderedDict], OrderedDict[str, OrderedDict]]
+        A tuple containing the arfftrace content, the OpenML run trace, the global and local evaluation measures.
+    """
     arff_datacontent = []  # type: List[List]
     traces = []  # type: List[OpenMLRunTrace]
     # stores fold-based evaluation measures. In case of a sample based task,
@@ -636,6 +667,35 @@ def _run_task_get_arffcontent_parallel_helper(
     Optional[OpenMLRunTrace],
     "OrderedDict[str, float]",
 ]:
+    """ Helper function that runs a single model on a single task fold sample.
+
+    Parameters
+    ----------
+    extension : Extension
+        An OpenML extension instance.
+    fold_no : int
+        The fold number to be run.
+    model : Any
+        The model that is to be evaluated.
+    rep_no : int
+        Repetition number to be run.
+    sample_no : int
+        Sample number to be run.
+    task : OpenMLTask
+        The task object from OpenML.
+    dataset_format : str
+        The dataset format to be used.
+    configuration : Dict
+        Hyperparameters to configure the model.
+
+    Returns
+    -------
+    Tuple[np.ndarray, Optional[pd.DataFrame], np.ndarray, Optional[pd.DataFrame],
+           Optional[OpenMLRunTrace], OrderedDict[str, float]]
+    A tuple containing the predictions, probability estimates (if applicable), 
+    actual target values, actual target value probabilities (if applicable), 
+    the trace object of the OpenML run (if applicable), and a dictionary of local measures for this particular fold.
+    """
     # Sets up the OpenML instantiated in the child process to match that of the parent's
     # if configuration=None, loads the default
     config._setup(configuration)

diff --git a/openml/runs/trace.py b/openml/runs/trace.py
@@ -33,7 +33,17 @@ class OpenMLRunTrace(object):
 
     """
 
-    def __init__(self, run_id, trace_iterations):
+    def __init__(self, run_id: int, trace_iterations: List[List]):
+        """
+        Object to hold the trace content of a run.
+
+        Parameters
+        ----------
+        run_id : int
+            Id for which the trace content is to be stored.
+        trace_iterations : List[List]
+            The trace content obtained by running a flow on a task.
+        """
         self.run_id = run_id
         self.trace_iterations = trace_iterations
 
@@ -228,6 +238,24 @@ def trace_from_arff(cls, arff_obj):
 
     @classmethod
     def _trace_from_arff_struct(cls, attributes, content, error_message):
+        """ Generate a trace dictionary from ARFF structure.
+
+        Parameters
+        ----------
+        cls : type
+            The trace object to be created.
+        attributes : List[Tuple[str, str]]
+            Attribute descriptions.
+        content : List[List[Union[int, float, str]]]
+            List of instances.
+        error_message : str
+            Error message to raise if `setup_string` is in `attributes`.
+
+        Returns
+        -------
+        OrderedDict
+            A dictionary representing the trace.
+        """
         trace = OrderedDict()
         attribute_idx = {att[0]: idx for idx, att in enumerate(attributes)}
 
@@ -345,6 +373,26 @@ def trace_from_xml(cls, xml):
 
     @classmethod
     def merge_traces(cls, traces: List["OpenMLRunTrace"]) -> "OpenMLRunTrace":
+        """Merge multiple traces into a single trace.
+
+        Parameters
+        ----------
+        cls : type
+            Type of the trace object to be created.
+        traces : List[OpenMLRunTrace]
+            List of traces to merge.
+
+        Returns
+        -------
+        OpenMLRunTrace
+            A trace object representing the merged traces.
+
+        Raises
+        ------
+        ValueError
+            If the parameters in the iterations of the traces being merged are not equal.
+            If a key (repeat, fold, iteration) is encountered twice while merging the traces.
+        """
         merged_trace = (
             OrderedDict()
         )  # type: OrderedDict[Tuple[int, int, int], OpenMLTraceIteration]  # noqa E501

diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -60,8 +60,24 @@ def setup_exists(flow) -> int:
     return setup_id if setup_id > 0 else False
 
 
-def _get_cached_setup(setup_id):
-    """Load a run from the cache."""
+def _get_cached_setup(setup_id: int):
+    """Load a run from the cache.
+
+    Parameters
+    ----------
+    setup_id : int
+        ID of the setup to be loaded.
+
+    Returns
+    -------
+    OpenMLSetup
+        The loaded setup object.
+
+    Raises
+    ------
+    OpenMLCacheException
+        If the setup file for the given setup ID is not cached.
+    """
     cache_dir = config.get_cache_directory()
     setup_cache_dir = os.path.join(cache_dir, "setups", str(setup_id))
     try:
@@ -271,7 +287,21 @@ def initialize_model(setup_id: int) -> Any:
     return model
 
 
-def _to_dict(flow_id, openml_parameter_settings):
+def _to_dict(flow_id: int, openml_parameter_settings):
+    """ Convert a flow ID and a list of OpenML parameter settings to a dictionary representation that can be serialized to XML.
+
+    Parameters
+    ----------
+    flow_id : int
+        ID of the flow.
+    openml_parameter_settings : List[OpenMLParameter]
+        A list of OpenML parameter settings.
+
+    Returns
+    -------
+    OrderedDict
+        A dictionary representation of the flow ID and parameter settings.
+    """
     # for convenience, this function (ab)uses the run object.
     xml = OrderedDict()
     xml["oml:run"] = OrderedDict()
@@ -319,6 +349,9 @@ def _create_setup_from_xml(result_dict, output_format="object"):
 
 
 def _create_setup_parameter_from_xml(result_dict, output_format="object"):
+    """
+        Create an OpenMLParameter object or a dictionary from an API xml result.
+    """
     if output_format == "object":
         return OpenMLParameter(
             input_id=int(result_dict["oml:id"]),

diff --git a/openml/study/functions.py b/openml/study/functions.py
@@ -107,6 +107,20 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy:
             tags.append(current_tag)
 
     def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]:
+        """ Extracts a list of nested IDs from a result dictionary.
+
+        Parameters
+        ----------
+        key : str
+            Nested OpenML IDs.
+        subkey : str
+            The subkey contains the nested OpenML IDs.
+
+        Returns
+        -------
+        Optional[List]
+            A list of nested OpenML IDs, or None if the key is not present in the dictionary.
+        """
         if result_dict.get(key) is not None:
             return [int(oml_id) for oml_id in result_dict[key][subkey]]
         return None
@@ -591,6 +605,20 @@ def _list_studies(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]:
 
 
 def __list_studies(api_call, output_format="object") -> Union[Dict, pd.DataFrame]:
+    """ Retrieves the list of OpenML studies and returns it in a dictionary or a Pandas DataFrame.
+
+    Parameters
+    ----------
+    api_call : str
+        The API call for retrieving the list of OpenML studies.
+    output_format : str in {"object", "dataframe"}
+        Format of the output, either 'object' for a dictionary or 'dataframe' for a Pandas DataFrame.
+
+    Returns
+    -------
+    Union[Dict, pd.DataFrame]
+        A dictionary or Pandas DataFrame of OpenML studies, depending on the value of 'output_format'.
+    """
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     study_dict = xmltodict.parse(xml_string, force_list=("oml:study",))
 

diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -230,6 +230,28 @@ def _list_tasks(task_type=None, output_format="dict", **kwargs):
 
 
 def __list_tasks(api_call, output_format="dict"):
+    """ Returns a dictionary or a Pandas DataFrame with information about OpenML tasks.
+
+    Parameters
+    ----------
+    api_call : str
+        The API call specifying which tasks to return.
+    output_format : str in {"dict", "dataframe"}
+        Output format for the returned object.
+
+    Returns
+    -------
+    Union[Dict, pd.DataFrame]
+        A dictionary or a Pandas DataFrame with information about OpenML tasks.
+
+    Raises
+    ------
+    ValueError
+        If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml', or has an incorrect value for
+        '@xmlns:oml'.
+    KeyError
+        If an invalid key is found in the XML for a task.
+    """
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input"))
     # Minimalistic check if the XML is useful