New documentation layout

Signed-off-by: Joaquin Anton <[email protected]>
NVIDIA · Feb 10, 2021 · 0ca079f · 0ca079f
1 parent 156cf09
commit 0ca079f
Show file tree

Hide file tree

Showing 6 changed files with 144 additions and 95 deletions.
diff --git a/dali/operators/decoder/image_decoder.cc b/dali/operators/decoder/image_decoder.cc
@@ -227,11 +227,9 @@ anchors and shapes.
 
 Inputs must be supplied as tensors in the following order:
 
-* ``data`` that contains the input data.
-* ``anchor`` that contains normalized or absolute coordinates, depending on the
-  ``normalized_anchor`` value, for the starting point of the slice (x0, x1, x2, and so on),
-* ``shape`` that contains normalized or absolute coordinates, depending on the
-  ``normalized_shape`` value, for the dimensions of the slice (s0, s1, s2, and so on).
+#. ``data``
+#. ``anchor``
+#. ``shape``
 
 The anchor and shape coordinates must be within the interval [0.0, 1.0] for normalized
 coordinates or within the image shape for the absolute coordinates. The ``anchor`` and ``shape``
@@ -257,6 +255,21 @@ Please note that GPU acceleration for JPEG 2000 decoding is only available for C
   .NumInput(3)
   .NumOutput(1)
   .AddParent("ImageDecoderAttr")
-  .AddParent("SliceAttr");
+  .AddParent("SliceAttr")
+  .InputDox(0, "data", "TensorList", R"code(Batch that contains the input data.)code")
+  .InputDox(1, "anchor", "1D TensorList of float or int",
+            R"code(Input that contains normalized or absolute coordinates for the starting
+point of the slice (x0, x1, x2, …).
+
+Integer coordinates are interpreted as absolute coordinates, while float coordinates can be
+interpreted as absolute or relative coordinates, depending on the value of
+``normalized_anchor``.)code")
+  .InputDox(2, "shape", "1D TensorList of float or int",
+            R"code(Input that contains normalized or absolute coordinates for the dimensions
+of the slice (s0, s1, s2, …).
+
+Integer coordinates are interpreted as absolute coordinates, while float coordinates can be
+interpreted as absolute or relative coordinates, depending on the value of
+``normalized_shape``.)code");
 
 }  // namespace dali
diff --git a/dali/python/nvidia/dali/fn.py b/dali/python/nvidia/dali/fn.py
@@ -61,7 +61,7 @@ def _to_snake_case(pascal):
     out = _handle_special_case(out)
     return out
 
-def _wrap_op_fn(op_class, wrapper_name):
+def _wrap_op_fn(op_class, wrapper_name, wrapper_doc):
     def op_wrapper(*inputs, **kwargs):
         import nvidia.dali.ops
         init_args, call_args = nvidia.dali.ops._separate_kwargs(kwargs)
@@ -77,19 +77,18 @@ def op_wrapper(*inputs, **kwargs):
 
     op_wrapper.__name__ = wrapper_name
     op_wrapper.__qualname__ = wrapper_name
-    op_wrapper.__doc__ = op_class.__doc__
-    if op_class.__call__.__doc__ is not None:
-        op_wrapper.__doc__ += "\n\n" + op_class.__call__.__doc__
+    op_wrapper.__doc__ = wrapper_doc
     return op_wrapper
 
-def _wrap_op(op_class, submodule, parent_module=None):
+def _wrap_op(op_class, submodule, parent_module, wrapper_doc):
     """Wrap the DALI Operator with fn API and insert the function into appropriate module.
 
     Args:
         op_class: Op class to wrap
         submodule: Additional submodule (scope)
         parent_module (str): If set to None, the wrapper is placed in nvidia.dali.fn module,
             otherwise in a specified parent module.
+        wrapper_doc (str): Documentation of the wrapper function
     """
     schema = _b.TryGetSchema(op_class.__name__)
     make_hidden = schema.IsDocHidden() if schema else False
@@ -100,7 +99,7 @@ def _wrap_op(op_class, submodule, parent_module=None):
         fn_module = sys.modules[parent_module]
     module = _internal.get_submodule(fn_module, submodule)
     if not hasattr(module, wrapper_name):
-        wrap_func = _wrap_op_fn(op_class, wrapper_name)
+        wrap_func = _wrap_op_fn(op_class, wrapper_name, wrapper_doc)
         setattr(module, wrapper_name, wrap_func)
         if submodule:
             wrap_func.__module__ = module.__name__

diff --git a/dali/python/nvidia/dali/ops.py b/dali/python/nvidia/dali/ops.py
@@ -48,6 +48,45 @@ def _numpydoc_formatter(name, type, doc, optional = False):
         type += ", optional"
     return "`{}` : {}{}{}".format(name, type, indent, doc.replace("\n", indent))
 
+def _get_inputs_doc(schema):
+    # Inputs section
+    if schema.MaxNumInput() == 0:
+        return ""
+    ret = """
+Args
+----
+"""
+    if schema.HasInputDox():
+        for i in range(schema.MaxNumInput()):
+            optional = i >= schema.MinNumInput()
+            input_type_str = schema.GetInputType(i) + _supported_layouts_str(schema.GetSupportedLayouts(i))
+            dox = schema.GetInputDox(i)
+            input_name = schema.GetInputName(i)
+            ret += _numpydoc_formatter(input_name, input_type_str, dox, optional) + "\n"
+    else:
+        for i in range(schema.MinNumInput()):
+            input_type_str = "TensorList" + _supported_layouts_str(schema.GetSupportedLayouts(i))
+            dox = "Input to the operator."
+            input_name = f"input{i}" if schema.MaxNumInput() > 1 else "data"
+            ret += _numpydoc_formatter(input_name, input_type_str, dox, False) + "\n"
+
+        extra_opt_args = schema.MaxNumInput() - schema.MinNumInput()
+        if extra_opt_args == 1:
+            i = schema.MinNumInput()
+            input_type_str = "TensorList" + _supported_layouts_str(schema.GetSupportedLayouts(i))
+            dox = "Input to the operator."
+            input_name = f"input{i}" if schema.MaxNumInput() > 1 else "data"
+            ret += _numpydoc_formatter(input_name, input_type_str, dox, True) + "\n"
+        elif extra_opt_args > 1:
+            input_type_str = "TensorList"
+            input_name = f"input[{schema.MinNumInput()}..{schema.MaxNumInput()}]"
+            dox = f"This function accepts up to {extra_opt_args} optional positional inputs"
+            ret += _numpydoc_formatter(input_name, input_type_str, dox, True) + "\n"
+
+    ret += "\n"
+    return ret
+
+
 def _get_kwargs(schema):
     """
     Get the keywords arguments from the schema.
@@ -75,7 +114,7 @@ def _get_kwargs(schema):
 def _schema_name(cls):
     return getattr(cls, 'schema_name', cls.__name__)
 
-def _docstring_generator(cls):
+def _docstring_generator_main(cls):
     """
         Generate docstring for the class obtaining it from schema based on cls.__name__
         This lists all the Keyword args that can be used when creating operator
@@ -126,14 +165,20 @@ def _docstring_generator(cls):
     for dev in op_dev:
         ret += " * " + dev + "\n"
     ret += "\n"
+    return ret
 
+def _docstring_generator(cls):
+    op_name = _schema_name(cls)
+    schema = _b.GetSchema(op_name)
+    ret = _docstring_generator_main(cls)
     ret += """
 Keyword args
 ------------
 """
     ret += _get_kwargs(schema)
     return ret
 
+
 def _supported_layouts_str(supported_layouts):
     if len(supported_layouts) == 0:
         return ""
@@ -152,16 +197,7 @@ def _docstring_prefix_from_inputs(op_name):
     # __call__ docstring
     ret += "\nOperator call to be used in graph definition.\n"
     # Args section
-    ret += """
-Args
-----
-"""
-    for i in range(schema.MaxNumInput()):
-        optional = i >= schema.MinNumInput()
-        input_type_str = schema.GetInputType(i) + _supported_layouts_str(schema.GetSupportedLayouts(i))
-        ret += _numpydoc_formatter(schema.GetInputName(i), input_type_str, schema.GetInputDox(i), optional)
-        ret += "\n"
-    ret += "\n"
+    ret += _get_inputs_doc(schema)
     return ret
 
 def _docstring_prefix_auto(op_name):
@@ -215,6 +251,18 @@ def _docstring_generator_call(op_name):
             ret += tensor_kwargs
     return ret
 
+def _docstring_generator_fn(cls):
+    op_name = _schema_name(cls)
+    schema = _b.GetSchema(op_name)
+    ret = _docstring_generator_main(cls)
+    ret += _get_inputs_doc(schema)
+    ret += """
+Keyword args
+------------
+"""
+    ret += _get_kwargs(schema)
+    return ret
+
 class _OpCounter(object):
     #pylint: disable=too-few-public-methods
     _lock = threading.Lock()
@@ -621,7 +669,7 @@ def _process_op_name(op_schema_name, make_hidden=False):
     return op_full_name, submodule, op_name
 
 def _wrap_op(op_class, submodule = [], parent_module=None):
-    return _functional._wrap_op(op_class, submodule, parent_module)
+    return _functional._wrap_op(op_class, submodule, parent_module, _docstring_generator_fn(op_class))
 
 def _load_ops():
     global _cpu_ops

diff --git a/docs/advanced_topics.rst b/docs/advanced_topics.rst
@@ -1,13 +1,12 @@
-Advanced Topics
-===============
-
-Thread Affinity
----------------
+Performance Tuning
+==================
 
 .. note::
   For typical use cases, the default DALI configuration performs well out of the box, and you do
   not need to review this section.
 
+Thread Affinity
+---------------
 
 This functionality allows you to pin DALI threads to the specified CPU. Thread affinity avoids
 the overhead of worker threads jumping from core to core and improves performance with CPU-heavy
@@ -40,10 +39,6 @@ and thread 4 to the CPU ID that is returned by nvmlDeviceGetCpuAffinity.
 Memory Consumption
 ------------------
 
-.. note::
-  For typical use cases, the default DALI configuration performs well out of the box, and you do
-  not need to review this section.
-
 DALI uses the following memory types:
 
 - Host
@@ -89,10 +84,6 @@ growth factor for the host and the GPU buffers.
 Operator Buffer Presizing
 -------------------------
 
-.. note::
-  For typical use cases, the default DALI configuration performs well out of the box, and you do
-  not need to review this section.
-
 When you can precisely forecast the memory consumption during a DALI run, this functionality helps
 you fine tune the processing pipeline. One of the benefits is that the overhead of some
 reallocations can be avoided.
@@ -124,10 +115,6 @@ the allocation is contiguous. This value should be provided to ``bytes_per_sampl
 Prefetching Queue Depth
 -----------------------
 
-.. note::
-  For typical use cases, the default DALI configuration performs well out of the box, and you do
-  not need to review this section.
-
 The DALI pipeline allows the buffering of one or more batches of data, which is important when
 the processing time varies from batch to batch.
 The default prefetch depth is 2. You can change this value by using the ``prefetch_queue_depth``
@@ -137,59 +124,8 @@ we recommend that you prefetch more data ahead of time.
 .. note::
   Increasing queue depth also increases memory consumption.
 
-Running DALI pipeline
----------------------
-
-DALI pipeline can be run in one of the following ways:
-
-- | Simple run method, which runs the computations and returns the results.
-  | This option corresponds to the :meth:`nvidia.dali.types.PipelineAPIType.BASIC` API type.
-- | :meth:`nvidia.dali.pipeline.Pipeline.schedule_runs`,
-    :meth:`nvidia.dali.pipeline.Pipeline.share_outputs`,
-    :meth:`nvidia.dali.pipeline.Pipeline.release_outputs` that allows a fine-grain control for
-    the duration of the output buffers' lifetime.
-  | This option corresponds to the :meth:`nvidia.dali.types.PipelineAPIType.SCHEDULED` API type.
-- | Built-in iterators for MXNet, PyTorch, and TensorFlow.
-  | This option corresponds to the :meth:`nvidia.dali.types.PipelineAPIType.ITERATOR` API type.
-
-The first API, :meth:`nvidia.dali.pipeline.Pipeline.run()` method completes the following tasks:
-
-#. Launches the DALI pipeline.
-#. Executes the prefetch iterations if necessary.
-#. Waits until the first batch is ready.
-#. Returns the resulting buffers.
-
-Buffers are marked as in-use until the next call to
-:meth:`nvidia.dali.pipeline.Pipeline.run`. This process can be wasteful because the data is usually
-copied to the DL framework's native storage objects and DALI pipeline outputs could be returned to
-DALI for reuse.
-
-The second API, which consists of :meth:`nvidia.dali.pipeline.Pipeline.schedule_run()`,
-:meth:`nvidia.dali.pipeline.Pipeline.share_outputs()`, and :meth:`nvidia.dali.pipeline.Pipeline.release_outputs()`
-allows you to explicitly manage the lifetime of the output buffers. The
-:meth:`nvidia.dali.pipeline.Pipeline.schedule_run()` method instructs DALI to prepare the next
-batch of data, and, if necessary, to prefetch. If the execution mode is set to asynchronous,
-this call returns immediately, without waiting for the results. This way, another task can be
-simultaneously executed. The data batch can be requested from DALI by calling
-:meth:`nvidia.dali.pipeline.Pipeline.share_outputs`, which returns the result buffer. If the data
-batch is not yet ready, DALI will wait for it. The data is ready as soon as the
-:meth:`nvidia.dali.pipeline.Pipeline.share_outputs()`` is complete. When the DALI buffers are
-no longer needed, because data was copied or has already been consumed, call
-:meth:`nvidia.dali.pipeline.Pipeline.release_outputs()` to return the DALI buffers for reuse
-in subsequent iterations.
-
-Built-in iterators use the second API to provide convenient wrappers for immediate use in
-Deep Learning Frameworks. The data is returned in the framework's native buffers. The iterator's
-implementation copies the data internally from DALI buffers and recycles the data by calling
-:meth:`nvidia.dali.pipeline.Pipeline.release_outputs()`.
-
-We recommend that you do not mix the APIs. The APIs follow a different logic for the output
-buffer lifetime management, and the details of the process are subject to change without notice.
-Mixing the APIs might result in undefined behavior, such as a deadlock or an attempt to access
-an invalid buffer.
-
 Sharding
---------
+========
 
 Sharding allows DALI to partition the dataset into nonoverlapping pieces on which each DALI pipeline
 instance can work. This functionality addresses the issue of having a global and a shared state
@@ -254,8 +190,60 @@ When this occurs, use the first formula.
 To address these challenges, use the ``reader_name`` parameter and allow the iterator
 handle the details.
 
+
+Pipeline run methods
+====================
+
+DALI pipeline can be run in one of the following ways:
+
+- | Simple run method, which runs the computations and returns the results.
+  | This option corresponds to the :meth:`nvidia.dali.types.PipelineAPIType.BASIC` API type.
+- | :meth:`nvidia.dali.pipeline.Pipeline.schedule_runs`,
+    :meth:`nvidia.dali.pipeline.Pipeline.share_outputs`,
+    :meth:`nvidia.dali.pipeline.Pipeline.release_outputs` that allows a fine-grain control for
+    the duration of the output buffers' lifetime.
+  | This option corresponds to the :meth:`nvidia.dali.types.PipelineAPIType.SCHEDULED` API type.
+- | Built-in iterators for MXNet, PyTorch, and TensorFlow.
+  | This option corresponds to the :meth:`nvidia.dali.types.PipelineAPIType.ITERATOR` API type.
+
+The first API, :meth:`nvidia.dali.pipeline.Pipeline.run()` method completes the following tasks:
+
+#. Launches the DALI pipeline.
+#. Executes the prefetch iterations if necessary.
+#. Waits until the first batch is ready.
+#. Returns the resulting buffers.
+
+Buffers are marked as in-use until the next call to
+:meth:`nvidia.dali.pipeline.Pipeline.run`. This process can be wasteful because the data is usually
+copied to the DL framework's native storage objects and DALI pipeline outputs could be returned to
+DALI for reuse.
+
+The second API, which consists of :meth:`nvidia.dali.pipeline.Pipeline.schedule_run()`,
+:meth:`nvidia.dali.pipeline.Pipeline.share_outputs()`, and :meth:`nvidia.dali.pipeline.Pipeline.release_outputs()`
+allows you to explicitly manage the lifetime of the output buffers. The
+:meth:`nvidia.dali.pipeline.Pipeline.schedule_run()` method instructs DALI to prepare the next
+batch of data, and, if necessary, to prefetch. If the execution mode is set to asynchronous,
+this call returns immediately, without waiting for the results. This way, another task can be
+simultaneously executed. The data batch can be requested from DALI by calling
+:meth:`nvidia.dali.pipeline.Pipeline.share_outputs`, which returns the result buffer. If the data
+batch is not yet ready, DALI will wait for it. The data is ready as soon as the
+:meth:`nvidia.dali.pipeline.Pipeline.share_outputs()`` is complete. When the DALI buffers are
+no longer needed, because data was copied or has already been consumed, call
+:meth:`nvidia.dali.pipeline.Pipeline.release_outputs()` to return the DALI buffers for reuse
+in subsequent iterations.
+
+Built-in iterators use the second API to provide convenient wrappers for immediate use in
+Deep Learning Frameworks. The data is returned in the framework's native buffers. The iterator's
+implementation copies the data internally from DALI buffers and recycles the data by calling
+:meth:`nvidia.dali.pipeline.Pipeline.release_outputs()`.
+
+We recommend that you do not mix the APIs. The APIs follow a different logic for the output
+buffer lifetime management, and the details of the process are subject to change without notice.
+Mixing the APIs might result in undefined behavior, such as a deadlock or an attempt to access
+an invalid buffer.
+
 C++ API
--------
+=======
 
 .. note::
   **This feature is not officially supported and may change without notice**

diff --git a/docs/index.rst b/docs/index.rst
@@ -63,6 +63,7 @@ This library is open sourced and it is available in the `NVIDIA GitHub repositor
    :caption: Reference
 
    Release Notes <https://docs.nvidia.com/deeplearning/dali/release-notes/index.html>
+   GitHub <https://github.com/NVIDIA/DALI>
 
 Indices and tables
 ==================

diff --git a/docs/supported_ops_legacy.rst b/docs/supported_ops_legacy.rst
@@ -1,4 +1,4 @@
-(Legacy) Operator Objects
+Operator Objects (Legacy)
 =========================
 
 In older versions of DALI, an object-oriented API was used to define operations instead of