Skip to content

Commit

Permalink
New documentation layout
Browse files Browse the repository at this point in the history
Signed-off-by: Joaquin Anton <[email protected]>
  • Loading branch information
jantonguirao committed Feb 10, 2021
1 parent 156cf09 commit 0ca079f
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 95 deletions.
25 changes: 19 additions & 6 deletions dali/operators/decoder/image_decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -227,11 +227,9 @@ anchors and shapes.
Inputs must be supplied as tensors in the following order:
* ``data`` that contains the input data.
* ``anchor`` that contains normalized or absolute coordinates, depending on the
``normalized_anchor`` value, for the starting point of the slice (x0, x1, x2, and so on),
* ``shape`` that contains normalized or absolute coordinates, depending on the
``normalized_shape`` value, for the dimensions of the slice (s0, s1, s2, and so on).
#. ``data``
#. ``anchor``
#. ``shape``
The anchor and shape coordinates must be within the interval [0.0, 1.0] for normalized
coordinates or within the image shape for the absolute coordinates. The ``anchor`` and ``shape``
Expand All @@ -257,6 +255,21 @@ Please note that GPU acceleration for JPEG 2000 decoding is only available for C
.NumInput(3)
.NumOutput(1)
.AddParent("ImageDecoderAttr")
.AddParent("SliceAttr");
.AddParent("SliceAttr")
.InputDox(0, "data", "TensorList", R"code(Batch that contains the input data.)code")
.InputDox(1, "anchor", "1D TensorList of float or int",
R"code(Input that contains normalized or absolute coordinates for the starting
point of the slice (x0, x1, x2, …).
Integer coordinates are interpreted as absolute coordinates, while float coordinates can be
interpreted as absolute or relative coordinates, depending on the value of
``normalized_anchor``.)code")
.InputDox(2, "shape", "1D TensorList of float or int",
R"code(Input that contains normalized or absolute coordinates for the dimensions
of the slice (s0, s1, s2, …).
Integer coordinates are interpreted as absolute coordinates, while float coordinates can be
interpreted as absolute or relative coordinates, depending on the value of
``normalized_shape``.)code");

} // namespace dali
11 changes: 5 additions & 6 deletions dali/python/nvidia/dali/fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def _to_snake_case(pascal):
out = _handle_special_case(out)
return out

def _wrap_op_fn(op_class, wrapper_name):
def _wrap_op_fn(op_class, wrapper_name, wrapper_doc):
def op_wrapper(*inputs, **kwargs):
import nvidia.dali.ops
init_args, call_args = nvidia.dali.ops._separate_kwargs(kwargs)
Expand All @@ -77,19 +77,18 @@ def op_wrapper(*inputs, **kwargs):

op_wrapper.__name__ = wrapper_name
op_wrapper.__qualname__ = wrapper_name
op_wrapper.__doc__ = op_class.__doc__
if op_class.__call__.__doc__ is not None:
op_wrapper.__doc__ += "\n\n" + op_class.__call__.__doc__
op_wrapper.__doc__ = wrapper_doc
return op_wrapper

def _wrap_op(op_class, submodule, parent_module=None):
def _wrap_op(op_class, submodule, parent_module, wrapper_doc):
"""Wrap the DALI Operator with fn API and insert the function into appropriate module.
Args:
op_class: Op class to wrap
submodule: Additional submodule (scope)
parent_module (str): If set to None, the wrapper is placed in nvidia.dali.fn module,
otherwise in a specified parent module.
wrapper_doc (str): Documentation of the wrapper function
"""
schema = _b.TryGetSchema(op_class.__name__)
make_hidden = schema.IsDocHidden() if schema else False
Expand All @@ -100,7 +99,7 @@ def _wrap_op(op_class, submodule, parent_module=None):
fn_module = sys.modules[parent_module]
module = _internal.get_submodule(fn_module, submodule)
if not hasattr(module, wrapper_name):
wrap_func = _wrap_op_fn(op_class, wrapper_name)
wrap_func = _wrap_op_fn(op_class, wrapper_name, wrapper_doc)
setattr(module, wrapper_name, wrap_func)
if submodule:
wrap_func.__module__ = module.__name__
Expand Down
72 changes: 60 additions & 12 deletions dali/python/nvidia/dali/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,45 @@ def _numpydoc_formatter(name, type, doc, optional = False):
type += ", optional"
return "`{}` : {}{}{}".format(name, type, indent, doc.replace("\n", indent))

def _get_inputs_doc(schema):
# Inputs section
if schema.MaxNumInput() == 0:
return ""
ret = """
Args
----
"""
if schema.HasInputDox():
for i in range(schema.MaxNumInput()):
optional = i >= schema.MinNumInput()
input_type_str = schema.GetInputType(i) + _supported_layouts_str(schema.GetSupportedLayouts(i))
dox = schema.GetInputDox(i)
input_name = schema.GetInputName(i)
ret += _numpydoc_formatter(input_name, input_type_str, dox, optional) + "\n"
else:
for i in range(schema.MinNumInput()):
input_type_str = "TensorList" + _supported_layouts_str(schema.GetSupportedLayouts(i))
dox = "Input to the operator."
input_name = f"input{i}" if schema.MaxNumInput() > 1 else "data"
ret += _numpydoc_formatter(input_name, input_type_str, dox, False) + "\n"

extra_opt_args = schema.MaxNumInput() - schema.MinNumInput()
if extra_opt_args == 1:
i = schema.MinNumInput()
input_type_str = "TensorList" + _supported_layouts_str(schema.GetSupportedLayouts(i))
dox = "Input to the operator."
input_name = f"input{i}" if schema.MaxNumInput() > 1 else "data"
ret += _numpydoc_formatter(input_name, input_type_str, dox, True) + "\n"
elif extra_opt_args > 1:
input_type_str = "TensorList"
input_name = f"input[{schema.MinNumInput()}..{schema.MaxNumInput()}]"
dox = f"This function accepts up to {extra_opt_args} optional positional inputs"
ret += _numpydoc_formatter(input_name, input_type_str, dox, True) + "\n"

ret += "\n"
return ret


def _get_kwargs(schema):
"""
Get the keywords arguments from the schema.
Expand Down Expand Up @@ -75,7 +114,7 @@ def _get_kwargs(schema):
def _schema_name(cls):
return getattr(cls, 'schema_name', cls.__name__)

def _docstring_generator(cls):
def _docstring_generator_main(cls):
"""
Generate docstring for the class obtaining it from schema based on cls.__name__
This lists all the Keyword args that can be used when creating operator
Expand Down Expand Up @@ -126,14 +165,20 @@ def _docstring_generator(cls):
for dev in op_dev:
ret += " * " + dev + "\n"
ret += "\n"
return ret

def _docstring_generator(cls):
op_name = _schema_name(cls)
schema = _b.GetSchema(op_name)
ret = _docstring_generator_main(cls)
ret += """
Keyword args
------------
"""
ret += _get_kwargs(schema)
return ret


def _supported_layouts_str(supported_layouts):
if len(supported_layouts) == 0:
return ""
Expand All @@ -152,16 +197,7 @@ def _docstring_prefix_from_inputs(op_name):
# __call__ docstring
ret += "\nOperator call to be used in graph definition.\n"
# Args section
ret += """
Args
----
"""
for i in range(schema.MaxNumInput()):
optional = i >= schema.MinNumInput()
input_type_str = schema.GetInputType(i) + _supported_layouts_str(schema.GetSupportedLayouts(i))
ret += _numpydoc_formatter(schema.GetInputName(i), input_type_str, schema.GetInputDox(i), optional)
ret += "\n"
ret += "\n"
ret += _get_inputs_doc(schema)
return ret

def _docstring_prefix_auto(op_name):
Expand Down Expand Up @@ -215,6 +251,18 @@ def _docstring_generator_call(op_name):
ret += tensor_kwargs
return ret

def _docstring_generator_fn(cls):
op_name = _schema_name(cls)
schema = _b.GetSchema(op_name)
ret = _docstring_generator_main(cls)
ret += _get_inputs_doc(schema)
ret += """
Keyword args
------------
"""
ret += _get_kwargs(schema)
return ret

class _OpCounter(object):
#pylint: disable=too-few-public-methods
_lock = threading.Lock()
Expand Down Expand Up @@ -621,7 +669,7 @@ def _process_op_name(op_schema_name, make_hidden=False):
return op_full_name, submodule, op_name

def _wrap_op(op_class, submodule = [], parent_module=None):
return _functional._wrap_op(op_class, submodule, parent_module)
return _functional._wrap_op(op_class, submodule, parent_module, _docstring_generator_fn(op_class))

def _load_ops():
global _cpu_ops
Expand Down
128 changes: 58 additions & 70 deletions docs/advanced_topics.rst
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
Advanced Topics
===============

Thread Affinity
---------------
Performance Tuning
==================

.. note::
For typical use cases, the default DALI configuration performs well out of the box, and you do
not need to review this section.

Thread Affinity
---------------

This functionality allows you to pin DALI threads to the specified CPU. Thread affinity avoids
the overhead of worker threads jumping from core to core and improves performance with CPU-heavy
Expand Down Expand Up @@ -40,10 +39,6 @@ and thread 4 to the CPU ID that is returned by nvmlDeviceGetCpuAffinity.
Memory Consumption
------------------

.. note::
For typical use cases, the default DALI configuration performs well out of the box, and you do
not need to review this section.

DALI uses the following memory types:

- Host
Expand Down Expand Up @@ -89,10 +84,6 @@ growth factor for the host and the GPU buffers.
Operator Buffer Presizing
-------------------------

.. note::
For typical use cases, the default DALI configuration performs well out of the box, and you do
not need to review this section.

When you can precisely forecast the memory consumption during a DALI run, this functionality helps
you fine tune the processing pipeline. One of the benefits is that the overhead of some
reallocations can be avoided.
Expand Down Expand Up @@ -124,10 +115,6 @@ the allocation is contiguous. This value should be provided to ``bytes_per_sampl
Prefetching Queue Depth
-----------------------

.. note::
For typical use cases, the default DALI configuration performs well out of the box, and you do
not need to review this section.

The DALI pipeline allows the buffering of one or more batches of data, which is important when
the processing time varies from batch to batch.
The default prefetch depth is 2. You can change this value by using the ``prefetch_queue_depth``
Expand All @@ -137,59 +124,8 @@ we recommend that you prefetch more data ahead of time.
.. note::
Increasing queue depth also increases memory consumption.

Running DALI pipeline
---------------------

DALI pipeline can be run in one of the following ways:

- | Simple run method, which runs the computations and returns the results.
| This option corresponds to the :meth:`nvidia.dali.types.PipelineAPIType.BASIC` API type.
- | :meth:`nvidia.dali.pipeline.Pipeline.schedule_runs`,
:meth:`nvidia.dali.pipeline.Pipeline.share_outputs`,
:meth:`nvidia.dali.pipeline.Pipeline.release_outputs` that allows a fine-grain control for
the duration of the output buffers' lifetime.
| This option corresponds to the :meth:`nvidia.dali.types.PipelineAPIType.SCHEDULED` API type.
- | Built-in iterators for MXNet, PyTorch, and TensorFlow.
| This option corresponds to the :meth:`nvidia.dali.types.PipelineAPIType.ITERATOR` API type.
The first API, :meth:`nvidia.dali.pipeline.Pipeline.run()` method completes the following tasks:

#. Launches the DALI pipeline.
#. Executes the prefetch iterations if necessary.
#. Waits until the first batch is ready.
#. Returns the resulting buffers.

Buffers are marked as in-use until the next call to
:meth:`nvidia.dali.pipeline.Pipeline.run`. This process can be wasteful because the data is usually
copied to the DL framework's native storage objects and DALI pipeline outputs could be returned to
DALI for reuse.

The second API, which consists of :meth:`nvidia.dali.pipeline.Pipeline.schedule_run()`,
:meth:`nvidia.dali.pipeline.Pipeline.share_outputs()`, and :meth:`nvidia.dali.pipeline.Pipeline.release_outputs()`
allows you to explicitly manage the lifetime of the output buffers. The
:meth:`nvidia.dali.pipeline.Pipeline.schedule_run()` method instructs DALI to prepare the next
batch of data, and, if necessary, to prefetch. If the execution mode is set to asynchronous,
this call returns immediately, without waiting for the results. This way, another task can be
simultaneously executed. The data batch can be requested from DALI by calling
:meth:`nvidia.dali.pipeline.Pipeline.share_outputs`, which returns the result buffer. If the data
batch is not yet ready, DALI will wait for it. The data is ready as soon as the
:meth:`nvidia.dali.pipeline.Pipeline.share_outputs()`` is complete. When the DALI buffers are
no longer needed, because data was copied or has already been consumed, call
:meth:`nvidia.dali.pipeline.Pipeline.release_outputs()` to return the DALI buffers for reuse
in subsequent iterations.

Built-in iterators use the second API to provide convenient wrappers for immediate use in
Deep Learning Frameworks. The data is returned in the framework's native buffers. The iterator's
implementation copies the data internally from DALI buffers and recycles the data by calling
:meth:`nvidia.dali.pipeline.Pipeline.release_outputs()`.

We recommend that you do not mix the APIs. The APIs follow a different logic for the output
buffer lifetime management, and the details of the process are subject to change without notice.
Mixing the APIs might result in undefined behavior, such as a deadlock or an attempt to access
an invalid buffer.

Sharding
--------
========

Sharding allows DALI to partition the dataset into nonoverlapping pieces on which each DALI pipeline
instance can work. This functionality addresses the issue of having a global and a shared state
Expand Down Expand Up @@ -254,8 +190,60 @@ When this occurs, use the first formula.
To address these challenges, use the ``reader_name`` parameter and allow the iterator
handle the details.


Pipeline run methods
====================

DALI pipeline can be run in one of the following ways:

- | Simple run method, which runs the computations and returns the results.
| This option corresponds to the :meth:`nvidia.dali.types.PipelineAPIType.BASIC` API type.
- | :meth:`nvidia.dali.pipeline.Pipeline.schedule_runs`,
:meth:`nvidia.dali.pipeline.Pipeline.share_outputs`,
:meth:`nvidia.dali.pipeline.Pipeline.release_outputs` that allows a fine-grain control for
the duration of the output buffers' lifetime.
| This option corresponds to the :meth:`nvidia.dali.types.PipelineAPIType.SCHEDULED` API type.
- | Built-in iterators for MXNet, PyTorch, and TensorFlow.
| This option corresponds to the :meth:`nvidia.dali.types.PipelineAPIType.ITERATOR` API type.
The first API, :meth:`nvidia.dali.pipeline.Pipeline.run()` method completes the following tasks:

#. Launches the DALI pipeline.
#. Executes the prefetch iterations if necessary.
#. Waits until the first batch is ready.
#. Returns the resulting buffers.

Buffers are marked as in-use until the next call to
:meth:`nvidia.dali.pipeline.Pipeline.run`. This process can be wasteful because the data is usually
copied to the DL framework's native storage objects and DALI pipeline outputs could be returned to
DALI for reuse.

The second API, which consists of :meth:`nvidia.dali.pipeline.Pipeline.schedule_run()`,
:meth:`nvidia.dali.pipeline.Pipeline.share_outputs()`, and :meth:`nvidia.dali.pipeline.Pipeline.release_outputs()`
allows you to explicitly manage the lifetime of the output buffers. The
:meth:`nvidia.dali.pipeline.Pipeline.schedule_run()` method instructs DALI to prepare the next
batch of data, and, if necessary, to prefetch. If the execution mode is set to asynchronous,
this call returns immediately, without waiting for the results. This way, another task can be
simultaneously executed. The data batch can be requested from DALI by calling
:meth:`nvidia.dali.pipeline.Pipeline.share_outputs`, which returns the result buffer. If the data
batch is not yet ready, DALI will wait for it. The data is ready as soon as the
:meth:`nvidia.dali.pipeline.Pipeline.share_outputs()`` is complete. When the DALI buffers are
no longer needed, because data was copied or has already been consumed, call
:meth:`nvidia.dali.pipeline.Pipeline.release_outputs()` to return the DALI buffers for reuse
in subsequent iterations.

Built-in iterators use the second API to provide convenient wrappers for immediate use in
Deep Learning Frameworks. The data is returned in the framework's native buffers. The iterator's
implementation copies the data internally from DALI buffers and recycles the data by calling
:meth:`nvidia.dali.pipeline.Pipeline.release_outputs()`.

We recommend that you do not mix the APIs. The APIs follow a different logic for the output
buffer lifetime management, and the details of the process are subject to change without notice.
Mixing the APIs might result in undefined behavior, such as a deadlock or an attempt to access
an invalid buffer.

C++ API
-------
=======

.. note::
**This feature is not officially supported and may change without notice**
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ This library is open sourced and it is available in the `NVIDIA GitHub repositor
:caption: Reference

Release Notes <https://docs.nvidia.com/deeplearning/dali/release-notes/index.html>
GitHub <https://github.com/NVIDIA/DALI>

Indices and tables
==================
Expand Down
2 changes: 1 addition & 1 deletion docs/supported_ops_legacy.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(Legacy) Operator Objects
Operator Objects (Legacy)
=========================

In older versions of DALI, an object-oriented API was used to define operations instead of
Expand Down

0 comments on commit 0ca079f

Please sign in to comment.