diff --git a/daft/io/_iceberg.py b/daft/io/_iceberg.py index 0d3b1af102..c936779d3d 100644 --- a/daft/io/_iceberg.py +++ b/daft/io/_iceberg.py @@ -70,6 +70,30 @@ def read_iceberg( pyiceberg_table: "PyIcebergTable", io_config: Optional["IOConfig"] = None, ) -> DataFrame: + """Create a DataFrame from an Iceberg table + + Example: + >>> import pyiceberg + >>> + >>> pyiceberg_table = pyiceberg.Table(...) + >>> df = daft.read_iceberg(pyiceberg_table) + >>> + >>> # Filters on this dataframe can now be pushed into + >>> # the read operation from Iceberg + >>> df = df.where(df["foo"] > 5) + >>> df.show() + + .. NOTE:: + This function requires the use of `PyIceberg `_, which is the Apache Iceberg's + official project for Python. + + Args: + pyiceberg_table: Iceberg table created using the PyIceberg library + io_config: A custom IOConfig to use when accessing Iceberg object storage data. Defaults to None. + + Returns: + DataFrame: a DataFrame with the schema converted from the specified Iceberg table + """ from daft.iceberg.iceberg_scan import IcebergScanOperator io_config = ( diff --git a/docs/source/api_docs/creation.rst b/docs/source/api_docs/creation.rst index e6e821a3a7..033095db97 100644 --- a/docs/source/api_docs/creation.rst +++ b/docs/source/api_docs/creation.rst @@ -20,71 +20,83 @@ Python Objects from_pylist from_pydict -Arrow -~~~~~ +Files +----- + +.. _df-io-files: + +Parquet +~~~~~~~ + +.. _daft-read-parquet: .. autosummary:: :nosignatures: :toctree: doc_gen/io_functions + read_parquet + +CSV +~~~ .. autosummary:: :nosignatures: :toctree: doc_gen/io_functions - from_arrow + read_csv -Pandas -~~~~~~ +JSON +~~~~ .. autosummary:: :nosignatures: :toctree: doc_gen/io_functions - from_pandas + read_json -File Paths -~~~~~~~~~~ +Data Catalogs +------------- + +Apache Iceberg +^^^^^^^^^^^^^^ .. autosummary:: :nosignatures: :toctree: doc_gen/io_functions - from_glob_path + read_iceberg -Files ------ - -.. _df-io-files: +Arrow +~~~~~ -Parquet -~~~~~~~ +.. autosummary:: + :nosignatures: + :toctree: doc_gen/io_functions -.. _daft-read-parquet: .. autosummary:: :nosignatures: :toctree: doc_gen/io_functions - read_parquet + from_arrow -CSV -~~~ +Pandas +~~~~~~ .. autosummary:: :nosignatures: :toctree: doc_gen/io_functions - read_csv + from_pandas -JSON -~~~~ +File Paths +~~~~~~~~~~ .. autosummary:: :nosignatures: :toctree: doc_gen/io_functions - read_json + from_glob_path Integrations ------------ diff --git a/docs/source/user_guide/basic_concepts/read-and-write.rst b/docs/source/user_guide/basic_concepts/read-and-write.rst index 5828cb847a..e4528cd5d8 100644 --- a/docs/source/user_guide/basic_concepts/read-and-write.rst +++ b/docs/source/user_guide/basic_concepts/read-and-write.rst @@ -34,6 +34,11 @@ Daft supports file paths to a single file, a directory of files, and wildcards. To learn more about each of these constructors, as well as the options that they support, consult the API documentation on :ref:`creating DataFrames from files `. +From Data Catalogs +^^^^^^^^^^^^^^^^^^ + +If you use catalogs such as Apache Iceberg or Hive, you may wish to consult our user guide on integrations with Data Catalogs: :doc:`Daft integration with Data Catalogs <../integrations/data_catalogs>`. + From File Paths ^^^^^^^^^^^^^^^ diff --git a/docs/source/user_guide/index.rst b/docs/source/user_guide/index.rst index ffc722ab1b..d8e719f518 100644 --- a/docs/source/user_guide/index.rst +++ b/docs/source/user_guide/index.rst @@ -9,6 +9,7 @@ Daft User Guide basic_concepts daft_in_depth poweruser + integrations tutorials Welcome to **Daft**! @@ -61,6 +62,11 @@ Core Daft concepts all Daft users will find useful to understand deeply. Become a true Daft Poweruser! This section explores advanced topics to help you configure Daft for specific application environments, improve reliability and optimize for performance. +:doc:`Integrations ` +********************************** + +Learn how to use Daft's integrations with other technologies such as Ray Datasets or Apache Iceberg. + :doc:`Tutorials ` **************************** diff --git a/docs/source/user_guide/integrations.rst b/docs/source/user_guide/integrations.rst new file mode 100644 index 0000000000..4a53a31a07 --- /dev/null +++ b/docs/source/user_guide/integrations.rst @@ -0,0 +1,6 @@ +Integrations +============ + +.. toctree:: + + integrations/data_catalogs diff --git a/docs/source/user_guide/integrations/data_catalogs.rst b/docs/source/user_guide/integrations/data_catalogs.rst new file mode 100644 index 0000000000..3dee9f8c06 --- /dev/null +++ b/docs/source/user_guide/integrations/data_catalogs.rst @@ -0,0 +1,43 @@ +Data Catalogs +============= + +**Data Catalogs** are services that provide access to **Tables** of data. **Tables** are powerful abstractions for large datasets in storage, providing many benefits over naively storing data as just a bunch of CSV/Parquet files. + +There are many different **Table Formats** that are employed by Data Catalogs. These table formats will differ implementation and capabilities, but will often provide advantages such as: + +1. **Schema:** what data do these files contain? +2. **Partitioning Specification:** how is the data organized? +3. **Statistics/Metadata:** how many rows does each file contain, and what are the min/max values of each files' columns? +4. **ACID compliance:** updates to the table are atomic + +.. NOTE:: + The names of Table Formats and their Data Catalogs are often used interchangeably. + + For example, "Apache Iceberg" often refers to both the Data Catalog and its Table Format. + + You can retrieve an **Apache Iceberg Table** from an **Apache Iceberg REST Data Catalog**. + + However, some Data Catalogs allow for many different underlying Table Formats. For example, you can request both an **Apache Iceberg Table** or a **Hive Table** from an **AWS Glue Data Catalog**. + +Why use Data Catalogs? +---------------------- + +Daft can effectively leverage the statistics and metadata provided by these Data Catalogs' Tables to dramatically speed up queries. + +This is accomplished by techniques such as: + +1. **Partition pruning:** ignore files where their partition values don't match filter predicates +2. **Schema retrieval:** convert the schema provided by the data catalog into a Daft schema instead of sampling a schema from the data +3. **Metadata execution**: utilize metadata such as row counts to read the bare minimum amount of data necessary from storage + +Data Catalog Integrations +------------------------- + +Apache Iceberg +^^^^^^^^^^^^^^ + +Apache Iceberg is an open-sourced table format originally developed at Netflix for large-scale analytical datasets. + +To read from the Apache Iceberg table format, use the :func:`daft.read_iceberg` function. + +We integrate closely with `PyIceberg `_ (the official Python implementation for Apache Iceberg) and allow the reading of Daft dataframes from PyIceberg's Table objects.