Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-40012][PYTHON][DOCS] Make pyspark.sql.dataframe examples self-contained (Part 1) #37444

Closed
wants to merge 8 commits into from
238 changes: 202 additions & 36 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,9 +477,22 @@ def schema(self) -> StructType:

Examples
HyukjinKwon marked this conversation as resolved.
Show resolved Hide resolved
--------
>>> df = spark.createDataFrame(
... [(14, "Tom"), (23, "Alice"),(16, "Bob")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+

Retrieve the schema of the current DataFrame.

>>> df.schema
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
StructType([StructField('age', IntegerType(), True),
StructField('name', StringType(), True)])
StructType([StructField('age', IntegerType(), True),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's remove the space in the end

StructField('name', StringType(), True)])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please avoid using tabs

"""
if self._schema is None:
try:
Expand Down Expand Up @@ -731,29 +744,51 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =

Examples
--------
>>> df
DataFrame[age: int, name: string]
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 2|Alice|
| 5| Bob|
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+

Show only top 2 rows.

>>> df.show(2)
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
+---+-----+
only showing top 2 rows

Show DataFrame where the maximum number of characters is 3.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:class:`DataFrame`


>>> df.show(truncate=3)
+---+----+
|age|name|
+---+----+
| 2| Ali|
| 5| Bob|
| 14| Tom|
| 23| Ali|
| 16| Bob|
+---+----+

Show DataFrame vertically.

>>> df.show(vertical=True)
-RECORD 0-----
age | 2
name | Alice
age | 14
name | Tom
-RECORD 1-----
age | 5
name | Bob
age | 23
name | Alice
-RECORD 2-----
age | 16
name | Bob
"""

if not isinstance(n, int) or isinstance(n, bool):
Expand Down Expand Up @@ -992,8 +1027,21 @@ def count(self) -> int:

Examples
--------
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+

Return the number of rows in the :class:`DataFrame`.

>>> df.count()
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
2
3
"""
return int(self._jdf.count())

Expand Down Expand Up @@ -1088,8 +1136,21 @@ def take(self, num: int) -> List[Row]:

Examples
--------
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+

Return the first 2 rows of the :class:`DataFrame`.

>>> df.take(2)
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
[Row(age=2, name='Alice'), Row(age=5, name='Bob')]
[Row(age=14, name='Tom'), Row(age=23, name='Alice')]
"""
return self.limit(num).collect()

Expand All @@ -1115,8 +1176,18 @@ def tail(self, num: int) -> List[Row]:

Examples
--------
>>> df.tail(1)
[Row(age=5, name='Bob')]
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+
>>> df.tail(2)
[Row(age=23, name='Alice'), Row(age=16, name='Bob')]
"""
with SCCallSiteSync(self._sc):
sock_info = self._jdf.tailToPython(num)
Expand Down Expand Up @@ -1511,6 +1582,19 @@ def distinct(self) -> "DataFrame":

Examples
--------
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (23, "Alice")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 23|Alice|
+---+-----+

Return the number of distinct rows in the :class:`DataFrame`

>>> df.distinct().count()
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
2
"""
Expand Down Expand Up @@ -1722,8 +1806,17 @@ def dtypes(self) -> List[Tuple[str, str]]:

Examples
--------
>>> df = spark.createDataFrame([(14, "Tom"),
... (23, "Alice")], ["age", "name"])
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
+---+-----+
>>> df.dtypes
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
[('age', 'int'), ('name', 'string')]
[('age', 'bigint'), ('name', 'string')]
"""
return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields]

Expand Down Expand Up @@ -3358,7 +3451,20 @@ def fillna(

Examples
--------
>>> df4.na.fill(50).show()
Fill all null values with 50 when the data type of the column is an integer

>>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
>>> df.show()
+----+------+-----+
| age|height| name|
+----+------+-----+
| 10| 80|Alice|
| 5| null| Bob|
|null| null| Tom|
|null| null| null|
+----+------+-----+
>>> df.na.fill(50).show()
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
+---+------+-----+
|age|height| name|
+---+------+-----+
Expand All @@ -3368,7 +3474,19 @@ def fillna(
| 50| 50| null|
+---+------+-----+

>>> df5.na.fill(False).show()
Fill all null values with ``False`` when the data type of the column is a boolean

>>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None),
... (None, "Mallory", True)], ["age", "name", "spy"])
>>> df.show()
+----+-------+----+
| age| name| spy|
+----+-------+----+
| 10| Alice|null|
| 5| Bob|null|
|null|Mallory|true|
+----+-------+----+
>>> df.na.fill(False).show()
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
+----+-------+-----+
| age| name| spy|
+----+-------+-----+
Expand All @@ -3377,7 +3495,20 @@ def fillna(
|null|Mallory| true|
+----+-------+-----+

>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column

>>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
>>> df.show()
+----+------+-----+
| age|height| name|
+----+------+-----+
| 10| 80|Alice|
| 5| null| Bob|
|null| null| Tom|
|null| null| null|
+----+------+-----+
>>> df.na.fill({'age': 50, 'name': 'unknown'}).show()
+---+------+-------+
|age|height| name|
+---+------+-------+
Expand Down Expand Up @@ -3489,7 +3620,18 @@ def replace( # type: ignore[misc]

Examples
--------
>>> df4.na.replace(10, 20).show()
>>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
>>> df.show()
+----+------+-----+
| age|height| name|
+----+------+-----+
| 10| 80|Alice|
| 5| null| Bob|
|null| null| Tom|
|null| null| null|
+----+------+-----+
>>> df.na.replace(10, 20).show()
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
+----+------+-----+
| age|height| name|
+----+------+-----+
Expand All @@ -3499,17 +3641,9 @@ def replace( # type: ignore[misc]
|null| null| null|
+----+------+-----+

>>> df4.na.replace('Alice', None).show()
+----+------+----+
| age|height|name|
+----+------+----+
| 10| 80|null|
| 5| null| Bob|
|null| null| Tom|
|null| null|null|
+----+------+----+
Replace all instances of Alice to null

>>> df4.na.replace({'Alice': None}).show()
>>> df.na.replace('Alice', None).show()
+----+------+----+
| age|height|name|
+----+------+----+
Expand All @@ -3519,7 +3653,9 @@ def replace( # type: ignore[misc]
|null| null|null|
+----+------+----+

>>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
Replace all instances of Alice to 'A' and Bob to 'B' under the name column

>>> df.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
+----+------+----+
| age|height|name|
+----+------+----+
Expand Down Expand Up @@ -4064,11 +4200,31 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc]

Examples
--------
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+
>>> df.drop('age').collect()
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
[Row(name='Alice'), Row(name='Bob')]
[Row(name='Tom'), Row(name='Alice'), Row(name='Bob')]

>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we don't need to create a new DataFrame here, since drop() doesn't remove the column in-place.

e.g.

>>> df.drop('age').show()
+-----+
| name|
+-----+
|  Tom|
|Alice|
|  Bob|
+-----+

>>> df.drop(df.age).show()
+-----+
| name|
+-----+
|  Tom|
|Alice|
|  Bob|
+-----+

>>> df.drop(df.age).collect()
[Row(name='Alice'), Row(name='Bob')]
[Row(name='Tom'), Row(name='Alice'), Row(name='Bob')]

>>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure what these 3 inner joins do exactly. I dont see anywhere an instantiation of df2..

What should I do with these 3 examples?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's showing a common example that join and drop the join key.

[Row(age=5, height=85, name='Bob')]
Expand Down Expand Up @@ -4100,7 +4256,7 @@ def toDF(self, *cols: "ColumnOrName") -> "DataFrame":
Parameters
----------
cols : str
new column names
new column names. The length of the list needs to be the same as the number of columns in the initial :class:`DataFrame`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Transurgeon mind running ./dev/lint-python script and fix the line length, etc?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes will do, sorry about that

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like it exceeds the 100 lines, which violates flake8 rule.

starting flake8 test...
flake8 checks failed:
./python/pyspark/sql/dataframe.py:4250:101: E501 line too long (128 > 100 characters)
        """
        Returns a best-effort snapshot of the files that compose this :class:`DataFrame`.
        This method simply asks each constituent BaseRelation for its respective files and
        takes the union of all results. Depending on the source relations, this may not find
        all input files. Duplicates are removed.

        new column names. The length of the list needs to be the same as the number of columns in the initial :class:`DataFrame`

        .. versionadded:: 3.1.0

        Returns
        -------
        list
            List of file paths.

        Examples
        --------
        >>> df = spark.read.load("examples/src/main/resources/people.json", format="json")
        >>> len(df.inputFiles())
        1
        """

                                                                                        ^
1     E501 line too long (128 > 100 characters)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can run dev/lint-python to check if the static analysis is passed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


Returns
-------
Expand All @@ -4109,8 +4265,18 @@ def toDF(self, *cols: "ColumnOrName") -> "DataFrame":

Examples
--------
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
... (16, "Bob")], ["age", "name"])
Transurgeon marked this conversation as resolved.
Show resolved Hide resolved
>>> df.show()
+---+-----+
|age| name|
+---+-----+
| 14| Tom|
| 23|Alice|
| 16| Bob|
+---+-----+
>>> df.toDF('f1', 'f2').collect()
[Row(f1=2, f2='Alice'), Row(f1=5, f2='Bob')]
[Row(f1=14, f2='Tom'), Row(f1=23, f2='Alice'), Row(f1=16, f2='Bob')]
"""
jdf = self._jdf.toDF(self._jseq(cols))
return DataFrame(jdf, self.sparkSession)
Expand Down