From 380149fdb2c09eff631b14172546deb0b873c5e4 Mon Sep 17 00:00:00 2001
From: William Zijie <peterzijie@gmail.com>
Date: Tue, 9 Aug 2022 00:09:51 -0400
Subject: [PATCH 1/7] adding self-contained examples for pyspark dataframe

---
 python/pyspark/sql/dataframe.py | 116 +++++++++++++++++++++++++++-----
 1 file changed, 99 insertions(+), 17 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 41ac701a332ac..436c143cd7e1b 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -798,8 +798,18 @@ def count(self) -> int:
 
         Examples
         --------
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df.show()
+        +---+-----+
+        |age| name|
+        +---+-----+
+        | 14|  Tom|
+        | 23|Alice|
+        | 16|  Bob|
+        +---+-----+
+
         >>> df.count()
-        2
+        3
         """
         return int(self._jdf.count())
 
@@ -1179,6 +1189,16 @@ def distinct(self) -> "DataFrame":
 
         Examples
         --------
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
+        >>> df.show()
+        +---+-----+
+        |age| name|
+        +---+-----+
+        | 14|  Tom|
+        | 23|Alice|
+        | 23|Alice|
+        +---+-----+
+
         >>> df.distinct().count()
         2
         """
@@ -1375,8 +1395,17 @@ def dtypes(self) -> List[Tuple[str, str]]:
 
         Examples
         --------
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice")], ["age", "name"])
+        >>> df.show()
+        +---+-----+
+        |age| name|
+        +---+-----+
+        | 14|  Tom|
+        | 23|Alice|
+        +---+-----+
+
         >>> df.dtypes
-        [('age', 'int'), ('name', 'string')]
+        [('age', 'bigint'), ('name', 'string')]
         """
         return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields]
 
@@ -2743,7 +2772,17 @@ def fillna(
 
         Examples
         --------
-        >>> df4.na.fill(50).show()
+        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
+        >>> df.show()
+        +----+------+-----+
+        | age|height| name|
+        +----+------+-----+
+        |  10|    80|Alice|
+        |   5|  null|  Bob|
+        |null|  null|  Tom|
+        |null|  null| null|
+        +----+------+-----+
+        >>> df.na.fill(50).show()
         +---+------+-----+
         |age|height| name|
         +---+------+-----+
@@ -2753,7 +2792,16 @@ def fillna(
         | 50|    50| null|
         +---+------+-----+
 
-        >>> df5.na.fill(False).show()
+        >>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None), (None, "Mallory", True)], ["age", "name", "spy"])
+        >>> df.show()
+        +----+-------+----+
+        | age|   name| spy|
+        +----+-------+----+
+        |  10|  Alice|null|
+        |   5|    Bob|null|
+        |null|Mallory|true|
+        +----+-------+----+
+        >>> df.na.fill(False).show()
         +----+-------+-----+
         | age|   name|  spy|
         +----+-------+-----+
@@ -2762,7 +2810,17 @@ def fillna(
         |null|Mallory| true|
         +----+-------+-----+
 
-        >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
+        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
+        >>> df.show()
+        +----+------+-----+
+        | age|height| name|
+        +----+------+-----+
+        |  10|    80|Alice|
+        |   5|  null|  Bob|
+        |null|  null|  Tom|
+        |null|  null| null|
+        +----+------+-----+
+        >>> df.na.fill({'age': 50, 'name': 'unknown'}).show()
         +---+------+-------+
         |age|height|   name|
         +---+------+-------+
@@ -2869,7 +2927,17 @@ def replace(  # type: ignore[misc]
 
         Examples
         --------
-        >>> df4.na.replace(10, 20).show()
+        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
+        >>> df.show()
+        +----+------+-----+
+        | age|height| name|
+        +----+------+-----+
+        |  10|    80|Alice|
+        |   5|  null|  Bob|
+        |null|  null|  Tom|
+        |null|  null| null|
+        +----+------+-----+
+        >>> df.na.replace(10, 20).show()
         +----+------+-----+
         | age|height| name|
         +----+------+-----+
@@ -2879,17 +2947,19 @@ def replace(  # type: ignore[misc]
         |null|  null| null|
         +----+------+-----+
 
-        >>> df4.na.replace('Alice', None).show()
-        +----+------+----+
-        | age|height|name|
-        +----+------+----+
-        |  10|    80|null|
-        |   5|  null| Bob|
-        |null|  null| Tom|
-        |null|  null|null|
-        +----+------+----+
+        Replace all instances of Alice to null
 
-        >>> df4.na.replace({'Alice': None}).show()
+        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
+        >>> df.show()
+        +----+------+-----+
+        | age|height| name|
+        +----+------+-----+
+        |  10|    80|Alice|
+        |   5|  null|  Bob|
+        |null|  null|  Tom|
+        |null|  null| null|
+        +----+------+-----+
+        >>> df.na.replace('Alice', None).show()
         +----+------+----+
         | age|height|name|
         +----+------+----+
@@ -2899,7 +2969,19 @@ def replace(  # type: ignore[misc]
         |null|  null|null|
         +----+------+----+
 
-        >>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
+        Replace all instances of Alice to 'A' and Bob to 'B' under the name column
+        
+        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
+        >>> df.show()
+        +----+------+-----+
+        | age|height| name|
+        +----+------+-----+
+        |  10|    80|Alice|
+        |   5|  null|  Bob|
+        |null|  null|  Tom|
+        |null|  null| null|
+        +----+------+-----+
+        >>> df.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
         +----+------+----+
         | age|height|name|
         +----+------+----+

From 1fed3caae18779bddba60c502445cc68772a8151 Mon Sep 17 00:00:00 2001
From: William Zijie <peterzijie@gmail.com>
Date: Tue, 9 Aug 2022 11:54:23 -0400
Subject: [PATCH 2/7] adding additional self-contained examples for dataframe
 API

---
 python/pyspark/sql/dataframe.py | 103 ++++++++++++++++++++++++++------
 1 file changed, 84 insertions(+), 19 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 436c143cd7e1b..7b96717d5a88d 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -363,9 +363,17 @@ def schema(self) -> StructType:
 
         Examples
         --------
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df.show()
+        +---+-----+
+        |age| name|
+        +---+-----+
+        | 14|  Tom|
+        | 23|Alice|
+        | 16|  Bob|
+        +---+-----+
         >>> df.schema
-        StructType([StructField('age', IntegerType(), True),
-                    StructField('name', StringType(), True)])
+        StructType([StructField('age', LongType(), True), StructField('name', StringType(), True)])
         """
         if self._schema is None:
             try:
@@ -571,29 +579,41 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
 
         Examples
         --------
-        >>> df
-        DataFrame[age: int, name: string]
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
         +---+-----+
-        |  2|Alice|
-        |  5|  Bob|
+        | 14|  Tom|
+        | 23|Alice|
+        | 16|  Bob|
+        +---+-----+
+        >>> df.show(2)
+        +---+-----+
+        |age| name|
         +---+-----+
+        | 14|  Tom|
+        | 23|Alice|
+        +---+-----+
+        only showing top 2 rows
         >>> df.show(truncate=3)
         +---+----+
         |age|name|
         +---+----+
-        |  2| Ali|
-        |  5| Bob|
+        | 14| Tom|
+        | 23| Ali|
+        | 16| Bob|
         +---+----+
         >>> df.show(vertical=True)
         -RECORD 0-----
-         age  | 2
-         name | Alice
+        age  | 14
+        name | Tom
         -RECORD 1-----
-         age  | 5
-         name | Bob
+        age  | 23
+        name | Alice
+        -RECORD 2-----
+        age  | 16
+        name | Bob
         """
 
         if not isinstance(n, int) or isinstance(n, bool):
@@ -872,8 +892,17 @@ def take(self, num: int) -> List[Row]:
 
         Examples
         --------
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df.show()
+        +---+-----+
+        |age| name|
+        +---+-----+
+        | 14|  Tom|
+        | 23|Alice|
+        | 16|  Bob|
+        +---+-----+
         >>> df.take(2)
-        [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
+        [Row(age=14, name='Tom'), Row(age=23, name='Alice')]
         """
         return self.limit(num).collect()
 
@@ -888,8 +917,17 @@ def tail(self, num: int) -> List[Row]:
 
         Examples
         --------
-        >>> df.tail(1)
-        [Row(age=5, name='Bob')]
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df.show()
+        +---+-----+
+        |age| name|
+        +---+-----+
+        | 14|  Tom|
+        | 23|Alice|
+        | 16|  Bob|
+        +---+-----+
+        >>> df.tail(2)
+        [Row(age=23, name='Alice'), Row(age=16, name='Bob')]
         """
         with SCCallSiteSync(self._sc):
             sock_info = self._jdf.tailToPython(num)
@@ -3438,11 +3476,29 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame":  # type: ignore[misc]
 
         Examples
         --------
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df.show()
+        +---+-----+
+        |age| name|
+        +---+-----+
+        | 14|  Tom|
+        | 23|Alice|
+        | 16|  Bob|
+        +---+-----+
         >>> df.drop('age').collect()
-        [Row(name='Alice'), Row(name='Bob')]
+        [Row(name='Tom'), Row(name='Alice'), Row(name='Bob')]
 
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df.show()
+        +---+-----+
+        |age| name|
+        +---+-----+
+        | 14|  Tom|
+        | 23|Alice|
+        | 16|  Bob|
+        +---+-----+
         >>> df.drop(df.age).collect()
-        [Row(name='Alice'), Row(name='Bob')]
+        [Row(name='Tom'), Row(name='Alice'), Row(name='Bob')]
 
         >>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect()
         [Row(age=5, height=85, name='Bob')]
@@ -3475,12 +3531,21 @@ def toDF(self, *cols: "ColumnOrName") -> "DataFrame":
         Parameters
         ----------
         cols : str
-            new column names
+            new column names. The length of the list needs to be the same as the number of columns in the initial DataFrame
 
         Examples
         --------
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df.show()
+        +---+-----+
+        |age| name|
+        +---+-----+
+        | 14|  Tom|
+        | 23|Alice|
+        | 16|  Bob|
+        +---+-----+
         >>> df.toDF('f1', 'f2').collect()
-        [Row(f1=2, f2='Alice'), Row(f1=5, f2='Bob')]
+        [Row(f1=14, f2='Tom'), Row(f1=23, f2='Alice'), Row(f1=16, f2='Bob')]
         """
         jdf = self._jdf.toDF(self._jseq(cols))
         return DataFrame(jdf, self.sparkSession)

From 24c4769fe89c0108007500f360b33d9d9a8c519f Mon Sep 17 00:00:00 2001
From: William Zijie <peterzijie@gmail.com>
Date: Tue, 23 Aug 2022 15:47:21 -0400
Subject: [PATCH 3/7] splitting instantion of df to two lines to avoid overflow
 of example space

---
 python/pyspark/sql/dataframe.py | 57 ++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 19 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 7b96717d5a88d..6651074df3421 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -363,7 +363,8 @@ def schema(self) -> StructType:
 
         Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
+        ... (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -579,7 +580,8 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
 
         Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
+        ... (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -818,7 +820,8 @@ def count(self) -> int:
 
         Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
+        ... (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -827,7 +830,6 @@ def count(self) -> int:
         | 23|Alice|
         | 16|  Bob|
         +---+-----+
-
         >>> df.count()
         3
         """
@@ -892,7 +894,8 @@ def take(self, num: int) -> List[Row]:
 
         Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
+        ... (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -917,7 +920,8 @@ def tail(self, num: int) -> List[Row]:
 
         Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
+        ... (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -1227,7 +1231,8 @@ def distinct(self) -> "DataFrame":
 
         Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
+        ... (23, "Alice")], ["age", "name"]) 
         >>> df.show()
         +---+-----+
         |age| name|
@@ -1236,7 +1241,6 @@ def distinct(self) -> "DataFrame":
         | 23|Alice|
         | 23|Alice|
         +---+-----+
-
         >>> df.distinct().count()
         2
         """
@@ -1433,7 +1437,8 @@ def dtypes(self) -> List[Tuple[str, str]]:
 
         Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice")], ["age", "name"])
+        >>> df = spark.createDataFrame([(14, "Tom"),
+        ... (23, "Alice")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -1441,7 +1446,6 @@ def dtypes(self) -> List[Tuple[str, str]]:
         | 14|  Tom|
         | 23|Alice|
         +---+-----+
-
         >>> df.dtypes
         [('age', 'bigint'), ('name', 'string')]
         """
@@ -2810,7 +2814,10 @@ def fillna(
 
         Examples
         --------
-        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
+        Fill all null values with 50 when the data type of the column is an integer
+
+	>>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
+        ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
         >>> df.show()
         +----+------+-----+
         | age|height| name|
@@ -2830,7 +2837,10 @@ def fillna(
         | 50|    50| null|
         +---+------+-----+
 
-        >>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None), (None, "Mallory", True)], ["age", "name", "spy"])
+	Fill all null values with False when the data type of the column is a boolean
+
+        >>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None),
+        ... (None, "Mallory", True)], ["age", "name", "spy"])
         >>> df.show()
         +----+-------+----+
         | age|   name| spy|
@@ -2848,7 +2858,10 @@ def fillna(
         |null|Mallory| true|
         +----+-------+-----+
 
-        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
+	Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column
+	
+        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
+        ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
         >>> df.show()
         +----+------+-----+
         | age|height| name|
@@ -2965,7 +2978,8 @@ def replace(  # type: ignore[misc]
 
         Examples
         --------
-        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
+        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
+        ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
         >>> df.show()
         +----+------+-----+
         | age|height| name|
@@ -2987,7 +3001,8 @@ def replace(  # type: ignore[misc]
 
         Replace all instances of Alice to null
 
-        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
+        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), 
+        ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
         >>> df.show()
         +----+------+-----+
         | age|height| name|
@@ -3009,7 +3024,8 @@ def replace(  # type: ignore[misc]
 
         Replace all instances of Alice to 'A' and Bob to 'B' under the name column
         
-        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
+        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
+        ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
         >>> df.show()
         +----+------+-----+
         | age|height| name|
@@ -3476,7 +3492,8 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame":  # type: ignore[misc]
 
         Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
+        ... (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -3488,7 +3505,8 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame":  # type: ignore[misc]
         >>> df.drop('age').collect()
         [Row(name='Tom'), Row(name='Alice'), Row(name='Bob')]
 
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
+        ... (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -3535,7 +3553,8 @@ def toDF(self, *cols: "ColumnOrName") -> "DataFrame":
 
         Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
+        ... (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|

From 60605d37b1a097da27c85c40d4ea3d74cc78b357 Mon Sep 17 00:00:00 2001
From: William Zijie Zhang <89562186+Transurgeon@users.noreply.github.com>
Date: Thu, 25 Aug 2022 18:28:06 -0400
Subject: [PATCH 4/7] Apply suggestions from code review

Co-authored-by: Hyukjin Kwon <gurwls223@gmail.com>
Co-authored-by: Qian.Sun <qian.sun2020@gmail.com>
---
 python/pyspark/sql/dataframe.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 6651074df3421..f37e16a2de91f 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -363,8 +363,8 @@ def schema(self) -> StructType:
 
         Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
-        ... (16, "Bob")], ["age", "name"])
+        >>> df = spark.createDataFrame(
+        ...     [(14, "Tom"), (23, "Alice"),(16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -895,7 +895,7 @@ def take(self, num: int) -> List[Row]:
         Examples
         --------
         >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
-        ... (16, "Bob")], ["age", "name"])
+        ...     (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -921,7 +921,7 @@ def tail(self, num: int) -> List[Row]:
         Examples
         --------
         >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
-        ... (16, "Bob")], ["age", "name"])
+        ...     (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -1232,7 +1232,7 @@ def distinct(self) -> "DataFrame":
         Examples
         --------
         >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
-        ... (23, "Alice")], ["age", "name"]) 
+        ...     (23, "Alice")], ["age", "name"]) 
         >>> df.show()
         +---+-----+
         |age| name|
@@ -1438,7 +1438,7 @@ def dtypes(self) -> List[Tuple[str, str]]:
         Examples
         --------
         >>> df = spark.createDataFrame([(14, "Tom"),
-        ... (23, "Alice")], ["age", "name"])
+        ...     (23, "Alice")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -2837,10 +2837,10 @@ def fillna(
         | 50|    50| null|
         +---+------+-----+
 
-	Fill all null values with False when the data type of the column is a boolean
+	Fill all null values with ``False`` when the data type of the column is a boolean
 
         >>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None),
-        ... (None, "Mallory", True)], ["age", "name", "spy"])
+        ...     (None, "Mallory", True)], ["age", "name", "spy"])
         >>> df.show()
         +----+-------+----+
         | age|   name| spy|
@@ -2861,7 +2861,7 @@ def fillna(
 	Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column
 	
         >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
-        ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
+        ...     (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
         >>> df.show()
         +----+------+-----+
         | age|height| name|
@@ -2979,7 +2979,7 @@ def replace(  # type: ignore[misc]
         Examples
         --------
         >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
-        ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
+        ...     (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
         >>> df.show()
         +----+------+-----+
         | age|height| name|
@@ -3549,7 +3549,7 @@ def toDF(self, *cols: "ColumnOrName") -> "DataFrame":
         Parameters
         ----------
         cols : str
-            new column names. The length of the list needs to be the same as the number of columns in the initial DataFrame
+            new column names. The length of the list needs to be the same as the number of columns in the initial :class:`DataFrame`
 
         Examples
         --------

From 1ce9604685efd05f91ce8cbc06e96a503c507e07 Mon Sep 17 00:00:00 2001
From: William Zijie <peterzijie@gmail.com>
Date: Thu, 25 Aug 2022 18:59:09 -0400
Subject: [PATCH 5/7] adding descriptions and removing duplicate dataframe
 initialisations

---
 python/pyspark/sql/dataframe.py | 56 ++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index f37e16a2de91f..ca0958ebaf542 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -361,6 +361,10 @@ def schema(self) -> StructType:
 
         .. versionadded:: 1.3.0
 
+	Returns
+        -------
+        :class:`StructType`
+
         Examples
         --------
         >>> df = spark.createDataFrame(
@@ -373,8 +377,12 @@ def schema(self) -> StructType:
         | 23|Alice|
         | 16|  Bob|
         +---+-----+
+
+	Retrieve the schema of the current DataFrame.
+
         >>> df.schema
-        StructType([StructField('age', LongType(), True), StructField('name', StringType(), True)])
+        StructType([StructField('age', IntegerType(), True), 
+		    StructField('name', StringType(), True)])
         """
         if self._schema is None:
             try:
@@ -581,7 +589,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
         Examples
         --------
         >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
-        ... (16, "Bob")], ["age", "name"])
+        ... 	(16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -590,6 +598,9 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
         | 23|Alice|
         | 16|  Bob|
         +---+-----+
+
+	Show only top 2 rows.
+
         >>> df.show(2)
         +---+-----+
         |age| name|
@@ -598,6 +609,9 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
         | 23|Alice|
         +---+-----+
         only showing top 2 rows
+
+	Show DataFrame where the maximum number of characters is 3.
+
         >>> df.show(truncate=3)
         +---+----+
         |age|name|
@@ -606,6 +620,9 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
         | 23| Ali|
         | 16| Bob|
         +---+----+
+
+	Show DataFrame vertically.
+
         >>> df.show(vertical=True)
         -RECORD 0-----
         age  | 14
@@ -830,6 +847,9 @@ def count(self) -> int:
         | 23|Alice|
         | 16|  Bob|
         +---+-----+
+
+	Return the number of rows in the :class:`DataFrame`.
+
         >>> df.count()
         3
         """
@@ -904,6 +924,9 @@ def take(self, num: int) -> List[Row]:
         | 23|Alice|
         | 16|  Bob|
         +---+-----+
+
+	Return the first 2 rows of the :class:`DataFrame`.
+
         >>> df.take(2)
         [Row(age=14, name='Tom'), Row(age=23, name='Alice')]
         """
@@ -1241,6 +1264,9 @@ def distinct(self) -> "DataFrame":
         | 23|Alice|
         | 23|Alice|
         +---+-----+
+
+	Return the number of distinct rows in the :class:`DataFrame`
+
         >>> df.distinct().count()
         2
         """
@@ -2817,9 +2843,9 @@ def fillna(
         Fill all null values with 50 when the data type of the column is an integer
 
 	>>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
-        ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
+        ... 	(None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
         >>> df.show()
-        +----+------+-----+
+        +----+------+-----+ 
         | age|height| name|
         +----+------+-----+
         |  10|    80|Alice|
@@ -3001,17 +3027,6 @@ def replace(  # type: ignore[misc]
 
         Replace all instances of Alice to null
 
-        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), 
-        ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
-        >>> df.show()
-        +----+------+-----+
-        | age|height| name|
-        +----+------+-----+
-        |  10|    80|Alice|
-        |   5|  null|  Bob|
-        |null|  null|  Tom|
-        |null|  null| null|
-        +----+------+-----+
         >>> df.na.replace('Alice', None).show()
         +----+------+----+
         | age|height|name|
@@ -3024,17 +3039,6 @@ def replace(  # type: ignore[misc]
 
         Replace all instances of Alice to 'A' and Bob to 'B' under the name column
         
-        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
-        ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
-        >>> df.show()
-        +----+------+-----+
-        | age|height| name|
-        +----+------+-----+
-        |  10|    80|Alice|
-        |   5|  null|  Bob|
-        |null|  null|  Tom|
-        |null|  null| null|
-        +----+------+-----+
         >>> df.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
         +----+------+----+
         | age|height|name|

From dfa5726a18f31af66fb48acec7f5927c0144b917 Mon Sep 17 00:00:00 2001
From: William Zijie <peterzijie@gmail.com>
Date: Thu, 25 Aug 2022 21:18:28 -0400
Subject: [PATCH 6/7] running python lint and reformat

---
 python/pyspark/sql/dataframe.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 19718cf96fb92..07d065193968d 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -488,11 +488,11 @@ def schema(self) -> StructType:
         | 16|  Bob|
         +---+-----+
 
-	Retrieve the schema of the current DataFrame.
+        Retrieve the schema of the current DataFrame.
 
         >>> df.schema
-        StructType([StructField('age', IntegerType(), True), 
-		    StructField('name', StringType(), True)])
+        StructType([StructField('age', IntegerType(), True),
+                    StructField('name', StringType(), True)])
         """
         if self._schema is None:
             try:
@@ -755,7 +755,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
         | 16|  Bob|
         +---+-----+
 
-	Show only top 2 rows.
+        Show only top 2 rows.
 
         >>> df.show(2)
         +---+-----+
@@ -766,7 +766,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
         +---+-----+
         only showing top 2 rows
 
-	Show DataFrame where the maximum number of characters is 3.
+        Show DataFrame where the maximum number of characters is 3.
 
         >>> df.show(truncate=3)
         +---+----+
@@ -777,7 +777,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
         | 16| Bob|
         +---+----+
 
-	Show DataFrame vertically.
+        Show DataFrame vertically.
 
         >>> df.show(vertical=True)
         -RECORD 0-----
@@ -1038,7 +1038,7 @@ def count(self) -> int:
         | 16|  Bob|
         +---+-----+
 
-	Return the number of rows in the :class:`DataFrame`.
+        Return the number of rows in the :class:`DataFrame`.
 
         >>> df.count()
         3
@@ -1147,7 +1147,7 @@ def take(self, num: int) -> List[Row]:
         | 16|  Bob|
         +---+-----+
 
-	Return the first 2 rows of the :class:`DataFrame`.
+        Return the first 2 rows of the :class:`DataFrame`.
 
         >>> df.take(2)
         [Row(age=14, name='Tom'), Row(age=23, name='Alice')]
@@ -1583,7 +1583,7 @@ def distinct(self) -> "DataFrame":
         Examples
         --------
         >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
-        ...     (23, "Alice")], ["age", "name"]) 
+        ...     (23, "Alice")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -1593,7 +1593,7 @@ def distinct(self) -> "DataFrame":
         | 23|Alice|
         +---+-----+
 
-	Return the number of distinct rows in the :class:`DataFrame`
+        Return the number of distinct rows in the :class:`DataFrame`
 
         >>> df.distinct().count()
         2
@@ -3453,10 +3453,10 @@ def fillna(
         --------
         Fill all null values with 50 when the data type of the column is an integer
 
-	>>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
+        >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
         ... 	(None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
         >>> df.show()
-        +----+------+-----+ 
+        +----+------+-----+
         | age|height| name|
         +----+------+-----+
         |  10|    80|Alice|
@@ -3474,7 +3474,7 @@ def fillna(
         | 50|    50| null|
         +---+------+-----+
 
-	Fill all null values with ``False`` when the data type of the column is a boolean
+        Fill all null values with ``False`` when the data type of the column is a boolean
 
         >>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None),
         ...     (None, "Mallory", True)], ["age", "name", "spy"])
@@ -3495,8 +3495,8 @@ def fillna(
         |null|Mallory| true|
         +----+-------+-----+
 
-	Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column
-	
+        Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column
+
         >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
         ...     (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
         >>> df.show()
@@ -3654,7 +3654,7 @@ def replace(  # type: ignore[misc]
         +----+------+----+
 
         Replace all instances of Alice to 'A' and Bob to 'B' under the name column
-        
+
         >>> df.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
         +----+------+----+
         | age|height|name|

From aadf3d0ae291e3dbf636e7e01b322def3198bdea Mon Sep 17 00:00:00 2001
From: William Zijie <peterzijie@gmail.com>
Date: Thu, 25 Aug 2022 22:22:22 -0400
Subject: [PATCH 7/7] making some additional small detail changes

---
 python/pyspark/sql/dataframe.py | 70 +++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 07d065193968d..33c1a6340fbc4 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -745,7 +745,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
         Examples
         --------
         >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
-        ... 	(16, "Bob")], ["age", "name"])
+        ...     (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -766,7 +766,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
         +---+-----+
         only showing top 2 rows
 
-        Show DataFrame where the maximum number of characters is 3.
+        Show :class:`DataFrame` where the maximum number of characters is 3.
 
         >>> df.show(truncate=3)
         +---+----+
@@ -777,7 +777,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
         | 16| Bob|
         +---+----+
 
-        Show DataFrame vertically.
+        Show :class:`DataFrame` vertically.
 
         >>> df.show(vertical=True)
         -RECORD 0-----
@@ -1028,7 +1028,7 @@ def count(self) -> int:
         Examples
         --------
         >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
-        ... (16, "Bob")], ["age", "name"])
+        ...     (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -1806,8 +1806,8 @@ def dtypes(self) -> List[Tuple[str, str]]:
 
         Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"),
-        ...     (23, "Alice")], ["age", "name"])
+        >>> df = spark.createDataFrame(
+        ...     [(14, "Tom"), (23, "Alice"),(16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|
@@ -1815,6 +1815,9 @@ def dtypes(self) -> List[Tuple[str, str]]:
         | 14|  Tom|
         | 23|Alice|
         +---+-----+
+
+        Return the name of each column along with their respective data types
+
         >>> df.dtypes
         [('age', 'bigint'), ('name', 'string')]
         """
@@ -3451,8 +3454,6 @@ def fillna(
 
         Examples
         --------
-        Fill all null values with 50 when the data type of the column is an integer
-
         >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
         ... 	(None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
         >>> df.show()
@@ -3464,6 +3465,9 @@ def fillna(
         |null|  null|  Tom|
         |null|  null| null|
         +----+------+-----+
+
+        Fill all null values with 50 when the data type of the column is an Integer
+
         >>> df.na.fill(50).show()
         +---+------+-----+
         |age|height| name|
@@ -3474,8 +3478,6 @@ def fillna(
         | 50|    50| null|
         +---+------+-----+
 
-        Fill all null values with ``False`` when the data type of the column is a boolean
-
         >>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None),
         ...     (None, "Mallory", True)], ["age", "name", "spy"])
         >>> df.show()
@@ -3486,6 +3488,9 @@ def fillna(
         |   5|    Bob|null|
         |null|Mallory|true|
         +----+-------+----+
+
+        Fill all null values with ``False`` when the data type of the column is a boolean
+
         >>> df.na.fill(False).show()
         +----+-------+-----+
         | age|   name|  spy|
@@ -3495,8 +3500,6 @@ def fillna(
         |null|Mallory| true|
         +----+-------+-----+
 
-        Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column
-
         >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"),
         ...     (None, None, "Tom"), (None, None, None)], ["age", "height", "name"])
         >>> df.show()
@@ -3508,6 +3511,9 @@ def fillna(
         |null|  null|  Tom|
         |null|  null| null|
         +----+------+-----+
+	
+        Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column
+
         >>> df.na.fill({'age': 50, 'name': 'unknown'}).show()
         +---+------+-------+
         |age|height|   name|
@@ -3631,6 +3637,9 @@ def replace(  # type: ignore[misc]
         |null|  null|  Tom|
         |null|  null| null|
         +----+------+-----+
+
+	Replace all instances of the value 10 to the value 20
+
         >>> df.na.replace(10, 20).show()
         +----+------+-----+
         | age|height| name|
@@ -4210,22 +4219,22 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame":  # type: ignore[misc]
         | 23|Alice|
         | 16|  Bob|
         +---+-----+
-        >>> df.drop('age').collect()
-        [Row(name='Tom'), Row(name='Alice'), Row(name='Bob')]
-
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
-        ... (16, "Bob")], ["age", "name"])
-        >>> df.show()
-        +---+-----+
-        |age| name|
-        +---+-----+
-        | 14|  Tom|
-        | 23|Alice|
-        | 16|  Bob|
-        +---+-----+
-        >>> df.drop(df.age).collect()
-        [Row(name='Tom'), Row(name='Alice'), Row(name='Bob')]
-
+        >>> df.drop('age').show()
+        +-----+
+        | name|
+        +-----+
+        |  Tom|
+        |Alice|
+        |  Bob|
+        +-----+
+        >>> df.drop(df.age).show()
+        +-----+
+        | name|
+        +-----+
+        |  Tom|
+        |Alice|
+        |  Bob|
+        +-----+
         >>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect()
         [Row(age=5, height=85, name='Bob')]
 
@@ -4256,7 +4265,8 @@ def toDF(self, *cols: "ColumnOrName") -> "DataFrame":
         Parameters
         ----------
         cols : str
-            new column names. The length of the list needs to be the same as the number of columns in the initial :class:`DataFrame`
+            new column names. The length of the list needs to be the same as the number
+            of columns in the initial :class:`DataFrame`
 
         Returns
         -------
@@ -4266,7 +4276,7 @@ def toDF(self, *cols: "ColumnOrName") -> "DataFrame":
         Examples
         --------
         >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),
-        ... (16, "Bob")], ["age", "name"])
+        ...     (16, "Bob")], ["age", "name"])
         >>> df.show()
         +---+-----+
         |age| name|