From 380149fdb2c09eff631b14172546deb0b873c5e4 Mon Sep 17 00:00:00 2001 From: William Zijie Date: Tue, 9 Aug 2022 00:09:51 -0400 Subject: [PATCH 1/7] adding self-contained examples for pyspark dataframe --- python/pyspark/sql/dataframe.py | 116 +++++++++++++++++++++++++++----- 1 file changed, 99 insertions(+), 17 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 41ac701a332ac..436c143cd7e1b 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -798,8 +798,18 @@ def count(self) -> int: Examples -------- + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df.show() + +---+-----+ + |age| name| + +---+-----+ + | 14| Tom| + | 23|Alice| + | 16| Bob| + +---+-----+ + >>> df.count() - 2 + 3 """ return int(self._jdf.count()) @@ -1179,6 +1189,16 @@ def distinct(self) -> "DataFrame": Examples -------- + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"]) + >>> df.show() + +---+-----+ + |age| name| + +---+-----+ + | 14| Tom| + | 23|Alice| + | 23|Alice| + +---+-----+ + >>> df.distinct().count() 2 """ @@ -1375,8 +1395,17 @@ def dtypes(self) -> List[Tuple[str, str]]: Examples -------- + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice")], ["age", "name"]) + >>> df.show() + +---+-----+ + |age| name| + +---+-----+ + | 14| Tom| + | 23|Alice| + +---+-----+ + >>> df.dtypes - [('age', 'int'), ('name', 'string')] + [('age', 'bigint'), ('name', 'string')] """ return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields] @@ -2743,7 +2772,17 @@ def fillna( Examples -------- - >>> df4.na.fill(50).show() + >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) + >>> df.show() + +----+------+-----+ + | age|height| name| + +----+------+-----+ + | 10| 80|Alice| + | 5| null| Bob| + |null| null| Tom| + |null| null| null| + +----+------+-----+ + >>> df.na.fill(50).show() +---+------+-----+ |age|height| name| +---+------+-----+ @@ -2753,7 +2792,16 @@ def fillna( | 50| 50| null| +---+------+-----+ - >>> df5.na.fill(False).show() + >>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None), (None, "Mallory", True)], ["age", "name", "spy"]) + >>> df.show() + +----+-------+----+ + | age| name| spy| + +----+-------+----+ + | 10| Alice|null| + | 5| Bob|null| + |null|Mallory|true| + +----+-------+----+ + >>> df.na.fill(False).show() +----+-------+-----+ | age| name| spy| +----+-------+-----+ @@ -2762,7 +2810,17 @@ def fillna( |null|Mallory| true| +----+-------+-----+ - >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show() + >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) + >>> df.show() + +----+------+-----+ + | age|height| name| + +----+------+-----+ + | 10| 80|Alice| + | 5| null| Bob| + |null| null| Tom| + |null| null| null| + +----+------+-----+ + >>> df.na.fill({'age': 50, 'name': 'unknown'}).show() +---+------+-------+ |age|height| name| +---+------+-------+ @@ -2869,7 +2927,17 @@ def replace( # type: ignore[misc] Examples -------- - >>> df4.na.replace(10, 20).show() + >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) + >>> df.show() + +----+------+-----+ + | age|height| name| + +----+------+-----+ + | 10| 80|Alice| + | 5| null| Bob| + |null| null| Tom| + |null| null| null| + +----+------+-----+ + >>> df.na.replace(10, 20).show() +----+------+-----+ | age|height| name| +----+------+-----+ @@ -2879,17 +2947,19 @@ def replace( # type: ignore[misc] |null| null| null| +----+------+-----+ - >>> df4.na.replace('Alice', None).show() - +----+------+----+ - | age|height|name| - +----+------+----+ - | 10| 80|null| - | 5| null| Bob| - |null| null| Tom| - |null| null|null| - +----+------+----+ + Replace all instances of Alice to null - >>> df4.na.replace({'Alice': None}).show() + >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) + >>> df.show() + +----+------+-----+ + | age|height| name| + +----+------+-----+ + | 10| 80|Alice| + | 5| null| Bob| + |null| null| Tom| + |null| null| null| + +----+------+-----+ + >>> df.na.replace('Alice', None).show() +----+------+----+ | age|height|name| +----+------+----+ @@ -2899,7 +2969,19 @@ def replace( # type: ignore[misc] |null| null|null| +----+------+----+ - >>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show() + Replace all instances of Alice to 'A' and Bob to 'B' under the name column + + >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) + >>> df.show() + +----+------+-----+ + | age|height| name| + +----+------+-----+ + | 10| 80|Alice| + | 5| null| Bob| + |null| null| Tom| + |null| null| null| + +----+------+-----+ + >>> df.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show() +----+------+----+ | age|height|name| +----+------+----+ From 1fed3caae18779bddba60c502445cc68772a8151 Mon Sep 17 00:00:00 2001 From: William Zijie Date: Tue, 9 Aug 2022 11:54:23 -0400 Subject: [PATCH 2/7] adding additional self-contained examples for dataframe API --- python/pyspark/sql/dataframe.py | 103 ++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 19 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 436c143cd7e1b..7b96717d5a88d 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -363,9 +363,17 @@ def schema(self) -> StructType: Examples -------- + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df.show() + +---+-----+ + |age| name| + +---+-----+ + | 14| Tom| + | 23|Alice| + | 16| Bob| + +---+-----+ >>> df.schema - StructType([StructField('age', IntegerType(), True), - StructField('name', StringType(), True)]) + StructType([StructField('age', LongType(), True), StructField('name', StringType(), True)]) """ if self._schema is None: try: @@ -571,29 +579,41 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = Examples -------- - >>> df - DataFrame[age: int, name: string] + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| +---+-----+ - | 2|Alice| - | 5| Bob| + | 14| Tom| + | 23|Alice| + | 16| Bob| + +---+-----+ + >>> df.show(2) + +---+-----+ + |age| name| +---+-----+ + | 14| Tom| + | 23|Alice| + +---+-----+ + only showing top 2 rows >>> df.show(truncate=3) +---+----+ |age|name| +---+----+ - | 2| Ali| - | 5| Bob| + | 14| Tom| + | 23| Ali| + | 16| Bob| +---+----+ >>> df.show(vertical=True) -RECORD 0----- - age | 2 - name | Alice + age | 14 + name | Tom -RECORD 1----- - age | 5 - name | Bob + age | 23 + name | Alice + -RECORD 2----- + age | 16 + name | Bob """ if not isinstance(n, int) or isinstance(n, bool): @@ -872,8 +892,17 @@ def take(self, num: int) -> List[Row]: Examples -------- + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df.show() + +---+-----+ + |age| name| + +---+-----+ + | 14| Tom| + | 23|Alice| + | 16| Bob| + +---+-----+ >>> df.take(2) - [Row(age=2, name='Alice'), Row(age=5, name='Bob')] + [Row(age=14, name='Tom'), Row(age=23, name='Alice')] """ return self.limit(num).collect() @@ -888,8 +917,17 @@ def tail(self, num: int) -> List[Row]: Examples -------- - >>> df.tail(1) - [Row(age=5, name='Bob')] + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df.show() + +---+-----+ + |age| name| + +---+-----+ + | 14| Tom| + | 23|Alice| + | 16| Bob| + +---+-----+ + >>> df.tail(2) + [Row(age=23, name='Alice'), Row(age=16, name='Bob')] """ with SCCallSiteSync(self._sc): sock_info = self._jdf.tailToPython(num) @@ -3438,11 +3476,29 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] Examples -------- + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df.show() + +---+-----+ + |age| name| + +---+-----+ + | 14| Tom| + | 23|Alice| + | 16| Bob| + +---+-----+ >>> df.drop('age').collect() - [Row(name='Alice'), Row(name='Bob')] + [Row(name='Tom'), Row(name='Alice'), Row(name='Bob')] + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df.show() + +---+-----+ + |age| name| + +---+-----+ + | 14| Tom| + | 23|Alice| + | 16| Bob| + +---+-----+ >>> df.drop(df.age).collect() - [Row(name='Alice'), Row(name='Bob')] + [Row(name='Tom'), Row(name='Alice'), Row(name='Bob')] >>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect() [Row(age=5, height=85, name='Bob')] @@ -3475,12 +3531,21 @@ def toDF(self, *cols: "ColumnOrName") -> "DataFrame": Parameters ---------- cols : str - new column names + new column names. The length of the list needs to be the same as the number of columns in the initial DataFrame Examples -------- + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df.show() + +---+-----+ + |age| name| + +---+-----+ + | 14| Tom| + | 23|Alice| + | 16| Bob| + +---+-----+ >>> df.toDF('f1', 'f2').collect() - [Row(f1=2, f2='Alice'), Row(f1=5, f2='Bob')] + [Row(f1=14, f2='Tom'), Row(f1=23, f2='Alice'), Row(f1=16, f2='Bob')] """ jdf = self._jdf.toDF(self._jseq(cols)) return DataFrame(jdf, self.sparkSession) From 24c4769fe89c0108007500f360b33d9d9a8c519f Mon Sep 17 00:00:00 2001 From: William Zijie Date: Tue, 23 Aug 2022 15:47:21 -0400 Subject: [PATCH 3/7] splitting instantion of df to two lines to avoid overflow of example space --- python/pyspark/sql/dataframe.py | 57 ++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 7b96717d5a88d..6651074df3421 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -363,7 +363,8 @@ def schema(self) -> StructType: Examples -------- - >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -579,7 +580,8 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = Examples -------- - >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -818,7 +820,8 @@ def count(self) -> int: Examples -------- - >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -827,7 +830,6 @@ def count(self) -> int: | 23|Alice| | 16| Bob| +---+-----+ - >>> df.count() 3 """ @@ -892,7 +894,8 @@ def take(self, num: int) -> List[Row]: Examples -------- - >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -917,7 +920,8 @@ def tail(self, num: int) -> List[Row]: Examples -------- - >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -1227,7 +1231,8 @@ def distinct(self) -> "DataFrame": Examples -------- - >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"]) + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), + ... (23, "Alice")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -1236,7 +1241,6 @@ def distinct(self) -> "DataFrame": | 23|Alice| | 23|Alice| +---+-----+ - >>> df.distinct().count() 2 """ @@ -1433,7 +1437,8 @@ def dtypes(self) -> List[Tuple[str, str]]: Examples -------- - >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice")], ["age", "name"]) + >>> df = spark.createDataFrame([(14, "Tom"), + ... (23, "Alice")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -1441,7 +1446,6 @@ def dtypes(self) -> List[Tuple[str, str]]: | 14| Tom| | 23|Alice| +---+-----+ - >>> df.dtypes [('age', 'bigint'), ('name', 'string')] """ @@ -2810,7 +2814,10 @@ def fillna( Examples -------- - >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) + Fill all null values with 50 when the data type of the column is an integer + + >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), + ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) >>> df.show() +----+------+-----+ | age|height| name| @@ -2830,7 +2837,10 @@ def fillna( | 50| 50| null| +---+------+-----+ - >>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None), (None, "Mallory", True)], ["age", "name", "spy"]) + Fill all null values with False when the data type of the column is a boolean + + >>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None), + ... (None, "Mallory", True)], ["age", "name", "spy"]) >>> df.show() +----+-------+----+ | age| name| spy| @@ -2848,7 +2858,10 @@ def fillna( |null|Mallory| true| +----+-------+-----+ - >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) + Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column + + >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), + ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) >>> df.show() +----+------+-----+ | age|height| name| @@ -2965,7 +2978,8 @@ def replace( # type: ignore[misc] Examples -------- - >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) + >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), + ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) >>> df.show() +----+------+-----+ | age|height| name| @@ -2987,7 +3001,8 @@ def replace( # type: ignore[misc] Replace all instances of Alice to null - >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) + >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), + ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) >>> df.show() +----+------+-----+ | age|height| name| @@ -3009,7 +3024,8 @@ def replace( # type: ignore[misc] Replace all instances of Alice to 'A' and Bob to 'B' under the name column - >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) + >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), + ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) >>> df.show() +----+------+-----+ | age|height| name| @@ -3476,7 +3492,8 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] Examples -------- - >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -3488,7 +3505,8 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] >>> df.drop('age').collect() [Row(name='Tom'), Row(name='Alice'), Row(name='Bob')] - >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -3535,7 +3553,8 @@ def toDF(self, *cols: "ColumnOrName") -> "DataFrame": Examples -------- - >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| From 60605d37b1a097da27c85c40d4ea3d74cc78b357 Mon Sep 17 00:00:00 2001 From: William Zijie Zhang <89562186+Transurgeon@users.noreply.github.com> Date: Thu, 25 Aug 2022 18:28:06 -0400 Subject: [PATCH 4/7] Apply suggestions from code review Co-authored-by: Hyukjin Kwon Co-authored-by: Qian.Sun --- python/pyspark/sql/dataframe.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 6651074df3421..f37e16a2de91f 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -363,8 +363,8 @@ def schema(self) -> StructType: Examples -------- - >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), - ... (16, "Bob")], ["age", "name"]) + >>> df = spark.createDataFrame( + ... [(14, "Tom"), (23, "Alice"),(16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -895,7 +895,7 @@ def take(self, num: int) -> List[Row]: Examples -------- >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), - ... (16, "Bob")], ["age", "name"]) + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -921,7 +921,7 @@ def tail(self, num: int) -> List[Row]: Examples -------- >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), - ... (16, "Bob")], ["age", "name"]) + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -1232,7 +1232,7 @@ def distinct(self) -> "DataFrame": Examples -------- >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), - ... (23, "Alice")], ["age", "name"]) + ... (23, "Alice")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -1438,7 +1438,7 @@ def dtypes(self) -> List[Tuple[str, str]]: Examples -------- >>> df = spark.createDataFrame([(14, "Tom"), - ... (23, "Alice")], ["age", "name"]) + ... (23, "Alice")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -2837,10 +2837,10 @@ def fillna( | 50| 50| null| +---+------+-----+ - Fill all null values with False when the data type of the column is a boolean + Fill all null values with ``False`` when the data type of the column is a boolean >>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None), - ... (None, "Mallory", True)], ["age", "name", "spy"]) + ... (None, "Mallory", True)], ["age", "name", "spy"]) >>> df.show() +----+-------+----+ | age| name| spy| @@ -2861,7 +2861,7 @@ def fillna( Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), - ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) + ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) >>> df.show() +----+------+-----+ | age|height| name| @@ -2979,7 +2979,7 @@ def replace( # type: ignore[misc] Examples -------- >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), - ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) + ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) >>> df.show() +----+------+-----+ | age|height| name| @@ -3549,7 +3549,7 @@ def toDF(self, *cols: "ColumnOrName") -> "DataFrame": Parameters ---------- cols : str - new column names. The length of the list needs to be the same as the number of columns in the initial DataFrame + new column names. The length of the list needs to be the same as the number of columns in the initial :class:`DataFrame` Examples -------- From 1ce9604685efd05f91ce8cbc06e96a503c507e07 Mon Sep 17 00:00:00 2001 From: William Zijie Date: Thu, 25 Aug 2022 18:59:09 -0400 Subject: [PATCH 5/7] adding descriptions and removing duplicate dataframe initialisations --- python/pyspark/sql/dataframe.py | 56 ++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index f37e16a2de91f..ca0958ebaf542 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -361,6 +361,10 @@ def schema(self) -> StructType: .. versionadded:: 1.3.0 + Returns + ------- + :class:`StructType` + Examples -------- >>> df = spark.createDataFrame( @@ -373,8 +377,12 @@ def schema(self) -> StructType: | 23|Alice| | 16| Bob| +---+-----+ + + Retrieve the schema of the current DataFrame. + >>> df.schema - StructType([StructField('age', LongType(), True), StructField('name', StringType(), True)]) + StructType([StructField('age', IntegerType(), True), + StructField('name', StringType(), True)]) """ if self._schema is None: try: @@ -581,7 +589,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = Examples -------- >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), - ... (16, "Bob")], ["age", "name"]) + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -590,6 +598,9 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = | 23|Alice| | 16| Bob| +---+-----+ + + Show only top 2 rows. + >>> df.show(2) +---+-----+ |age| name| @@ -598,6 +609,9 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = | 23|Alice| +---+-----+ only showing top 2 rows + + Show DataFrame where the maximum number of characters is 3. + >>> df.show(truncate=3) +---+----+ |age|name| @@ -606,6 +620,9 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = | 23| Ali| | 16| Bob| +---+----+ + + Show DataFrame vertically. + >>> df.show(vertical=True) -RECORD 0----- age | 14 @@ -830,6 +847,9 @@ def count(self) -> int: | 23|Alice| | 16| Bob| +---+-----+ + + Return the number of rows in the :class:`DataFrame`. + >>> df.count() 3 """ @@ -904,6 +924,9 @@ def take(self, num: int) -> List[Row]: | 23|Alice| | 16| Bob| +---+-----+ + + Return the first 2 rows of the :class:`DataFrame`. + >>> df.take(2) [Row(age=14, name='Tom'), Row(age=23, name='Alice')] """ @@ -1241,6 +1264,9 @@ def distinct(self) -> "DataFrame": | 23|Alice| | 23|Alice| +---+-----+ + + Return the number of distinct rows in the :class:`DataFrame` + >>> df.distinct().count() 2 """ @@ -2817,9 +2843,9 @@ def fillna( Fill all null values with 50 when the data type of the column is an integer >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), - ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) + ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) >>> df.show() - +----+------+-----+ + +----+------+-----+ | age|height| name| +----+------+-----+ | 10| 80|Alice| @@ -3001,17 +3027,6 @@ def replace( # type: ignore[misc] Replace all instances of Alice to null - >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), - ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) - >>> df.show() - +----+------+-----+ - | age|height| name| - +----+------+-----+ - | 10| 80|Alice| - | 5| null| Bob| - |null| null| Tom| - |null| null| null| - +----+------+-----+ >>> df.na.replace('Alice', None).show() +----+------+----+ | age|height|name| @@ -3024,17 +3039,6 @@ def replace( # type: ignore[misc] Replace all instances of Alice to 'A' and Bob to 'B' under the name column - >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), - ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) - >>> df.show() - +----+------+-----+ - | age|height| name| - +----+------+-----+ - | 10| 80|Alice| - | 5| null| Bob| - |null| null| Tom| - |null| null| null| - +----+------+-----+ >>> df.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show() +----+------+----+ | age|height|name| From dfa5726a18f31af66fb48acec7f5927c0144b917 Mon Sep 17 00:00:00 2001 From: William Zijie Date: Thu, 25 Aug 2022 21:18:28 -0400 Subject: [PATCH 6/7] running python lint and reformat --- python/pyspark/sql/dataframe.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 19718cf96fb92..07d065193968d 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -488,11 +488,11 @@ def schema(self) -> StructType: | 16| Bob| +---+-----+ - Retrieve the schema of the current DataFrame. + Retrieve the schema of the current DataFrame. >>> df.schema - StructType([StructField('age', IntegerType(), True), - StructField('name', StringType(), True)]) + StructType([StructField('age', IntegerType(), True), + StructField('name', StringType(), True)]) """ if self._schema is None: try: @@ -755,7 +755,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = | 16| Bob| +---+-----+ - Show only top 2 rows. + Show only top 2 rows. >>> df.show(2) +---+-----+ @@ -766,7 +766,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = +---+-----+ only showing top 2 rows - Show DataFrame where the maximum number of characters is 3. + Show DataFrame where the maximum number of characters is 3. >>> df.show(truncate=3) +---+----+ @@ -777,7 +777,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = | 16| Bob| +---+----+ - Show DataFrame vertically. + Show DataFrame vertically. >>> df.show(vertical=True) -RECORD 0----- @@ -1038,7 +1038,7 @@ def count(self) -> int: | 16| Bob| +---+-----+ - Return the number of rows in the :class:`DataFrame`. + Return the number of rows in the :class:`DataFrame`. >>> df.count() 3 @@ -1147,7 +1147,7 @@ def take(self, num: int) -> List[Row]: | 16| Bob| +---+-----+ - Return the first 2 rows of the :class:`DataFrame`. + Return the first 2 rows of the :class:`DataFrame`. >>> df.take(2) [Row(age=14, name='Tom'), Row(age=23, name='Alice')] @@ -1583,7 +1583,7 @@ def distinct(self) -> "DataFrame": Examples -------- >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), - ... (23, "Alice")], ["age", "name"]) + ... (23, "Alice")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -1593,7 +1593,7 @@ def distinct(self) -> "DataFrame": | 23|Alice| +---+-----+ - Return the number of distinct rows in the :class:`DataFrame` + Return the number of distinct rows in the :class:`DataFrame` >>> df.distinct().count() 2 @@ -3453,10 +3453,10 @@ def fillna( -------- Fill all null values with 50 when the data type of the column is an integer - >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), + >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) >>> df.show() - +----+------+-----+ + +----+------+-----+ | age|height| name| +----+------+-----+ | 10| 80|Alice| @@ -3474,7 +3474,7 @@ def fillna( | 50| 50| null| +---+------+-----+ - Fill all null values with ``False`` when the data type of the column is a boolean + Fill all null values with ``False`` when the data type of the column is a boolean >>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None), ... (None, "Mallory", True)], ["age", "name", "spy"]) @@ -3495,8 +3495,8 @@ def fillna( |null|Mallory| true| +----+-------+-----+ - Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column - + Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column + >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) >>> df.show() @@ -3654,7 +3654,7 @@ def replace( # type: ignore[misc] +----+------+----+ Replace all instances of Alice to 'A' and Bob to 'B' under the name column - + >>> df.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show() +----+------+----+ | age|height|name| From aadf3d0ae291e3dbf636e7e01b322def3198bdea Mon Sep 17 00:00:00 2001 From: William Zijie Date: Thu, 25 Aug 2022 22:22:22 -0400 Subject: [PATCH 7/7] making some additional small detail changes --- python/pyspark/sql/dataframe.py | 70 +++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 07d065193968d..33c1a6340fbc4 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -745,7 +745,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = Examples -------- >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), - ... (16, "Bob")], ["age", "name"]) + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -766,7 +766,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = +---+-----+ only showing top 2 rows - Show DataFrame where the maximum number of characters is 3. + Show :class:`DataFrame` where the maximum number of characters is 3. >>> df.show(truncate=3) +---+----+ @@ -777,7 +777,7 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = | 16| Bob| +---+----+ - Show DataFrame vertically. + Show :class:`DataFrame` vertically. >>> df.show(vertical=True) -RECORD 0----- @@ -1028,7 +1028,7 @@ def count(self) -> int: Examples -------- >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), - ... (16, "Bob")], ["age", "name"]) + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -1806,8 +1806,8 @@ def dtypes(self) -> List[Tuple[str, str]]: Examples -------- - >>> df = spark.createDataFrame([(14, "Tom"), - ... (23, "Alice")], ["age", "name"]) + >>> df = spark.createDataFrame( + ... [(14, "Tom"), (23, "Alice"),(16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name| @@ -1815,6 +1815,9 @@ def dtypes(self) -> List[Tuple[str, str]]: | 14| Tom| | 23|Alice| +---+-----+ + + Return the name of each column along with their respective data types + >>> df.dtypes [('age', 'bigint'), ('name', 'string')] """ @@ -3451,8 +3454,6 @@ def fillna( Examples -------- - Fill all null values with 50 when the data type of the column is an integer - >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) >>> df.show() @@ -3464,6 +3465,9 @@ def fillna( |null| null| Tom| |null| null| null| +----+------+-----+ + + Fill all null values with 50 when the data type of the column is an Integer + >>> df.na.fill(50).show() +---+------+-----+ |age|height| name| @@ -3474,8 +3478,6 @@ def fillna( | 50| 50| null| +---+------+-----+ - Fill all null values with ``False`` when the data type of the column is a boolean - >>> df = spark.createDataFrame([(10, "Alice", None), (5, "Bob", None), ... (None, "Mallory", True)], ["age", "name", "spy"]) >>> df.show() @@ -3486,6 +3488,9 @@ def fillna( | 5| Bob|null| |null|Mallory|true| +----+-------+----+ + + Fill all null values with ``False`` when the data type of the column is a boolean + >>> df.na.fill(False).show() +----+-------+-----+ | age| name| spy| @@ -3495,8 +3500,6 @@ def fillna( |null|Mallory| true| +----+-------+-----+ - Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column - >>> df = spark.createDataFrame([(10, 80, "Alice"), (5, None, "Bob"), ... (None, None, "Tom"), (None, None, None)], ["age", "height", "name"]) >>> df.show() @@ -3508,6 +3511,9 @@ def fillna( |null| null| Tom| |null| null| null| +----+------+-----+ + + Fill all null values in the 'age' column to 50 and "unknown" in the 'name' column + >>> df.na.fill({'age': 50, 'name': 'unknown'}).show() +---+------+-------+ |age|height| name| @@ -3631,6 +3637,9 @@ def replace( # type: ignore[misc] |null| null| Tom| |null| null| null| +----+------+-----+ + + Replace all instances of the value 10 to the value 20 + >>> df.na.replace(10, 20).show() +----+------+-----+ | age|height| name| @@ -4210,22 +4219,22 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] | 23|Alice| | 16| Bob| +---+-----+ - >>> df.drop('age').collect() - [Row(name='Tom'), Row(name='Alice'), Row(name='Bob')] - - >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), - ... (16, "Bob")], ["age", "name"]) - >>> df.show() - +---+-----+ - |age| name| - +---+-----+ - | 14| Tom| - | 23|Alice| - | 16| Bob| - +---+-----+ - >>> df.drop(df.age).collect() - [Row(name='Tom'), Row(name='Alice'), Row(name='Bob')] - + >>> df.drop('age').show() + +-----+ + | name| + +-----+ + | Tom| + |Alice| + | Bob| + +-----+ + >>> df.drop(df.age).show() + +-----+ + | name| + +-----+ + | Tom| + |Alice| + | Bob| + +-----+ >>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect() [Row(age=5, height=85, name='Bob')] @@ -4256,7 +4265,8 @@ def toDF(self, *cols: "ColumnOrName") -> "DataFrame": Parameters ---------- cols : str - new column names. The length of the list needs to be the same as the number of columns in the initial :class:`DataFrame` + new column names. The length of the list needs to be the same as the number + of columns in the initial :class:`DataFrame` Returns ------- @@ -4266,7 +4276,7 @@ def toDF(self, *cols: "ColumnOrName") -> "DataFrame": Examples -------- >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), - ... (16, "Bob")], ["age", "name"]) + ... (16, "Bob")], ["age", "name"]) >>> df.show() +---+-----+ |age| name|