Skip to content

Commit

Permalink
Run doctests. (#9815)
Browse files Browse the repository at this point in the history
This PR adds doctests and resolves #9513. Several issues were found by running doctests that have now been resolved:

- [x] #9821
- [x] #9822
- [x] #9823
- [x] #9824
- [x] #9825
- [x] #9826
- [x] #9827
- [x] #9828 (workaround by deleting doctests)
- [x] #9829

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #9815
  • Loading branch information
bdice authored Jan 15, 2022
1 parent 8c8d6ef commit e24fa8f
Show file tree
Hide file tree
Showing 17 changed files with 402 additions and 165 deletions.
63 changes: 63 additions & 0 deletions python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,66 @@

__version__ = get_versions()["version"]
del get_versions

__all__ = [
"BaseIndex",
"CategoricalDtype",
"CategoricalIndex",
"DataFrame",
"DateOffset",
"DatetimeIndex",
"Decimal32Dtype",
"Decimal64Dtype",
"Float32Index",
"Float64Index",
"GenericIndex",
"Grouper",
"Index",
"Int16Index",
"Int32Index",
"Int64Index",
"Int8Index",
"IntervalDtype",
"IntervalIndex",
"ListDtype",
"MultiIndex",
"NA",
"RangeIndex",
"Scalar",
"Series",
"StringIndex",
"StructDtype",
"TimedeltaIndex",
"UInt16Index",
"UInt32Index",
"UInt64Index",
"UInt8Index",
"api",
"concat",
"cut",
"date_range",
"factorize",
"from_dataframe",
"from_dlpack",
"from_pandas",
"get_dummies",
"interval_range",
"isclose",
"melt",
"merge",
"merge_sorted",
"pivot",
"read_avro",
"read_csv",
"read_feather",
"read_hdf",
"read_json",
"read_orc",
"read_parquet",
"read_text",
"set_allocator",
"testing",
"to_datetime",
"to_numeric",
"unstack",
]
4 changes: 3 additions & 1 deletion python/cudf/cudf/api/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from cudf.api import types
from cudf.api import extensions, types

__all__ = ["extensions", "types"]
6 changes: 6 additions & 0 deletions python/cudf/cudf/api/extensions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,9 @@
register_index_accessor,
register_series_accessor,
)

__all__ = [
"register_dataframe_accessor",
"register_index_accessor",
"register_series_accessor",
]
14 changes: 8 additions & 6 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ def fillna(self, value, downcast=None):
>>> import cudf
>>> index = cudf.Index([1, 2, None, 4])
>>> index
Int64Index([1, 2, null, 4], dtype='int64')
Int64Index([1, 2, <NA>, 4], dtype='int64')
>>> index.fillna(3)
Int64Index([1, 2, 3, 4], dtype='int64')
"""
Expand Down Expand Up @@ -553,7 +553,7 @@ def to_pandas(self):
>>> type(idx.to_pandas())
<class 'pandas.core.indexes.numeric.Int64Index'>
>>> type(idx)
<class 'cudf.core.index.GenericIndex'>
<class 'cudf.core.index.Int64Index'>
"""
return pd.Index(self._values.to_pandas(), name=self.name)

Expand Down Expand Up @@ -942,6 +942,7 @@ def is_interval(self):
Examples
--------
>>> import cudf
>>> import pandas as pd
>>> idx = cudf.from_pandas(
... pd.Index([pd.Interval(left=0, right=5),
... pd.Interval(left=5, right=10)])
Expand Down Expand Up @@ -1105,15 +1106,16 @@ def join(
Examples
--------
>>> import cudf
>>> lhs = cudf.DataFrame(
... {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b']
... ).index
>>> lhs = cudf.DataFrame({
... "a": [2, 3, 1],
... "b": [3, 4, 2],
... }).set_index(['a', 'b']).index
>>> lhs
MultiIndex([(2, 3),
(3, 4),
(1, 2)],
names=['a', 'b'])
>>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index
>>> rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index('a').index
>>> rhs
Int64Index([1, 4, 3], dtype='int64', name='a')
>>> lhs.join(rhs, how='inner')
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ class CategoricalAccessor(ColumnMethods):
--------
>>> s = cudf.Series([1,2,3], dtype='category')
>>> s
>>> s
0 1
1 2
2 3
Expand Down
122 changes: 76 additions & 46 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,12 +463,12 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
... [(t0+ timedelta(seconds=x)) for x in range(n)])
... })
>>> df
id datetimes
0 0 2018-10-07T12:00:00.000
1 1 2018-10-07T12:00:01.000
2 2 2018-10-07T12:00:02.000
3 3 2018-10-07T12:00:03.000
4 4 2018-10-07T12:00:04.000
id datetimes
0 0 2018-10-07 12:00:00
1 1 2018-10-07 12:00:01
2 2 2018-10-07 12:00:02
3 3 2018-10-07 12:00:03
4 4 2018-10-07 12:00:04
Build DataFrame via list of rows as tuples:
Expand Down Expand Up @@ -984,23 +984,34 @@ def __getitem__(self, arg):
Examples
--------
>>> df = DataFrame([('a', list(range(20))),
... ('b', list(range(20))),
... ('c', list(range(20)))])
>>> df[:4] # get first 4 rows of all columns
>>> df = cudf.DataFrame({
... 'a': list(range(10)),
... 'b': list(range(10)),
... 'c': list(range(10)),
... })
Get first 4 rows of all columns.
>>> df[:4]
a b c
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3
>>> df[-5:] # get last 5 rows of all columns
a b c
15 15 15 15
16 16 16 16
17 17 17 17
18 18 18 18
19 19 19 19
>>> df[['a', 'c']] # get columns a and c
Get last 5 rows of all columns.
>>> df[-5:]
a b c
5 5 5 5
6 6 6 6
7 7 7 7
8 8 8 8
9 9 9 9
Get columns a and c.
>>> df[['a', 'c']]
a c
0 0 0
1 1 1
Expand All @@ -1012,8 +1023,17 @@ def __getitem__(self, arg):
7 7 7
8 8 8
9 9 9
>>> df[[True, False, True, False]] # mask the entire dataframe,
# returning the rows specified in the boolean mask
Return the rows specified in the boolean mask.
>>> df[[True, False, True, False, True,
... False, True, False, True, False]]
a b c
0 0 0 0
2 2 2 2
4 4 4 4
6 6 6 6
8 8 8 8
"""
if _is_scalar_or_zero_d_array(arg) or isinstance(arg, tuple):
return self._get_columns_by_label(arg, downcast=True)
Expand Down Expand Up @@ -1261,10 +1281,12 @@ def memory_usage(self, index=True, deep=False):
object 40000
bool 5000
dtype: int64
Use a Categorical for efficient storage of an object-dtype column with
many repeated values.
>>> df['object'].astype('category').memory_usage(deep=True)
5048
5008
"""
if deep:
warnings.warn(
Expand Down Expand Up @@ -2225,11 +2247,11 @@ def reindex(
3 3 13.0
4 4 14.0
>>> df_new
key val sum
0 0 10.0 NaN
3 3 13.0 NaN
4 4 14.0 NaN
5 -1 NaN NaN
key val sum
0 0 10.0 <NA>
3 3 13.0 <NA>
4 4 14.0 <NA>
5 <NA> <NA> <NA>
"""

if labels is None and index is None and columns is None:
Expand Down Expand Up @@ -3701,10 +3723,10 @@ def query(self, expr, local_dict=None):
Examples
--------
>>> import cudf
>>> a = ('a', [1, 2, 2])
>>> b = ('b', [3, 4, 5])
>>> df = cudf.DataFrame([a, b])
>>> df = cudf.DataFrame({
... "a": [1, 2, 2],
... "b": [3, 4, 5],
... })
>>> expr = "(a == 2 and b == 4) or (b == 3)"
>>> df.query(expr)
a b
Expand All @@ -3720,8 +3742,8 @@ def query(self, expr, local_dict=None):
>>> df['datetimes'] = data
>>> search_date = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d')
>>> df.query('datetimes==@search_date')
datetimes
1 2018-10-08T00:00:00.000
datetimes
1 2018-10-08
Using local_dict:
Expand All @@ -3732,9 +3754,9 @@ def query(self, expr, local_dict=None):
>>> df['datetimes'] = data
>>> search_date2 = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d')
>>> df.query('datetimes==@search_date',
... local_dict={'search_date':search_date2})
datetimes
1 2018-10-08T00:00:00.000
... local_dict={'search_date': search_date2})
datetimes
1 2018-10-08
"""
# can't use `annotate` decorator here as we inspect the calling
# environment.
Expand Down Expand Up @@ -4189,18 +4211,23 @@ def info(
dtypes: float64(1), int64(1), object(1)
memory usage: 130.0+ bytes
Pipe output of DataFrame.info to buffer instead of sys.stdout,
get buffer content and writes to a text file:
Pipe output of DataFrame.info to a buffer instead of sys.stdout and
print buffer contents:
>>> import io
>>> buffer = io.StringIO()
>>> df.info(buf=buffer)
>>> s = buffer.getvalue()
>>> with open("df_info.txt", "w",
... encoding="utf-8") as f:
... f.write(s)
...
369
>>> print(buffer.getvalue())
<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 int_col 5 non-null int64
1 text_col 5 non-null object
2 float_col 5 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 130.0+ bytes
The `memory_usage` parameter allows deep introspection mode, specially
useful for big DataFrames and fine-tune memory optimization:
Expand Down Expand Up @@ -5761,7 +5788,7 @@ def stack(self, level=-1, dropna=True):
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a':[0,1,3], 'b':[1,2,4]})
>>> df = cudf.DataFrame({'a': [0, 1, 3], 'b': [1, 2, 4]})
>>> df.stack()
0 a 0
b 1
Expand Down Expand Up @@ -6084,8 +6111,11 @@ def explode(self, column, ignore_index=False):
Examples
--------
>>> import cudf
>>> cudf.DataFrame(
{"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]})
>>> df = cudf.DataFrame({
... "a": [[1, 2, 3], [], None, [4, 5]],
... "b": [11, 22, 33, 44],
... })
>>> df
a b
0 [1, 2, 3] 11
1 [] 22
Expand Down
11 changes: 6 additions & 5 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1256,9 +1256,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
--------
>>> import cudf
>>> import pandas as pd
>>> df = cudf.DataFrame({'Animal': ['Falcon', 'Falcon',
... 'Parrot', 'Parrot'],
... 'Max Speed': [380., 370., 24., 26.]})
>>> df = cudf.DataFrame({
... 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],
... 'Max Speed': [380., 370., 24., 26.],
... })
>>> df
Animal Max Speed
0 Falcon 380.0
Expand All @@ -1272,10 +1273,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
Parrot 25.0
>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
... ['Captive', 'Wild', 'Captive', 'Wild']]
... ['Captive', 'Wild', 'Captive', 'Wild']]
>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
>>> df = cudf.DataFrame({'Max Speed': [390., 350., 30., 20.]},
index=index)
... index=index)
>>> df
Max Speed
Animal Type
Expand Down
Loading

0 comments on commit e24fa8f

Please sign in to comment.