"""
Helper verbs
"""
from .operators import DataOperator
from .one_table_verbs import select, group_by
__all__ = ['call', 'tally', 'count', 'add_tally', 'add_count',
'arrange_all', 'arrange_at', 'arrange_if',
'create_all', 'create_at', 'create_if',
'group_by_all', 'group_by_at', 'group_by_if',
'mutate_all', 'mutate_at', 'mutate_if',
'query_all', 'query_at', 'query_if',
'rename_all', 'rename_at', 'rename_if',
'select_all', 'select_at', 'select_if',
'summarize_all', 'summarize_at', 'summarize_if',
# Aliases
'summarise_all', 'summarise_at', 'summarise_if',
'transmute_all', 'transmute_at', 'transmute_if',
]
MANY = float('inf')
[docs]class call(DataOperator):
"""
Call external function or dataframe method
This is a special verb; it turns regular functions and
dataframe instance methods into verb instances that you
can pipe to. It reduces the times one needs to break out
of the piping workflow.
Parameters
----------
func : callable or str
A function that accepts a dataframe as the first argument.
Dataframe methods are specified using strings and
preferrably they should start with a period,
e.g ``'.reset_index'``
*args : tuple
Second, third, fourth, ... arguments to ``func``
**kwargs : dict
Keyword arguments to ``func``
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'A': {0: 'a', 1: 'b', 2: 'c'},
... 'B': {0: 1, 1: 3, 2: 5},
... 'C': {0: 2, 1: 4, 2: np.nan}
... })
>>> df
A B C
0 a 1 2.0
1 b 3 4.0
2 c 5 NaN
Using an external function
>>> df >> call(pd.melt)
variable value
0 A a
1 A b
2 A c
3 B 1
4 B 3
5 B 5
6 C 2
7 C 4
8 C NaN
An external function with arguments
>>> df >> call(pd.melt, id_vars=['A'], value_vars=['B'])
A variable value
0 a B 1
1 b B 3
2 c B 5
A method on the dataframe
>>> df >> call('.dropna', axis=1)
A B
0 a 1
1 b 3
2 c 5
>>> (df
... >> call(pd.melt)
... >> query('variable != "B"')
... >> call('.reset_index', drop=True)
... )
variable value
0 A a
1 A b
2 A c
3 C 2
4 C 4
5 C NaN
"""
def __init__(self, func, *args, **kwargs):
self.func = func
self.args = args
self.kwargs = kwargs
[docs]class tally(DataOperator):
"""
Tally observations by group
``tally`` is a convenient wrapper for summarise that will
either call ``n`` or ``sum(n)`` depending on whether you're
tallying for the first time, or re-tallying.
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
weights : str or array-like, optional
Weight of each row in the group.
sort : bool, optional
If ``True``, sort the resulting data in descending
order.
Examples
--------
>>> import pandas as pd
>>> from plydata import tally, group_by, summarize
>>> df = pd.DataFrame({
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': ['a', 'b', 'a', 'b', 'a', 'b'],
... 'w': [1, 2, 1, 2, 1, 2]})
Without groups it is one large group
>>> df >> tally()
n
0 6
Sum of the weights
>>> df >> tally('w')
n
0 9
With groups
>>> df >> group_by('y') >> tally()
y n
0 a 3
1 b 3
With groups and weights
>>> df >> group_by('y') >> tally('w')
y n
0 a 3
1 b 6
Applying the weights to a column
>>> df >> group_by('y') >> tally('x*w')
y n
0 a 9
1 b 24
You can do that with :class:`~plydata.verbs.summarize`
>>> df >> group_by('y') >> summarize(n='sum(x*w)')
y n
0 a 9
1 b 24
"""
def __init__(self, weights=None, sort=False):
self.set_env_from_verb_init()
self.weights = weights
self.sort = sort
[docs]class count(group_by):
"""
Count observations by group
``count`` is a convenient wrapper for summarise that will
either call n or sum(n) depending on whether you’re
tallying for the first time, or re-tallying. Similar to
:class:`tally`, but it does the :class:`~plydata.verbs.group_by`
for you.
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
*args : str, list
Columns to group by.
weights : str or array-like, optional
Weight of each row in the group.
sort : bool, optional
If ``True``, sort the resulting data in descending
order.
Examples
--------
>>> import pandas as pd
>>> from plydata import count, group_by, summarize
>>> df = pd.DataFrame({
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': ['a', 'b', 'a', 'b', 'a', 'b'],
... 'w': [1, 2, 1, 2, 1, 2]})
Without groups it is one large group
>>> df >> count()
n
0 6
Sum of the weights
>>> df >> count(weights='w')
n
0 9
With groups
>>> df >> count('y')
y n
0 a 3
1 b 3
With groups and weights
>>> df >> count('y', weights='w')
y n
0 a 3
1 b 6
Applying the weights to a column
>>> df >> count('y', weights='x*w')
y n
0 a 9
1 b 24
You can do that with :class:`~plydata.verbs.summarize`
>>> df >> group_by('y') >> summarize(n='sum(x*w)')
y n
0 a 9
1 b 24
"""
def __init__(self, *args, weights=None, sort=False):
self.set_env_from_verb_init()
super().__init__(*args)
self.add_ = True
self.weights = weights
self.sort = sort
[docs]class add_tally(tally):
"""
Add column with tally of items in each group
Similar to :class:`tally`, but it adds a column and does
not collapse the groups.
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
weights : str or array-like, optional
Weight of each row in the group.
sort : bool, optional
If ``True``, sort the resulting data in descending
order.
Examples
--------
>>> import pandas as pd
>>> from plydata import *
>>> df = pd.DataFrame({
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': ['a', 'b', 'a', 'b', 'a', 'b'],
... 'w': [1, 2, 1, 2, 1, 2]})
Without groups it is one large group
>>> df >> add_tally()
x y w n
0 1 a 1 6
1 2 b 2 6
2 3 a 1 6
3 4 b 2 6
4 5 a 1 6
5 6 b 2 6
Sum of the weights
>>> df >> add_tally('w')
x y w n
0 1 a 1 9
1 2 b 2 9
2 3 a 1 9
3 4 b 2 9
4 5 a 1 9
5 6 b 2 9
With groups
>>> df >> group_by('y') >> add_tally()
groups: ['y']
x y w n
0 1 a 1 3
1 2 b 2 3
2 3 a 1 3
3 4 b 2 3
4 5 a 1 3
5 6 b 2 3
With groups and weights
>>> df >> group_by('y') >> add_tally('w')
groups: ['y']
x y w n
0 1 a 1 3
1 2 b 2 6
2 3 a 1 3
3 4 b 2 6
4 5 a 1 3
5 6 b 2 6
Applying the weights to a column
>>> df >> group_by('y') >> add_tally('x*w')
groups: ['y']
x y w n
0 1 a 1 9
1 2 b 2 24
2 3 a 1 9
3 4 b 2 24
4 5 a 1 9
5 6 b 2 24
Add tally is equivalent to using :func:`sum` or ``n()``
in :class:`~plydata.verbs.define`.
>>> df >> group_by('y') >> define(n='sum(x*w)')
groups: ['y']
x y w n
0 1 a 1 9
1 2 b 2 24
2 3 a 1 9
3 4 b 2 24
4 5 a 1 9
5 6 b 2 24
>>> df >> group_by('y') >> define(n='n()')
groups: ['y']
x y w n
0 1 a 1 3
1 2 b 2 3
2 3 a 1 3
3 4 b 2 3
4 5 a 1 3
5 6 b 2 3
Which is the same result as
:py:`df >> group_by('y') >> add_tally()` above.
See Also
--------
:class:`add_count`
"""
[docs]class add_count(count):
"""
Add column with number of items in each group
Similar to :class:`count`, but it adds a column and does
not collapse the groups. It is also a shortcut of
:class:`add_tally` that does the grouping.
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
*args : str, list
Columns to group by.
weights : str or array-like, optional
Weight of each row in the group.
sort : bool, optional
If ``True``, sort the resulting data in descending
order.
Examples
--------
>>> import pandas as pd
>>> from plydata import *
>>> df = pd.DataFrame({
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': ['a', 'b', 'a', 'b', 'a', 'b'],
... 'w': [1, 2, 1, 2, 1, 2]})
Without groups it is one large group
>>> df >> add_count()
x y w n
0 1 a 1 6
1 2 b 2 6
2 3 a 1 6
3 4 b 2 6
4 5 a 1 6
5 6 b 2 6
Sum of the weights
>>> df >> add_count(weights='w')
x y w n
0 1 a 1 9
1 2 b 2 9
2 3 a 1 9
3 4 b 2 9
4 5 a 1 9
5 6 b 2 9
With groups
>>> df >> add_count('y')
x y w n
0 1 a 1 3
1 2 b 2 3
2 3 a 1 3
3 4 b 2 3
4 5 a 1 3
5 6 b 2 3
>>> df >> group_by('y') >> add_count()
groups: ['y']
x y w n
0 1 a 1 3
1 2 b 2 3
2 3 a 1 3
3 4 b 2 3
4 5 a 1 3
5 6 b 2 3
With groups and weights
>>> df >> add_count('y', weights='w')
x y w n
0 1 a 1 3
1 2 b 2 6
2 3 a 1 3
3 4 b 2 6
4 5 a 1 3
5 6 b 2 6
Applying the weights to a column
>>> df >> add_count('y', weights='x*w')
x y w n
0 1 a 1 9
1 2 b 2 24
2 3 a 1 9
3 4 b 2 24
4 5 a 1 9
5 6 b 2 24
You can do that with :class:`add_tally`
>>> df >> group_by('y') >> add_tally('x*w') >> ungroup()
x y w n
0 1 a 1 9
1 2 b 2 24
2 3 a 1 9
3 4 b 2 24
4 5 a 1 9
5 6 b 2 24
See Also
--------
:class:`add_tally`
"""
class _all(DataOperator):
"""
Base class for *_all verbs
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
functions : callable or tuple or dict or str
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
"""
selector = '_all'
n_functions = MANY # Maximum number of functions
def __init__(self, functions=None, *args, **kwargs):
if functions is None:
functions = (lambda x: x, )
elif isinstance(functions, str) or callable(functions):
functions = (functions,)
elif isinstance(functions, dict):
functions = functions
else:
functions = tuple(functions)
n = len(functions)
if n > self.n_functions:
raise ValueError(
"{} expected {} function(s) got {}".format(
self.__class__.__name__, self.n_functions, n
)
)
self.set_env_from_verb_init()
self.functions = functions
self.args = args
self.kwargs = kwargs
class _if(DataOperator):
"""
Base class for *_if verbs
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
predicate : function
A predicate function to be applied to the columns of the
dataframe. Good candidates for predicate functions are
those that check the type of the column. Such function
are avaible at :mod:`pandas.api.dtypes`, for example
:func:`pandas.api.types.is_numeric_dtype`.
For convenience, you can reference the ``is_*_dtype``
functions with shorter strings::
'is_bool' # pandas.api.types.is_bool_dtype
'is_categorical' # pandas.api.types.is_categorical_dtype
'is_complex' # pandas.api.types.is_complex_dtype
'is_datetime64_any' # pandas.api.types.is_datetime64_any_dtype
'is_datetime64' # pandas.api.types.is_datetime64_dtype
'is_datetime64_ns' # pandas.api.types.is_datetime64_ns_dtype
'is_datetime64tz' # pandas.api.types.is_datetime64tz_dtype
'is_float' # pandas.api.types.is_float_dtype
'is_int64' # pandas.api.types.is_int64_dtype
'is_integer' # pandas.api.types.is_integer_dtype
'is_interval' # pandas.api.types.is_interval_dtype
'is_numeric' # pandas.api.types.is_numeric_dtype
'is_object' # pandas.api.types.is_object_dtype
'is_period' # pandas.api.types.is_period_dtype
'is_signed_integer' # pandas.api.types.is_signed_integer_dtype
'is_string' # pandas.api.types.is_string_dtype
'is_timedelta64' # pandas.api.types.is_timedelta64_dtype
'is_timedelta64_ns' # pandas.api.types.is_timedelta64_ns_dtype
'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype
No other string values are allowed.
functions : callable or tuple or dict or str
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
"""
selector = '_if'
n_functions = MANY # Maximum number of functions
def __init__(self, predicate, functions=None, *args, **kwargs):
if functions is None:
functions = (lambda x: x, )
elif isinstance(functions, str) or callable(functions):
functions = (functions,)
elif isinstance(functions, dict):
functions = functions
else:
functions = tuple(functions)
n = len(functions)
if n > self.n_functions:
raise ValueError(
"{} expected {} function(s) got {}".format(
self.__class__.__name__, self.n_functions, n
)
)
self.set_env_from_verb_init()
self.predicate = predicate
self.functions = functions
self.args = args
self.kwargs = kwargs
class _at(select):
"""
Base class for *_at verbs
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
names : tuple or dict
Names of columns in dataframe. If a tuple, they should be names
of columns. If a :class:`dict`, they keys must be in.
- startswith : str or tuple, optional
All column names that start with this string will be included.
- endswith : str or tuple, optional
All column names that end with this string will be included.
- contains : str or tuple, optional
All column names that contain with this string will be included.
- matches : str or regex or tuple, optional
All column names that match the string or a compiled regex pattern
will be included. A tuple can be used to match multiple regexs.
- drop : bool, optional
If ``True``, the selection is inverted. The unspecified/unmatched
columns are returned instead. Default is ``False``.
functions : callable or tuple or dict or str
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
"""
selector = '_at'
n_functions = MANY # Maximum number of functions
def __init__(self, names, functions=None, *args, **kwargs):
# Sort out the arguments to select
if isinstance(names, (tuple, list)):
args_select = names
kwargs_select = {}
elif isinstance(names, str):
args_select = (names,)
kwargs_select = {}
elif isinstance(names, dict):
args_select = tuple()
kwargs_select = names
else:
raise TypeError(
"Unexpected type for the names specification.")
if functions is None:
functions = (lambda x: x, )
elif isinstance(functions, str) or callable(functions):
functions = (functions,)
elif isinstance(functions, dict):
functions = functions
else:
functions = tuple(functions)
n = len(functions)
if n > self.n_functions:
raise ValueError(
"{} expected {} function(s) got {}".format(
self.__class__.__name__, self.n_functions, n
)
)
self.set_env_from_verb_init()
super().__init__(*args_select, **kwargs_select)
self.functions = functions
self.args = args
self.kwargs = kwargs
[docs]class arrange_all(_all):
"""
Arrange by all columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
functions : callable or tuple or dict or str
Functions to alter the columns before they are sorted:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
Note that, the functions do not change the data, they only
affect the sorting.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Arranging in ascending order.
>>> df >> arrange_all()
alpha beta theta x y z
1 a a d 2 5 9
0 a b c 1 6 7
2 a b e 3 4 11
5 b q e 6 1 12
3 b r c 4 3 8
4 b u d 5 2 10
Arranging in descending order.
>>> df >> arrange_all(pd.Series.rank, ascending=False)
alpha beta theta x y z
4 b u d 5 2 10
3 b r c 4 3 8
5 b q e 6 1 12
2 a b e 3 4 11
0 a b c 1 6 7
1 a a d 2 5 9
Notes
-----
Do not use functions that change the order of the values in the
array. Such functions are most likely the wrong candidates,
they corrupt the data. Use function(s) that return values that
can be sorted.
"""
def __init__(self, functions=None, *args, **kwargs):
self.set_env_from_verb_init()
super().__init__(functions, *args, **kwargs)
[docs]class arrange_if(_if):
"""
Arrange by all column that match a predicate
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
predicate : function
A predicate function to be applied to the columns of the
dataframe. Good candidates for predicate functions are
those that check the type of the column. Such function
are avaible at :mod:`pandas.api.dtypes`, for example
:func:`pandas.api.types.is_numeric_dtype`.
For convenience, you can reference the ``is_*_dtype``
functions with shorter strings::
'is_bool' # pandas.api.types.is_bool_dtype
'is_categorical' # pandas.api.types.is_categorical_dtype
'is_complex' # pandas.api.types.is_complex_dtype
'is_datetime64_any' # pandas.api.types.is_datetime64_any_dtype
'is_datetime64' # pandas.api.types.is_datetime64_dtype
'is_datetime64_ns' # pandas.api.types.is_datetime64_ns_dtype
'is_datetime64tz' # pandas.api.types.is_datetime64tz_dtype
'is_float' # pandas.api.types.is_float_dtype
'is_int64' # pandas.api.types.is_int64_dtype
'is_integer' # pandas.api.types.is_integer_dtype
'is_interval' # pandas.api.types.is_interval_dtype
'is_numeric' # pandas.api.types.is_numeric_dtype
'is_object' # pandas.api.types.is_object_dtype
'is_period' # pandas.api.types.is_period_dtype
'is_signed_integer' # pandas.api.types.is_signed_integer_dtype
'is_string' # pandas.api.types.is_string_dtype
'is_timedelta64' # pandas.api.types.is_timedelta64_dtype
'is_timedelta64_ns' # pandas.api.types.is_timedelta64_ns_dtype
'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype
No other string values are allowed.
functions : callable or tuple or dict or str
Functions to alter the columns before they are sorted:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
Note that, the functions do not change the data, they only
affect the sorting.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Arranging by the columns with strings in ascending order.
>>> df >> arrange_if('is_string')
alpha beta theta x y z
1 a a d 2 5 9
0 a b c 1 6 7
2 a b e 3 4 11
5 b q e 6 1 12
3 b r c 4 3 8
4 b u d 5 2 10
Arranging by the columns with strings in descending order.
>>> df >> arrange_if('is_string', pd.Series.rank, ascending=False)
alpha beta theta x y z
4 b u d 5 2 10
3 b r c 4 3 8
5 b q e 6 1 12
2 a b e 3 4 11
0 a b c 1 6 7
1 a a d 2 5 9
It is easier to sort by only the numeric columns in descending
order.
>>> df >> arrange_if('is_numeric', np.negative)
alpha beta theta x y z
5 b q e 6 1 12
4 b u d 5 2 10
3 b r c 4 3 8
2 a b e 3 4 11
1 a a d 2 5 9
0 a b c 1 6 7
Notes
-----
Do not use functions that change the order of the values in the
array. Such functions are most likely the wrong candidates,
they corrupt the data. Use function(s) that return values that
can be sorted.
"""
[docs]class arrange_at(_at):
"""
Arrange by specific columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
names : tuple or dict
Names of columns in dataframe. If a tuple, they should be names
of columns. If a :class:`dict`, they keys must be in.
- startswith : str or tuple, optional
All column names that start with this string will be included.
- endswith : str or tuple, optional
All column names that end with this string will be included.
- contains : str or tuple, optional
All column names that contain with this string will be included.
- matches : str or regex or tuple, optional
All column names that match the string or a compiled regex pattern
will be included. A tuple can be used to match multiple regexs.
- drop : bool, optional
If ``True``, the selection is inverted. The unspecified/unmatched
columns are returned instead. Default is ``False``.
functions : callable or tuple or dict or str, optional
Functions to alter the columns before they are sorted:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
Note that, the functions do not change the data, they only
affect the sorting.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Arrange by explictily naming the columns to arrange by.
This is not much different from :class:`~plydata.verbs.arrange`.
>>> df >> arrange_at(('alpha', 'z'))
alpha beta theta x y z
0 a b c 1 6 7
1 a a d 2 5 9
2 a b e 3 4 11
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
Arrange by dynamically selecting the columns to arrange
by. Here we the selection is *beta* and *theta*.
>>> df >> arrange_at(dict(contains='eta'))
alpha beta theta x y z
1 a a d 2 5 9
0 a b c 1 6 7
2 a b e 3 4 11
5 b q e 6 1 12
3 b r c 4 3 8
4 b u d 5 2 10
In descending order.
>>> (df
... >> arrange_at(
... dict(contains='eta'),
... pd.Series.rank, ascending=False)
... )
alpha beta theta x y z
4 b u d 5 2 10
3 b r c 4 3 8
5 b q e 6 1 12
2 a b e 3 4 11
0 a b c 1 6 7
1 a a d 2 5 9
Notes
-----
Do not use functions that change the order of the values in the
array. Such functions are most likely the wrong candidates,
they corrupt the data. Use function(s) that return values that
can be sorted.
"""
def __init__(self, names, functions=None, *args, **kwargs):
self.set_env_from_verb_init()
super().__init__(names, functions, *args, **kwargs)
[docs]class create_all(_all):
"""
Create a new dataframe with all columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
functions : callable or tuple or dict or str
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Create a new dataframe by doubling the values of the input frame.
>>> def double(s):
... return s + s
>>> df >> create_all(double)
alpha beta theta x y z
0 aa bb cc 2 12 14
1 aa aa dd 4 10 18
2 aa bb ee 6 8 22
3 bb rr cc 8 6 16
4 bb uu dd 10 4 20
5 bb qq ee 12 2 24
Convert from centimetes to inches.
>>> def inch(col, decimals=0):
... return np.round(col/2.54, decimals)
>>> def feet(col, decimals=0):
... return np.round(col/30.48, decimals)
>>> df >> select('x', 'y', 'z') >> create_all((inch, feet), decimals=2)
x_inch y_inch z_inch x_feet y_feet z_feet
0 0.39 2.36 2.76 0.03 0.20 0.23
1 0.79 1.97 3.54 0.07 0.16 0.30
2 1.18 1.57 4.33 0.10 0.13 0.36
3 1.57 1.18 3.15 0.13 0.10 0.26
4 1.97 0.79 3.94 0.16 0.07 0.33
5 2.36 0.39 4.72 0.20 0.03 0.39
Group columns are always included, but they do not count towards
the matched columns.
>>> (df
... >> select('x', 'y', 'z')
... >> group_by('x')
... >> create_all((inch, feet), decimals=2))
groups: ['x']
x y_inch z_inch y_feet z_feet
0 1 2.36 2.76 0.20 0.23
1 2 1.97 3.54 0.16 0.30
2 3 1.57 4.33 0.13 0.36
3 4 1.18 3.15 0.10 0.26
4 5 0.79 3.94 0.07 0.33
5 6 0.39 4.72 0.03 0.39
"""
[docs]class create_if(_if):
"""
Create a new dataframe with columns selected by a predicate
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
predicate : function
A predicate function to be applied to the columns of the
dataframe. Good candidates for predicate functions are
those that check the type of the column. Such function
are avaible at :mod:`pandas.api.dtypes`, for example
:func:`pandas.api.types.is_numeric_dtype`.
For convenience, you can reference the ``is_*_dtype``
functions with shorter strings::
'is_bool' # pandas.api.types.is_bool_dtype
'is_categorical' # pandas.api.types.is_categorical_dtype
'is_complex' # pandas.api.types.is_complex_dtype
'is_datetime64_any' # pandas.api.types.is_datetime64_any_dtype
'is_datetime64' # pandas.api.types.is_datetime64_dtype
'is_datetime64_ns' # pandas.api.types.is_datetime64_ns_dtype
'is_datetime64tz' # pandas.api.types.is_datetime64tz_dtype
'is_float' # pandas.api.types.is_float_dtype
'is_int64' # pandas.api.types.is_int64_dtype
'is_integer' # pandas.api.types.is_integer_dtype
'is_interval' # pandas.api.types.is_interval_dtype
'is_numeric' # pandas.api.types.is_numeric_dtype
'is_object' # pandas.api.types.is_object_dtype
'is_period' # pandas.api.types.is_period_dtype
'is_signed_integer' # pandas.api.types.is_signed_integer_dtype
'is_string' # pandas.api.types.is_string_dtype
'is_timedelta64' # pandas.api.types.is_timedelta64_dtype
'is_timedelta64_ns' # pandas.api.types.is_timedelta64_ns_dtype
'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype
No other string values are allowed.
functions : callable or tuple or dict or str, optional
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Create a new dataframe by doubling selected column values of the
input frame. ``'is_integer'`` is a shortcut to
:py:`pdtypes.is_integer_dtype`.
>>> def double(s):
... return s + s
>>> df >> create_if('is_integer', double)
x y z
0 2 12 14
1 4 10 18
2 6 8 22
3 8 6 16
4 10 4 20
5 12 2 24
Convert from centimetes to inches.
>>> def inch(col, decimals=0):
... return np.round(col/2.54, decimals)
>>> def feet(col, decimals=0):
... return np.round(col/30.48, decimals)
>>> df >> create_if('is_integer', (inch, feet), decimals=2)
x_inch y_inch z_inch x_feet y_feet z_feet
0 0.39 2.36 2.76 0.03 0.20 0.23
1 0.79 1.97 3.54 0.07 0.16 0.30
2 1.18 1.57 4.33 0.10 0.13 0.36
3 1.57 1.18 3.15 0.13 0.10 0.26
4 1.97 0.79 3.94 0.16 0.07 0.33
5 2.36 0.39 4.72 0.20 0.03 0.39
Group columns are always included, but they do not count towards
the matched columns.
>>> (df
... >> group_by('x')
... >> create_if('is_integer', (inch, feet), decimals=2))
groups: ['x']
x y_inch z_inch y_feet z_feet
0 1 2.36 2.76 0.20 0.23
1 2 1.97 3.54 0.16 0.30
2 3 1.57 4.33 0.13 0.36
3 4 1.18 3.15 0.10 0.26
4 5 0.79 3.94 0.07 0.33
5 6 0.39 4.72 0.03 0.39
Selecting columns that match a predicate.
>>> df >> create_if('is_integer')
x y z
0 1 6 7
1 2 5 9
2 3 4 11
3 4 3 8
4 5 2 10
5 6 1 12
"""
[docs]class create_at(_at):
"""
Create dataframe with specific columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
names : tuple or dict
Names of columns in dataframe. If a tuple, they should be names
of columns. If a :class:`dict`, they keys must be in.
- startswith : str or tuple, optional
All column names that start with this string will be included.
- endswith : str or tuple, optional
All column names that end with this string will be included.
- contains : str or tuple, optional
All column names that contain with this string will be included.
- matches : str or regex or tuple, optional
All column names that match the string or a compiled regex pattern
will be included. A tuple can be used to match multiple regexs.
- drop : bool, optional
If ``True``, the selection is inverted. The unspecified/unmatched
columns are returned instead. Default is ``False``.
functions : callable or tuple or dict or str
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Create a new dataframe by doubling selected column values of the input
frame.
>>> def double(s):
... return s + s
>>> df >> create_at(('x', 'y', 'z'), double)
x y z
0 2 12 14
1 4 10 18
2 6 8 22
3 8 6 16
4 10 4 20
5 12 2 24
Convert from centimetes to inches.
>>> def inch(col, decimals=0):
... return np.round(col/2.54, decimals)
>>> def feet(col, decimals=0):
... return np.round(col/30.48, decimals)
>>> df >> create_at(('x', 'y', 'z'), (inch, feet), decimals=2)
x_inch y_inch z_inch x_feet y_feet z_feet
0 0.39 2.36 2.76 0.03 0.20 0.23
1 0.79 1.97 3.54 0.07 0.16 0.30
2 1.18 1.57 4.33 0.10 0.13 0.36
3 1.57 1.18 3.15 0.13 0.10 0.26
4 1.97 0.79 3.94 0.16 0.07 0.33
5 2.36 0.39 4.72 0.20 0.03 0.39
Group columns are always included and if listed in the
selection, the functions act on them.
>>> (df
... >> group_by('x')
... >> create_at(('x', 'y', 'z'), (inch, feet), decimals=2))
groups: ['x']
x x_inch y_inch z_inch x_feet y_feet z_feet
0 1 0.39 2.36 2.76 0.03 0.20 0.23
1 2 0.79 1.97 3.54 0.07 0.16 0.30
2 3 1.18 1.57 4.33 0.10 0.13 0.36
3 4 1.57 1.18 3.15 0.13 0.10 0.26
4 5 1.97 0.79 3.94 0.16 0.07 0.33
5 6 2.36 0.39 4.72 0.20 0.03 0.39
Group columns that are not listed are not acted upon by the
functions.
>>> (df
... >> group_by('x')
... >> create_at(dict(matches=r'x|y|z'), (inch, feet), decimals=2))
groups: ['x']
x y_inch z_inch y_feet z_feet
0 1 2.36 2.76 0.20 0.23
1 2 1.97 3.54 0.16 0.30
2 3 1.57 4.33 0.13 0.36
3 4 1.18 3.15 0.10 0.26
4 5 0.79 3.94 0.07 0.33
5 6 0.39 4.72 0.03 0.39
"""
[docs]class group_by_all(_all):
"""
Groupby all columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
functions : callable or tuple or dict or str
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Grouping by all the columns
>>> df >> group_by_all()
groups: ['alpha', 'beta', 'theta', 'x', 'y', 'z']
alpha beta theta x y z
0 a b c 1 6 7
1 a a d 2 5 9
2 a b e 3 4 11
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
Grouping by all columns created by a function.
Same output as above, but now all the columns are
categorical
>>> result = df >> group_by_all(pd.Categorical)
>>> result
groups: ['alpha', 'beta', 'theta', 'x', 'y', 'z']
alpha beta theta x y z
0 a b c 1 6 7
1 a a d 2 5 9
2 a b e 3 4 11
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
>>> result['x']
0 1
1 2
2 3
3 4
4 5
5 6
Name: x, dtype: category
Categories (6, int64): [1, 2, 3, 4, 5, 6]
If apply more than one function or provide a postfix,
the original columns are retained.
>>> (df
... >> select('x', 'y', 'z')
... >> group_by_all(dict(cat=pd.Categorical)))
groups: ['x_cat', 'y_cat', 'z_cat']
x y z x_cat y_cat z_cat
0 1 6 7 1 6 7
1 2 5 9 2 5 9
2 3 4 11 3 4 11
3 4 3 8 4 3 8
4 5 2 10 5 2 10
5 6 1 12 6 1 12
"""
def __init__(self, functions=None, *args, **kwargs):
self.set_env_from_verb_init()
super().__init__(functions, *args, **kwargs)
[docs]class group_by_if(_if):
"""
Group by selected columns that are true for a predicate
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
predicate : function
A predicate function to be applied to the columns of the
dataframe. Good candidates for predicate functions are
those that check the type of the column. Such function
are avaible at :mod:`pandas.api.dtypes`, for example
:func:`pandas.api.types.is_numeric_dtype`.
For convenience, you can reference the ``is_*_dtype``
functions with shorter strings::
'is_bool' # pandas.api.types.is_bool_dtype
'is_categorical' # pandas.api.types.is_categorical_dtype
'is_complex' # pandas.api.types.is_complex_dtype
'is_datetime64_any' # pandas.api.types.is_datetime64_any_dtype
'is_datetime64' # pandas.api.types.is_datetime64_dtype
'is_datetime64_ns' # pandas.api.types.is_datetime64_ns_dtype
'is_datetime64tz' # pandas.api.types.is_datetime64tz_dtype
'is_float' # pandas.api.types.is_float_dtype
'is_int64' # pandas.api.types.is_int64_dtype
'is_integer' # pandas.api.types.is_integer_dtype
'is_interval' # pandas.api.types.is_interval_dtype
'is_numeric' # pandas.api.types.is_numeric_dtype
'is_object' # pandas.api.types.is_object_dtype
'is_period' # pandas.api.types.is_period_dtype
'is_signed_integer' # pandas.api.types.is_signed_integer_dtype
'is_string' # pandas.api.types.is_string_dtype
'is_timedelta64' # pandas.api.types.is_timedelta64_dtype
'is_timedelta64_ns' # pandas.api.types.is_timedelta64_ns_dtype
'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype
No other string values are allowed.
functions : callable or tuple or dict or str, optional
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Group by all string type columns. ``'is_string'`` is a
shortcut to :func:`pandas.api.types.is_string_dtype`.
>>> df >> group_by_if('is_string')
groups: ['alpha', 'beta', 'theta']
alpha beta theta x y z
0 a b c 1 6 7
1 a a d 2 5 9
2 a b e 3 4 11
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
Applying a function to create the group columns
>>> def double(s):
... return s + s
>>> df >> group_by_if('is_string', double)
groups: ['alpha', 'beta', 'theta']
alpha beta theta x y z
0 aa bb cc 1 6 7
1 aa aa dd 2 5 9
2 aa bb ee 3 4 11
3 bb rr cc 4 3 8
4 bb uu dd 5 2 10
5 bb qq ee 6 1 12
Apply more than one function, increases the number of
columns
>>> def m10(x): return x-10 # minus
>>> def p10(x): return x+10 # plus
>>> df >> group_by_if('is_numeric', (m10, p10))
groups: ['x_m10', 'y_m10', 'z_m10', 'x_p10', 'y_p10', 'z_p10']
alpha beta theta x y z x_m10 y_m10 z_m10 x_p10 y_p10 z_p10
0 a b c 1 6 7 -9 -4 -3 11 16 17
1 a a d 2 5 9 -8 -5 -1 12 15 19
2 a b e 3 4 11 -7 -6 1 13 14 21
3 b r c 4 3 8 -6 -7 -2 14 13 18
4 b u d 5 2 10 -5 -8 0 15 12 20
5 b q e 6 1 12 -4 -9 2 16 11 22
"""
[docs]class group_by_at(_at):
"""
Group by select columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
names : tuple or dict
Names of columns in dataframe. If a tuple, they should be names
of columns. If a :class:`dict`, they keys must be in.
- startswith : str or tuple, optional
All column names that start with this string will be included.
- endswith : str or tuple, optional
All column names that end with this string will be included.
- contains : str or tuple, optional
All column names that contain with this string will be included.
- matches : str or regex or tuple, optional
All column names that match the string or a compiled regex pattern
will be included. A tuple can be used to match multiple regexs.
- drop : bool, optional
If ``True``, the selection is inverted. The unspecified/unmatched
columns are returned instead. Default is ``False``.
functions : callable or tuple or dict or str, optional
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
In the simplest form it is not too different from
:class:`~plydata.verbs.group_by`.
>>> df >> group_by_at(('x', 'y'))
groups: ['x', 'y']
alpha beta theta x y z
0 a b c 1 6 7
1 a a d 2 5 9
2 a b e 3 4 11
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
The power comes from the ability to do dynamic column selection.
For example, regex match column names and apply function to get the
group columns.
>>> def double(s): return s + s
>>> df >> group_by_at(dict(matches=r'\\w+eta$'), double)
groups: ['beta', 'theta']
alpha beta theta x y z
0 a bb cc 1 6 7
1 a aa dd 2 5 9
2 a bb ee 3 4 11
3 b rr cc 4 3 8
4 b uu dd 5 2 10
5 b qq ee 6 1 12
"""
def __init__(self, names, functions=None, *args, **kwargs):
self.set_env_from_verb_init()
super().__init__(names, functions, *args, **kwargs)
[docs]class mutate_all(_all):
"""
Modify all columns that are true for a predicate
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
functions : callable or tuple or dict or str
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
A single function with an argument
>>> df >> select('x', 'y', 'z') >> mutate_all(np.add, 10)
x y z
0 11 16 17
1 12 15 19
2 13 14 21
3 14 13 18
4 15 12 20
5 16 11 22
A two functions that accept the same argument
>>> (df
... >> select('x', 'z')
... >> mutate_all((np.add, np.subtract), 10)
... )
x z x_add z_add x_subtract z_subtract
0 1 7 11 17 -9 -3
1 2 9 12 19 -8 -1
2 3 11 13 21 -7 1
3 4 8 14 18 -6 -2
4 5 10 15 20 -5 0
5 6 12 16 22 -4 2
Convert *x*, *y* and *z* from centimeters to inches and
round the 2 decimal places.
>>> (df
... >> select('x', 'y', 'z')
... >> mutate_all(dict(inch=lambda col: np.round(col/2.54, 2)))
... )
x y z x_inch y_inch z_inch
0 1 6 7 0.39 2.36 2.76
1 2 5 9 0.79 1.97 3.54
2 3 4 11 1.18 1.57 4.33
3 4 3 8 1.57 1.18 3.15
4 5 2 10 1.97 0.79 3.94
5 6 1 12 2.36 0.39 4.72
Groupwise standardization of multiple variables.
>>> def scale(col): return (col - np.mean(col))/np.std(col)
>>> (df
... >> group_by('alpha')
... >> select('x', 'y', 'z')
... >> mutate_all(scale))
groups: ['alpha']
alpha x y z
0 a -1.224745 1.224745 -1.224745
1 a 0.000000 0.000000 0.000000
2 a 1.224745 -1.224745 1.224745
3 b -1.224745 1.224745 -1.224745
4 b 0.000000 0.000000 0.000000
5 b 1.224745 -1.224745 1.224745
"""
[docs]class mutate_if(_if):
"""
Modify selected columns that are true for a predicate
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
predicate : function
A predicate function to be applied to the columns of the
dataframe. Good candidates for predicate functions are
those that check the type of the column. Such function
are avaible at :mod:`pandas.api.dtypes`, for example
:func:`pandas.api.types.is_numeric_dtype`.
For convenience, you can reference the ``is_*_dtype``
functions with shorter strings::
'is_bool' # pandas.api.types.is_bool_dtype
'is_categorical' # pandas.api.types.is_categorical_dtype
'is_complex' # pandas.api.types.is_complex_dtype
'is_datetime64_any' # pandas.api.types.is_datetime64_any_dtype
'is_datetime64' # pandas.api.types.is_datetime64_dtype
'is_datetime64_ns' # pandas.api.types.is_datetime64_ns_dtype
'is_datetime64tz' # pandas.api.types.is_datetime64tz_dtype
'is_float' # pandas.api.types.is_float_dtype
'is_int64' # pandas.api.types.is_int64_dtype
'is_integer' # pandas.api.types.is_integer_dtype
'is_interval' # pandas.api.types.is_interval_dtype
'is_numeric' # pandas.api.types.is_numeric_dtype
'is_object' # pandas.api.types.is_object_dtype
'is_period' # pandas.api.types.is_period_dtype
'is_signed_integer' # pandas.api.types.is_signed_integer_dtype
'is_string' # pandas.api.types.is_string_dtype
'is_timedelta64' # pandas.api.types.is_timedelta64_dtype
'is_timedelta64_ns' # pandas.api.types.is_timedelta64_ns_dtype
'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype
No other string values are allowed.
functions : callable or tuple or dict or str
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import pandas.api.types as pdtypes
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
A single function with an argument
>>> df >> mutate_if(pdtypes.is_numeric_dtype, np.add, 10)
alpha beta theta x y z
0 a b c 11 16 17
1 a a d 12 15 19
2 a b e 13 14 21
3 b r c 14 13 18
4 b u d 15 12 20
5 b q e 16 11 22
A two functions that accept the same argument and using our
crude column selector.
>>> def is_x_or_z(col): return col.name in ('x', 'z')
>>> df >> mutate_if(is_x_or_z, (np.add, np.subtract), 10)
alpha beta theta x y z x_add z_add x_subtract z_subtract
0 a b c 1 6 7 11 17 -9 -3
1 a a d 2 5 9 12 19 -8 -1
2 a b e 3 4 11 13 21 -7 1
3 b r c 4 3 8 14 18 -6 -2
4 b u d 5 2 10 15 20 -5 0
5 b q e 6 1 12 16 22 -4 2
Convert *x*, *y* and *z* from centimeters to inches and
round the 2 decimal places.
>>> (df
... >> mutate_if('is_numeric',
... dict(inch=lambda col: np.round(col/2.54, 2))))
alpha beta theta x y z x_inch y_inch z_inch
0 a b c 1 6 7 0.39 2.36 2.76
1 a a d 2 5 9 0.79 1.97 3.54
2 a b e 3 4 11 1.18 1.57 4.33
3 b r c 4 3 8 1.57 1.18 3.15
4 b u d 5 2 10 1.97 0.79 3.94
5 b q e 6 1 12 2.36 0.39 4.72
Groupwise standardization of multiple variables.
>>> def scale(col): return (col - np.mean(col))/np.std(col)
>>> (df
... >> group_by('alpha')
... >> mutate_if('is_numeric', scale))
groups: ['alpha']
alpha beta theta x y z
0 a b c -1.224745 1.224745 -1.224745
1 a a d 0.000000 0.000000 0.000000
2 a b e 1.224745 -1.224745 1.224745
3 b r c -1.224745 1.224745 -1.224745
4 b u d 0.000000 0.000000 0.000000
5 b q e 1.224745 -1.224745 1.224745
Using a boolean array to select the columns.
>>> df >> mutate_if(
... [False, False, False, True, True, True],
... np.negative)
alpha beta theta x y z
0 a b c -1 -6 -7
1 a a d -2 -5 -9
2 a b e -3 -4 -11
3 b r c -4 -3 -8
4 b u d -5 -2 -10
5 b q e -6 -1 -12
"""
[docs]class mutate_at(_at):
"""
Change selected columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
names : tuple or dict
Names of columns in dataframe. If a tuple, they should be names
of columns. If a :class:`dict`, they keys must be in.
- startswith : str or tuple, optional
All column names that start with this string will be included.
- endswith : str or tuple, optional
All column names that end with this string will be included.
- contains : str or tuple, optional
All column names that contain with this string will be included.
- matches : str or regex or tuple, optional
All column names that match the string or a compiled regex pattern
will be included. A tuple can be used to match multiple regexs.
- drop : bool, optional
If ``True``, the selection is inverted. The unspecified/unmatched
columns are returned instead. Default is ``False``.
functions : callable or tuple or dict or str
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - String can be used for more complex
statements, but the resulting names will be terrible.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
A single function with an argument
>>> df >> mutate_at(('x', 'y', 'z'), np.add, 10)
alpha beta theta x y z
0 a b c 11 16 17
1 a a d 12 15 19
2 a b e 13 14 21
3 b r c 14 13 18
4 b u d 15 12 20
5 b q e 16 11 22
A two functions that accept the same argument
>>> df >> mutate_at(('x', 'z'), (np.add, np.subtract), 10)
alpha beta theta x y z x_add z_add x_subtract z_subtract
0 a b c 1 6 7 11 17 -9 -3
1 a a d 2 5 9 12 19 -8 -1
2 a b e 3 4 11 13 21 -7 1
3 b r c 4 3 8 14 18 -6 -2
4 b u d 5 2 10 15 20 -5 0
5 b q e 6 1 12 16 22 -4 2
Convert *x*, *y* and *z* from centimeters to inches and
round the 2 decimal places.
>>> (df
... >> mutate_at(('x', 'y', 'z'),
... dict(inch=lambda col: np.round(col/2.54, 2)))
... )
alpha beta theta x y z x_inch y_inch z_inch
0 a b c 1 6 7 0.39 2.36 2.76
1 a a d 2 5 9 0.79 1.97 3.54
2 a b e 3 4 11 1.18 1.57 4.33
3 b r c 4 3 8 1.57 1.18 3.15
4 b u d 5 2 10 1.97 0.79 3.94
5 b q e 6 1 12 2.36 0.39 4.72
Groupwise standardization of multiple variables.
>>> def scale(col): return (col - np.mean(col))/np.std(col)
>>> (df
... >> group_by('alpha')
... >> mutate_at(('x', 'y', 'z'), scale))
groups: ['alpha']
alpha beta theta x y z
0 a b c -1.224745 1.224745 -1.224745
1 a a d 0.000000 0.000000 0.000000
2 a b e 1.224745 -1.224745 1.224745
3 b r c -1.224745 1.224745 -1.224745
4 b u d 0.000000 0.000000 0.000000
5 b q e 1.224745 -1.224745 1.224745
"""
[docs]class query_all(_all):
"""
Query all columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
all_vars : str, optional
A predicate statement to evaluate. It should conform to python
syntax and should return an array of boolean values (one for every
item in the column) or a single boolean (for the whole column).
You should use ``{_}`` to refer to the column names.
After the statement is evaluated for all columns, the
*union* (``|``), is used to select the output rows.
any_vars : str, optional
A predicate statement to evaluate. It should conform to python
syntax and should return an array of boolean values (one for every
item in the column) or a single boolean (for the whole column).
You should use ``{_}`` to refer to the column names.
After the statement is evaluated for all columns, the
*intersection* (``&``), is used to select the output rows.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Select all rows where any of the entries along the columns
is a 4.
>>> df >> query_all(any_vars='({_} == 4)')
alpha beta theta x y z
2 a b e 3 4 11
3 b r c 4 3 8
The opposit, select all rows where none of the entries along
the columns is a 4.
>>> df >> query_all(all_vars='({_} != 4)')
alpha beta theta x y z
0 a b c 1 6 7
1 a a d 2 5 9
4 b u d 5 2 10
5 b q e 6 1 12
For something more complicated, group-wise selection.
Select groups where any of the columns a large (> 28) sum.
First by using :class:`summarize_all`, we see that there is
one such group. Then using :class:`query_all` selects it.
>>> (df
... >> group_by('alpha')
... >> select('x', 'y', 'z')
... >> summarize_all('sum'))
alpha x y z
0 a 6 15 27
1 b 15 6 30
>>> (df
... >> group_by('alpha')
... >> select('x', 'y', 'z')
... >> query_all(any_vars='(sum({_}) > 28)'))
groups: ['alpha']
alpha x y z
3 b 4 3 8
4 b 5 2 10
5 b 6 1 12
Note that ``sum({_}) > 28`` is a column operation, it returns
a single number for the whole column. Therefore the whole column
is either selected or not selected. Column operations are what
enable group-wise selection.
"""
vars_predicate = None
def __init__(self, *, all_vars=None, any_vars=None):
self.set_env_from_verb_init()
if all_vars and any_vars:
raise ValueError(
"Only one of `all_vars` or `any_vars` should "
"be given."
)
elif all_vars:
self.vars_predicate = all_vars
self.all_vars = True
self.any_vars = False
elif any_vars:
self.vars_predicate = any_vars
self.any_vars = True
self.all_vars = False
else:
raise ValueError(
"One of `all_vars` or `any_vars` should be given.")
[docs]class query_if(_if):
"""
Query all columns that match a predicate
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
predicate : function
A predicate function to be applied to the columns of the
dataframe. Good candidates for predicate functions are
those that check the type of the column. Such function
are avaible at :mod:`pandas.api.dtypes`, for example
:func:`pandas.api.types.is_numeric_dtype`.
For convenience, you can reference the ``is_*_dtype``
functions with shorter strings::
'is_bool' # pandas.api.types.is_bool_dtype
'is_categorical' # pandas.api.types.is_categorical_dtype
'is_complex' # pandas.api.types.is_complex_dtype
'is_datetime64_any' # pandas.api.types.is_datetime64_any_dtype
'is_datetime64' # pandas.api.types.is_datetime64_dtype
'is_datetime64_ns' # pandas.api.types.is_datetime64_ns_dtype
'is_datetime64tz' # pandas.api.types.is_datetime64tz_dtype
'is_float' # pandas.api.types.is_float_dtype
'is_int64' # pandas.api.types.is_int64_dtype
'is_integer' # pandas.api.types.is_integer_dtype
'is_interval' # pandas.api.types.is_interval_dtype
'is_numeric' # pandas.api.types.is_numeric_dtype
'is_object' # pandas.api.types.is_object_dtype
'is_period' # pandas.api.types.is_period_dtype
'is_signed_integer' # pandas.api.types.is_signed_integer_dtype
'is_string' # pandas.api.types.is_string_dtype
'is_timedelta64' # pandas.api.types.is_timedelta64_dtype
'is_timedelta64_ns' # pandas.api.types.is_timedelta64_ns_dtype
'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype
No other string values are allowed.
all_vars : str, optional
A predicate statement to evaluate. It should conform to python
syntax and should return an array of boolean values (one for every
item in the column) or a single boolean (for the whole column).
You should use ``{_}`` to refer to the column names.
After the statement is evaluated for all columns selected by the
*predicate*, the *union* (``|``), is used to select the output rows.
any_vars : str, optional
A predicate statement to evaluate. It should conform to python
syntax and should return an array of boolean values (one for every
item in the column) or a single boolean (for the whole column).
You should use ``{_}`` to refer to the column names.
After the statement is evaluated for all columns selected by the
predicate, *intersection* (``&``), is used to select the output rows.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Select all rows where any of the entries along the integer columns
is a 4.
>>> df >> query_if('is_integer', any_vars='({_} == 4)')
alpha beta theta x y z
2 a b e 3 4 11
3 b r c 4 3 8
The opposite, select all rows where none of the entries along
the integer columns is a 4.
>>> df >> query_if('is_integer', all_vars='({_} != 4)')
alpha beta theta x y z
0 a b c 1 6 7
1 a a d 2 5 9
4 b u d 5 2 10
5 b q e 6 1 12
For something more complicated, group-wise selection.
Select groups where any of the columns a large (> 28) sum.
First by using :class:`summarize_if`, we see that there is
one such group. Then using :class:`query_if` selects it.
>>> (df
... >> group_by('alpha')
... >> summarize_if('is_integer', 'sum'))
alpha x y z
0 a 6 15 27
1 b 15 6 30
>>> (df
... >> group_by('alpha')
... >> query_if('is_integer', any_vars='(sum({_}) > 28)'))
groups: ['alpha']
alpha beta theta x y z
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
Note that ``sum({_}) > 28`` is a column operation, it returns
a single number for the whole column. Therefore the whole column
is either selected or not selected. Column operations are what
enable group-wise selection.
"""
vars_predicate = None
def __init__(self, predicate, *, all_vars=None, any_vars=None):
self.set_env_from_verb_init()
self.predicate = predicate
if all_vars and any_vars:
raise ValueError(
"Only one of `all_vars` or `any_vars` should "
"be given."
)
elif all_vars:
self.vars_predicate = all_vars
self.all_vars = True
self.any_vars = False
elif any_vars:
self.vars_predicate = any_vars
self.any_vars = True
self.all_vars = False
else:
raise ValueError(
"One of `all_vars` or `any_vars` should be given.")
[docs]class query_at(_at):
"""
Query specific columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
names : tuple or dict
Names of columns in dataframe. If a tuple, they should be names
of columns. If a :class:`dict`, they keys must be in.
- startswith : str or tuple, optional
All column names that start with this string will be included.
- endswith : str or tuple, optional
All column names that end with this string will be included.
- contains : str or tuple, optional
All column names that contain with this string will be included.
- matches : str or regex or tuple, optional
All column names that match the string or a compiled regex pattern
will be included. A tuple can be used to match multiple regexs.
- drop : bool, optional
If ``True``, the selection is inverted. The unspecified/unmatched
columns are returned instead. Default is ``False``.
all_vars : str, optional
A predicate statement to evaluate. It should conform to python
syntax and should return an array of boolean values (one for every
item in the column) or a single boolean (for the whole column).
You should use ``{_}`` to refer to the column names.
After the statement is evaluated for all columns selected by the
*names* specification, the *union* (``|``), is used to select the
output rows.
any_vars : str, optional
A predicate statement to evaluate. It should conform to python
syntax and should return an array of boolean values (one for every
item in the column) or a single boolean (for the whole column).
You should use ``{_}`` to refer to the column names.
After the statement is evaluated for all columns selected by the
*names* specification, *intersection* (``&``), is used to select
the output rows.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Select all rows where any of the entries along the integer columns
is a 4.
>>> df >> query_at(('x', 'y', 'z'), any_vars='({_} == 4)')
alpha beta theta x y z
2 a b e 3 4 11
3 b r c 4 3 8
The opposit, select all rows where none of the entries along
the integer columns is a 4.
>>> df >> query_at(('x', 'y', 'z'), all_vars='({_} != 4)')
alpha beta theta x y z
0 a b c 1 6 7
1 a a d 2 5 9
4 b u d 5 2 10
5 b q e 6 1 12
For something more complicated, group-wise selection.
Select groups where any of the columns a large (> 28) sum.
First by using :class:`summarize_at`, we see that there is
one such group. Then using :class:`query_at` selects it.
>>> (df
... >> group_by('alpha')
... >> summarize_at(('x', 'y', 'z'), 'sum'))
alpha x y z
0 a 6 15 27
1 b 15 6 30
>>> (df
... >> group_by('alpha')
... >> query_at(('x', 'y', 'z'), any_vars='(sum({_}) > 28)'))
groups: ['alpha']
alpha beta theta x y z
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
Note that ``sum({_}) > 28`` is a column operation, it returns
a single number for the whole column. Therefore the whole column
is either selected or not selected. Column operations are what
enable group-wise selection.
"""
def __init__(self, names, *, all_vars=None, any_vars=None):
if all_vars and any_vars:
raise ValueError(
"Only one of `all_vars` or `any_vars` should "
"be given."
)
elif all_vars:
self.vars_predicate = all_vars
self.all_vars = True
self.any_vars = False
elif any_vars:
self.vars_predicate = any_vars
self.any_vars = True
self.all_vars = False
else:
raise ValueError(
"One of `all_vars` or `any_vars` should be given.")
self.set_env_from_verb_init()
super().__init__(names, tuple())
[docs]class rename_all(_all):
"""
Rename all columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
functions : callable
Useful when not using the ``>>`` operator.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Rename all columns uppercase
>>> df >> rename_all(str.upper)
ALPHA BETA THETA X Y Z
0 a b c 1 6 7
1 a a d 2 5 9
2 a b e 3 4 11
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
Group columns are not renamed
>>> df >> group_by('beta') >> rename_all(str.upper)
groups: ['beta']
ALPHA beta THETA X Y Z
0 a b c 1 6 7
1 a a d 2 5 9
2 a b e 3 4 11
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
"""
n_functions = 1
[docs]class rename_if(_if):
"""
Rename all columns that match a predicate
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
predicate : function
A predicate function to be applied to the columns of the
dataframe. Good candidates for predicate functions are
those that check the type of the column. Such function
are avaible at :mod:`pandas.api.dtypes`, for example
:func:`pandas.api.types.is_numeric_dtype`.
For convenience, you can reference the ``is_*_dtype``
functions with shorter strings::
'is_bool' # pandas.api.types.is_bool_dtype
'is_categorical' # pandas.api.types.is_categorical_dtype
'is_complex' # pandas.api.types.is_complex_dtype
'is_datetime64_any' # pandas.api.types.is_datetime64_any_dtype
'is_datetime64' # pandas.api.types.is_datetime64_dtype
'is_datetime64_ns' # pandas.api.types.is_datetime64_ns_dtype
'is_datetime64tz' # pandas.api.types.is_datetime64tz_dtype
'is_float' # pandas.api.types.is_float_dtype
'is_int64' # pandas.api.types.is_int64_dtype
'is_integer' # pandas.api.types.is_integer_dtype
'is_interval' # pandas.api.types.is_interval_dtype
'is_numeric' # pandas.api.types.is_numeric_dtype
'is_object' # pandas.api.types.is_object_dtype
'is_period' # pandas.api.types.is_period_dtype
'is_signed_integer' # pandas.api.types.is_signed_integer_dtype
'is_string' # pandas.api.types.is_string_dtype
'is_timedelta64' # pandas.api.types.is_timedelta64_dtype
'is_timedelta64_ns' # pandas.api.types.is_timedelta64_ns_dtype
'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype
No other string values are allowed.
functions : callable
Useful when not using the ``>>`` operator.
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments
are passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
>>> def is_sorted(col):
... a = col.values
... return all(a[:-1] <= a[1:])
Rename all sorted columns to uppercase.
>>> df >> rename_if(is_sorted, str.upper)
ALPHA beta theta X y z
0 a b c 1 6 7
1 a a d 2 5 9
2 a b e 3 4 11
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
Group columns are not renamed.
>>> df >> group_by('alpha') >> rename_if(is_sorted, str.upper)
groups: ['alpha']
alpha beta theta X y z
0 a b c 1 6 7
1 a a d 2 5 9
2 a b e 3 4 11
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
"""
n_functions = 1
[docs]class rename_at(_at):
"""
Rename specific columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
names : tuple or dict
Names of columns in dataframe. If a tuple, they should be names
of columns. If a :class:`dict`, they keys must be in.
- startswith : str or tuple, optional
All column names that start with this string will be included.
- endswith : str or tuple, optional
All column names that end with this string will be included.
- contains : str or tuple, optional
All column names that contain with this string will be included.
- matches : str or regex or tuple, optional
All column names that match the string or a compiled regex pattern
will be included. A tuple can be used to match multiple regexs.
- drop : bool, optional
If ``True``, the selection is inverted. The unspecified/unmatched
columns are returned instead. Default is ``False``.
function : callable
Function to rename the column(s).
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Rename columns that contain the string ``eta`` to upper case.
>>> df >> rename_at(dict(contains='eta'), str.upper)
alpha BETA THETA x y z
0 a b c 1 6 7
1 a a d 2 5 9
2 a b e 3 4 11
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
The group columns are not renamed.
>>> (df
... >> group_by('beta')
... >> rename_at(('alpha', 'beta', 'x'), str.upper))
groups: ['beta']
ALPHA beta theta X y z
0 a b c 1 6 7
1 a a d 2 5 9
2 a b e 3 4 11
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
"""
n_functions = 1
[docs]class select_all(_all):
"""
Select all columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
function : callable
Function to rename the column(s).
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Select all columns and convert names to uppercase
>>> df >> select_all(str.upper)
ALPHA BETA THETA X Y Z
0 a b c 1 6 7
1 a a d 2 5 9
2 a b e 3 4 11
3 b r c 4 3 8
4 b u d 5 2 10
5 b q e 6 1 12
Group columns are selected but they are not renamed.
>>> df >> group_by('beta') >> select_all(str.upper)
groups: ['beta']
beta ALPHA THETA X Y Z
0 b a c 1 6 7
1 a a d 2 5 9
2 b a e 3 4 11
3 r b c 4 3 8
4 u b d 5 2 10
5 q b e 6 1 12
"""
n_functions = 1
[docs]class select_if(_if):
"""
Select all columns that match a predicate
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
predicate : function
A predicate function to be applied to the columns of the
dataframe. Good candidates for predicate functions are
those that check the type of the column. Such function
are avaible at :mod:`pandas.api.dtypes`, for example
:func:`pandas.api.types.is_numeric_dtype`.
For convenience, you can reference the ``is_*_dtype``
functions with shorter strings::
'is_bool' # pandas.api.types.is_bool_dtype
'is_categorical' # pandas.api.types.is_categorical_dtype
'is_complex' # pandas.api.types.is_complex_dtype
'is_datetime64_any' # pandas.api.types.is_datetime64_any_dtype
'is_datetime64' # pandas.api.types.is_datetime64_dtype
'is_datetime64_ns' # pandas.api.types.is_datetime64_ns_dtype
'is_datetime64tz' # pandas.api.types.is_datetime64tz_dtype
'is_float' # pandas.api.types.is_float_dtype
'is_int64' # pandas.api.types.is_int64_dtype
'is_integer' # pandas.api.types.is_integer_dtype
'is_interval' # pandas.api.types.is_interval_dtype
'is_numeric' # pandas.api.types.is_numeric_dtype
'is_object' # pandas.api.types.is_object_dtype
'is_period' # pandas.api.types.is_period_dtype
'is_signed_integer' # pandas.api.types.is_signed_integer_dtype
'is_string' # pandas.api.types.is_string_dtype
'is_timedelta64' # pandas.api.types.is_timedelta64_dtype
'is_timedelta64_ns' # pandas.api.types.is_timedelta64_ns_dtype
'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype
No other string values are allowed.
function : callable
Function to rename the column(s).
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
All sorted column names to uppercase
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Select all sorted columns and convert names to upper case
>>> def is_sorted(col):
... a = col.values
... return all(a[:-1] <= a[1:])
>>> df >> select_if(is_sorted, str.upper)
ALPHA X
0 a 1
1 a 2
2 a 3
3 b 4
4 b 5
5 b 6
Group columns are always selected.
>>> df >> group_by('beta') >> select_if(is_sorted, str.upper)
groups: ['beta']
beta ALPHA X
0 b a 1
1 a a 2
2 b a 3
3 r b 4
4 u b 5
5 q b 6
"""
n_functions = 1
[docs]class select_at(_at):
"""
Select specific columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
names : tuple or dict
Names of columns in dataframe. If a tuple, they should be names
of columns. If a :class:`dict`, they keys must be in.
- startswith : str or tuple, optional
All column names that start with this string will be included.
- endswith : str or tuple, optional
All column names that end with this string will be included.
- contains : str or tuple, optional
All column names that contain with this string will be included.
- matches : str or regex or tuple, optional
All column names that match the string or a compiled regex pattern
will be included. A tuple can be used to match multiple regexs.
- drop : bool, optional
If ``True``, the selection is inverted. The unspecified/unmatched
columns are returned instead. Default is ``False``.
function : callable
Functions to rename the column(s).
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Select the listed columns and rename them to upper case.
>>> df >> select_at(('alpha', 'x'), str.upper)
ALPHA X
0 a 1
1 a 2
2 a 3
3 b 4
4 b 5
5 b 6
Select columns that contain the string ``eta`` and rename
them name to upper case.
>>> df >> select_at(dict(contains='eta'), str.upper)
BETA THETA
0 b c
1 a d
2 b e
3 r c
4 u d
5 q e
Group columns are always selected.
>>> df >> group_by('beta') >> select_at(('alpha', 'x'), str.upper)
groups: ['beta']
beta ALPHA X
0 b a 1
1 a a 2
2 b a 3
3 r b 4
4 u b 5
5 q b 6
"""
n_functions = 1
[docs]class summarize_all(_all):
"""
Summarise all non-grouping columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
functions : callable or tuple or dict or str
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - You can use this to access the aggregation
functions provided in :class:`summarize`::
# Those that accept a single argument.
'min'
'max'
'sum'
'cumsum'
'mean'
'median'
'std'
'first'
'last'
'n_distinct'
'n_unique'
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
A single summarizing function
>>> df >> select('x', 'z') >> summarize_all('mean')
x z
0 3.5 9.5
More than one summarizing function (as a tuple).
>>> df >> select('x', 'z') >> summarize_all(('mean', np.std))
x_mean z_mean x_std z_std
0 3.5 9.5 1.707825 1.707825
You can use a dictionary to change postscripts of the
column names.
>>> (df
... >> select('x', 'z')
... >> summarize_all(dict(MEAN='mean', STD=np.std)))
x_MEAN z_MEAN x_STD z_STD
0 3.5 9.5 1.707825 1.707825
Group by
>>> (df
... >> group_by('alpha')
... >> select('x', 'z')
... >> summarize_all(('mean', np.std)))
alpha x_mean z_mean x_std z_std
0 a 2.0 9.0 0.816497 1.632993
1 b 5.0 10.0 0.816497 1.632993
Passing additional arguments
>>> (df
... >> group_by('alpha')
... >> select('x', 'z')
... >> summarize_all(np.std, ddof=1))
alpha x z
0 a 1.0 2.0
1 b 1.0 2.0
The arguments are passed to all functions, so in majority of
these cases it might only be possible to summarise with one
function.
The group columns is never summarised.
>>> (df
... >> select('x', 'y', 'z')
... >> define(parity='x%2')
... >> group_by('parity')
... >> summarize_all('mean'))
parity x y z
0 1 3.0 4.0 9.333333
1 0 4.0 3.0 9.666667
"""
[docs]class summarize_if(_if):
"""
Summarise all columns that are true for a predicate
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
predicate : function or str
A predicate function to be applied to the columns of the
dataframe. Good candidates for predicate functions are
those that check the type of the column. Such function
are avaible at :mod:`pandas.api.dtypes`, for example
:func:`pandas.api.types.is_numeric_dtype`.
For convenience, you can reference the ``is_*_dtype``
functions with shorter strings::
'is_bool' # pandas.api.types.is_bool_dtype
'is_categorical' # pandas.api.types.is_categorical_dtype
'is_complex' # pandas.api.types.is_complex_dtype
'is_datetime64_any' # pandas.api.types.is_datetime64_any_dtype
'is_datetime64' # pandas.api.types.is_datetime64_dtype
'is_datetime64_ns' # pandas.api.types.is_datetime64_ns_dtype
'is_datetime64tz' # pandas.api.types.is_datetime64tz_dtype
'is_float' # pandas.api.types.is_float_dtype
'is_int64' # pandas.api.types.is_int64_dtype
'is_integer' # pandas.api.types.is_integer_dtype
'is_interval' # pandas.api.types.is_interval_dtype
'is_numeric' # pandas.api.types.is_numeric_dtype
'is_object' # pandas.api.types.is_object_dtype
'is_period' # pandas.api.types.is_period_dtype
'is_signed_integer' # pandas.api.types.is_signed_integer_dtype
'is_string' # pandas.api.types.is_string_dtype
'is_timedelta64' # pandas.api.types.is_timedelta64_dtype
'is_timedelta64_ns' # pandas.api.types.is_timedelta64_ns_dtype
'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype
No other string values are allowed.
functions : str or tuple or dict, optional
Expressions or ``(name, expression)`` pairs. This should
be used when the *name* is not a valid python variable
name. The expression should be of type :class:`str` or
an *interable* with the same number of elements as the
dataframe.
Examples
--------
>>> import pandas as pd
>>> import pandas.api.types as pdtypes
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
Summarizing all numeric columns
>>> df >> summarize_if(pdtypes.is_numeric_dtype, (np.min, np.max))
x_amin y_amin z_amin x_amax y_amax z_amax
0 1 1 7 6 6 12
Group by
>>> (df
... >> group_by('alpha')
... >> summarize_if(pdtypes.is_numeric_dtype, (np.min, np.max))
... )
alpha x_amin y_amin z_amin x_amax y_amax z_amax
0 a 1 4 7 3 6 11
1 b 4 1 8 6 3 12
Using a ``'is_string'`` as a shortcut to :py:`pdtypes.is_string_dtype`
for the predicate and custom summarizing a function.
>>> def first(col): return list(col)[0]
>>> df >> group_by('alpha') >> summarize_if('is_string', first)
alpha beta theta
0 a b c
1 b r c
Note, if the any of the group columns match the predictate, they
are selected.
"""
[docs]class summarize_at(_at):
"""
Summarize select columns
Parameters
----------
data : dataframe, optional
Useful when not using the ``>>`` operator.
names : tuple or dict
Names of columns in dataframe. If a tuple, they should be names
of columns. If a :class:`dict`, they keys must be in.
- startswith : str or tuple, optional
All column names that start with this string will be included.
- endswith : str or tuple, optional
All column names that end with this string will be included.
- contains : str or tuple, optional
All column names that contain with this string will be included.
- matches : str or regex or tuple, optional
All column names that match the string or a compiled regex pattern
will be included. A tuple can be used to match multiple regexs.
- drop : bool, optional
If ``True``, the selection is inverted. The unspecified/unmatched
columns are returned instead. Default is ``False``.
functions : callable or tuple or dict or str
Functions to alter the columns:
- function (any callable) - Function is applied to the
column and the result columns replace the original
columns.
- :class:`tuple` of functions - Each function is applied to
all of the columns and the name (``__name__``) of the
function is postfixed to resulting column names.
- :class:`dict` of the form ``{'name': function}`` - Allows
you to apply one or more functions and also control the
postfix to the name.
- :class:`str` - You can use this to access the aggregation
functions provided in :class:`summarize`::
# Those that accept a single argument.
'min'
'max'
'sum'
'cumsum'
'mean'
'median'
'std'
'first'
'last'
'n_distinct'
'n_unique'
args : tuple
Arguments to the functions. The arguments are pass to *all*
functions.
kwargs : dict
Keyword arguments to the functions. The keyword arguments are
passed to *all* functions.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from plydata import *
>>> df = pd.DataFrame({
... 'alpha': list('aaabbb'),
... 'beta': list('babruq'),
... 'theta': list('cdecde'),
... 'x': [1, 2, 3, 4, 5, 6],
... 'y': [6, 5, 4, 3, 2, 1],
... 'z': [7, 9, 11, 8, 10, 12]
... })
One variable
>>> df >> summarize_at('x', ('mean', np.std))
x_mean x_std
0 3.5 1.707825
Many variables
>>> df >> summarize_at(('x', 'y', 'z'), ('mean', np.std))
x_mean y_mean z_mean x_std y_std z_std
0 3.5 3.5 9.5 1.707825 1.707825 1.707825
Group by and many variables
>>> (df
... >> group_by('theta')
... >> summarize_at(('x', 'y', 'z'), ('mean', np.std))
... )
theta x_mean y_mean z_mean x_std y_std z_std
0 c 2.5 4.5 7.5 1.5 1.5 0.5
1 d 3.5 3.5 9.5 1.5 1.5 0.5
2 e 4.5 2.5 11.5 1.5 1.5 0.5
Using `select` parameters
>>> (df
... >> group_by('alpha')
... >> summarize_at(
... dict(endswith='ta'),
... dict(unique_count=lambda col: len(pd.unique(col)))
... )
... )
alpha beta_unique_count theta_unique_count
0 a 2 3
1 b 3 3
For this data, we can achieve the same using
:class:`~plydata.verbs.summarize`.
>>> (df
... >> group_by('alpha')
... >> summarize(
... beta_unique_count='len(pd.unique(beta))',
... theta_unique_count='len(pd.unique(theta))'
... )
... )
alpha beta_unique_count theta_unique_count
0 a 2 3
1 b 3 3
"""
# Aliases
summarise_all = summarize_all
summarise_at = summarize_at
summarise_if = summarize_if
transmute_all = create_all
transmute_at = create_at
transmute_if = create_if