Source code for plydata.helper_verbs

"""
Helper verbs
"""
from .operators import DataOperator
from .one_table_verbs import select, group_by


__all__ = ['call', 'tally', 'count', 'add_tally', 'add_count',
           'arrange_all', 'arrange_at', 'arrange_if',
           'create_all', 'create_at', 'create_if',
           'group_by_all', 'group_by_at', 'group_by_if',
           'mutate_all', 'mutate_at', 'mutate_if',
           'query_all', 'query_at', 'query_if',
           'rename_all', 'rename_at', 'rename_if',
           'select_all', 'select_at', 'select_if',
           'summarize_all', 'summarize_at', 'summarize_if',
           # Aliases
           'summarise_all', 'summarise_at', 'summarise_if',
           'transmute_all', 'transmute_at', 'transmute_if',
           ]

MANY = float('inf')


[docs]class call(DataOperator):
    """
    Call external function or dataframe method

    This is a special verb; it turns regular functions and
    dataframe instance methods into verb instances that you
    can pipe to. It reduces the times one needs to break out
    of the piping workflow.

    Parameters
    ----------
    func : callable or str
        A function that accepts a dataframe as the first argument.
        Dataframe methods are specified using strings and
        preferrably they should start with a period,
        e.g ``'.reset_index'``
    *args : tuple
        Second, third, fourth, ... arguments to ``func``
    **kwargs : dict
        Keyword arguments to ``func``

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'A': {0: 'a', 1: 'b', 2: 'c'},
    ...     'B': {0: 1, 1: 3, 2: 5},
    ...     'C': {0: 2, 1: 4, 2: np.nan}
    ... })
    >>> df
       A  B    C
    0  a  1  2.0
    1  b  3  4.0
    2  c  5  NaN

    Using an external function

    >>> df >> call(pd.melt)
      variable value
    0        A     a
    1        A     b
    2        A     c
    3        B     1
    4        B     3
    5        B     5
    6        C     2
    7        C     4
    8        C   NaN

    An external function with arguments

    >>> df >> call(pd.melt, id_vars=['A'], value_vars=['B'])
       A variable  value
    0  a        B      1
    1  b        B      3
    2  c        B      5

    A method on the dataframe

    >>> df >> call('.dropna', axis=1)
       A  B
    0  a  1
    1  b  3
    2  c  5

    >>> (df
    ...  >> call(pd.melt)
    ...  >> query('variable != "B"')
    ...  >> call('.reset_index', drop=True)
    ...  )
      variable value
    0        A     a
    1        A     b
    2        A     c
    3        C     2
    4        C     4
    5        C   NaN
    """

    def __init__(self, func, *args, **kwargs):
        self.func = func
        self.args = args
        self.kwargs = kwargs


[docs]class tally(DataOperator):
    """
    Tally observations by group

    ``tally`` is a convenient wrapper for summarise that will
    either call ``n`` or ``sum(n)`` depending on whether you're
    tallying for the first time, or re-tallying.

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    weights : str or array-like, optional
        Weight of each row in the group.
    sort : bool, optional
        If ``True``, sort the resulting data in descending
        order.

    Examples
    --------
    >>> import pandas as pd
    >>> from plydata import tally, group_by, summarize
    >>> df = pd.DataFrame({
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': ['a', 'b', 'a', 'b', 'a', 'b'],
    ...     'w': [1, 2, 1, 2, 1, 2]})

    Without groups it is one large group

    >>> df >> tally()
       n
    0  6

    Sum of the weights

    >>> df >> tally('w')
        n
    0   9

    With groups

    >>> df >> group_by('y') >> tally()
       y  n
    0  a  3
    1  b  3

    With groups and weights

    >>> df >> group_by('y') >> tally('w')
       y  n
    0  a  3
    1  b  6

    Applying the weights to a column

    >>> df >> group_by('y') >> tally('x*w')
       y  n
    0  a  9
    1  b 24

    You can do that with :class:`~plydata.verbs.summarize`

    >>> df >> group_by('y') >> summarize(n='sum(x*w)')
       y  n
    0  a  9
    1  b 24
    """

    def __init__(self, weights=None, sort=False):
        self.set_env_from_verb_init()
        self.weights = weights
        self.sort = sort


[docs]class count(group_by):
    """
    Count observations by group

    ``count`` is a convenient wrapper for summarise that will
    either call n or sum(n) depending on whether you’re
    tallying for the first time, or re-tallying. Similar to
    :class:`tally`, but it does the :class:`~plydata.verbs.group_by`
    for you.

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    *args : str, list
        Columns to group by.
    weights : str or array-like, optional
        Weight of each row in the group.
    sort : bool, optional
        If ``True``, sort the resulting data in descending
        order.

    Examples
    --------
    >>> import pandas as pd
    >>> from plydata import count, group_by, summarize
    >>> df = pd.DataFrame({
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': ['a', 'b', 'a', 'b', 'a', 'b'],
    ...     'w': [1, 2, 1, 2, 1, 2]})

    Without groups it is one large group

    >>> df >> count()
       n
    0  6

    Sum of the weights

    >>> df >> count(weights='w')
        n
    0   9

    With groups

    >>> df >> count('y')
       y  n
    0  a  3
    1  b  3

    With groups and weights

    >>> df >> count('y', weights='w')
       y  n
    0  a  3
    1  b  6

    Applying the weights to a column

    >>> df >> count('y', weights='x*w')
       y  n
    0  a  9
    1  b 24

    You can do that with :class:`~plydata.verbs.summarize`

    >>> df >> group_by('y') >> summarize(n='sum(x*w)')
       y  n
    0  a  9
    1  b 24
    """

    def __init__(self, *args, weights=None, sort=False):
        self.set_env_from_verb_init()
        super().__init__(*args)
        self.add_ = True
        self.weights = weights
        self.sort = sort


[docs]class add_tally(tally):
    """
    Add column with tally of items in each group

    Similar to :class:`tally`, but it adds a column and does
    not collapse the groups.

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    weights : str or array-like, optional
        Weight of each row in the group.
    sort : bool, optional
        If ``True``, sort the resulting data in descending
        order.

    Examples
    --------
    >>> import pandas as pd
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': ['a', 'b', 'a', 'b', 'a', 'b'],
    ...     'w': [1, 2, 1, 2, 1, 2]})

    Without groups it is one large group

    >>> df >> add_tally()
       x  y  w  n
    0  1  a  1  6
    1  2  b  2  6
    2  3  a  1  6
    3  4  b  2  6
    4  5  a  1  6
    5  6  b  2  6

    Sum of the weights

    >>> df >> add_tally('w')
       x  y  w  n
    0  1  a  1  9
    1  2  b  2  9
    2  3  a  1  9
    3  4  b  2  9
    4  5  a  1  9
    5  6  b  2  9

    With groups

    >>> df >> group_by('y') >> add_tally()
    groups: ['y']
       x  y  w  n
    0  1  a  1  3
    1  2  b  2  3
    2  3  a  1  3
    3  4  b  2  3
    4  5  a  1  3
    5  6  b  2  3

    With groups and weights

    >>> df >> group_by('y') >> add_tally('w')
    groups: ['y']
       x  y  w  n
    0  1  a  1  3
    1  2  b  2  6
    2  3  a  1  3
    3  4  b  2  6
    4  5  a  1  3
    5  6  b  2  6

    Applying the weights to a column

    >>> df >> group_by('y') >> add_tally('x*w')
    groups: ['y']
       x  y  w   n
    0  1  a  1   9
    1  2  b  2  24
    2  3  a  1   9
    3  4  b  2  24
    4  5  a  1   9
    5  6  b  2  24

    Add tally is equivalent to using :func:`sum` or ``n()``
    in :class:`~plydata.verbs.define`.

    >>> df >> group_by('y') >> define(n='sum(x*w)')
    groups: ['y']
       x  y  w   n
    0  1  a  1   9
    1  2  b  2  24
    2  3  a  1   9
    3  4  b  2  24
    4  5  a  1   9
    5  6  b  2  24

    >>> df >> group_by('y') >> define(n='n()')
    groups: ['y']
       x  y  w  n
    0  1  a  1  3
    1  2  b  2  3
    2  3  a  1  3
    3  4  b  2  3
    4  5  a  1  3
    5  6  b  2  3

    Which is the same result as
    :py:`df >> group_by('y') >> add_tally()` above.

    See Also
    --------
    :class:`add_count`
    """


[docs]class add_count(count):
    """
    Add column with number of items in each group

    Similar to :class:`count`, but it adds a column and does
    not collapse the groups. It is also a shortcut of
    :class:`add_tally` that does the grouping.

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    *args : str, list
        Columns to group by.
    weights : str or array-like, optional
        Weight of each row in the group.
    sort : bool, optional
        If ``True``, sort the resulting data in descending
        order.

    Examples
    --------
    >>> import pandas as pd
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': ['a', 'b', 'a', 'b', 'a', 'b'],
    ...     'w': [1, 2, 1, 2, 1, 2]})

    Without groups it is one large group

    >>> df >> add_count()
       x  y  w  n
    0  1  a  1  6
    1  2  b  2  6
    2  3  a  1  6
    3  4  b  2  6
    4  5  a  1  6
    5  6  b  2  6

    Sum of the weights

    >>> df >> add_count(weights='w')
       x  y  w  n
    0  1  a  1  9
    1  2  b  2  9
    2  3  a  1  9
    3  4  b  2  9
    4  5  a  1  9
    5  6  b  2  9

    With groups

    >>> df >> add_count('y')
       x  y  w  n
    0  1  a  1  3
    1  2  b  2  3
    2  3  a  1  3
    3  4  b  2  3
    4  5  a  1  3
    5  6  b  2  3

    >>> df >> group_by('y') >> add_count()
    groups: ['y']
       x  y  w  n
    0  1  a  1  3
    1  2  b  2  3
    2  3  a  1  3
    3  4  b  2  3
    4  5  a  1  3
    5  6  b  2  3

    With groups and weights

    >>> df >> add_count('y', weights='w')
       x  y  w  n
    0  1  a  1  3
    1  2  b  2  6
    2  3  a  1  3
    3  4  b  2  6
    4  5  a  1  3
    5  6  b  2  6

    Applying the weights to a column

    >>> df >> add_count('y', weights='x*w')
       x  y  w   n
    0  1  a  1   9
    1  2  b  2  24
    2  3  a  1   9
    3  4  b  2  24
    4  5  a  1   9
    5  6  b  2  24

    You can do that with :class:`add_tally`

    >>> df >> group_by('y') >> add_tally('x*w') >> ungroup()
       x  y  w   n
    0  1  a  1   9
    1  2  b  2  24
    2  3  a  1   9
    3  4  b  2  24
    4  5  a  1   9
    5  6  b  2  24

    See Also
    --------
    :class:`add_tally`
    """


class _all(DataOperator):
    """
    Base class for *_all verbs

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    functions : callable or tuple or dict or str
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.
    """
    selector = '_all'
    n_functions = MANY   # Maximum number of functions

    def __init__(self, functions=None, *args, **kwargs):
        if functions is None:
            functions = (lambda x: x, )
        elif isinstance(functions, str) or callable(functions):
            functions = (functions,)
        elif isinstance(functions, dict):
            functions = functions
        else:
            functions = tuple(functions)

        n = len(functions)

        if n > self.n_functions:
            raise ValueError(
                "{} expected {} function(s) got {}".format(
                    self.__class__.__name__, self.n_functions, n
                )
            )

        self.set_env_from_verb_init()
        self.functions = functions
        self.args = args
        self.kwargs = kwargs


class _if(DataOperator):
    """
    Base class for *_if verbs

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    predicate : function
        A predicate function to be applied to the columns of the
        dataframe. Good candidates for predicate functions are
        those that check the type of the column. Such function
        are avaible at :mod:`pandas.api.dtypes`, for example
        :func:`pandas.api.types.is_numeric_dtype`.

        For convenience, you can reference the ``is_*_dtype``
        functions with shorter strings::

            'is_bool'             # pandas.api.types.is_bool_dtype
            'is_categorical'      # pandas.api.types.is_categorical_dtype
            'is_complex'          # pandas.api.types.is_complex_dtype
            'is_datetime64_any'   # pandas.api.types.is_datetime64_any_dtype
            'is_datetime64'       # pandas.api.types.is_datetime64_dtype
            'is_datetime64_ns'    # pandas.api.types.is_datetime64_ns_dtype
            'is_datetime64tz'     # pandas.api.types.is_datetime64tz_dtype
            'is_float'            # pandas.api.types.is_float_dtype
            'is_int64'            # pandas.api.types.is_int64_dtype
            'is_integer'          # pandas.api.types.is_integer_dtype
            'is_interval'         # pandas.api.types.is_interval_dtype
            'is_numeric'          # pandas.api.types.is_numeric_dtype
            'is_object'           # pandas.api.types.is_object_dtype
            'is_period'           # pandas.api.types.is_period_dtype
            'is_signed_integer'   # pandas.api.types.is_signed_integer_dtype
            'is_string'           # pandas.api.types.is_string_dtype
            'is_timedelta64'      # pandas.api.types.is_timedelta64_dtype
            'is_timedelta64_ns'   # pandas.api.types.is_timedelta64_ns_dtype
            'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype

        No other string values are allowed.
    functions : callable or tuple or dict or str
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.
    """
    selector = '_if'
    n_functions = MANY   # Maximum number of functions

    def __init__(self, predicate, functions=None, *args, **kwargs):
        if functions is None:
            functions = (lambda x: x, )
        elif isinstance(functions, str) or callable(functions):
            functions = (functions,)
        elif isinstance(functions, dict):
            functions = functions
        else:
            functions = tuple(functions)

        n = len(functions)

        if n > self.n_functions:
            raise ValueError(
                "{} expected {} function(s) got {}".format(
                    self.__class__.__name__, self.n_functions, n
                )
            )

        self.set_env_from_verb_init()
        self.predicate = predicate
        self.functions = functions
        self.args = args
        self.kwargs = kwargs


class _at(select):
    """
    Base class for *_at verbs

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    names : tuple or dict
        Names of columns in dataframe. If a tuple, they should be names
        of columns. If a :class:`dict`, they keys must be in.

        - startswith : str or tuple, optional
            All column names that start with this string will be included.
        - endswith : str or tuple, optional
            All column names that end with this string will be included.
        - contains : str or tuple, optional
            All column names that contain with this string will be included.
        - matches : str or regex or tuple, optional
            All column names that match the string or a compiled regex pattern
            will be included. A tuple can be used to match multiple regexs.
        - drop : bool, optional
            If ``True``, the selection is inverted. The unspecified/unmatched
            columns are returned instead. Default is ``False``.
    functions : callable or tuple or dict or str
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.
    """
    selector = '_at'
    n_functions = MANY   # Maximum number of functions

    def __init__(self, names, functions=None, *args, **kwargs):
        # Sort out the arguments to select
        if isinstance(names, (tuple, list)):
            args_select = names
            kwargs_select = {}
        elif isinstance(names, str):
            args_select = (names,)
            kwargs_select = {}
        elif isinstance(names, dict):
            args_select = tuple()
            kwargs_select = names
        else:
            raise TypeError(
                "Unexpected type for the names specification.")

        if functions is None:
            functions = (lambda x: x, )
        elif isinstance(functions, str) or callable(functions):
            functions = (functions,)
        elif isinstance(functions, dict):
            functions = functions
        else:
            functions = tuple(functions)

        n = len(functions)

        if n > self.n_functions:
            raise ValueError(
                "{} expected {} function(s) got {}".format(
                    self.__class__.__name__, self.n_functions, n
                )
            )

        self.set_env_from_verb_init()
        super().__init__(*args_select, **kwargs_select)
        self.functions = functions
        self.args = args
        self.kwargs = kwargs


[docs]class arrange_all(_all):
    """
    Arrange by all columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    functions : callable or tuple or dict or str
        Functions to alter the columns before they are sorted:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

        Note that, the functions do not change the data, they only
        affect the sorting.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Arranging in ascending order.

    >>> df >> arrange_all()
      alpha beta theta  x  y   z
    1     a    a     d  2  5   9
    0     a    b     c  1  6   7
    2     a    b     e  3  4  11
    5     b    q     e  6  1  12
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10

    Arranging in descending order.

    >>> df >> arrange_all(pd.Series.rank, ascending=False)
      alpha beta theta  x  y   z
    4     b    u     d  5  2  10
    3     b    r     c  4  3   8
    5     b    q     e  6  1  12
    2     a    b     e  3  4  11
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9

    Notes
    -----
    Do not use functions that change the order of the values in the
    array. Such functions are most likely the wrong candidates,
    they corrupt the data. Use function(s) that return values that
    can be sorted.
    """

    def __init__(self, functions=None, *args, **kwargs):
        self.set_env_from_verb_init()
        super().__init__(functions, *args, **kwargs)


[docs]class arrange_if(_if):
    """
    Arrange by all column that match a predicate

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    predicate : function
        A predicate function to be applied to the columns of the
        dataframe. Good candidates for predicate functions are
        those that check the type of the column. Such function
        are avaible at :mod:`pandas.api.dtypes`, for example
        :func:`pandas.api.types.is_numeric_dtype`.

        For convenience, you can reference the ``is_*_dtype``
        functions with shorter strings::

            'is_bool'             # pandas.api.types.is_bool_dtype
            'is_categorical'      # pandas.api.types.is_categorical_dtype
            'is_complex'          # pandas.api.types.is_complex_dtype
            'is_datetime64_any'   # pandas.api.types.is_datetime64_any_dtype
            'is_datetime64'       # pandas.api.types.is_datetime64_dtype
            'is_datetime64_ns'    # pandas.api.types.is_datetime64_ns_dtype
            'is_datetime64tz'     # pandas.api.types.is_datetime64tz_dtype
            'is_float'            # pandas.api.types.is_float_dtype
            'is_int64'            # pandas.api.types.is_int64_dtype
            'is_integer'          # pandas.api.types.is_integer_dtype
            'is_interval'         # pandas.api.types.is_interval_dtype
            'is_numeric'          # pandas.api.types.is_numeric_dtype
            'is_object'           # pandas.api.types.is_object_dtype
            'is_period'           # pandas.api.types.is_period_dtype
            'is_signed_integer'   # pandas.api.types.is_signed_integer_dtype
            'is_string'           # pandas.api.types.is_string_dtype
            'is_timedelta64'      # pandas.api.types.is_timedelta64_dtype
            'is_timedelta64_ns'   # pandas.api.types.is_timedelta64_ns_dtype
            'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype

        No other string values are allowed.
    functions : callable or tuple or dict or str
        Functions to alter the columns before they are sorted:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

        Note that, the functions do not change the data, they only
        affect the sorting.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Arranging by the columns with strings in ascending order.

    >>> df >> arrange_if('is_string')
      alpha beta theta  x  y   z
    1     a    a     d  2  5   9
    0     a    b     c  1  6   7
    2     a    b     e  3  4  11
    5     b    q     e  6  1  12
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10

    Arranging by the columns with strings in descending order.

    >>> df >> arrange_if('is_string', pd.Series.rank, ascending=False)
      alpha beta theta  x  y   z
    4     b    u     d  5  2  10
    3     b    r     c  4  3   8
    5     b    q     e  6  1  12
    2     a    b     e  3  4  11
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9

    It is easier to sort by only the numeric columns in descending
    order.

    >>> df >> arrange_if('is_numeric', np.negative)
      alpha beta theta  x  y   z
    5     b    q     e  6  1  12
    4     b    u     d  5  2  10
    3     b    r     c  4  3   8
    2     a    b     e  3  4  11
    1     a    a     d  2  5   9
    0     a    b     c  1  6   7

    Notes
    -----
    Do not use functions that change the order of the values in the
    array. Such functions are most likely the wrong candidates,
    they corrupt the data. Use function(s) that return values that
    can be sorted.
    """


[docs]class arrange_at(_at):
    """
    Arrange by specific columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    names : tuple or dict
        Names of columns in dataframe. If a tuple, they should be names
        of columns. If a :class:`dict`, they keys must be in.

        - startswith : str or tuple, optional
            All column names that start with this string will be included.
        - endswith : str or tuple, optional
            All column names that end with this string will be included.
        - contains : str or tuple, optional
            All column names that contain with this string will be included.
        - matches : str or regex or tuple, optional
            All column names that match the string or a compiled regex pattern
            will be included. A tuple can be used to match multiple regexs.
        - drop : bool, optional
            If ``True``, the selection is inverted. The unspecified/unmatched
            columns are returned instead. Default is ``False``.
    functions : callable or tuple or dict or str, optional
        Functions to alter the columns before they are sorted:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

        Note that, the functions do not change the data, they only
        affect the sorting.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Arrange by explictily naming the columns to arrange by.
    This is not much different from :class:`~plydata.verbs.arrange`.

    >>> df >> arrange_at(('alpha', 'z'))
      alpha beta theta  x  y   z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12

    Arrange by dynamically selecting the columns to arrange
    by. Here we the selection is *beta* and *theta*.

    >>> df >> arrange_at(dict(contains='eta'))
      alpha beta theta  x  y   z
    1     a    a     d  2  5   9
    0     a    b     c  1  6   7
    2     a    b     e  3  4  11
    5     b    q     e  6  1  12
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10

    In descending order.

    >>> (df
    ...  >> arrange_at(
    ...     dict(contains='eta'),
    ...     pd.Series.rank, ascending=False)
    ... )
      alpha beta theta  x  y   z
    4     b    u     d  5  2  10
    3     b    r     c  4  3   8
    5     b    q     e  6  1  12
    2     a    b     e  3  4  11
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9

    Notes
    -----
    Do not use functions that change the order of the values in the
    array. Such functions are most likely the wrong candidates,
    they corrupt the data. Use function(s) that return values that
    can be sorted.
    """
    def __init__(self, names, functions=None, *args, **kwargs):
        self.set_env_from_verb_init()
        super().__init__(names, functions, *args, **kwargs)


[docs]class create_all(_all):
    """
    Create a new dataframe with all columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    functions : callable or tuple or dict or str
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Create a new dataframe by doubling the values of the input frame.

    >>> def double(s):
    ...     return s + s
    >>> df >> create_all(double)
      alpha beta theta   x   y   z
    0    aa   bb    cc   2  12  14
    1    aa   aa    dd   4  10  18
    2    aa   bb    ee   6   8  22
    3    bb   rr    cc   8   6  16
    4    bb   uu    dd  10   4  20
    5    bb   qq    ee  12   2  24

    Convert from centimetes to inches.

    >>> def inch(col, decimals=0):
    ...     return np.round(col/2.54, decimals)
    >>> def feet(col, decimals=0):
    ...     return np.round(col/30.48, decimals)
    >>> df >> select('x', 'y', 'z') >> create_all((inch, feet), decimals=2)
       x_inch  y_inch  z_inch  x_feet  y_feet  z_feet
    0    0.39    2.36    2.76    0.03    0.20    0.23
    1    0.79    1.97    3.54    0.07    0.16    0.30
    2    1.18    1.57    4.33    0.10    0.13    0.36
    3    1.57    1.18    3.15    0.13    0.10    0.26
    4    1.97    0.79    3.94    0.16    0.07    0.33
    5    2.36    0.39    4.72    0.20    0.03    0.39

    Group columns are always included, but they do not count towards
    the matched columns.

    >>> (df
    ...  >> select('x', 'y', 'z')
    ...  >> group_by('x')
    ...  >> create_all((inch, feet), decimals=2))
    groups: ['x']
       x  y_inch  z_inch  y_feet  z_feet
    0  1    2.36    2.76    0.20    0.23
    1  2    1.97    3.54    0.16    0.30
    2  3    1.57    4.33    0.13    0.36
    3  4    1.18    3.15    0.10    0.26
    4  5    0.79    3.94    0.07    0.33
    5  6    0.39    4.72    0.03    0.39
    """


[docs]class create_if(_if):
    """
    Create a new dataframe with columns selected by a predicate

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    predicate : function
        A predicate function to be applied to the columns of the
        dataframe. Good candidates for predicate functions are
        those that check the type of the column. Such function
        are avaible at :mod:`pandas.api.dtypes`, for example
        :func:`pandas.api.types.is_numeric_dtype`.

        For convenience, you can reference the ``is_*_dtype``
        functions with shorter strings::

            'is_bool'             # pandas.api.types.is_bool_dtype
            'is_categorical'      # pandas.api.types.is_categorical_dtype
            'is_complex'          # pandas.api.types.is_complex_dtype
            'is_datetime64_any'   # pandas.api.types.is_datetime64_any_dtype
            'is_datetime64'       # pandas.api.types.is_datetime64_dtype
            'is_datetime64_ns'    # pandas.api.types.is_datetime64_ns_dtype
            'is_datetime64tz'     # pandas.api.types.is_datetime64tz_dtype
            'is_float'            # pandas.api.types.is_float_dtype
            'is_int64'            # pandas.api.types.is_int64_dtype
            'is_integer'          # pandas.api.types.is_integer_dtype
            'is_interval'         # pandas.api.types.is_interval_dtype
            'is_numeric'          # pandas.api.types.is_numeric_dtype
            'is_object'           # pandas.api.types.is_object_dtype
            'is_period'           # pandas.api.types.is_period_dtype
            'is_signed_integer'   # pandas.api.types.is_signed_integer_dtype
            'is_string'           # pandas.api.types.is_string_dtype
            'is_timedelta64'      # pandas.api.types.is_timedelta64_dtype
            'is_timedelta64_ns'   # pandas.api.types.is_timedelta64_ns_dtype
            'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype

        No other string values are allowed.
    functions : callable or tuple or dict or str, optional
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Create a new dataframe by doubling selected column values of the
    input frame. ``'is_integer'`` is a shortcut to
    :py:`pdtypes.is_integer_dtype`.

    >>> def double(s):
    ...     return s + s
    >>> df >> create_if('is_integer', double)
        x   y   z
    0   2  12  14
    1   4  10  18
    2   6   8  22
    3   8   6  16
    4  10   4  20
    5  12   2  24

    Convert from centimetes to inches.

    >>> def inch(col, decimals=0):
    ...     return np.round(col/2.54, decimals)
    >>> def feet(col, decimals=0):
    ...     return np.round(col/30.48, decimals)
    >>> df >> create_if('is_integer', (inch, feet), decimals=2)
       x_inch  y_inch  z_inch  x_feet  y_feet  z_feet
    0    0.39    2.36    2.76    0.03    0.20    0.23
    1    0.79    1.97    3.54    0.07    0.16    0.30
    2    1.18    1.57    4.33    0.10    0.13    0.36
    3    1.57    1.18    3.15    0.13    0.10    0.26
    4    1.97    0.79    3.94    0.16    0.07    0.33
    5    2.36    0.39    4.72    0.20    0.03    0.39

    Group columns are always included, but they do not count towards
    the matched columns.

    >>> (df
    ...  >> group_by('x')
    ...  >> create_if('is_integer', (inch, feet), decimals=2))
    groups: ['x']
       x  y_inch  z_inch  y_feet  z_feet
    0  1    2.36    2.76    0.20    0.23
    1  2    1.97    3.54    0.16    0.30
    2  3    1.57    4.33    0.13    0.36
    3  4    1.18    3.15    0.10    0.26
    4  5    0.79    3.94    0.07    0.33
    5  6    0.39    4.72    0.03    0.39

    Selecting columns that match a predicate.

    >>> df >> create_if('is_integer')
       x  y   z
    0  1  6   7
    1  2  5   9
    2  3  4  11
    3  4  3   8
    4  5  2  10
    5  6  1  12
    """


[docs]class create_at(_at):
    """
    Create dataframe with specific columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    names : tuple or dict
        Names of columns in dataframe. If a tuple, they should be names
        of columns. If a :class:`dict`, they keys must be in.

        - startswith : str or tuple, optional
            All column names that start with this string will be included.
        - endswith : str or tuple, optional
            All column names that end with this string will be included.
        - contains : str or tuple, optional
            All column names that contain with this string will be included.
        - matches : str or regex or tuple, optional
            All column names that match the string or a compiled regex pattern
            will be included. A tuple can be used to match multiple regexs.
        - drop : bool, optional
            If ``True``, the selection is inverted. The unspecified/unmatched
            columns are returned instead. Default is ``False``.
    functions : callable or tuple or dict or str
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Create a new dataframe by doubling selected column values of the input
    frame.

    >>> def double(s):
    ...     return s + s
    >>> df >> create_at(('x', 'y', 'z'), double)
        x   y   z
    0   2  12  14
    1   4  10  18
    2   6   8  22
    3   8   6  16
    4  10   4  20
    5  12   2  24

    Convert from centimetes to inches.

    >>> def inch(col, decimals=0):
    ...     return np.round(col/2.54, decimals)
    >>> def feet(col, decimals=0):
    ...     return np.round(col/30.48, decimals)
    >>> df >> create_at(('x', 'y', 'z'), (inch, feet), decimals=2)
       x_inch  y_inch  z_inch  x_feet  y_feet  z_feet
    0    0.39    2.36    2.76    0.03    0.20    0.23
    1    0.79    1.97    3.54    0.07    0.16    0.30
    2    1.18    1.57    4.33    0.10    0.13    0.36
    3    1.57    1.18    3.15    0.13    0.10    0.26
    4    1.97    0.79    3.94    0.16    0.07    0.33
    5    2.36    0.39    4.72    0.20    0.03    0.39

    Group columns are always included and if listed in the
    selection, the functions act on them.

    >>> (df
    ...  >> group_by('x')
    ...  >> create_at(('x', 'y', 'z'), (inch, feet), decimals=2))
    groups: ['x']
       x  x_inch  y_inch  z_inch  x_feet  y_feet  z_feet
    0  1    0.39    2.36    2.76    0.03    0.20    0.23
    1  2    0.79    1.97    3.54    0.07    0.16    0.30
    2  3    1.18    1.57    4.33    0.10    0.13    0.36
    3  4    1.57    1.18    3.15    0.13    0.10    0.26
    4  5    1.97    0.79    3.94    0.16    0.07    0.33
    5  6    2.36    0.39    4.72    0.20    0.03    0.39

    Group columns that are not listed are not acted upon by the
    functions.

    >>> (df
    ...  >> group_by('x')
    ...  >> create_at(dict(matches=r'x|y|z'), (inch, feet), decimals=2))
    groups: ['x']
       x  y_inch  z_inch  y_feet  z_feet
    0  1    2.36    2.76    0.20    0.23
    1  2    1.97    3.54    0.16    0.30
    2  3    1.57    4.33    0.13    0.36
    3  4    1.18    3.15    0.10    0.26
    4  5    0.79    3.94    0.07    0.33
    5  6    0.39    4.72    0.03    0.39
    """


[docs]class group_by_all(_all):
    """
    Groupby all columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    functions : callable or tuple or dict or str
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Grouping by all the columns

    >>> df >> group_by_all()
    groups: ['alpha', 'beta', 'theta', 'x', 'y', 'z']
      alpha beta theta  x  y   z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12

    Grouping by all columns created by a function.
    Same output as above, but now all the columns are
    categorical

    >>> result = df >> group_by_all(pd.Categorical)
    >>> result
    groups: ['alpha', 'beta', 'theta', 'x', 'y', 'z']
      alpha beta theta  x  y   z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12
    >>> result['x']
    0    1
    1    2
    2    3
    3    4
    4    5
    5    6
    Name: x, dtype: category
    Categories (6, int64): [1, 2, 3, 4, 5, 6]

    If apply more than one function or  provide a postfix,
    the original columns are retained.

    >>> (df
    ...  >> select('x', 'y', 'z')
    ...  >> group_by_all(dict(cat=pd.Categorical)))
    groups: ['x_cat', 'y_cat', 'z_cat']
       x  y   z x_cat y_cat z_cat
    0  1  6   7     1     6     7
    1  2  5   9     2     5     9
    2  3  4  11     3     4    11
    3  4  3   8     4     3     8
    4  5  2  10     5     2    10
    5  6  1  12     6     1    12
    """
    def __init__(self, functions=None, *args, **kwargs):
        self.set_env_from_verb_init()
        super().__init__(functions, *args, **kwargs)


[docs]class group_by_if(_if):
    """
    Group by selected columns that are true for a predicate

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    predicate : function
        A predicate function to be applied to the columns of the
        dataframe. Good candidates for predicate functions are
        those that check the type of the column. Such function
        are avaible at :mod:`pandas.api.dtypes`, for example
        :func:`pandas.api.types.is_numeric_dtype`.

        For convenience, you can reference the ``is_*_dtype``
        functions with shorter strings::

            'is_bool'             # pandas.api.types.is_bool_dtype
            'is_categorical'      # pandas.api.types.is_categorical_dtype
            'is_complex'          # pandas.api.types.is_complex_dtype
            'is_datetime64_any'   # pandas.api.types.is_datetime64_any_dtype
            'is_datetime64'       # pandas.api.types.is_datetime64_dtype
            'is_datetime64_ns'    # pandas.api.types.is_datetime64_ns_dtype
            'is_datetime64tz'     # pandas.api.types.is_datetime64tz_dtype
            'is_float'            # pandas.api.types.is_float_dtype
            'is_int64'            # pandas.api.types.is_int64_dtype
            'is_integer'          # pandas.api.types.is_integer_dtype
            'is_interval'         # pandas.api.types.is_interval_dtype
            'is_numeric'          # pandas.api.types.is_numeric_dtype
            'is_object'           # pandas.api.types.is_object_dtype
            'is_period'           # pandas.api.types.is_period_dtype
            'is_signed_integer'   # pandas.api.types.is_signed_integer_dtype
            'is_string'           # pandas.api.types.is_string_dtype
            'is_timedelta64'      # pandas.api.types.is_timedelta64_dtype
            'is_timedelta64_ns'   # pandas.api.types.is_timedelta64_ns_dtype
            'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype

        No other string values are allowed.

    functions : callable or tuple or dict or str, optional
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Group by all string type columns. ``'is_string'`` is a
    shortcut to :func:`pandas.api.types.is_string_dtype`.

    >>> df >> group_by_if('is_string')
    groups: ['alpha', 'beta', 'theta']
      alpha beta theta  x  y   z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12

    Applying a function to create the group columns

    >>> def double(s):
    ...     return s + s
    >>> df >> group_by_if('is_string', double)
    groups: ['alpha', 'beta', 'theta']
      alpha beta theta  x  y   z
    0    aa   bb    cc  1  6   7
    1    aa   aa    dd  2  5   9
    2    aa   bb    ee  3  4  11
    3    bb   rr    cc  4  3   8
    4    bb   uu    dd  5  2  10
    5    bb   qq    ee  6  1  12

    Apply more than one function, increases the number of
    columns

    >>> def m10(x): return x-10  # minus
    >>> def p10(x): return x+10  # plus
    >>> df >> group_by_if('is_numeric', (m10, p10))
    groups: ['x_m10', 'y_m10', 'z_m10', 'x_p10', 'y_p10', 'z_p10']
      alpha beta theta  x  y   z  x_m10  y_m10  z_m10  x_p10  y_p10  z_p10
    0     a    b     c  1  6   7     -9     -4     -3     11     16     17
    1     a    a     d  2  5   9     -8     -5     -1     12     15     19
    2     a    b     e  3  4  11     -7     -6      1     13     14     21
    3     b    r     c  4  3   8     -6     -7     -2     14     13     18
    4     b    u     d  5  2  10     -5     -8      0     15     12     20
    5     b    q     e  6  1  12     -4     -9      2     16     11     22
    """


[docs]class group_by_at(_at):
    """
    Group by select columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    names : tuple or dict
        Names of columns in dataframe. If a tuple, they should be names
        of columns. If a :class:`dict`, they keys must be in.

        - startswith : str or tuple, optional
            All column names that start with this string will be included.
        - endswith : str or tuple, optional
            All column names that end with this string will be included.
        - contains : str or tuple, optional
            All column names that contain with this string will be included.
        - matches : str or regex or tuple, optional
            All column names that match the string or a compiled regex pattern
            will be included. A tuple can be used to match multiple regexs.
        - drop : bool, optional
            If ``True``, the selection is inverted. The unspecified/unmatched
            columns are returned instead. Default is ``False``.
    functions : callable or tuple or dict or str, optional
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    In the simplest form it is not too different from
    :class:`~plydata.verbs.group_by`.

    >>> df >> group_by_at(('x', 'y'))
    groups: ['x', 'y']
      alpha beta theta  x  y   z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12

    The power comes from the ability to do dynamic column selection.
    For example, regex match column names and apply function to get the
    group columns.

    >>> def double(s): return s + s
    >>> df >> group_by_at(dict(matches=r'\\w+eta$'), double)
    groups: ['beta', 'theta']
      alpha beta theta  x  y   z
    0     a   bb    cc  1  6   7
    1     a   aa    dd  2  5   9
    2     a   bb    ee  3  4  11
    3     b   rr    cc  4  3   8
    4     b   uu    dd  5  2  10
    5     b   qq    ee  6  1  12
    """
    def __init__(self, names, functions=None, *args, **kwargs):
        self.set_env_from_verb_init()
        super().__init__(names, functions, *args, **kwargs)


[docs]class mutate_all(_all):
    """
    Modify all columns that are true for a predicate

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    functions : callable or tuple or dict or str
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    A single function with an argument

    >>> df >> select('x', 'y', 'z') >> mutate_all(np.add, 10)
        x   y   z
    0  11  16  17
    1  12  15  19
    2  13  14  21
    3  14  13  18
    4  15  12  20
    5  16  11  22

    A two functions that accept the same argument

    >>> (df
    ...  >> select('x', 'z')
    ...  >> mutate_all((np.add, np.subtract), 10)
    ... )
       x   z  x_add  z_add  x_subtract  z_subtract
    0  1   7     11     17          -9          -3
    1  2   9     12     19          -8          -1
    2  3  11     13     21          -7           1
    3  4   8     14     18          -6          -2
    4  5  10     15     20          -5           0
    5  6  12     16     22          -4           2

    Convert *x*, *y* and *z* from centimeters to inches and
    round the 2 decimal places.

    >>> (df
    ...  >> select('x', 'y', 'z')
    ...  >> mutate_all(dict(inch=lambda col: np.round(col/2.54, 2)))
    ... )
       x  y   z  x_inch  y_inch  z_inch
    0  1  6   7    0.39    2.36    2.76
    1  2  5   9    0.79    1.97    3.54
    2  3  4  11    1.18    1.57    4.33
    3  4  3   8    1.57    1.18    3.15
    4  5  2  10    1.97    0.79    3.94
    5  6  1  12    2.36    0.39    4.72

    Groupwise standardization of multiple variables.

    >>> def scale(col): return (col - np.mean(col))/np.std(col)
    >>> (df
    ...  >> group_by('alpha')
    ...  >> select('x', 'y', 'z')
    ...  >> mutate_all(scale))
    groups: ['alpha']
      alpha         x         y         z
    0     a -1.224745  1.224745 -1.224745
    1     a  0.000000  0.000000  0.000000
    2     a  1.224745 -1.224745  1.224745
    3     b -1.224745  1.224745 -1.224745
    4     b  0.000000  0.000000  0.000000
    5     b  1.224745 -1.224745  1.224745
    """


[docs]class mutate_if(_if):
    """
    Modify selected columns that are true for a predicate

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    predicate : function
        A predicate function to be applied to the columns of the
        dataframe. Good candidates for predicate functions are
        those that check the type of the column. Such function
        are avaible at :mod:`pandas.api.dtypes`, for example
        :func:`pandas.api.types.is_numeric_dtype`.

        For convenience, you can reference the ``is_*_dtype``
        functions with shorter strings::

            'is_bool'             # pandas.api.types.is_bool_dtype
            'is_categorical'      # pandas.api.types.is_categorical_dtype
            'is_complex'          # pandas.api.types.is_complex_dtype
            'is_datetime64_any'   # pandas.api.types.is_datetime64_any_dtype
            'is_datetime64'       # pandas.api.types.is_datetime64_dtype
            'is_datetime64_ns'    # pandas.api.types.is_datetime64_ns_dtype
            'is_datetime64tz'     # pandas.api.types.is_datetime64tz_dtype
            'is_float'            # pandas.api.types.is_float_dtype
            'is_int64'            # pandas.api.types.is_int64_dtype
            'is_integer'          # pandas.api.types.is_integer_dtype
            'is_interval'         # pandas.api.types.is_interval_dtype
            'is_numeric'          # pandas.api.types.is_numeric_dtype
            'is_object'           # pandas.api.types.is_object_dtype
            'is_period'           # pandas.api.types.is_period_dtype
            'is_signed_integer'   # pandas.api.types.is_signed_integer_dtype
            'is_string'           # pandas.api.types.is_string_dtype
            'is_timedelta64'      # pandas.api.types.is_timedelta64_dtype
            'is_timedelta64_ns'   # pandas.api.types.is_timedelta64_ns_dtype
            'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype

        No other string values are allowed.
    functions : callable or tuple or dict or str
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import pandas.api.types as pdtypes
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    A single function with an argument

    >>> df >> mutate_if(pdtypes.is_numeric_dtype, np.add, 10)
      alpha beta theta   x   y   z
    0     a    b     c  11  16  17
    1     a    a     d  12  15  19
    2     a    b     e  13  14  21
    3     b    r     c  14  13  18
    4     b    u     d  15  12  20
    5     b    q     e  16  11  22

    A two functions that accept the same argument and using our
    crude column selector.

    >>> def is_x_or_z(col): return col.name in ('x', 'z')
    >>> df >> mutate_if(is_x_or_z, (np.add, np.subtract), 10)
      alpha beta theta  x  y   z  x_add  z_add  x_subtract  z_subtract
    0     a    b     c  1  6   7     11     17          -9          -3
    1     a    a     d  2  5   9     12     19          -8          -1
    2     a    b     e  3  4  11     13     21          -7           1
    3     b    r     c  4  3   8     14     18          -6          -2
    4     b    u     d  5  2  10     15     20          -5           0
    5     b    q     e  6  1  12     16     22          -4           2

    Convert *x*, *y* and *z* from centimeters to inches and
    round the 2 decimal places.

    >>> (df
    ...  >> mutate_if('is_numeric',
    ...               dict(inch=lambda col: np.round(col/2.54, 2))))
      alpha beta theta  x  y   z  x_inch  y_inch  z_inch
    0     a    b     c  1  6   7    0.39    2.36    2.76
    1     a    a     d  2  5   9    0.79    1.97    3.54
    2     a    b     e  3  4  11    1.18    1.57    4.33
    3     b    r     c  4  3   8    1.57    1.18    3.15
    4     b    u     d  5  2  10    1.97    0.79    3.94
    5     b    q     e  6  1  12    2.36    0.39    4.72

    Groupwise standardization of multiple variables.

    >>> def scale(col): return (col - np.mean(col))/np.std(col)
    >>> (df
    ...  >> group_by('alpha')
    ...  >> mutate_if('is_numeric', scale))
    groups: ['alpha']
      alpha beta theta         x         y         z
    0     a    b     c -1.224745  1.224745 -1.224745
    1     a    a     d  0.000000  0.000000  0.000000
    2     a    b     e  1.224745 -1.224745  1.224745
    3     b    r     c -1.224745  1.224745 -1.224745
    4     b    u     d  0.000000  0.000000  0.000000
    5     b    q     e  1.224745 -1.224745  1.224745

    Using a boolean array to select the columns.

    >>> df >> mutate_if(
    ...     [False, False, False, True, True, True],
    ...     np.negative)
      alpha beta theta  x  y   z
    0     a    b     c -1 -6  -7
    1     a    a     d -2 -5  -9
    2     a    b     e -3 -4 -11
    3     b    r     c -4 -3  -8
    4     b    u     d -5 -2 -10
    5     b    q     e -6 -1 -12
    """


[docs]class mutate_at(_at):
    """
    Change selected columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    names : tuple or dict
        Names of columns in dataframe. If a tuple, they should be names
        of columns. If a :class:`dict`, they keys must be in.

        - startswith : str or tuple, optional
            All column names that start with this string will be included.
        - endswith : str or tuple, optional
            All column names that end with this string will be included.
        - contains : str or tuple, optional
            All column names that contain with this string will be included.
        - matches : str or regex or tuple, optional
            All column names that match the string or a compiled regex pattern
            will be included. A tuple can be used to match multiple regexs.
        - drop : bool, optional
            If ``True``, the selection is inverted. The unspecified/unmatched
            columns are returned instead. Default is ``False``.
    functions : callable or tuple or dict or str
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - String can be used for more complex
              statements, but the resulting names will be terrible.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    A single function with an argument

    >>> df >> mutate_at(('x', 'y', 'z'), np.add, 10)
      alpha beta theta   x   y   z
    0     a    b     c  11  16  17
    1     a    a     d  12  15  19
    2     a    b     e  13  14  21
    3     b    r     c  14  13  18
    4     b    u     d  15  12  20
    5     b    q     e  16  11  22

    A two functions that accept the same argument

    >>> df >> mutate_at(('x', 'z'), (np.add, np.subtract), 10)
      alpha beta theta  x  y   z  x_add  z_add  x_subtract  z_subtract
    0     a    b     c  1  6   7     11     17          -9          -3
    1     a    a     d  2  5   9     12     19          -8          -1
    2     a    b     e  3  4  11     13     21          -7           1
    3     b    r     c  4  3   8     14     18          -6          -2
    4     b    u     d  5  2  10     15     20          -5           0
    5     b    q     e  6  1  12     16     22          -4           2

    Convert *x*, *y* and *z* from centimeters to inches and
    round the 2 decimal places.

    >>> (df
    ...  >> mutate_at(('x', 'y', 'z'),
    ...               dict(inch=lambda col: np.round(col/2.54, 2)))
    ... )
      alpha beta theta  x  y   z  x_inch  y_inch  z_inch
    0     a    b     c  1  6   7    0.39    2.36    2.76
    1     a    a     d  2  5   9    0.79    1.97    3.54
    2     a    b     e  3  4  11    1.18    1.57    4.33
    3     b    r     c  4  3   8    1.57    1.18    3.15
    4     b    u     d  5  2  10    1.97    0.79    3.94
    5     b    q     e  6  1  12    2.36    0.39    4.72

    Groupwise standardization of multiple variables.

    >>> def scale(col): return (col - np.mean(col))/np.std(col)
    >>> (df
    ...  >> group_by('alpha')
    ...  >> mutate_at(('x', 'y', 'z'), scale))
    groups: ['alpha']
      alpha beta theta         x         y         z
    0     a    b     c -1.224745  1.224745 -1.224745
    1     a    a     d  0.000000  0.000000  0.000000
    2     a    b     e  1.224745 -1.224745  1.224745
    3     b    r     c -1.224745  1.224745 -1.224745
    4     b    u     d  0.000000  0.000000  0.000000
    5     b    q     e  1.224745 -1.224745  1.224745
    """


[docs]class query_all(_all):
    """
    Query all columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    all_vars : str, optional
        A predicate statement to evaluate. It should conform to python
        syntax and should return an array of boolean values (one for every
        item in the column) or a single boolean (for the whole column).
        You should use ``{_}`` to refer to the column names.

        After the statement is evaluated for all columns, the
        *union* (``|``), is used to select the output rows.
    any_vars : str, optional
        A predicate statement to evaluate. It should conform to python
        syntax and should return an array of boolean values (one for every
        item in the column) or a single boolean (for the whole column).
        You should use ``{_}`` to refer to the column names.

        After the statement is evaluated for all columns, the
        *intersection* (``&``), is used to select the output rows.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Select all rows where any of the entries along the columns
    is a 4.

    >>> df >> query_all(any_vars='({_} == 4)')
      alpha beta theta  x  y   z
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8

    The opposit, select all rows where none of the entries along
    the columns is a 4.

    >>> df >> query_all(all_vars='({_} != 4)')
      alpha beta theta  x  y   z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12

    For something more complicated, group-wise selection.

    Select groups where any of the columns a large (> 28) sum.
    First by using :class:`summarize_all`, we see that there is
    one such group. Then using :class:`query_all` selects it.

    >>> (df
    ...  >> group_by('alpha')
    ...  >> select('x', 'y', 'z')
    ...  >> summarize_all('sum'))
      alpha   x   y   z
    0     a   6  15  27
    1     b  15   6  30
    >>> (df
    ...  >> group_by('alpha')
    ...  >> select('x', 'y', 'z')
    ...  >> query_all(any_vars='(sum({_}) > 28)'))
    groups: ['alpha']
      alpha  x  y   z
    3     b  4  3   8
    4     b  5  2  10
    5     b  6  1  12

    Note that ``sum({_}) > 28`` is a column operation, it returns
    a single number for the whole column. Therefore the whole column
    is either selected or not selected. Column operations are what
    enable group-wise selection.
    """
    vars_predicate = None

    def __init__(self, *, all_vars=None, any_vars=None):
        self.set_env_from_verb_init()
        if all_vars and any_vars:
            raise ValueError(
                "Only one of `all_vars` or `any_vars` should "
                "be given."
            )
        elif all_vars:
            self.vars_predicate = all_vars
            self.all_vars = True
            self.any_vars = False
        elif any_vars:
            self.vars_predicate = any_vars
            self.any_vars = True
            self.all_vars = False
        else:
            raise ValueError(
                "One of `all_vars` or `any_vars` should be given.")


[docs]class query_if(_if):
    """
    Query all columns that match a predicate

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    predicate : function
        A predicate function to be applied to the columns of the
        dataframe. Good candidates for predicate functions are
        those that check the type of the column. Such function
        are avaible at :mod:`pandas.api.dtypes`, for example
        :func:`pandas.api.types.is_numeric_dtype`.

        For convenience, you can reference the ``is_*_dtype``
        functions with shorter strings::

            'is_bool'             # pandas.api.types.is_bool_dtype
            'is_categorical'      # pandas.api.types.is_categorical_dtype
            'is_complex'          # pandas.api.types.is_complex_dtype
            'is_datetime64_any'   # pandas.api.types.is_datetime64_any_dtype
            'is_datetime64'       # pandas.api.types.is_datetime64_dtype
            'is_datetime64_ns'    # pandas.api.types.is_datetime64_ns_dtype
            'is_datetime64tz'     # pandas.api.types.is_datetime64tz_dtype
            'is_float'            # pandas.api.types.is_float_dtype
            'is_int64'            # pandas.api.types.is_int64_dtype
            'is_integer'          # pandas.api.types.is_integer_dtype
            'is_interval'         # pandas.api.types.is_interval_dtype
            'is_numeric'          # pandas.api.types.is_numeric_dtype
            'is_object'           # pandas.api.types.is_object_dtype
            'is_period'           # pandas.api.types.is_period_dtype
            'is_signed_integer'   # pandas.api.types.is_signed_integer_dtype
            'is_string'           # pandas.api.types.is_string_dtype
            'is_timedelta64'      # pandas.api.types.is_timedelta64_dtype
            'is_timedelta64_ns'   # pandas.api.types.is_timedelta64_ns_dtype
            'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype

        No other string values are allowed.
    all_vars : str, optional
        A predicate statement to evaluate. It should conform to python
        syntax and should return an array of boolean values (one for every
        item in the column) or a single boolean (for the whole column).
        You should use ``{_}`` to refer to the column names.

        After the statement is evaluated for all columns selected by the
        *predicate*, the *union* (``|``), is used to select the output rows.
    any_vars : str, optional
        A predicate statement to evaluate. It should conform to python
        syntax and should return an array of boolean values (one for every
        item in the column) or a single boolean (for the whole column).
        You should use ``{_}`` to refer to the column names.

        After the statement is evaluated for all columns selected by the
        predicate, *intersection* (``&``), is used to select the output rows.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Select all rows where any of the entries along the integer columns
    is a 4.

    >>> df >> query_if('is_integer', any_vars='({_} == 4)')
      alpha beta theta  x  y   z
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8

    The opposite, select all rows where none of the entries along
    the integer columns is a 4.

    >>> df >> query_if('is_integer', all_vars='({_} != 4)')
      alpha beta theta  x  y   z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12

    For something more complicated, group-wise selection.

    Select groups where any of the columns a large (> 28) sum.
    First by using :class:`summarize_if`, we see that there is
    one such group. Then using :class:`query_if` selects it.

    >>> (df
    ...  >> group_by('alpha')
    ...  >> summarize_if('is_integer', 'sum'))
      alpha   x   y   z
    0     a   6  15  27
    1     b  15   6  30
    >>> (df
    ...  >> group_by('alpha')
    ...  >> query_if('is_integer', any_vars='(sum({_}) > 28)'))
    groups: ['alpha']
      alpha beta theta  x  y   z
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12

    Note that ``sum({_}) > 28`` is a column operation, it returns
    a single number for the whole column. Therefore the whole column
    is either selected or not selected. Column operations are what
    enable group-wise selection.
    """
    vars_predicate = None

    def __init__(self, predicate, *, all_vars=None, any_vars=None):
        self.set_env_from_verb_init()
        self.predicate = predicate

        if all_vars and any_vars:
            raise ValueError(
                "Only one of `all_vars` or `any_vars` should "
                "be given."
            )
        elif all_vars:
            self.vars_predicate = all_vars
            self.all_vars = True
            self.any_vars = False
        elif any_vars:
            self.vars_predicate = any_vars
            self.any_vars = True
            self.all_vars = False
        else:
            raise ValueError(
                "One of `all_vars` or `any_vars` should be given.")


[docs]class query_at(_at):
    """
    Query specific columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    names : tuple or dict
        Names of columns in dataframe. If a tuple, they should be names
        of columns. If a :class:`dict`, they keys must be in.

        - startswith : str or tuple, optional
            All column names that start with this string will be included.
        - endswith : str or tuple, optional
            All column names that end with this string will be included.
        - contains : str or tuple, optional
            All column names that contain with this string will be included.
        - matches : str or regex or tuple, optional
            All column names that match the string or a compiled regex pattern
            will be included. A tuple can be used to match multiple regexs.
        - drop : bool, optional
            If ``True``, the selection is inverted. The unspecified/unmatched
            columns are returned instead. Default is ``False``.
    all_vars : str, optional
        A predicate statement to evaluate. It should conform to python
        syntax and should return an array of boolean values (one for every
        item in the column) or a single boolean (for the whole column).
        You should use ``{_}`` to refer to the column names.

        After the statement is evaluated for all columns selected by the
        *names* specification, the *union* (``|``), is used to select the
        output rows.
    any_vars : str, optional
        A predicate statement to evaluate. It should conform to python
        syntax and should return an array of boolean values (one for every
        item in the column) or a single boolean (for the whole column).
        You should use ``{_}`` to refer to the column names.

        After the statement is evaluated for all columns selected by the
        *names* specification, *intersection* (``&``), is used to select
        the output rows.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Select all rows where any of the entries along the integer columns
    is a 4.

    >>> df >> query_at(('x', 'y', 'z'), any_vars='({_} == 4)')
      alpha beta theta  x  y   z
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8

    The opposit, select all rows where none of the entries along
    the integer columns is a 4.

    >>> df >> query_at(('x', 'y', 'z'), all_vars='({_} != 4)')
      alpha beta theta  x  y   z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12

    For something more complicated, group-wise selection.

    Select groups where any of the columns a large (> 28) sum.
    First by using :class:`summarize_at`, we see that there is
    one such group. Then using :class:`query_at` selects it.

    >>> (df
    ...  >> group_by('alpha')
    ...  >> summarize_at(('x', 'y', 'z'), 'sum'))
      alpha   x   y   z
    0     a   6  15  27
    1     b  15   6  30
    >>> (df
    ...  >> group_by('alpha')
    ...  >> query_at(('x', 'y', 'z'), any_vars='(sum({_}) > 28)'))
    groups: ['alpha']
      alpha beta theta  x  y   z
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12

    Note that ``sum({_}) > 28`` is a column operation, it returns
    a single number for the whole column. Therefore the whole column
    is either selected or not selected. Column operations are what
    enable group-wise selection.
    """

    def __init__(self, names, *, all_vars=None, any_vars=None):
        if all_vars and any_vars:
            raise ValueError(
                "Only one of `all_vars` or `any_vars` should "
                "be given."
            )
        elif all_vars:
            self.vars_predicate = all_vars
            self.all_vars = True
            self.any_vars = False
        elif any_vars:
            self.vars_predicate = any_vars
            self.any_vars = True
            self.all_vars = False
        else:
            raise ValueError(
                "One of `all_vars` or `any_vars` should be given.")

        self.set_env_from_verb_init()
        super().__init__(names, tuple())


[docs]class rename_all(_all):
    """
    Rename all columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    functions : callable
        Useful when not using the ``>>`` operator.

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Rename all columns uppercase

    >>> df >> rename_all(str.upper)
      ALPHA BETA THETA  X  Y   Z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12

    Group columns are not renamed

    >>> df >> group_by('beta') >> rename_all(str.upper)
    groups: ['beta']
      ALPHA beta THETA  X  Y   Z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12
    """
    n_functions = 1


[docs]class rename_if(_if):
    """
    Rename all columns that match a predicate

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    predicate : function
        A predicate function to be applied to the columns of the
        dataframe. Good candidates for predicate functions are
        those that check the type of the column. Such function
        are avaible at :mod:`pandas.api.dtypes`, for example
        :func:`pandas.api.types.is_numeric_dtype`.

        For convenience, you can reference the ``is_*_dtype``
        functions with shorter strings::

            'is_bool'             # pandas.api.types.is_bool_dtype
            'is_categorical'      # pandas.api.types.is_categorical_dtype
            'is_complex'          # pandas.api.types.is_complex_dtype
            'is_datetime64_any'   # pandas.api.types.is_datetime64_any_dtype
            'is_datetime64'       # pandas.api.types.is_datetime64_dtype
            'is_datetime64_ns'    # pandas.api.types.is_datetime64_ns_dtype
            'is_datetime64tz'     # pandas.api.types.is_datetime64tz_dtype
            'is_float'            # pandas.api.types.is_float_dtype
            'is_int64'            # pandas.api.types.is_int64_dtype
            'is_integer'          # pandas.api.types.is_integer_dtype
            'is_interval'         # pandas.api.types.is_interval_dtype
            'is_numeric'          # pandas.api.types.is_numeric_dtype
            'is_object'           # pandas.api.types.is_object_dtype
            'is_period'           # pandas.api.types.is_period_dtype
            'is_signed_integer'   # pandas.api.types.is_signed_integer_dtype
            'is_string'           # pandas.api.types.is_string_dtype
            'is_timedelta64'      # pandas.api.types.is_timedelta64_dtype
            'is_timedelta64_ns'   # pandas.api.types.is_timedelta64_ns_dtype
            'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype

        No other string values are allowed.
    functions : callable
        Useful when not using the ``>>`` operator.
    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments
        are passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    >>> def is_sorted(col):
    ...     a = col.values
    ...     return all(a[:-1] <= a[1:])

    Rename all sorted columns to uppercase.

    >>> df >> rename_if(is_sorted, str.upper)
      ALPHA beta theta  X  y   z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12

    Group columns are not renamed.

    >>> df >> group_by('alpha') >> rename_if(is_sorted, str.upper)
    groups: ['alpha']
      alpha beta theta  X  y   z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12
    """
    n_functions = 1


[docs]class rename_at(_at):
    """
    Rename specific columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    names : tuple or dict
        Names of columns in dataframe. If a tuple, they should be names
        of columns. If a :class:`dict`, they keys must be in.

        - startswith : str or tuple, optional
            All column names that start with this string will be included.
        - endswith : str or tuple, optional
            All column names that end with this string will be included.
        - contains : str or tuple, optional
            All column names that contain with this string will be included.
        - matches : str or regex or tuple, optional
            All column names that match the string or a compiled regex pattern
            will be included. A tuple can be used to match multiple regexs.
        - drop : bool, optional
            If ``True``, the selection is inverted. The unspecified/unmatched
            columns are returned instead. Default is ``False``.
    function : callable
        Function to rename the column(s).
    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Rename columns that contain the string ``eta`` to upper case.

    >>> df >> rename_at(dict(contains='eta'), str.upper)
      alpha BETA THETA  x  y   z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12

    The group columns are not renamed.

    >>> (df
    ...  >> group_by('beta')
    ...  >> rename_at(('alpha', 'beta', 'x'), str.upper))
    groups: ['beta']
      ALPHA beta theta  X  y   z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12
    """
    n_functions = 1


[docs]class select_all(_all):
    """
    Select all columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    function : callable
        Function to rename the column(s).
    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Select all columns and convert names to uppercase

    >>> df >> select_all(str.upper)
      ALPHA BETA THETA  X  Y   Z
    0     a    b     c  1  6   7
    1     a    a     d  2  5   9
    2     a    b     e  3  4  11
    3     b    r     c  4  3   8
    4     b    u     d  5  2  10
    5     b    q     e  6  1  12

    Group columns are selected but they are not renamed.

    >>> df >> group_by('beta') >> select_all(str.upper)
    groups: ['beta']
      beta ALPHA THETA  X  Y   Z
    0    b     a     c  1  6   7
    1    a     a     d  2  5   9
    2    b     a     e  3  4  11
    3    r     b     c  4  3   8
    4    u     b     d  5  2  10
    5    q     b     e  6  1  12
    """
    n_functions = 1


[docs]class select_if(_if):
    """
    Select all columns that match a predicate

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    predicate : function
        A predicate function to be applied to the columns of the
        dataframe. Good candidates for predicate functions are
        those that check the type of the column. Such function
        are avaible at :mod:`pandas.api.dtypes`, for example
        :func:`pandas.api.types.is_numeric_dtype`.

        For convenience, you can reference the ``is_*_dtype``
        functions with shorter strings::

            'is_bool'             # pandas.api.types.is_bool_dtype
            'is_categorical'      # pandas.api.types.is_categorical_dtype
            'is_complex'          # pandas.api.types.is_complex_dtype
            'is_datetime64_any'   # pandas.api.types.is_datetime64_any_dtype
            'is_datetime64'       # pandas.api.types.is_datetime64_dtype
            'is_datetime64_ns'    # pandas.api.types.is_datetime64_ns_dtype
            'is_datetime64tz'     # pandas.api.types.is_datetime64tz_dtype
            'is_float'            # pandas.api.types.is_float_dtype
            'is_int64'            # pandas.api.types.is_int64_dtype
            'is_integer'          # pandas.api.types.is_integer_dtype
            'is_interval'         # pandas.api.types.is_interval_dtype
            'is_numeric'          # pandas.api.types.is_numeric_dtype
            'is_object'           # pandas.api.types.is_object_dtype
            'is_period'           # pandas.api.types.is_period_dtype
            'is_signed_integer'   # pandas.api.types.is_signed_integer_dtype
            'is_string'           # pandas.api.types.is_string_dtype
            'is_timedelta64'      # pandas.api.types.is_timedelta64_dtype
            'is_timedelta64_ns'   # pandas.api.types.is_timedelta64_ns_dtype
            'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype

        No other string values are allowed.
    function : callable
        Function to rename the column(s).
    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    All sorted column names to uppercase

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Select all sorted columns and convert names to upper case

    >>> def is_sorted(col):
    ...     a = col.values
    ...     return all(a[:-1] <= a[1:])
    >>> df >> select_if(is_sorted, str.upper)
      ALPHA  X
    0     a  1
    1     a  2
    2     a  3
    3     b  4
    4     b  5
    5     b  6

    Group columns are always selected.

    >>> df >> group_by('beta') >> select_if(is_sorted, str.upper)
    groups: ['beta']
      beta ALPHA  X
    0    b     a  1
    1    a     a  2
    2    b     a  3
    3    r     b  4
    4    u     b  5
    5    q     b  6
    """
    n_functions = 1


[docs]class select_at(_at):
    """
    Select specific columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    names : tuple or dict
        Names of columns in dataframe. If a tuple, they should be names
        of columns. If a :class:`dict`, they keys must be in.

        - startswith : str or tuple, optional
            All column names that start with this string will be included.
        - endswith : str or tuple, optional
            All column names that end with this string will be included.
        - contains : str or tuple, optional
            All column names that contain with this string will be included.
        - matches : str or regex or tuple, optional
            All column names that match the string or a compiled regex pattern
            will be included. A tuple can be used to match multiple regexs.
        - drop : bool, optional
            If ``True``, the selection is inverted. The unspecified/unmatched
            columns are returned instead. Default is ``False``.

    function : callable
        Functions to rename the column(s).
    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Select the listed columns and rename them to upper case.

    >>> df >> select_at(('alpha', 'x'), str.upper)
      ALPHA  X
    0     a  1
    1     a  2
    2     a  3
    3     b  4
    4     b  5
    5     b  6

    Select columns that contain the string ``eta`` and rename
    them name to upper case.

    >>> df >> select_at(dict(contains='eta'), str.upper)
      BETA THETA
    0    b     c
    1    a     d
    2    b     e
    3    r     c
    4    u     d
    5    q     e

    Group columns are always selected.

    >>> df >> group_by('beta') >> select_at(('alpha', 'x'), str.upper)
    groups: ['beta']
      beta ALPHA  X
    0    b     a  1
    1    a     a  2
    2    b     a  3
    3    r     b  4
    4    u     b  5
    5    q     b  6
    """
    n_functions = 1


[docs]class summarize_all(_all):
    """
    Summarise all non-grouping columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    functions : callable or tuple or dict or str
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - You can use this to access the aggregation
              functions provided in :class:`summarize`::

                  # Those that accept a single argument.
                  'min'
                  'max'
                  'sum'
                  'cumsum'
                  'mean'
                  'median'
                  'std'
                  'first'
                  'last'
                  'n_distinct'
                  'n_unique'

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    A single summarizing function

    >>> df >> select('x', 'z') >> summarize_all('mean')
         x    z
    0  3.5  9.5

    More than one summarizing function (as a tuple).

    >>> df >> select('x', 'z') >> summarize_all(('mean', np.std))
       x_mean  z_mean     x_std     z_std
    0     3.5     9.5  1.707825  1.707825

    You can use a dictionary to change postscripts of the
    column names.

    >>> (df
    ...  >> select('x', 'z')
    ...  >> summarize_all(dict(MEAN='mean', STD=np.std)))
       x_MEAN  z_MEAN     x_STD     z_STD
    0     3.5     9.5  1.707825  1.707825

    Group by

    >>> (df
    ...  >> group_by('alpha')
    ...  >> select('x', 'z')
    ...  >> summarize_all(('mean', np.std)))
      alpha  x_mean  z_mean     x_std     z_std
    0     a     2.0     9.0  0.816497  1.632993
    1     b     5.0    10.0  0.816497  1.632993

    Passing additional arguments

    >>> (df
    ...  >> group_by('alpha')
    ...  >> select('x', 'z')
    ...  >> summarize_all(np.std, ddof=1))
      alpha    x    z
    0     a  1.0  2.0
    1     b  1.0  2.0

    The arguments are passed to all functions, so in majority of
    these cases it might only be possible to summarise with one
    function.

    The group columns is never summarised.

    >>> (df
    ...  >> select('x', 'y', 'z')
    ...  >> define(parity='x%2')
    ...  >> group_by('parity')
    ...  >> summarize_all('mean'))
       parity    x    y         z
    0       1  3.0  4.0  9.333333
    1       0  4.0  3.0  9.666667
    """


[docs]class summarize_if(_if):
    """
    Summarise all columns that are true for a predicate

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    predicate : function or str
        A predicate function to be applied to the columns of the
        dataframe. Good candidates for predicate functions are
        those that check the type of the column. Such function
        are avaible at :mod:`pandas.api.dtypes`, for example
        :func:`pandas.api.types.is_numeric_dtype`.

        For convenience, you can reference the ``is_*_dtype``
        functions with shorter strings::

            'is_bool'             # pandas.api.types.is_bool_dtype
            'is_categorical'      # pandas.api.types.is_categorical_dtype
            'is_complex'          # pandas.api.types.is_complex_dtype
            'is_datetime64_any'   # pandas.api.types.is_datetime64_any_dtype
            'is_datetime64'       # pandas.api.types.is_datetime64_dtype
            'is_datetime64_ns'    # pandas.api.types.is_datetime64_ns_dtype
            'is_datetime64tz'     # pandas.api.types.is_datetime64tz_dtype
            'is_float'            # pandas.api.types.is_float_dtype
            'is_int64'            # pandas.api.types.is_int64_dtype
            'is_integer'          # pandas.api.types.is_integer_dtype
            'is_interval'         # pandas.api.types.is_interval_dtype
            'is_numeric'          # pandas.api.types.is_numeric_dtype
            'is_object'           # pandas.api.types.is_object_dtype
            'is_period'           # pandas.api.types.is_period_dtype
            'is_signed_integer'   # pandas.api.types.is_signed_integer_dtype
            'is_string'           # pandas.api.types.is_string_dtype
            'is_timedelta64'      # pandas.api.types.is_timedelta64_dtype
            'is_timedelta64_ns'   # pandas.api.types.is_timedelta64_ns_dtype
            'is_unsigned_integer' # pandas.api.types.is_unsigned_integer_dtype

        No other string values are allowed.

    functions : str or tuple or dict, optional
        Expressions or ``(name, expression)`` pairs. This should
        be used when the *name* is not a valid python variable
        name. The expression should be of type :class:`str` or
        an *interable* with the same number of elements as the
        dataframe.

    Examples
    --------
    >>> import pandas as pd
    >>> import pandas.api.types as pdtypes
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    Summarizing all numeric columns

    >>> df >> summarize_if(pdtypes.is_numeric_dtype, (np.min, np.max))
       x_amin  y_amin  z_amin  x_amax  y_amax  z_amax
    0       1       1       7       6       6      12

    Group by

    >>> (df
    ...  >> group_by('alpha')
    ...  >> summarize_if(pdtypes.is_numeric_dtype, (np.min, np.max))
    ... )
      alpha  x_amin  y_amin  z_amin  x_amax  y_amax  z_amax
    0     a       1       4       7       3       6      11
    1     b       4       1       8       6       3      12

    Using a ``'is_string'`` as a shortcut to :py:`pdtypes.is_string_dtype`
    for the predicate and custom summarizing a function.

    >>> def first(col): return list(col)[0]
    >>> df >> group_by('alpha') >> summarize_if('is_string', first)
      alpha beta theta
    0     a    b     c
    1     b    r     c

    Note, if the any of the group columns match the predictate, they
    are selected.
    """


[docs]class summarize_at(_at):
    """
    Summarize select columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    names : tuple or dict
        Names of columns in dataframe. If a tuple, they should be names
        of columns. If a :class:`dict`, they keys must be in.

        - startswith : str or tuple, optional
            All column names that start with this string will be included.
        - endswith : str or tuple, optional
            All column names that end with this string will be included.
        - contains : str or tuple, optional
            All column names that contain with this string will be included.
        - matches : str or regex or tuple, optional
            All column names that match the string or a compiled regex pattern
            will be included. A tuple can be used to match multiple regexs.
        - drop : bool, optional
            If ``True``, the selection is inverted. The unspecified/unmatched
            columns are returned instead. Default is ``False``.
    functions : callable or tuple or dict or str
        Functions to alter the columns:

            - function (any callable) - Function is applied to the
              column and the result columns replace the original
              columns.
            - :class:`tuple` of functions - Each function is applied to
              all of the columns and the name (``__name__``) of the
              function is postfixed to resulting column names.
            - :class:`dict` of the form ``{'name': function}`` - Allows
              you to apply one or more functions and also control the
              postfix to the name.
            - :class:`str` - You can use this to access the aggregation
              functions provided in :class:`summarize`::

                  # Those that accept a single argument.
                  'min'
                  'max'
                  'sum'
                  'cumsum'
                  'mean'
                  'median'
                  'std'
                  'first'
                  'last'
                  'n_distinct'
                  'n_unique'

    args : tuple
        Arguments to the functions. The arguments are pass to *all*
        functions.
    kwargs : dict
        Keyword arguments to the functions. The keyword arguments are
        passed to *all* functions.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'alpha': list('aaabbb'),
    ...     'beta': list('babruq'),
    ...     'theta': list('cdecde'),
    ...     'x': [1, 2, 3, 4, 5, 6],
    ...     'y': [6, 5, 4, 3, 2, 1],
    ...     'z': [7, 9, 11, 8, 10, 12]
    ... })

    One variable

    >>> df >> summarize_at('x', ('mean', np.std))
       x_mean     x_std
    0     3.5  1.707825

    Many variables

    >>> df >> summarize_at(('x', 'y', 'z'), ('mean', np.std))
       x_mean  y_mean  z_mean     x_std     y_std     z_std
    0     3.5     3.5     9.5  1.707825  1.707825  1.707825

    Group by and many variables

    >>> (df
    ...  >> group_by('theta')
    ...  >> summarize_at(('x', 'y', 'z'), ('mean', np.std))
    ... )
      theta  x_mean  y_mean  z_mean  x_std  y_std  z_std
    0     c     2.5     4.5     7.5    1.5    1.5    0.5
    1     d     3.5     3.5     9.5    1.5    1.5    0.5
    2     e     4.5     2.5    11.5    1.5    1.5    0.5

    Using `select` parameters

    >>> (df
    ...  >> group_by('alpha')
    ...  >> summarize_at(
    ...         dict(endswith='ta'),
    ...         dict(unique_count=lambda col: len(pd.unique(col)))
    ...     )
    ... )
      alpha  beta_unique_count  theta_unique_count
    0     a                  2                   3
    1     b                  3                   3

    For this data, we can achieve the same using
    :class:`~plydata.verbs.summarize`.

    >>> (df
    ...  >> group_by('alpha')
    ...  >> summarize(
    ...         beta_unique_count='len(pd.unique(beta))',
    ...         theta_unique_count='len(pd.unique(theta))'
    ...     )
    ... )
      alpha  beta_unique_count  theta_unique_count
    0     a                  2                   3
    1     b                  3                   3
    """


# Aliases
summarise_all = summarize_all
summarise_at = summarize_at
summarise_if = summarize_if
transmute_all = create_all
transmute_at = create_at
transmute_if = create_if