Source code for plydata.one_table_verbs

"""
One table verb initializations
"""
import itertools

from .operators import DataOperator
from .expressions import Expression

__all__ = ['define', 'create', 'sample_n', 'sample_frac', 'select',
           'rename', 'distinct', 'unique', 'arrange', 'group_by',
           'ungroup', 'group_indices', 'summarize',
           'query', 'do', 'head', 'tail', 'pull', 'slice_rows',
           # Aliases
           'summarise', 'mutate', 'transmute',
           ]


[docs]class define(DataOperator):
    """
    Add column to DataFrame

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    args : strs, tuples, optional
        Expressions or ``(name, expression)`` pairs. This should
        be used when the *name* is not a valid python variable
        name. The expression should be of type :class:`str` or
        an *interable* with the same number of elements as the
        dataframe.
    kwargs : dict, optional
        ``{name: expression}`` pairs.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'x': [1, 2, 3]})
    >>> df >> define(x_sq='x**2')
       x  x_sq
    0  1     1
    1  2     4
    2  3     9
    >>> df >> define(('x*2', 'x*2'), ('x*3', 'x*3'), x_cubed='x**3')
       x  x*2  x*3  x_cubed
    0  1    2    3        1
    1  2    4    6        8
    2  3    6    9       27
    >>> df >> define('x*4')
       x  x*4
    0  1    4
    1  2    8
    2  3   12

    Notes
    -----
    If :obj:`plydata.options.modify_input_data` is ``True``,
    :class:`define` will modify the original dataframe.
    """

    def __init__(self, *args, **kwargs):
        self.set_env_from_verb_init()
        cols = []
        exprs = []
        for arg in args:
            if isinstance(arg, str):
                col = expr = arg
            else:
                col, expr = arg
            cols.append(col)
            exprs.append(expr)

        _cols = itertools.chain(cols, kwargs.keys())
        _exprs = itertools.chain(exprs, kwargs.values())
        self.expressions = [Expression(stmt, col)
                            for stmt, col in zip(_exprs, _cols)]


[docs]class create(define):
    """
    Create DataFrame with columns

    Similar to :class:`define`, but it drops the existing columns.

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    args : strs, tuples, optional
        Expressions or ``(name, expression)`` pairs. This should
        be used when the *name* is not a valid python variable
        name. The expression should be of type :class:`str` or
        an *interable* with the same number of elements as the
        dataframe.
    kwargs : dict, optional
        ``{name: expression}`` pairs.

    kwargs : dict, optional
        ``{name: expression}`` pairs.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'x': [1, 2, 3]})
    >>> df >> create(x_sq='x**2')
       x_sq
    0     1
    1     4
    2     9
    >>> df >> create(('x*2', 'x*2'), ('x*3', 'x*3'), x_cubed='x**3')
       x*2  x*3  x_cubed
    0    2    3        1
    1    4    6        8
    2    6    9       27
    >>> df >> create('x*4')
       x*4
    0    4
    1    8
    2   12
    """


[docs]class sample_n(DataOperator):
    """
    Sample n rows from dataframe

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    n : int, optional
        Number of items from axis to return.
    replace : boolean, optional
        Sample with or without replacement. Default = False.
    weights : str or ndarray-like, optional
        Default 'None' results in equal probability weighting.
        If passed a Series, will align with target object on index. Index
        values in weights not found in sampled object will be ignored and
        index values in sampled object not in weights will be assigned
        weights of zero.
        If called on a DataFrame, will accept the name of a column
        when axis = 0.
        Unless weights are a Series, weights must be same length as axis
        being sampled.
        If weights do not sum to 1, they will be normalized to sum to 1.
        Missing values in the weights column will be treated as zero.
        inf and -inf values not allowed.
    random_state : int or numpy.random.RandomState, optional
        Seed for the random number generator (if int), or numpy RandomState
        object.
    axis : int or string, optional
        Axis to sample. Accepts axis number or name. Default is stat axis
        for given data type (0 for Series and DataFrames, 1 for Panels).

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> rs = np.random.RandomState(1234567890)
    >>> df = pd.DataFrame({'x': range(20)})
    >>> df >> sample_n(5, random_state=rs)
         x
    5    5
    19  19
    14  14
    8    8
    17  17
    """
    def __init__(self, n=1, replace=False, weights=None,
                 random_state=None, axis=None):
        self.kwargs = dict(n=n, replace=replace, weights=weights,
                           random_state=random_state, axis=axis)


[docs]class sample_frac(DataOperator):
    """
    Sample a fraction of rows from dataframe

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    frac : float, optional
        Fraction of axis items to return. Cannot be used with `n`.
    replace : boolean, optional
        Sample with or without replacement. Default = False.
    weights : str or ndarray-like, optional
        Default 'None' results in equal probability weighting.
        If passed a Series, will align with target object on index. Index
        values in weights not found in sampled object will be ignored and
        index values in sampled object not in weights will be assigned
        weights of zero.
        If called on a DataFrame, will accept the name of a column
        when axis = 0.
        Unless weights are a Series, weights must be same length as axis
        being sampled.
        If weights do not sum to 1, they will be normalized to sum to 1.
        Missing values in the weights column will be treated as zero.
        inf and -inf values not allowed.
    random_state : int or numpy.random.RandomState, optional
        Seed for the random number generator (if int), or numpy RandomState
        object.
    axis : int or string, optional
        Axis to sample. Accepts axis number or name. Default is stat axis
        for given data type (0 for Series and DataFrames, 1 for Panels).

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> rs = np.random.RandomState(1234567890)
    >>> df = pd.DataFrame({'x': range(20)})
    >>> df >> sample_frac(0.25, random_state=rs)
         x
    5    5
    19  19
    14  14
    8    8
    17  17
    """

    def __init__(self, frac=None, replace=False, weights=None,
                 random_state=None, axis=None):
        self.kwargs = dict(
            frac=frac, replace=replace, weights=weights,
            random_state=random_state, axis=axis)


[docs]class select(DataOperator):
    """
    Select columns by name

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    names : tuple, optional
        Names of columns in dataframe. Normally, they are strings
        can include slice e.g :py:`slice('col2', 'col5')`.
        You can also exclude columns by prepending a ``-`` e.g
        py:`select('-col1')`, will include all columns minus than
        *col1*.
    startswith : str or tuple, optional
        All column names that start with this string will be included.
    endswith : str or tuple, optional
        All column names that end with this string will be included.
    contains : str or tuple, optional
        All column names that contain with this string will be included.
    matches : str or regex or tuple, optional
        All column names that match the string or a compiled regex pattern
        will be included. A tuple can be used to match multiple regexs.
    drop : bool, optional
        If ``True``, the selection is inverted. The unspecified/unmatched
        columns are returned instead. Default is ``False``.

    Examples
    --------
    >>> import pandas as pd
    >>> x = [1, 2, 3]
    >>> df = pd.DataFrame({'bell': x, 'whistle': x, 'nail': x, 'tail': x})
    >>> df >> select('bell', 'nail')
       bell  nail
    0     1     1
    1     2     2
    2     3     3
    >>> df >> select('bell', 'nail', drop=True)
       whistle  tail
    0        1     1
    1        2     2
    2        3     3
    >>> df >> select('whistle',  endswith='ail')
       whistle nail  tail
    0        1    1     1
    1        2    2     2
    2        3    3     3
    >>> df >> select('bell',  matches=r'\\w+tle$')
       bell  whistle
    0     1        1
    1     2        2
    2     3        3

    You can select column slices too. Like :meth:`~pandas.DataFrame.loc`,
    the stop column is included.

    >>> df = pd.DataFrame({'a': x, 'b': x, 'c': x, 'd': x,
    ...                    'e': x, 'f': x, 'g': x, 'h': x})
    >>> df
       a  b  c  d  e  f  g  h
    0  1  1  1  1  1  1  1  1
    1  2  2  2  2  2  2  2  2
    2  3  3  3  3  3  3  3  3
    >>> df >> select('a', slice('c', 'e'), 'g')
       a  c  d  e  g
    0  1  1  1  1  1
    1  2  2  2  2  2
    2  3  3  3  3  3

    You can exclude columns by prepending ``-``

    >>> df >> select('-a', '-c', '-e')
       b  d  f  g  h
    0  1  1  1  1  1
    1  2  2  2  2  2
    2  3  3  3  3  3

    Remove and place column at the end

    >>> df >> select('-a', '-c', '-e', 'a')
       b  d  f  g  h  a
    0  1  1  1  1  1  1
    1  2  2  2  2  2  2
    2  3  3  3  3  3  3

    Notes
    -----
    To exclude columns by prepending a minus, the first column
    passed to :class:`select` must be prepended with minus.
    :py:`select('-a', 'c')` will exclude column ``a``, while
    :py:`select('c', '-a')` will not exclude column ``a``.
    """
    def __init__(self, *names, startswith=None, endswith=None,
                 contains=None, matches=None, drop=False):
        def as_tuple(obj):
            if obj is None:
                return tuple()
            elif isinstance(obj, tuple):
                return obj
            elif isinstance(obj, list):
                return tuple(obj)
            else:
                return (obj,)

        self.names = names
        self.startswith = as_tuple(startswith)
        self.endswith = as_tuple(endswith)
        self.contains = as_tuple(contains)
        self.matches = as_tuple(matches)
        self.drop = drop

    @staticmethod
    def from_columns(*columns):
        """
        Create a select verb from the columns specification

        Parameters
        ----------
        *columns : list-like | select | str | slice
            Column names to be gathered and whose contents will
            make values.

        Return
        ------
        out : select
            Select verb representation of the columns.
        """
        from .helper_verbs import select_all, select_at, select_if
        n = len(columns)
        if n == 0:
            return select_all()
        elif n == 1:
            obj = columns[0]
            if isinstance(obj, (select, select_all, select_at, select_if)):
                return obj
            elif isinstance(obj, slice):
                return select(obj)
            elif isinstance(obj, (list, tuple)):
                return select(*obj)
            elif isinstance(obj, str):
                return select(obj)
            else:
                raise TypeError(
                    "Unrecognised type {}".format(type(obj))
                )
        else:
            return select(*columns)


[docs]class rename(DataOperator):
    """
    Rename columns

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    args : tuple, optional
        A single positional argument that holds
        ``{'new_name': 'old_name'}`` pairs. This is useful if the
        *old_name* is not a valid python variable name.
    kwargs : dict, optional
        ``{new_name: 'old_name'}`` pairs. If all the columns to be
        renamed are valid python variable names, then they
        can be specified as keyword arguments.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> x = np.array([1, 2, 3])
    >>> df = pd.DataFrame({'bell': x, 'whistle': x,
    ...                    'nail': x, 'tail': x})
    >>> df >> rename(gong='bell', pin='nail')
       gong  whistle  pin  tail
    0     1        1    1     1
    1     2        2    2     2
    2     3        3    3     3
    >>> df >> rename({'flap': 'tail'}, pin='nail')
       bell  whistle  pin  flap
    0     1        1    1     1
    1     2        2    2     2
    2     3        3    3     3

    Notes
    -----
    If :obj:`plydata.options.modify_input_data` is ``True``,
    :class:`rename` will modify the original dataframe.
    """
    lookup = None

    def __init__(self, *args, **kwargs):
        lookup = args[0] if len(args) else {}
        self.lookup = {v: k for k, v in lookup.items()}
        self.lookup.update({v: k for k, v in kwargs.items()})


[docs]class distinct(DataOperator):
    """
    Select distinct/unique rows

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    columns : list-like, optional
        Column names to use when determining uniqueness.
    keep : {'first', 'last', False}, optional
        - ``first`` : Keep the first occurence.
        - ``last`` : Keep the last occurence.
        - False : Do not keep any of the duplicates.

        Default is False.
    kwargs : dict, optional
        ``{name: expression}`` computed columns. If specified,
        these are taken together with the columns when determining
        unique rows.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'x': [1, 1, 2, 3, 4, 4, 5],
    ...                    'y': [1, 2, 3, 4, 5, 5, 6]})
    >>> df >> distinct()
       x  y
    0  1  1
    1  1  2
    2  2  3
    3  3  4
    4  4  5
    6  5  6
    >>> df >> distinct(['x'])
       x  y
    0  1  1
    2  2  3
    3  3  4
    4  4  5
    6  5  6
    >>> df >> distinct(['x'], 'last')
       x  y
    1  1  2
    2  2  3
    3  3  4
    5  4  5
    6  5  6
    >>> df >> distinct(z='x%2')
       x  y  z
    0  1  1  1
    2  2  3  0
    >>> df >> distinct(['x'], z='x%2')
       x  y  z
    0  1  1  1
    2  2  3  0
    3  3  4  1
    4  4  5  0
    6  5  6  1
    >>> df >> define(z='x%2') >> distinct(['x', 'z'])
       x  y  z
    0  1  1  1
    2  2  3  0
    3  3  4  1
    4  4  5  0
    6  5  6  1
    """
    columns = None
    keep = 'first'

    def __init__(self, *args, **kwargs):
        self.set_env_from_verb_init()
        if len(args) == 1:
            if isinstance(args[0], (str, bool)):
                self.keep = args[0]
            else:
                self.columns = args[0]
        elif len(args) == 2:
            self.columns, self.keep = args
        elif len(args) > 2:
            raise Exception("Too many positional arguments.")

        # define
        if kwargs:
            if self.columns is None:
                self.columns = []
            elif not isinstance(self.columns, list):
                self.columns = list(self.columns)

            _cols = list(kwargs.keys())
            _exprs = list(kwargs.values())
            self.columns.extend(_cols)
        else:
            _cols = []
            _exprs = []

        self.expressions = [Expression(stmt, col)
                            for stmt, col in zip(_exprs, _cols)]


[docs]class arrange(DataOperator):
    """
    Sort rows by column variables

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    args : tuple
        Columns/expressions to sort by.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0],
    ...                    'y': [1, 2, 3, 4, 5, 6]})
    >>> df >> arrange('x')
       x  y
    5  0  6
    0  1  1
    2  2  3
    3  2  4
    4  4  5
    1  5  2
    >>> df >> arrange('x', '-y')
       x  y
    5  0  6
    0  1  1
    3  2  4
    2  2  3
    4  4  5
    1  5  2
    >>> df >> arrange('np.sin(y)')
       x  y
    4  4  5
    3  2  4
    5  0  6
    2  2  3
    0  1  1
    1  5  2
    """
    expressions = None

    def __init__(self, *args):
        self.set_env_from_verb_init()
        name_gen = ('col_{}'.format(x) for x in range(100))
        self.expressions = [
            Expression(stmt, col)
            for stmt, col in zip(args, name_gen)
        ]


[docs]class group_by(define):
    """
    Group dataframe by one or more columns/variables

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    args : strs, tuples, optional
        Expressions or ``(name, expression)`` pairs. This should
        be used when the *name* is not a valid python variable
        name. The expression should be of type :class:`str` or
        an *interable* with the same number of elements as the
        dataframe.
    add_ : bool, optional
        If True, add to existing groups. Default is to create
        new groups.
    kwargs : dict, optional
        ``{name: expression}`` pairs.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0, 4],
    ...                    'y': [1, 2, 3, 4, 5, 6, 5]})
    >>> df >> group_by('x')
    groups: ['x']
       x  y
    0  1  1
    1  5  2
    2  2  3
    3  2  4
    4  4  5
    5  0  6
    6  4  5

    Like :meth:`define`, :meth:`group_by` creates any
    missing columns.

    >>> df >> group_by('y-1', xplus1='x+1')
    groups: ['y-1', 'xplus1']
       x  y  y-1  xplus1
    0  1  1    0       2
    1  5  2    1       6
    2  2  3    2       3
    3  2  4    3       3
    4  4  5    4       5
    5  0  6    5       1
    6  4  5    4       5

    Columns that are grouped on remain in the dataframe after any
    verb operations that do not use the group information. For
    example:

    >>> df >> group_by('y-1', xplus1='x+1') >> select('y')
    groups: ['y-1', 'xplus1']
       y-1  xplus1  y
    0    0       2  1
    1    1       6  2
    2    2       3  3
    3    3       3  4
    4    4       5  5
    5    5       1  6
    6    4       5  5

    Notes
    -----
    If :obj:`plydata.options.modify_input_data` is ``True``,
    :class:`group_by` will modify the original dataframe.
    """
    groups = None

    def __init__(self, *args, add_=False, **kwargs):
        self.set_env_from_verb_init()
        super().__init__(*args, **kwargs)
        self.add_ = add_
        self.groups = [expr.column for expr in self.expressions]


[docs]class ungroup(DataOperator):
    """
    Remove the grouping variables for dataframe

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'x': [1, 2, 3],
    ...                    'y': [1, 2, 3]})
    >>> df >> group_by('x')
    groups: ['x']
       x  y
    0  1  1
    1  2  2
    2  3  3
    >>> df >> group_by('x') >> ungroup()
       x  y
    0  1  1
    1  2  2
    2  3  3
    """


[docs]class group_indices(group_by):
    """
    Generate a unique id for each group

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    args : strs, tuples, optional
        Expressions or ``(name, expression)`` pairs. This should
        be used when the *name* is not a valid python variable
        name. The expression should be of type :class:`str` or
        an *interable* with the same number of elements as the
        dataframe. As this verb returns an array, the tuples have
        no added benefit over strings.
    kwargs : dict, optional
        ``{name: expression}`` pairs. As this verb returns an
        array, keyword arguments have no added benefit over
        :class:`str` positional arguments.

    Returns
    -------
    out : numpy.array
        Ids for each group

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0, 4],
    ...                    'y': [1, 2, 3, 4, 5, 6, 5]})
    >>> df >> group_by('x')
    groups: ['x']
       x  y
    0  1  1
    1  5  2
    2  2  3
    3  2  4
    4  4  5
    5  0  6
    6  4  5
    >>> df >> group_by('x') >> group_indices()
    array([1, 4, 2, 2, 3, 0, 3])

    You can pass the group column(s) as parameters to
    :class:`group_indices`

    >>> df >> group_indices('x*2')
    array([1, 4, 2, 2, 3, 0, 3])
    """


[docs]class summarize(define):
    """
    Summarise multiple values to a single value

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    args : strs, tuples, optional
        Expressions or ``(name, expression)`` pairs. This should
        be used when the *name* is not a valid python variable
        name. The expression should be of type :class:`str` or
        an *interable* with the same number of elements as the
        dataframe.
    kwargs : dict, optional
        ``{name: expression}`` pairs.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0, 4],
    ...                    'y': [1, 2, 3, 4, 5, 6, 5],
    ...                    'z': [1, 3, 3, 4, 5, 5, 5]})

    Can take only positional, only keyword arguments or both.

    >>> df >> summarize('np.sum(x)', max='np.max(x)')
       np.sum(x)  max
    0         18    5

    When summarizing after a :class:`group_by` operation
    the group columns are retained.

    >>> df >> group_by('y', 'z') >> summarize(mean_x='np.mean(x)')
       y  z  mean_x
    0  1  1     1.0
    1  2  3     5.0
    2  3  3     2.0
    3  4  4     2.0
    4  5  5     4.0
    5  6  5     0.0

    .. rubric:: Aggregate Functions

    When summarizing the following functions can be used, they take
    an array and return a *single* number.

    - ``min(x)`` - Alias of :func:`numpy.amin` (a.k.a ``numpy.min``).
    - ``max(x)`` - Alias of :func:`numpy.amax` (a.k.a ``numpy.max``).
    - ``sum(x)`` - Alias of :func:`numpy.sum`.
    - ``cumsum(x)`` - Alias of :func:`numpy.cumsum`.
    - ``mean(x)`` - Alias of :func:`numpy.mean`.
    - ``median(x)`` - Alias of :func:`numpy.median`.
    - ``std(x)`` - Alias of :func:`numpy.std`.
    - ``first(x)`` - First element of ``x``.
    - ``last(x)`` - Last element of ``x``.
    - ``nth(x, n)`` - *nth* value of ``x`` or ``numpy.nan``.
    - ``n_distinct(x)`` - Number of distint elements in ``x``.
    - ``n_unique(x)`` - Alias of ``n_distinct``.
    - ``n()`` - Number of elements in current group.

    The aliases of the Numpy functions save you from typing 3 or 5 key
    strokes and you get better column names. i.e ``min(x)`` instead of
    ``np.min(x)`` or ``numpy.min(x)`` if you have Numpy imported.

    >>> df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5],
    ...                    'y': [0, 0, 1, 1, 2, 3]})
    >>> df >> summarize('min(x)', 'max(x)', 'mean(x)', 'sum(x)',
    ...                 'first(x)', 'last(x)', 'nth(x, 3)')
       min(x)  max(x)  mean(x)  sum(x)  first(x)  last(x)  nth(x, 3)
    0       0       5      2.5      15         0        5          3

    Summarizing groups with aggregate functions

    >>> df >> group_by('y') >> summarize('mean(x)')
       y  mean(x)
    0  0      0.5
    1  1      2.5
    2  2      4.0
    3  3      5.0

    >>> df >> group_by('y') >> summarize(y_count='n()')
       y  y_count
    0  0        2
    1  1        2
    2  2        1
    3  3        1

    You can use ``n()`` even when there are no groups.

    >>> df >> summarize('n()')
       n()
    0    6
    """


[docs]class query(DataOperator):
    """
    Return rows with matching conditions

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    expr : str
        The query string to evaluate.  You can refer to variables
        in the environment by prefixing them with an '@' character
        like ``@a + b``. Allowed functions are `sin`, `cos`, `exp`,
        `log`, `expm1`, `log1p`, `sqrt`, `sinh`, `cosh`, `tanh`,
        `arcsin`, `arccos`, `arctan`, `arccosh`, `arcsinh`,
        `arctanh`, `abs` and `arctan2`.
    kwargs : dict
        See the documentation for :func:`pandas.eval` for complete
        details on the keyword arguments accepted by
        :meth:`pandas.DataFrame.query`.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5],
    ...                    'y': [0, 0, 1, 1, 2, 3]})
    >>> df >> query('x % 2 == 0')
       x  y
    0  0  0
    2  2  1
    4  4  2

    >>> df >> query('x % 2 == 0 & y > 0')
       x  y
    2  2  1
    4  4  2

    By default, Bitwise operators ``&`` and ``|`` have the same
    precedence as the booleans ``and`` and ``or``.

    >>> df >> query('x % 2 == 0 and y > 0')
       x  y
    2  2  1
    4  4  2

    ``query`` works within groups

    >>> df >> query('x == x.min()')
       x  y
    0  0  0

    >>> df >> group_by('y') >> query('x == x.min()')
    groups: ['y']
       x  y
    0  0  0
    2  2  1
    4  4  2
    5  5  3

    For more information see :meth:`pandas.DataFrame.query`. To query
    rows and columns with ``NaN`` values, use :class:`dropna`

    Notes
    -----
    :class:`~plydata.one_table_verbs.query` is the equivalent of
    dplyr's `filter` verb but with slightly different python syntax
    the expressions.
    """
    expression = None

    def __init__(self, expr, **kwargs):
        self.set_env_from_verb_init()
        self.expression = expr
        self.kwargs = kwargs


[docs]class do(DataOperator):
    """
    Do arbitrary operations on a dataframe

    Considering the *split-apply-combine* data manipulation
    strategy, :class:`do` gives a window into which to place
    the complex *apply* actions, and also control over the form of
    results when they are combined. This allows

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    func : function, optional
        A single function to apply to each group. *The
        function should accept a dataframe and return a
        dataframe*.
    kwargs : dict, optional
        ``{name: function}`` pairs. *The function should
        accept a dataframe and return an array*. The function
        computes a column called ``name``.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> df = pd.DataFrame({'x': [1, 2, 2, 3],
    ...                    'y': [2, 3, 4, 3],
    ...                    'z': list('aabb')})

    Define a function that uses numpy to do a least squares fit.
    It takes input from a dataframe and output is a dataframe.
    ``gdf`` is a dataframe that contains only rows from the current
    group.

    >>> def least_squares(gdf):
    ...     X = np.vstack([gdf.x, np.ones(len(gdf))]).T
    ...     (m, c), _, _, _ = np.linalg.lstsq(X, gdf.y, None)
    ...     return pd.DataFrame({'intercept': c, 'slope': [m]})

    Define functions that take x and y values and compute the
    intercept and slope.

    >>> def slope(x, y):
    ...     return np.diff(y)[0] / np.diff(x)[0]
    ...
    >>> def intercept(x, y):
    ...     return y.values[0] - slope(x, y) * x.values[0]

    Demonstrating do

    >>> df >> group_by('z') >> do(least_squares)
    groups: ['z']
       z  intercept  slope
    0  a        1.0    1.0
    1  b        6.0   -1.0

    We can get the same result, by passing separate functions
    that calculate the columns independently.

    >>> df >> group_by('z') >> do(
    ...     intercept=lambda gdf: intercept(gdf.x, gdf.y),
    ...     slope=lambda gdf: slope(gdf.x, gdf.y))
    groups: ['z']
       z  intercept  slope
    0  a        1.0    1.0
    1  b        6.0   -1.0

    The functions need not return numerical values. Pandas columns can
    hold any type of object. You could store result objects from more
    complicated models. Each model would be linked to a group. Notice
    that the group columns (``z`` in the above cases) are included in
    the result.

    Notes
    -----
    You cannot have both a position argument and keyword
    arguments.
    """
    single_function = False

    def __init__(self, func=None, **kwargs):
        if func is not None:
            if kwargs:
                raise ValueError(
                    "Unexpected positional and keyword arguments.")
            if not callable(func):
                raise TypeError(
                    "func should be a callable object")

        if func:
            self.single_function = True
            self.expressions = [Expression(func, None)]
        else:
            stmts_cols = zip(kwargs.values(), kwargs.keys())
            self.expressions = [
                Expression(stmt, col) for stmt, col in stmts_cols
            ]


[docs]class head(DataOperator):
    """
    Select the top n rows

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    n : int, optional
        Number of rows to return. If the ``data`` is grouped,
        then number of rows per group. Default is 5.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({
    ...     'x': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    ...     'y': list('aaaabbcddd') })
    >>> df >> head(2)
       x  y
    0  1  a
    1  2  a

    Grouped dataframe

    >>> df >> group_by('y') >> head(2)
    groups: ['y']
       x  y
    0  1  a
    1  2  a
    2  5  b
    3  6  b
    4  7  c
    5  8  d
    6  9  d
    """
    def __init__(self, n=5):
        self.n = n


[docs]class tail(DataOperator):
    """
    Select the bottom n rows

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    n : int, optional
        Number of rows to return. If the ``data`` is grouped,
        then number of rows per group. Default is 5.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({
    ...     'x': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    ...     'y': list('aaaabbcddd') })
    >>> df >> tail(2)
        x  y
    8   9  d
    9  10  d

    Grouped dataframe

    >>> df >> group_by('y') >> tail(2)
    groups: ['y']
        x  y
    0   3  a
    1   4  a
    2   5  b
    3   6  b
    4   7  c
    5   9  d
    6  10  d
    """
    def __init__(self, n=5):
        self.n = n


[docs]class pull(DataOperator):
    """
    Pull a single column from the dataframe

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    column : name
        Column name or index id.
    use_index : bool
        Whether to pull column by name or by its integer
        index. Default is False.


    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({
    ...     'x': [1, 2, 3],
    ...     'y': [4, 5, 6],
    ...     'z': [7, 8, 9]
    ... })
    >>> df
       x  y  z
    0  1  4  7
    1  2  5  8
    2  3  6  9
    >>> df >> pull('y')
    array([4, 5, 6])
    >>> df >> pull(0, True)
    array([1, 2, 3])
    >>> df >> pull(-1, True)
    array([7, 8, 9])

    Notes
    -----
    Always returns a numpy array.

    If :obj:`plydata.options.modify_input_data` is ``True``,
    :class:`pull` will not make a copy the original column.
    """
    def __init__(self, column, use_index=False):
        self.column = column
        self.use_index = use_index


[docs]class slice_rows(DataOperator):
    """
    Select rows

    A wrapper around :class:`slice` to use when piping.

    Parameters
    ----------
    data : dataframe, optional
        Useful when not using the ``>>`` operator.
    *args : tuple
        (start, stop, step) as expected by the builtin :class:`slice`
        type.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'x': range(10), 'y': range(100, 110)})
    >>> df >> slice_rows(5)
       x    y
    0  0  100
    1  1  101
    2  2  102
    3  3  103
    4  4  104

    >>> df >> slice_rows(3, 7)
       x    y
    3  3  103
    4  4  104
    5  5  105
    6  6  106

    >>> df >> slice_rows(None, None, 3)
       x    y
    0  0  100
    3  3  103
    6  6  106
    9  9  109

    The above examples are equivalent to::

        df[slice(5)]
        df[slice(3, 7)]
        df[slice(None, None, 3)]

    respectively.

    Notes
    -----
    If :obj:`plydata.options.modify_input_data` is ``True``,
    :class:`slice_rows` will not make a copy the original dataframe.
    """
    def __init__(self, *args):
        self.slice = slice(*args)


# Aliases
mutate = define
transmute = create
unique = distinct
summarise = summarize