Source code for plydata.one_table_verbs

"""
One table verb initializations
"""
import itertools

from .operators import DataOperator
from .expressions import Expression

__all__ = ['define', 'create', 'sample_n', 'sample_frac', 'select',
           'rename', 'distinct', 'unique', 'arrange', 'group_by',
           'ungroup', 'group_indices', 'summarize',
           'query', 'do', 'head', 'tail', 'pull', 'slice_rows',
           # Aliases
           'summarise', 'mutate', 'transmute',
           ]


[docs]class define(DataOperator): """ Add column to DataFrame Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. args : strs, tuples, optional Expressions or ``(name, expression)`` pairs. This should be used when the *name* is not a valid python variable name. The expression should be of type :class:`str` or an *interable* with the same number of elements as the dataframe. kwargs : dict, optional ``{name: expression}`` pairs. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({'x': [1, 2, 3]}) >>> df >> define(x_sq='x**2') x x_sq 0 1 1 1 2 4 2 3 9 >>> df >> define(('x*2', 'x*2'), ('x*3', 'x*3'), x_cubed='x**3') x x*2 x*3 x_cubed 0 1 2 3 1 1 2 4 6 8 2 3 6 9 27 >>> df >> define('x*4') x x*4 0 1 4 1 2 8 2 3 12 Notes ----- If :obj:`plydata.options.modify_input_data` is ``True``, :class:`define` will modify the original dataframe. """ def __init__(self, *args, **kwargs): self.set_env_from_verb_init() cols = [] exprs = [] for arg in args: if isinstance(arg, str): col = expr = arg else: col, expr = arg cols.append(col) exprs.append(expr) _cols = itertools.chain(cols, kwargs.keys()) _exprs = itertools.chain(exprs, kwargs.values()) self.expressions = [Expression(stmt, col) for stmt, col in zip(_exprs, _cols)]
[docs]class create(define): """ Create DataFrame with columns Similar to :class:`define`, but it drops the existing columns. Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. args : strs, tuples, optional Expressions or ``(name, expression)`` pairs. This should be used when the *name* is not a valid python variable name. The expression should be of type :class:`str` or an *interable* with the same number of elements as the dataframe. kwargs : dict, optional ``{name: expression}`` pairs. kwargs : dict, optional ``{name: expression}`` pairs. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({'x': [1, 2, 3]}) >>> df >> create(x_sq='x**2') x_sq 0 1 1 4 2 9 >>> df >> create(('x*2', 'x*2'), ('x*3', 'x*3'), x_cubed='x**3') x*2 x*3 x_cubed 0 2 3 1 1 4 6 8 2 6 9 27 >>> df >> create('x*4') x*4 0 4 1 8 2 12 """
[docs]class sample_n(DataOperator): """ Sample n rows from dataframe Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. n : int, optional Number of items from axis to return. replace : boolean, optional Sample with or without replacement. Default = False. weights : str or ndarray-like, optional Default 'None' results in equal probability weighting. If passed a Series, will align with target object on index. Index values in weights not found in sampled object will be ignored and index values in sampled object not in weights will be assigned weights of zero. If called on a DataFrame, will accept the name of a column when axis = 0. Unless weights are a Series, weights must be same length as axis being sampled. If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. inf and -inf values not allowed. random_state : int or numpy.random.RandomState, optional Seed for the random number generator (if int), or numpy RandomState object. axis : int or string, optional Axis to sample. Accepts axis number or name. Default is stat axis for given data type (0 for Series and DataFrames, 1 for Panels). Examples -------- >>> import pandas as pd >>> import numpy as np >>> rs = np.random.RandomState(1234567890) >>> df = pd.DataFrame({'x': range(20)}) >>> df >> sample_n(5, random_state=rs) x 5 5 19 19 14 14 8 8 17 17 """ def __init__(self, n=1, replace=False, weights=None, random_state=None, axis=None): self.kwargs = dict(n=n, replace=replace, weights=weights, random_state=random_state, axis=axis)
[docs]class sample_frac(DataOperator): """ Sample a fraction of rows from dataframe Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. frac : float, optional Fraction of axis items to return. Cannot be used with `n`. replace : boolean, optional Sample with or without replacement. Default = False. weights : str or ndarray-like, optional Default 'None' results in equal probability weighting. If passed a Series, will align with target object on index. Index values in weights not found in sampled object will be ignored and index values in sampled object not in weights will be assigned weights of zero. If called on a DataFrame, will accept the name of a column when axis = 0. Unless weights are a Series, weights must be same length as axis being sampled. If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. inf and -inf values not allowed. random_state : int or numpy.random.RandomState, optional Seed for the random number generator (if int), or numpy RandomState object. axis : int or string, optional Axis to sample. Accepts axis number or name. Default is stat axis for given data type (0 for Series and DataFrames, 1 for Panels). Examples -------- >>> import pandas as pd >>> import numpy as np >>> rs = np.random.RandomState(1234567890) >>> df = pd.DataFrame({'x': range(20)}) >>> df >> sample_frac(0.25, random_state=rs) x 5 5 19 19 14 14 8 8 17 17 """ def __init__(self, frac=None, replace=False, weights=None, random_state=None, axis=None): self.kwargs = dict( frac=frac, replace=replace, weights=weights, random_state=random_state, axis=axis)
[docs]class select(DataOperator): """ Select columns by name Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. names : tuple, optional Names of columns in dataframe. Normally, they are strings can include slice e.g :py:`slice('col2', 'col5')`. You can also exclude columns by prepending a ``-`` e.g py:`select('-col1')`, will include all columns minus than *col1*. startswith : str or tuple, optional All column names that start with this string will be included. endswith : str or tuple, optional All column names that end with this string will be included. contains : str or tuple, optional All column names that contain with this string will be included. matches : str or regex or tuple, optional All column names that match the string or a compiled regex pattern will be included. A tuple can be used to match multiple regexs. drop : bool, optional If ``True``, the selection is inverted. The unspecified/unmatched columns are returned instead. Default is ``False``. Examples -------- >>> import pandas as pd >>> x = [1, 2, 3] >>> df = pd.DataFrame({'bell': x, 'whistle': x, 'nail': x, 'tail': x}) >>> df >> select('bell', 'nail') bell nail 0 1 1 1 2 2 2 3 3 >>> df >> select('bell', 'nail', drop=True) whistle tail 0 1 1 1 2 2 2 3 3 >>> df >> select('whistle', endswith='ail') whistle nail tail 0 1 1 1 1 2 2 2 2 3 3 3 >>> df >> select('bell', matches=r'\\w+tle$') bell whistle 0 1 1 1 2 2 2 3 3 You can select column slices too. Like :meth:`~pandas.DataFrame.loc`, the stop column is included. >>> df = pd.DataFrame({'a': x, 'b': x, 'c': x, 'd': x, ... 'e': x, 'f': x, 'g': x, 'h': x}) >>> df a b c d e f g h 0 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 >>> df >> select('a', slice('c', 'e'), 'g') a c d e g 0 1 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 You can exclude columns by prepending ``-`` >>> df >> select('-a', '-c', '-e') b d f g h 0 1 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 Remove and place column at the end >>> df >> select('-a', '-c', '-e', 'a') b d f g h a 0 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 Notes ----- To exclude columns by prepending a minus, the first column passed to :class:`select` must be prepended with minus. :py:`select('-a', 'c')` will exclude column ``a``, while :py:`select('c', '-a')` will not exclude column ``a``. """ def __init__(self, *names, startswith=None, endswith=None, contains=None, matches=None, drop=False): def as_tuple(obj): if obj is None: return tuple() elif isinstance(obj, tuple): return obj elif isinstance(obj, list): return tuple(obj) else: return (obj,) self.names = names self.startswith = as_tuple(startswith) self.endswith = as_tuple(endswith) self.contains = as_tuple(contains) self.matches = as_tuple(matches) self.drop = drop @staticmethod def from_columns(*columns): """ Create a select verb from the columns specification Parameters ---------- *columns : list-like | select | str | slice Column names to be gathered and whose contents will make values. Return ------ out : select Select verb representation of the columns. """ from .helper_verbs import select_all, select_at, select_if n = len(columns) if n == 0: return select_all() elif n == 1: obj = columns[0] if isinstance(obj, (select, select_all, select_at, select_if)): return obj elif isinstance(obj, slice): return select(obj) elif isinstance(obj, (list, tuple)): return select(*obj) elif isinstance(obj, str): return select(obj) else: raise TypeError( "Unrecognised type {}".format(type(obj)) ) else: return select(*columns)
[docs]class rename(DataOperator): """ Rename columns Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. args : tuple, optional A single positional argument that holds ``{'new_name': 'old_name'}`` pairs. This is useful if the *old_name* is not a valid python variable name. kwargs : dict, optional ``{new_name: 'old_name'}`` pairs. If all the columns to be renamed are valid python variable names, then they can be specified as keyword arguments. Examples -------- >>> import pandas as pd >>> import numpy as np >>> x = np.array([1, 2, 3]) >>> df = pd.DataFrame({'bell': x, 'whistle': x, ... 'nail': x, 'tail': x}) >>> df >> rename(gong='bell', pin='nail') gong whistle pin tail 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 >>> df >> rename({'flap': 'tail'}, pin='nail') bell whistle pin flap 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 Notes ----- If :obj:`plydata.options.modify_input_data` is ``True``, :class:`rename` will modify the original dataframe. """ lookup = None def __init__(self, *args, **kwargs): lookup = args[0] if len(args) else {} self.lookup = {v: k for k, v in lookup.items()} self.lookup.update({v: k for k, v in kwargs.items()})
[docs]class distinct(DataOperator): """ Select distinct/unique rows Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. columns : list-like, optional Column names to use when determining uniqueness. keep : {'first', 'last', False}, optional - ``first`` : Keep the first occurence. - ``last`` : Keep the last occurence. - False : Do not keep any of the duplicates. Default is False. kwargs : dict, optional ``{name: expression}`` computed columns. If specified, these are taken together with the columns when determining unique rows. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({'x': [1, 1, 2, 3, 4, 4, 5], ... 'y': [1, 2, 3, 4, 5, 5, 6]}) >>> df >> distinct() x y 0 1 1 1 1 2 2 2 3 3 3 4 4 4 5 6 5 6 >>> df >> distinct(['x']) x y 0 1 1 2 2 3 3 3 4 4 4 5 6 5 6 >>> df >> distinct(['x'], 'last') x y 1 1 2 2 2 3 3 3 4 5 4 5 6 5 6 >>> df >> distinct(z='x%2') x y z 0 1 1 1 2 2 3 0 >>> df >> distinct(['x'], z='x%2') x y z 0 1 1 1 2 2 3 0 3 3 4 1 4 4 5 0 6 5 6 1 >>> df >> define(z='x%2') >> distinct(['x', 'z']) x y z 0 1 1 1 2 2 3 0 3 3 4 1 4 4 5 0 6 5 6 1 """ columns = None keep = 'first' def __init__(self, *args, **kwargs): self.set_env_from_verb_init() if len(args) == 1: if isinstance(args[0], (str, bool)): self.keep = args[0] else: self.columns = args[0] elif len(args) == 2: self.columns, self.keep = args elif len(args) > 2: raise Exception("Too many positional arguments.") # define if kwargs: if self.columns is None: self.columns = [] elif not isinstance(self.columns, list): self.columns = list(self.columns) _cols = list(kwargs.keys()) _exprs = list(kwargs.values()) self.columns.extend(_cols) else: _cols = [] _exprs = [] self.expressions = [Expression(stmt, col) for stmt, col in zip(_exprs, _cols)]
[docs]class arrange(DataOperator): """ Sort rows by column variables Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. args : tuple Columns/expressions to sort by. Examples -------- >>> import pandas as pd >>> import numpy as np >>> df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0], ... 'y': [1, 2, 3, 4, 5, 6]}) >>> df >> arrange('x') x y 5 0 6 0 1 1 2 2 3 3 2 4 4 4 5 1 5 2 >>> df >> arrange('x', '-y') x y 5 0 6 0 1 1 3 2 4 2 2 3 4 4 5 1 5 2 >>> df >> arrange('np.sin(y)') x y 4 4 5 3 2 4 5 0 6 2 2 3 0 1 1 1 5 2 """ expressions = None def __init__(self, *args): self.set_env_from_verb_init() name_gen = ('col_{}'.format(x) for x in range(100)) self.expressions = [ Expression(stmt, col) for stmt, col in zip(args, name_gen) ]
[docs]class group_by(define): """ Group dataframe by one or more columns/variables Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. args : strs, tuples, optional Expressions or ``(name, expression)`` pairs. This should be used when the *name* is not a valid python variable name. The expression should be of type :class:`str` or an *interable* with the same number of elements as the dataframe. add_ : bool, optional If True, add to existing groups. Default is to create new groups. kwargs : dict, optional ``{name: expression}`` pairs. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0, 4], ... 'y': [1, 2, 3, 4, 5, 6, 5]}) >>> df >> group_by('x') groups: ['x'] x y 0 1 1 1 5 2 2 2 3 3 2 4 4 4 5 5 0 6 6 4 5 Like :meth:`define`, :meth:`group_by` creates any missing columns. >>> df >> group_by('y-1', xplus1='x+1') groups: ['y-1', 'xplus1'] x y y-1 xplus1 0 1 1 0 2 1 5 2 1 6 2 2 3 2 3 3 2 4 3 3 4 4 5 4 5 5 0 6 5 1 6 4 5 4 5 Columns that are grouped on remain in the dataframe after any verb operations that do not use the group information. For example: >>> df >> group_by('y-1', xplus1='x+1') >> select('y') groups: ['y-1', 'xplus1'] y-1 xplus1 y 0 0 2 1 1 1 6 2 2 2 3 3 3 3 3 4 4 4 5 5 5 5 1 6 6 4 5 5 Notes ----- If :obj:`plydata.options.modify_input_data` is ``True``, :class:`group_by` will modify the original dataframe. """ groups = None def __init__(self, *args, add_=False, **kwargs): self.set_env_from_verb_init() super().__init__(*args, **kwargs) self.add_ = add_ self.groups = [expr.column for expr in self.expressions]
[docs]class ungroup(DataOperator): """ Remove the grouping variables for dataframe Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({'x': [1, 2, 3], ... 'y': [1, 2, 3]}) >>> df >> group_by('x') groups: ['x'] x y 0 1 1 1 2 2 2 3 3 >>> df >> group_by('x') >> ungroup() x y 0 1 1 1 2 2 2 3 3 """
[docs]class group_indices(group_by): """ Generate a unique id for each group Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. args : strs, tuples, optional Expressions or ``(name, expression)`` pairs. This should be used when the *name* is not a valid python variable name. The expression should be of type :class:`str` or an *interable* with the same number of elements as the dataframe. As this verb returns an array, the tuples have no added benefit over strings. kwargs : dict, optional ``{name: expression}`` pairs. As this verb returns an array, keyword arguments have no added benefit over :class:`str` positional arguments. Returns ------- out : numpy.array Ids for each group Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0, 4], ... 'y': [1, 2, 3, 4, 5, 6, 5]}) >>> df >> group_by('x') groups: ['x'] x y 0 1 1 1 5 2 2 2 3 3 2 4 4 4 5 5 0 6 6 4 5 >>> df >> group_by('x') >> group_indices() array([1, 4, 2, 2, 3, 0, 3]) You can pass the group column(s) as parameters to :class:`group_indices` >>> df >> group_indices('x*2') array([1, 4, 2, 2, 3, 0, 3]) """
[docs]class summarize(define): """ Summarise multiple values to a single value Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. args : strs, tuples, optional Expressions or ``(name, expression)`` pairs. This should be used when the *name* is not a valid python variable name. The expression should be of type :class:`str` or an *interable* with the same number of elements as the dataframe. kwargs : dict, optional ``{name: expression}`` pairs. Examples -------- >>> import pandas as pd >>> import numpy as np >>> df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0, 4], ... 'y': [1, 2, 3, 4, 5, 6, 5], ... 'z': [1, 3, 3, 4, 5, 5, 5]}) Can take only positional, only keyword arguments or both. >>> df >> summarize('np.sum(x)', max='np.max(x)') np.sum(x) max 0 18 5 When summarizing after a :class:`group_by` operation the group columns are retained. >>> df >> group_by('y', 'z') >> summarize(mean_x='np.mean(x)') y z mean_x 0 1 1 1.0 1 2 3 5.0 2 3 3 2.0 3 4 4 2.0 4 5 5 4.0 5 6 5 0.0 .. rubric:: Aggregate Functions When summarizing the following functions can be used, they take an array and return a *single* number. - ``min(x)`` - Alias of :func:`numpy.amin` (a.k.a ``numpy.min``). - ``max(x)`` - Alias of :func:`numpy.amax` (a.k.a ``numpy.max``). - ``sum(x)`` - Alias of :func:`numpy.sum`. - ``cumsum(x)`` - Alias of :func:`numpy.cumsum`. - ``mean(x)`` - Alias of :func:`numpy.mean`. - ``median(x)`` - Alias of :func:`numpy.median`. - ``std(x)`` - Alias of :func:`numpy.std`. - ``first(x)`` - First element of ``x``. - ``last(x)`` - Last element of ``x``. - ``nth(x, n)`` - *nth* value of ``x`` or ``numpy.nan``. - ``n_distinct(x)`` - Number of distint elements in ``x``. - ``n_unique(x)`` - Alias of ``n_distinct``. - ``n()`` - Number of elements in current group. The aliases of the Numpy functions save you from typing 3 or 5 key strokes and you get better column names. i.e ``min(x)`` instead of ``np.min(x)`` or ``numpy.min(x)`` if you have Numpy imported. >>> df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], ... 'y': [0, 0, 1, 1, 2, 3]}) >>> df >> summarize('min(x)', 'max(x)', 'mean(x)', 'sum(x)', ... 'first(x)', 'last(x)', 'nth(x, 3)') min(x) max(x) mean(x) sum(x) first(x) last(x) nth(x, 3) 0 0 5 2.5 15 0 5 3 Summarizing groups with aggregate functions >>> df >> group_by('y') >> summarize('mean(x)') y mean(x) 0 0 0.5 1 1 2.5 2 2 4.0 3 3 5.0 >>> df >> group_by('y') >> summarize(y_count='n()') y y_count 0 0 2 1 1 2 2 2 1 3 3 1 You can use ``n()`` even when there are no groups. >>> df >> summarize('n()') n() 0 6 """
[docs]class query(DataOperator): """ Return rows with matching conditions Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. expr : str The query string to evaluate. You can refer to variables in the environment by prefixing them with an '@' character like ``@a + b``. Allowed functions are `sin`, `cos`, `exp`, `log`, `expm1`, `log1p`, `sqrt`, `sinh`, `cosh`, `tanh`, `arcsin`, `arccos`, `arctan`, `arccosh`, `arcsinh`, `arctanh`, `abs` and `arctan2`. kwargs : dict See the documentation for :func:`pandas.eval` for complete details on the keyword arguments accepted by :meth:`pandas.DataFrame.query`. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], ... 'y': [0, 0, 1, 1, 2, 3]}) >>> df >> query('x % 2 == 0') x y 0 0 0 2 2 1 4 4 2 >>> df >> query('x % 2 == 0 & y > 0') x y 2 2 1 4 4 2 By default, Bitwise operators ``&`` and ``|`` have the same precedence as the booleans ``and`` and ``or``. >>> df >> query('x % 2 == 0 and y > 0') x y 2 2 1 4 4 2 ``query`` works within groups >>> df >> query('x == x.min()') x y 0 0 0 >>> df >> group_by('y') >> query('x == x.min()') groups: ['y'] x y 0 0 0 2 2 1 4 4 2 5 5 3 For more information see :meth:`pandas.DataFrame.query`. To query rows and columns with ``NaN`` values, use :class:`dropna` Notes ----- :class:`~plydata.one_table_verbs.query` is the equivalent of dplyr's `filter` verb but with slightly different python syntax the expressions. """ expression = None def __init__(self, expr, **kwargs): self.set_env_from_verb_init() self.expression = expr self.kwargs = kwargs
[docs]class do(DataOperator): """ Do arbitrary operations on a dataframe Considering the *split-apply-combine* data manipulation strategy, :class:`do` gives a window into which to place the complex *apply* actions, and also control over the form of results when they are combined. This allows Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. func : function, optional A single function to apply to each group. *The function should accept a dataframe and return a dataframe*. kwargs : dict, optional ``{name: function}`` pairs. *The function should accept a dataframe and return an array*. The function computes a column called ``name``. Examples -------- >>> import pandas as pd >>> import numpy as np >>> df = pd.DataFrame({'x': [1, 2, 2, 3], ... 'y': [2, 3, 4, 3], ... 'z': list('aabb')}) Define a function that uses numpy to do a least squares fit. It takes input from a dataframe and output is a dataframe. ``gdf`` is a dataframe that contains only rows from the current group. >>> def least_squares(gdf): ... X = np.vstack([gdf.x, np.ones(len(gdf))]).T ... (m, c), _, _, _ = np.linalg.lstsq(X, gdf.y, None) ... return pd.DataFrame({'intercept': c, 'slope': [m]}) Define functions that take x and y values and compute the intercept and slope. >>> def slope(x, y): ... return np.diff(y)[0] / np.diff(x)[0] ... >>> def intercept(x, y): ... return y.values[0] - slope(x, y) * x.values[0] Demonstrating do >>> df >> group_by('z') >> do(least_squares) groups: ['z'] z intercept slope 0 a 1.0 1.0 1 b 6.0 -1.0 We can get the same result, by passing separate functions that calculate the columns independently. >>> df >> group_by('z') >> do( ... intercept=lambda gdf: intercept(gdf.x, gdf.y), ... slope=lambda gdf: slope(gdf.x, gdf.y)) groups: ['z'] z intercept slope 0 a 1.0 1.0 1 b 6.0 -1.0 The functions need not return numerical values. Pandas columns can hold any type of object. You could store result objects from more complicated models. Each model would be linked to a group. Notice that the group columns (``z`` in the above cases) are included in the result. Notes ----- You cannot have both a position argument and keyword arguments. """ single_function = False def __init__(self, func=None, **kwargs): if func is not None: if kwargs: raise ValueError( "Unexpected positional and keyword arguments.") if not callable(func): raise TypeError( "func should be a callable object") if func: self.single_function = True self.expressions = [Expression(func, None)] else: stmts_cols = zip(kwargs.values(), kwargs.keys()) self.expressions = [ Expression(stmt, col) for stmt, col in stmts_cols ]
[docs]class tail(DataOperator): """ Select the bottom n rows Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. n : int, optional Number of rows to return. If the ``data`` is grouped, then number of rows per group. Default is 5. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'x': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], ... 'y': list('aaaabbcddd') }) >>> df >> tail(2) x y 8 9 d 9 10 d Grouped dataframe >>> df >> group_by('y') >> tail(2) groups: ['y'] x y 0 3 a 1 4 a 2 5 b 3 6 b 4 7 c 5 9 d 6 10 d """ def __init__(self, n=5): self.n = n
[docs]class pull(DataOperator): """ Pull a single column from the dataframe Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. column : name Column name or index id. use_index : bool Whether to pull column by name or by its integer index. Default is False. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'x': [1, 2, 3], ... 'y': [4, 5, 6], ... 'z': [7, 8, 9] ... }) >>> df x y z 0 1 4 7 1 2 5 8 2 3 6 9 >>> df >> pull('y') array([4, 5, 6]) >>> df >> pull(0, True) array([1, 2, 3]) >>> df >> pull(-1, True) array([7, 8, 9]) Notes ----- Always returns a numpy array. If :obj:`plydata.options.modify_input_data` is ``True``, :class:`pull` will not make a copy the original column. """ def __init__(self, column, use_index=False): self.column = column self.use_index = use_index
[docs]class slice_rows(DataOperator): """ Select rows A wrapper around :class:`slice` to use when piping. Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. *args : tuple (start, stop, step) as expected by the builtin :class:`slice` type. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({'x': range(10), 'y': range(100, 110)}) >>> df >> slice_rows(5) x y 0 0 100 1 1 101 2 2 102 3 3 103 4 4 104 >>> df >> slice_rows(3, 7) x y 3 3 103 4 4 104 5 5 105 6 6 106 >>> df >> slice_rows(None, None, 3) x y 0 0 100 3 3 103 6 6 106 9 9 109 The above examples are equivalent to:: df[slice(5)] df[slice(3, 7)] df[slice(None, None, 3)] respectively. Notes ----- If :obj:`plydata.options.modify_input_data` is ``True``, :class:`slice_rows` will not make a copy the original dataframe. """ def __init__(self, *args): self.slice = slice(*args)
# Aliases mutate = define transmute = create unique = distinct summarise = summarize