Source code for plydata.expressions

from collections import OrderedDict
import keyword
import re

import pandas as pd
import pandas.api.types as pdtypes
import numpy as np

__all__ = ['case_when', 'if_else']

KEYWORDS = set(keyword.kwlist)

# A pattern that matches the function 'n()'
# anywhere in an expression
n_func_pattern = re.compile(r'\bn\(\)')


# Internal expression classes

class BaseExpression:
    """
    An expression that will be evaluated

    Parameters
    ----------
    stmt : str or function
        Statement that will be evaluated. Some verbs
        allow only one or the other.
    column : str
        Column in which the result of the statment
        will be placed.
    """
    stmt = None
    column = None

    # Whether the statement uses the special function n()
    _has_n_func = False

    def __init__(self, stmt, column):
        self.stmt = stmt
        self.column = column

        # Check for n() in the statement
        if isinstance(stmt, str):
            if n_func_pattern.search(stmt):
                self._has_n_func = True

    def __repr__(self):
        fmt = '{}({!r}, {!r})'.format
        return fmt(self.__class__.__name__, self.stmt, self.column)

    def nice_value(self, value, idx):
        if isinstance(value, (np.ndarray, pd.Series)):
            return value[idx]
        else:
            return value

    def evaluate(self, data, env):
        """
        Evaluate statement

        Parameters
        ----------
        data : pandas.DataFrame
            Data in whose namespace the statement will be
            evaluated. Typically, this is a group dataframe.

        Returns
        -------
        out : object
            Result of the evaluation.pandas.DataFrame
        """
        def n():
            """
            Return number of rows in groups

            This function is part of the public API
            """
            return len(data)

        if isinstance(self.stmt, str):
            # Add function n() that computes the
            # size of the group data to the inner namespace.
            if self._has_n_func:
                namespace = dict(data, n=n)
            else:
                namespace = data
            # Avoid obvious keywords e.g if a column
            # is named class
            if self.stmt not in KEYWORDS:
                value = env.eval(
                    self.stmt,
                    source_name='Expression.evaluate',
                    inner_namespace=namespace)
            else:
                value = namespace[self.stmt]
        elif callable(self.stmt):
            value = self.stmt(data)
        else:
            value = self.stmt
        return value


class CaseWhenExpression(BaseExpression):
    """
    An expression that will be evaluated

    Parameters
    ----------
    preds_values : ordered-dict
        The predicate expressions and value expressions.
        Ordered by most specific to most general.
    column : str
        Column in which the result of the statment
        will be placed.
    """
    def __init__(self, preds_values, column):
        # Both the predicates and values of a case_when are
        # treated as expressions. And they will be evaluated.
        self.pv_expressions = [
            (Expression(pred, None), Expression(value, column))
            for pred, value in preds_values.items()]
        self.preds_values = preds_values
        self.column = column

    def __str__(self):
        fmt = 'CaseWhenExpression({!r}, {!r})'.format
        return fmt(self.preds_values, self.column)

    def evaluate(self, data, env):
        """
        Evaluate the predicates and values
        """
        # For each predicate-value, we keep track of the positions
        # that have been copied to the result, so that the later
        # more general values do not overwrite the previous ones.
        result = np.repeat(None, len(data))
        copied = np.repeat(False, len(data))
        for pred_expr, value_expr in self.pv_expressions:
            bool_idx = pred_expr.evaluate(data, env)
            if not pdtypes.is_bool_dtype(np.asarray(bool_idx)):
                raise TypeError(
                    "The predicate keys must return a boolean array, "
                    "or a boolean value.")
            value = value_expr.evaluate(data, env)
            mask = (copied ^ bool_idx) & bool_idx
            copied |= bool_idx
            idx = np.where(mask)[0]
            result[idx] = self.nice_value(value, idx)
        return np.array(list(result))


class IfElseExpression(BaseExpression):
    def __init__(self, ifelse, column):
        self.stmt = ifelse
        self.column = column
        self.predicate_expr = Expression(ifelse.predicate, None)
        self.true_value_expr = Expression(ifelse.true_value, None)
        self.false_value_expr = Expression(ifelse.false_value, None)

    def evaluate(self, data, env):
        """
        Evaluate the predicates and values
        """
        bool_idx = self.predicate_expr.evaluate(data, env)
        true_value = self.true_value_expr.evaluate(data, env)
        false_value = self.false_value_expr.evaluate(data, env)
        true_idx = np.where(bool_idx)[0]
        false_idx = np.where(~bool_idx)[0]
        result = np.repeat(None, len(data))
        result[true_idx] = self.nice_value(true_value, true_idx)
        result[false_idx] = self.nice_value(false_value, false_idx)
        return np.array(list(result))


def Expression(*args, **kwargs):
    """
    Return an appropriate Expression given the arguments

    Parameters
    ----------
    args : tuple
        Positional arguments passed to the Expression class
    kwargs : dict
        Keyword arguments passed to the Expression class
    """
    # dispatch
    if not hasattr(args[0], '_Expression'):
        return BaseExpression(*args, *kwargs)
    else:
        return args[0]._Expression(*args, **kwargs)


# User API expressions

[docs]class case_when(OrderedDict): """ Vectorized case Parameters ---------- args : mapping, iterable (predicate, value) pairs, ordered from most specific to most general. kwargs : collections.OrderedDict {predicate: value} pairs, ordered from most specific to most general. Examples -------- >>> import pandas as pd >>> from plydata import define >>> from plydata.expressions import case_when >>> df = pd.DataFrame({'x': range(10)}) Here we use an iterable of tuples with key-value pairs for the predicate and value. >>> df >> define(divisible=case_when([ ... ('x%2 == 0', 2), ... ('x%3 == 0', 3), ... (True, -1) ... ])) x divisible 0 0 2 1 1 -1 2 2 2 3 3 3 4 4 2 5 5 -1 6 6 2 7 7 -1 8 8 2 9 9 3 When the most general predicate comes first, it obscures the rest. *Every row is matched by atmost one predicate function* >>> df >> define(divisible=case_when([ ... (True, -1), ... ('x%2 == 0', 2), ... ('x%3 == 0', 3) ... ])) x divisible 0 0 -1 1 1 -1 2 2 -1 3 3 -1 4 4 -1 5 5 -1 6 6 -1 7 7 -1 8 8 -1 9 9 -1 String values must be quoted >>> df >> define(divisible=case_when([ ... ('x%2 == 0', '"by-2"'), ... ('x%3 == 0', '"by-3"'), ... (True, '"neither-by-2or3"') ... ])) x divisible 0 0 by-2 1 1 neither-by-2or3 2 2 by-2 3 3 by-3 4 4 by-2 5 5 neither-by-2or3 6 6 by-2 7 7 neither-by-2or3 8 8 by-2 9 9 by-3 The values can be expressions >>> df >> define(divisible=case_when([ ... ('x%2 == 0', 'x+200'), ... ('x%3 == 0', 'x+300'), ... (True, -1) ... ])) x divisible 0 0 200 1 1 -1 2 2 202 3 3 303 4 4 204 5 5 -1 6 6 206 7 7 -1 8 8 208 9 9 309 .. rubric:: Combining Predicates When combining predicate statements, you can use the bitwise operators, ``|``, ``&``, ``^`` and ``~``. The different statements must be enclosed in parenthesis, -- ``()``. >>> df >> define(y=case_when([ ... ('(x < 5) & (x % 2 == 0)', '"less-than-5-and-even"'), ... ('(x < 5) & (x % 2 != 0)', '"less-than-5-and-odd"'), ... ('(x > 5) & (x % 2 == 0)', '"greater-than-5-and-even"'), ... ('(x > 5) & (x % 2 != 0)', '"greater-than-5-and-odd"'), ... (True, '"Just 5"') ... ])) x y 0 0 less-than-5-and-even 1 1 less-than-5-and-odd 2 2 less-than-5-and-even 3 3 less-than-5-and-odd 4 4 less-than-5-and-even 5 5 Just 5 6 6 greater-than-5-and-even 7 7 greater-than-5-and-odd 8 8 greater-than-5-and-even 9 9 greater-than-5-and-odd Notes ----- As :class:`dict` classes are ordered, in python 3.6 and above you can get away with:: df >> define(divisible=case_when({ 'x%2 == 0': 'x+200', 'x%3 == 0': 'x+300', True: -1 })) However, be careful it may not always be the case. """ # The expression class that handles user expression # of this type. _Expression = CaseWhenExpression
[docs]class if_else: """ Vectorized if Parameters ---------- predicate : bool, str, function Predicate true_value : object Value when predicate is True. false_value : object Value when predicate is False. Examples -------- >>> import pandas as pd >>> from plydata import define >>> from plydata.expressions import if_else >>> df = pd.DataFrame({'x': range(10)}) y takes on a value that depends on a predicate expression. The values can be scalar. >>> df >> define(y=if_else('x%2==0', 2, -1)) x y 0 0 2 1 1 -1 2 2 2 3 3 -1 4 4 2 5 5 -1 6 6 2 7 7 -1 8 8 2 9 9 -1 If they are strings, they should be quoted. >>> df >> define(y=if_else('x%2==0', '"even"', '"odd"')) x y 0 0 even 1 1 odd 2 2 even 3 3 odd 4 4 even 5 5 odd 6 6 even 7 7 odd 8 8 even 9 9 odd If the values are treated as expressions. >>> df >> define(y=if_else('x%2==0', 'x*2', 'x/2')) x y 0 0 0.0 1 1 0.5 2 2 4.0 3 3 1.5 4 4 8.0 5 5 2.5 6 6 12.0 7 7 3.5 8 8 16.0 9 9 4.5 .. rubric:: Combining Predicates When combining predicate statements, you can use the bitwise operators, ``|``, ``&``, ``^`` and ``~``. The different statements must be enclosed in parenthesis, ``()``. >>> df >> define(y=if_else( ... '(x < 5) & (x % 2 == 0)', ... '"less-than-5-and-even"', ... '"odd-or-greater-than-5"')) x y 0 0 less-than-5-and-even 1 1 odd-or-greater-than-5 2 2 less-than-5-and-even 3 3 odd-or-greater-than-5 4 4 less-than-5-and-even 5 5 odd-or-greater-than-5 6 6 odd-or-greater-than-5 7 7 odd-or-greater-than-5 8 8 odd-or-greater-than-5 9 9 odd-or-greater-than-5 """ # The expression class that handles user expression # of this type. _Expression = IfElseExpression def __init__(self, predicate, true_value, false_value): self.predicate = predicate self.true_value = true_value self.false_value = false_value def __repr__(self): fmt = 'if_else({!r}, {!r}, {!r})'.format return fmt(self.predicate, self.true_value, self.false_value)