Source code for plydata.utils

import re
from contextlib import contextmanager

import numpy as np
import pandas as pd

from .eval import EvalEnvironment
from .options import options

BOOL_PATTERN = re.compile(r'True|False')


def hasattrs(obj, names):
    """
    Return True of obj has all the names attributes
    """
    return all(hasattr(obj, attr) for attr in names)


@contextmanager
def temporary_key(d, key, value):
    """
    Context manager that removes key from dictionary on closing

    The dictionary will hold the key for the duration of
    the context.

    Parameters
    ----------
    d : dict-like
        Dictionary in which to insert a temporary key.
    key : hashable
        Location at which to insert ``value``.
    value : object
        Value to insert in ``d`` at location ``key``.
    """
    d[key] = value
    try:
        yield d
    finally:
        del d[key]


@contextmanager
def temporary_attr(obj, name, value):
    """
    Context manager that removes key from dictionary on closing

    The dictionary will hold the key for the duration of
    the context.

    Parameters
    ----------
    obj : object
        Object onto which to add a temporary attribute.
    name : str
        Name of attribute to add to ``obj``.
    value : object
        Value of ``attr``.
    """
    setattr(obj, name, value)
    try:
        yield obj
    finally:
        delattr(obj, name)


def get_empty_env():
    """
    Return an empty environment

    This is for testing or documentation purposes
    """
    return EvalEnvironment(namespaces={})


[docs]def Q(name):
    """
    Quote a variable name

    A way to 'quote' variable names, especially ones that do not otherwise
    meet Python's variable name rules.

    Parameters
    ----------
    name : str
        Name of variable

    Returns
    -------
    value : object
        Value of variable

    Examples
    --------
    >>> import pandas as pd
    >>> from plydata import define
    >>> df = pd.DataFrame({'class': [10, 20, 30]})

    Since ``class`` is a reserved python keyword it cannot be a variable
    name, and therefore cannot be used in an expression without quoting it.

    >>> df >> define(y='class+1')
    Traceback (most recent call last):
      File "<string>", line 1
        class+1
            ^
    SyntaxError: invalid syntax

    >>> df >> define(y='Q("class")+1')
       class   y
    0     10  11
    1     20  21
    2     30  31

    Note that it is ``'Q("some name")'`` and not ``'Q(some name)'``.
    As in the above example, you do not need to ``import`` ``Q`` before
    you can use it.
    """
    env = EvalEnvironment.capture(1)
    try:
        return env.namespace[name]
    except KeyError:
        raise NameError("No data named {!r} found".format(name))


[docs]def n():
    """
    Size of a group

    It can be used in verbs like
    :class:`~plydata.one_table_verbs.summarize`,
    :class:`~plydata.one_table_verbs.define`. and
    :class:`~plydata.one_table_verbs.create`.
    This is special function that is internally created for each
    group dataframe.
    """
    # For documentation purposes


class custom_dict(dict):
    """
    Dict datastore for conflict testing purposes

    Using a regular dict creates conflicts with verbs
    whose first parameter can be a dict
    """
    pass


@contextmanager
def regular_index(*dfs):
    """
    Change & restore the indices of dataframes

    Dataframe with duplicate values can be hard to work with.
    When split and recombined, you cannot restore the row order.
    This can be the case even if the index has unique but
    irregular/unordered. This contextmanager resets the unordered
    indices of any dataframe passed to it, on exit it restores
    the original index.

    A regular index is of the form::

        RangeIndex(start=0, stop=n, step=1)

    Parameters
    ----------
    dfs : tuple
        Dataframes

    Yields
    ------
    dfs : tuple
        Dataframe

    Examples
    --------
    Create dataframes with different indices

    >>> df1 = pd.DataFrame([4, 3, 2, 1])
    >>> df2 = pd.DataFrame([3, 2, 1], index=[3, 0, 0])
    >>> df3 = pd.DataFrame([11, 12, 13], index=[11, 12, 13])

    Within the contexmanager all frames have nice range indices

    >>> with regular_index(df1, df2, df3):
    ...     print(df1.index)
    ...     print(df2.index)
    ...     print(df3.index)
    RangeIndex(start=0, stop=4, step=1)
    RangeIndex(start=0, stop=3, step=1)
    RangeIndex(start=0, stop=3, step=1)

    Indices restored

    >>> df1.index
    RangeIndex(start=0, stop=4, step=1)
    >>> df2.index
    Int64Index([3, 0, 0], dtype='int64')
    >>> df3.index
    Int64Index([11, 12, 13], dtype='int64')
    """
    original_index = [df.index for df in dfs]
    have_bad_index = [not isinstance(df.index, pd.RangeIndex)
                      for df in dfs]

    for df, bad in zip(dfs, have_bad_index):
        if bad:
            df.reset_index(drop=True, inplace=True)

    try:
        yield dfs
    finally:
        for df, bad, idx in zip(dfs, have_bad_index, original_index):
            if bad and len(df.index) == len(idx):
                df.index = idx


def unique(lst):
    """
    Return unique elements

    :class:`pandas.unique` and :class:`numpy.unique` cast
    mixed type lists to the same type. They are faster, but
    some times we want to maintain the type.

    Parameters
    ----------
    lst : list-like
        List of items

    Returns
    -------
    out : list
        Unique items in the order that they appear in the
        input.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> lst = ['one', 'two', 123, 'three']
    >>> pd.unique(lst)
    array(['one', 'two', '123', 'three'], dtype='<U5')
    >>> np.unique(lst)
    array(['123', 'one', 'three', 'two'],
          dtype='<U5')
    >>> unique(lst)
    ['one', 'two', 123, 'three']

    pandas and numpy cast 123 to a string!, and numpy does not
    even maintain the order.
    """
    seen = set()

    def make_seen(x):
        seen.add(x)
        return x

    return [make_seen(x) for x in lst if x not in seen]


def identity(*args):
    """
    Return whatever is passed in

    Examples
    --------
    >>> x = 1
    >>> y = 2
    >>> identity(x)
    1
    >>> identity(x, y)
    (1, 2)
    >>> identity(*(x, y))
    (1, 2)
    """
    return args if len(args) > 1 else args[0]


def clean_indices(df, sep='_', inplace=False):
    """
    Clearup any multi/fancy indices

    1. columns multiindices are flattened
    2. Fancy multivariable row indices are turned into
       columns and the row index set regular form (0..n)

    Parameters
    ----------
    df : dataframe
        Dataframe
    sep : str
        Separator for the new column names

    Returns
    -------
    out : dataframe
        Dataframe

    Examples
    --------
    >>> import pandas as pd
    >>> ridx = pd.MultiIndex.from_tuples(
    ...     [(1, 'red'), (1, 'blue'),
    ...      (2, 'red'), (2, 'blue')],
    ...     names=('number', 'color')
    ... )
    >>> cidx = pd.MultiIndex.from_product(
    ...     [['part1', 'part2'], ['numeric', 'char']],
    ...     names=('parts','types')
    ... )
    >>> df = pd.DataFrame({
    ...     'w': [1, 2, 3, 4],
    ...     'x': list('aabb'),
    ...     'y': [5, 6, 7, 8],
    ...     'z': list('ccdd')
    ...     }, index=ridx
    ... )
    >>> df.columns = cidx
    >>> df
    parts          part1        part2
    types        numeric char numeric char
    number color
    1      red         1    a       5    c
           blue        2    a       6    c
    2      red         3    b       7    d
           blue        4    b       8    d
    >>> clean_indices(df)
       number color  part1_numeric part1_char  part2_numeric part2_char
    0       1   red              1          a              5          c
    1       1  blue              2          a              6          c
    2       2   red              3          b              7          d
    3       2  blue              4          b              8          d

    When the inner levels are unique, the names are not joined

    >>> cidx2 = pd.MultiIndex.from_tuples(
    ...     [('part1', 'numeric1'), ('part1', 'char1'),
    ...      ('part2', 'numeric2'), ('part2', 'char2')],
    ...     names=('parts','types')
    ... )
    >>> df.columns = cidx2
    >>> df
    parts           part1          part2
    types        numeric1 char1 numeric2 char2
    number color
    1      red          1     a        5     c
           blue         2     a        6     c
    2      red          3     b        7     d
           blue         4     b        8     d
    >>> clean_indices(df)
       number color  numeric1 char1  numeric2 char2
    0       1   red         1     a         5     c
    1       1  blue         2     a         6     c
    2       2   red         3     b         7     d
    3       2  blue         4     b         8     d
    """
    if not inplace:
        df = df.copy()

    if isinstance(df.columns, pd.MultiIndex):
        df.columns = collapse_multiindex(df.columns, sep)

    df.reset_index(inplace=True)
    df.columns.name = None
    df.index.name = None
    return None if inplace else df


def collapse_multiindex(midx, sep='_'):
    """
    Collapse a MultiIndex into a minimal Index

    Parameters
    ----------
    midx : pandas.MultiIndex
        MultiIndex to be collapsed

    Returns
    -------
    out : pandas.Index
        Flat Index

    Examples
    --------
    >>> m1 = pd.MultiIndex.from_product([list('a'), list('12')])
    >>> m1
    MultiIndex([('a', '1'),
                ('a', '2')],
               )
    >>> collapse_multiindex(m1)
    Index(['1', '2'], dtype='object')
    >>> m2 = pd.MultiIndex.from_product([list('ab'), list('12')])
    >>> m2
    MultiIndex([('a', '1'),
                ('a', '2'),
                ('b', '1'),
                ('b', '2')],
               )
    >>> collapse_multiindex(m2)
    Index(['a_1', 'a_2', 'b_1', 'b_2'], dtype='object')
    >>> m3 = pd.MultiIndex.from_tuples(
    ...     [('a', '1'), ('a', '2'),
    ...      ('b', '1'), ('b', '1')]
    ... )
    >>> m3
    MultiIndex([('a', '1'),
                ('a', '2'),
                ('b', '1'),
                ('b', '1')],
               )
    >>> collapse_multiindex(m3)
    Traceback (most recent call last):
        ...
    ValueError: Cannot create unique column names.
    """
    def is_unique(lst):
        return len(set(lst)) == len(lst)

    def make_name(toks):
        if len(toks) == 1:
            # Preserves integer column names for basic
            # simple case when they will not be joined up
            # with another name up the hierarchy
            return toks[0]
        else:
            return sep.join(str(t) for t in toks)

    # Minimum tokens required to uniquely identify columns.
    # We start with the columns in the inner most level of
    # the multiindex.
    # - [(a, 1), (a, 2)] -> [(1,), (2,)]
    # - [(a, 1), (a, 2), (b, 1), (b, 2)] ->
    #       [(a, 1), (a, 2), (b, 1), (b, 2)]
    # - [(z, a, 1), (z, a, 2), (z, b, 1), (z, b, 2)] ->
    #       [(a, 1), (a, 2), (b, 1), (b, 2)]
    for i in range(midx.nlevels):
        id_tokens = [x[-(1+i):] for x in midx]
        if is_unique(id_tokens):
            break
    else:
        raise ValueError("Cannot create unique column names.")

    columns = [make_name(toks) for toks in id_tokens]
    return pd.Index(columns)


def convert_str(data, columns=None):
    """
    Try converting string/object columns in data to more specific dtype

    This function modifies the input data.

    Parameters
    ----------
    data : dataframe
        Data
    columns : list-like or None
        Names of columns to check and maybe convert.
        If ``None``, all the string columns are converted.

    Returns
    -------
    data : dataframe
        Data
    """
    if columns is None:
        columns = [
            name
            for name, col in data.items()
            if hasattr(col, 'str')
        ]

    def is_numeric(col):
        return col.str.isnumeric().all()

    def is_float(col):
        try:
            col.astype(float)
        except ValueError:
            return False
        else:
            return True

    def is_bool(col):
        return col.str.match(BOOL_PATTERN).all()

    for name in columns:
        col = data[name]

        if is_numeric(col) or is_float(col):
            data[name] = pd.to_numeric(col)
        elif is_bool(col):
            data[name] = col.replace({
                'True': True,
                'False': False
            })

    return data


def verify_arg(value, name, options):
    """
    Verify Argument

    Parameter
    ---------
    value : int | str
        Value of argument
    name : str
        Name of argument
    options : list-like | set
        Allowed values of argument

    Raises
    ------
    ValueError
        If value is not in the allowed options.

    Examples
    --------
    >>> verify_arg('dog', 'pet', ('fish', 'dog', 'cat'))
    >>> verify_arg('snail', 'pet', ('fish', 'dog', 'cat'))
    Traceback (most recent call last):
        ...
    ValueError: Got pet='snail'. Should be one of ('dog', 'fish', 'cat')
    """
    if value not in options:
        raise ValueError(
            "Got {}={!r}. Should be one of {!r}".format(
                name, value, options
            )
        )


def mean_if_many(x):
    """
    Compute mean of x if x has more than 1 element

    If x has one element, return that element.
    By only computing the mean if x is greater than 1;

        - singular integer values remain integers
        - a single string value passes through so this can be used as
          an aggregate function (aggfunc) when pivoting. This avoids an
          unnecessary error.

    Parameters
    ----------
    x : list-like
        Values whose mean to compute

    Returns
    -------
    out : object
        Mean of x or the only value in x

    Examples
    --------
    >>> mean_if_many([4])
    4
    >>> mean_if_many([4, 4])
    4.0
    >>> mean_if_many([4, 5, 6, 7])
    5.5
    >>> mean_if_many(['string_1'])
    'string_1'
    >>> mean_if_many(['string_1', 'string_2'])
    Traceback (most recent call last):
        ...
    TypeError: cannot perform reduce with flexible type
    """
    return list(x)[0] if len(x) == 1 else np.mean(x)


[docs]def last2(x, y):
    """
    Find last value of y when sorted by x

    Parameters
    ----------
    x : list-like
        Values
    y : list-like
        Values

    Returns
    -------
    obj : object
        Last value of y when sorted by x

    Examples
    --------
    >>> x = [1, 2, 3, 99, 5, 6]
    >>> y = [1, 2, 3, 4, 5, 6]
    >>> last2(x, y)
    4
    >>> last2(x, y[::-1])
    3

    See Also
    --------
    :class:`~plydata.cat_tools.reorder2`
    """
    y = np.asarray(y)
    return y[np.argsort(x)][-1]


[docs]def first2(x, y):
    """
    Find first value of y when sorted by x

    Parameters
    ----------
    x : list-like
        Values
    y : list-like
        Values

    Returns
    -------
    obj : object
        Last value of y when sorted by x

    Examples
    --------
    >>> x = [1, 2, 3, -99, 5, 6]
    >>> y = [1, 2, 3, 4, 5, 6]
    >>> first2(x, y)
    4
    >>> first2(x, y[::-1])
    3

    See Also
    --------
    :class:`~plydata.cat_tools.reorder2`
    """
    y = np.asarray(y)
    return y[np.argsort(x)][0]


[docs]def ply(data, *verbs):
    """
    Pipe data through the verbs

    This function allows you to use plydata without
    abusing the ``>>`` operator.

    Parameters
    ----------
    data : dataframe
        Data
    verbs : tuple
        Verb to which the data should be piped

    Examples
    --------

    >>> from plydata import *
    >>> df = pd.DataFrame({
    ...     'x': [0, 1, 2, 3],
    ...     'y': ['zero', 'one', 'two', 'three']}
    ... )

    Using ply

    >>> ply(
    ...    df,
    ...    define(z='2*x', w='y+"-"+y'),
    ...    group_by(parity='x % 2'),
    ...    define(u='sum(z)')
    ... )
    groups: ['parity']
       x      y  z            w  parity  u
    0  0   zero  0    zero-zero       0  4
    1  1    one  2      one-one       1  8
    2  2    two  4      two-two       0  4
    3  3  three  6  three-three       1  8

    Is equivalent to

    >>> (df
    ...  >> define(z='2*x', w='y+"-"+y')
    ...  >> group_by(parity='x % 2')
    ...  >> define(u='sum(z)'))
    groups: ['parity']
       x      y  z            w  parity  u
    0  0   zero  0    zero-zero       0  4
    1  1    one  2      one-one       1  8
    2  2    two  4      two-two       0  4
    3  3  three  6  three-three       1  8
    """
    data = data.copy()
    with options(modify_input_data=True):
        for verb in verbs:
            data >>= verb
    return data