import re
from contextlib import contextmanager
import numpy as np
import pandas as pd
from .eval import EvalEnvironment
from .options import options
BOOL_PATTERN = re.compile(r'True|False')
def hasattrs(obj, names):
"""
Return True of obj has all the names attributes
"""
return all(hasattr(obj, attr) for attr in names)
@contextmanager
def temporary_key(d, key, value):
"""
Context manager that removes key from dictionary on closing
The dictionary will hold the key for the duration of
the context.
Parameters
----------
d : dict-like
Dictionary in which to insert a temporary key.
key : hashable
Location at which to insert ``value``.
value : object
Value to insert in ``d`` at location ``key``.
"""
d[key] = value
try:
yield d
finally:
del d[key]
@contextmanager
def temporary_attr(obj, name, value):
"""
Context manager that removes key from dictionary on closing
The dictionary will hold the key for the duration of
the context.
Parameters
----------
obj : object
Object onto which to add a temporary attribute.
name : str
Name of attribute to add to ``obj``.
value : object
Value of ``attr``.
"""
setattr(obj, name, value)
try:
yield obj
finally:
delattr(obj, name)
def get_empty_env():
"""
Return an empty environment
This is for testing or documentation purposes
"""
return EvalEnvironment(namespaces={})
[docs]def Q(name):
"""
Quote a variable name
A way to 'quote' variable names, especially ones that do not otherwise
meet Python's variable name rules.
Parameters
----------
name : str
Name of variable
Returns
-------
value : object
Value of variable
Examples
--------
>>> import pandas as pd
>>> from plydata import define
>>> df = pd.DataFrame({'class': [10, 20, 30]})
Since ``class`` is a reserved python keyword it cannot be a variable
name, and therefore cannot be used in an expression without quoting it.
>>> df >> define(y='class+1')
Traceback (most recent call last):
File "<string>", line 1
class+1
^
SyntaxError: invalid syntax
>>> df >> define(y='Q("class")+1')
class y
0 10 11
1 20 21
2 30 31
Note that it is ``'Q("some name")'`` and not ``'Q(some name)'``.
As in the above example, you do not need to ``import`` ``Q`` before
you can use it.
"""
env = EvalEnvironment.capture(1)
try:
return env.namespace[name]
except KeyError:
raise NameError("No data named {!r} found".format(name))
[docs]def n():
"""
Size of a group
It can be used in verbs like
:class:`~plydata.one_table_verbs.summarize`,
:class:`~plydata.one_table_verbs.define`. and
:class:`~plydata.one_table_verbs.create`.
This is special function that is internally created for each
group dataframe.
"""
# For documentation purposes
class custom_dict(dict):
"""
Dict datastore for conflict testing purposes
Using a regular dict creates conflicts with verbs
whose first parameter can be a dict
"""
pass
@contextmanager
def regular_index(*dfs):
"""
Change & restore the indices of dataframes
Dataframe with duplicate values can be hard to work with.
When split and recombined, you cannot restore the row order.
This can be the case even if the index has unique but
irregular/unordered. This contextmanager resets the unordered
indices of any dataframe passed to it, on exit it restores
the original index.
A regular index is of the form::
RangeIndex(start=0, stop=n, step=1)
Parameters
----------
dfs : tuple
Dataframes
Yields
------
dfs : tuple
Dataframe
Examples
--------
Create dataframes with different indices
>>> df1 = pd.DataFrame([4, 3, 2, 1])
>>> df2 = pd.DataFrame([3, 2, 1], index=[3, 0, 0])
>>> df3 = pd.DataFrame([11, 12, 13], index=[11, 12, 13])
Within the contexmanager all frames have nice range indices
>>> with regular_index(df1, df2, df3):
... print(df1.index)
... print(df2.index)
... print(df3.index)
RangeIndex(start=0, stop=4, step=1)
RangeIndex(start=0, stop=3, step=1)
RangeIndex(start=0, stop=3, step=1)
Indices restored
>>> df1.index
RangeIndex(start=0, stop=4, step=1)
>>> df2.index
Int64Index([3, 0, 0], dtype='int64')
>>> df3.index
Int64Index([11, 12, 13], dtype='int64')
"""
original_index = [df.index for df in dfs]
have_bad_index = [not isinstance(df.index, pd.RangeIndex)
for df in dfs]
for df, bad in zip(dfs, have_bad_index):
if bad:
df.reset_index(drop=True, inplace=True)
try:
yield dfs
finally:
for df, bad, idx in zip(dfs, have_bad_index, original_index):
if bad and len(df.index) == len(idx):
df.index = idx
def unique(lst):
"""
Return unique elements
:class:`pandas.unique` and :class:`numpy.unique` cast
mixed type lists to the same type. They are faster, but
some times we want to maintain the type.
Parameters
----------
lst : list-like
List of items
Returns
-------
out : list
Unique items in the order that they appear in the
input.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> lst = ['one', 'two', 123, 'three']
>>> pd.unique(lst)
array(['one', 'two', '123', 'three'], dtype='<U5')
>>> np.unique(lst)
array(['123', 'one', 'three', 'two'],
dtype='<U5')
>>> unique(lst)
['one', 'two', 123, 'three']
pandas and numpy cast 123 to a string!, and numpy does not
even maintain the order.
"""
seen = set()
def make_seen(x):
seen.add(x)
return x
return [make_seen(x) for x in lst if x not in seen]
def identity(*args):
"""
Return whatever is passed in
Examples
--------
>>> x = 1
>>> y = 2
>>> identity(x)
1
>>> identity(x, y)
(1, 2)
>>> identity(*(x, y))
(1, 2)
"""
return args if len(args) > 1 else args[0]
def clean_indices(df, sep='_', inplace=False):
"""
Clearup any multi/fancy indices
1. columns multiindices are flattened
2. Fancy multivariable row indices are turned into
columns and the row index set regular form (0..n)
Parameters
----------
df : dataframe
Dataframe
sep : str
Separator for the new column names
Returns
-------
out : dataframe
Dataframe
Examples
--------
>>> import pandas as pd
>>> ridx = pd.MultiIndex.from_tuples(
... [(1, 'red'), (1, 'blue'),
... (2, 'red'), (2, 'blue')],
... names=('number', 'color')
... )
>>> cidx = pd.MultiIndex.from_product(
... [['part1', 'part2'], ['numeric', 'char']],
... names=('parts','types')
... )
>>> df = pd.DataFrame({
... 'w': [1, 2, 3, 4],
... 'x': list('aabb'),
... 'y': [5, 6, 7, 8],
... 'z': list('ccdd')
... }, index=ridx
... )
>>> df.columns = cidx
>>> df
parts part1 part2
types numeric char numeric char
number color
1 red 1 a 5 c
blue 2 a 6 c
2 red 3 b 7 d
blue 4 b 8 d
>>> clean_indices(df)
number color part1_numeric part1_char part2_numeric part2_char
0 1 red 1 a 5 c
1 1 blue 2 a 6 c
2 2 red 3 b 7 d
3 2 blue 4 b 8 d
When the inner levels are unique, the names are not joined
>>> cidx2 = pd.MultiIndex.from_tuples(
... [('part1', 'numeric1'), ('part1', 'char1'),
... ('part2', 'numeric2'), ('part2', 'char2')],
... names=('parts','types')
... )
>>> df.columns = cidx2
>>> df
parts part1 part2
types numeric1 char1 numeric2 char2
number color
1 red 1 a 5 c
blue 2 a 6 c
2 red 3 b 7 d
blue 4 b 8 d
>>> clean_indices(df)
number color numeric1 char1 numeric2 char2
0 1 red 1 a 5 c
1 1 blue 2 a 6 c
2 2 red 3 b 7 d
3 2 blue 4 b 8 d
"""
if not inplace:
df = df.copy()
if isinstance(df.columns, pd.MultiIndex):
df.columns = collapse_multiindex(df.columns, sep)
df.reset_index(inplace=True)
df.columns.name = None
df.index.name = None
return None if inplace else df
def collapse_multiindex(midx, sep='_'):
"""
Collapse a MultiIndex into a minimal Index
Parameters
----------
midx : pandas.MultiIndex
MultiIndex to be collapsed
Returns
-------
out : pandas.Index
Flat Index
Examples
--------
>>> m1 = pd.MultiIndex.from_product([list('a'), list('12')])
>>> m1
MultiIndex([('a', '1'),
('a', '2')],
)
>>> collapse_multiindex(m1)
Index(['1', '2'], dtype='object')
>>> m2 = pd.MultiIndex.from_product([list('ab'), list('12')])
>>> m2
MultiIndex([('a', '1'),
('a', '2'),
('b', '1'),
('b', '2')],
)
>>> collapse_multiindex(m2)
Index(['a_1', 'a_2', 'b_1', 'b_2'], dtype='object')
>>> m3 = pd.MultiIndex.from_tuples(
... [('a', '1'), ('a', '2'),
... ('b', '1'), ('b', '1')]
... )
>>> m3
MultiIndex([('a', '1'),
('a', '2'),
('b', '1'),
('b', '1')],
)
>>> collapse_multiindex(m3)
Traceback (most recent call last):
...
ValueError: Cannot create unique column names.
"""
def is_unique(lst):
return len(set(lst)) == len(lst)
def make_name(toks):
if len(toks) == 1:
# Preserves integer column names for basic
# simple case when they will not be joined up
# with another name up the hierarchy
return toks[0]
else:
return sep.join(str(t) for t in toks)
# Minimum tokens required to uniquely identify columns.
# We start with the columns in the inner most level of
# the multiindex.
# - [(a, 1), (a, 2)] -> [(1,), (2,)]
# - [(a, 1), (a, 2), (b, 1), (b, 2)] ->
# [(a, 1), (a, 2), (b, 1), (b, 2)]
# - [(z, a, 1), (z, a, 2), (z, b, 1), (z, b, 2)] ->
# [(a, 1), (a, 2), (b, 1), (b, 2)]
for i in range(midx.nlevels):
id_tokens = [x[-(1+i):] for x in midx]
if is_unique(id_tokens):
break
else:
raise ValueError("Cannot create unique column names.")
columns = [make_name(toks) for toks in id_tokens]
return pd.Index(columns)
def convert_str(data, columns=None):
"""
Try converting string/object columns in data to more specific dtype
This function modifies the input data.
Parameters
----------
data : dataframe
Data
columns : list-like or None
Names of columns to check and maybe convert.
If ``None``, all the string columns are converted.
Returns
-------
data : dataframe
Data
"""
if columns is None:
columns = [
name
for name, col in data.items()
if hasattr(col, 'str')
]
def is_numeric(col):
return col.str.isnumeric().all()
def is_float(col):
try:
col.astype(float)
except ValueError:
return False
else:
return True
def is_bool(col):
return col.str.match(BOOL_PATTERN).all()
for name in columns:
col = data[name]
if is_numeric(col) or is_float(col):
data[name] = pd.to_numeric(col)
elif is_bool(col):
data[name] = col.replace({
'True': True,
'False': False
})
return data
def verify_arg(value, name, options):
"""
Verify Argument
Parameter
---------
value : int | str
Value of argument
name : str
Name of argument
options : list-like | set
Allowed values of argument
Raises
------
ValueError
If value is not in the allowed options.
Examples
--------
>>> verify_arg('dog', 'pet', ('fish', 'dog', 'cat'))
>>> verify_arg('snail', 'pet', ('fish', 'dog', 'cat'))
Traceback (most recent call last):
...
ValueError: Got pet='snail'. Should be one of ('dog', 'fish', 'cat')
"""
if value not in options:
raise ValueError(
"Got {}={!r}. Should be one of {!r}".format(
name, value, options
)
)
def mean_if_many(x):
"""
Compute mean of x if x has more than 1 element
If x has one element, return that element.
By only computing the mean if x is greater than 1;
- singular integer values remain integers
- a single string value passes through so this can be used as
an aggregate function (aggfunc) when pivoting. This avoids an
unnecessary error.
Parameters
----------
x : list-like
Values whose mean to compute
Returns
-------
out : object
Mean of x or the only value in x
Examples
--------
>>> mean_if_many([4])
4
>>> mean_if_many([4, 4])
4.0
>>> mean_if_many([4, 5, 6, 7])
5.5
>>> mean_if_many(['string_1'])
'string_1'
>>> mean_if_many(['string_1', 'string_2'])
Traceback (most recent call last):
...
TypeError: cannot perform reduce with flexible type
"""
return list(x)[0] if len(x) == 1 else np.mean(x)
[docs]def last2(x, y):
"""
Find last value of y when sorted by x
Parameters
----------
x : list-like
Values
y : list-like
Values
Returns
-------
obj : object
Last value of y when sorted by x
Examples
--------
>>> x = [1, 2, 3, 99, 5, 6]
>>> y = [1, 2, 3, 4, 5, 6]
>>> last2(x, y)
4
>>> last2(x, y[::-1])
3
See Also
--------
:class:`~plydata.cat_tools.reorder2`
"""
y = np.asarray(y)
return y[np.argsort(x)][-1]
[docs]def first2(x, y):
"""
Find first value of y when sorted by x
Parameters
----------
x : list-like
Values
y : list-like
Values
Returns
-------
obj : object
Last value of y when sorted by x
Examples
--------
>>> x = [1, 2, 3, -99, 5, 6]
>>> y = [1, 2, 3, 4, 5, 6]
>>> first2(x, y)
4
>>> first2(x, y[::-1])
3
See Also
--------
:class:`~plydata.cat_tools.reorder2`
"""
y = np.asarray(y)
return y[np.argsort(x)][0]
[docs]def ply(data, *verbs):
"""
Pipe data through the verbs
This function allows you to use plydata without
abusing the ``>>`` operator.
Parameters
----------
data : dataframe
Data
verbs : tuple
Verb to which the data should be piped
Examples
--------
>>> from plydata import *
>>> df = pd.DataFrame({
... 'x': [0, 1, 2, 3],
... 'y': ['zero', 'one', 'two', 'three']}
... )
Using ply
>>> ply(
... df,
... define(z='2*x', w='y+"-"+y'),
... group_by(parity='x % 2'),
... define(u='sum(z)')
... )
groups: ['parity']
x y z w parity u
0 0 zero 0 zero-zero 0 4
1 1 one 2 one-one 1 8
2 2 two 4 two-two 0 4
3 3 three 6 three-three 1 8
Is equivalent to
>>> (df
... >> define(z='2*x', w='y+"-"+y')
... >> group_by(parity='x % 2')
... >> define(u='sum(z)'))
groups: ['parity']
x y z w parity u
0 0 zero 0 zero-zero 0 4
1 1 one 2 one-one 1 8
2 2 two 4 two-two 0 4
3 3 three 6 three-three 1 8
"""
data = data.copy()
with options(modify_input_data=True):
for verb in verbs:
data >>= verb
return data