"""
Functions for categoricals
"""
from itertools import chain, product
import numpy as np
import pandas as pd
import pandas.api.types as pdtypes
from pandas.core.algorithms import value_counts
from .utils import last2
__all__ = [
'cat_anon',
'cat_collapse',
'cat_concat',
'cat_drop',
'cat_expand',
'cat_explicit_na',
'cat_infreq',
'cat_inorder',
'cat_inseq',
'cat_lump',
'cat_lump_lowfreq',
'cat_lump_min',
'cat_lump_n',
'cat_lump_prop',
'cat_move',
'cat_other',
'cat_recode',
'cat_relabel',
'cat_relevel',
'cat_rename',
'cat_reorder',
'cat_reorder2',
'cat_rev',
'cat_shift',
'cat_shuffle',
'cat_unify',
'cat_zip',
]
[docs]def cat_infreq(c, ordered=None):
"""
Reorder categorical by frequency of the values
Parameters
----------
c : list-like
Values that will make up the categorical.
ordered : bool
If ``True``, the categorical is ordered.
Returns
-------
out : categorical
Values
Examples
--------
>>> x = ['d', 'a', 'b', 'b', 'c', 'c', 'c']
>>> cat_infreq(x)
['d', 'a', 'b', 'b', 'c', 'c', 'c']
Categories (4, object): ['c', 'b', 'd', 'a']
>>> cat_infreq(x, ordered=True)
['d', 'a', 'b', 'b', 'c', 'c', 'c']
Categories (4, object): ['c' < 'b' < 'd' < 'a']
When two or more values occur the same number of times, if the
categorical is ordered, the order is preserved. If it is not
not ordered, the order depends on that of the values. Above 'd'
comes before 'a', and below 'a' comes before 'a'.
>>> c = pd.Categorical(
... x, categories=['a', 'c', 'b', 'd']
... )
>>> cat_infreq(c)
['d', 'a', 'b', 'b', 'c', 'c', 'c']
Categories (4, object): ['c', 'b', 'a', 'd']
>>> cat_infreq(c.set_ordered(True))
['d', 'a', 'b', 'b', 'c', 'c', 'c']
Categories (4, object): ['c' < 'b' < 'a' < 'd']
"""
kwargs = {} if ordered is None else {'ordered': ordered}
counts = value_counts(c)
if pdtypes.is_categorical_dtype(c):
original_cat_order = c.categories
else:
original_cat_order = pd.unique(c)
counts = counts.reindex(index=original_cat_order)
cats = (_stable_series_sort(counts, ascending=False)
.index
.to_list())
return pd.Categorical(c, categories=cats, **kwargs)
[docs]def cat_inorder(c, ordered=None):
"""
Reorder categorical by appearance
Parameters
----------
c : list-like
Values that will make up the categorical.
ordered : bool
If ``True``, the categorical is ordered.
Returns
-------
out : categorical
Values
Examples
--------
>>> import numpy as np
>>> x = [4, 1, 3, 4, 4, 7, 3]
>>> cat_inorder(x)
[4, 1, 3, 4, 4, 7, 3]
Categories (4, int64): [4, 1, 3, 7]
>>> arr = np.array(x)
>>> cat_inorder(arr)
[4, 1, 3, 4, 4, 7, 3]
Categories (4, int64): [4, 1, 3, 7]
>>> c = ['b', 'f', 'c', None, 'c', 'a', 'b', 'e']
>>> cat_inorder(c)
['b', 'f', 'c', NaN, 'c', 'a', 'b', 'e']
Categories (5, object): ['b', 'f', 'c', 'a', 'e']
>>> s = pd.Series(c)
>>> cat_inorder(s)
['b', 'f', 'c', NaN, 'c', 'a', 'b', 'e']
Categories (5, object): ['b', 'f', 'c', 'a', 'e']
>>> cat = pd.Categorical(c)
>>> cat_inorder(cat)
['b', 'f', 'c', NaN, 'c', 'a', 'b', 'e']
Categories (5, object): ['b', 'f', 'c', 'a', 'e']
>>> cat_inorder(cat, ordered=True)
['b', 'f', 'c', NaN, 'c', 'a', 'b', 'e']
Categories (5, object): ['b' < 'f' < 'c' < 'a' < 'e']
By default, ordered categories remain ordered.
>>> ocat = pd.Categorical(cat, ordered=True)
>>> ocat
['b', 'f', 'c', NaN, 'c', 'a', 'b', 'e']
Categories (5, object): ['a' < 'b' < 'c' < 'e' < 'f']
>>> cat_inorder(ocat)
['b', 'f', 'c', NaN, 'c', 'a', 'b', 'e']
Categories (5, object): ['b' < 'f' < 'c' < 'a' < 'e']
>>> cat_inorder(ocat, ordered=False)
['b', 'f', 'c', NaN, 'c', 'a', 'b', 'e']
Categories (5, object): ['b', 'f', 'c', 'a', 'e']
Notes
-----
``NaN`` or ``None`` are ignored when creating the categories.
"""
kwargs = {} if ordered is None else {'ordered': ordered}
if isinstance(c, (pd.Series, pd.Categorical)):
cats = c[~pd.isnull(c)].unique()
if hasattr(cats, 'to_list'):
cats = cats.to_list()
elif hasattr(c, 'dtype'):
cats = pd.unique(c[~pd.isnull(c)])
else:
cats = pd.unique([
x for x, keep in zip(c, ~pd.isnull(c))
if keep
])
return pd.Categorical(c, categories=cats, **kwargs)
[docs]def cat_inseq(c, ordered=None):
"""
Reorder categorical by numerical order
Parameters
----------
c : list-like
Values that will make up the categorical.
ordered : bool
If ``True``, the categorical is ordered.
Returns
-------
out : categorical
Values
Examples
--------
>>> x = pd.Categorical([5, 1, 3, 2, 4])
>>> cat_inseq(x)
[5, 1, 3, 2, 4]
Categories (5, int64): [1, 2, 3, 4, 5]
>>> x = pd.Categorical([5, 1, '3', 2, 4])
>>> cat_inseq(x)
[5, 1, 3, 2, 4]
Categories (5, int64): [1, 2, 3, 4, 5]
Values that cannot be coerced to numerical turn in ``NaN``,
and categories cannot be ``NaN``.
>>> x = pd.Categorical([5, 1, 'three', 2, 4])
>>> cat_inseq(x)
[5, 1, NaN, 2, 4]
Categories (4, int64): [1, 2, 4, 5]
Coerces values to numerical
>>> x = [5, 1, '3', 2, 4]
>>> cat_inseq(x, ordered=True)
[5, 1, 3, 2, 4]
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
>>> x = [5, 1, '3', 2, '4.5']
>>> cat_inseq(x)
[5.0, 1.0, 3.0, 2.0, 4.5]
Categories (5, float64): [1.0, 2.0, 3.0, 4.5, 5.0]
Atleast one of the values must be coercible to the integer
>>> x = ['five', 'one', 'three', 'two', 'four']
>>> cat_inseq(x)
Traceback (most recent call last):
...
ValueError: Atleast one existing category must be a number.
>>> x = ['five', 'one', '3', 'two', 'four']
>>> cat_inseq(x)
[NaN, NaN, 3, NaN, NaN]
Categories (1, int64): [3]
"""
c = as_categorical(c)
# one value at a time to avoid turning integers into floats
# when some values create nans
numerical_cats = []
for x in c.categories:
_x = pd.to_numeric(x, 'coerce')
if not pd.isnull(_x):
numerical_cats.append(_x)
if len(numerical_cats) == 0 and len(c) > 0:
raise ValueError(
"Atleast one existing category must be a number."
)
# Change the original categories to numerical ones, making sure
# to rename the existing ones i.e '3' becomes 3. Only after that,
# change to order.
c = (c.set_categories(numerical_cats, rename=True)
.reorder_categories(sorted(numerical_cats)))
if ordered is not None:
c.set_ordered(ordered, inplace=True)
return c
[docs]def cat_reorder(c, x, fun=np.median, ascending=True):
"""
Reorder categorical by sorting along another variable
It is the order of the categories that changes. Values in x
are grouped by categories and summarised to determine the
new order.
Parameters
----------
c : list-like
Values that will make up the categorical.
x : list-like
Values by which ``c`` will be ordered.
fun : callable
Summarising function to ``x`` for each category in ``c``.
Default is the *median*.
ascending : bool
If ``True``, the ``c`` is ordered in ascending order of ``x``.
Examples
--------
>>> c = list('abbccc')
>>> x = [11, 2, 2, 3, 33, 3]
>>> cat_reorder(c, x)
['a', 'b', 'b', 'c', 'c', 'c']
Categories (3, object): ['b', 'c', 'a']
>>> cat_reorder(c, x, fun=max)
['a', 'b', 'b', 'c', 'c', 'c']
Categories (3, object): ['b', 'a', 'c']
>>> cat_reorder(c, x, fun=max, ascending=False)
['a', 'b', 'b', 'c', 'c', 'c']
Categories (3, object): ['c', 'a', 'b']
>>> c_ordered = pd.Categorical(c, ordered=True)
>>> cat_reorder(c_ordered, x)
['a', 'b', 'b', 'c', 'c', 'c']
Categories (3, object): ['b' < 'c' < 'a']
>>> cat_reorder(c + ['d'], x)
Traceback (most recent call last):
...
ValueError: Lengths are not equal. len(c) is 7 and len(x) is 6.
"""
if len(c) != len(x):
raise ValueError(
"Lengths are not equal. len(c) is {} and len(x) is {}.".format(
len(c), len(x)
)
)
summary = (pd.Series(x)
.groupby(c)
.apply(fun)
.sort_values(ascending=ascending)
)
cats = summary.index.to_list()
return pd.Categorical(c, categories=cats)
[docs]def cat_reorder2(c, x, y, *args, fun=last2, ascending=False, **kwargs):
"""
Reorder categorical by sorting along another variable
It is the order of the categories that changes. Values in x
are grouped by categories and summarised to determine the
new order.
Parameters
----------
c : list-like
Values that will make up the categorical.
x : list-like
Values by which ``c`` will be ordered.
y : list-like
Values by which ``c`` will be ordered.
*args : tuple
Position arguments passed to function fun.
fun : callable
Summarising function to ``x`` for each category in ``c``.
Default is the *median*.
ascending : bool
If ``True``, the ``c`` is ordered in ascending order of ``x``.
**kwargs : dict
Keyword arguments passed to ``fun``.
Examples
--------
Order stocks by the price in the latest year. This type of ordering
can be used to order line plots so that the ends match the order of
the legend.
>>> stocks = list('AAABBBCCC')
>>> year = [1980, 1990, 2000] * 3
>>> price = [12.34, 12.90, 13.55, 10.92, 14.73, 11.08, 9.02, 12.44, 15.65]
>>> cat_reorder2(stocks, year, price)
['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C']
Categories (3, object): ['C', 'A', 'B']
"""
if len(c) != len(x) or len(x) != len(y):
raise ValueError(
"Lengths are not equal. len(c) is {}, len(x) is {} and "
"len(y) is {}.".format(len(c), len(x), len(y))
)
# Wrap two argument function fun with a function that
# takes a dataframe, put x and y into a dataframe, then
# use dataframe.groupby
def _fun(cat_df):
return fun(cat_df['x'], cat_df['y'], *args, **kwargs)
summary = (pd.DataFrame({'x': x, 'y': y})
.groupby(c)
.apply(_fun)
.sort_values(ascending=ascending)
)
cats = summary.index.to_list()
return pd.Categorical(c, categories=cats)
def cat_move(c, *args, to=0):
"""
Reorder categories explicitly
Parameters
----------
c : list-like
Values that will make up the categorical.
*args : tuple
Categories to reorder. Any categories not mentioned
will be left in existing order.
to : int or inf
Position where to place the categories. ``inf``, puts
them at the end (highest value).
Returns
-------
out : categorical
Values
Examples
--------
>>> c = ['a', 'b', 'c', 'd', 'e']
>>> cat_move(c, 'e', 'b')
['a', 'b', 'c', 'd', 'e']
Categories (5, object): ['e', 'b', 'a', 'c', 'd']
>>> cat_move(c, 'c', to=np.inf)
['a', 'b', 'c', 'd', 'e']
Categories (5, object): ['a', 'b', 'd', 'e', 'c']
>>> cat_move(pd.Categorical(c, ordered=True), 'a', 'c', 'e', to=1)
['a', 'b', 'c', 'd', 'e']
Categories (5, object): ['b' < 'a' < 'c' < 'e' < 'd']
"""
c = as_categorical(c)
if np.isinf(to):
to = len(c.categories)
args = list(args)
unmoved_cats = c.categories.drop(args).to_list()
cats = unmoved_cats[0:to] + args + unmoved_cats[to:]
c.reorder_categories(cats, inplace=True)
return c
[docs]def cat_rev(c):
"""
Reverse order of categories
Parameters
----------
c : list-like
Values that will make up the categorical.
Returns
-------
out : categorical
Values
Examples
--------
>>> c = ['a', 'b', 'c']
>>> cat_rev(c)
['a', 'b', 'c']
Categories (3, object): ['c', 'b', 'a']
>>> cat_rev(pd.Categorical(c))
['a', 'b', 'c']
Categories (3, object): ['c', 'b', 'a']
"""
c = as_categorical(c)
c.reorder_categories(c.categories[::-1], inplace=True)
return c
[docs]def cat_shift(c, n=1):
"""
Shift and wrap-around categories to the left or right
Parameters
----------
c : list-like
Values that will make up the categorical.
n : int
Number of times to shift. If positive, shift to
the left, if negative shift to the right.
Default is 1.
Returns
-------
out : categorical
Values
Examples
--------
>>> c = ['a', 'b', 'c', 'd', 'e']
>>> cat_shift(c)
['a', 'b', 'c', 'd', 'e']
Categories (5, object): ['b', 'c', 'd', 'e', 'a']
>>> cat_shift(c, 2)
['a', 'b', 'c', 'd', 'e']
Categories (5, object): ['c', 'd', 'e', 'a', 'b']
>>> cat_shift(c, -2)
['a', 'b', 'c', 'd', 'e']
Categories (5, object): ['d', 'e', 'a', 'b', 'c']
>>> cat_shift(pd.Categorical(c, ordered=True), -3)
['a', 'b', 'c', 'd', 'e']
Categories (5, object): ['c' < 'd' < 'e' < 'a' < 'b']
"""
c = as_categorical(c)
cats = c.categories.to_list()
cats_extended = cats + cats
m = len(cats)
n = n % m
cats = cats_extended[n:m] + cats_extended[:n]
c.reorder_categories(cats, inplace=True)
return c
[docs]def cat_shuffle(c, random_state=None):
"""
Reverse order of categories
Parameters
----------
c : list-like
Values that will make up the categorical.
random_state : int or ~numpy.random.RandomState, optional
Seed or Random number generator to use. If ``None``, then
numpy global generator :class:`numpy.random` is used.
Returns
-------
out : categorical
Values
Examples
--------
>>> np.random.seed(123)
>>> c = ['a', 'b', 'c', 'd', 'e']
>>> cat_shuffle(c)
['a', 'b', 'c', 'd', 'e']
Categories (5, object): ['b', 'd', 'e', 'a', 'c']
>>> cat_shuffle(pd.Categorical(c, ordered=True), 321)
['a', 'b', 'c', 'd', 'e']
Categories (5, object): ['d' < 'b' < 'a' < 'c' < 'e']
"""
c = as_categorical(c)
if random_state is None:
random_state = np.random
elif isinstance(random_state, int):
random_state = np.random.RandomState(random_state)
elif not isinstance(random_state, np.random.RandomState):
raise TypeError(
"Unknown type `{}` of random_state".format(type(random_state))
)
cats = c.categories.to_list()
random_state.shuffle(cats)
c.reorder_categories(cats, inplace=True)
return c
# Change the value of categories
[docs]def cat_anon(c, prefix='', random_state=None):
"""
Anonymise categories
Neither the value nor the order of the categories is preserved.
Parameters
----------
c : list-like
Values that will make up the categorical.
random_state : int or ~numpy.random.RandomState, optional
Seed or Random number generator to use. If ``None``, then
numpy global generator :class:`numpy.random` is used.
Returns
-------
out : categorical
Values
Examples
--------
>>> np.random.seed(123)
>>> c = ['a', 'b', 'b', 'c', 'c', 'c']
>>> cat_anon(c)
['0', '1', '1', '2', '2', '2']
Categories (3, object): ['1', '0', '2']
>>> cat_anon(c, 'c-', 321)
['c-1', 'c-2', 'c-2', 'c-0', 'c-0', 'c-0']
Categories (3, object): ['c-0', 'c-2', 'c-1']
>>> cat_anon(pd.Categorical(c, ordered=True), 'c-', 321)
['c-1', 'c-2', 'c-2', 'c-0', 'c-0', 'c-0']
Categories (3, object): ['c-0' < 'c-2' < 'c-1']
"""
c = as_categorical(c)
if random_state is None:
random_state = np.random
elif isinstance(random_state, int):
random_state = np.random.RandomState(random_state)
elif not isinstance(random_state, np.random.RandomState):
raise TypeError(
"Unknown type `{}` of random_state".format(type(random_state))
)
# Shuffle two times,
# 1. to prevent predicable sequence to category mapping
# 2. to prevent reversing of the new categories to the old ones
fmt = '{}{}'.format
cats = [fmt(prefix, i) for i in range(len(c.categories))]
random_state.shuffle(cats)
c.rename_categories(cats, inplace=True)
cats = c.categories.to_list()
random_state.shuffle(cats)
c.reorder_categories(cats, inplace=True)
return c
[docs]def cat_collapse(c, mapping, group_other=False):
"""
Collapse categories into manually defined groups
Parameters
----------
c : list-like
Values that will make up the categorical.
mapping : dict
New categories and the old categories contained in them.
group_other : False
If ``True``, a category is created to contain all other
categories that have not been explicitly collapsed.
The name of the other categories is ``other``, it may be
postfixed by the first available integer starting from
2 if there is a category with a similar name.
Returns
-------
out : categorical
Values
Examples
--------
>>> c = ['a', 'b', 'c', 'd', 'e', 'f']
>>> mapping = {'first_2': ['a', 'b'], 'second_2': ['c', 'd']}
>>> cat_collapse(c, mapping)
['first_2', 'first_2', 'second_2', 'second_2', 'e', 'f']
Categories (4, object): ['first_2', 'second_2', 'e', 'f']
>>> cat_collapse(c, mapping, group_other=True)
['first_2', 'first_2', 'second_2', 'second_2', 'other', 'other']
Categories (3, object): ['first_2', 'second_2', 'other']
Collapsing preserves the order
>>> cat_rev(c)
['a', 'b', 'c', 'd', 'e', 'f']
Categories (6, object): ['f', 'e', 'd', 'c', 'b', 'a']
>>> cat_collapse(cat_rev(c), mapping)
['first_2', 'first_2', 'second_2', 'second_2', 'e', 'f']
Categories (4, object): ['f', 'e', 'second_2', 'first_2']
>>> mapping = {'other': ['a', 'b'], 'another': ['c', 'd']}
>>> cat_collapse(c, mapping, group_other=True)
['other', 'other', 'another', 'another', 'other2', 'other2']
Categories (3, object): ['other', 'another', 'other2']
"""
def make_other_name():
"""
Generate unique name for the other category
"""
if 'other' not in mapping:
return 'other'
for i in range(2, len(mapping)+2):
other = 'other' + str(i)
if other not in mapping:
return other
c = as_categorical(c)
if group_other:
mapping = mapping.copy()
other = make_other_name()
mapped_categories = list(chain(*mapping.values()))
unmapped_categories = c.categories.difference(mapped_categories)
mapping[other] = list(unmapped_categories)
inverted_mapping = {
cat: new_cat
for new_cat, old_cats in mapping.items()
for cat in old_cats
}
# Convert old categories to new values in order and remove
# any duplicates. The preserves the order
new_cats = pd.unique([
inverted_mapping.get(x, x)
for x in c.categories
])
c = pd.Categorical(
[inverted_mapping.get(x, x) for x in c],
categories=new_cats,
ordered=c.ordered
)
return c
[docs]def cat_other(c, keep=None, drop=None, other_category='other'):
"""
Replace categories with 'other'
Parameters
----------
c : list-like
Values that will make up the categorical.
keep : list-like
Categories to preserve. Only one of ``keep`` or ``drop``
should be specified.
drop : list-like
Categories to drop. Only one of ``keep`` or ``drop``
should be specified.
other_category : object
Value used for the 'other' values. It is placed at
the end of the categories.
Returns
-------
out : categorical
Values
Examples
--------
>>> c = ['a', 'b', 'a', 'c', 'b', 'b', 'b', 'd', 'c']
>>> cat_other(c, keep=['a', 'b'])
['a', 'b', 'a', 'other', 'b', 'b', 'b', 'other', 'other']
Categories (3, object): ['a', 'b', 'other']
>>> cat_other(c, drop=['a', 'b'])
['other', 'other', 'other', 'c', 'other', 'other', 'other', 'd', 'c']
Categories (3, object): ['c', 'd', 'other']
>>> cat_other(pd.Categorical(c, ordered=True), drop=['a', 'b'])
['other', 'other', 'other', 'c', 'other', 'other', 'other', 'd', 'c']
Categories (3, object): ['c' < 'd' < 'other']
"""
if keep is None and drop is None:
raise ValueError(
"Missing columns to `keep` or those to `drop`."
)
elif keep is not None and drop is not None:
raise ValueError(
"Only one of `keep` or `drop` should be given."
)
c = as_categorical(c)
cats = c.categories
if keep is not None:
if not pdtypes.is_list_like(keep):
keep = [keep]
elif drop is not None:
if not pdtypes.is_list_like(drop):
drop = [drop]
keep = cats.difference(drop)
inverted_mapping = {
cat: other_category
for cat in cats.difference(keep)
}
inverted_mapping.update({x: x for x in keep})
new_cats = cats.intersection(keep).to_list() + [other_category]
c = pd.Categorical(
[inverted_mapping.get(x, x) for x in c],
categories=new_cats,
ordered=c.ordered
)
return c
def _lump(lump_it, c, other_category):
"""
Return a categorical of lumped
Helper for cat_lump_* functions
Parameters
----------
lump_it : sequence[(obj, bool)]
Sequence of (category, lump_category)
c : cateorical
Original categorical.
other_category : object (default: 'other')
Value used for the 'other' values. It is placed at
the end of the categories.
Returns
-------
out : categorical
Values
"""
lookup = {
cat: other_category if lump else cat
for cat, lump in lump_it
}
new_cats = (
c.categories
.intersection(lookup.values())
.insert(len(c), other_category)
)
c = pd.Categorical(
[lookup[value] for value in c],
categories=new_cats,
ordered=c.ordered
)
return c
[docs]def cat_lump(
c,
n=None,
prop=None,
w=None,
other_category='other',
ties_method='min'
):
"""
Lump together least or most common categories
This is a general method that calls one of
:func:`~plydata.cat_tools.cat_lump_n`
:func:`~plydata.cat_tools.cat_lump_prop` or
:func:`~plydata.cat_tools.cat_lump_lowfreq`
depending on the parameters.
Parameters
----------
c : list-like
Values that will make up the categorical.
n : int (optional)
Number of most/least common values to preserve (not lumped
together). Positive ``n`` preserves the most common,
negative ``n`` preserves the least common.
Lumping happens on condition that the lumped category "other"
will have the smallest number of items.
You should only specify one of ``n`` or ``prop``
prop : float (optional)
Proportion above/below which the values of a category will be
preserved (not lumped together). Positive ``prop`` preserves
categories whose proportion of values is *more* than ``prop``.
Negative ``prop`` preserves categories whose proportion of
values is *less* than ``prop``.
Lumping happens on condition that the lumped category "other"
will have the smallest number of items.
You should only specify one of ``n`` or ``prop``
w : list[int|float] (optional)
Weights for the frequency of each value. It should be the same
length as ``c``.
other_category : object (default: 'other')
Value used for the 'other' values. It is placed at
the end of the categories.
ties_method : {'min', 'max', 'average', 'first', 'dense'} (default: min)
How to treat categories that occur the same number of times
(i.e. ties):
* min: lowest rank in the group
* max: highest rank in the group
* average: average rank of the group
* first: ranks assigned in order they appear in the array
* dense: like 'min', but rank always increases by 1 between groups.
Examples
--------
>>> cat_lump(list('abbccc'))
['other', 'b', 'b', 'c', 'c', 'c']
Categories (3, object): ['b', 'c', 'other']
When the least categories put together are not less than the next
smallest group.
>>> cat_lump(list('abcddd'))
['a', 'b', 'c', 'd', 'd', 'd']
Categories (4, object): ['a', 'b', 'c', 'd']
>>> cat_lump(list('abcdddd'))
['other', 'other', 'other', 'd', 'd', 'd', 'd']
Categories (2, object): ['d', 'other']
>>> c = pd.Categorical(list('abccdd'))
>>> cat_lump(c, n=1)
['other', 'other', 'c', 'c', 'd', 'd']
Categories (3, object): ['c', 'd', 'other']
>>> cat_lump(c, n=2)
['other', 'other', 'c', 'c', 'd', 'd']
Categories (3, object): ['c', 'd', 'other']
``n`` Least common categories
>>> cat_lump(c, n=-2)
['a', 'b', 'other', 'other', 'other', 'other']
Categories (3, object): ['a', 'b', 'other']
There are fewer than ``n`` categories that are the most/least common.
>>> cat_lump(c, n=3)
['a', 'b', 'c', 'c', 'd', 'd']
Categories (4, object): ['a', 'b', 'c', 'd']
>>> cat_lump(c, n=-3)
['a', 'b', 'c', 'c', 'd', 'd']
Categories (4, object): ['a', 'b', 'c', 'd']
By proportions, categories that make up *more* than ``prop`` fraction
of the items.
>>> cat_lump(c, prop=1/3.01)
['other', 'other', 'c', 'c', 'd', 'd']
Categories (3, object): ['c', 'd', 'other']
>>> cat_lump(c, prop=-1/3.01)
['a', 'b', 'other', 'other', 'other', 'other']
Categories (3, object): ['a', 'b', 'other']
>>> cat_lump(c, prop=1/2)
['other', 'other', 'other', 'other', 'other', 'other']
Categories (1, object): ['other']
Order of categoricals is maintained
>>> c = pd.Categorical(
... list('abccdd'),
... categories=list('adcb'),
... ordered=True
... )
>>> cat_lump(c, n=2)
['other', 'other', 'c', 'c', 'd', 'd']
Categories (3, object): ['d' < 'c' < 'other']
**Weighted lumping**
>>> c = list('abcd')
>>> weights = [3, 2, 1, 1]
>>> cat_lump(c, n=2) # No lumping
['a', 'b', 'c', 'd']
Categories (4, object): ['a', 'b', 'c', 'd']
>>> cat_lump(c, n=2, w=weights)
['a', 'b', 'other', 'other']
Categories (3, object): ['a', 'b', 'other']
"""
if n is not None:
return cat_lump_n(c, n, w, other_category, ties_method)
elif prop is not None:
return cat_lump_prop(c, prop, w, other_category)
else:
return cat_lump_lowfreq(c, other_category)
[docs]def cat_lump_n(
c,
n,
w=None,
other_category='other',
ties_method='min'
):
"""
Lump together most/least common n categories
Parameters
----------
c : list-like
Values that will make up the categorical.
n : int
Number of most/least common values to preserve (not lumped
together). Positive ``n`` preserves the most common,
negative ``n`` preserves the least common.
Lumping happens on condition that the lumped category "other"
will have the smallest number of items.
You should only specify one of ``n`` or ``prop``
w : list[int|float] (optional)
Weights for the frequency of each value. It should be the same
length as ``c``.
other_category : object (default: 'other')
Value used for the 'other' values. It is placed at
the end of the categories.
ties_method : {'min', 'max', 'average', 'first', 'dense'} (default: min)
How to treat categories that occur the same number of times
(i.e. ties):
* min: lowest rank in the group
* max: highest rank in the group
* average: average rank of the group
* first: ranks assigned in order they appear in the array
* dense: like 'min', but rank always increases by 1 between groups.
Examples
--------
>>> c = pd.Categorical(list('abccdd'))
>>> cat_lump_n(c, 1)
['other', 'other', 'c', 'c', 'd', 'd']
Categories (3, object): ['c', 'd', 'other']
>>> cat_lump_n(c, 2)
['other', 'other', 'c', 'c', 'd', 'd']
Categories (3, object): ['c', 'd', 'other']
``n`` Least common categories
>>> cat_lump_n(c, -2)
['a', 'b', 'other', 'other', 'other', 'other']
Categories (3, object): ['a', 'b', 'other']
There are fewer than ``n`` categories that are the most/least common.
>>> cat_lump_n(c, 3)
['a', 'b', 'c', 'c', 'd', 'd']
Categories (4, object): ['a', 'b', 'c', 'd']
>>> cat_lump_n(c, -3)
['a', 'b', 'c', 'c', 'd', 'd']
Categories (4, object): ['a', 'b', 'c', 'd']
Order of categoricals is maintained
>>> c = pd.Categorical(
... list('abccdd'),
... categories=list('adcb'),
... ordered=True
... )
>>> cat_lump_n(c, 2)
['other', 'other', 'c', 'c', 'd', 'd']
Categories (3, object): ['d' < 'c' < 'other']
**Weighted lumping**
>>> c = list('abcd')
>>> weights = [3, 2, 1, 1]
>>> cat_lump_n(c, n=2) # No lumping
['a', 'b', 'c', 'd']
Categories (4, object): ['a', 'b', 'c', 'd']
>>> cat_lump_n(c, n=2, w=weights)
['a', 'b', 'other', 'other']
Categories (3, object): ['a', 'b', 'other']
"""
c = as_categorical(c)
if len(c) == 0:
return c
if w is None:
counts = c.value_counts().sort_values(ascending=False)
else:
counts = (
pd.Series(w)
.groupby(c)
.apply(np.sum)
.sort_values(ascending=False)
)
if n < 0:
rank = counts.rank(method=ties_method)
n = -n
else:
rank = (-counts).rank(method=ties_method)
# Less than n categories outside the lumping,
if not (rank > n).any():
return c
lump_it = zip(rank.index, rank > n)
return _lump(lump_it, c, other_category)
[docs]def cat_lump_prop(
c,
prop,
w=None,
other_category='other',
):
"""
Lump together least or most common categories by proportion
Parameters
----------
c : list-like
Values that will make up the categorical.
prop : float
Proportion above/below which the values of a category will be
preserved (not lumped together). Positive ``prop`` preserves
categories whose proportion of values is *more* than ``prop``.
Negative ``prop`` preserves categories whose proportion of
values is *less* than ``prop``.
Lumping happens on condition that the lumped category "other"
will have the smallest number of items.
You should only specify one of ``n`` or ``prop``
w : list[int|float] (optional)
Weights for the frequency of each value. It should be the same
length as ``c``.
other_category : object (default: 'other')
Value used for the 'other' values. It is placed at
the end of the categories.
Examples
--------
By proportions, categories that make up *more* than ``prop`` fraction
of the items.
>>> c = pd.Categorical(list('abccdd'))
>>> cat_lump_prop(c, 1/3.01)
['other', 'other', 'c', 'c', 'd', 'd']
Categories (3, object): ['c', 'd', 'other']
>>> cat_lump_prop(c, -1/3.01)
['a', 'b', 'other', 'other', 'other', 'other']
Categories (3, object): ['a', 'b', 'other']
>>> cat_lump_prop(c, 1/2)
['other', 'other', 'other', 'other', 'other', 'other']
Categories (1, object): ['other']
"""
c = as_categorical(c)
if len(c) == 0:
return c
if w is None:
counts = c.value_counts().sort_values(ascending=False)
total = len(c)
else:
counts = (
pd.Series(w)
.groupby(c)
.apply(np.sum)
.sort_values(ascending=False)
)
total = counts.sum()
# For each category findout whether to lump it or keep it
# Create a generator of the form ((cat, lump), ...)
props = counts / total
if prop < 0:
if not (props > -prop).any():
# No proportion more than target, so no lumping
# the most common
return c
else:
lump_it = zip(props.index, props > -prop)
else:
if not (props <= prop).any():
# No proportion less than target, so no lumping
# the least common
return c
else:
lump_it = zip(props.index, props <= prop)
return _lump(lump_it, c, other_category)
[docs]def cat_lump_lowfreq(
c,
other_category='other',
):
"""
Lump together least categories
Ensures that the "other" category is still the smallest.
Parameters
----------
c : list-like
Values that will make up the categorical.
other_category : object (default: 'other')
Value used for the 'other' values. It is placed at
the end of the categories.
Examples
--------
>>> cat_lump_lowfreq(list('abbccc'))
['other', 'b', 'b', 'c', 'c', 'c']
Categories (3, object): ['b', 'c', 'other']
When the least categories put together are not less than the next
smallest group.
>>> cat_lump_lowfreq(list('abcddd'))
['a', 'b', 'c', 'd', 'd', 'd']
Categories (4, object): ['a', 'b', 'c', 'd']
>>> cat_lump_lowfreq(list('abcdddd'))
['other', 'other', 'other', 'd', 'd', 'd', 'd']
Categories (2, object): ['d', 'other']
"""
c = as_categorical(c)
if len(c) == 0:
return c
# For each category findout whether to lump it or keep it
# Create a generator of the form ((cat, lump), ...)
counts = c.value_counts().sort_values(ascending=False)
if len(counts) == 1:
return c
unique_counts = pd.unique(counts)
smallest = unique_counts[-1]
next_smallest = unique_counts[-2]
smallest_counts = counts[counts == smallest]
smallest_total = smallest_counts.sum()
smallest_cats = smallest_counts.index
if not smallest_total < next_smallest:
return c
lump_it = (
(cat, True) if cat in smallest_cats else (cat, False)
for cat in counts.index
)
return _lump(lump_it, c, other_category)
[docs]def cat_lump_min(
c,
min,
w=None,
other_category='other',
):
"""
Lump catogeries, preserving those that appear min number of times
Parameters
----------
c : list-like
Values that will make up the categorical.
min : int
Minum number of times a category must be represented to be
preserved.
w : list[int|float] (optional)
Weights for the frequency of each value. It should be the same
length as ``c``.
other_category : object (default: 'other')
Value used for the 'other' values. It is placed at
the end of the categories.
Examples
--------
>>> c = list('abccdd')
>>> cat_lump_min(c, min=1)
['a', 'b', 'c', 'c', 'd', 'd']
Categories (4, object): ['a', 'b', 'c', 'd']
>>> cat_lump_min(c, min=2)
['other', 'other', 'c', 'c', 'd', 'd']
Categories (3, object): ['c', 'd', 'other']
**Weighted Lumping**
>>> weights = [2, 2, .5, .5, 1, 1]
>>> cat_lump_min(c, min=2, w=weights)
['a', 'b', 'other', 'other', 'd', 'd']
Categories (4, object): ['a', 'b', 'd', 'other']
Unlike :func:`~plydata.cat_tools.cat_lump`, :func:`cat_lump_min`
can lump together and create a category larger than the preserved
categories.
>>> c = list('abxyzccdd')
>>> cat_lump_min(c, min=2)
['other', 'other', 'other', 'other', 'other', 'c', 'c', 'd', 'd']
Categories (3, object): ['c', 'd', 'other']
"""
c = as_categorical(c)
if len(c) == 0:
return c
if w is None:
counts = c.value_counts().sort_values(ascending=False)
else:
counts = (
pd.Series(w)
.groupby(c)
.apply(np.sum)
.sort_values(ascending=False)
)
if (counts >= min).all():
return c
lookup = {
cat: cat if freq >= min else other_category
for cat, freq in counts.items()
}
new_cats = (
c.categories
.intersection(lookup.values())
.insert(len(c), other_category)
)
c = pd.Categorical(
[lookup[value] for value in c],
categories=new_cats,
ordered=c.ordered
)
return c
[docs]def cat_rename(c, mapping=None, **kwargs):
"""
Change/rename categories manually
Parameters
----------
c : list-like
Values that will make up the categorical.
mapping : dict (optional)
Mapping of the form ``{old_name: new_name}`` for how to rename
the categories. Setting a value to ``None`` removes the category.
This arguments is useful if the old names are not valid
python parameters. Otherwise, ``kwargs`` can be used.
**kwargs : dict
Mapping to rename categories. Setting a value to ``None`` removes
the category.
Examples
--------
>>> c = list('abcd')
>>> cat_rename(c, a='A')
['A', 'b', 'c', 'd']
Categories (4, object): ['A', 'b', 'c', 'd']
>>> c = pd.Categorical(
... list('abcd'),
... categories=list('bacd'),
... ordered=True
... )
>>> cat_rename(c, b='B', d='D')
['a', 'B', 'c', 'D']
Categories (4, object): ['B' < 'a' < 'c' < 'D']
Remove categories by setting them to ``None``.
>>> cat_rename(c, b='B', d=None)
['a', 'B', 'c']
Categories (3, object): ['B' < 'a' < 'c']
"""
c = as_categorical(c)
if mapping is not None and len(kwargs):
raise ValueError("Use only one of `new` or the ``kwargs``.")
lookup = mapping or kwargs
if not lookup:
return c
# Remove categories set to None
remove = [
old
for old, new in lookup.items()
if new is None
]
if remove:
for cat in remove:
del lookup[cat]
c = c.remove_categories(remove).dropna()
# Separately change values (inplace) and the categories (using an
# array) old to the new names. Then reconcile the two lists.
categories = c.categories.to_numpy().copy()
c.add_categories(
pd.Index(lookup.values()).difference(c.categories),
inplace=True
)
for old, new in lookup.items():
if old not in c.categories:
raise IndexError("Unknown category '{}'.".format(old))
c[c == old] = new
categories[categories == old] = new
new_categories = pd.unique(categories)
c.remove_unused_categories(inplace=True)
c.set_categories(new_categories, inplace=True)
return c
[docs]def cat_relabel(c, func=None, *args, **kwargs):
"""
Change/rename categories and collapse as necessary
Parameters
----------
c : list-like
Values that will make up the categorical.
func : callable
Function to create the new name. The first argument to
the function will be a category to be renamed.
*args : tuple
Positional arguments passed to ``func``.
*kwargs : dict
Keyword arguments passed to ``func``.
Examples
--------
>>> c = list('abcde')
>>> cat_relabel(c, str.upper)
['A', 'B', 'C', 'D', 'E']
Categories (5, object): ['A', 'B', 'C', 'D', 'E']
>>> c = pd.Categorical([0, 1, 2, 1, 1, 0])
>>> def func(x):
... if x == 0:
... return 'low'
... elif x == 1:
... return 'mid'
... elif x == 2:
... return 'high'
>>> cat_relabel(c, func)
['low', 'mid', 'high', 'mid', 'mid', 'low']
Categories (3, object): ['low', 'mid', 'high']
When the function yields the same output for 2 or more
different categories, those categories are collapsed.
>>> def first(x):
... return x[0]
>>> c = pd.Categorical(['aA', 'bB', 'aC', 'dD'],
... categories=['bB', 'aA', 'dD', 'aC'],
... ordered=True
... )
>>> cat_relabel(c, first)
['a', 'b', 'a', 'd']
Categories (3, object): ['b' < 'a' < 'd']
"""
c = as_categorical(c)
new_categories = [func(x, *args, **kwargs) for x in c.categories]
new_categories_uniq = pd.unique(new_categories)
if len(new_categories_uniq) < len(c.categories):
# Collapse
lookup = dict(zip(c.categories, new_categories))
c = pd.Categorical(
[lookup[value] for value in c],
categories=new_categories_uniq,
ordered=c.ordered
)
else:
c.categories = new_categories
return c
[docs]def cat_expand(c, *args):
"""
Add additional categories to a categorical
Parameters
----------
c : list-like
Values that will make up the categorical.
*args : tuple
Categories to add.
Examples
--------
>>> cat_expand(list('abc'), 'd', 'e')
['a', 'b', 'c']
Categories (5, object): ['a', 'b', 'c', 'd', 'e']
>>> c = pd.Categorical(list('abcd'), ordered=True)
>>> cat_expand(c, 'e', 'f')
['a', 'b', 'c', 'd']
Categories (6, object): ['a' < 'b' < 'c' < 'd' < 'e' < 'f']
"""
c = as_categorical(c)
c.add_categories(
pd.Index(args).difference(c.categories),
inplace=True
)
return c
[docs]def cat_explicit_na(c, na_category='(missing)'):
"""
Give missing values an explicity category
Parameters
----------
c : list-like
Values that will make up the categorical.
na_category : object (default: '(missing)')
Category for missing values
Examples
--------
>>> c = pd.Categorical(
... ['a', 'b', None, 'c', None, 'd', 'd'],
... ordered=True
... )
>>> c
['a', 'b', NaN, 'c', NaN, 'd', 'd']
Categories (4, object): ['a' < 'b' < 'c' < 'd']
>>> cat_explicit_na(c)
['a', 'b', '(missing)', 'c', '(missing)', 'd', 'd']
Categories (5, object): ['a' < 'b' < 'c' < 'd' < '(missing)']
"""
c = as_categorical(c)
bool_idx = pd.isnull(c)
if any(bool_idx):
c.add_categories([na_category], inplace=True)
c[bool_idx] = na_category
return c
[docs]def cat_remove_unused(c, only=None):
"""
Remove unused categories
Parameters
----------
c : list-like
Values that will make up the categorical.
only : list-like (optional)
The categories to remove *if* they are empty. If not given,
all unused categories are dropped.
Examples
--------
>>> c = pd.Categorical(list('abcdd'), categories=list('bacdefg'))
>>> c
['a', 'b', 'c', 'd', 'd']
Categories (7, object): ['b', 'a', 'c', 'd', 'e', 'f', 'g']
>>> cat_remove_unused(c)
['a', 'b', 'c', 'd', 'd']
Categories (4, object): ['b', 'a', 'c', 'd']
>>> cat_remove_unused(c, only=['a', 'e', 'g'])
['a', 'b', 'c', 'd', 'd']
Categories (5, object): ['b', 'a', 'c', 'd', 'f']
"""
if not pdtypes.is_categorical_dtype(c):
# All categories are used
c = pd.Categorical(c)
return c
else:
c = c.copy()
if only is None:
only = c.categories
used_idx = pd.unique(c.codes)
used_categories = c.categories[used_idx]
c = c.remove_categories(
c.categories
.difference(used_categories)
.intersection(only)
)
return c
[docs]def cat_unify(cs, categories=None):
"""
Unify (union of all) the categories in a list of categoricals
Parameters
----------
cs : list-like
Categoricals
categories : list-like
Extra categories to apply to very categorical.
Examples
--------
>>> c1 = pd.Categorical(['a', 'b'], categories=list('abc'))
>>> c2 = pd.Categorical(['d', 'e'], categories=list('edf'))
>>> c1_new, c2_new = cat_unify([c1, c2])
>>> c1_new
['a', 'b']
Categories (6, object): ['a', 'b', 'c', 'e', 'd', 'f']
>>> c2_new
['d', 'e']
Categories (6, object): ['a', 'b', 'c', 'e', 'd', 'f']
>>> c1_new, c2_new = cat_unify([c1, c2], categories=['z', 'y'])
>>> c1_new
['a', 'b']
Categories (8, object): ['a', 'b', 'c', 'e', 'd', 'f', 'z', 'y']
>>> c2_new
['d', 'e']
Categories (8, object): ['a', 'b', 'c', 'e', 'd', 'f', 'z', 'y']
"""
cs = [as_categorical(c) for c in cs]
all_cats = list(chain(*(c.categories.to_list() for c in cs)))
if categories is None:
categories = pd.unique(all_cats)
else:
categories = pd.unique(all_cats + categories)
cs = [c.set_categories(categories) for c in cs]
return cs
[docs]def cat_concat(*args):
"""
Concatenate categoricals and combine the categories
Parameters
----------
*args : tuple
Categoricals to be concatenated
Examples
--------
>>> c1 = pd.Categorical(['a', 'b'], categories=['b', 'a'])
>>> c2 = pd.Categorical(['d', 'a', 'c'])
>>> cat_concat(c1, c2)
['a', 'b', 'd', 'a', 'c']
Categories (4, object): ['b', 'a', 'c', 'd']
Notes
-----
The resulting category is not ordered.
"""
categories = pd.unique(list(chain(*(
c.categories if pdtypes.is_categorical_dtype(c) else c
for c in args
))))
cs = pd.Categorical(
list(chain(*(c for c in args))),
categories=categories
)
return cs
[docs]def cat_zip(*args, sep=':', keep_empty=False):
"""
Create a new categorical (zip style) combined from two or more
Parameters
----------
*args : tuple
Categoricals to be concatenated.
sep : str (default: ':')
Separator for the combined categories.
keep_empty : bool (default: False)
If ``True``, include all combinations of categories
even those without observations.
Examples
--------
>>> c1 = pd.Categorical(list('aba'))
>>> c2 = pd.Categorical(list('122'))
>>> cat_zip(c1, c2)
['a:1', 'b:2', 'a:2']
Categories (3, object): ['a:1', 'a:2', 'b:2']
>>> cat_zip(c1, c2, keep_empty=True)
['a:1', 'b:2', 'a:2']
Categories (4, object): ['a:1', 'a:2', 'b:1', 'b:2']
"""
values = [sep.join(items) for items in zip(*args)]
cs = [
c if pdtypes.is_categorical_dtype(c) else pd.Categorical(c)
for c in args
]
categories = [
sep.join(items)
for items in product(*(c.categories for c in cs))
]
c = pd.Categorical(values, categories=categories)
if not keep_empty:
c.remove_unused_categories(inplace=True)
return c
# helpers
def as_categorical(c, copy=True):
"""
Convert input to a categorical
Parameters
----------
c : categorical_like
Sequence of objects
copy : bool
If `True` and c is alread a categorical, return
a copy of `c` otherwise return `c`.
Returns
-------
out : categorical
Categorical made out of `c` or copy of `c`
if it was a categorical
"""
if not pdtypes.is_categorical_dtype(c):
c = pd.Categorical(c)
elif copy:
c = c.copy()
return c
# Temporary functions
def _stable_series_sort(ser, ascending):
"""
Stable sort for pandas series
Temporary Solution until
https://github.com/pandas-dev/pandas/issues/28697
https://github.com/pandas-dev/pandas/pull/28698
are resolved
"""
from pandas.core.sorting import nargsort
values = ser._values
indexer = nargsort(
values, kind='mergesort', ascending=ascending, na_position='last')
return pd.Series(values[indexer], index=ser.index[indexer])
cat_relevel = cat_move
cat_recode = cat_rename
cat_drop = cat_remove_unused