Source code for plydata.tidy.tidy_verbs

"""
Tidy verb initializations
"""
import re

import pandas.api.types as pdtypes

from plydata.operators import DataOperator
from plydata.one_table_verbs import select
from plydata.utils import verify_arg, hasattrs, mean_if_many

__all__ = [
    'gather',
    'spread',
    'separate',
    'separate_rows',
    'extract',
    'unite',
    'pivot_wider',
    'pivot_longer'
]


[docs]class gather(DataOperator): """ Collapse multiple columns into key-value pairs. You use gather() when you notice that you have columns that are not variables. Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. key : str Name of the variable column value : str Name of the value column *columns : list-like | select | str | slice Columns to be gathered and whose contents will make values. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'name': ['mary', 'oscar', 'martha', 'john'], ... 'math': [92, 83, 85, 90], ... 'art': [75, 95, 80, 72] ... }) >>> df name math art 0 mary 92 75 1 oscar 83 95 2 martha 85 80 3 john 90 72 >>> df >> gather('subject', 'grade', ['math', 'art']) name subject grade 0 mary math 92 1 oscar math 83 2 martha math 85 3 john math 90 4 mary art 75 5 oscar art 95 6 martha art 80 7 john art 72 You can use the :class:`~plydata.select` verb to choose the columns >>> df >> gather('subject', 'grade', select('math', 'art')) name subject grade 0 mary math 92 1 oscar math 83 2 martha math 85 3 john math 90 4 mary art 75 5 oscar art 95 6 martha art 80 7 john art 72 >>> df >> gather('subject', 'grade', select('-name')) name subject grade 0 mary math 92 1 oscar math 83 2 martha math 85 3 john math 90 4 mary art 75 5 oscar art 95 6 martha art 80 7 john art 72 ``gather('subject', 'grade', '-name')`` without the ``select`` would have worked. >>> df >> gather('subject', 'grade', select(startswith='m')) name art subject grade 0 mary 75 math 92 1 oscar 95 math 83 2 martha 80 math 85 3 john 72 math 90 """ # Powers the column selection _select_verb = None def __init__(self, key, value, *columns): self.key = key self.value = value self._select_verb = select.from_columns(*columns)
[docs]class spread(DataOperator): """ Spread a key-value pair across multiple columns Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. key : str Name of the variable column value : str Name of the value column sep : str Charater(s) used to separate the column names. This is used to add a hierarchy and resolve duplicate column names. Examples -------- >>> import numpy as np >>> import pandas as pd >>> df = pd.DataFrame({ ... 'name': ['mary', 'oscar', 'martha', 'john'] * 2, ... 'subject': np.repeat(['math', 'art'], 4), ... 'grade': [92, 83, 85, 90, 75, 95, 80, 72] ... }) >>> df name subject grade 0 mary math 92 1 oscar math 83 2 martha math 85 3 john math 90 4 mary art 75 5 oscar art 95 6 martha art 80 7 john art 72 >>> df >> spread('subject', 'grade') name art math 0 john 72 90 1 martha 80 85 2 mary 75 92 3 oscar 95 83 """ def __init__(self, key, value, sep='_'): self.key = key self.value = value self.sep = sep
[docs]class separate(DataOperator): r""" Split a single column into multiple columns Parameters ---------- col : str | int Column name or position of variable to separate. into : list-like Column names. Use ``None`` to omit the variable from the output. sep : str | regex | list-like If String or regex, it is the pattern at which to separate the strings in the column. The default value separates on a string of non-alphanumeric characters. If list-like it must contain positions to split at. The length of the list should be 1 less than ``into``. remove : bool If ``True`` remove input column from output frame. convert : bool If ``True`` convert result columns to int, float or bool where appropriate. extra : 'warn' | 'drop' | 'merge' Control what happens when there are too many pieces. Only applies if ``sep`` is a string/regex. - 'warn'(the default): warn and drop extra values. - 'drop': drop any extra values without a warning. - 'merge': only splits at most :py:`len(into)` times. fill : 'warn' | 'right' | 'left' Control what happens when there are not enough pieces. Only applies if ``sep`` is a string/regex. - 'warn' (the default): warn and fill from the right - 'right': fill with missing values on the right - 'left': fill with missing values on the left Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'alpha': 1, ... 'x': ['a,1', 'b,2', 'c,3'], ... 'zeta': 6 ... }) >>> df alpha x zeta 0 1 a,1 6 1 1 b,2 6 2 1 c,3 6 >>> df >> separate('x', into=['A', 'B']) alpha A B zeta 0 1 a 1 6 1 1 b 2 6 2 1 c 3 6 >>> df >> separate('x', into=['A', 'B'], remove=False) alpha x A B zeta 0 1 a,1 a 1 6 1 1 b,2 b 2 6 2 1 c,3 c 3 6 Using an array of positions and using ``None`` to omit a variable. >>> df >> separate('x', into=['A', None, 'C'], sep=(1, 2)) alpha A C zeta 0 1 a 1 6 1 1 b 2 6 2 1 c 3 6 Dealing with extra pieces >>> df = pd.DataFrame({ ... 'alpha': 1, ... 'x': ['a,1', 'b,2', 'c,3,d'], ... 'zeta': 6 ... }) >>> df alpha x zeta 0 1 a,1 6 1 1 b,2 6 2 1 c,3,d 6 >>> df >> separate('x', into=['A', 'B'], extra='merge') alpha A B zeta 0 1 a 1 6 1 1 b 2 6 2 1 c 3,d 6 >>> df >> separate('x', into=['A', 'B'], extra='drop') alpha A B zeta 0 1 a 1 6 1 1 b 2 6 2 1 c 3 6 Dealing with fewer pieces >>> df = pd.DataFrame({ ... 'alpha': 1, ... 'x': ['a,1', 'b,2', 'c'], ... 'zeta': 6 ... }) >>> df alpha x zeta 0 1 a,1 6 1 1 b,2 6 2 1 c 6 >>> df >> separate('x', into=['A', 'B'], fill='right') alpha A B zeta 0 1 a 1 6 1 1 b 2 6 2 1 c None 6 >>> df >> separate('x', into=['A', 'B'], fill='left') alpha A B zeta 0 1 a 1 6 1 1 b 2 6 2 1 None c 6 Missing values >>> df = pd.DataFrame({ ... 'alpha': 1, ... 'x': ['a,1', None, 'c,3'], ... 'zeta': 6 ... }) >>> df alpha x zeta 0 1 a,1 6 1 1 None 6 2 1 c,3 6 >>> df >> separate('x', into=['A', 'B']) alpha A B zeta 0 1 a 1 6 1 1 None None 6 2 1 c 3 6 More than one character separators. Any spaces must be included in the separator >>> df = pd.DataFrame({ ... 'alpha': 1, ... 'x': ['a -> 1', 'b -> 2', 'c -> 3'], ... 'zeta': 6 ... }) >>> df alpha x zeta 0 1 a -> 1 6 1 1 b -> 2 6 2 1 c -> 3 6 >>> df >> separate('x', into=['A', 'B'], sep=' -> ') alpha A B zeta 0 1 a 1 6 1 1 b 2 6 2 1 c 3 6 All values of ``sep`` are treated ad regular expression, but a compiled regex can is also permitted. >>> pattern = re.compile(r'\s*->\s*') >>> df >> separate('x', into=['A', 'B'], sep=pattern) alpha A B zeta 0 1 a 1 6 1 1 b 2 6 2 1 c 3 6 """ # Use _pattern when sep is a string/regex # Use _positions when sep is a list of split positions _pattern = None _positions = None # When other verbs need to use the separate verb, they can use # this name for the temporary column. _column_name = '__plydata_separate_col__' def __init__( self, col, into, sep=r'[^A-Za-z0-9]+', remove=True, convert=False, extra='warn', fill='warn' ): verify_arg(extra, 'extra', ('warn', 'drop', 'merge')) verify_arg(fill, 'fill', ('warn', 'right', 'left')) if isinstance(sep, str): self._pattern = re.compile(sep) elif hasattrs(sep, ('split', 'match')): self._pattern = sep else: n, n_ref = len(sep), len(into) - 1 if n != n_ref: raise ValueError( "length of `sep` is {}, it should be {}, 1 less " "than `into` (the number of pieces).".format( n, n_ref )) self._positions = [0] + list(sep) + [None] self.col = col self.into = into self.sep = sep self.remove = remove self.convert = convert self.extra = extra self.fill = fill
[docs]class separate_rows(DataOperator): r""" Separate values of a variable along multiple rows Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. *cols : list-like | select | str | slice Columns to be gathered and whose contents will make values. sep : str | regex The pattern at which to separate the variable. The default value separates on a string of non-alphanumeric characters. convert : bool If ``True`` convert result columns to int, float or bool Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'parent': ['martha', 'james', 'alice'], ... 'child': ['leah', 'joe,vinny,laura', 'pat,lee'], ... 'age': ['3', '12,6,4', '2,7'] ... }) >>> df parent child age 0 martha leah 3 1 james joe,vinny,laura 12,6,4 2 alice pat,lee 2,7 >>> df >> separate_rows('child', 'age') parent child age 0 martha leah 3 1 james joe 12 2 james vinny 6 3 james laura 4 4 alice pat 2 5 alice lee 7 Column selection uses :class:`plydata.one_table_verbs.select`, so you can do: >>> df >> separate_rows('-parent') parent child age 0 martha leah 3 1 james joe 12 2 james vinny 6 3 james laura 4 4 alice pat 2 5 alice lee 7 or >>> df >> separate_rows(select(matches=r'^[ac]')) parent child age 0 martha leah 3 1 james joe 12 2 james vinny 6 3 james laura 4 4 alice pat 2 5 alice lee 7 You can separate all columns by specifying any column. All columns should be separable. >>> df[['child', 'age']] >> separate_rows() child age 0 leah 3 1 joe 12 2 vinny 6 3 laura 4 4 pat 2 5 lee 7 """ def __init__(self, *cols, sep=r'[^A-Za-z0-9]+', convert=False): self.sep = sep self.convert = convert self._select_verb = select.from_columns(*cols)
[docs]class extract(DataOperator): r""" Split a column using a regular expression with capturing groups. If the groups don't match, or the input is NA, the output will be NA. Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. col : str | int Column name or position of variable to separate. into : list-like Column names. Use ``None`` to omit the variable from the output. regex : str | regex Pattern used to extract columns from ``col``. There should be only one group (defined by ``()``) for each element of ``into``. remove : bool If ``True`` remove input column from output frame. convert : bool If ``True`` convert result columns to int, float or bool where appropriate. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'alpha': 1, ... 'x': ['a,1', 'b,2', 'c,3'], ... 'zeta': 6 ... }) >>> df alpha x zeta 0 1 a,1 6 1 1 b,2 6 2 1 c,3 6 >>> df >> extract('x', into='A') alpha A zeta 0 1 a 6 1 1 b 6 2 1 c 6 >>> df >> extract('x', into=['A', 'B'], regex=r'(\w+),(\w+)') alpha A B zeta 0 1 a 1 6 1 1 b 2 6 2 1 c 3 6 >>> df >> extract('x', into=['A', 'B'], regex=r'(\w+),(\w+)', remove=False) alpha x A B zeta 0 1 a,1 a 1 6 1 1 b,2 b 2 6 2 1 c,3 c 3 6 Convert extracted columns to appropriate data types. >>> result = df >> extract( ... 'x', into=['A', 'B'], regex=r'(\w+),(\w+)', convert=True) >>> result['B'].dtype dtype('int64') The regex must match fully, not just the individual groups. >>> df >> extract('x', into=['A', 'B'], regex=r'(\w+),([12]+)') alpha A B zeta 0 1 a 1 6 1 1 b 2 6 2 1 NaN NaN 6 """ # When other verbs need to use the extract verb, they can use # this name for the temporary column. _column_name = '__plydata_extract_col__' def __init__( self, col, into, regex=r'([A-Za-z0-9]+)', remove=True, convert=False, ): if isinstance(regex, str): self.regex = re.compile(regex) elif hasattrs(regex, ('split', 'match')): self.regex = regex else: raise TypeError( "Unknown type `{}` used to describe a regular " "expression.".format(type(regex)) ) if not pdtypes.is_list_like(into): into = [into] self.col = col self.into = into self.regex = regex self.remove = remove self.convert = convert
[docs]class unite(DataOperator): """ Join multiple columns into one Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. col : str Name of new column *unite_cols : list-like | select | str | slice Columns to join. Uses :class:`~plydata.one_table_verbs.select`. sep : str Separator between values. Default is ``_``. remove : bool If ``True``, remove the input columns from the output dataframe. na_rm : bool If ``True``, missing values will be removed prior to uniting each value. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'c1': [1, 2, 3, 4, None], ... 'c2': list('abcde'), ... 'c3': list('vwxyz') ... }) >>> df c1 c2 c3 0 1.0 a v 1 2.0 b w 2 3.0 c x 3 4.0 d y 4 NaN e z >>> df >> unite('c1c2', 'c1', 'c2') c1c2 c3 0 1.0_a v 1 2.0_b w 2 3.0_c x 3 4.0_d y 4 nan_e z >>> df >> unite('c1c2', 'c1', 'c2', na_rm=True) c1c2 c3 0 1.0_a v 1 2.0_b w 2 3.0_c x 3 4.0_d y 4 e z >>> df >> unite('c2c3', 'c2', 'c3', sep=',') c1 c2c3 0 1.0 a,v 1 2.0 b,w 2 3.0 c,x 3 4.0 d,y 4 NaN e,z >>> df >> unite('c2c3', 'c2', 'c3', remove=False) c1 c2c3 c2 c3 0 1.0 a_v a v 1 2.0 b_w b w 2 3.0 c_x c x 3 4.0 d_y d y 4 NaN e_z e z You can choose columns in all ways that :class:`~plydata.one_table_verbs.select` can understand and you can also pass a select verb directly. >>> df >> unite('c2c3', '-c1') c1 c2c3 0 1.0 a_v 1 2.0 b_w 2 3.0 c_x 3 4.0 d_y 4 NaN e_z >>> df >> unite('c2c3', select(matches=r'c[23]$')) c1 c2c3 0 1.0 a_v 1 2.0 b_w 2 3.0 c_x 3 4.0 d_y 4 NaN e_z """ # Powers the column selection _select_verb = None def __init__(self, col, *unite_cols, sep='_', remove=True, na_rm=False): self.col = col self.sep = sep self.remove = remove self.na_rm = na_rm self._select_verb = select.from_columns(*unite_cols)
[docs]class pivot_wider(DataOperator): """ Spread a key-value pair across multiple columns Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. names_from : str Column where to get the wide column names. values_from : str | list-like Column(s) where to get observation values that will be placed in the wide columns. id_cols : list-like Columns that uniquely identify each observation. The default is all columns in the data except those in ``names_from`` and ``values_from``. Typically used when you have additional variables that is directly related. names_prefix : str String added to the start of every variable name. This is particularly useful if ``names_from`` is a numeric vector and you want to create syntactic variable names. names_sep : str If ``names_from`` or ``values_from`` contains multiple variables, with the same name this will be used to join their values together into a single string to use as a column name. values_fill : object What to fill in if there are missing values.j values_fn : callable | dict A function to be applied to each cell if ``names_from`` & ``values_from`` or ``id_cols`` do not uniquely identify an observation. The function is used to aggregate the multiple observations. A ``dict`` can be used to apply a different function to each of the columns. The default is to compute the mean for all. Examples -------- >>> import numpy as np >>> import pandas as pd >>> df = pd.DataFrame({ ... 'name': ['mary', 'oscar', 'martha', 'john'] * 2, ... 'initials': ['M.K', 'O.S', 'M.J', 'J.T'] * 2, ... 'subject': np.repeat(['math', 'art'], 4), ... 'grade': [92, 83, 85, 90, 75, 95, 80, 72], ... 'midterm': [88, 83, 89, 93, 85, 95, 76, 79] ... }) >>> df name initials subject grade midterm 0 mary M.K math 92 88 1 oscar O.S math 83 83 2 martha M.J math 85 89 3 john J.T math 90 93 4 mary M.K art 75 85 5 oscar O.S art 95 95 6 martha M.J art 80 76 7 john J.T art 72 79 >>> df >> pivot_wider( ... names_from='subject', ... values_from=('grade', 'midterm') ... ) initials name grade_art grade_math midterm_art midterm_math 0 J.T john 72 90 79 93 1 M.J martha 80 85 76 89 2 M.K mary 75 92 85 88 3 O.S oscar 95 83 95 83 Be specific about the identifier column. >>> df >> pivot_wider( ... names_from='subject', ... values_from=('grade', 'midterm'), ... id_cols='name' ... ) name grade_art grade_math midterm_art midterm_math 0 john 72 90 79 93 1 martha 80 85 76 89 2 mary 75 92 85 88 3 oscar 95 83 95 83 When there is no ambiguity about the column names, the previous column names are not prepended onto the new names >>> df >> pivot_wider( ... names_from='subject', ... values_from='grade', ... id_cols='name' ... ) name art math 0 john 72 90 1 martha 80 85 2 mary 75 92 3 oscar 95 83 *Dealing with non-syntactic column names in the result* >>> np.random.seed(123) >>> df = pd.DataFrame({ ... 'name': ['mary', 'oscar'] * 6, ... 'face': np.repeat([1, 2, 3, 4, 5, 6], 2), ... 'rolls': np.random.randint(5, 21, 12) ... }) >>> df name face rolls 0 mary 1 19 1 oscar 1 18 2 mary 2 19 3 oscar 2 7 4 mary 3 17 5 oscar 3 7 6 mary 4 11 7 oscar 4 6 8 mary 5 8 9 oscar 5 15 10 mary 6 16 11 oscar 6 14 >>> df >> pivot_wider( ... names_from='face', ... values_from='rolls' ... ) name 1 2 3 4 5 6 0 mary 19 19 17 11 8 16 1 oscar 18 7 7 6 15 14 >>> df >> pivot_wider( ... names_from='face', ... values_from='rolls', ... names_prefix='rolled_', ... ) name rolled_1 rolled_2 rolled_3 rolled_4 rolled_5 rolled_6 0 mary 19 19 17 11 8 16 1 oscar 18 7 7 6 15 14 """ def __init__( self, names_from, values_from, id_cols=None, names_prefix='', names_sep='_', values_fill=None, values_fn=mean_if_many ): def as_list_like(x): if x is not None and not pdtypes.is_list_like(x): return [x] return x self.id_cols = as_list_like(id_cols) self.names_from = as_list_like(names_from) self.values_from = as_list_like(values_from) self.names_prefix = names_prefix self.names_sep = names_sep self.values_fill = values_fill self.values_fn = values_fn
[docs]class pivot_longer(DataOperator): r""" Lengthen dataframe by reducing the columns & turning them into into values Parameters ---------- data : dataframe, optional Useful when not using the ``>>`` operator. col : list-like | select | str | slice Columns to pivot into longer format. names_to : str | list Name of column to create. If a list, it is the names of columns to create if ``names_sep`` or ``names_pattern`` is given. values_to : str | list-like Column(s) where to get observation values that will be placed in the wide columns. names_prefix : str | dict A regular expression string used to remove matching text from the start of each variable name. If each column requires a different prefix, use a ``dict`` of the form ``{'column_name': r'regex'}``. The ``column_name`` is as given in ``names_to``. names_sep : str If ``names_to`` contains multiple values, these control how the column name is broken up. names_pattern : regex How the column name is broken up. Should be a regular expression containing matching groups. values_drop_na : bool If ``True``, drop rows that contain only ``NA``s in the ``values_to`` column. This effectively converts explicit missing values to implicit missing values, and should be used only when missing values in the ``data`` were created by its structure. convert : bool If ``True`` convert separated or extracted columns to int, float or bool where appropriate. This only applies when ``names_to`` has multiple values and ``names_sep`` or ``names_pattern`` is set. Default is ``False``. Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({ ... 'name': ['mary', 'mary', 'john', 'john'], ... 'city':['dakar', 'dakar', 'lome', 'lome'], ... 'year': [1990, 1992, 1996, 1998], ... 'data_t1_sunny': [8, 6, 4, 7], ... 'data_t2_rainy': [9, 7, 7, 6] ... }) >>> df name city year data_t1_sunny data_t2_rainy 0 mary dakar 1990 8 9 1 mary dakar 1992 6 7 2 john lome 1996 4 7 3 john lome 1998 7 6 >>> df >> pivot_longer( ... cols=select(startswith='data'), ... names_to='data_description', ... values_to='score', ... ) name city year data_description score 0 mary dakar 1990 data_t1_sunny 8 1 mary dakar 1992 data_t1_sunny 6 2 john lome 1996 data_t1_sunny 4 3 john lome 1998 data_t1_sunny 7 4 mary dakar 1990 data_t2_rainy 9 5 mary dakar 1992 data_t2_rainy 7 6 john lome 1996 data_t2_rainy 7 7 john lome 1998 data_t2_rainy 6 >>> df >> pivot_longer( ... cols=select(startswith='data'), ... names_to='data_description', ... values_to='score', ... names_prefix='data_' ... ) name city year data_description score 0 mary dakar 1990 t1_sunny 8 1 mary dakar 1992 t1_sunny 6 2 john lome 1996 t1_sunny 4 3 john lome 1998 t1_sunny 7 4 mary dakar 1990 t2_rainy 9 5 mary dakar 1992 t2_rainy 7 6 john lome 1996 t2_rainy 7 7 john lome 1998 t2_rainy 6 The column names are hiding data (data_description). We can extract it. >>> df >> pivot_longer( ... cols=select(startswith='data'), ... names_to=['take', 'season'], ... values_to='score', ... names_pattern=r'data_t(\d)_(\w+)' ... ) name city year take season score 0 mary dakar 1990 1 sunny 8 1 mary dakar 1992 1 sunny 6 2 john lome 1996 1 sunny 4 3 john lome 1998 1 sunny 7 4 mary dakar 1990 2 rainy 9 5 mary dakar 1992 2 rainy 7 6 john lome 1996 2 rainy 7 7 john lome 1998 2 rainy 6 Using a dictionary to specify different prefixes for more than than one column. >>> df >> pivot_longer( ... cols=select(startswith='data'), ... names_to=['take', 'season'], ... values_to='score', ... names_pattern=r'data_(t\d)(_\w+)', ... names_prefix={'take': 't', 'season': '_'} ... ) name city year take season score 0 mary dakar 1990 1 sunny 8 1 mary dakar 1992 1 sunny 6 2 john lome 1996 1 sunny 4 3 john lome 1998 1 sunny 7 4 mary dakar 1990 2 rainy 9 5 mary dakar 1992 2 rainy 7 6 john lome 1996 2 rainy 7 7 john lome 1998 2 rainy 6 """ _select_verb = None _separate_verb = None _extract_verb = None _prefix_patterns = None def __init__( self, cols, names_to='name', values_to='value', names_prefix=None, names_sep=None, names_pattern=None, convert=False ): if pdtypes.is_list_like(names_to): if len(names_to) > 1 and not names_sep and not names_pattern: raise ValueError( "If you supply multiple names in `names_to` you must " "also supply one of `names_sep` or `names_pattern`." ) else: names_to = [names_to] if names_sep: self._separate_verb = separate( separate._column_name, into=names_to, convert=convert ) elif names_pattern: names_pattern = re.compile(names_pattern) self._extract_verb = extract( extract._column_name, into=names_to, regex=names_pattern, convert=convert ) if isinstance(names_prefix, str): pattern = re.compile(r'^' + names_prefix) self._prefix_patterns = { name: pattern for name in names_to } elif isinstance(names_prefix, dict): self._prefix_patterns = { name: re.compile(r'^' + p) for name, p in names_prefix.items() } elif names_prefix: raise TypeError( "Got {} names_prefix should be a str or a dict".format( type(names_prefix)) ) self.cols = cols self.names_to = names_to self.values_to = values_to self.names_prefix = names_prefix self.names_sep = names_sep self.names_pattern = names_pattern self._select_verb = select.from_columns(cols)