Source code for convoys.utils

import datetime
import numpy
import pandas

__all__ = ['get_arrays']


def get_timescale(t, unit):
    ''' Take a datetime or a numerical type, return two things:

    1. A unit
    2. A function that converts it to numerical form
    '''
    def get_timedelta_converter(t_factor):
        return lambda td: td.total_seconds() * t_factor

    if not isinstance(t, datetime.timedelta) or \
            not isinstance(t, pandas.Timedelta):
        # Assume numeric type
        return None, lambda x: float(x)
    for u, f in [('years', 365.25*24*60*60), ('days', 24*60*60),
                 ('hours', 60*60), ('minutes', 60), ('seconds', 1)]:
        if u == unit or (unit is None and t >= datetime.timedelta(seconds=f)):
            return u, get_timedelta_converter(1./f)
    raise Exception('Could not find unit for %f and %s' % (t, unit))


def get_groups(data, group_min_size, max_groups):
    ''' Picks the top groups out of a dataset

    1. Remove groups with too few data points
    2. Pick the top groups
    3. Sort groups lexicographically
    '''
    group2count = {}
    for group in data:
        group2count[group] = group2count.get(group, 0) + 1

    groups = [group for group, count in group2count.items() if count >= group_min_size]
    if max_groups >= 0:
        groups = sorted(groups, key=group2count.get, reverse=True)[:max_groups]
    return sorted(groups, key=lambda g: (g is None, g))  # Put Nones last


def _sub(a, b):
    # Computes a - b for a bunch of different cases
    if isinstance(a, datetime.datetime) and a.tzinfo is not None:
        return a.astimezone(b.tzinfo) - b
    else:
        # Either naive timestamps or numerical type
        return a - b


[docs]def get_arrays(data, features=None, groups=None, created=None, converted=None, now=None, unit=None, group_min_size=0, max_groups=-1): ''' Converts a dataframe to a list of numpy arrays. Generates either feature data, or group data. :param data: Pandas dataframe :param features: string (optional), refers to a column in the dataframe containing features, each being a 1d-vector or list of features. If not provided, then it it will look for a column in the dataframe named "features". This argument can also be a list of columns. :param groups: string (optional), refers to a column in the dataframe containing the groups for each row. If not provided, then it will look for a column in the dataframe named "groups". :param created: string (optional), refers to a column in the dataframe containing timestamps of when each item was "created". If not provided, then it will look for a column in the dataframe named "created". :param converted: string, refers to a column in the dataframe containing timestamps of when each item converted. If there is no column containing creation values, then the converted values should be timedeltas denoting time until conversion. If this argument is not provided, then it will look for a column in the dataframe named "created". :param now: string (optional), refers to a column in the dataframe containing the point in time up until which we have observed non-conversion. If there is no column containing creation value, then these values should be timedeltas. If this argument is not provided, the current timestamp will be used. :param unit: string (optional), time unit to use when converting to numerical values. Has to be one of "years", "days", "hours", "minutes", or "seconds". If not provided, then a choice will be made based on the largest time interval in the inputs. :param group_min_size: integer (optional), only include groups that has at least this many observations :param max_groups: integer (optional), only include the `n` largest groups :returns: tuple (unit, groups, arrays) `unit` is the unit chosen. Will be one of "years", "days", "hours", "minutes", or "seconds". If the `unit` parameter is passed, this will be the same. `groups` is a list of strings containing the groups. Will be `None` if `groups` is not set. `arrays` is a tuple of numpy arrays `(G, B, T)` or `(X, B, T)` containing the transformed input in numerical format. `G`, `B`, `T` will all be 1D numpy arrays. `X` will be a 2D numpy array. ''' res = [] # First, construct either the `X` or the `G` array if features is None and groups is None: if 'group' in data.columns: groups = 'group' elif 'features' in data.columns: features = 'features' else: raise Exception('Neither of the `features` or `group` parameters' ' was provided, and there was no `features` or' ' `groups` dataframe column') if groups is not None: groups_list = get_groups(data[groups], group_min_size, max_groups) group2j = dict((group, j) for j, group in enumerate(groups_list)) # Remove rows for rare groups data = data[data[groups].isin(group2j.keys())] G = data[groups].apply(lambda g: group2j.get(g, -1)).values res.append(G) else: groups_list = None if type(features) == tuple: features = list(features) # Otherwise sad Panda X = numpy.array([numpy.array(z) for z in data[features].values]) res.append(X) # Next, construct the `B` and `T` arrays if converted is None: if 'converted' in data.columns: converted = 'converted' else: raise Exception('The `converted` parameter was not provided' ' and there was no `converted` dataframe column') if now is None and 'now' in data.columns: now = 'now' if created is None and 'created' in data.columns: created = 'created' B = ~pandas.isnull(data[converted]).values res.append(B) def _calculate_T(row): if not pandas.isnull(row[converted]): if created is not None: return _sub(row[converted], row[created]) else: return row[converted] else: if created is not None: if now is not None: return _sub(row[now], row[created]) else: return (datetime.datetime.now(tz=row[created].tzinfo) - row[created]) else: return row[now] T_deltas = data.apply(_calculate_T, axis=1) max_T_delta = T_deltas.max() unit, converter = get_timescale(max_T_delta, unit) T = T_deltas.apply(converter).to_numpy() res.append(T) return unit, groups_list, tuple(res)