# Source code for convoys.utils

import datetime
import numpy
import pandas

__all__ = ['get_arrays']

def get_timescale(t, unit):
''' Take a datetime or a numerical type, return two things:

1. A unit
2. A function that converts it to numerical form
'''
def get_timedelta_converter(t_factor):
return lambda td: td.total_seconds() * t_factor

if not isinstance(t, datetime.timedelta) or \
not isinstance(t, pandas.Timedelta):
# Assume numeric type
return None, lambda x: float(x)
for u, f in [('years', 365.25*24*60*60), ('days', 24*60*60),
('hours', 60*60), ('minutes', 60), ('seconds', 1)]:
if u == unit or (unit is None and t >= datetime.timedelta(seconds=f)):
return u, get_timedelta_converter(1./f)
raise Exception('Could not find unit for %f and %s' % (t, unit))

def get_groups(data, group_min_size, max_groups):
''' Picks the top groups out of a dataset

1. Remove groups with too few data points
2. Pick the top groups
3. Sort groups lexicographically
'''
group2count = {}
for group in data:
group2count[group] = group2count.get(group, 0) + 1

groups = [group for group, count in group2count.items() if count >= group_min_size]
if max_groups >= 0:
groups = sorted(groups, key=group2count.get, reverse=True)[:max_groups]
return sorted(groups, key=lambda g: (g is None, g))  # Put Nones last

def _sub(a, b):
# Computes a - b for a bunch of different cases
if isinstance(a, datetime.datetime) and a.tzinfo is not None:
return a.astimezone(b.tzinfo) - b
else:
# Either naive timestamps or numerical type
return a - b

[docs]def get_arrays(data, features=None, groups=None, created=None,
converted=None, now=None, unit=None,
group_min_size=0, max_groups=-1):
''' Converts a dataframe to a list of numpy arrays.

Generates either feature data, or group data.

:param data: Pandas dataframe
:param features: string (optional), refers to a column in the dataframe
containing features, each being a 1d-vector or list of features.
If not provided, then it it will look for a column in the dataframe
named "features". This argument can also be a list of columns.
:param groups: string (optional), refers to a column in the dataframe
containing the groups for each row. If not provided, then it will
look for a column in the dataframe named "groups".
:param created: string (optional), refers to a column in the dataframe
containing timestamps of when each item was "created". If not
provided, then it will look for a column in the dataframe named
"created".
:param converted: string, refers to a column in the dataframe
containing timestamps of when each item converted. If there is no
column containing creation values, then the converted values should
be timedeltas denoting time until conversion. If this argument is
not provided, then it will look for a column in the dataframe named
"created".
:param now: string (optional), refers to a column in the dataframe
containing the point in time up until which we have observed
non-conversion. If there is no column containing creation value,
then these values should be timedeltas. If this argument is not
provided, the current timestamp will be used.
:param unit: string (optional), time unit to use when converting to
numerical values. Has to be one of "years", "days", "hours",
"minutes", or "seconds". If not provided, then a choice will be
made based on the largest time interval in the inputs.
:param group_min_size: integer (optional), only include groups that
has at least this many observations
:param max_groups: integer (optional), only include the n largest
groups
:returns: tuple (unit, groups, arrays)

unit is the unit chosen. Will be one of "years", "days", "hours",
"minutes", or "seconds". If the unit parameter is passed, this
will be the same.

groups is a list of strings containing the groups. Will be None
if groups is not set.

arrays is a tuple of numpy arrays (G, B, T) or (X, B, T)
containing the transformed input in numerical format. G, B, T
will all be 1D numpy arrays. X will be a 2D numpy array.
'''
res = []

# First, construct either the X or the G array
if features is None and groups is None:
if 'group' in data.columns:
groups = 'group'
elif 'features' in data.columns:
features = 'features'
else:
raise Exception('Neither of the features or group parameters'
' was provided, and there was no features or'
' groups dataframe column')
if groups is not None:
groups_list = get_groups(data[groups], group_min_size, max_groups)
group2j = dict((group, j) for j, group in enumerate(groups_list))
# Remove rows for rare groups
data = data[data[groups].isin(group2j.keys())]
G = data[groups].apply(lambda g: group2j.get(g, -1)).values
res.append(G)
else:
groups_list = None
if type(features) == tuple:
features = list(features)  # Otherwise sad Panda
X = numpy.array([numpy.array(z) for z in data[features].values])
res.append(X)

# Next, construct the B and T arrays
if converted is None:
if 'converted' in data.columns:
converted = 'converted'
else:
raise Exception('The converted parameter was not provided'
' and there was no converted dataframe column')
if now is None and 'now' in data.columns:
now = 'now'
if created is None and 'created' in data.columns:
created = 'created'
B = ~pandas.isnull(data[converted]).values
res.append(B)

def _calculate_T(row):
if not pandas.isnull(row[converted]):
if created is not None:
return _sub(row[converted], row[created])
else:
return row[converted]
else:
if created is not None:
if now is not None:
return _sub(row[now], row[created])
else:
return (datetime.datetime.now(tz=row[created].tzinfo)
- row[created])
else:
return row[now]

T_deltas = data.apply(_calculate_T, axis=1)
max_T_delta = T_deltas.max()
unit, converter = get_timescale(max_T_delta, unit)
T = T_deltas.apply(converter).to_numpy()
res.append(T)
return unit, groups_list, tuple(res)