#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# seniority_list is an analytical tool used when seniority-based work
# groups merge. It brings modern data science to the area of labor
# integration, utilizing the powerful data analysis capabilities of Python
# scientific computing.
# Copyright (C) 2016-2017 Robert E. Davison, Ruby Data Systems Inc.
# Please direct inquires to: rubydatasystems@fastmail.net
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
'''
.. module:: reports
:synopsis: This module builds general statistical reports for all of the
program datasets and presents the results as spreadsheets and chart images
within the *reports* folder.
.. moduleauthor:: Bob Davison <rubydatasystems@fastmail.net>
'''
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import functions as f
import matplotlib_charting as mp
from os import path, makedirs
[docs]def stats_to_excel(ds_dict,
quantiles=10,
date_grouper='ldate',
fixed_col_name='eg_initQ',
running_col_name='eg_runQ'):
'''Create a set of basic statistics for each calculated dataset and
write the results as spreadsheets within the **reports** folder.
There are 2 spreadsheets produced, one related to retirement data and the
other related to annual data.annual
The retirement information is grouped by employees retiring in future
years, further grouped for longevity or initial job.
The annual information is grouped by the model year, and further grouped
by 10% quantiles, either by initial quantile membership or by an annual
quantile adjustment of remaining employees.
inputs
ds_dict (dictionary)
output of load_datasets function, a dictionary of datasets
quantiles (integer)
the number of binning quantiles to measure for the initial and
running (annually updated) quantile membership analysis
(default is 10)
date_grouper (string)
column name representing a column of dates within a dataframe.
Year membership of this column will be used for grouping.
Input is limited to 'ldate' or 'doh'.
fixed_col_name (string)
label to use for quantile number column when calculating using
the initial quantile membership for all results
running_col_name (string)
label to use for quantile number column when calculating using
a continuously updated quantile membership for all results
'''
ret_dict = {}
ann_dict = {}
try:
# get current case study name
case_df = pd.read_pickle('dill/case_dill.pkl')
case_name = case_df.case.value
except OSError:
print('unable to retrieve case name, try setting case input')
return
# remove the skeleton dataset from consideration for this function
ds_list = [key for key in ds_dict.keys() if key != 'skeleton']
attrs = ['spcnt', 'snum', 'cat_order', 'jobp', 'cpay', 'ylong']
ret_attrs = [attr for attr in attrs if attr not in ['mlong', 'ylong']]
# find number of employee groups in case study
num_egs = np.unique(ds_dict[ds_list[0]].eg).size
# find column numbers for each attribute measure
ret_start = np.array(range(2, (num_egs * 6) + 1, num_egs))
ret_end = ret_start + (num_egs - 1)
ann_start = np.array(range(1, (num_egs * 6) + 1, num_egs))
ann_end = ann_start + (num_egs - 1)
# make an array of excel column letters
xlcols = list('abcdefghijklmnopqrstuvwxyz'.upper())
next_cols = ['A' + el for el in xlcols]
xlcols.extend(next_cols)
cols = np.array(xlcols)
# slice column letter array according to column numbers
ret_startxl_cols = cols[ret_start]
ret_endxl_cols = cols[ret_end]
ann_startxl_cols = cols[ann_start]
ann_endxl_cols = cols[ann_end]
# make excel column ranges for formatting (example: 'C:E')
ret_col_dict = {}
i = 0
for idx in range(len(ret_startxl_cols)):
ret_col_dict[i] = ret_startxl_cols[idx] + ':' + ret_endxl_cols[idx]
i += 1
ann_col_dict = {}
i = 0
for idx in range(len(ann_startxl_cols)):
ann_col_dict[i] = ann_startxl_cols[idx] + ':' + ann_endxl_cols[idx]
i += 1
# loop through datasets
for key in ds_list:
# copy to avoid altering ds_dict values
df = ds_dict[key].copy()
# create employee group percentage columns, one fixed and the other
# recalculated each month
f.make_eg_pcnt_column(df, fixed_col_name=fixed_col_name)
f.make_eg_pcnt_column(df, recalc_each_month=True,
running_col_name=running_col_name)
# create a column assigning the initial model job for each
# employee for all months
f.make_starting_val_column(df, 'jnum')
# make grouping values
yr = df.date.dt.year
lyr = df.ldate.dt.year
rq = ((df[running_col_name] * quantiles // 1) + 1).astype(int)
iq = ((df[fixed_col_name] * quantiles // 1) + 1).astype(int)
retp = df[df.ret_mark == 1]
dateyr = retp[date_grouper].dt.year
job = retp.start_jnum
ret = retp.retdate.dt.year
# add grouped dataframes to dictionaries
try:
ret_dict[date_grouper + '_' + key] = \
retp.groupby([dateyr, ret, 'eg'])[ret_attrs].mean().unstack()
ret_dict['job' + '_' + key] = \
retp.groupby([job, ret, 'eg'])[ret_attrs].mean().unstack()
ann_dict['A_' + key] = \
df.groupby([yr, 'eg'])[attrs].mean().unstack()
ann_dict['Long_' + key] = \
df.groupby([lyr, yr, 'eg'])[attrs].mean().unstack()
ann_dict['RQ_' + key] = \
df.groupby([rq, yr, 'eg'])[attrs].mean().unstack()
ann_dict['IQ_' + key] = \
df.groupby([iq, yr, 'eg'])[attrs].mean().unstack()
except:
print(key, 'report fail')
# make a count worksheet
for key in ds_dict.keys():
if key != 'skeleton':
found_key = key
break
else:
continue
dfk = ds_dict[found_key].copy()
p = dfk[dfk.ret_mark == 1]
yr = p.date.dt.year
dateyr = p[date_grouper].dt.year
retyr = p.retdate.dt.year
ret_dict['retirement_count'] = p.groupby([dateyr, retyr, 'eg'])['mnum'] \
.count().unstack().fillna(0).astype(int)
ann_dict['retirement_count'] = p.groupby([retyr, 'eg'])['mnum'] \
.count().unstack().fillna(0).astype(int)
# write grouped retirement dataframes to excel workbook
with pd.ExcelWriter('reports/' + case_name + '/ret_stats.xlsx',
engine='xlsxwriter') as writer:
for ws_name, df_sheet in sorted(ret_dict.items()):
df_sheet.to_excel(writer, sheet_name=ws_name)
if ws_name not in ['retirement_count']:
# prepare to format worksheet
workbook = writer.book
worksheet = writer.sheets[ws_name]
format0 = workbook.add_format({'num_format': '#0',
'align': 'center'})
format2 = workbook.add_format({'num_format': '#0.00',
'align': 'center'})
formatpcnt = workbook.add_format({'num_format': '#0.0%'})
# format each worksheet attribute column range
worksheet.set_column('A:A', 10, None)
worksheet.set_column('B:B', 7, None)
worksheet.set_column(ret_col_dict[0], 7, formatpcnt)
worksheet.set_column(ret_col_dict[1], 7, format0)
worksheet.set_column(ret_col_dict[2], 7, format0)
worksheet.set_column(ret_col_dict[3], 6, format2)
worksheet.set_column(ret_col_dict[4], 7, format0)
# freeze worksheet for scrolling with headers visible
worksheet.freeze_panes('A4')
else:
# format retirement count worksheet
workbook = writer.book
worksheet = writer.sheets[ws_name]
format0 = workbook.add_format({'num_format': '#0',
'align': 'center'})
worksheet.set_column('A:A', 10, None)
worksheet.set_column('B:B', 7, None)
worksheet.set_column(ret_col_dict[0], 7, format0)
worksheet.freeze_panes('A2')
# write grouped annual dataframes to excel workbook
with pd.ExcelWriter('reports/' + case_name + '/annual_stats.xlsx',
engine='xlsxwriter') as writer:
for ws_name, df_sheet in sorted(ann_dict.items()):
df_sheet.to_excel(writer, sheet_name=ws_name)
if ws_name not in ['retirement_count']:
workbook = writer.book
worksheet = writer.sheets[ws_name]
format0 = workbook.add_format({'num_format': '#0',
'align': 'center'})
format1 = workbook.add_format({'num_format': '#0.0',
'align': 'center'})
format2 = workbook.add_format({'num_format': '#0.00',
'align': 'center'})
formatpcnt = workbook.add_format({'num_format': '#0.0%'})
if ws_name.startswith('A_'):
col_dict = ann_col_dict
else:
col_dict = ret_col_dict
worksheet.set_column('A:A', 10, None)
worksheet.set_column(col_dict[0], 7, formatpcnt)
worksheet.set_column(col_dict[1], 7, format0)
worksheet.set_column(col_dict[2], 7, format0)
worksheet.set_column(col_dict[3], 6, format2)
worksheet.set_column(col_dict[4], 7, format0)
worksheet.set_column(col_dict[5], 6, format1)
worksheet.freeze_panes('A4')
else:
workbook = writer.book
worksheet = writer.sheets[ws_name]
format0 = workbook.add_format({'num_format': '#0',
'align': 'center'})
worksheet.set_column('A:A', 10, None)
worksheet.set_column(ann_col_dict[0], 7, format0)
worksheet.freeze_panes('A2')
[docs]def retirement_charts(ds_dict,
adict, cdict,
plot_year_group=True,
date_grouper='ldate',
plot_job_group=True,
plot_init_quarter=True,
plot_running_quarter=True,
quantiles=10,
pcnt_ylim=.75,
cpay_stride=500,
fixed_col_name='eg_initQ',
running_col_name='eg_runQ',
figsize=None,
chartstyle='ticks',
verbose_status=True,
tick_size=13,
legend_size=14,
label_size=14,
title_size=14,
adjust_chart_top=.85):
'''Generates multiple charts representing general attribute statistics
of all calculated datasets for all employee groups AT RETIREMENT ONLY.
The user may select grouping analysis by any or all of the following:
1. longevity or date of hire year
2. job level
3. initial employee group list quantile membership
4. annual employee group list quantile membership
Stores the output as images in multiple folders within the
**reports/<case_name>/ret_charts** folder.
inputs
ds_dict (dictionary)
output of load_datasets function, a dictionary of datasets
adict (dictionary)
dataset column name description dictionary
cdict (dictionary)
program colors dictionary
plot_year_group (boolean)
if True, create chart images grouped by the date_grouper input
year
date_grouper (string)
column name representing a column of dates within a dataframe.
Year membership of this column will be used for grouping.
Input is limited to 'ldate' or 'doh'.
plot_job_group (boolean)
if True, create chart images grouped by job level held by
employees
quantiles (integer)
the number of binning quantiles to measure for the initial and
running (annually updated) quantile membership analysis
(default is 10)
plot_init_quarter (boolean)
if True, produce output grouped by initial list quantile
membership, for each employee group
plot_running_quarter (boolean)
if True, produce output grouped by annual list quantile
membership, for each employee group
pcnt_ylim (float)
output chart maximum y axis value for percentage attribute charts
as a float, example: .75 equals max displayed chart value of 75%
cpay_stride (integer)
y axis chart tick interval (in thousands) for charts displaying
cpay (career pay)
fixed_col_name (string)
label to use for quantile number column when calculating using
the initial quantile membership for all results
running_col_name (string)
label to use for quantile number column when calculating using
a continuously updated quantile membership for all results
figsize (tuple)
optional size of all generated chart images. Default is None.
This input will allow creation of larger chart images than the
default small charts, at the price of an increase
in the time required to run the function.
date_grouper (string)
'ldate' or 'doh' date column grouping attribute used when
plot_year_group input is True
chartstyle (string)
any valid seaborn charting style ('ticks', 'dark', 'white',
'darkgrid', 'whitegrid'), defalut is 'ticks'
verbose_status (boolean)
if True, print status of calculations as function is running
tick_size (integer or float)
text size of tick labels on the output chart images
legend_size (integer or float)
text size of the legend on the output chart images
label_size (integer or float)
text size of the x and y axis labels on the output chart images
title_size (integer or float)
text size of the title on the output chart images
adjust_chart_top (float)
input to permit adjustment of the top location of the generated
charts - used to ensure full chart title is captured by the save
chart figure code. Defalt top position is 1.0, default vaule for
this input is .85 which "shrinks" the charts slightly vertically
so that the two-line chart titles are captured when saving the
charts to file as images.
'''
try:
# get current case study name
case_df = pd.read_pickle('dill/case_dill.pkl')
case_name = case_df.case.value
except OSError:
print('unable to retrieve case name, try setting case input')
return
attrs = ['spcnt', 'cat_order', 'jobp', 'cpay']
pcnt_attributes = ['spcnt', 'lspcnt']
job_attrs = ['jnum', 'jobp']
eg_colors = cdict['eg_color_dict']
im_prefix = 'reports/' + case_name + '/ret_charts/ret_charts_'
# set directory names, make directories, set title prefixes
if plot_year_group:
date_grouper_string = adict[date_grouper]
# set path to image directories
long_image_dir = (im_prefix + date_grouper_string + '/')
# make directory if it does not already exist
if not path.exists(long_image_dir):
makedirs(long_image_dir)
long_title_pre = date_grouper_string + ' '
if plot_job_group:
job_image_dir = (im_prefix + 'init_job/')
# make directory if it does not already exist
if not path.exists(job_image_dir):
makedirs(job_image_dir)
job_title_pre = 'initial job '
if plot_init_quarter:
iq_image_dir = im_prefix + 'init_qntl/'
# make directory if it does not already exist
if not path.exists(iq_image_dir):
makedirs(iq_image_dir)
iq_title_pre = 'initial qntl '
if plot_init_quarter:
rq_image_dir = im_prefix + 'run_qntl/'
# make directory if it does not already exist
if not path.exists(rq_image_dir):
makedirs(rq_image_dir)
rq_title_pre = 'running_qntl '
# remove the skeleton dataset from consideration for this function
ds_list = [key for key in ds_dict.keys() if key != 'skeleton']
# get any computed dataset to find intial chart values
dummy_ds = ds_dict[ds_list[0]]
# get a sorted array of unique employee group codes
eg_nums = np.unique(dummy_ds.eg)
# get the first and last data model years
dt_min = min(dummy_ds.date.dt.year)
dt_max = max(dummy_ds.date.dt.year)
for attr in attrs:
if verbose_status:
print('\nPreparing', attr.upper(), 'charts...')
# set chart style
with sns.axes_style(chartstyle):
# set global figure size (if figsize input is not None)
if figsize is None:
fig, ax = plt.subplots()
else:
fig, ax = plt.subplots(figsize=figsize)
if attr not in pcnt_attributes:
ylim = max(dummy_ds[attr]) * .7
if attr in job_attrs:
jlim = int(max(dummy_ds[attr]) + 1)
if attr == 'cpay':
ylim = max(dummy_ds[attr])
ldict = {}
# initialize chart lines for each employee group
for eg_num in eg_nums:
ldict[eg_num], = ax.plot([], [], eg_colors[eg_num],
label='eg' + str(eg_num),
marker='o', markersize=4)
# format y axis tick labels, axis labels, and y axis limit
if attr in pcnt_attributes:
ax.yaxis.set_major_formatter(mp.pct_format())
ax.set_xlabel('retirement year', fontsize=label_size)
if attr == 'cpay':
ax.set_ylabel(adict[attr] + ' ($K)', fontsize=label_size)
else:
ax.set_ylabel(adict[attr], fontsize=label_size)
if attr in pcnt_attributes:
ax.set_yticks(np.arange(0, pcnt_ylim, .05))
for label in ax.yaxis.get_ticklabels()[1::2]:
label.set_visible(False)
ax.set_ylim(ymin=-.01, ymax=pcnt_ylim)
elif attr in ['ylong']:
ax.set_ylim(ymin=-.75, ymax=ylim)
elif attr in job_attrs:
ax.set_yticks(np.arange(1, jlim))
ax.set_ylim(ymin=.75, ymax=ylim)
elif attr in ['snum', 'cat_order']:
ax.set_ylim(ymin=-100, ymax=ylim)
elif attr in ['cpay']:
ax.set_yticks(np.arange(0, ylim, cpay_stride))
ax.set_ylim(ymin=-100, ymax=ylim)
min_xtick_yr = dt_min // 10 * 10
max_xtick_yr = dt_max // 10 * 10 + 10
ax.set_xticks(np.arange(min_xtick_yr, max_xtick_yr, 5))
ax.set_xlim(xmin=dt_min, xmax=dt_max)
ax.tick_params(axis='both', labelsize=tick_size)
ax.grid(alpha=.6)
# invert y axis if appropriate for attribute
if attr not in ['cpay', 'ylong']:
ax.invert_yaxis()
ax.legend(loc=4, fontsize=legend_size)
plt.tight_layout()
fig.subplots_adjust(top=adjust_chart_top)
# loop through each calculated dataset
for key in ds_list:
ds = ds_dict[key]
# create a column assigning the initial model job for each
# employee for all months
f.make_starting_val_column(ds, 'jnum')
# create employee group percentage columns, one fixed and the other
# recalculated each month
f.make_eg_pcnt_column(ds, fixed_col_name=fixed_col_name)
f.make_eg_pcnt_column(ds, recalc_each_month=True,
running_col_name=running_col_name)
# make grouping values
ds['yr'] = ds.date.dt.year
# running quantile:
ds['rq'] = ((ds[running_col_name] *
quantiles // 1) + 1).astype(int)
# initial quantile:
ds['iq'] = ((ds[fixed_col_name] * quantiles // 1) + 1).astype(int)
# create a dataframe containing each employee in retirement month
ret_df = ds[ds.ret_mark == 1]
yr = ret_df.date.dt.year
rq = ret_df.rq
iq = ret_df.iq
dateyr = ret_df[date_grouper].dt.year
job = ret_df.start_jnum
retyr = ret_df.retdate.dt.year
if plot_year_group:
# group by date grouper year (likely longevity date), data
# model year, and employee group
gb = ret_df.groupby([dateyr, retyr, 'eg'])[attr]
# find attribute averages and unstack to create employee group
# attribute value columns
unstkd = gb.mean().unstack()
# get a sorted array of unique data model years
yrs = np.unique(unstkd.index.get_level_values(level=0))
# loop through model years
for year in yrs:
# reset the chart line data for the current data model year
# and employee group
for eg_num in eg_nums:
ldict[eg_num].set_data(unstkd.loc[year].index.values,
unstkd.loc[year][eg_num].values)
# create the chart title
ax.set_title(long_title_pre + str(year) + '\n' +
key + ' retirees',
fontsize=title_size)
# draw the chart
fig.canvas.draw()
# save the chart
fig.savefig(path.join(long_image_dir,
attr + '_' +
str(year) + ' ' +
date_grouper +
'_' + key + '.png'))
if plot_job_group:
# group by initial job (from make starting value function),
# data model year, and employee group
gb = ret_df.groupby([job, retyr, 'eg'])[attr]
# find attribute averages and unstack to create employee group
# attribute value columns
unstkd = gb.mean().unstack()
# get a sorted array of unique data model years
jobs = np.unique(unstkd.index.get_level_values(level=0))
# loop through model years
for job in jobs:
# reset the chart line data for the current data model job
# and employee group
for eg_num in eg_nums:
ldict[eg_num].set_data(unstkd.loc[job].index.values,
unstkd.loc[job][eg_num].values)
# create the chart title
ax.set_title(job_title_pre + str(job) + '\n' +
key + ' retirees',
fontsize=title_size)
# draw the chart
fig.canvas.draw()
# save the chart
fig.savefig(path.join(job_image_dir,
attr + '_' + 'job' +
str(job) + '_' + key + '.png'))
if plot_init_quarter:
# group by initial quantile, data model year, and employee
# group
gb = ret_df.groupby([iq, yr, 'eg'])[attr]
unstkd = gb.mean().unstack()
# get a sorted array of quantiles
qrtls = np.unique(unstkd.index.get_level_values(level=0))
# loop through intial quantile groups
for qt in qrtls:
# reset the chart line data for the data model
# initial quantile and employee group
for eg_num in eg_nums:
ldict[eg_num].set_data(unstkd.loc[qt].index.values,
unstkd.loc[qt][eg_num].values)
# create the chart title
ax.set_title(iq_title_pre + str(qt) + '\n' +
key + ' retirees',
fontsize=title_size)
# draw the chart
fig.canvas.draw()
# save the chart
fig.savefig(path.join(iq_image_dir,
attr + '_' + 'iq' +
str(qt) + '_' + key + '.png'))
if plot_running_quarter:
# group by running quantile, data model year, and employee
# group
gb = ret_df.groupby([rq, yr, 'eg'])[attr]
# get a sorted array of quantiles
unstkd = gb.mean().unstack()
# get a sorted array of unique data model years
qrtls = np.unique(unstkd.index.get_level_values(level=0))
# loop through running quantile groups
for qt in qrtls:
# reset the chart line data for the current data
# running quantile and employee group
for eg_num in eg_nums:
ldict[eg_num].set_data(unstkd.loc[qt].index.values,
unstkd.loc[qt][eg_num].values)
# create the chart title
ax.set_title(rq_title_pre + str(qt) + '\n' +
key + ' retirees',
fontsize=title_size)
# draw the chart
fig.canvas.draw()
# save the chart
fig.savefig(path.join(rq_image_dir,
attr + '_' + 'rq' +
str(qt) + '_' + key + '.png'))
if verbose_status:
print(' ', key, ' done!')
plt.close()
if verbose_status:
print('\nALL JOBS COMPLETE\n\nChart images are located in the "' +
case_name + '" folder within the "reports" folder.\n\n' +
'Note: The charts produced from this routine are intended ' +
'for basic overview of the calculated datasets only.\n' +
'The built-in plotting functions provide much more detailed ' +
'and comprehensive analysis of integrated list outcomes.\n')
[docs]def annual_charts(ds_dict,
adict, cdict,
plot_year_group=True,
plot_job_group=True,
quantiles=10,
plot_init_quarter=True,
plot_running_quarter=True,
pcnt_ylim=.75,
cpay_stride=500,
fixed_col_name='eg_initQ',
running_col_name='eg_runQ',
figsize=None,
date_grouper='ldate',
chartstyle='ticks',
verbose_status=True,
tick_size=13,
legend_size=14,
label_size=14,
title_size=14,
adjust_chart_top=.85):
'''Generates multiple charts representing general annual attribute
statistics of all calculated datasets for all employee groups FOR ALL
ACTIVE EMPLOYEES (annual results for all employees).
The user may select grouping analysis by any or all of the following:
1. longevity or date of hire year
2. job level
3. initial employee group list quantile membership
4. annual employee group list quantile membership
Stores the output as images in multiple folders within the
**reports/<case_name>/ann_charts** folder.
inputs
ds_dict (dictionary)
output of load_datasets function, a dictionary of datasets
adict (dictionary)
dataset column name description dictionary
cdict (dictionary)
program colors dictionary
plot_year_group (boolean)
if True, create chart images grouped by the date_grouper input
year
date_grouper (string)
column name representing a column of dates within a dataframe.
Year membership of this column will be used for grouping.
Input is limited to 'ldate' or 'doh'.
plot_job_group (boolean)
if True, create chart images grouped by job level held by
employees
quantiles (integer)
the number of binning quantiles to measure for the initial and
running (annually updated) quantile membership analysis
(default is 10)
plot_init_quarter (boolean)
if True, produce output grouped by initial list quantile
membership, for each employee group
plot_running_quarter (boolean)
if True, produce output grouped by annual list quantile
membership, for each employee group
pcnt_ylim (float)
output chart maximum y axis value for percentage attribute charts
as a float, example: .75 equals max displayed chart value of 75%
cpay_stride (integer)
y axis chart tick interval (in thousands) for charts displaying
cpay (career pay)
fixed_col_name (string)
label to use for quantile number column when calculating using
the initial quantile membership for all results
running_col_name (string)
label to use for quantile number column when calculating using
a continuously updated quantile membership for all results
figsize (tuple)
optional size of all generated chart images. Default is None.
This input will allow creation of larger chart images than the
default small charts, at the price of an increase
in the time required to run the function.
date_grouper (string)
'ldate' or 'doh' date column grouping attribute used when
plot_year_group input is True
chartstyle (string)
any valid seaborn charting style ('ticks', 'dark', 'white',
'darkgrid', 'whitegrid'), defalut is 'ticks'
verbose_status (boolean)
if True, print status of calculations as function is running
tick_size (integer or float)
text size of tick labels on the output chart images
legend_size (integer or float)
text size of the legend on the output chart images
label_size (integer or float)
text size of the x and y axis labels on the output chart images
title_size (integer or float)
text size of the title on the output chart images
adjust_chart_top (float)
input to permit adjustment of the top location of the generated
charts - used to ensure full chart title is captured by the save
chart figure code. Defalt top position is 1.0, default vaule for
this input is .85 which "shrinks" the charts slightly vertically
so that the two-line chart titles are captured when saving the
charts to file as images.
'''
try:
# get current case study name
case_df = pd.read_pickle('dill/case_dill.pkl')
case_name = case_df.case.value
except OSError:
print('unable to retrieve case name, try setting case input')
return
attrs = ['spcnt', 'cat_order', 'jobp', 'cpay']
pcnt_attributes = ['spcnt', 'lspcnt']
job_attrs = ['jnum', 'jobp']
eg_colors = cdict['eg_color_dict']
im_prefix = 'reports/' + case_name + '/annual_charts/ann_charts_'
# set directory names, make directories, set title prefixes
if plot_year_group:
date_grouper_string = adict[date_grouper]
# set path to image directories
long_image_dir = (im_prefix + date_grouper_string + '/')
# make directory if it does not already exist
if not path.exists(long_image_dir):
makedirs(long_image_dir)
long_title_pre = date_grouper_string + ' '
if plot_job_group:
job_image_dir = (im_prefix + 'init_job/')
# make directory if it does not already exist
if not path.exists(job_image_dir):
makedirs(job_image_dir)
job_title_pre = 'initial job '
if plot_init_quarter:
iq_image_dir = im_prefix + 'init_qntl/'
# make directory if it does not already exist
if not path.exists(iq_image_dir):
makedirs(iq_image_dir)
iq_title_pre = 'initial qntl '
if plot_init_quarter:
rq_image_dir = im_prefix + 'run_qntl/'
# make directory if it does not already exist
if not path.exists(rq_image_dir):
makedirs(rq_image_dir)
rq_title_pre = 'running_qntl '
# remove the skeleton dataset from consideration for this function
ds_list = [key for key in ds_dict.keys() if key != 'skeleton']
# get any computed dataset to find intial chart values
dummy_ds = ds_dict[ds_list[0]]
# get a sorted array of unique employee group codes
eg_nums = np.unique(dummy_ds.eg)
# get the first and last data model years
dt_min = min(dummy_ds.date.dt.year)
dt_max = max(dummy_ds.date.dt.year)
for attr in attrs:
if verbose_status:
print('\nPreparing', attr.upper(), 'charts...')
# set chart style
with sns.axes_style(chartstyle):
# set global figure size (if figsize input is not None)
if figsize is None:
fig, ax = plt.subplots()
else:
fig, ax = plt.subplots(figsize=figsize)
if attr not in pcnt_attributes:
ylim = max(dummy_ds[attr]) * .7
if attr in job_attrs:
jlim = int(max(dummy_ds[attr]) + 1)
if attr == 'cpay':
ylim = max(dummy_ds[attr])
ldict = {}
# initialize chart lines for each employee group
for eg_num in eg_nums:
ldict[eg_num], = ax.plot([], [], eg_colors[eg_num],
label='eg' + str(eg_num),
marker='o', markersize=4)
# format y axis tick labels, axis labels, and y axis limit
if attr in pcnt_attributes:
ax.yaxis.set_major_formatter(mp.pct_format())
ax.set_xlabel('year', fontsize=label_size)
if attr == 'cpay':
ax.set_ylabel(adict[attr] + ' ($K)', fontsize=label_size)
else:
ax.set_ylabel(adict[attr], fontsize=label_size)
if attr in pcnt_attributes:
ax.set_yticks(np.arange(0, pcnt_ylim, .05))
for label in ax.yaxis.get_ticklabels()[1::2]:
label.set_visible(False)
ax.set_ylim(ymin=-.01, ymax=pcnt_ylim)
elif attr in ['ylong']:
ax.set_ylim(ymin=-.75, ymax=ylim)
elif attr in ['jobp', 'jnum']:
ax.set_yticks(np.arange(1, jlim))
ax.set_ylim(ymin=.75, ymax=ylim)
elif attr in ['snum', 'cat_order']:
ax.set_ylim(ymin=-100, ymax=ylim)
elif attr in ['cpay']:
ax.set_yticks(np.arange(0, ylim, cpay_stride))
ax.set_ylim(ymin=-100, ymax=ylim)
min_xtick_yr = dt_min // 10 * 10
max_xtick_yr = dt_max // 10 * 10 + 10
ax.set_xticks(np.arange(min_xtick_yr, max_xtick_yr, 5))
ax.set_xlim(xmin=dt_min, xmax=dt_max)
ax.tick_params(axis='both', labelsize=tick_size)
ax.grid(alpha=.6)
# invert y axis if appropriate for attribute
if attr not in ['cpay', 'ylong']:
ax.invert_yaxis()
ax.legend(loc=4, fontsize=legend_size)
plt.tight_layout()
fig.subplots_adjust(top=adjust_chart_top)
# loop through each calculated dataset
for key in ds_list:
ds = ds_dict[key]
# create a column assigning the initial model job for each
# employee for all months
f.make_starting_val_column(ds, 'jnum')
# create employee group percentage columns, one fixed and the other
# recalculated each month
f.make_eg_pcnt_column(ds, fixed_col_name=fixed_col_name)
f.make_eg_pcnt_column(ds, recalc_each_month=True,
running_col_name=running_col_name)
# make grouping values
yr = ds.date.dt.year
rq = ((ds[running_col_name] * quantiles // 1) + 1).astype(int)
iq = ((ds[fixed_col_name] * quantiles // 1) + 1).astype(int)
dateyr = ds[date_grouper].dt.year
job = ds.start_jnum
# retyr = ds.retdate.dt.year
if plot_year_group:
# group by date grouper year (likely longevity date), data
# model year, and employee group
gb = ds.groupby([dateyr, yr, 'eg'])[attr]
# find attribute averages and unstack to create employee group
# attribute value columns
unstkd = gb.mean().unstack()
# get a sorted array of unique data model years
yrs = np.unique(unstkd.index.get_level_values(level=0))
# loop through model years
for year in yrs:
# reset the chart line data for the current data model year
# and employee group
for eg_num in eg_nums:
ldict[eg_num].set_data(unstkd.loc[year].index.values,
unstkd.loc[year][eg_num].values)
# create the chart title
ax.set_title(long_title_pre + str(year) + '\n' +
key + ' actives',
fontsize=title_size)
# draw the chart
fig.canvas.draw()
# save the chart
fig.savefig(path.join(long_image_dir,
attr + '_' +
str(year) + ' ' +
date_grouper +
'_' + key + '.png'))
if plot_job_group:
# group by initial job (from make starting value function),
# data model year, and employee group
gb = ds.groupby([job, yr, 'eg'])[attr]
# find attribute averages and unstack to create employee group
# attribute value columns
unstkd = gb.mean().unstack()
# get a sorted array of unique data model years
jobs = np.unique(unstkd.index.get_level_values(level=0))
# loop through model years
for job in jobs:
# reset the chart line data for the current data model job
# and employee group
for eg_num in eg_nums:
ldict[eg_num].set_data(unstkd.loc[job].index.values,
unstkd.loc[job][eg_num].values)
# create the chart title
ax.set_title(job_title_pre + str(job) + '\n' +
key + ' actives',
fontsize=title_size)
# draw the chart
fig.canvas.draw()
# save the chart
fig.savefig(path.join(job_image_dir,
attr + '_' + 'job' +
str(job) + '_' + key + '.png'))
if plot_init_quarter:
# group by initial quantile, data model year, and employee
# group
gb = ds.groupby([iq, yr, 'eg'])[attr]
unstkd = gb.mean().unstack()
# get a sorted array of quantiles
qrtls = np.unique(unstkd.index.get_level_values(level=0))
# loop through intial quantile groups
for qt in qrtls:
# reset the chart line data for the data model
# initial quantile and employee group
for eg_num in eg_nums:
ldict[eg_num].set_data(unstkd.loc[qt].index.values,
unstkd.loc[qt][eg_num].values)
# create the chart title
ax.set_title(iq_title_pre + str(qt) + '\n' +
key + ' actives',
fontsize=title_size)
# draw the chart
fig.canvas.draw()
# save the chart
fig.savefig(path.join(iq_image_dir,
attr + '_' + 'iq' +
str(qt) + '_' + key + '.png'))
if plot_running_quarter:
# group by running quantile, data model year, and employee
# group
gb = ds.groupby([rq, yr, 'eg'])[attr]
# get a sorted array of quantiles
unstkd = gb.mean().unstack()
# get a sorted array of unique data model years
qrtls = np.unique(unstkd.index.get_level_values(level=0))
# loop through running quantile groups
for qt in qrtls:
# reset the chart line data for the current data
# running quantile and employee group
for eg_num in eg_nums:
ldict[eg_num].set_data(unstkd.loc[qt].index.values,
unstkd.loc[qt][eg_num].values)
# create the chart title
ax.set_title(rq_title_pre + str(qt) + '\n' +
key + ' actives',
fontsize=title_size)
# draw the chart
fig.canvas.draw()
# save the chart
fig.savefig(path.join(rq_image_dir,
attr + '_' + 'rq' +
str(qt) + '_' + key + '.png'))
if verbose_status:
print(' ', key, ' done!')
plt.close()
if verbose_status:
print('\nALL JOBS COMPLETE\n\nChart images are located in the "' +
case_name + '" folder within the "reports" folder.\n\n' +
'Note: The charts produced from this routine are intended ' +
'for basic overview of the calculated datasets only.\n' +
'The built-in plotting functions provide much more detailed ' +
'and comprehensive analysis of integrated list outcomes.\n')
[docs]def job_diff_to_excel(base_ds,
compare_ds,
ds_dict,
add_cpay=True,
diff_color=True,
row_color=True,
lighten_factor=.65,
neg_color='red',
pos_color='blue',
zero_color='white',
id_cols=['lname', 'ldate', 'retdate']):
'''Generates a spreadsheet which reports the differential number of
months spent at each job level between two outcome datasets. Results
are reported for every employee.
The order of the employees shown will be the order from the "compare"
dataset input.
The user may choose to apply formatting to the output spreadsheet. The
generation of the output with formatting is much slower than without,
however.
Stores the output within the **reports/<case_name>/by_employee** folder.
inputs
base_ds (dataframe)
baseline dataset
compare_ds (dataframe)
comparison dataset
add_cpay (boolean)
if True, add a "cpay_diff" column to show data model pay
differential (compare vs. base)
diff_color (boolean)
if True, use the neg_color, pos_color, and zero_color inputs
to color the spreadsheet job differential output
row_color (boolean)
color spreadsheet rows by employee group if True. Color will
be a tint (lighter color version) of the colors used to
represent the employee groups in chart output.
lighten_factor (float)
when the "row_color" input is True, this input controls the
tint of the normal employee group colors to use for the cell
background row coloring. The input is limited from 0.0 to 1.0
and a higher value will make the coloring lighter.
neg_color, pos_color, zero_color (color values)
this input will determine the font colors to use for negative,
positive, and zero job differential values within the spreadsheet
output. Inputs may by string hex values, or rgb values within
tuples or lists
id_cols (list)
list of columns to include within the spreadsheet output which
are in addition to the job level columns. This list (with the
addition of the "order" column) will also be colored according
to employee group when the "row_color" input is set to True.
'''
sdict = pd.read_pickle('dill/dict_settings.pkl')
hex_dict = f.hex_dict()
if compare_ds == 'edit':
order_col = 'idx'
else:
order_col = 'order'
def lighten(color, hex_dict, factor=.8):
'''This function will return a lightened color. The "factor" input
determines how much lighter. The "color" input may be a string or
rgb input.
'''
try:
if type(color) is str:
if color.startswith('#'):
h = color.lstrip('#')
else:
h = hex_dict[color].lstrip('#')
rgb = tuple(int(h[i:i + 2], 16) for i in (0, 2, 4))
list_rgb = list(rgb)
lighter = (((255 - np.array(list_rgb)) * factor) +
list_rgb).astype(int)
hexcolor = '#%02x%02x%02x' % tuple(lighter)
return hexcolor
if type(color) is tuple:
list_rgb = list(color)[:3]
if type(color) is list:
list_rgb = color[:3]
lighter = (((255 - np.array(list_rgb)) * factor) +
list_rgb).astype(int)
return type(color)(lighter)
except:
print('''invalid color input for "lighten" function,
must be an RGB tuple, RGB list, or string hex value''')
def cpay_diff(base, compare):
base_cpay = base[base.ret_mark == 1][['cpay']].copy()
compare_cpay = compare[compare.ret_mark == 1][['cpay']].copy()
compare_cpay['base'] = base_cpay['cpay']
compare_cpay['cpay_diff'] = \
((compare_cpay.cpay - compare_cpay.base) * 1000).astype(int)
return compare_cpay[['cpay_diff']]
def job_months(df):
job_months = df.groupby([df.empkey, 'jnum'])['jnum']. \
count().unstack().fillna(0)
return job_months.astype(int)
def fill_job_columns(df, sdict):
num_of_job_levels = sdict['num_of_job_levels']
job_levels = list(range(1, num_of_job_levels + 2))
for level in job_levels:
if level not in df.columns:
df[level] = 0
df.columns = sorted(df.columns)
return df
def resort_df(df, order_df):
m0 = order_df[order_df.mnum == 0][[]].copy()
m0[order_col] = range(1, len(m0) + 1)
df['order'] = m0[order_col]
df.sort_values('order', inplace=True)
df.drop('order', inplace=True, axis=1)
return df
def add_cols(df, order_df, col_list=id_cols):
cols = [order_col]
if 'eg' not in col_list:
col_list.append('eg')
m0 = order_df[order_df.mnum == 0][col_list].copy()
m0[order_col] = range(1, len(m0) + 1)
cols.extend(col_list)
m0 = m0[cols]
m0 = m0.join(df)
for col in col_list:
try:
m0[col] = m0[col].dt.date
except:
pass
return m0
def color_vals(val):
"""
Takes a scalar and returns a string with
the css property 'color: red' for negative
strings, 'color: blue' for positive numbers
and 'color: white' for zero values.
"""
color = ((pos_color, neg_color), (zero_color, ))[val == 0][val < 0]
return 'color: %s' % color
def color_rows(df, hex_dict, light_factor=.85):
sdict = pd.read_pickle('dill/dict_color.pkl')
eg_dict = sdict['eg_color_dict']
for key in eg_dict.keys():
eg_dict[key] = lighten(eg_dict[key], hex_dict, factor=light_factor)
prefix = 'background-color: '
egs = df.eg.values
color_strs = np.empty(egs.size, dtype=object)
for eg in np.unique(egs):
np.put(color_strs, np.where(egs == eg)[0], prefix + eg_dict[eg])
return color_strs
try:
# get current case study name
case_df = pd.read_pickle('dill/case_dill.pkl')
case_name = case_df.case.value
except OSError:
print('unable to retrieve case name, try setting case input')
return
# make a folder for the report if it does not exist
folder = 'reports/' + case_name + '/by_employee/'
if not path.exists(folder):
makedirs(folder)
# assign the output spreadsheet name and path
file_name = 'job_months_' + compare_ds + '_vs_' + base_ds + '.xlsx'
path_name = folder + file_name
# get the base and compare dataframes from the dataset dictionary
base = ds_dict[base_ds]
compare = ds_dict[compare_ds]
# make the groupby job count dataframes
base_jobs = job_months(base)
compare_jobs = job_months(compare)
# add any missing job level columns
base_jobs_filled = fill_job_columns(base_jobs, sdict)
compare_jobs_filled = fill_job_columns(compare_jobs, sdict)
# sort both datasets by compare dataset order
base_sorted = resort_df(base_jobs_filled, compare)
compare_sorted = resort_df(compare_jobs_filled, compare)
# create the differential dataframe
df_diff = compare_sorted - base_sorted
# add other columns to the differential dataframe (name, ldate, etc.)
final = add_cols(df_diff, compare)
# add a career pay differential column to the final dataframe
if add_cpay:
cpay = cpay_diff(base, compare)
final = final.join(cpay)
# define the row color function
apply_row_color = color_rows(final, hex_dict, light_factor=lighten_factor)
job_levels = list(range(1, sdict['num_of_job_levels'] + 2))
id_cols.append(order_col)
# determine the formatting to be applied
if diff_color and row_color:
frame = final.style.apply(lambda s: apply_row_color,
subset=id_cols).applymap(color_vals,
subset=job_levels)
elif diff_color and not row_color:
frame = final.style.applymap(color_vals, subset=job_levels)
elif not diff_color and row_color:
frame = final.style.apply(lambda s: apply_row_color, subset=id_cols)
else:
frame = final
# write the spreadsheet file to disk
frame.to_excel(path_name, engine='openpyxl', freeze_panes=(1, 0))