import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
from DataCleaner import DataCleaner
from DataTransformer import DataTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, roc_auc_score, plot_roc_curve
from Model import Model
pd.set_option('mode.chained_assignment', None)
seed = 888


df_known = pd.read_csv("BADS_WS2021_known.csv", index_col='order_item_id')


df_known.head(10)


df_known.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 1 to 100000
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   order_date     100000 non-null  object 
 1   delivery_date  90682 non-null   object 
 2   item_id        100000 non-null  int64  
 3   item_size      100000 non-null  object 
 4   item_color     100000 non-null  object 
 5   brand_id       100000 non-null  int64  
 6   item_price     100000 non-null  float64
 7   user_id        100000 non-null  int64  
 8   user_title     100000 non-null  object 
 9   user_dob       91275 non-null   object 
 10  user_state     100000 non-null  object 
 11  user_reg_date  100000 non-null  object 
 12  return         100000 non-null  int64  
dtypes: float64(1), int64(4), object(8)
memory usage: 10.7+ MB


transformer = DataTransformer()


df_known = transformer.convert_data_types(df_known)


df_known.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 1 to 100000
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   order_date     100000 non-null  datetime64[ns]
 1   delivery_date  90682 non-null   datetime64[ns]
 2   item_id        100000 non-null  int32         
 3   item_size      100000 non-null  category      
 4   item_color     100000 non-null  category      
 5   brand_id       100000 non-null  int32         
 6   item_price     100000 non-null  float32       
 7   user_id        100000 non-null  int32         
 8   user_title     100000 non-null  category      
 9   user_dob       91275 non-null   datetime64[ns]
 10  user_state     100000 non-null  category      
 11  user_reg_date  100000 non-null  datetime64[ns]
 12  return         100000 non-null  int64         
dtypes: category(4), datetime64[ns](4), float32(1), int32(3), int64(1)
memory usage: 6.5 MB


df_known.hist(figsize=(16, 16), bins=100, xlabelsize=8, ylabelsize=8); # produces one histogram per feature


df_unknown = pd.read_csv("BADS_WS2021_unknown.csv", index_col='order_item_id')
df_unknown = transformer.convert_data_types(df_unknown)
df_unknown.hist(figsize=(16, 16), bins=100, xlabelsize=8, ylabelsize=8); # produces one histogram per feature


common_users = np.intersect1d(df_known.user_id, df_unknown.user_id).tolist()

for user in common_users:
    
    users_identical = True
    
    known_state = pd.unique(df_known.loc[df_known['user_id']==user,'user_state'])
    unknown_state = pd.unique(df_unknown.loc[df_unknown['user_id']==user,'user_state'])
    known_title = pd.unique(df_known.loc[df_known['user_id']==user,'user_title'])
    unknown_title = pd.unique(df_unknown.loc[df_unknown['user_id']==user,'user_title'])
    known_dob = pd.unique(df_known.loc[df_known['user_id']==user,'user_dob'])
    unknown_dob = pd.unique(df_unknown.loc[df_unknown['user_id']==user,'user_dob'])

    if unknown_state[0] != known_state[0] or unknown_title[0] != known_title[0] or unknown_dob[0] != known_dob[0]:
        users_identical = False
        
if users_identical:
    print("The user IDs belong to the same users.")
else:
    print("The user IDs do not belong to the same users.")

The user IDs belong to the same users.


common_items = np.intersect1d(df_known.item_id, df_unknown.item_id).tolist()

for item in common_items:
    
    items_identical = True
    
    known_brand = pd.unique(df_known.loc[df_known['item_id']==item,'brand_id'])
    unknown_brand = pd.unique(df_unknown.loc[df_unknown['item_id']==item,'brand_id'])
    
    if unknown_brand[0] != known_brand[0]:
        users_identical = False
        
if items_identical:
    print("The item IDs belong to the same items.")
else:
    print("The item IDs do not belong to the same items.")

The item IDs belong to the same items.


df_known['item_size'].value_counts()

l       12347
xl      10979
m       10190
xxl      8966
40       7693
        ...  
3834        1
12+         1
105         1
4034        1
3132        1
Name: item_size, Length: 102, dtype: int64


df_known['item_color'].value_counts()

black          18640
blue           10625
brown           8734
grey            8708
red             7264
               ...  
copper coin        7
amethyst           5
avocado            4
creme              4
opal               2
Name: item_color, Length: 77, dtype: int64


df_known['user_id'].nunique()

19205


df_known['user_title'].value_counts()

Mrs             95429
Mr               3915
Family            414
Company           128
not reported      114
Name: user_title, dtype: int64


user_title = df_known.groupby(['return', 'user_title']).size().reset_index().pivot(columns='return', index='user_title', values=0)
                                                                       
#user_title.plot(kind='bar', stacked=True);
user_title.div(user_title.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)

<AxesSubplot:xlabel='user_title'>


df_known.loc[df_known['user_title']=='Company', 'user_id'].value_counts()

3633     24
36816    18
347      17
46613    16
10132    10
46360     7
32416     7
39520     4
37934     4
47241     3
42983     3
43704     3
33343     2
24884     2
35885     2
20045     1
31373     1
8150      1
41880     1
45426     1
29182     1
Name: user_id, dtype: int64


user_state = df_known.groupby(['return', 'user_state']).size().reset_index().pivot(columns='return', index='user_state', values=0)
                                                                       
#user_title.plot(kind='bar', stacked=True);
user_state.div(user_state.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)

<AxesSubplot:xlabel='user_state'>


sns.heatmap(df_known.isnull(), cbar=False)

<AxesSubplot:ylabel='order_item_id'>


df_known.loc[df_known['delivery_date'].isnull()]


items_no_ddate = df_known.loc[df_known['delivery_date'].isnull()]
print("There are {} items without delivery date".format(len(items_no_ddate)))
print("Out of those items {} were returned.".format(items_no_ddate['return'].sum()))

There are 9318 items without delivery date
Out of those items 0 were returned.


df_known.groupby('return')['item_price'].mean().plot(kind='bar')

<AxesSubplot:xlabel='return'>


df_known.loc[df_known['item_price']==0]


df_known.loc[df_known['item_price']==0, 'item_size'].value_counts()

unsized    395
40           1
1            0
42+          0
48           0
          ... 
29           0
28           0
27           0
26           0
xxxl         0
Name: item_size, Length: 102, dtype: int64


df_known.loc[df_known['item_price']==0, 'return'].mean()

0.11868686868686869


df_known = transformer.create_features_independent(df_known)


X = df_known.drop('return', axis=1).copy()
y = df_known['return'].copy()

transformer.fit(X,y)

X = transformer.create_features_dependent(X)
df_known = pd.concat([X,y], axis=1)


df_known = transformer.drop_columns(df_known)


df_known


df_known.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 1 to 100000
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   item_price             100000 non-null  float32
 1   user_age               91275 non-null   float64
 2   item_is_free           100000 non-null  bool   
 3   delivery_span          90682 non-null   float32
 4   order_delivered        100000 non-null  bool   
 5   order_num_items        100000 non-null  int32  
 6   item_multiple_orders   100000 non-null  bool   
 7   user_orders            100000 non-null  float32
 8   item_popularity        100000 non-null  int32  
 9   item_color_popularity  100000 non-null  int32  
 10  user_avg_return        100000 non-null  float32
 11  item_avg_return        100000 non-null  float32
 12  return                 100000 non-null  int64  
dtypes: bool(3), float32(5), float64(1), int32(3), int64(1)
memory usage: 5.6 MB


df_known.hist(figsize=(16, 16), bins=100, xlabelsize=8, ylabelsize=8); # produces one histogram per feature


df_known = transformer.clean(df_known)


df_known = transformer.unskew(df_known)


df_known = pd.read_csv("BADS_WS2021_known.csv", index_col='order_item_id')
X = df_known.drop('return', axis=1).copy()
y = df_known['return'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

X_train = transformer.fit_transform(X_train, y_train)

X_test = transformer.transform(X_test)

X_all = pd.concat([X_test,y_test], axis=1)


corr = X_all.corr()

fig, ax = plt.subplots(figsize=(15, 12))


sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            annot=True,
            cmap="PiYG");


transformer = DataTransformer()
scaler = StandardScaler()

logit_pipe = Pipeline([
            ('transformer', transformer), # ('step_name', transfomer) always follow this format for transformers in the pipeline
            ('scaler', StandardScaler()),
            ('logistic', LogisticRegression(penalty='elasticnet', solver='saga', random_state=seed, fit_intercept=True)) # ('step_name', fun()) add parantheses for other functions
            ])

C_values = [0.6, 0.8, 1, 1.2, 1.4]
l1_ratios = [0.3, 0.5, 0.7]
num_splits = 3


y = df_known['return'].copy()
X = df_known.drop('return', axis=1).copy()


best_auc = 0

auc_list = []


for split in range(num_splits):

    # Split data into training and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed+split)
    
    X_train = transformer.fit_transform(X_train, y_train)
    X_train = scaler.fit_transform(X_train)
    
    X_test = transformer.transform(X_test)
    X_test = scaler.transform(X_test)
    
    for C in C_values:
        for l1_ratio in l1_ratios:
            logit = LogisticRegression(penalty='elasticnet', solver='saga',
                                            C=C, l1_ratio=l1_ratio,
                                            random_state=seed, max_iter=200) # ('step_name', fun()) add parantheses for other functions
            logit.fit(X_train, y_train)
            y_probs = logit.predict_proba(X_test)[:,1]
            auc = roc_auc_score(y_test, y_probs)
            auc_list.append(auc)
            if auc > best_auc:
                best_auc = auc
                best_params_logit = {'C': C, 'l1_ratio': l1_ratio}
                
print("The best parameters are:")
print(best_params_logit)
print("The best AUC score is {}".format(best_auc))
print("The standard deviation of all AUC values is {}".format(np.std(auc_list)))

The best parameters are:
{'C': 0.8, 'l1_ratio': 0.3}
The best AUC score is 0.7650149102565651
The standard deviation of all AUC values is 0.0009461665625776799


transformer = DataTransformer()
scaler = StandardScaler()

n_estimators_list = [10, 30, 50, 100]
criteria = ['gini', 'entropy']
max_depths = [1,2,5,10]

num_splits = 3

best_auc = 0

auc_list = []

y = df_known['return'].copy()
X = df_known.drop('return', axis=1).copy()

for split in range(num_splits):

    # Split data into training and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed+split)
    
    X_train = transformer.fit_transform(X_train, y_train)
    X_train = scaler.fit_transform(X_train)
    
    X_test = transformer.transform(X_test)
    X_test = scaler.transform(X_test)
    
    for n_estimators in n_estimators_list:
        for criterion in criteria:
            for max_depth in max_depths:
                rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
                                             random_state=seed)
                rfc.fit(X_train, y_train)
                y_probs = rfc.predict_proba(X_test)[:,1]
                auc = roc_auc_score(y_test, y_probs)
                auc_list.append(auc)
                if auc > best_auc:
                    best_auc = auc
                    best_params_rfc = {'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth}

print("The best parameters are:")
print(best_params_rfc)
print("The best AUC score is {}".format(best_auc))
print("The standard deviation of all AUC values is {}".format(np.std(auc_list)))

The best parameters are:
{'n_estimators': 30, 'criterion': 'gini', 'max_depth': 2}
The best AUC score is 0.7612823395409543
The standard deviation of all AUC values is 0.01093890516730509


def calculate_costs(X, y, y_pred):
    costs = 0
    differences = y-y_pred
    fn = differences == 1
    fp = differences == -1
    
    X.reset_index()
    costs += X.loc[fp, 'item_price'].sum() * 0.5
    costs += X.loc[fn, 'item_price'].sum() * 0.25
    costs += np.sum(fn) * 7.5
    
    return costs.round(2)


def find_optimal_threshold(X, y, y_probs):
    best_threshold = 0
    lowest_cost = np.inf
    for threshold in np.linspace(0, 1, 101):
        y_pred = y_probs > threshold
        cost = calculate_costs(X, y, y_pred)
        if cost < lowest_cost:
            lowest_cost = cost
            best_threshold = threshold
    
    return best_threshold, lowest_cost


model = Model(LogisticRegression(penalty='elasticnet', solver='saga',random_state=seed, max_iter=200,
                                 C=best_params_logit['C'], l1_ratio=best_params_logit['l1_ratio']))

model = Pipeline([
            ('c', DataTransformer()), # ('step_name', transfomer) always follow this format for transformers in the pipeline
            ('scaler', StandardScaler()),
            ('logit', LogisticRegression(penalty='l2', fit_intercept=True)) # ('step_name', fun()) add parantheses for other functions
            ])

data = pd.read_csv("BADS_WS2021_known.csv", index_col='order_item_id')

y = data['return'].copy()
X = data.drop('return', axis=1).copy()

num_splits = 5
test_percentage = 0.3

best_thresholds = []
lowest_costs = []

for iteration in range(num_splits):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_percentage, random_state=seed+iteration)
    model.fit(X_train,y_train)
    y_probs = model.predict_proba(X_test)[:,1]
    best_threshold, lowest_cost = find_optimal_threshold(X_test, y_test, y_probs)
    best_thresholds.append(best_threshold)
    lowest_costs.append(lowest_cost)
    
final_threshold = np.mean(best_thresholds).round(2)
lowest_costs = np.array(lowest_costs)
lowest_costs = lowest_costs / (len(X)*test_percentage)
final_costs = np.mean(lowest_costs).round(2)

print("The best threshold is {} with a standard deviation of {}.".format(final_threshold, np.std(best_thresholds).round(2)))
print("The average costs per item are {}€ with a standard deviation of {}€.".format(final_costs, np.std(lowest_costs).round(2)))

The best threshold is 0.56 with a standard deviation of 0.01.
The average costs per item are 8.96€ with a standard deviation of 0.04€.


model = Model(LogisticRegression(penalty='l2', fit_intercept=True))
model.fit(X,y)
y_preds = model.predict(df_unknown, final_threshold)


predictions = pd.Series(y_preds, index=df_unknown.index, name='return')


predictions.to_csv('final_predictions.csv')

	order_date	delivery_date	item_id	item_size	item_color	brand_id	item_price	user_id	user_title	user_dob	user_state	user_reg_date	return
order_item_id
1	2016-06-22	2016-06-27	643	38	navy	30	49.90	30822	Mrs	1969-04-17	Saxony	2016-06-23	0
2	2016-06-22	NaN	337	152	grey	30	19.95	30822	Mrs	1969-04-17	Saxony	2016-06-23	0
3	2016-06-22	2016-06-27	270	xxl	grey	49	79.90	30823	Mrs	1970-04-22	Baden-Wuerttemberg	2015-03-15	1
4	2016-06-22	2016-06-27	142	xxl	grey	49	99.90	30823	Mrs	1970-04-22	Baden-Wuerttemberg	2015-03-15	0
5	2016-06-22	2016-06-27	561	xxl	grey	3	14.90	30823	Mrs	1970-04-22	Baden-Wuerttemberg	2015-03-15	1
6	2016-06-22	2016-06-27	579	xxl	grey	3	19.90	30823	Mrs	1970-04-22	Baden-Wuerttemberg	2015-03-15	1
7	2016-06-22	2016-06-27	72	41	grey	1	119.90	30823	Mrs	1970-04-22	Baden-Wuerttemberg	2015-03-15	0
8	2016-06-22	2016-06-27	106	50	white	6	39.90	30823	Mrs	1970-04-22	Baden-Wuerttemberg	2015-03-15	1
9	2016-06-22	2016-06-27	195	xxl	blue	46	13.90	30823	Mrs	1970-04-22	Baden-Wuerttemberg	2015-03-15	1
10	2016-06-22	2016-06-27	195	xxl	grey	46	19.90	30823	Mrs	1970-04-22	Baden-Wuerttemberg	2015-03-15	1

	order_date	delivery_date	item_id	item_size	item_color	brand_id	item_price	user_id	user_title	user_dob	user_state	user_reg_date	return
order_item_id
2	2016-06-22	NaT	337	152	grey	30	19.950001	30822	Mrs	1969-04-17	Saxony	2016-06-23	0
56	2016-06-23	NaT	5	l	white	5	69.900002	30828	Mrs	1966-05-13	Lower Saxony	2016-01-21	0
64	2016-06-23	NaT	211	40	turquoise	1	69.900002	22948	Mrs	1957-03-11	Baden-Wuerttemberg	2015-02-17	0
65	2016-06-23	NaT	55	40	purple	1	89.900002	22948	Mrs	1957-03-11	Baden-Wuerttemberg	2015-02-17	0
66	2016-06-23	NaT	74	40	pink	24	69.900002	22948	Mrs	1957-03-11	Baden-Wuerttemberg	2015-02-17	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
99953	2016-09-11	NaT	1508	42	dark denim	113	59.900002	48234	Mrs	1962-10-02	Hesse	2016-09-12	0
99954	2016-09-11	NaT	1498	42	green	6	59.900002	48234	Mrs	1962-10-02	Hesse	2016-09-12	0
99955	2016-09-11	NaT	1412	40	blue	54	119.900002	48234	Mrs	1962-10-02	Hesse	2016-09-12	0
99962	2016-09-11	NaT	1550	xxl	berry	117	129.899994	48236	Mrs	1964-10-15	North Rhine-Westphalia	2016-09-12	0
99963	2016-09-11	NaT	1712	xxl	brown	42	99.900002	48236	Mrs	1964-10-15	North Rhine-Westphalia	2016-09-12	0

	order_date	delivery_date	item_id	item_size	item_color	brand_id	item_price	user_id	user_title	user_dob	user_state	user_reg_date	return
order_item_id
203	2016-06-23	2016-06-26	157	unsized	purple	32	0.0	12373	Mrs	1958-06-07	Lower Saxony	2015-02-26	0
217	2016-06-23	2016-06-26	157	unsized	purple	32	0.0	30852	Mrs	1969-03-30	Saxony	2015-06-21	1
219	2016-06-23	2016-06-27	157	unsized	purple	32	0.0	18671	Mrs	1955-11-05	North Rhine-Westphalia	2015-02-17	0
352	2016-06-23	2016-06-27	157	unsized	purple	32	0.0	12388	Mrs	1962-01-16	North Rhine-Westphalia	2015-11-01	0
353	2016-06-23	2016-06-27	157	unsized	purple	32	0.0	12388	Mrs	1962-01-16	North Rhine-Westphalia	2015-11-01	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
96409	2016-09-07	2016-10-29	1472	unsized	brown	60	0.0	47558	Mrs	1971-10-25	Schleswig-Holstein	2015-07-09	0
97232	2016-09-08	2016-10-02	2025	unsized	grey	86	0.0	34189	Mrs	1963-09-08	Bavaria	2016-07-01	0
97434	2016-09-08	2016-10-29	1472	unsized	brown	60	0.0	47745	Mrs	1951-07-13	Baden-Wuerttemberg	2015-11-23	0
98942	2016-09-10	2016-10-29	1472	unsized	brown	60	0.0	7820	Mrs	1975-11-23	Hesse	2016-08-29	0
99657	2016-09-11	2016-09-12	2025	unsized	grey	86	0.0	1764	Mrs	1957-10-05	Hesse	2015-09-01	0

Final Assignment: Customer Return Prediciton¶

Business Analytics and Data Science - Winter 2020/2021 - Prof. Dr. Stefan Lessmann¶

Submission by David Schulte¶

Problem statement (as given by the instructor)¶

Solution¶

Explanatory data analysis and data preparation¶

Data type conversion¶

Visualization of the numerical features¶

Comparing known and unknown data¶

Exploring of the categorical variables¶

Gaps in the data¶

Further exploration¶

Feature Engineering¶

Dropping unnecessary features¶

Visualization of new features¶

Handling of gaps and outliers¶

Handling skewness¶

Correlations of features and target¶

Modelling¶

Finding a prediciton threshold that minimizes costs¶

Prediction of the unknown values¶

Conclusion¶

	item_price	user_age	item_is_free	delivery_span	order_delivered	order_num_items	item_multiple_orders	user_orders	item_popularity	item_color_popularity	user_avg_return	item_avg_return	return
order_item_id
1	49.900002	47.213699	False	5.0	True	2	False	3.0	22	314	0.000000	0.181818	0
2	19.950001	47.213699	False	NaN	False	2	False	3.0	52	8708	0.000000	0.096154	0
3	79.900002	46.200000	False	5.0	True	9	False	17.0	81	8708	0.588235	0.506173	1
4	99.900002	46.200000	False	5.0	True	9	False	17.0	228	8708	0.588235	0.535088	0
5	14.900000	46.200000	False	5.0	True	9	False	17.0	137	8708	0.588235	0.386861	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...
99996	29.900000	57.517808	False	1.0	True	2	True	4.0	221	10625	0.500000	0.502262	0
99997	29.900000	57.517808	False	1.0	True	2	True	4.0	221	8734	0.500000	0.502262	0
99998	26.900000	31.553425	False	-7925.0	True	2	True	3.0	98	18640	0.000000	0.448980	0
99999	26.900000	31.553425	False	-7925.0	True	2	True	3.0	98	18640	0.000000	0.448980	0
100000	27.900000	42.931507	False	27.0	True	1	False	7.0	28	119	0.714286	0.321429	0