Predict who will purchase next week by Deep Learning
import os
import pandas as pd
import numpy as np
#Data Preprocessing
result2 = result2.dropna()
#columns rename
result2.rename(columns = {'2. 고객 ID':'customer_id'}, inplace = True)
result2.rename(columns = {'5. 고객등급':'grade'}, inplace = True)
result2.rename(columns = {'날짜':'date'}, inplace = True)
result2.rename(columns = {'세션 수':'session'}, inplace = True)
result2.rename(columns = {'순 이벤트 수':'event'}, inplace = True)
result2.rename(columns = {'순 페이지뷰 수':'pageview'}, inplace = True)
result2.rename(columns = {'세션 시간':'session_time'}, inplace = True)
result2.rename(columns = {'평균 페이지에 머문 시간':'page_time'}, inplace = True)
result2.rename(columns = {'거래수':'quantity'}, inplace = True)
result2.rename(columns = {'상품 수익':'revenue'}, inplace = True)
#to_datetime
result2["date"] = pd.to_datetime(result2['date'])
#Make target column
result2["target"] = np.where(result2["quantity"]>0, 1, 0)
#220515 Family+로 추출하여 테스트 시도
result3 = result2[result2["grade"]=="Family+"]
df_feat = result3.drop("target", axis = 1)
df_target = result3[["target"]]
customer_list = result3["customer_id"].unique()
date_range = result3["date"].unique()
customer_list
array(['MB202010070066006', 'MBR000000464681', 'MBR000000394781', ...,
'MB202103270198595', 'MBR000000916188', 'MB202108270376286'],
dtype=object)
mask = (result3["date"]>="2021-02-01") & (result3["date"]<="2021-02-07")
filtered_result = result3.loc[mask]
#학습 Target Label 구성 : 일주일 후 구매/비구매 여부
target_label_table = filtered_result.groupby('customer_id')
target_label_table2 = target_label_table.sum()
target_label_column = target_label_table2["target"]
target_label = pd.merge(family_customer, target_label_column, how='left',on=['customer_id'])
for i in target_label[(target_label['target']>0)].index:
target_label.at[i,'target'] = 1
target_label.fillna(0.0, inplace=True)
target_label = target_label.set_index("customer_id")
#1월 한 달 간의 데이터 생성
date_range = date_range[:30]
pro = pd.MultiIndex.from_product([customer_list,date_range], names=["customer_id","date"])
preprocessing = pd.DataFrame(index=pro).reset_index()
#Feature 데이터와 테이블 결합하기 + Panel Data 생성하기
merge1 = pd.merge(preprocessing, result3, how='left',on=['customer_id', 'date'])
merge1["session"].fillna(int(merge1['session'].min()), inplace=True)
merge1["event"].fillna(int(merge1['event'].min()), inplace=True)
merge1["pageview"].fillna(int(merge1['pageview'].min()), inplace=True)
merge1["target"].fillna(0.0, inplace=True)
merge2 = merge1.drop(columns=["grade","session_time","page_time","quantity","revenue"])
merge2 = merge2.set_index(["customer_id","date"])
merge2.sort_index()
session | event | pageview | target | ||
---|---|---|---|---|---|
customer_id | date | ||||
MB202004290002256 | 2021-01-01 | 1.0 | 0.0 | 1.0 | 0.0 |
2021-01-02 | 1.0 | 0.0 | 1.0 | 0.0 | |
2021-01-03 | 1.0 | 0.0 | 1.0 | 0.0 | |
2021-01-04 | 1.0 | 0.0 | 1.0 | 0.0 | |
2021-01-05 | 1.0 | 0.0 | 1.0 | 0.0 | |
... | ... | ... | ... | ... | ... |
MBR000001308166 | 2021-01-26 | 1.0 | 0.0 | 1.0 | 0.0 |
2021-01-27 | 1.0 | 0.0 | 1.0 | 0.0 | |
2021-01-28 | 1.0 | 0.0 | 1.0 | 0.0 | |
2021-01-29 | 1.0 | 0.0 | 1.0 | 0.0 | |
2021-01-30 | 1.0 | 0.0 | 1.0 | 0.0 |
1058550 rows × 4 columns
#Panel Data 학습 데이터화 (백터화)
res = merge2.to_xarray()
image = merge2.to_xarray().to_array().to_numpy()
re_image = image.reshape(image.shape[1],30,4)
#훈련데이터/테스트데이터 분리
train_split = 20000
train_data = re_image[:train_split]
test_data = re_image[train_split:]
target = target_label["target"].to_numpy()
train_label = target[:train_split]
test_label = target[train_split:]
SMOTE 알고리즘으로 데이터 불균형 해결
pip install scipy
Requirement already satisfied: scipy in c:\users\msi\anaconda3\envs\tfstart\lib\site-packages (1.5.2)
Requirement already satisfied: numpy>=1.14.5 in c:\users\msi\anaconda3\envs\tfstart\lib\site-packages (from scipy) (1.19.2)
Note: you may need to restart the kernel to use updated packages.
WARNING: You are using pip version 21.3.1; however, version 22.1 is available.
You should consider upgrading via the 'C:\Users\MSI\Anaconda3\envs\tfstart\python.exe -m pip install --upgrade pip' command.
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from collections import Counter
sm = SMOTE(sampling_strategy=0.5, random_state=10)
dim_1 = np.array(train_data).shape[0]
dim_2 = np.array(train_data).shape[1]
dim_3 = np.array(train_data).shape[2]
new_dim = dim_1 * dim_2
new_x_train = np.array(train_data).reshape(new_dim, dim_3)
new_y_train = []
for i in range(len(train_label)):
# print(y_train[i])
new_y_train.extend([train_label[i]]*dim_2)
new_y_train = np.array(new_y_train)
oversample = SMOTE()
X_Train, Y_Train = oversample.fit_resample(new_x_train, new_y_train)
# summarize the new class distribution
counter = Counter(Y_Train)
print('The number of samples in TRAIN: ', counter)
The number of samples in TRAIN: Counter({1.0: 577290, 0.0: 577290})
x_train_SMOTE = X_Train.reshape(int(X_Train.shape[0]/dim_2), dim_2, dim_3)
y_train_SMOTE = []
for i in range(int(X_Train.shape[0]/dim_2)):
# print(i)
value_list = list(Y_Train.reshape(int(X_Train.shape[0]/dim_2), dim_2)[i])
# print(list(set(value_list)))
y_train_SMOTE.extend(list(set(value_list)))
## Check: if there is any different value in a list
if len(set(value_list)) != 1:
print('\n\n********* STOP: THERE IS SOMETHING WRONG IN TRAIN ******\n\n')
y_train_SMOTE = np.array(y_train_SMOTE)
CNN 으로 학습하기
# CNN
import tensorflow as tf
import tensorflow_addons as tfa
C:\Users\MSI\Anaconda3\envs\tfstart\lib\site-packages\tensorflow_addons\utils\ensure_tf_install.py:53: UserWarning: Tensorflow Addons supports using Python ops for all Tensorflow versions above or equal to 2.7.0 and strictly below 2.10.0 (nightly versions are not supported).
The versions of TensorFlow you are currently using is 2.3.0 and is not supported.
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version.
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons
warnings.warn(
model = tf.keras.Sequential([
tf.keras.layers.Conv1D(32, 3, padding="same",activation='relu', input_shape=(30,4)),
tf.keras.layers.MaxPooling1D( 2 ),
tf.keras.layers.Conv1D(64, 3, padding="same",activation='relu'),
tf.keras.layers.MaxPooling1D(2),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Conv1D(128, 3, padding="same",activation='relu'),
tf.keras.layers.MaxPooling1D( 2 ),
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(1, activation="sigmoid")
])
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv1d (Conv1D) (None, 30, 32) 416
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 15, 32) 0
_________________________________________________________________
conv1d_1 (Conv1D) (None, 15, 64) 6208
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 7, 64) 0
_________________________________________________________________
dropout (Dropout) (None, 7, 64) 0
_________________________________________________________________
conv1d_2 (Conv1D) (None, 7, 128) 24704
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 3, 128) 0
_________________________________________________________________
dense (Dense) (None, 3, 128) 16512
_________________________________________________________________
flatten (Flatten) (None, 384) 0
_________________________________________________________________
dense_1 (Dense) (None, 1) 385
=================================================================
Total params: 48,225
Trainable params: 48,225
Non-trainable params: 0
_________________________________________________________________
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=[tf.keras.metrics.Accuracy(),
tf.keras.metrics.Precision(),
tf.keras.metrics.Recall(),
tfa.metrics.F1Score(num_classes=1,
average='macro',
threshold=0.5)])
model.fit(x_train_SMOTE, y_train_SMOTE, epochs=10)
Epoch 1/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3627 - accuracy: 0.1172 - precision_2: 0.8500 - recall_2: 0.7200 - f1_score: 0.7796: 1s - loss: 0.3654 - accuracy: 0.1173 - precision
Epoch 2/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3624 - accuracy: 0.1177 - precision_2: 0.8606 - recall_2: 0.7092 - f1_score: 0.7776
Epoch 3/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3593 - accuracy: 0.1207 - precision_2: 0.8693 - recall_2: 0.7042 - f1_score: 0.7780
Epoch 4/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3590 - accuracy: 0.1230 - precision_2: 0.8607 - recall_2: 0.7122 - f1_score: 0.7794
Epoch 5/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3580 - accuracy: 0.1206 - precision_2: 0.8577 - recall_2: 0.7177 - f1_score: 0.7815
Epoch 6/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3560 - accuracy: 0.1201 - precision_2: 0.8780 - recall_2: 0.6980 - f1_score: 0.7777
Epoch 7/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3550 - accuracy: 0.1166 - precision_2: 0.8683 - recall_2: 0.7055 - f1_score: 0.7785
Epoch 8/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3542 - accuracy: 0.1178 - precision_2: 0.8711 - recall_2: 0.7081 - f1_score: 0.7812
Epoch 9/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3530 - accuracy: 0.1247 - precision_2: 0.8691 - recall_2: 0.7093 - f1_score: 0.7811
Epoch 10/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3520 - accuracy: 0.1250 - precision_2: 0.8801 - recall_2: 0.7002 - f1_score: 0.7799
<tensorflow.python.keras.callbacks.History at 0x2aa9ed88ee0>
test_loss, test_acc = model.evaluate(test_data, test_label, verbose=2)
478/478 - 0s - loss: 0.1671 - accuracy: 0.1576 - precision_2: 0.0000e+00 - recall_2: 0.0000e+00 - f1_score: 0.0000e+00
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-59-8b4395f9a420> in <module>
----> 1 test_loss, test_acc = model.evaluate(test_data, test_label, verbose=2)
ValueError: too many values to unpack (expected 2)
VGG 모델 적용
model = tf.keras.Sequential([
tf.keras.layers.Conv1D(input_shape=(30,4), kernel_size=(3), filters=32, padding='same', activation='relu'),
tf.keras.layers.Conv1D(kernel_size=(3), filters=64, padding='same', activation='relu'),
tf.keras.layers.MaxPool1D(pool_size=(2)),
tf.keras.layers.Dropout(rate=0.5),
tf.keras.layers.Conv1D(kernel_size=(3), filters=128, padding='same', activation='relu'),
tf.keras.layers.Conv1D(kernel_size=(3), filters=256, padding='valid', activation='relu'),
tf.keras.layers.MaxPool1D(pool_size=(2)),
tf.keras.layers.Dropout(rate=0.5),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(units=512, activation='relu'),
tf.keras.layers.Dropout(rate=0.5),
tf.keras.layers.Dense(units=256, activation='relu'),
tf.keras.layers.Dropout(rate=0.5),
tf.keras.layers.Dense(units=1, activation='sigmoid')
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=[tf.keras.metrics.Accuracy(),
tf.keras.metrics.Precision(),
tf.keras.metrics.Recall(),
tfa.metrics.F1Score(num_classes=1,
average='macro',
threshold=0.5)])
model.fit(x_train_SMOTE, y_train_SMOTE, epochs=10)
Epoch 1/10
1203/1203 [==============================] - 11s 10ms/step - loss: 0.4685 - accuracy: 0.0176 - precision_3: 0.6813 - recall_3: 0.9111 - f1_score: 0.7796
Epoch 2/10
1203/1203 [==============================] - 11s 9ms/step - loss: 0.4001 - accuracy: 0.0333 - precision_3: 0.8060 - recall_3: 0.7320 - f1_score: 0.7672
Epoch 3/10
1203/1203 [==============================] - 11s 9ms/step - loss: 0.3891 - accuracy: 0.0293 - precision_3: 0.8429 - recall_3: 0.7062 - f1_score: 0.7686
Epoch 4/10
1203/1203 [==============================] - 11s 9ms/step - loss: 0.3855 - accuracy: 0.0385 - precision_3: 0.8507 - recall_3: 0.7036 - f1_score: 0.7702
Epoch 5/10
1203/1203 [==============================] - 11s 9ms/step - loss: 0.3849 - accuracy: 0.0501 - precision_3: 0.8568 - recall_3: 0.6961 - f1_score: 0.7681
Epoch 6/10
1203/1203 [==============================] - 12s 10ms/step - loss: 0.3836 - accuracy: 0.1836 - precision_3: 0.8603 - recall_3: 0.6955 - f1_score: 0.7691
Epoch 7/10
1203/1203 [==============================] - 11s 9ms/step - loss: 0.3814 - accuracy: 0.1957 - precision_3: 0.8637 - recall_3: 0.6964 - f1_score: 0.7711
Epoch 8/10
1203/1203 [==============================] - 11s 10ms/step - loss: 0.3813 - accuracy: 0.1265 - precision_3: 0.8621 - recall_3: 0.6960 - f1_score: 0.7702
Epoch 9/10
1203/1203 [==============================] - 12s 10ms/step - loss: 0.3789 - accuracy: 0.1084 - precision_3: 0.8643 - recall_3: 0.6912 - f1_score: 0.7681
Epoch 10/10
1203/1203 [==============================] - 12s 10ms/step - loss: 0.3777 - accuracy: 0.0721 - precision_3: 0.8698 - recall_3: 0.6925 - f1_score: 0.7711
<tensorflow.python.keras.callbacks.History at 0x2aa9fb87f40>
Vanila LSTM 적용
model = tf.keras.Sequential([
tf.keras.layers.LSTM(64, input_shape=(30,4), return_sequences=True),
tf.keras.layers.LSTM(64),
tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=[tf.keras.metrics.Accuracy(),
tf.keras.metrics.Precision(),
tf.keras.metrics.Recall(),
tfa.metrics.F1Score(num_classes=1,
average='macro',
threshold=0.5)])
model.fit(x_train_SMOTE, y_train_SMOTE, epochs=10)
Epoch 1/10
1203/1203 [==============================] - 18s 15ms/step - loss: 0.4948 - accuracy: 0.0000e+00 - precision_4: 0.6536 - recall_4: 0.9990 - f1_score: 0.7902
Epoch 2/10
1203/1203 [==============================] - 18s 15ms/step - loss: 0.4960 - accuracy: 0.0000e+00 - precision_4: 0.6540 - recall_4: 0.9986 - f1_score: 0.7904
Epoch 3/10
1203/1203 [==============================] - 18s 15ms/step - loss: 0.4952 - accuracy: 0.0000e+00 - precision_4: 0.6547 - recall_4: 0.9992 - f1_score: 0.7911
Epoch 4/10
1203/1203 [==============================] - 18s 15ms/step - loss: 0.4911 - accuracy: 0.0000e+00 - precision_4: 0.6567 - recall_4: 0.9993 - f1_score: 0.7925
Epoch 5/10
1203/1203 [==============================] - 18s 15ms/step - loss: 0.4970 - accuracy: 0.0000e+00 - precision_4: 0.6584 - recall_4: 0.9892 - f1_score: 0.79062s - loss: 0.4978
Epoch 6/10
1203/1203 [==============================] - 18s 15ms/step - loss: 0.4847 - accuracy: 0.0000e+00 - precision_4: 0.6637 - recall_4: 0.9944 - f1_score: 0.7960
Epoch 7/10
1203/1203 [==============================] - 19s 16ms/step - loss: 0.4739 - accuracy: 0.0000e+00 - precision_4: 0.6674 - recall_4: 0.9804 - f1_score: 0.7942
Epoch 8/10
1203/1203 [==============================] - 18s 15ms/step - loss: 0.4523 - accuracy: 0.0000e+00 - precision_4: 0.6877 - recall_4: 0.9093 - f1_score: 0.78321s - loss: 0.4497 - accuracy: 0.00
Epoch 9/10
1203/1203 [==============================] - 19s 15ms/step - loss: 0.4357 - accuracy: 0.0000e+00 - precision_4: 0.7075 - recall_4: 0.8947 - f1_score: 0.7902
Epoch 10/10
1203/1203 [==============================] - 19s 16ms/step - loss: 0.4056 - accuracy: 0.0000e+00 - precision_4: 0.7614 - recall_4: 0.7974 - f1_score: 0.7790
<tensorflow.python.keras.callbacks.History at 0x2aab0e20100>
제안 모델 CNN + LSTM
model = tf.keras.Sequential([
tf.keras.layers.Conv1D(input_shape=(30,4), kernel_size=3, filters=32, padding='same', activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.MaxPooling1D(4),
tf.keras.layers.Dropout(0.1),
tf.keras.layers.LSTM(16, return_sequences=True),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=[tf.keras.metrics.Accuracy(),
tf.keras.metrics.Precision(),
tf.keras.metrics.Recall(),
tfa.metrics.F1Score(num_classes=1,
average='macro',
threshold=0.5)])
model.fit(x_train_SMOTE, y_train_SMOTE, epochs=10)
Epoch 1/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.4750 - accuracy: 0.0000e+00 - precision_6: 0.6762 - recall_6: 0.9265 - f1_score: 0.7818
Epoch 2/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.4364 - accuracy: 0.0000e+00 - precision_6: 0.7113 - recall_6: 0.8487 - f1_score: 0.7739
Epoch 3/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.4219 - accuracy: 0.0000e+00 - precision_6: 0.7364 - recall_6: 0.8102 - f1_score: 0.7715
Epoch 4/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.4126 - accuracy: 0.0000e+00 - precision_6: 0.7512 - recall_6: 0.7927 - f1_score: 0.7714
Epoch 5/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.4055 - accuracy: 0.0000e+00 - precision_6: 0.7639 - recall_6: 0.7767 - f1_score: 0.7703
Epoch 6/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.4038 - accuracy: 0.0000e+00 - precision_6: 0.7770 - recall_6: 0.7612 - f1_score: 0.7690
Epoch 7/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3986 - accuracy: 0.0000e+00 - precision_6: 0.7887 - recall_6: 0.7475 - f1_score: 0.7675
Epoch 8/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3959 - accuracy: 0.0000e+00 - precision_6: 0.7879 - recall_6: 0.7489 - f1_score: 0.7679
Epoch 9/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3923 - accuracy: 0.0000e+00 - precision_6: 0.7945 - recall_6: 0.7443 - f1_score: 0.7686
Epoch 10/10
1203/1203 [==============================] - 4s 3ms/step - loss: 0.3922 - accuracy: 0.0000e+00 - precision_6: 0.7979 - recall_6: 0.7440 - f1_score: 0.7700
<tensorflow.python.keras.callbacks.History at 0x2aab2e47df0>