import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors


cpa = pd.read_csv('/content/drive/MyDrive/data/customer_personality_analysis.csv', sep="\t")
cpa.head()


cpa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   int64  
 16  NumWebPurchases      2240 non-null   int64  
 17  NumCatalogPurchases  2240 non-null   int64  
 18  NumStorePurchases    2240 non-null   int64  
 19  NumWebVisitsMonth    2240 non-null   int64  
 20  AcceptedCmp3         2240 non-null   int64  
 21  AcceptedCmp4         2240 non-null   int64  
 22  AcceptedCmp5         2240 non-null   int64  
 23  AcceptedCmp1         2240 non-null   int64  
 24  AcceptedCmp2         2240 non-null   int64  
 25  Complain             2240 non-null   int64  
 26  Z_CostContact        2240 non-null   int64  
 27  Z_Revenue            2240 non-null   int64  
 28  Response             2240 non-null   int64  
dtypes: float64(1), int64(25), object(3)
memory usage: 507.6+ KB


print(cpa.shape)
cpa = cpa.dropna()
print(cpa.shape)

(2240, 29)
(2216, 29)


cpa['Dt_Customer'] = pd.to_datetime(cpa['Dt_Customer'])
cpa['Dt_Customer']

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

0      2012-04-09
1      2014-08-03
2      2013-08-21
3      2014-10-02
4      2014-01-19
          ...    
2235   2013-06-13
2236   2014-10-06
2237   2014-01-25
2238   2014-01-24
2239   2012-10-15
Name: Dt_Customer, Length: 2216, dtype: datetime64[ns]


max(cpa['Dt_Customer'])

Timestamp('2014-12-06 00:00:00')


cpa['Customer_For'] = cpa['Dt_Customer'].apply(lambda x : max(cpa['Dt_Customer']) - x)
cpa['Customer_For']

0      971 days
1      125 days
2      472 days
3       65 days
4      321 days
         ...   
2235   541 days
2236    61 days
2237   315 days
2238   316 days
2239   782 days
Name: Customer_For, Length: 2216, dtype: timedelta64[ns]


# 날짜를 숫자로 type 을 변경해준다
cpa["Customer_For"] = pd.to_numeric(cpa["Customer_For"], errors="coerce")
cpa["Customer_For"] 

# errors: error는 총 3개의 옵션이 존재합니다.
# errors = 'ignore' -> 만약 숫자로 변경할 수 없는 데이터라면 숫자로 변경하지 않고 원본 데이터를 그대로 반환합니다.
# errors = 'coerce' -> 만약 숫자로 변경할 수 없는 데이터라면 기존 데이터를 지우고 NaN으로 설정하여 반환합니다.
# errors = 'raise' -> 만약 숫자로 변경할 수 없는 데이터라면 에러를 일으키며 코드를 중단합니다.

0       83894400000000000
1       10800000000000000
2       40780800000000000
3        5616000000000000
4       27734400000000000
              ...        
2235    46742400000000000
2236     5270400000000000
2237    27216000000000000
2238    27302400000000000
2239    67564800000000000
Name: Customer_For, Length: 2216, dtype: int64


cpa["Marital_Status"].value_counts()

Married     857
Together    573
Single      471
Divorced    232
Widow        76
Alone         3
Absurd        2
YOLO          2
Name: Marital_Status, dtype: int64


# Marital_Status 정리하기, 파트너와 같이 사는지, 혼자사는지 여부
cpa["Living_With"]=(
    cpa["Marital_Status"]
    .replace(
      {"Married":"Partner", 
       "Together":"Partner", 
       "Absurd":"Alone", 
       "Widow":"Alone", 
       "YOLO":"Alone", 
       "Divorced":"Alone", 
       "Single":"Alone"
    })
)

# the number of children, Kidhome 과 Teenhome 을 분리하지 않고 합쳐준다.
cpa["Children"]=cpa["Kidhome"]+cpa["Teenhome"]

# 위의 데이터를 통해 가족 사이즈도 구할 수 있다.
cpa["Family_Size"] = (
    cpa["Living_With"].replace({"Alone": 1, "Partner":2})
    + cpa["Children"]
)

# 아이가 있는지, 없는지
cpa["Is_Parent"] = np.where(cpa.Children> 0, 1, 0)


cpa["Education"].value_counts()

Graduation    1116
PhD            481
Master         365
2n Cycle       200
Basic           54
Name: Education, dtype: int64


# 교육 상태 정리하기
cpa["Education"]=cpa["Education"].replace({"Basic":"Undergraduate",
                                           "2n Cycle":"Undergraduate", 
                                           "Graduation":"Graduate", 
                                           "Master":"Postgraduate", 
                                           "PhD":"Postgraduate"})


# 생년 월일을 통해 나이를 구할 수 있다.
cpa["Age"] = 2022-cpa['Year_Birth']

# 다양한 잡화 구매를 더해서 총 사용한 비용 Spent 를 구한다.
cpa["Spent"] = cpa["MntWines"]+ cpa["MntFruits"]+ cpa["MntMeatProducts"]+ cpa["MntFishProducts"]+ cpa["MntSweetProducts"]+ cpa["MntGoldProds"]


# 모두 동일한 값, 필요없는 컬럼
cpa.Z_CostContact.value_counts()
cpa.Z_Revenue.value_counts()

11    2216
Name: Z_Revenue, dtype: int64


# 컬럼명 짧게 변경
cpa=cpa.rename(columns={"MntWines": "Wines","MntFruits":"Fruits","MntMeatProducts":"Meat","MntFishProducts":"Fish","MntSweetProducts":"Sweets","MntGoldProds":"Gold"})

# 중복되거나 필요없는 컬럼 제거
cpa = cpa.drop(["Marital_Status", "Dt_Customer", "Z_CostContact", "Z_Revenue", "Year_Birth", "ID"], axis=1)


cpa.head()


cpa.describe()


cpa.hist(figsize=(20, 20), bins=50)
plt.show()


# Dropping the outliers by setting a cap on Age and income. 
cpa = cpa[(cpa["Age"]< 90)]
cpa = cpa[(cpa["Income"] < 600000)]


columns = ['Income', 'Kidhome', 'Teenhome', 'Recency', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'Customer_For', 'Children',
       'Family_Size', 'Is_Parent', 'Age', 'Spent']

plt.figure(figsize=(20,20))

sns.heatmap(cpa[columns].corr(), 
            linewidths = 0.1,
            square = True,
            annot = True,
            fmt = '.2f',
            cmap='Blues')

<matplotlib.axes._subplots.AxesSubplot at 0x7f933bb854d0>


s = (cpa.dtypes == 'object')


# 데이터 중에서 object 의 데이터 형태를 가지고 있는 컬럼을 가져와라
s[s.values == True].index

Index(['Education', 'Living_With'], dtype='object')


cpa[['Education', 'Living_With']]


# 카테고리 데이터에 라벨 인코더 사용하기
# 라벨 인코더란? 카테고리형 데이터에 숫자를 매핑하여 바꿔준다. 

from sklearn.preprocessing import LabelEncoder 

LE = LabelEncoder()

# Education 카테고리 -> 숫자로 바꿔주기
cpa['Education']=cpa[['Education']].apply(LE.fit_transform)

# Living_With 카테고리 -> 숫자로 바꿔주기
cpa['Living_With']=cpa[['Living_With']].apply(LE.fit_transform)


print(LE.classes_)
print(LE.inverse_transform([0,1]))

['Alone' 'Partner']
['Alone' 'Partner']


ds = cpa.copy()

# 나중에 클러스터 별 캠페인 반응률을 살펴보기 위해 사용
# 클러스터를 구성하기 위한 피쳐에서는 제거한다.
cols_del = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1','AcceptedCmp2', 'Complain', 'Response']
ds = ds.drop(cols_del, axis=1)


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(ds) # mean, variance 계산 

# #Scaled data to be used for reducing the dimensionality
scaled_ds = pd.DataFrame(scaler.transform(ds), columns=ds.columns)


scaled_ds.head()


# PCA로 3차원으로 데이터를 줄인다.

from sklearn.decomposition import PCA 

pca = PCA(n_components=3)
pca_ds = pd.DataFrame(pca.fit_transform(scaled_ds), columns=['pca_x', 'pca_y', 'pca_z'])
pca_ds.head()


#A 3D Projection Of Data In The Reduced Dimension
x = pca_ds["pca_x"]
y = pca_ds["pca_y"]
z = pca_ds["pca_z"]

#3D 플랏 그리기
# https://www.python-graph-gallery.com/370-3d-scatterplot
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(x,y,z, c="maroon", marker="o" )
ax.set_title("A 3D Projection Of Data In The Reduced Dimension")
plt.show()


from sklearn.cluster import KMeans

distortions = [] # 군집 내 분산 WSS  

# 클러스터 개수 1 ~ 10 까지 늘려보면서
# 클러스터 내 거리합을 저장한다.

K = range(1, 10)

for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(pca_ds)
    distortions.append(kmeanModel.inertia_) # within-cluster sum-of-squares criterion


plt.figure(figsize=(8,6))
plt.plot(K, distortions) #plt.plot(x, y)
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()


from sklearn.cluster import AgglomerativeClustering

# 유사도 기준: affinity = euclidean
ac = AgglomerativeClustering(n_clusters=4) # 3으로 해도 된다. 

ac.fit(pca_ds)

# labels_값을 cluster 칼럼으로 지정 
pca_ds["Clusters"] = ac.labels_

# 원래 데이터에도 넣어준다 
cpa["Clusters"] = ac.labels_


#Plotting the clusters
cmap = colors.ListedColormap(["#682F2F", "#9E726F", "#D6B2B1", "#B9C0C9", "#9F8A78", "#F3AB60"])

fig = plt.figure(figsize=(10,8))
ax = plt.subplot(111, projection='3d', label="bla")
ax.scatter(x, y, z, s=40, 
           c=pca_ds["Clusters"], # 클러스터로 점들의 색상을 표현한다. 
           marker='o', cmap=cmap)
ax.set_title("The Plot Of The Clusters")
plt.show()


# 그룹이 골고루 분포되어 있을까?
pl = sns.countplot(x = cpa['Clusters'])
pl.set_title('Distribution Of The Clusters')
plt.show()


# 클러스터별 구매 금액과 연간 수입 
pl = sns.scatterplot(data=cpa, x=cpa['Spent'], y=cpa['Income'], hue=cpa['Clusters'])
pl.set_title("Cluster's Profile Based On Income And Spending")
plt.legend()
plt.show()


# 구매를 많이하는 고객층은?
sns.boxenplot(x=cpa["Clusters"], y=cpa["Spent"])

<matplotlib.axes._subplots.AxesSubplot at 0x7f933cea82d0>


# 어떤 클러스터에서 캠페인 반응이 높을까?
cpa["Total_Promos"] = cpa["AcceptedCmp1"]+ cpa["AcceptedCmp2"]+ cpa["AcceptedCmp3"]+ cpa["AcceptedCmp4"]+ cpa["AcceptedCmp5"]

plt.figure()
pl = sns.countplot(x=cpa["Total_Promos"],hue=cpa["Clusters"])
pl.set_title("Count Of Promotion Accepted")
pl.set_xlabel("Number Of Total Accepted Promotions")
plt.show()


# 할인에 잘 반응한 그룹은?
plt.figure()
pl=sns.boxenplot(y=cpa["NumDealsPurchases"],x=cpa["Clusters"])
pl.set_title("Number of Deals Purchased")
plt.show()


# 가족 구성원과 나이대는 어떻게 될까? 
fig, axs = plt.subplots(figsize=(20, 10), ncols=5, nrows=2)
features = [ "Income","Kidhome","Teenhome","Customer_For", "Age", "Children", "Family_Size", "Is_Parent", "Education","Living_With"]

for i, feature in enumerate(features): 
    row = int(i/5)
    col = i%5
    sns.boxplot(x=cpa['Clusters'], y=cpa[feature], ax=axs[row][col])


from sklearn.cluster import DBSCAN 

dbscan = DBSCAN(eps=0.30, # 일반적으로 eps는 1 이하의 값을 설정  
                min_samples=9, metric='euclidean') # 최소 이웃 개수 

dbscan.fit(pca_ds)

# labels_값을 cluster 칼럼으로 지정 
pca_ds["DBSCAN_Clusters"] = dbscan.labels_

# 원래 데이터에도 넣어준다 
cpa["DBSCAN_Clusters"] = dbscan.labels_


fig = plt.figure(figsize=(10,8))
ax = plt.subplot(111, projection='3d', label="bla")
ax.scatter(x, y, z, s=40, 
           c=pca_ds["DBSCAN_Clusters"], 
           marker='o', cmap = 'viridis' )
ax.set_title("The Plot Of The Clusters")
plt.show()


#Plotting countplot of clusters
pl = sns.countplot(x=cpa["DBSCAN_Clusters"])
pl.set_title("Distribution Of The Clusters")
plt.show()


pl = sns.scatterplot(data=cpa, x=cpa["Spent"], y=cpa["Income"], hue=cpa["DBSCAN_Clusters"])
pl.set_title("Cluster's Profile Based On Income And Spending")
plt.legend()
plt.show()


kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, random_state=0) 
kmeans.fit(pca_ds)

# labels_값을 cluster 칼럼으로 지정 
pca_ds["Kmeans_Clusters"] = kmeans.labels_

# 원래 데이터에도 넣어준다 
cpa["Kmeans_Clusters"] = kmeans.labels_


#Plotting countplot of clusters
pl = sns.countplot(x=cpa["Kmeans_Clusters"])
pl.set_title("Distribution Of The Clusters")
plt.show()


pl = sns.scatterplot(data=cpa,x=cpa["Spent"],y=cpa["Income"],hue=cpa["Kmeans_Clusters"])
pl.set_title("Cluster's Profile Based On Income And Spending")
plt.legend()
plt.show()


from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components = 3, # gaussian mixture의 모델의 총 개수 
                random_state=0)

gmm.fit(pca_ds)
gmm_cluster_labels = gmm.predict(pca_ds)

# labels_값을 cluster 칼럼으로 지정 
pca_ds["GMM_Clusters"] = gmm_cluster_labels

# 원래 데이터에도 넣어준다 
cpa["GMM_Clusters"] = gmm_cluster_labels


#Plotting countplot of clusters
pl = sns.countplot(x=cpa["GMM_Clusters"])
pl.set_title("Distribution Of The Clusters")
plt.show()


pl = sns.scatterplot(data=cpa,x=cpa["Spent"],y=cpa["Income"],hue=cpa["GMM_Clusters"])
pl.set_title("Cluster's Profile Based On Income And Spending")
plt.legend()
plt.show()


# 실루엣 분석 평가 지표 값을 구하기 위한 API 추가 
from sklearn.metrics import silhouette_samples, silhouette_score

# 모든 개별 데이터의 실루엣 계수 값
score_samples = silhouette_samples(pca_ds, pca_ds["Clusters"])
pca_ds['Hierachical_silhouette_coeff'] = score_samples

score_samples = silhouette_samples(pca_ds, pca_ds["DBSCAN_Clusters"])
pca_ds['DBSCAN_silhouette_coeff'] = score_samples

score_samples = silhouette_samples(pca_ds, pca_ds["Kmeans_Clusters"])
pca_ds['Kmeans_silhouette_coeff'] = score_samples


# Hierachical 모든 데이터의 평균 실루엣 계수 값
average_score = silhouette_score(pca_ds, pca_ds["Clusters"])
print("Hierachical {0:.4f}".format(average_score))
# 군집별 평균 실루엣 계수 값 
print(pca_ds.groupby('Clusters')['Hierachical_silhouette_coeff'].mean(),'\n')

# Density - DBSCAN 모든 데이터의 평균 실루엣 계수 값
average_score = silhouette_score(pca_ds, pca_ds["DBSCAN_Clusters"])
print("Density {0:.4f}".format(average_score))
# 군집별 평균 실루엣 계수 값 
print(pca_ds.groupby('DBSCAN_Clusters')['DBSCAN_silhouette_coeff'].mean(),'\n')

# Kmeans 모든 데이터의 평균 실루엣 계수 값
average_score = silhouette_score(pca_ds, pca_ds["Kmeans_Clusters"])
print("Kmeans {0:.4f}".format(average_score))
# 군집별 평균 실루엣 계수 값  
print(pca_ds.groupby('Kmeans_Clusters')['Kmeans_silhouette_coeff'].mean(),'\n')

Hierachical 0.4152
Clusters
0    0.404362
1    0.372097
2    0.312544
3    0.602977
Name: Hierachical_silhouette_coeff, dtype: float64 

Density 0.0610
DBSCAN_Clusters
-1   -0.048433
 0    0.454229
 1    0.620734
 2    0.774115
 3    0.884097
 4    0.908008
 5    0.948117
 6    0.915586
 7    0.913140
Name: DBSCAN_silhouette_coeff, dtype: float64 

Kmeans 0.4301
Kmeans_Clusters
0    0.317607
1    0.549122
2    0.416198
3    0.431250
Name: Kmeans_silhouette_coeff, dtype: float64

	Income	Kidhome	Teenhome	Recency	Wines	Fruits	Meat	Fish	Sweets	Gold	...	AcceptedCmp1	AcceptedCmp2	Complain	Response	Customer_For	Children	Family_Size	Is_Parent	Age	Spent
count	2216.000000	2216.000000	2216.000000	2216.000000	2216.000000	2216.000000	2216.000000	2216.000000	2216.000000	2216.000000	...	2216.000000	2216.000000	2216.000000	2216.000000	2.216000e+03	2216.000000	2216.000000	2216.000000	2216.000000	2216.000000
mean	52247.251354	0.441787	0.505415	49.012635	305.091606	26.356047	166.995939	37.637635	27.028881	43.965253	...	0.064079	0.013538	0.009477	0.150271	4.423735e+16	0.947202	2.592509	0.714350	53.179603	607.075361
std	25173.076661	0.536896	0.544181	28.948352	337.327920	39.793917	224.283273	54.752082	41.072046	51.815414	...	0.244950	0.115588	0.096907	0.357417	2.008532e+16	0.749062	0.905722	0.451825	11.985554	602.900476
min	1730.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000e+00	0.000000	1.000000	0.000000	26.000000	5.000000
25%	35303.000000	0.000000	0.000000	24.000000	24.000000	2.000000	16.000000	3.000000	1.000000	9.000000	...	0.000000	0.000000	0.000000	0.000000	2.937600e+16	0.000000	2.000000	0.000000	45.000000	69.000000
50%	51381.500000	0.000000	0.000000	49.000000	174.500000	8.000000	68.000000	12.000000	8.000000	24.500000	...	0.000000	0.000000	0.000000	0.000000	4.432320e+16	1.000000	3.000000	1.000000	52.000000	396.500000
75%	68522.000000	1.000000	1.000000	74.000000	505.000000	33.000000	232.250000	50.000000	33.000000	56.000000	...	0.000000	0.000000	0.000000	0.000000	5.927040e+16	1.000000	3.000000	1.000000	63.000000	1048.000000
max	666666.000000	2.000000	2.000000	99.000000	1493.000000	199.000000	1725.000000	259.000000	262.000000	321.000000	...	1.000000	1.000000	1.000000	1.000000	9.184320e+16	3.000000	5.000000	1.000000	129.000000	2525.000000

	Education	Income	Kidhome	Teenhome	Recency	Wines	Fruits	Meat	Fish	Sweets	...	NumCatalogPurchases	NumStorePurchases	NumWebVisitsMonth	Customer_For	Living_With	Children	Family_Size	Is_Parent	Age	Spent
0	-0.893586	0.287105	-0.822754	-0.929699	0.310353	0.977660	1.552041	1.690293	2.453472	1.483713	...	2.503607	-0.555814	0.692181	1.973583	-1.349603	-1.264598	-1.758359	-1.581139	1.018352	1.676245
1	-0.893586	-0.260882	1.040021	0.908097	-0.380813	-0.872618	-0.637461	-0.718230	-0.651004	-0.634019	...	-0.571340	-1.171160	-0.132545	-1.665144	-1.349603	1.404572	0.449070	0.632456	1.274785	-0.963297
2	-0.893586	0.913196	-0.822754	-0.929699	-0.795514	0.357935	0.570540	-0.178542	1.339513	-0.147184	...	-0.229679	1.290224	-0.544908	-0.172664	0.740959	-1.264598	-0.654644	-1.581139	0.334530	0.280110
3	-0.893586	-1.176114	1.040021	-0.929699	-0.795514	-0.872618	-0.561961	-0.655787	-0.504911	-0.585335	...	-0.913000	-0.555814	0.279818	-1.923210	0.740959	0.069987	0.449070	0.632456	-1.289547	-0.920135
4	0.571657	0.294307	1.040021	-0.929699	1.554453	-0.392257	0.419540	-0.218684	0.152508	-0.001133	...	0.111982	0.059532	-0.132545	-0.822130	0.740959	0.069987	0.449070	0.632456	-1.033114	-0.307562

	pca_x	pca_y	pca_z
0	4.994347	-0.151240	2.648869
1	-2.884454	-0.006836	-1.860486
2	2.617862	-0.720369	-0.257159
3	-2.676035	-1.542089	-0.922223
4	-0.649591	0.209823	-0.019256

티스토리

차원 축소와 군집화

데이터 정제¶

결측치 제거하기¶

날짜 데이터 정제하기¶

카테고리 데이터 정리하기¶

이상치 제거하기¶

상관 관계¶

데이터 전처리¶

카테고리 데이터 전처리하기¶

피쳐 간 규모를 맞추기 위해 Scaling 하기¶

차원 축소¶

공분산 행렬의 의미¶

주성분 분석 (PCA)¶

클러스터링¶

Elbow Method¶

Hierarchical Clustering¶

Density Based Clustering¶

Partition Based Clustering¶

가우시안 혼합 모델 (Gaussian Mixture Model)¶

최대 우도법 (Maximum Likelihood Estimation)¶

나이브 베이즈 분류기¶

실루엣 분석¶

	ID	Year_Birth	Education	Marital_Status	Income	Kidhome	Teenhome	Dt_Customer	Recency	MntWines	...	NumWebVisitsMonth	Z_CostContact	Z_Revenue	Response
0	5524	1957	Graduation	Single	58138.0	0	0	04-09-2012	58	635	...	7	3	11	1
1	2174	1954	Graduation	Single	46344.0	1	1	08-03-2014	38	11	...	5	3	11	0
2	4141	1965	Graduation	Together	71613.0	0	0	21-08-2013	26	426	...	4	3	11	0
3	6182	1984	Graduation	Together	26646.0	1	0	10-02-2014	26	11	...	6	3	11	0
4	5324	1981	PhD	Married	58293.0	1	0	19-01-2014	94	173	...	5	3	11	0

	Education	Income	Kidhome	Teenhome	Recency	Wines	Fruits	Meat	Fish	Sweets	...	Response	Customer_For	Living_With	Children	Family_Size	Is_Parent	Age	Spent
0	Graduate	58138.0	0	0	58	635	88	546	172	88	...	1	83894400000000000	Alone	0	1	0	65	1617
1	Graduate	46344.0	1	1	38	11	1	6	2	1	...	0	10800000000000000	Alone	2	3	1	68	27
2	Graduate	71613.0	0	0	26	426	49	127	111	21	...	0	40780800000000000	Partner	0	2	0	57	776
3	Graduate	26646.0	1	0	26	11	4	20	10	3	...	0	5616000000000000	Partner	1	3	1	38	53
4	Postgraduate	58293.0	1	0	94	173	43	118	46	27	...	0	27734400000000000	Partner	1	3	1	41	422