# 모듈 로드

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# 데이터 로드

data = pd.read_csv('advertising.csv')


data


data.head()


data.tail()


data.sample()


data.shape

(1000, 10)


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       916 non-null    float64
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(4), int64(2), object(4)
memory usage: 78.2+ KB


data.describe()


sns.displot(x=data['Area Income'])

<seaborn.axisgrid.FacetGrid at 0x28679e15ee0>


sns.displot(x=data['Age'])

<seaborn.axisgrid.FacetGrid at 0x2867a367220>


data['Country'].nunique()

237


data['City'].nunique()

969


data['Ad Topic Line'].nunique()

1000


# Text Data인 Country, City, Ad Topic Line column의 경우 고유값이 너무 많으므로
# 본 프로젝트에서는 제거하여 사용하겠다. 

data.drop(['Country', 'City', 'Ad Topic Line'], axis=1)


data.isna().sum() / len(data)

Daily Time Spent on Site    0.000
Age                         0.084
Area Income                 0.000
Daily Internet Usage        0.000
Ad Topic Line               0.000
City                        0.000
Male                        0.000
Country                     0.000
Timestamp                   0.000
Clicked on Ad               0.000
dtype: float64


data['Age'].mean()

36.12882096069869


data['Age'].median()

35.0


# 프로젝트 진행을 위해 Age column의 결측치를 나이의 평균으로 대치(impute)한다.

data = data.fillna(round(data['Age'].mean()))


data


# 모든 결측치를 대치했음을 확인

data.isna().sum()

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Ad Topic Line               0
City                        0
Male                        0
Country                     0
Timestamp                   0
Clicked on Ad               0
dtype: int64


from sklearn.model_selection import train_test_split


X = data[['Daily Time Spent on Site','Age','Area Income', 'Daily Internet Usage','Male']]
y = data['Clicked on Ad']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)


from sklearn.linear_model import LogisticRegression


model = LogisticRegression().fit(X_train, y_train)


# Coefficient 확인

model.coef_

array([[-6.64737762e-02,  2.66015818e-01, -1.15501902e-05,
        -2.44285539e-02,  2.00758165e-03]])


predictions = model.predict(X_test)


y_test

249    1
353    0
537    0
424    1
564    1
      ..
684    1
644    0
110    1
28     1
804    1
Name: Clicked on Ad, Length: 200, dtype: int64


from sklearn.metrics import accuracy_score, confusion_matrix


accuracy_score(predictions, y_test)

0.9


# confusion matrix: array([[TP, FN], [FP, TN]])

confusion_matrix(predictions, y_test)

array([[92, 12],
       [ 8, 88]], dtype=int64)


# 모듈 로드

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# 데이터 로드

data = pd.read_csv('ecommerce.csv')


data


data.head()


data.tail()


data.sample()


data.shape

(500, 8)


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Email                 500 non-null    object 
 1   Address               500 non-null    object 
 2   Avatar                500 non-null    object 
 3   Avg. Session Length   500 non-null    float64
 4   Time on App           500 non-null    float64
 5   Time on Website       500 non-null    float64
 6   Length of Membership  500 non-null    float64
 7   Yearly Amount Spent   500 non-null    float64
dtypes: float64(5), object(3)
memory usage: 31.4+ KB


data.describe()


sns.pairplot(data)

<seaborn.axisgrid.PairGrid at 0x1a90900e850>


# 고객의 연간 지출액 예측에 불필요한 컬럼 제거

data.drop(['Email', 'Address', 'Avatar'], axis=1, inplace=True)


from sklearn.model_selection import train_test_split


X = data.drop('Yearly Amount Spent', axis=1)


y = data['Yearly Amount Spent']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)


import statsmodels.api as sm


model = sm.OLS(y_train, X_train).fit()


model.summary()


predictions = model.predict(X_test)


predictions

69     418.211323
29     567.097473
471    534.706617
344    425.690888
54     474.931682
          ...    
460    570.877250
152    564.267305
154    557.093996
56     489.285778
392    550.720695
Length: 100, dtype: float64


y_test

69     451.575685
29     554.722084
471    541.049831
344    442.722892
54     522.404141
          ...    
460    618.845970
152    555.892595
154    595.803819
56     520.898794
392    549.131573
Name: Yearly Amount Spent, Length: 100, dtype: float64


sns.scatterplot(x=y_test, y=predictions)

<AxesSubplot:xlabel='Yearly Amount Spent'>


from sklearn import metrics


print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

MSE: 482.2890139088915
RMSE: 21.961079525125616

	Daily Time Spent on Site	Age	Area Income	Daily Internet Usage	Ad Topic Line	City	Male	Country	Timestamp	Clicked on Ad
0	68.95	NaN	61833.90	256.09	Cloned 5thgeneration orchestration	Wrightburgh	0	Tunisia	3/27/2016 0:53	0
1	80.23	31.0	68441.85	193.77	Monitored national standardization	West Jodi	1	Nauru	4/4/2016 1:39	0
2	69.47	26.0	59785.94	236.50	Organic bottom-line service-desk	Davidton	0	San Marino	3/13/2016 20:35	0
3	74.15	29.0	54806.18	245.89	Triple-buffered reciprocal time-frame	West Terrifurt	1	Italy	1/10/2016 2:31	0
4	68.37	35.0	73889.99	225.58	Robust logistical utilization	South Manuel	0	Iceland	6/3/2016 3:36	0
...	...	...	...	...	...	...	...	...	...	...
995	72.97	30.0	71384.57	208.58	Fundamental modular algorithm	Duffystad	1	Lebanon	2/11/2016 21:49	1
996	51.30	45.0	67782.17	134.42	Grass-roots cohesive monitoring	New Darlene	1	Bosnia and Herzegovina	4/22/2016 2:07	1
997	51.63	51.0	42415.72	120.37	Expanded intangible solution	South Jessica	1	Mongolia	2/1/2016 17:24	1
998	55.55	19.0	41920.79	187.95	Proactive bandwidth-monitored policy	West Steven	0	Guatemala	3/24/2016 2:35	0
999	45.01	26.0	29875.80	178.35	Virtual 5thgeneration emulation	Ronniemouth	0	Brazil	6/3/2016 21:43	1

	Daily Time Spent on Site	Age	Area Income	Daily Internet Usage	Ad Topic Line	City	Male	Country	Timestamp
0	68.95	NaN	61833.90	256.09	Cloned 5thgeneration orchestration	Wrightburgh	0	Tunisia	3/27/2016 0:53
1	80.23	31.0	68441.85	193.77	Monitored national standardization	West Jodi	1	Nauru	4/4/2016 1:39
2	69.47	26.0	59785.94	236.50	Organic bottom-line service-desk	Davidton	0	San Marino	3/13/2016 20:35
3	74.15	29.0	54806.18	245.89	Triple-buffered reciprocal time-frame	West Terrifurt	1	Italy	1/10/2016 2:31
4	68.37	35.0	73889.99	225.58	Robust logistical utilization	South Manuel	0	Iceland	6/3/2016 3:36

	Daily Time Spent on Site	Age	Area Income	Daily Internet Usage	Ad Topic Line	City	Male	Country	Timestamp	Clicked on Ad
995	72.97	30.0	71384.57	208.58	Fundamental modular algorithm	Duffystad	1	Lebanon	2/11/2016 21:49	1
996	51.30	45.0	67782.17	134.42	Grass-roots cohesive monitoring	New Darlene	1	Bosnia and Herzegovina	4/22/2016 2:07	1
997	51.63	51.0	42415.72	120.37	Expanded intangible solution	South Jessica	1	Mongolia	2/1/2016 17:24	1
998	55.55	19.0	41920.79	187.95	Proactive bandwidth-monitored policy	West Steven	0	Guatemala	3/24/2016 2:35	0
999	45.01	26.0	29875.80	178.35	Virtual 5thgeneration emulation	Ronniemouth	0	Brazil	6/3/2016 21:43	1

	Daily Time Spent on Site	Age	Area Income	Daily Internet Usage	Male	Clicked on Ad
count	1000.000000	916.000000	1000.000000	1000.000000	1000.000000	1000.00000
mean	65.000200	36.128821	55000.000080	180.000100	0.481000	0.50000
std	15.853615	9.018548	13414.634022	43.902339	0.499889	0.50025
min	32.600000	19.000000	13996.500000	104.780000	0.000000	0.00000
25%	51.360000	29.000000	47031.802500	138.830000	0.000000	0.00000
50%	68.215000	35.000000	57012.300000	183.130000	0.000000	0.50000
75%	78.547500	42.000000	65470.635000	218.792500	1.000000	1.00000
max	91.430000	61.000000	79484.800000	269.960000	1.000000	1.00000

	Daily Time Spent on Site	Age	Area Income	Daily Internet Usage	Male	Timestamp	Clicked on Ad
0	68.95	NaN	61833.90	256.09	0	3/27/2016 0:53	0
1	80.23	31.0	68441.85	193.77	1	4/4/2016 1:39	0
2	69.47	26.0	59785.94	236.50	0	3/13/2016 20:35	0
3	74.15	29.0	54806.18	245.89	1	1/10/2016 2:31	0
4	68.37	35.0	73889.99	225.58	0	6/3/2016 3:36	0
...	...	...	...	...	...	...	...
995	72.97	30.0	71384.57	208.58	1	2/11/2016 21:49	1
996	51.30	45.0	67782.17	134.42	1	4/22/2016 2:07	1
997	51.63	51.0	42415.72	120.37	1	2/1/2016 17:24	1
998	55.55	19.0	41920.79	187.95	0	3/24/2016 2:35	0
999	45.01	26.0	29875.80	178.35	0	6/3/2016 21:43	1

햇밤우유의 돌다리

데이터분석/E커머스 데이터분석

Logistic Regression을 통한 고객의 광고 반응률 예측

고객별 광고 반응율 예측¶

Logistic Regression¶

모듈 및 데이터 로드¶

데이터 특성 확인¶

불필요한 컬럼 제거¶

Missing Value 확인 및 처리¶

Train/Test Set 분리¶

Logistic Regression 모델 만들기¶

예측 및 평가¶

참고: 분류 모델의 성능 평가¶

Confusion Matrix = [[TP, FN], [FP, TN]]¶

'데이터분석 > E커머스 데이터분석' 카테고리의 다른 글

Linear Regression을 통한 고객별 연간 지출액 예측

고객별 연간 지출액 예측¶

Linear Regression¶

모듈 및 데이터 로드¶

데이터 특성 확인¶

불필요한 컬럼 제거¶

Train/Test Set 분리¶

Linear Regression 모델 만들기¶

예측 및 평가¶

'데이터분석 > E커머스 데이터분석' 카테고리의 다른 글

+ Recent posts

티스토리툴바

	Daily Time Spent on Site	Age	Area Income	Daily Internet Usage	Ad Topic Line	City	Male	Country	Timestamp	Clicked on Ad
0	68.95	36.0	61833.90	256.09	Cloned 5thgeneration orchestration	Wrightburgh	0	Tunisia	3/27/2016 0:53	0
1	80.23	31.0	68441.85	193.77	Monitored national standardization	West Jodi	1	Nauru	4/4/2016 1:39	0
2	69.47	26.0	59785.94	236.50	Organic bottom-line service-desk	Davidton	0	San Marino	3/13/2016 20:35	0
3	74.15	29.0	54806.18	245.89	Triple-buffered reciprocal time-frame	West Terrifurt	1	Italy	1/10/2016 2:31	0
4	68.37	35.0	73889.99	225.58	Robust logistical utilization	South Manuel	0	Iceland	6/3/2016 3:36	0
...	...	...	...	...	...	...	...	...	...	...
995	72.97	30.0	71384.57	208.58	Fundamental modular algorithm	Duffystad	1	Lebanon	2/11/2016 21:49	1
996	51.30	45.0	67782.17	134.42	Grass-roots cohesive monitoring	New Darlene	1	Bosnia and Herzegovina	4/22/2016 2:07	1
997	51.63	51.0	42415.72	120.37	Expanded intangible solution	South Jessica	1	Mongolia	2/1/2016 17:24	1
998	55.55	19.0	41920.79	187.95	Proactive bandwidth-monitored policy	West Steven	0	Guatemala	3/24/2016 2:35	0
999	45.01	26.0	29875.80	178.35	Virtual 5thgeneration emulation	Ronniemouth	0	Brazil	6/3/2016 21:43	1

	Email	Address	Avatar	Avg. Session Length	Time on App	Time on Website	Length of Membership	Yearly Amount Spent
0	mstephenson@fernandez.com	835 Frank Tunnel\nWrightmouth, MI 82180-9605	Violet	34.497268	12.655651	39.577668	4.082621	587.951054
1	hduke@hotmail.com	4547 Archer Common\nDiazchester, CA 06566-8576	DarkGreen	31.926272	11.109461	37.268959	2.664034	392.204933
2	pallen@yahoo.com	24645 Valerie Unions Suite 582\nCobbborough, D...	Bisque	33.000915	11.330278	37.110597	4.104543	487.547505
3	riverarebecca@gmail.com	1414 David Throughway\nPort Jason, OH 22070-1220	SaddleBrown	34.305557	13.717514	36.721283	3.120179	581.852344
4	mstephens@davidson-herman.com	14023 Rodriguez Passage\nPort Jacobville, PR 3...	MediumAquaMarine	33.330673	12.795189	37.536653	4.446308	599.406092
...	...	...	...	...	...	...	...	...
495	lewisjessica@craig-evans.com	4483 Jones Motorway Suite 872\nLake Jamiefurt,...	Tan	33.237660	13.566160	36.417985	3.746573	573.847438
496	katrina56@gmail.com	172 Owen Divide Suite 497\nWest Richard, CA 19320	PaleVioletRed	34.702529	11.695736	37.190268	3.576526	529.049004
497	dale88@hotmail.com	0787 Andrews Ranch Apt. 633\nSouth Chadburgh, ...	Cornsilk	32.646777	11.499409	38.332576	4.958264	551.620145
498	cwilson@hotmail.com	680 Jennifer Lodge Apt. 808\nBrendachester, TX...	Teal	33.322501	12.391423	36.840086	2.336485	456.469510
499	hannahwilson@davidson.com	49791 Rachel Heights Apt. 898\nEast Drewboroug...	DarkMagenta	33.715981	12.418808	35.771016	2.735160	497.778642

	Avg. Session Length	Time on App	Time on Website	Length of Membership	Yearly Amount Spent
count	500.000000	500.000000	500.000000	500.000000	500.000000
mean	33.053194	12.052488	37.060445	3.533462	499.314038
std	0.992563	0.994216	1.010489	0.999278	79.314782
min	29.532429	8.508152	33.913847	0.269901	256.670582
25%	32.341822	11.388153	36.349257	2.930450	445.038277
50%	33.082008	11.983231	37.069367	3.533975	498.887875
75%	33.711985	12.753850	37.716432	4.126502	549.313828
max	36.139662	15.126994	40.005182	6.922689	765.518462

Dep. Variable:	Yearly Amount Spent	R-squared (uncentered):	0.998
Model:	OLS	Adj. R-squared (uncentered):	0.998
Method:	Least Squares	F-statistic:	4.798e+04
Date:	Sat, 29 Jan 2022	Prob (F-statistic):	0.00
Time:	12:29:00	Log-Likelihood:	-1820.0
No. Observations:	400	AIC:	3648.
Df Residuals:	396	BIC:	3664.
Df Model:	4
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Avg. Session Length	11.9059	0.869	13.696	0.000	10.197	13.615
Time on App	34.3257	1.121	30.610	0.000	32.121	36.530
Time on Website	-14.1405	0.812	-17.405	0.000	-15.738	-12.543
Length of Membership	61.0149	1.144	53.318	0.000	58.765	63.265

Omnibus:	0.490	Durbin-Watson:	1.987
Prob(Omnibus):	0.783	Jarque-Bera (JB):	0.606
Skew:	-0.022	Prob(JB):	0.739
Kurtosis:	2.814	Cond. No.	55.4