本项目研究目的:本项目目的是通过给定的广告信息和用户信息来预测一个广告被点击与否, 如果广告有很大概率被点击就展示广告,如果概率低,就不展示。

# Load Librariesimport numpy as np     #linear algebra
import pandas as pd    #data processing
import matplotlib.pyplot as plt    #visualizations
import seaborn as sns       #visualizations
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
import warnings    #hide warning messages
%matplotlib inline
# Load Datadf = pd.read_csv("advertising.csv") #reading the file
# examine the datadf.head(10) #cheking the first 10 rows of the data
Daily Time Spent on SiteAgeArea IncomeDaily Internet UsageAd Topic LineCityMaleCountryTimestampClicked on Ad
068.953561833.90256.09Cloned 5thgeneration orchestrationWrightburgh0Tunisia2016-03-27 00:53:110
180.233168441.85193.77Monitored national standardizationWest Jodi1Nauru2016-04-04 01:39:020
269.472659785.94236.50Organic bottom-line service-deskDavidton0San Marino2016-03-13 20:35:420
374.152954806.18245.89Triple-buffered reciprocal time-frameWest Terrifurt1Italy2016-01-10 02:31:190
468.373573889.99225.58Robust logistical utilizationSouth Manuel0Iceland2016-06-03 03:36:180
559.992359761.56226.74Sharable client-driven softwareJamieberg1Norway2016-05-19 14:30:170
688.913353852.85208.36Enhanced dedicated supportBrandonstad0Myanmar2016-01-28 20:59:320
766.004824593.33131.76Reactive local challengePort Jefferybury1Australia2016-03-07 01:40:151
874.533068862.00221.51Configurable coherent functionWest Colin1Grenada2016-04-18 09:33:420
969.882055642.32183.82Mandatory homogeneous architectureRamirezton1Ghana2016-07-11 01:42:510
# data type and length of the variablesdf.info()  #gives the information about the data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):#   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  0   Daily Time Spent on Site  1000 non-null   float641   Age                       1000 non-null   int64  2   Area Income               1000 non-null   float643   Daily Internet Usage      1000 non-null   float644   Ad Topic Line             1000 non-null   object 5   City                      1000 non-null   object 6   Male                      1000 non-null   int64  7   Country                   1000 non-null   object 8   Timestamp                 1000 non-null   object 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.2+ KB
# duplicates checkupdf.duplicated().sum() #displays duplicate records
# numerical and categorical variables identificationdf.columns #displays column names
Index(['Daily Time Spent on Site', 'Age', 'Area Income','Daily Internet Usage', 'Ad Topic Line', 'City', 'Male', 'Country','Timestamp', 'Clicked on Ad'],dtype='object')
df.select_dtypes(include=['object']).columns  #displays categorical variables which are detected by python
Index(['Ad Topic Line', 'City', 'Country', 'Timestamp'], dtype='object')
# assigning columns as numerical variables
numeric_cols = ['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']
# Assigning columns as categorical variables
Categorical_cols = [ 'Ad Topic Line', 'City', 'Male', 'Country', 'Clicked on Ad' ]
# Summarizing Numerical Variablesdf[numeric_cols].describe()
Daily Time Spent on SiteAgeArea IncomeDaily Internet Usage


# Summarizing Categorical Variablesdf[Categorical_cols].describe(include = ['O'])
Ad Topic LineCityCountry
topObject-based neutral policyWilliamsportFrance

由于我们有许多不同的城市(唯一),也没有多少人属于同一城市(频率)。 因此,这可能意味着该功能没有或具有很小的预测能力。 但是,我们在国家特征方面的多样性较少,因此我们必须对国家进行进一步分析。

#Investing Country Variable
pd.crosstab(df['Country'], df['Clicked on Ad']).sort_values(1,0,ascending=False).head(20) #先按1列降序排序,再按0列降序排序
Clicked on Ad01
South Africa26
Svalbard & Jan Mayen Islands24
Antigua and Barbuda14
Hong Kong24
pd.crosstab(index=df['Country'], columns='count').sort_values(['count'], ascending=False).head(10)
Czech Republic9
South Africa8


# Check for Missing Valuesdf.isnull().sum()  #number of missing values in each column
Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Ad Topic Line               0
City                        0
Male                        0
Country                     0
Timestamp                   0
Clicked on Ad               0
dtype: int64
# extract datetime variables using timestamp column# Converting timestamp column into datatime object in order to extract new features
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
# Creates a new column called Month
df['Month'] = df['Timestamp'].dt.month
# Creates a new column called Day
df['Day'] = df['Timestamp'].dt.day
# Creates a new column called Hour
df['Hour'] = df['Timestamp'].dt.hour
# Creates a new column called Weekday with sunday as 6 and monday as 0
df['Weekday'] = df['Timestamp'].dt.dayofweek
# Dropping timestamp column to avoid redundancy
df = df.drop(['Timestamp'], axis=1)
Daily Time Spent on SiteAgeArea IncomeDaily Internet UsageAd Topic LineCityMaleCountryClicked on AdMonthDayHourWeekday
068.953561833.90256.09Cloned 5thgeneration orchestrationWrightburgh0Tunisia032706
180.233168441.85193.77Monitored national standardizationWest Jodi1Nauru04410
269.472659785.94236.50Organic bottom-line service-deskDavidton0San Marino0313206
374.152954806.18245.89Triple-buffered reciprocal time-frameWest Terrifurt1Italy011026
468.373573889.99225.58Robust logistical utilizationSouth Manuel0Iceland06334
# visualize target variable clicked on adplt.figure(figsize=(14, 6))
sns.countplot(x='Clicked on Ad', data=df)
sns.distplot(df['Clicked on Ad'], bins=20)


# joinplot of daily time spent on site and age
sns.jointplot(x='Age', y='Daily Time Spent on Site', data=df)


# Distribution and Relationship Between Variables#creating a pairplot with hue defined by clicked on ad column
sns.pairplot(df, hue='Clicked on Ad', vars=['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage'], palette='husl')


plots = ['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']
for i in plots:plt.figure(figsize=(14, 6))plt.subplot(1,2,1)sns.boxenplot(df[i])plt.subplot(1,2,2)sns.distplot(df[i], bins=20)plt.title(i)plt.show()


我们可以清楚地看到,网站的日常使用和每天花费的时间有2个高峰(以统计数据为Bi模型)。 这表明我们的数据中存在两个不同的组。 我们不希望用户分布正常,因为有些人会花更多时间在Internet /网站上,而有些人会花更少的时间。

print('oldest person was of: ', df['Age'].max(), 'Years')
print('Youngest person was of: ', df['Age'].min(), 'Years')
print('Average age was of: ', df['Age'].mean(), 'Years')
oldest person was of:  61 Years
Youngest person was of:  19 Years
Average age was of:  36.009 Years
f, ax = plt.subplots(2,2, figsize=(20,10))
sns.violinplot('Male', 'Age', hue='Clicked on Ad', data=df, ax=ax[0, 0], palette='spring')
ax[0,0].set_title('Gender and Age vs Clicked on Ad or not')
ax[0,0].set_yticks(range(0,80,10))sns.violinplot('Weekday', 'Age', hue='Clicked on Ad', data=df, ax=ax[0,1], palette='summer')
ax[0,1].set_title('Weekday and Age vs Clicked on Ad or not')
ax[0,1].set_yticks(range(0,90,10))sns.violinplot('Male', 'Daily Time Spent on Site', hue='Clicked on Ad', data=df, ax=ax[1,0],palette='autumn')
ax[1,0].set_title('Gender Daily time spent vs Clicked on Ad or not' )
ax[1,0].set_yticks(range(1,120,10))sns.violinplot('Weekday', 'Daily Time Spent on Site', hue='Clicked on Ad', data=df,ax=ax[1,1],palette='winter')
ax[1,1].set_title('Weekday and Daily time spent vs Clicked on Ad or not')


# Correlation Between Variablesfig = plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), cmap='Blues', annot=True)

热图使我们可以更好地了解每个功能之间的关系。 相关性在-1和1之间测量。绝对值越高,变量之间的相关度越高。 我们希望每天的互联网使用量和每天在网站上花费的时间与我们的目标变量更加相关。 同样,我们的解释性变量似乎都没有相关,这表明我们的数据中没有共线性。

# Extracted Features Visualizationsf,ax=plt.subplots(1,2,figsize=(14,5))
df['Month'][df['Clicked on Ad']==1].value_counts().sort_index().plot(ax=ax[0])
ax[0].set_title('Months Vs Clicks')
ax[0].set_ylabel('Count of Clicks')
pd.crosstab(df["Clicked on Ad"], df["Month"]).T.plot(kind = 'bar',ax=ax[1]) #bar是小写
#df.groupby(['Month'])['Clicked on Ad'].sum() # alternative code

折线图显示了每月的点击次数。 分组条形图显示了7个月内目标变量的分布。 2月点击量最高。

pd.crosstab(df['Clicked on Ad'], df['Hour']).T.plot(style=[],ax=ax[0])
pd.pivot_table(df, index=['Weekday'], values=['Clicked on Ad'], aggfunc=np.sum).plot(kind='bar', ax=ax[1])

此处的线形图表明用户倾向于在一天的晚些时候或清晨点击广告。 根据年龄特征,大多数人都在工作,因此似乎很合适,因为他们早晚都在找时间。 同样,周日点击广告较多。

# Clicked Vs Not Clickeddf.groupby('Clicked on Ad')['Clicked on Ad', 'Daily Time Spent on Site', 'Age','Area Income','Daily Internet Usage'].mean()
Clicked on AdDaily Time Spent on SiteAgeArea IncomeDaily Internet Usage
Clicked on Ad
df.groupby(['Male', 'Clicked on Ad'])['Clicked on Ad'].count().unstack()
Clicked on Ad01
hdf = pd.pivot_table(df, index=['Hour'], columns=['Male'], values=['Clicked on Ad'],aggfunc=np.sum).rename(columns={'Clicked on Ad': 'Clicked'}) #透视表
cm = sns.light_palette('green', as_cmap=True) #调色盘

每小时和性别分布。 总体而言,女性点击广告的频率高于男性。

sns.countplot(x='Male', hue='Clicked on Ad', data=df, palette='bwr',ax=ax[0])# overall distribution of males and females count
table=pd.crosstab(df['Weekday'], df['Clicked on Ad']) #交叉表,统计分组频率
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, ax=ax[1],grid=False)
ax[1].set_title('Stacked Bar Chart of Weekday vs Clicked')
ax[1].set_ylabel('Proportion by Day')
plt.tight_layout() #自动调整子图参数,使之填充整个图像区域


sns.factorplot(x='Weekday', y='Age', col='Clicked on Ad', data=df, kind='box',size=5,aspect=2.0)

按年龄和工作日比较是否点击过广告的用户。 显然,年龄较大的人倾向于点击广告。

sns.factorplot('Month', 'Clicked on Ad', hue='Male', data=df)


# Identifying Potential Outliers using IQRfor i in numeric_cols:stat= df[i].describe()print(stat)IQR = stat['75%'] - stat['25%']upper = stat['75%']+1.5*IQRlower=stat['25%']-1.5*IQRprint('The upper and lower bounds for suspected outliers are {} and {}.'.format(upper, lower))
count    1000.000000
mean       65.000200
std        15.853615
min        32.600000
25%        51.360000
50%        68.215000
75%        78.547500
max        91.430000
Name: Daily Time Spent on Site, dtype: float64
The upper and lower bounds for suspected outliers are 119.32875 and 10.57875.
count    1000.000000
mean       36.009000
std         8.785562
min        19.000000
25%        29.000000
50%        35.000000
75%        42.000000
max        61.000000
Name: Age, dtype: float64
The upper and lower bounds for suspected outliers are 61.5 and 9.5.
count     1000.000000
mean     55000.000080
std      13414.634022
min      13996.500000
25%      47031.802500
50%      57012.300000
75%      65470.635000
max      79484.800000
Name: Area Income, dtype: float64
The upper and lower bounds for suspected outliers are 93128.88375000001 and 19373.553749999992.
count    1000.000000
mean      180.000100
std        43.902339
min       104.780000
25%       138.830000
50%       183.130000
75%       218.792500
max       269.960000
Name: Daily Internet Usage, dtype: float64
The upper and lower bounds for suspected outliers are 338.73625000000004 and 18.886250000000004.
# Basic model building based on the actual data# Importing train_test_split from sklearn.model_selection family
from sklearn.model_selection import train_test_split
# Assigning Numerical columns to X & y only as model can only take numbers
X = df[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage', 'Male']]
y = df['Clicked on Ad']
# Splitting the data into train & test sets 
# test_size is % of data that we want to allocate & random_state ensures a specific set of random splits on our data because 
#this train test split is going to occur randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(670, 5) (670,)
(330, 5) (330,)
# Building a Basic Model# Import LogisticRegression from sklearn.linear_model family
from sklearn.linear_model import LogisticRegression
#Creating a linear regression object
logreg = LogisticRegression()
# Fit the model on training data using a fit method
model = logreg.fit(X_train, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,intercept_scaling=1, l1_ratio=None, max_iter=100,multi_class='auto', n_jobs=None, penalty='l2',random_state=None, solver='lbfgs', tol=0.0001, verbose=0,warm_start=False)
# Predictions# The predict method just takes X_test as a parameter, which means it just takes the features to draw predictions
predictions = logreg.predict(X_test)
# Below are the results of predicted click on Ads
array([0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1],dtype=int64)
# Performance Metrics# Importing classification_report from sklearn.metrics family
from sklearn.metrics import classification_report# Printing classification_report to see the results
print(classification_report(y_test, predictions))
              precision    recall  f1-score   support0       0.86      0.96      0.91       1621       0.96      0.85      0.90       168accuracy                           0.91       330macro avg       0.91      0.91      0.91       330
weighted avg       0.91      0.91      0.91       330
# Importing a pure confusion matrix from sklearn.metrics family
from sklearn.metrics import confusion_matrix# Printing the confusion_matrix
print(confusion_matrix(y_test, predictions))
[[156   6][ 25 143]]
# Feature Engineeringnew_df = df.copy() # just to keep the original dataframe unchanged
# creating pairplot to check effect of datetime variable on target variable
pp = sns.pairplot(new_df, hue= 'Clicked on Ad', vars = ['Month', 'Day', 'Hour', 'Weekday'], palette= 'husl')


# Dummy encoding on Month column
new_df=pd.concat([new_df, pd.get_dummies(new_df['Month'], prefix='Month')], axis=1)
#dummy enconding on weekly column
new_df=pd.concat([new_df, pd.get_dummies(new_df['Weekday'], prefix='Weekday')], axis=1)
#creating buckets for hour columns based on EDA part
new_df['Hour_bins']=pd.cut(new_df['Hour'], bins=[0, 5, 11, 17, 23], labels=['Hour_0-5', 'Hour_6-11','Hour_12-17','Hour_18-23'],include_lowest=True )
# dummy encoding on Hour_bins column
new_df=pd.concat([new_df, pd.get_dummies(new_df['Hour_bins'], prefix='Hour')], axis=1)
# feature engineering on Age column
sns.barplot(new_df['Age'], df['Clicked on Ad'], ci=None)
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,34, 35, 36, 37, 38, 39, 40, 41, 42]),[Text(0, 0, '19'),Text(1, 0, '20'),Text(2, 0, '21'),Text(3, 0, '22'),Text(4, 0, '23'),Text(5, 0, '24'),Text(6, 0, '25'),Text(7, 0, '26'),Text(8, 0, '27'),Text(9, 0, '28'),Text(10, 0, '29'),Text(11, 0, '30'),Text(12, 0, '31'),Text(13, 0, '32'),Text(14, 0, '33'),Text(15, 0, '34'),Text(16, 0, '35'),Text(17, 0, '36'),Text(18, 0, '37'),Text(19, 0, '38'),Text(20, 0, '39'),Text(21, 0, '40'),Text(22, 0, '41'),Text(23, 0, '42'),Text(24, 0, '43'),Text(25, 0, '44'),Text(26, 0, '45'),Text(27, 0, '46'),Text(28, 0, '47'),Text(29, 0, '48'),Text(30, 0, '49'),Text(31, 0, '50'),Text(32, 0, '51'),Text(33, 0, '52'),Text(34, 0, '53'),Text(35, 0, '54'),Text(36, 0, '55'),Text(37, 0, '56'),Text(38, 0, '57'),Text(39, 0, '58'),Text(40, 0, '59'),Text(41, 0, '60'),Text(42, 0, '61')])


# checking bins
limit_1 = 18
limit_2 = 35x_limit_1 = np.size(df[df['Age']<limit_1]['Age'].unique())
x_limit_2 = np.size(df[df['Age']<limit_2]['Age'].unique())plt.figure(figsize=(15,10))
sns.countplot('Age', hue='Clicked on Ad', data=df)
plt.axvspan(-1, x_limit_1, alpha=0.25, color='green')
plt.axvspan(x_limit_1, x_limit_2, alpha=0.25, color='red')
plt.axvspan(x_limit_2, 50, alpha=0.25, color='yellow')
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,34, 35, 36, 37, 38, 39, 40, 41, 42]),[Text(0, 0, '19'),Text(1, 0, '20'),Text(2, 0, '21'),Text(3, 0, '22'),Text(4, 0, '23'),Text(5, 0, '24'),Text(6, 0, '25'),Text(7, 0, '26'),Text(8, 0, '27'),Text(9, 0, '28'),Text(10, 0, '29'),Text(11, 0, '30'),Text(12, 0, '31'),Text(13, 0, '32'),Text(14, 0, '33'),Text(15, 0, '34'),Text(16, 0, '35'),Text(17, 0, '36'),Text(18, 0, '37'),Text(19, 0, '38'),Text(20, 0, '39'),Text(21, 0, '40'),Text(22, 0, '41'),Text(23, 0, '42'),Text(24, 0, '43'),Text(25, 0, '44'),Text(26, 0, '45'),Text(27, 0, '46'),Text(28, 0, '47'),Text(29, 0, '48'),Text(30, 0, '49'),Text(31, 0, '50'),Text(32, 0, '51'),Text(33, 0, '52'),Text(34, 0, '53'),Text(35, 0, '54'),Text(36, 0, '55'),Text(37, 0, '56'),Text(38, 0, '57'),Text(39, 0, '58'),Text(40, 0, '59'),Text(41, 0, '60'),Text(42, 0, '61')])


# creating bins on Age colummn based on above plots
new_df['Age_bins']=pd.cut(new_df['Age'], bins=[0, 18, 30, 45, 70], labels=['Young', 'Adult', 'Mid', 'Elder'])
# verifying the bins by checking the count
sns.countplot('Age_bins', hue='Clicked on Ad', data=new_df)


# dummy enconding on Age column
new_df=pd.concat([new_df, pd.get_dummies(new_df['Age_bins'], prefix='Age')], axis=1)
# dummy encoding on Contry column based on EDA
new_df=pd.concat([new_df, pd.get_dummies(new_df['Country'], prefix='Country')], axis=1)
# Remove redundant and no predictive power features
new_df.drop(['Country', 'Ad Topic Line', 'City', 'Day', 'Month', 'Weekday', 'Hour', 'Hour_bins', 'Age', 'Age_bins'], axis = 1, inplace = True)
new_df.head() # Checking the final dataframe
Daily Time Spent on SiteArea IncomeDaily Internet UsageMaleClicked on AdMonth_1Month_2Month_3Month_4Month_5...Country_UruguayCountry_UzbekistanCountry_VanuatuCountry_VenezuelaCountry_VietnamCountry_Wallis and FutunaCountry_Western SaharaCountry_YemenCountry_ZambiaCountry_Zimbabwe

5 rows × 278 columns

# Building Logistic Regression ModelX = new_df.drop(['Clicked on Ad'],1)
y = new_df['Clicked on Ad']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# Standarizing the features
from  sklearn.preprocessing  import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(800, 277) (800,)
(200, 277) (200,)
import statsmodels.api as sm
from scipy import statsX2 = sm.add_constant(X_train_std)
est = sm.OLS(y_train, X2)
est2 = est.fit()
                            OLS Regression Results                            
Dep. Variable:          Clicked on Ad   R-squared:                       0.893
Model:                            OLS   Adj. R-squared:                  0.843
Method:                 Least Squares   F-statistic:                     17.75
Date:                Sun, 09 Aug 2020   Prob (F-statistic):          8.68e-164
Time:                        02:03:52   Log-Likelihood:                 314.62
No. Observations:                 800   AIC:                            -115.2
Df Residuals:                     543   BIC:                             1089.
Df Model:                         256                                         
Covariance Type:            nonrobust                                         
==============================================================================coef    std err          t      P>|t|      [0.025      0.975]
const          0.4863      0.007     69.390      0.000       0.472       0.500
x1            -0.1816      0.010    -17.728      0.000      -0.202      -0.161
x2            -0.0932      0.009    -10.275      0.000      -0.111      -0.075
x3            -0.2588      0.011    -24.497      0.000      -0.280      -0.238
x4            -0.0166      0.008     -1.974      0.049      -0.033   -8.27e-05
x5            -0.0020      0.002     -0.818      0.414      -0.007       0.003
x6             0.0023      0.002      0.957      0.339      -0.002       0.007
x7            -0.0045      0.002     -1.864      0.063      -0.009       0.000
x8             0.0003      0.002      0.115      0.908      -0.004       0.005
x9             0.0032      0.002      1.296      0.195      -0.002       0.008
x10           -0.0031      0.002     -1.280      0.201      -0.008       0.002
x11            0.0043      0.003      1.694      0.091      -0.001       0.009
x12           -0.0020      0.002     -0.818      0.414      -0.007       0.003
x13            0.0023      0.002      0.957      0.339      -0.002       0.007
x14           -0.0045      0.002     -1.864      0.063      -0.009       0.000
x15            0.0003      0.002      0.115      0.908      -0.004       0.005
x16            0.0032      0.002      1.296      0.195      -0.002       0.008
x17           -0.0031      0.002     -1.280      0.201      -0.008       0.002
x18            0.0043      0.003      1.694      0.091      -0.001       0.009
x19           -0.0020      0.002     -0.818      0.414      -0.007       0.003
x20            0.0023      0.002      0.957      0.339      -0.002       0.007
x21           -0.0045      0.002     -1.864      0.063      -0.009       0.000
x22            0.0003      0.002      0.115      0.908      -0.004       0.005
x23            0.0032      0.002      1.296      0.195      -0.002       0.008
x24           -0.0031      0.002     -1.280      0.201      -0.008       0.002
x25            0.0043      0.003      1.694      0.091      -0.001       0.009
x26            0.0025      0.007      0.339      0.735      -0.012       0.017
x27           -0.0040      0.007     -0.542      0.588      -0.018       0.010
x28           -0.0028      0.007     -0.391      0.696      -0.017       0.011
x29           -0.0011      0.007     -0.150      0.881      -0.015       0.013
x30           -0.0045      0.007     -0.607      0.544      -0.019       0.010
x31           -0.0007      0.008     -0.087      0.930      -0.016       0.014
x32            0.0100      0.007      1.383      0.167      -0.004       0.024
x33           -0.0002      0.006     -0.028      0.978      -0.013       0.013
x34            0.0027      0.006      0.434      0.665      -0.010       0.015
x35           -0.0084      0.006     -1.307      0.192      -0.021       0.004
x36            0.0057      0.006      0.892      0.373      -0.007       0.018
x37         8.946e-19   9.61e-18      0.093      0.926    -1.8e-17    1.98e-17
x38           -0.0305      0.006     -5.329      0.000      -0.042      -0.019
x39            0.0052      0.005      1.066      0.287      -0.004       0.015
x40            0.0316      0.007      4.543      0.000       0.018       0.045
x41           -0.0042      0.007     -0.599      0.550      -0.018       0.010
x42            0.0036      0.007      0.506      0.613      -0.010       0.017
x43            0.0055      0.007      0.775      0.438      -0.008       0.020
x44            0.0010      0.007      0.138      0.891      -0.013       0.015
x45            0.0132      0.007      1.848      0.065      -0.001       0.027
x46           -0.0128      0.007     -1.802      0.072      -0.027       0.001
x47            0.0025      0.007      0.346      0.729      -0.012       0.017
x48           -0.0026      0.007     -0.359      0.720      -0.017       0.011
x49            0.0036      0.007      0.512      0.609      -0.010       0.018
x50           -0.0014      0.007     -0.194      0.846      -0.015       0.013
x51           -0.0055      0.007     -0.771      0.441      -0.019       0.008
x52           -0.0013      0.007     -0.188      0.851      -0.015       0.013
x53            0.0195      0.007      2.745      0.006       0.006       0.033
x54           -0.0066      0.007     -0.926      0.355      -0.021       0.007
x55         -8.59e-05      0.007     -0.012      0.990      -0.014       0.014
x56           -0.0008      0.007     -0.110      0.913      -0.015       0.013
x57           -0.0020      0.007     -0.281      0.779      -0.016       0.012
x58           -0.0056      0.007     -0.788      0.431      -0.020       0.008
x59           -0.0031      0.007     -0.431      0.666      -0.017       0.011
x60           -0.0059      0.007     -0.835      0.404      -0.020       0.008
x61           -0.0078      0.007     -1.105      0.270      -0.022       0.006
x62           -0.0033      0.007     -0.465      0.642      -0.017       0.011
x63            0.0023      0.007      0.329      0.742      -0.012       0.016
x64            0.0029      0.007      0.402      0.688      -0.011       0.017
x65            0.0130      0.007      1.829      0.068      -0.001       0.027
x66           -0.0084      0.007     -1.185      0.237      -0.022       0.006
x67           -0.0021      0.007     -0.294      0.769      -0.016       0.012
x68           -0.0111      0.007     -1.581      0.114      -0.025       0.003
x69           -0.0004      0.007     -0.062      0.950      -0.015       0.014
x70           -0.0058      0.007     -0.806      0.421      -0.020       0.008
x71           -0.0036      0.007     -0.505      0.613      -0.018       0.010
x72           -0.0042      0.007     -0.595      0.552      -0.018       0.010
x73            0.0053      0.007      0.743      0.458      -0.009       0.019
x74            0.0104      0.007      1.465      0.144      -0.004       0.024
x75           -0.0079      0.007     -1.120      0.263      -0.022       0.006
x76           -0.0031      0.007     -0.449      0.653      -0.017       0.011
x77            0.0008      0.007      0.106      0.915      -0.013       0.015
x78            0.0010      0.007      0.136      0.892      -0.013       0.015
x79           -0.0065      0.007     -0.912      0.362      -0.020       0.007
x80            0.0283      0.007      4.009      0.000       0.014       0.042
x81         5.668e-18   1.02e-17      0.555      0.579   -1.44e-17    2.57e-17
x82            0.0065      0.007      0.922      0.357      -0.007       0.020
x83           -0.0039      0.007     -0.545      0.586      -0.018       0.010
x84           -0.0082      0.007     -1.152      0.250      -0.022       0.006
x85            0.0191      0.007      2.701      0.007       0.005       0.033
x86           -0.0035      0.007     -0.488      0.625      -0.017       0.011
x87           -0.0046      0.007     -0.639      0.523      -0.019       0.009
x88            0.0002      0.007      0.024      0.981      -0.014       0.014
x89           -0.0038      0.007     -0.533      0.594      -0.018       0.010
x90           -0.0040      0.007     -0.563      0.573      -0.018       0.010
x91           -0.0069      0.007     -0.971      0.332      -0.021       0.007
x92           -0.0062      0.007     -0.866      0.387      -0.020       0.008
x93            0.0230      0.007      3.188      0.002       0.009       0.037
x94           -0.0169      0.007     -2.369      0.018      -0.031      -0.003
x95           -0.0101      0.007     -1.428      0.154      -0.024       0.004
x96            0.0014      0.007      0.196      0.845      -0.013       0.015
x97           -0.0042      0.007     -0.586      0.558      -0.018       0.010
x98            0.0055      0.007      0.783      0.434      -0.008       0.019
x99            0.0030      0.007      0.425      0.671      -0.011       0.017
x100          -0.0009      0.007     -0.124      0.901      -0.015       0.013
x101           0.0045      0.007      0.635      0.526      -0.009       0.018
x102           0.0069      0.007      0.978      0.329      -0.007       0.021
x103          -0.0028      0.007     -0.391      0.696      -0.017       0.011
x104          -0.0036      0.007     -0.506      0.613      -0.018       0.010
x105          -0.0016      0.007     -0.228      0.820      -0.016       0.012
x106           0.0021      0.007      0.298      0.766      -0.012       0.016
x107           0.0028      0.007      0.390      0.696      -0.011       0.017
x108           0.0003      0.007      0.046      0.963      -0.014       0.014
x109          -0.0100      0.007     -1.406      0.160      -0.024       0.004
x110           0.0002      0.007      0.029      0.977      -0.014       0.014
x111          -0.0054      0.007     -0.762      0.446      -0.019       0.009
x112           0.0004      0.007      0.050      0.961      -0.014       0.014
x113          -0.0061      0.007     -0.863      0.389      -0.020       0.008
x114          -0.0029      0.007     -0.403      0.687      -0.017       0.011
x115          -0.0097      0.007     -1.370      0.171      -0.024       0.004
x116           0.0054      0.007      0.755      0.451      -0.009       0.019
x117          -0.0068      0.007     -0.956      0.339      -0.021       0.007
x118           0.0195      0.007      2.712      0.007       0.005       0.034
x119          -0.0189      0.007     -2.640      0.009      -0.033      -0.005
x120          -0.0113      0.007     -1.585      0.113      -0.025       0.003
x121          -0.0067      0.007     -0.947      0.344      -0.020       0.007
x122          -0.0059      0.007     -0.828      0.408      -0.020       0.008
x123          -0.0095      0.007     -1.346      0.179      -0.023       0.004
x124           0.0089      0.007      1.242      0.215      -0.005       0.023
x125          -0.0013      0.007     -0.189      0.850      -0.015       0.013
x126           0.0138      0.007      1.949      0.052      -0.000       0.028
x127          -0.0049      0.007     -0.687      0.492      -0.019       0.009
x128          -0.0014      0.007     -0.200      0.841      -0.015       0.013
x129           0.0070      0.007      0.980      0.327      -0.007       0.021
x130          -0.0066      0.007     -0.934      0.351      -0.021       0.007
x131           0.0085      0.007      1.203      0.230      -0.005       0.022
x132        8.763e-07      0.007      0.000      1.000      -0.014       0.014
x133           0.0015      0.007      0.217      0.828      -0.012       0.015
x134          -0.0035      0.007     -0.498      0.619      -0.017       0.010
x135          -0.0026      0.007     -0.367      0.713      -0.017       0.011
x136           0.0230      0.007      3.212      0.001       0.009       0.037
x137          -0.0062      0.007     -0.884      0.377      -0.020       0.008
x138           0.0021      0.007      0.298      0.766      -0.012       0.016
x139           0.0039      0.007      0.540      0.589      -0.010       0.018
x140          -0.0010      0.007     -0.140      0.889      -0.015       0.013
x141          -0.0068      0.007     -0.952      0.342      -0.021       0.007
x142          -0.0020      0.007     -0.287      0.774      -0.016       0.012
x143           0.0114      0.007      1.608      0.108      -0.003       0.025
x144          -0.0041      0.007     -0.577      0.564      -0.018       0.010
x145          -0.0012      0.007     -0.169      0.866      -0.015       0.013
x146           0.0125      0.007      1.752      0.080      -0.002       0.027
x147           0.0093      0.007      1.305      0.193      -0.005       0.023
x148          -0.0093      0.007     -1.309      0.191      -0.023       0.005
x149           0.0130      0.007      1.825      0.068      -0.001       0.027
x150          -0.0032      0.007     -0.442      0.659      -0.017       0.011
x151          -0.0025      0.007     -0.354      0.724      -0.017       0.011
x152           0.0019      0.007      0.264      0.792      -0.012       0.016
x153          -0.0128      0.007     -1.791      0.074      -0.027       0.001
x154          -0.0030      0.007     -0.418      0.676      -0.017       0.011
x155          -0.0035      0.007     -0.487      0.626      -0.017       0.010
x156           0.0129      0.007      1.818      0.070      -0.001       0.027
x157          -0.0015      0.007     -0.210      0.834      -0.015       0.012
x158          -0.0044      0.007     -0.614      0.539      -0.018       0.010
x159           0.0044      0.007      0.628      0.530      -0.009       0.018
x160          -0.0028      0.007     -0.400      0.690      -0.017       0.011
x161           0.0167      0.007      2.371      0.018       0.003       0.030
x162          -0.0007      0.007     -0.104      0.917      -0.015       0.013
x163           0.0078      0.007      1.087      0.277      -0.006       0.022
x164          -0.0097      0.007     -1.364      0.173      -0.024       0.004
x165           0.0008      0.007      0.110      0.912      -0.013       0.015
x166          -0.0073      0.007     -1.037      0.300      -0.021       0.007
x167          -0.0144      0.007     -1.990      0.047      -0.029      -0.000
x168           0.0013      0.007      0.185      0.853      -0.013       0.015
x169          -0.0004      0.007     -0.060      0.953      -0.014       0.014
x170          -0.0036      0.007     -0.505      0.614      -0.018       0.010
x171          -0.0017      0.007     -0.244      0.808      -0.016       0.012
x172          -0.0043      0.007     -0.606      0.545      -0.018       0.010
x173           0.0121      0.007      1.690      0.092      -0.002       0.026
x174          -0.0046      0.007     -0.652      0.515      -0.019       0.009
x175          -0.0013      0.007     -0.185      0.853      -0.015       0.013
x176           0.0075      0.007      1.046      0.296      -0.007       0.021
x177          -0.0036      0.007     -0.505      0.614      -0.018       0.010
x178          -0.0029      0.007     -0.405      0.686      -0.017       0.011
x179          -0.0034      0.007     -0.473      0.636      -0.017       0.011
x180           0.0018      0.007      0.253      0.800      -0.012       0.016
x181          -0.0042      0.007     -0.595      0.552      -0.018       0.010
x182           0.0095      0.007      1.337      0.182      -0.004       0.024
x183           0.0034      0.007      0.472      0.637      -0.011       0.017
x184           0.0060      0.007      0.835      0.404      -0.008       0.020
x185           0.0038      0.007      0.529      0.597      -0.010       0.018
x186          -0.0067      0.007     -0.952      0.341      -0.021       0.007
x187           0.0068      0.007      0.951      0.342      -0.007       0.021
x188           0.0004      0.007      0.061      0.951      -0.013       0.014
x189          -0.0026      0.007     -0.368      0.713      -0.017       0.011
x190           0.0060      0.007      0.841      0.401      -0.008       0.020
x191           0.0098      0.007      1.386      0.166      -0.004       0.024
x192           0.0148      0.007      2.076      0.038       0.001       0.029
x193          -0.0101      0.007     -1.424      0.155      -0.024       0.004
x194          -0.0004      0.007     -0.062      0.951      -0.014       0.013
x195          -0.0020      0.007     -0.274      0.784      -0.016       0.012
x196          -0.0024      0.007     -0.338      0.736      -0.016       0.012
x197          -0.0040      0.007     -0.569      0.569      -0.018       0.010
x198           0.0015      0.007      0.212      0.832      -0.012       0.016
x199          -0.0059      0.007     -0.828      0.408      -0.020       0.008
x200          -0.0064      0.007     -0.903      0.367      -0.020       0.008
x201          -0.0113      0.007     -1.595      0.111      -0.025       0.003
x202          -0.0114      0.007     -1.588      0.113      -0.026       0.003
x203          -0.0038      0.007     -0.538      0.591      -0.018       0.010
x204          -0.0001      0.007     -0.018      0.986      -0.014       0.014
x205           0.0018      0.007      0.253      0.800      -0.012       0.016
x206           0.0032      0.007      0.454      0.650      -0.011       0.017
x207           0.0068      0.007      0.962      0.336      -0.007       0.021
x208           0.0003      0.007      0.041      0.967      -0.014       0.014
x209           0.0026      0.007      0.368      0.713      -0.011       0.016
x210           0.0010      0.007      0.142      0.887      -0.013       0.015
x211           0.0011      0.007      0.161      0.872      -0.013       0.015
x212          -0.0057      0.007     -0.801      0.424      -0.020       0.008
x213          -0.0055      0.007     -0.765      0.444      -0.020       0.009
x214           0.0011      0.007      0.157      0.876      -0.013       0.015
x215           0.0013      0.007      0.184      0.854      -0.013       0.015
x216          -0.0069      0.007     -0.969      0.333      -0.021       0.007
x217           0.0050      0.007      0.697      0.486      -0.009       0.019
x218           0.0117      0.007      1.641      0.101      -0.002       0.026
x219           0.0236      0.007      3.309      0.001       0.010       0.038
x220          -0.0012      0.007     -0.170      0.865      -0.015       0.013
x221           0.0002      0.007      0.034      0.973      -0.014       0.014
x222           0.0054      0.007      0.761      0.447      -0.009       0.019
x223          -0.0046      0.007     -0.641      0.522      -0.019       0.009
x224          -0.0064      0.007     -0.908      0.364      -0.020       0.007
x225          -0.0002      0.007     -0.024      0.981      -0.014       0.014
x226           0.0062      0.007      0.869      0.385      -0.008       0.020
x227           0.0054      0.007      0.759      0.448      -0.009       0.019
x228           0.0079      0.007      1.122      0.262      -0.006       0.022
x229          -0.0024      0.007     -0.339      0.735      -0.016       0.012
x230           0.0065      0.007      0.908      0.364      -0.008       0.020
x231           0.0014      0.007      0.196      0.845      -0.013       0.015
x232          -0.0135      0.007     -1.901      0.058      -0.027       0.000
x233          -0.0095      0.007     -1.337      0.182      -0.023       0.004
x234           0.0034      0.007      0.479      0.632      -0.011       0.017
x235           0.0065      0.007      0.908      0.364      -0.008       0.020
x236           0.0146      0.007      2.072      0.039       0.001       0.028
x237           0.0067      0.007      0.950      0.343      -0.007       0.021
x238           0.0165      0.007      2.303      0.022       0.002       0.031
x239          -0.0062      0.007     -0.862      0.389      -0.020       0.008
x240           0.0017      0.007      0.238      0.812      -0.012       0.016
x241           0.0013      0.007      0.181      0.857      -0.013       0.015
x242           0.0064      0.007      0.895      0.371      -0.008       0.020
x243          -0.0061      0.007     -0.850      0.396      -0.020       0.008
x244          -0.0017      0.007     -0.244      0.808      -0.016       0.012
x245           0.0069      0.007      0.977      0.329      -0.007       0.021
x246          -0.0072      0.007     -1.004      0.316      -0.021       0.007
x247          -0.0066      0.007     -0.926      0.355      -0.020       0.007
x248           0.0023      0.007      0.327      0.744      -0.012       0.016
x249           0.0006      0.007      0.080      0.936      -0.013       0.014
x250          -0.0075      0.007     -1.060      0.290      -0.022       0.006
x251          -0.0087      0.007     -1.222      0.222      -0.023       0.005
x252          -0.0098      0.007     -1.374      0.170      -0.024       0.004
x253           0.0040      0.007      0.563      0.574      -0.010       0.018
x254          -0.0015      0.007     -0.207      0.836      -0.015       0.012
x255          -0.0029      0.007     -0.412      0.680      -0.017       0.011
x256           0.0003      0.007      0.037      0.970      -0.014       0.014
x257           0.0075      0.007      1.054      0.292      -0.006       0.021
x258           0.0008      0.007      0.118      0.906      -0.013       0.015
x259           0.0019      0.007      0.264      0.792      -0.012       0.016
x260          -0.0037      0.007     -0.517      0.605      -0.018       0.010
x261           0.0168      0.007      2.379      0.018       0.003       0.031
x262          -0.0033      0.007     -0.472      0.637      -0.017       0.011
x263           0.0130      0.007      1.831      0.068      -0.001       0.027
x264          -0.0047      0.007     -0.666      0.506      -0.019       0.009
x265           0.0032      0.007      0.444      0.657      -0.011       0.017
x266           0.0003      0.007      0.041      0.967      -0.014       0.014
x267          -0.0077      0.007     -1.094      0.274      -0.022       0.006
x268           0.0145      0.007      2.037      0.042       0.001       0.028
x269           0.0046      0.007      0.647      0.518      -0.009       0.019
x270          -0.0067      0.007     -0.938      0.349      -0.021       0.007
x271          -0.0041      0.007     -0.579      0.563      -0.018       0.010
x272          -0.0061      0.007     -0.854      0.393      -0.020       0.008
x273          -0.0067      0.007     -0.948      0.343      -0.021       0.007
x274          -0.0116      0.007     -1.635      0.103      -0.025       0.002
x275          -0.0097      0.007     -1.361      0.174      -0.024       0.004
x276           0.0008      0.007      0.111      0.911      -0.013       0.015
x277          -0.0017      0.007     -0.234      0.815      -0.016       0.012
Omnibus:                      141.574   Durbin-Watson:                   1.962
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              397.037
Skew:                           0.888   Prob(JB):                     6.09e-87
Kurtosis:                       5.959   Cond. No.                     2.76e+16
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 4.23e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

我们可以看到,特征Male(Gender)对模型没有贡献(即见x4),因此我们实际上可以从模型中删除该变量。 移除变量后,如果“调整后的R平方”与以前的模型相比没有变化。 然后我们可以得出结论,该功能确实对模型没有贡献。

# applying logistic regression model to training data
lr = LogisticRegression(penalty='l2', C=0.1, random_state=42)
lr.fit(X_train_std, y_train)
#predict using model
lr_training_pred = lr.predict(X_train_std)
lr_training_acc = accuracy_score(y_train, lr_training_pred)
print('Accuracy of Logistic regression training set: ', round(lr_training_acc, 3))
Accuracy of Logistic regression training set:  0.992
# creating K fold Cross-validation
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(lr, #modelX_train_std, #feature matrixy_train, #target vectorcv=kf,  #cross-validation techniguescoring='accuracy',  #loss functionn_jobs= -1) # use all cpu scores
print('10 fold CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
10 fold CV accuracy: 0.957 +/- 0.018
from sklearn.model_selection import cross_val_predict
print('the cross validated score for Logistic Regression Classifier is: ', round(scores.mean()*100,2))
y_pred  = cross_val_predict(lr, X_train_std, y_train, cv=10)
sns.heatmap(confusion_matrix(y_train, y_pred), annot=True, fmt='3.0f', cmap='winter')
plt.title('Confusion_matrix', y=1.05, size=15)
the cross validated score for Logistic Regression Classifier is:  95.75
Text(0.5, 1.05, 'Confusion_matrix')


# Random Forest Modelfrom sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion='gini', n_estimators=400,min_samples_split=10, min_samples_leaf=1,max_features='auto', oob_score=True,random_state=42, n_jobs=-1)
rf.fit(X_train_std, y_train)
rf_training_pred = rf.predict(X_train_std)
rf_training_acc = accuracy_score(y_train, rf_training_pred)print('Accuracy of Random Forest training set: ', round(rf_training_acc, 3))
Accuracy of Random Forest training set:  0.994
# creating K fold Cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(rf, #modelX_train_std, #Feature matrixy_train, #target vectorcv=kf, #cross-validation techniguescoring='accuracy', #loss functionn_jobs= -1, #use all CPU scores)
print('10 fold CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
10 fold CV accuracy: 0.966 +/- 0.011
from sklearn.model_selection import cross_val_predict
print('the cross validated score for Random Forest Classifier is: ', round(scores.mean()*100,2))
y_pred = cross_val_predict(rf, X_train_std, y_train, cv=10)
sns.heatmap(confusion_matrix(y_train,y_pred),annot=True,fmt='3.0f', cmap='winter')
plt.title('Confusion_matrix', y=1.05, size=15)
the cross validated score for Random Forest Classifier is:  96.62Text(0.5, 1.05, 'Confusion_matrix')


# Test Models Performanceprint('\n\n ---Logistic Regression Model---')
lr_roc_auc = roc_auc_score(y_test, lr.predict(X_test_std))print('Logistic Regression AUC = %2.2f' % lr_roc_auc)
print(classification_report(y_test, lr.predict(X_test_std)))print('\n\n ---Random Forest Model---')
rf_roc_auc = roc_auc_score(y_test, rf.predict(X_test_std))print('Random Forest AUC = %2.2f' % rf_roc_auc)
print(classification_report(y_test, rf.predict(X_test_std)))
 ---Logistic Regression Model---
Logistic Regression AUC = 0.91precision    recall  f1-score   support0       0.84      0.97      0.90        891       0.97      0.86      0.91       111accuracy                           0.91       200macro avg       0.91      0.91      0.90       200
weighted avg       0.91      0.91      0.91       200---Random Forest Model---
Random Forest AUC = 0.93precision    recall  f1-score   support0       0.94      0.90      0.92        891       0.92      0.95      0.94       111accuracy                           0.93       200macro avg       0.93      0.93      0.93       200
weighted avg       0.93      0.93      0.93       200


# ROC Graph#create ROC Graph
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, lr.predict_proba(X_test_std)[:,1])
rf_fpr, rf_tpr,rf_thresholds = roc_curve(y_test, rf.predict_proba(X_test_std)[:,1])plt.figure()#Plot Logistic Regression ROC
plt.plot(fpr, tpr, label='Logistic Regression (area=%0.2f)' % lr_roc_auc)#Plot Random Forest ROC
plt.plot(rf_fpr, rf_tpr, label='Random Forest Classifier(area = %0.2f)' % rf_roc_auc)#Plot Base Rate ROC
plt.plot([0,1], [0,1], label='Base Rate')plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Graph')
plt.legend(loc='lower right')


# Random Forest Feature Importancescolumns = X.columns
#converting numpy array list into dataframes
train = pd.DataFrame(np.atleast_2d(X_train_std), columns=columns) 
# get feature importance
feature_importances = pd.DataFrame(rf.feature_importances_,index=train.columns,columns=['importance']).sort_values('importance', ascending=False)
feature_importances = feature_importances.reset_index()
0Daily Internet Usage0.364392
1Daily Time Spent on Site0.267391
2Area Income0.113468
sns.set(style='whitegrid')#initialize the matplotlib figure
f, ax = plt.subplots(figsize=(13,7))#plot the feature importance
sns.barplot(x='importance', y='index', data=feature_importances[0:10],label='Total', color='b')




    2022/11/19 21:16:57