import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#set seaborn style as default style
sns.set()


# load the dataset
path = '../data/diamonds.csv'
data = pd.read_csv(path)


# overview about data
data.head()


# data shape
data.shape

(53940, 11)


# quick summary of the dataset
data.describe()


# The Data type of each column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  53940 non-null  int64  
 1   carat       53940 non-null  float64
 2   cut         53940 non-null  object 
 3   color       53940 non-null  object 
 4   clarity     53940 non-null  object 
 5   depth       53940 non-null  float64
 6   table       53940 non-null  float64
 7   price       53940 non-null  int64  
 8   x           53940 non-null  float64
 9   y           53940 non-null  float64
 10  z           53940 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 4.5+ MB


# check for null values
data.isnull().sum().sum()

0


# drop the unimportant column
data = data.drop(['Unnamed: 0'] , axis=1)


#check for duplicated values
data[data.duplicated(keep='first')]


# get their indexes
df_duplicated = data[data.duplicated(keep='first')]
# drop them from the original data frame
data = data.drop(df_duplicated.index , axis=0)


# verify again
data[data.duplicated(keep='first')]


# the number of entries for each color
data.color.value_counts()

G    11262
E     9776
F     9520
H     8272
D     6755
I     5407
J     2802
Name: color, dtype: int64


data


# length vs width
#-----scatter plot--------
sns.set_theme(color_codes=True)
sns.lmplot(x='x', y='y', data=data,height=6)

<seaborn.axisgrid.FacetGrid at 0x240f1b566a0>


# adding log(y) to check for correlation
data['log(y)'] = np.log1p(data['y'])


sns.lmplot(x='x', y='log(y)', data=data,height=6)

<seaborn.axisgrid.FacetGrid at 0x240f22fb8e0>


# the distribution of variables
fig,(ax1,ax2) = plt.subplots(1,2)
ax1.hist(data.x)
ax1.set_title('length(mm)')
ax2.hist(data.y)
ax2.set_title('width(mm)')

Text(0.5, 1.0, 'width(mm)')


# distribution of each feature
  # get only numerical variables
num_var = data.describe().columns

#plot the histogram of each feature
axList = data[num_var].hist(bins=18,figsize=(9,9))

# Add some x- and y- labels to columns and rows
for ax,title in zip(axList.flatten(),num_var.tolist()):
    ax.set_title(title)
    if ax.get_subplotspec().is_first_col():
        ax.set_ylabel('Frequency')


sns.set_context('talk')
sns.pairplot(data, hue='color', height=2)

<seaborn.axisgrid.PairGrid at 0x240f4359940>


# create list of numerical values
l = list(num_var)


# remove the log1p(y) added column
l.remove('log(y)')


# loop through to create indexes
list_indexes=list()
for i in range(len(l)):     
    if(i%2==0 and i!=len(l)-1):
        list_indexes.append((i,i+1))
    if(i==len(l)-1):
        list_indexes.append((i,1))


#generate a list of pair indexes
    #check
list_indexes

[(0, 1), (2, 3), (4, 5), (6, 1)]


#check for correlation between each two numerical variables
h=1
plt.figure(figsize=(16,20))
for i,j in list_indexes:
    plt.subplot(4, 1, h)
    plt.scatter(data[num_var[i]],data[num_var[j]])
    plt.title(f'the correlation of {num_var[i]} vs {num_var[j]}')
    plt.xlabel(num_var[i])
    plt.ylabel(num_var[j])
    h+=1
plt.show()


# another powerful plot that builds a linear model
sns.lmplot(x='x', y='z', data=data,height=6)
sns.lmplot(x='x', y='y', data=data,height=6)
sns.lmplot(x='y', y='z', data=data,height=6)
sns.lmplot(x='carat', y='x', data=data,height=6)
sns.lmplot(x='carat', y='y', data=data,height=6)
sns.lmplot(x='carat', y='z', data=data,height=6)
plt.show()


# create a new small dataframe
    # remove the 'log1p(y)' column plus the target variable
small_data = data.drop(['price','log(y)'],axis=1)


# remove the "price" column from our list
l.remove('price')


# the correlation of each feature & the target
h=1
plt.figure(figsize=(10,35))
for i in small_data[l]:
    plt.subplot(len(small_data.columns), 1, h)
    h+=1
    plt.scatter(y=data['price'],x=data[i])
    plt.xlabel(i,fontdict={'fontsize':13})
    plt.ylabel('price')


# The correlation matrix(using heatmap)
plt.figure(figsize=(10,10))
    #Generating the correlation matrix
corr = data.corr()
    #heatmap
sns.heatmap(corr,linewidth = 1,annot = True)

<AxesSubplot:>


# let's create a dataframe of correlations and filter the weak ones
    #Return the indices for the lower-triangle of arr.
trill_indexes = np.tril_indices_from(corr)
corr_array = np.array(corr)

#make the unused values NANs
corr_array[trill_indexes] = np.nan


# preview result to understand
pd.DataFrame(corr_array)


# create our correlation dataframe
corr_values = pd.DataFrame(corr_array, columns=corr.columns , index = corr.index)


corr_values


# stack the dataframe to ensure data readability
corr_values = (
    corr_values
    .stack()
    .to_frame()
    .reset_index()
    .rename(columns={
        'level_0' : 'feature 1',
        'level_1' : 'feature 2',
        0 : 'correlation'
    })

)


# get the absolute value of correlations for sorting
corr_values['abs correlation'] = corr_values.correlation.abs()


# sort by 'abs correlation' following a descending order 
corr_values.sort_values(by='abs correlation',ascending=False)


# plot to see correlation scores
sns.set_context('talk')
sns.set_style('white')
corr_values['abs correlation'].hist(figsize=(12,8))
plt.xlabel('correlation')
plt.ylabel('frequency')

Text(0, 0.5, 'frequency')


# Encode categorical variables(since there is an order, we'll use ordianl encoder to let our ML model understand the order)
from sklearn.preprocessing import OrdinalEncoder


# keep a copy of data intact
copy_data = data.copy()


# filter numerical values
categories = data.dtypes[data.dtypes == 'object'].to_frame().index


# preview
categories

Index(['cut', 'color', 'clarity'], dtype='object')


# we well prepare lists of different values for each categorical variable(ordered)
ord_color = ['J','I','H','G','F','E','D']
ord_cut = data['cut'].value_counts().sort_values().index.tolist()
ord_clarity = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']


# add lists to a dictionary
ordered = {
    'color':ord_color,
    'cut':ord_cut,
    'clarity':ord_clarity
}


data['color']

0        E
1        E
2        E
3        I
4        J
        ..
53935    D
53936    D
53937    D
53938    H
53939    D
Name: color, Length: 53794, dtype: object


# encode categories using Ordinale encoder
# perform encoding 
for category in categories:
    oec = OrdinalEncoder(categories=[ordered[category]])
    data[category] = oec.fit_transform(data[[category]])


# verify
print('before')
print(copy_data[categories])
print('-'*20)
print('after')
print(data[categories])

before
             cut color clarity
0          Ideal     E     SI2
1        Premium     E     SI1
2           Good     E     VS1
3        Premium     I     VS2
4           Good     J     SI2
...          ...   ...     ...
53935      Ideal     D     SI1
53936       Good     D     SI1
53937  Very Good     D     SI1
53938    Premium     H     SI2
53939      Ideal     D     SI2

[53794 rows x 3 columns]
--------------------
after
       cut  color  clarity
0      4.0    5.0      1.0
1      3.0    5.0      2.0
2      1.0    5.0      4.0
3      3.0    1.0      3.0
4      1.0    0.0      1.0
...    ...    ...      ...
53935  4.0    6.0      2.0
53936  1.0    6.0      2.0
53937  2.0    6.0      2.0
53938  3.0    2.0      1.0
53939  4.0    6.0      1.0

[53794 rows x 3 columns]


# check for outliers
    #via histograms
axList = data[num_var].drop('log(y)',axis=1).hist(bins=100, figsize=(20,20))


    # via boxplots
data[num_var].drop('log(y)',axis=1).plot(kind="box",subplots=True,layout=(7,2),figsize=(15,20))

carat       AxesSubplot(0.125,0.787927;0.352273x0.0920732)
depth    AxesSubplot(0.547727,0.787927;0.352273x0.0920732)
table       AxesSubplot(0.125,0.677439;0.352273x0.0920732)
price    AxesSubplot(0.547727,0.677439;0.352273x0.0920732)
x           AxesSubplot(0.125,0.566951;0.352273x0.0920732)
y        AxesSubplot(0.547727,0.566951;0.352273x0.0920732)
z           AxesSubplot(0.125,0.456463;0.352273x0.0920732)
dtype: object


# reset data indexes (to keep the work clean)
data.reset_index(inplace=True,drop=True)


# implementation
def iqr_outlier(x,factor):
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    iqr = q3 - q1
    min_ = q1 - factor * iqr
    max_ = q3 + factor * iqr
    result_ = pd.Series([0] * len(x))
    result_[((x < min_) | (x > max_))] = 1
    return result_


# after this, let's check the number of outliers for each numerical variable & turn them into nan values(for easy processing)
outliers_nb = dict()
for feature in l:
    # for each feature do this code
    outliers_nb[feature] = iqr_outlier(data[feature],1.5).value_counts()[1]
    # get the list of "1"(outliers) indexes 
    indexes = iqr_outlier(data[feature],1.5)[iqr_outlier(data[feature],1.5)==1].index.tolist()
    # replace all values matching the given indexes with NaN
    data.loc[indexes,feature] = np.nan


# check results
outliers_nb

{'carat': 1873, 'depth': 2525, 'table': 604, 'x': 31, 'y': 28, 'z': 48}


#check for null values
data.isnull().sum()

carat      1873
cut           0
color         0
clarity       0
depth      2525
table       604
price         0
x            31
y            28
z            48
log(y)        0
dtype: int64


data.shape

(53794, 11)


# We will take the simple approach and delete outliers(as the max number 2545 represents only 4% of all data)
data.dropna(inplace=True)


# another box-plotting
data[num_var].plot(kind="box",subplots=True,layout=(7,2),figsize=(15,20))

carat        AxesSubplot(0.125,0.787927;0.352273x0.0920732)
depth     AxesSubplot(0.547727,0.787927;0.352273x0.0920732)
table        AxesSubplot(0.125,0.677439;0.352273x0.0920732)
price     AxesSubplot(0.547727,0.677439;0.352273x0.0920732)
x            AxesSubplot(0.125,0.566951;0.352273x0.0920732)
y         AxesSubplot(0.547727,0.566951;0.352273x0.0920732)
z            AxesSubplot(0.125,0.456463;0.352273x0.0920732)
log(y)    AxesSubplot(0.547727,0.456463;0.352273x0.0920732)
dtype: object


# values of 30 last values
data[['x','y']].tail(30)


# statistics about x and y
data[['x','y']].describe().T


# distribution of x and y
data[['x','y']].hist(figsize=(8,5))

array([[<AxesSubplot:title={'center':'x'}>,
        <AxesSubplot:title={'center':'y'}>]], dtype=object)


# scatter plot of x and y
plt.scatter(data['x'],data['y'])

<matplotlib.collections.PathCollection at 0x2b2dce643d0>


# let's examine the ratio
(data['x']/data['y']).describe().to_frame().T


# drop the `x` variable
data = data.drop('x',axis=1)


# preview
data.head()


# save the New data for future
data.to_csv('clean_data.csv',index=False)


# import data
    # libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
    # set seaborn as default style
sns.set()
data = pd.read_csv('clean_data.csv')


data.skew()

carat      0.729621
cut       -0.641315
color     -0.213516
clarity    0.526213
depth     -0.245751
table      0.377942
price      1.597436
y          0.256599
z          0.254492
log(y)     0.073074
dtype: float64


# # Let's look at what happens to one of these features, when we apply np.log1p visually.

# Choose a field
field = "table"

# Create two "subplots" and a "figure" using matplotlib
fig, (ax_before, ax_after) = plt.subplots(1, 2, figsize=(16, 5))

# Create a histogram on the "ax_before" subplot
data[field].hist(ax=ax_before)

# Apply a log transformation (numpy syntax) to this column
data[field].apply(np.log1p).hist(ax=ax_after)

# Formatting of titles etc. for each subplot
ax_before.set(title='before np.log1p', ylabel='frequency', xlabel='value')
ax_after.set(title='after np.log1p', ylabel='frequency', xlabel='value')
fig.suptitle('Field "{}"'.format(field));


# review data
data.head()


# train test split
    # library
from sklearn.model_selection import train_test_split
    # split features and target
X = data.drop('price',axis=1)
y = data[['price']]
    # perform splitting
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.40,random_state=42)


# define our pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.pipeline import Pipeline
ml_pipeline = Pipeline([
 ('std_scaler', StandardScaler()),
 ('lr',LinearRegression())
 ])


# launch the model
ml_pipeline.fit(X_train,y_train)
    # score
ml_pipeline.score(X_test,y_test)

0.9094574950224463


#  define a new pipeline 
    # import polynomial features package
from sklearn.preprocessing import PolynomialFeatures
    # we'll limit the degree to 3
pf = PolynomialFeatures(degree=3)
ml2_pipeline = Pipeline([
 ('std_scaler', StandardScaler()),
 ('p_features',pf),
 ('lr',LinearRegression())
 ])


# launch the model again
# launch the model
ml2_pipeline.fit(X_train,y_train)
    # score
ml2_pipeline.score(X_test,y_test)

0.979122879493727


# score for training data
ml2_pipeline.score(X_train,y_train)

0.9819461912261326


# import package
from sklearn.model_selection import cross_val_score


# Launch the model
scores = cross_val_score(ml2_pipeline,X_train,y_train,cv=10,scoring='r2')


# view final score(mean)
np.mean(scores)

0.26025273524455494


#import necessary packages
from sklearn.model_selection import GridSearchCV


# import Lasso Reg model
from sklearn.linear_model import Lasso
# define a new pipeline for Lasso Reg

    # notice the convention of names used in both 'estimator name' and 'params name'(to match estimator with it's hyperparameter)

ml3_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
    ('polynomial_features',PolynomialFeatures()),
    ('lasso_regression',Lasso())
])

# define the combination of hyperparameters
params = {
    'polynomial_features__degree': [1, 2, 3],
    'lasso_regression__alpha': np.geomspace(1e-5, 1e5, num=30)
}

# define the gridsearch
grid = GridSearchCV(ml3_pipeline, params, cv=10)


# show scores and best params
grid.best_score_, grid.best_params_

(0.9803178376511497,
 {'lasso_regression__alpha': 0.3039195382313201,
  'polynomial_features__degree': 3})


# review the r2-score
from sklearn.metrics import r2_score
y_pred_tst  = grid.predict(X_test)

# for testing data
r2_score(y_pred_tst,y_test)

0.9792656721005385


# import Ridge Reg model
from sklearn.linear_model import Ridge
# define a new pipeline for Ridge Reg

    # notice the convention of names used in both 'estimator name' and 'params name'(to match estimator with it's hyperparameter)

ml4_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
    ('polynomial_features',PolynomialFeatures()),
    ('ridge_regression',Ridge())
])

# define the combination of hyperparameters
params = {
    'polynomial_features__degree': [1, 2, 3],
    'ridge_regression__alpha': np.geomspace(1e-5, 1e5, num=30)
}

# define the gridsearch
grid = GridSearchCV(ml4_pipeline, params, cv=10)


# show scores and best params
grid.best_score_, grid.best_params_

(0.9801646585771057,
 {'polynomial_features__degree': 3,
  'ridge_regression__alpha': 7.278953843983161})


# review the r2-score
from sklearn.metrics import r2_score
y_pred_tst  = grid.predict(X_test)

# for testing data
r2_score(y_pred_tst,y_test)

0.9794036115480247

	Unnamed: 0	carat	cut	color	clarity	depth	table	price	x	y	z
0	1	0.23	Ideal	E	SI2	61.5	55.0	326	3.95	3.98	2.43
1	2	0.21	Premium	E	SI1	59.8	61.0	326	3.89	3.84	2.31
2	3	0.23	Good	E	VS1	56.9	65.0	327	4.05	4.07	2.31
3	4	0.29	Premium	I	VS2	62.4	58.0	334	4.20	4.23	2.63
4	5	0.31	Good	J	SI2	63.3	58.0	335	4.34	4.35	2.75

	Unnamed: 0	carat	depth	table	price	x	y	z
count	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000
mean	26970.500000	0.797940	61.749405	57.457184	3932.799722	5.731157	5.734526	3.538734
std	15571.281097	0.474011	1.432621	2.234491	3989.439738	1.121761	1.142135	0.705699
min	1.000000	0.200000	43.000000	43.000000	326.000000	0.000000	0.000000	0.000000
25%	13485.750000	0.400000	61.000000	56.000000	950.000000	4.710000	4.720000	2.910000
50%	26970.500000	0.700000	61.800000	57.000000	2401.000000	5.700000	5.710000	3.530000
75%	40455.250000	1.040000	62.500000	59.000000	5324.250000	6.540000	6.540000	4.040000
max	53940.000000	5.010000	79.000000	95.000000	18823.000000	10.740000	58.900000	31.800000

	carat	cut	color	clarity	depth	table	price	x	y	z
1005	0.79	Ideal	G	SI1	62.3	57.0	2898	5.90	5.85	3.66
1006	0.79	Ideal	G	SI1	62.3	57.0	2898	5.90	5.85	3.66
1007	0.79	Ideal	G	SI1	62.3	57.0	2898	5.90	5.85	3.66
1008	0.79	Ideal	G	SI1	62.3	57.0	2898	5.90	5.85	3.66
2025	1.52	Good	E	I1	57.3	58.0	3105	7.53	7.42	4.28
...	...	...	...	...	...	...	...	...	...	...
47969	0.52	Ideal	D	VS2	61.8	55.0	1919	5.19	5.16	3.20
49326	0.51	Ideal	F	VVS2	61.2	56.0	2093	5.17	5.19	3.17
49557	0.71	Good	F	SI2	64.1	60.0	2130	0.00	0.00	0.00
50079	0.51	Ideal	F	VVS2	61.2	56.0	2203	5.19	5.17	3.17
52861	0.50	Fair	E	VS2	79.0	73.0	2579	5.21	5.18	4.09

	carat	cut	color	clarity	depth	table	price	x	y	z
0	0.23	Ideal	E	SI2	61.5	55.0	326	3.95	3.98	2.43
1	0.21	Premium	E	SI1	59.8	61.0	326	3.89	3.84	2.31
2	0.23	Good	E	VS1	56.9	65.0	327	4.05	4.07	2.31
3	0.29	Premium	I	VS2	62.4	58.0	334	4.20	4.23	2.63
4	0.31	Good	J	SI2	63.3	58.0	335	4.34	4.35	2.75
...	...	...	...	...	...	...	...	...	...	...
53935	0.72	Ideal	D	SI1	60.8	57.0	2757	5.75	5.76	3.50
53936	0.72	Good	D	SI1	63.1	55.0	2757	5.69	5.75	3.61
53937	0.70	Very Good	D	SI1	62.8	60.0	2757	5.66	5.68	3.56
53938	0.86	Premium	H	SI2	61.0	58.0	2757	6.15	6.12	3.74
53939	0.75	Ideal	D	SI2	62.2	55.0	2757	5.83	5.87	3.64

	x	y
53763	5.63	5.67
53764	5.74	5.77
53765	5.43	5.38
53766	5.48	5.40
53767	5.84	5.81
53768	5.94	5.90
53769	5.84	5.86
53770	5.71	5.74
53771	6.12	6.09
53772	5.93	5.85
53773	5.89	5.87
53774	5.57	5.61
53775	5.59	5.65
53776	5.67	5.58
53777	5.80	5.84
53778	5.82	5.84
53779	5.95	5.97
53780	5.71	5.73
53782	6.03	5.96
53783	5.76	5.73
53784	5.79	5.74
53785	5.74	5.73
53786	5.71	5.76
53787	5.69	5.72
53788	5.69	5.73
53789	5.75	5.76
53790	5.69	5.75
53791	5.66	5.68
53792	6.15	6.12
53793	5.83	5.87

Introduction¶

ML workflow¶

Data Description¶

EDA part¶

Check for patterns & correlations¶

pairplot¶

Data Preprocessing part¶

Outliers¶

Why dealing with outliers ?¶

IQR method¶

Important note¶

Features` skewness¶

Building our ML model¶

adding polynomial features¶

Evaluation Using Cross-Validation¶

Fine-tune our model¶

Launch our model¶

more informations¶

Launch our model¶

	0	1	2	3	4	5	6	7
0	NaN	0.027861	0.181091	0.921548	0.975380	0.951908	0.953542	0.947523
1	NaN	NaN	-0.297669	-0.011048	-0.025348	-0.029389	0.094757	-0.026035
2	NaN	NaN	NaN	0.126566	0.194855	0.183231	0.150270	0.187081
3	NaN	NaN	NaN	NaN	0.884504	0.865395	0.861208	0.852543
4	NaN	NaN	NaN	NaN	NaN	0.974592	0.970686	0.990493
5	NaN	NaN	NaN	NaN	NaN	NaN	0.951844	0.982435
6	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.964685
7	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	feature 1	feature 2	correlation	abs correlation
24	x	log(y)	0.990493	0.990493
26	y	log(y)	0.982435	0.982435
3	carat	x	0.975380	0.975380
22	x	y	0.974592	0.974592
23	x	z	0.970686	0.970686
27	z	log(y)	0.964685	0.964685
5	carat	z	0.953542	0.953542
4	carat	y	0.951908	0.951908
25	y	z	0.951844	0.951844
6	carat	log(y)	0.947523	0.947523
2	carat	price	0.921548	0.921548
18	price	x	0.884504	0.884504
19	price	y	0.865395	0.865395
20	price	z	0.861208	0.861208
21	price	log(y)	0.852543	0.852543
7	depth	table	-0.297669	0.297669
14	table	x	0.194855	0.194855
17	table	log(y)	0.187081	0.187081
15	table	y	0.183231	0.183231
1	carat	table	0.181091	0.181091
16	table	z	0.150270	0.150270
13	table	price	0.126566	0.126566
11	depth	z	0.094757	0.094757
10	depth	y	-0.029389	0.029389
0	carat	depth	0.027861	0.027861
12	depth	log(y)	-0.026035	0.026035
9	depth	x	-0.025348	0.025348
8	depth	price	-0.011048	0.011048

	count	mean	std	min	25%	50%	75%	max
x	49191.0	5.616158	1.027540	3.73	4.67	5.61	6.47	8.34
y	49191.0	5.620879	1.021784	3.68	4.68	5.61	6.47	8.27