import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer


train = pd.read_csv('train.csv')


train.head()


df = train
df.head()


autoEDA = sweetviz.analyze(df)
autoEDA.show_notebook()


sns.set_style('whitegrid')
sns.set_palette('dark')


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


plt.figure(figsize = (8,8))
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

<AxesSubplot:>


sum(df.duplicated())

0


sum(df['Loan_ID'].duplicated())

0


ax = sns.countplot(x='Loan_Status', data=df)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/df['Loan_Status'].count())
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.show()

# Data is a bit imbalanced. 68.7% people get loan


# There is no information about defauting in the trainning dataset
ax = sns.countplot(x='Gender', hue='Loan_Status', data=df)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/df['Loan_Status'].count())
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.show()
# Chance of getting the loan is about the same across male and female group
# Lot more male than female in the dataset


# Not listed in the question but I think it is a interesting factor
ax = sns.countplot(x='Married', hue='Loan_Status', data=df)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/df['Loan_Status'].count())
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.title('Loan and Marriage')
plt.show()
# People are more likley to get the loan if they are married


ax = sns.countplot(x='Dependents', hue='Loan_Status', data=df)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/df['Loan_Status'].count())
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.show()
# Notice lot of people don't have dependent but get the loan
# Category with 2 dependents, majority of people get loan compare to other group
# Other group's loan status ration is 2 to 1


ax = sns.countplot(x='Self_Employed', hue='Loan_Status', data=df)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/df['Loan_Status'].count())
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.show()
# Chance of getting the loan are pretty much same across two group
# No significant pattern


ax = sns.countplot(x='Education', hue='Loan_Status', data=df)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/df['Loan_Status'].count())
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.show()
# Chance of getting the loan is about the same across two group. Maybe slightly more likley if graduate


df['Credit_History'] = df['Credit_History'].map({0:'No',1:'Yes'})


ax = sns.countplot(x='Credit_History', hue='Loan_Status', data=df)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/df['Loan_Status'].count())
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel(xlabel='Credit History')
plt.title('Credit History and Loan')
plt.show()
# Lot more likley to get loan if you have credit history almost never rejected
# We don't normally give loan to people who don't have credit history


ax = sns.countplot(x='Property_Area', hue='Loan_Status', data=df)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/df['Loan_Status'].count())
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.title('Loan and Property Area')
plt.show()
# About 70% of people from semiurban get loan
# Semiurban has higher chance to get loan than other group


g=sns.FacetGrid(df, hue='Loan_Status', height=3, aspect=1.5)
g=g.map(plt.hist, 'LoanAmount', bins=20, alpha=0.7)
plt.legend(loc='best',prop={'size': 10})
# The distributions of loan amount are about the same across two groups

<matplotlib.legend.Legend at 0x2653fcc2b50>


df['Loan_Issue'] = df['Loan_Status'].map({'Y':1,'N':0})
df.head()


plt.figure(figsize=(12,4))
df.corr()['Loan_Issue'].drop('Loan_Issue').sort_values().plot(kind='bar')
# Non of the continuous feature is good. We will do feature engineering

<AxesSubplot:>


df['Total_Income'] = df['ApplicantIncome']+df['CoapplicantIncome']


df['e1_feature'] = df['LoanAmount']/df['Total_Income']


df['e2_feature'] = df['LoanAmount']*df['Loan_Amount_Term']


plt.figure(figsize=(12,4))
df.corr()['Loan_Issue'].drop('Loan_Issue').sort_values().plot(kind='bar')

<AxesSubplot:>


# Feature selection
select = df[['Education', 'Credit_History', 'Married', 'Dependents', 'ApplicantIncome','Property_Area', 'Total_Income',
             'Loan_Issue','CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'e1_feature', 'e2_feature']]
select.head()


select.isna().sum()

Education             0
Credit_History       50
Married               3
Dependents           15
ApplicantIncome       0
Property_Area         0
Total_Income          0
Loan_Issue            0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
e1_feature           22
e2_feature           36
dtype: int64


select['Credit_History'].value_counts()
# Ratio between 0 and 1 is 1 to 5.3

Yes    475
No      89
Name: Credit_History, dtype: int64


select['Credit_History'].fillna(0.0, inplace=True, limit=8)
select['Credit_History'].fillna(1.0, inplace=True)

E:\Anaconda\lib\site-packages\pandas\core\series.py:4463: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


select['Married'].value_counts()
# Ratio between Yes and No is 1.86 to 1

Yes    398
No     213
Name: Married, dtype: int64


select['Married'].fillna('Yes', inplace=True, limit=2)
select['Married'].fillna('No', inplace=True)


select['Dependents'].value_counts()
# Ratio between them is about 6.5:2:2:1

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64


select['Dependents'].fillna(0, inplace=True, limit=9)
select['Dependents'].fillna(1, inplace=True, limit=3)
select['Dependents'].fillna(2, inplace=True, limit=3)


select['Loan_Amount_Term'].value_counts()
#Mostly 360. Fill the rest 14 with 360

360.0    512
180.0     44
480.0     15
300.0     13
84.0       4
240.0      4
120.0      3
36.0       2
60.0       2
12.0       1
Name: Loan_Amount_Term, dtype: int64


select['Loan_Amount_Term'].fillna(360, inplace=True)


select.corr()['LoanAmount'].sort_values()
#Let's use a model to fill in this

Loan_Issue          -0.037318
Loan_Amount_Term     0.036981
e1_feature           0.166660
CoapplicantIncome    0.188619
ApplicantIncome      0.570909
Total_Income         0.624621
e2_feature           0.939293
LoanAmount           1.000000
Name: LoanAmount, dtype: float64


X = df.dropna()[['Total_Income','ApplicantIncome']]
y = df.dropna()['LoanAmount']


from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)


from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)

LinearRegression()


corr = pd.DataFrame( lm.coef_, X.columns)
corr.columns = ['Coefficient']
corr


prediction = lm.predict(X_test)
plt.scatter(y_test, prediction)

<matplotlib.collections.PathCollection at 0x2653b6b4c10>


from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, prediction))
print('MSE:', metrics.mean_squared_error(y_test, prediction))
print('MAE:', np.sqrt(metrics.mean_squared_error(y_test, prediction)))
# The model does very good job

MAE: 44.945303851049445
MSE: 4741.932486734215
MAE: 68.86169099531476


data = select[['LoanAmount', 'Total_Income', 'ApplicantIncome']]
select['pred'] = lm.predict(data.drop('LoanAmount', axis=1))

<ipython-input-736-cdbb127a4a1f>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  select['pred'] = lm.predict(data.drop('LoanAmount', axis=1))


select['LoanAmount'] = np.where(select['LoanAmount']>0, select['LoanAmount'], select['pred'])
select=select.drop('pred', axis=1)

<ipython-input-737-9a7ccfaadc95>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  select['LoanAmount'] = np.where(select['LoanAmount']>0, select['LoanAmount'], select['pred'])


select['Total_Income']=select['ApplicantIncome']+select['CoapplicantIncome']
select['e1_feature'] = select['LoanAmount']/select['Total_Income']
select['e2_feature'] = select['LoanAmount']/select['Loan_Amount_Term']


select.isna().sum()

Education            0
Credit_History       0
Married              0
Dependents           0
ApplicantIncome      0
Property_Area        0
Total_Income         0
Loan_Issue           0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
e1_feature           0
e2_feature           0
dtype: int64


select.select_dtypes(['object']).columns

Index(['Education', 'Credit_History', 'Married', 'Dependents',
       'Property_Area'],
      dtype='object')


dummies=pd.get_dummies(select[['Education', 'Credit_History', 'Married', 'Dependents', 'Property_Area' ]],drop_first=True)


select = select.drop(['Education', 'Credit_History', 'Married', 'Dependents', 'Property_Area' ],axis=1)


select = pd.concat([select,dummies], axis=1)


select.columns

Index(['ApplicantIncome', 'Total_Income', 'Loan_Issue', 'CoapplicantIncome',
       'LoanAmount', 'Loan_Amount_Term', 'e1_feature', 'e2_feature',
       'Education_Not Graduate', 'Credit_History_1.0', 'Credit_History_No',
       'Credit_History_Yes', 'Married_Yes', 'Dependents_1', 'Dependents_2',
       'Dependents_0', 'Dependents_1', 'Dependents_2', 'Dependents_3+',
       'Property_Area_Semiurban', 'Property_Area_Urban'],
      dtype='object')


X = select.drop('Loan_Issue', axis=1)
y = select['Loan_Issue']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = MinMaxScaler()


X_train = scaler.fit_transform(X_train)


X_test = scaler.transform(X_test)


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix


lg = LogisticRegression()
lg.fit(X_train, y_train)

LogisticRegression()


predic_log = lg.predict(X_test)


print(classification_report(y_test,predic_log))

              precision    recall  f1-score   support

           0       0.95      0.44      0.60        43
           1       0.77      0.99      0.86        80

    accuracy                           0.80       123
   macro avg       0.86      0.71      0.73       123
weighted avg       0.83      0.80      0.77       123


from sklearn.neighbors import KNeighborsClassifier


# Find best K
error_rate = []

# Will take some time
for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))


plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
# I will use 16

Text(0, 0.5, 'Error Rate')


knn = KNeighborsClassifier(n_neighbors=20)


knn.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=35)


pred_KNN = knn.predict(X_test)


print(confusion_matrix(y_test,pred_KNN))

[[16 27]
 [ 0 80]]


print(classification_report(y_test,pred_KNN))

              precision    recall  f1-score   support

           0       1.00      0.37      0.54        43
           1       0.75      1.00      0.86        80

    accuracy                           0.78       123
   macro avg       0.87      0.69      0.70       123
weighted avg       0.84      0.78      0.75       123


from sklearn.ensemble import RandomForestClassifier


rfc = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=100)


rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, random_state=0)


predic_rfc = rfc.predict(X_test)


print(confusion_matrix(y_test,predic_rfc))

[[18 25]
 [ 1 79]]


print(classification_report(y_test,predic_rfc))

              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123


from sklearn.svm import SVC


model = SVC()


model.fit(X_train,y_train)

SVC()


predic_svc = model.predict(X_test)


print(confusion_matrix(y_test,predic_svc))

[[18 25]
 [ 1 79]]


print(classification_report(y_test,predic_svc))

              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123


param_grid = {'C': [0.1,1, 10, 100, 1000,10000], 'gamma': [1,0.1,0.01,0.001,0.0001,0.00001], 'kernel': ['rbf']}


from sklearn.model_selection import GridSearchCV


grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)


grid.fit(X_train,y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV 2/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV 3/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV 4/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV 5/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV 1/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 2/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 3/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 4/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 5/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 1/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 2/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 3/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 4/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 5/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 1/5] END .................C=0.1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 2/5] END .................C=0.1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 3/5] END .................C=0.1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 4/5] END .................C=0.1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 5/5] END .................C=0.1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 1/5] END ................C=0.1, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 2/5] END ................C=0.1, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 3/5] END ................C=0.1, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 4/5] END ................C=0.1, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 5/5] END ................C=0.1, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 1/5] END .................C=0.1, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 2/5] END .................C=0.1, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 3/5] END .................C=0.1, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 4/5] END .................C=0.1, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 5/5] END .................C=0.1, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 1/5] END .......................C=1, gamma=1, kernel=rbf; total time=   0.0s
[CV 2/5] END .......................C=1, gamma=1, kernel=rbf; total time=   0.0s
[CV 3/5] END .......................C=1, gamma=1, kernel=rbf; total time=   0.0s
[CV 4/5] END .......................C=1, gamma=1, kernel=rbf; total time=   0.0s
[CV 5/5] END .......................C=1, gamma=1, kernel=rbf; total time=   0.0s
[CV 1/5] END .....................C=1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 2/5] END .....................C=1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 3/5] END .....................C=1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 4/5] END .....................C=1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 5/5] END .....................C=1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 1/5] END ....................C=1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 2/5] END ....................C=1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 3/5] END ....................C=1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 4/5] END ....................C=1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 5/5] END ....................C=1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 1/5] END ...................C=1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 2/5] END ...................C=1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 3/5] END ...................C=1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 4/5] END ...................C=1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 5/5] END ...................C=1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 1/5] END ..................C=1, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 2/5] END ..................C=1, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 3/5] END ..................C=1, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 4/5] END ..................C=1, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 5/5] END ..................C=1, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 1/5] END ...................C=1, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 2/5] END ...................C=1, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 3/5] END ...................C=1, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 4/5] END ...................C=1, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 5/5] END ...................C=1, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 1/5] END ......................C=10, gamma=1, kernel=rbf; total time=   0.0s
[CV 2/5] END ......................C=10, gamma=1, kernel=rbf; total time=   0.0s
[CV 3/5] END ......................C=10, gamma=1, kernel=rbf; total time=   0.0s
[CV 4/5] END ......................C=10, gamma=1, kernel=rbf; total time=   0.0s
[CV 5/5] END ......................C=10, gamma=1, kernel=rbf; total time=   0.0s
[CV 1/5] END ....................C=10, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 2/5] END ....................C=10, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 3/5] END ....................C=10, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 4/5] END ....................C=10, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 5/5] END ....................C=10, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 1/5] END ...................C=10, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 2/5] END ...................C=10, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 3/5] END ...................C=10, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 4/5] END ...................C=10, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 5/5] END ...................C=10, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 1/5] END ..................C=10, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 2/5] END ..................C=10, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 3/5] END ..................C=10, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 4/5] END ..................C=10, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 5/5] END ..................C=10, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 1/5] END .................C=10, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 2/5] END .................C=10, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 3/5] END .................C=10, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 4/5] END .................C=10, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 5/5] END .................C=10, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 1/5] END ..................C=10, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 2/5] END ..................C=10, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 3/5] END ..................C=10, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 4/5] END ..................C=10, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 5/5] END ..................C=10, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 1/5] END .....................C=100, gamma=1, kernel=rbf; total time=   0.0s
[CV 2/5] END .....................C=100, gamma=1, kernel=rbf; total time=   0.0s
[CV 3/5] END .....................C=100, gamma=1, kernel=rbf; total time=   0.0s
[CV 4/5] END .....................C=100, gamma=1, kernel=rbf; total time=   0.0s
[CV 5/5] END .....................C=100, gamma=1, kernel=rbf; total time=   0.0s
[CV 1/5] END ...................C=100, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 2/5] END ...................C=100, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 3/5] END ...................C=100, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 4/5] END ...................C=100, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 5/5] END ...................C=100, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 1/5] END ..................C=100, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 2/5] END ..................C=100, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 3/5] END ..................C=100, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 4/5] END ..................C=100, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 5/5] END ..................C=100, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 1/5] END .................C=100, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 2/5] END .................C=100, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 3/5] END .................C=100, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 4/5] END .................C=100, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 5/5] END .................C=100, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 1/5] END ................C=100, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 2/5] END ................C=100, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 3/5] END ................C=100, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 4/5] END ................C=100, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 5/5] END ................C=100, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 1/5] END .................C=100, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 2/5] END .................C=100, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 3/5] END .................C=100, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 4/5] END .................C=100, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 5/5] END .................C=100, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 1/5] END ....................C=1000, gamma=1, kernel=rbf; total time=   0.0s
[CV 2/5] END ....................C=1000, gamma=1, kernel=rbf; total time=   0.0s
[CV 3/5] END ....................C=1000, gamma=1, kernel=rbf; total time=   0.0s
[CV 4/5] END ....................C=1000, gamma=1, kernel=rbf; total time=   0.0s
[CV 5/5] END ....................C=1000, gamma=1, kernel=rbf; total time=   0.0s
[CV 1/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 2/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 3/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 4/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 5/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 1/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 2/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 3/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 4/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 5/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 1/5] END ................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 2/5] END ................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 3/5] END ................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 4/5] END ................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 5/5] END ................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 1/5] END ...............C=1000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 2/5] END ...............C=1000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 3/5] END ...............C=1000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 4/5] END ...............C=1000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 5/5] END ...............C=1000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 1/5] END ................C=1000, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 2/5] END ................C=1000, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 3/5] END ................C=1000, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 4/5] END ................C=1000, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 5/5] END ................C=1000, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 1/5] END ...................C=10000, gamma=1, kernel=rbf; total time=   0.0s
[CV 2/5] END ...................C=10000, gamma=1, kernel=rbf; total time=   0.0s
[CV 3/5] END ...................C=10000, gamma=1, kernel=rbf; total time=   0.0s
[CV 4/5] END ...................C=10000, gamma=1, kernel=rbf; total time=   0.0s
[CV 5/5] END ...................C=10000, gamma=1, kernel=rbf; total time=   0.0s
[CV 1/5] END .................C=10000, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 2/5] END .................C=10000, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 3/5] END .................C=10000, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 4/5] END .................C=10000, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 5/5] END .................C=10000, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 1/5] END ................C=10000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 2/5] END ................C=10000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 3/5] END ................C=10000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 4/5] END ................C=10000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 5/5] END ................C=10000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 1/5] END ...............C=10000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 2/5] END ...............C=10000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 3/5] END ...............C=10000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 4/5] END ...............C=10000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 5/5] END ...............C=10000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 1/5] END ..............C=10000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 2/5] END ..............C=10000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 3/5] END ..............C=10000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 4/5] END ..............C=10000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 5/5] END ..............C=10000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 1/5] END ...............C=10000, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 2/5] END ...............C=10000, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 3/5] END ...............C=10000, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 4/5] END ...............C=10000, gamma=1e-05, kernel=rbf; total time=   0.0s
[CV 5/5] END ...............C=10000, gamma=1e-05, kernel=rbf; total time=   0.0s

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000, 10000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 1e-05],
                         'kernel': ['rbf']},
             verbose=3)


grid.best_params_

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}


grid.best_estimator_

SVC(C=1, gamma=0.1)


grid_predictions = grid.predict(X_test)


print(confusion_matrix(y_test,grid_predictions))

[[18 25]
 [ 1 79]]


print(classification_report(y_test,grid_predictions))
# The result isn't improving

              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123

Standard Bank Data Science Virtual Experience Programme¶

Credit / Home Loans¶

Import Libraries¶

Import Datasets¶

Part One: EDA¶

Sweetviz¶

Overview of the data¶

Data Quality Evaluation¶

Loan Statuses Distribution¶

By gender¶

Married¶

Have dependent¶

Loan by Employment type¶

Education¶

By Credit History¶

Property Area¶

Loan Amount¶

Part Two: Data Preparation¶

Data Preparation¶

Feature Selection¶

Handle Missing Values¶

Dummy, Split, Scale¶

Logistic Regression¶

KNN¶

Random Forests¶

Support Vector Machine¶

	Loan_ID	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
0	LP001002	Male	No	0	Graduate	No	5849	0.0	NaN	360.0	1.0	Urban	Y
1	LP001003	Male	Yes	1	Graduate	No	4583	1508.0	128.0	360.0	1.0	Rural	N
2	LP001005	Male	Yes	0	Graduate	Yes	3000	0.0	66.0	360.0	1.0	Urban	Y
3	LP001006	Male	Yes	0	Not Graduate	No	2583	2358.0	120.0	360.0	1.0	Urban	Y
4	LP001008	Male	No	0	Graduate	No	6000	0.0	141.0	360.0	1.0	Urban	Y

	Coefficient
Total_Income	0.005412
ApplicantIncome	0.001390