gallery/data entry fields
LOGISTIC REGRESSION
CLASS WORK
LogisticRegression_prac_01
In [ ]:

Titan - Train dataset

In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
In [ ]:
df = pd.read_csv('/content/titanic_train.csv')
In [ ]:
df.head(2)
Out[ ]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C

Exploratory Analysis on basic import of dataset

In [ ]:
df.isnull()
Out[ ]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 False False False False False False False False False False True False
1 False False False False False False False False False False False False
2 False False False False False False False False False False True False
3 False False False False False False False False False False False False
4 False False False False False False False False False False True False
... ... ... ... ... ... ... ... ... ... ... ... ...
886 False False False False False False False False False False True False
887 False False False False False False False False False False False False
888 False False False False False True False False False False True False
889 False False False False False False False False False False False False
890 False False False False False False False False False False True False

891 rows × 12 columns

In [ ]:
sns.heatmap(df.isnull(), yticklabels= False, cbar = False, cmap = "Blues")
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9335ad1c10>
In [ ]:
sns.countplot(x='Survived', data = df)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f933c3f0750>
In [ ]:
sns.countplot(x = 'Survived', hue = "Sex", data = df)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f933c2d55d0>
In [ ]:
sns.countplot(x='Survived', hue = "Pclass", data = df)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f933c032c90>
In [ ]:
df.drop('PassengerId', inplace = True, axis = 1)
In [ ]:
sns.pairplot(df)
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x7f933998e890>
In [ ]:
df.head(2)
Out[ ]:
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
In [ ]:
sns.boxplot(x = 'Pclass', y = 'Age', data = df)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f933620eb50>
In [ ]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]

    if pd.isnull(Age) == True:
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 32
        else:
            return 24
    else:
        return Age
In [ ]:
df['Age'] = df[['Age', 'Pclass']].apply(impute_age, axis = 1)
In [ ]:
sns.heatmap(df.isnull(), cmap = 'Blues')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9335d6c950>
In [ ]:
df.head(2)
Out[ ]:
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
In [ ]:
df.drop(['Name', 'Ticket'], inplace = True, axis = 1)
In [ ]:
df.head(2)
Out[ ]:
Survived Pclass Sex Age SibSp Parch Fare Cabin Embarked
0 0 3 male 22.0 1 0 7.2500 NaN S
1 1 1 female 38.0 1 0 71.2833 C85 C
In [ ]:
sex_df = pd.get_dummies(df['Sex'], drop_first=True)
In [ ]:
Embark_df = pd.get_dummies(df['Embarked'], drop_first = True)
In [ ]:
sex_df
Out[ ]:
male
0 1
1 0
2 0
3 0
4 1
... ...
886 1
887 0
888 0
889 1
890 1

891 rows × 1 columns

In [ ]:
Embark_df
Out[ ]:
Q S
0 0 1
1 0 0
2 0 1
3 0 1
4 0 1
... ... ...
886 0 1
887 0 1
888 0 1
889 0 0
890 1 0

891 rows × 2 columns

In [ ]:
df = pd.concat([df, sex_df, Embark_df], axis = 1)
In [ ]:
df.drop('Cabin', inplace = True, axis = 1)
In [ ]:
logreg = LogisticRegression()
In [ ]:
df
Out[ ]:
Survived Pclass Sex Age SibSp Parch Fare Embarked male Q S
0 0 3 male 22.0 1 0 7.2500 S 1 0 1
1 1 1 female 38.0 1 0 71.2833 C 0 0 0
2 1 3 female 26.0 0 0 7.9250 S 0 0 1
3 1 1 female 35.0 1 0 53.1000 S 0 0 1
4 0 3 male 35.0 0 0 8.0500 S 1 0 1
... ... ... ... ... ... ... ... ... ... ... ...
886 0 2 male 27.0 0 0 13.0000 S 1 0 1
887 1 1 female 19.0 0 0 30.0000 S 0 0 1
888 0 3 female 24.0 1 2 23.4500 S 0 0 1
889 1 1 male 26.0 0 0 30.0000 C 1 0 0
890 0 3 male 32.0 0 0 7.7500 Q 1 1 0

891 rows × 11 columns

In [ ]:
df.drop('Sex', inplace = True, axis = 1)
In [ ]:
df.drop('Embarked', inplace = True, axis = 1)
In [ ]:
df
Out[ ]:
Survived Pclass Age SibSp Parch Fare male Q S
0 0 3 22.0 1 0 7.2500 1 0 1
1 1 1 38.0 1 0 71.2833 0 0 0
2 1 3 26.0 0 0 7.9250 0 0 1
3 1 1 35.0 1 0 53.1000 0 0 1
4 0 3 35.0 0 0 8.0500 1 0 1
... ... ... ... ... ... ... ... ... ...
886 0 2 27.0 0 0 13.0000 1 0 1
887 1 1 19.0 0 0 30.0000 0 0 1
888 0 3 24.0 1 2 23.4500 0 0 1
889 1 1 26.0 0 0 30.0000 1 0 0
890 0 3 32.0 0 0 7.7500 1 1 0

891 rows × 9 columns

In [ ]:
from sklearn.model_selection import train_test_split
In [ ]:
X = df.drop('Survived', axis = 1)
y = df['Survived']
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
In [ ]:
logreg.fit(X_train, y_train)
/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
Out[ ]:
LogisticRegression()
In [ ]:
predictions = logreg.predict(X_test)
In [ ]:
predictions
Out[ ]:
array([0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1])
In [ ]:
df
Out[ ]:
Survived Pclass Age SibSp Parch Fare male Q S
0 0 3 22.0 1 0 7.2500 1 0 1
1 1 1 38.0 1 0 71.2833 0 0 0
2 1 3 26.0 0 0 7.9250 0 0 1
3 1 1 35.0 1 0 53.1000 0 0 1
4 0 3 35.0 0 0 8.0500 1 0 1
... ... ... ... ... ... ... ... ... ...
886 0 2 27.0 0 0 13.0000 1 0 1
887 1 1 19.0 0 0 30.0000 0 0 1
888 0 3 24.0 1 2 23.4500 0 0 1
889 1 1 26.0 0 0 30.0000 1 0 0
890 0 3 32.0 0 0 7.7500 1 1 0

891 rows × 9 columns

In [ ]:
print(classification_report(y_test, predictions))
              precision    recall  f1-score   support

           0       0.78      0.87      0.82       154
           1       0.79      0.67      0.72       114

    accuracy                           0.78       268
   macro avg       0.79      0.77      0.77       268
weighted avg       0.78      0.78      0.78       268

In [ ]:
confusion_matrix(y_test, predictions)
Out[ ]:
array([[134,  20],
       [ 38,  76]])
In [ ]:
sns.heatmap(df, cmap = 'Blues')
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9335e6c3d0>
In [ ]:
df
Out[ ]:
Survived Pclass Age SibSp Parch Fare male Q S
0 0 3 22.0 1 0 7.2500 1 0 1
1 1 1 38.0 1 0 71.2833 0 0 0
2 1 3 26.0 0 0 7.9250 0 0 1
3 1 1 35.0 1 0 53.1000 0 0 1
4 0 3 35.0 0 0 8.0500 1 0 1
... ... ... ... ... ... ... ... ... ...
886 0 2 27.0 0 0 13.0000 1 0 1
887 1 1 19.0 0 0 30.0000 0 0 1
888 0 3 24.0 1 2 23.4500 0 0 1
889 1 1 26.0 0 0 30.0000 1 0 0
890 0 3 32.0 0 0 7.7500 1 1 0

891 rows × 9 columns

In [ ]:
df.head(2)
Out[ ]:
Survived Pclass Age SibSp Parch Fare male Q S
0 0 3 22.0 1 0 7.2500 1 0 1
1 1 1 38.0 1 0 71.2833 0 0 0
In [ ]: