-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbinaryClassificationLogisitcRegression.py
More file actions
47 lines (37 loc) · 1.42 KB
/
binaryClassificationLogisitcRegression.py
File metadata and controls
47 lines (37 loc) · 1.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# importing libraries
import numpy as np
import sklearn as sk
import sklearn.datasets
import sklearn.linear_model
import sklearn.model_selection
from pprint import pprint
# loading the breast cancer dataset
dataset = sk.datasets.load_breast_cancer()
X = dataset.data
y = dataset.target
# splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = sk.model_selection.train_test_split(X, y)
# training the model using scikit-learn
model = sk.linear_model.LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
# measuring accuracy, that is, how well do the predicted probabilities agree with the ground truth probability.
accuracy = np.mean(y_pred == y_val)
print(f'accuracy is: {accuracy}') # 0.9370629370629371
# 1st approach alternate way to find a probability with a set threshold
threshold = .7
probabilities = model.predict_proba(X_val)
y_pred2 = (probabilities[:, 1] > threshold).astype('float64')
accuracy2 = np.mean(y_pred2 == y_val)
print(f'accuracy2 is: {accuracy2}') # 0.9440559440559441
# 1st approach with a for loop to find a probability with a set threshold
N = len(X_val)
y_pred3 = np.zeros(N)
for i in range(N):
if probabilities[i, 1] > threshold:
y_pred3[i] = 1
else:
y_pred3[i] = 0
accuracy3 = np.mean(y_pred3 == y_val)
print(f'accuracy3 is: {accuracy3}') # 0.9440559440559441
# Here, accuracy2 and accuracy3 will have the same answer