Messidor_features (Tugas 8)
Messidor_features (Tugas 8)#
import pandas as pd
import numpy as np
from scipy.io import arff
data = arff.loadarff('messidor_features.arff')
df = pd.DataFrame(data[0])
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 1.0 | 22.0 | 22.0 | 22.0 | 19.0 | 18.0 | 14.0 | 49.895756 | 17.775994 | 5.270920 | 0.771761 | 0.018632 | 0.006864 | 0.003923 | 0.003923 | 0.486903 | 0.100025 | 1.0 | b'0' |
1 | 1.0 | 1.0 | 24.0 | 24.0 | 22.0 | 18.0 | 16.0 | 13.0 | 57.709936 | 23.799994 | 3.325423 | 0.234185 | 0.003903 | 0.003903 | 0.003903 | 0.003903 | 0.520908 | 0.144414 | 0.0 | b'0' |
2 | 1.0 | 1.0 | 62.0 | 60.0 | 59.0 | 54.0 | 47.0 | 33.0 | 55.831441 | 27.993933 | 12.687485 | 4.852282 | 1.393889 | 0.373252 | 0.041817 | 0.007744 | 0.530904 | 0.128548 | 0.0 | b'1' |
3 | 1.0 | 1.0 | 55.0 | 53.0 | 53.0 | 50.0 | 43.0 | 31.0 | 40.467228 | 18.445954 | 9.118901 | 3.079428 | 0.840261 | 0.272434 | 0.007653 | 0.001531 | 0.483284 | 0.114790 | 0.0 | b'0' |
4 | 1.0 | 1.0 | 44.0 | 44.0 | 44.0 | 41.0 | 39.0 | 27.0 | 18.026254 | 8.570709 | 0.410381 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.475935 | 0.123572 | 0.0 | b'1' |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1146 | 1.0 | 1.0 | 34.0 | 34.0 | 34.0 | 33.0 | 31.0 | 24.0 | 6.071765 | 0.937472 | 0.031145 | 0.003115 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.537470 | 0.116795 | 0.0 | b'0' |
1147 | 1.0 | 1.0 | 49.0 | 49.0 | 49.0 | 49.0 | 45.0 | 37.0 | 63.197145 | 27.377668 | 8.067688 | 0.979548 | 0.001552 | 0.000000 | 0.000000 | 0.000000 | 0.516733 | 0.124190 | 0.0 | b'0' |
1148 | 1.0 | 0.0 | 49.0 | 48.0 | 48.0 | 45.0 | 43.0 | 33.0 | 30.461898 | 13.966980 | 1.763305 | 0.137858 | 0.011221 | 0.000000 | 0.000000 | 0.000000 | 0.560632 | 0.129843 | 0.0 | b'0' |
1149 | 1.0 | 1.0 | 39.0 | 36.0 | 29.0 | 23.0 | 13.0 | 7.0 | 40.525739 | 12.604947 | 4.740919 | 1.077570 | 0.563518 | 0.326860 | 0.239568 | 0.174584 | 0.485972 | 0.106690 | 1.0 | b'1' |
1150 | 1.0 | 1.0 | 7.0 | 7.0 | 7.0 | 7.0 | 7.0 | 5.0 | 69.423565 | 7.031843 | 1.750548 | 0.046597 | 0.021180 | 0.008472 | 0.000000 | 0.000000 | 0.556192 | 0.088957 | 0.0 | b'0' |
1151 rows × 20 columns
col_names = []
for i in range(20):
if i == 0:
col_names.append('quality')
if i == 1:
col_names.append('prescreen')
if i >= 2 and i <= 7:
col_names.append('ma' + str(i))
if i >= 8 and i <= 15:
col_names.append('exudate' + str(i))
if i == 16:
col_names.append('euDist')
if i == 17:
col_names.append('diameter')
if i == 18:
col_names.append('amfm_class')
if i == 19:
col_names.append('label')
df.columns = [col_names]
df.head()
quality | prescreen | ma2 | ma3 | ma4 | ma5 | ma6 | ma7 | exudate8 | exudate9 | exudate10 | exudate11 | exudate12 | exudate13 | exudate14 | exudate15 | euDist | diameter | amfm_class | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 1.0 | 22.0 | 22.0 | 22.0 | 19.0 | 18.0 | 14.0 | 49.895756 | 17.775994 | 5.270920 | 0.771761 | 0.018632 | 0.006864 | 0.003923 | 0.003923 | 0.486903 | 0.100025 | 1.0 | b'0' |
1 | 1.0 | 1.0 | 24.0 | 24.0 | 22.0 | 18.0 | 16.0 | 13.0 | 57.709936 | 23.799994 | 3.325423 | 0.234185 | 0.003903 | 0.003903 | 0.003903 | 0.003903 | 0.520908 | 0.144414 | 0.0 | b'0' |
2 | 1.0 | 1.0 | 62.0 | 60.0 | 59.0 | 54.0 | 47.0 | 33.0 | 55.831441 | 27.993933 | 12.687485 | 4.852282 | 1.393889 | 0.373252 | 0.041817 | 0.007744 | 0.530904 | 0.128548 | 0.0 | b'1' |
3 | 1.0 | 1.0 | 55.0 | 53.0 | 53.0 | 50.0 | 43.0 | 31.0 | 40.467228 | 18.445954 | 9.118901 | 3.079428 | 0.840261 | 0.272434 | 0.007653 | 0.001531 | 0.483284 | 0.114790 | 0.0 | b'0' |
4 | 1.0 | 1.0 | 44.0 | 44.0 | 44.0 | 41.0 | 39.0 | 27.0 | 18.026254 | 8.570709 | 0.410381 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.475935 | 0.123572 | 0.0 | b'1' |
df2 = df.drop(columns=['label'])
df2
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py:4150: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.
obj = obj._drop_axis(labels, axis, level=level, errors=errors)
quality | prescreen | ma2 | ma3 | ma4 | ma5 | ma6 | ma7 | exudate8 | exudate9 | exudate10 | exudate11 | exudate12 | exudate13 | exudate14 | exudate15 | euDist | diameter | amfm_class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 1.0 | 22.0 | 22.0 | 22.0 | 19.0 | 18.0 | 14.0 | 49.895756 | 17.775994 | 5.270920 | 0.771761 | 0.018632 | 0.006864 | 0.003923 | 0.003923 | 0.486903 | 0.100025 | 1.0 |
1 | 1.0 | 1.0 | 24.0 | 24.0 | 22.0 | 18.0 | 16.0 | 13.0 | 57.709936 | 23.799994 | 3.325423 | 0.234185 | 0.003903 | 0.003903 | 0.003903 | 0.003903 | 0.520908 | 0.144414 | 0.0 |
2 | 1.0 | 1.0 | 62.0 | 60.0 | 59.0 | 54.0 | 47.0 | 33.0 | 55.831441 | 27.993933 | 12.687485 | 4.852282 | 1.393889 | 0.373252 | 0.041817 | 0.007744 | 0.530904 | 0.128548 | 0.0 |
3 | 1.0 | 1.0 | 55.0 | 53.0 | 53.0 | 50.0 | 43.0 | 31.0 | 40.467228 | 18.445954 | 9.118901 | 3.079428 | 0.840261 | 0.272434 | 0.007653 | 0.001531 | 0.483284 | 0.114790 | 0.0 |
4 | 1.0 | 1.0 | 44.0 | 44.0 | 44.0 | 41.0 | 39.0 | 27.0 | 18.026254 | 8.570709 | 0.410381 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.475935 | 0.123572 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1146 | 1.0 | 1.0 | 34.0 | 34.0 | 34.0 | 33.0 | 31.0 | 24.0 | 6.071765 | 0.937472 | 0.031145 | 0.003115 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.537470 | 0.116795 | 0.0 |
1147 | 1.0 | 1.0 | 49.0 | 49.0 | 49.0 | 49.0 | 45.0 | 37.0 | 63.197145 | 27.377668 | 8.067688 | 0.979548 | 0.001552 | 0.000000 | 0.000000 | 0.000000 | 0.516733 | 0.124190 | 0.0 |
1148 | 1.0 | 0.0 | 49.0 | 48.0 | 48.0 | 45.0 | 43.0 | 33.0 | 30.461898 | 13.966980 | 1.763305 | 0.137858 | 0.011221 | 0.000000 | 0.000000 | 0.000000 | 0.560632 | 0.129843 | 0.0 |
1149 | 1.0 | 1.0 | 39.0 | 36.0 | 29.0 | 23.0 | 13.0 | 7.0 | 40.525739 | 12.604947 | 4.740919 | 1.077570 | 0.563518 | 0.326860 | 0.239568 | 0.174584 | 0.485972 | 0.106690 | 1.0 |
1150 | 1.0 | 1.0 | 7.0 | 7.0 | 7.0 | 7.0 | 7.0 | 5.0 | 69.423565 | 7.031843 | 1.750548 | 0.046597 | 0.021180 | 0.008472 | 0.000000 | 0.000000 | 0.556192 | 0.088957 | 0.0 |
1151 rows × 19 columns
y=df['label'].values
y
array([[b'0'],
[b'0'],
[b'1'],
...,
[b'0'],
[b'1'],
[b'0']], dtype=object)
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit (y)
/usr/local/lib/python3.7/dist-packages/sklearn/preprocessing/_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
LabelEncoder()
y=le.transform(y)
/usr/local/lib/python3.7/dist-packages/sklearn/preprocessing/_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
training, test = train_test_split(df2, train_size = 0.8, test_size = 0.2, shuffle=False)
training_label, test_label = train_test_split(y, train_size = 0.8, test_size = 0.2, shuffle=False)
clf2 = GaussianNB()
clf2.fit(training, training_label)
/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,
GaussianNB()
from sklearn.metrics import accuracy_score
post = clf2.predict_proba(test)
# y=np.round_(probas,2)
probas=post[:,1]
probas=np.round(probas)
pred=probas
akurasi=accuracy_score(test_label, pred)
print(akurasi)
0.645021645021645
/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
FutureWarning,