Messidor_features (Tugas 8)#

import pandas as pd
import numpy as np
from scipy.io import arff
data = arff.loadarff('messidor_features.arff')
df = pd.DataFrame(data[0])

df
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 Class
0 1.0 1.0 22.0 22.0 22.0 19.0 18.0 14.0 49.895756 17.775994 5.270920 0.771761 0.018632 0.006864 0.003923 0.003923 0.486903 0.100025 1.0 b'0'
1 1.0 1.0 24.0 24.0 22.0 18.0 16.0 13.0 57.709936 23.799994 3.325423 0.234185 0.003903 0.003903 0.003903 0.003903 0.520908 0.144414 0.0 b'0'
2 1.0 1.0 62.0 60.0 59.0 54.0 47.0 33.0 55.831441 27.993933 12.687485 4.852282 1.393889 0.373252 0.041817 0.007744 0.530904 0.128548 0.0 b'1'
3 1.0 1.0 55.0 53.0 53.0 50.0 43.0 31.0 40.467228 18.445954 9.118901 3.079428 0.840261 0.272434 0.007653 0.001531 0.483284 0.114790 0.0 b'0'
4 1.0 1.0 44.0 44.0 44.0 41.0 39.0 27.0 18.026254 8.570709 0.410381 0.000000 0.000000 0.000000 0.000000 0.000000 0.475935 0.123572 0.0 b'1'
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1146 1.0 1.0 34.0 34.0 34.0 33.0 31.0 24.0 6.071765 0.937472 0.031145 0.003115 0.000000 0.000000 0.000000 0.000000 0.537470 0.116795 0.0 b'0'
1147 1.0 1.0 49.0 49.0 49.0 49.0 45.0 37.0 63.197145 27.377668 8.067688 0.979548 0.001552 0.000000 0.000000 0.000000 0.516733 0.124190 0.0 b'0'
1148 1.0 0.0 49.0 48.0 48.0 45.0 43.0 33.0 30.461898 13.966980 1.763305 0.137858 0.011221 0.000000 0.000000 0.000000 0.560632 0.129843 0.0 b'0'
1149 1.0 1.0 39.0 36.0 29.0 23.0 13.0 7.0 40.525739 12.604947 4.740919 1.077570 0.563518 0.326860 0.239568 0.174584 0.485972 0.106690 1.0 b'1'
1150 1.0 1.0 7.0 7.0 7.0 7.0 7.0 5.0 69.423565 7.031843 1.750548 0.046597 0.021180 0.008472 0.000000 0.000000 0.556192 0.088957 0.0 b'0'

1151 rows × 20 columns

col_names = []
for i in range(20):
    if i == 0:
        col_names.append('quality')
    if i == 1:
        col_names.append('prescreen')
    if i >= 2 and i <= 7:
        col_names.append('ma' + str(i))
    if i >= 8 and i <= 15:
        col_names.append('exudate' + str(i))
    if i == 16:
        col_names.append('euDist')
    if i == 17:
        col_names.append('diameter')
    if i == 18:
        col_names.append('amfm_class')
    if i == 19:
        col_names.append('label')
df.columns = [col_names]
df.head()
quality prescreen ma2 ma3 ma4 ma5 ma6 ma7 exudate8 exudate9 exudate10 exudate11 exudate12 exudate13 exudate14 exudate15 euDist diameter amfm_class label
0 1.0 1.0 22.0 22.0 22.0 19.0 18.0 14.0 49.895756 17.775994 5.270920 0.771761 0.018632 0.006864 0.003923 0.003923 0.486903 0.100025 1.0 b'0'
1 1.0 1.0 24.0 24.0 22.0 18.0 16.0 13.0 57.709936 23.799994 3.325423 0.234185 0.003903 0.003903 0.003903 0.003903 0.520908 0.144414 0.0 b'0'
2 1.0 1.0 62.0 60.0 59.0 54.0 47.0 33.0 55.831441 27.993933 12.687485 4.852282 1.393889 0.373252 0.041817 0.007744 0.530904 0.128548 0.0 b'1'
3 1.0 1.0 55.0 53.0 53.0 50.0 43.0 31.0 40.467228 18.445954 9.118901 3.079428 0.840261 0.272434 0.007653 0.001531 0.483284 0.114790 0.0 b'0'
4 1.0 1.0 44.0 44.0 44.0 41.0 39.0 27.0 18.026254 8.570709 0.410381 0.000000 0.000000 0.000000 0.000000 0.000000 0.475935 0.123572 0.0 b'1'
df2 = df.drop(columns=['label'])
df2
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py:4150: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
quality prescreen ma2 ma3 ma4 ma5 ma6 ma7 exudate8 exudate9 exudate10 exudate11 exudate12 exudate13 exudate14 exudate15 euDist diameter amfm_class
0 1.0 1.0 22.0 22.0 22.0 19.0 18.0 14.0 49.895756 17.775994 5.270920 0.771761 0.018632 0.006864 0.003923 0.003923 0.486903 0.100025 1.0
1 1.0 1.0 24.0 24.0 22.0 18.0 16.0 13.0 57.709936 23.799994 3.325423 0.234185 0.003903 0.003903 0.003903 0.003903 0.520908 0.144414 0.0
2 1.0 1.0 62.0 60.0 59.0 54.0 47.0 33.0 55.831441 27.993933 12.687485 4.852282 1.393889 0.373252 0.041817 0.007744 0.530904 0.128548 0.0
3 1.0 1.0 55.0 53.0 53.0 50.0 43.0 31.0 40.467228 18.445954 9.118901 3.079428 0.840261 0.272434 0.007653 0.001531 0.483284 0.114790 0.0
4 1.0 1.0 44.0 44.0 44.0 41.0 39.0 27.0 18.026254 8.570709 0.410381 0.000000 0.000000 0.000000 0.000000 0.000000 0.475935 0.123572 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1146 1.0 1.0 34.0 34.0 34.0 33.0 31.0 24.0 6.071765 0.937472 0.031145 0.003115 0.000000 0.000000 0.000000 0.000000 0.537470 0.116795 0.0
1147 1.0 1.0 49.0 49.0 49.0 49.0 45.0 37.0 63.197145 27.377668 8.067688 0.979548 0.001552 0.000000 0.000000 0.000000 0.516733 0.124190 0.0
1148 1.0 0.0 49.0 48.0 48.0 45.0 43.0 33.0 30.461898 13.966980 1.763305 0.137858 0.011221 0.000000 0.000000 0.000000 0.560632 0.129843 0.0
1149 1.0 1.0 39.0 36.0 29.0 23.0 13.0 7.0 40.525739 12.604947 4.740919 1.077570 0.563518 0.326860 0.239568 0.174584 0.485972 0.106690 1.0
1150 1.0 1.0 7.0 7.0 7.0 7.0 7.0 5.0 69.423565 7.031843 1.750548 0.046597 0.021180 0.008472 0.000000 0.000000 0.556192 0.088957 0.0

1151 rows × 19 columns

y=df['label'].values
y
array([[b'0'],
       [b'0'],
       [b'1'],
       ...,
       [b'0'],
       [b'1'],
       [b'0']], dtype=object)
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit (y)
/usr/local/lib/python3.7/dist-packages/sklearn/preprocessing/_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
LabelEncoder()
y=le.transform(y)
/usr/local/lib/python3.7/dist-packages/sklearn/preprocessing/_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score


training, test = train_test_split(df2, train_size = 0.8, test_size = 0.2, shuffle=False)
training_label, test_label = train_test_split(y, train_size = 0.8, test_size = 0.2, shuffle=False)

clf2 = GaussianNB()
clf2.fit(training, training_label)
/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
  FutureWarning,
GaussianNB()
from sklearn.metrics import accuracy_score
post = clf2.predict_proba(test)
# y=np.round_(probas,2)
probas=post[:,1]
probas=np.round(probas)
pred=probas
akurasi=accuracy_score(test_label, pred)
print(akurasi)
0.645021645021645
/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
  FutureWarning,