Messidor_features (Tugas 8)

import pandas as pd
import numpy as np
from scipy.io import arff

data = arff.loadarff('messidor_features.arff')
df = pd.DataFrame(data[0])

df

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	Class
0	1.0	1.0	22.0	22.0	22.0	19.0	18.0	14.0	49.895756	17.775994	5.270920	0.771761	0.018632	0.006864	0.003923	0.003923	0.486903	0.100025	1.0	b'0'
1	1.0	1.0	24.0	24.0	22.0	18.0	16.0	13.0	57.709936	23.799994	3.325423	0.234185	0.003903	0.003903	0.003903	0.003903	0.520908	0.144414	0.0	b'0'
2	1.0	1.0	62.0	60.0	59.0	54.0	47.0	33.0	55.831441	27.993933	12.687485	4.852282	1.393889	0.373252	0.041817	0.007744	0.530904	0.128548	0.0	b'1'
3	1.0	1.0	55.0	53.0	53.0	50.0	43.0	31.0	40.467228	18.445954	9.118901	3.079428	0.840261	0.272434	0.007653	0.001531	0.483284	0.114790	0.0	b'0'
4	1.0	1.0	44.0	44.0	44.0	41.0	39.0	27.0	18.026254	8.570709	0.410381	0.000000	0.000000	0.000000	0.000000	0.000000	0.475935	0.123572	0.0	b'1'
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1146	1.0	1.0	34.0	34.0	34.0	33.0	31.0	24.0	6.071765	0.937472	0.031145	0.003115	0.000000	0.000000	0.000000	0.000000	0.537470	0.116795	0.0	b'0'
1147	1.0	1.0	49.0	49.0	49.0	49.0	45.0	37.0	63.197145	27.377668	8.067688	0.979548	0.001552	0.000000	0.000000	0.000000	0.516733	0.124190	0.0	b'0'
1148	1.0	0.0	49.0	48.0	48.0	45.0	43.0	33.0	30.461898	13.966980	1.763305	0.137858	0.011221	0.000000	0.000000	0.000000	0.560632	0.129843	0.0	b'0'
1149	1.0	1.0	39.0	36.0	29.0	23.0	13.0	7.0	40.525739	12.604947	4.740919	1.077570	0.563518	0.326860	0.239568	0.174584	0.485972	0.106690	1.0	b'1'
1150	1.0	1.0	7.0	7.0	7.0	7.0	7.0	5.0	69.423565	7.031843	1.750548	0.046597	0.021180	0.008472	0.000000	0.000000	0.556192	0.088957	0.0	b'0'

1151 rows × 20 columns

col_names = []
for i in range(20):
    if i == 0:
        col_names.append('quality')
    if i == 1:
        col_names.append('prescreen')
    if i >= 2 and i <= 7:
        col_names.append('ma' + str(i))
    if i >= 8 and i <= 15:
        col_names.append('exudate' + str(i))
    if i == 16:
        col_names.append('euDist')
    if i == 17:
        col_names.append('diameter')
    if i == 18:
        col_names.append('amfm_class')
    if i == 19:
        col_names.append('label')

df.columns = [col_names]
df.head()

	quality	prescreen	ma2	ma3	ma4	ma5	ma6	ma7	exudate8	exudate9	exudate10	exudate11	exudate12	exudate13	exudate14	exudate15	euDist	diameter	amfm_class	label
0	1.0	1.0	22.0	22.0	22.0	19.0	18.0	14.0	49.895756	17.775994	5.270920	0.771761	0.018632	0.006864	0.003923	0.003923	0.486903	0.100025	1.0	b'0'
1	1.0	1.0	24.0	24.0	22.0	18.0	16.0	13.0	57.709936	23.799994	3.325423	0.234185	0.003903	0.003903	0.003903	0.003903	0.520908	0.144414	0.0	b'0'
2	1.0	1.0	62.0	60.0	59.0	54.0	47.0	33.0	55.831441	27.993933	12.687485	4.852282	1.393889	0.373252	0.041817	0.007744	0.530904	0.128548	0.0	b'1'
3	1.0	1.0	55.0	53.0	53.0	50.0	43.0	31.0	40.467228	18.445954	9.118901	3.079428	0.840261	0.272434	0.007653	0.001531	0.483284	0.114790	0.0	b'0'
4	1.0	1.0	44.0	44.0	44.0	41.0	39.0	27.0	18.026254	8.570709	0.410381	0.000000	0.000000	0.000000	0.000000	0.000000	0.475935	0.123572	0.0	b'1'

df2 = df.drop(columns=['label'])
df2

/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py:4150: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.
  obj = obj._drop_axis(labels, axis, level=level, errors=errors)

	quality	prescreen	ma2	ma3	ma4	ma5	ma6	ma7	exudate8	exudate9	exudate10	exudate11	exudate12	exudate13	exudate14	exudate15	euDist	diameter	amfm_class
0	1.0	1.0	22.0	22.0	22.0	19.0	18.0	14.0	49.895756	17.775994	5.270920	0.771761	0.018632	0.006864	0.003923	0.003923	0.486903	0.100025	1.0
1	1.0	1.0	24.0	24.0	22.0	18.0	16.0	13.0	57.709936	23.799994	3.325423	0.234185	0.003903	0.003903	0.003903	0.003903	0.520908	0.144414	0.0
2	1.0	1.0	62.0	60.0	59.0	54.0	47.0	33.0	55.831441	27.993933	12.687485	4.852282	1.393889	0.373252	0.041817	0.007744	0.530904	0.128548	0.0
3	1.0	1.0	55.0	53.0	53.0	50.0	43.0	31.0	40.467228	18.445954	9.118901	3.079428	0.840261	0.272434	0.007653	0.001531	0.483284	0.114790	0.0
4	1.0	1.0	44.0	44.0	44.0	41.0	39.0	27.0	18.026254	8.570709	0.410381	0.000000	0.000000	0.000000	0.000000	0.000000	0.475935	0.123572	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1146	1.0	1.0	34.0	34.0	34.0	33.0	31.0	24.0	6.071765	0.937472	0.031145	0.003115	0.000000	0.000000	0.000000	0.000000	0.537470	0.116795	0.0
1147	1.0	1.0	49.0	49.0	49.0	49.0	45.0	37.0	63.197145	27.377668	8.067688	0.979548	0.001552	0.000000	0.000000	0.000000	0.516733	0.124190	0.0
1148	1.0	0.0	49.0	48.0	48.0	45.0	43.0	33.0	30.461898	13.966980	1.763305	0.137858	0.011221	0.000000	0.000000	0.000000	0.560632	0.129843	0.0
1149	1.0	1.0	39.0	36.0	29.0	23.0	13.0	7.0	40.525739	12.604947	4.740919	1.077570	0.563518	0.326860	0.239568	0.174584	0.485972	0.106690	1.0
1150	1.0	1.0	7.0	7.0	7.0	7.0	7.0	5.0	69.423565	7.031843	1.750548	0.046597	0.021180	0.008472	0.000000	0.000000	0.556192	0.088957	0.0

1151 rows × 19 columns

y=df['label'].values
y

array([[b'0'],
       [b'0'],
       [b'1'],
       ...,
       [b'0'],
       [b'1'],
       [b'0']], dtype=object)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit (y)

/usr/local/lib/python3.7/dist-packages/sklearn/preprocessing/_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

LabelEncoder()

y=le.transform(y)

/usr/local/lib/python3.7/dist-packages/sklearn/preprocessing/_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score


training, test = train_test_split(df2, train_size = 0.8, test_size = 0.2, shuffle=False)
training_label, test_label = train_test_split(y, train_size = 0.8, test_size = 0.2, shuffle=False)

clf2 = GaussianNB()
clf2.fit(training, training_label)

/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
  FutureWarning,

GaussianNB()

from sklearn.metrics import accuracy_score
post = clf2.predict_proba(test)
# y=np.round_(probas,2)
probas=post[:,1]
probas=np.round(probas)
pred=probas
akurasi=accuracy_score(test_label, pred)
print(akurasi)

0.645021645021645

/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py:1692: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.
  FutureWarning,

My sample book

Messidor_features (Tugas 8)

Messidor_features (Tugas 8)#