Diskritisasi (Tugas 2)
Contents
Diskritisasi (Tugas 2)#
from google.colab import drive
drive.mount('/content/drive')
KeyboardInterruptTraceback (most recent call last)
<ipython-input-1-d5df0069828e> in <module>
1 from google.colab import drive
----> 2 drive.mount('/content/drive')
/usr/local/lib/python3.7/dist-packages/google/colab/drive.py in mount(mountpoint, force_remount, timeout_ms, readonly)
104 timeout_ms=timeout_ms,
105 ephemeral=True,
--> 106 readonly=readonly)
107
108
/usr/local/lib/python3.7/dist-packages/google/colab/drive.py in _mount(mountpoint, force_remount, timeout_ms, ephemeral, readonly)
123 if ephemeral:
124 _message.blocking_request(
--> 125 'request_auth', request={'authType': 'dfs_ephemeral'}, timeout_sec=None)
126
127 mountpoint = _os.path.expanduser(mountpoint)
/usr/local/lib/python3.7/dist-packages/google/colab/_message.py in blocking_request(request_type, request, timeout_sec, parent)
169 request_id = send_request(
170 request_type, request, parent=parent, expect_reply=True)
--> 171 return read_reply_from_input(request_id, timeout_sec)
/usr/local/lib/python3.7/dist-packages/google/colab/_message.py in read_reply_from_input(message_id, timeout_sec)
95 reply = _read_next_input_message()
96 if reply == _NOT_READY or not isinstance(reply, dict):
---> 97 time.sleep(0.025)
98 continue
99 if (reply.get('type') == 'colab_reply' and
KeyboardInterrupt:
Tugas Melakukan diskritisasi
Carilah data yang bertipe numerik ( data klassifikasi)
Lakukan proses diskritisasi dengan equal width dan equal frequency
Lakukan proses diskritisasi dengan basis entropy
import pandas as pd
import numpy as np
import math
Diskritisasi dengan equal width#
rumus $\( w=\frac{x_{min}-x_{max}}k \)$
dimana i = 1,2,3,..k
iris=pd.read_csv("https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv")
iris.head(5)
sepal.length | sepal.width | petal.length | petal.width | variety | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Setosa |
def equalwidth1(iris):
Xmax=max(iris["sepal.length"])
Xmin=min(iris["sepal.length"])
w=(Xmax-Xmin)/5
v1=Xmin+(1*w)
v2=Xmin+(2*w)
v3=Xmin+(3*w)
v4=Xmin+(4*w)
v5=Xmin+(5*w)
for i in range(len(iris["sepal.length"])):
if iris["sepal.length"][i]<=v1:
iris["sepal.length"][i]="A"
elif iris["sepal.length"][i]>v1 and iris["sepal.length"][i]<=v2:
iris["sepal.length"][i]="B"
elif iris["sepal.length"][i]>v2 and iris["sepal.length"][i]<=v3:
iris["sepal.length"][i]="C"
elif iris["sepal.length"][i]>v3 and iris["sepal.length"][i]<=v4:
iris["sepal.length"][i]="D"
else:
iris["sepal.length"][i]="E"
def equalwidth2(iris):
Xmax=max(iris["sepal.width"])
Xmin=min(iris["sepal.width"])
w=(Xmax-Xmin)/5
v1=Xmin+(1*w)
v2=Xmin+(2*w)
v3=Xmin+(3*w)
v4=Xmin+(4*w)
v5=Xmin+(5*w)
for i in range(len(iris["sepal.width"])):
if iris["sepal.width"][i]<=v1:
iris["sepal.width"][i]="A"
elif iris["sepal.width"][i]>v1 and iris["sepal.width"][i]<=v2:
iris["sepal.width"][i]="B"
elif iris["sepal.width"][i]>v2 and iris["sepal.width"][i]<=v3:
iris["sepal.width"][i]="C"
elif iris["sepal.width"][i]>v3 and iris["sepal.width"][i]<=v4:
iris["sepal.width"][i]="D"
else:
iris["sepal.width"][i]="E"
def equalwidth3(iris):
Xmax=max(iris["petal.length"])
Xmin=min(iris["petal.length"])
w=(Xmax-Xmin)/5
v1=Xmin+(1*w)
v2=Xmin+(2*w)
v3=Xmin+(3*w)
v4=Xmin+(4*w)
v5=Xmin+(5*w)
for i in range(len(iris["petal.length"])):
if iris["petal.length"][i]<=v1:
iris["petal.length"][i]="A"
elif iris["petal.length"][i]>v1 and iris["petal.length"][i]<=v2:
iris["petal.length"][i]="B"
elif iris["petal.length"][i]>v2 and iris["petal.length"][i]<=v3:
iris["petal.length"][i]="C"
elif iris["petal.length"][i]>v3 and iris["petal.length"][i]<=v4:
iris["petal.length"][i]="D"
else:
iris["petal.length"][i]="E"
def equalwidth4(iris):
Xmax=max(iris["petal.width"])
Xmin=min(iris["petal.width"])
w=(Xmax-Xmin)/5
v1=Xmin+(1*w)
v2=Xmin+(2*w)
v3=Xmin+(3*w)
v4=Xmin+(4*w)
v5=Xmin+(5*w)
for i in range(len(iris["petal.width"])):
if iris["petal.width"][i]<=v1:
iris["petal.width"][i]="A"
elif iris["petal.width"][i]>v1 and iris["petal.width"][i]<=v2:
iris["petal.width"][i]="B"
elif iris["petal.width"][i]>v2 and iris["petal.width"][i]<=v3:
iris["petal.width"][i]="C"
elif iris["petal.width"][i]>v3 and iris["petal.width"][i]<=v4:
iris["petal.width"][i]="D"
else:
iris["petal.width"][i]="E"
def equalwidth(iris):
equalwidth1(iris)
equalwidth2(iris)
equalwidth3(iris)
equalwidth4(iris)
equalwidth(iris)
iris.head(5)
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:18: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py:1732: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._setitem_single_block(indexer, value, name)
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:47: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:66: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:91: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
sepal.length | sepal.width | petal.length | petal.width | variety | |
---|---|---|---|---|---|
0 | B | D | A | A | Setosa |
1 | A | C | A | A | Setosa |
2 | A | C | A | A | Setosa |
3 | A | C | A | A | Setosa |
4 | A | D | A | A | Setosa |
Diskritisasi dengan equal frequency#
iris=pd.read_csv("https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv")
iris.head(5)
sepal.length | sepal.width | petal.length | petal.width | variety | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Setosa |
data_sepal_length = iris[["sepal.length"]]
data_sepal_width = iris[["sepal.width"]]
data_petal_length = iris[["petal.length"]]
data_petal_width = iris[["petal.width"]]
def qcut(col, k):
intervals = pd.qcut(iris[col], k).value_counts().index.to_list()
return [[interval.left, interval.right] for interval in intervals]
interval_sepal_length = qcut("sepal.length", 5)
interval_sepal_width = qcut("sepal.width", 5)
interval_petal_length = qcut("petal.length", 5)
interval_petal_width = qcut("petal.width", 5)
print("interval sepal.length :", interval_sepal_length)
print("interval sepal.width :", interval_sepal_width)
print("interval petal.length :", interval_petal_length)
print("interval petal.width :", interval_petal_width)
interval sepal.length : [[5.0, 5.6], [4.2989999999999995, 5.0], [5.6, 6.1], [6.52, 7.9], [6.1, 6.52]]
interval sepal.width : [[2.7, 3.0], [1.999, 2.7], [3.1, 3.4], [3.4, 4.4], [3.0, 3.1]]
interval petal.length : [[0.999, 1.5], [4.64, 5.32], [5.32, 6.9], [3.9, 4.64], [1.5, 3.9]]
interval petal.width : [[1.16, 1.5], [0.099, 0.2], [1.9, 2.5], [0.2, 1.16], [1.5, 1.9]]
def equalfreq(list_interval, col):
# mendapatkan panjang interval
length = len(list_interval)
# mengurutkan interval
sort_interval = np.sort(list_interval, axis=0)
# mendapatkan Category dari interval
categories = np.array([chr(65+i) for i in range(length)])[:, None]
# combine into interval data
intervals = np.hstack((sort_interval, categories))
newCol = []
for i, row in iris.iterrows():
data = row[col]
for interval in intervals:
if data >= interval[0].astype(float) and data <= interval[1].astype(float):
newCol.append(interval[2])
break
return np.array(newCol, dtype=str)
iris["sepal.length"] = equalfreq(interval_sepal_length, "sepal.length")
iris["sepal.width"] = equalfreq(interval_sepal_width, "sepal.width")
iris["petal.length"] = equalfreq(interval_petal_length, "petal.length")
iris["petal.width"] = equalfreq(interval_petal_width, "petal.width")
iris.head(5)
sepal.length | sepal.width | petal.length | petal.width | variety | |
---|---|---|---|---|---|
0 | B | E | A | A | Setosa |
1 | A | B | A | A | Setosa |
2 | A | D | A | A | Setosa |
3 | A | C | A | A | Setosa |
4 | A | E | A | A | Setosa |
Diskritisasi basis entropy#
iris=pd.read_csv("https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv")
iris.head(5)
sepal.length | sepal.width | petal.length | petal.width | variety | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Setosa |
sample = iris[["sepal.length"]]
sample.describe()
sepal.length | |
---|---|
count | 150.000000 |
mean | 5.843333 |
std | 0.828066 |
min | 4.300000 |
25% | 5.100000 |
50% | 5.800000 |
75% | 6.400000 |
max | 7.900000 |
Membuat category random untuk semua data
np.random.seed(0)
sample["category"] = np.where(np.random.choice(2, sample.shape[0]) < 1, "A", "B" )
sample
sample.shape[0]
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
150
Membuat fungsi getOverollCategory digunakan untuk menghitung data keseluruhan yang nantinya digunakan untuk menghitung entropy
def getOverollCategory(col):
group = sample.loc[:, :].groupby("category").count()
a = group.loc["A", col]
b = group.loc["B", col]
print(a,b, group)
return (a, b, a+b)
Fungsi splitter digunakan untuk membuat split antara value yang telah di tentukan lalu mengembalikan data yang telah di pisahkan
def splitter(value:float, col:str)->tuple:
# get data less and greater from value
less = sample[sample[col] <= value]
greater = sample[sample[col] > value]
# calculate into category for each data
less_group = less.loc[:, :].groupby("category").count()
print(less_group)
greater_group = greater.loc[:, :].groupby("category").count()
# get value based on category
less_category_A = less_group.loc["A", col]
print(less_category_A)
less_category_B = less_group.loc["B", col]
greater_category_A = greater_group.loc["A", col]
greater_category_B = greater_group.loc["B", col]
return (
[less_category_A, less_category_B, less_category_A + less_category_B],
[greater_category_A, greater_category_B, greater_category_A + greater_category_B]
)
Buat Fungsi entropy untuk mencari nilai entropy
def entropy(d):
r1 = (d[0] / d[2]) * np.log2(d[0] / d[2])
r2 = (d[1] / d[2]) * np.log2(d[1] / d[2])
return np.sum([r1, r2]) * -1
Membuat fungsi info dan gain
def info(d):
r1 = (d[0][2] / sample.shape[0]) * entropy(d[0])
r2 = (d[1][2] / sample.shape[0]) * entropy(d[1])
return r1 + r2
Fungsi gain untuk menghitung selsih antara entropy awal dengan yang baru
def gain(Einitial, Enew):
return Einitial - Enew
Membuat Dinitial
D = getOverollCategory("sepal.length")
entropy_d = entropy(D)
print(D)
print(entropy_d)
68 82 sepal.length
category
A 68
B 82
(68, 82, 150)
0.993707106604508
Melakukan beberapa tes split untuk mencri informasi yang terbaik
Split 1: 4.4
split1 = splitter(4.4, "sepal.length")
info_split1 = info(split1)
gain(entropy_d, info_split1)
sepal.length
category
A 1
B 3
1
0.003488151753460178
split 2: 5.5
split2 = splitter(5.5, "sepal.length")
info_split2 = info(split2)
gain(entropy_d, info_split2)
sepal.length
category
A 22
B 37
22
0.012302155146638905
split 3: 7.0
split3 = splitter(7.0, "sepal.length")
info_split3 = info(split3)
gain(entropy_d, info_split3)
sepal.length
category
A 62
B 76
62
0.0005490214732508658