Diskritisasi (Tugas 2)#

from google.colab import drive
drive.mount('/content/drive')
KeyboardInterruptTraceback (most recent call last)
<ipython-input-1-d5df0069828e> in <module>
      1 from google.colab import drive
----> 2 drive.mount('/content/drive')

/usr/local/lib/python3.7/dist-packages/google/colab/drive.py in mount(mountpoint, force_remount, timeout_ms, readonly)
    104       timeout_ms=timeout_ms,
    105       ephemeral=True,
--> 106       readonly=readonly)
    107 
    108 

/usr/local/lib/python3.7/dist-packages/google/colab/drive.py in _mount(mountpoint, force_remount, timeout_ms, ephemeral, readonly)
    123   if ephemeral:
    124     _message.blocking_request(
--> 125         'request_auth', request={'authType': 'dfs_ephemeral'}, timeout_sec=None)
    126 
    127   mountpoint = _os.path.expanduser(mountpoint)

/usr/local/lib/python3.7/dist-packages/google/colab/_message.py in blocking_request(request_type, request, timeout_sec, parent)
    169   request_id = send_request(
    170       request_type, request, parent=parent, expect_reply=True)
--> 171   return read_reply_from_input(request_id, timeout_sec)

/usr/local/lib/python3.7/dist-packages/google/colab/_message.py in read_reply_from_input(message_id, timeout_sec)
     95     reply = _read_next_input_message()
     96     if reply == _NOT_READY or not isinstance(reply, dict):
---> 97       time.sleep(0.025)
     98       continue
     99     if (reply.get('type') == 'colab_reply' and

KeyboardInterrupt: 

Tugas Melakukan diskritisasi

Carilah data yang bertipe numerik ( data klassifikasi)

  1. Lakukan proses diskritisasi dengan equal width dan equal frequency

  2. Lakukan proses diskritisasi dengan basis entropy

import pandas as pd
import numpy as np
import math

Diskritisasi dengan equal width#

rumus $\( w=\frac{x_{min}-x_{max}}k \)$

\[v_i=x_{min}+i*w\]

dimana i = 1,2,3,..k

iris=pd.read_csv("https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv")
iris.head(5)
sepal.length sepal.width petal.length petal.width variety
0 5.1 3.5 1.4 0.2 Setosa
1 4.9 3.0 1.4 0.2 Setosa
2 4.7 3.2 1.3 0.2 Setosa
3 4.6 3.1 1.5 0.2 Setosa
4 5.0 3.6 1.4 0.2 Setosa
def equalwidth1(iris):
  Xmax=max(iris["sepal.length"])
  Xmin=min(iris["sepal.length"])

  w=(Xmax-Xmin)/5

  v1=Xmin+(1*w)
  v2=Xmin+(2*w)
  v3=Xmin+(3*w)
  v4=Xmin+(4*w)
  v5=Xmin+(5*w)


  for i in range(len(iris["sepal.length"])):
    if iris["sepal.length"][i]<=v1:
      iris["sepal.length"][i]="A"
    elif iris["sepal.length"][i]>v1 and iris["sepal.length"][i]<=v2:
      iris["sepal.length"][i]="B"
    elif iris["sepal.length"][i]>v2 and iris["sepal.length"][i]<=v3:
      iris["sepal.length"][i]="C"
    elif iris["sepal.length"][i]>v3 and iris["sepal.length"][i]<=v4:
      iris["sepal.length"][i]="D"
    else:
      iris["sepal.length"][i]="E"

def equalwidth2(iris):
  Xmax=max(iris["sepal.width"])
  Xmin=min(iris["sepal.width"])

  w=(Xmax-Xmin)/5

  v1=Xmin+(1*w)
  v2=Xmin+(2*w)
  v3=Xmin+(3*w)
  v4=Xmin+(4*w)
  v5=Xmin+(5*w)


  for i in range(len(iris["sepal.width"])):
    if iris["sepal.width"][i]<=v1:
      iris["sepal.width"][i]="A"
    elif iris["sepal.width"][i]>v1 and iris["sepal.width"][i]<=v2:
      iris["sepal.width"][i]="B"
    elif iris["sepal.width"][i]>v2 and iris["sepal.width"][i]<=v3:
      iris["sepal.width"][i]="C"
    elif iris["sepal.width"][i]>v3 and iris["sepal.width"][i]<=v4:
      iris["sepal.width"][i]="D"
    else:
      iris["sepal.width"][i]="E"

def equalwidth3(iris):
  Xmax=max(iris["petal.length"])
  Xmin=min(iris["petal.length"])

  w=(Xmax-Xmin)/5

  v1=Xmin+(1*w)
  v2=Xmin+(2*w)
  v3=Xmin+(3*w)
  v4=Xmin+(4*w)
  v5=Xmin+(5*w)


  for i in range(len(iris["petal.length"])):
    if iris["petal.length"][i]<=v1:
      iris["petal.length"][i]="A"
    elif iris["petal.length"][i]>v1 and iris["petal.length"][i]<=v2:
      iris["petal.length"][i]="B"
    elif iris["petal.length"][i]>v2 and iris["petal.length"][i]<=v3:
      iris["petal.length"][i]="C"
    elif iris["petal.length"][i]>v3 and iris["petal.length"][i]<=v4:
      iris["petal.length"][i]="D"
    else:
      iris["petal.length"][i]="E"

def equalwidth4(iris):
  Xmax=max(iris["petal.width"])
  Xmin=min(iris["petal.width"])

  w=(Xmax-Xmin)/5

  v1=Xmin+(1*w)
  v2=Xmin+(2*w)
  v3=Xmin+(3*w)
  v4=Xmin+(4*w)
  v5=Xmin+(5*w)


  for i in range(len(iris["petal.width"])):
    if iris["petal.width"][i]<=v1:
      iris["petal.width"][i]="A"
    elif iris["petal.width"][i]>v1 and iris["petal.width"][i]<=v2:
      iris["petal.width"][i]="B"
    elif iris["petal.width"][i]>v2 and iris["petal.width"][i]<=v3:
      iris["petal.width"][i]="C"
    elif iris["petal.width"][i]>v3 and iris["petal.width"][i]<=v4:
      iris["petal.width"][i]="D"
    else:
      iris["petal.width"][i]="E"
def equalwidth(iris):
  equalwidth1(iris)
  equalwidth2(iris)
  equalwidth3(iris)
  equalwidth4(iris)
equalwidth(iris)
iris.head(5)
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py:1732: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:47: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:66: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:91: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
sepal.length sepal.width petal.length petal.width variety
0 B D A A Setosa
1 A C A A Setosa
2 A C A A Setosa
3 A C A A Setosa
4 A D A A Setosa

Diskritisasi dengan equal frequency#

iris=pd.read_csv("https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv")
iris.head(5)
sepal.length sepal.width petal.length petal.width variety
0 5.1 3.5 1.4 0.2 Setosa
1 4.9 3.0 1.4 0.2 Setosa
2 4.7 3.2 1.3 0.2 Setosa
3 4.6 3.1 1.5 0.2 Setosa
4 5.0 3.6 1.4 0.2 Setosa
data_sepal_length = iris[["sepal.length"]]
data_sepal_width = iris[["sepal.width"]]
data_petal_length = iris[["petal.length"]]
data_petal_width = iris[["petal.width"]]    
def qcut(col, k):
    intervals = pd.qcut(iris[col], k).value_counts().index.to_list()
    return [[interval.left, interval.right] for interval in intervals]
interval_sepal_length = qcut("sepal.length", 5)
interval_sepal_width = qcut("sepal.width", 5)
interval_petal_length = qcut("petal.length", 5)
interval_petal_width = qcut("petal.width", 5)

print("interval sepal.length :", interval_sepal_length)
print("interval sepal.width  :", interval_sepal_width)
print("interval petal.length :", interval_petal_length)
print("interval petal.width  :", interval_petal_width)
interval sepal.length : [[5.0, 5.6], [4.2989999999999995, 5.0], [5.6, 6.1], [6.52, 7.9], [6.1, 6.52]]
interval sepal.width  : [[2.7, 3.0], [1.999, 2.7], [3.1, 3.4], [3.4, 4.4], [3.0, 3.1]]
interval petal.length : [[0.999, 1.5], [4.64, 5.32], [5.32, 6.9], [3.9, 4.64], [1.5, 3.9]]
interval petal.width  : [[1.16, 1.5], [0.099, 0.2], [1.9, 2.5], [0.2, 1.16], [1.5, 1.9]]
def equalfreq(list_interval, col):
    # mendapatkan panjang interval
    length = len(list_interval)

    # mengurutkan interval
    sort_interval = np.sort(list_interval, axis=0)
    # mendapatkan Category dari interval
    categories = np.array([chr(65+i) for i in range(length)])[:, None]
    # combine into interval data
    intervals = np.hstack((sort_interval, categories))

    newCol = []
    for i, row in iris.iterrows():
        data = row[col]
        for interval in intervals:
            if data >= interval[0].astype(float) and data <= interval[1].astype(float):
                newCol.append(interval[2])
                break

    return np.array(newCol, dtype=str)
iris["sepal.length"] = equalfreq(interval_sepal_length, "sepal.length")
iris["sepal.width"] = equalfreq(interval_sepal_width, "sepal.width")
iris["petal.length"] = equalfreq(interval_petal_length, "petal.length")
iris["petal.width"] = equalfreq(interval_petal_width, "petal.width")
iris.head(5)
sepal.length sepal.width petal.length petal.width variety
0 B E A A Setosa
1 A B A A Setosa
2 A D A A Setosa
3 A C A A Setosa
4 A E A A Setosa

Diskritisasi basis entropy#

iris=pd.read_csv("https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv")
iris.head(5)
sepal.length sepal.width petal.length petal.width variety
0 5.1 3.5 1.4 0.2 Setosa
1 4.9 3.0 1.4 0.2 Setosa
2 4.7 3.2 1.3 0.2 Setosa
3 4.6 3.1 1.5 0.2 Setosa
4 5.0 3.6 1.4 0.2 Setosa
sample = iris[["sepal.length"]]
sample.describe()
sepal.length
count 150.000000
mean 5.843333
std 0.828066
min 4.300000
25% 5.100000
50% 5.800000
75% 6.400000
max 7.900000

Membuat category random untuk semua data

np.random.seed(0)
sample["category"] = np.where(np.random.choice(2, sample.shape[0]) < 1, "A", "B" )
sample
sample.shape[0]
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
150

Membuat fungsi getOverollCategory digunakan untuk menghitung data keseluruhan yang nantinya digunakan untuk menghitung entropy

def getOverollCategory(col):
    group = sample.loc[:, :].groupby("category").count()
    a = group.loc["A", col]
    b = group.loc["B", col]
    print(a,b, group)
    return (a, b, a+b)

Fungsi splitter digunakan untuk membuat split antara value yang telah di tentukan lalu mengembalikan data yang telah di pisahkan

def splitter(value:float, col:str)->tuple:
    # get data less and greater from value
    less = sample[sample[col] <= value]
    greater = sample[sample[col] > value]

    # calculate into category for each data
    less_group = less.loc[:, :].groupby("category").count()
    print(less_group)
    greater_group = greater.loc[:, :].groupby("category").count()

    # get value based on category
    less_category_A = less_group.loc["A", col]
    print(less_category_A)

    less_category_B = less_group.loc["B", col]
    greater_category_A = greater_group.loc["A", col]
    greater_category_B = greater_group.loc["B", col]

    return (
        [less_category_A, less_category_B, less_category_A + less_category_B],
        [greater_category_A, greater_category_B, greater_category_A + greater_category_B]
    )

Buat Fungsi entropy untuk mencari nilai entropy

def entropy(d):
    r1 = (d[0] / d[2]) * np.log2(d[0] / d[2])
    r2 = (d[1] / d[2]) * np.log2(d[1] / d[2])
    return np.sum([r1, r2]) * -1

Membuat fungsi info dan gain

def info(d):
    r1 = (d[0][2] / sample.shape[0]) * entropy(d[0])
    r2 = (d[1][2] / sample.shape[0]) * entropy(d[1])
    return r1 + r2

Fungsi gain untuk menghitung selsih antara entropy awal dengan yang baru

def gain(Einitial, Enew):
    return Einitial - Enew

Membuat Dinitial

D = getOverollCategory("sepal.length")
entropy_d = entropy(D)
print(D)
print(entropy_d)
68 82           sepal.length
category              
A                   68
B                   82
(68, 82, 150)
0.993707106604508

Melakukan beberapa tes split untuk mencri informasi yang terbaik

Split 1: 4.4

split1 = splitter(4.4, "sepal.length")
info_split1 = info(split1)
gain(entropy_d, info_split1)
          sepal.length
category              
A                    1
B                    3
1
0.003488151753460178

split 2: 5.5

split2 = splitter(5.5, "sepal.length")
info_split2 = info(split2)
gain(entropy_d, info_split2)
          sepal.length
category              
A                   22
B                   37
22
0.012302155146638905

split 3: 7.0

split3 = splitter(7.0, "sepal.length")
info_split3 = info(split3)
gain(entropy_d, info_split3)
          sepal.length
category              
A                   62
B                   76
62
0.0005490214732508658