Dissimilarity (Tugas 1)#

Mengukur jarak (dissimilarity)

  • Ambil data dari Kaggle atau Github

  • Ukur jarak d(1,2), d(1,3), d(1,4) dari data tersebut

import pandas as pd

url = 'https://raw.githubusercontent.com/Athpr123/Binary-Classification-Using-Machine-learning/master/dataset.csv'
data=pd.read_csv(url)
data.head()
ID Age Agency Agency Type Commision (in value) Destination Distribution Channel Duration Gender Net Sales Product Name Claim
0 45341 28 C2B Airlines 28.13 SINGAPORE Online 34 F 112.5 Silver Plan 1
1 12958 37 JZI Airlines 12.95 PHILIPPINES Online 53 F 37.0 Basic Plan 0
2 18233 27 EPX Travel Agency 0.00 UNITED STATES Online 28 NaN 13.0 Cancellation Plan 0
3 31742 36 EPX Travel Agency 0.00 SAUDI ARABIA Online 1 NaN 34.0 Cancellation Plan 0
4 14381 26 CWT Travel Agency 23.76 THAILAND Online 33 NaN 39.6 Rental Vehicle Excess Insurance 0
# Show dataset shape
number_of_columns = data.shape[1]
# Show all columns for dataset
pd.set_option('display.max_columns', number_of_columns)
pd.set_option('display.max_rows', number_of_columns)
# Show all columns from dataframe
data.columns
Index(['ID', 'Age', 'Agency', 'Agency Type', 'Commision (in value)',
       'Destination', 'Distribution Channel', 'Duration', 'Gender',
       'Net Sales', 'Product Name', 'Claim'],
      dtype='object')
data[["Product Name","Agency Type", "Claim"]].head(5)
Product Name Agency Type Claim
0 Silver Plan Airlines 1
1 Basic Plan Airlines 0
2 Cancellation Plan Travel Agency 0
3 Cancellation Plan Travel Agency 0
4 Rental Vehicle Excess Insurance Travel Agency 0
# agency code
code_category_for_airlines = "Airlines"
code_category_for_travel = "Travel Agency"

# binary value
value_of_one = 1
value_of_zero = 0

def change_code_category_to_biner(category):
    return value_of_one if category == code_category_for_travel else value_of_zero
# Update all values of 'Category' series
data["Agency Type"] = data["Agency Type"].apply(change_code_category_to_biner)
data[["Product Name","Agency Type", "Claim"]].head(5)
Product Name Agency Type Claim
0 Silver Plan 0 1
1 Basic Plan 0 0
2 Cancellation Plan 1 0
3 Cancellation Plan 1 0
4 Rental Vehicle Excess Insurance 1 0
# CONSTAN VARIABLE
DECREMENT_BY_ONE = 1
INCREMENT_BY_ONE = 1

CONTINGENCY_TABLE_VALUE = {
    "q" : (1,1),
    "r" : (1,0),
    "s" : (0,1),
    "t" : (0,0),
}
def get_series(data, idx, series):
    return data.loc[(idx), series]
def get_dissimilarity_dataset(data, series_index = [], series = []):
    first_series = get_series(data, series_index[0], series)
    second_series = get_series(data, series_index[1], series)
    dataset = pd.concat([first_series,second_series],axis=1)
    return dataset.T
get_dissimilarity_dataset(data, [1,2], ["Agency Type", "Claim"]).T
1 2
Agency Type 0 1
Claim 0 0
data.loc[0:5, ["Agency Type", "Claim"]]
Agency Type Claim
0 0 1
1 0 0
2 1 0
3 1 0
4 1 0
5 1 0
def count_contingency_value(data, start_index = 0, last_index = 1):

    CONTINGENCY_VALUE = {
        "q" : 0,
        "r" : 0,
        "s" : 0,
        "t" : 0,
    }

    column_range = data.shape[1]

    for column in range(column_range):
        for value in CONTINGENCY_TABLE_VALUE:
            item = list((tuple(data.loc[(start_index):(last_index), data.columns[column]]) == CONTINGENCY_TABLE_VALUE[value], value))
            if item[0] == True:
                if item[1] == "q":
                    CONTINGENCY_VALUE["q"] += 1
                if item[1] == "r":
                    CONTINGENCY_VALUE["r"] += 1
                if item[1] == "s":
                    CONTINGENCY_VALUE["s"] += 1
                if item[1] == "t":
                    CONTINGENCY_VALUE["t"] += 1

    return CONTINGENCY_VALUE
# d(1,2)
data_1_2 = get_dissimilarity_dataset(data, [1,2], ["Agency Type", "Claim"])
c_d_1_2 = count_contingency_value(data_1_2, 1, 2)
# d(1,3)
data_1_3 = get_dissimilarity_dataset(data, [1,3], ["Agency Type", "Claim"])
c_d_1_3 = count_contingency_value(data_1_3, 1, 3)
# d(1,4)
data_1_4 = get_dissimilarity_dataset(data, [1,4], ["Agency Type", "Claim"])
c_data_1_4 = count_contingency_value(data_1_4, 1, 4)
# d(1,5)
data_1_5 = get_dissimilarity_dataset(data, [1,5], ["Agency Type", "Claim"])
c_data_1_5 = count_contingency_value(data_1_5, 1, 5)

Dissimilarity Binary Assymetric Value Formula

\[ \frac{r + s}{q + r + s}\ \]
def measure_dissimilarity_binary_value_assymetric_distance(contingency_value):

    return (contingency_value["r"] + contingency_value["s"]) / (contingency_value["q"] + contingency_value["r"] + contingency_value["s"])
d_1_2 = measure_dissimilarity_binary_value_assymetric_distance(c_d_1_2)
d_1_3 = measure_dissimilarity_binary_value_assymetric_distance(c_d_1_2)
d_1_4 = measure_dissimilarity_binary_value_assymetric_distance(c_d_1_2)
d_1_2
1.0
d_1_3
1.0
d_1_4
1.0