Tuesday, 14 February 2017

Python Program to Cluster Data using the K-medoid method

Here, our given data-set is :


A1
A2
O1
2
6
O2
3
4
O3
3
8
O4
4
7
O5
6
2
O6
6
4
O7
7
3
O8
7
4
O9
8
5
O10
7
6

We implement the K-medoid algorithm for this dataset as follows:


import random

data_set = [[2,6],[3,4],[3,8],[4,7],[6,2],[6,4],[7,3],[7,4],[8,5],[7,6]]
median = [1,7]
m = [[3,4],[7,4]]
c1 = []
c2 = []
count = 0

def assign_mediod():
    median.append(random.randint(0,9))
    while True:
        a = random.randint(0,9)
        if a <> median[0]:
            median.append(a)
            break
    print(median)
    m.append(data_set[median[0]])
    m.append(data_set[median[1]])

def manhattan_distance(a,b):
    return abs(a[0]-b[0]) + abs(a[1]-b[1])

def create_cluster():
    print("data_set = ",data_set)
    for i in range(0,10):
        if i == median[0] or i == median[1]:
            continue
        else:
            if manhattan_distance(data_set[i],m[0]) < manhattan_distance(data_set[i],m[1]):
                c1.append(data_set[i])
            else:
                c2.append(data_set[i])

def AE(c1,c2,m):
    total = 0
    for o in c1:
        total = total + manhattan_distance(o,m[0])
    for o in c2:
        total = total + manhattan_distance(o,m[1])
    return total

def Check_for_Swap():
    global m
    global median
    global c1
    global c2
    global count
    count = count + 1
    swap = False
    tm = list(m)
    tc1 = list(c1)
    tc2 = list(c2)
    for i in range(0,len(c1)):
        temp = tm[0]
        tm[0] = tc1[i]
        tc1[i] = temp
        print("Cost Function for old - ",c1,",",c2,",",m," = ",AE(c1,c2,m))
        print("Cost Function for new - ",tc1,",",tc2,",",tm," = ",AE(tc1,tc2,tm))
        if (AE(tc1,tc2,tm) - AE(c1,c2,m)) < 0:
            m = list(tm)
            swap = True
            break
        print("c1 iteration  = ",i)
    for i in range(0,len(c2)):
        temp = tm[1]
        tm[1] = tc2[i]
        tc2[i] = temp
        print("Cost Function for old - ",c1,",",c2,",",m," = ",AE(c1,c2,m))
        print("Cost Function for new - ",tc1,",",tc2,",",tm," = ",AE(tc1,tc2,tm))
        if (AE(tc1,tc2,tm) - AE(c1,c2,m)) < 0:
            m = list(tm)
            swap = True
            break
        print("c2 iteration  = ",i)
    median = [data_set.index(m[0]), data_set.index(m[1])]
    print ("After iteration -",count)
    print ("medoids = ",m)
    print("medians = ",median)
    print("c1 = ",c1)
    print("c2 = ",c2)
    if swap == True:
        c1 = []
        c2 = []
        create_cluster()
        Check_for_Swap()
    else:
        print(count)
        print(m)
        print(median)
        print(c1)
        print(c2)
            
if __name__ == "__main__":
    assign_mediod()
    print(m)
    print(median)
    create_cluster()
    print(c1)
    print(c2)
    Check_for_Swap()

Let us suppose that the two medoids, the program randomly assigns are [O2,O7], then we get the following output

[[3, 4], [7, 4]]
[1, 7]
('data_set = ', [[2, 6], [3, 4], [3, 8], [4, 7], [6, 2], [6, 4], [7, 3], [7, 4], [8, 5], [7, 6]])
[[2, 6], [3, 8], [4, 7]]
[[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]]
('Cost Function for old - ', [[2, 6], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[3, 4], [7, 4]], ' = ', 20)
('Cost Function for new - ', [[3, 4], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 18)
('Cost Function for old - ', [[2, 6], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 15)
('Cost Function for new - ', [[3, 4], [3, 8], [4, 7]], ',', [[7, 4], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [6, 2]], ' = ', 26)
('c2 iteration  = ', 0)
('Cost Function for old - ', [[2, 6], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 15)
('Cost Function for new - ', [[3, 4], [3, 8], [4, 7]], ',', [[7, 4], [6, 2], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [6, 4]], ' = ', 20)
('c2 iteration  = ', 1)
('Cost Function for old - ', [[2, 6], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 15)
('Cost Function for new - ', [[3, 4], [3, 8], [4, 7]], ',', [[7, 4], [6, 2], [6, 4], [8, 5], [7, 6]], ',', [[2, 6], [7, 3]], ' = ', 20)
('c2 iteration  = ', 2)
('Cost Function for old - ', [[2, 6], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 15)
('Cost Function for new - ', [[3, 4], [3, 8], [4, 7]], ',', [[7, 4], [6, 2], [6, 4], [7, 3], [7, 6]], ',', [[2, 6], [8, 5]], ' = ', 24)
('c2 iteration  = ', 3)
('Cost Function for old - ', [[2, 6], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 15)
('Cost Function for new - ', [[3, 4], [3, 8], [4, 7]], ',', [[7, 4], [6, 2], [6, 4], [7, 3], [8, 5]], ',', [[2, 6], [7, 6]], ' = ', 24)
('c2 iteration  = ', 4)
('After iteration -', 1)
('medoids = ', [[2, 6], [7, 4]])
('medians = ', [0, 7])
('c1 = ', [[2, 6], [3, 8], [4, 7]])
('c2 = ', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]])
('data_set = ', [[2, 6], [3, 4], [3, 8], [4, 7], [6, 2], [6, 4], [7, 3], [7, 4], [8, 5], [7, 6]])
('Cost Function for old - ', [[3, 4], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 18)
('Cost Function for new - ', [[2, 6], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[3, 4], [7, 4]], ' = ', 20)
('c1 iteration  = ', 0)
('Cost Function for old - ', [[3, 4], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 18)
('Cost Function for new - ', [[2, 6], [3, 4], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[3, 8], [7, 4]], ' = ', 18)
('c1 iteration  = ', 1)
('Cost Function for old - ', [[3, 4], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 18)
('Cost Function for new - ', [[2, 6], [3, 4], [3, 8]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[4, 7], [7, 4]], ' = ', 18)
('c1 iteration  = ', 2)
('Cost Function for old - ', [[3, 4], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 18)
('Cost Function for new - ', [[2, 6], [3, 4], [3, 8]], ',', [[7, 4], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[4, 7], [6, 2]], ' = ', 26)
('c2 iteration  = ', 0)
('Cost Function for old - ', [[3, 4], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 18)
('Cost Function for new - ', [[2, 6], [3, 4], [3, 8]], ',', [[7, 4], [6, 2], [7, 3], [8, 5], [7, 6]], ',', [[4, 7], [6, 4]], ' = ', 20)
('c2 iteration  = ', 1)
('Cost Function for old - ', [[3, 4], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 18)
('Cost Function for new - ', [[2, 6], [3, 4], [3, 8]], ',', [[7, 4], [6, 2], [6, 4], [8, 5], [7, 6]], ',', [[4, 7], [7, 3]], ' = ', 20)
('c2 iteration  = ', 2)
('Cost Function for old - ', [[3, 4], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 18)
('Cost Function for new - ', [[2, 6], [3, 4], [3, 8]], ',', [[7, 4], [6, 2], [6, 4], [7, 3], [7, 6]], ',', [[4, 7], [8, 5]], ' = ', 24)
('c2 iteration  = ', 3)
('Cost Function for old - ', [[3, 4], [3, 8], [4, 7]], ',', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]], ',', [[2, 6], [7, 4]], ' = ', 18)
('Cost Function for new - ', [[2, 6], [3, 4], [3, 8]], ',', [[7, 4], [6, 2], [6, 4], [7, 3], [8, 5]], ',', [[4, 7], [7, 6]], ' = ', 24)
('c2 iteration  = ', 4)
('After iteration -', 2)
('medoids = ', [[2, 6], [7, 4]])
('medians = ', [0, 7])
('c1 = ', [[3, 4], [3, 8], [4, 7]])
('c2 = ', [[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]])
2
[[2, 6], [7, 4]]
[0, 7]
[[3, 4], [3, 8], [4, 7]]
[[6, 2], [6, 4], [7, 3], [8, 5], [7, 6]]
>>> 

No comments:

Post a Comment