2016-09-23 65 views
-1

我已经为2维创建了我的k均值算法。我想修改它的8个维度,即数据点可以取8维值,最后返回8维质心值。如何修改我的K-Means聚类算法以将维度增加到8?

的代码如下:

import random 
import math 

# Input varibles 
#k = 3 
#Threshold = 1 
DATA = [[2, 1, 1, 2, 1, 1, 1, 5], [ 6, 8, 1, 3, 4, 3, 7, 1],[4, 1, 3, 2, 1, 3, 1, 1],[3, 1, 1, 2, 1, 2, 1, 1],[3 ,1 ,1 ,1, 1, 2, 1, 1],[6, 1, 1, 1, 1, 7, 1, 1],[6, 10, 2, 8, 10, 7, 3, 3]] 


BIG_NUMBER = math.pow(10, 10) 
data = [] 
centroids = [] 

class DataPoint: 
    def __init__(self, x, y): 
    self.x = x 
    self.y = y 


def set_x(self, x): 
    self.x = x 

def get_x(self): 
    return self.x 

def set_y(self, y): 
    self.y = y 

def get_y(self): 
    return self.y 

def set_cluster(self, clusterNumber): 
    self.clusterNumber = clusterNumber 

def get_cluster(self): 
    return self.clusterNumber 

class Centroid: 
def __init__(self, x, y): 
    self.x = x 
    self.y = y 

def set_x(self, x): 
    self.x = x 

def get_x(self): 
    return self.x 

def set_y(self, y): 
    self.y = y 

def get_y(self): 
    return self.y 

# Initializing The Centroids 



def initialize_centroids(k,DATA): 
    #find data range in x and y 
    max_x = max(x for x,y in DATA) 
    max_y = max(y for x,y in DATA) 
    min_x = min(x for x,y in DATA) 
    min_y = min(y for x,y in DATA) 
    #chosse random x and y between this data range 

#assign to centroids 

for j in range(k): 
    #x = random.choice(DATA) 
    random_x = random.uniform(min_x,max_x) 
    random_y = random.uniform(min_y,max_y) 
    centroids.append(Centroid(random_x, random_y)) 
    #print("(", centroids[j].get_x(), ",", centroids[j].get_y(), ")") 

return centroids 

# Assigning Datapoints to nearest Centroids 

def initialize_datapoints(k,DATA): 
    for i in range(len(DATA)): 
     newpoint = DataPoint(DATA[i][0], DATA[i][1]) 
     bestMinimum = BIG_NUMBER 
     data.append(newpoint) 

     for j in range(k): 
      distance = get_distance(newpoint.get_x(), newpoint.get_y(), centroids[j].get_x(), centroids[j].get_y()) 
      if(distance < bestMinimum): 
       bestMinimum = distance 
       newpoint.set_cluster(j) 
    return 

# Calculating Euclidean distance 

def get_distance(dataPointX, dataPointY, centroidX, centroidY): 

    return math.sqrt(math.pow((centroidY - dataPointY), 2) + math.pow((centroidX - dataPointX), 2)) 

# Updating Centroid and Clusters till the threshold is met 

def update_centroids_n_clusters(k,DATA,Threshold): 
    dist = 0.0 
    #print ("a") 
    for j in range(k): 
     prev_x = centroids[j].get_x() 
     prev_y = centroids[j].get_y() 

     totalX = 0 
     totalY = 0 
     totalInCluster = 0 
     for z in range(len(data)): 
      if (data[z].get_cluster() == j): 
       totalX += data[z].get_x() 
       totalY += data[z].get_y() 
       totalInCluster += 1 

     if (totalInCluster > 0): 
      s_x = (totalX/totalInCluster) 
      s_y = (totalY/totalInCluster) 
      centroids[j].set_x(s_x) 
      centroids[j].set_y(s_y) 


     x1 = centroids[j].get_x() 
     y1 = centroids[j].get_y() 
     x2 = prev_x 
     y2 = prev_y 

     dist += get_distance(x1,y1,x2,y2) 

    conv_val = (1/k)*dist 

    if(conv_val >= Threshold): 

     for i in range(len(DATA)): 
      bestMinimum = BIG_NUMBER 
      currentCluster = 0 

      for j in range(k): 
       distance = get_distance(data[i].get_x(), data[i].get_y(), centroids[j].get_x(), centroids[j].get_y()) 
       if (distance < bestMinimum): 
        bestMinimum = distance 
        currentCluster = j 

      data[i].set_cluster(currentCluster) 
     update_centroids_n_clusters(k, DATA, Threshold) 
    return 

# Performing K_Means 

def Kmeans(k, DATA, Threshold): 

    initialize_centroids(k,DATA) 

    initialize_datapoints(k, DATA) 

    update_centroids_n_clusters(k, DATA, Threshold) 

    for i in range(k): 
     p = 0 
     print() 
     print("Centroid ", i, " is at") 
     print("(",centroids[i].get_x(), ",", centroids[i].get_y(), ")") 

     print("Cluster ", i, " includes:") 
     for j in range(len(DATA)): 
      if (data[j].get_cluster() == i): 
       #print("(", data[j].get_x(), ", ", data[j].get_y(), ")") 
       p += 1 
     print(p,"points") 

    return 

Kmeans(3,DATA,0.1) 

我应该如何修改我的类质心类数据点在这个代码?谢谢!!

注:该代码是在的Python 3

回答

1

使用阵列而不是xy

您想要你的距离函数是

def distance(array1, array2): 
    return (array1 - array2)**2 

(假设你使用numpy