Powered by GitBook

// Databricks notebook source exported at Tue, 28 Jun 2016 10:37:19 UTC

Scalable Data Science

Course Project - High Order Spectral Clustering

by Xin Zhao

supported by and

The html source url of this databricks notebook and its recorded Uji :

Example of High Order Spectral Clustering

Iris flower data set: The data set consists of 50 samples from each of three species of Iris (Iris setosa, Iris versicolor and Iris virginica).

setosa: , versicolor: , and virginica:

Based on Fisher's linear discriminant model, this data set became a typical test case for many statistical classification techniques in machine learning such as support vector machines. The use of this data set in cluster analysis however is uncommon, since the data set only contains two clusters with rather obvious separation. One of the clusters contains Iris setosa, while the other cluster contains both Iris virginica and Iris versicolor and is not separable without the species information Fisher used.

The table below shows the iris data set.


val fisheririsDF = sqlContext.table("fisheriris")
display(fisheririsDF)

Scatter plot

Four features were measured from each sample: the length and the width of the sepals and petals, in centimetres.

Clustered using

k-means (left) and
true species in the data set (right).

K-mean result by wikipedia, but it seems wrong

Section1. K-Mean cluster via Mllib kmeans


import org.apache.spark.mllib.clustering.{KMeansModel, KMeans}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.{sql, SparkContext}
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.functions.monotonicallyIncreasingId


def getKMeanCluster(fisheririsDF:sql.DataFrame, feaNames:Array[String], idcolName: String,
                              nc:Int, sc: SparkContext, sqlContext: SQLContext ): sql.DataFrame ={
    val feasName0=feaNames
    val featureDF: sql.DataFrame = fisheririsDF.select(feasName0.head, feasName0.tail: _*).cache()
    val feasName=featureDF.columns.filter(x=>x!=idcolName)
    val feaRdd=featureDF.rdd.zipWithIndex()
    val outputID=feaRdd.map(v=>(v._2,v._1.getAs[Long](idcolName))).collect().sortBy(_._1).map(x=>x._2)

    def row2denseVector(row: Row, fname:Array[String]):Vector={
      val rowarray=Vectors.dense(Range(0,fname.length).toArray.map(v=>row.getAs[Double](fname(v))))
      rowarray
    }
    val dataArray=feaRdd.collect().sortBy(_._2).map(_._1).map(x=>row2denseVector(x,feasName))

    var mycluster=new KMeans()
    mycluster.setMaxIterations(10)
    mycluster.setK(nc)
    mycluster.setInitializationSteps(5)
    mycluster.setEpsilon(1e-4)

    val data=sc.parallelize(dataArray)

    val clusterModel:KMeansModel = mycluster.run(data)
    var labels: Array[(Int,Long)]=null
    if(outputID==null){
      labels = clusterModel.predict(data).collect().zipWithIndex.map(v=>(v._1,v._2.toLong))
    }else{
      labels = clusterModel.predict(data).collect().zip(outputID)
    }
    //disp labels
    import sqlContext.implicits._
    val labeldf = sc.makeRDD(labels).toDF("label", "id").cache()

    labeldf.registerTempTable("clusterLabel")
    fisheririsDF.registerTempTable("fisheriris")

    val fisheririsCluster=sqlContext.sql(
      "SELECT fisheriris.sepalLength, fisheriris.sepalWidth, " +
        "fisheriris.petalLength, fisheriris.petalWidth, " +
        "fisheriris.id, fisheriris.specie, clusterLabel.label FROM fisheriris, clusterLabel " +
        "WHERE fisheriris.id=clusterLabel.id"
    ).orderBy("id")


    //get unique class
    val groupCounts=fisheririsCluster.select("specie","label").groupBy("specie","label").count()
    groupCounts.show()

    fisheririsCluster

  }


val feasName=Array("sepalLength","sepalWidth","petalLength","petalWidth","id")
val nc=3
val fisherDFwithIndex=fisheririsDF.withColumn("id",monotonicallyIncreasingId).cache()
 //cluster on raw feature
val clusterOnRawFeature = getKMeanCluster(fisherDFwithIndex,feasName,"id",nc,sc,sqlContext)

K-mean error ratio 11.33%


clusterOnRawFeature.registerTempTable("kmeanTable")
display(clusterOnRawFeature)


%py
#do plot with true label
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

df = sqlContext.table("fisheriris")
y=df.collect()
fea1 = np.asarray(map(lambda x: x[0], y))
fea2 = np.asarray(map(lambda x: x[1], y))
fea3 = np.asarray(map(lambda x: x[2], y))
tl= np.arange(150)
tl[range(0,50)]=0
tl[range(50,100)]=1
tl[range(100,150)]=2

fig1 = plt.figure(figsize=(20,10))
ax = fig1.add_subplot(121, projection='3d')
figpars=[('r', 'o'), ('m', 'x'), ('b','^')] #color and marker
symbol=['setosa','versicolor','virginica']
for i in range(0,3):
    xs = fea1[tl==i]
    ys = fea2[tl==i]
    zs = fea3[tl==i]
    ax.scatter(xs, ys, zs, c=figpars[i][0], marker=figpars[i][1])
    plt.legend(symbol)
    plt.hold(True)

plt.hold(False)
for ii in xrange(60,160,30):
    ax.view_init(elev=10., azim=ii)
plt.axis('tight')
plt.title('Iris flower class')
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')

#do plot with kmean
dfkm = sqlContext.table("kmeanTable")
yy=dfkm.collect()
fea1 = np.asarray(map(lambda x: x[0], yy))
fea2 = np.asarray(map(lambda x: x[1], yy))
fea3 = np.asarray(map(lambda x: x[2], yy))
tl = np.asarray(map(lambda x: x[6], yy))

ax = fig1.add_subplot(122, projection='3d')
figpars=[('r', 'o'), ('m', 'x'), ('b','^')] #color and marker
symbol=['cluster_0','cluster_1','cluster_2']
li=[0,1,2]#This is used just to match the marker of two figure below more comparable
for i in range(0,3):
    xs = fea1[tl==li[i]]
    ys = fea2[tl==li[i]]
    zs = fea3[tl==li[i]]
    ax.scatter(xs, ys, zs, c=figpars[i][0], marker=figpars[i][1])
    plt.legend(symbol)
    plt.hold(True)

plt.hold(False)
for ii in xrange(60,160,30):
    ax.view_init(elev=10., azim=ii)
plt.axis('tight')
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.title('Kmean-cluster')


%py
display(fig1)

Section 2. High Order Spectral Clustering


import tensorSVD.{TensorSVD, RddMatrix, RddTensor}
import highOrderSpectralClustering.graph.ConstructGraph
import highOrderSpectralClustering.clustering.TensorSpectralClustering


def getSpectralCluster(feadf:sql.DataFrame,feaNames:Array[String], idcolName: String,
                                 nc:Int,K:Array[Int],sc:SparkContext,sqlContext: SQLContext ):sql.DataFrame={

    val featureSelected: sql.DataFrame = feadf.select(feaNames.head, feaNames.tail: _*).cache()
    val mycluster= TensorSpectralClustering(featureSelected,nc,K,sc)
    val isLocalTensor=true //false means use distributed tensor method instead which is suggested for larger sample.
    mycluster.run(isLocalTensor,sc)

    import sqlContext.implicits._
    val labeldf = sc.makeRDD(mycluster.labels).toDF("label", "id").cache()

    labeldf.registerTempTable("clusterLabel")
    feadf.registerTempTable("fisheriris")

    val fisheririsCluster=sqlContext.sql(
      "SELECT fisheriris.sepalLength, fisheriris.sepalWidth, " +
        "fisheriris.petalLength, fisheriris.petalWidth, " +
        "fisheriris.id, fisheriris.specie, clusterLabel.label FROM fisheriris, clusterLabel " +
        "WHERE fisheriris.id=clusterLabel.id"
    ).orderBy("id")

    //get unique class
    val groupCounts=fisheririsCluster.select("specie","label").groupBy("specie","label").count().show()

    fisheririsCluster
  }


    val K=Array(5,5,4)
    val nc=3
    val HOSpectralCluster=getSpectralCluster(fisherDFwithIndex,feasName,"id",nc,K,sc,sqlContext)

High order spectral clustering error ratio 4%. It shows clearly that the high order spectral clustering done a better job in separating "versicolor" and "virginica" which seems hard to be distinguished by k-means in the original feature space.


HOSpectralCluster.registerTempTable("HOSpectralTable")
display(HOSpectralCluster)


%py
#do plot with true label
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

df = sqlContext.table("fisheriris")
y=df.collect()
fea1 = np.asarray(map(lambda x: x[0], y))
fea2 = np.asarray(map(lambda x: x[1], y))
fea3 = np.asarray(map(lambda x: x[2], y))
tl= np.arange(150)
tl[range(0,50)]=0
tl[range(50,100)]=1
tl[range(100,150)]=2

fig2 = plt.figure(figsize=(20,10))
ax = fig2.add_subplot(121, projection='3d')
figpars=[('r', 'o'), ('m', 'x'), ('b','^')] #color and marker
symbol=['setosa','versicolor','virginica']
for i in range(0,3):
    xs = fea1[tl==i]
    ys = fea2[tl==i]
    zs = fea3[tl==i]
    ax.scatter(xs, ys, zs, c=figpars[i][0], marker=figpars[i][1])
    plt.legend(symbol)
    plt.hold(True)

plt.hold(False)
for ii in xrange(60,160,30):
    ax.view_init(elev=10., azim=ii)
plt.axis('tight')
plt.title('Iris flower class')
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')

#do plot with kmean
dfkm = sqlContext.table("HOSpectralTable")
yy=dfkm.collect()
fea1 = np.asarray(map(lambda x: x[0], yy))
fea2 = np.asarray(map(lambda x: x[1], yy))
fea3 = np.asarray(map(lambda x: x[2], yy))
tl = np.asarray(map(lambda x: x[6], yy))

ax = fig2.add_subplot(122, projection='3d')
#figpars=[('r', 'o'), ('m', 'x'), ('b','^')] #color and marker
symbol=['cluster_0','cluster_1','cluster_2']
li=[2,1,0]#This is used just to match the marker of two figure below more comparable
for i in range(0,3):
    xs = fea1[tl==li[i]]
    ys = fea2[tl==li[i]]
    zs = fea3[tl==li[i]]
    ax.scatter(xs, ys, zs, c=figpars[i][0], marker=figpars[i][1])
    plt.legend(symbol)
    plt.hold(True)

plt.hold(False)
for ii in xrange(60,160,30):
    ax.view_init(elev=10., azim=ii)
plt.axis('tight')
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.title('HOSpectral-cluster')


%py
display(fig2)

The figures showed above shows a good performance of the high order spectral clustering algorithm.

Scalable Data Science

Course Project - High Order Spectral Clustering

by Xin Zhao

supported by and

results matching ""

No results matching ""