Produce a new version of kmeans called MRKmeansMultirun Kmea
Solution
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import smile.math.Math;
import smile.util.MulticoreExecutor;
public class KMeans extends PartitionClustering<double[]> {
private static final Logger logger = LoggerFactory.getLogger(KMeans.class);
/**
* The total distortion.
*/
double distortion;
/**
* The centroids of each cluster.
*/
double[][] centroids;
/**
* Constructor.
*/
KMeans() {
}
/**
* Returns the distortion.
*/
public double distortion() {
return distortion;
}
/**
* Returns the centroids.
*/
public double[][] centroids() {
return centroids;
}
/**
* Cluster a new instance.
* @param x a new instance.
* @return the cluster label, which is the index of nearest centroid.
*/
@Override
public int predict(double[] x) {
double minDist = Double.MAX_VALUE;
int bestCluster = 0;
for (int i = 0; i < k; i++) {
double dist = Math.squaredDistance(x, centroids[i]);
if (dist < minDist) {
minDist = dist;
bestCluster = i;
}
}
return bestCluster;
}
public KMeans(double[][] data, int k) {
this(data, k, 100);
}
public KMeans(double[][] data, int k, int maxIter) {
this(new BBDTree(data), data, k, maxIter);
}
/**
* Constructor. Clustering data into k clusters.
* @param bbd the BBD-tree of data for fast clustering.
* @param data the input data of which each row is a sample.
* @param k the number of clusters.
* @param maxIter the maximum number of iterations for each running.
*/
KMeans(BBDTree bbd, double[][] data, int k, int maxIter) {
if (k < 2) {
throw new IllegalArgumentException(\"Invalid number of clusters: \" + k);
}
if (maxIter <= 0) {
throw new IllegalArgumentException(\"Invalid maximum number of iterations: \" + maxIter);
}
int n = data.length;
int d = data[0].length;
this.k = k;
distortion = Double.MAX_VALUE;
y = seed(data, k, ClusteringDistance.EUCLIDEAN);
size = new int[k];
centroids = new double[k][d];
for (int i = 0; i < n; i++) {
size[y[i]]++;
}
for (int i = 0; i < n; i++) {
for (int j = 0; j < d; j++) {
centroids[y[i]][j] += data[i][j];
}
}
for (int i = 0; i < k; i++) {
for (int j = 0; j < d; j++) {
centroids[i][j] /= size[i];
}
}
double[][] sums = new double[k][d];
for (int iter = 1; iter <= maxIter; iter++) {
double dist = bbd.clustering(centroids, sums, size, y);
for (int i = 0; i < k; i++) {
if (size[i] > 0) {
for (int j = 0; j < d; j++) {
centroids[i][j] = sums[i][j] / size[i];
}
}
}
if (distortion <= dist) {
break;
} else {
distortion = dist;
}
}
}
public KMeans(double[][] data, int k, int maxIter, int runs) {
if (k < 2) {
throw new IllegalArgumentException(\"Invalid number of clusters: \" + k);
}
if (maxIter <= 0) {
throw new IllegalArgumentException(\"Invalid maximum number of iterations: \" + maxIter);
}
if (runs <= 0) {
throw new IllegalArgumentException(\"Invalid number of runs: \" + runs);
}
BBDTree bbd = new BBDTree(data);
List<KMeansThread> tasks = new ArrayList<>();
for (int i = 0; i < runs; i++) {
tasks.add(new KMeansThread(bbd, data, k, maxIter));
}
KMeans best = new KMeans();
best.distortion = Double.MAX_VALUE;
try {
List<KMeans> clusters = MulticoreExecutor.run(tasks);
for (KMeans kmeans : clusters) {
if (kmeans.distortion < best.distortion) {
best = kmeans;
}
}
} catch (Exception ex) {
logger.error(\"Failed to run K-Means on multi-core\", ex);
for (int i = 0; i < runs; i++) {
KMeans kmeans = lloyd(data, k, maxIter);
if (kmeans.distortion < best.distortion) {
best = kmeans;
}
}
}
this.k = best.k;
this.distortion = best.distortion;
this.centroids = best.centroids;
this.y = best.y;
this.size = best.size;
}
/**
* Adapter for running BBD-Tree based K-Means algorithm in thread pool.
*/
static class KMeansThread implements Callable<KMeans> {
final BBDTree bbd;
final double[][] data;
final int k;
final int maxIter;
KMeansThread(BBDTree bbd, double[][] data, int k, int maxIter) {
this.bbd = bbd;
this.data = data;
this.k = k;
this.maxIter = maxIter;
}
@Override
public KMeans call() {
return new KMeans(bbd, data, k, maxIter);
}
}
public static KMeans lloyd(double[][] data, int k) {
return lloyd(data, k, 100);
}
public static KMeans lloyd(double[][] data, int k, int maxIter) {
if (k < 2) {
throw new IllegalArgumentException(\"Invalid number of clusters: \" + k);
}
if (maxIter <= 0) {
throw new IllegalArgumentException(\"Invalid maximum number of iterations: \" + maxIter);
}
int n = data.length;
int d = data[0].length;
int[][] nd = new int[k][d]; // The number of non-missing values per cluster per variable.
double distortion = Double.MAX_VALUE;
int[] size = new int[k];
double[][] centroids = new double[k][d];
int[] y = seed(data, k, ClusteringDistance.EUCLIDEAN_MISSING_VALUES);
int np = MulticoreExecutor.getThreadPoolSize();
List<LloydThread> tasks = null;
if (n >= 1000 && np >= 2) {
tasks = new ArrayList<>(np + 1);
int step = n / np;
if (step < 100) {
step = 100;
}
int start = 0;
int end = step;
for (int i = 0; i < np-1; i++) {
tasks.add(new LloydThread(data, centroids, y, start, end));
start += step;
end += step;
}
tasks.add(new LloydThread(data, centroids, y, start, n));
}
for (int iter = 0; iter < maxIter; iter++) {
Arrays.fill(size, 0);
for (int i = 0; i < k; i++) {
Arrays.fill(centroids[i], 0);
Arrays.fill(nd[i], 0);
}
for (int i = 0; i < n; i++) {
int m = y[i];
size[m]++;
for (int j = 0; j < d; j++) {
if (!Double.isNaN(data[i][j])) {
centroids[m][j] += data[i][j];
nd[m][j]++;
}
}
}
for (int i = 0; i < k; i++) {
for (int j = 0; j < d; j++) {
centroids[i][j] /= nd[i][j];
}
}
double wcss = Double.NaN;
if (tasks != null) {
try {
wcss = 0.0;
for (double ss : MulticoreExecutor.run(tasks)) {
wcss += ss;
}
} catch (Exception ex) {
logger.error(\"Failed to run K-Means on multi-core\", ex);
wcss = Double.NaN;
}
}
if (Double.isNaN(wcss)) {
wcss = 0.0;
for (int i = 0; i < n; i++) {
double nearest = Double.MAX_VALUE;
for (int j = 0; j < k; j++) {
double dist = squaredDistance(data[i], centroids[j]);
if (nearest > dist) {
y[i] = j;
nearest = dist;
}
}
wcss += nearest;
}
}
if (distortion <= wcss) {
break;
} else {
distortion = wcss;
}
}
// In case of early stop, we should recalculate centroids and clusterSize.
Arrays.fill(size, 0);
for (int i = 0; i < k; i++) {
Arrays.fill(centroids[i], 0);
Arrays.fill(nd[i], 0);
}
for (int i = 0; i < n; i++) {
int m = y[i];
size[m]++;
for (int j = 0; j < d; j++) {
if (!Double.isNaN(data[i][j])) {
centroids[m][j] += data[i][j];
nd[m][j]++;
}
}
}
for (int i = 0; i < k; i++) {
for (int j = 0; j < d; j++) {
centroids[i][j] /= nd[i][j];
}
}
KMeans kmeans = new KMeans();
kmeans.k = k;
kmeans.distortion = distortion;
kmeans.size = size;
kmeans.centroids = centroids;
kmeans.y = y;
return kmeans;
}
public static KMeans lloyd(double[][] data, int k, int maxIter, int runs) {
if (k < 2) {
throw new IllegalArgumentException(\"Invalid number of clusters: \" + k);
}
if (maxIter <= 0) {
throw new IllegalArgumentException(\"Invalid maximum number of iterations: \" + maxIter);
}
if (runs <= 0) {
throw new IllegalArgumentException(\"Invalid number of runs: \" + runs);
}
KMeans best = lloyd(data, k, maxIter);
for (int i = 1; i < runs; i++) {
KMeans kmeans = lloyd(data, k, maxIter);
if (kmeans.distortion < best.distortion) {
best = kmeans;
}
}
return best;
}
/**
* Adapter for running Lloyd algorithm in thread pool.
*/
static class LloydThread implements Callable<Double> {
/**
* The start index of data portion for this task.
*/
final int start;
/**
* The end index of data portion for this task.
*/
final int end;
final double[][] data;
final int k;
final double[][] centroids;
int[] y;
LloydThread(double[][] data, double[][] centroids, int[] y, int start, int end) {
this.data = data;
this.k = centroids.length;
this.y = y;
this.centroids = centroids;
this.start = start;
this.end = end;
}
@Override
public Double call() {
double wcss = 0.0;
for (int i = start; i < end; i++) {
double nearest = Double.MAX_VALUE;
for (int j = 0; j < k; j++) {
double dist = squaredDistance(data[i], centroids[j]);
if (nearest > dist) {
y[i] = j;
nearest = dist;
}
}
wcss += nearest;
}
return wcss;
}
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(String.format(\"K-Means distortion: %.5f%n\", distortion));
sb.append(String.format(\"Clusters of %d data points of dimension %d:%n\", y.length, centroids[0].length));
for (int i = 0; i < k; i++) {
int r = (int) Math.round(1000.0 * size[i] / y.length);
sb.append(String.format(\"%3d\\t%5d (%2d.%1d%%)%n\", i, size[i], r / 10, r % 10));
}
return sb.toString();
}
}






