Miscellaneous code

changeset 139:fac8bb755e41

Fixed a bunch of bugs. Discrete distributions work. KMeans no longer falls into infinite loops (I hope).
author david@david-desktop
date Mon Sep 08 16:10:21 2008 +0100 (2008-09-08)
parents 86768d2ae27a
children dfef34d732fc
files scala/vectors/base/src/main/scala/matrices.scala scala/vectors/base/src/main/scala/vectors.scala scala/vectors/buildfile scala/vectors/clustering/src/main/scala/kmeans.scala scala/vectors/clustering/src/main/scala/spectral.scala scala/vectors/probability/src/main/scala/distribution.scala
line diff
     1.1 --- a/scala/vectors/base/src/main/scala/matrices.scala	Mon Sep 08 11:14:57 2008 +0100
     1.2 +++ b/scala/vectors/base/src/main/scala/matrices.scala	Mon Sep 08 16:10:21 2008 +0100
     1.3 @@ -23,6 +23,18 @@
     1.4      forIn(0, n, (i : Int) =>  result.underlying.setQuick(i, i, 1));
     1.5      result;  
     1.6    }
     1.7 +
     1.8 +  def random(rows : Int, columns : Int) = {
     1.9 +    val random = new scala.util.Random;
    1.10 +    val matrix = sparse(rows, columns);
    1.11 +
    1.12 +    val occupancy = random.nextInt(rows * columns);
    1.13 +
    1.14 +    for(i <- 0 until occupancy){
    1.15 +      matrix(random.nextInt(rows), random.nextInt(columns)) = random.nextDouble();
    1.16 +    }   
    1.17 +    matrix; 
    1.18 +  }
    1.19  }
    1.20  
    1.21  abstract class Matrix{
    1.22 @@ -54,6 +66,23 @@
    1.23    def multiplyTo(that : Vector, target : Vector) = 
    1.24      this.underlying.zMult(that.underlying, target.underlying);
    1.25  
    1.26 +  def dominantEigenvector : Vector = dominantEigenvector(0.01);
    1.27 +
    1.28 +  def dominantEigenvector(precision : Double) : Vector = {
    1.29 +    
    1.30 +    def powerMultiply(from : Vector, to : Vector) : Vector = {
    1.31 +      multiplyTo(from, to);      
    1.32 +      to.normalize;
    1.33 +      if (from.distanceTo(to) < precision) to;
    1.34 +      else powerMultiply(to, from);
    1.35 +    }
    1.36 +  
    1.37 +    val n = rows.length;
    1.38 +    powerMultiply(Vector.const(1.0 / n, n), Vector.zero(n))
    1.39 +  }
    1.40 +
    1.41 +
    1.42 +
    1.43    def *(that : Vector) : Vector = {
    1.44      val z = that.underlying.like;
    1.45      this.underlying.zMult(that.underlying, z);
     2.1 --- a/scala/vectors/base/src/main/scala/vectors.scala	Mon Sep 08 11:14:57 2008 +0100
     2.2 +++ b/scala/vectors/base/src/main/scala/vectors.scala	Mon Sep 08 16:10:21 2008 +0100
     2.3 @@ -28,6 +28,13 @@
     2.4      avg
     2.5    }
     2.6  
     2.7 +  def random(length : Int) = {
     2.8 +    val vect = zero(length);
     2.9 +    val rand = new scala.util.Random;
    2.10 +    vect.updateWith((x : Double) => rand.nextDouble)
    2.11 +    vect 
    2.12 +  }
    2.13 +
    2.14    implicit def wrapVector (x : DoubleMatrix1D) = Vector(x);
    2.15    implicit def wrapRange(range : Range) : Vector = {
    2.16      val x = new DenseDoubleMatrix1D(range.length);
     3.1 --- a/scala/vectors/buildfile	Mon Sep 08 11:14:57 2008 +0100
     3.2 +++ b/scala/vectors/buildfile	Mon Sep 08 16:10:21 2008 +0100
     3.3 @@ -2,8 +2,10 @@
     3.4  NEXT_VERSION = "0.1"
     3.5  GROUP = "vectors"
     3.6  COLT="colt:colt:jar:1.2.0"
     3.7 +SCALAX="org.scalaforge:scalax:jar:0.1"
     3.8  
     3.9  repositories.remote << 'http://www.ibiblio.org/maven2/'
    3.10 +repositories.remote << 'http://scala-tools.org/repo-releases/'
    3.11  
    3.12  VECTORS = FileList["vectors/src/main/scala/**/*.scala"]
    3.13  
    3.14 @@ -36,8 +38,26 @@
    3.15      compile.with COLT, projects("base", "clustering")
    3.16      package :jar
    3.17    end 
    3.18 +
    3.19 +  desc"The magic clusterer"
    3.20 +  define "magic" do
    3.21 +    compile.with projects("base", "clustering", "probability"), COLT, SCALAX
    3.22 +    package(:jar)
    3.23 +  end
    3.24  end
    3.25  
    3.26 +def scala(main)
    3.27 +  classpath = (artifacts(COLT, SCALAX) + projects.map(&:package)).join(File::PATH_SEPARATOR)
    3.28 +  sh "scala -cp #{classpath} #{main}" 
    3.29 +end
    3.30 +
    3.31 +task :run do
    3.32 +  scala("vectors.clustering.magic.Magic")
    3.33 +end
    3.34 +
    3.35 +task :shell do
    3.36 +  scala("")
    3.37 +end
    3.38  
    3.39  task :scaladoc do  
    3.40    mkdir "doc" unless File.exists? "doc"
     4.1 --- a/scala/vectors/clustering/src/main/scala/kmeans.scala	Mon Sep 08 11:14:57 2008 +0100
     4.2 +++ b/scala/vectors/clustering/src/main/scala/kmeans.scala	Mon Sep 08 16:10:21 2008 +0100
     4.3 @@ -14,50 +14,58 @@
     4.4  case class KMeans(count : Int) extends Clustering{
     4.5    def pickCenters(_vectors : Collection[Vector]) = {
     4.6      val vectors = _vectors.toArray;
     4.7 -    val indices = new scala.collection.mutable.HashSet[Int];
     4.8 +    val centers = new scala.collection.mutable.HashSet[Vector];
     4.9      val rnd = new Random;    
    4.10  
    4.11 -    while (indices.size < count){
    4.12 -      indices += rnd.nextInt(vectors.length);
    4.13 +    while (centers.size < count){
    4.14 +      centers += vectors(rnd.nextInt(vectors.length));
    4.15      }
    4.16  
    4.17 -    indices.toArray.map(vectors)
    4.18 +    centers.toArray
    4.19    }
    4.20  
    4.21 -  def apply[T](vectors : Collection[(T, Vector)]) = {  
    4.22 -    var centers = pickCenters(vectors.toArray.map((_ : (T, Vector))._2));
    4.23 +  def apply[T](_vectors : Collection[(T, Vector)]) : Array[Array[(T, Vector)]]= {  
    4.24 +    val vectorsWithKeys = _vectors.toArray;
    4.25 +    val vectors = vectorsWithKeys.map(_._2);
    4.26 +    var centers = pickCenters(vectors);
    4.27      val dimension = centers(0).length
    4.28 -    var clusters : Array[ArrayBuffer[(T, Vector)]] = centers.map(i => new ArrayBuffer[(T, Vector)]);    
    4.29 +    var clusters : Array[Int] = null;
    4.30  
    4.31 -    clusters(0) ++= vectors;
    4.32 +    def computeClusters = {
    4.33 +      val newclusters = new Array[Int](vectors.length);
    4.34 +      for (i <- 0 until vectors.length){
    4.35 +        val vector = vectors(i);
    4.36 +        var minDistance = Double.MaxValue
    4.37  
    4.38 -    var changed = true;
    4.39 -    while(changed){
    4.40 -      changed = false;
    4.41 -      val newclusters = centers.map(i => new ArrayBuffer[(T, Vector)]);
    4.42 -
    4.43 -      for (i <- 0 until clusters.size;
    4.44 -           it@(t, vector) <- clusters(i)){
    4.45 -        var currentCluster = i;
    4.46 -        var minDistance = vector.distanceTo(centers(i)); 
    4.47 -
    4.48 -        for (j <- 0 until centers.size){
    4.49 +        for (j <- 0 until centers.length){
    4.50            val distance = vector.distanceTo(centers(j));
    4.51            if (distance < minDistance){
    4.52              minDistance = distance;
    4.53 -            currentCluster = j; 
    4.54 -          } 
    4.55 +            newclusters(i) = j;
    4.56 +          }
    4.57          }
    4.58 -        
    4.59 -        newclusters(currentCluster) += it;
    4.60 +      }
    4.61 +      newclusters;
    4.62 +    } 
    4.63  
    4.64 -        if (i != currentCluster) changed = true;
    4.65 -      } 
    4.66 +    clusters = computeClusters;
    4.67  
    4.68 -      centers = newclusters.map(x => Vector.average(dimension, x.map(_._2))); 
    4.69 +    def computeCenters = {
    4.70 +      def center(i : Int) = Vector.average(dimension, (0 until vectors.size).filter(j => clusters(j) == i).map(vectors))
    4.71 +      (0 until centers.length).map(center).toArray
    4.72      }
    4.73  
    4.74 -    clusters
    4.75 +    while(true){
    4.76 +      val newClusters = computeClusters;
    4.77 +      if (newClusters.sameElements(clusters)) return centers.indices.map(i => 
    4.78 +          clusters.indices.filter(j => clusters(j) == i).map(vectorsWithKeys)
    4.79 +        ) 
    4.80 +      else {
    4.81 +        clusters = newClusters;
    4.82 +        centers = computeCenters;
    4.83 +      }
    4.84 +    }
    4.85 +    error("fallthrough")
    4.86    }
    4.87  }
    4.88  
     5.1 --- a/scala/vectors/clustering/src/main/scala/spectral.scala	Mon Sep 08 11:14:57 2008 +0100
     5.2 +++ b/scala/vectors/clustering/src/main/scala/spectral.scala	Mon Sep 08 16:10:21 2008 +0100
     5.3 @@ -5,21 +5,6 @@
     5.4  import probability._;
     5.5  
     5.6  object SpectralClustering{
     5.7 -  def dominantEigenvector(matrix : Matrix) : Vector = dominantEigenvector(matrix, 0.01);
     5.8 -
     5.9 -  def dominantEigenvector(matrix : Matrix, precision : Double) : Vector = {
    5.10 -    
    5.11 -    def powerMultiply(from : Vector, to : Vector) : Vector = {
    5.12 -      matrix.multiplyTo(from, to);      
    5.13 -      to.normalize;
    5.14 -      if (from.distanceTo(to) < precision) to;
    5.15 -      else powerMultiply(to, from);
    5.16 -    }
    5.17 -  
    5.18 -    val n = matrix.rows.length;
    5.19 -    powerMultiply(Vector.const(1.0 / n, n), Vector.zero(n))
    5.20 -  }
    5.21 -
    5.22  
    5.23    def split(vectors : Array[Double], indices : Array[Int]) : Array[Array[Int]] = {
    5.24      val gen : Gen = new scala.util.Random;
    5.25 @@ -80,7 +65,7 @@
    5.26      val elements = new HashSet[T];
    5.27      elements ++= tuples.map(_._1);
    5.28      elements ++= tuples.map(_._2);
    5.29 -    fromTuples(tuples).partition(elements);
    5.30 +    fromTuples(tuples)(elements);
    5.31    }
    5.32  }
    5.33  
    5.34 @@ -105,7 +90,7 @@
    5.35  
    5.36    def apply(_elements : Set[T]) : Collection[Collection[T]] = {
    5.37      val (elements, matrix) = partition(_elements);
    5.38 -    val ev = dominantEigenvector(matrix);
    5.39 +    val ev = matrix.dominantEigenvector;
    5.40  
    5.41      val clusterIndices : Array[Array[Int]]= oneDGMeans(matrix.columns.map(ev dot _).toArray); 
    5.42      
     6.1 --- a/scala/vectors/probability/src/main/scala/distribution.scala	Mon Sep 08 11:14:57 2008 +0100
     6.2 +++ b/scala/vectors/probability/src/main/scala/distribution.scala	Mon Sep 08 16:10:21 2008 +0100
     6.3 @@ -84,13 +84,16 @@
     6.4  
     6.5    def apply(x : Gen) = {
     6.6      val p = x.nextDouble;
     6.7 -    ints(java.util.Arrays.binarySearch(cumulativeProbabilities, p));
     6.8 +    val search = java.util.Arrays.binarySearch(cumulativeProbabilities, p);
     6.9 +    if (search >= 0) ints(search); 
    6.10 +    else ints(-search + 1) 
    6.11 +
    6.12    }
    6.13  }
    6.14  
    6.15  object Discrete{
    6.16 -  def apply(ints : Collection[Int], distribution : Int => Double) = new Discrete{
    6.17 -    def range = ints;
    6.18 +  def apply(collection: Collection[Int], distribution : Int => Double) = new Discrete{
    6.19 +    def range = collection;
    6.20      def probability(i : Int) = distribution(i);
    6.21    } 
    6.22  }