Miscellaneous code
changeset 139:fac8bb755e41
Fixed a bunch of bugs. Discrete distributions work. KMeans no longer falls into infinite loops (I hope).
| author | david@david-desktop |
|---|---|
| date | Mon Sep 08 16:10:21 2008 +0100 (2008-09-08) |
| parents | 86768d2ae27a |
| children | dfef34d732fc |
| files | scala/vectors/base/src/main/scala/matrices.scala scala/vectors/base/src/main/scala/vectors.scala scala/vectors/buildfile scala/vectors/clustering/src/main/scala/kmeans.scala scala/vectors/clustering/src/main/scala/spectral.scala scala/vectors/probability/src/main/scala/distribution.scala |
line diff
1.1 --- a/scala/vectors/base/src/main/scala/matrices.scala Mon Sep 08 11:14:57 2008 +0100 1.2 +++ b/scala/vectors/base/src/main/scala/matrices.scala Mon Sep 08 16:10:21 2008 +0100 1.3 @@ -23,6 +23,18 @@ 1.4 forIn(0, n, (i : Int) => result.underlying.setQuick(i, i, 1)); 1.5 result; 1.6 } 1.7 + 1.8 + def random(rows : Int, columns : Int) = { 1.9 + val random = new scala.util.Random; 1.10 + val matrix = sparse(rows, columns); 1.11 + 1.12 + val occupancy = random.nextInt(rows * columns); 1.13 + 1.14 + for(i <- 0 until occupancy){ 1.15 + matrix(random.nextInt(rows), random.nextInt(columns)) = random.nextDouble(); 1.16 + } 1.17 + matrix; 1.18 + } 1.19 } 1.20 1.21 abstract class Matrix{ 1.22 @@ -54,6 +66,23 @@ 1.23 def multiplyTo(that : Vector, target : Vector) = 1.24 this.underlying.zMult(that.underlying, target.underlying); 1.25 1.26 + def dominantEigenvector : Vector = dominantEigenvector(0.01); 1.27 + 1.28 + def dominantEigenvector(precision : Double) : Vector = { 1.29 + 1.30 + def powerMultiply(from : Vector, to : Vector) : Vector = { 1.31 + multiplyTo(from, to); 1.32 + to.normalize; 1.33 + if (from.distanceTo(to) < precision) to; 1.34 + else powerMultiply(to, from); 1.35 + } 1.36 + 1.37 + val n = rows.length; 1.38 + powerMultiply(Vector.const(1.0 / n, n), Vector.zero(n)) 1.39 + } 1.40 + 1.41 + 1.42 + 1.43 def *(that : Vector) : Vector = { 1.44 val z = that.underlying.like; 1.45 this.underlying.zMult(that.underlying, z);
2.1 --- a/scala/vectors/base/src/main/scala/vectors.scala Mon Sep 08 11:14:57 2008 +0100 2.2 +++ b/scala/vectors/base/src/main/scala/vectors.scala Mon Sep 08 16:10:21 2008 +0100 2.3 @@ -28,6 +28,13 @@ 2.4 avg 2.5 } 2.6 2.7 + def random(length : Int) = { 2.8 + val vect = zero(length); 2.9 + val rand = new scala.util.Random; 2.10 + vect.updateWith((x : Double) => rand.nextDouble) 2.11 + vect 2.12 + } 2.13 + 2.14 implicit def wrapVector (x : DoubleMatrix1D) = Vector(x); 2.15 implicit def wrapRange(range : Range) : Vector = { 2.16 val x = new DenseDoubleMatrix1D(range.length);
3.1 --- a/scala/vectors/buildfile Mon Sep 08 11:14:57 2008 +0100 3.2 +++ b/scala/vectors/buildfile Mon Sep 08 16:10:21 2008 +0100 3.3 @@ -2,8 +2,10 @@ 3.4 NEXT_VERSION = "0.1" 3.5 GROUP = "vectors" 3.6 COLT="colt:colt:jar:1.2.0" 3.7 +SCALAX="org.scalaforge:scalax:jar:0.1" 3.8 3.9 repositories.remote << 'http://www.ibiblio.org/maven2/' 3.10 +repositories.remote << 'http://scala-tools.org/repo-releases/' 3.11 3.12 VECTORS = FileList["vectors/src/main/scala/**/*.scala"] 3.13 3.14 @@ -36,8 +38,26 @@ 3.15 compile.with COLT, projects("base", "clustering") 3.16 package :jar 3.17 end 3.18 + 3.19 + desc"The magic clusterer" 3.20 + define "magic" do 3.21 + compile.with projects("base", "clustering", "probability"), COLT, SCALAX 3.22 + package(:jar) 3.23 + end 3.24 end 3.25 3.26 +def scala(main) 3.27 + classpath = (artifacts(COLT, SCALAX) + projects.map(&:package)).join(File::PATH_SEPARATOR) 3.28 + sh "scala -cp #{classpath} #{main}" 3.29 +end 3.30 + 3.31 +task :run do 3.32 + scala("vectors.clustering.magic.Magic") 3.33 +end 3.34 + 3.35 +task :shell do 3.36 + scala("") 3.37 +end 3.38 3.39 task :scaladoc do 3.40 mkdir "doc" unless File.exists? "doc"
4.1 --- a/scala/vectors/clustering/src/main/scala/kmeans.scala Mon Sep 08 11:14:57 2008 +0100 4.2 +++ b/scala/vectors/clustering/src/main/scala/kmeans.scala Mon Sep 08 16:10:21 2008 +0100 4.3 @@ -14,50 +14,58 @@ 4.4 case class KMeans(count : Int) extends Clustering{ 4.5 def pickCenters(_vectors : Collection[Vector]) = { 4.6 val vectors = _vectors.toArray; 4.7 - val indices = new scala.collection.mutable.HashSet[Int]; 4.8 + val centers = new scala.collection.mutable.HashSet[Vector]; 4.9 val rnd = new Random; 4.10 4.11 - while (indices.size < count){ 4.12 - indices += rnd.nextInt(vectors.length); 4.13 + while (centers.size < count){ 4.14 + centers += vectors(rnd.nextInt(vectors.length)); 4.15 } 4.16 4.17 - indices.toArray.map(vectors) 4.18 + centers.toArray 4.19 } 4.20 4.21 - def apply[T](vectors : Collection[(T, Vector)]) = { 4.22 - var centers = pickCenters(vectors.toArray.map((_ : (T, Vector))._2)); 4.23 + def apply[T](_vectors : Collection[(T, Vector)]) : Array[Array[(T, Vector)]]= { 4.24 + val vectorsWithKeys = _vectors.toArray; 4.25 + val vectors = vectorsWithKeys.map(_._2); 4.26 + var centers = pickCenters(vectors); 4.27 val dimension = centers(0).length 4.28 - var clusters : Array[ArrayBuffer[(T, Vector)]] = centers.map(i => new ArrayBuffer[(T, Vector)]); 4.29 + var clusters : Array[Int] = null; 4.30 4.31 - clusters(0) ++= vectors; 4.32 + def computeClusters = { 4.33 + val newclusters = new Array[Int](vectors.length); 4.34 + for (i <- 0 until vectors.length){ 4.35 + val vector = vectors(i); 4.36 + var minDistance = Double.MaxValue 4.37 4.38 - var changed = true; 4.39 - while(changed){ 4.40 - changed = false; 4.41 - val newclusters = centers.map(i => new ArrayBuffer[(T, Vector)]); 4.42 - 4.43 - for (i <- 0 until clusters.size; 4.44 - it@(t, vector) <- clusters(i)){ 4.45 - var currentCluster = i; 4.46 - var minDistance = vector.distanceTo(centers(i)); 4.47 - 4.48 - for (j <- 0 until centers.size){ 4.49 + for (j <- 0 until centers.length){ 4.50 val distance = vector.distanceTo(centers(j)); 4.51 if (distance < minDistance){ 4.52 minDistance = distance; 4.53 - currentCluster = j; 4.54 - } 4.55 + newclusters(i) = j; 4.56 + } 4.57 } 4.58 - 4.59 - newclusters(currentCluster) += it; 4.60 + } 4.61 + newclusters; 4.62 + } 4.63 4.64 - if (i != currentCluster) changed = true; 4.65 - } 4.66 + clusters = computeClusters; 4.67 4.68 - centers = newclusters.map(x => Vector.average(dimension, x.map(_._2))); 4.69 + def computeCenters = { 4.70 + def center(i : Int) = Vector.average(dimension, (0 until vectors.size).filter(j => clusters(j) == i).map(vectors)) 4.71 + (0 until centers.length).map(center).toArray 4.72 } 4.73 4.74 - clusters 4.75 + while(true){ 4.76 + val newClusters = computeClusters; 4.77 + if (newClusters.sameElements(clusters)) return centers.indices.map(i => 4.78 + clusters.indices.filter(j => clusters(j) == i).map(vectorsWithKeys) 4.79 + ) 4.80 + else { 4.81 + clusters = newClusters; 4.82 + centers = computeCenters; 4.83 + } 4.84 + } 4.85 + error("fallthrough") 4.86 } 4.87 } 4.88
5.1 --- a/scala/vectors/clustering/src/main/scala/spectral.scala Mon Sep 08 11:14:57 2008 +0100 5.2 +++ b/scala/vectors/clustering/src/main/scala/spectral.scala Mon Sep 08 16:10:21 2008 +0100 5.3 @@ -5,21 +5,6 @@ 5.4 import probability._; 5.5 5.6 object SpectralClustering{ 5.7 - def dominantEigenvector(matrix : Matrix) : Vector = dominantEigenvector(matrix, 0.01); 5.8 - 5.9 - def dominantEigenvector(matrix : Matrix, precision : Double) : Vector = { 5.10 - 5.11 - def powerMultiply(from : Vector, to : Vector) : Vector = { 5.12 - matrix.multiplyTo(from, to); 5.13 - to.normalize; 5.14 - if (from.distanceTo(to) < precision) to; 5.15 - else powerMultiply(to, from); 5.16 - } 5.17 - 5.18 - val n = matrix.rows.length; 5.19 - powerMultiply(Vector.const(1.0 / n, n), Vector.zero(n)) 5.20 - } 5.21 - 5.22 5.23 def split(vectors : Array[Double], indices : Array[Int]) : Array[Array[Int]] = { 5.24 val gen : Gen = new scala.util.Random; 5.25 @@ -80,7 +65,7 @@ 5.26 val elements = new HashSet[T]; 5.27 elements ++= tuples.map(_._1); 5.28 elements ++= tuples.map(_._2); 5.29 - fromTuples(tuples).partition(elements); 5.30 + fromTuples(tuples)(elements); 5.31 } 5.32 } 5.33 5.34 @@ -105,7 +90,7 @@ 5.35 5.36 def apply(_elements : Set[T]) : Collection[Collection[T]] = { 5.37 val (elements, matrix) = partition(_elements); 5.38 - val ev = dominantEigenvector(matrix); 5.39 + val ev = matrix.dominantEigenvector; 5.40 5.41 val clusterIndices : Array[Array[Int]]= oneDGMeans(matrix.columns.map(ev dot _).toArray); 5.42
6.1 --- a/scala/vectors/probability/src/main/scala/distribution.scala Mon Sep 08 11:14:57 2008 +0100 6.2 +++ b/scala/vectors/probability/src/main/scala/distribution.scala Mon Sep 08 16:10:21 2008 +0100 6.3 @@ -84,13 +84,16 @@ 6.4 6.5 def apply(x : Gen) = { 6.6 val p = x.nextDouble; 6.7 - ints(java.util.Arrays.binarySearch(cumulativeProbabilities, p)); 6.8 + val search = java.util.Arrays.binarySearch(cumulativeProbabilities, p); 6.9 + if (search >= 0) ints(search); 6.10 + else ints(-search + 1) 6.11 + 6.12 } 6.13 } 6.14 6.15 object Discrete{ 6.16 - def apply(ints : Collection[Int], distribution : Int => Double) = new Discrete{ 6.17 - def range = ints; 6.18 + def apply(collection: Collection[Int], distribution : Int => Double) = new Discrete{ 6.19 + def range = collection; 6.20 def probability(i : Int) = distribution(i); 6.21 } 6.22 }
