Mahout Canopy源码分析_mahout canopy源码-CSDN博客

本文深入探讨了Canopy聚类算法的工作原理及其在Apache Mahout中的实现细节。介绍了Canopy算法作为预处理步骤如何快速筛选数据，为后续更复杂的聚类算法提供高质量的初始中心点。

Canopy Clustering

Canopy算法介绍：

1.选择简单，计算代价低的方法计算对象相似性，将相似的对象放在一个Canopy子集下通过计算得到若干Canopy，Canopy之间可以重叠，不存在某个对象不属于任何Canopy。
2.Canopy算法一般做为数据预处理。通过计算得到的几个簇，从每个簇中选取一个理中心点最近的点(中心点)作为其他聚类(k-means)的初始中心。

Canopy 聚类算法思想：

1.包括一个原始数据集list和一个空的canopy列表，2个阀值T1,T2 T2<T1
2.从list中取一点P,将P从list中删除并在canopy列表中新建一个以P为中心的canopy。
3.用低成本的计算方式计算list中剩余的点与该canopy的距离,如果距离小于T2，将点从list中删除，并将点加入到canopy中，如果距离小于T1，将点加入到canopy中.
4.重复2-3步骤，直到list为空。

mahout Canopy源码解析：

CanopyConfigKeys类：

<pre name="code" class="java"><span style="white-space:pre">		</span>String T1_KEY = "org.apache.mahout.clustering.canopy.t1";  //t1
<span style="white-space:pre">		</span>String T2_KEY = "org.apache.mahout.clustering.canopy.t2";  //t2
<span style="white-space:pre">		</span>String T3_KEY = "org.apache.mahout.clustering.canopy.t3";  //t3，在reduce中用到的t1，不设等于t1
<span style="white-space:pre">		</span>String T4_KEY = "org.apache.mahout.clustering.canopy.t4";  //t4，在reduce中用到的t2，不设等于t2
<span style="white-space:pre">		</span>public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.canopy.measure";//计算距离的公式
<span style="white-space:pre">		</span>public static final String CF_KEY = "org.apache.mahout.clustering.canopy.canopyFilter";//一个阀值，canopy中的向量数量大于这个值才算是一个canopy

Canopy类：

<span style="white-space:pre">		</span>public Canopy(Vector center, int canopyId, DistanceMeasure measure);//创建一个中心为center,ID为canopyId,距离计算公式为measure只有一个给定点(center)的canopy

CanopyClusterer类：//canopy的实现类

<span style="white-space:pre">		</span>CanopyClusterer(DistanceMeasure measure, double t1, double t2) ;//传入计算距离的方法，t1，t2
	<span style="white-space:pre">	</span>public CanopyClusterer(Configuration config);//传入一个configuration，可以用Configuration对象中设定距离计算类，T1，T2，T3和T4等参数
			{
			 this.configure(config);  
			}
		public void configure(Configuration configuration){
			measure = ClassUtils.instantiateAs(  //将带DISTANCE_MEASURE_KEY的类转化成DistanceMeasure类并实例化(DISTANCE_MEASURE_KEY为DistanceMeasure的子类)
				configuration.get(CanopyConfigKeys.DISTANCE_MEASURE_KEY),  DistanceMeasure.class);  
			measure.configure(configuration);  
			t1 = Double.parseDouble(configuration.get(CanopyConfigKeys.T1_KEY));  
			t2 = Double.parseDouble(configuration.get(CanopyConfigKeys.T2_KEY));  
			....
			}
			//根据点创建canopy集合
		public static List<Canopy> createCanopies(List<Vector> points, DistanceMeasure measure, double t1, double t2){
			List<Canopy> canopies = Lists.newArrayList();  //其实就是new ArrayList<Canopy>();
			int nextCanopyId = 0;  
			while (!points.isEmpty()) {  // points不空 
				Iterator<Vector> ptIter = points.iterator();  
				Vector p1 = ptIter.next();  
				ptIter.remove();  //取出第一个点并从list中删除 
				Canopy canopy = new Canopy(p1, nextCanopyId++, measure);  //以取出的点作为一个canopy
				canopies.add(canopy);  //将点加入canopy列表
				while (ptIter.hasNext()) {  //取出list中的其他点
					Vector p2 = ptIter.next();  
					double dist = measure.distance(p1, p2);  //计算list中的其他点和canopy距离
					if (dist < t1) {   //小于t1
						canopy.observe(p2);  //将点加入到canopy中 同时更新s0,s1,s2
						}  
					if (dist < t2) {  //小于t2
						ptIter.remove();  //将点从list移除
						}  
					}  
				for (Canopy c : canopies) {  
					c.computeParameters();  //更新每个canopy的S0，S1，S2，中心,半径,点的数量，和总canopy的点数量
					}  
				}  
				return canopies;  
			}  
		//将一个点加入到canopy中
		public void addPointToCanopies(Vector point, Collection<Canopy> canopies){
			boolean pointStronglyBound = false;  
			for (Canopy canopy : canopies) {  
				double dist = measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point);  //计算距离
				if (dist < t1) {  
					if (log.isDebugEnabled()) {  
						log.debug("Added point: {} to canopy: {}",  AbstractCluster.formatVector(point, null), canopy.getIdentifier());  
						}  
					canopy.observe(point);  
					}  
				pointStronglyBound = pointStronglyBound || dist < t2;  
				}  
			if (!pointStronglyBound) {  //pointStronglyBound为假 距离大于t2
				if (log.isDebugEnabled()) {  
					log.debug("Created new Canopy:{} at center:{}", nextCanopyId,  
						AbstractCluster.formatVector(point, null));  
					}  
					canopies.add(new Canopy(point, nextCanopyId++, measure));  //新建一个canopy加入到canopy列表
				}  
			}

CanopyDriver类

<span style="white-space:pre">		</span>public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double t3,  
				double t4, int clusterFilter, boolean runClustering, double clusterClassificationThreshold, boolean runSequential)  
				throws IOException, InterruptedException, ClassNotFoundException {  
			Path clustersOut = buildClusters(conf, input, output, measure, t1, t2, t3, t4, clusterFilter, runSequential);  
			if (runClustering) {  
				clusterData(conf, input, clustersOut, output,  
				clusterClassificationThreshold, runSequential);  
			}  
		}  
			
		public static Path buildClusters(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2,  
				double t3, double t4, int clusterFilter, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException {  
			log.info("Build Clusters Input: {} Out: {} Measure: {} t1: {} t2: {}", new Object[] { input, output, measure, t1, t2 });  
			if (runSequential) {  //ture就单机进行聚类，false分布式聚类
				return buildClustersSeq(input, output, measure, t1, t2,  
				clusterFilter);  
			} else {  
				return buildClustersMR(conf, input, output, measure, t1, t2, t3, t4, clusterFilter);  
				}  
		}
			//单机聚类
		private static Path buildClustersSeq(Path input, Path output, DistanceMeasure measure, double t1, double t2, int clusterFilter)  
				throws IOException {  
			CanopyClusterer clusterer = new CanopyClusterer(measure, t1, t2);  
			Collection<Canopy> canopies = Lists.newArrayList();  //创建空canopy
			Configuration conf = new Configuration();  
			FileSystem fs = FileSystem.get(input.toUri(), conf);  
			 // 将每个向量都添加到canopies中    input：要遍历的文件路径 PathType.LIST把路径作为目录 
			 //PathFilters.logsCRCFilter过滤以"_","."开始的文件和以".CRC"结尾的文件
			 for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST, PathFilters.logsCRCFilter(), conf)) {  
				clusterer.addPointToCanopies(vw.get(), canopies);  
			}  
			Path canopyOutputDir = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);  
			Path path = new Path(canopyOutputDir, "part-r-00000");  
			SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, ClusterWritable.class);  
			ClusterWritable clusterWritable = new ClusterWritable();  
			try {  
				for (Canopy canopy : canopies) {  
					canopy.computeParameters();  
					if (log.isDebugEnabled()) {  
						log.debug( "Writing Canopy:{} center:{} numPoints:{} radius:{}", new Object[] {  
                               canopy.getIdentifier(), AbstractCluster.formatVector(canopy.getCenter(), null),  
                               canopy.getNumObservations(),AbstractCluster.formatVector(canopy.getRadius(), null) });  
						}   
					if (canopy.getNumObservations() > clusterFilter) {  //canopy中的点大于阀值，输出到文件
						clusterWritable.setValue(canopy);  
						writer.append(new Text(canopy.getIdentifier()), clusterWritable);  
					}  
				}  
			} finally {  
				Closeables.closeQuietly(writer);  
			}  
			return canopyOutputDir;  
		}  
	//分布式聚类
		private static Path buildClustersMR(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2,  
				double t3, double t4, int clusterFilter) throws IOException, InterruptedException, ClassNotFoundException { 
			conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass() .getName());  
			conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1));  
			conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2));  
			conf.set(CanopyConfigKeys.T3_KEY, String.valueOf(t3));  
			conf.set(CanopyConfigKeys.T4_KEY, String.valueOf(t4));  
			conf.set(CanopyConfigKeys.CF_KEY, String.valueOf(clusterFilter));   
			Job job = new Job(conf, "Canopy Driver running buildClusters over input: " + input);  
			job.setInputFormatClass(SequenceFileInputFormat.class);  
			job.setOutputFormatClass(SequenceFileOutputFormat.class);  
			job.setMapperClass(CanopyMapper.class);  
			job.setMapOutputKeyClass(Text.class);  
			job.setMapOutputValueClass(VectorWritable.class);  
			job.setReducerClass(CanopyReducer.class);  
			job.setOutputKeyClass(Text.class);  
			job.setOutputValueClass(ClusterWritable.class);  
			job.setNumReduceTasks(1);  
			job.setJarByClass(CanopyDriver.class);   
			FileInputFormat.addInputPath(job, input);  
			Path canopyOutputDir = new Path(output, Cluster.CLUSTERS_DIR + '0'  + Cluster.FINAL_ITERATION_SUFFIX);  
			FileOutputFormat.setOutputPath(job, canopyOutputDir);  
			if (!job.waitForCompletion(true)) {  
				throw new InterruptedException("Canopy Job failed processing "  + input);  
			}  
			return canopyOutputDir;  
		}

CanopyMapper：

<span style="white-space:pre">			</span>//通过context对象构建一个canopyCluster对象，并设定canopy阀值
			protected void setup(Context context) throws IOException,InterruptedException {
				super.setup(context);
				canopyClusterer = new CanopyClusterer(context.getConfiguration());
				clusterFilter = Integer.parseInt(context.getConfiguration().get(CanopyConfigKeys.CF_KEY));
			}
			//将点加入到canopy中
			protected void map(WritableComparable<?> key, VectorWritable point,Context context) throws IOException, InterruptedException {
				canopyClusterer.addPointToCanopies(point.get(), canopies);
				}
			//更新canopy参数，将大于阀值的canopy 输出到reduce   map输出(centroid，每个canopy的中心)
			protected void cleanup(Context context) throws IOException,InterruptedException {
				for (Canopy canopy : canopies) {
					canopy.computeParameters();
					if (canopy.getNumObservations() > clusterFilter) {
						context.write(new Text("centroid"), new VectorWritable(canopy.getCenter()));
					}
				}
				super.cleanup(context);
			}

canopyreducer：

<pre name="code" class="java"><span style="white-space:pre">			</span>//更新t1,t2 
			protected void setup(Context context) throws IOException,InterruptedException {
				super.setup(context);
				canopyClusterer = new CanopyClusterer(context.getConfiguration());
				canopyClusterer.useT3T4();
				clusterFilter = Integer.parseInt(context.getConfiguration().get(CanopyConfigKeys.CF_KEY));
			}
			//聚类canopy中心，得到最终的canopies reduce输出(canopy id,一个canopy)
			protected void reduce(Text arg0, Iterable<VectorWritable> values,Context context) throws IOException, InterruptedException {
				for (VectorWritable value : values) {
					Vector point = value.get();
					canopyClusterer.addPointToCanopies(point, canopies);
				}
				for (Canopy canopy : canopies) {
					canopy.computeParameters();
					if (canopy.getNumObservations() > clusterFilter) {
						ClusterWritable clusterWritable = new ClusterWritable();
						clusterWritable.setValue(canopy);
							context.write(new Text(canopy.getIdentifier()), clusterWritable);
					}
				}
			}