Mit 6.830:SimpleDB Lab3

这篇博客介绍了数据库查询优化中的直方图统计方法,包括IntHistogram和StringHistogram。IntHistogram用于统计整数字段的选择率,通过计算每个值在直方图中的比例来估计查询选择性。StringHistogram类似,但处理字符串转换为数字的情况。TableStats类则负责记录表的字段直方图,用于计算表的统计信息。JoinOptimizer通过枚举不同的子集和计算子查询成本,找到最优的连接顺序。这些工具对于提高数据库查询效率至关重要。

Exercise 1

IntHistogram

这个主要是统计直方图对于某一个限制条件占总体的一个比例

在这里插入图片描述

也就是说,我们需要去计算一个选择率

步骤:
1、首先需要把值传进来放入 buckets
2、然后根据传进来的运算符,计算值所占的比例
3、计算的过程
	首先统计当前值 value 前的所有总数,也就是通过buckets[] 去统计所有的总数
	然后就去计算value所占的数量,其实就是计算面积,index 是 value 所在的索引柱
	buckets[index] / width 计算出当前柱的高度(每个柱子的宽度是相同的,整个柱子的面积除于宽度就是高度)
	value - index * width - min 计算当前柱所占的宽度(减去当前所有柱子前的宽度再减去最小值,也就获得了所占区域的宽度)
	然后对面积进行计算 (1.0 * buckets[index] / width) * (value - index * width - min)
	这里 1.0 的目的是 避免精度丢失

参数

  • private int[] buckets; // 直方图
  • private int min; // 边界最小值
  • private int max; // 边界最大值
  • private double width; // 长度
  • private int tuplesCount = 0; // 行数

方法

  • addValue(int v) :添加新的值

    private int getIndex(int v){
         
         
        return (int) ((v - min) / width);
    }
    public void addValue(int v) {
         
         
        // some code goes here
        if(v >= min && v <= max){
         
         
            buckets[getIndex(v)]++;
            tuplesCount++;
        }
    }
    
  • double estimateSelectivity(Predicate.Op op, int v) : 根据运算符和值 估计选择率

    public double estimateSelectivity(Predicate.Op op, int v) {
         
         
        // some code goes here
        switch (op){
         
         
            case LESS_THAN:
                if(v <= min){
         
         
                    return 0.0;
                }
                else if(v >= max){
         
         
                    return 1.0;
                }
                else{
         
         
                    int index = getIndex(v);
                    double tuples = 0;
                    for (int i = 0; i < index; i++) {
         
         
                        tuples += buckets[i];
                    }
                    // 索引所在柱的高度 * (当前值 - 该柱前的宽度)<这个也就是当前柱所占的宽度>
                    tuples += (1.0 * buckets[index] / width) * (v - index * width - min);
                    return tuples / tuplesCount;
                }
            case GREATER_THAN:
                return 1 - estimateSelectivity(Predicate.Op.LESS_THAN_OR_EQ, v);
            case EQUALS:
                return estimateSelectivity(Predicate.Op.LESS_THAN_OR_EQ, v) - estimateSelectivity(Predicate.Op.LESS_THAN, v);
            case NOT_EQUALS:
                return 1 - estimateSelectivity(Predicate.Op.EQUALS, v);
            case GREATER_THAN_OR_EQ:
                return estimateSelectivity(Predicate.Op.GREATER_THAN, v - 1);
            case LESS_THAN_OR_EQ:
                return estimateSelectivity(Predicate.Op.LESS_THAN, v + 1);
            default:
                throw new IllegalArgumentException("Operation is illegal");
        }
    }
    

全代码

package simpledb.optimizer;

import simpledb.execution.Predicate;

/** A class to represent a fixed-width histogram over a single integer-based field.
 */
public class IntHistogram {
   
   

    // 直方图
    private int[] buckets;
    // 边界最小值
    private int min;
    // 边界最大值
    private int max;
    // 长度
    private double width;
    // 行数
    private int tuplesCount = 0;

    /**
     * Create a new IntHistogram.
     * 
     * This IntHistogram should maintain a histogram of integer values that it receives.
     * It should split the histogram into "buckets" buckets.
     * 
     * The values that are being histogrammed will be provided one-at-a-time through the "addValue()" function.
     * 
     * Your implementation should use space and have execution time that are both
     * constant with respect to the number of values being histogrammed.  For example, you shouldn't 
     * simply store every value that you see in a sorted list.
     * 
     * @param buckets The number of buckets to split the input value into.
     * @param min The minimum integer value that will ever be passed to this class for histogramming
     * @param max The maximum integer value that will ever be passed to this class for histogramming
     */
    public IntHistogram(int buckets, int min, int max) {
   
   
    	// some code goes here
        this.buckets = new int[buckets];
        this.min = min;
        this.max = max;
        this.width = (max - min + 1.0) / buckets;
        this.tuplesCount = 0;
    }

    /**
     * Add a value to the set of values that you are keeping a histogram of.
     * @param v Value to add to the histogram
     */
    private int getIndex(int v){
   
   
        return (int) ((v - min) / width);
    }
    public void addValue(int v) {
   
   
    	// some code goes here
        if(v >= min && v <= max){
   
   
            buckets[getIndex(v)]++;
            tuplesCount++;
        }
    }

    /**
     * Estimate the selectivity of a particular predicate and operand on this table.
     *
     * For example, if "op" is "GREATER_THAN" and "v" is 5,
     * return your estimate of the fraction of elements that are greater than 5.
     * 
     * @param op Operator
     * @param v Value
     * @return Predicted selectivity of this particular operator and value
     */
    public double estimateSelectivity(Predicate.Op op, int v) {
   
   
    	// some code goes here
        switch (op){
   
   
            case LESS_THAN:
                if(v <= min){
   
   
                    return 0.0;
                }
                else if(v >= max){
   
   
                    return 1.0;
                }
                else{
   
   
                    int index = getIndex(v);
                    double tuples = 0;
                    for (int i = 0; i < index; i++) {
   
   
                        tuples += buckets[i];
                    }
                    // 索引所在柱的高度 * (当前值 - 该柱前的宽度)<这个也就是当前柱所占的宽度>
                    tuples += (1.0 * buckets[index] / width) * (v - index * width - min);
                    return tuples / tuplesCount;
                }
            case GREATER_THAN:
                return 1 - estimateSelectivity(Predicate.Op.LESS_THAN_OR_EQ, v);
            case EQUALS:
                return estimateSelectivity(Predicate.Op.LESS_THAN_OR_EQ, v) - estimateSelectivity(Predicate.Op.LESS_THAN, v);
            case NOT_EQUALS:
                return 1 - estimateSelectivity(Predicate.Op.EQUALS, v);
            case GREATER_THAN_OR_EQ:
                return estimateSelectivity(Predicate.Op.GREATER_THAN, v - 1);
            case LESS_THAN_OR_EQ:
                return estimateSelectivity(Predicate.Op.LESS_THAN, v + 1);
            default:
                throw new IllegalArgumentException("Operation is illegal");
        }
    }
    
    /**
     * @return
     *     the average selectivity of this histogram.
     *     
     *     This is not an indispensable method to implement the basic
     *     join optimization. It may be needed if you want to
     *     implement a more efficient optimization
     * */
    public double avgSelectivity()
    {
   
   
        // some code goes here
        int cnt = 0;
        for(int bucket : buckets
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值