R语言学习笔记——估计概率密度函数

原创已于 2025-10-09 17:49:33 修改 · 9.3k 阅读

51 ·

本内容遵循CC 4.0 BY-SA版权协议

标签

#r语言 #核函数

于 2021-03-08 16:00:25 首次发布

该文探讨了如何估计概率密度函数，包括使用R语言的fitdistrplus和logspline库进行参数法和非参数法的分布检验。文中涉及正态分布、伽马分布和贝塔分布的拟合，并通过MCMC方法进行伽马分布的参数估计。此外，还利用直方图法、核密度估计和K近邻方法进行非参数估计，展示了不同binwidth和k值对结果的影响。

估计概率密度函数

大概查看服从哪些基本的分布

library(fitdistrplus)
library(logspline)
descdist(data, discrete = FALSE)

在这里插入图片描述

一、参数法

1、正态分布

检验：

shapiro.test(data)

方法1:

fit.normal <- fitdist(data_tmp$VE,"norm")
summary(fit.normal)

方法2:

fit.normal_b<-Bnormal(data_tmp$VE, priors=list(muMean=13.651040, muSD=3.998584))
summary(fit.normal_b)

2、伽马分布

检验：

library(goft)
gamma_test(data)

方法1:

fit.gamma <- fitdist(data_tmp$VA, distr = "gamma", method = "mle")
summary(fit.gamma)

方法2:

mcmc <- function(y, iter, sigma.rw, lambda.alpha, lambda.beta, nu.alpha, nu.beta) {
  
  # find n from the data (ie the number of observations)
  n <- length(y)
  
  # compute the sum of the observations and the sum of the log of the observations
  sum.y <- sum(y)
  sum.log.y <- sum(log(y))
  
  # first create a table which will store the samples; first column will be alpha, second will be beta
  out <- matrix(NA, nrow=iter, ncol=2)
  
  # initial values
  alpha.cur <- 1
  beta.cur <- 1
  out[1,] <- c(alpha.cur, beta.cur)
  
  # mcmc loop starts here
  for (i in 2:iter) {
    
    ###############
    # update alpha (assume beta is fixed)
    ###############
    
    # propose a new value for alpha
    alpha.can <- rnorm(1, alpha.cur, sigma.rw)
    
    # if it is negative reject straight away else compute the M-H ratio
    if (alpha.can > 0) {
      
      # evaluate the loglikelihood at the current values of alpha.
      loglik.cur <- n*alpha.cur*log(beta.cur) - n*lgamma(alpha.cur) + (alpha.cur-1)*sum.log.y 
      
      # compute the log-likelihood at the candidate value of alpha
      loglik.can <- n*alpha.can*log(beta.cur) - n*lgamma(alpha.can) + (alpha.can-1)*sum.log.y 
      
      
      # log prior densities
      log.prior.alpha.cur <- log.gamma.density(alpha.cur, lambda.alpha, nu.alpha)
      log.prior.alpha.can <- log.gamma.density(alpha.can, lambda.alpha, nu.alpha)
      
      logpi.cur <- loglik.cur + log.prior.alpha.cur
      logpi.can <- loglik.can + log.prior.alpha.can
      
      # M-H ratio
      
      # draw from a U(0,1)
      u <- runif(1)
      
      if (log(u) < logpi.can - logpi.cur) {
        alpha.cur <- alpha.can
      }
    }
    
    ###############
    # update beta
    ###############
    
    beta.cur <- rgamma(1, alpha.cur*n + lambda.beta, rate = sum.y + nu.beta)  
    
    # store the samples
    out[i,] <- c(alpha.cur, beta.cur)
    
  }
  
  return(out)
}
res <- mcmc(y = data_tmp$VA, iter = 10000, sigma.rw = 1, lambda.alpha = 1, lambda.beta = 1, nu.alpha = 1e-4, nu.beta = 1e-4)
res <- res[-c(1:1000),]
mean(res[,1])
mean(res[,2])
sd(res[,1])
sd(res[,2])

3、贝塔分布

方法1:

fit.beta <- fitdist(data_tmp$VA, distr = "beta", method="mle")
summary(fit.beta)

方法2:

library("LearnBayes")
library(Lahman)
findBeta <- function(quantile1,quantile2,quantile3)
{
  # find the quantiles specified by quantile1 and quantile2 and quantile3
  quantile1_p <- quantile1[[1]]; quantile1_q <- quantile1[[2]]
  quantile2_p <- quantile2[[1]]; quantile2_q <- quantile2[[2]]
  quantile3_p <- quantile3[[1]]; quantile3_q <- quantile3[[2]]
  
  # find the beta prior using quantile1 and quantile2
  priorA <- beta.select(quantile1,quantile2)
  priorA_a <- priorA[1]; priorA_b <- priorA[2]
  
  # find the beta prior using quantile1 and quantile3
  priorB <- beta.select(quantile1,quantile3)
  priorB_a <- priorB[1]; priorB_b <- priorB[2]
  
  # find the best possible beta prior
  diff_a <- abs(priorA_a - priorB_a); diff_b <- abs(priorB_b - priorB_b)
  step_a <- diff_a / 100; step_b <- diff_b / 100
  if (priorA_a < priorB_a) { start_a <- priorA_a; end_a <- priorB_a }
  else                     { start_a <- priorB_a; end_a <- priorA_a }
  if (priorA_b < priorB_b) { start_b <- priorA_b; end_b <- priorB_b }
  else                     { start_b <- priorB_b; end_b <- priorA_b }
  steps_a <- seq(from=start_a, to=end_a, length.out=1000)
  steps_b <- seq(from=start_b, to=end_b, length.out=1000)
  max_error <- 10000000000000000000
  best_a <- 0; best_b <- 0
  for (a in steps_a)
  {
    for (b in steps_b)
    {
      # priorC is beta(a,b)
      # find the quantile1_q, quantile2_q, quantile3_q quantiles of priorC:
      priorC_q1 <- qbeta(c(quantile1_p), a, b)
      priorC_q2 <- qbeta(c(quantile2_p), a, b)
      priorC_q3 <- qbeta(c(quantile3_p), a, b)
      priorC_error <- abs(priorC_q1-quantile1_q) +
        abs(priorC_q2-quantile2_q) +
        abs(priorC_q3-quantile3_q)
      if (priorC_error < max_error)
      {
        max_error <- priorC_error; best_a <- a; best_b <- b
      }
    }
  }
  print(paste("The best beta prior has a=",best_a,"b=",best_b))
}
quantile1 <- list(p=0.5, x=median(data_tmp$VA)) 
x1<-quantile(data_tmp$VA,probs = 0.8)
x2<-quantile(data_tmp$VA,probs = 0.2)
quantile2 <- list(p=0.8,x=x1) 
quantile3 <- list(p=0.2,x=x2)
findBeta(quantile1,quantile2,quantile3)

二、非参数方法

1、直方图法

a1<-ggplot(data_tmp, aes(x=VA)) + 
  geom_histogram(binwidth=0.005,aes(y=..density..),colour="black",fill="white")+
  geom_density(alpha=.3,col='red') +
  theme_classic()+
  annotate('text', x = 0.8, y = 8, 
           label = "Delta ==0.005 ",parse = TRUE,size=4) 
a2<-ggplot(data_tmp, aes(x=VA)) + 
 geom_histogram(binwidth=0.02,aes(y=..density..),colour="black",fill="white")+
  geom_density(alpha=.3,col='red') +
  theme_classic()+
  annotate('text', x = 0.8, y = 3.8, 
           label = "Delta ==0.02 ",parse = TRUE,size=4) 
a3<-ggplot(data_tmp, aes(x=VA)) + 
  geom_histogram(binwidth=0.15,aes(y=..density..),colour="black",fill="white")+
  geom_density(alpha=.3,col='red') +
  theme_classic()+
  annotate('text', x = 0.8, y = 3.8, 
           label = "Delta ==0.15 ",parse = TRUE,size=4)   
abc<-grid.arrange(a1,a2,a3,
                  ncol=1,nrow=3,
                  as.table=TRUE)

只画下左边半个图

2、Kernel

png(file = "kernel.png",width = 1080, height = 1080, units = "px")
par(mfrow=c(1,2))
plot(density(data_tmp$VA,bw = 0.005, kernel = "gaussian"),col="red",main="Kernel Estimation",xlab="VA")
lines(density(data_tmp$VA,bw = 0.01, kernel = "gaussian"),col="blue")
lines(density(data_tmp$VA,bw = 0.05, kernel = "gaussian"),col="green")
rug(data_tmp$VA)
legend("topright", legend=c("bw=0.005", "bw=0.01","bw=0.05"),
       col=c("red", "blue","green"), lty=1:1, cex=0.7)

只画下左边半个图

3、KNN

library( basicTrendline)
dk_a=function(A,x,k){
  na=nrow(A)
  or=1:na
  dis=NULL
  for(i in 1:na)
  {dis=c(dis,(abs(x-A[i,279])))}
  disnew=sort(dis)
  dk=disnew[k]
  return(dk)
}
knear_a=function(A,x,k){
  knear=c()
  for(i in 1:nrow(A)){
    knear[i]=(k-1)/(2*nrow(A)*dk_a(A,x[i],k))}
  knear}
set.seed(123)
data_k<-data_tmp[sample(1:nrow(data_tmp),5000),]
VA<-sort(data_k$VA)
k1<-knear_a(data_k,VA,100)
k2<-knear_a(data_k,VA,500)
k3<-knear_a(data_k,VA,800)
k4<-knear_a(data_k,VA,1000)
par(mfrow=c(2,2))
plot(VA,k1,type="l",xlab="VA",ylab="Density",main='k=100')
plot(VA,k2,type="l",xlab="VA",ylab="Density",main='k=500')
plot(VA,k3,type="l",xlab="VA",ylab="Density",main='k=800')
plot(VA,k4,type="l",xlab="VA",ylab="Density",main='k=1000')