估计概率密度函数
大概查看服从哪些基本的分布
library(fitdistrplus)
library(logspline)
descdist(data, discrete = FALSE)

一、参数法
1、正态分布
检验:
shapiro.test(data)
方法1:
fit.normal <- fitdist(data_tmp$VE,"norm")
summary(fit.normal)
方法2:
fit.normal_b<-Bnormal(data_tmp$VE, priors=list(muMean=13.651040, muSD=3.998584))
summary(fit.normal_b)
2、伽马分布
检验:
library(goft)
gamma_test(data)
方法1:
fit.gamma <- fitdist(data_tmp$VA, distr = "gamma", method = "mle")
summary(fit.gamma)
方法2:
mcmc <- function(y, iter, sigma.rw, lambda.alpha, lambda.beta, nu.alpha, nu.beta) {
# find n from the data (ie the number of observations)
n <- length(y)
# compute the sum of the observations and the sum of the log of the observations
sum.y <- sum(y)
sum.log.y <- sum(log(y))
# first create a table which will store the samples; first column will be alpha, second will be beta
out <- matrix(NA, nrow=iter, ncol=2)
# initial values
alpha.cur <- 1
beta.cur <- 1
out[1,] <- c(alpha.cur, beta.cur)
# mcmc loop starts here
for (i in 2:iter) {
###############
# update alpha (assume beta is fixed)
###############
# propose a new value for alpha
alpha.can <- rnorm(1, alpha.cur, sigma.rw)
# if it is negative reject straight away else compute the M-H ratio
if (alpha.can > 0) {
# evaluate the loglikelihood at the current values of alpha.
loglik.cur <- n*alpha.cur*log(beta.cur) - n*lgamma(alpha.cur) + (alpha.cur-1)*sum.log.y
# compute the log-likelihood at the candidate value of alpha
loglik.can <- n*alpha.can*log(beta.cur) - n*lgamma(alpha.can) + (alpha.can-1)*sum.log.y
# log prior densities
log.prior.alpha.cur <- log.gamma.density(alpha.cur, lambda.alpha, nu.alpha)
log.prior.alpha.can <- log.gamma.density(alpha.can, lambda.alpha, nu.alpha)
logpi.cur <- loglik.cur + log.prior.alpha.cur
logpi.can <- loglik.can + log.prior.alpha.can
# M-H ratio
# draw from a U(0,1)
u <- runif(1)
if (log(u) < logpi.can - logpi.cur) {
alpha.cur <- alpha.can
}
}
###############
# update beta
###############
beta.cur <- rgamma(1, alpha.cur*n + lambda.beta, rate = sum.y + nu.beta)
# store the samples
out[i,] <- c(alpha.cur, beta.cur)
}
return(out)
}
res <- mcmc(y = data_tmp$VA, iter = 10000, sigma.rw = 1, lambda.alpha = 1, lambda.beta = 1, nu.alpha = 1e-4, nu.beta = 1e-4)
res <- res[-c(1:1000),]
mean(res[,1])
mean(res[,2])
sd(res[,1])
sd(res[,2])
3、贝塔分布
方法1:
fit.beta <- fitdist(data_tmp$VA, distr = "beta", method="mle")
summary(fit.beta)
方法2:
library("LearnBayes")
library(Lahman)
findBeta <- function(quantile1,quantile2,quantile3)
{
# find the quantiles specified by quantile1 and quantile2 and quantile3
quantile1_p <- quantile1[[1]]; quantile1_q <- quantile1[[2]]
quantile2_p <- quantile2[[1]]; quantile2_q <- quantile2[[2]]
quantile3_p <- quantile3[[1]]; quantile3_q <- quantile3[[2]]
# find the beta prior using quantile1 and quantile2
priorA <- beta.select(quantile1,quantile2)
priorA_a <- priorA[1]; priorA_b <- priorA[2]
# find the beta prior using quantile1 and quantile3
priorB <- beta.select(quantile1,quantile3)
priorB_a <- priorB[1]; priorB_b <- priorB[2]
# find the best possible beta prior
diff_a <- abs(priorA_a - priorB_a); diff_b <- abs(priorB_b - priorB_b)
step_a <- diff_a / 100; step_b <- diff_b / 100
if (priorA_a < priorB_a) { start_a <- priorA_a; end_a <- priorB_a }
else { start_a <- priorB_a; end_a <- priorA_a }
if (priorA_b < priorB_b) { start_b <- priorA_b; end_b <- priorB_b }
else { start_b <- priorB_b; end_b <- priorA_b }
steps_a <- seq(from=start_a, to=end_a, length.out=1000)
steps_b <- seq(from=start_b, to=end_b, length.out=1000)
max_error <- 10000000000000000000
best_a <- 0; best_b <- 0
for (a in steps_a)
{
for (b in steps_b)
{
# priorC is beta(a,b)
# find the quantile1_q, quantile2_q, quantile3_q quantiles of priorC:
priorC_q1 <- qbeta(c(quantile1_p), a, b)
priorC_q2 <- qbeta(c(quantile2_p), a, b)
priorC_q3 <- qbeta(c(quantile3_p), a, b)
priorC_error <- abs(priorC_q1-quantile1_q) +
abs(priorC_q2-quantile2_q) +
abs(priorC_q3-quantile3_q)
if (priorC_error < max_error)
{
max_error <- priorC_error; best_a <- a; best_b <- b
}
}
}
print(paste("The best beta prior has a=",best_a,"b=",best_b))
}
quantile1 <- list(p=0.5, x=median(data_tmp$VA))
x1<-quantile(data_tmp$VA,probs = 0.8)
x2<-quantile(data_tmp$VA,probs = 0.2)
quantile2 <- list(p=0.8,x=x1)
quantile3 <- list(p=0.2,x=x2)
findBeta(quantile1,quantile2,quantile3)
二、非参数方法
1、直方图法
a1<-ggplot(data_tmp, aes(x=VA)) +
geom_histogram(binwidth=0.005,aes(y=..density..),colour="black",fill="white")+
geom_density(alpha=.3,col='red') +
theme_classic()+
annotate('text', x = 0.8, y = 8,
label = "Delta ==0.005 ",parse = TRUE,size=4)
a2<-ggplot(data_tmp, aes(x=VA)) +
geom_histogram(binwidth=0.02,aes(y=..density..),colour="black",fill="white")+
geom_density(alpha=.3,col='red') +
theme_classic()+
annotate('text', x = 0.8, y = 3.8,
label = "Delta ==0.02 ",parse = TRUE,size=4)
a3<-ggplot(data_tmp, aes(x=VA)) +
geom_histogram(binwidth=0.15,aes(y=..density..),colour="black",fill="white")+
geom_density(alpha=.3,col='red') +
theme_classic()+
annotate('text', x = 0.8, y = 3.8,
label = "Delta ==0.15 ",parse = TRUE,size=4)
abc<-grid.arrange(a1,a2,a3,
ncol=1,nrow=3,
as.table=TRUE)

只画下左边半个图
2、Kernel
png(file = "kernel.png",width = 1080, height = 1080, units = "px")
par(mfrow=c(1,2))
plot(density(data_tmp$VA,bw = 0.005, kernel = "gaussian"),col="red",main="Kernel Estimation",xlab="VA")
lines(density(data_tmp$VA,bw = 0.01, kernel = "gaussian"),col="blue")
lines(density(data_tmp$VA,bw = 0.05, kernel = "gaussian"),col="green")
rug(data_tmp$VA)
legend("topright", legend=c("bw=0.005", "bw=0.01","bw=0.05"),
col=c("red", "blue","green"), lty=1:1, cex=0.7)

只画下左边半个图
3、KNN
library( basicTrendline)
dk_a=function(A,x,k){
na=nrow(A)
or=1:na
dis=NULL
for(i in 1:na)
{dis=c(dis,(abs(x-A[i,279])))}
disnew=sort(dis)
dk=disnew[k]
return(dk)
}
knear_a=function(A,x,k){
knear=c()
for(i in 1:nrow(A)){
knear[i]=(k-1)/(2*nrow(A)*dk_a(A,x[i],k))}
knear}
set.seed(123)
data_k<-data_tmp[sample(1:nrow(data_tmp),5000),]
VA<-sort(data_k$VA)
k1<-knear_a(data_k,VA,100)
k2<-knear_a(data_k,VA,500)
k3<-knear_a(data_k,VA,800)
k4<-knear_a(data_k,VA,1000)
par(mfrow=c(2,2))
plot(VA,k1,type="l",xlab="VA",ylab="Density",main='k=100')
plot(VA,k2,type="l",xlab="VA",ylab="Density",main='k=500')
plot(VA,k3,type="l",xlab="VA",ylab="Density",main='k=800')
plot(VA,k4,type="l",xlab="VA",ylab="Density",main='k=1000')

该文探讨了如何估计概率密度函数,包括使用R语言的fitdistrplus和logspline库进行参数法和非参数法的分布检验。文中涉及正态分布、伽马分布和贝塔分布的拟合,并通过MCMC方法进行伽马分布的参数估计。此外,还利用直方图法、核密度估计和K近邻方法进行非参数估计,展示了不同binwidth和k值对结果的影响。

858

被折叠的 条评论
为什么被折叠?



