legendre_p.cu
#include "cuda_runtime.h"
#include <cuda.h>
#include <stdlib.h>
#include <iostream>
#include <sys/time.h>
#include <chrono>
#define M 1000000 // M个点等分[-1, 1]
#define N 100 // Legendre polynomial的阶数
// https://people.sc.fsu.edu/~jburkardt/cpp_src/legendre_polynomial/legendre_polynomial.cpp
// https://people.sc.fsu.edu/~jburkardt/cpp_src/legendre_polynomial/legendre_polynomial.html
float *p_polynomial_value ( int m, int n, float x[] )
{
int i;
int j;
float *v;
if ( n < 0 ) return NULL;
v = new float[m*(n+1)];
for ( i = 0; i < m; i++ ) v[i+0*m] = 1.0f;
if ( n < 1 ) return v;
for ( i = 0; i < m; i++ ) v[i+1*m] = x[i];
for ( j = 2; j <= n; j++ )
{
for ( i = 0; i < m; i++ )
{
v[i+j*m] = ( ( float ) ( 2 * j - 1 ) * x[i] * v[i+(j-1)*m]
- ( float ) ( j - 1 ) * v[i+(j-2)*m] )
/ ( float ) ( j

本文介绍了使用CUDA优化计算勒让德多项式的过程,通过将double类型转换为float类型,速度提升约3倍。同时,通过减少全局内存访问,进一步提升了效率。实验结果显示,在计算1000000个点、100阶的勒让德多项式时,GPU版本相较于CPU版本加速了400倍。在release编译模式下,GPU版本依然保持80倍的速度优势。
&spm=1001.2101.3001.5002&articleId=122894533&d=1&t=3&u=3a5119593d5e43e589362d08dfed09f6)
598

被折叠的 条评论
为什么被折叠?



