我很好奇,看到使用OpenGL配置纹理上传的性能,并注意到我认为很奇怪的事情。我使用glTexStorage2D
创建了一个4K纹理,格式为GL_RGBA8
。然后,我使用每个帧glTexSubImage2D
重新上传静态图像缓冲区到纹理。基于帧速率,我得到大约5.19GB/s。接下来,我将纹理格式更改为GL_SRGB8_ALPHA8
并重新尝试实验。这一次我得到了2.81GB/s,显着下降。这看起来很奇怪,因为据我所知,上传sRGB数据和上传RGB数据应该没有什么不同,因为没有应该发生的转换(在采样过程中,sRGB转换应该在着色器中进行)。glTexSubImage2D性能怪异
一些附加信息。对于第一次测试,我在拨打glTexSubImage2D
时使用GL_RGBA
和GL_UNSIGNED_INT_8_8_8_8_REV
,因为这是驱动程序(通过glGetInternalformativ
)告诉我的理想选择。根据司机的建议,我使用GL_UNSIGNED_INT_8_8_8_8
进行第二次测试。一些测试证实这些是分别使用的最快格式。这是使用332.21驱动程序在Windows 7 x64上使用Nvidia GeForce GTX 760。
#include <GL/glew.h>
#include <GLFW/glfw3.h>
#include <vector>
#include <cstdlib>
#include <cstdio>
#define SCREEN_SIZE_X 1024
#define SCREEN_SIZE_Y 1024
#define GLSL(src) "#version 440 core\n" #src
const char* vertex_shader = GLSL(
const vec2 data[4] = vec2[]
(
vec2(-1.0, 1.0),
vec2(-1.0, -1.0),
vec2(1.0, 1.0),
vec2(1.0, -1.0)
);
void main()
{
gl_Position = vec4(data[gl_VertexID], 0.0, 1.0);
}
);
const char* fragment_shader = GLSL(
layout(location = 0) uniform sampler2D texture0;
layout(location = 1) uniform vec2 screenSize;
out vec4 frag_color;
void main()
{
frag_color = texture(texture0, gl_FragCoord.xy/screenSize);
}
);
int main(int argc, char *argv[])
{
if(!glfwInit())
exit(EXIT_FAILURE);
glfwWindowHint(GLFW_RESIZABLE, GL_FALSE);
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 4);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
GLFWwindow* window = glfwCreateWindow(SCREEN_SIZE_X, SCREEN_SIZE_Y, "OpenGL Texture Upload", nullptr, nullptr);
if(!window)
{
glfwTerminate();
exit(EXIT_FAILURE);
}
glfwMakeContextCurrent(window);
glfwSwapInterval(0);
glewExperimental = GL_TRUE;
if(glewInit() != GLEW_OK)
{
glfwTerminate();
exit(EXIT_FAILURE);
}
GLuint vao = 0;
glGenVertexArrays(1, &vao);
glBindVertexArray(vao);
GLuint vs = glCreateShader(GL_VERTEX_SHADER);
glShaderSource(vs, 1, &vertex_shader, nullptr);
glCompileShader(vs);
GLuint fs = glCreateShader(GL_FRAGMENT_SHADER);
glShaderSource(fs, 1, &fragment_shader, nullptr);
glCompileShader(fs);
GLuint shader_program = glCreateProgram();
glAttachShader(shader_program, fs);
glAttachShader(shader_program, vs);
glLinkProgram(shader_program);
glUseProgram(shader_program);
glProgramUniform2f(shader_program, 1, SCREEN_SIZE_X, SCREEN_SIZE_Y);
GLuint texture = 0;
glGenTextures(1, &texture);
#ifdef USE_SRGB
glTextureStorage2DEXT(texture, GL_TEXTURE_2D, 1, GL_SRGB8_ALPHA8, 4096, 4096);
#else
glTextureStorage2DEXT(texture, GL_TEXTURE_2D, 1, GL_RGBA8, 4096, 4096);
#endif
glTextureParameteriEXT(texture, GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTextureParameteriEXT(texture, GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
glTextureParameteriEXT(texture, GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTextureParameteriEXT(texture, GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glBindMultiTextureEXT(GL_TEXTURE0, GL_TEXTURE_2D, texture);
glProgramUniform1i(shader_program, 0, 0);
std::vector<unsigned int> image_buffer(4096*4096, 0xFF0000FFul);
double lastTime = glfwGetTime();
double nbFrames = 0;
while(!glfwWindowShouldClose(window))
{
double currentTime = glfwGetTime();
nbFrames++;
if (currentTime - lastTime >= 1.0)
{
char cbuffer[50];
snprintf(cbuffer, sizeof(cbuffer), "OpenGL Texture Upload [%.1f fps, %.3f ms]", nbFrames, 1000.0/nbFrames);
glfwSetWindowTitle(window, cbuffer);
nbFrames = 0;
lastTime++;
}
#ifdef USE_SRGB
glTextureSubImage2DEXT(texture, GL_TEXTURE_2D, 0, 0, 0, 4096, 4096, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, image_buffer.data());
#else
glTextureSubImage2DEXT(texture, GL_TEXTURE_2D, 0, 0, 0, 4096, 4096, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, image_buffer.data());
#endif
glClear(GL_COLOR_BUFFER_BIT);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
glfwSwapBuffers(window);
glfwPollEvents();
}
glfwDestroyWindow(window);
glfwTerminate();
exit(EXIT_SUCCESS);
}
这不是解释。首先,'glGetInternalformativ'在两种情况下推荐'GL_RGBA',而不是'GL_BGRA'。其次,实验表明,在任何情况下'GL_BGRA'都不会更快。 –