optimize performace

Signed-off-by: ZhuohaoHe <[email protected]>
Simple-XX · Sep 17, 2024 · 02caa9e · 02caa9e
1 parent 081444c
commit 02caa9e
Show file tree

Hide file tree

Showing 4 changed files with 137 additions and 19 deletions.
diff --git a/src/include/face.hpp b/src/include/face.hpp
@@ -39,9 +39,9 @@ class Face {
 
   // Get functions
   // 获取函数
-  const std::array<size_t, 3>& GetIndices() const { return indices_; }
-  const size_t GetIndex(size_t index) const { return indices_[index]; }
-  const Material& GetMaterial() const { return material_; }
+  inline const std::array<size_t, 3>& GetIndices() const { return indices_; }
+  inline const size_t GetIndex(size_t index) const { return indices_[index]; }
+  inline const Material& GetMaterial() const { return material_; }
 
  private:
   std::array<size_t, 3> indices_;

diff --git a/src/include/renderer.h b/src/include/renderer.h
@@ -58,7 +58,6 @@ class SimpleRenderer {
  private:
   const size_t height_;
   const size_t width_;
-  std::shared_ptr<float[]> depth_buffer_;
   LogSystem log_system_;
 
   std::shared_ptr<Shader> shader_;
@@ -69,8 +68,7 @@ class SimpleRenderer {
    * @param model 模型
    */
   void DrawModel(const Model &model, uint32_t *buffer);
-
-  void ClearDepthBuffer();
+  void DrawModelSlower(const Model &model, uint32_t *buffer);
 };
 }  // namespace simple_renderer
 

diff --git a/src/include/vertex.hpp b/src/include/vertex.hpp
@@ -41,10 +41,10 @@ class Vertex {
 
   // Getter functions
   // 获取函数
-  [[nodiscard]] Vector4f GetPosition() const { return position_; }
-  [[nodiscard]] Vector3f GetNormal() const { return normal_; }
-  [[nodiscard]] Vector2f GetTexCoords() const { return texCoords_; }
-  [[nodiscard]] Color GetColor() const { return color_; }
+  [[nodiscard]] inline Vector4f GetPosition() const { return position_; }
+  [[nodiscard]] inline Vector3f GetNormal() const { return normal_; }
+  [[nodiscard]] inline Vector2f GetTexCoords() const { return texCoords_; }
+  [[nodiscard]] inline Color GetColor() const { return color_; }
 
  private:
   Vector4f position_;   // 3D position, 3D顶点坐标

diff --git a/src/renderer.cpp b/src/renderer.cpp
@@ -37,28 +37,148 @@ SimpleRenderer::SimpleRenderer(size_t width, size_t height)
       width_(width),
       log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)) {
   rasterizer_ = std::make_shared<Rasterizer>(width, height);
-  // init depth buffer
-  depth_buffer_ = std::shared_ptr<float[]>(new float[width * height],
-                                           std::default_delete<float[]>());
 }
 
 bool SimpleRenderer::Render(const Model &model, const Shader &shader,
                             uint32_t *buffer) {
   SPDLOG_INFO("render model: {}", model.GetModelPath());
-  ClearDepthBuffer();
   shader_ = std::make_shared<Shader>(shader);
   DrawModel(model, buffer);
   return true;
 }
 
-void SimpleRenderer::ClearDepthBuffer() {
-  std::fill(depth_buffer_.get(), depth_buffer_.get() + width_ * height_,
-            std::numeric_limits<float>::infinity());
-}
+/*
+Optimizes performance by performing depth testing during rasterization, keeping
+only the closest fragment per pixel, and avoiding storing all
+fragments—resulting in faster rendering.
 
+通过在光栅化过程中执行深度测试，仅保留每个像素的深度值最近的片段，避免存储所有片段，从而优化性能，实现更快的渲染。
+*/
 void SimpleRenderer::DrawModel(const Model &model, uint32_t *buffer) {
   SPDLOG_INFO("draw {}", model.GetModelPath());
 
+  /* * * Vertex Shader * * */
+  std::vector<Vertex> processedVertices;
+  std::vector<std::vector<Vertex>> processed_vertices_all_thread(kNProc);
+#pragma omp parallel num_threads(kNProc) default(none) \
+    shared(shader_, processed_vertices_all_thread) firstprivate(model)
+  {
+    int thread_id = omp_get_thread_num();
+    std::vector<Vertex> &processedVertices_per_thread =
+        processed_vertices_all_thread[thread_id];
+
+#pragma omp for
+    for (const auto &v : model.GetVertices()) {
+      auto vertex = shader_->VertexShader(v);
+      processedVertices_per_thread.push_back(vertex);
+    }
+  }
+
+  for (const auto &processedVertices_per_thread :
+       processed_vertices_all_thread) {
+    processedVertices.insert(processedVertices.end(),
+                             processedVertices_per_thread.begin(),
+                             processedVertices_per_thread.end());
+  }
+  /*  *  *  *  *  *  *  */
+
+  /* * * Rasterization * * */
+  std::vector<std::unique_ptr<float[]>> depthBuffer_all_thread(kNProc);
+  std::vector<std::unique_ptr<uint32_t[]>> colorBuffer_all_thread(kNProc);
+
+  for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
+    depthBuffer_all_thread[thread_id] =
+        std::make_unique<float[]>(width_ * height_);
+    colorBuffer_all_thread[thread_id] =
+        std::make_unique<uint32_t[]>(width_ * height_);
+
+    std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_,
+                std::numeric_limits<float>::infinity());
+    std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0);
+  }
+
+#pragma omp parallel num_threads(kNProc) default(none) \ 
+  shared(processedVertices, rasterizer_, shader_, width_, height_, \
+             depthBuffer_all_thread, colorBuffer_all_thread)       \
+    firstprivate(model)
+  {
+    int thread_id = omp_get_thread_num();
+    auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
+    auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id];
+#pragma omp for
+    for (const auto &f : model.GetFaces()) {
+      auto v0 = processedVertices[f.GetIndex(0)];
+      auto v1 = processedVertices[f.GetIndex(1)];
+      auto v2 = processedVertices[f.GetIndex(2)];
+
+      const Material *material = &f.GetMaterial();
+
+      auto fragments = rasterizer_->Rasterize(v0, v1, v2);
+
+      for (auto &fragment : fragments) {
+        fragment.material = material;
+
+        size_t x = fragment.screen_coord[0];
+        size_t y = fragment.screen_coord[1];
+
+        if (x >= width_ || y >= height_) {
+          continue;
+        }
+
+        size_t index = x + y * width_;
+
+        if (fragment.depth < depthBuffer_per_thread[index]) {
+          depthBuffer_per_thread[index] = fragment.depth;
+
+          /* * * Fragment Shader * * */
+          auto color = shader_->FragmentShader(fragment);
+          colorBuffer_per_thread[index] = uint32_t(color);
+        }
+      }
+    }
+  }
+
+  // Merge
+  std::unique_ptr<float[]> depthBuffer =
+      std::make_unique<float[]>(width_ * height_);
+  std::unique_ptr<uint32_t[]> colorBuffer =
+      std::make_unique<uint32_t[]>(width_ * height_);
+
+  std::fill_n(depthBuffer.get(), width_ * height_,
+              std::numeric_limits<float>::infinity());
+  std::fill_n(colorBuffer.get(), width_ * height_, 0);
+
+#pragma omp parallel for
+  for (size_t i = 0; i < width_ * height_; i++) {
+    float min_depth = std::numeric_limits<float>::infinity();
+    uint32_t color = 0;
+
+    for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
+      float depth = depthBuffer_all_thread[thread_id][i];
+      if (depth < min_depth) {
+        min_depth = depth;
+        color = colorBuffer_all_thread[thread_id][i];
+      }
+    }
+    depthBuffer[i] = min_depth;
+    colorBuffer[i] = color;
+  }
+
+  std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
+}
+
+/*
+Organizes processing to simulate how OpenGL works with GPUs by collecting all
+fragments per pixel before processing, closely mimicking the GPU pipeline but
+leading to increased memory usage and slower performance.
+
+组织处理方式模拟 OpenGL 在 GPU
+上的工作原理，先收集每个像素的所有片段再并行处理屏幕上的每个像素，模仿 GPU
+管线，但导致内存使用增加和渲染速度变慢
+*/
+void SimpleRenderer::DrawModelSlower(const Model &model, uint32_t *buffer) {
+  SPDLOG_INFO("draw {}", model.GetModelPath());
+
   /* * * Vertex Shader * * */
   std::vector<Vertex> processedVertex;
   std::vector<std::vector<Vertex>> processed_vertices_per_thread(kNProc);
@@ -152,6 +272,6 @@ void SimpleRenderer::DrawModel(const Model &model, uint32_t *buffer) {
     }
   }
   /*  *  *  *  *  *  *  */
-}  // namespace simple_renderer
+}
 
 }  // namespace simple_renderer