clutter/frame-clock: Optimize latency for platforms missing TIMESTAMP_QUERY

Previously if we had no measurements then `compute_max_render_time_us` would pessimise its answer to ensure triple buffering could be reached: ``` if (frame_clock->state == CLUTTER_FRAME_CLOCK_STATE_DISPATCHED_ONE) ret += refresh_interval_us; ``` But that also meant entering triple buffering even when not required. Now we make `compute_max_render_time_us` more honest and return failure if the answer isn't known (or is disabled). This in turn allows us to optimize `calculate_next_update_time_us` for this special case, ensuring triple buffering can be used, but isn't blindly always used. This makes a visible difference to the latency when dragging windows in Xorg, but will also help Wayland sessions on platforms lacking TIMESTAMP_QUERY such as Raspberry Pi. Signed-off-by: Mingi Sung <sungmg@saltyming.net>
clutter/frame-clock: Record measurements of zero for cursor-only updates
2024-09-15 14:31:18 +09:00 · 2024-09-15 14:31:18 +09:00 · 2024-09-15 14:31:18 +09:00 · 2024-09-15 14:31:18 +09:00 · 2024-09-15 14:31:18 +09:00 · 2024-09-15 14:31:18 +09:00
3 changed files with 200 additions and 2 deletions
--- a/src/backends/meta-egl.c
+++ b/src/backends/meta-egl.c
@ -44,6 +44,11 @@ struct _MetaEgl
  PFNEGLCREATEIMAGEKHRPROC eglCreateImageKHR;
  PFNEGLDESTROYIMAGEKHRPROC eglDestroyImageKHR;

+  PFNEGLCREATESYNCPROC eglCreateSync;
+  PFNEGLDESTROYSYNCPROC eglDestroySync;
+  PFNEGLWAITSYNCPROC eglWaitSync;
+  PFNEGLDUPNATIVEFENCEFDANDROIDPROC eglDupNativeFenceFDANDROID;
+
  PFNEGLBINDWAYLANDDISPLAYWL eglBindWaylandDisplayWL;
  PFNEGLQUERYWAYLANDBUFFERWL eglQueryWaylandBufferWL;

@ -1162,6 +1167,90 @@ meta_egl_query_display_attrib (MetaEgl     *egl,
  return TRUE;
 }

+gboolean
+meta_egl_create_sync (MetaEgl           *egl,
+                      EGLDisplay         display,
+                      EGLenum            type,
+                      const EGLAttrib   *attrib_list,
+                      EGLSync           *egl_sync,
+                      GError           **error)
+{
+  if (!is_egl_proc_valid (egl->eglCreateSync, error))
+    return FALSE;
+
+  EGLSync sync;
+
+  sync = egl->eglCreateSync (display, type, attrib_list);
+
+  if (sync == EGL_NO_SYNC)
+    {
+      set_egl_error (error);
+      return FALSE;
+    }
+
+  *egl_sync = sync;
+
+  return TRUE;
+}
+
+gboolean
+meta_egl_destroy_sync (MetaEgl     *egl,
+                       EGLDisplay   display,
+                       EGLSync      sync,
+                       GError     **error)
+{
+  if (!is_egl_proc_valid (egl->eglDestroySync, error))
+    return FALSE;
+
+  if (!egl->eglDestroySync (display, sync))
+    {
+      set_egl_error (error);
+      return FALSE;
+    }
+
+  return TRUE;
+}
+
+gboolean
+meta_egl_wait_sync (MetaEgl     *egl,
+                    EGLDisplay   display,
+                    EGLSync      sync,
+                    EGLint       flags,
+                    GError     **error)
+{
+  if (!is_egl_proc_valid (egl->eglWaitSync, error))
+    return FALSE;
+
+  if (!egl->eglWaitSync (display, sync, flags))
+    {
+      set_egl_error (error);
+      return FALSE;
+    }
+
+  return TRUE;
+}
+
+EGLint
+meta_egl_duplicate_native_fence_fd (MetaEgl     *egl,
+                                    EGLDisplay   display,
+                                    EGLSync      sync,
+                                    GError     **error)
+{
+  if (!is_egl_proc_valid (egl->eglDupNativeFenceFDANDROID, error))
+    return EGL_NO_NATIVE_FENCE_FD_ANDROID;
+
+  EGLint fd = EGL_NO_NATIVE_FENCE_FD_ANDROID;
+
+  fd = egl->eglDupNativeFenceFDANDROID (display, sync);
+
+  if (fd == EGL_NO_NATIVE_FENCE_FD_ANDROID)
+    {
+      set_egl_error (error);
+    }
+
+  return fd;
+}
+
 #define GET_EGL_PROC_ADDR(proc) \
  egl->proc = (void *) eglGetProcAddress (#proc);

@ -1175,6 +1264,11 @@ meta_egl_constructed (GObject *object)
  GET_EGL_PROC_ADDR (eglCreateImageKHR);
  GET_EGL_PROC_ADDR (eglDestroyImageKHR);

+  GET_EGL_PROC_ADDR (eglCreateSync);
+  GET_EGL_PROC_ADDR (eglDestroySync);
+  GET_EGL_PROC_ADDR (eglWaitSync);
+  GET_EGL_PROC_ADDR (eglDupNativeFenceFDANDROID);
+
  GET_EGL_PROC_ADDR (eglBindWaylandDisplayWL);
  GET_EGL_PROC_ADDR (eglQueryWaylandBufferWL);

--- a/src/backends/meta-egl.h
+++ b/src/backends/meta-egl.h
@ -276,3 +276,26 @@ gboolean meta_egl_query_display_attrib (MetaEgl     *egl,
                                        EGLint       attribute,
                                        EGLAttrib   *value,
                                        GError     **error);
+
+gboolean meta_egl_create_sync (MetaEgl           *egl,
+                               EGLDisplay         display,
+                               EGLenum            type,
+                               const EGLAttrib   *attrib_list,
+                               EGLSync           *egl_sync,
+                               GError           **error);
+
+gboolean meta_egl_destroy_sync (MetaEgl     *egl,
+                                EGLDisplay   display,
+                                EGLSync      sync,
+                                GError     **error);
+
+gboolean meta_egl_wait_sync (MetaEgl     *egl,
+                             EGLDisplay   display,
+                             EGLSync      sync,
+                             EGLint       flags,
+                             GError     **error);
+
+EGLint meta_egl_duplicate_native_fence_fd (MetaEgl     *egl,
+                                           EGLDisplay   display,
+                                           EGLSync      sync,
+                                           GError     **error);
--- a/src/backends/native/meta-onscreen-native.c
+++ b/src/backends/native/meta-onscreen-native.c
@ -29,6 +29,7 @@

 #include "backends/native/meta-onscreen-native.h"

+#include <glib/gstdio.h>
 #include <drm_fourcc.h>

 #include "backends/meta-egl-ext.h"
@ -879,19 +880,51 @@ copy_shared_framebuffer_gpu (CoglOnscreen                         *onscreen,
  CoglFramebuffer *framebuffer = COGL_FRAMEBUFFER (onscreen);
  CoglContext *cogl_context = cogl_framebuffer_get_context (framebuffer);
  CoglDisplay *cogl_display = cogl_context_get_display (cogl_context);
+  CoglRendererEGL *cogl_renderer_egl = cogl_context->display->renderer->winsys;
  MetaRenderDevice *render_device;
-  EGLDisplay egl_display;
+  EGLDisplay egl_display = NULL;
  gboolean use_modifiers;
  MetaDeviceFile *device_file;
  MetaDrmBufferFlags flags;
  MetaDrmBufferGbm *buffer_gbm = NULL;
  struct gbm_bo *bo;
+  EGLSync primary_gpu_egl_sync = EGL_NO_SYNC;
+  EGLSync secondary_gpu_egl_sync = EGL_NO_SYNC;
+  g_autofd int primary_gpu_sync_fence = EGL_NO_NATIVE_FENCE_FD_ANDROID;

  COGL_TRACE_BEGIN_SCOPED (CopySharedFramebufferSecondaryGpu,
                           "copy_shared_framebuffer_gpu()");

  if (renderer_gpu_data->secondary.needs_explicit_sync)
-    cogl_framebuffer_finish (COGL_FRAMEBUFFER (onscreen));
+    {
+      if (!meta_egl_create_sync (egl,
+                                cogl_renderer_egl->edpy,
+                                EGL_SYNC_NATIVE_FENCE_ANDROID,
+                                NULL,
+                                &primary_gpu_egl_sync,
+                                error))
+       {
+         g_prefix_error (error, "Failed to create EGLSync on primary GPU: ");
+         return NULL;
+       }
+
+      // According to the EGL_KHR_fence_sync specification we must ensure
+      // the fence command is flushed in this context to be able to await it
+      // in another (secondary GPU context) or we risk waiting indefinitely.
+      cogl_framebuffer_flush (COGL_FRAMEBUFFER (onscreen));
+
+      primary_gpu_sync_fence =
+        meta_egl_duplicate_native_fence_fd (egl,
+                                            cogl_renderer_egl->edpy,
+                                            primary_gpu_egl_sync,
+                                            error);
+
+      if (primary_gpu_sync_fence == EGL_NO_NATIVE_FENCE_FD_ANDROID)
+        {
+          g_prefix_error (error, "Failed to duplicate EGLSync FD on primary GPU: ");
+          goto done;
+        }
+    }

  render_device = renderer_gpu_data->render_device;
  egl_display = meta_render_device_get_egl_display (render_device);
@ -907,6 +940,40 @@ copy_shared_framebuffer_gpu (CoglOnscreen                         *onscreen,
      goto done;
    }

+  if (primary_gpu_sync_fence != EGL_NO_NATIVE_FENCE_FD_ANDROID)
+    {
+      EGLAttrib attribs[3];
+
+      attribs[0] = EGL_SYNC_NATIVE_FENCE_FD_ANDROID;
+      attribs[1] = primary_gpu_sync_fence;
+      attribs[2] = EGL_NONE;
+
+      if (!meta_egl_create_sync (egl,
+                                egl_display,
+                                EGL_SYNC_NATIVE_FENCE_ANDROID,
+                                attribs,
+                                &secondary_gpu_egl_sync,
+                                error))
+        {
+          g_prefix_error (error, "Failed to create EGLSync on secondary GPU: ");
+          goto done;
+        }
+
+      // eglCreateSync takes ownership of an existing fd that is passed, so
+      // don't try to clean it up twice.
+      primary_gpu_sync_fence = EGL_NO_NATIVE_FENCE_FD_ANDROID;
+
+      if (!meta_egl_wait_sync (egl,
+                               egl_display,
+                               secondary_gpu_egl_sync,
+                               0,
+                               error))
+        {
+          g_prefix_error (error, "Failed to wait for EGLSync on secondary GPU: ");
+          goto done;
+        }
+    }
+
  buffer_gbm = META_DRM_BUFFER_GBM (primary_gpu_fb);
  bo = meta_drm_buffer_gbm_get_bo (buffer_gbm);
  if (!meta_renderer_native_gles3_blit_shared_bo (egl,
@ -956,6 +1023,20 @@ copy_shared_framebuffer_gpu (CoglOnscreen                         *onscreen,
 done:
  _cogl_winsys_egl_ensure_current (cogl_display);

+  if (primary_gpu_egl_sync != EGL_NO_SYNC &&
+      !meta_egl_destroy_sync (egl,
+                              cogl_renderer_egl->edpy,
+                              primary_gpu_egl_sync,
+                              error))
+    g_prefix_error (error, "Failed to destroy primary GPU EGLSync: ");
+
+  if (secondary_gpu_egl_sync != EGL_NO_SYNC &&
+      !meta_egl_destroy_sync (egl,
+                              egl_display,
+                              secondary_gpu_egl_sync,
+                              error))
+    g_prefix_error (error, "Failed to destroy secondary GPU EGLSync: ");
+
  return buffer_gbm ? META_DRM_BUFFER (buffer_gbm) : NULL;
 }
Author	SHA1	Message	Date
Daniel van Vugt	5488009f59	clutter/frame-clock: Optimize latency for platforms missing TIMESTAMP_QUERY Previously if we had no measurements then `compute_max_render_time_us` would pessimise its answer to ensure triple buffering could be reached: ``` if (frame_clock->state == CLUTTER_FRAME_CLOCK_STATE_DISPATCHED_ONE) ret += refresh_interval_us; ``` But that also meant entering triple buffering even when not required. Now we make `compute_max_render_time_us` more honest and return failure if the answer isn't known (or is disabled). This in turn allows us to optimize `calculate_next_update_time_us` for this special case, ensuring triple buffering can be used, but isn't blindly always used. This makes a visible difference to the latency when dragging windows in Xorg, but will also help Wayland sessions on platforms lacking TIMESTAMP_QUERY such as Raspberry Pi. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:18 +09:00
Daniel van Vugt	6e7297e764	clutter/frame-clock: Record measurements of zero for cursor-only updates But only if we've ever got actual swap measurements (COGL_FEATURE_ID_TIMESTAMP_QUERY). If it's supported then we now drop to double buffering and get optimal latency on a burst of cursor-only updates. Closes: https://launchpad.net/bugs/2023363 Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:18 +09:00
Daniel van Vugt	e3b2344420	onscreen/native: Avoid callbacks on "detached" onscreens Detached onscreens have no valid view so avoid servicing callbacks on them during/after sleep mode. As previously mentioned in `45bda2d969`. Closes: https://launchpad.net/bugs/2020049 Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:18 +09:00
Daniel van Vugt	b935844a4c	tests/native-kms-render: Fix failing client-scanout test It was assuming an immediate transition from compositing (triple buffering) to direct scanout (double buffering), whereas there is a one frame delay in that transition as the buffer queue shrinks. We don't lose any frames in the transition. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:18 +09:00
Daniel van Vugt	d345de78c2	clutter/frame-clock: Conditionally disable triple buffering 1. When direct scanout is attempted There's no compositing during direct scanout so the "render" time is zero. Thus there is no need to implement triple buffering for direct scanouts. Stick to double buffering and enjoy the lower latency. 2. If disabled by environment variable MUTTER_DEBUG_TRIPLE_BUFFERING With possible values {never, auto, always} where auto is the default. 3. When VRR is in use VRR calls `clutter_frame_clock_schedule_update_now` which would keep the buffer queue full, which in turn prevented direct scanout mode. Because OnscreenNative currently only supports direct scanout with double buffering. We now break that feedback loop by preventing triple buffering from being scheduled when the frame clock mode becomes variable. Long term this could also be solved by supporting triple buffering in direct scanout mode. But whether or not that would be desirable given the latency penalty remains to be seen. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:18 +09:00
Daniel van Vugt	6523517350	clutter: Pass ClutterFrameHint(s) to the frame clock Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:18 +09:00
Daniel van Vugt	f366a7d931	backends: Flag that the frame attempted direct scanout We need this hint whether direct scanout succeeds or fails because it's the mechanism by which we will tell the clock to enforce double buffering, thus making direct scanout possible on future frames. Triple buffering will be disabled until such time that direct scanout is not being attempted. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:17 +09:00
Daniel van Vugt	fcea00f63a	clutter/frame: Add ClutterFrameHint to ClutterFrame This will allow the backend to provide performance hints to the frame clock in future. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:17 +09:00
Daniel van Vugt	a1e6d2242b	clutter/frame-clock: Log N-buffers in CLUTTTER_DEBUG=frame-timings Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:17 +09:00
Daniel van Vugt	0b2e48db6f	clutter/frame-clock: Add triple buffering support Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:17 +09:00
Daniel van Vugt	2ae303bb95	clutter/frame-clock: Merge states DISPATCHING and PENDING_PRESENTED Chronologically they already overlap in time as presentation may complete in the middle of the dispatch function, otherwise they are contiguous in time. And most switch statements treated the two states the same already so they're easy to merge into a single `DISPATCHED` state. Having fewer states now will make life easier when we add more states later. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:17 +09:00
Daniel van Vugt	a4ac229578	clutter/frame-clock: Lower the threshold for disabling error diffusion Error diffusion was introduced in `0555a5bbc1` for Nvidia where last presentation time is always unknown (zero). Dispatch times would drift apart always being a fraction of a frame late, and accumulated to cause periodic frame skips. So error diffusion corrected that precisely and avoided the skips. That works great with double buffering but less great with triple buffering. It's certainly still needed with triple buffering but correcting for a lateness of many milliseconds isn't a good idea. That's because a dispatch being that late is not due to main loop jitter but due to Nvidia's swap buffers blocking when the queue is full. So scheduling the next frame even earlier using last_dispatch_lateness_us would just perpetuate the problem of swap buffers blocking for too long. So now we lower the threshold of when error diffusion gets disabled. It's still high enough to fix the original smoothness problem it was for, but now low enough to detect Nvidia's occasionally blocking swaps and backs off in that case. Since the average duration of a blocking swap is half a frame interval and we want to distinguish between that and sub-millisecond jitter, the logical threshold is halfway again: refresh_interval_us/4. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:17 +09:00
Daniel van Vugt	a0248cb618	renderer/native: Discard pending swaps when rebuilding views It's analogous to discard_pending_page_flips but represents swaps that might become flips after the next frame notification callbacks, thanks to triple buffering. Since the views are being rebuilt and their onscreens are about to be destroyed, turning those swaps into more flips/posts would just lead to unexpected behaviour (like trying to flip on a half-destroyed inactive CRTC). Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:17 +09:00
Daniel van Vugt	bc1ec8e24e	onscreen/native: Skip try_post_latest_swap if shutting down Otherwise we could get: meta_kms_prepare_shutdown -> flush_callbacks -> ... -> try_post_latest_swap -> post and queue more callbacks So later in shutdown those callbacks would trigger an assertion failure in meta_kms_impl_device_atomic_finalize: g_hash_table_size (impl_device_atomic->page_flip_datas) == 0 Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:17 +09:00
Daniel van Vugt	501a5cc512	onscreen/native: Add function meta_onscreen_native_discard_pending_swaps Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:17 +09:00
Daniel van Vugt	66dd0826a8	onscreen/native: Increase secondary GPU dumb_fbs from 2 to 3 So that they don't get overwritten prematurely during triple buffering causing tearing. https://launchpad.net/bugs/1999216 Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:16 +09:00
Daniel van Vugt	febb9a4261	onscreen/native: Defer posting if there's already a post in progress And when the number of pending posts decreases we know it's safe to submit a new one. Since KMS generally only supports one outstanding post right now, "decreases" means equal to zero. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:16 +09:00
Daniel van Vugt	167b013b99	onscreen/native: Insert a 'posted' frame between 'next' and 'presented' This will allow us to keep track of up to two buffers that have been swapped but not yet scanning out, for triple buffering. This commit replaces mutter!1968 Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:16 +09:00
Daniel van Vugt	044997b8cc	onscreen/native: Split swap_buffers_with_damage into two functions 1. The EGL part: meta_onscreen_native_swap_buffers_with_damage 2. The KMS part: post_latest_swap Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:16 +09:00
Daniel van Vugt	7edfbcceb7	onscreen/native: Deduplicate calls to clutter_frame_set_result All paths out of `meta_onscreen_native_swap_buffers_with_damage` from here onward would set the same `CLUTTER_FRAME_RESULT_PENDING_PRESENTED` (or terminate with `g_assert_not_reached`). Even failed posts set this result because they will do a `meta_onscreen_native_notify_frame_complete` in `page_flip_feedback_discarded`. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:16 +09:00
Daniel van Vugt	3205e666fe	onscreen/native: Replace an assertion that double buffering is the maximum Because it soon won't be the maximum. But we do want to verify that the frame info queue is not empty, to avoid NULL dereferencing and catch logic errors. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:16 +09:00
Daniel van Vugt	fbfaeb56a6	onscreen/native: Log swapbuffers and N-buffering when MUTTER_DEBUG=kms Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:16 +09:00
Daniel van Vugt	5dc8b2f73a	backends/native: Add set/get_damage functions to MetaFrameNative Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:16 +09:00
Daniel van Vugt	d4e9b1f8d5	renderer/native: Steal the power save flip list before iterating over it Because a single iteration might also grow the list again. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:15 +09:00
Daniel van Vugt	fbac742306	renderer/native: Avoid requeuing the same onscreen for a power save flip This is a case that triple buffering will encounter. We don't want it to queue the same onscreen multiple times because that would represent multiple flips occurring simultaneously. It's a linear search but the list length is typically only 1 or 2 so no need for anything fancier yet. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:15 +09:00
Daniel van Vugt	bd521be148	kms: Keep a shutting_down flag Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:15 +09:00
Daniel van Vugt	8ed6470b31	cogl/onscreen: Indent declaration parameters to align with above This fixes warnings from check-code-style. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:15 +09:00
Daniel van Vugt	e02a8e15b1	cogl/onscreen: Add function cogl_onscreen_get_pending_frame_count Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:31:15 +09:00
Gert-dev	4726186224	onscreen/native: Use EGLSyncs instead of cogl_framebuffer_finish cogl_framebuffer_finish can result in a CPU-side stall because it waits for the primary GPU to flush and execute all commands that were queued before that. By using a GPU-side EGLSync we can let the primary GPU inform us when it is done with the queued commands instead. We then create another EGLSync on the secondary GPU using the same fd so the primary GPU effectively signals the secondary GPU when it is done rendering, causing the latter to wait for the former before copying part of the frames it needs for monitors attached to it directly. This solves the corruption that cogl_framebuffer_finish also solved, but without needing a CPU-side stall. Signed-off-by: Mingi Sung <sungmg@saltyming.net>	2024-09-15 14:30:55 +09:00