From 36cfd15fb25a25213199eccfa1289962ee2cd7c7 Mon Sep 17 00:00:00 2001
From: Raymond Kim <109366641+tt-rkim@users.noreply.github.com>
Date: Wed, 28 Aug 2024 17:15:59 -0400
Subject: [PATCH] #10718: Fix issue with negative pipeline queue times (#12010)

* #10718: Filter out jobs with a start time lower than the pipeline start time because that means that it's a re-run and we need to account for already-passing jobs

* #10718: Comment for the thing we just did

* #10718: Add workflow run attempt as an arg so we can test diff attempts
---
 .github/workflows/_produce-data.yaml  |  6 +++++-
 infra/data_collection/github/utils.py | 12 +++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_produce-data.yaml b/.github/workflows/_produce-data.yaml
index 242ed2ca329..8f0edf71f52 100644
--- a/.github/workflows/_produce-data.yaml
+++ b/.github/workflows/_produce-data.yaml
@@ -8,6 +8,10 @@ on:
         description: "Unique GitHub workflow run ID to use for data"
         default: 10066309412
         type: number
+      test_workflow_run_attempt:
+        description: "Run attempt of the workflow run"
+        default: 1
+        type: number
   workflow_run:
     workflows:
       - "All post-commit tests"
@@ -44,7 +48,7 @@ jobs:
           event_name="${{ github.event_name }}"
           if [[ "$event_name" == "workflow_dispatch" ]]; then
             run_id="${{ inputs.test_workflow_run_id }}"
-            attempt_number="1"
+            attempt_number="${{ inputs.test_workflow_run_attempt }}"
           elif [[ "$event_name" == "workflow_run" ]]; then
             run_id="${{ github.event.workflow_run.id }}"
             attempt_number="${{ github.event.workflow_run.run_attempt }}"
diff --git a/infra/data_collection/github/utils.py b/infra/data_collection/github/utils.py
index 54911294573..84ef0d97a70 100644
--- a/infra/data_collection/github/utils.py
+++ b/infra/data_collection/github/utils.py
@@ -59,7 +59,17 @@ def get_pipeline_row_from_github_info(github_runner_environment, github_pipeline
 
     jobs = github_jobs_json["jobs"]
     jobs_start_times = list(map(lambda job_: get_datetime_from_github_datetime(job_["started_at"]), jobs))
-    sorted_jobs_start_times = sorted(jobs_start_times)
+    # We filter out jobs that started before because that means they're from a previous attempt for that pipeline
+    eligible_jobs_start_times = list(
+        filter(
+            lambda job_start_time_: job_start_time_ >= get_datetime_from_github_datetime(pipeline_submission_ts),
+            jobs_start_times,
+        )
+    )
+    sorted_jobs_start_times = sorted(eligible_jobs_start_times)
+    assert (
+        sorted_jobs_start_times
+    ), f"It seems that this pipeline does not have any jobs that started on or after the pipeline was submitted, which should be impossible. Please directly inspect the JSON objects"
     pipeline_start_ts = get_data_pipeline_datetime_from_datetime(sorted_jobs_start_times[0])
 
     pipeline_end_ts = github_pipeline_json["updated_at"]