Skip to content

Commit

Permalink
Add lost_communication_with_server_error_total (#445)
Browse files Browse the repository at this point in the history
  • Loading branch information
int128 authored Aug 27, 2022
1 parent 9b1a7de commit 55e05d6
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 3 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ This action sends the following metrics if enabled.
- Time from a job is started to completed
- `github.actions.job.queued_duration_second`
- Time from a job is started until the first step is started
- `github.actions.job.lost_communication_with_server_error_total`
- Count of "lost communication with the server" errors of self-hosted runners.
See the issue [#444](https://github.com/int128/datadog-actions-metrics/issues/444) for details

It has the following tags:

Expand Down
2 changes: 1 addition & 1 deletion src/generated/graphql.ts

Large diffs are not rendered by default.

19 changes: 18 additions & 1 deletion src/queries/completedCheckSuite.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { CompletedCheckSuiteQuery, CompletedCheckSuiteQueryVariables } from '../generated/graphql'
import { CheckRun, CheckStep } from '../generated/graphql-types'
import { CheckAnnotation, CheckRun, CheckStep } from '../generated/graphql-types'
import { Octokit } from '../types'

const query = /* GraphQL */ `
Expand All @@ -15,6 +15,11 @@ const query = /* GraphQL */ `
completedAt
conclusion
status
annotations(first: 10) {
nodes {
message
}
}
steps(first: 50) {
nodes {
number
Expand Down Expand Up @@ -64,6 +69,9 @@ export type CompletedCheckSuite = {

type CompletedCheckRun = Pick<CheckRun, 'databaseId' | 'name' | 'status'> &
NonNullablePick<CheckRun, 'startedAt' | 'completedAt' | 'conclusion'> & {
annotations: {
nodes: Pick<CheckAnnotation, 'message'>[]
}
steps: {
nodes: CompletedStep[]
}
Expand Down Expand Up @@ -97,6 +105,14 @@ const extractCheckRuns = (r: CompletedCheckSuiteQuery): CompletedCheckSuite['nod
continue
}

const annotations = []
for (const annotation of checkRun.annotations?.nodes ?? []) {
if (annotation == null) {
continue
}
annotations.push(annotation)
}

const steps: CompletedStep[] = []
for (const step of checkRun.steps?.nodes ?? []) {
if (step == null) {
Expand All @@ -123,6 +139,7 @@ const extractCheckRuns = (r: CompletedCheckSuiteQuery): CompletedCheckSuite['nod
startedAt,
completedAt,
conclusion,
annotations: { nodes: annotations },
steps: { nodes: steps },
})
}
Expand Down
13 changes: 13 additions & 0 deletions src/workflowRun/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,16 @@ export const computeJobMetrics = (
points: [[completedAt, duration]],
})

if (checkRun.annotations.nodes.some((a) => isLostCommunicationWithServerError(a.message))) {
series.push({
host: 'github.com',
tags,
metric: 'github.actions.job.lost_communication_with_server_error_total',
type: 'count',
points: [[completedAt, 1]],
})
}

if (checkRun.steps.nodes.length > 0) {
const firstStepStartedAt = Math.min(...checkRun.steps.nodes.map((s) => unixTime(s.startedAt)))
const queued = firstStepStartedAt - startedAt
Expand All @@ -156,6 +166,9 @@ export const computeJobMetrics = (
return series
}

export const isLostCommunicationWithServerError = (message: string): boolean =>
/^The self-hosted runner: .+? lost communication with the server./.test(message)

export const computeStepMetrics = (
e: WorkflowRunCompletedEvent,
checkSuite: CompletedCheckSuite,
Expand Down
1 change: 1 addition & 0 deletions tests/workflowRun/fixtures/completedCheckSuite.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export const exampleCompletedCheckSuite: CompletedCheckSuiteQuery & CompletedChe
name: 'build',
status: CheckStatusState.Completed,
conclusion: CheckConclusionState.Success,
annotations: { nodes: [] },
steps: {
nodes: [
{
Expand Down
20 changes: 19 additions & 1 deletion tests/workflowRun/metrics.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import { WorkflowDefinition } from '../../src/workflowRun/parse'
import { computeJobMetrics, computeStepMetrics, computeWorkflowRunMetrics } from '../../src/workflowRun/metrics'
import {
computeJobMetrics,
computeStepMetrics,
computeWorkflowRunMetrics,
isLostCommunicationWithServerError,
} from '../../src/workflowRun/metrics'
import { exampleCompletedCheckSuite } from './fixtures/completedCheckSuite'
import { exampleJobMetrics, exampleStepMetrics, exampleWorkflowRunMetrics } from './fixtures/metrics'
import { exampleWorkflowRunEvent } from './fixtures/workflowRunEvent'
Expand All @@ -26,3 +31,16 @@ test('computeStepMetrics', () => {
const series = computeStepMetrics(exampleWorkflowRunEvent, exampleCompletedCheckSuite, exampleWorkflowDefinition)
expect(series).toStrictEqual(exampleStepMetrics)
})

describe('isLostCommunicationWithServerError', () => {
test('matched', () => {
expect(
isLostCommunicationWithServerError(
`The self-hosted runner: POD-NAME lost communication with the server. Verify the machine is running and has a healthy network connection. Anything in your workflow that terminates the runner process, starves it for CPU/Memory, or blocks its network access can cause this error.`
)
).toBeTruthy()
})
test('not related error', () => {
expect(isLostCommunicationWithServerError(`Process exit 1`)).toBeFalsy()
})
})

0 comments on commit 55e05d6

Please sign in to comment.