diff --git a/metric/system/process/helper_test.go b/metric/system/process/helper_test.go new file mode 100644 index 000000000..841b2f172 --- /dev/null +++ b/metric/system/process/helper_test.go @@ -0,0 +1,76 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package process + +import ( + "errors" + "fmt" + "syscall" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestErrors(t *testing.T) { + cases := []struct { + name string + check func(t *testing.T) + }{ + { + name: "non fatal error", + check: func(t *testing.T) { + err := fmt.Errorf("Faced non-fatal error: %w", NonFatalErr{Err: syscall.EPERM}) + require.True(t, isNonFatal(err), "Should be a non fatal error") + }, + }, + { + name: "non fatal error - unwrapped", + check: func(t *testing.T) { + err := fmt.Errorf("Faced non-fatal error: %w", syscall.EPERM) + require.True(t, isNonFatal(err), "Should be a non fatal error") + }, + }, + { + name: "non fatal error - hierarchy", + check: func(t *testing.T) { + err := fmt.Errorf("Faced non-fatal error: %w", syscall.EPERM) + err2 := errors.Join(toNonFatal(err)) + require.True(t, isNonFatal(err2), "Should be a non fatal error") + }, + }, + { + name: "fatal error", + check: func(t *testing.T) { + err := fmt.Errorf("Faced fatal error: %w", errors.New("FATAL")) + err = toNonFatal(err) // shouldn't have any effect as it's a fatal error + require.Falsef(t, isNonFatal(err), "Should be a fatal error") + }, + }, + { + name: "fatal error - hierarchy", + check: func(t *testing.T) { + err := fmt.Errorf("Faced fatal error: %w", errors.New("FATAL")) + err2 := errors.Join(err) + require.Falsef(t, isNonFatal(err2), "Should be a fatal error") + }, + }, + } + for _, c := range cases { + t.Run(c.name, c.check) + } +} diff --git a/metric/system/process/helpers.go b/metric/system/process/helpers.go index 362d7953d..4c489a628 100644 --- a/metric/system/process/helpers.go +++ b/metric/system/process/helpers.go @@ -98,3 +98,13 @@ func GetProcCPUPercentage(s0, s1 ProcState) ProcState { return s1 } + +func toNonFatal(err error) error { + if err == nil { + return nil + } + if !isNonFatal(err) { + return err + } + return NonFatalErr{Err: err} +} diff --git a/metric/system/process/helpers_others.go b/metric/system/process/helpers_others.go new file mode 100644 index 000000000..2873a5e4c --- /dev/null +++ b/metric/system/process/helpers_others.go @@ -0,0 +1,35 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//go:build !windows + +package process + +import ( + "errors" + "syscall" +) + +func isNonFatal(err error) bool { + if err == nil { + return true + } + return (errors.Is(err, syscall.EACCES) || + errors.Is(err, syscall.EPERM) || + errors.Is(err, syscall.EINVAL) || + errors.Is(err, NonFatalErr{})) +} diff --git a/metric/system/process/helpers_windows.go b/metric/system/process/helpers_windows.go new file mode 100644 index 000000000..94e4b882d --- /dev/null +++ b/metric/system/process/helpers_windows.go @@ -0,0 +1,37 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//go:build windows + +package process + +import ( + "errors" + "syscall" + + "golang.org/x/sys/windows" +) + +func isNonFatal(err error) bool { + if err == nil { + return true + } + return errors.Is(err, windows.ERROR_ACCESS_DENIED) || + errors.Is(err, syscall.EPERM) || + errors.Is(err, syscall.EINVAL) || + errors.Is(err, windows.ERROR_INVALID_PARAMETER) || errors.Is(err, NonFatalErr{}) +} diff --git a/metric/system/process/process.go b/metric/system/process/process.go index 4a9151b61..b0eacdd09 100644 --- a/metric/system/process/process.go +++ b/metric/system/process/process.go @@ -25,6 +25,7 @@ import ( "fmt" "sort" "strings" + "syscall" "time" psutil "github.com/shirou/gopsutil/v3/process" @@ -54,11 +55,11 @@ func ListStates(hostfs resolve.Resolver) ([]ProcState, error) { // actually fetch the PIDs from the OS-specific code _, plist, err := init.FetchPids() - if err != nil { + if err != nil && !isNonFatal(err) { return nil, fmt.Errorf("error gathering PIDs: %w", err) } - return plist, nil + return plist, toNonFatal(err) } // GetPIDState returns the state of a given PID @@ -90,10 +91,10 @@ func (procStats *Stats) Get() ([]mapstr.M, []mapstr.M, error) { } // actually fetch the PIDs from the OS-specific code - pidMap, plist, err := procStats.FetchPids() + pidMap, plist, wrappedErr := procStats.FetchPids() - if err != nil { - return nil, nil, fmt.Errorf("error gathering PIDs: %w", err) + if wrappedErr != nil && !isNonFatal(wrappedErr) { + return nil, nil, fmt.Errorf("error gathering PIDs: %w", wrappedErr) } // We use this to track processes over time. procStats.ProcsMap.SetMap(pidMap) @@ -133,13 +134,13 @@ func (procStats *Stats) Get() ([]mapstr.M, []mapstr.M, error) { rootEvents = append(rootEvents, rootMap) } - return procs, rootEvents, nil + return procs, rootEvents, toNonFatal(wrappedErr) } // GetOne fetches process data for a given PID if its name matches the regexes provided from the host. func (procStats *Stats) GetOne(pid int) (mapstr.M, error) { pidStat, _, err := procStats.pidFill(pid, false) - if err != nil { + if err != nil && !isNonFatal(err) { return nil, fmt.Errorf("error fetching PID %d: %w", pid, err) } @@ -151,9 +152,9 @@ func (procStats *Stats) GetOne(pid int) (mapstr.M, error) { // GetOneRootEvent is the same as `GetOne()` but it returns an // event formatted as expected by ECS func (procStats *Stats) GetOneRootEvent(pid int) (mapstr.M, mapstr.M, error) { - pidStat, _, err := procStats.pidFill(pid, false) - if err != nil { - return nil, nil, fmt.Errorf("error fetching PID %d: %w", pid, err) + pidStat, _, wrappedErr := procStats.pidFill(pid, false) + if wrappedErr != nil && !isNonFatal(wrappedErr) { + return nil, nil, fmt.Errorf("error fetching PID %d: %w", pid, wrappedErr) } procStats.ProcsMap.SetPid(pid, pidStat) @@ -165,7 +166,7 @@ func (procStats *Stats) GetOneRootEvent(pid int) (mapstr.M, mapstr.M, error) { rootMap := processRootEvent(&pidStat) - return procMap, rootMap, err + return procMap, rootMap, toNonFatal(wrappedErr) } // GetSelf gets process info for the beat itself @@ -180,34 +181,41 @@ func (procStats *Stats) GetSelf() (ProcState, error) { } pidStat, _, err := procStats.pidFill(self, false) - if err != nil { + if err != nil && !isNonFatal(err) { return ProcState{}, fmt.Errorf("error fetching PID %d: %w", self, err) } procStats.ProcsMap.SetPid(self, pidStat) - return pidStat, nil + return pidStat, toNonFatal(err) } // pidIter wraps a few lines of generic code that all OS-specific FetchPids() functions must call. // this also handles the process of adding to the maps/lists in order to limit the code duplication in all the OS implementations -func (procStats *Stats) pidIter(pid int, procMap ProcsMap, proclist []ProcState) (ProcsMap, []ProcState) { +func (procStats *Stats) pidIter(pid int, procMap ProcsMap, proclist []ProcState) (ProcsMap, []ProcState, error) { status, saved, err := procStats.pidFill(pid, true) + var nonFatalErr error if err != nil { if !errors.Is(err, NonFatalErr{}) { procStats.logger.Debugf("Error fetching PID info for %d, skipping: %s", pid, err) - return procMap, proclist + // While monitoring a set of processes, some processes might get killed after we get all the PIDs + // So, there's no need to capture "process not found" error. + if errors.Is(err, syscall.ESRCH) { + return procMap, proclist, nil + } + return procMap, proclist, err } - procStats.logger.Debugf("Non fatal error fetching PID some info for %d, metrics are valid, but partial: %s", pid, err) + nonFatalErr = fmt.Errorf("non fatal error fetching PID some info for %d, metrics are valid, but partial: %w", pid, err) + procStats.logger.Debugf(err.Error()) } if !saved { procStats.logger.Debugf("Process name does not match the provided regex; PID=%d; name=%s", pid, status.Name) - return procMap, proclist + return procMap, proclist, nonFatalErr } procMap[pid] = status proclist = append(proclist, status) - return procMap, proclist + return procMap, proclist, nonFatalErr } // NonFatalErr is returned when there was an error @@ -232,13 +240,17 @@ func (c NonFatalErr) Is(other error) bool { return is } +func (c NonFatalErr) Unwrap() error { + return c.Err +} + // pidFill is an entrypoint used by OS-specific code to fill out a pid. // This in turn calls various OS-specific code to fill out the various bits of PID data // This is done to minimize the code duplication between different OS implementations // The second return value will only be false if an event has been filtered out. func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) { // Fetch proc state so we can get the name for filtering based on user's filter. - + var wrappedErr error // OS-specific entrypoint, get basic info so we can at least run matchProcess status, err := GetInfoForPid(procStats.Hostfs, pid) if err != nil { @@ -265,7 +277,8 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) { if !errors.Is(err, NonFatalErr{}) { return status, true, fmt.Errorf("FillPidMetrics: %w", err) } - procStats.logger.Debugf("Non-fatal error fetching PID metrics for %d, metrics are valid, but partial: %s", pid, err) + wrappedErr = errors.Join(wrappedErr, fmt.Errorf("non-fatal error fetching PID metrics for %d, metrics are valid, but partial: %w", pid, err)) + procStats.logger.Debugf(wrappedErr.Error()) } if status.CPU.Total.Ticks.Exists() { @@ -320,7 +333,7 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) { } } - return status, true, nil + return status, true, wrappedErr } // cacheCmdLine fills out Env and arg metrics from any stored previous metrics for the pid diff --git a/metric/system/process/process_aix.go b/metric/system/process/process_aix.go index 4705d4b7a..44f17fa54 100644 --- a/metric/system/process/process_aix.go +++ b/metric/system/process/process_aix.go @@ -46,6 +46,7 @@ func (procStats *Stats) FetchPids() (ProcsMap, []ProcState, error) { pid := C.pid_t(0) procMap := make(ProcsMap, 0) + var wrappedErr err var plist []ProcState for { // getprocs first argument is a void* @@ -53,13 +54,14 @@ func (procStats *Stats) FetchPids() (ProcsMap, []ProcState, error) { if err != nil { return nil, nil, fmt.Errorf("error fetching PIDs: %w", err) } - procMap, plist = procStats.pidIter(int(info.pi_pid), procMap, plist) + procMap, plist, err = procStats.pidIter(int(pid), procMap, plist) + wrappedErr = errors.Join(wrappedErr, err) if num == 0 { break } } - return procMap, plist, nil + return procMap, plist, toNonFatal(wrappedErr) } // GetInfoForPid returns basic info for the process diff --git a/metric/system/process/process_container_test.go b/metric/system/process/process_container_test.go index 7bac98754..39fc8baea 100644 --- a/metric/system/process/process_container_test.go +++ b/metric/system/process/process_container_test.go @@ -18,6 +18,7 @@ package process import ( + "fmt" "os" "os/user" "runtime" @@ -112,7 +113,7 @@ func TestSystemHostFromContainer(t *testing.T) { validateProcResult(t, result) } else { _, roots, err := testStats.Get() - require.NoError(t, err) + require.True(t, isNonFatal(err), fmt.Sprintf("Fatal error: %s", err)) for _, proc := range roots { t.Logf("proc: %d: %s", proc["process"].(map[string]interface{})["pid"], diff --git a/metric/system/process/process_darwin.go b/metric/system/process/process_darwin.go index 873714e0a..e4d19d4a5 100644 --- a/metric/system/process/process_darwin.go +++ b/metric/system/process/process_darwin.go @@ -34,6 +34,7 @@ import "C" import ( "bytes" "encoding/binary" + "errors" "fmt" "io" "os" @@ -73,6 +74,8 @@ func (procStats *Stats) FetchPids() (ProcsMap, []ProcState, error) { procMap := make(ProcsMap, num) plist := make([]ProcState, 0, num) + var wrappedErr error + var err error for i := 0; i < num; i++ { if err := binary.Read(bbuf, binary.LittleEndian, &pid); err != nil { @@ -82,10 +85,11 @@ func (procStats *Stats) FetchPids() (ProcsMap, []ProcState, error) { if pid == 0 { continue } - procMap, plist = procStats.pidIter(int(pid), procMap, plist) + procMap, plist, err = procStats.pidIter(int(pid), procMap, plist) + wrappedErr = errors.Join(wrappedErr, err) } - return procMap, plist, nil + return procMap, plist, toNonFatal(wrappedErr) } // GetInfoForPid returns basic info for the process @@ -98,9 +102,9 @@ func GetInfoForPid(_ resolve.Resolver, pid int) (ProcState, error) { // For docs, see the link below. Check the `proc_taskallinfo` struct, which // is a composition of `proc_bsdinfo` and `proc_taskinfo`. // https://opensource.apple.com/source/xnu/xnu-1504.3.12/bsd/sys/proc_info.h.auto.html - n := C.proc_pidinfo(C.int(pid), C.PROC_PIDTASKALLINFO, 0, ptr, size) + n, err := C.proc_pidinfo(C.int(pid), C.PROC_PIDTASKALLINFO, 0, ptr, size) if n != size { - return ProcState{}, fmt.Errorf("could not read process info for pid %d: proc_pidinfo returned %d", pid, int(n)) + return ProcState{}, fmt.Errorf("could not read process info for pid %d: proc_pidinfo returned %d, err: %w", pid, int(n), err) } status := ProcState{} diff --git a/metric/system/process/process_linux_common.go b/metric/system/process/process_linux_common.go index 5ffbf135d..6954fd902 100644 --- a/metric/system/process/process_linux_common.go +++ b/metric/system/process/process_linux_common.go @@ -85,6 +85,7 @@ func (procStats *Stats) FetchPids() (ProcsMap, []ProcState, error) { procMap := make(ProcsMap, len(names)) plist := make([]ProcState, 0, len(names)) + var wrappedErr error // Iterate over the directory, fetch just enough info so we can filter based on user input. logger := logp.L() @@ -99,10 +100,11 @@ func (procStats *Stats) FetchPids() (ProcsMap, []ProcState, error) { logger.Debugf("Error converting PID name %s", name) continue } - procMap, plist = procStats.pidIter(pid, procMap, plist) + procMap, plist, err = procStats.pidIter(pid, procMap, plist) + wrappedErr = errors.Join(wrappedErr, err) } - return procMap, plist, nil + return procMap, plist, toNonFatal(wrappedErr) } func FillPidMetrics(hostfs resolve.Resolver, pid int, state ProcState, filter func(string) bool) (ProcState, error) { diff --git a/metric/system/process/process_test.go b/metric/system/process/process_test.go index f5d40db2e..c0c5c151e 100644 --- a/metric/system/process/process_test.go +++ b/metric/system/process/process_test.go @@ -169,12 +169,12 @@ func TestGetOne(t *testing.T) { assert.NoError(t, err, "Init") _, _, err = testConfig.Get() - assert.NoError(t, err, "GetOne") + assert.True(t, isNonFatal(err), fmt.Sprintf("Fatal Error: %s", err)) time.Sleep(time.Second * 2) procData, _, err := testConfig.Get() - assert.NoError(t, err, "GetOne") + assert.True(t, isNonFatal(err), fmt.Sprintf("Fatal Error: %s", err)) t.Logf("Proc: %s", procData[0].StringToPrint()) } @@ -267,7 +267,7 @@ func TestFilter(t *testing.T) { func TestProcessList(t *testing.T) { plist, err := ListStates(resolve.NewTestResolver("/")) - assert.NoError(t, err, "ListStates") + assert.True(t, isNonFatal(err), fmt.Sprintf("Fatal Error: %s", err)) for _, proc := range plist { assert.NotEmpty(t, proc.State) diff --git a/metric/system/process/process_windows.go b/metric/system/process/process_windows.go index 65125a22d..66862da2c 100644 --- a/metric/system/process/process_windows.go +++ b/metric/system/process/process_windows.go @@ -41,14 +41,16 @@ func (procStats *Stats) FetchPids() (ProcsMap, []ProcState, error) { procMap := make(ProcsMap, len(pids)) plist := make([]ProcState, 0, len(pids)) + var wrappedErr error // This is probably the only implementation that doesn't benefit from our // little fillPid callback system. We'll need to iterate over everything // manually. for _, pid := range pids { - procMap, plist = procStats.pidIter(int(pid), procMap, plist) + procMap, plist, err = procStats.pidIter(int(pid), procMap, plist) + wrappedErr = errors.Join(wrappedErr, err) } - return procMap, plist, nil + return procMap, plist, toNonFatal(wrappedErr) } // GetSelfPid is the darwin implementation; see the linux version in