Skip to content

Commit

Permalink
Revert non fatal error changes for now. (#169)
Browse files Browse the repository at this point in the history
- This PR reverts
#166 and
#166
- This is due the agent CI getting blocked due to metricbeat status
reporter.

- We will merge these changes as we fix the root cause of above.
  • Loading branch information
VihasMakwana authored Jul 26, 2024
1 parent 0f81a1e commit 50700ed
Show file tree
Hide file tree
Showing 11 changed files with 57 additions and 240 deletions.
76 changes: 0 additions & 76 deletions metric/system/process/helper_test.go

This file was deleted.

37 changes: 0 additions & 37 deletions metric/system/process/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,40 +98,3 @@ func GetProcCPUPercentage(s0, s1 ProcState) ProcState {
return s1

}

// NonFatalErr indicates an error occurred during metrics
// collection, however the metrics already
// gathered and returned are still valid.
// This error can be safely ignored, this will result
// in having partial metrics for a process rather than
// no metrics at all.
//
// It was introduced to allow for partial metrics collection
// on privileged process on Windows.
type NonFatalErr struct {
Err error
}

func (c NonFatalErr) Error() string {
return "Not enough privileges to fetch information: " + c.Err.Error()
}

func (c NonFatalErr) Is(other error) bool {
_, is := other.(NonFatalErr)
return is
}

func (c NonFatalErr) Unwrap() error {
return c.Err
}

// Wraps a NonFatalError around a generic error, if given error is non-fatal in nature
func toNonFatal(err error) error {
if err == nil {
return nil
}
if !isNonFatal(err) {
return err
}
return NonFatalErr{Err: err}
}
35 changes: 0 additions & 35 deletions metric/system/process/helpers_others.go

This file was deleted.

37 changes: 0 additions & 37 deletions metric/system/process/helpers_windows.go

This file was deleted.

73 changes: 43 additions & 30 deletions metric/system/process/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ import (
"fmt"
"sort"
"strings"
"syscall"
"time"

psutil "github.com/shirou/gopsutil/v3/process"
Expand Down Expand Up @@ -55,11 +54,11 @@ func ListStates(hostfs resolve.Resolver) ([]ProcState, error) {

// actually fetch the PIDs from the OS-specific code
_, plist, err := init.FetchPids()
if err != nil && !isNonFatal(err) {
if err != nil {
return nil, fmt.Errorf("error gathering PIDs: %w", err)
}

return plist, toNonFatal(err)
return plist, nil
}

// GetPIDState returns the state of a given PID
Expand Down Expand Up @@ -91,10 +90,10 @@ func (procStats *Stats) Get() ([]mapstr.M, []mapstr.M, error) {
}

// actually fetch the PIDs from the OS-specific code
pidMap, plist, wrappedErr := procStats.FetchPids()
pidMap, plist, err := procStats.FetchPids()

if wrappedErr != nil && !isNonFatal(wrappedErr) {
return nil, nil, fmt.Errorf("error gathering PIDs: %w", wrappedErr)
if err != nil {
return nil, nil, fmt.Errorf("error gathering PIDs: %w", err)
}
// We use this to track processes over time.
procStats.ProcsMap.SetMap(pidMap)
Expand Down Expand Up @@ -134,13 +133,13 @@ func (procStats *Stats) Get() ([]mapstr.M, []mapstr.M, error) {
rootEvents = append(rootEvents, rootMap)
}

return procs, rootEvents, toNonFatal(wrappedErr)
return procs, rootEvents, nil
}

// GetOne fetches process data for a given PID if its name matches the regexes provided from the host.
func (procStats *Stats) GetOne(pid int) (mapstr.M, error) {
pidStat, _, err := procStats.pidFill(pid, false)
if err != nil && !isNonFatal(err) {
if err != nil {
return nil, fmt.Errorf("error fetching PID %d: %w", pid, err)
}

Expand All @@ -152,9 +151,9 @@ func (procStats *Stats) GetOne(pid int) (mapstr.M, error) {
// GetOneRootEvent is the same as `GetOne()` but it returns an
// event formatted as expected by ECS
func (procStats *Stats) GetOneRootEvent(pid int) (mapstr.M, mapstr.M, error) {
pidStat, _, wrappedErr := procStats.pidFill(pid, false)
if wrappedErr != nil && !isNonFatal(wrappedErr) {
return nil, nil, fmt.Errorf("error fetching PID %d: %w", pid, wrappedErr)
pidStat, _, err := procStats.pidFill(pid, false)
if err != nil {
return nil, nil, fmt.Errorf("error fetching PID %d: %w", pid, err)
}

procStats.ProcsMap.SetPid(pid, pidStat)
Expand All @@ -166,7 +165,7 @@ func (procStats *Stats) GetOneRootEvent(pid int) (mapstr.M, mapstr.M, error) {

rootMap := processRootEvent(&pidStat)

return procMap, rootMap, toNonFatal(wrappedErr)
return procMap, rootMap, err
}

// GetSelf gets process info for the beat itself
Expand All @@ -181,41 +180,56 @@ func (procStats *Stats) GetSelf() (ProcState, error) {
}

pidStat, _, err := procStats.pidFill(self, false)
if err != nil && !isNonFatal(err) {
if err != nil {
return ProcState{}, fmt.Errorf("error fetching PID %d: %w", self, err)
}

procStats.ProcsMap.SetPid(self, pidStat)

return pidStat, toNonFatal(err)
return pidStat, nil
}

// pidIter wraps a few lines of generic code that all OS-specific FetchPids() functions must call.
// this also handles the process of adding to the maps/lists in order to limit the code duplication in all the OS implementations
func (procStats *Stats) pidIter(pid int, procMap ProcsMap, proclist []ProcState) (ProcsMap, []ProcState, error) {
func (procStats *Stats) pidIter(pid int, procMap ProcsMap, proclist []ProcState) (ProcsMap, []ProcState) {
status, saved, err := procStats.pidFill(pid, true)
var nonFatalErr error
if err != nil {
if !errors.Is(err, NonFatalErr{}) {
procStats.logger.Debugf("Error fetching PID info for %d, skipping: %s", pid, err)
// While monitoring a set of processes, some processes might get killed after we get all the PIDs
// So, there's no need to capture "process not found" error.
if errors.Is(err, syscall.ESRCH) {
return procMap, proclist, nil
}
return procMap, proclist, err
return procMap, proclist
}
nonFatalErr = fmt.Errorf("non fatal error fetching PID some info for %d, metrics are valid, but partial: %w", pid, err)
procStats.logger.Debugf(err.Error())
procStats.logger.Debugf("Non fatal error fetching PID some info for %d, metrics are valid, but partial: %s", pid, err)
}
if !saved {
procStats.logger.Debugf("Process name does not match the provided regex; PID=%d; name=%s", pid, status.Name)
return procMap, proclist, nonFatalErr
return procMap, proclist
}
procMap[pid] = status
proclist = append(proclist, status)

return procMap, proclist, nonFatalErr
return procMap, proclist
}

// NonFatalErr is returned when there was an error
// collecting metrics, however the metrics already
// gathered and returned are still valid.
// This error can be safely ignored, this will result
// in having partial metrics for a process rather than
// no metrics at all.
//
// It was introduced to allow for partial metrics collection
// on privileged process on Windows.
type NonFatalErr struct {
Err error
}

func (c NonFatalErr) Error() string {
return "Not enough privileges to fetch information: " + c.Err.Error()
}

func (c NonFatalErr) Is(other error) bool {
_, is := other.(NonFatalErr)
return is
}

// pidFill is an entrypoint used by OS-specific code to fill out a pid.
Expand All @@ -224,7 +238,7 @@ func (procStats *Stats) pidIter(pid int, procMap ProcsMap, proclist []ProcState)
// The second return value will only be false if an event has been filtered out.
func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) {
// Fetch proc state so we can get the name for filtering based on user's filter.
var wrappedErr error

// OS-specific entrypoint, get basic info so we can at least run matchProcess
status, err := GetInfoForPid(procStats.Hostfs, pid)
if err != nil {
Expand All @@ -251,8 +265,7 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) {
if !errors.Is(err, NonFatalErr{}) {
return status, true, fmt.Errorf("FillPidMetrics: %w", err)
}
wrappedErr = errors.Join(wrappedErr, fmt.Errorf("non-fatal error fetching PID metrics for %d, metrics are valid, but partial: %w", pid, err))
procStats.logger.Debugf(wrappedErr.Error())
procStats.logger.Debugf("Non-fatal error fetching PID metrics for %d, metrics are valid, but partial: %s", pid, err)
}

if status.CPU.Total.Ticks.Exists() {
Expand Down Expand Up @@ -307,7 +320,7 @@ func (procStats *Stats) pidFill(pid int, filter bool) (ProcState, bool, error) {
}
}

return status, true, wrappedErr
return status, true, nil
}

// cacheCmdLine fills out Env and arg metrics from any stored previous metrics for the pid
Expand Down
6 changes: 2 additions & 4 deletions metric/system/process/process_aix.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,22 +46,20 @@ func (procStats *Stats) FetchPids() (ProcsMap, []ProcState, error) {
pid := C.pid_t(0)

procMap := make(ProcsMap, 0)
var wrappedErr err
var plist []ProcState
for {
// getprocs first argument is a void*
num, err := C.getprocs(unsafe.Pointer(&info), C.sizeof_struct_procsinfo64, nil, 0, &pid, 1)
if err != nil {
return nil, nil, fmt.Errorf("error fetching PIDs: %w", err)
}
procMap, plist, err = procStats.pidIter(int(pid), procMap, plist)
wrappedErr = errors.Join(wrappedErr, err)
procMap, plist = procStats.pidIter(int(info.pi_pid), procMap, plist)

if num == 0 {
break
}
}
return procMap, plist, toNonFatal(wrappedErr)
return procMap, plist, nil
}

// GetInfoForPid returns basic info for the process
Expand Down
Loading

0 comments on commit 50700ed

Please sign in to comment.