Skip to content

Commit

Permalink
Merge branch 'master' into tm/rhtap-dnf-enablement
Browse files Browse the repository at this point in the history
  • Loading branch information
tommartensen committed Jan 4, 2024
2 parents fc6fc25 + fef185c commit 6f85459
Show file tree
Hide file tree
Showing 16 changed files with 280 additions and 27 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/integration-test-containers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
run: |
ansible-galaxy install -r ansible/requirements.yml
BUILD_MULTI_ARCH="${{ contains(github.event.pull_request.labels.*.name, 'run-multiarch-builds') || github.event_name == 'push' }}"
BUILD_MULTI_ARCH="${{ contains(github.event.pull_request.labels.*.name, 'run-multiarch-builds') || github.event_name == 'push' || github.event_name == 'schedule' }}"
# build_multi_arch passed in as json to ensure boolean type
ansible-playbook \
Expand Down
3 changes: 2 additions & 1 deletion RELEASED_VERSIONS
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@
3.13.4 3.74.7 # Rox release 3.74.7 by roxbot at Tue Oct 24 13:38:02 UTC 2023
3.15.2 4.1.5 # Rox release 4.1.5 by roxbot at Thu Nov 9 02:58:36 UTC 2023
3.17.0 4.3.0 # Rox release 4.3.0 by roxbot at Wed Nov 15 08:14:26 UTC 2023
3.16.2 4.2.3 # Rox release 4.2.3 by roxbot at Wed Nov 22 00:02:20 UTC 2023
3.16.2 4.2.3 # Rox release 4.2.3 by roxbot at Wed Nov 22 00:02:20 UTC 2023
3.17.0 4.3.1 # Rox release 4.3.1 by roxbot at Mon Dec 11 20:36:15 UTC 2023
9 changes: 9 additions & 0 deletions collector/container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ COPY LICENSE-kernel-modules.txt /kernel-modules/LICENSE
COPY container/libs/libsinsp-wrapper.so /usr/local/lib/
COPY container/bin/collector /usr/local/bin/
COPY container/bin/self-checks /usr/local/bin/self-checks
COPY container/status-check.sh /usr/local/bin/status-check.sh

RUN echo '/usr/local/lib' > /etc/ld.so.conf.d/usrlocallib.conf && \
ldconfig && \
Expand All @@ -38,6 +39,14 @@ RUN echo '/usr/local/lib' > /etc/ld.so.conf.d/usrlocallib.conf && \

EXPOSE 8080 9090

HEALTHCHECK \
# health checks within the first 5s are not counted as failure
--start-period=5s \
# perform health check every 5s
--interval=5s \
# the command uses /ready API
CMD /usr/local/bin/status-check.sh

ENTRYPOINT ["/bootstrap.sh"]

CMD collector-wrapper.sh \
Expand Down
21 changes: 21 additions & 0 deletions collector/container/status-check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash

# /ready API will return the following formatted response:
# {
# "collector" : {
# "drops" : 0,
# "events" : 9330070,
# "node" : "node.name",
# "preemptions" : 0
# },
# "status" : "ok"
# }
#
# Take the status line, split it by ":" and trim spaces and quotes.
STATUS=$(curl -s localhost:8080/ready | grep 'status' | awk -F ':' '{print $2}' | tr -d '"' | tr -d ' ')

if [[ "${STATUS}" = "ok" ]]; then
exit 0
else
exit 1
fi
35 changes: 35 additions & 0 deletions collector/lib/CollectorConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ CollectorConfig::CollectorConfig(CollectorArgs* args) {

HandleAfterglowEnvVars();
HandleConnectionStatsEnvVars();
HandleSinspEnvVars();

host_config_ = ProcessHostHeuristics(*this);
}
Expand All @@ -175,6 +176,15 @@ void CollectorConfig::HandleAfterglowEnvVars() {
afterglow_period_micros_ = static_cast<int64_t>(atof(afterglow_period) * 1000000);
}

const int64_t max_afterglow_period_micros = 300000000; // 5 minutes

if (afterglow_period_micros_ > max_afterglow_period_micros) {
CLOG(WARNING) << "User set afterglow period of " << afterglow_period_micros_ / 1000000
<< "s is greater than the maximum allowed afterglow period of " << max_afterglow_period_micros / 1000000 << "s";
CLOG(WARNING) << "Setting the afterglow period to " << max_afterglow_period_micros / 1000000 << "s";
afterglow_period_micros_ = max_afterglow_period_micros;
}

if (enable_afterglow_ && afterglow_period_micros_ > 0) {
CLOG(INFO) << "Afterglow is enabled";
return;
Expand Down Expand Up @@ -239,6 +249,31 @@ void CollectorConfig::HandleConnectionStatsEnvVars() {
}
}

void CollectorConfig::HandleSinspEnvVars() {
const char* envvar;

sinsp_cpu_per_buffer_ = DEFAULT_CPU_FOR_EACH_BUFFER;
sinsp_buffer_size_ = DEFAULT_DRIVER_BUFFER_BYTES_DIM;

if ((envvar = std::getenv("ROX_COLLECTOR_SINSP_CPU_PER_BUFFER")) != NULL) {
try {
sinsp_cpu_per_buffer_ = std::stoi(envvar);
CLOG(INFO) << "Sinsp cpu per buffer: " << sinsp_cpu_per_buffer_;
} catch (...) {
CLOG(ERROR) << "Invalid cpu per buffer value: '" << envvar << "'";
}
}

if ((envvar = std::getenv("ROX_COLLECTOR_SINSP_BUFFER_SIZE")) != NULL) {
try {
sinsp_buffer_size_ = std::stoi(envvar);
CLOG(INFO) << "Sinsp buffer size: " << sinsp_buffer_size_;
} catch (...) {
CLOG(ERROR) << "Invalid buffer size value: '" << envvar << "'";
}
}
}

bool CollectorConfig::TurnOffScrape() const {
return turn_off_scrape_;
}
Expand Down
8 changes: 8 additions & 0 deletions collector/lib/CollectorConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class CollectorConfig {
const std::vector<double>& GetConnectionStatsQuantiles() const { return connection_stats_quantiles_; }
double GetConnectionStatsError() const { return connection_stats_error_; }
unsigned int GetConnectionStatsWindow() const { return connection_stats_window_; }
unsigned int GetSinspBufferSize() const { return sinsp_buffer_size_; }
unsigned int GetSinspCpuPerBuffer() const { return sinsp_cpu_per_buffer_; }

std::shared_ptr<grpc::Channel> grpc_channel;

Expand Down Expand Up @@ -105,10 +107,16 @@ class CollectorConfig {
double connection_stats_error_;
unsigned int connection_stats_window_;

// One ring buffer will be initialized for this many CPUs
unsigned int sinsp_cpu_per_buffer_;
// Size of one ring buffer, in bytes
unsigned int sinsp_buffer_size_;

Json::Value tls_config_;

void HandleAfterglowEnvVars();
void HandleConnectionStatsEnvVars();
void HandleSinspEnvVars();
};

std::ostream& operator<<(std::ostream& os, const CollectorConfig& c);
Expand Down
2 changes: 1 addition & 1 deletion collector/lib/ConnTracker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ IPNet ConnectionTracker::NormalizeAddressNoLock(const Address& address) const {
}

if (enable_external_ips_) {
return IPNet(address, 0, true);
return IPNet(address, address.length() * 8);
}

// Otherwise, associate it to "rest of the internet".
Expand Down
8 changes: 5 additions & 3 deletions collector/lib/KernelDriver.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ class KernelDriverEBPF : public IKernelDriver {
std::unordered_set<ppm_sc_code> ppm_sc;

try {
inspector.open_bpf(SysdigService::kProbePath, DEFAULT_DRIVER_BUFFER_BYTES_DIM, ppm_sc, tp_set);
inspector.open_bpf(SysdigService::kProbePath,
config.GetSinspBufferSize(),
ppm_sc, tp_set);
} catch (const sinsp_exception& ex) {
CLOG(WARNING) << ex.what();
return false;
Expand Down Expand Up @@ -99,8 +101,8 @@ class KernelDriverCOREEBPF : public IKernelDriver {
}

try {
inspector.open_modern_bpf(DEFAULT_DRIVER_BUFFER_BYTES_DIM,
DEFAULT_CPU_FOR_EACH_BUFFER,
inspector.open_modern_bpf(config.GetSinspBufferSize(),
config.GetSinspCpuPerBuffer(),
true, ppm_sc, tp_set);
} catch (const sinsp_exception& ex) {
if (config.CoReBPFHardfail()) {
Expand Down
4 changes: 2 additions & 2 deletions collector/test/ConnTrackerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,13 +386,13 @@ TEST(ConnTrackerTest, TestNormalizedEnableExternalIPs) {
// from private network
Connection conn1_normalized("xyz", Endpoint(IPNet(), 9999), Endpoint(IPNet(Address(10, 1, 1, 42), 0, true), 0), L4Proto::TCP, true);
// from unmatched external IP
Connection conn2_normalized("xyz", Endpoint(IPNet(), 9999), Endpoint(IPNet(Address(35, 127, 0, 15), 0, true), 0), L4Proto::TCP, true);
Connection conn2_normalized("xyz", Endpoint(IPNet(), 9999), Endpoint(IPNet(Address(35, 127, 0, 15), 32, false), 0), L4Proto::TCP, true);
// from matched external IP
Connection conn3_normalized("xyz", Endpoint(IPNet(), 9999), Endpoint(IPNet(Address(35, 127, 1, 0), 24, false), 0), L4Proto::TCP, true);
// to private network
Connection conn4_normalized("xyz", Endpoint(), Endpoint(IPNet(Address(10, 1, 1, 42), 0, true), 9999), L4Proto::TCP, false);
// to unmatched external IP
Connection conn5_normalized("xyz", Endpoint(), Endpoint(IPNet(Address(35, 127, 0, 15), 0, true), 54321), L4Proto::TCP, false);
Connection conn5_normalized("xyz", Endpoint(), Endpoint(IPNet(Address(35, 127, 0, 15), 32, false), 54321), L4Proto::TCP, false);
// to matched external IP
Connection conn6_normalized("xyz", Endpoint(), Endpoint(IPNet(Address(35, 127, 1, 0), 24, false), 54321), L4Proto::TCP, false);

Expand Down
10 changes: 10 additions & 0 deletions docs/references.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ This is enabled by default.
- `ROX_COLLECTOR_CONNECTION_STATS_WINDOW`: the length of the sliding time window
in minutes. Default: `60`

* `ROX_COLLECTOR_SINSP_CPU_PER_BUFFER`: Allows to control how many sinsp
buffers are going to be allocated. The resulting number of buffers will be
calculated as the overall number of CPU cores available divided by this
value. The default value is 1, meaning one buffer for each CPU. The value 0 is
special, it instructs sinsp to allocate only one buffer no matter how many CPUs
are there. This parameter affects CO-RE BPF only.

* `ROX_COLLECTOR_SINSP_BUFFER_SIZE`: Specifies the size of a sinsp buffer in
bytes. The default value is 16 MB.

NOTE: Using environment variables is a preferred way of configuring Collector,
so if you're adding a new configuration knob, keep this in mind.

Expand Down
127 changes: 117 additions & 10 deletions integration-tests/suites/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,40 @@ func (s *IntegrationTestSuiteBase) StartCollector(disableGRPC bool, options *com
s.Require().NoError(s.Collector().Setup(options))
s.Require().NoError(s.Collector().Launch())

// wait for self-check process to guarantee collector is started
s.Sensor().WaitProcessesN(s.Collector().ContainerID, 30*time.Second, 1)
// Verify if the image we test has a health check. There are some CI
// configurations, where it's not the case. If something went wrong and we
// get an error, treat it as no health check was found for the sake of
// robustness.
hasHealthCheck, err := s.findContainerHealthCheck("collector",
s.Collector().ContainerID)

if hasHealthCheck && err == nil {
// Wait for collector to report healthy, includes initial setup and
// probes loading. It doesn't make sense to wait for very long, limit
// it to 1 min.
_, err := s.waitForContainerToBecomeHealthy(
"collector",
s.Collector().ContainerID,
defaultWaitTickSeconds, 1*time.Minute)
s.Require().NoError(err)
} else {
fmt.Println("No HealthCheck found, do not wait for collector to become healthy")

// No way to figure out when all the services up and running, skip this
// phase.
}

// wait for the canary process to guarantee collector is started
selfCheckOk := s.Sensor().WaitProcessesN(
s.Collector().ContainerID, 30*time.Second, 1, func() {
// Self-check process is not going to be sent via GRPC, instead
// create at least one canary process to make sure everything is
// fine.
fmt.Println("Spawn a canary process")
_, err = s.execContainer("collector", []string{"echo"})
s.Require().NoError(err)
})
s.Require().True(selfCheckOk)
}

// StopCollector will tear down the collector container and stop
Expand Down Expand Up @@ -235,17 +267,33 @@ func (s *IntegrationTestSuiteBase) launchContainer(name string, args ...string)
return outLines[len(outLines)-1], err
}

func (s *IntegrationTestSuiteBase) waitForContainerToExit(containerName, containerID string, tickSeconds time.Duration) (bool, error) {
// Wait for a container to become a certain status.
// - tickSeconds -- how often to check for the status
// - timeoutThreshold -- the overall time limit for waiting,
// defaulting to 30 min if zero
// - filter -- description of the desired status
func (s *IntegrationTestSuiteBase) waitForContainerStatus(
containerName string,
containerID string,
tickSeconds time.Duration,
timeoutThreshold time.Duration,
filter string) (bool, error) {

cmd := []string{
common.RuntimeCommand, "ps", "-qa",
"--filter", "id=" + containerID,
"--filter", "status=exited",
"--filter", filter,
}

start := time.Now()
tick := time.Tick(tickSeconds)
tickElapsed := time.Tick(1 * time.Minute)
timeout := time.After(30 * time.Minute)

if timeoutThreshold == 0 {
timeoutThreshold = 30 * time.Minute
}
timeout := time.After(timeoutThreshold)

for {
select {
case <-tick:
Expand All @@ -256,17 +304,76 @@ func (s *IntegrationTestSuiteBase) waitForContainerToExit(containerName, contain
return true, nil
}
if err != nil {
fmt.Printf("Retrying waitForContainerToExit(%s, %s): Error: %v\n", containerName, containerID, err)
fmt.Printf("Retrying waitForContainerStatus(%s, %s): Error: %v\n",
containerName, containerID, err)
}
case <-timeout:
fmt.Printf("Timed out waiting for container %s to exit, elapsed Time: %s\n", containerName, time.Since(start))
return false, nil
fmt.Printf("Timed out waiting for container %s to become %s, elapsed Time: %s\n",
containerName, filter, time.Since(start))
return false, fmt.Errorf("Timeout waiting for container %s to become %s after %v",
containerName, filter, timeoutThreshold)
case <-tickElapsed:
fmt.Printf("Waiting for container: %s, elapsed time: %s\n", containerName, time.Since(start))
fmt.Printf("Waiting for container %s to become %s, elapsed time: %s\n",
containerName, filter, time.Since(start))
}
}
}

// Find a HealthCheck section in the specified container and verify it's what
// we expect. This function would be used to wait until the health check
// reports healthy, so be conservative and report true only if absolutely
// certain.
func (s *IntegrationTestSuiteBase) findContainerHealthCheck(
containerName string,
containerID string) (bool, error) {

cmd := []string{
common.RuntimeCommand, "inspect", "-f",
"'{{ .Config.Healthcheck }}'", containerID,
}

output, err := s.Executor().Exec(cmd...)
if err != nil {
return false, err
}

outLines := strings.Split(output, "\n")
lastLine := outLines[len(outLines)-1]

// Clearly no HealthCheck section
if lastLine == "<nil>" {
return false, nil
}

// If doesn't contain an expected command, do not consider it to be a valid
// health check
if strings.Contains(lastLine, "CMD-SHELL /usr/local/bin/status-check.sh") {
return true, nil
} else {
return false, nil
}
}

func (s *IntegrationTestSuiteBase) waitForContainerToBecomeHealthy(
containerName string,
containerID string,
tickSeconds time.Duration,
timeoutThreshold time.Duration) (bool, error) {

return s.waitForContainerStatus(containerName, containerID, tickSeconds,
timeoutThreshold, "health=healthy")
}

func (s *IntegrationTestSuiteBase) waitForContainerToExit(
containerName string,
containerID string,
tickSeconds time.Duration,
timeoutThreshold time.Duration) (bool, error) {

return s.waitForContainerStatus(containerName, containerID, tickSeconds,
timeoutThreshold, "status=exited")
}

func (s *IntegrationTestSuiteBase) execContainer(containerName string, command []string) (string, error) {
cmd := []string{common.RuntimeCommand, "exec", containerName}
cmd = append(cmd, command...)
Expand Down Expand Up @@ -342,7 +449,7 @@ func (s *IntegrationTestSuiteBase) RunCollectorBenchmark() {
containerID, err := s.launchContainer(benchmarkName, benchmarkArgs...)
s.Require().NoError(err)

_, err = s.waitForContainerToExit(benchmarkName, containerID, defaultWaitTickSeconds)
_, err = s.waitForContainerToExit(benchmarkName, containerID, defaultWaitTickSeconds, 0)
s.Require().NoError(err)

benchmarkLogs, err := s.containerLogs("benchmark")
Expand Down
2 changes: 1 addition & 1 deletion integration-tests/suites/benchmark.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func (b *BenchmarkTestSuiteBase) RunInitContainer() {
containerID, err := b.launchContainer("host-init", cmd...)
require.NoError(b.T(), err)

if finished, _ := b.waitForContainerToExit("host-init", containerID, 5*time.Second); !finished {
if finished, _ := b.waitForContainerToExit("host-init", containerID, 5*time.Second, 0); !finished {
logs, err := b.containerLogs("host-init")
if err == nil {
fmt.Println(logs)
Expand Down
2 changes: 1 addition & 1 deletion integration-tests/suites/image_json.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ func (s *ImageLabelJSONTestSuite) TestRunImageWithJSONLabel() {
containerID, err := s.launchContainer(name, image)
s.Require().NoError(err)

_, err = s.waitForContainerToExit(name, containerID, defaultWaitTickSeconds)
_, err = s.waitForContainerToExit(name, containerID, defaultWaitTickSeconds, 0)
s.Require().NoError(err)
}

Expand Down
Loading

0 comments on commit 6f85459

Please sign in to comment.