Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize counter polling interval by making it more accurate #3391

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
23 changes: 23 additions & 0 deletions orchagent/flexcounterorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ void FlexCounterOrch::doTask(Consumer &consumer)
{
auto itDelay = std::find(std::begin(data), std::end(data), FieldValueTuple(FLEX_COUNTER_DELAY_STATUS_FIELD, "true"));
string poll_interval;
string bulk_chunk_size;
string bulk_chunk_size_per_counter;

if (itDelay != data.end())
{
Expand All @@ -141,6 +143,14 @@ void FlexCounterOrch::doTask(Consumer &consumer)
}
}
}
else if (field == BULK_CHUNK_SIZE_FIELD)
{
bulk_chunk_size = value;
}
else if (field == BULK_CHUNK_SIZE_PER_PREFIX_FIELD)
{
bulk_chunk_size_per_counter = value;
}
else if(field == FLEX_COUNTER_STATUS_FIELD)
{
// Currently, the counters are disabled for polling by default
Expand Down Expand Up @@ -256,6 +266,19 @@ void FlexCounterOrch::doTask(Consumer &consumer)
SWSS_LOG_NOTICE("Unsupported field %s", field.c_str());
}
}

if (!bulk_chunk_size.empty() || !bulk_chunk_size_per_counter.empty())
{
m_groupsWithBulkChunkSize.insert(key);
setFlexCounterGroupBulkChunkSize(flexCounterGroupMap[key],
bulk_chunk_size.empty() ? "NULL" : bulk_chunk_size,
bulk_chunk_size_per_counter.empty() ? "NULL" : bulk_chunk_size_per_counter);
}
else if (m_groupsWithBulkChunkSize.find(key) != m_groupsWithBulkChunkSize.end())
{
setFlexCounterGroupBulkChunkSize(flexCounterGroupMap[key], "NULL", "NULL");
m_groupsWithBulkChunkSize.erase(key);
}
}

consumer.m_toSync.erase(it++);
Expand Down
1 change: 1 addition & 0 deletions orchagent/flexcounterorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ class FlexCounterOrch: public Orch
Table m_bufferQueueConfigTable;
Table m_bufferPgConfigTable;
Table m_deviceMetadataConfigTable;
std::unordered_set<std::string> m_groupsWithBulkChunkSize;
};

#endif
44 changes: 40 additions & 4 deletions orchagent/pfc_detect_mellanox.lua
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,20 @@ local timestamp_struct = redis.call('TIME')
local timestamp_current = timestamp_struct[1] + timestamp_struct[2] / 1000000
local timestamp_string = tostring(timestamp_current)
redis.call('HSET', 'TIMESTAMP', 'pfcwd_poll_timestamp_last', timestamp_string)
local effective_poll_time = poll_time
local effective_poll_time_lasttime = redis.call('HGET', 'TIMESTAMP', 'effective_pfcwd_poll_time_last')
local global_effective_poll_time = poll_time
local global_effective_poll_time_lasttime = redis.call('HGET', 'TIMESTAMP', 'effective_pfcwd_poll_time_last')
if timestamp_last ~= false then
effective_poll_time = (timestamp_current - tonumber(timestamp_last)) * 1000000
redis.call('HSET', 'TIMESTAMP', 'effective_pfcwd_poll_time_last', effective_poll_time)
global_effective_poll_time = (timestamp_current - tonumber(timestamp_last)) * 1000000
redis.call('HSET', 'TIMESTAMP', 'effective_pfcwd_poll_time_last', global_effective_poll_time)
end

local effective_poll_time
local effective_poll_time_lasttime
local port_timestamp_last_cache = {}

local debug_storm_global = redis.call('HGET', 'DEBUG_STORM', 'enabled') == 'true'
local debug_storm_threshold = tonumber(redis.call('HGET', 'DEBUG_STORM', 'threshold'))

-- Iterate through each queue
local n = table.getn(KEYS)
for i = n, 1, -1 do
Expand Down Expand Up @@ -56,12 +63,37 @@ for i = n, 1, -1 do
local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS'
local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION_US'

-- Get port specific timestamp
local port_timestamp_current = tonumber(redis.call('HGET', counters_table_name .. ':' .. port_id, 'PFC_WD_time_stamp'))
if port_timestamp_current ~= nil then
local port_timestamp_lasttime = port_timestamp_last_cache[port_id]
if port_timestamp_lasttime == nil then
port_timestamp_lasttime = tonumber(redis.call('HGET', counters_table_name .. ':' .. port_id, 'PFC_WD_time_stamp_last'))
port_timestamp_last_cache[port_id] = port_timestamp_lasttime
redis.call('HSET', counters_table_name .. ':' .. port_id, 'PFC_WD_time_stamp_last', port_timestamp_current)
end

if port_timestamp_lasttime ~= nil then
effective_poll_time = (port_timestamp_current - port_timestamp_lasttime) / 1000
else
effective_poll_time = global_effective_poll_time
end
effective_poll_time_lasttime = false
else
effective_poll_time = global_effective_poll_time
effective_poll_time_lasttime = global_effective_poll_time_lasttime
end

-- Get all counters
local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES')
local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS')
local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key)
local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key)

if debug_storm_global then
redis.call('PUBLISH', 'PFC_WD_DEBUG', 'Port ID ' .. port_id .. ' Queue index ' .. queue_index .. ' occupancy ' .. occupancy_bytes .. ' packets ' .. packets .. ' pfc rx ' .. pfc_rx_packets .. ' pfc duration ' .. pfc_duration .. ' effective poll time ' .. tostring(effective_poll_time) .. '(global ' .. tostring(global_effective_poll_time) .. ')')
end

if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then
occupancy_bytes = tonumber(occupancy_bytes)
packets = tonumber(packets)
Expand All @@ -82,6 +114,10 @@ for i = n, 1, -1 do
pfc_duration_last = tonumber(pfc_duration_last)
local storm_condition = (pfc_duration - pfc_duration_last) > (effective_poll_time * 0.99)

if debug_storm_threshold ~= nil and (pfc_duration - pfc_duration_last) > (effective_poll_time * debug_storm_threshold / 100) then
redis.call('PUBLISH', 'PFC_WD_DEBUG', 'Port ID ' .. port_id .. ' Queue index ' .. queue_index .. ' occupancy ' .. occupancy_bytes .. ' packets ' .. packets .. ' pfc rx ' .. pfc_rx_packets .. ' pfc duration ' .. pfc_duration .. ' effective poll time ' .. tostring(effective_poll_time) .. ', triggered by threshold ' .. debug_storm_threshold .. '%')
end

-- Check actual condition of queue being in PFC storm
if (occupancy_bytes > 0 and packets - packets_last == 0 and storm_condition) or
-- DEBUG CODE START. Uncomment to enable
Expand Down
23 changes: 23 additions & 0 deletions orchagent/saihelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,8 @@ static inline void initSaiRedisCounterEmptyParameter(sai_redis_flex_counter_grou
initSaiRedisCounterEmptyParameter(flex_counter_group_param.stats_mode);
initSaiRedisCounterEmptyParameter(flex_counter_group_param.plugin_name);
initSaiRedisCounterEmptyParameter(flex_counter_group_param.plugins);
initSaiRedisCounterEmptyParameter(flex_counter_group_param.bulk_chunk_size);
initSaiRedisCounterEmptyParameter(flex_counter_group_param.bulk_chunk_size_per_prefix);
}

static inline void initSaiRedisCounterParameterFromString(sai_s8_list_t &sai_s8_list, const std::string &str)
Expand Down Expand Up @@ -938,6 +940,8 @@ void setFlexCounterGroupParameter(const string &group,
attr.id = SAI_REDIS_SWITCH_ATTR_FLEX_COUNTER_GROUP;
attr.value.ptr = &flex_counter_group_param;

initSaiRedisCounterEmptyParameter(flex_counter_group_param.bulk_chunk_size);
initSaiRedisCounterEmptyParameter(flex_counter_group_param.bulk_chunk_size_per_prefix);
initSaiRedisCounterParameterFromString(flex_counter_group_param.counter_group_name, group);
initSaiRedisCounterParameterFromString(flex_counter_group_param.poll_interval, poll_interval);
initSaiRedisCounterParameterFromString(flex_counter_group_param.operation, operation);
Expand Down Expand Up @@ -1017,6 +1021,25 @@ void setFlexCounterGroupStatsMode(const std::string &group,
notifySyncdCounterOperation(is_gearbox, attr);
}

void setFlexCounterGroupBulkChunkSize(const std::string &group,
const std::string &bulk_chunk_size,
const std::string &bulk_chunk_size_per_prefix,
bool is_gearbox)
{
sai_attribute_t attr;
sai_redis_flex_counter_group_parameter_t flex_counter_group_param;

attr.id = SAI_REDIS_SWITCH_ATTR_FLEX_COUNTER_GROUP;
attr.value.ptr = &flex_counter_group_param;

initSaiRedisCounterEmptyParameter(flex_counter_group_param);
initSaiRedisCounterParameterFromString(flex_counter_group_param.counter_group_name, group);
initSaiRedisCounterParameterFromString(flex_counter_group_param.bulk_chunk_size, bulk_chunk_size);
initSaiRedisCounterParameterFromString(flex_counter_group_param.bulk_chunk_size_per_prefix, bulk_chunk_size_per_prefix);

notifySyncdCounterOperation(is_gearbox, attr);
}

void delFlexCounterGroup(const std::string &group,
bool is_gearbox)
{
Expand Down
5 changes: 5 additions & 0 deletions orchagent/saihelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ void setFlexCounterGroupStatsMode(const std::string &group,
const std::string &stats_mode,
bool is_gearbox=false);

void setFlexCounterGroupBulkChunkSize(const std::string &group,
const std::string &bulk_size,
const std::string &bulk_chunk_size_per_prefix,
bool is_gearbox=false);

void delFlexCounterGroup(const std::string &group,
bool is_gearbox=false);

Expand Down
4 changes: 4 additions & 0 deletions tests/mock_tests/flexcounter_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ namespace flexcounter_test
}
else
{
if (flexCounterGroupParam->bulk_chunk_size.list != nullptr || flexCounterGroupParam->bulk_chunk_size_per_prefix.list != nullptr)
{
return SAI_STATUS_SUCCESS;
}
mockFlexCounterGroupTable->del(key);
}

Expand Down
Loading