Skip to content

Commit

Permalink
Merge pull request ceph#36379 from dzafman/wip-46096-nautilus-smithfarm
Browse files Browse the repository at this point in the history
nautilus: mon: Warn when too many reads are repaired on an OSD

Reviewed-by: Neha Ojha <[email protected]>
  • Loading branch information
yuriw authored Aug 12, 2020
2 parents 9a584a4 + 1a63a63 commit 6eb6d48
Show file tree
Hide file tree
Showing 11 changed files with 159 additions and 1 deletion.
9 changes: 9 additions & 0 deletions PendingReleaseNotes
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,12 @@
* Now when noscrub and/or nodeep-scrub flags are set globally or per pool,
scheduled scrubs of the type disabled will be aborted. All user initiated
scrubs are NOT interrupted.

* Monitors now have a config option ``mon_osd_warn_num_repaired``, 10 by default.
If any OSD has repaired more than this many I/O errors in stored data a
``OSD_TOO_MANY_REPAIRS`` health warning is generated. In order to allow
clearing of the warning, a new command ``ceph tell osd.# clear_shards_repaired [count]``
has been added. By default it will set the repair count to 0. If you wanted
to be warned again if additional repairs are performed you can provide a value
to the command and specify the value of ``mon_osd_warn_num_repaired``.
This command will be replaced in future releases by the health mute/unmute feature.
17 changes: 17 additions & 0 deletions doc/rados/operations/health-checks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,23 @@ paired with *PG_DAMAGED* (see above).

See :doc:`pg-repair` for more information.

OSD_TOO_MANY_REPAIRS
____________________

When a read error occurs and another replica is available it is used to repair
the error immediately, so that the client can get the object data. Scrub
handles errors for data at rest. In order to identify possible failing disks
that aren't seeing scrub errors, a count of read repairs is maintained. If
it exceeds a config value threshold *mon_osd_warn_num_repaired* default 10,
this health warning is generated.

In order to allow clearing of the warning, a new command
``ceph tell osd.# clear_shards_repaired [count]`` has been added.
By default it will set the repair count to 0. If the administrator wanted
to re-enable the warning if any additional repairs are performed you can provide
a value to the command and specify the value of ``mon_osd_warn_num_repaired``.
This command will be replaced in future releases by the health mute/unmute feature.

LARGE_OMAP_OBJECTS
__________________

Expand Down
4 changes: 4 additions & 0 deletions qa/standalone/ceph-helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2063,6 +2063,10 @@ function flush_pg_stats()
seqs=''
for osd in $ids; do
seq=`ceph tell osd.$osd flush_pg_stats`
if test -z "$seq"
then
continue
fi
seqs="$seqs $osd-$seq"
done

Expand Down
90 changes: 89 additions & 1 deletion qa/standalone/osd/osd-rep-recov-eio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

source $CEPH_ROOT/qa/standalone/ceph-helpers.sh

warnings=10

function run() {
local dir=$1
shift
Expand All @@ -32,7 +34,8 @@ function run() {
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
for func in $funcs ; do
setup $dir || return 1
run_mon $dir a || return 1
# set warning amount in case default changes
run_mon $dir a --mon_osd_warn_num_repaired=$warnings || return 1
run_mgr $dir x || return 1
ceph osd pool create foo 8 || return 1

Expand Down Expand Up @@ -171,6 +174,91 @@ function TEST_rados_get_with_eio() {
delete_pool $poolname
}

function TEST_rados_repair_warning() {
local dir=$1
local OBJS=$(expr $warnings + 1)

setup_osds 4 || return 1

local poolname=pool-rep
create_pool $poolname 1 1 || return 1
wait_for_clean || return 1

local poolname=pool-rep
local obj-base=obj-warn-
local inject=eio

for i in $(seq 1 $OBJS)
do
rados_put $dir $poolname ${objbase}-$i || return 1
inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1
rados_get $dir $poolname ${objbase}-$i || return 1
done
local pgid=$(get_pg $poolname ${objbase}-1)

local object_osds=($(get_osds $poolname ${objbase}-1))
local primary=${object_osds[0]}
local bad_peer=${object_osds[1]}

COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired')
test "$COUNT" = "$OBJS" || return 1
flush_pg_stats
COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
test "$COUNT" = "$OBJS" || return 1

ceph health | grep -q "Too many repaired reads on 1 OSDs" || return 1
ceph health detail | grep -q "osd.$primary had $OBJS reads repaired" || return 1

ceph tell osd.$primary clear_shards_repaired
sleep 10

set -o pipefail
# Should mute this
ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1
set +o pipefail

ceph tell osd.$primary clear_shards_repaired $OBJS
sleep 10

for i in $(seq 1 $OBJS)
do
inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1
inject_$inject rep data $poolname ${objbase}-$i $dir 1 || return 1
# Force primary to pull from the bad peer, so we can repair it too!
set_config osd $primary osd_debug_feed_pullee $bad_peer || return 1
rados_get $dir $poolname ${objbase}-$i || return 1
done

COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired')
test "$COUNT" = "$(expr $OBJS \* 2)" || return 1
flush_pg_stats
COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
test "$COUNT" = "$(expr $OBJS \* 3)" || return 1

# Give mon a chance to notice additional OSD and reset num_shards_repaired
# The default tick time is 5 seconds
CHECKTIME=10
LOOPS=0
while(true)
do
sleep 1
if ceph health | grep -q "Too many repaired reads on 2 OSDs"
then
break
fi
LOOPS=$(expr $LOOPS + 1)
if test "$LOOPS" = "$CHECKTIME"
then
echo "Too many repaired reads not seen after $CHECKTIME seconds"
return 1
fi
done
ceph health detail | grep -q "osd.$primary had $(expr $OBJS \* 2) reads repaired" || return 1
ceph health detail | grep -q "osd.$bad_peer had $OBJS reads repaired" || return 1

delete_pool $poolname
}

# Test backfill with unfound object
function TEST_rep_backfill_unfound() {
local dir=$1
Expand Down
1 change: 1 addition & 0 deletions qa/suites/rados/singleton/all/random-eio.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ tasks:
- overall HEALTH_
- \(POOL_APP_NOT_ENABLED\)
- \(PG_DEGRADED\)
- \(OSD_TOO_MANY_REPAIRS\)
- full_sequential:
- exec:
client.0:
Expand Down
5 changes: 5 additions & 0 deletions src/common/options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1511,6 +1511,11 @@ std::vector<Option> get_global_options() {
.add_service("mgr")
.set_description("issue REQUEST_SLOW health warning if OSD ops are slower than this age (seconds)"),

Option("mon_osd_warn_num_repaired", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(10)
.add_service("mon")
.set_description("issue OSD_TOO_MANY_REPAIRS health warning if an OSD has more than this many read repairs"),

Option("mon_osd_err_op_age_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(128)
.add_service("mgr")
Expand Down
13 changes: 13 additions & 0 deletions src/mon/PGMap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2770,6 +2770,7 @@ void PGMap::get_health_checks(

list<string> detail_back;
list<string> detail_front;
list<string> detail;
set<mon_ping_item_t> back_sorted, front_sorted;
for (auto i : osd_stat) {
for (auto j : i.second.hb_pingtime) {
Expand Down Expand Up @@ -2800,6 +2801,18 @@ void PGMap::get_health_checks(
front_sorted.emplace(front);
}
}
if (i.second.num_shards_repaired >
cct->_conf.get_val<uint64_t>("mon_osd_warn_num_repaired")) {
ostringstream ss;
ss << "osd." << i.first << " had " << i.second.num_shards_repaired << " reads repaired";
detail.push_back(ss.str());
}
}
if (!detail.empty()) {
ostringstream ss;
ss << "Too many repaired reads on " << detail.size() << " OSDs";
auto& d = checks->add("OSD_TOO_MANY_REPAIRS", HEALTH_WARN, ss.str());
d.detail.swap(detail);
}
int max_detail = 10;
for (auto &sback : boost::adaptors::reverse(back_sorted)) {
Expand Down
16 changes: 16 additions & 0 deletions src/osd/OSD.cc
Original file line number Diff line number Diff line change
Expand Up @@ -982,6 +982,13 @@ void OSDService::inc_osd_stat_repaired()
return;
}

void OSDService::set_osd_stat_repaired(int64_t count)
{
std::lock_guard l(stat_lock);
osd_stat.num_shards_repaired = count;
return;
}

float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
uint64_t adjust_used)
{
Expand Down Expand Up @@ -6750,6 +6757,10 @@ COMMAND("cluster_log " \
"name=message,type=CephString,n=N",
"log a message to the cluster log",
"osd", "rw")
COMMAND("clear_shards_repaired " \
"name=count,type=CephInt,req=false",
"clear num_shards_repaired to clear health warning",
"osd", "rw")
COMMAND("bench " \
"name=count,type=CephInt,req=false " \
"name=size,type=CephInt,req=false " \
Expand Down Expand Up @@ -6968,6 +6979,11 @@ int OSD::_do_command(
}
clog->do_log(level, message);
}
else if (prefix == "clear_shards_repaired") {
int64_t count;
cmd_getval(cct, cmdmap, "count", count, (int64_t) 0);
service.set_osd_stat_repaired(count);
}

// either 'pg <pgid> <command>' or
// 'tell <pgid>' (which comes in without any of that prefix)?
Expand Down
1 change: 1 addition & 0 deletions src/osd/OSD.h
Original file line number Diff line number Diff line change
Expand Up @@ -909,6 +909,7 @@ class OSDService {
osd_alert_list_t& alerts);
osd_stat_t set_osd_stat(vector<int>& hb_peers, int num_pgs);
void inc_osd_stat_repaired(void);
void set_osd_stat_repaired(int64_t);
float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0);
osd_stat_t get_osd_stat() {
std::lock_guard l(stat_lock);
Expand Down
1 change: 1 addition & 0 deletions src/osd/PGBackend.h
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;

virtual bool pg_is_repair() = 0;
virtual void inc_osd_stat_repaired() = 0;
virtual void set_osd_stat_repaired(int64_t) = 0;
virtual bool pg_is_remote_backfilling() = 0;
virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0;
virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0;
Expand Down
3 changes: 3 additions & 0 deletions src/osd/PrimaryLogPG.h
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,9 @@ class PrimaryLogPG : public PG, public PGBackend::Listener {
void inc_osd_stat_repaired() override {
osd->inc_osd_stat_repaired();
}
void set_osd_stat_repaired(int64_t count) override {
osd->set_osd_stat_repaired(count);
}
bool pg_is_remote_backfilling() override {
return is_remote_backfilling();
}
Expand Down

0 comments on commit 6eb6d48

Please sign in to comment.