Merge pull request ceph#36379 from dzafman/wip-46096-nautilus-smithfarm

nautilus: mon: Warn when too many reads are repaired on an OSD Reviewed-by: Neha Ojha <[email protected]>
yaarith · Aug 12, 2020 · 6eb6d48 · 6eb6d48
2 parents 9a584a4 + 1a63a63
commit 6eb6d48
Show file tree

Hide file tree

Showing 11 changed files with 159 additions and 1 deletion.
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
@@ -12,3 +12,12 @@
 * Now when noscrub and/or nodeep-scrub flags are set globally or per pool,
   scheduled scrubs of the type disabled will be aborted. All user initiated
   scrubs are NOT interrupted.
+
+* Monitors now have a config option ``mon_osd_warn_num_repaired``, 10 by default.
+  If any OSD has repaired more than this many I/O errors in stored data a
+  ``OSD_TOO_MANY_REPAIRS`` health warning is generated.  In order to allow
+  clearing of the warning, a new command ``ceph tell osd.# clear_shards_repaired [count]``
+  has been added.  By default it will set the repair count to 0.  If you wanted
+  to be warned again if additional repairs are performed you can provide a value
+  to the command and specify the value of ``mon_osd_warn_num_repaired``.
+  This command will be replaced in future releases by the health mute/unmute feature.
diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst
@@ -584,6 +584,23 @@ paired with *PG_DAMAGED* (see above).
 
 See :doc:`pg-repair` for more information.
 
+OSD_TOO_MANY_REPAIRS
+____________________
+
+When a read error occurs and another replica is available it is used to repair
+the error immediately, so that the client can get the object data.  Scrub
+handles errors for data at rest.  In order to identify possible failing disks
+that aren't seeing scrub errors, a count of read repairs is maintained.  If
+it exceeds a config value threshold *mon_osd_warn_num_repaired* default 10,
+this health warning is generated.
+
+In order to allow clearing of the warning, a new command
+``ceph tell osd.# clear_shards_repaired [count]`` has been added.
+By default it will set the repair count to 0.  If the administrator wanted
+to re-enable the warning if any additional repairs are performed you can provide
+a value to the command and specify the value of ``mon_osd_warn_num_repaired``.
+This command will be replaced in future releases by the health mute/unmute feature.
+
 LARGE_OMAP_OBJECTS
 __________________
 

diff --git a/qa/standalone/ceph-helpers.sh b/qa/standalone/ceph-helpers.sh
@@ -2063,6 +2063,10 @@ function flush_pg_stats()
     seqs=''
     for osd in $ids; do
 	    seq=`ceph tell osd.$osd flush_pg_stats`
+	    if test -z "$seq"
+	    then
+		continue
+	    fi
 	    seqs="$seqs $osd-$seq"
     done
 

diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh
@@ -19,6 +19,8 @@
 
 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
 
+warnings=10
+
 function run() {
     local dir=$1
     shift
@@ -32,7 +34,8 @@ function run() {
     local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
     for func in $funcs ; do
         setup $dir || return 1
-        run_mon $dir a || return 1
+	# set warning amount in case default changes
+        run_mon $dir a --mon_osd_warn_num_repaired=$warnings || return 1
 	run_mgr $dir x || return 1
 	ceph osd pool create foo 8 || return 1
 
@@ -171,6 +174,91 @@ function TEST_rados_get_with_eio() {
     delete_pool $poolname
 }
 
+function TEST_rados_repair_warning() {
+    local dir=$1
+    local OBJS=$(expr $warnings + 1)
+
+    setup_osds 4 || return 1
+
+    local poolname=pool-rep
+    create_pool $poolname 1 1 || return 1
+    wait_for_clean || return 1
+
+    local poolname=pool-rep
+    local obj-base=obj-warn-
+    local inject=eio
+
+   for i in $(seq 1 $OBJS)
+    do
+      rados_put $dir $poolname ${objbase}-$i || return 1
+      inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1
+      rados_get $dir $poolname ${objbase}-$i || return 1
+    done
+    local pgid=$(get_pg $poolname ${objbase}-1)
+
+    local object_osds=($(get_osds $poolname ${objbase}-1))
+    local primary=${object_osds[0]}
+    local bad_peer=${object_osds[1]}
+
+    COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired')
+    test "$COUNT" = "$OBJS" || return 1
+    flush_pg_stats
+    COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
+    test "$COUNT" = "$OBJS" || return 1
+
+    ceph health | grep -q "Too many repaired reads on 1 OSDs" || return 1
+    ceph health detail | grep -q "osd.$primary had $OBJS reads repaired" || return 1
+
+    ceph tell osd.$primary clear_shards_repaired
+    sleep 10
+
+    set -o pipefail
+    # Should mute this
+    ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1
+    set +o pipefail
+
+    ceph tell osd.$primary clear_shards_repaired $OBJS
+    sleep 10
+
+    for i in $(seq 1 $OBJS)
+     do
+       inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1
+       inject_$inject rep data $poolname ${objbase}-$i $dir 1 || return 1
+       # Force primary to pull from the bad peer, so we can repair it too!
+       set_config osd $primary osd_debug_feed_pullee $bad_peer || return 1
+       rados_get $dir $poolname ${objbase}-$i || return 1
+    done
+
+    COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired')
+    test "$COUNT" = "$(expr $OBJS \* 2)" || return 1
+    flush_pg_stats
+    COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
+    test "$COUNT" = "$(expr $OBJS \* 3)" || return 1
+
+    # Give mon a chance to notice additional OSD and reset num_shards_repaired
+    # The default tick time is 5 seconds
+    CHECKTIME=10
+    LOOPS=0
+    while(true)
+    do
+      sleep 1
+      if ceph health | grep -q "Too many repaired reads on 2 OSDs"
+      then
+	      break
+      fi
+      LOOPS=$(expr $LOOPS + 1)
+      if test "$LOOPS" = "$CHECKTIME"
+      then
+	      echo "Too many repaired reads not seen after $CHECKTIME seconds"
+	      return 1
+      fi
+    done
+    ceph health detail | grep -q "osd.$primary had $(expr $OBJS \* 2) reads repaired" || return 1
+    ceph health detail | grep -q "osd.$bad_peer had $OBJS reads repaired" || return 1
+
+    delete_pool $poolname
+}
+
 # Test backfill with unfound object
 function TEST_rep_backfill_unfound() {
     local dir=$1

diff --git a/qa/suites/rados/singleton/all/random-eio.yaml b/qa/suites/rados/singleton/all/random-eio.yaml
@@ -22,6 +22,7 @@ tasks:
     - overall HEALTH_
     - \(POOL_APP_NOT_ENABLED\)
     - \(PG_DEGRADED\)
+    - \(OSD_TOO_MANY_REPAIRS\)
 - full_sequential:
   - exec:
       client.0:

diff --git a/src/common/options.cc b/src/common/options.cc
@@ -1511,6 +1511,11 @@ std::vector<Option> get_global_options() {
     .add_service("mgr")
     .set_description("issue REQUEST_SLOW health warning if OSD ops are slower than this age (seconds)"),
 
+    Option("mon_osd_warn_num_repaired", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .add_service("mon")
+    .set_description("issue OSD_TOO_MANY_REPAIRS health warning if an OSD has more than this many read repairs"),
+
     Option("mon_osd_err_op_age_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(128)
     .add_service("mgr")

diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
@@ -2770,6 +2770,7 @@ void PGMap::get_health_checks(
 
     list<string> detail_back;
     list<string> detail_front;
+    list<string> detail;
     set<mon_ping_item_t> back_sorted, front_sorted;
     for (auto i : osd_stat) {
       for (auto j : i.second.hb_pingtime) {
@@ -2800,6 +2801,18 @@ void PGMap::get_health_checks(
 	  front_sorted.emplace(front);
 	}
       }
+      if (i.second.num_shards_repaired >
+		      cct->_conf.get_val<uint64_t>("mon_osd_warn_num_repaired")) {
+        ostringstream ss;
+	ss << "osd." << i.first << " had " << i.second.num_shards_repaired << " reads repaired";
+        detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << "Too many repaired reads on " << detail.size() << " OSDs";
+      auto& d = checks->add("OSD_TOO_MANY_REPAIRS", HEALTH_WARN, ss.str());
+      d.detail.swap(detail);
     }
     int max_detail = 10;
     for (auto &sback : boost::adaptors::reverse(back_sorted)) {

diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
@@ -982,6 +982,13 @@ void OSDService::inc_osd_stat_repaired()
   return;
 }
 
+void OSDService::set_osd_stat_repaired(int64_t count)
+{
+  std::lock_guard l(stat_lock);
+  osd_stat.num_shards_repaired = count;
+  return;
+}
+
 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
 				         uint64_t adjust_used)
 {
@@ -6750,6 +6757,10 @@ COMMAND("cluster_log " \
 	"name=message,type=CephString,n=N",
 	"log a message to the cluster log",
 	"osd", "rw")
+COMMAND("clear_shards_repaired " \
+	"name=count,type=CephInt,req=false",
+	"clear num_shards_repaired to clear health warning",
+	"osd", "rw")
 COMMAND("bench " \
 	"name=count,type=CephInt,req=false " \
 	"name=size,type=CephInt,req=false " \
@@ -6968,6 +6979,11 @@ int OSD::_do_command(
     }
     clog->do_log(level, message);
   }
+  else if (prefix == "clear_shards_repaired") {
+    int64_t count;
+    cmd_getval(cct, cmdmap, "count", count, (int64_t) 0);
+    service.set_osd_stat_repaired(count);
+  }
 
   // either 'pg <pgid> <command>' or
   // 'tell <pgid>' (which comes in without any of that prefix)?

diff --git a/src/osd/OSD.h b/src/osd/OSD.h
@@ -909,6 +909,7 @@ class OSDService {
     osd_alert_list_t& alerts);
   osd_stat_t set_osd_stat(vector<int>& hb_peers, int num_pgs);
   void inc_osd_stat_repaired(void);
+  void set_osd_stat_repaired(int64_t);
   float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0);
   osd_stat_t get_osd_stat() {
     std::lock_guard l(stat_lock);

diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
@@ -298,6 +298,7 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
 
      virtual bool pg_is_repair() = 0;
      virtual void inc_osd_stat_repaired() = 0;
+     virtual void set_osd_stat_repaired(int64_t) = 0;
      virtual bool pg_is_remote_backfilling() = 0;
      virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0;
      virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0;

diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h
@@ -410,6 +410,9 @@ class PrimaryLogPG : public PG, public PGBackend::Listener {
   void inc_osd_stat_repaired() override {
     osd->inc_osd_stat_repaired();
   }
+  void set_osd_stat_repaired(int64_t count) override {
+    osd->set_osd_stat_repaired(count);
+  }
   bool pg_is_remote_backfilling() override {
     return is_remote_backfilling();
   }