Skip to content

Commit

Permalink
DNM osd: add 'ceph [tell|daemon] osd.id smart'
Browse files Browse the repository at this point in the history
Also added 'ceph daemon osd.id list_devices' which prints to stdout the
osd devices. 'ceph [tell|daemon] osd.id smart' probes the osd devices
for SMART data and prints it to stdout in a JSON format. It assumes smartctl '--json' feature
exists.

Signed-off-by: Yaarit Hatuka <[email protected]>
  • Loading branch information
Yaarit Hatuka committed Jan 16, 2018
1 parent aab2def commit d1d21e7
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/common/options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1632,6 +1632,10 @@ std::vector<Option> get_global_options() {
.set_min(2)
.set_description("Number of striping periods to zero head of MDS journal write position"),

Option("osd_smart_report_timeout", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(5)
.set_description("Timeout (in seconds) for smarctl to run, default is set to 5"),

Option("osd_check_max_object_name_len_on_startup", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(true)
.set_description(""),
Expand Down
107 changes: 107 additions & 0 deletions src/osd/OSD.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include "common/version.h"
#include "common/io_priority.h"
#include "common/pick_address.h"
#include "common/SubProcess.h"

#include "os/ObjectStore.h"
#ifdef HAVE_LIBFUSE
Expand Down Expand Up @@ -146,6 +147,9 @@
#include "common/config.h"
#include "common/EventTrace.h"

#include "json_spirit/json_spirit_reader.h"
#include "json_spirit/json_spirit_writer.h"

#ifdef WITH_LTTNG
#define TRACEPOINT_DEFINE
#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
Expand Down Expand Up @@ -2349,6 +2353,19 @@ will start to track new ops received afterwards.";
set<int> poollist = get_mapped_pools();
f->dump_stream("pool_list") << poollist;
f->close_section();
} else if (admin_command == "smart") {
probe_smart(ss);
} else if (admin_command == "list_devices") {
set<string> devnames;
store->get_devices(&devnames);
f->open_object_section("list_devices");
for (auto dev : devnames) {
f->dump_string("device", "/dev/" + dev);
if (dev.find("dm-") == 0) {
continue;
}
}
f->close_section();
} else {
assert(0 == "broken asok registration");
}
Expand Down Expand Up @@ -2881,6 +2898,18 @@ void OSD::final_init()
asok_hook,
"dump pools whose PG(s) are mapped to this OSD.");

assert(r == 0);

r = admin_socket->register_command("smart", "smart",
asok_hook,
"probe OSD devices for SMART data.");

assert(r == 0);

r = admin_socket->register_command("list_devices", "list_devices",
asok_hook,
"list OSD devices.");

test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
// Note: pools are CephString instead of CephPoolname because
// these commands traditionally support both pool names and numbers
Expand Down Expand Up @@ -3367,6 +3396,8 @@ int OSD::shutdown()
cct->get_admin_socket()->unregister_command("dump_pgstate_history");
cct->get_admin_socket()->unregister_command("compact");
cct->get_admin_socket()->unregister_command("get_mapped_pools");
cct->get_admin_socket()->unregister_command("smart");
cct->get_admin_socket()->unregister_command("list_devices");
delete asok_hook;
asok_hook = NULL;

Expand Down Expand Up @@ -6043,6 +6074,9 @@ COMMAND("compact",
"compact object store's omap. "
"WARNING: Compaction probably slows your requests",
"osd", "rw", "cli,rest")
COMMAND("smart",
"runs smartctl on this osd devices. ",
"osd", "rw", "cli,rest")
};

void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
Expand Down Expand Up @@ -6457,6 +6491,10 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
ss << "compacted omap in " << duration << " seconds";
}

else if (prefix == "smart") {
probe_smart(ds);
}

else {
ss << "unrecognized command! " << cmd;
r = -EINVAL;
Expand All @@ -6475,6 +6513,75 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
}
}

void OSD::probe_smart(ostream& ss)
{
set<string> devnames;
store->get_devices(&devnames);
uint64_t smart_timeout = cct->_conf->get_val<uint64_t>("osd_smart_report_timeout");
std::string result;

json_spirit::mObject json_map; // == typedef std::map<std::string, mValue> mObject;
json_spirit::mValue smart_json;

for (auto dev : devnames) {
// smartctl works only on physical devices; filter out any logical device
if (dev.find("dm-") == 0) {
continue;
}

if (probe_smart_device(("/dev/" + dev).c_str(), smart_timeout, &result)) {
derr << "probe_smart_device failed for /dev/" << dev << ", continuing to next device"<< dendl;
continue;
}

// TODO: change to read_or_throw?
if (!json_spirit::read(result, smart_json)) {
derr << "smartctl JSON output of /dev/" + dev + " is invalid" << dendl;
}
else { //json is valid, assigning
json_map[dev] = smart_json;
}
// no need to result.clear() or clear smart_json
}
json_spirit::write(json_map, ss, json_spirit::pretty_print);
}

int OSD::probe_smart_device(const char *device, int timeout, std::string *result)
{
// when using --json, smartctl will report its errors in JSON format to stdout
SubProcessTimed smartctl("sudo", SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE, timeout);
smartctl.add_cmd_args(
"smartctl",
"-a",
//"-x",
"--json",
device,
NULL);

int ret = smartctl.spawn();
if (ret != 0) {
derr << "failed run smartctl: " << smartctl.err() << dendl;
return ret;
}

bufferlist output;
ret = output.read_fd(smartctl.get_stdout(), 100*1024);
if (ret < 0) {
derr << "failed read from smartctl: " << cpp_strerror(-ret) << dendl;
return ret;
}

derr << "smartctl output is: " << output.c_str() << dendl;
*result = output.c_str();

if (smartctl.join() != 0) {
derr << smartctl.err() << dendl;
return -EINVAL;
}

return 0;
}

bool OSD::heartbeat_dispatch(Message *m)
{
dout(30) << "heartbeat_dispatch " << m << dendl;
Expand Down
3 changes: 3 additions & 0 deletions src/osd/OSD.h
Original file line number Diff line number Diff line change
Expand Up @@ -2349,6 +2349,9 @@ class OSD : public Dispatcher,

float get_osd_recovery_sleep();

void probe_smart(ostream& ss);
int probe_smart_device(const char *device, int timeout, std::string *result);

public:
static int peek_meta(ObjectStore *store, string& magic,
uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami);
Expand Down
2 changes: 2 additions & 0 deletions src/pybind/mgr/smart/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

from module import * # NOQA
35 changes: 35 additions & 0 deletions src/pybind/mgr/smart/module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@

"""
Pulling smart data from OSD
"""

import json
from mgr_module import MgrModule, CommandResult


class Module(MgrModule):
COMMANDS = [
{
"cmd": "osd smart get "
"name=osd_id,type=CephString,req=true",
"desc": "Get smart data for osd.id",
"perm": "r"
},
]

def handle_command(self, cmd):
self.log.error("handle_command")

if cmd['prefix'] == 'osd smart get':
result = CommandResult('')
self.send_command(result, 'osd', cmd['osd_id'], json.dumps({
'prefix': 'smart',
'format': 'json',
}), '')
r, outb, outs = result.wait()
return (r, outb, outs)

else:
# mgr should respect our self.COMMANDS and not call us for
# any prefix we don't advertise
raise NotImplementedError(cmd['prefix'])

0 comments on commit d1d21e7

Please sign in to comment.