Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DNM: pybind/mgr: add smart mgr module #1

Open
wants to merge 5 commits into
base: wip-smartctl-pull
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/common/options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1632,6 +1632,10 @@ std::vector<Option> get_global_options() {
.set_min(2)
.set_description("Number of striping periods to zero head of MDS journal write position"),

Option("osd_smart_report_timeout", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(5)
.set_description("Timeout (in seconds) for smarctl to run, default is set to 5"),

Option("osd_check_max_object_name_len_on_startup", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(true)
.set_description(""),
Expand Down
107 changes: 107 additions & 0 deletions src/osd/OSD.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include "common/version.h"
#include "common/io_priority.h"
#include "common/pick_address.h"
#include "common/SubProcess.h"

#include "os/ObjectStore.h"
#ifdef HAVE_LIBFUSE
Expand Down Expand Up @@ -146,6 +147,9 @@
#include "common/config.h"
#include "common/EventTrace.h"

#include "json_spirit/json_spirit_reader.h"
#include "json_spirit/json_spirit_writer.h"

#ifdef WITH_LTTNG
#define TRACEPOINT_DEFINE
#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
Expand Down Expand Up @@ -2349,6 +2353,19 @@ will start to track new ops received afterwards.";
set<int> poollist = get_mapped_pools();
f->dump_stream("pool_list") << poollist;
f->close_section();
} else if (admin_command == "smart") {
probe_smart(ss);
} else if (admin_command == "list_devices") {
set<string> devnames;
store->get_devices(&devnames);
f->open_object_section("list_devices");
for (auto dev : devnames) {
f->dump_string("device", "/dev/" + dev);
if (dev.find("dm-") == 0) {
continue;
}
}
f->close_section();
} else {
assert(0 == "broken asok registration");
}
Expand Down Expand Up @@ -2881,6 +2898,18 @@ void OSD::final_init()
asok_hook,
"dump pools whose PG(s) are mapped to this OSD.");

assert(r == 0);

r = admin_socket->register_command("smart", "smart",
asok_hook,
"probe OSD devices for SMART data.");

assert(r == 0);

r = admin_socket->register_command("list_devices", "list_devices",
asok_hook,
"list OSD devices.");

test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
// Note: pools are CephString instead of CephPoolname because
// these commands traditionally support both pool names and numbers
Expand Down Expand Up @@ -3367,6 +3396,8 @@ int OSD::shutdown()
cct->get_admin_socket()->unregister_command("dump_pgstate_history");
cct->get_admin_socket()->unregister_command("compact");
cct->get_admin_socket()->unregister_command("get_mapped_pools");
cct->get_admin_socket()->unregister_command("smart");
cct->get_admin_socket()->unregister_command("list_devices");
delete asok_hook;
asok_hook = NULL;

Expand Down Expand Up @@ -6043,6 +6074,9 @@ COMMAND("compact",
"compact object store's omap. "
"WARNING: Compaction probably slows your requests",
"osd", "rw", "cli,rest")
COMMAND("smart",
"runs smartctl on this osd devices. ",
"osd", "rw", "cli,rest")
};

void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
Expand Down Expand Up @@ -6457,6 +6491,10 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
ss << "compacted omap in " << duration << " seconds";
}

else if (prefix == "smart") {
probe_smart(ds);
}

else {
ss << "unrecognized command! " << cmd;
r = -EINVAL;
Expand All @@ -6475,6 +6513,75 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
}
}

void OSD::probe_smart(ostream& ss)
{
set<string> devnames;
store->get_devices(&devnames);
uint64_t smart_timeout = cct->_conf->get_val<uint64_t>("osd_smart_report_timeout");
std::string result;

json_spirit::mObject json_map; // == typedef std::map<std::string, mValue> mObject;
json_spirit::mValue smart_json;

for (auto dev : devnames) {
// smartctl works only on physical devices; filter out any logical device
if (dev.find("dm-") == 0) {
continue;
}

if (probe_smart_device(("/dev/" + dev).c_str(), smart_timeout, &result)) {
derr << "probe_smart_device failed for /dev/" << dev << ", continuing to next device"<< dendl;
continue;
}

// TODO: change to read_or_throw?
if (!json_spirit::read(result, smart_json)) {
derr << "smartctl JSON output of /dev/" + dev + " is invalid" << dendl;
}
else { //json is valid, assigning
json_map[dev] = smart_json;
}
// no need to result.clear() or clear smart_json
}
json_spirit::write(json_map, ss, json_spirit::pretty_print);
}

int OSD::probe_smart_device(const char *device, int timeout, std::string *result)
{
// when using --json, smartctl will report its errors in JSON format to stdout
SubProcessTimed smartctl("sudo", SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE, timeout);
smartctl.add_cmd_args(
"smartctl",
"-a",
//"-x",
"--json",
device,
NULL);

int ret = smartctl.spawn();
if (ret != 0) {
derr << "failed run smartctl: " << smartctl.err() << dendl;
return ret;
}

bufferlist output;
ret = output.read_fd(smartctl.get_stdout(), 100*1024);
if (ret < 0) {
derr << "failed read from smartctl: " << cpp_strerror(-ret) << dendl;
return ret;
}

derr << "smartctl output is: " << output.c_str() << dendl;
*result = output.c_str();

if (smartctl.join() != 0) {
derr << smartctl.err() << dendl;
return -EINVAL;
}

return 0;
}

bool OSD::heartbeat_dispatch(Message *m)
{
dout(30) << "heartbeat_dispatch " << m << dendl;
Expand Down
3 changes: 3 additions & 0 deletions src/osd/OSD.h
Original file line number Diff line number Diff line change
Expand Up @@ -2349,6 +2349,9 @@ class OSD : public Dispatcher,

float get_osd_recovery_sleep();

void probe_smart(ostream& ss);
int probe_smart_device(const char *device, int timeout, std::string *result);

public:
static int peek_meta(ObjectStore *store, string& magic,
uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami);
Expand Down
2 changes: 2 additions & 0 deletions src/pybind/mgr/smart/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

from module import * # NOQA
Loading