Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow specifying a set of errnos to select from. #24

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions cookbook/recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def connect():


def main():
if len(sys.argv) != 2:
if len(sys.argv) < 2:
usage()

client = connect()
Expand All @@ -48,31 +48,35 @@ def main():
sys.exit(0)
elif sys.argv[1] == "--full":
print("Simulating disk full")
client.set_all_fault(False, errno.ENOSPC, 0, "", False, 0, False)
client.set_all_fault(False, [errno.ENOSPC], 0, "", False, 0, False)
elif sys.argv[1] == "--io-error":
print("Simulating IO error")
client.set_all_fault(False, errno.EIO, 0, "", False, 0, False)
client.set_all_fault(False, [errno.EIO], 0, "", False, 0, False)
elif sys.argv[1] == "--quota":
print("Simulating quota exceeded")
client.set_all_fault(False, errno.EDQUOT, 0, "", False, 0, False)
client.set_all_fault(False, [errno.EDQUOT], 0, "", False, 0, False)
elif sys.argv[1] == "--delay":
print("Simulating delayed IO")
client.set_all_fault(False, 0, 0, "", False, 50000, False)
client.set_all_fault(False, [], 0, "", False, 50000, False)
elif sys.argv[1] == "--random":
# Use all errnos, minus any specified in trailing arguments.
errnos_selected = {code: name for code, name in
errno.errorcode.items() if name not in sys.argv[1:]}
print("Simulating random errno")
client.set_all_fault(True, 0, 0, "", False, 0, False)
print("Using errnos: " + ", ".join(errnos_selected.values()))
client.set_all_fault(True, errnos_selected.keys(), 0, "", False, 0, False)
elif sys.argv[1] == "--specific-syscalls":
print("Restricting random IO restricted to specific syscalls")
client.set_fault(['read', 'read_buf', 'write', 'write_buf'], True, 0, 0, "", False, 0, False)
client.set_fault(['read', 'read_buf', 'write', 'write_buf'], True, [], 0, "", False, 0, False)
elif sys.argv[1] == "--probability":
print("Restricting random IO restricted to specific syscalls and 1% error probability")
client.set_fault(['read', 'read_buf', 'write', 'write_buf'], True, 0, 1000, "", False, 0, False)
client.set_fault(['read', 'read_buf', 'write', 'write_buf'], True, [], 1000, "", False, 0, False)
elif sys.argv[1] == "--file-pattern":
print("Restricting random IO restricted to specific syscalls while cursing *.sendmail.cf")
client.set_fault(['read', 'read_buf', 'write', 'write_buf'], True, 0, 0, ".*sendmail.cf", False, 0, False)
client.set_fault(['read', 'read_buf', 'write', 'write_buf'], True, [], 0, ".*sendmail.cf", False, 0, False)
elif sys.argv[1] == "--broken-drive":
print("The agonising drive simulator")
client.set_all_fault(False, errno.EIO, 100, "", False, 100000, False)
client.set_all_fault(False, [errno.EIO], 100, "", False, 100000, False)
else:
usage()

Expand Down
4 changes: 2 additions & 2 deletions python_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@

print(client.get_methods())

# client.set_fault(['flush', 'fsync', 'fsyncdir'], False, 0, 100000, "", True, 500000)
client.set_fault(['flush', 'fsync', 'fsyncdir'], False, 0, 99000, "", True, 500000)
# client.set_fault(['flush', 'fsync', 'fsyncdir'], False, [], 100000, "", True, 500000)
client.set_fault(['flush', 'fsync', 'fsyncdir'], False, [], 99000, "", True, 500000)
# client.clear_all_faults()

except Thrift.TException as tx:
Expand Down
60 changes: 35 additions & 25 deletions server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,28 @@ using namespace ::apache::thrift::server;
using std::shared_ptr;

struct fault_descriptor {
bool random; // error code must be randomized
int err_no; // error code to return
int32_t probability; // 0 < probability < 100
std::string regexp; // regular expression on filename
bool kill_caller; // Must we kill the caller
int32_t delay_us; // operation delay in us
bool auto_delay; // must auto delay like an SSD
bool random; // error code must be randomized
std::vector<int32_t> err_nos; // error code(s) to select from
int32_t probability; // 0 < probability < 100
std::string regexp; // regular expression on filename
bool kill_caller; // Must we kill the caller
int32_t delay_us; // operation delay in us
bool auto_delay; // must auto delay like an SSD
};

std::vector<int32_t> default_errnos;
std::set<std::string> valid_methods;
std::map<std::string, fault_descriptor> fault_map;
std::mutex mutex;

void init_default_errnos()
{
int32_t errno;
for (errno = E2BIG; errno < EXFULL; errno++) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see where this range comes from, but it doesn't make a lot of sense to be honest.
As a follow up we should be more specific about the random errors and drop those that don't make sense, like ENOSYS or EL2NSYNC and OTOH add some that do make sense like EIO.

default_errnos.push_back(errno);
}
}

void init_valid_methods()
{
valid_methods.insert("getattr");
Expand Down Expand Up @@ -104,13 +113,16 @@ static bool is_valid_method(std::string method)
}

// return a random err_no
static int random_err_no()
static int random_err_no(std::vector<int32_t> err_nos)
{
std::random_device rd;
std::uniform_int_distribution<int> dist(E2BIG, EXFULL);
if (err_nos.empty() == 0) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean if (err_nos.empty())?
If it's not empty why override it with default_errnos?

err_nos = default_errnos;
}

return dist(rd);
}
std::random_device rd;
std::uniform_int_distribution<int> dist(0, err_nos.size());
return int(err_nos[dist(rd)]);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a follow up, we need to use a PRNG (default_random_engine) here rather than std::random_device.
The std::random_device that is often hardware based is too expensive and slow if used too frequently once its entropy pool is exhausted. It should be used just to get a seed for initializing the default_random_engine.

It would also be nice to add an option to provide a seed to have reproducible runs.

}

// return true if random number is not in the probability
static bool get_lucky(int probability)
Expand All @@ -125,7 +137,7 @@ static bool get_lucky(int probability)
if (dist(rd) > probability) {
return true;
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cosmetic changes should be done in cosmetic-only patches, that contain no functional change.

return false;
}

Expand All @@ -139,16 +151,13 @@ int error_inject(volatile int in_flight, std::string path, std::string method)
return 0;
}

// get the fault injection descritor
// get the fault injection descriptor
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto, move to a patch with cosmetic changes only

auto descr = fault_map[method];

int err_no = 0;

// get the err_no to inject
if (descr.err_no) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's instead special case the err_nos.size() == 1 case here.

Although drawing a random number in dist(0, 1) is technically correct, there's no reason to waste cpu cycles on the random code if only one err_no is used.

err_no = descr.err_no;
} else if (descr.random) {
err_no = random_err_no();
int err_no = 0;
if (descr.random || !descr.err_nos.empty()) {
err_no = random_err_no(descr.err_nos);
}

if (descr.regexp.size()) {
Expand Down Expand Up @@ -208,14 +217,14 @@ class server_handler: public serverIf {
}

void set_fault(const std::vector<std::string>& methods, const bool random,
const int32_t err_no, const int32_t probability,
const std::vector<int32_t> & err_nos, const int32_t probability,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: & sticks to angular bracket please.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not provide it by value and move it below rather than copying?

const std::string& regexp, const bool kill_caller,
int32_t delay_us, const bool auto_delay)
{
struct fault_descriptor descr;

descr.random = random;
descr.err_no = err_no;
descr.err_nos = err_nos;
descr.probability = probability;
descr.regexp = regexp;
descr.kill_caller = kill_caller;
Expand All @@ -230,7 +239,7 @@ class server_handler: public serverIf {
}
}

void set_all_fault(const bool random, const int32_t err_no,
void set_all_fault(const bool random, const std::vector<int32_t> & err_nos,
const int32_t probability, const std::string& regexp,
const bool kill_caller, const int32_t delay_us,
const bool auto_delay)
Expand All @@ -240,8 +249,8 @@ class server_handler: public serverIf {
for (auto method: valid_methods) {
methods.push_back(method);
}
set_fault(methods, random, err_no, probability,

set_fault(methods, random, err_nos, probability,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto, & coding style, and consider moving a vector rather than passing a reference around and eventually copy it, just to throw away the original vector.

regexp, kill_caller, delay_us,
auto_delay);
}
Expand All @@ -253,6 +262,7 @@ void server_thread()
int port = 9090;

init_valid_methods();
init_default_errnos();

std::cout << "Server Thread started" << std::endl;
try {
Expand Down
6 changes: 3 additions & 3 deletions server.thrift
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
service server {

// Used to get the list of availables systems calls
list<string> get_methods(),
list<string> get_methods(),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cosmetic


// Used to clear all faults sources
void clear_all_faults(),
Expand All @@ -18,7 +18,7 @@ service server {
// Set fault on a specific list of methods
void set_fault(list<string> methods, // the list of methods to operate on
bool random, // Must we return random errno
i32 err_no, // A specific errno to return
list<i32> err_nos, // A list of specific errnos to select from
i32 probability, // Fault probability over 100 000
string regexp, // A regexp matching a victim file
bool kill_caller, // Kill -9 the caller process
Expand All @@ -27,7 +27,7 @@ service server {

// Works like set_fault but applies the fault to all methods
void set_all_fault(bool random,
i32 err_no,
list<i32> err_nos,
i32 probability,
string regexp,
bool kill_caller,
Expand Down