-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgraphite_smart_exporter.sh
executable file
·562 lines (511 loc) · 25.5 KB
/
graphite_smart_exporter.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
#!/usr/bin/env bash
VERSION=1.1.4
DESTINATION= # The destination where the Graphite server is reachable
PORT=2003 # The port the Graphite server listens on for the plaintext protocol
FREQUENCY=300 # The frequency data is gathered and sent to graphite in
VERBOSE=0 # Default verbosity level
QUIET=0 # Does not write any output if set
OMIT_DRIVES_IN_STANDBY=1 # Does not send the last known metrics for drives that are in standby
DISABLE_DRIVE_DETECTION=0 # Disable drive detection using smartctl. Only enabled if devices to monitor are supplied as arguments
OMIT_DEVICE_NAME_FROM_INFO_METRIC=0 # Does not include the device_name tag in the smart_disk_info metric if set to 1.
NETCAT_FLAVOR= # Stores flavor of netcat implemenation, should be GNU, OPENBSD or TRADITIONAL
declare -A DRIVES # Associative array for detected drives, mapping to device type
declare -A DRIVES_SERIALS # Associateive array mapping a drive to its serial number
declare -A DRIVE_COMMON_TAGS # Associative array for storing common drive tags
declare -A METRICS # Associative array keeping track of the metrics for each drive
TEMP_FILE_PREIFX=smartctl_output # Name prefix of the temporary file smartctl output is stored in. jq seems to be more efficient when reading from file vs. getting the input piped to
LOG_FILE="" # Log file name. File logging is only enabled if not empty.
SMART_TEMP_FILE_NAME="smart_output.json" # Name of the temp file SMART output is written to. jq is more efficient when reading input from a file instead of piping the output to it.
METRIC_NAME_VALUE_DELIMITER=">>" # Delimiter used between metric name and value when building metrics
SMART_POWER_STATUS_METRIC_NAME="smart_power_status" # Metric name indicating the power status (active or standby/sleep)
##
# Prints the help/usage message
##
function print_usage() {
cat << EOF
Graphite S.M.A.R.T. exporter version ${VERSION}
Usage:
$0 [-h] -d [-p] -n <HOSTNAME> [-f <FREQUENCY>] [-c] [-m <DEVICE>] [-t <DEVICE=TYPE> ] [-v] [-q] [-l <LOG_FILE>] [-s <SMART_TEMP_FILE_NAME>]
Gathers S.M.A.R.T. data about all S.M.A.R.T. capable drives in the system
and sends them as metrics to a Graphite server.
Options:
-d DESTINATION : The destnation IP address or host name under which the Graphite
server is reachable.
-p PORT : The port the Graphite server is listening on for the plaintext protocol.
-n HOSTNAME : The host name to set for the metrics' 'instance' tag.
-f FREQUENCY : Frequency metrics are gathered and sent to Graphite with in seconds
(default: 300)
-l LOG_FILE : Name of the log file to log into. File logging is only enabled if a file name is provided. (default: empty)
-c : Continue sending last known/stale data if a drive is in standby/spun down. If a drive is spun down, S.M.A.R.T. attributes
cannot be read without waking it up. If this option is set, the script continues to send the last known S.M.A.R.T.
metrics for a drive that is spun down to prevent gaps in data.
Otherwise no metrics are sent until the drive is awake again.
-m DEVICE : List devices to monitor using this argument, once per drive to minor, e.g. -m /dev/sda -m /dev/sdc
-t DEVICE=TYPE : Manually specify the device type for a device. Use this if smartctl device type autodetection does not work for your case. Does NOT disable device discovery. Example: -t /dev/sda=nvme
-s SMART_TEMP_FILE_NAME : Name of the temp file the S.M.A.R.T. output is written to during each cycle the script is running.
Explicitly set if you plan on running multiple instances of this script to prevent collisions. (default: smart_output.json)
-o : Omit device name tag from info metric. If you're dealing with a system that changes device names frequently, set this flag to avoid multiple time series after the device changed name.
-q : Quiet mode. Outputs are suppressed set. Can not be set if -v is set.
-v : Verbose mode. Prints additional information during execution. File logging is only enabled in verbose mode. Can not be set if -q is set.
-h : Print this help message.
Example usage:
$0 -d graphite.mydomain.com -n myhost
$0 -d graphite.mydomain.com -p 9198 -n myhost -f 600
$0 -d graphite.mydomain.com -n myhost -f 600 -o -m /dev/sda -m /dev/sdc -t /dev/sdc=sat
EOF
}
##
# Determines which tool to access disk parameters is available. This
# differentiates between TrueNAS Core and TrueNAS SCALE.
#
# Return: Command to use to access disk parameters
#
##
function detect_netcat_flavor() {
local nc_info=$( nc -h 2>&1 )
if [[ ! -z $(echo $nc_info | grep "GNU" ) ]]; then
NETCAT_FLAVOR="GNU"
return
elif [[ ! -z $(echo $nc_info | grep "OpenBSD") ]]; then
NETCAT_FLAVOR="OPENBSD"
return
elif [[ ! -z $(echo $nc_info | grep "c shell commands") ]]; then
NETCAT_FLAVOR="TRADITIONAL"
return
fi
log_error "No supported netcat utility found."
exit 1
}
##
# Checks whether all prerequisites for this script to work correctly are fulfilled.
##
function check_prerequisites() {
# check smartctl
if [ -z $(which smartctl) ]; then
log_error "smartctl not found, aborting."
exit 1
fi
# check if smartctl understands --json
local smartctl_help=$(smartctl --json=c -h 2>&1)
if [[ ! -z $(echo "$smartctl_help" | grep "UNRECOGNIZED OPTION") ]]; then
log_error "smartctl does not support json formatted output, aborting. smartctl 7.0 is required at minimum!"
exit 1
fi
if [ -z $(which jq) ]; then
log_error "jq not found, aborting."
exit 1
fi
}
##
# Writes argument $1 to stdout if $QUIET is not set
#
# Arguments:
# $1 Message to write to stdout
##
function log() {
if [[ $QUIET -eq 0 ]]; then
echo "time=$(date --iso-8601=seconds) level=info msg=$1"
if [[ ! -z "$LOG_FILE" ]]; then
echo "time=$(date --iso-8601=seconds) level=info msg=$1" >> $LOG_FILE
fi
fi
}
##
# Writes argument $1 to stdout if $VERBOSE is set and $QUIET is not set
#
# Arguments:
# $1 Message to write to stdout
##
function log_verbose() {
if [[ $VERBOSE -eq 1 ]] && [[ $QUIET -eq 0 ]]; then
echo "time=$(date --iso-8601=seconds) level=debug msg=$1"
if [[ ! -z "$LOG_FILE" ]]; then
echo "time=$(date --iso-8601=seconds) level=debug msg=$1" >> $LOG_FILE
fi
fi
}
##
# Writes argument $1 to $LOG_FILE, appending to the file
#
# Arguments:
# $1 Message to write to the log file
##
function log_file() {
if [[ $VERBOSE -eq 1 ]] && [[ $QUIET -eq 0 ]] && [[ ! -z "$LOG_FILE" ]]; then
echo "time=$(date --iso-8601=seconds) level=debug msg=$1" >> $LOG_FILE
fi
}
##
# Writes argument $1 to stderr. Ignores $QUIET.
#
# Arguments:
# $1 Message to write to stderr
##
function log_error() {
>&2 echo "time=$(date --iso-8601=seconds) level=error msg=$1"
if [[ ! -z "$LOG_FILE" ]]; then
echo "time=$(date --iso-8601=seconds) level=error msg=$1" >> $LOG_FILE
fi
}
##
# Manually sets the device type for a specific device. Useful if
# smartctl auto-detection does not work correctly.
#
# Arguments:
# $1 Drive device type specification string in the form of <device>=<type>, e.g. /dev/sda=sat
##
function set_manual_device_type() {
local drive=$(cut -d'=' -f1 <<<"$1")
local device_type=$(cut -d'=' -f2 <<<"$1")
if [[ -z "$drive" || -z "$device_type" ]]; then
print_usage
fi
DRIVES[$drive]=$device_type
}
##
# Registers a new drive in $DRIVES array and detects if it is an ATA or SCSI
# drive.
#
# Arguemnts:
# $1 Device identifier (e.g. /dev/ada0)
##
function register_drive() {
local drive="$1"
if [ -z "$drive" ]; then
log_error "Failed to register drive. Empty name received."
return 1
fi
# Check if we need to use a manually provided device type for querying SMART initially
local device_type_argument=""
if [ ! -z "${DRIVES[$drive]}" ]; then
device_type_argument="-d ${DRIVES[$drive]}"
fi
local smart_output=$(smartctl --json=c -a $device_type_argument $drive)
local common_tags=""
if [ $OMIT_DEVICE_NAME_FROM_INFO_METRIC -eq 0 ]; then
common_tags=$(echo "$smart_output" | jq -r --arg HOSTNAME $HOSTNAME '
(.model_family // "" | gsub(" "; "_")) as $model_family
| (.model_name // "" | gsub(" "; "_")) as $model_name
| (.serial_number | tostring) as $serial_number
| .firmware_version as $firmware_version
| (.user_capacity.bytes | tostring) as $user_capacity_bytes
| (.device.name | sub("/dev/"; "")) as $device_name
| .device.type as $device_type
| if $model_name != "" then "model_name=\($model_name);" else "" end
+ if $model_family != "" then "model_family=\($model_family);" else "" end
+ "serial_number=\($serial_number);"
+ "firmware_version=\($firmware_version);"
+ "user_capacity_bytes=\($user_capacity_bytes);"
+ "device_name=\($device_name);"
+ "device_type=\($device_type);"
+ "instance=\($HOSTNAME)"')
else
common_tags=$(echo "$smart_output" | jq -r --arg HOSTNAME $HOSTNAME '
(.model_family // "" | gsub(" "; "_")) as $model_family
| (.model_name // "" | gsub(" "; "_")) as $model_name
| (.serial_number | tostring) as $serial_number
| .firmware_version as $firmware_version
| (.user_capacity.bytes | tostring) as $user_capacity_bytes
| .device.type as $device_type
| if $model_name != "" then "model_name=\($model_name);" else "" end
+ if $model_family != "" then "model_family=\($model_family);" else "" end
+ "serial_number=\($serial_number);"
+ "firmware_version=\($firmware_version);"
+ "user_capacity_bytes=\($user_capacity_bytes);"
+ "device_type=\($device_type);"
+ "instance=\($HOSTNAME)"')
fi
DRIVE_COMMON_TAGS[$drive]="$common_tags"
# detect device type if not provided by command line argument
if [ -z "${DRIVES[$drive]}" ]; then
local device_type=$(echo "$smart_output" | jq -r '.device.type')
DRIVES[$drive]=$device_type
fi
# store drive serial separately
local serial_number=$(echo "$smart_output" | jq -r '.serial_number')
DRIVES_SERIALS[$drive]="$serial_number"
}
##
# Checks if the system has a zfs file system. If so, gathers all
# disks of all pools and adds that information to the disk's common tags.
#
##
function detect_drives_zpool() {
# check for zfs
if [[ -z $(which zpool) ]]; then
return
fi
log_verbose "Detected ZFS, trying to find pools for disks"
# Auto detect available pools
local poolnames=$(zpool list -H -o name)
# Index disks in detected pools
for poolname in $poolnames; do
local partitions
# this command lists the full path to the partition devices contained in the poll
if ! partitions=$(zpool list -LPHv "$poolname" | grep -Eo "(\/\w+\/\w+)" ); then
continue;
fi
local zpool_tag="zfs_pool=${poolname}"
for partition in $partitions; do
for drive in $(get_drives); do
if [[ ! -z $(echo ${partition} | grep "${drive}") ]]; then
DRIVE_COMMON_TAGS[$drive]="${zpool_tag};${DRIVE_COMMON_TAGS[$drive]}"
log_verbose "Disk ${drive} is part of ZFS pool ${poolname}"
break
fi
done
done
done
}
##
# Detects all connected drives using plain iostat method and whether they are
# ATA or SCSI drives. Drives listed in $IGNORE_DRIVES will be excluded.
#
# Note: This function populates the $DRIVES array directly.
##
function detect_drives_smart() {
local DRIVE_DEVS=$(smartctl --json=c --scan-open | jq -r '.devices[].name')
# Detect protocol type (ATA or SCSI) for each drive and populate $DRIVES array
for drive in ${DRIVE_DEVS}; do
register_drive "$drive"
done
}
##
# Retrieves the list of identifiers (e.g. "ada0") for all monitored drives.
# Drives listed in $IGNORE_DRIVES will be excluded.
#
# Note: Must be run after detect_drives().
##
function get_drives() {
echo "${!DRIVES[@]}"
}
##
# Gets all SMART attributes for the provided drive, named already as metrics.
#
# Arguments:
# $1 The drive device ID to get SMART attributes for, e.g. /dev/sda
#
# Returns
# A list of delimiter-separated (refer to $METRIC_NAME_VALUE_DELIMITER) SMART metrics with name tags. The smart_disk_info and smart_power_status metrics are always returned (if
# no smartctl error occurred). Example:
# smart_status_passed;serial_number=2JHXXXXX>>1 smart_power_status;serial_number=2JHXXXXX>>1 smart_disk_info;model_name=WDC__WUH721818ALE6L4;model_family=Western_Digital_Ultrastar_DC_HC550;serial_number=2JHXXXXX;firmware_version=PCGNW680;user_capacity_bytes=18000207937536;device_name=sda;device_type=sat;instance=myhost.mydomain.com>>1
# OR smart_power_status;serial_number=2JHXXXXX>>1 smart_disk_info;model_name=WDC__WUH721818ALE6L4;model_family=Western_Digital_Ultrastar_DC_HC550;serial_number=2JHXXXXX;firmware_version=PCGNW680;user_capacity_bytes=18000207937536;device_name=sda;device_type=sat;instance=myhost.mydomain.com>>1 if the device is in standy and attributes could not be retrieved
# OR "error" on any other error returned by smartctl
##
function get_smart_metrics() {
local drive="$1"
declare -A attributes
# Determine device type
local device_type=${DRIVES[$drive]}
# Get common drive tags
local common_tags=${DRIVE_COMMON_TAGS[$drive]}
# Get drive serial
local serial_number=${DRIVES_SERIALS[$drive]}
local disk_metrics=""
# Read SMART attributes
smartctl --json=c -a -n standby -d $device_type $drive > $SMART_TEMP_FILE_NAME
# If 0 < exit_status <= 7, an error performing smartctl occurred. However, if messages contains a message that contains the keyword
# STANDBY or SLEEP, the device is in sleep mode and was not queried.
# The following filter accounts for the possibility that there are no messages at all
local exit_code=$(jq -r '.smartctl.exit_status as $exit_status
| try .smartctl.messages[].string catch ""
| (contains("STANDBY") or contains("SLEEP")) as $sleep
| if ($exit_status > 0 and $exit_status <= 7 and $sleep) then "standby" elif ($exit_status > 0 and $exit_status <= 7) then "error" else . end ' $SMART_TEMP_FILE_NAME)
# Create an Info Metric with all the device's static tags
local info_metric=$(echo "smart_disk_info;${common_tags}${METRIC_NAME_VALUE_DELIMITER}1")
disk_metrics="${info_metric} ${disk_metrics}"
# Determine power status (drive active or in standby/sleep)
local smart_power_status=1
if [ "$exit_code" == "standby" ]; then
smart_power_status=0
fi
local smart_power_status_metric=$(echo "${SMART_POWER_STATUS_METRIC_NAME};serial_number=${serial_number}${METRIC_NAME_VALUE_DELIMITER}${smart_power_status}")
disk_metrics="${smart_power_status_metric} ${disk_metrics}"
# Exit status between 1 and 7 indicate either Standby or an hard error
if [ ! -z "$exit_code" ]; then
if [ "$exit_code" == "error" ]; then
echo "$exit_code"
fi
# If the exit code is not "error", it means the disk is in standby.
# In that case, we're going to return the metrics we have gathered thus far, that is the info metric and the power status metric.
# All other exit codes yield SMART attributes
else
# Get SMART attributes depending on drive type
case $device_type in
"nvme")
local nvme_attributes=$(jq -r --arg serial_number "$serial_number" --arg delim $METRIC_NAME_VALUE_DELIMITER '
.nvme_smart_health_information_log
| keys[] as $key
| "smart_nvme_attribute;serial_number=\($serial_number);value_type=raw;attribute_name=\($key)\($delim)\(.[$key]|numbers)"' $SMART_TEMP_FILE_NAME)
local temperature_sensors=$(jq -r --arg serial_number "$serial_number" --arg delim $METRIC_NAME_VALUE_DELIMITER '
.nvme_smart_health_information_log.temperature_sensors
| keys[] as $key
| "smart_nvme_attribute;serial_number=\($serial_number);value_type=raw;attribute_name=temperature_sensor_\($key)\($delim)\(.[$key]|numbers)"' $SMART_TEMP_FILE_NAME)
# Special metrics: Temperature, Power Cycle Count, Power on Time, Smart Status
local additional_metrics=$(jq -r --arg serial_number "$serial_number" --arg delim $METRIC_NAME_VALUE_DELIMITER '
"smart_device_temperature;serial_number=\($serial_number)\($delim)\(.temperature.current)",
"smart_power_cycle_count;serial_number=\($serial_number)\($delim)\(.power_cycle_count)",
"smart_power_on_time_hours;serial_number=\($serial_number)\($delim)\(.power_on_time.hours)",
"smart_status_passed;serial_number=\($serial_number)\($delim)\(if .smart_status.passed then "1" else "0" end)"' $SMART_TEMP_FILE_NAME)
disk_metrics="${nvme_attributes} ${temperature_sensors} ${additional_metrics} ${disk_metrics}"
;;
"sat")
smart_attributes=$(jq -r --arg serial_number "$serial_number" --arg delim $METRIC_NAME_VALUE_DELIMITER '
.ata_smart_attributes.table[]
| "smart_attribute;serial_number=\($serial_number);value_type=value;attribute_id=\(.id);attribute_name=\(.name)\($delim)\(.value)",
"smart_attribute;serial_number=\($serial_number);value_type=raw;attribute_id=\(.id);attribute_name=\(.name)\($delim)\(.raw.value)",
"smart_attribute;serial_number=\($serial_number);value_type=thresh;attribute_id=\(.id);attribute_name=\(.name)\($delim)\(.thresh)",
"smart_attribute;serial_number=\($serial_number);value_type=worst;attribute_id=\(.id);attribute_name=\(.name)\($delim)\(.worst)"' $SMART_TEMP_FILE_NAME)
# Special metric: Temperature
local additional_metrics=$(jq -r --arg serial_number "$serial_number" --arg delim $METRIC_NAME_VALUE_DELIMITER '
"smart_device_temperature;serial_number=\($serial_number)\($delim)\(.temperature.current)",
"smart_power_cycle_count;serial_number=\($serial_number)\($delim)\(.power_cycle_count)",
"smart_power_on_time_hours;serial_number=\($serial_number)\($delim)\(.power_on_time.hours)",
"smart_status_passed;serial_number=\($serial_number)\($delim)\(if .smart_status.passed then "1" else "0" end)"' $SMART_TEMP_FILE_NAME)
disk_metrics="${smart_attributes} ${additional_metrics} ${disk_metrics}"
;;
esac
fi
if [ "$exit_code" != "error" ]; then
for disk_metric in "${disk_metrics}"; do
echo $disk_metric
done
fi
rm $SMART_TEMP_FILE_NAME
}
##
# Sends the metrics for all drives currently stored in global array
# METRICS to the Graphite server at DESTINATION on port PORT
##
function send_metrics {
# get current time in Unix timestamp format, save in $time
time=$(/bin/date +%s)
local netcat_args="-w 2"
if [ $VERBOSE -eq 1 ]; then
netcat_args="-v $netcat_args"
fi
# The GNU flavor requires the -c argument to terminate after sending the payload
if [ "$NETCAT_FLAVOR" = "GNU" ]; then
netcat_args="-c $netcat_args"
fi
# only send if there are actually any metrics available
if [ ${#METRICS[@]} -gt 0 ]; then
for drive in ${!METRICS[@]}; do
local metrics_for_drive=${METRICS[$drive]}
# Check if we actually have metrics for that drive (might be in standby)
if [ ! -z "$metrics_for_drive" ]; then
for metric in $metrics_for_drive; do
local formatted_metric=$(echo "$metric" | sed -E "s/($METRIC_NAME_VALUE_DELIMITER)/ /")
echo "${formatted_metric} ${time}"
done
fi
done | nc "${DESTINATION}" "${PORT}" $netcat_args
fi
}
##
# Main program loop
##
function main() {
log_verbose "Running SMART Graphite Exporter $VERSION"
# Verify mandatory arguments
if [ -z $DESTINATION ] || [ -z $HOSTNAME ]; then
print_usage
exit 1
fi
if [ $VERBOSE -eq 1 ] && [ $QUIET -eq 1 ]; then
echo "Either set -v OR -o, not both!"
print_usage
exit 1
fi
detect_netcat_flavor
# Replace dots '.' in hostname with underscores
HOSTNAME_METRIC=${HOSTNAME//./_}
HOSTNAME_METRIC=$HOSTNAME
log_verbose "Destination Server: $DESTINATION"
log_verbose "Port: $PORT"
log_verbose "Hostname: $HOSTNAME"
log_verbose "Hostname in metrics: $HOSTNAME_METRIC"
log_verbose "Frequency: $FREQUENCY"
log_verbose "Verbose: $VERBOSE"
log_verbose "Disable drive detection: $DISABLE_DRIVE_DETECTION"
log_verbose "Manually specified drives: $(get_drives)"
log_verbose "Omit device name from info metric: $OMIT_DEVICE_NAME_FROM_INFO_METRIC"
log_verbose "Detected netcat utility flavor: $NETCAT_FLAVOR"
local smart_power_status_regex="$SMART_POWER_STATUS_METRIC_NAME(;\S+=\S+)+>>(0|1)"
# Identify drives if no drives were provided as arguments
if [ $DISABLE_DRIVE_DETECTION -eq 0 ]; then
detect_drives_smart
fi
detect_drives_zpool
for drive in $(get_drives); do
log_verbose "Using drive ${drive} as ${DRIVES[$drive]} device"
done
log "Starting to send S.M.A.R.T. metrics with a frequency ${FREQUENCY} seconds: $(get_drives)"
# Drive SMART monitoring loop
while true; do
for drive in $(get_drives); do
local SMART_METRICS=$(get_smart_metrics $drive)
local power_status_metric=$(echo ${SMART_METRICS} | grep -Eo "${smart_power_status_regex}")
# If the power status metrics ends with 0, the device is in standby
if [[ "${power_status_metric}" == *0 ]]; then
if [ $OMIT_DRIVES_IN_STANDBY -eq 0 ]; then
log_verbose "Drive ${drive} is in standby, sending last known metrics"
# Update power status in existing metrics by replacing it via regex
METRICS[$drive]=$(sed --regexp-extended "s/${smart_power_status_regex}/${SMART_METRICS}/" <<< "${METRICS[$drive]}")
else
log_verbose "Drive ${drive} is in standby, will not send any metrics this cycle"
# Sets metrics to only inlcude the power status, as returned by get_smart_metrics
METRICS[$drive]="${SMART_METRICS}"
fi
fi
if [ "${SMART_METRICS}" == "error" ]; then
log "Error querying SMART attributes for drive ${drive}!"
fi
# Store metrics (empty if the drive is in standby)
if [ "${SMART_METRICS}" != "error" ] && [ "${SMART_METRICS}" != "standby" ]; then
log_verbose "Metrics for drive ${drive}: ${SMART_METRICS}"
METRICS[$drive]=$SMART_METRICS
fi
done
# Send metrics to Graphite server
send_metrics
# Wait for next cycle
sleep $FREQUENCY
done
}
check_prerequisites
# Parse arguments
while getopts "hd:p:n:vqof:cm:t:l:s:" opt; do
case ${opt} in
d ) DESTINATION=${OPTARG}
;;
f ) FREQUENCY=${OPTARG}
;;
l ) LOG_FILE=${OPTARG}
;;
m ) register_drive ${OPTARG}
DISABLE_DRIVE_DETECTION=1
;;
n ) HOSTNAME=${OPTARG}
;;
p ) PORT=${OPTARG}
;;
c ) OMIT_DRIVES_IN_STANDBY=0
;;
s ) SMART_TEMP_FILE_NAME=${OPTARG}
;;
t ) set_manual_device_type ${OPTARG}
;;
o ) OMIT_DEVICE_NAME_FROM_INFO_METRIC=1
;;
q ) QUIET=1
;;
v ) VERBOSE=1
;;
h ) print_usage; exit
;;
\? ) print_usage; exit
;;
esac
done
main # Start main program