-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbtrfs_health.py
220 lines (185 loc) · 8.95 KB
/
btrfs_health.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
"""Helper routines for converting btrfs’ output into something processable. In
particular, it fullfills the need to decide whethr a btrfs filesystem is
healthy or not. Unfortunately, btrfs’ userland utilities produce output that
is unstable in ordering, unstable in device naming, complex to parse, and not
accompanied by proper exit codes.
"""
import re, subprocess, logging, time, json
from pathlib import Path
def get_filesystems():
"""Returns the mounted btrfs filesystems.
:returns:
All mounted btrfs filesystems, as a dictionary mapping the UUID to
filesystem data. The filesystem data is a dictionary mapping field names
to its value. The fields are “label” (UUID), “number_devices”,
“bytes_used”, and “devices”. For the field “devices”, the value is a
dictionary mapping the “devid” (a number) to a dictionary mapping field
names to device data. Here, the field names are “size”, “used”, and
“path” (e.g. “/dev/sda”).
:rtype: dict[str, dict[str, object]]
"""
btrfs = subprocess.run(["btrfs", "fi", "show", "--mounted"], check=True, capture_output=True, text=True)
assert not btrfs.stderr, btrfs.stderr
lines = iter(btrfs.stdout.splitlines())
def parse_filesystem(lines):
data = {}
match = re.match(r"Label: (?P<label>.+) uuid: (?P<uuid>[-0-9a-f]+)", next(lines))
uuid = match.group("uuid")
data["label"] = match.group("label")
match = re.match("\tTotal devices (?P<number_devices>\\d+) FS bytes used (?P<bytes_used>.+)", next(lines))
data["number_devices"] = int(match.group("number_devices"))
data["bytes_used"] = match.group("bytes_used")
devices = {}
while line := next(lines):
if "*** Some devices missing" in line:
continue
match = re.match("\tdevid\\s* (?P<devid>\\d+) size (?P<size>.+) used (?P<used>.+) path (?P<device_path>.*)", line)
devices[match.group("devid")] = {"size": match.group("size"), "used": match.group("used"),
"path": match.group("device_path")}
data["devices"] = devices
return uuid, data
filesystems = {}
try:
while True:
uuid, data = parse_filesystem(lines)
filesystems[uuid] = data
except StopIteration:
pass
return filesystems
def mounted_filesystem_ids():
"""Returns equivalent IDs for all mounted btrfs filesystems. Unfortunately,
btrfs filesystems may be identified by three IDs: (1) the UUID of the first
device, (2) the path to the first device, (3) the mount point. All three
IDs are used in the btrfs userland tools in a … well … chaotic way.
This function makes translation possible by returning all IDs in triplets.
:returns:
triplets of the form (UUID, device path, mount point) for all mounted
btrfs filesystems
:rtype: set[tuple[str]]
"""
filesystems = get_filesystems()
mounts = {device["path"]: device["mountpoint"]
for device in json.loads(subprocess.run(["lsblk", "--json", "--output", "PATH,MOUNTPOINT"],
check=True, text=True, capture_output=True).stdout)["blockdevices"]}
filesystem_ids = set()
for uuid, data in filesystems.items():
devices = data["devices"]
device_path = devices[sorted(devices, key=int)[0]]["path"]
try:
filesystem_ids.add((uuid, device_path, mounts[device_path]))
except KeyError:
raise RuntimeError(f"File system {uuid} is nowhere mounted with subvol=/")
return filesystem_ids
def get_errors(filesystems):
"""Returns filesystem errors detected by “btrfs device stats”. This call is
rather fast (< one second).
:param filesystems: the filesystems to be checked, as returned by
`get_filesystems`.
:type filesystems: dict[str, dict[str, object]]
:returns:
The device status as a dictionary mapping the device path
(e.g. “/dev/sda”) to the number of recorded errors.
:rtype: dict[str, int]
"""
devices = {}
for data in filesystems.values():
for device in data["devices"].values():
device_path = device["path"]
for line in subprocess.run(["btrfs", "device", "stats", device_path], check=True, capture_output=True,
text=True).stdout.splitlines():
match = re.match(r"\[(?P<device>.+)\]\..+_errs\s+(?P<errors>\d+)", line)
devices[device_path] = devices.get(device_path, 0) + int(match.group("errors"))
return devices
def read_scrub_status():
"""Read all scrub status files under ``/var/lib/btrfs``.
:return:
The status of all ongoing, cancelled, or finished scrubs as a dictionary
mapping UUID to a dictionary mapping the device ID to a dictionary
mapping field names to values. Impotant field names are “finished”,
“canceled”, or “total_errors”. The latter is the only integer value.
:rtype: dict[str, dict[str, dict[str, (str or int)]]]
"""
results = {}
for path in Path("/var/lib/btrfs").glob("scrub.status.*"):
if not re.match(r"scrub\.status\.[-0-9a-f]{36}$", path.name):
# Do not read "…_tmp" files.
continue
with open(path) as status_file:
status_lines = status_file.readlines()[1:]
for device in status_lines:
device = device.rstrip()
items = device.split("|")
uuid, colon, devid = items[0].partition(":")
assert colon
results.setdefault(uuid, {})[devid] = device_data = {}
for item in items[1:]:
key, colon, value = item.partition(":")
assert colon, item
device_data[key] = value
total_errors = 0
for key in ("read_errors", "csum_errors", "verify_errors", "super_errors",
"malloc_errors", "uncorrectable_errors", "corrected_errors"):
total_errors += int(device_data[key])
device_data["total_errors"] = total_errors
return results
class ScrubCanceled(RuntimeError):
pass
def scrub(uuids):
"""Returns filesystem errors detected by “btrfs scrub start”. This call is
expensive (scrubbing of all devices).
:param set[str] uuids: the uuids of the filesystems to be checked; they
must be mounted
:returns: The device status as a dictionary mapping the UUID of the
filesystem to a dictionary mapping the device IDs (numbers starting at 1)
to dictionaries mapping field names to values. The most important field
name is „total_errors“ (calculated by this routine rather than coming
from btrfs directly) which maps to an integer.
:rtype: dict[str, dict[str, dict[str, (str or int)]]]
"""
cancel_scrubs(uuids)
try:
for mount_point in (ids[2] for ids in mounted_filesystem_ids() if ids[0] in uuids):
logging.debug(f"Launch scrub process for {mount_point}")
subprocess.run(["btrfs", "scrub", "start", mount_point], check=True, stdout=subprocess.DEVNULL)
while True:
time.sleep(5)
if scrub.cancel:
scrub.cancel = False
raise ScrubCanceled
results = read_scrub_status()
unfinished_scrub = False
for uuid, devices in results.items():
if uuid in uuids:
for device in devices.values():
if device["finished"] != "1":
unfinished_scrub = True
assert device["canceled"] != "1"
if not unfinished_scrub:
logging.debug(f"All scrubs finished")
return results
except BaseException:
cancel_scrubs(uuids)
raise
scrub.cancel = False
def cancel_scrubs(uuids):
"""Cancel the scrubs to the given btrfs filesystems. If no scrub is ongoing
for some or all of them, this is ignored.
:param set[str] uuids: the uuids of the filesystems the scrubs of which
should be cancelled; they must be mounted
"""
while True:
uncanceled_scrubs = set()
status = read_scrub_status()
for uuid, devices in status.items():
if uuid in uuids:
for device in devices.values():
if device["canceled"] != "1" and device["finished"] != "1":
uncanceled_scrubs.add(uuid)
if not uncanceled_scrubs:
break
for mount_point in (ids[2] for ids in mounted_filesystem_ids() if ids[0] in uncanceled_scrubs):
logging.debug(f"Cancel scrub for {mount_point}")
process = subprocess.run(["btrfs", "scrub", "cancel", mount_point],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
assert process.returncode in [0, 2], process.returncode
time.sleep(1)