test-failure-policy

#!/usr/bin/env python3

# This file is part of Cockpit.
#
# Copyright (C) 2021 Red Hat, Inc.
#
# Cockpit is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Cockpit is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Cockpit; If not, see <http://www.gnu.org/licenses/>.

import argparse
import datetime
import fnmatch
import os
import re
import sys
import traceback

from lib.aio.jsonutil import JsonValue, get_str
from lib.constants import BOTS_DIR
from lib.testmap import get_test_image
from task import github

sys.dont_write_bytecode = True


def main() -> int:
    script = os.path.basename(__file__)
    parser = argparse.ArgumentParser(description='Check a traceback for a known issue')
    parser.add_argument('-o', "--offline", action='store_true',
                        help="Work offline, don't fetch new data or contact servers")
    parser.add_argument('-a', '--all', action='store_true',
                        help='Check for matching naughties in all other images')
    parser.add_argument('image', help="The image to check against")
    opts = parser.parse_args()

    api = None if opts.offline else github.GitHub(repo="cockpit-project/bots")

    try:
        output = sys.stdin.read()
        number = None

        if output:
            number = check_known_issue(api, output, opts.image)

        if number and api and api.token:
            try:
                update_known_issue(api, number, output, opts.image)
            except (OSError, RuntimeError):
                traceback.print_exc()
                sys.stderr.write(f"{script}: posting update to GitHub failed\n")
                # Fall through

        if number:
            print(f"Known issue #{number}")
            return 77
        elif opts.all:
            number, img = check_all_known_issues(api, output, opts.image)
            if number:
                print(f"Known issue #{number} in {img}")
                return 78

        if checkRetry(output):
            print("due to failure of test harness or framework")
            return 1

    except RuntimeError as ex:
        sys.stderr.write(f"{script}: {ex}\n")

    return 0


# -----------------------------------------------------------------------------
# Retry policy

def checkRetry(trace: str) -> bool:
    # We check for persistent but test harness or framework specific
    # failures that otherwise cause flakiness and false positives.
    #
    # The things we check here must:
    #  * have no impact on users of Cockpit in the real world
    #  * be things we tried to resolve in other ways. This is a last resort
    #

    trace = normalize_traceback(trace)

    # HACK: Interacting with sshd during boot is not always predictable
    # We're using an implementation detail of the server as our "way in" for testing.
    # This often has to do with sshd being restarted for some reason
    if "SSH master process exited with code: 255" in trace:
        return True

    # HACK: Intermittently the new libvirt machine won't get an IP address
    # or SSH will completely fail to start. We've tried various approaches
    # to minimize this, but it happens every 100,000 tests or so
    if "Failure: Unable to reach machine " in trace:
        return True

    # HACK: For when the verify machine runs out of available processes
    # We should retry this test process
    if "self.pid = os.fork()\nOSError: [Errno 11] Resource temporarily unavailable" in trace:
        return True

    return False


# -----------------------------------------------------------------------------
# Known Issue Matching and Filing


def check_all_known_issues(api: github.GitHub | None, trace: str, image: str) -> tuple[int | None, str | None]:
    img = None
    number = None
    naughty_dir = os.path.join(BOTS_DIR, "naughty")

    for dir_entry in os.scandir(naughty_dir):
        # Skip symlinks so we don't check duplicates
        if dir_entry.is_symlink():
            continue

        img = dir_entry.name

        # Skip the image we already checked
        if img != get_test_image(image):
            number = check_known_issue(api, trace, img)
            if number:
                break

    return number, img


def normalize_traceback(trace: str) -> str:
    # All file paths converted to basename
    trace = re.sub(r'File "[^"]*/([^/"]+)"', 'File "\\1"', trace.strip())

    # replace noise in SELinux violations
    trace = re.sub(r'audit\([0-9.:]+\)', 'audit(0)', trace)
    trace = re.sub(r'\b(pid|ino)=[0-9]+ ', r'\1=0 ', trace)

    # in Python 3, testlib.Error is shown with namespace
    trace = re.sub(r'testlib\.Error', 'Error', trace)
    return trace


def check_known_issue(api: github.GitHub | None, trace: str, image: str) -> int | None:
    image_naughty = os.path.join(BOTS_DIR, "naughty", get_test_image(image))
    if not os.path.exists(image_naughty):
        return None

    trace = normalize_traceback(trace)

    pixel_patterns = ["Differences in pixel test", "New pixel test reference", "Unused reference image"]
    if [pattern for pattern in pixel_patterns if pattern in trace]:
        return None

    number = None
    for naughty in os.listdir(image_naughty):
        prefix, _, _name = naughty.partition("-")
        n = int(prefix)
        with open(os.path.join(image_naughty, naughty)) as fp:
            match = "*" + normalize_traceback(fp.read()) + "*"
        # Match as in a file name glob, albeit multi line, and account for literal pastes with '[]'
        if fnmatch.fnmatchcase(trace, match) or fnmatch.fnmatchcase(trace, match.replace("[", "?")):
            number = n
    return number


# Update a known issue thread on GitHub
#
# The idea is to combine repeated errors into fewer comments by
# editing them and keeping all relevant information.
#
# For this we keep one comment per context (e.g. 'fedora-coreos')
# and divide that into sections, one each per error description / trace.
# In each section, we keep the error description / trace as well as
# the number of recorded events, the first occurrence and the last 10
# occurrences.
# For each (listed) occurrence we display the timestamp and revision
def update_known_issue(api: github.GitHub, number: int, err: str, context: str) -> JsonValue:
    occ = datetime.datetime.now(tz=datetime.UTC).isoformat()

    revision = os.environ.get("TEST_REVISION", None)
    if revision:
        occ = f"{occ} | revision {revision}"

    comments = api.issue_comments(number)

    # try to find an existing comment to update; extract the traceback from the
    # whole output; also ensure to remove the "# duration: XXs" trailer
    err_key = normalize_traceback(err).strip()
    m = re.search("^(Traceback.*^not ok[^#\\n]*)", err_key, re.S | re.M)
    if m:
        err_key = m.group(1)
    comment_key = f"{context}\n"
    latest_occurrences = "Latest occurrences:\n\n"
    for comment in reversed(comments):
        body = get_str(comment, 'body', None)
        if body is not None and body.startswith(comment_key):
            parts = body.split("<hr>")
            updated = False
            for part_idx, part in enumerate(parts):
                part = normalize_traceback(part).strip()
                if err_key in part:
                    latest = part.split(latest_occurrences)
                    if len(latest) < 2:
                        sys.stderr.write("Error while parsing latest occurrences\n")
                    else:
                        # number of times this error was recorded
                        header = latest[0].split("\n")
                        for header_idx, entry in enumerate(header):
                            if entry.startswith("Times recorded: "):
                                rec_entries = entry.split(" ")
                                rec_entries[-1] = str(int(rec_entries[-1]) + 1)
                                header[header_idx] = " ".join(rec_entries)
                        latest[0] = "\n".join(header)
                        # list of recent occurrences
                        occurrences = [_f for _f in latest[1].split("\n") if _f]
                        occurrences.append(f"- {occ}\n")
                        # only keep the last 10
                        if len(occurrences) > 10:
                            occurrences.pop(0)
                        parts[part_idx] = f"{latest[0]}{latest_occurrences}" + '\n'.join(occurrences)
                        updated = True
                    break

            if updated:
                # shuffle the updated part to the end
                assert len(parts) > part_idx
                parts.append(parts[part_idx])
                del parts[part_idx]

            else:
                # add a new part
                while len(parts) > 10:  # maximum 10 traces
                    parts.pop()

                parts.append(f"""
```
{err.strip()}
```
First occurrence: {occ}
Times recorded: 1
{latest_occurrences}- {occ}
""")

            # update comment, no need to check others
            body = "<hr>\n".join(parts)

            # ensure that the body is not longer than 64k.
            # drop earlier parts if we need to.
            while len(body) >= 65536:
                parts.pop(1)  # parts[0] is the header

                body = "<hr>\n".join(parts)

            return api.patch(f"issues/comments/{comment['id']}", {"body": body})

    # create a new comment, since we didn't find one to update

    data = {"body": f"""{context}\nOoops, it happened again<hr>
```
{err.strip()}
```
First occurrence: {occ}
Times recorded: 1
{latest_occurrences}- {occ}
"""}
    return api.post(f"issues/{number}/comments", data)


if __name__ == '__main__':
    sys.exit(main())