From 5d648e4a1190bf26ce3863bc5de393a418749b24 Mon Sep 17 00:00:00 2001
From: Carmen Bianca BAKKER <carmenbianca@fsfe.org>
Date: Thu, 17 Oct 2024 17:19:42 +0200
Subject: [PATCH 1/3] Move consts to covered_files

Signed-off-by: Carmen Bianca BAKKER <carmenbianca@fsfe.org>
---
 src/reuse/__init__.py      | 38 ----------------------------------
 src/reuse/cli/spdx.py      |  2 +-
 src/reuse/covered_files.py | 42 +++++++++++++++++++++++++++++++++-----
 3 files changed, 38 insertions(+), 44 deletions(-)

diff --git a/src/reuse/__init__.py b/src/reuse/__init__.py
index 416f7d944..8a9a0e741 100644
--- a/src/reuse/__init__.py
+++ b/src/reuse/__init__.py
@@ -22,7 +22,6 @@
 import gettext
 import logging
 import os
-import re
 from dataclasses import dataclass, field
 from enum import Enum
 from importlib.metadata import PackageNotFoundError, version
@@ -54,43 +53,6 @@
     _LOGGER.debug("no translations found at %s", _LOCALE_DIR)
 
 
-_IGNORE_DIR_PATTERNS = [
-    re.compile(r"^\.git$"),
-    re.compile(r"^\.hg$"),
-    re.compile(r"^\.sl$"),  # Used by Sapling SCM
-    re.compile(r"^LICENSES$"),
-    re.compile(r"^\.reuse$"),
-]
-
-_IGNORE_MESON_PARENT_DIR_PATTERNS = [
-    re.compile(r"^subprojects$"),
-]
-
-_IGNORE_FILE_PATTERNS = [
-    # LICENSE, LICENSE-MIT, LICENSE.txt
-    re.compile(r"^LICEN[CS]E([-\.].*)?$"),
-    re.compile(r"^COPYING([-\.].*)?$"),
-    # ".git" as file happens in submodules
-    re.compile(r"^\.git$"),
-    re.compile(r"^\.hgtags$"),
-    re.compile(r".*\.license$"),
-    re.compile(r"^REUSE\.toml$"),
-    # Workaround for https://github.com/fsfe/reuse-tool/issues/229
-    re.compile(r"^CAL-1.0(-Combined-Work-Exception)?(\..+)?$"),
-    re.compile(r"^SHL-2.1(\..+)?$"),
-]
-
-_IGNORE_SPDX_PATTERNS = [
-    # SPDX files from
-    # https://spdx.github.io/spdx-spec/conformance/#44-standard-data-format-requirements
-    re.compile(r".*\.spdx$"),
-    re.compile(r".*\.spdx.(rdf|json|xml|ya?ml)$"),
-]
-
-# Combine SPDX patterns into file patterns to ease default ignore usage
-_IGNORE_FILE_PATTERNS.extend(_IGNORE_SPDX_PATTERNS)
-
-
 class SourceType(Enum):
     """
     An enumeration representing the types of sources for license information.
diff --git a/src/reuse/cli/spdx.py b/src/reuse/cli/spdx.py
index 11a9933b5..1f29e4e0a 100644
--- a/src/reuse/cli/spdx.py
+++ b/src/reuse/cli/spdx.py
@@ -12,7 +12,7 @@
 
 import click
 
-from .. import _IGNORE_SPDX_PATTERNS
+from ..covered_files import _IGNORE_SPDX_PATTERNS
 from ..i18n import _
 from ..report import ProjectReport
 from .common import ClickObj
diff --git a/src/reuse/covered_files.py b/src/reuse/covered_files.py
index ad4e019b6..cbe143550 100644
--- a/src/reuse/covered_files.py
+++ b/src/reuse/covered_files.py
@@ -11,19 +11,51 @@
 import contextlib
 import logging
 import os
+import re
 from pathlib import Path
 from typing import Collection, Generator, Optional, cast
 
-from . import (
-    _IGNORE_DIR_PATTERNS,
-    _IGNORE_FILE_PATTERNS,
-    _IGNORE_MESON_PARENT_DIR_PATTERNS,
-)
 from .types import StrPath
 from .vcs import VCSStrategy
 
 _LOGGER = logging.getLogger(__name__)
 
+_IGNORE_DIR_PATTERNS = [
+    re.compile(r"^\.git$"),
+    re.compile(r"^\.hg$"),
+    re.compile(r"^\.sl$"),  # Used by Sapling SCM
+    re.compile(r"^LICENSES$"),
+    re.compile(r"^\.reuse$"),
+]
+
+_IGNORE_MESON_PARENT_DIR_PATTERNS = [
+    re.compile(r"^subprojects$"),
+]
+
+_IGNORE_FILE_PATTERNS = [
+    # LICENSE, LICENSE-MIT, LICENSE.txt
+    re.compile(r"^LICEN[CS]E([-\.].*)?$"),
+    re.compile(r"^COPYING([-\.].*)?$"),
+    # ".git" as file happens in submodules
+    re.compile(r"^\.git$"),
+    re.compile(r"^\.hgtags$"),
+    re.compile(r".*\.license$"),
+    re.compile(r"^REUSE\.toml$"),
+    # Workaround for https://github.com/fsfe/reuse-tool/issues/229
+    re.compile(r"^CAL-1.0(-Combined-Work-Exception)?(\..+)?$"),
+    re.compile(r"^SHL-2.1(\..+)?$"),
+]
+
+_IGNORE_SPDX_PATTERNS = [
+    # SPDX files from
+    # https://spdx.github.io/spdx-spec/conformance/#44-standard-data-format-requirements
+    re.compile(r".*\.spdx$"),
+    re.compile(r".*\.spdx.(rdf|json|xml|ya?ml)$"),
+]
+
+# Combine SPDX patterns into file patterns to ease default ignore usage
+_IGNORE_FILE_PATTERNS.extend(_IGNORE_SPDX_PATTERNS)
+
 
 def is_path_ignored(
     path: Path,

From 9582ce40a594d62612c2094f13ce37a738b26f61 Mon Sep 17 00:00:00 2001
From: Carmen Bianca BAKKER <carmenbianca@fsfe.org>
Date: Thu, 24 Oct 2024 10:34:17 +0200
Subject: [PATCH 2/3] Split _util into extract and copyright

Kind of. This commit is a bit of a mess, but will help a lot in the
future. All stuff related to the extraction of information out of files
is now in its own module, and all stuff related to the parsing and
handling of copyright lines is also in its own module.

The problem caused by this commit is that the tests are incomplete.
Previously, not all functions in _util were tested in the understanding
that they were just small helper functions for _actual_ functions that
are under test. Now, there are public functions in public modules that
are not tested, and by chance, all of the remaining _util is currently
untested.

Signed-off-by: Carmen Bianca BAKKER <carmenbianca@fsfe.org>
---
 src/reuse/__init__.py                   |   3 +
 src/reuse/_annotate.py                  |   7 +-
 src/reuse/_util.py                      | 367 +-----------------------
 src/reuse/cli/annotate.py               |   8 +-
 src/reuse/cli/common.py                 |   2 +-
 src/reuse/copyright.py                  | 122 ++++++++
 src/reuse/download.py                   |   3 +-
 src/reuse/extract.py                    | 282 ++++++++++++++++++
 src/reuse/global_licensing.py           |   3 +-
 src/reuse/header.py                     |   7 +-
 src/reuse/project.py                    |   8 +-
 src/reuse/report.py                     |   5 +-
 tests/test_cli_annotate.py              |   2 +-
 tests/test_copyright.py                 | 135 +++++++++
 tests/{test_util.py => test_extract.py} | 198 +++----------
 tests/test_global_licensing.py          |   3 +-
 tests/test_project.py                   |   3 +-
 17 files changed, 602 insertions(+), 556 deletions(-)
 create mode 100644 src/reuse/copyright.py
 create mode 100644 src/reuse/extract.py
 create mode 100644 tests/test_copyright.py
 rename tests/{test_util.py => test_extract.py} (62%)

diff --git a/src/reuse/__init__.py b/src/reuse/__init__.py
index 8a9a0e741..720c52710 100644
--- a/src/reuse/__init__.py
+++ b/src/reuse/__init__.py
@@ -28,6 +28,7 @@
 from typing import Any, Optional
 
 from boolean.boolean import Expression
+from license_expression import Licensing
 
 try:
     __version__ = version("reuse")
@@ -42,6 +43,8 @@
 
 _LOGGER = logging.getLogger(__name__)
 
+_LICENSING = Licensing()
+
 _PACKAGE_PATH = os.path.dirname(__file__)
 _LOCALE_DIR = os.path.join(_PACKAGE_PATH, "locale")
 
diff --git a/src/reuse/_annotate.py b/src/reuse/_annotate.py
index 564c0341b..2c49d6ac2 100644
--- a/src/reuse/_annotate.py
+++ b/src/reuse/_annotate.py
@@ -23,11 +23,7 @@
 from jinja2.exceptions import TemplateNotFound
 
 from . import ReuseInfo
-from ._util import (
-    _determine_license_suffix_path,
-    contains_reuse_info,
-    detect_line_endings,
-)
+from ._util import _determine_license_suffix_path
 from .comment import (
     NAME_STYLE_MAP,
     CommentStyle,
@@ -35,6 +31,7 @@
     get_comment_style,
 )
 from .exceptions import CommentCreateError, MissingReuseInfoError
+from .extract import contains_reuse_info, detect_line_endings
 from .header import add_new_header, find_and_replace_header
 from .i18n import _
 from .project import Project
diff --git a/src/reuse/_util.py b/src/reuse/_util.py
index 04af137d5..fc91d76e7 100644
--- a/src/reuse/_util.py
+++ b/src/reuse/_util.py
@@ -16,22 +16,13 @@
 
 import logging
 import os
-import re
 import shutil
 import subprocess
-from collections import Counter
 from hashlib import sha1
 from inspect import cleandoc
-from itertools import chain
 from pathlib import Path
-from typing import IO, Any, BinaryIO, Iterator, Optional, Union
+from typing import IO, Any, Optional, Union
 
-from boolean.boolean import ParseError
-from license_expression import ExpressionError, Licensing
-
-from . import ReuseInfo, SourceType
-from .comment import _all_style_classes  # TODO: This import is not ideal here.
-from .i18n import _
 from .types import StrPath
 
 GIT_EXE = shutil.which("git")
@@ -39,93 +30,9 @@
 JUJUTSU_EXE = shutil.which("jj")
 PIJUL_EXE = shutil.which("pijul")
 
-REUSE_IGNORE_START = "REUSE-IgnoreStart"
-REUSE_IGNORE_END = "REUSE-IgnoreEnd"
-
-SPDX_SNIPPET_INDICATOR = b"SPDX-SnippetBegin"
-
-_LOGGER = logging.getLogger(__name__)
-_LICENSING = Licensing()
 
 # REUSE-IgnoreStart
 
-_END_PATTERN = r"{}$".format(
-    "".join(
-        {
-            r"(?:{})*".format(item)  # pylint: disable=consider-using-f-string
-            for item in chain(
-                (
-                    re.escape(style.MULTI_LINE.end)
-                    for style in _all_style_classes()
-                    if style.MULTI_LINE.end
-                ),
-                # These are special endings which do not belong to specific
-                # comment styles, but which we want to nonetheless strip away
-                # while parsing.
-                (
-                    ending
-                    for ending in [
-                        # ex: <tag value="Copyright Jane Doe">
-                        r'"\s*/*>',
-                        r"'\s*/*>",
-                        # ex: [SPDX-License-Identifier: GPL-3.0-or-later] ::
-                        r"\]\s*::",
-                    ]
-                ),
-            )
-        }
-    )
-)
-_LICENSE_IDENTIFIER_PATTERN = re.compile(
-    r"^(.*?)SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
-)
-_CONTRIBUTOR_PATTERN = re.compile(
-    r"^(.*?)SPDX-FileContributor:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
-)
-# The keys match the relevant attributes of ReuseInfo.
-_SPDX_TAGS: dict[str, re.Pattern] = {
-    "spdx_expressions": _LICENSE_IDENTIFIER_PATTERN,
-    "contributor_lines": _CONTRIBUTOR_PATTERN,
-}
-
-_COPYRIGHT_PATTERNS = [
-    re.compile(
-        r"(?P<copyright>(?P<prefix>SPDX-(File|Snippet)CopyrightText:"
-        r"(\s(\([Cc]\)|©|Copyright(\s(©|\([Cc]\)))?))?)\s+"
-        r"((?P<year>\d{4} ?- ?\d{4}|\d{4}),?\s+)?"
-        r"(?P<statement>.*?))" + _END_PATTERN
-    ),
-    re.compile(
-        r"(?P<copyright>(?P<prefix>Copyright(\s(\([Cc]\)|©))?)\s+"
-        r"((?P<year>\d{4} ?- ?\d{4}|\d{4}),?\s+)?"
-        r"(?P<statement>.*?))" + _END_PATTERN
-    ),
-    re.compile(
-        r"(?P<copyright>(?P<prefix>©)\s+"
-        r"((?P<year>\d{4} ?- ?\d{4}|\d{4}),?\s+)?"
-        r"(?P<statement>.*?))" + _END_PATTERN
-    ),
-]
-_COPYRIGHT_PREFIXES = {
-    "spdx": "SPDX-FileCopyrightText:",
-    "spdx-c": "SPDX-FileCopyrightText: (C)",
-    "spdx-string-c": "SPDX-FileCopyrightText: Copyright (C)",
-    "spdx-string": "SPDX-FileCopyrightText: Copyright",
-    "spdx-string-symbol": "SPDX-FileCopyrightText: Copyright ©",
-    "spdx-symbol": "SPDX-FileCopyrightText: ©",
-    "string": "Copyright",
-    "string-c": "Copyright (C)",
-    "string-symbol": "Copyright ©",
-    "symbol": "©",
-}
-
-_LICENSEREF_PATTERN = re.compile("LicenseRef-[a-zA-Z0-9-.]+$")
-
-# Amount of bytes that we assume will be big enough to contain the entire
-# comment header (including SPDX tags), so that we don't need to read the
-# entire file.
-_HEADER_BYTES = 4096
-
 
 def setup_logging(level: int = logging.WARNING) -> None:
     """Configure logging for reuse.
@@ -191,22 +98,6 @@ def find_licenses_directory(root: Optional[StrPath] = None) -> Path:
     return licenses_path
 
 
-def decoded_text_from_binary(
-    binary_file: BinaryIO, size: Optional[int] = None
-) -> str:
-    """Given a binary file object, detect its encoding and return its contents
-    as a decoded string. Do not throw any errors if the encoding contains
-    errors:  Just replace the false characters.
-
-    If *size* is specified, only read so many bytes.
-    """
-    if size is None:
-        size = -1
-    rawdata = binary_file.read(size)
-    result = rawdata.decode("utf-8", errors="replace")
-    return result.replace("\r\n", "\n")
-
-
 def _determine_license_path(path: StrPath) -> Path:
     """Given a path FILE, return FILE.license if it exists, otherwise return
     FILE.
@@ -225,169 +116,6 @@ def _determine_license_suffix_path(path: StrPath) -> Path:
     return Path(f"{path}.license")
 
 
-def _parse_copyright_year(year: Optional[str]) -> list[str]:
-    """Parse copyright years and return list."""
-    ret: list[str] = []
-    if not year:
-        return ret
-    if re.match(r"\d{4}$", year):
-        ret = [year]
-    elif re.match(r"\d{4} ?- ?\d{4}$", year):
-        ret = [year[:4], year[-4:]]
-    return ret
-
-
-def _contains_snippet(binary_file: BinaryIO) -> bool:
-    """Check if a file seems to contain a SPDX snippet"""
-    # Assumes that if SPDX_SNIPPET_INDICATOR (SPDX-SnippetBegin) is found in a
-    # file, the file contains a snippet
-    content = binary_file.read()
-    if SPDX_SNIPPET_INDICATOR in content:
-        return True
-    return False
-
-
-def merge_copyright_lines(copyright_lines: set[str]) -> set[str]:
-    """Parse all copyright lines and merge identical statements making years
-    into a range.
-
-    If a same statement uses multiple prefixes, use only the most frequent one.
-    """
-    # pylint: disable=too-many-locals
-    # TODO: Rewrite this function. It's a bit of a mess.
-    copyright_in = []
-    for line in copyright_lines:
-        for pattern in _COPYRIGHT_PATTERNS:
-            match = pattern.search(line)
-            if match is not None:
-                copyright_in.append(
-                    {
-                        "statement": match.groupdict()["statement"],
-                        "year": _parse_copyright_year(
-                            match.groupdict()["year"]
-                        ),
-                        "prefix": match.groupdict()["prefix"],
-                    }
-                )
-                break
-
-    copyright_out = set()
-    for line_info in copyright_in:
-        statement = str(line_info["statement"])
-        copyright_list = [
-            item for item in copyright_in if item["statement"] == statement
-        ]
-
-        # Get the most common prefix.
-        most_common = str(
-            Counter([item["prefix"] for item in copyright_list]).most_common(1)[
-                0
-            ][0]
-        )
-        prefix = "spdx"
-        for key, value in _COPYRIGHT_PREFIXES.items():
-            if most_common == value:
-                prefix = key
-                break
-
-        # get year range if any
-        years: list[str] = []
-        for copy in copyright_list:
-            years += copy["year"]
-
-        year: Optional[str] = None
-        if years:
-            if min(years) == max(years):
-                year = min(years)
-            else:
-                year = f"{min(years)} - {max(years)}"
-
-        copyright_out.add(make_copyright_line(statement, year, prefix))
-    return copyright_out
-
-
-def extract_reuse_info(text: str) -> ReuseInfo:
-    """Extract REUSE information from comments in a string.
-
-    Raises:
-        ExpressionError: if an SPDX expression could not be parsed.
-        ParseError: if an SPDX expression could not be parsed.
-    """
-    text = filter_ignore_block(text)
-    spdx_tags: dict[str, set[str]] = {}
-    for tag, pattern in _SPDX_TAGS.items():
-        spdx_tags[tag] = set(find_spdx_tag(text, pattern))
-    # License expressions and copyright matches are special cases.
-    expressions = set()
-    copyright_matches = set()
-    for expression in spdx_tags.pop("spdx_expressions"):
-        try:
-            expressions.add(_LICENSING.parse(expression))
-        except (ExpressionError, ParseError):
-            _LOGGER.error(
-                _("Could not parse '{expression}'").format(
-                    expression=expression
-                )
-            )
-            raise
-    for line in text.splitlines():
-        for pattern in _COPYRIGHT_PATTERNS:
-            match = pattern.search(line)
-            if match is not None:
-                copyright_matches.add(match.groupdict()["copyright"].strip())
-                break
-
-    return ReuseInfo(
-        spdx_expressions=expressions,
-        copyright_lines=copyright_matches,
-        **spdx_tags,  # type: ignore
-    )
-
-
-def reuse_info_of_file(
-    path: StrPath, original_path: StrPath, root: StrPath
-) -> ReuseInfo:
-    """Open *path* and return its :class:`ReuseInfo`.
-
-    Normally only the first few :const:`_HEADER_BYTES` are read. But if a
-    snippet was detected, the entire file is read.
-    """
-    path = Path(path)
-    with path.open("rb") as fp:
-        try:
-            read_limit: Optional[int] = _HEADER_BYTES
-            # Completely read the file once
-            # to search for possible snippets
-            if _contains_snippet(fp):
-                _LOGGER.debug(f"'{path}' seems to contain an SPDX Snippet")
-                read_limit = None
-            # Reset read position
-            fp.seek(0)
-            # Scan the file for REUSE info, possibly limiting the read
-            # length
-            file_result = extract_reuse_info(
-                decoded_text_from_binary(fp, size=read_limit)
-            )
-            if file_result.contains_copyright_or_licensing():
-                source_type = SourceType.FILE_HEADER
-                if path.suffix == ".license":
-                    source_type = SourceType.DOT_LICENSE
-                return file_result.copy(
-                    path=relative_from_root(original_path, root).as_posix(),
-                    source_path=relative_from_root(path, root).as_posix(),
-                    source_type=source_type,
-                )
-
-        except (ExpressionError, ParseError):
-            _LOGGER.error(
-                _(
-                    "'{path}' holds an SPDX expression that cannot be"
-                    " parsed, skipping the file"
-                ).format(path=path)
-            )
-    return ReuseInfo()
-
-
 def relative_from_root(path: StrPath, root: StrPath) -> Path:
     """A helper function to get *path* relative to *root*."""
     path = Path(path)
@@ -397,88 +125,6 @@ def relative_from_root(path: StrPath, root: StrPath) -> Path:
         return Path(os.path.relpath(path, start=root))
 
 
-def find_spdx_tag(text: str, pattern: re.Pattern) -> Iterator[str]:
-    """Extract all the values in *text* matching *pattern*'s regex, taking care
-    of stripping extraneous whitespace of formatting.
-    """
-    for prefix, value in pattern.findall(text):
-        prefix, value = prefix.strip(), value.strip()
-
-        # Some comment headers have ASCII art to "frame" the comment, like this:
-        #
-        # /***********************\
-        # |*  This is a comment  *|
-        # \***********************/
-        #
-        # To ensure we parse them correctly, if the line ends with the inverse
-        # of the comment prefix, we strip that suffix. See #343 for a real
-        # world example of a project doing this (LLVM).
-        suffix = prefix[::-1]
-        if suffix and value.endswith(suffix):
-            value = value[: -len(suffix)]
-
-        yield value.strip()
-
-
-def filter_ignore_block(text: str) -> str:
-    """Filter out blocks beginning with REUSE_IGNORE_START and ending with
-    REUSE_IGNORE_END to remove lines that should not be treated as copyright and
-    licensing information.
-    """
-    ignore_start = None
-    ignore_end = None
-    if REUSE_IGNORE_START in text:
-        ignore_start = text.index(REUSE_IGNORE_START)
-    if REUSE_IGNORE_END in text:
-        ignore_end = text.index(REUSE_IGNORE_END) + len(REUSE_IGNORE_END)
-    if not ignore_start:
-        return text
-    if not ignore_end:
-        return text[:ignore_start]
-    if ignore_end > ignore_start:
-        return text[:ignore_start] + filter_ignore_block(text[ignore_end:])
-    rest = text[ignore_start + len(REUSE_IGNORE_START) :]
-    if REUSE_IGNORE_END in rest:
-        ignore_end = rest.index(REUSE_IGNORE_END) + len(REUSE_IGNORE_END)
-        return text[:ignore_start] + filter_ignore_block(rest[ignore_end:])
-    return text[:ignore_start]
-
-
-def contains_reuse_info(text: str) -> bool:
-    """The text contains REUSE info."""
-    try:
-        return bool(extract_reuse_info(text))
-    except (ExpressionError, ParseError):
-        return False
-
-
-def make_copyright_line(
-    statement: str, year: Optional[str] = None, copyright_prefix: str = "spdx"
-) -> str:
-    """Given a statement, prefix it with ``SPDX-FileCopyrightText:`` if it is
-    not already prefixed with some manner of copyright tag.
-    """
-    if "\n" in statement:
-        raise RuntimeError(f"Unexpected newline in '{statement}'")
-
-    prefix = _COPYRIGHT_PREFIXES.get(copyright_prefix)
-    if prefix is None:
-        # TODO: Maybe translate this. Also maybe reduce DRY here.
-        raise RuntimeError(
-            "Unexpected copyright prefix: Need 'spdx', 'spdx-c', "
-            "'spdx-symbol', 'string', 'string-c', "
-            "'string-symbol', or 'symbol'"
-        )
-
-    for pattern in _COPYRIGHT_PATTERNS:
-        match = pattern.search(statement)
-        if match is not None:
-            return statement
-    if year is not None:
-        return f"{prefix} {year} {statement}"
-    return f"{prefix} {statement}"
-
-
 def _checksum(path: StrPath) -> str:
     path = Path(path)
 
@@ -490,17 +136,6 @@ def _checksum(path: StrPath) -> str:
     return file_sha1.hexdigest()
 
 
-def detect_line_endings(text: str) -> str:
-    """Return one of '\n', '\r' or '\r\n' depending on the line endings used in
-    *text*. Return os.linesep if there are no line endings.
-    """
-    line_endings = ["\r\n", "\r", "\n"]
-    for line_ending in line_endings:
-        if line_ending in text:
-            return line_ending
-    return os.linesep
-
-
 def cleandoc_nl(text: str) -> str:
     """Like :func:`inspect.cleandoc`, but with a newline at the end."""
     return cleandoc(text) + "\n"
diff --git a/src/reuse/cli/annotate.py b/src/reuse/cli/annotate.py
index 4d3dd3904..ff40bfd01 100644
--- a/src/reuse/cli/annotate.py
+++ b/src/reuse/cli/annotate.py
@@ -29,12 +29,7 @@
 
 from .. import ReuseInfo
 from .._annotate import add_header_to_file
-from .._util import (
-    _COPYRIGHT_PREFIXES,
-    _determine_license_path,
-    _determine_license_suffix_path,
-    make_copyright_line,
-)
+from .._util import _determine_license_path, _determine_license_suffix_path
 from ..comment import (
     NAME_STYLE_MAP,
     CommentStyle,
@@ -42,6 +37,7 @@
     has_style,
     is_uncommentable,
 )
+from ..copyright import _COPYRIGHT_PREFIXES, make_copyright_line
 from ..i18n import _
 from ..project import Project
 from .common import ClickObj, MutexOption, spdx_identifier
diff --git a/src/reuse/cli/common.py b/src/reuse/cli/common.py
index c2ce22132..0345cf24f 100644
--- a/src/reuse/cli/common.py
+++ b/src/reuse/cli/common.py
@@ -12,7 +12,7 @@
 from boolean.boolean import Expression, ParseError
 from license_expression import ExpressionError
 
-from .._util import _LICENSING
+from .. import _LICENSING
 from ..exceptions import GlobalLicensingConflictError, GlobalLicensingParseError
 from ..i18n import _
 from ..project import Project
diff --git a/src/reuse/copyright.py b/src/reuse/copyright.py
new file mode 100644
index 000000000..4f4df9ff2
--- /dev/null
+++ b/src/reuse/copyright.py
@@ -0,0 +1,122 @@
+# SPDX-FileCopyrightText: 2024 Free Software Foundation Europe e.V. <https://fsfe.org>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+"""Utilities related to the parsing and storing of copyright notices."""
+
+import re
+from collections import Counter
+from typing import Optional
+
+from .extract import _COPYRIGHT_PATTERNS  # TODO: Get rid of this import.
+
+_COPYRIGHT_PREFIXES = {
+    "spdx": "SPDX-FileCopyrightText:",
+    "spdx-c": "SPDX-FileCopyrightText: (C)",
+    "spdx-string-c": "SPDX-FileCopyrightText: Copyright (C)",
+    "spdx-string": "SPDX-FileCopyrightText: Copyright",
+    "spdx-string-symbol": "SPDX-FileCopyrightText: Copyright ©",
+    "spdx-symbol": "SPDX-FileCopyrightText: ©",
+    "string": "Copyright",
+    "string-c": "Copyright (C)",
+    "string-symbol": "Copyright ©",
+    "symbol": "©",
+}
+
+
+def merge_copyright_lines(copyright_lines: set[str]) -> set[str]:
+    """Parse all copyright lines and merge identical statements making years
+    into a range.
+
+    If a same statement uses multiple prefixes, use only the most frequent one.
+    """
+    # pylint: disable=too-many-locals
+    # TODO: Rewrite this function. It's a bit of a mess.
+    copyright_in = []
+    for line in copyright_lines:
+        for pattern in _COPYRIGHT_PATTERNS:
+            match = pattern.search(line)
+            if match is not None:
+                copyright_in.append(
+                    {
+                        "statement": match.groupdict()["statement"],
+                        "year": _parse_copyright_year(
+                            match.groupdict()["year"]
+                        ),
+                        "prefix": match.groupdict()["prefix"],
+                    }
+                )
+                break
+
+    copyright_out = set()
+    for line_info in copyright_in:
+        statement = str(line_info["statement"])
+        copyright_list = [
+            item for item in copyright_in if item["statement"] == statement
+        ]
+
+        # Get the most common prefix.
+        most_common = str(
+            Counter([item["prefix"] for item in copyright_list]).most_common(1)[
+                0
+            ][0]
+        )
+        prefix = "spdx"
+        for key, value in _COPYRIGHT_PREFIXES.items():
+            if most_common == value:
+                prefix = key
+                break
+
+        # get year range if any
+        years: list[str] = []
+        for copy in copyright_list:
+            years += copy["year"]
+
+        year: Optional[str] = None
+        if years:
+            if min(years) == max(years):
+                year = min(years)
+            else:
+                year = f"{min(years)} - {max(years)}"
+
+        copyright_out.add(make_copyright_line(statement, year, prefix))
+    return copyright_out
+
+
+def make_copyright_line(
+    statement: str, year: Optional[str] = None, copyright_prefix: str = "spdx"
+) -> str:
+    """Given a statement, prefix it with ``SPDX-FileCopyrightText:`` if it is
+    not already prefixed with some manner of copyright tag.
+    """
+    if "\n" in statement:
+        raise RuntimeError(f"Unexpected newline in '{statement}'")
+
+    prefix = _COPYRIGHT_PREFIXES.get(copyright_prefix)
+    if prefix is None:
+        # TODO: Maybe translate this. Also maybe reduce DRY here.
+        raise RuntimeError(
+            "Unexpected copyright prefix: Need 'spdx', 'spdx-c', "
+            "'spdx-symbol', 'string', 'string-c', "
+            "'string-symbol', or 'symbol'"
+        )
+
+    for pattern in _COPYRIGHT_PATTERNS:
+        match = pattern.search(statement)
+        if match is not None:
+            return statement
+    if year is not None:
+        return f"{prefix} {year} {statement}"
+    return f"{prefix} {statement}"
+
+
+def _parse_copyright_year(year: Optional[str]) -> list[str]:
+    """Parse copyright years and return list."""
+    ret: list[str] = []
+    if not year:
+        return ret
+    if re.match(r"\d{4}$", year):
+        ret = [year]
+    elif re.match(r"\d{4} ?- ?\d{4}$", year):
+        ret = [year[:4], year[-4:]]
+    return ret
diff --git a/src/reuse/download.py b/src/reuse/download.py
index 06ef1c944..677a0d094 100644
--- a/src/reuse/download.py
+++ b/src/reuse/download.py
@@ -15,7 +15,8 @@
 from urllib.error import URLError
 from urllib.parse import urljoin
 
-from ._util import _LICENSEREF_PATTERN, find_licenses_directory
+from ._util import find_licenses_directory
+from .extract import _LICENSEREF_PATTERN
 from .project import Project
 from .types import StrPath
 from .vcs import VCSStrategyNone
diff --git a/src/reuse/extract.py b/src/reuse/extract.py
new file mode 100644
index 000000000..34bcd0faf
--- /dev/null
+++ b/src/reuse/extract.py
@@ -0,0 +1,282 @@
+# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
+# SPDX-FileCopyrightText: 2020 Tuomas Siipola <tuomas@zpl.fi>
+# SPDX-FileCopyrightText: 2022 Carmen Bianca Bakker <carmenbianca@fsfe.org>
+# SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
+# SPDX-FileCopyrightText: 2022 Nico Rikken <nico.rikken@fsfe.org>
+# SPDX-FileCopyrightText: 2022 Pietro Albini <pietro.albini@ferrous-systems.com>
+# SPDX-FileCopyrightText: 2023 DB Systel GmbH
+# SPDX-FileCopyrightText: 2023 Johannes Zarl-Zierl <johannes@zarl-zierl.at>
+# SPDX-FileCopyrightText: 2024 Rivos Inc.
+# SPDX-FileCopyrightText: 2024 Skyler Grey <sky@a.starrysky.fyi>
+# SPDX-FileCopyrightText: © 2020 Liferay, Inc. <https://liferay.com>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+"""Utilities related to the extraction of REUSE information out of files."""
+
+import logging
+import os
+import re
+from itertools import chain
+from pathlib import Path
+from typing import BinaryIO, Iterator, Optional
+
+from boolean.boolean import ParseError
+from license_expression import ExpressionError
+
+from . import _LICENSING, ReuseInfo, SourceType
+from ._util import relative_from_root
+from .comment import _all_style_classes
+from .i18n import _
+from .types import StrPath
+
+REUSE_IGNORE_START = "REUSE-IgnoreStart"
+REUSE_IGNORE_END = "REUSE-IgnoreEnd"
+
+# REUSE-IgnoreStart
+
+SPDX_SNIPPET_INDICATOR = b"SPDX-SnippetBegin"
+
+_LOGGER = logging.getLogger(__name__)
+
+_END_PATTERN = r"{}$".format(
+    "".join(
+        {
+            r"(?:{})*".format(item)  # pylint: disable=consider-using-f-string
+            for item in chain(
+                (
+                    re.escape(style.MULTI_LINE.end)
+                    for style in _all_style_classes()
+                    if style.MULTI_LINE.end
+                ),
+                # These are special endings which do not belong to specific
+                # comment styles, but which we want to nonetheless strip away
+                # while parsing.
+                (
+                    ending
+                    for ending in [
+                        # ex: <tag value="Copyright Jane Doe">
+                        r'"\s*/*>',
+                        r"'\s*/*>",
+                        # ex: [SPDX-License-Identifier: GPL-3.0-or-later] ::
+                        r"\]\s*::",
+                    ]
+                ),
+            )
+        }
+    )
+)
+_LICENSE_IDENTIFIER_PATTERN = re.compile(
+    r"^(.*?)SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
+)
+_CONTRIBUTOR_PATTERN = re.compile(
+    r"^(.*?)SPDX-FileContributor:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
+)
+# The keys match the relevant attributes of ReuseInfo.
+_SPDX_TAGS: dict[str, re.Pattern] = {
+    "spdx_expressions": _LICENSE_IDENTIFIER_PATTERN,
+    "contributor_lines": _CONTRIBUTOR_PATTERN,
+}
+
+_COPYRIGHT_PATTERNS = [
+    re.compile(
+        r"(?P<copyright>(?P<prefix>SPDX-(File|Snippet)CopyrightText:"
+        r"(\s(\([Cc]\)|©|Copyright(\s(©|\([Cc]\)))?))?)\s+"
+        r"((?P<year>\d{4} ?- ?\d{4}|\d{4}),?\s+)?"
+        r"(?P<statement>.*?))" + _END_PATTERN
+    ),
+    re.compile(
+        r"(?P<copyright>(?P<prefix>Copyright(\s(\([Cc]\)|©))?)\s+"
+        r"((?P<year>\d{4} ?- ?\d{4}|\d{4}),?\s+)?"
+        r"(?P<statement>.*?))" + _END_PATTERN
+    ),
+    re.compile(
+        r"(?P<copyright>(?P<prefix>©)\s+"
+        r"((?P<year>\d{4} ?- ?\d{4}|\d{4}),?\s+)?"
+        r"(?P<statement>.*?))" + _END_PATTERN
+    ),
+]
+
+_LICENSEREF_PATTERN = re.compile("LicenseRef-[a-zA-Z0-9-.]+$")
+
+# Amount of bytes that we assume will be big enough to contain the entire
+# comment header (including SPDX tags), so that we don't need to read the
+# entire file.
+_HEADER_BYTES = 4096
+
+
+def decoded_text_from_binary(
+    binary_file: BinaryIO, size: Optional[int] = None
+) -> str:
+    """Given a binary file object, detect its encoding and return its contents
+    as a decoded string. Do not throw any errors if the encoding contains
+    errors:  Just replace the false characters.
+
+    If *size* is specified, only read so many bytes.
+    """
+    if size is None:
+        size = -1
+    rawdata = binary_file.read(size)
+    result = rawdata.decode("utf-8", errors="replace")
+    return result.replace("\r\n", "\n")
+
+
+def _contains_snippet(binary_file: BinaryIO) -> bool:
+    """Check if a file seems to contain a SPDX snippet"""
+    # Assumes that if SPDX_SNIPPET_INDICATOR (SPDX-SnippetBegin) is found in a
+    # file, the file contains a snippet
+    content = binary_file.read()
+    if SPDX_SNIPPET_INDICATOR in content:
+        return True
+    return False
+
+
+def extract_reuse_info(text: str) -> ReuseInfo:
+    """Extract REUSE information from comments in a string.
+
+    Raises:
+        ExpressionError: if an SPDX expression could not be parsed.
+        ParseError: if an SPDX expression could not be parsed.
+    """
+    text = filter_ignore_block(text)
+    spdx_tags: dict[str, set[str]] = {}
+    for tag, pattern in _SPDX_TAGS.items():
+        spdx_tags[tag] = set(find_spdx_tag(text, pattern))
+    # License expressions and copyright matches are special cases.
+    expressions = set()
+    copyright_matches = set()
+    for expression in spdx_tags.pop("spdx_expressions"):
+        try:
+            expressions.add(_LICENSING.parse(expression))
+        except (ExpressionError, ParseError):
+            _LOGGER.error(
+                _("Could not parse '{expression}'").format(
+                    expression=expression
+                )
+            )
+            raise
+    for line in text.splitlines():
+        for pattern in _COPYRIGHT_PATTERNS:
+            match = pattern.search(line)
+            if match is not None:
+                copyright_matches.add(match.groupdict()["copyright"].strip())
+                break
+
+    return ReuseInfo(
+        spdx_expressions=expressions,
+        copyright_lines=copyright_matches,
+        **spdx_tags,  # type: ignore
+    )
+
+
+def reuse_info_of_file(
+    path: StrPath, original_path: StrPath, root: StrPath
+) -> ReuseInfo:
+    """Open *path* and return its :class:`ReuseInfo`.
+
+    Normally only the first few :const:`_HEADER_BYTES` are read. But if a
+    snippet was detected, the entire file is read.
+    """
+    path = Path(path)
+    with path.open("rb") as fp:
+        try:
+            read_limit: Optional[int] = _HEADER_BYTES
+            # Completely read the file once
+            # to search for possible snippets
+            if _contains_snippet(fp):
+                _LOGGER.debug(f"'{path}' seems to contain an SPDX Snippet")
+                read_limit = None
+            # Reset read position
+            fp.seek(0)
+            # Scan the file for REUSE info, possibly limiting the read
+            # length
+            file_result = extract_reuse_info(
+                decoded_text_from_binary(fp, size=read_limit)
+            )
+            if file_result.contains_copyright_or_licensing():
+                source_type = SourceType.FILE_HEADER
+                if path.suffix == ".license":
+                    source_type = SourceType.DOT_LICENSE
+                return file_result.copy(
+                    path=relative_from_root(original_path, root).as_posix(),
+                    source_path=relative_from_root(path, root).as_posix(),
+                    source_type=source_type,
+                )
+
+        except (ExpressionError, ParseError):
+            _LOGGER.error(
+                _(
+                    "'{path}' holds an SPDX expression that cannot be"
+                    " parsed, skipping the file"
+                ).format(path=path)
+            )
+    return ReuseInfo()
+
+
+def find_spdx_tag(text: str, pattern: re.Pattern) -> Iterator[str]:
+    """Extract all the values in *text* matching *pattern*'s regex, taking care
+    of stripping extraneous whitespace of formatting.
+    """
+    for prefix, value in pattern.findall(text):
+        prefix, value = prefix.strip(), value.strip()
+
+        # Some comment headers have ASCII art to "frame" the comment, like this:
+        #
+        # /***********************\
+        # |*  This is a comment  *|
+        # \***********************/
+        #
+        # To ensure we parse them correctly, if the line ends with the inverse
+        # of the comment prefix, we strip that suffix. See #343 for a real
+        # world example of a project doing this (LLVM).
+        suffix = prefix[::-1]
+        if suffix and value.endswith(suffix):
+            value = value[: -len(suffix)]
+
+        yield value.strip()
+
+
+def filter_ignore_block(text: str) -> str:
+    """Filter out blocks beginning with REUSE_IGNORE_START and ending with
+    REUSE_IGNORE_END to remove lines that should not be treated as copyright and
+    licensing information.
+    """
+    ignore_start = None
+    ignore_end = None
+    if REUSE_IGNORE_START in text:
+        ignore_start = text.index(REUSE_IGNORE_START)
+    if REUSE_IGNORE_END in text:
+        ignore_end = text.index(REUSE_IGNORE_END) + len(REUSE_IGNORE_END)
+    if not ignore_start:
+        return text
+    if not ignore_end:
+        return text[:ignore_start]
+    if ignore_end > ignore_start:
+        return text[:ignore_start] + filter_ignore_block(text[ignore_end:])
+    rest = text[ignore_start + len(REUSE_IGNORE_START) :]
+    if REUSE_IGNORE_END in rest:
+        ignore_end = rest.index(REUSE_IGNORE_END) + len(REUSE_IGNORE_END)
+        return text[:ignore_start] + filter_ignore_block(rest[ignore_end:])
+    return text[:ignore_start]
+
+
+def contains_reuse_info(text: str) -> bool:
+    """The text contains REUSE info."""
+    try:
+        return bool(extract_reuse_info(text))
+    except (ExpressionError, ParseError):
+        return False
+
+
+def detect_line_endings(text: str) -> str:
+    """Return one of '\n', '\r' or '\r\n' depending on the line endings used in
+    *text*. Return os.linesep if there are no line endings.
+    """
+    line_endings = ["\r\n", "\r", "\n"]
+    for line_ending in line_endings:
+        if line_ending in text:
+            return line_ending
+    return os.linesep
+
+
+# REUSE-IgnoreEnd
diff --git a/src/reuse/global_licensing.py b/src/reuse/global_licensing.py
index bacc8f92a..c8ad9cf1e 100644
--- a/src/reuse/global_licensing.py
+++ b/src/reuse/global_licensing.py
@@ -33,8 +33,7 @@
 from debian.copyright import Error as DebianError
 from license_expression import ExpressionError
 
-from . import ReuseInfo, SourceType
-from ._util import _LICENSING
+from . import _LICENSING, ReuseInfo, SourceType
 from .covered_files import iter_files
 from .exceptions import (
     GlobalLicensingParseError,
diff --git a/src/reuse/header.py b/src/reuse/header.py
index 6000d9a84..0538155a3 100644
--- a/src/reuse/header.py
+++ b/src/reuse/header.py
@@ -23,17 +23,14 @@
 from license_expression import ExpressionError
 
 from . import ReuseInfo
-from ._util import (
-    contains_reuse_info,
-    extract_reuse_info,
-    merge_copyright_lines,
-)
 from .comment import CommentStyle, EmptyCommentStyle, PythonCommentStyle
+from .copyright import merge_copyright_lines
 from .exceptions import (
     CommentCreateError,
     CommentParseError,
     MissingReuseInfoError,
 )
+from .extract import contains_reuse_info, extract_reuse_info
 from .i18n import _
 
 _LOGGER = logging.getLogger(__name__)
diff --git a/src/reuse/project.py b/src/reuse/project.py
index cf6ca24d5..4b7425f5b 100644
--- a/src/reuse/project.py
+++ b/src/reuse/project.py
@@ -23,17 +23,13 @@
 
 from . import ReuseInfo
 from ._licenses import EXCEPTION_MAP, LICENSE_MAP
-from ._util import (
-    _LICENSEREF_PATTERN,
-    _determine_license_path,
-    relative_from_root,
-    reuse_info_of_file,
-)
+from ._util import _determine_license_path, relative_from_root
 from .covered_files import iter_files
 from .exceptions import (
     GlobalLicensingConflictError,
     SpdxIdentifierNotFoundError,
 )
+from .extract import _LICENSEREF_PATTERN, reuse_info_of_file
 from .global_licensing import (
     GlobalLicensing,
     NestedReuseTOML,
diff --git a/src/reuse/report.py b/src/reuse/report.py
index dd2a0355a..4277a9cec 100644
--- a/src/reuse/report.py
+++ b/src/reuse/report.py
@@ -31,8 +31,9 @@
 )
 from uuid import uuid4
 
-from . import __REUSE_version__, __version__
-from ._util import _LICENSEREF_PATTERN, _LICENSING, _checksum
+from . import _LICENSING, __REUSE_version__, __version__
+from ._util import _checksum
+from .extract import _LICENSEREF_PATTERN
 from .global_licensing import ReuseDep5
 from .i18n import _
 from .project import Project, ReuseInfo
diff --git a/tests/test_cli_annotate.py b/tests/test_cli_annotate.py
index 4aecfd85e..0326af280 100644
--- a/tests/test_cli_annotate.py
+++ b/tests/test_cli_annotate.py
@@ -17,8 +17,8 @@
 import pytest
 from click.testing import CliRunner
 
-from reuse._util import _COPYRIGHT_PREFIXES
 from reuse.cli.main import main
+from reuse.copyright import _COPYRIGHT_PREFIXES
 
 # pylint: disable=too-many-public-methods,too-many-lines,unused-argument
 
diff --git a/tests/test_copyright.py b/tests/test_copyright.py
new file mode 100644
index 000000000..662b5ecb5
--- /dev/null
+++ b/tests/test_copyright.py
@@ -0,0 +1,135 @@
+# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
+# SPDX-FileCopyrightText: 2022 Carmen Bianca Bakker <carmenbianca@fsfe.org>
+# SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
+# SPDX-FileCopyrightText: 2022 Nico Rikken <nico.rikken@fsfe.org>
+# SPDX-FileCopyrightText: 2022 Pietro Albini <pietro.albini@ferrous-systems.com>
+# SPDX-FileCopyrightText: 2024 Rivos Inc.
+# SPDX-FileCopyrightText: © 2020 Liferay, Inc. <https://liferay.com>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+"""Tests for reuse.copyright"""
+
+import pytest
+
+from reuse.copyright import make_copyright_line
+
+# REUSE-IgnoreStart
+
+
+def test_make_copyright_line_simple():
+    """Given a simple statement, make it a copyright line."""
+    assert make_copyright_line("hello") == "SPDX-FileCopyrightText: hello"
+
+
+def test_make_copyright_line_year():
+    """Given a simple statement and a year, make it a copyright line."""
+    assert (
+        make_copyright_line("hello", year="2019")
+        == "SPDX-FileCopyrightText: 2019 hello"
+    )
+
+
+def test_make_copyright_line_prefix_spdx():
+    """Given a simple statement and prefix, make it a copyright line."""
+    statement = make_copyright_line("hello", copyright_prefix="spdx")
+    assert statement == "SPDX-FileCopyrightText: hello"
+
+
+def test_make_copyright_line_prefix_spdx_year():
+    """Given a simple statement, prefix and a year, make it a copyright line."""
+    statement = make_copyright_line("hello", year=2019, copyright_prefix="spdx")
+    assert statement == "SPDX-FileCopyrightText: 2019 hello"
+
+
+def test_make_copyright_line_prefix_spdx_c_year():
+    """Given a simple statement, prefix and a year, make it a copyright line."""
+    statement = make_copyright_line(
+        "hello", year=2019, copyright_prefix="spdx-c"
+    )
+    assert statement == "SPDX-FileCopyrightText: (C) 2019 hello"
+
+
+def test_make_copyright_line_prefix_spdx_symbol_year():
+    """Given a simple statement, prefix and a year, make it a copyright line."""
+    statement = make_copyright_line(
+        "hello", year=2019, copyright_prefix="spdx-symbol"
+    )
+    assert statement == "SPDX-FileCopyrightText: © 2019 hello"
+
+
+def test_make_copyright_line_prefix_string_year():
+    """Given a simple statement, prefix and a year, make it a copyright line."""
+    statement = make_copyright_line(
+        "hello", year=2019, copyright_prefix="string"
+    )
+    assert statement == "Copyright 2019 hello"
+
+
+def test_make_copyright_line_prefix_string_c_year():
+    """Given a simple statement, prefix and a year, make it a copyright line."""
+    statement = make_copyright_line(
+        "hello", year=2019, copyright_prefix="string-c"
+    )
+    assert statement == "Copyright (C) 2019 hello"
+
+
+def test_make_copyright_line_prefix_spdx_string_c_year():
+    """Given a simple statement, prefix and a year, make it a copyright line."""
+    statement = make_copyright_line(
+        "hello", year=2019, copyright_prefix="spdx-string-c"
+    )
+    assert statement == "SPDX-FileCopyrightText: Copyright (C) 2019 hello"
+
+
+def test_make_copyright_line_prefix_spdx_string_year():
+    """Given a simple statement, prefix and a year, make it a copyright line."""
+    statement = make_copyright_line(
+        "hello", year=2019, copyright_prefix="spdx-string"
+    )
+    assert statement == "SPDX-FileCopyrightText: Copyright 2019 hello"
+
+
+def test_make_copyright_line_prefix_spdx_string_symbol_year():
+    """Given a simple statement, prefix and a year, make it a copyright line."""
+    statement = make_copyright_line(
+        "hello", year=2019, copyright_prefix="spdx-string-symbol"
+    )
+    assert statement == "SPDX-FileCopyrightText: Copyright © 2019 hello"
+
+
+def test_make_copyright_line_prefix_string_symbol_year():
+    """Given a simple statement, prefix and a year, make it a copyright line."""
+    statement = make_copyright_line(
+        "hello", year=2019, copyright_prefix="string-symbol"
+    )
+    assert statement == "Copyright © 2019 hello"
+
+
+def test_make_copyright_line_prefix_symbol_year():
+    """Given a simple statement, prefix and a year, make it a copyright line."""
+    statement = make_copyright_line(
+        "hello", year=2019, copyright_prefix="symbol"
+    )
+    assert statement == "© 2019 hello"
+
+
+def test_make_copyright_line_existing_spdx_copyright():
+    """Given a copyright line, do nothing."""
+    value = "SPDX-FileCopyrightText: hello"
+    assert make_copyright_line(value) == value
+
+
+def test_make_copyright_line_existing_other_copyright():
+    """Given a non-SPDX copyright line, do nothing."""
+    value = "© hello"
+    assert make_copyright_line(value) == value
+
+
+def test_make_copyright_line_multine_error():
+    """Given a multiline argument, expect an error."""
+    with pytest.raises(RuntimeError):
+        make_copyright_line("hello\nworld")
+
+
+# REUSE-IgnoreEnd
diff --git a/tests/test_util.py b/tests/test_extract.py
similarity index 62%
rename from tests/test_util.py
rename to tests/test_extract.py
index e26445fe8..46bcad68e 100644
--- a/tests/test_util.py
+++ b/tests/test_extract.py
@@ -8,7 +8,7 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
-"""Tests for reuse._util"""
+"""Tests for reuse.extract"""
 
 import os
 from inspect import cleandoc
@@ -17,8 +17,13 @@
 import pytest
 from boolean.boolean import ParseError
 
-from reuse import _util
-from reuse._util import _LICENSING
+from reuse import _LICENSING, ReuseInfo
+from reuse.extract import (
+    decoded_text_from_binary,
+    detect_line_endings,
+    extract_reuse_info,
+    filter_ignore_block,
+)
 
 # REUSE-IgnoreStart
 
@@ -27,15 +32,13 @@ def test_extract_expression():
     """Parse various expressions."""
     expressions = ["GPL-3.0+", "GPL-3.0 AND CC0-1.0", "nonsense"]
     for expression in expressions:
-        result = _util.extract_reuse_info(
-            f"SPDX-License-Identifier: {expression}"
-        )
+        result = extract_reuse_info(f"SPDX-License-Identifier: {expression}")
         assert result.spdx_expressions == {_LICENSING.parse(expression)}
 
 
 def test_extract_expression_from_ascii_art_frame():
     """Parse an expression from an ASCII art frame"""
-    result = _util.extract_reuse_info(
+    result = extract_reuse_info(
         cleandoc(
             """
              /**********************************\\
@@ -51,20 +54,20 @@ def test_extract_erroneous_expression():
     """Parse an incorrect expression."""
     expression = "SPDX-License-Identifier: GPL-3.0-or-later AND (MIT OR)"
     with pytest.raises(ParseError):
-        _util.extract_reuse_info(expression)
+        extract_reuse_info(expression)
 
 
 def test_extract_no_info():
     """Given a string without REUSE information, return an empty ReuseInfo
     object.
     """
-    result = _util.extract_reuse_info("")
-    assert result == _util.ReuseInfo()
+    result = extract_reuse_info("")
+    assert result == ReuseInfo()
 
 
 def test_extract_tab():
     """A tag followed by a tab is also valid."""
-    result = _util.extract_reuse_info("SPDX-License-Identifier:\tMIT")
+    result = extract_reuse_info("SPDX-License-Identifier:\tMIT")
     assert result.spdx_expressions == {_LICENSING.parse("MIT")}
 
 
@@ -72,14 +75,14 @@ def test_extract_many_whitespace():
     """When a tag is followed by a lot of whitespace, the whitespace should be
     filtered out.
     """
-    result = _util.extract_reuse_info("SPDX-License-Identifier:    MIT")
+    result = extract_reuse_info("SPDX-License-Identifier:    MIT")
     assert result.spdx_expressions == {_LICENSING.parse("MIT")}
 
 
 def test_extract_bibtex_comment():
     """A special case for BibTex comments."""
     expression = "@Comment{SPDX-License-Identifier: GPL-3.0-or-later}"
-    result = _util.extract_reuse_info(expression)
+    result = extract_reuse_info(expression)
     assert str(list(result.spdx_expressions)[0]) == "GPL-3.0-or-later"
 
 
@@ -88,23 +91,21 @@ def test_extract_copyright():
     information.
     """
     copyright_line = "SPDX-FileCopyrightText: 2019 Jane Doe"
-    result = _util.extract_reuse_info(copyright_line)
+    result = extract_reuse_info(copyright_line)
     assert result.copyright_lines == {copyright_line}
 
 
 def test_extract_copyright_duplicate():
     """When a copyright line is duplicated, only yield one."""
     copyright_line = "SPDX-FileCopyrightText: 2019 Jane Doe"
-    result = _util.extract_reuse_info(
-        "\n".join((copyright_line, copyright_line))
-    )
+    result = extract_reuse_info("\n".join((copyright_line, copyright_line)))
     assert result.copyright_lines == {copyright_line}
 
 
 def test_extract_copyright_tab():
     """A tag followed by a tab is also valid."""
     copyright_line = "SPDX-FileCopyrightText:\t2019 Jane Doe"
-    result = _util.extract_reuse_info(copyright_line)
+    result = extract_reuse_info(copyright_line)
     assert result.copyright_lines == {copyright_line}
 
 
@@ -113,7 +114,7 @@ def test_extract_copyright_many_whitespace():
     whitespace is not filtered out.
     """
     copyright_line = "SPDX-FileCopyrightText:    2019 Jane Doe"
-    result = _util.extract_reuse_info(copyright_line)
+    result = extract_reuse_info(copyright_line)
     assert result.copyright_lines == {copyright_line}
 
 
@@ -133,7 +134,7 @@ def test_extract_copyright_variations():
         """
     )
 
-    result = _util.extract_reuse_info(text)
+    result = extract_reuse_info(text)
     lines = text.splitlines()
     for line in lines:
         assert line in result.copyright_lines
@@ -155,7 +156,7 @@ def test_extract_with_ignore_block():
         SPDX-FileCopyrightText: 2019 Eve
         """
     )
-    result = _util.extract_reuse_info(text)
+    result = extract_reuse_info(text)
     assert len(result.copyright_lines) == 2
     assert len(result.spdx_expressions) == 1
 
@@ -165,7 +166,7 @@ def test_extract_sameline_multiline():
     do not include the comment end pattern as part of the copyright.
     """
     text = "<!-- SPDX-FileCopyrightText: Jane Doe -->"
-    result = _util.extract_reuse_info(text)
+    result = extract_reuse_info(text)
     assert len(result.copyright_lines) == 1
     assert result.copyright_lines == {"SPDX-FileCopyrightText: Jane Doe"}
 
@@ -185,7 +186,7 @@ def test_extract_special_endings():
         [Copyright 2019 Ajnulo] ::
         """
     )
-    result = _util.extract_reuse_info(text)
+    result = extract_reuse_info(text)
     for item in result.copyright_lines:
         assert ">" not in item
         assert "] ::" not in item
@@ -198,7 +199,7 @@ def test_extract_contributors():
         # SPDX-FileContributor: Jane Doe
         """
     )
-    result = _util.extract_reuse_info(text)
+    result = extract_reuse_info(text)
     assert result.contributor_lines == {"Jane Doe"}
 
 
@@ -217,7 +218,7 @@ def test_filter_ignore_block_with_comment_style():
     )
     expected = "Relevant text\n# \nOther relevant text"
 
-    result = _util.filter_ignore_block(text)
+    result = filter_ignore_block(text)
     assert result == expected
 
 
@@ -242,7 +243,7 @@ def test_filter_ignore_block_non_comment_style():
         """
     )
 
-    result = _util.filter_ignore_block(text)
+    result = filter_ignore_block(text)
     assert result == expected
 
 
@@ -267,7 +268,7 @@ def test_filter_ignore_block_with_ignored_information_on_same_line():
         """
     )
 
-    result = _util.filter_ignore_block(text)
+    result = filter_ignore_block(text)
     assert result == expected
 
 
@@ -284,7 +285,7 @@ def test_filter_ignore_block_with_relevant_information_on_same_line():
     )
     expected = "Relevant textOther relevant text"
 
-    result = _util.filter_ignore_block(text)
+    result = filter_ignore_block(text)
     assert result == expected
 
 
@@ -305,7 +306,7 @@ def test_filter_ignore_block_with_beginning_and_end_on_same_line_correct_order()
         """
     )
 
-    result = _util.filter_ignore_block(text)
+    result = filter_ignore_block(text)
     assert result == expected
 
 
@@ -316,7 +317,7 @@ def test_filter_ignore_block_with_beginning_and_end_on_same_line_wrong_order():
     text = "Relevant textREUSE-IgnoreEndOther relevant textREUSE-IgnoreStartIgnored text"  # pylint: disable=line-too-long
     expected = "Relevant textREUSE-IgnoreEndOther relevant text"
 
-    result = _util.filter_ignore_block(text)
+    result = filter_ignore_block(text)
     assert result == expected
 
 
@@ -334,7 +335,7 @@ def test_filter_ignore_block_without_end():
     )
     expected = "Relevant text\n"
 
-    result = _util.filter_ignore_block(text)
+    result = filter_ignore_block(text)
     assert result == expected
 
 
@@ -365,166 +366,49 @@ def test_filter_ignore_block_with_multiple_ignore_blocks():
         """
     )
 
-    result = _util.filter_ignore_block(text)
+    result = filter_ignore_block(text)
     assert result == expected
 
 
-def test_make_copyright_line_simple():
-    """Given a simple statement, make it a copyright line."""
-    assert _util.make_copyright_line("hello") == "SPDX-FileCopyrightText: hello"
-
-
-def test_make_copyright_line_year():
-    """Given a simple statement and a year, make it a copyright line."""
-    assert (
-        _util.make_copyright_line("hello", year="2019")
-        == "SPDX-FileCopyrightText: 2019 hello"
-    )
-
-
-def test_make_copyright_line_prefix_spdx():
-    """Given a simple statement and prefix, make it a copyright line."""
-    statement = _util.make_copyright_line("hello", copyright_prefix="spdx")
-    assert statement == "SPDX-FileCopyrightText: hello"
-
-
-def test_make_copyright_line_prefix_spdx_year():
-    """Given a simple statement, prefix and a year, make it a copyright line."""
-    statement = _util.make_copyright_line(
-        "hello", year=2019, copyright_prefix="spdx"
-    )
-    assert statement == "SPDX-FileCopyrightText: 2019 hello"
-
-
-def test_make_copyright_line_prefix_spdx_c_year():
-    """Given a simple statement, prefix and a year, make it a copyright line."""
-    statement = _util.make_copyright_line(
-        "hello", year=2019, copyright_prefix="spdx-c"
-    )
-    assert statement == "SPDX-FileCopyrightText: (C) 2019 hello"
-
-
-def test_make_copyright_line_prefix_spdx_symbol_year():
-    """Given a simple statement, prefix and a year, make it a copyright line."""
-    statement = _util.make_copyright_line(
-        "hello", year=2019, copyright_prefix="spdx-symbol"
-    )
-    assert statement == "SPDX-FileCopyrightText: © 2019 hello"
-
-
-def test_make_copyright_line_prefix_string_year():
-    """Given a simple statement, prefix and a year, make it a copyright line."""
-    statement = _util.make_copyright_line(
-        "hello", year=2019, copyright_prefix="string"
-    )
-    assert statement == "Copyright 2019 hello"
-
-
-def test_make_copyright_line_prefix_string_c_year():
-    """Given a simple statement, prefix and a year, make it a copyright line."""
-    statement = _util.make_copyright_line(
-        "hello", year=2019, copyright_prefix="string-c"
-    )
-    assert statement == "Copyright (C) 2019 hello"
-
-
-def test_make_copyright_line_prefix_spdx_string_c_year():
-    """Given a simple statement, prefix and a year, make it a copyright line."""
-    statement = _util.make_copyright_line(
-        "hello", year=2019, copyright_prefix="spdx-string-c"
-    )
-    assert statement == "SPDX-FileCopyrightText: Copyright (C) 2019 hello"
-
-
-def test_make_copyright_line_prefix_spdx_string_year():
-    """Given a simple statement, prefix and a year, make it a copyright line."""
-    statement = _util.make_copyright_line(
-        "hello", year=2019, copyright_prefix="spdx-string"
-    )
-    assert statement == "SPDX-FileCopyrightText: Copyright 2019 hello"
-
-
-def test_make_copyright_line_prefix_spdx_string_symbol_year():
-    """Given a simple statement, prefix and a year, make it a copyright line."""
-    statement = _util.make_copyright_line(
-        "hello", year=2019, copyright_prefix="spdx-string-symbol"
-    )
-    assert statement == "SPDX-FileCopyrightText: Copyright © 2019 hello"
-
-
-def test_make_copyright_line_prefix_string_symbol_year():
-    """Given a simple statement, prefix and a year, make it a copyright line."""
-    statement = _util.make_copyright_line(
-        "hello", year=2019, copyright_prefix="string-symbol"
-    )
-    assert statement == "Copyright © 2019 hello"
-
-
-def test_make_copyright_line_prefix_symbol_year():
-    """Given a simple statement, prefix and a year, make it a copyright line."""
-    statement = _util.make_copyright_line(
-        "hello", year=2019, copyright_prefix="symbol"
-    )
-    assert statement == "© 2019 hello"
-
-
-def test_make_copyright_line_existing_spdx_copyright():
-    """Given a copyright line, do nothing."""
-    value = "SPDX-FileCopyrightText: hello"
-    assert _util.make_copyright_line(value) == value
-
-
-def test_make_copyright_line_existing_other_copyright():
-    """Given a non-SPDX copyright line, do nothing."""
-    value = "© hello"
-    assert _util.make_copyright_line(value) == value
-
-
-def test_make_copyright_line_multine_error():
-    """Given a multiline argument, expect an error."""
-    with pytest.raises(RuntimeError):
-        _util.make_copyright_line("hello\nworld")
-
-
 def test_decoded_text_from_binary_simple():
     """A unicode string encoded as bytes object decodes back correctly."""
     text = "Hello, world ☺"
     encoded = text.encode("utf-8")
-    assert _util.decoded_text_from_binary(BytesIO(encoded)) == text
+    assert decoded_text_from_binary(BytesIO(encoded)) == text
 
 
 def test_decoded_text_from_binary_size():
     """Only a given amount of bytes is decoded."""
     text = "Hello, world ☺"
     encoded = text.encode("utf-8")
-    assert _util.decoded_text_from_binary(BytesIO(encoded), size=5) == "Hello"
+    assert decoded_text_from_binary(BytesIO(encoded), size=5) == "Hello"
 
 
 def test_decoded_text_from_binary_crlf():
     """Given CRLF line endings, convert to LF."""
     text = "Hello\r\nworld"
     encoded = text.encode("utf-8")
-    assert _util.decoded_text_from_binary(BytesIO(encoded)) == "Hello\nworld"
+    assert decoded_text_from_binary(BytesIO(encoded)) == "Hello\nworld"
 
 
 def test_detect_line_endings_windows():
     """Given a CRLF string, detect the line endings."""
-    assert _util.detect_line_endings("hello\r\nworld") == "\r\n"
+    assert detect_line_endings("hello\r\nworld") == "\r\n"
 
 
 def test_detect_line_endings_mac():
     """Given a CR string, detect the line endings."""
-    assert _util.detect_line_endings("hello\rworld") == "\r"
+    assert detect_line_endings("hello\rworld") == "\r"
 
 
 def test_detect_line_endings_linux():
     """Given a LF string, detect the line endings."""
-    assert _util.detect_line_endings("hello\nworld") == "\n"
+    assert detect_line_endings("hello\nworld") == "\n"
 
 
 def test_detect_line_endings_no_newlines():
     """Given a file without line endings, default to os.linesep."""
-    assert _util.detect_line_endings("hello world") == os.linesep
+    assert detect_line_endings("hello world") == os.linesep
 
 
-# REUSE-IgnoreEnd
+# Reuse-IgnoreEnd
diff --git a/tests/test_global_licensing.py b/tests/test_global_licensing.py
index 62a69557a..1b7385381 100644
--- a/tests/test_global_licensing.py
+++ b/tests/test_global_licensing.py
@@ -13,8 +13,7 @@
 from debian.copyright import Copyright
 from license_expression import LicenseSymbol
 
-from reuse import ReuseInfo, SourceType
-from reuse._util import _LICENSING
+from reuse import _LICENSING, ReuseInfo, SourceType
 from reuse.exceptions import (
     GlobalLicensingParseError,
     GlobalLicensingParseTypeError,
diff --git a/tests/test_project.py b/tests/test_project.py
index 448a2bf79..3606816d3 100644
--- a/tests/test_project.py
+++ b/tests/test_project.py
@@ -19,8 +19,7 @@
 from conftest import RESOURCES_DIRECTORY
 from license_expression import LicenseSymbol
 
-from reuse import ReuseInfo, SourceType
-from reuse._util import _LICENSING
+from reuse import _LICENSING, ReuseInfo, SourceType
 from reuse.covered_files import iter_files
 from reuse.exceptions import (
     GlobalLicensingConflictError,

From 9f9f4136052df6dacec108432b645fc54ca655ce Mon Sep 17 00:00:00 2001
From: Carmen Bianca BAKKER <carmenbianca@fsfe.org>
Date: Thu, 24 Oct 2024 10:52:46 +0200
Subject: [PATCH 3/3] Refactor VCS paths out of _util into vcs

Signed-off-by: Carmen Bianca BAKKER <carmenbianca@fsfe.org>
---
 src/reuse/_util.py |  7 -------
 src/reuse/vcs.py   | 15 +++++++--------
 tests/conftest.py  | 13 ++-----------
 3 files changed, 9 insertions(+), 26 deletions(-)

diff --git a/src/reuse/_util.py b/src/reuse/_util.py
index fc91d76e7..af226cf1d 100644
--- a/src/reuse/_util.py
+++ b/src/reuse/_util.py
@@ -16,7 +16,6 @@
 
 import logging
 import os
-import shutil
 import subprocess
 from hashlib import sha1
 from inspect import cleandoc
@@ -25,12 +24,6 @@
 
 from .types import StrPath
 
-GIT_EXE = shutil.which("git")
-HG_EXE = shutil.which("hg")
-JUJUTSU_EXE = shutil.which("jj")
-PIJUL_EXE = shutil.which("pijul")
-
-
 # REUSE-IgnoreStart
 
 
diff --git a/src/reuse/vcs.py b/src/reuse/vcs.py
index 135c91393..8c1cb5c8e 100644
--- a/src/reuse/vcs.py
+++ b/src/reuse/vcs.py
@@ -12,19 +12,13 @@
 
 import logging
 import os
+import shutil
 from abc import ABC, abstractmethod
 from inspect import isclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Generator, Optional, Type
 
-from ._util import (
-    GIT_EXE,
-    HG_EXE,
-    JUJUTSU_EXE,
-    PIJUL_EXE,
-    execute_command,
-    relative_from_root,
-)
+from ._util import execute_command, relative_from_root
 from .types import StrPath
 
 if TYPE_CHECKING:
@@ -32,6 +26,11 @@
 
 _LOGGER = logging.getLogger(__name__)
 
+GIT_EXE = shutil.which("git")
+HG_EXE = shutil.which("hg")
+JUJUTSU_EXE = shutil.which("jj")
+PIJUL_EXE = shutil.which("pijul")
+
 
 class VCSStrategy(ABC):
     """Strategy pattern for version control systems."""
diff --git a/tests/conftest.py b/tests/conftest.py
index 075a5fe4d..929533075 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -39,14 +39,9 @@
 except ImportError:
     sys.path.append(os.path.join(Path(__file__).parent.parent, "src"))
 finally:
-    from reuse._util import (
-        GIT_EXE,
-        HG_EXE,
-        JUJUTSU_EXE,
-        PIJUL_EXE,
-        setup_logging,
-    )
+    from reuse._util import setup_logging
     from reuse.global_licensing import ReuseDep5
+    from reuse.vcs import GIT_EXE, HG_EXE, JUJUTSU_EXE, PIJUL_EXE
 
 CWD = Path.cwd()
 
@@ -117,7 +112,6 @@ def optional_git_exe(
     """Run the test with or without git."""
     exe = GIT_EXE if request.param else ""
     monkeypatch.setattr("reuse.vcs.GIT_EXE", exe)
-    monkeypatch.setattr("reuse._util.GIT_EXE", exe)
     yield exe
 
 
@@ -136,7 +130,6 @@ def optional_hg_exe(
     """Run the test with or without mercurial."""
     exe = HG_EXE if request.param else ""
     monkeypatch.setattr("reuse.vcs.HG_EXE", exe)
-    monkeypatch.setattr("reuse._util.HG_EXE", exe)
     yield exe
 
 
@@ -155,7 +148,6 @@ def optional_jujutsu_exe(
     """Run the test with or without Jujutsu."""
     exe = JUJUTSU_EXE if request.param else ""
     monkeypatch.setattr("reuse.vcs.JUJUTSU_EXE", exe)
-    monkeypatch.setattr("reuse._util.JUJUTSU_EXE", exe)
     yield exe
 
 
@@ -174,7 +166,6 @@ def optional_pijul_exe(
     """Run the test with or without Pijul."""
     exe = PIJUL_EXE if request.param else ""
     monkeypatch.setattr("reuse.vcs.PIJUL_EXE", exe)
-    monkeypatch.setattr("reuse._util.PIJUL_EXE", exe)
     yield exe