Skip to content

Commit

Permalink
Allow grouping highlights by color in markdown output (#79)
Browse files Browse the repository at this point in the history
This commit adds new options to the CLI, allowing the user to control how highlights are grouped in the markdown output. The new option is called `--group-highlight-by-color` to group highlights by their color in the grouped output. Additionally, the commit updates the `MarkdownPrinter` class in the `printer/markdown.py` module to include the `group_highlights_by_color` parameter. The `GroupedMarkdownPrinter` class also handles the grouping of highlights based on this new parameter.

Note: This commit does not introduce any behavior changes by default, as the `--group` option is set to `True` and the `--no-group` option is set to `False` by default. The `--group-highlight-by-color` option is also set to `False` by default.
  • Loading branch information
linozen authored Aug 10, 2023
1 parent 4404a3b commit db6ebf0
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 17 deletions.
26 changes: 22 additions & 4 deletions pdfannots/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,14 @@
from .printer.json import JsonPrinter


MD_FORMAT_ARGS = ['print_filename', 'remove_hyphens', 'wrap_column', 'condense', 'sections']
"""Named of arguments passed to the markdown printers."""
MD_FORMAT_ARGS = [
'condense',
'group_highlights_by_color',
'print_filename',
'sections',
'wrap_column',
]
"""Names of arguments passed to the markdown printer."""


def _float_or_disabled(x: str) -> typ.Optional[float]:
Expand Down Expand Up @@ -47,15 +53,27 @@ def parse_args() -> typ.Tuple[argparse.Namespace, LAParams]:
help="Output format (default: markdown).")

g = p.add_argument_group('Options controlling markdown output')
mutex_group = g.add_mutually_exclusive_group()
mutex_group.add_argument(
"--no-group",
dest="group",
default=True, action="store_false",
help="Emit annotations in order, don't group into sections."
)
mutex_group.add_argument(
"--group-highlights-by-color",
dest="group_highlights_by_color",
default=False, action="store_true",
help="Group highlights by color in grouped output."
)

g.add_argument("-s", "--sections", metavar="SEC", nargs="*",
choices=GroupedMarkdownPrinter.ALL_SECTIONS,
default=GroupedMarkdownPrinter.ALL_SECTIONS,
help=("sections to emit (default: %s)" %
', '.join(GroupedMarkdownPrinter.ALL_SECTIONS)))
g.add_argument("--no-condense", dest="condense", default=True, action="store_false",
help="Emit annotations as a blockquote regardless of length.")
g.add_argument("--no-group", dest="group", default=True, action="store_false",
help="Emit annotations in order, don't group into sections.")
g.add_argument("--print-filename", dest="print_filename", default=False, action="store_true",
help="Print the name of each file with annotations.")
g.add_argument("-w", "--wrap", dest="wrap_column", metavar="COLS", type=int,
Expand Down
61 changes: 49 additions & 12 deletions pdfannots/printer/markdown.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from collections import defaultdict
import logging
import textwrap
import typing as typ

from . import Printer
from ..types import AnnotationType, Pos, Annotation, Document
from ..types import RGB, AnnotationType, Pos, Annotation, Document

logger = logging.getLogger('pdfannots')

Expand Down Expand Up @@ -84,13 +85,15 @@ class MarkdownPrinter(Printer):
def __init__(
self,
*,
condense: bool = True, # Permit use of the condensed format
print_filename: bool = False, # Whether to print file names
remove_hyphens: bool = True, # Whether to remove hyphens across a line break
wrap_column: typ.Optional[int] = None, # Column at which output is word-wrapped
**kwargs: typ.Any # Other args, ignored
condense: bool = True, # Permit use of the condensed format
print_filename: bool = False, # Whether to print file names
group_highlights_by_color: bool = False, # Whether to group highlights by color
remove_hyphens: bool = True, # Whether to remove hyphens across a line break
wrap_column: typ.Optional[int] = None, # Column at which output is word-wrapped
**kwargs: typ.Any # Other args, ignored
) -> None:
self.print_filename = print_filename
self.group_highlights_by_color = group_highlights_by_color
self.remove_hyphens = remove_hyphens
self.wrap_column = wrap_column
self.condense = condense
Expand Down Expand Up @@ -286,29 +289,63 @@ def emit_body(

self._fmt_header_called = False

def fmt_header(name: str) -> str:
def fmt_header(name: str, level: int = 2) -> str:
"""
A function that formats a header with a given name and level.
Parameters:
name (str): The name of the header.
level (int, optional): The level of the header. Defaults to 2.
Returns:
str: The formatted header.
"""
# emit blank separator line if needed
prefix = '\n' if self._fmt_header_called else ''
self._fmt_header_called = True
return prefix + "## " + name + "\n\n"
header = '#' * level
return prefix + header + " " + name + "\n"

# Partition annotations into nits, comments, and highlights.
nits = []
comments = []
# Create a defaultdict to hold grouped highlights by color.
highlights_by_color: typ.DefaultDict[
typ.Union[RGB, str],
typ.List[Annotation]
] = defaultdict(list)
# Create just a normal list for highlights when the defaultdict above is not needed
highlights = []
for a in document.iter_annots():
if a.subtype in self.ANNOT_NITS:
nits.append(a)
elif a.contents:
comments.append(a)
elif a.subtype == AnnotationType.Highlight:
highlights.append(a)
if self.group_highlights_by_color:
if a.color:
color: typ.Union[RGB, str] = a.color
else:
color = "undefined"
highlights_by_color[color].append(a)
else:
highlights.append(a)

for secname in self.sections:
if highlights and secname == 'highlights':
if (
self.group_highlights_by_color
and highlights_by_color
and secname == 'highlights'
):
yield fmt_header("Highlights")
for a in highlights:
yield self.format_annot(a, document)
for color, annots in highlights_by_color.items():
yield fmt_header(f"Color: {color}", level=3)
for a in annots:
yield self.format_annot(a, document)
else:
if highlights and secname == 'highlights':
for a in highlights:
yield self.format_annot(a, document)

if comments and secname == 'comments':
yield fmt_header("Detailed comments")
Expand Down
3 changes: 3 additions & 0 deletions pdfannots/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,3 +494,6 @@ def ashex(self) -> str:
green_hex = format(int(self.green * 255), '02x')
blue_hex = format(int(self.blue * 255), '02x')
return f"#{str(red_hex)}{str(green_hex)}{(blue_hex)}"

def __str__(self) -> str:
return self.ashex()
16 changes: 15 additions & 1 deletion tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ def test_annots(self) -> None:
'"Broadwell" CPUs with the bug fix shipped in late 2014.'),
(1, AnnotationType.Highlight, 'This is lower in column 1',
'user-mode access to FS/GS registers, and TLB tags for non-VM address spaces'),
(1, AnnotationType.Highlight, None,
'segmentation, task switching, and 16-bit modes.'),
(1, AnnotationType.Highlight, 'This is at the top of column two',
'The jump is due to extensions introduced with the "Skylake" microarchitecture'),
(3, AnnotationType.Squiggly, 'This is a nit.',
Expand Down Expand Up @@ -290,6 +292,18 @@ def test_grouped(self) -> None:
self.assertGreater(linecount, 10)
self.assertGreater(charcount, 900)

def test_multicolorgrouping(self) -> None:
p = GroupedMarkdownPrinter(group_highlights_by_color=True)

linecount = 0
charcount = 0
for line in p.print_file('dummyfile', self.doc):
linecount += line.count('\n')
charcount += len(line)

self.assertGreater(linecount, 10)
self.assertGreater(charcount, 900)


class JsonPrinterTest(PrinterTestBase):
def test_flat(self) -> None:
Expand All @@ -301,7 +315,7 @@ def test_flat(self) -> None:
+ p.end())

self.assertTrue(isinstance(j, list))
self.assertEqual(len(j), 8)
self.assertEqual(len(j), 9)


if __name__ == "__main__":
Expand Down
Binary file modified tests/hotos17.pdf
Binary file not shown.

0 comments on commit db6ebf0

Please sign in to comment.