From db6ebf009daf117d02c2ae1dcbaaee6d84d5be79 Mon Sep 17 00:00:00 2001 From: Linus Sehn <37184648+linozen@users.noreply.github.com> Date: Thu, 10 Aug 2023 09:24:15 +0200 Subject: [PATCH] Allow grouping highlights by color in markdown output (#79) This commit adds new options to the CLI, allowing the user to control how highlights are grouped in the markdown output. The new option is called `--group-highlight-by-color` to group highlights by their color in the grouped output. Additionally, the commit updates the `MarkdownPrinter` class in the `printer/markdown.py` module to include the `group_highlights_by_color` parameter. The `GroupedMarkdownPrinter` class also handles the grouping of highlights based on this new parameter. Note: This commit does not introduce any behavior changes by default, as the `--group` option is set to `True` and the `--no-group` option is set to `False` by default. The `--group-highlight-by-color` option is also set to `False` by default. --- pdfannots/cli.py | 26 ++++++++++++--- pdfannots/printer/markdown.py | 61 +++++++++++++++++++++++++++------- pdfannots/types.py | 3 ++ tests.py | 16 ++++++++- tests/hotos17.pdf | Bin 295224 -> 297132 bytes 5 files changed, 89 insertions(+), 17 deletions(-) diff --git a/pdfannots/cli.py b/pdfannots/cli.py index 54c198d..57ba206 100644 --- a/pdfannots/cli.py +++ b/pdfannots/cli.py @@ -11,8 +11,14 @@ from .printer.json import JsonPrinter -MD_FORMAT_ARGS = ['print_filename', 'remove_hyphens', 'wrap_column', 'condense', 'sections'] -"""Named of arguments passed to the markdown printers.""" +MD_FORMAT_ARGS = [ + 'condense', + 'group_highlights_by_color', + 'print_filename', + 'sections', + 'wrap_column', +] +"""Names of arguments passed to the markdown printer.""" def _float_or_disabled(x: str) -> typ.Optional[float]: @@ -47,6 +53,20 @@ def parse_args() -> typ.Tuple[argparse.Namespace, LAParams]: help="Output format (default: markdown).") g = p.add_argument_group('Options controlling markdown output') + mutex_group = g.add_mutually_exclusive_group() + mutex_group.add_argument( + "--no-group", + dest="group", + default=True, action="store_false", + help="Emit annotations in order, don't group into sections." + ) + mutex_group.add_argument( + "--group-highlights-by-color", + dest="group_highlights_by_color", + default=False, action="store_true", + help="Group highlights by color in grouped output." + ) + g.add_argument("-s", "--sections", metavar="SEC", nargs="*", choices=GroupedMarkdownPrinter.ALL_SECTIONS, default=GroupedMarkdownPrinter.ALL_SECTIONS, @@ -54,8 +74,6 @@ def parse_args() -> typ.Tuple[argparse.Namespace, LAParams]: ', '.join(GroupedMarkdownPrinter.ALL_SECTIONS))) g.add_argument("--no-condense", dest="condense", default=True, action="store_false", help="Emit annotations as a blockquote regardless of length.") - g.add_argument("--no-group", dest="group", default=True, action="store_false", - help="Emit annotations in order, don't group into sections.") g.add_argument("--print-filename", dest="print_filename", default=False, action="store_true", help="Print the name of each file with annotations.") g.add_argument("-w", "--wrap", dest="wrap_column", metavar="COLS", type=int, diff --git a/pdfannots/printer/markdown.py b/pdfannots/printer/markdown.py index ce10aee..b3d0410 100644 --- a/pdfannots/printer/markdown.py +++ b/pdfannots/printer/markdown.py @@ -1,9 +1,10 @@ +from collections import defaultdict import logging import textwrap import typing as typ from . import Printer -from ..types import AnnotationType, Pos, Annotation, Document +from ..types import RGB, AnnotationType, Pos, Annotation, Document logger = logging.getLogger('pdfannots') @@ -84,13 +85,15 @@ class MarkdownPrinter(Printer): def __init__( self, *, - condense: bool = True, # Permit use of the condensed format - print_filename: bool = False, # Whether to print file names - remove_hyphens: bool = True, # Whether to remove hyphens across a line break - wrap_column: typ.Optional[int] = None, # Column at which output is word-wrapped - **kwargs: typ.Any # Other args, ignored + condense: bool = True, # Permit use of the condensed format + print_filename: bool = False, # Whether to print file names + group_highlights_by_color: bool = False, # Whether to group highlights by color + remove_hyphens: bool = True, # Whether to remove hyphens across a line break + wrap_column: typ.Optional[int] = None, # Column at which output is word-wrapped + **kwargs: typ.Any # Other args, ignored ) -> None: self.print_filename = print_filename + self.group_highlights_by_color = group_highlights_by_color self.remove_hyphens = remove_hyphens self.wrap_column = wrap_column self.condense = condense @@ -286,15 +289,32 @@ def emit_body( self._fmt_header_called = False - def fmt_header(name: str) -> str: + def fmt_header(name: str, level: int = 2) -> str: + """ + A function that formats a header with a given name and level. + + Parameters: + name (str): The name of the header. + level (int, optional): The level of the header. Defaults to 2. + + Returns: + str: The formatted header. + """ # emit blank separator line if needed prefix = '\n' if self._fmt_header_called else '' self._fmt_header_called = True - return prefix + "## " + name + "\n\n" + header = '#' * level + return prefix + header + " " + name + "\n" # Partition annotations into nits, comments, and highlights. nits = [] comments = [] + # Create a defaultdict to hold grouped highlights by color. + highlights_by_color: typ.DefaultDict[ + typ.Union[RGB, str], + typ.List[Annotation] + ] = defaultdict(list) + # Create just a normal list for highlights when the defaultdict above is not needed highlights = [] for a in document.iter_annots(): if a.subtype in self.ANNOT_NITS: @@ -302,13 +322,30 @@ def fmt_header(name: str) -> str: elif a.contents: comments.append(a) elif a.subtype == AnnotationType.Highlight: - highlights.append(a) + if self.group_highlights_by_color: + if a.color: + color: typ.Union[RGB, str] = a.color + else: + color = "undefined" + highlights_by_color[color].append(a) + else: + highlights.append(a) for secname in self.sections: - if highlights and secname == 'highlights': + if ( + self.group_highlights_by_color + and highlights_by_color + and secname == 'highlights' + ): yield fmt_header("Highlights") - for a in highlights: - yield self.format_annot(a, document) + for color, annots in highlights_by_color.items(): + yield fmt_header(f"Color: {color}", level=3) + for a in annots: + yield self.format_annot(a, document) + else: + if highlights and secname == 'highlights': + for a in highlights: + yield self.format_annot(a, document) if comments and secname == 'comments': yield fmt_header("Detailed comments") diff --git a/pdfannots/types.py b/pdfannots/types.py index b957a97..6a1c59a 100644 --- a/pdfannots/types.py +++ b/pdfannots/types.py @@ -494,3 +494,6 @@ def ashex(self) -> str: green_hex = format(int(self.green * 255), '02x') blue_hex = format(int(self.blue * 255), '02x') return f"#{str(red_hex)}{str(green_hex)}{(blue_hex)}" + + def __str__(self) -> str: + return self.ashex() diff --git a/tests.py b/tests.py index 1b1c0e4..b743579 100755 --- a/tests.py +++ b/tests.py @@ -67,6 +67,8 @@ def test_annots(self) -> None: '"Broadwell" CPUs with the bug fix shipped in late 2014.'), (1, AnnotationType.Highlight, 'This is lower in column 1', 'user-mode access to FS/GS registers, and TLB tags for non-VM address spaces'), + (1, AnnotationType.Highlight, None, + 'segmentation, task switching, and 16-bit modes.'), (1, AnnotationType.Highlight, 'This is at the top of column two', 'The jump is due to extensions introduced with the "Skylake" microarchitecture'), (3, AnnotationType.Squiggly, 'This is a nit.', @@ -290,6 +292,18 @@ def test_grouped(self) -> None: self.assertGreater(linecount, 10) self.assertGreater(charcount, 900) + def test_multicolorgrouping(self) -> None: + p = GroupedMarkdownPrinter(group_highlights_by_color=True) + + linecount = 0 + charcount = 0 + for line in p.print_file('dummyfile', self.doc): + linecount += line.count('\n') + charcount += len(line) + + self.assertGreater(linecount, 10) + self.assertGreater(charcount, 900) + class JsonPrinterTest(PrinterTestBase): def test_flat(self) -> None: @@ -301,7 +315,7 @@ def test_flat(self) -> None: + p.end()) self.assertTrue(isinstance(j, list)) - self.assertEqual(len(j), 8) + self.assertEqual(len(j), 9) if __name__ == "__main__": diff --git a/tests/hotos17.pdf b/tests/hotos17.pdf index 4cfc78dcfce9cbfa75e6448e88ddc7eb82320032..2db33466bf7c8ca80d5d41db68effbd1d771e0fe 100644 GIT binary patch delta 1780 zcmcJP&ubGw6vwA^PoV)R{%9>inK3u$Sx`9+sbU!p2EU^xqF3cW|=Nze5i*RN4i7fa1bS~?d{ z@MQ8;a8R9#bvf(Qyn1!nft_l&GXJdN;I^VeCnjh-=C}oMV|a^r9Uqs&T~}S#^C=v% z?9-gdDOWA3GEP;6;azj}C10F&y<99kDhI_9Y`#^`7cyQMgUcyKxVQ5S+T0G$K*b|_ zOv?~Aa8UI2m{1@*i-V%~pc!zy4&9*W?lE;1dO^_(!l9z5XhLej1clUY(B#x*&bZ1f zW-E$nX!;oA3C0pBs6J#ySz0rhZdqK_v9j<@knt+@%6^;Vd7AvyBD17STv8`BLdhI) zNQt;`M23o9XM1~q9X_T9oEvEidTv=lw6+PsgB|Dxni58HI1|;5GHFjsXTti1Wp#x$ zS6*`a4_L z+}3SViN{7qr*B@1cL55F2s&WO;aY)bQ%*~)i&@@g78RoWPt5Y;=A}K@9Vj4*KQhY= z;G&BhVgEZiHa8SC8n|9`f;Kx5I#{ifp%YaUI7=NS>DbC3>$XM%4SxZeh7H;;&5`6l zB8*3gbde(>?gzAeh#VCMzC%tBqO(uV*%6n2-(FnY@Yn7qF4jI3`lEw``M2o6{}I4d iB7gw9E`s_2v^6G>*=mtN;ON!)POV