Add a custom parser to insert the divs with lang attributes (#92)

* added custom parser to insert the divs with lang attributes * added unit tests * bump version number * remove the [[lang]] tags from the email preheader text * add newlines around language tags if they do not exist * fixed test and added fn to remove whitespace from lines containing language tags
cds-snc · Jul 20, 2021 · d34694d · d34694d
1 parent 00d4023
commit d34694d
Show file tree

Hide file tree

Showing 6 changed files with 251 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -43,6 +43,7 @@ nosetests.xml
 coverage.xml
 *,cover
 .pytest_cache/
+log/
 
 # Translations
 *.mo

diff --git a/notifications_utils/formatters.py b/notifications_utils/formatters.py
@@ -1,5 +1,6 @@
 import string
 import re
+from typing import List
 import urllib
 
 import mistune
@@ -22,6 +23,13 @@
     "\uFEFF"  # zero width non-breaking space
 )
 
+EMAIL_P_OPEN_TAG = '<p style="Margin: 0 0 20px 0; font-size: 19px; line-height: 25px; color: #0B0C0C;">'
+EMAIL_P_CLOSE_TAG = "</p>"
+
+FR_OPEN = r"\[\[fr\]\]"  # matches [[fr]]
+FR_CLOSE = r"\[\[/fr\]\]"  # matches [[/fr]]
+EN_OPEN = r"\[\[en\]\]"  # matches [[en]]
+EN_CLOSE = r"\[\[/en\]\]"  # matches [[/en]]
 
 mistune._block_quote_leading_pattern = re.compile(r"^ *\^ ?", flags=re.M)
 mistune.BlockGrammar.block_quote = re.compile(r"^( *\^[^\n]+(\n[^\n]+)*\n*)+")
@@ -48,7 +56,8 @@
 govuk_not_a_link = re.compile(r"(?<!\.|\/)(GOV)\.(UK)(?!\/|\?)", re.IGNORECASE)
 
 dvla_markup_tags = re.compile(
-    str("|".join("<{}>".format(tag) for tag in {"cr", "h1", "h2", "p", "normal", "op", "np", "bul", "tab"})), re.IGNORECASE
+    str("|".join("<{}>".format(tag) for tag in {"cr", "h1", "h2", "p", "normal", "op", "np", "bul", "tab"})),
+    re.IGNORECASE,
 )
 
 smartypants.tags_to_skip = smartypants.tags_to_skip + ["a"]
@@ -129,7 +138,13 @@ def url_encode_full_stops(value):
 
 
 def unescaped_formatted_list(
-    items, conjunction="and", before_each="‘", after_each="’", separator=", ", prefix="", prefix_plural=""
+    items,
+    conjunction="and",
+    before_each="‘",
+    after_each="’",
+    separator=", ",
+    prefix="",
+    prefix_plural="",
 ):
     if prefix:
         prefix += " "
@@ -146,10 +161,24 @@ def unescaped_formatted_list(
         return ("{prefix_plural}{first_items} {conjunction} {last_item}").format(**locals())
 
 
-def formatted_list(items, conjunction="and", before_each="‘", after_each="’", separator=", ", prefix="", prefix_plural=""):
+def formatted_list(
+    items,
+    conjunction="and",
+    before_each="‘",
+    after_each="’",
+    separator=", ",
+    prefix="",
+    prefix_plural="",
+):
     return Markup(
         unescaped_formatted_list(
-            [escape_html(x) for x in items], conjunction, before_each, after_each, separator, prefix, prefix_plural
+            [escape_html(x) for x in items],
+            conjunction,
+            before_each,
+            after_each,
+            separator,
+            prefix,
+            prefix_plural,
         )
     )
 
@@ -200,6 +229,55 @@ def add_trailing_newline(value):
     return "{}\n".format(value)
 
 
+def is_valid_index(index: int, lines: List[str]):
+    return index >= 0 and index < len(lines)
+
+
+def insert_newline_after(lines: List[str], tag_index: int):
+    # no need to insert newlines at the end of the file
+    if tag_index == len(lines) - 1:
+        return
+    if not is_valid_index(tag_index + 1, lines):
+        return
+    if lines[tag_index + 1] == "":
+        return
+
+    lines.insert(tag_index + 1, "")  # insert 1 newline
+
+
+def insert_newline_before(lines: List[str], tag_index: int):
+    # no need to insert newlines at the beginning of the file
+    if tag_index == 0:
+        return
+    if not is_valid_index(tag_index - 1, lines):
+        return
+    if lines[tag_index - 1] == "":
+        return
+
+    lines.insert(tag_index, "")  # insert 1 newline
+
+
+def add_newlines_around_lang_tags(content: str) -> str:
+    lines = content.splitlines()
+    all_tags = ["[[fr]]", "[[/fr]]", "[[en]]", "[[/en]]"]
+    for tag in all_tags:
+        # strip whitespace
+        for index, line in enumerate(lines):
+            if tag in line and line.strip() == tag:
+                lines[index] = line.strip()
+
+        if tag not in lines:
+            continue
+
+        tag_index = lines.index(tag)
+
+        insert_newline_before(lines, tag_index)
+        new_tag_index = lines.index(tag)
+        insert_newline_after(lines, new_tag_index)
+    new_content = "\n".join(lines)
+    return new_content
+
+
 def tweak_dvla_list_markup(value):
     return value.replace("<cr><cr><np>", "<cr><np>").replace("<p><cr><p><cr>", "<p><cr>")
 
@@ -363,7 +441,7 @@ def list_item(self, text):
 
     def paragraph(self, text):
         if text.strip():
-            return ('<p style="Margin: 0 0 20px 0; font-size: 19px; line-height: 25px; color: #0B0C0C;">{}</p>').format(text)
+            return f"{EMAIL_P_OPEN_TAG}{text}{EMAIL_P_CLOSE_TAG}"
         return ""
 
     def block_quote(self, text):
@@ -388,7 +466,9 @@ def autolink(self, link, is_email=False):
         if is_email:
             return link
         return '<a style="{}" href="{}">{}</a>'.format(
-            LINK_STYLE, urllib.parse.quote(urllib.parse.unquote(link), safe=":/?#=&;"), link
+            LINK_STYLE,
+            urllib.parse.quote(urllib.parse.unquote(link), safe=":/?#=&;"),
+            link,
         )
 
     def double_emphasis(self, text):
@@ -515,3 +595,32 @@ def link(self, link, title, content):
     hard_wrap=True,
     use_xhtml=False,
 )
+
+
+def add_language_divs(_content: str) -> str:
+    """
+    Custom parser to add the language divs. We need to search for and remove the EMAIL_P_OPEN_TAG
+    and EMAIL_P_CLOSE_TAG because the mistune parser has already run and put our [[lang]] tags inside
+    paragraphs.
+    """
+    select_anything = r"([\s\S]*)"
+    fr_regex = re.compile(
+        f"{EMAIL_P_OPEN_TAG}{FR_OPEN}{EMAIL_P_CLOSE_TAG}{select_anything}{EMAIL_P_OPEN_TAG}{FR_CLOSE}{EMAIL_P_CLOSE_TAG}"
+    )  # matches <p ...>[[fr]]</p>anything<p ...>[[/fr]]</p>
+    content = fr_regex.sub(r'<div lang="fr-ca">\1</div>', _content)  # \1 returns the "anything" content above
+
+    en_regex = re.compile(
+        f"{EMAIL_P_OPEN_TAG}{EN_OPEN}{EMAIL_P_CLOSE_TAG}{select_anything}{EMAIL_P_OPEN_TAG}{EN_CLOSE}{EMAIL_P_CLOSE_TAG}"
+    )  # matches <p ...>[[en]]</p>anything<p ...>[[/en]]</p>
+    content = en_regex.sub(r'<div lang="en-ca">\1</div>', content)  # \1 returns the "anything" content above
+    return content
+
+
+def remove_language_divs(_content: str) -> str:
+    """Remove the tags from content. This fn is for use in the email
+    preheader, since this is plain text not html"""
+    content = re.compile(FR_OPEN).sub("", _content)
+    content = re.compile(FR_CLOSE).sub("", content)
+    content = re.compile(EN_OPEN).sub("", content)
+    content = re.compile(EN_CLOSE).sub("", content)
+    return content
diff --git a/notifications_utils/template.py b/notifications_utils/template.py
@@ -14,7 +14,9 @@
     unlink_govuk_escaped,
     nl2br,
     nl2li,
+    add_language_divs,
     add_prefix,
+    add_newlines_around_lang_tags,
     autolink_sms,
     notify_email_markdown,
     notify_email_preheader_markdown,
@@ -26,6 +28,7 @@
     strip_dvla_markup,
     strip_pipes,
     remove_whitespace_before_punctuation,
+    remove_language_divs,
     make_quotes_smart,
     replace_hyphens_with_en_dashes,
     replace_hyphens_with_non_breaking_hyphens,
@@ -57,7 +60,13 @@ class Template:
 
     encoding = "utf-8"
 
-    def __init__(self, template, values=None, redact_missing_personalisation=False, jinja_path=None):
+    def __init__(
+        self,
+        template,
+        values=None,
+        redact_missing_personalisation=False,
+        jinja_path=None,
+    ):
         if not isinstance(template, dict):
             raise TypeError("Template must be a dict")
         if values is not None and not isinstance(values, dict):
@@ -140,7 +149,15 @@ def is_message_too_long(self):
 
 
 class SMSMessageTemplate(Template):
-    def __init__(self, template, values=None, prefix=None, show_prefix=True, sender=None, jinja_path=None):
+    def __init__(
+        self,
+        template,
+        values=None,
+        prefix=None,
+        show_prefix=True,
+        sender=None,
+        jinja_path=None,
+    ):
         self.prefix = prefix
         self.show_prefix = show_prefix
         self.sender = sender
@@ -222,7 +239,10 @@ def __str__(self):
                             redact_missing_personalisation=self.redact_missing_personalisation,
                         )
                     )
-                    .then(add_prefix, (escape_html(self.prefix) or None) if self.show_prefix else None)
+                    .then(
+                        add_prefix,
+                        (escape_html(self.prefix) or None) if self.show_prefix else None,
+                    )
                     .then(sms_encode if self.downgrade_non_sms_characters else str)
                     .then(remove_whitespace_before_punctuation)
                     .then(nl2br)
@@ -241,7 +261,12 @@ def __init__(
         jinja_path=None,
     ):
         self._subject = template["subject"]
-        super().__init__(template, values, redact_missing_personalisation=redact_missing_personalisation, jinja_path=jinja_path)
+        super().__init__(
+            template,
+            values,
+            redact_missing_personalisation=redact_missing_personalisation,
+            jinja_path=jinja_path,
+        )
 
     def __str__(self):
         return str(
@@ -354,6 +379,7 @@ def preheader(self):
             .then(strip_unsupported_characters)
             .then(add_trailing_newline)
             .then(notify_email_preheader_markdown)
+            .then(remove_language_divs)
             .then(do_nice_typography)
             .split()
         )[: self.PREHEADER_LENGTH_IN_CHARACTERS].strip()
@@ -396,7 +422,12 @@ def __init__(
         logo_with_background_colour=None,
         asset_domain=None,
     ):
-        super().__init__(template, values, redact_missing_personalisation=redact_missing_personalisation, jinja_path=jinja_path)
+        super().__init__(
+            template,
+            values,
+            redact_missing_personalisation=redact_missing_personalisation,
+            jinja_path=jinja_path,
+        )
         self.from_name = from_name
         self.from_address = from_address
         self.reply_to = reply_to
@@ -415,7 +446,9 @@ def __str__(self):
             self.jinja_template.render(
                 {
                     "body": get_html_email_body(
-                        self.content, self.values, redact_missing_personalisation=self.redact_missing_personalisation
+                        self.content,
+                        self.values,
+                        redact_missing_personalisation=self.redact_missing_personalisation,
                     ),
                     "subject": self.subject,
                     "from_name": escape_html(self.from_name),
@@ -439,7 +472,10 @@ def subject(self):
         return (
             Take(
                 Field(
-                    self._subject, self.values, html="escape", redact_missing_personalisation=self.redact_missing_personalisation
+                    self._subject,
+                    self.values,
+                    html="escape",
+                    redact_missing_personalisation=self.redact_missing_personalisation,
                 )
             )
             .then(do_nice_typography)
@@ -474,7 +510,11 @@ def __init__(
         date=None,
     ):
         self.contact_block = (contact_block or "").strip()
-        super().__init__(template, values, redact_missing_personalisation=redact_missing_personalisation)
+        super().__init__(
+            template,
+            values,
+            redact_missing_personalisation=redact_missing_personalisation,
+        )
         self.admin_base_url = admin_base_url
         self.logo_file_name = logo_file_name
         self.date = date or datetime.utcnow()
@@ -696,8 +736,10 @@ def get_html_email_body(template_content, template_values, redact_missing_person
         )
         .then(unlink_govuk_escaped)
         .then(strip_unsupported_characters)
+        .then(add_newlines_around_lang_tags)
         .then(add_trailing_newline)
         .then(notify_email_markdown)
+        .then(add_language_divs)
         .then(do_nice_typography)
     )
 

diff --git a/notifications_utils/version.py b/notifications_utils/version.py
@@ -1,2 +1,2 @@
-__version__ = "43.11.1"
+__version__ = "44.0.0"
 # GDS version '34.0.1'
diff --git a/tests/test_formatters.py b/tests/test_formatters.py
@@ -2,6 +2,8 @@
 from flask import Markup
 
 from notifications_utils.formatters import (
+    add_language_divs,
+    remove_language_divs,
     unlink_govuk_escaped,
     notify_email_markdown,
     notify_letter_preview_markdown,
@@ -947,3 +949,42 @@ def test_strip_unsupported_characters():
 
 def test_normalise_whitespace():
     assert normalise_whitespace("\u200C Your tax   is\ndue\n\n") == "Your tax is due"
+
+
+@pytest.mark.parametrize("lang", ("en", "fr"))
+def test_add_language_divs_fr_replaces(lang: str):
+    _content = (
+        f'<p style="Margin: 0 0 20px 0; font-size: 19px; line-height: 25px; color: #0B0C0C;">[[{lang}]]</p>'
+        '<h2 style="Margin: 0 0 20px 0; padding: 0; font-size: 27px; line-height: 35px; font-weight: bold; color: #0B0C0C;">'
+        "title</h2>"
+        '<p style="Margin: 0 0 20px 0; font-size: 19px; line-height: 25px; color: #0B0C0C;">'
+        "Comment vas-tu aujourd'hui?</p>"
+        f'<p style="Margin: 0 0 20px 0; font-size: 19px; line-height: 25px; color: #0B0C0C;">[[/{lang}]]</p>'
+    )
+    content = (
+        f'<div lang="{lang}-ca">'
+        '<h2 style="Margin: 0 0 20px 0; padding: 0; font-size: 27px; line-height: 35px; font-weight: bold; color: #0B0C0C;">'
+        "title</h2>"
+        '<p style="Margin: 0 0 20px 0; font-size: 19px; line-height: 25px; color: #0B0C0C;">'
+        "Comment vas-tu aujourd'hui?</p></div>"
+    )
+    assert add_language_divs(_content) == content
+
+
+@pytest.mark.parametrize("lang", ("en", "fr"))
+def test_add_language_divs_fr_does_not_replace(lang: str):
+    _content = f"[[{lang}]] asdf [[/{lang}]]"
+    assert add_language_divs(_content) == _content
+
+
+@pytest.mark.parametrize(
+    "input,output",
+    (
+        ("abc 123", "abc 123"),
+        ("[[fr]]\n\nabc\n\n[[/fr]]", "\n\nabc\n\n"),
+        ("[[en]]\n\nabc\n\n[[/en]]", "\n\nabc\n\n"),
+        ("[[en]]\n\nabc\n\n[[/en]]\n\n[[fr]]\n\n123\n\n[[/fr]]", "\n\nabc\n\n\n\n\n\n123\n\n"),
+    ),
+)
+def test_remove_language_divs(input: str, output: str):
+    assert remove_language_divs(input) == output
-Original file line number
+Diff line change
@@ Expand Up / @@ -43,6 +43,7 @@ nosetests.xml @@
     coverage.xml
     *,cover
     .pytest_cache/
+    log/
     # Translations
     *.mo
@@ Expand Down @@