From a8e1c7e6551d390a07d9f5b02bafa5000dce7911 Mon Sep 17 00:00:00 2001 From: Jeremy Howard Date: Sat, 14 Sep 2024 17:51:47 +1000 Subject: [PATCH] fixes #626 --- fastcore/__init__.py | 2 +- fastcore/_modidx.py | 1 + fastcore/xml.py | 61 ++++++---- nbs/11_xml.ipynb | 276 ++++++++++++++++++++++++++++++++----------- settings.ini | 2 +- 5 files changed, 248 insertions(+), 94 deletions(-) diff --git a/fastcore/__init__.py b/fastcore/__init__.py index 9f0329de..c77aefc0 100644 --- a/fastcore/__init__.py +++ b/fastcore/__init__.py @@ -1 +1 @@ -__version__ = "1.7.7" +__version__ = "1.7.8" diff --git a/fastcore/_modidx.py b/fastcore/_modidx.py index efc1a370..3d86c296 100644 --- a/fastcore/_modidx.py +++ b/fastcore/_modidx.py @@ -599,6 +599,7 @@ 'fastcore.xml.__getattr__': ('xml.html#__getattr__', 'fastcore/xml.py'), 'fastcore.xml._escape': ('xml.html#_escape', 'fastcore/xml.py'), 'fastcore.xml._flatten_tuple': ('xml.html#_flatten_tuple', 'fastcore/xml.py'), + 'fastcore.xml._is_whitespace_significant': ('xml.html#_is_whitespace_significant', 'fastcore/xml.py'), 'fastcore.xml._noescape': ('xml.html#_noescape', 'fastcore/xml.py'), 'fastcore.xml._preproc': ('xml.html#_preproc', 'fastcore/xml.py'), 'fastcore.xml._to_attr': ('xml.html#_to_attr', 'fastcore/xml.py'), diff --git a/fastcore/xml.py b/fastcore/xml.py index dda6d932..2a9ad653 100644 --- a/fastcore/xml.py +++ b/fastcore/xml.py @@ -118,8 +118,6 @@ def __html__(self): return self # %% ../nbs/11_xml.ipynb def _escape(s): return '' if s is None else s.__html__() if hasattr(s, '__html__') else escape(s) if isinstance(s, str) else s - -# %% ../nbs/11_xml.ipynb def _noescape(s): return '' if s is None else s.__html__() if hasattr(s, '__html__') else s # %% ../nbs/11_xml.ipynb @@ -137,34 +135,57 @@ def _to_attr(k,v): return f'{k}={qt}{v}{qt}' # %% ../nbs/11_xml.ipynb -def _to_xml(elm, lvl, indent, do_escape): +_block_tags = {'div', 'p', 'ul', 'ol', 'li', 'table', 'thead', 'tbody', 'tfoot', + 'html', 'head', 'body', 'meta', '!doctype', 'input', 'script', 'link', 'style', + 'tr', 'th', 'td', 'section', 'article', 'nav', 'aside', 'header', + 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote'} +_inline_tags = {'a', 'span', 'b', 'i', 'u', 'em', 'strong', 'img', 'br', 'small', + 'big', 'sub', 'sup', 'label', 'input', 'select', 'option'} + +def _is_whitespace_significant(elm): + return elm.tag in {'pre', 'code', 'textarea', 'script'} or elm.get('contenteditable') == 'true' + +# %% ../nbs/11_xml.ipynb +def _to_xml(elm, lvl=0, indent=True, do_escape=True): + "Convert `FT` element tree into an XML string" esc_fn = _escape if do_escape else _noescape - nl = '\n' - if not indent: lvl,nl = 0,'' if elm is None: return '' if hasattr(elm, '__ft__'): elm = elm.__ft__() - if isinstance(elm, tuple): return f'{nl}'.join(_to_xml(o, lvl=lvl, indent=indent, do_escape=do_escape) for o in elm) + if isinstance(elm, tuple): + return ''.join(_to_xml(o, lvl=lvl, indent=indent, do_escape=do_escape) for o in elm) if isinstance(elm, bytes): return elm.decode('utf-8') - sp = ' ' * lvl - if not isinstance(elm, FT): return f'{esc_fn(elm)}{nl}' + if not isinstance(elm, FT): return f'{esc_fn(elm)}' + + tag, cs, attrs = elm.list + is_void = getattr(elm, 'void_', False) + is_block = tag in _block_tags + if _is_whitespace_significant(elm): indent = False + + sp,nl = (' ' * lvl,'\n') if indent and is_block else ('','') + nl_end = nl - tag,cs,attrs = elm.list stag = tag if attrs: - sattrs = (_to_attr(k,v) for k,v in attrs.items()) - stag += ' ' + ' '.join(sattrs) - - isvoid = getattr(elm, 'void_', False) - cltag = '' if isvoid else f'' - if not cs: return f'{sp}<{stag}>{cltag}{nl}' - if len(cs)==1 and not isinstance(cs[0],(list,tuple,FT)) and not hasattr(cs[0],'__ft__'): - return f'{sp}<{stag}>{esc_fn(cs[0])}{cltag}{nl}' + sattrs = ' '.join(_to_attr(k, v) for k, v in attrs.items() if v not in (False, None, '')) + stag += f' {sattrs}' if sattrs else stag + + cltag = '' if is_void else f'' + + if not cs: + if is_void: return f'{sp}<{stag}>{nl_end}' + else: return f'{sp}<{stag}>{cltag}{nl_end}' + if len(cs) == 1 and not isinstance(cs[0], (list, tuple, FT)) and not hasattr(cs[0], '__ft__'): + content = esc_fn(cs[0]) + return f'{sp}<{stag}>{content}{cltag}{nl_end}' + res = f'{sp}<{stag}>{nl}' - res += ''.join(_to_xml(c, lvl=lvl+2, indent=indent, do_escape=do_escape) for c in cs) - if not isvoid: res += f'{sp}{cltag}{nl}' + for c in cs: + res += _to_xml(c, lvl=lvl+2 if indent else 0, indent=indent, do_escape=do_escape) + if not is_void: res += f'{sp}{cltag}{nl_end}' return Safe(res) -def to_xml(elm, lvl=0, indent:bool=True, do_escape:bool=True): +# %% ../nbs/11_xml.ipynb +def to_xml(elm, lvl=0, indent=True, do_escape=True): "Convert `ft` element tree into an XML string" return Safe(_to_xml(elm, lvl, indent, do_escape=do_escape)) diff --git a/nbs/11_xml.ipynb b/nbs/11_xml.ipynb index 8c05eb6e..bc88403c 100644 --- a/nbs/11_xml.ipynb +++ b/nbs/11_xml.ipynb @@ -2,8 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": null, - "id": "d0c84aa5", + "execution_count": 1, + "id": "8a46bf54", "metadata": {}, "outputs": [], "source": [ @@ -12,7 +12,7 @@ }, { "cell_type": "markdown", - "id": "61bf8203", + "id": "3e03bb3d", "metadata": {}, "source": [ "# XML\n", @@ -22,8 +22,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "55944805", + "execution_count": 2, + "id": "f6c9a7f5", "metadata": {}, "outputs": [], "source": [ @@ -40,18 +40,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "42d18e5c", "metadata": {}, "outputs": [], "source": [ "from IPython.display import Markdown\n", - "from pprint import pprint" + "from pprint import pprint\n", + "\n", + "from fastcore.test import test_eq" + ] + }, + { + "cell_type": "markdown", + "id": "a7448678", + "metadata": {}, + "source": [ + "## FT functions" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "159d3560", "metadata": {}, "outputs": [], @@ -68,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "6f000a63", "metadata": {}, "outputs": [], @@ -82,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "ddc7d705", "metadata": {}, "outputs": [], @@ -99,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "df5d12c7", "metadata": {}, "outputs": [], @@ -113,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "b06c10f6", "metadata": {}, "outputs": [], @@ -154,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "06718948", "metadata": {}, "outputs": [], @@ -167,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "45489975", "metadata": {}, "outputs": [], @@ -190,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "306844ba", "metadata": {}, "outputs": [ @@ -200,7 +210,7 @@ "body((div(('hi',),{'a': 1, 'b': True, 'class': None}), p(('hi',),{'class': 'a 1', 'style': 'a:1; b:2'})),{})" ] }, - "execution_count": null, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -212,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "500f358a", "metadata": {}, "outputs": [ @@ -222,7 +232,7 @@ "body((div(('hi',),{'a': 1, 'b': True, 'class': None}), p(('hi',),{'class': 'a 1', 'style': 'a:1; b:2'}), p(('a',),{}), p(('b',),{})),{})" ] }, - "execution_count": null, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -243,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "39834fcb", "metadata": {}, "outputs": [], @@ -258,7 +268,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "9a8b4ddb", "metadata": {}, "outputs": [ @@ -283,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "5c6c57e9", "metadata": {}, "outputs": [ @@ -314,7 +324,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "5c7f175d", "metadata": {}, "outputs": [ @@ -331,7 +341,7 @@ "p(('Some text',),{'id': 'newid'})" ] }, - "execution_count": null, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -344,7 +354,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "116c886e", "metadata": {}, "outputs": [], @@ -355,30 +365,28 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "254c8ff3", + "cell_type": "markdown", + "id": "99c9e1cc", "metadata": {}, - "outputs": [], "source": [ - "#| export\n", - "def _escape(s): return '' if s is None else s.__html__() if hasattr(s, '__html__') else escape(s) if isinstance(s, str) else s" + "## Conversion to XML/HTML" ] }, { "cell_type": "code", - "execution_count": null, - "id": "f302aac1", + "execution_count": 28, + "id": "254c8ff3", "metadata": {}, "outputs": [], "source": [ "#| export\n", + "def _escape(s): return '' if s is None else s.__html__() if hasattr(s, '__html__') else escape(s) if isinstance(s, str) else s\n", "def _noescape(s): return '' if s is None else s.__html__() if hasattr(s, '__html__') else s" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "0255b96f", "metadata": {}, "outputs": [], @@ -400,40 +408,79 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "b89d088a", + "execution_count": 30, + "id": "ea224c94", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "_block_tags = {'div', 'p', 'ul', 'ol', 'li', 'table', 'thead', 'tbody', 'tfoot',\n", + " 'html', 'head', 'body', 'meta', '!doctype', 'input', 'script', 'link', 'style',\n", + " 'tr', 'th', 'td', 'section', 'article', 'nav', 'aside', 'header',\n", + " 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote'}\n", + "_inline_tags = {'a', 'span', 'b', 'i', 'u', 'em', 'strong', 'img', 'br', 'small',\n", + " 'big', 'sub', 'sup', 'label', 'input', 'select', 'option'}\n", + "\n", + "def _is_whitespace_significant(elm):\n", + " return elm.tag in {'pre', 'code', 'textarea', 'script'} or elm.get('contenteditable') == 'true'" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "a1ed9c01", "metadata": {}, "outputs": [], "source": [ "#| export\n", - "def _to_xml(elm, lvl, indent, do_escape):\n", + "def _to_xml(elm, lvl=0, indent=True, do_escape=True):\n", + " \"Convert `FT` element tree into an XML string\"\n", " esc_fn = _escape if do_escape else _noescape\n", - " nl = '\\n'\n", - " if not indent: lvl,nl = 0,''\n", " if elm is None: return ''\n", " if hasattr(elm, '__ft__'): elm = elm.__ft__()\n", - " if isinstance(elm, tuple): return f'{nl}'.join(_to_xml(o, lvl=lvl, indent=indent, do_escape=do_escape) for o in elm)\n", + " if isinstance(elm, tuple):\n", + " return ''.join(_to_xml(o, lvl=lvl, indent=indent, do_escape=do_escape) for o in elm)\n", " if isinstance(elm, bytes): return elm.decode('utf-8')\n", - " sp = ' ' * lvl\n", - " if not isinstance(elm, FT): return f'{esc_fn(elm)}{nl}'\n", + " if not isinstance(elm, FT): return f'{esc_fn(elm)}'\n", + "\n", + " tag, cs, attrs = elm.list\n", + " is_void = getattr(elm, 'void_', False)\n", + " is_block = tag in _block_tags\n", + " if _is_whitespace_significant(elm): indent = False\n", + "\n", + " sp,nl = (' ' * lvl,'\\n') if indent and is_block else ('','')\n", + " nl_end = nl\n", "\n", - " tag,cs,attrs = elm.list\n", " stag = tag\n", " if attrs:\n", - " sattrs = (_to_attr(k,v) for k,v in attrs.items())\n", - " stag += ' ' + ' '.join(sattrs)\n", + " sattrs = ' '.join(_to_attr(k, v) for k, v in attrs.items() if v not in (False, None, ''))\n", + " stag += f' {sattrs}' if sattrs else stag\n", "\n", - " isvoid = getattr(elm, 'void_', False)\n", - " cltag = '' if isvoid else f''\n", - " if not cs: return f'{sp}<{stag}>{cltag}{nl}'\n", - " if len(cs)==1 and not isinstance(cs[0],(list,tuple,FT)) and not hasattr(cs[0],'__ft__'):\n", - " return f'{sp}<{stag}>{esc_fn(cs[0])}{cltag}{nl}'\n", - " res = f'{sp}<{stag}>{nl}'\n", - " res += ''.join(_to_xml(c, lvl=lvl+2, indent=indent, do_escape=do_escape) for c in cs)\n", - " if not isvoid: res += f'{sp}{cltag}{nl}'\n", - " return Safe(res)\n", + " cltag = '' if is_void else f''\n", "\n", - "def to_xml(elm, lvl=0, indent:bool=True, do_escape:bool=True):\n", + " if not cs:\n", + " if is_void: return f'{sp}<{stag}>{nl_end}'\n", + " else: return f'{sp}<{stag}>{cltag}{nl_end}'\n", + " if len(cs) == 1 and not isinstance(cs[0], (list, tuple, FT)) and not hasattr(cs[0], '__ft__'):\n", + " content = esc_fn(cs[0])\n", + " return f'{sp}<{stag}>{content}{cltag}{nl_end}'\n", + "\n", + " res = f'{sp}<{stag}>{nl}'\n", + " for c in cs:\n", + " res += _to_xml(c, lvl=lvl+2 if indent else 0, indent=indent, do_escape=do_escape)\n", + " if not is_void: res += f'{sp}{cltag}{nl_end}'\n", + " return Safe(res)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "dd054392", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def to_xml(elm, lvl=0, indent=True, do_escape=True):\n", " \"Convert `ft` element tree into an XML string\"\n", " return Safe(_to_xml(elm, lvl, indent, do_escape=do_escape))\n", "\n", @@ -442,7 +489,55 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, + "id": "c2a921a2", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "test_eq(to_xml(Div(\"Hello\")), '
Hello
\\n')\n", + "test_eq(to_xml(P(\"Text\", Class=\"test\")), '

Text

\\n')\n", + "test_eq(to_xml(Div(P(\"Nested\"))), '
\\n

Nested

\\n
\\n')\n", + "test_eq(to_xml(Pre(\" Whitespace\\n Significant \")), '
  Whitespace\\n  Significant  
')\n", + "test_eq(to_xml(Img(src=\"image.jpg\")), '')\n", + "test_eq(to_xml(Div(\"Text\", contenteditable=\"true\")), '
Text
')\n", + "test_eq(to_xml(None), '')\n", + "test_eq(to_xml((\"Text\", P(\"Paragraph\"))), 'Text

Paragraph

\\n')\n", + "test_eq(to_xml(b\"Bytes\"), 'Bytes')\n", + "test_eq(to_xml(Div(P(\"Text\"), B(\"Bold\")), indent=False), '

Text

Bold
')\n", + "test_eq(to_xml(Div(\"\"), do_escape=True),\n", + " '
<script>alert('XSS')</script>
\\n')\n", + "test_eq(to_xml(Div(\"\"), do_escape=False),\n", + " \"
\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "eef16b38", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "test_eq(to_xml(B('Bold Text')), 'Bold Text')\n", + "test_eq(to_xml(Div(P('Paragraph Text'))), '
\\n

Paragraph Text

\\n
\\n')\n", + "test_eq(to_xml(Pre(' Preformatted\\n Text')), '
   Preformatted\\n   Text
')\n", + "editable_div = Div('Editable Content', contenteditable='true')\n", + "test_eq(to_xml(editable_div), '
Editable Content
')\n", + "test_eq(to_xml(Div(Span('Inline Text'), P('Paragraph'))),\n", + " '
\\nInline Text

Paragraph

\\n
\\n')\n", + "test_eq(to_xml(Br()), '
')\n", + "test_eq(to_xml(P(None)), '

\\n')\n", + "test_eq(to_xml(Div()), '
\\n')\n", + "test_eq(to_xml(Input(type='text', disabled=True)), '\\n')\n", + "special_attr_tag = Div(id='main\"div', data_info=\"Some 'info'\")\n", + "expected_special_attr = \"
\\n\"\n", + "test_eq(to_xml(special_attr_tag), expected_special_attr)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, "id": "d3d23c48", "metadata": {}, "outputs": [ @@ -451,18 +546,14 @@ "output_type": "stream", "text": [ "\n", - "\n", "\n", " \n", - " Some page\n", - " \n", + "Some page \n", " \n", "
\n", "Some text\n", - "another line\n", - " \n", - " \n", - "
\n", + "another line \n", + " \n", " \n", "\n", "\n" @@ -476,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "85a16341", "metadata": {}, "outputs": [], @@ -500,7 +591,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, + "id": "61d63a79", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "
\n", + "
\n", + "

Hello

\n", + "
hello
\n", + "
\n", + "
\n", + "\n" + ] + } + ], + "source": [ + "print(h)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, "id": "51e58821", "metadata": {}, "outputs": [ @@ -528,7 +643,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "id": "798ae1d2", "metadata": {}, "outputs": [ @@ -553,6 +668,14 @@ "print(_esc(Div(P('Hello from fastcore <3'))))" ] }, + { + "cell_type": "markdown", + "id": "5ad30d7c", + "metadata": {}, + "source": [ + "## Display" + ] + }, { "cell_type": "code", "execution_count": null, @@ -602,11 +725,8 @@ "```html\n", "\n", "
\n", - "Some text 1<2\n", - " in italics\n", - " \n", - " \n", - "
\n", + "Some text 1<2in italics \n", + " \n", "\n", "\n", "```" @@ -656,7 +776,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "id": "ad32b076", "metadata": {}, "outputs": [], @@ -676,9 +796,21 @@ ], "metadata": { "kernelspec": { - "display_name": "python3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" } }, "nbformat": 4, diff --git a/settings.ini b/settings.ini index 71da2e2a..dbf46a3d 100644 --- a/settings.ini +++ b/settings.ini @@ -8,7 +8,7 @@ author = Jeremy Howard and Sylvain Gugger author_email = infos@fast.ai copyright = fast.ai branch = master -version = 1.7.7 +version = 1.7.8 min_python = 3.8 audience = Developers language = English