From f3499c6efd7190ab87795dd0c6770df2be443910 Mon Sep 17 00:00:00 2001 From: Rick Littel Date: Tue, 14 Feb 2023 19:33:53 +0100 Subject: [PATCH] Pass arguments to parser modules (#446) * Add optional arguments to pass to the parser module when parsing documents and fragments * Add tests for parse_document and parse_fragment with arguments --------- Co-authored-by: Rick Littel --- lib/floki/html_parser.ex | 12 +++-- lib/floki/html_parser/fast_html.ex | 8 ++-- lib/floki/html_parser/html5ever.ex | 4 +- lib/floki/html_parser/mochiweb.ex | 4 +- test/floki_test.exs | 74 ++++++++++++++++++++++++++++++ 5 files changed, 90 insertions(+), 12 deletions(-) diff --git a/lib/floki/html_parser.ex b/lib/floki/html_parser.ex index 55f2e1b3..18e49e70 100644 --- a/lib/floki/html_parser.ex +++ b/lib/floki/html_parser.ex @@ -21,15 +21,19 @@ defmodule Floki.HTMLParser do @default_parser Floki.HTMLParser.Mochiweb - @callback parse_document(binary()) :: {:ok, Floki.html_tree()} | {:error, String.t()} - @callback parse_fragment(binary()) :: {:ok, Floki.html_tree()} | {:error, String.t()} + @callback parse_document(binary(), list()) :: {:ok, Floki.html_tree()} | {:error, String.t()} + @callback parse_fragment(binary(), list()) :: {:ok, Floki.html_tree()} | {:error, String.t()} def parse_document(html, opts \\ []) do - parser(opts).parse_document(html) + parser_args = opts[:parser_args] || [] + + parser(opts).parse_document(html, parser_args) end def parse_fragment(html, opts \\ []) do - parser(opts).parse_fragment(html) + parser_args = opts[:parser_args] || [] + + parser(opts).parse_fragment(html, parser_args) end defp parser(opts) do diff --git a/lib/floki/html_parser/fast_html.ex b/lib/floki/html_parser/fast_html.ex index dd0431d3..a66691ef 100644 --- a/lib/floki/html_parser/fast_html.ex +++ b/lib/floki/html_parser/fast_html.ex @@ -3,13 +3,13 @@ defmodule Floki.HTMLParser.FastHtml do @moduledoc false @impl true - def parse_document(html) do - execute_with_module(fn module -> module.decode(html) end) + def parse_document(html, args) do + execute_with_module(fn module -> module.decode(html, args) end) end @impl true - def parse_fragment(html) do - execute_with_module(fn module -> module.decode_fragment(html) end) + def parse_fragment(html, args) do + execute_with_module(fn module -> module.decode_fragment(html, args) end) end defp execute_with_module(fun) do diff --git a/lib/floki/html_parser/html5ever.ex b/lib/floki/html_parser/html5ever.ex index 70a2c26e..d8250c87 100644 --- a/lib/floki/html_parser/html5ever.ex +++ b/lib/floki/html_parser/html5ever.ex @@ -4,7 +4,7 @@ defmodule Floki.HTMLParser.Html5ever do @moduledoc false @impl true - def parse_document(html) do + def parse_document(html, _args) do case Code.ensure_loaded(Html5ever) do {:module, module} -> case module.parse(html) do @@ -22,5 +22,5 @@ defmodule Floki.HTMLParser.Html5ever do # NOTE: html5ever does not implement parse_fragment yet. @impl true - def parse_fragment(html), do: parse_document(html) + def parse_fragment(html, args), do: parse_document(html, args) end diff --git a/lib/floki/html_parser/mochiweb.ex b/lib/floki/html_parser/mochiweb.ex index 6bdf0e7d..7ac47dc1 100644 --- a/lib/floki/html_parser/mochiweb.ex +++ b/lib/floki/html_parser/mochiweb.ex @@ -5,7 +5,7 @@ defmodule Floki.HTMLParser.Mochiweb do @root_node "floki" @impl true - def parse_document(html) do + def parse_document(html, _args) do html = "<#{@root_node}>#{html}" {@root_node, [], parsed} = :floki_mochi_html.parse(html) @@ -14,5 +14,5 @@ defmodule Floki.HTMLParser.Mochiweb do # NOTE: mochi_html cannot make a distinction of a fragment and document. @impl true - def parse_fragment(html), do: parse_document(html) + def parse_fragment(html, args), do: parse_document(html, args) end diff --git a/test/floki_test.exs b/test/floki_test.exs index 014fa4ce..f0f131cc 100644 --- a/test/floki_test.exs +++ b/test/floki_test.exs @@ -175,6 +175,80 @@ defmodule FlokiTest do Enum.each(@plain_text_tags, validate_html) end + + @tag only_parser: FastHtml + test "parses all elements as strings by default" do + html = html_body(~s(

Content

Custom
)) + + {:ok, parsed} = Floki.parse_document(html) + + assert [ + { + "html", + [], + [ + {"head", [], []}, + { + "body", + [], + [ + {"div", [], [{"p", [], ["Content"]}, {"custom", [], ["Custom"]}]} + ] + } + ] + } + ] = parsed + end + + @tag only_parser: FastHtml + test "parses known elements as atoms when :html_atoms format argument is given" do + html = html_body(~s(

Content

Custom
)) + + {:ok, parsed} = Floki.parse_document(html, parser_args: [format: [:html_atoms]]) + + assert [ + { + :html, + [], + [ + {:head, [], []}, + { + :body, + [], + [ + {:div, [], [{:p, [], ["Content"]}, {"custom", [], ["Custom"]}]} + ] + } + ] + } + ] == parsed + end + end + + describe "parse_fragment/2" do + @tag only_parser: FastHtml + test "does not parse a table row with missing parent table tag by default" do + html = "Column 1Column 2" + + {:ok, parsed} = Floki.parse_fragment(html) + + assert ["Column 1Column 2"] == parsed + end + + @tag only_parser: FastHtml + test "parses a table row with missing parent table tag when table context is given" do + html = "12" + + {:ok, parsed} = Floki.parse_fragment(html, parser_args: [context: "table"]) + + assert [ + { + "tbody", + [], + [{"tr", [], [{"td", [], ["1"]}, {"td", [], ["2"]}]}] + } + ] == parsed + end end # Floki.raw_html/2