From deb580732abefcfa4aaeb6ca7212c924af23c202 Mon Sep 17 00:00:00 2001 From: Yuri Pereira Constante Date: Wed, 27 Dec 2023 21:00:37 -0300 Subject: [PATCH] Optimize HTMLTree.build (#511) * Avoid updating existing entries on tree.nodes * Don't use HTMLTree on build_tree --- lib/floki/html_tree.ex | 219 ++++++++++++++++++++++++----------------- 1 file changed, 129 insertions(+), 90 deletions(-) diff --git a/lib/floki/html_tree.ex b/lib/floki/html_tree.ex index 774a70fd..69d732bd 100644 --- a/lib/floki/html_tree.ex +++ b/lib/floki/html_tree.ex @@ -31,75 +31,92 @@ defmodule Floki.HTMLTree do root_id = IDSeeder.seed([]) root_node = %HTMLNode{type: tag, attributes: attrs, node_id: root_id} - build_tree( - %HTMLTree{root_nodes_ids: [root_id], node_ids: [root_id], nodes: %{root_id => root_node}}, - children, - root_id, - [] - ) + {node_ids, nodes} = + build_tree( + [root_id], + [], + children, + root_node, + [] + ) + + %HTMLTree{ + root_nodes_ids: [root_id], + node_ids: node_ids, + nodes: Map.new(nodes) + } end def build(html_tuples) when is_list(html_tuples) do reducer = fn - {:pi, _, _}, tree -> - tree - - {tag, attrs, children}, tree -> - root_id = IDSeeder.seed(tree.node_ids) + {:pi, _, _}, state -> + state + {tag, attrs, children}, {root_nodes_ids, node_ids, nodes} -> + root_id = IDSeeder.seed(node_ids) root_node = %HTMLNode{type: tag, attributes: attrs, node_id: root_id} - build_tree( - %{ - tree - | nodes: Map.put(tree.nodes, root_id, root_node), - node_ids: [root_id | tree.node_ids], - root_nodes_ids: [root_id | tree.root_nodes_ids] - }, - children, - root_id, - [] - ) - - text, tree when is_binary(text) -> - root_id = IDSeeder.seed(tree.node_ids) + root_nodes_ids = [root_id | root_nodes_ids] + node_ids = [root_id | node_ids] + {node_ids, nodes} = + build_tree( + node_ids, + nodes, + children, + root_node, + [] + ) + + {root_nodes_ids, node_ids, nodes} + + text, {root_nodes_ids, node_ids, nodes} when is_binary(text) -> + root_id = IDSeeder.seed(node_ids) root_node = %Text{content: text, node_id: root_id} - build_tree( - %{ - tree - | nodes: Map.put(tree.nodes, root_id, root_node), - node_ids: [root_id | tree.node_ids], - root_nodes_ids: [root_id | tree.root_nodes_ids] - }, - [], - root_id, - [] - ) - - {:comment, comment}, tree -> - root_id = IDSeeder.seed(tree.node_ids) + root_nodes_ids = [root_id | root_nodes_ids] + node_ids = [root_id | node_ids] + + {node_ids, nodes} = + build_tree( + node_ids, + nodes, + [], + root_node, + [] + ) + + {root_nodes_ids, node_ids, nodes} + {:comment, comment}, {root_nodes_ids, node_ids, nodes} -> + root_id = IDSeeder.seed(node_ids) root_node = %Comment{content: comment, node_id: root_id} - build_tree( - %{ - tree - | nodes: Map.put(tree.nodes, root_id, root_node), - node_ids: [root_id | tree.node_ids], - root_nodes_ids: [root_id | tree.root_nodes_ids] - }, - [], - root_id, - [] - ) - - _, tree -> - tree + root_nodes_ids = [root_id | root_nodes_ids] + node_ids = [root_id | node_ids] + + {node_ids, nodes} = + build_tree( + node_ids, + nodes, + [], + root_node, + [] + ) + + {root_nodes_ids, node_ids, nodes} + + _, state -> + state end - Enum.reduce(html_tuples, %HTMLTree{}, reducer) + {root_nodes_ids, node_ids, nodes} = Enum.reduce(html_tuples, {[], [], []}, reducer) + + %HTMLTree{ + root_nodes_ids: root_nodes_ids, + node_ids: node_ids, + nodes: Map.new(nodes) + } end def build(_), do: %HTMLTree{} @@ -174,70 +191,92 @@ defmodule Floki.HTMLTree do defp get_ids_for_delete_stack(%HTMLNode{children_nodes_ids: ids}), do: ids defp get_ids_for_delete_stack(_), do: [] - defp build_tree(tree, [], _, []), do: tree + defp build_tree(node_ids, nodes, [], parent_node, []) do + {node_ids, [{parent_node.node_id, parent_node} | nodes]} + end - defp build_tree(tree, [{:pi, _, _} | children], parent_id, stack), - do: build_tree(tree, children, parent_id, stack) + defp build_tree(node_ids, nodes, [{:pi, _, _} | children], parent_node, stack) do + build_tree(node_ids, nodes, children, parent_node, stack) + end - defp build_tree(tree, [{tag, attrs, child_children} | children], parent_id, stack) do - new_id = IDSeeder.seed(tree.node_ids) + defp build_tree(node_ids, nodes, [{tag, attrs, child_children} | children], parent_node, stack) do + new_id = IDSeeder.seed(node_ids) - new_node = %HTMLNode{type: tag, attributes: attrs, node_id: new_id, parent_node_id: parent_id} + new_node = %HTMLNode{ + type: tag, + attributes: attrs, + node_id: new_id, + parent_node_id: parent_node.node_id + } - nodes = put_new_node(tree.nodes, new_node) + parent_node = %{ + parent_node + | children_nodes_ids: [new_id | parent_node.children_nodes_ids] + } build_tree( - %{tree | nodes: nodes, node_ids: [new_id | tree.node_ids]}, + [new_id | node_ids], + nodes, child_children, - new_id, - [{parent_id, children} | stack] + new_node, + [{parent_node, children} | stack] ) end - defp build_tree(tree, [{:comment, comment} | children], parent_id, stack) do - new_id = IDSeeder.seed(tree.node_ids) - new_node = %Comment{content: comment, node_id: new_id, parent_node_id: parent_id} + defp build_tree(node_ids, nodes, [{:comment, comment} | children], parent_node, stack) do + new_id = IDSeeder.seed(node_ids) + new_node = %Comment{content: comment, node_id: new_id, parent_node_id: parent_node.node_id} - nodes = put_new_node(tree.nodes, new_node) + parent_node = %{ + parent_node + | children_nodes_ids: [new_id | parent_node.children_nodes_ids] + } build_tree( - %{tree | nodes: nodes, node_ids: [new_id | tree.node_ids]}, + [new_id | node_ids], + [{new_id, new_node} | nodes], children, - parent_id, + parent_node, stack ) end - defp build_tree(tree, [text | children], parent_id, stack) when is_binary(text) do - new_id = IDSeeder.seed(tree.node_ids) - new_node = %Text{content: text, node_id: new_id, parent_node_id: parent_id} + defp build_tree(node_ids, nodes, [text | children], parent_node, stack) when is_binary(text) do + new_id = IDSeeder.seed(node_ids) + new_node = %Text{content: text, node_id: new_id, parent_node_id: parent_node.node_id} - nodes = put_new_node(tree.nodes, new_node) + parent_node = %{ + parent_node + | children_nodes_ids: [new_id | parent_node.children_nodes_ids] + } build_tree( - %{tree | nodes: nodes, node_ids: [new_id | tree.node_ids]}, + [new_id | node_ids], + [{new_id, new_node} | nodes], children, - parent_id, + parent_node, stack ) end - defp build_tree(tree, [_other | children], parent_id, stack) do - build_tree(tree, children, parent_id, stack) - end - - defp build_tree(tree, [], _, [{parent_node_id, children} | stack]) do - build_tree(tree, children, parent_node_id, stack) + defp build_tree(node_ids, nodes, [_other | children], parent_node, stack) do + build_tree(node_ids, nodes, children, parent_node, stack) end - defp put_new_node(nodes, new_node) do - parent_node = Map.get(nodes, new_node.parent_node_id) - children_ids = parent_node.children_nodes_ids - updated_parent = %{parent_node | children_nodes_ids: [new_node.node_id | children_ids]} - - nodes - |> Map.put(new_node.node_id, new_node) - |> Map.put(new_node.parent_node_id, updated_parent) + defp build_tree( + node_ids, + nodes, + [], + parent_node, + [{next_parent_node, next_parent_children} | stack] + ) do + build_tree( + node_ids, + [{parent_node.node_id, parent_node} | nodes], + next_parent_children, + next_parent_node, + stack + ) end def patch_nodes(html_tree, operation_with_nodes) do