From adee38d20d981fdc7be95eba54030ef040b43c83 Mon Sep 17 00:00:00 2001 From: Andrea Amantini Date: Wed, 10 Apr 2024 11:04:00 +0200 Subject: [PATCH] Use open inline parsing --- deps.edn | 3 + notebooks/benchmarks.clj | 59 +++++++++++-------- src/nextjournal/markdown/parser2.clj | 32 ++-------- src/nextjournal/markdown/parser2/formulas.clj | 39 ++++++++++++ 4 files changed, 82 insertions(+), 51 deletions(-) create mode 100644 src/nextjournal/markdown/parser2/formulas.clj diff --git a/deps.edn b/deps.edn index 40efb69..4bd370f 100644 --- a/deps.edn +++ b/deps.edn @@ -47,6 +47,9 @@ :git/sha "e8f275b5cf077ec9441e404c1885ff0b6ee0aef9" :deps/root "render"}}} + :commonmark-java-local + {:extra-paths ["../commonmark-java/commonmark/target/classes"]} + :build {:ns-default build :deps {io.github.clojure/tools.build {:git/tag "v0.6.1" :git/sha "515b334"} diff --git a/notebooks/benchmarks.clj b/notebooks/benchmarks.clj index 9ea387a..3c8fab7 100644 --- a/notebooks/benchmarks.clj +++ b/notebooks/benchmarks.clj @@ -4,6 +4,7 @@ [nextjournal.clerk :as clerk] [nextjournal.clerk.eval :as clerk.eval] [nextjournal.markdown :as md] + [nextjournal.markdown.parser2 :as parser2] parsing-extensibility [nextjournal.markdown.parser :as md.parser])) @@ -22,30 +23,40 @@ (md.parser/parse (update md.parser/empty-doc :text-tokenizers concat extra-tokenizers) (md/tokenize text)))) -;; Default set of tokenizers -(time-ms (parse reference-text)) - -;; With an extra brace-brace parser -(time-ms (parse [{:regex #"\{\{([^\{]+)\}\}" - :handler (fn [m] {:type :var :text (m 1)})}] - reference-text)) - -;; With the losange reader -(time-ms (parse [{:regex #"\{\{([^\{]+)\}\}" - :handler (fn [m] {:type :var :text (m 1)})} - {:tokenizer-fn parsing-extensibility/losange-tokenizer-fn - :handler (fn [data] {:type :losange :data data})}] - reference-text)) - -;; With hashtags and internal links -(time-ms - (parse [md.parser/hashtag-tokenizer - md.parser/internal-link-tokenizer - {:regex #"\{\{([^\{]+)\}\}" - :handler (fn [m] {:type :var :text (m 1)})} - {:tokenizer-fn parsing-extensibility/losange-tokenizer-fn - :handler (fn [data] {:type :losange :data data})}] - reference-text)) +(comment + + ;; Default set of tokenizers + (time-ms (parse reference-text)) + (time-ms (parser2/parse reference-text)) + + (-> (parse reference-text) + :content count ) + + (-> (parser2/parse reference-text) + :content count ) + + + ;; With an extra brace-brace parser + (time-ms (parse [{:regex #"\{\{([^\{]+)\}\}" + :handler (fn [m] {:type :var :text (m 1)})}] + reference-text)) + + ;; With the losange reader + (time-ms (parse [{:regex #"\{\{([^\{]+)\}\}" + :handler (fn [m] {:type :var :text (m 1)})} + {:tokenizer-fn parsing-extensibility/losange-tokenizer-fn + :handler (fn [data] {:type :losange :data data})}] + reference-text)) + + ;; With hashtags and internal links + (time-ms + (parse [md.parser/hashtag-tokenizer + md.parser/internal-link-tokenizer + {:regex #"\{\{([^\{]+)\}\}" + :handler (fn [m] {:type :var :text (m 1)})} + {:tokenizer-fn parsing-extensibility/losange-tokenizer-fn + :handler (fn [data] {:type :losange :data data})}] + reference-text))) ^{::clerk/visibility {:code :hide :result :hide}} (comment diff --git a/src/nextjournal/markdown/parser2.clj b/src/nextjournal/markdown/parser2.clj index e90427f..ec46531 100644 --- a/src/nextjournal/markdown/parser2.clj +++ b/src/nextjournal/markdown/parser2.clj @@ -3,7 +3,8 @@ [clojure.zip :as z] [nextjournal.markdown.parser :as parser] [nextjournal.markdown.parser2.types] - [nextjournal.markdown.parser2.footnotes :as footnotes]) + [nextjournal.markdown.parser2.footnotes :as footnotes] + [nextjournal.markdown.parser2.formulas :as formulas]) (:import (org.commonmark.parser Parser Parser$ParserExtension Parser$Builder) (org.commonmark.parser.delimiter DelimiterProcessor) (org.commonmark.ext.task.list.items TaskListItemsExtension TaskListItemMarker) @@ -49,29 +50,6 @@ ;; - [ ] promote single images as blocks ;; - [ ] [[TOC]] (although not used in Clerk) -(def InlineFormulaExtension - (proxy [Object Parser$ParserExtension] [] - (extend [^Parser$Builder pb] - (.customDelimiterProcessor - pb - (proxy [Object DelimiterProcessor] [] - (getOpeningCharacter [] \$) - (getClosingCharacter [] \$) - (getMinLength [] 1) - (process [open close] - (if (and (= 1 (.length open)) - (= 1 (.length close))) - (let [text (str/join - (keep #(when (instance? Text %) (.getLiteral %)) - (Nodes/between (.. open getOpener) (.. close getCloser))))] - (doseq [^Node n (Nodes/between (.. open getOpener) - (.. close getCloser))] - (.unlink n)) - (.. open getOpener - ;; needs a named class `gen-class` - (insertAfter (new InlineFormula text))) - 1) - 0))))))) (comment (parse "* this is inline $\\phi$ math @@ -80,9 +58,9 @@ (def ^Parser parser (.. Parser builder - (extensions [(TaskListItemsExtension/create) - InlineFormulaExtension - (footnotes/extension)]) + (extensions [(formulas/extension) + (footnotes/extension) + (TaskListItemsExtension/create)]) build)) ;; helpers / ctx diff --git a/src/nextjournal/markdown/parser2/formulas.clj b/src/nextjournal/markdown/parser2/formulas.clj new file mode 100644 index 0000000..028a172 --- /dev/null +++ b/src/nextjournal/markdown/parser2/formulas.clj @@ -0,0 +1,39 @@ +(ns nextjournal.markdown.parser2.formulas + (:import (nextjournal.markdown.parser2.types InlineFormula) + (org.commonmark.node Node) + (org.commonmark.internal InlineParserImpl) + (org.commonmark.internal.inline InlineContentParser InlineParserState ParsedInline) + (org.commonmark.parser InlineParserFactory Parser Parser$ParserExtension Parser$Builder))) + +(defn inline-formula-parser [] + (proxy [InlineContentParser] [] + (tryParse [^InlineParserState parser-state] + + (let [scanner (.scanner parser-state) + dollars-open (.matchMultiple scanner \$) + after-opening (.position scanner)] + + (if (< 0 (.find scanner \$)) + (let [before-closing (.position scanner) + dollars-close (.matchMultiple scanner \$)] + (if (= dollars-open dollars-close) + (let [^String source (.getContent (.getSource scanner after-opening before-closing))] + (prn :source source) + (ParsedInline/of (new InlineFormula source) (.position scanner))))) + (ParsedInline/none)))))) + +(defn extension [] + (proxy [Object Parser$ParserExtension] [] + (extend [^Parser$Builder pb] + (.inlineParserFactory pb (proxy [InlineParserFactory] [] + (create [ctx] + (.addInlineParser (new InlineParserImpl ctx) + \$ (list (inline-formula-parser))))))))) + +(comment + + (nextjournal.markdown.parser2/parse " + # Ok + Aloha, that costs + * a $\\int_a^b\\phi(t)dt$ with discount + * and what"))