Include working MathJax server in docker image (#258)

## Summary of changes - Include MathJax server deps (Node and JS libs) in docker image ( Resolves #256 ) - Corrected bug in parsing JSON data in `tex2mml` endpoint ( Resolves #257 ) - Update `docker-compose.img2mml.yml` to include networked MathJax and unified REST API for latex2mml + img2mml services (w/ examples in Python) - Update development environment setup instructions - mml response is now xml (avoids funky escapes previously encountered)
ml4ai · Jun 23, 2023 · 04eb5ee · 04eb5ee
1 parent 085e0b4
commit 04eb5ee
Show file tree

Hide file tree

Showing 15 changed files with 262 additions and 33 deletions.
diff --git a/Dockerfile.skema-py b/Dockerfile.skema-py
@@ -13,11 +13,15 @@ RUN apt-get update &&\
         build-essential \
         graphviz \
         libgraphviz-dev \
-        python3-venv  &&\
-    # The two commands below are to reduce the size of the Docker image
-    apt-get clean &&\
-    rm -rf /var/lib/apt/lists/*
+        python3-venv 
+
+# Node needed for img2mml
+RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - &&\
+apt-get install -y nodejs
 
+# The two commands below are to reduce the size of the Docker image
+RUN apt-get clean &&\
+    rm -rf /var/lib/apt/lists/*
 
 # =====================
 # Setup the repository
@@ -44,9 +48,11 @@ RUN tree /app
 #RUN pip install ".[all]"
 # exclude dependencies for docs
 RUN pip install ".[core,dev]"
-
 # Build tree-sitter-fortran grammar required by Fortran code2fn
+RUN skema-tree-sitter-build-fortran-grammar
 #RUN python /app/skema/program_analysis/TS2CAST/build_tree_sitter_fortran.py
 
+# img2mml JS dependencies for MathJax server
+RUN (cd skema/img2mml/data_generation && npm install)
 
 #CMD ["uvicorn", "skema.skema_py.server:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/README.md b/README.md
@@ -1,4 +1,8 @@
-![](http://ci.kraken.sista.arizona.edu/api/badges/ml4ai/skema/status.svg)
+![](http://ci.kraken.sista.arizona.edu/api/badges/ml4ai/skema/status.svg)  
+![Docker lumai/askem-skema-py Image Version (latest by date)](https://img.shields.io/docker/v/lumai/askem-skema-py?sort=date&logo=docker&label=lumai%2Faskem-skema-py)  
+![Docker lumai/askem-skema-img2mml Image Version (latest by date)](https://img.shields.io/docker/v/lumai/askem-skema-img2mml?sort=date&logo=docker&label=lumai%2Faskem-skema-img2mml)  
+![Docker lumai/askem-skema-rs Image Version (latest by date)](https://img.shields.io/docker/v/lumai/askem-skema-rs?sort=date&logo=docker&label=lumai%2Faskem-skema-rs)  
+![Docker lumai/askem-skema-text-reading Image Version (latest by date)](https://img.shields.io/docker/v/lumai/askem-skema-text-reading?sort=date&logo=docker&label=lumai%2Faskem-skema-text-reading)
 
 # SKEMA: Scientific Knowledge Extraction and Model Analysis
 
@@ -97,6 +101,6 @@ To run the img2mml Dockerized service, run
 docker-compose -f docker-compose.img2mml.yml up --build
 ```
 
-(make sure the appropriate img2mml model is in the
-`skema/img2mml/trained_models` directory - see the `README.md` file in
+(if you're not using a published docker image with the default model, make sure that the appropriate img2mml model is in the
+`skema/img2mml/trained_models` directory before building - see the `README.md` file in
 `skema/img2mml` for details)
diff --git a/docker-compose.img2mml.yml b/docker-compose.img2mml.yml
@@ -1,12 +1,37 @@
 # Docker Compose file for the img2mml service
 version: "3"
 services:
-  img2mml:
-    image: lumai/askem-skema-img2mml:local
+  eq2mml:
+    #image: lumai/askem-skema-py:latest
+    # ... or to build locally:
+    image: lumai/askem-skema-eq2mml:local
     build:
       context: .
       dockerfile: Dockerfile.skema-py
+    depends_on:
+      # FIXME: add `condition: service_healthy` and healthcheck
+      - mathjax
     # open browser to http://localhost:8000/docs
-    entrypoint: uvicorn skema.img2mml.img2mml:app --host=0.0.0.0 --port 8000
+    entrypoint: uvicorn skema.img2mml.eq2mml:app --host=0.0.0.0 --port 8000
     ports:
       - "8000:8000" # Change port mapping appropriately before deploying.
+    environment:
+      - "SKEMA_MATHJAX_HOST=mathjax"
+      - "SKEMA_MATHJAX_PORT=8031"
+  # example cmd: curl -X POST http://0.0.0.0:8031/tex2mml \
+  # -H "Content-Type: application/json" \
+  # -d '{"tex_src": "E = mc^{2}"}'
+  mathjax:
+    #image: lumai/askem-skema-py:latest
+    # ... or to build locally:
+    image: lumai/askem-skema-mathjax:local
+    build:
+      context: .
+      dockerfile: Dockerfile.skema-py
+    environment:
+      - "SKEMA_MATHJAX_HOST=0.0.0.0"
+      - "SKEMA_MATHJAX_PORT=8031"
+    ports:
+      - "8031:8031"
+    working_dir: /app/skema/img2mml/data_generation
+    entrypoint: ["npm", "start"]
diff --git a/docs/dev/env.md b/docs/dev/env.md
@@ -4,7 +4,11 @@
 We recommend configuring your local development environment using [`conda`](https://docs.conda.io/en/latest/miniconda.html):
 
 ```bash
-conda create -n skema python=3.8 -c conda-forge rust=1.70.0 openjdk=11 sbt=1.9.0
+conda create -n skema python=3.8 -c conda-forge rust=1.70.0 openjdk=11 sbt=1.9.0 nodejs=18.15.0
 conda activate skema
 pip install -e ".[all]"
+# fortran grammar for pa
+skema-tree-sitter-build-fortran-grammar
+# mathjax deps for img2mml
+(cd skema/img2mml/data_generation && npm install)
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -72,13 +72,14 @@ all = ["skema[core]", "skema[dev]", "skema[doc]"]
 "skema.skema_py" = "skema/skema_py"
 # re-map skema/text_reading/python to skema.text_reading
 "skema.text_reading.mention_linking" = "skema/text_reading/mention_linking"
+"skema.data" = "skema/data"
 
 # [tool.setuptools.packages]
 # find = {}  # Scan the project directory with the default parameters
 
 [tool.setuptools.package-data]
 # needed to ensure models are included in package/discoverable
-"*" = ["*.json", "vocab.txt", "*.pt"]
+"*" = ["*.json", "vocab.txt", "*.pt", "*.png"]
 
 [tool.setuptools.dynamic]
 readme = {file = ["README.md"], content-type = "text/markdown"}

diff --git a/skema/data/__init__.py b/skema/data/__init__.py
diff --git a/skema/data/eq2mml/__init__.py b/skema/data/eq2mml/__init__.py
@@ -0,0 +1,14 @@
+from typing import Text
+from pathlib import Path
+import base64
+
+# NOTE: generate additional images with https://latex2png.com/
+
+def _img2b64(img_name: Text) -> bytes:
+  p = Path(__file__).parent / "images" / img_name
+  with p.open("rb") as infile:
+    img_bytes = infile.read()
+    return base64.b64encode(img_bytes).decode("utf-8")
+
+img_b64_bayes_transparent = _img2b64("bayes-rule-transparent.png")
+img_b64_bayes_white_bg = _img2b64("bayes-rule-white-bg.png")
diff --git a/skema/data/eq2mml/images/bayes-rule-transparent.png b/skema/data/eq2mml/images/bayes-rule-transparent.png
diff --git a/skema/data/eq2mml/images/bayes-rule-white-bg.png b/skema/data/eq2mml/images/bayes-rule-white-bg.png
diff --git a/skema/img2mml/api.py b/skema/img2mml/api.py
@@ -38,32 +38,24 @@ def get_mathml_from_file(filepath) -> str:
     return get_mathml_from_bytes(data)
 
 
-def get_mathml_from_latex(eqn) -> str:
+def get_mathml_from_latex(eqn: str) -> str:
     """Read a LaTeX equation string and convert it to presentation MathML"""
 
     # Define the webservice address from the MathJAX service
+    protocol = os.environ.get('SKEMA_MATHJAX_PROTOCOL', 'http://')
     host = os.environ.get('SKEMA_MATHJAX_HOST', '127.0.0.1')
-    port = os.environ.get('SKEMA_MATHJAX_PORT', 8031)
-    webservice = host + ':' + port
+    port = str(os.environ.get('SKEMA_MATHJAX_PORT', 8031))
+    webservice = protocol + host + ':' + port
     print('Connecting to ' + webservice)
 
     # Translate and save each LaTeX string using the NodeJS service for MathJax
     res = requests.post(
         f"{webservice}/tex2mml",
         headers={"Content-type": "application/json"},
-        json={"tex_src": json.dumps(eqn)},
+        json={"tex_src": eqn},
     )
-
     if res.status_code == 200:
-        clean_res = (
-            res.content.decode("utf-8")[1:-1]
-            .replace("\\n", "")
-            .replace('\\"', '"')
-            .replace("\\\\", "\\")
-            .strip()
-        )
-        clean_res = re.sub(r"\s+", " ", clean_res)
-        return clean_res
+        return res.text
     else:
         try:
             res.raise_for_status()

diff --git a/skema/img2mml/data_generation/.gitignore b/skema/img2mml/data_generation/.gitignore
@@ -0,0 +1,2 @@
+node_modules/
+package-lock.json
diff --git a/skema/img2mml/data_generation/mathjax_server.js b/skema/img2mml/data_generation/mathjax_server.js
@@ -59,10 +59,11 @@ function tex2mml(tex) {
 // Process a single TeX equation string into a MathML string
 app.post('/tex2mml', function (req, res) {
     // Access the LaTeX source from the request object
-    var tex_str = JSON.parse(req.body.tex_src)
+    var tex_str = req.body.tex_src
     tex2mml(tex_str)
         .then((data) => {
-            res.send(JSON.stringify(data.mml));
+            res.setHeader('Content-Type', "application/xml");
+            res.send(data.mml);
         })
         .catch((err) => {
             res.send(JSON.stringify(`FAILED::${err}::${tex_str}`));

diff --git a/skema/img2mml/data_generation/package.json b/skema/img2mml/data_generation/package.json
@@ -0,0 +1,18 @@
+{
+  "name": "mathjax-server",
+  "version": "1.0.0",
+  "description": "",
+  "main": "mathjax_server.js",
+  "scripts": {
+    "start": "node mathjax_server.js",
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "keywords": [],
+  "author": "",
+  "license": "Apache 2.0",
+  "dependencies": {
+    "express": "^4.18.2",
+    "lodash": "^4.17.21",
+    "mathjax-node": "^2.1.1"
+  }
+}
diff --git a/skema/img2mml/eq2mml.py b/skema/img2mml/eq2mml.py
@@ -0,0 +1,142 @@
+# -*- coding: utf-8 -*-
+"""
+Convert the LaTeX equation to the corresponding presentation MathML using the MathJAX service.
+Please run the following command to initialize the MathJAX service:
+node data_generation/mathjax_server.js
+"""
+
+from typing import Text, Union
+from typing_extensions import Annotated
+from fastapi import FastAPI, Body, File, Response, Request, Query
+from skema.img2mml.api import (get_mathml_from_bytes, get_mathml_from_latex)
+from skema.data.eq2mml import img_b64_bayes_white_bg
+from pydantic import BaseModel, Field
+import base64
+
+EquationQueryParameter = Annotated[
+  Text,
+  Query(
+      examples={
+          "simple": {
+              "summary": "A familiar equation",
+              "description": "A simple equation (mass-energy equivalence)",
+              "value": "E = mc^{2}",
+          },
+          "complex": {
+              "summary": "A more feature-rich equation (Bayes' rule)",
+              "description": "A equation drawing on latex features",
+              "value": "\\frac{P(\\textrm{a } | \\textrm{ b}) \\times P(\\textrm{b})}{P(\\textrm{a})}",
+          }
+      },
+  ),
+]
+
+ImageBytes = Annotated[
+  bytes,
+  File(
+      description="bytes of a PNG of an equation",
+      # examples={
+      #     "bayes-rule": {
+      #         "summary": "PNG of Bayes' rule",
+      #         "description": "PNG of Bayes' rule",
+      #         "value": str(img_path_bayes_rule_eqn),
+      #     }
+      # },
+  ),
+]
+
+class LatexEquation(BaseModel):
+    tex_src: Text = Field(title="LaTeX equation", description="The LaTeX equation to process")
+    class Config:
+        schema_extra = {
+            "example": {
+                "tex_src": "E = mc^{c}",
+            }
+        }
+
+
+app = FastAPI()
+
+def process_latex_equation(eqn: Text) -> Response:
+    """Helper function used by both GET and POST LaTeX equation processing endpoints"""
+    res = get_mathml_from_latex(eqn)
+    return Response(content=res, media_type="application/xml")
+
+# FIXME: have this test the mathjax endpoint (and perhaps check the pt model loaded)
+@app.get("/healthcheck", summary="Ping endpoint to test health of service", response_model=Text, status_code=200)
+def healthcheck():
+    return "The eq2mml service is running."
+
+@app.post("/image/mml", summary="Get MathML representation of an equation image")
+async def post_image_to_mathml(data: ImageBytes) -> Response:
+    """
+    Endpoint for generating MathML from an input image.
+
+    ### Python example
+    ```
+    import requests
+
+    files = {
+      "data": open("bayes-rule-white-bg.png", "rb"),
+    }
+    r = requests.post("http://0.0.0.0:8000/image/mml", files=files)
+    print(r.text)
+    """
+    # convert bytes of png image to tensor
+    res =  get_mathml_from_bytes(data)
+    print(res)
+    print(type(res))
+    return Response(content=res, media_type="application/xml")
+
+@app.post("/image/base64/mml", summary="Get MathML representation of an equation image")
+async def post_b64image_to_mathml(request: Request) -> Response:
+    """
+    Endpoint for generating MathML from an input image.
+
+    ### Python example
+    ```
+    from pathlib import Path
+    import base64
+    import requests
+
+    url = "http://0.0.0.0:8000/image/base64/mml"
+    with Path("bayes-rule-white-bg.png").open("rb") as infile:
+      img_bytes = infile.read()
+    img_b64 = base64.b64encode(img_bytes).decode("utf-8")
+    r = requests.post(url, data=img_b64)
+    print(r.text)
+    """
+    img_b64 = await request.body()
+    img_bytes = base64.b64decode(img_b64)
+    # convert bytes of png image to tensor
+    res =  get_mathml_from_bytes(img_bytes)
+    return Response(content=res, media_type="application/xml")
+
+@app.get("/latex/mml", summary="Get MathML representation of a LaTeX equation")
+async def get_tex_to_mathml(tex_src: EquationQueryParameter) -> Response:
+    """
+    GET endpoint for generating MathML from an input LaTeX equation.
+
+    ### Python example
+    ```
+    import requests
+
+    r = requests.get("http://0.0.0.0:8000/latex/mml", params={"tex_src":"E = mc^{c}"})
+    print(r.text)
+    """
+    return process_latex_equation(tex_src)
+
+@app.post("/latex/mml", summary="Get MathML representation of a LaTeX equation")
+async def post_tex_to_mathml(eqn: LatexEquation) -> Response:
+    """
+    Endpoint for generating MathML from an input LaTeX equation.
+
+    ### Python example
+    ```
+    import requests
+
+    r = requests.post("http://0.0.0.0:8000/latex/mml", json={"tex_src":"E = mc^{c}"})
+    print(r.text)
+    """
+    # convert latex string to presentation mathml
+    return process_latex_equation(eqn.tex_src)