From 98f59c3206818c7caf28e333dc9732dcab6fce43 Mon Sep 17 00:00:00 2001
From: Rick Staa <rick.staa@outlook.com>
Date: Wed, 4 Sep 2024 20:31:44 +0200
Subject: [PATCH] docs(a): add segment anything 2 documentation (#638)

* docs(a): add segment anything 2 documentation

This commit adds documentation for the [Segment Anything 2](https://ai.meta.com/blog/segment-anything-2/)
now that it has been released to the AI subnet software.

Co-authored-by: Rick Staa <rick.staa@outlook.com>

* chore(ai): update AI OpenAPI spec

This commit updates the AI OpenAPI spec since the speakeasy integration
has not yet been enabled.

* docs(ai): fix HTML closing bracket

This commit ensures that there is not error thrown anymore because there
was an unclosed div in the Segment anything pipeline docs.

* docs(ai): add SAM2 docker warning

This commit ensures that people who want to serve the SAM2 model are
aware that we don't yet host it on dockerhub.

* docs(ai): fix incorrect SAM2 pricing

This commit fixes a syntax error in the SAM2 pricing.

Co-authored-by: Peter Schroedl <peter@livepeer.org>

* fixup! docs(ai): fix incorrect SAM2 pricing

---------

Co-authored-by: ea_superstar <edidiongarchibong1@gmail.com>
Co-authored-by: Peter Schroedl <peter@livepeer.org>
---
 ai/api-reference/gateway.openapi.yaml   | 124 +++++++++++++++++++++++-
 ai/api-reference/segment-anything-2.mdx |  21 ++++
 ai/orchestrators/models-config.mdx      |  16 +++
 ai/pipelines/overview.mdx               |   3 +
 ai/pipelines/segment-anything-2.mdx     | 104 ++++++++++++++++++++
 mint.json                               |   6 +-
 6 files changed, 271 insertions(+), 3 deletions(-)
 create mode 100644 ai/api-reference/segment-anything-2.mdx
 create mode 100644 ai/pipelines/segment-anything-2.mdx

diff --git a/ai/api-reference/gateway.openapi.yaml b/ai/api-reference/gateway.openapi.yaml
index 811dcbcc..38208652 100644
--- a/ai/api-reference/gateway.openapi.yaml
+++ b/ai/api-reference/gateway.openapi.yaml
@@ -2,10 +2,12 @@ openapi: 3.1.0
 info:
   title: Livepeer AI Runner
   description: An application to run AI pipelines
-  version: v0.1.2
+  version: v0.2.0
 servers:
 - url: https://dream-gateway.livepeer.cloud
   description: Livepeer Cloud Community Gateway
+- url: https://livepeer.studio/api/beta/generate
+  description: Livepeer Studio Gateway
 paths:
   /text-to-image:
     post:
@@ -233,6 +235,50 @@ paths:
                 $ref: '#/components/schemas/HTTPValidationError'
       security:
       - HTTPBearer: []
+  /segment-anything-2:
+    post:
+      summary: Segment Anything 2
+      description: Segment objects in an image.
+      operationId: segment_anything_2
+      requestBody:
+        content:
+          multipart/form-data:
+            schema:
+              $ref: '#/components/schemas/Body_segment_anything_2_segment_anything_2_post'
+        required: true
+      responses:
+        '200':
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/MasksResponse'
+        '400':
+          description: Bad Request
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPError'
+        '401':
+          description: Unauthorized
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPError'
+        '500':
+          description: Internal Server Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPError'
+        '422':
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HTTPValidationError'
+      security:
+      - HTTPBearer: []
 components:
   schemas:
     APIError:
@@ -390,6 +436,61 @@ components:
       - image
       - model_id
       title: Body_image_to_video_image_to_video_post
+    Body_segment_anything_2_segment_anything_2_post:
+      properties:
+        image:
+          type: string
+          format: binary
+          title: Image
+          description: Image to segment.
+        model_id:
+          type: string
+          title: Model Id
+          description: Hugging Face model ID used for image generation.
+          default: ''
+        point_coords:
+          type: string
+          title: Point Coords
+          description: Nx2 array of point prompts to the model, where each point is
+            in (X,Y) in pixels.
+        point_labels:
+          type: string
+          title: Point Labels
+          description: Labels for the point prompts, where 1 indicates a foreground
+            point and 0 indicates a background point.
+        box:
+          type: string
+          title: Box
+          description: A length 4 array given as a box prompt to the model, in XYXY
+            format.
+        mask_input:
+          type: string
+          title: Mask Input
+          description: A low-resolution mask input to the model, typically from a
+            previous prediction iteration, with the form 1xHxW (H=W=256 for SAM).
+        multimask_output:
+          type: boolean
+          title: Multimask Output
+          description: If true, the model will return three masks for ambiguous input
+            prompts, often producing better masks than a single prediction.
+          default: true
+        return_logits:
+          type: boolean
+          title: Return Logits
+          description: If true, returns un-thresholded mask logits instead of a binary
+            mask.
+          default: true
+        normalize_coords:
+          type: boolean
+          title: Normalize Coords
+          description: If true, the point coordinates will be normalized to the range
+            [0,1], with point_coords expected to be with respect to image dimensions.
+          default: true
+      type: object
+      required:
+      - image
+      - model_id
+      title: Body_segment_anything_2_segment_anything_2_post
     Body_upscale_upscale_post:
       properties:
         prompt:
@@ -461,6 +562,27 @@ components:
       - images
       title: ImageResponse
       description: Response model for image generation.
+    MasksResponse:
+      properties:
+        masks:
+          type: string
+          title: Masks
+          description: The generated masks.
+        scores:
+          type: string
+          title: Scores
+          description: The model's confidence scores for each generated mask.
+        logits:
+          type: string
+          title: Logits
+          description: The raw, unnormalized predictions (logits) for the masks.
+      type: object
+      required:
+      - masks
+      - scores
+      - logits
+      title: MasksResponse
+      description: Response model for object segmentation.
     Media:
       properties:
         url:
diff --git a/ai/api-reference/segment-anything-2.mdx b/ai/api-reference/segment-anything-2.mdx
new file mode 100644
index 00000000..983936b3
--- /dev/null
+++ b/ai/api-reference/segment-anything-2.mdx
@@ -0,0 +1,21 @@
+---
+openapi: post /segment-anything-2
+---
+
+<Info>
+  The public [Livepeer.cloud](https://www.livepeer.cloud/) Gateway used in this
+  guide is intended for experimentation and is not guaranteed for production
+  use. It is a free, non-token-gated, but rate-limited service designed for
+  testing purposes. For production-ready applications, consider setting up your
+  own Gateway node or partnering with one via the `ai-video` channel on
+  [Discord](https://discord.gg/livepeer).
+</Info>
+
+<Note>
+  Please note that the **optimal** parameters for a given model may vary
+  depending on the specific model and use case. The parameters provided in this
+  guide are not model-specific and should be used as a starting point.
+  Additionally, some models may have parameters such as `guiding_scale` and
+  `num_inference_steps` disabled by default. For more information on
+  model-specific parameters, please refer to the respective model documentation.
+</Note>
diff --git a/ai/orchestrators/models-config.mdx b/ai/orchestrators/models-config.mdx
index 14bd9810..8db564ef 100644
--- a/ai/orchestrators/models-config.mdx
+++ b/ai/orchestrators/models-config.mdx
@@ -11,6 +11,14 @@ for their download. For details on supported pipelines and models, consult
 
 ## Configuration File Format
 
+<Warning>
+  The `livepeer/ai-runner:segment-anything-2` container required to serve the [segment-anything-2](ai/pipelines/segment-anything-2) pipeline has not yet been released to our Docker registry. To serve this model as an orchestrator, please use the following command to download the source code and build the Docker container locally:
+
+  ```bash
+  docker build https://github.com/livepeer/ai-worker.git#main:runner/docker -t livepeer/ai-runner:segment-anything-2 -f Dockerfile.segment_anything_2
+  ```
+</Warning>
+
 Orchestrators specify supported AI models in an `aiModels.json` file, typically
 located in the `~/.lpData` directory. Below is an example configuration showing
 currently **recommended** models and their respective prices.
@@ -37,6 +45,14 @@ currently **recommended** models and their respective prices.
     "model_id": "openai/whisper-large-v3",
     "price_per_unit": 12882811
   },
+  {
+    "pipeline": "segment-anything-2",
+    "model_id": "facebook/sam2-hiera-large",
+    "price_per_unit": 3565,
+    "pixels_per_unit": 1e13,
+    "currency": "USD",
+    "warm": true
+  },
   {
     "pipeline": "image-to-video",
     "model_id": "stabilityai/stable-video-diffusion-img2vid-xt-1-1",
diff --git a/ai/pipelines/overview.mdx b/ai/pipelines/overview.mdx
index 030cb888..3a78c1ff 100644
--- a/ai/pipelines/overview.mdx
+++ b/ai/pipelines/overview.mdx
@@ -67,4 +67,7 @@ The subnet currently supports the following generative AI pipelines:
   <Card title="Audio-to-Text" icon="message-dots" href="/ai/pipelines/audio-to-text">
     The audio-to-text pipeline uses automatic speech recognition (ASR) to translate audio to text with timestamps
   </Card>
+  <Card title="Segment-Anything-2" icon="video" href="/ai/pipelines/segment-anything-2">
+    The segment-anything-2 pipeline offers promptable visual segmentation for images and videos.
+  </Card>
 </CardGroup>
diff --git a/ai/pipelines/segment-anything-2.mdx b/ai/pipelines/segment-anything-2.mdx
new file mode 100644
index 00000000..0cee5870
--- /dev/null
+++ b/ai/pipelines/segment-anything-2.mdx
@@ -0,0 +1,104 @@
+---
+title: Segment-anything-2
+---
+
+## Overview
+
+The `segment-anything-2` pipeline provides direct access to the
+[Segment Anything 2 pipeline](https://ai.meta.com/sam2/) developed by
+[Meta AI Research](https://research.facebook.com/). In its current version, it
+supports only image segmentation, enabling it to segment any object in an image.
+Future versions will also support direct video input, allowing the object to be
+consistently tracked across all frames of a video in real-time. This advancement
+will unlock new possibilities for video editing and enhance experiences in mixed
+reality. The pipeline is powered by the latest diffusion models from
+HuggingFace's
+[facebook/sam2-hiera-large](https://huggingface.co/facebook/sam2-hiera-large).
+
+## Models
+
+### Warm Models
+
+The current warm model requested for the `segment-anything-2` pipeline is:
+
+- [facebook/sam2-hiera-large](https://huggingface.co/facebook/sam2-hiera-large):
+  The largest model in the Segment Anything 2 model suite, designed for the most
+  accurate image segmentation.
+
+<Tip>
+  For faster responses with different
+  [segment-anything-2](https://github.com/facebookresearch/segment-anything-2)
+  diffusion models, ask Orchestrators to load it on their GPU via the `ai-video`
+  channel in [Discord Server](https://discord.gg/livepeer).
+</Tip>
+
+### On-Demand Models
+
+The following models have been tested and verified for the `segment-anything-2`
+pipeline:
+
+<Note>
+  If a specific model you wish to use is not listed, please submit a [feature
+  request](https://github.com/livepeer/ai-worker/issues/new?assignees=&labels=enhancement%2Cmodel&projects=&template=model_request.yml)
+  on GitHub to get the model verified and added to the list.
+</Note>
+
+{/* prettier-ignore */}
+<Accordion title="Tested and Verified Diffusion Models">
+- [facebook/sam2-hiera-base-plus](https://huggingface.co/facebook/sam2-hiera-base-plus): The second largest model in the Segment Anything 2 model suite, providing a balance between speed and accuracy.
+- [facebook/sam2-hiera-small](https://huggingface.co/facebook/sam2-hiera-small): A smaller model in the Segment Anything 2 model suite, designed for faster image segmentation.
+- [facebook/sam2-hiera-tiny](https://huggingface.co/facebook/sam2-hiera-tiny): The smallest model in the Segment Anything 2 model suite, optimized for real-time image segmentation.
+</Accordion>
+
+## Basic Usage Instructions
+
+<Tip>
+  For a detailed understanding of the `segment-anything-2` endpoint and to
+  experiment with the API, see the [AI Subnet API
+  Reference](/ai/api-reference/segment-anything-2).
+</Tip>
+
+To generate an image with the `segment-anything-2` pipeline, send a `POST`
+request to the Gateway's `segment-anything-2` API endpoint:
+
+```bash
+curl -X POST http://<gateway-ip>/segment-anything-2 \
+    -F model_id="facebook/sam2-hiera-large" \
+    -F point_coords="[[120,100],[120,50]]" \
+    -F point_labels="[1,0]" \
+    -F image=@<PATH_TO_IMAGE>/cool-cat.png
+```
+
+In this command:
+
+- `<gateway-ip>` should be replaced with your AI Gateway's IP address.
+- `model_id` is the diffusion model for image generation.
+- The `point_coords` field holds the coordinates of the points to be segmented.
+- The `point_labels` field holds the labels for the points to be segmented.
+- The `image` field holds the **absolute** path to the image file to be
+  transformed.
+
+For additional optional parameters, refer to the
+[AI Subnet API Reference](/ai/api-reference/segment-anything-2).
+
+After execution, the Orchestrator processes the request and returns the response
+to the Gateway:
+
+```json
+{
+  "masks": "[[[2.84, 2.83, ...], [2.92, 2.91, ...], [3.22, 3.56, ...], ...]]",
+  "scores": "[0.50, 0.37, ...]",
+  "logits": "[[[2.84, 2.66, ...], [3.59, 5.20, ...], [5.07, 5.68, ...], ...]]"
+}
+```
+
+## API Reference
+
+<Card
+  href="/ai/api-reference/segment-anything-2"
+  title="API Reference"
+  icon="rectangle-terminal"
+>
+  Explore the `segment-anything-2` endpoint and experiment with the API in
+  Livepeer AI API Reference.
+</Card>
diff --git a/mint.json b/mint.json
index af391fb5..7c45d5b4 100644
--- a/mint.json
+++ b/mint.json
@@ -527,7 +527,8 @@
             "ai/pipelines/text-to-image",
             "ai/pipelines/image-to-image",
             "ai/pipelines/image-to-video",
-            "ai/pipelines/upscale"
+            "ai/pipelines/upscale",
+            "ai/pipelines/segment-anything-2"
           ]
         },
         {
@@ -572,7 +573,8 @@
             "ai/api-reference/image-to-image",
             "ai/api-reference/image-to-video",
             "ai/api-reference/upscale",
-            "ai/api-reference/audio-to-text"
+            "ai/api-reference/audio-to-text",
+            "ai/api-reference/segment-anything-2"
           ]
         }
       ]