diff --git a/examples/openai-htr/Dockerfile b/examples/openai-htr/Dockerfile index ffb1eab..83ff518 100644 --- a/examples/openai-htr/Dockerfile +++ b/examples/openai-htr/Dockerfile @@ -12,9 +12,8 @@ RUN --mount=type=bind,from=imagemagick,source=/packages,target=/packages \ RUN magick -list format | grep "JPEG-2000" -ENV OPENAI_MODEL=gpt-4o-mini \ - PROMPT="Transcribe this image that contains handwritten text. Include all text you see in the image. In your response, say absolutely nothing except the text from the image." \ - MAX_TOKENS=300 +ENV OPENAI_MODEL=gpt-4o \ + CHAT_PROMPT="changeme" \ + MAX_TOKENS=16384 -COPY scyllaridae.yml /app/scyllaridae.yml -COPY cmd.sh /app/cmd.sh +COPY . /app/ diff --git a/examples/openai-htr/cmd.sh b/examples/openai-htr/cmd.sh index 676fc13..dd60fa1 100755 --- a/examples/openai-htr/cmd.sh +++ b/examples/openai-htr/cmd.sh @@ -3,10 +3,26 @@ set -eou pipefail TMP_DIR=$(mktemp -d) +HOCR_URL="$1" +DOMAIN=$(echo "$HOCR_URL"| awk -F/ '{print $1"//"$3}') +# our hOCR filenames are the node ID +NID=$(echo "$HOCR_URL" | xargs basename | awk -F '.hocr' '{print $1}') +# take the base prompt and move it into place +cp /app/prompt.txt "$TMP_DIR/prompt.txt" + +# the hOCR document is being streamed into this script +# append the hOCR document into the prompt +# since we're asking the LLM to improve the hOCR doc +cat >> "$TMP_DIR/prompt.txt" +CHAT_PROMPT=$(jq --null-input --rawfile rawstring "$TMP_DIR/prompt.txt" '$rawstring') + +# find the service file +SERVICE_FILE_PATH=$(curl -s "$DOMAIN/node/$NID/service-file" | jq -r '.[0].file') # convert service file to jpg -magick - "$TMP_DIR/img.jpg" +curl -s "${DOMAIN}${SERVICE_FILE_PATH}" | magick - "$TMP_DIR/img.jpg" +# chatgpt needs it base64 encoded BASE64_IMAGE=$(base64 -w 0 "$TMP_DIR/img.jpg") cat < "$TMP_DIR/payload.json" @@ -18,7 +34,7 @@ cat < "$TMP_DIR/payload.json" "content": [ { "type": "text", - "text": "$PROMPT" + "text": $CHAT_PROMPT }, { "type": "image_url", diff --git a/examples/openai-htr/prompt.txt b/examples/openai-htr/prompt.txt new file mode 100644 index 0000000..218d028 --- /dev/null +++ b/examples/openai-htr/prompt.txt @@ -0,0 +1,8 @@ +Transcribe the text in the provided image. +Place the transcribed text into the appropriate ocrx_word span in the hOCR template provided. Ensure the following: +Never use tesseract to aid you. +Include all text in the transcription. +Never return an incomplete document. Always finish the complete job, with every word wrapped in a span, and the words accurately transcribed from the image. +Never edit the hOCR dimensions provided. Only replace the text within 's. +Do not wrap your response in backticks. Your response should only be the hOCR document provided below with your transcribed text placed in the appropriate spots in the hOCR document. +Do not add any additional spans or attributes. You only should be adding plain text within the hocr HTML document. diff --git a/examples/openai-htr/scyllaridae.yml b/examples/openai-htr/scyllaridae.yml index 36d1b3a..a527b7e 100644 --- a/examples/openai-htr/scyllaridae.yml +++ b/examples/openai-htr/scyllaridae.yml @@ -3,3 +3,5 @@ allowedMimeTypes: cmdByMimeType: default: cmd: /app/cmd.sh + args: + - "%source-uri"