From 40151e616b7dad9bda77a32558b05e3d4886bab4 Mon Sep 17 00:00:00 2001 From: Kevin Birk Date: Tue, 3 Dec 2024 12:10:55 -0500 Subject: [PATCH] Decode Project Overview properly before sending for embeddings (#5674) --- .../models/dataservice/model/Model.java | 33 +++++++++++++++++-- .../models/dataservice/project/Project.java | 26 +++++++++++++-- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/packages/server/src/main/java/software/uncharted/terarium/hmiserver/models/dataservice/model/Model.java b/packages/server/src/main/java/software/uncharted/terarium/hmiserver/models/dataservice/model/Model.java index 4faf296d36..68d966044e 100644 --- a/packages/server/src/main/java/software/uncharted/terarium/hmiserver/models/dataservice/model/Model.java +++ b/packages/server/src/main/java/software/uncharted/terarium/hmiserver/models/dataservice/model/Model.java @@ -10,9 +10,12 @@ import jakarta.persistence.Entity; import java.io.Serial; import java.util.ArrayList; +import java.util.Base64; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.experimental.Accessors; @@ -358,18 +361,40 @@ public boolean isPetrinet() { return this.getHeader().getSchemaName().equalsIgnoreCase("petrinet"); } + private String getDescriptionAsReadableString() { + if (getDescription() == null) { + return null; + } + + // decode from base64 + final byte[] decodedBytes = Base64.getDecoder().decode(getDescription()); + final String decodedString = new String(decodedBytes); + + // remove image tags + final String regex = "]*>(.*?)<\\/img>|]*\\/>"; + final Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); + final Matcher matcher = pattern.matcher(decodedString); + final String result = matcher.replaceAll(""); + + return result; + } + @JsonIgnore @TSIgnore public String getEmbeddingSourceText() { + String source = ""; try { + if (getDescription() != null) { + source += getDescriptionAsReadableString(); + } final ObjectMapper objectMapper = new ObjectMapper(); if (getMetadata() != null && getMetadata().getGollmCard() != null) { - return objectMapper.writeValueAsString(getMetadata().getGollmCard()); + source += objectMapper.writeValueAsString(getMetadata().getGollmCard()); } - return objectMapper.writeValueAsString(this); } catch (final Exception e) { throw new RuntimeException("Failed to serialize model embedding text into JSON", e); } + return source; } @JsonIgnore @@ -388,6 +413,10 @@ public Map getEmbeddingsSourceByType() { log.warn("Failed to serialize card embedding text into JSON", e); } + if (getDescription() != null) { + sources.put(TerariumAssetEmbeddingType.DESCRIPTION, getDescriptionAsReadableString()); + } + return sources; } } diff --git a/packages/server/src/main/java/software/uncharted/terarium/hmiserver/models/dataservice/project/Project.java b/packages/server/src/main/java/software/uncharted/terarium/hmiserver/models/dataservice/project/Project.java index ece06316c3..46a69fea34 100644 --- a/packages/server/src/main/java/software/uncharted/terarium/hmiserver/models/dataservice/project/Project.java +++ b/packages/server/src/main/java/software/uncharted/terarium/hmiserver/models/dataservice/project/Project.java @@ -13,9 +13,12 @@ import java.io.Serial; import java.sql.Types; import java.util.ArrayList; +import java.util.Base64; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.experimental.Accessors; @@ -132,8 +135,7 @@ public Project clone() { public String getEmbeddingSourceText() { try { if (overviewContent != null) { - log.info(new String(overviewContent)); - return new String(overviewContent); + return getOverviewAsReadableString(); } final ObjectMapper objectMapper = new ObjectMapper(); return objectMapper.writeValueAsString(this); @@ -142,13 +144,31 @@ public String getEmbeddingSourceText() { } } + private String getOverviewAsReadableString() { + if (overviewContent == null) { + return null; + } + + // decode from base64 + final byte[] decodedBytes = Base64.getDecoder().decode(overviewContent); + final String decodedString = new String(decodedBytes); + + // remove image tags + final String regex = "]*>(.*?)<\\/img>|]*\\/>"; + final Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); + final Matcher matcher = pattern.matcher(decodedString); + final String result = matcher.replaceAll(""); + + return result; + } + @JsonIgnore @TSIgnore public Map getEmbeddingsSourceByType() { final Map sources = super.getEmbeddingsSourceByType(); if (overviewContent != null) { - sources.put(TerariumAssetEmbeddingType.OVERVIEW, new String(overviewContent)); + sources.put(TerariumAssetEmbeddingType.OVERVIEW, getOverviewAsReadableString()); } if (metadata != null) { sources.put(TerariumAssetEmbeddingType.METADATA, metadata.toString());