Skip to content

Commit

Permalink
Decode Project Overview properly before sending for embeddings (#5674)
Browse files Browse the repository at this point in the history
  • Loading branch information
kbirk authored Dec 3, 2024
1 parent 8868c15 commit 40151e6
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@
import jakarta.persistence.Entity;
import java.io.Serial;
import java.util.ArrayList;
import java.util.Base64;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.Accessors;
Expand Down Expand Up @@ -358,18 +361,40 @@ public boolean isPetrinet() {
return this.getHeader().getSchemaName().equalsIgnoreCase("petrinet");
}

private String getDescriptionAsReadableString() {
if (getDescription() == null) {
return null;
}

// decode from base64
final byte[] decodedBytes = Base64.getDecoder().decode(getDescription());
final String decodedString = new String(decodedBytes);

// remove image tags
final String regex = "<img\\b[^>]*>(.*?)<\\/img>|<img\\b[^>]*\\/>";
final Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
final Matcher matcher = pattern.matcher(decodedString);
final String result = matcher.replaceAll("");

return result;
}

@JsonIgnore
@TSIgnore
public String getEmbeddingSourceText() {
String source = "";
try {
if (getDescription() != null) {
source += getDescriptionAsReadableString();
}
final ObjectMapper objectMapper = new ObjectMapper();
if (getMetadata() != null && getMetadata().getGollmCard() != null) {
return objectMapper.writeValueAsString(getMetadata().getGollmCard());
source += objectMapper.writeValueAsString(getMetadata().getGollmCard());
}
return objectMapper.writeValueAsString(this);
} catch (final Exception e) {
throw new RuntimeException("Failed to serialize model embedding text into JSON", e);
}
return source;
}

@JsonIgnore
Expand All @@ -388,6 +413,10 @@ public Map<TerariumAssetEmbeddingType, String> getEmbeddingsSourceByType() {
log.warn("Failed to serialize card embedding text into JSON", e);
}

if (getDescription() != null) {
sources.put(TerariumAssetEmbeddingType.DESCRIPTION, getDescriptionAsReadableString());
}

return sources;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@
import java.io.Serial;
import java.sql.Types;
import java.util.ArrayList;
import java.util.Base64;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.Accessors;
Expand Down Expand Up @@ -132,8 +135,7 @@ public Project clone() {
public String getEmbeddingSourceText() {
try {
if (overviewContent != null) {
log.info(new String(overviewContent));
return new String(overviewContent);
return getOverviewAsReadableString();
}
final ObjectMapper objectMapper = new ObjectMapper();
return objectMapper.writeValueAsString(this);
Expand All @@ -142,13 +144,31 @@ public String getEmbeddingSourceText() {
}
}

private String getOverviewAsReadableString() {
if (overviewContent == null) {
return null;
}

// decode from base64
final byte[] decodedBytes = Base64.getDecoder().decode(overviewContent);
final String decodedString = new String(decodedBytes);

// remove image tags
final String regex = "<img\\b[^>]*>(.*?)<\\/img>|<img\\b[^>]*\\/>";
final Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
final Matcher matcher = pattern.matcher(decodedString);
final String result = matcher.replaceAll("");

return result;
}

@JsonIgnore
@TSIgnore
public Map<TerariumAssetEmbeddingType, String> getEmbeddingsSourceByType() {
final Map<TerariumAssetEmbeddingType, String> sources = super.getEmbeddingsSourceByType();

if (overviewContent != null) {
sources.put(TerariumAssetEmbeddingType.OVERVIEW, new String(overviewContent));
sources.put(TerariumAssetEmbeddingType.OVERVIEW, getOverviewAsReadableString());
}
if (metadata != null) {
sources.put(TerariumAssetEmbeddingType.METADATA, metadata.toString());
Expand Down

0 comments on commit 40151e6

Please sign in to comment.