Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Vector Embeddings #121

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions includes/classes/Feature/VectorEmbeddings/Indexable.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
<?php
/**
* Vector Embeddings - Indexable
*
* As each indexable type (posts, terms, comments, users) uses different hooks, this abstract class is used to
* keep implementations independent.
*
* @since 2.4.0
* @package ElasticPressLabs
*/

namespace ElasticPressLabs\Feature\VectorEmbeddings;

/**
* Vector Embeddings Indexable abstract class
*/
abstract class Indexable {
/**
* VectorEmbeddings instance
*
* @var VectorEmbeddings
*/
protected $feature;

/**
* Class constructor
*
* @param VectorEmbeddings $feature The VectorEmbeddings feature instance
*/
public function __construct( VectorEmbeddings $feature ) {
$this->feature = $feature;
}

/**
* Given an object and its content pieces, return the embeddings and clean up unused embeddings stored.
*
* @param int $object_id The object ID
* @param string $object_type The object type
* @param array $content_pieces Content pieces to get embeddings for
* @return array
*/
public function get_updated_embeddings( int $object_id, string $object_type, array $content_pieces ): array {
$all_hashes = $this->feature->storage->get_all_object_hashes( $object_id, 'post' );

$hashes_in_use = [];
$embeddings = [];
foreach ( $content_pieces as $content_piece ) {
$content_chunks = $this->feature->chunk_content( $content_piece );

// Get the embeddings for each chunk.
if ( ! empty( $content_chunks ) ) {
foreach ( $content_chunks as $chunk ) {
$hash = $this->feature->storage->hash_content( $chunk );

$hashes_in_use[] = $hash;

if ( isset( $all_hashes[ $hash ] ) ) {
$embeddings[] = $all_hashes[ $hash ];
continue;
}

$embedding = $this->feature->get_embedding( $object_id, $object_type, $chunk );
if ( $embedding ) {
$embeddings[] = $embedding;
}
}
}
}

$hashes_in_use = array_unique( $hashes_in_use );

$unused_hashes = array_diff( array_keys( $all_hashes ), $hashes_in_use );
foreach ( $unused_hashes as $unused_hash ) {
$this->feature->storage->delete( $object_id, $object_type, $unused_hash );
}

return $embeddings;
}

/**
* Add the embedding data to the post vector sync args.
*
* @param array $args The current sync args (an Elasticsearch document)
* @param array $embeddings The embeddings to add to the sync args
* @return array
*/
public function add_chuncks_field_value( array $args, array $embeddings ): array {
// If we still don't have embeddings, return early.
if ( empty( $embeddings ) ) {
return $args;
}

// Add the embeddings data to the sync args.
$args['chunks'] = [];

foreach ( $embeddings as $embedding ) {
$args['chunks'][] = [
'vector' => array_map( 'floatval', $embedding ),
];
}

return $args;
}
}
101 changes: 101 additions & 0 deletions includes/classes/Feature/VectorEmbeddings/Indexables/Post.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
<?php
/**
* Vector Embeddings - Post Indexable
*
* @since 2.4.0
* @package ElasticPressLabs
*/

namespace ElasticPressLabs\Feature\VectorEmbeddings\Indexables;

use ElasticPressLabs\Feature\VectorEmbeddings\Indexable;

/**
* Vector Embeddings - Post Indexable class
*/
class Post extends Indexable {
/**
* Setup hooks
*/
public function setup() {
// Alter post and term mapping to store our vector embeddings
add_filter( 'ep_post_mapping', [ $this, 'add_post_vector_field_mapping' ] );

// Exclude designated meta field holding the vector embeddings from search
add_filter( 'ep_prepare_meta_excluded_public_keys', [ $this, 'exclude_vector_meta' ] );

// Only trigger embeddings when external embeddings are turned off
if ( ! $this->feature->get_setting( 'ep_external_embedding' ) ) {
add_filter( 'ep_post_sync_args_post_prepare_meta', [ $this, 'add_vector_field_to_post_sync' ], 10, 2 );
}
}

/**
* Add our vector field mapping to the Elasticsearch post index.
*
* @param array $mapping Current mapping.
* @return array
*/
public function add_post_vector_field_mapping( array $mapping ): array {
return $this->feature->add_vector_mapping_field( $mapping );
}

/**
* Exclude our vector meta from being synced.
*
* @param array $excluded_keys Current excluded keys.
* @return array
*/
public function exclude_vector_meta( array $excluded_keys ): array {
$excluded_keys[] = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' );
return $excluded_keys;
}

/**
* Add the embedding data to the post vector sync args.
*
* @param array $args Current sync args.
* @param int $post_id Post ID being synced.
* @return array
*/
public function add_vector_field_to_post_sync( array $args, int $post_id ): array {
if ( ! $this->should_add_vector_field_to_post( $post_id ) ) {
return $args;
}

$content_pieces = $this->get_object_content_pieces( $post_id );
$embeddings = $this->get_updated_embeddings( $post_id, 'post', $content_pieces );

return $this->add_chuncks_field_value( $args, $embeddings );
}

/**
* Whether or not we should add the vector field to the post.
*
* @param int $post_id The Post ID
* @return boolean
*/
public function should_add_vector_field_to_post( int $post_id ): bool {
$post = get_post( $post_id );
return ! empty( $post );
}

/**
* Return all content pieces for a given post ID.
*
* By default includes the title, the slug, and the post content, but could also add
* meta fields and taxonomy terms, for example.
*
* @param int $post_id The Post ID
* @return array
*/
public function get_object_content_pieces( int $post_id ): array {
$post = get_post( $post_id );

return [
$post->post_content,
$post->post_title,
$post->post_name,
];
}
}
109 changes: 109 additions & 0 deletions includes/classes/Feature/VectorEmbeddings/Indexables/Term.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
<?php
/**
* Vector Embeddings - Term Indexable
*
* @since 2.4.0
* @package ElasticPressLabs
*/

namespace ElasticPressLabs\Feature\VectorEmbeddings\Indexables;

use ElasticPressLabs\Feature\VectorEmbeddings\Indexable;

/**
* Vector Embeddings - Term Indexable class
*/
class Term extends Indexable {
/**
* Setup hooks
*/
public function setup() {
// Alter post and term mapping to store our vector embeddings
add_filter( 'ep_term_mapping', [ $this, 'add_term_vector_field_mapping' ] );

// Exclude designated meta field holding the vector embeddings from search
add_filter( 'ep_prepare_term_meta_excluded_public_keys', [ $this, 'exclude_vector_meta' ] );

// Only trigger embeddings when external embeddings are turned off
if ( ! $this->feature->get_setting( 'ep_external_embedding' ) ) {
add_filter( 'ep_term_sync_args', [ $this, 'add_vector_field_to_term_sync' ], 10, 2 );
}
}

/**
* Add our vector field mapping to the Elasticsearch term index.
*
* @param array $mapping Current mapping.
* @return array
*/
public function add_term_vector_field_mapping( array $mapping ): array {
return $this->feature->add_vector_mapping_field( $mapping );
}

/**
* Exclude our vector meta from being synced.
*
* @param array $excluded_keys Current excluded keys.
* @return array
*/
public function exclude_vector_meta( array $excluded_keys ): array {
$excluded_keys[] = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' );
return $excluded_keys;
}

/**
* Add the embedding data to the term vector sync args.
*
* @param array $args Current sync args.
* @param int $term_id Term ID being synced.
* @return array
*/
public function add_vector_field_to_term_sync( array $args, int $term_id ): array {
$meta_field = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' );
// Try to use the stored embeddings first.
$embeddings = get_term_meta( $term_id, $meta_field, true );

// If they don't exist, make API requests to generate them.
if ( ! $embeddings ) {
$term = get_term( $term_id );

// Build up the content we want to generate embeddings for.
$content = $term->name . ' ' . $term->slug . ' ' . $term->description;

$embeddings = [];
$content_chunks = $this->feature->chunk_content( $content );

// Get the embeddings for each chunk.
if ( ! empty( $content_chunks ) ) {
foreach ( $content_chunks as $chunk ) {
$embedding = $this->feature->get_embedding( $chunk );

if ( $embedding && ! is_wp_error( $embedding ) ) {
$embeddings[] = array_map( 'floatval', $embedding );
}
}
}

// Store the embeddings for future use.
if ( ! empty( $embeddings ) ) {
update_term_meta( $term_id, $meta_field, $embeddings );
}
}

// If we still don't have embeddings, return early.
if ( ! $embeddings || empty( $embeddings ) ) {
return $args;
}

// Add the embeddings data to the sync args.
$args['chunks'] = [];

foreach ( $embeddings as $embedding ) {
$args['chunks'][] = [
'vector' => array_map( 'floatval', $embedding ),
];
}

return $args;
}
}
Loading
Loading