From 64a7ef2be77a243ac2e2a95fb8980c1233c6031e Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Thu, 16 Jan 2025 11:35:42 -0300 Subject: [PATCH 1/3] Vector Embeddings - Initial Commit --- .../Feature/VectorEmbeddings/Indexable.php | 33 ++ .../VectorEmbeddings/Indexables/Post.php | 123 +++++ .../VectorEmbeddings/Indexables/Term.php | 109 ++++ .../VectorEmbeddings/VectorEmbeddings.php | 493 ++++++++++++++++++ includes/functions/core.php | 3 + 5 files changed, 761 insertions(+) create mode 100644 includes/classes/Feature/VectorEmbeddings/Indexable.php create mode 100644 includes/classes/Feature/VectorEmbeddings/Indexables/Post.php create mode 100644 includes/classes/Feature/VectorEmbeddings/Indexables/Term.php create mode 100644 includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php diff --git a/includes/classes/Feature/VectorEmbeddings/Indexable.php b/includes/classes/Feature/VectorEmbeddings/Indexable.php new file mode 100644 index 0000000..e129d62 --- /dev/null +++ b/includes/classes/Feature/VectorEmbeddings/Indexable.php @@ -0,0 +1,33 @@ +feature = $feature; + } +} diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php new file mode 100644 index 0000000..01fb95c --- /dev/null +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -0,0 +1,123 @@ +feature->get_setting( 'ep_external_embedding' ) ) { + add_filter( 'ep_post_sync_args_post_prepare_meta', [ $this, 'add_vector_field_to_post_sync' ], 10, 2 ); + } + } + + /** + * Add our vector field mapping to the Elasticsearch post index. + * + * @param array $mapping Current mapping. + * @return array + */ + public function add_post_vector_field_mapping( array $mapping ): array { + return $this->feature->add_vector_mapping_field( $mapping ); + } + + /** + * Exclude our vector meta from being synced. + * + * @param array $excluded_keys Current excluded keys. + * @return array + */ + public function exclude_vector_meta( array $excluded_keys ): array { + $excluded_keys[] = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' ); + return $excluded_keys; + } + + /** + * Add the embedding data to the post vector sync args. + * + * @param array $args Current sync args. + * @param int $post_id Post ID being synced. + * @return array + */ + public function add_vector_field_to_post_sync( array $args, int $post_id ): array { + // No need to add vector data if no content exists. + $post = get_post( $post_id ); + if ( empty( $post->post_content ) ) { + return $args; + } + $meta_field = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' ); + + // Try to use the stored embeddings first. + $embeddings = get_post_meta( $post_id, $meta_field, true ); + + // If they don't exist, make API requests to generate them. + if ( ! $embeddings ) { + $embeddings = []; + + $content_chunks = $this->feature->chunk_content( $post->post_content ); + + // Get the embeddings for each chunk. + if ( ! empty( $content_chunks ) ) { + foreach ( $content_chunks as $chunk ) { + $embedding = $this->feature->get_embedding( $chunk ); + + if ( $embedding && ! is_wp_error( $embedding ) ) { + $embeddings[] = array_map( 'floatval', $embedding ); + } + } + } + + // Add embeddings for title. + $title_embedding = $this->feature->get_embedding( $this->feature->normalize_content( $post->post_title ) ); + if ( $title_embedding && ! is_wp_error( $title_embedding ) ) { + $embeddings[] = array_map( 'floatval', $title_embedding ); + } + + // Add embeddings for slug. + $slug_embedding = $this->feature->get_embedding( $post->post_name ); + if ( $slug_embedding && ! is_wp_error( $slug_embedding ) ) { + $embeddings[] = array_map( 'floatval', $slug_embedding ); + } + + // Store the embeddings for future use. + if ( ! empty( $embeddings ) ) { + update_post_meta( $post_id, $meta_field, $embeddings ); + } + } + + // If we still don't have embeddings, return early. + if ( ! $embeddings || empty( $embeddings ) ) { + return $args; + } + + // Add the embeddings data to the sync args. + $args['chunks'] = []; + + foreach ( $embeddings as $embedding ) { + $args['chunks'][] = [ + 'vector' => array_map( 'floatval', $embedding ), + ]; + } + + return $args; + } +} diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Term.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Term.php new file mode 100644 index 0000000..766b1f2 --- /dev/null +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Term.php @@ -0,0 +1,109 @@ +feature->get_setting( 'ep_external_embedding' ) ) { + add_filter( 'ep_term_sync_args', [ $this, 'add_vector_field_to_term_sync' ], 10, 2 ); + } + } + + /** + * Add our vector field mapping to the Elasticsearch term index. + * + * @param array $mapping Current mapping. + * @return array + */ + public function add_term_vector_field_mapping( array $mapping ): array { + return $this->feature->add_vector_mapping_field( $mapping ); + } + + /** + * Exclude our vector meta from being synced. + * + * @param array $excluded_keys Current excluded keys. + * @return array + */ + public function exclude_vector_meta( array $excluded_keys ): array { + $excluded_keys[] = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' ); + return $excluded_keys; + } + + /** + * Add the embedding data to the term vector sync args. + * + * @param array $args Current sync args. + * @param int $term_id Term ID being synced. + * @return array + */ + public function add_vector_field_to_term_sync( array $args, int $term_id ): array { + $meta_field = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' ); + // Try to use the stored embeddings first. + $embeddings = get_term_meta( $term_id, $meta_field, true ); + + // If they don't exist, make API requests to generate them. + if ( ! $embeddings ) { + $term = get_term( $term_id ); + + // Build up the content we want to generate embeddings for. + $content = $term->name . ' ' . $term->slug . ' ' . $term->description; + + $embeddings = []; + $content_chunks = $this->feature->chunk_content( $content ); + + // Get the embeddings for each chunk. + if ( ! empty( $content_chunks ) ) { + foreach ( $content_chunks as $chunk ) { + $embedding = $this->feature->get_embedding( $chunk ); + + if ( $embedding && ! is_wp_error( $embedding ) ) { + $embeddings[] = array_map( 'floatval', $embedding ); + } + } + } + + // Store the embeddings for future use. + if ( ! empty( $embeddings ) ) { + update_term_meta( $term_id, $meta_field, $embeddings ); + } + } + + // If we still don't have embeddings, return early. + if ( ! $embeddings || empty( $embeddings ) ) { + return $args; + } + + // Add the embeddings data to the sync args. + $args['chunks'] = []; + + foreach ( $embeddings as $embedding ) { + $args['chunks'][] = [ + 'vector' => array_map( 'floatval', $embedding ), + ]; + } + + return $args; + } +} diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php new file mode 100644 index 0000000..e8b82c2 --- /dev/null +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -0,0 +1,493 @@ + '', + 'ep_openai_embeddings_api_url' => 'https://api.openai.com/v1/embeddings', + 'ep_openai_embedding_model' => 'text-embedding-3-small', + 'ep_vector_embeddings_meta_field' => 'vector_embeddings', + 'ep_external_embedding' => '0', + ]; + + /** + * Initialize feature setting it's config + */ + public function __construct() { + $this->slug = 'vector_embeddings'; + + $this->title = esc_html__( 'Vector Embeddings', 'elasticpress-labs' ); + + $this->requires_install_reindex = true; + + $this->summary = __( + 'This feature enables storage of vector embeddings, a numerical representation of the indexed content that can capture semantic relationships and similarities between data points. These embeddings are often used by AI models to process and understand complex information more efficiently and are used for features like natural language processing, recommendations and computer vision.', + 'elasticpress-labs' + ); + + $this->es_version = Elasticsearch::factory()->get_elasticsearch_version(); + + parent::__construct(); + } + + /** + * Connects the Module with WordPress using Hooks and/or Filters. + * + * @return void + */ + public function setup() { + $post_indexable = new Indexables\Post( $this ); + $post_indexable->setup(); + + $term_indexable = new Indexables\Term( $this ); + $term_indexable->setup(); + } + + /** + * Tell user whether requirements for feature are met or not. + * + * @return FeatureRequirementsStatus Requirements object + */ + public function requirements_status() { + $status = new \ElasticPress\FeatureRequirementsStatus( 1 ); + + // Vector support was added in Elasticsearch 7.0. + if ( version_compare( $this->es_version, '7.0', '<=' ) ) { + $status->code = 2; + $status->message = esc_html__( 'You need to have Elasticsearch with version >7.0.', 'elasticpress-labs' ); + } + + return $status; + } + + /** + * Set the `settings_schema` attribute + */ + public function set_settings_schema() { + $this->settings_schema = [ + [ + 'key' => 'ep_openai_api_key', + 'label' => __( 'OpenAI API Key', 'elasticpress-labs' ), + 'help' => sprintf( + wp_kses( + /* translators: %1$s: OpenAI sign up URL */ + __( 'Don\'t have an OpenAI account yet? Sign up for one in order to get your API key.', 'elasticpress-labs' ), + [ + 'a' => [ + 'href' => [], + 'title' => [], + ], + ] + ), + esc_url( 'https://platform.openai.com/signup' ) + ), + 'type' => 'text', + ], + [ + 'help' => __( 'OpenAI Embeddings API Url', 'elasticpress-labs' ), + 'key' => 'ep_openai_embeddings_api_url', + 'label' => __( 'OpenAI Embeddings API Url', 'elasticpress-labs' ), + 'type' => 'text', + ], + [ + 'help' => __( 'OpenAI Embedding model', 'elasticpress-labs' ), + 'key' => 'ep_openai_embedding_model', + 'label' => __( 'The name of the embedding model to use', 'elasticpress-labs' ), + 'type' => 'text', + ], + [ + 'help' => __( 'Specify the postmeta field name that will hold vector embeddings and will be added as dense vector in Elasticsearch mapping.', 'elasticpress-labs' ), + 'key' => 'ep_vector_embeddings_meta_field', + 'label' => __( 'Meta field holding the vector_embeddings', 'elasticpress-labs' ), + 'type' => 'text', + ], + [ + 'key' => 'ep_external_embedding', + 'help' => __( 'Enable this if an external process is providing the vector_embeddings meta field provided above with content. This will disable ElasticPress\'s control over embedding generation', 'elasticpress-labs' ), + 'label' => __( 'External embedding processing', 'elasticpress-labs' ), + 'type' => 'checkbox', + ], + ]; + } + + /** + * Add a vector field to the Elasticsearch mapping. + * + * @param array $mapping Current mapping. + * @param null|int $dimensions Number of dimensions for the vector field. + * @param bool $quantization Whether to use quantization for the vector field. Default false. + * @return array + */ + public function add_vector_mapping_field( array $mapping, $dimensions = null, bool $quantization = true ): array { + // Don't add the field if it already exists. + if ( isset( $mapping['mappings']['properties']['chunks'] ) ) { + return $mapping; + } + + // This needs to match the dimensions your model uses and be between 1 and 4096. + if ( ! $dimensions ) { + $dimensions = $this->get_dimensions(); + } + $calc_dimensions = max( 1, min( 4096, $dimensions ) ); + + // Add the default vector field mapping. + $mapping['mappings']['properties']['chunks'] = [ + 'type' => 'nested', + 'properties' => [ + 'vector' => [ + 'type' => 'dense_vector', + 'dims' => (int) $calc_dimensions, + ], + ], + ]; + + // Add extra vector fields for newer versions of Elasticsearch. + if ( version_compare( $this->es_version, '8.0', '>=' ) ) { + // The index (true or false, default true) and similarity (l2_norm, dot_product or cosine) fields + // were added in 8.0. The similarity field must be set if index is true. + $mapping['mappings']['properties']['chunks']['properties']['vector'] = array_merge( + $mapping['mappings']['properties']['chunks']['properties']['vector'], + [ + 'index' => true, + 'similarity' => 'cosine', + ] + ); + + // The element_type field was added in 8.6. This can be either float (default) or byte. + if ( version_compare( $this->es_version, '8.6', '>=' ) ) { + $mapping['mappings']['properties']['chunks']['properties']['vector']['element_type'] = 'float'; + } + + // The int8_hnsw type was added in 8.12. + if ( $quantization && version_compare( $this->es_version, '8.12', '>=' ) ) { + // This is supposed to result in better performance but slightly less accurate results. + // See https://www.elastic.co/guide/en/elasticsearch/reference/8.13/knn-search.html#knn-search-quantized-example. + // Can test with this on and off and compare results to see what works best. + $mapping['mappings']['properties']['chunks']['properties']['vector']['index_options']['type'] = 'int8_hnsw'; + } + } + + return $mapping; + } + + /** + * Get an embedding from a given text. + * + * @param string $text Text to get the embedding for. + * @param bool $cache Whether to cache the result. Default false. + * @return array|WP_Error + */ + public function get_embedding( string $text, bool $cache = false ) { + // Check to see if we have a stored embedding. + if ( $cache ) { + $key = 'ep_embedding_' . sanitize_title( $text ); + $query_embedding = wp_cache_get( $key, 'ep_embeddings' ); + + if ( $query_embedding ) { + return $query_embedding; + } + } + + // Generate the embedding. + $embedding = $this->generate_embedding( $text ); + + if ( is_wp_error( $embedding ) ) { + return $embedding; + } + + // Store the embedding for future use if desired. + if ( $cache ) { + wp_cache_set( $key, $embedding, 'ep_embeddings', false ); + } + + return $embedding; + } + + /** + * Generate an embedding for a particular piece of text. + * + * @param string $text Text to generate the embedding for. + * @return array|boolean|WP_Error + */ + public function generate_embedding( string $text = '' ) { + /** + * Filter the URL for the post request. + * + * @hook ep_openai_embeddings_api_url + * @since 2.4.0 + * + * @param {string} $url The URL for the request. + * + * @return {string} The URL for the request. + */ + $url = apply_filters( 'ep_openai_embeddings_api_url', $this->get_setting( 'ep_openai_embeddings_api_url' ) ); + + /** + * Filter the request body before sending to OpenAI. + * + * @hook ep_openai_embeddings_request_body + * @since 2.4.0 + * + * @param {array} $body Request body that will be sent to OpenAI. + * @param {string} $text Text we are getting embeddings for. + * + * @return {array} Request body. + */ + $body = apply_filters( + 'ep_openai_embeddings_request_body', + [ + 'model' => $this->get_setting( 'ep_openai_embedding_model' ), + 'input' => $text, + 'dimensions' => $this->get_dimensions(), + ], + $text + ); + + /** + * Filter the options for the post request. + * + * @hook ep_openai_embeddings_options + * @since 2.4.0 + * + * @param {array} $options The options for the request. + * @param {string} $url The URL for the request. + * + * @return {array} The options for the request. + */ + $options = apply_filters( + 'ep_openai_embeddings_options', + [ + 'body' => wp_json_encode( $body ), + 'timeout' => 60, // phpcs:ignore WordPressVIPMinimum.Performance.RemoteRequestTimeout.timeout_timeout + ], + $url + ); + + $this->add_headers( $options ); + + // Make our API request. + $response = $this->get_result( + wp_remote_post( + $url, + $options + ) + ); + + if ( is_wp_error( $response ) ) { + return $response; + } + + if ( empty( $response['data'] ) ) { + return new WP_Error( 'no_data', esc_html__( 'No data returned from OpenAI.', 'elasticpress-labs' ) ); + } + + $return = []; + + // Parse out the embeddings response. + foreach ( $response['data'] as $data ) { + if ( ! isset( $data['embedding'] ) || ! is_array( $data['embedding'] ) ) { + continue; + } + + $return = $data['embedding']; + break; + } + + return $return; + } + + /** + * Get results from the response. + * + * @param object $response The API response. + * @return array|WP_Error + */ + public function get_result( $response ) { + if ( is_wp_error( $response ) ) { + return $response; + } + + $headers = wp_remote_retrieve_headers( $response ); + $content_type = false; + + if ( ! empty( $headers ) ) { + $content_type = isset( $headers['content-type'] ) ? $headers['content-type'] : false; + } + + $body = wp_remote_retrieve_body( $response ); + $code = wp_remote_retrieve_response_code( $response ); + + if ( false === $content_type || false !== strpos( $content_type, 'application/json' ) ) { + $json = json_decode( $body, true ); + + if ( json_last_error() === JSON_ERROR_NONE ) { + if ( empty( $json['error'] ) ) { + return $json; + } else { + $message = $json['error']['message'] ?? esc_html__( 'An error occured', 'elasticpresslabs' ); + return new WP_Error( $code, $message ); + } + } else { + return new WP_Error( 'Invalid JSON: ' . json_last_error_msg(), $body ); + } + } elseif ( $content_type && false !== strpos( $content_type, 'audio/mpeg' ) ) { + return $response; + } else { + return new WP_Error( 'Invalid content type', $response ); + } + } + + /** + * Normalizes content into plain text. + * + * @param string $content Content to normalize. + * @return string + */ + public function normalize_content( string $content = '' ): string { + $content = apply_filters( 'the_content', $content ); + + // Strip shortcodes but keep internal caption text. + $content = preg_replace( '#\[.+\](.+)\[/.+\]#', '$1', $content ); + + // Strip HTML entities. + $content = preg_replace( '/&#?[a-z0-9]{2,8};/i', '', $content ); + + // Replace HTML linebreaks with newlines. + $content = preg_replace( '##', "\n\n", $content ); + + // Strip all HTML tags. + $content = wp_strip_all_tags( $content ); + + return $content; + } + + /** + * Chunk content into smaller pieces with an overlap. + * + * @param string $content Content to chunk. + * @param int $chunk_size Size of each chunk, in words. + * @param int $overlap_size Overlap size for each chunk, in words. + * @return array + */ + public function chunk_content( string $content = '', int $chunk_size = 150, $overlap_size = 25 ): array { + // Normalize our content. + $content = $this->normalize_content( $content ); + + // Remove multiple whitespaces. + $content = preg_replace( '/\s+/', ' ', $content ); + + // Split text by single whitespace. + $words = explode( ' ', $content ); + + $chunks = []; + $text_count = count( $words ); + + // Iterate through & chunk data with an overlap. + for ( $i = 0; $i < $text_count; $i += $chunk_size ) { + // Join a set of words into a string. + $chunk = implode( + ' ', + array_slice( + $words, + max( $i - $overlap_size, 0 ), + $i + $chunk_size + ) + ); + + array_push( $chunks, $chunk ); + } + + return $chunks; + } + + /** + * Get the number of dimensions for the embeddings. + * + * @return int + */ + public function get_dimensions(): int { + /** + * Filter the dimensions we want for each embedding. + * + * Useful if you want to increase or decrease the length + * of each embedding. + * + * @hook ep_openai_embeddings_dimensions + * @since 2.4.0 + * + * @param {int} $dimensions The default dimensions. + * @return {int} The dimensions. + */ + return apply_filters( 'ep_openai_embeddings_dimensions', $this->dimensions ); + } + + /** + * Add the headers. + * + * @param array $options The header options, passed by reference. + */ + public function add_headers( array &$options = [] ) { + if ( empty( $options['headers'] ) ) { + $options['headers'] = []; + } + + if ( ! isset( $options['headers']['Authorization'] ) ) { + $options['headers']['Authorization'] = $this->get_auth_header(); + } + + if ( ! isset( $options['headers']['Content-Type'] ) ) { + $options['headers']['Content-Type'] = 'application/json'; + } + } + + /** + * Get the auth header. + * + * @return string + */ + public function get_auth_header() { + return 'Bearer ' . $this->get_setting( 'ep_openai_api_key' ); + } +} diff --git a/includes/functions/core.php b/includes/functions/core.php index f7cae24..05ffcd7 100644 --- a/includes/functions/core.php +++ b/includes/functions/core.php @@ -232,6 +232,9 @@ function maybe_load_features() { \ElasticPress\Features::factory()->register_feature( $subfeature ); } } + + $vector_embeddings = new \ElasticPressLabs\Feature\VectorEmbeddings\VectorEmbeddings(); + \ElasticPress\Features::factory()->register_feature( $vector_embeddings ); } /** From 265b928e44f1776a673f1a95af6e1c2cba669d87 Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Mon, 20 Jan 2025 18:02:13 -0300 Subject: [PATCH 2/3] Custom table to store vectors --- .../Feature/VectorEmbeddings/Indexable.php | 71 ++++++ .../VectorEmbeddings/Indexables/Post.php | 84 +++---- .../VectorEmbeddings/Storage/DbTable.php | 207 ++++++++++++++++++ .../VectorEmbeddings/VectorEmbeddings.php | 33 ++- 4 files changed, 332 insertions(+), 63 deletions(-) create mode 100644 includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php diff --git a/includes/classes/Feature/VectorEmbeddings/Indexable.php b/includes/classes/Feature/VectorEmbeddings/Indexable.php index e129d62..4187896 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexable.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexable.php @@ -30,4 +30,75 @@ abstract class Indexable { public function __construct( VectorEmbeddings $feature ) { $this->feature = $feature; } + + /** + * Given an object and its content pieces, return the embeddings and clean up unused embeddings stored. + * + * @param int $object_id The object ID + * @param string $object_type The object type + * @param array $content_pieces Content pieces to get embeddings for + * @return array + */ + public function get_updated_embeddings( int $object_id, string $object_type, array $content_pieces ): array { + $all_hashes = $this->feature->storage->get_all_object_hashes( $object_id, 'post' ); + + $hashes_in_use = []; + $embeddings = []; + foreach ( $content_pieces as $content_piece ) { + $content_chunks = $this->feature->chunk_content( $content_piece ); + + // Get the embeddings for each chunk. + if ( ! empty( $content_chunks ) ) { + foreach ( $content_chunks as $chunk ) { + $hash = $this->feature->storage->hash_content( $chunk ); + + $hashes_in_use[] = $hash; + + if ( isset( $all_hashes[ $hash ] ) ) { + $embeddings[] = $all_hashes[ $hash ]; + continue; + } + + $embedding = $this->feature->get_embedding( $object_id, $object_type, $chunk ); + if ( $embedding ) { + $embeddings[] = $embedding; + } + } + } + } + + $hashes_in_use = array_unique( $hashes_in_use ); + + $unused_hashes = array_diff( array_keys( $all_hashes ), $hashes_in_use ); + foreach ( $unused_hashes as $unused_hash ) { + $this->feature->storage->delete( $object_id, $object_type, $unused_hash ); + } + + return $embeddings; + } + + /** + * Add the embedding data to the post vector sync args. + * + * @param array $args The current sync args (an Elasticsearch document) + * @param array $embeddings The embeddings to add to the sync args + * @return array + */ + public function add_chuncks_field_value( array $args, array $embeddings ): array { + // If we still don't have embeddings, return early. + if ( empty( $embeddings ) ) { + return $args; + } + + // Add the embeddings data to the sync args. + $args['chunks'] = []; + + foreach ( $embeddings as $embedding ) { + $args['chunks'][] = [ + 'vector' => array_map( 'floatval', $embedding ), + ]; + } + + return $args; + } } diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index 01fb95c..f78ea55 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -59,65 +59,43 @@ public function exclude_vector_meta( array $excluded_keys ): array { * @return array */ public function add_vector_field_to_post_sync( array $args, int $post_id ): array { - // No need to add vector data if no content exists. - $post = get_post( $post_id ); - if ( empty( $post->post_content ) ) { + if ( ! $this->should_add_vector_field_to_post( $post_id ) ) { return $args; } - $meta_field = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' ); - - // Try to use the stored embeddings first. - $embeddings = get_post_meta( $post_id, $meta_field, true ); - - // If they don't exist, make API requests to generate them. - if ( ! $embeddings ) { - $embeddings = []; - - $content_chunks = $this->feature->chunk_content( $post->post_content ); - - // Get the embeddings for each chunk. - if ( ! empty( $content_chunks ) ) { - foreach ( $content_chunks as $chunk ) { - $embedding = $this->feature->get_embedding( $chunk ); - - if ( $embedding && ! is_wp_error( $embedding ) ) { - $embeddings[] = array_map( 'floatval', $embedding ); - } - } - } - // Add embeddings for title. - $title_embedding = $this->feature->get_embedding( $this->feature->normalize_content( $post->post_title ) ); - if ( $title_embedding && ! is_wp_error( $title_embedding ) ) { - $embeddings[] = array_map( 'floatval', $title_embedding ); - } + $content_pieces = $this->get_object_content_pieces( $post_id ); + $embeddings = $this->get_updated_embeddings( $post_id, 'post', $content_pieces ); - // Add embeddings for slug. - $slug_embedding = $this->feature->get_embedding( $post->post_name ); - if ( $slug_embedding && ! is_wp_error( $slug_embedding ) ) { - $embeddings[] = array_map( 'floatval', $slug_embedding ); - } - - // Store the embeddings for future use. - if ( ! empty( $embeddings ) ) { - update_post_meta( $post_id, $meta_field, $embeddings ); - } - } - - // If we still don't have embeddings, return early. - if ( ! $embeddings || empty( $embeddings ) ) { - return $args; - } + return $this->add_chuncks_field_value( $args, $embeddings ); + } - // Add the embeddings data to the sync args. - $args['chunks'] = []; + /** + * Whether or not we should add the vector field to the post. + * + * @param int $post_id The Post ID + * @return boolean + */ + public function should_add_vector_field_to_post( int $post_id ): bool { + $post = get_post( $post_id ); + return ! empty( $post ); + } - foreach ( $embeddings as $embedding ) { - $args['chunks'][] = [ - 'vector' => array_map( 'floatval', $embedding ), - ]; - } + /** + * Return all content pieces for a given post ID. + * + * By default includes the title, the slug, and the post content, but could also add + * meta fields and taxonomy terms, for example. + * + * @param int $post_id The Post ID + * @return array + */ + public function get_object_content_pieces( int $post_id ): array { + $post = get_post( $post_id ); - return $args; + return [ + $post->post_content, + $post->post_title, + $post->post_name, + ]; } } diff --git a/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php b/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php new file mode 100644 index 0000000..46950e3 --- /dev/null +++ b/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php @@ -0,0 +1,207 @@ +feature = $feature; + } + + /** + * Setup hooks + */ + public function setup() { + add_action( 'init', [ $this, 'create_table' ] ); + } + + /** + * Return the custom table name + * + * @return string + */ + public function get_table_name(): string { + global $wpdb; + + return $wpdb->prefix . 'ep_embeddings_table'; + } + + /** + * Create the table + */ + public function create_table() { + global $wpdb; + + if ( $this->table_exists() ) { + return; + } + + $table_name = $this->get_table_name(); + + $charset_collate = $wpdb->get_charset_collate(); + + $sql = "CREATE TABLE $table_name ( + id bigint(20) unsigned NOT NULL AUTO_INCREMENT, + object_id bigint(20) unsigned NOT NULL, + object_type varchar(32) NOT NULL, + hash varchar(32) NOT NULL, + vectors longtext NOT NULL, + PRIMARY KEY (id), + INDEX object (object_id, object_type) + ) $charset_collate;"; + + require_once ABSPATH . 'wp-admin/includes/upgrade.php'; + dbDelta( $sql ); + } + + /** + * Whether or not the table exists + * + * @return boolean + */ + public function table_exists(): bool { + global $wpdb; + + $table_name = $this->get_table_name(); + + $table_exists = $wpdb->get_var( + $wpdb->prepare( 'SHOW TABLES LIKE %s', $table_name ) + ); + return ! \is_wp_error( $table_exists ) && ! \is_null( $table_exists ); + } + + /** + * Insert a new entry in the database + * + * @param integer $object_id The object ID + * @param string $object_type The object type + * @param string $text The text. It will be hashed and used as a key + * @param array $vectors Array of vectors + * @return void + */ + public function insert( int $object_id, string $object_type, string $text, array $vectors ) { + global $wpdb; + + $table_name = $this->get_table_name(); + + $wpdb->insert( + $table_name, + [ + 'object_id' => $object_id, + 'object_type' => $object_type, + 'hash' => $this->hash_content( $text ), + 'vectors' => wp_json_encode( $vectors ), + ] + ); + } + + /** + * Given a text, return the vectors if they exist in the database + * + * @param string $text The text. It will be hashed and used as a key + * @return array|null + */ + public function get( string $text ) { + global $wpdb; + + $table_name = $this->get_table_name(); + + $vectors = $wpdb->get_var( + $wpdb->prepare( + 'SELECT vectors FROM %s WHERE hash = %s', + $table_name, + $this->hash_content( $text ) + ) + ); + + return $vectors ? json_decode( $vectors ) : null; + } + + /** + * Get a full list of hashes for a given object + * + * @param integer $object_id The object ID + * @param string $object_type The object type + * @return array + */ + public function get_all_object_hashes( int $object_id, string $object_type ): array { + global $wpdb; + + $table_name = $this->get_table_name(); + + $rows = $wpdb->get_results( + $wpdb->prepare( + 'SELECT hash, vectors FROM %s WHERE object_id = %d AND object_type = %s', + $table_name, + $object_id, + $object_type + ) + ); + + return array_reduce( + $rows, + function ( $carry, $row ) { + $carry[ $row->hash ] = json_decode( $row->vectors ); + return $carry; + }, + [] + ); + } + + /** + * Delete a hash of a given object + * + * @param integer $object_id The object ID + * @param string $object_type The object type + * @param string $hash The hash + * @return void + */ + public function delete( int $object_id, string $object_type, string $hash ) { + global $wpdb; + + $table_name = $this->get_table_name(); + + $wpdb->delete( + $table_name, + [ + 'object_id' => $object_id, + 'object_type' => $object_type, + 'hash' => $hash, + ] + ); + } + + /** + * Hash the content. Uses md5 by default. + * + * @param string $content The content + * @return string + */ + public function hash_content( string $content ): string { + return md5( $content ); + } +} diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php index e8b82c2..ecb46ba 100644 --- a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -39,6 +39,13 @@ class VectorEmbeddings extends Feature { */ protected $dimensions = 512; + /** + * Storage class instance. + * + * @var Storage\DbTable + */ + public $storage; + /** * Default settings * @@ -83,6 +90,9 @@ public function setup() { $term_indexable = new Indexables\Term( $this ); $term_indexable->setup(); + + $this->storage = new Storage\DbTable( $this ); + $this->storage->setup(); } /** @@ -215,18 +225,20 @@ public function add_vector_mapping_field( array $mapping, $dimensions = null, bo /** * Get an embedding from a given text. * - * @param string $text Text to get the embedding for. - * @param bool $cache Whether to cache the result. Default false. - * @return array|WP_Error + * @param int $object_id The Object ID. + * @param string $object_type The Object type. + * @param string $text Text to get the embedding for. + * @param string $return_type Return type ('array' or 'raw'). Default 'array'. + * @param bool $cache Whether to cache the result. Default true. + * @return array|null|WP_Error */ - public function get_embedding( string $text, bool $cache = false ) { + public function get_embedding( int $object_id, string $object_type, string $text, string $return_type = 'array', bool $cache = true ) { // Check to see if we have a stored embedding. if ( $cache ) { - $key = 'ep_embedding_' . sanitize_title( $text ); - $query_embedding = wp_cache_get( $key, 'ep_embeddings' ); + $cached = $this->storage->get( $text ); - if ( $query_embedding ) { - return $query_embedding; + if ( $cached ) { + return $cached; } } @@ -234,17 +246,18 @@ public function get_embedding( string $text, bool $cache = false ) { $embedding = $this->generate_embedding( $text ); if ( is_wp_error( $embedding ) ) { - return $embedding; + return 'raw' === $return_type ? $embedding : null; } // Store the embedding for future use if desired. if ( $cache ) { - wp_cache_set( $key, $embedding, 'ep_embeddings', false ); + $this->storage->insert( $object_id, $object_type, $text, $embedding ); } return $embedding; } + /** * Generate an embedding for a particular piece of text. * From 689fedb465a41352e518802f2374fdb566907b90 Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Mon, 20 Jan 2025 18:24:26 -0300 Subject: [PATCH 3/3] Fix tablename --- .../classes/Feature/VectorEmbeddings/Storage/DbTable.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php b/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php index 46950e3..e8614e9 100644 --- a/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php +++ b/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php @@ -132,8 +132,8 @@ public function get( string $text ) { $vectors = $wpdb->get_var( $wpdb->prepare( - 'SELECT vectors FROM %s WHERE hash = %s', - $table_name, + // phpcs:ignore WordPress.DB.PreparedSQL.InterpolatedNotPrepared + "SELECT vectors FROM {$table_name} WHERE hash = %s", $this->hash_content( $text ) ) ); @@ -155,8 +155,8 @@ public function get_all_object_hashes( int $object_id, string $object_type ): ar $rows = $wpdb->get_results( $wpdb->prepare( - 'SELECT hash, vectors FROM %s WHERE object_id = %d AND object_type = %s', - $table_name, + // phpcs:ignore WordPress.DB.PreparedSQL.InterpolatedNotPrepared + "SELECT hash, vectors FROM {$table_name} WHERE object_id = %d AND object_type = %s", $object_id, $object_type )