Skip to content

Commit

Permalink
Add deletions + Fix image OCR
Browse files Browse the repository at this point in the history
* add new memory API to delete documents, supporting retry and long running operation, + examples
* add Azure Form Recognizer interactive setup
* add image OCR examples
* allow to request max results to vector DBs
* move default handlers registration code to memory client builder, with full support for DI. This will allow cleaning up/reducing the orchestrators interface and reduce coupling/complexity.
* fix Azure Form Recognizer, crashing both service and serverless memory and other issues
* fix potential overflow in Azure Search and handle correctly max results
* fix tags filtering in SimpleVectorDb
  • Loading branch information
dluc committed Aug 29, 2023
1 parent 50c369e commit 5541455
Show file tree
Hide file tree
Showing 29 changed files with 977 additions and 386 deletions.
2 changes: 2 additions & 0 deletions SemanticMemory.sln.DotSettings
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=MQTT/@EntryIndexedValue">MQTT</s:String>
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=MS/@EntryIndexedValue">MS</s:String>
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=MSAL/@EntryIndexedValue">MSAL</s:String>
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=OCR/@EntryIndexedValue">OCR</s:String>
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=OID/@EntryIndexedValue">OID</s:String>
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=OK/@EntryIndexedValue">OK</s:String>
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=OS/@EntryIndexedValue">OS</s:String>
Expand Down Expand Up @@ -254,6 +255,7 @@ public void It$SOMENAME$()
<s:Boolean x:Key="/Default/UserDictionary/Words/=skprompt/@EntryIndexedValue">True</s:Boolean>
<s:String x:Key="/Default/CodeInspection/Highlighting/InspectionSeverities/=Xunit_002EXunitTestWithConsoleOutput/@EntryIndexedValue">DO_NOT_SHOW</s:String>
<s:Boolean x:Key="/Default/UserDictionary/Words/=smemory/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=SVCS/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=testsettings/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=tldr/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=typeparam/@EntryIndexedValue">True</s:Boolean>
Expand Down
7 changes: 6 additions & 1 deletion dotnet/ClientLib/Constants.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,12 @@ public static class Constants
public const string HttpSearchEndpoint = "/search";
public const string HttpUploadEndpoint = "/upload";
public const string HttpUploadStatusEndpoint = "/upload-status";
public const string HttpUploadStatusEndpointWithParams = $"/upload-status?{WebServiceIndexField}={HttpIndexPlaceholder}&{WebServiceDocumentIdField}={HttpDocumentIdPlaceholder}";
public const string HttpDocumentsEndpoint = "/documents";
public const string HttpDeleteEndpointWithParams = $"{HttpDocumentsEndpoint}?{WebServiceIndexField}={HttpIndexPlaceholder}&{WebServiceDocumentIdField}={HttpDocumentIdPlaceholder}";
public const string HttpUploadStatusEndpointWithParams = $"{HttpUploadStatusEndpoint}?{WebServiceIndexField}={HttpIndexPlaceholder}&{WebServiceDocumentIdField}={HttpDocumentIdPlaceholder}";
public const string HttpIndexPlaceholder = "{index}";
public const string HttpDocumentIdPlaceholder = "{documentId}";

// Handlers
public const string DeleteDocumentPipelineStepName = "private_delete_document";
}
11 changes: 11 additions & 0 deletions dotnet/ClientLib/ISemanticMemoryClient.cs
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,17 @@ public Task<string> ImportWebPageAsync(
IEnumerable<string>? steps = null,
CancellationToken cancellationToken = default);

/// <summary>
/// Delete a specified document from memory, and update all derived memories.
/// </summary>
/// <param name="documentId">Document ID</param>
/// <param name="index">Optional index name</param>
/// <param name="cancellationToken">Async task cancellation token</param>
public Task DeleteDocumentAsync(
string documentId,
string? index = null,
CancellationToken cancellationToken = default);

/// <summary>
/// Check if a document ID exists in the given index and is ready for usage.
/// The logic checks if the uploaded document has been fully processed.
Expand Down
30 changes: 30 additions & 0 deletions dotnet/ClientLib/MemoryWebClient.cs
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,36 @@ public async Task<string> ImportWebPageAsync(
.ConfigureAwait(false);
}

/// <inheritdoc />
public async Task DeleteDocumentAsync(string documentId, string? index = null, CancellationToken cancellationToken = default)
{
if (string.IsNullOrWhiteSpace(documentId))
{
throw new SemanticMemoryException("The document ID is empty");
}

index = IndexExtensions.CleanName(index);
var url = Constants.HttpDeleteEndpointWithParams
.Replace(Constants.HttpIndexPlaceholder, index)
.Replace(Constants.HttpDocumentIdPlaceholder, documentId);
HttpResponseMessage? response = await this._client.DeleteAsync(url, cancellationToken).ConfigureAwait(false);

// No error if the document doesn't exist
if (response.StatusCode == HttpStatusCode.NotFound)
{
return;
}

try
{
response.EnsureSuccessStatusCode();
}
catch (Exception e)
{
throw new SemanticMemoryException($"Delete failed, status code: {response.StatusCode}", e);
}
}

/// <inheritdoc />
public async Task<bool> IsDocumentReadyAsync(
string documentId,
Expand Down
74 changes: 64 additions & 10 deletions dotnet/CoreLib/AppBuilders/MemoryClientBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
using Microsoft.SemanticMemory.ContentStorage.DevTools;
using Microsoft.SemanticMemory.DataFormats.Image;
using Microsoft.SemanticMemory.DataFormats.Image.AzureFormRecognizer;
using Microsoft.SemanticMemory.Handlers;
using Microsoft.SemanticMemory.MemoryStorage;
using Microsoft.SemanticMemory.MemoryStorage.DevTools;
using Microsoft.SemanticMemory.MemoryStorage.Qdrant;
Expand All @@ -33,6 +34,7 @@ public class MemoryClientBuilder
{
private readonly IServiceCollection? _sharedServiceCollection;
private const string ConfigRoot = "SemanticMemory";
private bool _useDefaultHandlers = true;

private enum ClientTypes
{
Expand Down Expand Up @@ -83,6 +85,12 @@ public MemoryClientBuilder(WebApplicationBuilder appBuilder)
this.WithSimpleFileStorage(new SimpleFileStorageConfig { Directory = Path.Join(Path.GetTempPath(), "content") });
}

public MemoryClientBuilder WithoutDefaultHandlers()
{
this._useDefaultHandlers = false;
return this;
}

public MemoryClientBuilder WithCustomIngestionQueueClientFactory(QueueClientFactory service)
{
service = service ?? throw new ConfigurationException("The ingestion queue client factory instance is NULL");
Expand Down Expand Up @@ -419,6 +427,28 @@ public Memory BuildServerlessClient()
try
{
this.CompleteServerlessClient();

if (this._useDefaultHandlers)
{
this._appBuilder.Services.AddTransient<TextExtractionHandler>(serviceProvider
=> ActivatorUtilities.CreateInstance<TextExtractionHandler>(serviceProvider, "extract"));

this._appBuilder.Services.AddTransient<TextPartitioningHandler>(serviceProvider
=> ActivatorUtilities.CreateInstance<TextPartitioningHandler>(serviceProvider, "partition"));

this._appBuilder.Services.AddTransient<SummarizationHandler>(serviceProvider
=> ActivatorUtilities.CreateInstance<SummarizationHandler>(serviceProvider, "summarize"));

this._appBuilder.Services.AddTransient<GenerateEmbeddingsHandler>(serviceProvider
=> ActivatorUtilities.CreateInstance<GenerateEmbeddingsHandler>(serviceProvider, "gen_embeddings"));

this._appBuilder.Services.AddTransient<SaveEmbeddingsHandler>(serviceProvider
=> ActivatorUtilities.CreateInstance<SaveEmbeddingsHandler>(serviceProvider, "save_embeddings"));

this._appBuilder.Services.AddTransient<DeleteDocumentHandler>(serviceProvider
=> ActivatorUtilities.CreateInstance<DeleteDocumentHandler>(serviceProvider, Constants.DeleteDocumentPipelineStepName));
}

this._app = this._appBuilder.Build();

// In case the user didn't set the embedding generator and vector DB to use for ingestion, use the values set for retrieval
Expand All @@ -427,9 +457,20 @@ public Memory BuildServerlessClient()

var orchestrator = this._app.Services.GetService<InProcessPipelineOrchestrator>() ?? throw new ConfigurationException("Unable to build orchestrator");
var searchClient = this._app.Services.GetService<SearchClient>() ?? throw new ConfigurationException("Unable to build search client");
var ocrEngine = this._app.Services.GetService<IOcrEngine>();

return new Memory(orchestrator, searchClient, ocrEngine);
var instance = new Memory(orchestrator, searchClient);

if (this._useDefaultHandlers)
{
instance.AddHandler(this._app.Services.GetService<TextExtractionHandler>() ?? throw new ConfigurationException("Unable to build " + nameof(TextExtractionHandler)));
instance.AddHandler(this._app.Services.GetService<TextPartitioningHandler>() ?? throw new ConfigurationException("Unable to build " + nameof(TextPartitioningHandler)));
instance.AddHandler(this._app.Services.GetService<SummarizationHandler>() ?? throw new ConfigurationException("Unable to build " + nameof(SummarizationHandler)));
instance.AddHandler(this._app.Services.GetService<GenerateEmbeddingsHandler>() ?? throw new ConfigurationException("Unable to build " + nameof(GenerateEmbeddingsHandler)));
instance.AddHandler(this._app.Services.GetService<SaveEmbeddingsHandler>() ?? throw new ConfigurationException("Unable to build " + nameof(SaveEmbeddingsHandler)));
instance.AddHandler(this._app.Services.GetService<DeleteDocumentHandler>() ?? throw new ConfigurationException("Unable to build " + nameof(DeleteDocumentHandler)));
}

return instance;
}
catch (Exception e)
{
Expand All @@ -450,6 +491,27 @@ public MemoryService BuildAsyncClient()
var orchestrator = app.Services.GetService<DistributedPipelineOrchestrator>() ?? throw new ConfigurationException("Unable to build orchestrator");
var searchClient = app.Services.GetService<SearchClient>() ?? throw new ConfigurationException("Unable to build search client");

if (this._useDefaultHandlers)
{
if (this._sharedServiceCollection == null)
{
const string ClassName = nameof(MemoryClientBuilder);
const string MethodName = nameof(this.WithoutDefaultHandlers);
throw new ConfigurationException("Service collection not available, unable to register default handlers. " +
$"If you'd like using the default handlers use `new {ClassName}(<your service collection provider>)`, " +
$"otherwise use `{ClassName}(...).{MethodName}()` to manage the list of handlers manually.");
}

// Handlers - Register these handlers to run as hosted services in the caller app.
// At start each hosted handler calls IPipelineOrchestrator.AddHandlerAsync() to register in the orchestrator.
this._sharedServiceCollection.AddHandlerAsHostedService<TextExtractionHandler>("extract");
this._sharedServiceCollection.AddHandlerAsHostedService<SummarizationHandler>("summarize");
this._sharedServiceCollection.AddHandlerAsHostedService<TextPartitioningHandler>("partition");
this._sharedServiceCollection.AddHandlerAsHostedService<GenerateEmbeddingsHandler>("gen_embeddings");
this._sharedServiceCollection.AddHandlerAsHostedService<SaveEmbeddingsHandler>("save_embeddings");
this._sharedServiceCollection.AddHandlerAsHostedService<DeleteDocumentHandler>(Constants.DeleteDocumentPipelineStepName);
}

return new MemoryService(orchestrator, searchClient);
}

Expand All @@ -473,14 +535,6 @@ private MemoryClientBuilder CompleteAsyncClient()
return this;
}

private MemoryClientBuilder AddSingleton<TService>(Func<IServiceProvider, TService> serviceFactory)
where TService : class
{
this._appBuilder.Services.AddSingleton<TService>(serviceFactory);
this._sharedServiceCollection?.AddSingleton<TService>(serviceFactory);
return this;
}

private MemoryClientBuilder AddSingleton<TService>(TService implementationInstance)
where TService : class
{
Expand Down
92 changes: 86 additions & 6 deletions dotnet/CoreLib/ContentStorage/AzureBlobs/AzureBlobsStorage.cs
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,24 @@ public AzureBlobsStorage(
}

/// <inherit />
public string JoinPaths(string path1, string path2)
public async Task CreateIndexDirectoryAsync(
string index,
CancellationToken cancellationToken = default)
{
return $"{path1}/{path2}";
this._log.LogTrace("Creating container '{0}' ...", this._containerName);

await this._containerClient
.CreateIfNotExistsAsync(PublicAccessType.None, cancellationToken: cancellationToken)
.ConfigureAwait(false);

this._log.LogTrace("Container '{0}' ready", this._containerName);
}

/// <inherit />
public async Task CreateDirectoryAsync(string directoryName, CancellationToken cancellationToken = default)
public async Task CreateDocumentDirectoryAsync(
string index,
string documentId,
CancellationToken cancellationToken = default)
{
this._log.LogTrace("Creating container '{0}' ...", this._containerName);

Expand All @@ -121,20 +132,38 @@ await this._containerClient
}

/// <inherit />
public Task WriteTextFileAsync(string directoryName, string fileName, string fileContent, CancellationToken cancellationToken = default)
public Task WriteTextFileAsync(
string index,
string documentId,
string fileName,
string fileContent,
CancellationToken cancellationToken = default)
{
var directoryName = JoinPaths(index, documentId);
return this.InternalWriteAsync(directoryName, fileName, fileContent, cancellationToken);
}

/// <inherit />
public Task<long> WriteStreamAsync(string directoryName, string fileName, Stream contentStream, CancellationToken cancellationToken = default)
public Task<long> WriteStreamAsync(
string index,
string documentId,
string fileName,
Stream contentStream,
CancellationToken cancellationToken = default)
{
var directoryName = JoinPaths(index, documentId);
return this.InternalWriteAsync(directoryName, fileName, contentStream, cancellationToken);
}

/// <inherit />
public async Task<BinaryData> ReadFileAsync(string directoryName, string fileName, bool errIfNotFound = true, CancellationToken cancellationToken = default)
public async Task<BinaryData> ReadFileAsync(
string index,
string documentId,
string fileName,
bool errIfNotFound = true,
CancellationToken cancellationToken = default)
{
var directoryName = JoinPaths(index, documentId);
var blobName = $"{directoryName}/{fileName}";
BlobClient blobClient = this.GetBlobClient(blobName);

Expand All @@ -158,6 +187,57 @@ public async Task<BinaryData> ReadFileAsync(string directoryName, string fileNam
}
}

/// <inherit />
public async Task DeleteDocumentDirectoryAsync(string index, string documentId, CancellationToken cancellationToken = default)
{
var directoryName = JoinPaths(index, documentId);
if (string.IsNullOrWhiteSpace(index) || string.IsNullOrWhiteSpace(documentId) || string.IsNullOrWhiteSpace(directoryName))
{
throw new ContentStorageException("The blob prefix is empty, stopping the process to prevent data loss");
}

this._log.LogInformation("Deleting blobs at {0}", directoryName);

AsyncPageable<BlobItem>? blobList = this._containerClient.GetBlobsAsync(prefix: directoryName, cancellationToken: cancellationToken);
await foreach (Page<BlobItem> page in blobList.AsPages().WithCancellation(cancellationToken))
{
foreach (BlobItem blob in page.Values)
{
// Skip root and empty names
if (string.IsNullOrWhiteSpace(blob.Name) || blob.Name == directoryName) { continue; }

// Remove the prefix, skip root and empty names
var fileName = blob.Name.Trim('/').Substring(directoryName.Trim('/').Length).Trim('/');
if (string.IsNullOrWhiteSpace(fileName)) { continue; }

// Don't delete the pipeline status file
if (fileName == Constants.PipelineStatusFilename) { continue; }

this._log.LogInformation("Deleting blob {0}", blob.Name);
Response? response = await this.GetBlobClient(blob.Name).DeleteAsync(cancellationToken: cancellationToken).ConfigureAwait(false);
if (response.Status < 300)
{
this._log.LogDebug("Delete response: {0}", response.Status);
}
else
{
this._log.LogWarning("Unexpected delete response: {0}", response.Status);
}
}
}
}

/// <summary>
/// Join index name and document ID, using the platform specific logic, to calculate the directory name
/// </summary>
/// <param name="index">Index name, left side of the path</param>
/// <param name="documentId">Document ID, right side of the path (appended to index)</param>
/// <returns>Index name concatenated with Document Id into a single path</returns>
private static string JoinPaths(string index, string documentId)
{
return $"{index}/{documentId}";
}

private async Task<long> InternalWriteAsync(string directoryName, string fileName, object content, CancellationToken cancellationToken)
{
var blobName = $"{directoryName}/{fileName}";
Expand Down
Loading

0 comments on commit 5541455

Please sign in to comment.