Azure-Samples · Prajwal-Microsoft · Jan 5, 2025 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
@@ -22,8 +22,9 @@ AZURE_SEARCH_DATASOURCE_NAME=
 # Azure OpenAI for generating the answer and computing the embedding of the documents
 AZURE_OPENAI_RESOURCE=
 AZURE_OPENAI_API_KEY=
-AZURE_OPENAI_MODEL_INFO="{\"model\":\"gpt-35-turbo-16k\",\"modelName\":\"gpt-35-turbo-16k\",\"modelVersion\":\"0613\"}"
-AZURE_OPENAI_EMBEDDING_MODEL_INFO="{\"model\":\"text-embedding-ada-002\",\"modelName\":\"text-embedding-ada-002\",\"modelVersion\":\"2\"}"
+AZURE_OPENAI_MODEL=gpt-35-turbo
+AZURE_OPENAI_MODEL_NAME=gpt-35-turbo
+AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-ada-002
 AZURE_OPENAI_TEMPERATURE=0
 AZURE_OPENAI_TOP_P=1.0
 AZURE_OPENAI_MAX_TOKENS=1000
@@ -35,10 +36,12 @@ AZURE_OPENAI_STREAM=True
 AzureWebJobsStorage=
 BACKEND_URL=http://localhost:7071
 DOCUMENT_PROCESSING_QUEUE_NAME=
-# Azure Blob Storage for storing the original documents to be processed
-AZURE_BLOB_STORAGE_INFO="{\"containerName\":\"documents\",\"accountName\":\"\",\"accountKey\":\"\"}"
+AZURE_BLOB_ACCOUNT_NAME=
+AZURE_BLOB_ACCOUNT_KEY=
+AZURE_BLOB_CONTAINER_NAME=
 # Azure Form Recognizer for extracting the text from the documents
-AZURE_FORM_RECOGNIZER_INFO="{\"endpoint\":\"\",\"key\":\"\"}"
+AZURE_FORM_RECOGNIZER_ENDPOINT=
+AZURE_FORM_RECOGNIZER_KEY=
 # Azure AI Content Safety for filtering out the inappropriate questions or answers
 AZURE_CONTENT_SAFETY_ENDPOINT=
 AZURE_CONTENT_SAFETY_KEY=
@@ -60,8 +63,11 @@ AZURE_KEY_VAULT_ENDPOINT=
 # Chat conversation type to decide between custom or byod (bring your own data) conversation type
 CONVERSATION_FLOW=
 # Chat History CosmosDB Integration Settings
-AZURE_COSMOSDB_INFO="{\"accountName\":\"cosmos-abc123\",\"databaseName\":\"db_conversation_history\",\"containerName\":\"conversations\"}"
-AZURE_COSMOSDB_ACCOUNT_KEY=
+AZURE_COSMOSDB_ACCOUNT_NAME=
+AZURE_COSMOSDB_DATABASE_NAME=
+AZURE_COSMOSDB_CONVERSATIONS_CONTAINER_NAME=
 AZURE_COSMOSDB_ENABLE_FEEDBACK=
-AZURE_POSTGRESQL_INFO="{\"user\":\"\",\"dbname\":\"postgres\",\"host\":\"\"}"
+AZURE_POSTGRESQL_HOST_NAME=
+AZURE_POSTGRESQL_DATABASE_NAME=
+AZURE_POSTGRESQL_USER=
 DATABASE_TYPE="CosmosDB"
@@ -1,9 +1,7 @@
 name: Build Docker Images
 
 on:
-  workflow_run:
-    workflows: [Tests]
-    types: [completed]
+  push:
     branches:
       - main
       - dev
@@ -22,7 +20,6 @@ on:
 
 jobs:
   docker-build:
-    if: ${{ github.event_name != 'workflow_run' || github.event.workflow_run.conclusion == 'success' }}
     strategy:
       matrix:
         include:
@@ -34,9 +31,9 @@ jobs:
             dockerfile: docker/Frontend.Dockerfile
     uses: ./.github/workflows/build-docker.yml
     with:
-      registry: ${{ github.event.workflow_run.head_branch == 'main' && 'fruoccopublic.azurecr.io' || 'cwydcontainerreg.azurecr.io'}}
-      username: ${{ github.event.workflow_run.head_branch == 'main' && 'fruoccopublic' || 'cwydcontainerreg'}}
+      registry: ${{ github.ref_name == 'main' && 'fruoccopublic.azurecr.io' || 'cwydcontainerreg.azurecr.io'}}
+      username: ${{ github.ref_name == 'main' && 'fruoccopublic' || 'cwydcontainerreg'}}
       app_name: ${{ matrix.app_name }}
       dockerfile: ${{ matrix.dockerfile }}
-      push: ${{ github.event.workflow_run.head_branch == 'main' || github.event.workflow_run.head_branch == 'dev' || github.event.workflow_run.head_branch == 'demo' }}
+      push: ${{ github.ref_name == 'main' || github.ref_name == 'dev' || github.ref_name == 'demo' }}
     secrets: inherit
@@ -28,7 +28,6 @@ jobs:
   docker-build:
     runs-on: ubuntu-latest
     steps:
-
     - name: Checkout
       uses: actions/checkout@v4
 
@@ -61,7 +60,7 @@ jobs:
         context: .
         file: ${{ inputs.dockerfile }}
         push: ${{ inputs.push }}
-        cache-from: type=registry,ref=${{ inputs.registry }}/${{ inputs.app_name}}:${{ github.ref_name == 'main' && 'latest' || github.ref_name == 'dev' && 'dev' || github.ref_name == 'demo' && 'demo' || 'latest' }}
+        cache-from: type=registry,ref=${{ inputs.registry }}/${{ inputs.app_name}}:${{ github.ref_name == 'main' && 'latest' || github.ref_name == 'dev' && 'dev' || github.ref_name == 'demo' && 'demo' || github.head_ref || github.ref_name }}
         tags: |
-          ${{ inputs.registry }}/${{ inputs.app_name}}:${{ github.ref_name == 'main' && 'latest' || github.ref_name == 'dev' && 'dev' || github.ref_name == 'demo' && 'demo' || 'latest' }}
+          ${{ inputs.registry }}/${{ inputs.app_name}}:${{ github.ref_name == 'main' && 'latest' || github.ref_name == 'dev' && 'dev' || github.ref_name == 'demo' && 'demo' || github.head_ref || 'default' }}
           ${{ inputs.registry }}/${{ inputs.app_name}}:${{ steps.date.outputs.date }}_${{ github.run_number }}
@@ -15,7 +15,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0  # Fetch all history for accurate branch comparison
 

@@ -50,6 +50,7 @@ Welcome to the *Chat with your data* Solution accelerator repository! The *Chat
 
 
 
+
 ### About this repo
 
 This repository provides an end-to-end solution for users who want to query their data with natural language. It includes a well designed ingestion mechanism for multiple file types, an easy deployment, and a support team for maintenance. The accelerator demonstrates both Push or Pull Ingestion; the choice of orchestration (Semantic Kernel, LangChain, OpenAI Functions or [Prompt Flow](docs/prompt_flow.md)) and should be the minimum components needed to implement a RAG pattern. It is not intended to be put into Production as-is without experimentation or evaluation of your data. It provides the following features:

@@ -5,7 +5,13 @@ metadata:
   template: [email protected]
 hooks:
   postprovision:
-    run: ./infra/prompt-flow/create-prompt-flow.sh
+    # run: ./infra/prompt-flow/create-prompt-flow.sh
+    posix:
+      shell: sh
+      run: chmod +x ./scripts/parse_env.sh && ./scripts/parse_env.sh
+    windows:
+      shell: pwsh
+      run: ./scripts/parse_env.ps1
 services:
   web:
     project: ./code

@@ -28,19 +28,22 @@ def _get_file_name_from_message(message_body) -> str:
 )
 def batch_push_results(msg: func.QueueMessage) -> None:
     message_body = json.loads(msg.get_body().decode("utf-8"))
-    logger.debug("Process Document Event queue function triggered: %s", message_body)
+    logger.info("Process Document Event queue function triggered: %s", message_body)
 
     event_type = message_body.get("eventType", "")
     # We handle "" in this scenario for backwards compatibility
     # This function is primarily triggered by an Event Grid queue message from the blob storage
     # However, it can also be triggered using a legacy schema from BatchStartProcessing
     if event_type in ("", "Microsoft.Storage.BlobCreated"):
+        logger.info("Handling 'Blob Created' event with message body: %s", message_body)
         _process_document_created_event(message_body)
 
     elif event_type == "Microsoft.Storage.BlobDeleted":
+        logger.info("Handling 'Blob Deleted' event with message body: %s", message_body)
         _process_document_deleted_event(message_body)
 
     else:
+        logger.exception("Received an unrecognized event type: %s", event_type)
         raise NotImplementedError(f"Unknown event type received: {event_type}")
 
 

@@ -247,7 +247,7 @@ def get_container_sas(self):
             user_delegation_key=self.user_delegation_key,
             account_key=self.account_key,
             permission="r",
-            expiry=datetime.utcnow() + timedelta(hours=1),
+            expiry=datetime.utcnow() + timedelta(days=365 * 5),
         )
 
     def get_blob_sas(self, file_name):

@@ -1,10 +1,13 @@
+import logging
 from azure.core.credentials import AzureKeyCredential
 from azure.ai.formrecognizer import DocumentAnalysisClient
 from azure.identity import DefaultAzureCredential
 import html
 import traceback
 from .env_helper import EnvHelper
 
+logger = logging.getLogger(__name__)
+
 
 class AzureFormRecognizerClient:
     def __init__(self) -> None:
@@ -75,6 +78,8 @@ def begin_analyze_document_from_url(
         model_id = "prebuilt-layout" if use_layout else "prebuilt-read"
 
         try:
+            logger.info("Method begin_analyze_document_from_url started")
+            logger.info(f"Model ID selected: {model_id}")
             poller = self.document_analysis_client.begin_analyze_document_from_url(
                 model_id, document_url=source_url
             )
@@ -144,4 +149,7 @@ def begin_analyze_document_from_url(
 
             return page_map
         except Exception as e:
+            logger.exception(f"Exception in begin_analyze_document_from_url: {e}")
             raise ValueError(f"Error: {traceback.format_exc()}. Error: {e}")
+        finally:
+            logger.info("Method begin_analyze_document_from_url ended")
@@ -52,6 +52,9 @@ def __init__(self, config: dict):
         )
         self.enable_chat_history = config["enable_chat_history"]
         self.database_type = config.get("database_type", self.env_helper.DATABASE_TYPE)
+        self.conversational_flow = config.get(
+            "conversational_flow", self.env_helper.CONVERSATION_FLOW
+        )
 
     def get_available_document_types(self) -> list[str]:
         document_types = {
@@ -187,21 +190,27 @@ def _set_new_config_properties(config: dict, default_config: dict):
     @staticmethod
     @functools.cache
     def get_active_config_or_default():
+        logger.info("Method get_active_config_or_default started")
         env_helper = EnvHelper()
         config = ConfigHelper.get_default_config()
 
         if env_helper.LOAD_CONFIG_FROM_BLOB_STORAGE:
+            logger.info("Loading configuration from Blob Storage")
             blob_client = AzureBlobStorageClient(container_name=CONFIG_CONTAINER_NAME)
 
             if blob_client.file_exists(CONFIG_FILE_NAME):
+                logger.info("Configuration file found in Blob Storage")
                 default_config = config
                 config_file = blob_client.download_file(CONFIG_FILE_NAME)
                 config = json.loads(config_file)
 
                 ConfigHelper._set_new_config_properties(config, default_config)
             else:
-                logger.info("Returning default config")
+                logger.info(
+                    "Configuration file not found in Blob Storage, using default configuration"
+                )
 
+        logger.info("Method get_active_config_or_default ended")
         return Config(config)
 
     @staticmethod
@@ -247,11 +256,7 @@ def get_default_config():
                 logger.info("Loading default config from %s", config_file_path)
                 ConfigHelper._default_config = json.loads(
                     Template(f.read()).substitute(
-                        ORCHESTRATION_STRATEGY=(
-                            OrchestrationStrategy.SEMANTIC_KERNEL.value
-                            if env_helper.DATABASE_TYPE == DatabaseType.POSTGRESQL.value
-                            else env_helper.ORCHESTRATION_STRATEGY
-                        ),
+                        ORCHESTRATION_STRATEGY=env_helper.ORCHESTRATION_STRATEGY,
                         LOG_USER_INTERACTIONS=(
                             False
                             if env_helper.DATABASE_TYPE == DatabaseType.POSTGRESQL.value
@@ -262,6 +267,7 @@ def get_default_config():
                             if env_helper.DATABASE_TYPE == DatabaseType.POSTGRESQL.value
                             else True
                         ),
+                        CONVERSATION_FLOW=env_helper.CONVERSATION_FLOW,
                         DATABASE_TYPE=env_helper.DATABASE_TYPE,
                     )
                 )

@@ -9,7 +9,7 @@
     "enable_post_answering_prompt": false,
     "ai_assistant_type": "default",
     "enable_content_safety": true,
-    "conversational_flow": "custom"
+    "conversational_flow": "${CONVERSATION_FLOW}"
   },
   "example": {
     "documents": "{\n  \"retrieved_documents\": [\n    {\n      \"[doc1]\": {\n        \"content\": \"Dual Transformer Encoder (DTE) DTE (https://dev.azure.com/TScience/TSciencePublic/_wiki/wikis/TSciencePublic.wiki/82/Dual-Transformer-Encoder) DTE is a general pair-oriented sentence representation learning framework based on transformers. It provides training, inference and evaluation for sentence similarity models. Model Details DTE can be used to train a model for sentence similarity with the following features: - Build upon existing transformer-based text representations (e.g.TNLR, BERT, RoBERTa, BAG-NLR) - Apply smoothness inducing technology to improve the representation robustness - SMART (https://arxiv.org/abs/1911.03437) SMART - Apply NCE (Noise Contrastive Estimation) based similarity learning to speed up training of 100M pairs We use pretrained DTE model\"\n      }\n    },\n    {\n      \"[doc2]\": {\n        \"content\": \"trained on internal data. You can find more details here - Models.md (https://dev.azure.com/TScience/_git/TSciencePublic?path=%2FDualTransformerEncoder%2FMODELS.md&version=GBmaster&_a=preview) Models.md DTE-pretrained for In-context Learning Research suggests that finetuned transformers can be used to retrieve semantically similar exemplars for e.g. KATE (https://arxiv.org/pdf/2101.06804.pdf) KATE . They show that finetuned models esp. tuned on related tasks give the maximum boost to GPT-3 in-context performance. DTE have lot of pretrained models that are trained on intent classification tasks. We can use these model embedding to find natural language utterances which are similar to our test utterances at test time. The steps are: 1. Embed\"\n      }\n    },\n    {\n      \"[doc3]\": {\n        \"content\": \"train and test utterances using DTE model 2. For each test embedding, find K-nearest neighbors. 3. Prefix the prompt with nearest embeddings. The following diagram from the above paper (https://arxiv.org/pdf/2101.06804.pdf) the above paper visualizes this process: DTE-Finetuned This is an extension of DTE-pretrained method where we further finetune the embedding models for prompt crafting task. In summary, we sample random prompts from our training data and use them for GPT-3 inference for the another part of training data. Some prompts work better and lead to right results whereas other prompts lead\"\n      }\n    },\n    {\n      \"[doc4]\": {\n        \"content\": \"to wrong completions. We finetune the model on the downstream task of whether a prompt is good or not based on whether it leads to right or wrong completion. This approach is similar to this paper: Learning To Retrieve Prompts for In-Context Learning (https://arxiv.org/pdf/2112.08633.pdf) this paper: Learning To Retrieve Prompts for In-Context Learning . This method is very general but it may require a lot of data to actually finetune a model to learn how to retrieve examples suitable for the downstream inference model like GPT-3.\"\n      }\n    }\n  ]\n}",

@@ -15,11 +15,16 @@ class IntegratedVectorizationEmbedder(EmbedderBase):
     def __init__(self, env_helper: EnvHelper):
         self.env_helper = env_helper
         self.llm_helper: LLMHelper = LLMHelper()
+        logger.info("Initialized IntegratedVectorizationEmbedder.")
 
     def embed_file(self, source_url: str, file_name: str = None):
+        logger.info(
+            f"Starting embed_file for source_url: {source_url}, file_name: {file_name}."
+        )
         self.process_using_integrated_vectorization(source_url=source_url)
 
     def process_using_integrated_vectorization(self, source_url: str):
+        logger.info(f"Starting integrated vectorization for source_url: {source_url}.")
         config = ConfigHelper.get_active_config_or_default()
         try:
             search_datasource = AzureSearchDatasource(self.env_helper)
@@ -35,14 +40,20 @@ def process_using_integrated_vectorization(self, source_url: str):
                 self.env_helper.AZURE_SEARCH_INDEXER_NAME,
                 skillset_name=search_skillset_result.name,
             )
+            logger.info("Integrated vectorization process completed successfully.")
             return indexer_result
         except Exception as e:
             logger.error(f"Error processing {source_url}: {e}")
             raise e
 
     def reprocess_all(self):
+        logger.info("Starting reprocess_all operation.")
         search_indexer = AzureSearchIndexer(self.env_helper)
         if search_indexer.indexer_exists(self.env_helper.AZURE_SEARCH_INDEXER_NAME):
+            logger.info(
+                f"Running indexer: {self.env_helper.AZURE_SEARCH_INDEXER_NAME}."
+            )
             search_indexer.run_indexer(self.env_helper.AZURE_SEARCH_INDEXER_NAME)
         else:
+            logger.info("Indexer does not exist. Starting full processing.")
             self.process_using_integrated_vectorization(source_url="all")
@@ -20,6 +20,7 @@
 
 class PostgresEmbedder(EmbedderBase):
     def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper):
+        logger.info("Initializing PostgresEmbedder.")
         self.env_helper = env_helper
         self.llm_helper = LLMHelper()
         self.azure_postgres_helper = AzurePostgresHelper()
@@ -33,6 +34,7 @@ def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper):
             self.embedding_configs[ext] = processor
 
     def embed_file(self, source_url: str, file_name: str):
+        logger.info(f"Embedding file: {file_name} from source: {source_url}")
         file_extension = file_name.split(".")[-1].lower()
         embedding_config = self.embedding_configs.get(file_extension)
         self.__embed(
@@ -48,32 +50,42 @@ def embed_file(self, source_url: str, file_name: str):
     def __embed(
         self, source_url: str, file_extension: str, embedding_config: EmbeddingConfig
     ):
+        logger.info(f"Starting embedding process for source: {source_url}")
         documents_to_upload: List[SourceDocument] = []
         if (
             embedding_config.use_advanced_image_processing
             and file_extension
             in self.config.get_advanced_image_processing_image_types()
         ):
+            logger.error(
+                "Advanced image processing is not supported in PostgresEmbedder."
+            )
             raise NotImplementedError(
                 "Advanced image processing is not supported in PostgresEmbedder."
             )
         else:
+            logger.info(f"Loading documents from source: {source_url}")
             documents: List[SourceDocument] = self.document_loading.load(
                 source_url, embedding_config.loading
             )
             documents = self.document_chunking.chunk(
                 documents, embedding_config.chunking
             )
+            logger.info("Chunked into document chunks.")
 
             for document in documents:
                 documents_to_upload.append(self.__convert_to_search_document(document))
 
         if documents_to_upload:
+            logger.info(
+                f"Uploading {len(documents_to_upload)} documents to vector store."
+            )
             self.azure_postgres_helper.create_vector_store(documents_to_upload)
         else:
             logger.warning("No documents to upload.")
 
     def __convert_to_search_document(self, document: SourceDocument):
+        logger.info(f"Generating embeddings for document ID: {document.id}")
         embedded_content = self.llm_helper.generate_embeddings(document.content)
         metadata = {
             "id": document.id,
@@ -84,6 +96,7 @@ def __convert_to_search_document(self, document: SourceDocument):
             "offset": document.offset,
             "page_number": document.page_number,
         }
+        logger.info(f"Metadata generated for document ID: {document.id}")
         return {
             "id": document.id,
             "content": document.content,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -50,6 +50,7 @@ Welcome to the Chat with your data Solution accelerator repository! The *Chat




		### About this repo

		This repository provides an end-to-end solution for users who want to query their data with natural language. It includes a well designed ingestion mechanism for multiple file types, an easy deployment, and a support team for maintenance. The accelerator demonstrates both Push or Pull Ingestion; the choice of orchestration (Semantic Kernel, LangChain, OpenAI Functions or [Prompt Flow](docs/prompt_flow.md)) and should be the minimum components needed to implement a RAG pattern. It is not intended to be put into Production as-is without experimentation or evaluation of your data. It provides the following features:
Expand Down