nulib · mbklein · Jun 24, 2024 · Jun 13, 2024 · Jun 13, 2024 · Jun 12, 2024
diff --git a/.github/workflows/test-node.yml b/.github/workflows/test-node.yml
@@ -18,7 +18,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-node@v3
         with:
-          node-version: 16.x
+          node-version: 20.x
           cache: "npm"
           cache-dependency-path: 'node/package-lock.json'
       - run: npm ci

diff --git a/.tool-versions b/.tool-versions
@@ -1,4 +1,4 @@
-nodejs 16.14.0
+nodejs 20.15.0
 java corretto-19.0.1.10.1
 aws-sam-cli 1.107.0
 python 3.10.5
diff --git a/chat/src/handlers/chat.py b/chat/src/handlers/chat.py
@@ -24,6 +24,10 @@ def handler(event, context):
     if not config.is_logged_in:
         config.socket.send({"type": "error", "message": "Unauthorized"})
         return {"statusCode": 401, "body": "Unauthorized"}
+
+    if config.question is None or config.question == "":
+        config.socket.send({"type": "error", "message": "Question cannot be blank"})
+        return {"statusCode": 400, "body": "Question cannot be blank"}
 
     debug_message = config.debug_message()
     if config.debug_mode:

diff --git a/chat/src/handlers/opensearch_neural_search.py b/chat/src/handlers/opensearch_neural_search.py
@@ -2,7 +2,7 @@
 from langchain_core.vectorstores import VectorStore
 from opensearchpy import OpenSearch
 from typing import Any, List, Tuple
-
+from helpers.hybrid_query import hybrid_query
 
 class OpenSearchNeuralSearch(VectorStore):
     """Read-only OpenSearch vectorstore with neural search."""
@@ -40,33 +40,8 @@ def similarity_search_with_score(
         self, query: str, k: int = 10, subquery: Any = None, **kwargs: Any
     ) -> List[Tuple[Document, float]]:
         """Return docs most similar to query."""
-        dsl = {
-            "size": k,
-            "query": {
-                "hybrid": {
-                    "queries": [
-                        {
-                            "neural": {
-                                self.vector_field: {
-                                    "query_text": query,
-                                    "model_id": self.model_id,
-                                    "k": k,
-                                }
-                            }
-                        }
-                    ]
-                }
-            },
-        }
-
-        if subquery:
-            dsl["query"]["hybrid"]["queries"].append(subquery)
-
-        for key, value in kwargs.items():
-            dsl[key] = value
-
+        dsl = hybrid_query(query=query, model_id=self.model_id, vector_field=self.vector_field, k=k, subquery=subquery, **kwargs)
         response = self.client.search(index=self.index, body=dsl, params={"search_pipeline": self.search_pipeline} if self.search_pipeline else None)
-
         documents_with_scores = [
             (
                 Document(

diff --git a/chat/src/helpers/hybrid_query.py b/chat/src/helpers/hybrid_query.py
@@ -0,0 +1,71 @@
+from typing import Any
+
+def filter(query: dict):
+    return {
+        "bool": {
+            "must": [
+                query,
+                { "terms": { "visibility": ["Public", "Institution"] } },
+                { "term": { "published": True } }
+            ]
+        }
+    }
+
+def hybrid_query(query: str, model_id: str, vector_field: str = "embedding", k: int = 10, subquery: Any = None, **kwargs: Any):
+    if subquery:
+        weights = [0.5, 0.3, 0.2]
+    else:
+        weights = [0.7, 0.3]
+
+    result = {
+        "size": k,
+        "query": {
+            "hybrid": {
+                "queries": [
+                    filter({
+                        "query_string": {
+                            "default_operator": "AND", 
+                            "fields": ["title^5", "all_controlled_labels", "all_ids^5"], 
+                            "query": query
+                        }
+                    }),
+                    filter({
+                        "neural": {
+                            vector_field: {
+                                "k": k, 
+                                "model_id": model_id,
+                                "query_text": query
+                            }
+                        }
+                    })
+                ]
+            },
+        },
+        "search_pipeline": {
+            "phase_results_processors": [
+                {
+                    "normalization-processor": {
+                        "combination": {
+                            "parameters": {
+                                "weights": weights
+                            },
+                            "technique": "arithmetic_mean"
+                        },
+                        "normalization": {
+                            "technique": "l2"
+                        }
+                    }
+                }
+            ]
+        }
+    }
+
+    if subquery:
+        result["query"]["hybrid"]["queries"].append(filter(subquery))
+
+    for key, value in kwargs.items():
+        result[key] = value
+
+    return result
+
+
diff --git a/chat/src/setup.py b/chat/src/setup.py
@@ -39,7 +39,6 @@ def opensearch_vector_store(region_name=os.getenv("AWS_REGION")):
         endpoint=os.getenv("OPENSEARCH_ENDPOINT"),
         connection_class=RequestsHttpConnection,
         http_auth=awsauth,
-        search_pipeline=prefix("dc-v2-work-pipeline"),
         text_field= "id"
     )
     return docsearch

diff --git a/chat/template.yaml b/chat/template.yaml
@@ -241,7 +241,7 @@ Resources:
           - logs:PutLogEvents
           Resource: !Sub "${ChatMetricsLog.Arn}:*"
     Metadata:
-      BuildMethod: nodejs18.x
+      BuildMethod: nodejs20.x
   ChatMetricsLog:
     Type: AWS::Logs::LogGroup
     Properties:

diff --git a/chat/test/handlers/test_chat.py b/chat/test/handlers/test_chat.py
@@ -35,11 +35,11 @@ class TestHandler(TestCase):
     def test_handler_unauthorized(self):        
         event = {"socket": Websocket(client=MockClient(), endpoint_url="test", connection_id="test", ref="test")}
         self.assertEqual(handler(event, MockContext()), {'body': 'Unauthorized', 'statusCode': 401})
-      
+
     @patch.object(ApiToken, 'is_logged_in')
     def test_handler_success(self, mock_is_logged_in):
       mock_is_logged_in.return_value = True
-      event = {"socket": Websocket(client=MockClient(), endpoint_url="test", connection_id="test", ref="test")}
+      event = {"socket": Websocket(client=MockClient(), endpoint_url="test", connection_id="test", ref="test"), "body": '{"question": "Question?"}' }
       self.assertEqual(handler(event, MockContext()), {'statusCode': 200})
 
     @patch.object(ApiToken, 'is_logged_in')
@@ -51,7 +51,7 @@ def test_handler_debug_mode(self, mock_is_debug_enabled, mock_is_logged_in, mock
       mock_is_superuser.return_value = True
       mock_client = MockClient()
       mock_websocket = Websocket(client=mock_client, endpoint_url="test", connection_id="test", ref="test")
-      event = {"socket": mock_websocket, "debug": True}
+      event = {"socket": mock_websocket, "debug": True, "body": '{"question": "Question?"}' }
       handler(event, MockContext())
       response = json.loads(mock_client.received_data)
       self.assertEqual(response["type"], "debug")
@@ -65,7 +65,29 @@ def test_handler_debug_mode_for_superusers_only(self, mock_is_debug_enabled, moc
       mock_is_superuser.return_value = False
       mock_client = MockClient()
       mock_websocket = Websocket(client=mock_client, endpoint_url="test", connection_id="test", ref="test")
-      event = {"socket": mock_websocket, "debug": True}
+      event = {"socket": mock_websocket, "debug": True, "body": '{"question": "Question?"}' }
       handler(event, MockContext())
       response = json.loads(mock_client.received_data)
       self.assertEqual(response["type"], "error")
+
+    @patch.object(ApiToken, 'is_logged_in')
+    def test_handler_question_missing(self, mock_is_logged_in):
+        mock_is_logged_in.return_value = True
+        mock_client = MockClient()
+        mock_websocket = Websocket(client=mock_client, endpoint_url="test", connection_id="test", ref="test")
+        event = {"socket": mock_websocket}
+        handler(event, MockContext())
+        response = json.loads(mock_client.received_data)
+        self.assertEqual(response["type"], "error")
+        self.assertEqual(response["message"], "Question cannot be blank")
+
+    @patch.object(ApiToken, 'is_logged_in')
+    def test_handler_question_blank(self, mock_is_logged_in):
+        mock_is_logged_in.return_value = True
+        mock_client = MockClient()
+        mock_websocket = Websocket(client=mock_client, endpoint_url="test", connection_id="test", ref="test")
+        event = {"socket": mock_websocket, "body": '{"quesion": ""}'}
+        handler(event, MockContext())
+        response = json.loads(mock_client.received_data)
+        self.assertEqual(response["type"], "error")
+        self.assertEqual(response["message"], "Question cannot be blank")
diff --git a/chat/test/helpers/test_hybrid_query.py b/chat/test/helpers/test_hybrid_query.py
@@ -0,0 +1,26 @@
+import sys
+from helpers.hybrid_query import hybrid_query
+from unittest import TestCase
+
+sys.path.append('./src')
+
+class TestFunction(TestCase):
+    def test_hybrid_query(self):
+        subquery = { "term": { "title": { "value": "The Title" } } }
+        dsl = hybrid_query("Question?", "MODEL_ID", k=10, subquery=subquery)
+        subject = dsl["query"]["hybrid"]["queries"]
+
+        checks = [
+          (lambda x: x["query_string"]["query"], "Question?"),
+          (lambda x: x["neural"]["embedding"]["model_id"], "MODEL_ID"),
+          (lambda x: x["term"]["title"]["value"], "The Title")
+        ]
+
+        self.assertEqual(len(subject), 3)
+
+        for i in range(3):
+          lookup, expected = checks[i]
+          queries = subject[i]["bool"]["must"]
+          self.assertEqual(lookup(queries[0]), expected)
+          self.assertIn({ "terms": { "visibility": ["Public", "Institution"] } }, queries)
+          self.assertIn({ "term": { "published": True } }, queries)