From 33f72f86e23a4072ae221b006f12dcb7605a5b96 Mon Sep 17 00:00:00 2001 From: JobSmithManipulation <143315462+JobSmithManipulation@users.noreply.github.com> Date: Wed, 18 Sep 2024 18:46:37 +0800 Subject: [PATCH] rename some attributes in document sdk (#2481) ### What problem does this PR solve? #1102 ### Type of change - [x] Performance Improvement --------- Co-authored-by: Kevin Hu --- api/apps/sdk/doc.py | 81 ++++++++++++++------------ sdk/python/ragflow/modules/chunk.py | 10 ++-- sdk/python/ragflow/modules/dataset.py | 2 +- sdk/python/ragflow/modules/document.py | 18 +++--- sdk/python/ragflow/ragflow.py | 11 ++-- sdk/python/test/t_document.py | 6 +- 6 files changed, 66 insertions(+), 62 deletions(-) diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 812a6219e4b..77a63ae79ef 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -99,6 +99,7 @@ def docinfos(tenant_id): "chunk_num": "chunk_count", "kb_id": "knowledgebase_id", "token_num": "token_count", + "parser_id":"parser_method", } renamed_doc = {} for key, value in doc.to_dict().items(): @@ -125,10 +126,14 @@ def save_doc(tenant_id): if not e: return get_data_error_result(retmsg="Document not found!") #other value can't be changed - if "chunk_num" in req: - if req["chunk_num"] != doc.chunk_num: + if "chunk_count" in req: + if req["chunk_count"] != doc.chunk_num: return get_data_error_result( retmsg="Can't change chunk_count.") + if "token_count" in req: + if req["token_count"] != doc.token_num: + return get_data_error_result( + retmsg="Can't change token_count.") if "progress" in req: if req['progress'] != doc.progress: return get_data_error_result( @@ -158,9 +163,9 @@ def save_doc(tenant_id): FileService.update_by_id(file.id, {"name": req["name"]}) except Exception as e: return server_error_response(e) - if "parser_id" in req: + if "parser_method" in req: try: - if doc.parser_id.lower() == req["parser_id"].lower(): + if doc.parser_id.lower() == req["parser_method"].lower(): if "parser_config" in req: if req["parser_config"] == doc.parser_config: return get_json_result(data=True) @@ -172,7 +177,7 @@ def save_doc(tenant_id): return get_data_error_result(retmsg="Not supported yet!") e = DocumentService.update_by_id(doc.id, - {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", + {"parser_id": req["parser_method"], "progress": 0, "progress_msg": "", "run": TaskStatus.UNSTART.value}) if not e: return get_data_error_result(retmsg="Document not found!") @@ -183,7 +188,7 @@ def save_doc(tenant_id): doc.process_duation * -1) if not e: return get_data_error_result(retmsg="Document not found!") - tenant_id = DocumentService.get_tenant_id(req["doc_id"]) + tenant_id = DocumentService.get_tenant_id(req["id"]) if not tenant_id: return get_data_error_result(retmsg="Tenant not found!") ELASTICSEARCH.deleteByQuery( @@ -272,7 +277,7 @@ def rename(): @manager.route("/", methods=["GET"]) @token_required -def download_document(dataset_id, document_id,tenant_id): +def download_document(document_id,tenant_id): try: # Check whether there is this document exist, document = DocumentService.get_by_id(document_id) @@ -304,7 +309,7 @@ def download_document(dataset_id, document_id,tenant_id): @manager.route('/dataset//documents', methods=['GET']) @token_required def list_docs(dataset_id, tenant_id): - kb_id = request.args.get("kb_id") + kb_id = request.args.get("knowledgebase_id") if not kb_id: return get_json_result( data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR) @@ -334,6 +339,7 @@ def list_docs(dataset_id, tenant_id): "chunk_num": "chunk_count", "kb_id": "knowledgebase_id", "token_num": "token_count", + "parser_id":"parser_method" } renamed_doc = {} for key, value in doc.items(): @@ -349,10 +355,10 @@ def list_docs(dataset_id, tenant_id): @token_required def rm(tenant_id): req = request.args - if "doc_id" not in req: + if "document_id" not in req: return get_data_error_result( retmsg="doc_id is required") - doc_ids = req["doc_id"] + doc_ids = req["document_id"] if isinstance(doc_ids, str): doc_ids = [doc_ids] root_folder = FileService.get_root_folder(tenant_id) pf_id = root_folder["id"] @@ -413,7 +419,7 @@ def show_parsing_status(tenant_id, document_id): def run(tenant_id): req = request.json try: - for id in req["doc_ids"]: + for id in req["document_ids"]: info = {"run": str(req["run"]), "progress": 0} if str(req["run"]) == TaskStatus.RUNNING.value: info["progress_msg"] = "" @@ -442,15 +448,15 @@ def run(tenant_id): @manager.route('/chunk/list', methods=['POST']) @token_required -@validate_request("doc_id") +@validate_request("document_id") def list_chunk(tenant_id): req = request.json - doc_id = req["doc_id"] + doc_id = req["document_id"] page = int(req.get("page", 1)) size = int(req.get("size", 30)) question = req.get("keywords", "") try: - tenant_id = DocumentService.get_tenant_id(req["doc_id"]) + tenant_id = DocumentService.get_tenant_id(req["document_id"]) if not tenant_id: return get_data_error_result(retmsg="Tenant not found!") e, doc = DocumentService.get_by_id(doc_id) @@ -509,15 +515,15 @@ def list_chunk(tenant_id): @manager.route('/chunk/create', methods=['POST']) @token_required -@validate_request("doc_id", "content_with_weight") +@validate_request("document_id", "content") def create(tenant_id): req = request.json md5 = hashlib.md5() - md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) + md5.update((req["content"] + req["document_id"]).encode("utf-8")) chunk_id = md5.hexdigest() - d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]), - "content_with_weight": req["content_with_weight"]} + d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]), + "content_with_weight": req["content"]} d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) d["important_kwd"] = req.get("important_kwd", []) d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", []))) @@ -525,22 +531,22 @@ def create(tenant_id): d["create_timestamp_flt"] = datetime.datetime.now().timestamp() try: - e, doc = DocumentService.get_by_id(req["doc_id"]) + e, doc = DocumentService.get_by_id(req["document_id"]) if not e: return get_data_error_result(retmsg="Document not found!") d["kb_id"] = [doc.kb_id] d["docnm_kwd"] = doc.name d["doc_id"] = doc.id - tenant_id = DocumentService.get_tenant_id(req["doc_id"]) + tenant_id = DocumentService.get_tenant_id(req["document_id"]) if not tenant_id: return get_data_error_result(retmsg="Tenant not found!") - embd_id = DocumentService.get_embd_id(req["doc_id"]) + embd_id = DocumentService.get_embd_id(req["document_id"]) embd_mdl = TenantLLMService.model_instance( tenant_id, LLMType.EMBEDDING.value, embd_id) - v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) + v, c = embd_mdl.encode([doc.name, req["content"]]) v = 0.1 * v[0] + 0.9 * v[1] d["q_%d_vec" % len(v)] = v.tolist() ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) @@ -568,14 +574,14 @@ def create(tenant_id): @manager.route('/chunk/rm', methods=['POST']) @token_required -@validate_request("chunk_ids", "doc_id") +@validate_request("chunk_ids", "document_id") def rm_chunk(tenant_id): req = request.json try: if not ELASTICSEARCH.deleteByQuery( Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)): return get_data_error_result(retmsg="Index updating failure") - e, doc = DocumentService.get_by_id(req["doc_id"]) + e, doc = DocumentService.get_by_id(req["document_id"]) if not e: return get_data_error_result(retmsg="Document not found!") deleted_chunk_ids = req["chunk_ids"] @@ -587,30 +593,30 @@ def rm_chunk(tenant_id): @manager.route('/chunk/set', methods=['POST']) @token_required -@validate_request("doc_id", "chunk_id", "content_with_weight", - "important_kwd") +@validate_request("document_id", "chunk_id", "content", + "important_keywords") def set(tenant_id): req = request.json d = { "id": req["chunk_id"], - "content_with_weight": req["content_with_weight"]} - d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"]) + "content_with_weight": req["content"]} + d["content_ltks"] = rag_tokenizer.tokenize(req["content"]) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) - d["important_kwd"] = req["important_kwd"] - d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"])) + d["important_kwd"] = req["important_keywords"] + d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"])) if "available_int" in req: d["available_int"] = req["available_int"] try: - tenant_id = DocumentService.get_tenant_id(req["doc_id"]) + tenant_id = DocumentService.get_tenant_id(req["document_id"]) if not tenant_id: return get_data_error_result(retmsg="Tenant not found!") - embd_id = DocumentService.get_embd_id(req["doc_id"]) + embd_id = DocumentService.get_embd_id(req["document_id"]) embd_mdl = TenantLLMService.model_instance( tenant_id, LLMType.EMBEDDING.value, embd_id) - e, doc = DocumentService.get_by_id(req["doc_id"]) + e, doc = DocumentService.get_by_id(req["document_id"]) if not e: return get_data_error_result(retmsg="Document not found!") @@ -618,7 +624,7 @@ def set(tenant_id): arr = [ t for t in re.split( r"[\n\t]", - req["content_with_weight"]) if len(t) > 1] + req["content"]) if len(t) > 1] if len(arr) != 2: return get_data_error_result( retmsg="Q&A must be separated by TAB/ENTER key.") @@ -626,7 +632,7 @@ def set(tenant_id): d = beAdoc(d, arr[0], arr[1], not any( [rag_tokenizer.is_chinese(t) for t in q + a])) - v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) + v, c = embd_mdl.encode([doc.name, req["content"]]) v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] d["q_%d_vec" % len(v)] = v.tolist() ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) @@ -636,13 +642,13 @@ def set(tenant_id): @manager.route('/retrieval_test', methods=['POST']) @token_required -@validate_request("kb_id", "question") +@validate_request("knowledgebase_id", "question") def retrieval_test(tenant_id): req = request.json page = int(req.get("page", 1)) size = int(req.get("size", 30)) question = req["question"] - kb_id = req["kb_id"] + kb_id = req["knowledgebase_id"] if isinstance(kb_id, str): kb_id = [kb_id] doc_ids = req.get("doc_ids", []) similarity_threshold = float(req.get("similarity_threshold", 0.2)) @@ -693,6 +699,7 @@ def retrieval_test(tenant_id): "content_with_weight": "content", "doc_id": "document_id", "important_kwd": "important_keywords", + "docnm_kwd":"document_keyword" } rename_chunk={} for key, value in chunk.items(): diff --git a/sdk/python/ragflow/modules/chunk.py b/sdk/python/ragflow/modules/chunk.py index f20e967c6b1..ee135404c73 100644 --- a/sdk/python/ragflow/modules/chunk.py +++ b/sdk/python/ragflow/modules/chunk.py @@ -22,7 +22,7 @@ def delete(self) -> bool: Delete the chunk in the document. """ res = self.post('/doc/chunk/rm', - {"doc_id": self.document_id, 'chunk_ids': [self.id]}) + {"document_id": self.document_id, 'chunk_ids': [self.id]}) res = res.json() if res.get("retmsg") == "success": return True @@ -34,13 +34,13 @@ def save(self) -> bool: """ res = self.post('/doc/chunk/set', {"chunk_id": self.id, - "kb_id": self.knowledgebase_id, + "knowledgebase_id": self.knowledgebase_id, "name": self.document_name, - "content_with_weight": self.content, - "important_kwd": self.important_keywords, + "content": self.content, + "important_keywords": self.important_keywords, "create_time": self.create_time, "create_timestamp_flt": self.create_timestamp_float, - "doc_id": self.document_id, + "document_id": self.document_id, "status": self.status, }) res = res.json() diff --git a/sdk/python/ragflow/modules/dataset.py b/sdk/python/ragflow/modules/dataset.py index a0ef8ffed49..4efd3b9598c 100644 --- a/sdk/python/ragflow/modules/dataset.py +++ b/sdk/python/ragflow/modules/dataset.py @@ -65,7 +65,7 @@ def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int """ # Construct the request payload for listing documents payload = { - "kb_id": self.id, + "knowledgebase_id": self.id, "keywords": keywords, "offset": offset, "limit": limit diff --git a/sdk/python/ragflow/modules/document.py b/sdk/python/ragflow/modules/document.py index 0e3b352395a..ff822b70ff7 100644 --- a/sdk/python/ragflow/modules/document.py +++ b/sdk/python/ragflow/modules/document.py @@ -34,10 +34,10 @@ def save(self) -> bool: Save the document details to the server. """ res = self.post('/doc/save', - {"id": self.id, "name": self.name, "thumbnail": self.thumbnail, "kb_id": self.knowledgebase_id, - "parser_id": self.parser_method, "parser_config": self.parser_config.to_json(), + {"id": self.id, "name": self.name, "thumbnail": self.thumbnail, "knowledgebase_id": self.knowledgebase_id, + "parser_method": self.parser_method, "parser_config": self.parser_config.to_json(), "source_type": self.source_type, "type": self.type, "created_by": self.created_by, - "size": self.size, "token_num": self.token_count, "chunk_num": self.chunk_count, + "size": self.size, "token_count": self.token_count, "chunk_count": self.chunk_count, "progress": self.progress, "progress_msg": self.progress_msg, "process_begin_at": self.process_begin_at, "process_duation": self.process_duration }) @@ -51,7 +51,7 @@ def delete(self) -> bool: Delete the document from the server. """ res = self.rm('/doc/delete', - {"doc_id": self.id}) + {"document_id": self.id}) res = res.json() if res.get("retmsg") == "success": return True @@ -83,7 +83,7 @@ def async_parse(self): """ try: # Construct request data including document ID and run status (assuming 1 means to run) - data = {"doc_ids": [self.id], "run": 1} + data = {"document_ids": [self.id], "run": 1} # Send a POST request to the specified parsing status endpoint to start parsing res = self.post(f'/doc/run', data) @@ -112,7 +112,7 @@ def join(self, interval=5, timeout=3600): start_time = time.time() while time.time() - start_time < timeout: # Check the parsing status - res = self.get(f'/doc/{self.id}/status', {"doc_ids": [self.id]}) + res = self.get(f'/doc/{self.id}/status', {"document_ids": [self.id]}) res_data = res.json() data = res_data.get("data", []) @@ -133,7 +133,7 @@ def cancel(self): """ try: # Construct request data, including document ID and action to cancel (assuming 2 means cancel) - data = {"doc_ids": [self.id], "run": 2} + data = {"document_ids": [self.id], "run": 2} # Send a POST request to the specified parsing status endpoint to cancel parsing res = self.post(f'/doc/run', data) @@ -162,7 +162,7 @@ def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available list: A list of chunks returned from the API. """ data = { - "doc_id": self.id, + "document_id": self.id, "page": page, "size": size, "keywords": keywords, @@ -188,7 +188,7 @@ def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available raise Exception(f"API request failed with status code {res.status_code}") def add_chunk(self, content: str): - res = self.post('/doc/chunk/create', {"doc_id": self.id, "content_with_weight":content}) + res = self.post('/doc/chunk/create', {"document_id": self.id, "content":content}) if res.status_code == 200: res_data = res.json().get("data") chunk_data = res_data.get("chunk") diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index d4fc6de3643..f30433d9614 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -150,14 +150,11 @@ def create_document(self, ds: DataSet, name: str, blob: bytes) -> bool: files = { 'file': (name, blob) } - data = { - 'kb_id': ds.id - } headers = { 'Authorization': f"Bearer {ds.rag.user_key}" } - response = requests.post(self.api_url + url, data=data, files=files, + response = requests.post(self.api_url + url, files=files, headers=headers) if response.status_code == 200 and response.json().get('retmsg') == 'success': @@ -184,7 +181,7 @@ def async_parse_documents(self, doc_ids): if not doc_ids or not isinstance(doc_ids, list): raise ValueError("doc_ids must be a non-empty list of document IDs") - data = {"doc_ids": doc_ids, "run": 1} + data = {"document_ids": doc_ids, "run": 1} res = self.post(f'/doc/run', data) @@ -206,7 +203,7 @@ def async_cancel_parse_documents(self, doc_ids): try: if not doc_ids or not isinstance(doc_ids, list): raise ValueError("doc_ids must be a non-empty list of document IDs") - data = {"doc_ids": doc_ids, "run": 2} + data = {"document_ids": doc_ids, "run": 2} res = self.post(f'/doc/run', data) if res.status_code != 200: @@ -252,7 +249,7 @@ def retrieval(self, "similarity_threshold": similarity_threshold, "vector_similarity_weight": vector_similarity_weight, "top_k": top_k, - "kb_id": datasets, + "knowledgebase_id": datasets, } # Send a POST request to the backend service (using requests library as an example, actual implementation may vary) diff --git a/sdk/python/test/t_document.py b/sdk/python/test/t_document.py index 2e375fc52cf..eed572f341d 100644 --- a/sdk/python/test/t_document.py +++ b/sdk/python/test/t_document.py @@ -255,14 +255,14 @@ def test_parse_document_and_chunk_list(self): def test_add_chunk_to_chunk_list(self): rag = RAGFlow(API_KEY, HOST_ADDRESS) doc = rag.get_document(name='story.txt') - chunk = doc.add_chunk(content="assss") + chunk = doc.add_chunk(content="assssdd") assert chunk is not None, "Chunk is None" assert isinstance(chunk, Chunk), "Chunk was not added to chunk list" def test_delete_chunk_of_chunk_list(self): rag = RAGFlow(API_KEY, HOST_ADDRESS) doc = rag.get_document(name='story.txt') - chunk = doc.add_chunk(content="assss") + chunk = doc.add_chunk(content="assssdd") assert chunk is not None, "Chunk is None" assert isinstance(chunk, Chunk), "Chunk was not added to chunk list" doc = rag.get_document(name='story.txt') @@ -274,7 +274,7 @@ def test_delete_chunk_of_chunk_list(self): def test_update_chunk_content(self): rag = RAGFlow(API_KEY, HOST_ADDRESS) doc = rag.get_document(name='story.txt') - chunk = doc.add_chunk(content="assssd") + chunk = doc.add_chunk(content="assssddd") assert chunk is not None, "Chunk is None" assert isinstance(chunk, Chunk), "Chunk was not added to chunk list" chunk.content = "ragflow123"