From 6d0c2c2ecabc09c29febc2a347c0ce1ba1931414 Mon Sep 17 00:00:00 2001 From: gkowalc Date: Tue, 12 Dec 2023 15:36:19 +0100 Subject: [PATCH 1/2] [Confluence] new method added confluence_get_tables_from_page +requirements.txt --- atlassian/confluence.py | 42 ++++++++++++++++++++++++++++++++++++++++- docs/confluence.rst | 2 ++ requirements.txt | 2 ++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/atlassian/confluence.py b/atlassian/confluence.py index b0a2757ee..61e12aaca 100644 --- a/atlassian/confluence.py +++ b/atlassian/confluence.py @@ -3,7 +3,7 @@ import os import time import json - +from bs4 import BeautifulSoup from requests import HTTPError import requests from deprecated import deprecated @@ -356,6 +356,46 @@ def get_page_by_id(self, page_id, expand=None, status=None, version=None): return response + def get_tables_from_page(self, page_id): + """ + Scraps tables added to confluence page + :param page_id: integer confluence page_id + :return: json object with page_id, number_of_tables_in_page and list of list tables_content representing scrapepd tables + """ + try: + page_content = self.get_page_by_id(page_id, expand="body.storage")["body"]["storage"]["value"] + + if page_content: + tables_raw = [ + [[cell.text for cell in row("th") + row("td")] for row in table("tr")] + for table in BeautifulSoup(page_content, features="lxml")("table") + ] + if len(tables_raw) > 0: + log.info("Found: ", len(tables_raw), "for pageid: ", page_id) + return json.dumps( + { + "page_id": page_id, + "number_of_tables_in_page": len(tables_raw), + "tables_content": tables_raw, + } + ) + else: + return { + "No tables found for page: ": page_id, + } + else: + return {"Page content is empty"} + except HTTPError as e: + if e.response.status_code == 404: + # Raise ApiError as the documented reason is ambiguous + log.error("Couldn't retrieve tables from page", page_id) + raise ApiError( + "There is no content with the given pageid, pageid params is not an integer " + "or the calling user does not have permission to view the page", + reason=e, + ) + except Exception as e: + log.error("error occured" + e) def get_page_labels(self, page_id, prefix=None, start=None, limit=None): """ Returns the list of labels on a piece of Content. diff --git a/docs/confluence.rst b/docs/confluence.rst index 6e4e48520..ca65bfc08 100644 --- a/docs/confluence.rst +++ b/docs/confluence.rst @@ -152,6 +152,8 @@ Page actions # Add comment into page confluence.add_comment(page_id, text) + # Fetch tables from Confluence page + confluence.get_page_tables(page_id) Template actions ---------------- diff --git a/requirements.txt b/requirements.txt index 27d6a56a0..f480a0a47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ six oauthlib requests_oauthlib requests-kerberos==0.14.0 +lxml +beautifulsoup4 From 3f92e760f176c4fe2c4e31979cfb51274f3da037 Mon Sep 17 00:00:00 2001 From: gkowalc Date: Tue, 12 Dec 2023 15:38:56 +0100 Subject: [PATCH 2/2] [Confluence] new method added confluence_get_tables_from_page +requirements.txt --- atlassian/confluence.py | 1 + 1 file changed, 1 insertion(+) diff --git a/atlassian/confluence.py b/atlassian/confluence.py index 61e12aaca..5f825bcec 100644 --- a/atlassian/confluence.py +++ b/atlassian/confluence.py @@ -4,6 +4,7 @@ import time import json from bs4 import BeautifulSoup +import lxml from requests import HTTPError import requests from deprecated import deprecated