diff --git a/atlassian/confluence.py b/atlassian/confluence.py index b0a2757ee..6d1bd953c 100644 --- a/atlassian/confluence.py +++ b/atlassian/confluence.py @@ -7,6 +7,7 @@ from requests import HTTPError import requests from deprecated import deprecated +from bs4 import BeautifulSoup from atlassian import utils from .errors import ApiError, ApiNotFoundError, ApiPermissionError, ApiValueError, ApiConflictError, ApiNotAcceptable from .rest_client import AtlassianRestAPI @@ -356,6 +357,46 @@ def get_page_by_id(self, page_id, expand=None, status=None, version=None): return response + def get_tables_from_page(self, page_id): + """ + Fetches html tables added to confluence page + :param page_id: integer confluence page_id + :return: json object with page_id, number_of_tables_in_page and list of list tables_content representing scrapepd tables + """ + try: + page_content = self.get_page_by_id(page_id, expand="body.storage")["body"]["storage"]["value"] + + if page_content: + tables_raw = [ + [[cell.text for cell in row("th") + row("td")] for row in table("tr")] + for table in BeautifulSoup(page_content, features="lxml")("table") + ] + if len(tables_raw) > 0: + return json.dumps( + { + "page_id": page_id, + "number_of_tables_in_page": len(tables_raw), + "tables_content": tables_raw, + } + ) + else: + return { + "No tables found for page: ": page_id, + } + else: + return {"Page content is empty"} + except HTTPError as e: + if e.response.status_code == 404: + # Raise ApiError as the documented reason is ambiguous + log.error("Couldn't retrieve tables from page", page_id) + raise ApiError( + "There is no content with the given pageid, pageid params is not an integer " + "or the calling user does not have permission to view the page", + reason=e, + ) + except Exception as e: + log.error("Error occured", e) + def get_page_labels(self, page_id, prefix=None, start=None, limit=None): """ Returns the list of labels on a piece of Content. diff --git a/docs/confluence.rst b/docs/confluence.rst index 6e4e48520..8580130ea 100644 --- a/docs/confluence.rst +++ b/docs/confluence.rst @@ -152,6 +152,9 @@ Page actions # Add comment into page confluence.add_comment(page_id, text) + # Fetch tables from Confluence page + confluence.get_page_tables(page_id) + Template actions ---------------- diff --git a/examples/confluence/confluence_get_tables_from_page.py b/examples/confluence/confluence_get_tables_from_page.py new file mode 100644 index 000000000..d09464487 --- /dev/null +++ b/examples/confluence/confluence_get_tables_from_page.py @@ -0,0 +1,17 @@ +from atlassian import Confluence +import logging + +confluence = Confluence( + url="", + username="", + password="api_key", +) +page_id = 393464 +logging.basicConfig(level=logging.INFO) +# Page_id is the page id of the page you want to get the tables from. + +result = confluence.get_tables_from_page(page_id) +print(result) +# Let's say page has two table, each one has 3 columns and 2 rows' +# Method should return following output: {"page_id": 393464, "number_of_tables_in_page": 2, "tables_content": [[["header1", "header2", "header3"], ["h1r1", "h2r1", "h3r1"], ["h1r2", "h2r2", "h3r2"]], [["table2 header1", "table2 header2", "table2 header3"], ["h1r1", "h2r1", "h3r1"], ["h1r2", "h2r2", "h3r2"]]]} +# tables_content is a list of lists of lists. Each nested list represents a table. Each nested list inside a table represents a row. diff --git a/requirements.txt b/requirements.txt index 27d6a56a0..f155b7940 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ six oauthlib requests_oauthlib requests-kerberos==0.14.0 +bs4 +lxml \ No newline at end of file diff --git a/tox.ini b/tox.ini index 00a00c6a4..1638b7389 100644 --- a/tox.ini +++ b/tox.ini @@ -11,6 +11,7 @@ deps = pytest-cov coverage requests + bs4 commands = coverage erase pytest -v --cov=atlassian --cov-branch --cov-report=xml