From 6d0c2c2ecabc09c29febc2a347c0ce1ba1931414 Mon Sep 17 00:00:00 2001
From: gkowalc <gkowalc>
Date: Tue, 12 Dec 2023 15:36:19 +0100
Subject: [PATCH 1/2] [Confluence] new method added
 confluence_get_tables_from_page +requirements.txt

---
 atlassian/confluence.py | 42 ++++++++++++++++++++++++++++++++++++++++-
 docs/confluence.rst     |  2 ++
 requirements.txt        |  2 ++
 3 files changed, 45 insertions(+), 1 deletion(-)
diff --git a/atlassian/confluence.py b/atlassian/confluence.py
index b0a2757ee..61e12aaca 100644
--- a/atlassian/confluence.py
+++ b/atlassian/confluence.py
@@ -3,7 +3,7 @@
 import os
 import time
 import json
-
+from bs4  import BeautifulSoup
 from requests import HTTPError
 import requests
 from deprecated import deprecated
@@ -356,6 +356,46 @@ def get_page_by_id(self, page_id, expand=None, status=None, version=None):
 
         return response
 
+    def get_tables_from_page(self, page_id):
+        """
+        Scraps  tables added to  confluence page
+        :param page_id: integer confluence page_id
+        :return: json object with page_id, number_of_tables_in_page  and  list of list tables_content representing scrapepd tables
+        """
+        try:
+            page_content = self.get_page_by_id(page_id, expand="body.storage")["body"]["storage"]["value"]
+
+            if page_content:
+                tables_raw = [
+                    [[cell.text for cell in row("th") + row("td")] for row in table("tr")]
+                    for table in BeautifulSoup(page_content, features="lxml")("table")
+                ]
+                if len(tables_raw) > 0:
+                    log.info("Found: ", len(tables_raw), "for pageid: ", page_id)
+                    return json.dumps(
+                        {
+                            "page_id": page_id,
+                            "number_of_tables_in_page": len(tables_raw),
+                            "tables_content": tables_raw,
+                        }
+                    )
+                else:
+                    return {
+                        "No tables found for page: ": page_id,
+                    }
+            else:
+                return {"Page content is empty"}
+        except HTTPError as e:
+            if e.response.status_code == 404:
+                # Raise ApiError as the documented reason is ambiguous
+                log.error("Couldn't retrieve tables  from page", page_id)
+                raise ApiError(
+                    "There is no content with the given pageid, pageid params is not an integer "
+                    "or the calling user does not have permission to view the page",
+                    reason=e,
+                )
+        except Exception as e:
+            log.error("error occured" + e)
     def get_page_labels(self, page_id, prefix=None, start=None, limit=None):
         """
         Returns the list of labels on a piece of Content.
diff --git a/docs/confluence.rst b/docs/confluence.rst
index 6e4e48520..ca65bfc08 100644
--- a/docs/confluence.rst
+++ b/docs/confluence.rst
@@ -152,6 +152,8 @@ Page actions
     # Add comment into page
     confluence.add_comment(page_id, text)
 
+    # Fetch tables from Confluence page
+    confluence.get_page_tables(page_id)
 Template actions
 ----------------
 
diff --git a/requirements.txt b/requirements.txt
index 27d6a56a0..f480a0a47 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,5 @@ six
 oauthlib
 requests_oauthlib
 requests-kerberos==0.14.0
+lxml
+beautifulsoup4

From 3f92e760f176c4fe2c4e31979cfb51274f3da037 Mon Sep 17 00:00:00 2001
From: gkowalc <gkowalc>
Date: Tue, 12 Dec 2023 15:38:56 +0100
Subject: [PATCH 2/2] [Confluence] new method added
 confluence_get_tables_from_page +requirements.txt

---
 atlassian/confluence.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/atlassian/confluence.py b/atlassian/confluence.py
index 61e12aaca..5f825bcec 100644
--- a/atlassian/confluence.py
+++ b/atlassian/confluence.py
@@ -4,6 +4,7 @@
 import time
 import json
 from bs4  import BeautifulSoup
+import lxml
 from requests import HTTPError
 import requests
 from deprecated import deprecated