Add function to read PDF data

ruchernchong · May 25, 2024 · 9e311cc · 9e311cc
1 parent 2a4ed69
commit 9e311cc
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 21 deletions.
diff --git a/main.py b/main.py
@@ -1,43 +1,48 @@
 import matplotlib.pyplot as plt
 import numpy as np
 
+import read_pdf
 from utils.calculate_growth_rate import calculate_growth_rate
 from utils.replace_text import replace_text
 
-# Data
-quarters = ["Q1 FY24", "Q2 FY24", "Q3 FY24", "Q4 FY24", "Q1 FY25"]
-data_center = [4284, 10323, 14514, 18404, 22563]
-gaming = [2240, 2486, 2856, 2865, 2647]
-professional_visualization = [295, 379, 416, 463, 427]
-auto = [296, 253, 261, 281, 329]
-oem_other = [77, 66, 73, 90, 78]
-total = [7192, 13507, 18120, 22103, 26044]
+# Step 1: Extract data from the PDF
+data = read_pdf.extract_data_from_pdf('Rev_by_Mkt_Qtrly_Trend_Q125.pdf')
 
-# Calculate growth rates as percentages with + or -
-growth_rates = [calculate_growth_rate(total[i], total[i - 1]) for i in range(len(total))]
+# Step 2: Assign data to variables
+quarters = data['quarters']
+data_center = data['data_center']
+gaming = data['gaming']
+professional_visualization = data['professional_visualization']
+auto = data['auto']
+oem_other = data['oem_other']
+total = data['total']
 
-# Print growth rates
+# Step 3: Calculate growth rates as percentages with + or -
+growth_rates = [calculate_growth_rate(total[i], total[i - 1]) if i != 0 else 0 for i in range(len(total))]
+
+# Step 4: Print growth rates
 for quarter, rate in zip(quarters[1:], growth_rates[1:]):
-    print(f"{quarter}: {rate}")
+    print(f"{quarter}: {rate}%")
 
-# Plotting
+# Step 5: Plotting
 x = np.arange(len(quarters))  # the label locations
-width = 0.1  # the width of the bars
+width = 0.15  # the width of the bars
 bar_positions = [x - 2 * width, x - width, x, x + width, x + 2 * width, x + 3 * width]
-bar_labels = ['Data Center', 'Gaming', 'Professional Visualization', 'Auto', 'OEM & Other', 'Total']
+bar_labels = ['data_center', 'gaming', 'professional_visualization', 'auto', 'oem_other', 'total']
+bar_data = [data_center, gaming, professional_visualization, auto, oem_other, total]
 
 fig, ax = plt.subplots(figsize=(14, 8))
 
 rects = []
-for pos, label in zip(bar_positions, bar_labels):
-    rect = ax.bar(pos, eval(replace_text(label)), width, label=label)
+for pos, label, data in zip(bar_positions, bar_labels, bar_data):
+    rect = ax.bar(pos, data, width, label=replace_text(label))
     rects.append(rect)
 
-# Add growth rate annotations
+# Step 6: Add growth rate annotations
 for i, rate in enumerate(growth_rates):
-    ax.annotate(f'{rate}', (x[i], total[i]), textcoords="offset points", xytext=(0, 0), ha='center')
+    ax.annotate(f'{rate}%', (x[i], total[i]), textcoords="offset points", xytext=(0, 5), ha='center')
 
-# Add some text for labels, title and custom x-axis tick labels, etc.
+# Step 7: Add some text for labels, title, and custom x-axis tick labels, etc.
 ax.set_xlabel('Quarter')
 ax.set_ylabel('Revenue ($ in millions)')
 ax.set_title('NVIDIA Quarterly Revenue Trend by Market')
@@ -48,7 +53,7 @@
 # Rotate the tick labels for better readability
 plt.xticks(rotation=45)
 
+# Step 8: Adjust layout and save the figure
 fig.tight_layout()
-
 plt.savefig('nvidia-revenue-trend.png')
 plt.show()
diff --git a/nvidia-revenue-trend.png b/nvidia-revenue-trend.png
diff --git a/read_pdf.py b/read_pdf.py
@@ -0,0 +1,36 @@
+import pdfplumber
+
+from utils.replace_text import replace_text
+
+
+def extract_data_from_pdf(pdf_path):
+    data = {}
+
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            # Assuming the relevant data is on the first page
+            page = pdf.pages[0]
+            table = page.extract_table()
+
+            if table:
+                # Extract quarters from the first row, skipping the first column header
+                quarters = table[0][1:]
+                data['quarters'] = quarters[::-1]
+
+                # Process the rest of the rows, skipping the first row (headers)
+                for row in table[1:]:
+                    if row:  # Ensure the row is not empty
+                        key = replace_text(row[0].lower())
+                        values = [int(item.replace('$', '').replace(',', '')) for item in row[1:] if item]
+                        data[key] = values[::-1]
+            else:
+                raise ValueError("No table found on the first page.")
+
+    except FileNotFoundError:
+        print(f"Error: The file '{pdf_path}' was not found.")
+    except ValueError as ve:
+        print(f"Error processing the PDF: {ve}")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+
+    return data
diff --git a/requirements.txt b/requirements.txt
@@ -1,14 +1,21 @@
+cffi==1.16.0
+charset-normalizer==3.3.2
 contourpy==1.2.1
+cryptography==42.0.7
 cycler==0.12.1
 fonttools==4.52.1
 iniconfig==2.0.0
 kiwisolver==1.4.5
 matplotlib==3.9.0
 numpy==1.26.4
 packaging==24.0
+pdfminer.six==20231228
+pdfplumber==0.11.0
 pillow==10.3.0
 pluggy==1.5.0
+pycparser==2.22
 pyparsing==3.1.2
+pypdfium2==4.30.0
 pytest==8.2.1
 python-dateutil==2.9.0.post0
 six==1.16.0