-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_pdf.py
96 lines (81 loc) · 3.52 KB
/
extract_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import re
import shutil
import time
import win32com.client
import extract_msg # You need to install this via pip (pip install extract-msg)
from win32com.client import Dispatch
# Define the folder where you want to save the extracted PDFs
save_folder = r"C:\Users\Public\Downloads"
# Create the folder if it doesn't exist
if not os.path.exists(save_folder):
os.makedirs(save_folder)
# Function to sanitize filenames
def sanitize_filename(filename):
return re.sub(r'[\\/*?:"<>|]', "", filename)
# Function to save the .msg file
def save_msg_file(attachment, save_path):
try:
attachment.SaveAsFile(save_path)
print(f"Saved MSG file: {save_path}")
return True
except Exception as e:
print(f"Failed to save MSG file {attachment.FileName}: {e}")
return False
# Recursive function to extract PDF from nested .msg files
def extract_pdf_from_msg(msg):
try:
if isinstance(msg, str):
print(f"Attempting to extract PDF from: {msg}")
msg = extract_msg.Message(msg)
if not msg.attachments:
print(f"No attachments found in {msg}")
return False
for att in msg.attachments:
print(f"Processing attachment: {att}")
if att is None:
print(f"Encountered a NoneType attachment in {msg}, skipping.")
continue
print(f"Attachment type: {type(att)}")
# Handle EmbeddedMsgAttachment (nested .msg files)
if isinstance(att, extract_msg.attachments.emb_msg_att.EmbeddedMsgAttachment):
nested_msg = att.data # Get the nested Message object
print(f"Processing nested MSG attachment.")
return extract_pdf_from_msg(nested_msg)
elif att.longFilename and att.longFilename.endswith(".pdf"):
att.save() # Save the file to the current directory
current_dir = os.getcwd() # Get current directory
pdf_save_path = os.path.join(save_folder, sanitize_filename(att.longFilename))
shutil.move(os.path.join(current_dir, att.longFilename), pdf_save_path) # Move to desired location
print(f"Extracted PDF: {pdf_save_path}")
return True
return False
except Exception as e:
print(f"Failed to extract PDF from {msg}: {e}")
return False
# Connect to Outlook
outlook = Dispatch("Outlook.Application").GetNamespace("MAPI")
# Get all the accounts
accounts = outlook.Folders
# Specify the name of the account you want to target
account_name = "[email protected]"
# Find the inbox for the specified account
for account in accounts:
if account.Name == account_name:
inbox = account.Folders("Inbox")
break
else:
print(f"Account '{account_name}' not found.")
exit()
# Start the extraction process from unread emails in the inbox
for message in inbox.Items:
if message.UnRead: # Process only unread emails
print(f"Processing unread email: {message.Subject}")
for attachment in message.Attachments:
if attachment.FileName.endswith(".msg"):
sanitized_filename = sanitize_filename(attachment.FileName)
msg_save_path = os.path.join(save_folder, sanitized_filename)
if save_msg_file(attachment, msg_save_path):
if extract_pdf_from_msg(msg_save_path):
message.UnRead = False # Mark the email as read
message.Save() # Save changes to the email