-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsheethelper.py
130 lines (104 loc) · 4.78 KB
/
sheethelper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import pandas as pd
import gspread
from google.oauth2 import service_account
class SheetHelper:
"""
A helper class to interact with Google Sheets using the gspread library.
"""
def __init__(self, sheet_url=None, sheet_id=0, secret_file_path=secret_path):
"""
Initializes the SheetHelper instance and authenticates with the Google Sheets API.
Parameters:
- sheet_url (str): The URL of the Google Sheet.
- sheet_id (int): The index of the worksheet to interact with (default is 0).
- secret_file_path (str): The file path to the Google service account credentials.
"""
self.sheet_instance = self.authenticate(sheet_url, sheet_id, secret_file_path)
def authenticate(self, sheet_url, sheet_id, secret_file_path):
"""
Authenticates with the Google Sheets API using service account credentials and returns a worksheet instance.
Parameters:
- sheet_url (str): The URL of the Google Sheet.
- sheet_id (int): The index of the worksheet to interact with.
- secret_file_path (str): The file path to the Google service account credentials.
Returns:
- gspread.models.Worksheet: The authenticated worksheet instance.
"""
credentials = service_account.Credentials.from_service_account_file(
secret_file_path
)
scope = [
"https://spreadsheets.google.com/feeds",
"https://www.googleapis.com/auth/drive",
]
creds = credentials.with_scopes(scope)
client = gspread.authorize(creds)
sheet = client.open_by_url(sheet_url)
return sheet.get_worksheet(sheet_id)
def append_row(self, row_list):
"""
Appends a new row to the worksheet.
Parameters:
- row_list (list): A list containing the data to be appended as a new row.
"""
self.sheet_instance.append_row(row_list)
return "Wrote to Gsheet."
def get_last_row_index(self):
"""
Retrieves the index of the last row with data in the worksheet.
Returns:
- int: The index of the last row with data.
"""
return len(self.sheet_instance.get_all_records())
def update_cell(self, row, col, value):
"""
Updates a specific cell in the worksheet with a new value.
Parameters:
- row (int): The row number of the cell to be updated.
- col (int): The column number of the cell to be updated.
- value: The new value to set in the cell.
"""
self.sheet_instance.update_cell(row, col, value)
def gsheet_to_df(self, num_rows=None) -> pd.DataFrame:
"""
Converts the Google Sheet data into a pandas DataFrame with an optional
parameter to limit the number of rows.
Parameters:
- num_rows (int, optional): The number of rows to return from the sheet.
If None, all rows are returned.
Returns:
- pd.DataFrame: A DataFrame containing the data from the Google Sheet.
"""
records = self.sheet_instance.get_all_records()
# Limit the number of rows if num_rows is specified
if num_rows is not None:
records = records[:num_rows]
return pd.DataFrame.from_dict(records)
def get_unloaded_emails(self) -> pd.DataFrame:
"""
Retrieves emails from the Google Sheet that have not been loaded into Chromedb.
Returns:
- pd.DataFrame: A DataFrame containing the unloaded emails.
"""
records = self.sheet_instance.get_all_records()
df = pd.DataFrame.from_dict(records)
# Assuming 'Status' is the column indicating if the email is loaded
unloaded_emails = df[df['Status'] != 'Loaded']
return unloaded_emails
def mark_emails_as_loaded(self, email_ids):
"""
Marks the specified emails as loaded in the Google Sheet by setting 'chroma_status' to 1.
Parameters:
- email_ids (list): A list of email identifiers to mark as loaded.
"""
# Fetch all records to find the rows and update them
records = self.sheet_instance.get_all_records()
# Determine the index of 'chroma_status' and 'Email' columns
header = records[0].keys()
chroma_status_col = list(header).index('chroma_status') + 1 # gspread is 1-indexed
email_col = list(header).index('Email') + 1 # assuming 'Email' is the column with the identifier
for i, record in enumerate(records):
if record['Email'] in email_ids:
# Update the 'chroma_status' column to 1 for this email
self.update_cell(i + 2, chroma_status_col, 1) # '+2' because records start from index 0 and Google Sheets rows start from 1 (excluding header)