Skip to content

Commit

Permalink
Better logging for Google Drive follow shortcuts (#367)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuhongsun96 authored Aug 31, 2023
1 parent c1727e6 commit 8bf82ac
Showing 1 changed file with 27 additions and 8 deletions.
35 changes: 27 additions & 8 deletions backend/danswer/connectors/google_drive/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import docx2txt # type:ignore
from google.auth.credentials import Credentials # type: ignore
from googleapiclient import discovery # type: ignore
from googleapiclient.errors import HttpError # type: ignore
from PyPDF2 import PdfReader

from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
Expand Down Expand Up @@ -41,7 +42,7 @@

logger = setup_logger()

# allow 10 minutes for modifiedTime to get propogated
# allow 10 minutes for modifiedTime to get propagated
DRIVE_START_TIME_OFFSET = 60 * 10
SUPPORTED_DRIVE_DOC_TYPES = [
"application/vnd.google-apps.document",
Expand All @@ -58,6 +59,7 @@
def _run_drive_file_query(
service: discovery.Resource,
query: str,
continue_on_failure: bool,
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
batch_size: int = INDEX_BATCH_SIZE,
Expand All @@ -84,12 +86,20 @@ def _run_drive_file_query(
files = results["files"]
for file in files:
if follow_shortcuts and "shortcutDetails" in file:
file = service.files().get(
fileId=file["shortcutDetails"]["targetId"],
supportsAllDrives=include_shared,
fields="mimeType, id, name, webViewLink, shortcutDetails",
)
file = file.execute()
try:
file = service.files().get(
fileId=file["shortcutDetails"]["targetId"],
supportsAllDrives=include_shared,
fields="mimeType, id, name, webViewLink, shortcutDetails",
)
file = file.execute()
except HttpError:
logger.error(
f"Failed to follow shortcut with details: {file['shortcutDetails']}"
)
if continue_on_failure:
continue
raise
yield file


Expand Down Expand Up @@ -133,6 +143,7 @@ def _get_folder_id(

def _get_folders(
service: discovery.Resource,
continue_on_failure: bool,
folder_id: str | None = None, # if specified, only fetches files within this folder
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
Expand All @@ -149,6 +160,7 @@ def _get_folders(
for file in _run_drive_file_query(
service=service,
query=query,
continue_on_failure=continue_on_failure,
include_shared=include_shared,
follow_shortcuts=follow_shortcuts,
batch_size=batch_size,
Expand All @@ -163,6 +175,7 @@ def _get_folders(

def _get_files(
service: discovery.Resource,
continue_on_failure: bool,
time_range_start: SecondsSinceUnixEpoch | None = None,
time_range_end: SecondsSinceUnixEpoch | None = None,
folder_id: str | None = None, # if specified, only fetches files within this folder
Expand All @@ -187,6 +200,7 @@ def _get_files(
files = _run_drive_file_query(
service=service,
query=query,
continue_on_failure=continue_on_failure,
include_shared=include_shared,
follow_shortcuts=follow_shortcuts,
batch_size=batch_size,
Expand All @@ -198,6 +212,7 @@ def _get_files(

def get_all_files_batched(
service: discovery.Resource,
continue_on_failure: bool,
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
batch_size: int = INDEX_BATCH_SIZE,
Expand All @@ -214,6 +229,7 @@ def get_all_files_batched(
"""
valid_files = _get_files(
service=service,
continue_on_failure=continue_on_failure,
time_range_start=time_range_start,
time_range_end=time_range_end,
folder_id=folder_id,
Expand All @@ -234,6 +250,7 @@ def get_all_files_batched(
subfolders = _get_folders(
service=service,
folder_id=folder_id,
continue_on_failure=continue_on_failure,
include_shared=include_shared,
follow_shortcuts=follow_shortcuts,
batch_size=batch_size,
Expand All @@ -244,6 +261,7 @@ def get_all_files_batched(
folder_ids_traversed.append(subfolder["id"])
yield from get_all_files_batched(
service=service,
continue_on_failure=continue_on_failure,
include_shared=include_shared,
follow_shortcuts=follow_shortcuts,
batch_size=batch_size,
Expand Down Expand Up @@ -408,6 +426,7 @@ def _fetch_docs_from_drive(
*[
get_all_files_batched(
service=service,
continue_on_failure=self.continue_on_failure,
include_shared=self.include_shared,
follow_shortcuts=self.follow_shortcuts,
batch_size=self.batch_size,
Expand Down Expand Up @@ -481,7 +500,7 @@ def poll_source(
if delegated_user:
credentials_dict[DB_CREDENTIALS_DICT_DELEGATED_USER_KEY] = delegated_user

connector = GoogleDriveConnector()
connector = GoogleDriveConnector(include_shared=True, follow_shortcuts=True)
connector.load_credentials(credentials_dict)
document_batch_generator = connector.load_from_state()
for document_batch in document_batch_generator:
Expand Down

1 comment on commit 8bf82ac

@vercel
Copy link

@vercel vercel bot commented on 8bf82ac Aug 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.