8
8
from datetime import datetime
9
9
from functools import lru_cache
10
10
from io import IOBase
11
- from os import makedirs , path
12
11
from os .path import getsize
13
12
from typing import Any , Callable , Dict , Iterable , List , MutableMapping , Optional , Tuple
14
13
30
29
from airbyte_cdk .sources .file_based .remote_file import RemoteFile
31
30
from source_microsoft_sharepoint .spec import SourceMicrosoftSharePointSpec
32
31
33
- from .exceptions import ErrorDownloadingFile , ErrorFetchingMetadata
32
+ from .exceptions import ErrorFetchingMetadata
34
33
from .utils import (
35
34
FolderNotFoundException ,
36
35
MicrosoftSharePointRemoteFile ,
@@ -167,7 +166,7 @@ def config(self, value: SourceMicrosoftSharePointSpec):
167
166
assert isinstance (value , SourceMicrosoftSharePointSpec )
168
167
self ._config = value
169
168
170
- def _get_shared_drive_object (self , drive_id : str , object_id : str , path : str ) -> List [ Tuple [ str , str , datetime ] ]:
169
+ def _get_shared_drive_object (self , drive_id : str , object_id : str , path : str ) -> Iterable [ MicrosoftSharePointRemoteFile ]:
171
170
"""
172
171
Retrieves a list of all nested files under the specified object.
173
172
@@ -176,7 +175,7 @@ def _get_shared_drive_object(self, drive_id: str, object_id: str, path: str) ->
176
175
object_id: The ID of the object to start the search from.
177
176
178
177
Returns:
179
- A list of tuples containing file information (name, download URL, and last modified datetime) .
178
+ An iterable of MicrosoftSharePointRemoteFile instances containing file information.
180
179
181
180
Raises:
182
181
RuntimeError: If an error occurs during the request.
@@ -186,7 +185,7 @@ def _get_shared_drive_object(self, drive_id: str, object_id: str, path: str) ->
186
185
headers = {"Authorization" : f"Bearer { access_token } " }
187
186
base_url = f"https://graph.microsoft.com/v1.0/drives/{ drive_id } "
188
187
189
- def get_files (url : str , path : str ) -> List [ Tuple [ str , str , datetime ] ]:
188
+ def get_files (url : str , path : str ) -> Iterable [ MicrosoftSharePointRemoteFile ]:
190
189
response = requests .get (url , headers = headers )
191
190
if response .status_code != 200 :
192
191
error_info = response .json ().get ("error" , {}).get ("message" , "No additional error information provided." )
@@ -196,8 +195,15 @@ def get_files(url: str, path: str) -> List[Tuple[str, str, datetime]]:
196
195
for child in data .get ("value" , []):
197
196
new_path = path + "/" + child ["name" ]
198
197
if child .get ("file" ): # Object is a file
198
+ # last_modified and created_at are type string e.g. "2025-04-16T14:41:00Z"
199
199
last_modified = datetime .strptime (child ["lastModifiedDateTime" ], "%Y-%m-%dT%H:%M:%SZ" )
200
- yield (new_path , child ["@microsoft.graph.downloadUrl" ], last_modified )
200
+ created_at = datetime .strptime (child ["createdDateTime" ], "%Y-%m-%dT%H:%M:%SZ" )
201
+ yield MicrosoftSharePointRemoteFile (
202
+ uri = new_path ,
203
+ download_url = child ["@microsoft.graph.downloadUrl" ],
204
+ last_modified = last_modified ,
205
+ created_at = created_at
206
+ )
201
207
else : # Object is a folder, retrieve children
202
208
child_url = f"{ base_url } /items/{ child ['id' ]} /children" # Use item endpoint for nested objects
203
209
yield from get_files (child_url , new_path )
@@ -218,23 +224,35 @@ def get_files(url: str, path: str) -> List[Tuple[str, str, datetime]]:
218
224
if item_data .get ("file" ): # Initial object is a file
219
225
new_path = path + "/" + item_data ["name" ]
220
226
last_modified = datetime .strptime (item_data ["lastModifiedDateTime" ], "%Y-%m-%dT%H:%M:%SZ" )
221
- yield (new_path , item_data ["@microsoft.graph.downloadUrl" ], last_modified )
227
+ created_at = datetime .strptime (item_data ["createdDateTime" ], "%Y-%m-%dT%H:%M:%SZ" )
228
+ yield MicrosoftSharePointRemoteFile (
229
+ uri = new_path ,
230
+ download_url = item_data ["@microsoft.graph.downloadUrl" ],
231
+ last_modified = last_modified ,
232
+ created_at = created_at
233
+ )
222
234
else :
223
235
# Initial object is a folder, start file retrieval
224
236
yield from get_files (f"{ item_url } /children" , path )
225
237
226
- def _list_directories_and_files (self , root_folder , path ):
238
+ def _list_directories_and_files (self , root_folder , path ) -> Iterable [ MicrosoftSharePointRemoteFile ] :
227
239
"""Enumerates folders and files starting from a root folder."""
228
240
drive_items = execute_query_with_retry (root_folder .children .get ())
229
241
for item in drive_items :
230
242
item_path = path + "/" + item .name if path else item .name
231
243
if item .is_file :
232
- yield (item_path , item .properties ["@microsoft.graph.downloadUrl" ], item .properties ["lastModifiedDateTime" ])
244
+ # last_modified and created_at are type datetime.datetime e.g. (2025, 2, 18, 19, 32, 4)
245
+ yield MicrosoftSharePointRemoteFile (
246
+ uri = item_path ,
247
+ download_url = item .properties ["@microsoft.graph.downloadUrl" ],
248
+ last_modified = item .properties ["lastModifiedDateTime" ],
249
+ created_at = item .properties ["createdDateTime" ]
250
+ )
233
251
else :
234
252
yield from self ._list_directories_and_files (item , item_path )
235
253
yield from []
236
254
237
- def _get_files_by_drive_name (self , drives , folder_path ):
255
+ def _get_files_by_drive_name (self , drives , folder_path ) -> Iterable [ MicrosoftSharePointRemoteFile ] :
238
256
"""Yields files from the specified drive."""
239
257
path_levels = [level for level in folder_path .split ("/" ) if level ]
240
258
folder_path = "/" .join (path_levels )
@@ -350,7 +368,7 @@ def drives(self) -> EntityCollection:
350
368
351
369
return drives
352
370
353
- def _get_shared_files_from_all_drives (self , parsed_drives ):
371
+ def _get_shared_files_from_all_drives (self , parsed_drives ) -> Iterable [ MicrosoftSharePointRemoteFile ] :
354
372
drive_ids = [drive .id for drive in parsed_drives ]
355
373
356
374
shared_drive_items = execute_query_with_retry (self .one_drive_client .me .drive .shared_with_me ())
@@ -361,7 +379,7 @@ def _get_shared_files_from_all_drives(self, parsed_drives):
361
379
if parent_reference and parent_reference ["driveId" ] not in drive_ids :
362
380
yield from self ._get_shared_drive_object (parent_reference ["driveId" ], drive_item .id , drive_item .web_url )
363
381
364
- def get_all_files (self ):
382
+ def get_all_files (self ) -> Iterable [ MicrosoftSharePointRemoteFile ] :
365
383
if self .config .search_scope in ("ACCESSIBLE_DRIVES" , "ALL" ):
366
384
# Get files from accessible drives
367
385
yield from self ._get_files_by_drive_name (self .drives , self .config .folder_path )
@@ -383,17 +401,7 @@ def get_matching_files(self, globs: List[str], prefix: Optional[str], logger: lo
383
401
files = self .get_all_files ()
384
402
385
403
files_generator = filter_http_urls (
386
- self .filter_files_by_globs_and_start_date (
387
- [
388
- MicrosoftSharePointRemoteFile (
389
- uri = path ,
390
- download_url = download_url ,
391
- last_modified = last_modified ,
392
- )
393
- for path , download_url , last_modified in files
394
- ],
395
- globs ,
396
- ),
404
+ self .filter_files_by_globs_and_start_date ([file for file in files ], globs ),
397
405
logger ,
398
406
)
399
407
@@ -500,6 +508,7 @@ def upload(
500
508
filename = file_name ,
501
509
bytes = file_size ,
502
510
source_uri = file .uri ,
511
+ created_at = file .created_at .strftime ("%Y-%m-%dT%H:%M:%S.%fZ" ),
503
512
updated_at = file .last_modified .strftime ("%Y-%m-%dT%H:%M:%S.%fZ" ),
504
513
)
505
514
0 commit comments