Skip to content

Commit 919b5a3

Browse files
committed
source-microsoft-sharepoint: add cretaed_at
1 parent 01594a1 commit 919b5a3

File tree

5 files changed

+76
-45
lines changed

5 files changed

+76
-45
lines changed

airbyte-integrations/connectors/source-microsoft-sharepoint/poetry.lock

+4-4
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

airbyte-integrations/connectors/source-microsoft-sharepoint/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ python = "^3.11,<3.12"
2020
msal = "==1.27.0"
2121
Office365-REST-Python-Client = "==2.5.5"
2222
smart-open = "==6.4.0"
23-
airbyte-cdk = {extras = ["file-based"], version = "6.45.4.post48.dev14477787653"}
23+
airbyte-cdk = {extras = ["file-based"], version = "6.45.4.post49.dev14495925594"}
2424

2525
[tool.poetry.scripts]
2626
source-microsoft-sharepoint = "source_microsoft_sharepoint.run:run"

airbyte-integrations/connectors/source-microsoft-sharepoint/source_microsoft_sharepoint/stream_reader.py

+32-23
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from datetime import datetime
99
from functools import lru_cache
1010
from io import IOBase
11-
from os import makedirs, path
1211
from os.path import getsize
1312
from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Tuple
1413

@@ -30,7 +29,7 @@
3029
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
3130
from source_microsoft_sharepoint.spec import SourceMicrosoftSharePointSpec
3231

33-
from .exceptions import ErrorDownloadingFile, ErrorFetchingMetadata
32+
from .exceptions import ErrorFetchingMetadata
3433
from .utils import (
3534
FolderNotFoundException,
3635
MicrosoftSharePointRemoteFile,
@@ -167,7 +166,7 @@ def config(self, value: SourceMicrosoftSharePointSpec):
167166
assert isinstance(value, SourceMicrosoftSharePointSpec)
168167
self._config = value
169168

170-
def _get_shared_drive_object(self, drive_id: str, object_id: str, path: str) -> List[Tuple[str, str, datetime]]:
169+
def _get_shared_drive_object(self, drive_id: str, object_id: str, path: str) -> Iterable[MicrosoftSharePointRemoteFile]:
171170
"""
172171
Retrieves a list of all nested files under the specified object.
173172
@@ -176,7 +175,7 @@ def _get_shared_drive_object(self, drive_id: str, object_id: str, path: str) ->
176175
object_id: The ID of the object to start the search from.
177176
178177
Returns:
179-
A list of tuples containing file information (name, download URL, and last modified datetime).
178+
An iterable of MicrosoftSharePointRemoteFile instances containing file information.
180179
181180
Raises:
182181
RuntimeError: If an error occurs during the request.
@@ -186,7 +185,7 @@ def _get_shared_drive_object(self, drive_id: str, object_id: str, path: str) ->
186185
headers = {"Authorization": f"Bearer {access_token}"}
187186
base_url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}"
188187

189-
def get_files(url: str, path: str) -> List[Tuple[str, str, datetime]]:
188+
def get_files(url: str, path: str) -> Iterable[MicrosoftSharePointRemoteFile]:
190189
response = requests.get(url, headers=headers)
191190
if response.status_code != 200:
192191
error_info = response.json().get("error", {}).get("message", "No additional error information provided.")
@@ -196,8 +195,15 @@ def get_files(url: str, path: str) -> List[Tuple[str, str, datetime]]:
196195
for child in data.get("value", []):
197196
new_path = path + "/" + child["name"]
198197
if child.get("file"): # Object is a file
198+
# last_modified and created_at are type string e.g. "2025-04-16T14:41:00Z"
199199
last_modified = datetime.strptime(child["lastModifiedDateTime"], "%Y-%m-%dT%H:%M:%SZ")
200-
yield (new_path, child["@microsoft.graph.downloadUrl"], last_modified)
200+
created_at = datetime.strptime(child["createdDateTime"], "%Y-%m-%dT%H:%M:%SZ")
201+
yield MicrosoftSharePointRemoteFile(
202+
uri=new_path,
203+
download_url=child["@microsoft.graph.downloadUrl"],
204+
last_modified=last_modified,
205+
created_at=created_at
206+
)
201207
else: # Object is a folder, retrieve children
202208
child_url = f"{base_url}/items/{child['id']}/children" # Use item endpoint for nested objects
203209
yield from get_files(child_url, new_path)
@@ -218,23 +224,35 @@ def get_files(url: str, path: str) -> List[Tuple[str, str, datetime]]:
218224
if item_data.get("file"): # Initial object is a file
219225
new_path = path + "/" + item_data["name"]
220226
last_modified = datetime.strptime(item_data["lastModifiedDateTime"], "%Y-%m-%dT%H:%M:%SZ")
221-
yield (new_path, item_data["@microsoft.graph.downloadUrl"], last_modified)
227+
created_at = datetime.strptime(item_data["createdDateTime"], "%Y-%m-%dT%H:%M:%SZ")
228+
yield MicrosoftSharePointRemoteFile(
229+
uri=new_path,
230+
download_url=item_data["@microsoft.graph.downloadUrl"],
231+
last_modified=last_modified,
232+
created_at=created_at
233+
)
222234
else:
223235
# Initial object is a folder, start file retrieval
224236
yield from get_files(f"{item_url}/children", path)
225237

226-
def _list_directories_and_files(self, root_folder, path):
238+
def _list_directories_and_files(self, root_folder, path) -> Iterable[MicrosoftSharePointRemoteFile]:
227239
"""Enumerates folders and files starting from a root folder."""
228240
drive_items = execute_query_with_retry(root_folder.children.get())
229241
for item in drive_items:
230242
item_path = path + "/" + item.name if path else item.name
231243
if item.is_file:
232-
yield (item_path, item.properties["@microsoft.graph.downloadUrl"], item.properties["lastModifiedDateTime"])
244+
# last_modified and created_at are type datetime.datetime e.g. (2025, 2, 18, 19, 32, 4)
245+
yield MicrosoftSharePointRemoteFile(
246+
uri=item_path,
247+
download_url=item.properties["@microsoft.graph.downloadUrl"],
248+
last_modified=item.properties["lastModifiedDateTime"],
249+
created_at=item.properties["createdDateTime"]
250+
)
233251
else:
234252
yield from self._list_directories_and_files(item, item_path)
235253
yield from []
236254

237-
def _get_files_by_drive_name(self, drives, folder_path):
255+
def _get_files_by_drive_name(self, drives, folder_path) -> Iterable[MicrosoftSharePointRemoteFile]:
238256
"""Yields files from the specified drive."""
239257
path_levels = [level for level in folder_path.split("/") if level]
240258
folder_path = "/".join(path_levels)
@@ -350,7 +368,7 @@ def drives(self) -> EntityCollection:
350368

351369
return drives
352370

353-
def _get_shared_files_from_all_drives(self, parsed_drives):
371+
def _get_shared_files_from_all_drives(self, parsed_drives) -> Iterable[MicrosoftSharePointRemoteFile]:
354372
drive_ids = [drive.id for drive in parsed_drives]
355373

356374
shared_drive_items = execute_query_with_retry(self.one_drive_client.me.drive.shared_with_me())
@@ -361,7 +379,7 @@ def _get_shared_files_from_all_drives(self, parsed_drives):
361379
if parent_reference and parent_reference["driveId"] not in drive_ids:
362380
yield from self._get_shared_drive_object(parent_reference["driveId"], drive_item.id, drive_item.web_url)
363381

364-
def get_all_files(self):
382+
def get_all_files(self) -> Iterable[MicrosoftSharePointRemoteFile]:
365383
if self.config.search_scope in ("ACCESSIBLE_DRIVES", "ALL"):
366384
# Get files from accessible drives
367385
yield from self._get_files_by_drive_name(self.drives, self.config.folder_path)
@@ -383,17 +401,7 @@ def get_matching_files(self, globs: List[str], prefix: Optional[str], logger: lo
383401
files = self.get_all_files()
384402

385403
files_generator = filter_http_urls(
386-
self.filter_files_by_globs_and_start_date(
387-
[
388-
MicrosoftSharePointRemoteFile(
389-
uri=path,
390-
download_url=download_url,
391-
last_modified=last_modified,
392-
)
393-
for path, download_url, last_modified in files
394-
],
395-
globs,
396-
),
404+
self.filter_files_by_globs_and_start_date([file for file in files], globs),
397405
logger,
398406
)
399407

@@ -500,6 +508,7 @@ def upload(
500508
filename=file_name,
501509
bytes=file_size,
502510
source_uri=file.uri,
511+
created_at=file.created_at.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
503512
updated_at=file.last_modified.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
504513
)
505514

airbyte-integrations/connectors/source-microsoft-sharepoint/source_microsoft_sharepoint/utils.py

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class FolderNotFoundException(Exception):
2929

3030
class MicrosoftSharePointRemoteFile(RemoteFile):
3131
download_url: str
32+
created_at: datetime
3233

3334

3435
def filter_http_urls(files, logger):

airbyte-integrations/connectors/source-microsoft-sharepoint/unit_tests/test_stream_reader.py

+38-17
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,11 @@
3030

3131
def create_mock_drive_item(is_file, name, children=None):
3232
"""Helper function to create a mock drive item."""
33-
mock_item = MagicMock(properties={"@microsoft.graph.downloadUrl": "test_url", "lastModifiedDateTime": "1991-08-24"})
33+
mock_item = MagicMock(properties={
34+
"@microsoft.graph.downloadUrl": "test_url",
35+
"lastModifiedDateTime": datetime(1991, 8, 24),
36+
"createdDateTime": datetime(1991, 8, 24),
37+
})
3438
mock_item.is_file = is_file
3539
mock_item.name = name
3640
mock_item.children.get.return_value.execute_query = Mock(return_value=children or [])
@@ -62,8 +66,18 @@ def create_mock_drive_files():
6266
Provides mock data for SharePoint drive files (personal drive).
6367
"""
6468
return [
65-
("file1.csv", "https://example.com/file1.csv", datetime(2021, 1, 1)),
66-
("file2.txt", "https://example.com/file2.txt", datetime(2021, 1, 1)),
69+
MicrosoftSharePointRemoteFile(
70+
uri="file1.csv",
71+
download_url="https://example.com/file1.csv",
72+
last_modified=datetime(2021, 1, 1),
73+
created_at=datetime(2021, 1, 1)
74+
),
75+
MicrosoftSharePointRemoteFile(
76+
uri="file2.txt",
77+
download_url="https://example.com/file2.txt",
78+
last_modified=datetime(2021, 1, 1),
79+
created_at=datetime(2021, 1, 1)
80+
),
6781
]
6882

6983

@@ -73,8 +87,18 @@ def create_mock_shared_drive_files():
7387
Provides mock data for SharePoint drive files (shared drives).
7488
"""
7589
return [
76-
("file3.csv", "https://example.com/file3.csv", datetime(2021, 3, 1)),
77-
("file4.txt", "https://example.com/file4.txt", datetime(2021, 4, 1)),
90+
MicrosoftSharePointRemoteFile(
91+
uri="file3.csv",
92+
download_url="https://example.com/file3.csv",
93+
last_modified=datetime(2021, 3, 1),
94+
created_at=datetime(2021, 3, 1)
95+
),
96+
MicrosoftSharePointRemoteFile(
97+
uri="file4.txt",
98+
download_url="https://example.com/file4.txt",
99+
last_modified=datetime(2021, 4, 1),
100+
created_at=datetime(2021, 4, 1)
101+
),
78102
]
79103

80104

@@ -258,6 +282,7 @@ def test_get_file(mock_requests_head, mock_requests_get, mock_get_access_token,
258282
file_uri = f"{file_uri}.{file_extension}"
259283
mock_file = Mock(download_url=f"https://example.com/file.{file_extension}", uri=file_uri)
260284
mock_file.last_modified = datetime(2021, 1, 1)
285+
mock_file.created_at = datetime(2021, 1, 1)
261286
mock_logger = Mock()
262287
mock_get_access_token.return_value = "dummy_access_token"
263288

@@ -381,8 +406,10 @@ def test_list_directories_and_files():
381406

382407
assert len(result) == 2
383408
assert result == [
384-
("https://example.com/root/folder1/file1.txt", "test_url", "1991-08-24"),
385-
("https://example.com/root/file2.txt", "test_url", "1991-08-24"),
409+
MicrosoftSharePointRemoteFile(uri='https://example.com/root/folder1/file1.txt', last_modified=datetime(1991, 8, 24, 0, 0),
410+
mime_type=None, download_url='test_url', created_at=datetime(1991, 8, 24, 0, 0)),
411+
MicrosoftSharePointRemoteFile(uri='https://example.com/root/file2.txt', last_modified=datetime(1991, 8, 24, 0, 0),
412+
mime_type=None, download_url='test_url', created_at=datetime(1991, 8, 24, 0, 0)),
386413
]
387414

388415

@@ -480,6 +507,7 @@ def test_get_shared_files_from_all_drives(
480507
"name": "TestFile.txt",
481508
"@microsoft.graph.downloadUrl": "http://example.com/download",
482509
"lastModifiedDateTime": "2021-01-01T00:00:00Z",
510+
"createdDateTime": "2021-01-01T00:00:00Z"
483511
}
484512

485513
empty_folder_response = {"folder": True, "value": []}
@@ -504,6 +532,7 @@ def test_get_shared_files_from_all_drives(
504532
"name": "NestedFile.txt",
505533
"@microsoft.graph.downloadUrl": "http://example.com/nested",
506534
"lastModifiedDateTime": "2021-01-02T00:00:00Z",
535+
"createdDateTime": "2021-01-02T00:00:00Z"
507536
}
508537
],
509538
"name": "subfolder2",
@@ -518,11 +547,7 @@ def test_get_shared_files_from_all_drives(
518547
file_response,
519548
[],
520549
[
521-
(
522-
"http://example.com/TestFile.txt",
523-
"http://example.com/download",
524-
datetime.strptime("2021-01-01T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ"),
525-
)
550+
MicrosoftSharePointRemoteFile(uri='http://example.com/TestFile.txt', last_modified=datetime(2021, 1, 1, 0, 0), mime_type=None, download_url='http://example.com/download', created_at=datetime(2021, 1, 1, 0, 0)),
526551
],
527552
False,
528553
None,
@@ -539,11 +564,7 @@ def test_get_shared_files_from_all_drives(
539564
not_empty_subfolder_response,
540565
],
541566
[
542-
(
543-
"http://example.com/subfolder2/NestedFile.txt",
544-
"http://example.com/nested",
545-
datetime.strptime("2021-01-02T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ"),
546-
)
567+
MicrosoftSharePointRemoteFile(uri='http://example.com/subfolder2/NestedFile.txt', last_modified=datetime(2021, 1, 2, 0, 0), mime_type=None, download_url='http://example.com/nested', created_at=datetime(2021, 1, 2, 0, 0))
547568
],
548569
False,
549570
None,

0 commit comments

Comments
 (0)