Skip to content

Commit ce9ce98

Browse files
committed
files: add file format detection on file commit
* this functionality uses a tool called siegfried https://github.com/richardlehane/siegfried
1 parent 47f1cf9 commit ce9ce98

File tree

4 files changed

+87
-0
lines changed

4 files changed

+87
-0
lines changed

invenio_records_resources/services/files/components/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22
#
33
# Copyright (C) 2021 CERN.
4+
# Copyright (C) 2024 TU Wien.
45
#
56
# Invenio-Records-Resources is free software; you can redistribute it and/or
67
# modify it under the terms of the MIT License; see LICENSE file for more
@@ -10,6 +11,7 @@
1011

1112
from .base import FileServiceComponent
1213
from .content import FileContentComponent
14+
from .filetype import FileTypeDetectionComponent
1315
from .metadata import FileMetadataComponent
1416
from .processor import FileProcessorComponent
1517

@@ -18,4 +20,5 @@
1820
"FileMetadataComponent",
1921
"FileProcessorComponent",
2022
"FileServiceComponent",
23+
"FileTypeDetectionComponent",
2124
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright (C) 2024 TU Wien.
4+
#
5+
# Invenio-Records-Resources is free software; you can redistribute it and/or
6+
# modify it under the terms of the MIT License; see LICENSE file for more
7+
# details.
8+
9+
"""Service component for detecting file types."""
10+
11+
from ...uow import TaskOp
12+
from ..tasks import detect_file_type
13+
from .base import FileServiceComponent
14+
15+
16+
class FileTypeDetectionComponent(FileServiceComponent):
17+
"""Service component for detecting file types."""
18+
19+
def commit_file(self, identity, id, file_key, record):
20+
"""Detect the file format as soon as the file has been committed."""
21+
self.uow.register(TaskOp(detect_file_type, str(record.bucket.id), file_key))

invenio_records_resources/services/files/config.py

+3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#
33
# Copyright (C) 2020-2022 CERN.
44
# Copyright (C) 2020 Northwestern University.
5+
# Copyright (C) 2024 TU Wien.
56
#
67
# Invenio-Records-Resources is free software; you can redistribute it and/or
78
# modify it under the terms of the MIT License; see LICENSE file for more
@@ -15,6 +16,7 @@
1516
FileContentComponent,
1617
FileMetadataComponent,
1718
FileProcessorComponent,
19+
FileTypeDetectionComponent,
1820
)
1921
from .links import FileLink
2022
from .processors import ImageMetadataExtractor
@@ -54,6 +56,7 @@ class FileServiceConfig(ServiceConfig):
5456
FileMetadataComponent,
5557
FileContentComponent,
5658
FileProcessorComponent,
59+
FileTypeDetectionComponent,
5760
]
5861

5962
file_processors = [

invenio_records_resources/services/files/tasks.py

+60
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,23 @@
11
# -*- coding: utf-8 -*-
22
#
33
# Copyright (C) 2022 CERN.
4+
# Copyright (C) 2024 TU Wien.
45
#
56
# Invenio-Records-Resources is free software; you can redistribute it and/or
67
# modify it under the terms of the MIT License; see LICENSE file for more
78
# details.
89

910
"""Files tasks."""
1011

12+
import json
13+
import subprocess as sp
14+
1115
import requests
1216
from celery import shared_task
1317
from flask import current_app
1418
from invenio_access.permissions import system_identity
19+
from invenio_db import db
20+
from invenio_files_rest.models import ObjectVersion, ObjectVersionTag
1521

1622
from ...proxies import current_service_registry
1723
from ...services.errors import FileKeyNotFoundError
@@ -39,3 +45,57 @@ def fetch_file(service_id, record_id, file_key):
3945

4046
except FileKeyNotFoundError as e:
4147
current_app.logger.error(e)
48+
49+
50+
# TODO update siegfried signatures (`sf -update`) regularly
51+
@shared_task(ignore_result=True)
52+
def detect_file_type(bucket_id, file_key):
53+
"""Detect the format of the file using siegfried."""
54+
# TODO maybe we should go through the Records-Resources files API instead?
55+
ov = ObjectVersion.get(bucket_id, file_key)
56+
if ov.file is None:
57+
return
58+
59+
# TODO the original filename is lost (renamed to 'data'), but sf uses the filename
60+
# for parts of its algorithm; possible solutions:
61+
# * create a temporary alias (link?) to the file and pass that to sf
62+
# * pipe the file's contents into sf via stdin and use the `-name` arg
63+
64+
# TODO question: could we utilize siegfried's server mode?
65+
66+
mimetype, pronom_id = None, None
67+
try:
68+
sf_bin = "sf"
69+
# TODO this may only be possible for 'local' storage?
70+
sf_output = sp.check_output([sf_bin, "-json", ov.file.uri], text=True)
71+
result = json.loads(sf_output)
72+
73+
for file_info in result.get("files", []):
74+
# only consider results for the file in question
75+
if file_info.get("filename") != ov.file.uri:
76+
continue
77+
78+
if not file_info.get("errors", None) and file_info.get("matches", []):
79+
for match in file_info["matches"]:
80+
if match["ns"] == "pronom":
81+
pronom_id = match["id"]
82+
83+
# NOTE: there may be results other than for the "pronom" ns
84+
# which may actually deliver better matches
85+
# e.g. for the `sway-vulkan` script, the sf website
86+
# (https://www.itforarchivists.com/siegfried)
87+
# reports "plain text file" and no mimetype for PRONOM
88+
# but "shell script" (and a mimetype) for the
89+
# "freedesktop.org" ns
90+
if match["mime"]:
91+
mimetype = match["mime"]
92+
93+
except Exception as e:
94+
print(e)
95+
96+
if mimetype is not None:
97+
ov.mimetype = mimetype
98+
if pronom_id is not None:
99+
ObjectVersionTag.create_or_update(ov, "pronom_id", pronom_id)
100+
101+
db.session.commit()

0 commit comments

Comments
 (0)