|
1 | 1 | # -*- coding: utf-8 -*-
|
2 | 2 | #
|
3 | 3 | # Copyright (C) 2022 CERN.
|
| 4 | +# Copyright (C) 2024 TU Wien. |
4 | 5 | #
|
5 | 6 | # Invenio-Records-Resources is free software; you can redistribute it and/or
|
6 | 7 | # modify it under the terms of the MIT License; see LICENSE file for more
|
7 | 8 | # details.
|
8 | 9 |
|
9 | 10 | """Files tasks."""
|
10 | 11 |
|
| 12 | +import json |
| 13 | +import subprocess as sp |
| 14 | + |
11 | 15 | import requests
|
12 | 16 | from celery import shared_task
|
13 | 17 | from flask import current_app
|
14 | 18 | from invenio_access.permissions import system_identity
|
| 19 | +from invenio_db import db |
| 20 | +from invenio_files_rest.models import ObjectVersion, ObjectVersionTag |
15 | 21 |
|
16 | 22 | from ...proxies import current_service_registry
|
17 | 23 | from ...services.errors import FileKeyNotFoundError
|
@@ -39,3 +45,57 @@ def fetch_file(service_id, record_id, file_key):
|
39 | 45 |
|
40 | 46 | except FileKeyNotFoundError as e:
|
41 | 47 | current_app.logger.error(e)
|
| 48 | + |
| 49 | + |
| 50 | +# TODO update siegfried signatures (`sf -update`) regularly |
| 51 | +@shared_task(ignore_result=True) |
| 52 | +def detect_file_type(bucket_id, file_key): |
| 53 | + """Detect the format of the file using siegfried.""" |
| 54 | + # TODO maybe we should go through the Records-Resources files API instead? |
| 55 | + ov = ObjectVersion.get(bucket_id, file_key) |
| 56 | + if ov.file is None: |
| 57 | + return |
| 58 | + |
| 59 | + # TODO the original filename is lost (renamed to 'data'), but sf uses the filename |
| 60 | + # for parts of its algorithm; possible solutions: |
| 61 | + # * create a temporary alias (link?) to the file and pass that to sf |
| 62 | + # * pipe the file's contents into sf via stdin and use the `-name` arg |
| 63 | + |
| 64 | + # TODO question: could we utilize siegfried's server mode? |
| 65 | + |
| 66 | + mimetype, pronom_id = None, None |
| 67 | + try: |
| 68 | + sf_bin = "sf" |
| 69 | + # TODO this may only be possible for 'local' storage? |
| 70 | + sf_output = sp.check_output([sf_bin, "-json", ov.file.uri], text=True) |
| 71 | + result = json.loads(sf_output) |
| 72 | + |
| 73 | + for file_info in result.get("files", []): |
| 74 | + # only consider results for the file in question |
| 75 | + if file_info.get("filename") != ov.file.uri: |
| 76 | + continue |
| 77 | + |
| 78 | + if not file_info.get("errors", None) and file_info.get("matches", []): |
| 79 | + for match in file_info["matches"]: |
| 80 | + if match["ns"] == "pronom": |
| 81 | + pronom_id = match["id"] |
| 82 | + |
| 83 | + # NOTE: there may be results other than for the "pronom" ns |
| 84 | + # which may actually deliver better matches |
| 85 | + # e.g. for the `sway-vulkan` script, the sf website |
| 86 | + # (https://www.itforarchivists.com/siegfried) |
| 87 | + # reports "plain text file" and no mimetype for PRONOM |
| 88 | + # but "shell script" (and a mimetype) for the |
| 89 | + # "freedesktop.org" ns |
| 90 | + if match["mime"]: |
| 91 | + mimetype = match["mime"] |
| 92 | + |
| 93 | + except Exception as e: |
| 94 | + print(e) |
| 95 | + |
| 96 | + if mimetype is not None: |
| 97 | + ov.mimetype = mimetype |
| 98 | + if pronom_id is not None: |
| 99 | + ObjectVersionTag.create_or_update(ov, "pronom_id", pronom_id) |
| 100 | + |
| 101 | + db.session.commit() |
0 commit comments