Skip to content

Commit

Permalink
Try to extract creation time of generic media from internal metadata
Browse files Browse the repository at this point in the history
The generic media provider pulls images from the filesystem, but file creation times are unreliable when working from copies of the source, so try to examine built-in metadata. Start with creation time as it's very important for RIME but plan to extend to other metadata such as location.
  • Loading branch information
Nicholas FitzRoy-Dale committed Aug 12, 2024
1 parent 992d8d9 commit 4edd9fe
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 3 deletions.
9 changes: 9 additions & 0 deletions rime/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,21 @@ def __hash__(self):
return hash(key)


@dataclass(kw_only=True)
class MediaMetadata:
"""
Represents metadata associated with a media file.
"""
ctime: datetime | None = None


@dataclass(kw_only=True)
class Media:
"""
Represents either standalone media or media associated with a MessageEvent.
"""
mime_type: str
metadata: MediaMetadata | None = None
local_id: str # A provider-specific reference to the media.


Expand Down
18 changes: 17 additions & 1 deletion rime/filesystem/direntry.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import stat

from filetype import guess as filetype_guess
from .metadatainfo import can_try_extract_metadata, try_extract_metadata

# MIME type for DirEntries which haven't yet had their MIME type determined
MIME_TYPE_NOT_YET_DETERMINED = 'rime/mime-type-not-yet-determined'
Expand All @@ -28,6 +29,10 @@ class DirEntry:
stat_val: os.stat_result
mime_type: str

# Creation time if embedded in file metadata (such as in EXIF for images). If there is no embedded creation time,
# this should be None
embedded_ctime: int | None = None

def is_dir(self):
return stat.S_ISDIR(self.stat_val.st_mode)

Expand All @@ -41,6 +46,7 @@ def stat(self):
def from_path(cls, fs, path):
stat_val = fs.stat(path)

# Determine MIME type
if stat.S_ISDIR(stat_val.st_mode):
mime_type = MIME_TYPE_DIRECTORY
elif stat.S_ISREG(stat_val.st_mode):
Expand All @@ -58,4 +64,14 @@ def from_path(cls, fs, path):
else:
mime_type = MIME_TYPE_CANNOT_DETERMINE

return cls(path, stat_val, mime_type)
# Attempt to find embedded ctime.
embedded_ctime = None

if can_try_extract_metadata(mime_type):
with fs.open(path) as f:
metadata = try_extract_metadata(mime_type, f)

if metadata:
embedded_ctime = metadata.ctime.timestamp()

return cls(path, stat_val, mime_type, embedded_ctime)
29 changes: 29 additions & 0 deletions rime/filesystem/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,21 @@
"""
import os
import pickle
from sqlite3 import OperationalError

from ..sql import sqlite3_connect_filename, Table, Query, Parameter, Column
from .direntry import DirEntry


METADATA_DB_VERSION = 2


class MetadataDb:
def __init__(self, db_pathname):
self.settings_table = Table('settings')
self.dir_entries_table = Table('dir_entries')
self.mime_types_table = Table('mime_types')
self.version_table = Table('version')

self.db = self._init_db(db_pathname)
self._db_pathname = db_pathname
Expand All @@ -26,6 +31,29 @@ def _init_db(self, db_pathname):
conn = sqlite3_connect_filename(db_pathname, read_only=False)

with conn:
query = Query.from_(self.version_table).select('version')
try:
result = conn.execute(query.get_sql()).fetchone()
except OperationalError:
result = None

upgrade_required = result is None or result[0] < METADATA_DB_VERSION

if upgrade_required:
conn.close()
os.remove(db_pathname)
conn = sqlite3_connect_filename(db_pathname, read_only=False)

with conn:
query = Query.create_table(Table('version')).if_not_exists().columns(
Column('version', 'INT'))\
.primary_key('version')
conn.execute(query.get_sql())

if upgrade_required:
query = Query.into(self.version_table).columns(self.version_table.version).insert(Parameter('?'))
conn.execute(str(query), (METADATA_DB_VERSION,))

query = Query.create_table(Table('settings')).if_not_exists().columns(
Column('key', 'TEXT'),
Column('value', 'TEXT'))\
Expand All @@ -42,6 +70,7 @@ def _init_db(self, db_pathname):
Column('id', 'INT'),
Column('path', 'TEXT'),
Column('mime_type_id', 'INT'),
Column('embedded_ctime', 'INT'),
Column('stat_val', 'TEXT'))\
.primary_key('id')
conn.execute(query.get_sql())
Expand Down
124 changes: 124 additions & 0 deletions rime/filesystem/metadatainfo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""
Attempt to read metadata from a file
"""
import dataclasses
import datetime
import json
import struct
import shutil
import subprocess

BAIL_AFTER_BYTES = 1024 * 1024 * 10


@dataclasses.dataclass
class Metadata:
ctime: datetime.datetime | None = None


def from_mp4(handle):
"""
Attempt to read metadata from an MP4 file.
"""
def read_box(handle):
box_size = struct.unpack('>I', handle.read(4))[0]
box_type = handle.read(4).decode('ascii')
return box_size, box_type

handle.seek(0, 0)
size, typ = read_box(handle)
if typ != 'ftyp':
raise ValueError(f'Expected ftyp, got {typ}')

# Find the moov box
while typ != 'moov' and size > 0:
handle.seek(size - 8, 1)
size, typ = read_box(handle)
if handle.tell() >= BAIL_AFTER_BYTES:
raise ValueError('Bailed out of searching for moov box')

size, typ = read_box(handle)
if typ != 'mvhd':
raise ValueError(f'Expected mvhd, got {typ}')

version_and_flags = struct.unpack('>I', handle.read(4))[0]
version = version_and_flags >> 24

if version != 0:
raise ValueError(f'Expected version 0, got {version}')

ctime_secs = struct.unpack('>I', handle.read(4))[0]

if ctime_secs == 0:
raise ValueError('No creation time found')

# Convert from seconds since 1904-01-01 to seconds since 1970-01-01
ctime = datetime.datetime(1904, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
ctime += datetime.timedelta(seconds=ctime_secs)

return Metadata(ctime=ctime)


def from_ffprobe(ffprobe, handle):
"""
Attempt to read metadata from a file using ffprobe.
"""
handle.seek(0)
p = subprocess.run([ffprobe, '-v', 'quiet', '-print_format', 'json', '-show_format', '-'], input=handle.read(),
stdout=subprocess.PIPE, stderr=subprocess.PIPE)

if p.returncode != 0:
raise ValueError(f'ffprobe failed: {p.stderr.decode()}')

data = json.loads(p.stdout)
ctime = data['format']['tags'].get('creation_time')

if ctime is not None:
ctime = datetime.datetime.fromisoformat(ctime)
return Metadata(ctime=ctime)

raise ValueError('No creation time found')


def try_extract_video_metadata(mime_type, handle):
# Try to use the built-in decoders and fall back to ffprobe if it exists.
ffprobe = shutil.which('ffprobe')

# Try to extract using native decoder if possible.
try:
if mime_type == 'video/mp4':
return from_mp4(handle)
except Exception:
pass

if ffprobe is not None:
try:
return from_ffprobe(ffprobe, handle)
except Exception:
pass

return None


def try_extract_metadata(mime_type, handle):
try:
if mime_type.startswith('video/'):
return try_extract_video_metadata(mime_type, handle)
else:
pass
except ValueError:
pass

return None


def can_try_extract_metadata(mime_type):
return mime_type.startswith('video/') or mime_type.startswith('image/') or mime_type.startswith('audio/')


if __name__ == '__main__':
import sys
with open(sys.argv[1], 'rb') as handle:
metadata = try_extract_metadata('video/mp4', handle)

print(metadata)
16 changes: 14 additions & 2 deletions rime/providers/androidgenericmedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from dataclasses import dataclass

from ..provider import Provider
from ..event import MediaEvent, GenericEventInfo
from ..event import MediaEvent, GenericEventInfo, MediaMetadata
from ..media import MediaData

from . import providernames
Expand Down Expand Up @@ -88,11 +88,23 @@ def search_events(self, device, filter_):
is_user_generated=is_user_generated,
)

# The entry's timestamp determines how it is displayed in the GUI, so we want to to
# return the most accurate one. Since file ctimes can be modified easily (e.g. by
# copying), we assume that the embedded creation time from the file's metadata, if
# it has one, is most accurate.
if direntry.embedded_ctime:
timestamp = datetime.fromtimestamp(direntry.embedded_ctime)
else:
timestamp = datetime.fromtimestamp(direntry.stat().st_ctime)

metadata = MediaMetadata(ctime=direntry.embedded_ctime)

yield MediaEvent(
mime_type=direntry.mime_type,
metadata=metadata,
local_id=direntry.path,
id_=direntry.path,
timestamp=datetime.fromtimestamp(direntry.stat().st_ctime),
timestamp=timestamp,
generic_event_info=generic_event_info,
provider=self,
sender=sender,
Expand Down
9 changes: 9 additions & 0 deletions rime/schema.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -298,13 +298,21 @@ interface Media {
url: String!
}

"""
Metadata embedded in media in the form of, for example, EXIF (pictures) or MVHD (mp4).
"""
type MediaMetadata {
ctime: DateTime
}

"""
Media which is attached to something else, such as a message.
"""
type AttachedMedia implements Media {
id: ID!
mime_type: String!
url: String!
metadata: MediaMetadata
}

"""
Expand Down Expand Up @@ -354,4 +362,5 @@ type MediaEvent implements Media & Event {
timestamp: DateTime
mime_type: String!
url: String!
metadata: MediaMetadata
}

0 comments on commit 4edd9fe

Please sign in to comment.