Import trust source

This commit is contained in:
Eliot Berriot 2018-09-23 12:38:42 +00:00
parent ad7e6a97e5
commit 1bee3a4675
14 changed files with 872 additions and 429 deletions

View file

@ -1,9 +1,10 @@
import collections
import logging
import os
from django.utils import timezone
from django.db import transaction
from django.db.models import F
from django.db.models import F, Q
from django.dispatch import receiver
from musicbrainzngs import ResponseError
@ -14,7 +15,6 @@ from funkwhale_api.common import preferences
from funkwhale_api.federation import activity, actors, routes
from funkwhale_api.federation import library as lb
from funkwhale_api.federation import library as federation_serializers
from funkwhale_api.providers.acoustid import get_acoustid_client
from funkwhale_api.taskapp import celery
from . import lyrics as lyrics_utils
@ -26,102 +26,32 @@ from . import serializers
logger = logging.getLogger(__name__)
@celery.app.task(name="acoustid.set_on_upload")
@celery.require_instance(models.Upload, "upload")
def set_acoustid_on_upload(upload):
client = get_acoustid_client()
result = client.get_best_match(upload.audio_file.path)
def update(id):
upload.acoustid_track_id = id
upload.save(update_fields=["acoustid_track_id"])
return id
if result:
return update(result["id"])
def import_track_from_remote(metadata):
try:
track_mbid = metadata["recording"]["musicbrainz_id"]
assert track_mbid # for null/empty values
except (KeyError, AssertionError):
pass
else:
return models.Track.get_or_create_from_api(mbid=track_mbid)[0]
try:
album_mbid = metadata["release"]["musicbrainz_id"]
assert album_mbid # for null/empty values
except (KeyError, AssertionError):
pass
else:
album, _ = models.Album.get_or_create_from_api(mbid=album_mbid)
return models.Track.get_or_create_from_title(
metadata["title"], artist=album.artist, album=album
)[0]
try:
artist_mbid = metadata["artist"]["musicbrainz_id"]
assert artist_mbid # for null/empty values
except (KeyError, AssertionError):
pass
else:
artist, _ = models.Artist.get_or_create_from_api(mbid=artist_mbid)
album, _ = models.Album.get_or_create_from_title(
metadata["album_title"], artist=artist
)
return models.Track.get_or_create_from_title(
metadata["title"], artist=artist, album=album
)[0]
# worst case scenario, we have absolutely no way to link to a
# musicbrainz resource, we rely on the name/titles
artist, _ = models.Artist.get_or_create_from_name(metadata["artist_name"])
album, _ = models.Album.get_or_create_from_title(
metadata["album_title"], artist=artist
)
return models.Track.get_or_create_from_title(
metadata["title"], artist=artist, album=album
)[0]
def update_album_cover(album, upload, replace=False):
def update_album_cover(album, source=None, cover_data=None, replace=False):
if album.cover and not replace:
return
if upload:
# maybe the file has a cover embedded?
if cover_data:
return album.get_image(data=cover_data)
if source and source.startswith("file://"):
# let's look for a cover in the same directory
path = os.path.dirname(source.replace("file://", "", 1))
logger.info("[Album %s] scanning covers from %s", album.pk, path)
cover = get_cover_from_fs(path)
if cover:
return album.get_image(data=cover)
if album.mbid:
try:
metadata = upload.get_metadata()
except FileNotFoundError:
metadata = None
if metadata:
cover = metadata.get_picture("cover_front")
if cover:
# best case scenario, cover is embedded in the track
logger.info("[Album %s] Using cover embedded in file", album.pk)
return album.get_image(data=cover)
if upload.source and upload.source.startswith("file://"):
# let's look for a cover in the same directory
path = os.path.dirname(upload.source.replace("file://", "", 1))
logger.info("[Album %s] scanning covers from %s", album.pk, path)
cover = get_cover_from_fs(path)
if cover:
return album.get_image(data=cover)
if not album.mbid:
return
try:
logger.info(
"[Album %s] Fetching cover from musicbrainz release %s",
album.pk,
str(album.mbid),
)
return album.get_image()
except ResponseError as exc:
logger.warning(
"[Album %s] cannot fetch cover from musicbrainz: %s", album.pk, str(exc)
)
logger.info(
"[Album %s] Fetching cover from musicbrainz release %s",
album.pk,
str(album.mbid),
)
return album.get_image()
except ResponseError as exc:
logger.warning(
"[Album %s] cannot fetch cover from musicbrainz: %s", album.pk, str(exc)
)
IMAGE_TYPES = [("jpg", "image/jpeg"), ("png", "image/png")]
@ -244,15 +174,15 @@ def scan_library_page(library_scan, page_url):
scan_library_page.delay(library_scan_id=library_scan.pk, page_url=next_page)
def getter(data, *keys):
def getter(data, *keys, default=None):
if not data:
return
return default
v = data
for k in keys:
try:
v = v[k]
except KeyError:
return
return default
return v
@ -269,12 +199,17 @@ def fail_import(upload, error_code):
upload.import_details = {"error_code": error_code}
upload.import_date = timezone.now()
upload.save(update_fields=["import_details", "import_status", "import_date"])
signals.upload_import_status_updated.send(
old_status=old_status,
new_status=upload.import_status,
upload=upload,
sender=None,
broadcast = getter(
upload.import_metadata, "funkwhale", "config", "broadcast", default=True
)
if broadcast:
signals.upload_import_status_updated.send(
old_status=old_status,
new_status=upload.import_status,
upload=upload,
sender=None,
)
@celery.app.task(name="music.process_upload")
@ -285,22 +220,29 @@ def fail_import(upload, error_code):
"upload",
)
def process_upload(upload):
data = upload.import_metadata or {}
import_metadata = upload.import_metadata or {}
old_status = upload.import_status
audio_file = upload.get_audio_file()
try:
track = get_track_from_import_metadata(upload.import_metadata or {})
if not track and upload.audio_file:
# easy ways did not work. Now we have to be smart and use
# metadata from the file itself if any
track = import_track_data_from_file(upload.audio_file.file, hints=data)
if not track and upload.metadata:
# we can try to import using federation metadata
track = import_track_from_remote(upload.metadata)
additional_data = {}
if not audio_file:
# we can only rely on user proveded data
final_metadata = import_metadata
else:
# we use user provided data and data from the file itself
m = metadata.Metadata(audio_file)
file_metadata = m.all()
final_metadata = collections.ChainMap(
additional_data, import_metadata, file_metadata
)
additional_data["cover_data"] = m.get_picture("cover_front")
additional_data["upload_source"] = upload.source
track = get_track_from_import_metadata(final_metadata)
except UploadImportError as e:
return fail_import(upload, e.code)
except Exception:
fail_import(upload, "unknown_error")
raise
return fail_import(upload, "unknown_error")
# under some situations, we want to skip the import (
# for instance if the user already owns the files)
owned_duplicates = get_owned_duplicates(upload, track)
@ -342,33 +284,69 @@ def process_upload(upload):
"bitrate",
]
)
signals.upload_import_status_updated.send(
old_status=old_status,
new_status=upload.import_status,
upload=upload,
sender=None,
broadcast = getter(
import_metadata, "funkwhale", "config", "broadcast", default=True
)
routes.outbox.dispatch(
{"type": "Create", "object": {"type": "Audio"}}, context={"upload": upload}
if broadcast:
signals.upload_import_status_updated.send(
old_status=old_status,
new_status=upload.import_status,
upload=upload,
sender=None,
)
dispatch_outbox = getter(
import_metadata, "funkwhale", "config", "dispatch_outbox", default=True
)
if not track.album.cover:
update_album_cover(track.album, upload)
if dispatch_outbox:
routes.outbox.dispatch(
{"type": "Create", "object": {"type": "Audio"}}, context={"upload": upload}
)
def get_track_from_import_metadata(data):
track_mbid = getter(data, "track", "mbid")
track_uuid = getter(data, "track", "uuid")
def federation_audio_track_to_metadata(payload):
"""
Given a valid payload as returned by federation.serializers.TrackSerializer.validated_data,
returns a correct metadata payload for use with get_track_from_import_metadata.
"""
musicbrainz_recordingid = payload.get("musicbrainzId")
musicbrainz_artistid = payload["artists"][0].get("musicbrainzId")
musicbrainz_albumartistid = payload["album"]["artists"][0].get("musicbrainzId")
musicbrainz_albumid = payload["album"].get("musicbrainzId")
if track_mbid:
# easiest case: there is a MBID provided in the import_metadata
return models.Track.get_or_create_from_api(mbid=track_mbid)[0]
if track_uuid:
# another easy case, we have a reference to a uuid of a track that
# already exists in our database
try:
return models.Track.objects.get(uuid=track_uuid)
except models.Track.DoesNotExist:
raise UploadImportError(code="track_uuid_not_found")
new_data = {
"title": payload["name"],
"album": payload["album"]["name"],
"track_number": payload["position"],
"artist": payload["artists"][0]["name"],
"album_artist": payload["album"]["artists"][0]["name"],
"date": payload["album"].get("released"),
# musicbrainz
"musicbrainz_recordingid": str(musicbrainz_recordingid)
if musicbrainz_recordingid
else None,
"musicbrainz_artistid": str(musicbrainz_artistid)
if musicbrainz_artistid
else None,
"musicbrainz_albumartistid": str(musicbrainz_albumartistid)
if musicbrainz_albumartistid
else None,
"musicbrainz_albumid": str(musicbrainz_albumid)
if musicbrainz_albumid
else None,
# federation
"fid": payload["id"],
"artist_fid": payload["artists"][0]["id"],
"album_artist_fid": payload["album"]["artists"][0]["id"],
"album_fid": payload["album"]["id"],
"fdate": payload["published"],
"album_fdate": payload["album"]["published"],
"album_artist_fdate": payload["album"]["artists"][0]["published"],
"artist_fdate": payload["artists"][0]["published"],
}
cover = payload["album"].get("cover")
if cover:
new_data["cover_data"] = {"mimetype": cover["mediaType"], "url": cover["href"]}
return new_data
def get_owned_duplicates(upload, track):
@ -385,45 +363,191 @@ def get_owned_duplicates(upload, track):
)
def get_best_candidate_or_create(model, query, defaults, sort_fields):
"""
Like queryset.get_or_create() but does not crash if multiple objects
are returned on the get() call
"""
candidates = model.objects.filter(query)
if candidates:
return sort_candidates(candidates, sort_fields)[0], False
return model.objects.create(**defaults), True
def sort_candidates(candidates, important_fields):
"""
Given a list of objects and a list of fields,
will return a sorted list of those objects by score.
Score is higher for objects that have a non-empty attribute
that is also present in important fields::
artist1 = Artist(mbid=None, fid=None)
artist2 = Artist(mbid="something", fid=None)
# artist2 has a mbid, so is sorted first
assert sort_candidates([artist1, artist2], ['mbid'])[0] == artist2
Only supports string fields.
"""
# map each fields to its score, giving a higher score to first fields
fields_scores = {f: i + 1 for i, f in enumerate(sorted(important_fields))}
candidates_with_scores = []
for candidate in candidates:
current_score = 0
for field, score in fields_scores.items():
v = getattr(candidate, field, "")
if v:
current_score += score
candidates_with_scores.append((candidate, current_score))
return [c for c, s in reversed(sorted(candidates_with_scores, key=lambda v: v[1]))]
@transaction.atomic
def import_track_data_from_file(file, hints={}):
data = metadata.Metadata(file)
album = None
def get_track_from_import_metadata(data):
track_uuid = getter(data, "funkwhale", "track", "uuid")
if track_uuid:
# easy case, we have a reference to a uuid of a track that
# already exists in our database
try:
track = models.Track.objects.get(uuid=track_uuid)
except models.Track.DoesNotExist:
raise UploadImportError(code="track_uuid_not_found")
if not track.album.cover:
update_album_cover(
track.album,
source=data.get("upload_source"),
cover_data=data.get("cover_data"),
)
return track
from_activity_id = data.get("from_activity_id", None)
track_mbid = data.get("musicbrainz_recordingid", None)
album_mbid = data.get("musicbrainz_albumid", None)
track_fid = getter(data, "fid")
query = None
if album_mbid and track_mbid:
# to gain performance and avoid additional mb lookups,
# we import from the release data, which is already cached
return models.Track.get_or_create_from_release(album_mbid, track_mbid)[0]
elif track_mbid:
return models.Track.get_or_create_from_api(track_mbid)[0]
elif album_mbid:
album = models.Album.get_or_create_from_api(album_mbid)[0]
query = Q(mbid=track_mbid, album__mbid=album_mbid)
artist = album.artist if album else None
if track_fid:
query = query | Q(fid=track_fid) if query else Q(fid=track_fid)
if query:
# second easy case: we have a (track_mbid, album_mbid) pair or
# a federation uuid we can check on
try:
return sort_candidates(models.Track.objects.filter(query), ["mbid", "fid"])[
0
]
except IndexError:
pass
# get / create artist and album artist
artist_mbid = data.get("musicbrainz_artistid", None)
if not artist:
if artist_mbid:
artist = models.Artist.get_or_create_from_api(artist_mbid)[0]
else:
artist = models.Artist.objects.get_or_create(
name__iexact=data.get("artist"), defaults={"name": data.get("artist")}
)[0]
artist_fid = data.get("artist_fid", None)
artist_name = data["artist"]
query = Q(name__iexact=artist_name)
if artist_mbid:
query |= Q(mbid=artist_mbid)
if artist_fid:
query |= Q(fid=artist_fid)
defaults = {
"name": artist_name,
"mbid": artist_mbid,
"fid": artist_fid,
"from_activity_id": from_activity_id,
}
if data.get("artist_fdate"):
defaults["creation_date"] = data.get("artist_fdate")
release_date = data.get("date", default=None)
if not album:
album = models.Album.objects.get_or_create(
title__iexact=data.get("album"),
artist=artist,
defaults={"title": data.get("album"), "release_date": release_date},
)[0]
position = data.get("track_number", default=None)
track = models.Track.objects.get_or_create(
title__iexact=data.get("title"),
album=album,
defaults={"title": data.get("title"), "position": position},
artist = get_best_candidate_or_create(
models.Artist, query, defaults=defaults, sort_fields=["mbid", "fid"]
)[0]
album_artist_name = data.get("album_artist", artist_name)
if album_artist_name == artist_name:
album_artist = artist
else:
query = Q(name__iexact=album_artist_name)
album_artist_mbid = data.get("musicbrainz_albumartistid", None)
album_artist_fid = data.get("album_artist_fid", None)
if album_artist_mbid:
query |= Q(mbid=album_artist_mbid)
if album_artist_fid:
query |= Q(fid=album_artist_fid)
defaults = {
"name": album_artist_name,
"mbid": album_artist_mbid,
"fid": album_artist_fid,
"from_activity_id": from_activity_id,
}
if data.get("album_artist_fdate"):
defaults["creation_date"] = data.get("album_artist_fdate")
album_artist = get_best_candidate_or_create(
models.Artist, query, defaults=defaults, sort_fields=["mbid", "fid"]
)[0]
# get / create album
album_title = data["album"]
album_fid = data.get("album_fid", None)
query = Q(title__iexact=album_title, artist=album_artist)
if album_mbid:
query |= Q(mbid=album_mbid)
if album_fid:
query |= Q(fid=album_fid)
defaults = {
"title": album_title,
"artist": album_artist,
"mbid": album_mbid,
"release_date": data.get("date"),
"fid": album_fid,
"from_activity_id": from_activity_id,
}
if data.get("album_fdate"):
defaults["creation_date"] = data.get("album_fdate")
album = get_best_candidate_or_create(
models.Album, query, defaults=defaults, sort_fields=["mbid", "fid"]
)[0]
if not album.cover:
update_album_cover(
album, source=data.get("upload_source"), cover_data=data.get("cover_data")
)
# get / create track
track_title = data["title"]
track_number = data.get("track_number", 1)
query = Q(title__iexact=track_title, artist=artist, album=album)
if track_mbid:
query |= Q(mbid=track_mbid)
if track_fid:
query |= Q(fid=track_fid)
defaults = {
"title": track_title,
"album": album,
"mbid": track_mbid,
"artist": artist,
"position": track_number,
"fid": track_fid,
"from_activity_id": from_activity_id,
}
if data.get("fdate"):
defaults["creation_date"] = data.get("fdate")
track = get_best_candidate_or_create(
models.Track, query, defaults=defaults, sort_fields=["mbid", "fid"]
)[0]
return track
@ -432,6 +556,7 @@ def broadcast_import_status_update_to_owner(old_status, new_status, upload, **kw
user = upload.library.actor.get_user()
if not user:
return
group = "user.{}.imports".format(user.pk)
channels.group_send(
group,