Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow to configure which fields are used to find duplicates #4199

Merged
merged 11 commits into from
Aug 21, 2022
2 changes: 1 addition & 1 deletion beets/autotag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
TrackMatch,
Distance,
)
from .match import tag_item, tag_album, Proposal # noqa
from .match import tag_item, tag_album, current_metadata, Proposal # noqa
from .match import Recommendation # noqa

# Global logger.
Expand Down
3 changes: 3 additions & 0 deletions beets/config_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ import:
group_albums: no
pretend: false
search_ids: []
duplicate_keys:
album: albumartist album
item: artist title
duplicate_action: ask
bell: no
set_fields: {}
Expand Down
21 changes: 20 additions & 1 deletion beets/dbcore/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from beets.util import functemplate
from beets.util import py3_path
from beets.dbcore import types
from .query import MatchQuery, NullSort, TrueQuery
from .query import MatchQuery, NullSort, TrueQuery, AndQuery
from collections.abc import Mapping


Expand Down Expand Up @@ -641,6 +641,25 @@ def set_parse(self, key, string):
"""
self[key] = self._parse(key, string)

# Convenient queries.

@classmethod
def field_query(cls, field, pattern, query_cls=MatchQuery):
"""Get a `FieldQuery` for this model."""
return query_cls(field, pattern, field in cls._fields)

@classmethod
def all_fields_query(cls, pats, query_cls=MatchQuery):
"""Get a query that matches many fields with different patterns.

`pats` should be a mapping from field names to patterns. The
resulting query is a conjunction ("and") of per-field queries
for all of these field/pattern pairs.
"""
subqueries = [cls.field_query(k, v, query_cls)
for k, v in pats.items()]
return AndQuery(subqueries)


# Database controller and supporting interfaces.

Expand Down
71 changes: 45 additions & 26 deletions beets/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,17 +521,18 @@ def skip(self):

# Convenient data.

def chosen_ident(self):
"""Returns identifying metadata about the current choice. For
albums, this is an (artist, album) pair. For items, this is
(artist, title). May only be called when the choice flag is ASIS
or RETAG (in which case the data comes from the files' current
metadata) or APPLY (data comes from the choice).
def chosen_info(self):
"""Return a dictionary of metadata about the current choice.
May only be called when the choice flag is ASIS or RETAG
(in which case the data comes from the files' current metadata)
or APPLY (in which case the data comes from the choice).
"""
if self.choice_flag in (action.ASIS, action.RETAG):
return (self.cur_artist, self.cur_album)
likelies, consensus = autotag.current_metadata(self.items)
return likelies
elif self.choice_flag is action.APPLY:
return (self.match.info.artist, self.match.info.album)
return self.match.info.copy()
assert False

def imported_items(self):
"""Return a list of Items that should be added to the library.
Expand Down Expand Up @@ -667,26 +668,34 @@ def find_duplicates(self, lib):
"""Return a list of albums from `lib` with the same artist and
album name as the task.
"""
artist, album = self.chosen_ident()
info = self.chosen_info()
info['albumartist'] = info['artist']

if artist is None:
if info['artist'] is None:
# As-is import with no artist. Skip check.
return []

duplicates = []
# Construct a query to find duplicates with this metadata. We
# use a temporary Album object to generate any computed fields.
tmp_album = library.Album(lib, **info)
keys = config['import']['duplicate_keys']['album'].as_str_seq()
dup_query = library.Album.all_fields_query({
key: tmp_album.get(key)
for key in keys
})

# Don't count albums with the same files as duplicates.
task_paths = {i.path for i in self.items if i}
duplicate_query = dbcore.AndQuery((
dbcore.MatchQuery('albumartist', artist),
dbcore.MatchQuery('album', album),
))

for album in lib.albums(duplicate_query):
duplicates = []
for album in lib.albums(dup_query):
# Check whether the album paths are all present in the task
# i.e. album is being completely re-imported by the task,
# in which case it is not a duplicate (will be replaced).
album_paths = {i.path for i in album.items()}
if not (album_paths <= task_paths):
duplicates.append(album)

return duplicates

def align_album_level_fields(self):
Expand Down Expand Up @@ -892,12 +901,17 @@ def __init__(self, toppath, item):
self.is_album = False
self.paths = [item.path]

def chosen_ident(self):
assert self.choice_flag in (action.ASIS, action.APPLY, action.RETAG)
def chosen_info(self):
"""Return a dictionary of metadata about the current choice.
May only be called when the choice flag is ASIS or RETAG
(in which case the data comes from the files' current metadata)
or APPLY (in which case the data comes from the choice).
"""
assert self.choice_flag in (action.ASIS, action.RETAG, action.APPLY)
if self.choice_flag in (action.ASIS, action.RETAG):
return (self.item.artist, self.item.title)
return dict(self.item)
elif self.choice_flag is action.APPLY:
return (self.match.info.artist, self.match.info.title)
return self.match.info.copy()

def imported_items(self):
return [self.item]
Expand All @@ -918,14 +932,19 @@ def find_duplicates(self, lib):
"""Return a list of items from `lib` that have the same artist
and title as the task.
"""
artist, title = self.chosen_ident()
info = self.chosen_info()

# Query for existing items using the same metadata. We use a
# temporary `Item` object to generate any computed fields.
tmp_item = library.Item(lib, **info)
keys = config['import']['duplicate_keys']['item'].as_str_seq()
dup_query = library.Album.all_fields_query({
key: tmp_item.get(key)
for key in keys
})

found_items = []
query = dbcore.AndQuery((
dbcore.MatchQuery('artist', artist),
dbcore.MatchQuery('title', title),
))
for other_item in lib.items(query):
for other_item in lib.items(dup_query):
# Existing items not considered duplicates.
if other_item.path != self.item.path:
found_items.append(other_item)
Expand Down
4 changes: 4 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ New features:
* :doc:`/plugins/kodiupdate`: Now supports multiple kodi instances
:bug:`4101`
* Add the item fields ``bitrate_mode``, ``encoder_info`` and ``encoder_settings``.
* Add query prefixes ``=`` and ``~``.
* A new configuration option, :ref:`duplicate_keys`, lets you change which
fields the beets importer uses to identify duplicates.
:bug:`1133` :bug:`4199`
* Add :ref:`exact match <exact-match>` queries, using the prefixes ``=`` and
``=~``.
:bug:`4251`
Expand Down
16 changes: 16 additions & 0 deletions docs/reference/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,22 @@ with the ``-a`` flag to the :ref:`import-cmd` command.)

Default: ``yes``.

.. _duplicate_keys:

duplicate_keys
~~~~~~~~~~~~~~

The fields used to find duplicates when importing.
There are two sub-values here: ``album`` and ``item``.
Each one is a list of field names; if an existing object (album or item) in
the library matches the new object on all of these fields, the importer will
consider it a duplicate.

Default::

album: albumartist album
item: artist title

.. _duplicate_action:

duplicate_action
Expand Down
33 changes: 33 additions & 0 deletions test/test_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,6 +1234,7 @@ def test_album_info(*args, **kwargs):
tracks=[track_info],
album_id='albumid',
artist_id='artistid',
flex='flex',
)
return iter([album_info])

Expand All @@ -1251,6 +1252,7 @@ def setUp(self):
# Create import session
self.importer = self.create_importer()
config['import']['autotag'] = True
config['import']['duplicate_keys']['album'] = 'albumartist album'

def tearDown(self):
self.teardown_beets()
Expand Down Expand Up @@ -1320,6 +1322,24 @@ def test_merge_duplicate_album(self):
def test_twice_in_import_dir(self):
self.skipTest('write me')

def test_keep_when_extra_key_is_different(self):
config['import']['duplicate_keys']['album'] = 'albumartist album flex'

item = self.lib.items().get()
import_file = MediaFile(os.path.join(
self.importer.paths[0], b'album 0', b'track 0.mp3'))
import_file.artist = item['artist']
import_file.albumartist = item['artist']
import_file.album = item['album']
import_file.title = item['title']
import_file.flex = 'different'

self.importer.default_resolution = self.importer.Resolution.SKIP
self.importer.run()

self.assertEqual(len(self.lib.albums()), 2)
self.assertEqual(len(self.lib.items()), 2)

def add_album_fixture(self, **kwargs):
# TODO move this into upstream
album = super().add_album_fixture()
Expand Down Expand Up @@ -1349,6 +1369,7 @@ def setUp(self):
self.importer = self.create_importer()
config['import']['autotag'] = True
config['import']['singletons'] = True
config['import']['duplicate_keys']['item'] = 'artist title'

def tearDown(self):
self.teardown_beets()
Expand Down Expand Up @@ -1385,6 +1406,18 @@ def test_skip_duplicate(self):
item = self.lib.items().get()
self.assertEqual(item.mb_trackid, 'old trackid')

def test_keep_when_extra_key_is_different(self):
config['import']['duplicate_keys']['item'] = 'artist title flex'
item = self.lib.items().get()
item.flex = 'different'
item.store()
self.assertEqual(len(self.lib.items()), 1)

self.importer.default_resolution = self.importer.Resolution.SKIP
self.importer.run()

self.assertEqual(len(self.lib.items()), 2)

def test_twice_in_import_dir(self):
self.skipTest('write me')

Expand Down