fix(agents): deduplicate mismatched songs in similar songs matching (#4956)
* feat(agents): enhance song matching by removing unwanted duplicates while preserving identical entries Signed-off-by: Deluan <deluan@navidrome.org> * refactor: consolidate duplicate checks Signed-off-by: Deluan <deluan@navidrome.org> --------- Signed-off-by: Deluan <deluan@navidrome.org>
This commit is contained in:
Vendored
+57
-28
@@ -119,17 +119,24 @@ func (e *provider) matchSongsToLibrary(ctx context.Context, songs []agents.Song,
|
||||
// songMatchedIn checks if a song has already been matched in any of the provided match maps.
|
||||
// It checks the song's ID, MBID, and ISRC fields against the corresponding map keys.
|
||||
func songMatchedIn(s agents.Song, priorMatches ...map[string]model.MediaFile) bool {
|
||||
_, found := lookupByIdentifiers(s, priorMatches...)
|
||||
return found
|
||||
}
|
||||
|
||||
// lookupByIdentifiers searches for a song's identifiers (ID, MBID, ISRC) in the provided maps.
|
||||
// Returns the first matching MediaFile found and true, or an empty MediaFile and false if no match.
|
||||
func lookupByIdentifiers(s agents.Song, maps ...map[string]model.MediaFile) (model.MediaFile, bool) {
|
||||
keys := []string{s.ID, s.MBID, s.ISRC}
|
||||
for _, m := range priorMatches {
|
||||
for _, m := range maps {
|
||||
for _, key := range keys {
|
||||
if key != "" {
|
||||
if mf, ok := m[key]; ok && mf.ID != "" {
|
||||
return true
|
||||
return mf, true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
return model.MediaFile{}, false
|
||||
}
|
||||
|
||||
// loadTracksByID fetches MediaFiles from the library using direct ID matching.
|
||||
@@ -405,6 +412,9 @@ func (e *provider) findBestMatch(q songQuery, tracks model.MediaFiles, threshold
|
||||
return bestMatch, found
|
||||
}
|
||||
|
||||
// buildTitleQueries converts agent songs into normalized songQuery structs for title+artist matching.
|
||||
// It skips songs that have already been matched in prior phases (by ID, MBID, or ISRC) and sanitizes
|
||||
// all string fields for consistent comparison (lowercase, diacritics removed, articles stripped from artist names).
|
||||
func (e *provider) buildTitleQueries(songs []agents.Song, priorMatches ...map[string]model.MediaFile) []songQuery {
|
||||
var queries []songQuery
|
||||
for _, s := range songs {
|
||||
@@ -423,42 +433,61 @@ func (e *provider) buildTitleQueries(songs []agents.Song, priorMatches ...map[st
|
||||
return queries
|
||||
}
|
||||
|
||||
// selectBestMatchingSongs assembles the final result by mapping input songs to their best matching
|
||||
// library tracks. It iterates through the input songs in order and selects the first available match
|
||||
// using priority order: ID > MBID > ISRC > title+artist.
|
||||
//
|
||||
// The function also handles deduplication: when multiple different input songs would match the same
|
||||
// library track (e.g., "Song (Live)" and "Song (Remastered)" both matching "Song (Live)" in the library),
|
||||
// only the first match is kept. However, if the same input song appears multiple times (intentional
|
||||
// repetition), duplicates are preserved in the output.
|
||||
//
|
||||
// Returns up to 'count' MediaFiles, preserving the input order. Songs that cannot be matched are skipped.
|
||||
func (e *provider) selectBestMatchingSongs(songs []agents.Song, byID, byMBID, byISRC, byTitleArtist map[string]model.MediaFile, count int) model.MediaFiles {
|
||||
var mfs model.MediaFiles
|
||||
mfs := make(model.MediaFiles, 0, len(songs))
|
||||
// Track MediaFile.ID -> input song that added it, for deduplication
|
||||
addedBy := make(map[string]agents.Song, len(songs))
|
||||
|
||||
for _, t := range songs {
|
||||
if len(mfs) == count {
|
||||
break
|
||||
}
|
||||
// Try ID match first
|
||||
if t.ID != "" {
|
||||
if mf, ok := byID[t.ID]; ok {
|
||||
mfs = append(mfs, mf)
|
||||
continue
|
||||
|
||||
mf, found := findMatchingTrack(t, byID, byMBID, byISRC, byTitleArtist)
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check for duplicate library track
|
||||
if prevSong, alreadyAdded := addedBy[mf.ID]; alreadyAdded {
|
||||
// Only add duplicate if input songs are identical
|
||||
if t != prevSong {
|
||||
continue // Different input songs → skip mismatch-induced duplicate
|
||||
}
|
||||
} else {
|
||||
addedBy[mf.ID] = t
|
||||
}
|
||||
// Try MBID match second
|
||||
if t.MBID != "" {
|
||||
if mf, ok := byMBID[t.MBID]; ok {
|
||||
mfs = append(mfs, mf)
|
||||
continue
|
||||
}
|
||||
}
|
||||
// Try ISRC match third
|
||||
if t.ISRC != "" {
|
||||
if mf, ok := byISRC[t.ISRC]; ok {
|
||||
mfs = append(mfs, mf)
|
||||
continue
|
||||
}
|
||||
}
|
||||
// Fall back to title+artist match (composite key preserves duplicate titles)
|
||||
key := str.SanitizeFieldForSorting(t.Name) + "|" + str.SanitizeFieldForSortingNoArticle(t.Artist)
|
||||
if mf, ok := byTitleArtist[key]; ok {
|
||||
mfs = append(mfs, mf)
|
||||
}
|
||||
|
||||
mfs = append(mfs, mf)
|
||||
}
|
||||
return mfs
|
||||
}
|
||||
|
||||
// findMatchingTrack looks up a song in the match maps using priority order: ID > MBID > ISRC > title+artist.
|
||||
// Returns the matched MediaFile and true if found, or an empty MediaFile and false if no match exists.
|
||||
func findMatchingTrack(t agents.Song, byID, byMBID, byISRC, byTitleArtist map[string]model.MediaFile) (model.MediaFile, bool) {
|
||||
// Try identifier-based matches first (ID, MBID, ISRC)
|
||||
if mf, found := lookupByIdentifiers(t, byID, byMBID, byISRC); found {
|
||||
return mf, true
|
||||
}
|
||||
// Fall back to title+artist fuzzy match
|
||||
key := str.SanitizeFieldForSorting(t.Name) + "|" + str.SanitizeFieldForSortingNoArticle(t.Artist)
|
||||
if mf, ok := byTitleArtist[key]; ok {
|
||||
return mf, true
|
||||
}
|
||||
return model.MediaFile{}, false
|
||||
}
|
||||
|
||||
// similarityRatio calculates the similarity between two strings using Jaro-Winkler algorithm.
|
||||
// Returns a value between 0.0 (completely different) and 1.0 (identical).
|
||||
// Jaro-Winkler is well-suited for matching song titles because it gives higher scores
|
||||
|
||||
+135
@@ -624,4 +624,139 @@ var _ = Describe("Provider - Song Matching", func() {
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Describe("Deduplication of mismatched songs", func() {
|
||||
var track model.MediaFile
|
||||
|
||||
BeforeEach(func() {
|
||||
DeferCleanup(configtest.SetupConfig())
|
||||
conf.Server.SimilarSongsMatchThreshold = 85 // Allow fuzzy matching
|
||||
|
||||
track = model.MediaFile{ID: "track-1", Title: "Test Track", Artist: "Test Artist"}
|
||||
|
||||
// Setup for GetEntityByID to return the track
|
||||
artistRepo.On("Get", "track-1").Return(nil, model.ErrNotFound).Once()
|
||||
albumRepo.On("Get", "track-1").Return(nil, model.ErrNotFound).Once()
|
||||
mediaFileRepo.On("Get", "track-1").Return(&track, nil).Once()
|
||||
})
|
||||
|
||||
It("removes duplicates when different input songs match the same library track", func() {
|
||||
// Agent returns two different versions that will both fuzzy-match to the same library track
|
||||
returnedSongs := []agents.Song{
|
||||
{Name: "Bohemian Rhapsody (Live)", Artist: "Queen"},
|
||||
{Name: "Bohemian Rhapsody (Original Mix)", Artist: "Queen"},
|
||||
}
|
||||
// Library only has one version
|
||||
libraryTrack := model.MediaFile{
|
||||
ID: "br-live", Title: "Bohemian Rhapsody (Live)", Artist: "Queen",
|
||||
}
|
||||
|
||||
setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{libraryTrack})
|
||||
|
||||
songs, err := provider.SimilarSongs(ctx, "track-1", 5)
|
||||
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
// Should only return one track, not two duplicates
|
||||
Expect(songs).To(HaveLen(1))
|
||||
Expect(songs[0].ID).To(Equal("br-live"))
|
||||
})
|
||||
|
||||
It("preserves duplicates when identical input songs match the same library track", func() {
|
||||
// Agent returns the exact same song twice (intentional repetition)
|
||||
returnedSongs := []agents.Song{
|
||||
{Name: "Bohemian Rhapsody", Artist: "Queen", Album: "A Night at the Opera"},
|
||||
{Name: "Bohemian Rhapsody", Artist: "Queen", Album: "A Night at the Opera"},
|
||||
}
|
||||
// Library has matching track
|
||||
libraryTrack := model.MediaFile{
|
||||
ID: "br", Title: "Bohemian Rhapsody", Artist: "Queen", Album: "A Night at the Opera",
|
||||
}
|
||||
|
||||
setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{libraryTrack})
|
||||
|
||||
songs, err := provider.SimilarSongs(ctx, "track-1", 5)
|
||||
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
// Should return two tracks since input songs were identical
|
||||
Expect(songs).To(HaveLen(2))
|
||||
Expect(songs[0].ID).To(Equal("br"))
|
||||
Expect(songs[1].ID).To(Equal("br"))
|
||||
})
|
||||
|
||||
It("handles mixed scenario with both identical and different input songs", func() {
|
||||
// Agent returns: Song A, Song B (different from A), Song A again (same as first)
|
||||
// All three match to the same library track
|
||||
returnedSongs := []agents.Song{
|
||||
{Name: "Yesterday", Artist: "The Beatles", Album: "Help!"},
|
||||
{Name: "Yesterday (Remastered)", Artist: "The Beatles", Album: "1"}, // Different version
|
||||
{Name: "Yesterday", Artist: "The Beatles", Album: "Help!"}, // Same as first
|
||||
{Name: "Yesterday (Anthology)", Artist: "The Beatles", Album: "Anthology"}, // Another different version
|
||||
}
|
||||
// Library only has one version
|
||||
libraryTrack := model.MediaFile{
|
||||
ID: "yesterday", Title: "Yesterday", Artist: "The Beatles", Album: "Help!",
|
||||
}
|
||||
|
||||
setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{libraryTrack})
|
||||
|
||||
songs, err := provider.SimilarSongs(ctx, "track-1", 5)
|
||||
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
// Should return 2 tracks:
|
||||
// 1. First "Yesterday" (original)
|
||||
// 2. Third "Yesterday" (same as first, so kept)
|
||||
// Skip: Second "Yesterday (Remastered)" (different input, same library track)
|
||||
// Skip: Fourth "Yesterday (Anthology)" (different input, same library track)
|
||||
Expect(songs).To(HaveLen(2))
|
||||
Expect(songs[0].ID).To(Equal("yesterday"))
|
||||
Expect(songs[1].ID).To(Equal("yesterday"))
|
||||
})
|
||||
|
||||
It("does not deduplicate songs that match different library tracks", func() {
|
||||
// Agent returns different songs that match different library tracks
|
||||
returnedSongs := []agents.Song{
|
||||
{Name: "Song A", Artist: "Artist"},
|
||||
{Name: "Song B", Artist: "Artist"},
|
||||
{Name: "Song C", Artist: "Artist"},
|
||||
}
|
||||
// Library has all three songs
|
||||
trackA := model.MediaFile{ID: "track-a", Title: "Song A", Artist: "Artist"}
|
||||
trackB := model.MediaFile{ID: "track-b", Title: "Song B", Artist: "Artist"}
|
||||
trackC := model.MediaFile{ID: "track-c", Title: "Song C", Artist: "Artist"}
|
||||
|
||||
setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{trackA, trackB, trackC})
|
||||
|
||||
songs, err := provider.SimilarSongs(ctx, "track-1", 5)
|
||||
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
// All three should be returned since they match different library tracks
|
||||
Expect(songs).To(HaveLen(3))
|
||||
Expect(songs[0].ID).To(Equal("track-a"))
|
||||
Expect(songs[1].ID).To(Equal("track-b"))
|
||||
Expect(songs[2].ID).To(Equal("track-c"))
|
||||
})
|
||||
|
||||
It("respects count limit after deduplication", func() {
|
||||
// Agent returns 4 songs: 2 unique + 2 that would create duplicates
|
||||
returnedSongs := []agents.Song{
|
||||
{Name: "Song A", Artist: "Artist"},
|
||||
{Name: "Song A (Live)", Artist: "Artist"}, // Different, matches same track
|
||||
{Name: "Song B", Artist: "Artist"},
|
||||
{Name: "Song B (Remix)", Artist: "Artist"}, // Different, matches same track
|
||||
}
|
||||
trackA := model.MediaFile{ID: "track-a", Title: "Song A", Artist: "Artist"}
|
||||
trackB := model.MediaFile{ID: "track-b", Title: "Song B", Artist: "Artist"}
|
||||
|
||||
setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{trackA, trackB})
|
||||
|
||||
// Request only 2 songs
|
||||
songs, err := provider.SimilarSongs(ctx, "track-1", 2)
|
||||
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
// Should return exactly 2: Song A and Song B (skipping duplicates)
|
||||
Expect(songs).To(HaveLen(2))
|
||||
Expect(songs[0].ID).To(Equal("track-a"))
|
||||
Expect(songs[1].ID).To(Equal("track-b"))
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user