fix(agents): deduplicate mismatched songs in similar songs matching (#4956)

* feat(agents): enhance song matching by removing unwanted duplicates while preserving identical entries

Signed-off-by: Deluan <deluan@navidrome.org>

* refactor: consolidate duplicate checks

Signed-off-by: Deluan <deluan@navidrome.org>

---------

Signed-off-by: Deluan <deluan@navidrome.org>
This commit is contained in:
Deluan Quintão
2026-01-30 15:25:00 +01:00
committed by GitHub
parent 7d5e13672d
commit 36252823ce
2 changed files with 192 additions and 28 deletions
+57 -28
View File
@@ -119,17 +119,24 @@ func (e *provider) matchSongsToLibrary(ctx context.Context, songs []agents.Song,
// songMatchedIn checks if a song has already been matched in any of the provided match maps.
// It checks the song's ID, MBID, and ISRC fields against the corresponding map keys.
func songMatchedIn(s agents.Song, priorMatches ...map[string]model.MediaFile) bool {
_, found := lookupByIdentifiers(s, priorMatches...)
return found
}
// lookupByIdentifiers searches for a song's identifiers (ID, MBID, ISRC) in the provided maps.
// Returns the first matching MediaFile found and true, or an empty MediaFile and false if no match.
func lookupByIdentifiers(s agents.Song, maps ...map[string]model.MediaFile) (model.MediaFile, bool) {
keys := []string{s.ID, s.MBID, s.ISRC}
for _, m := range priorMatches {
for _, m := range maps {
for _, key := range keys {
if key != "" {
if mf, ok := m[key]; ok && mf.ID != "" {
return true
return mf, true
}
}
}
}
return false
return model.MediaFile{}, false
}
// loadTracksByID fetches MediaFiles from the library using direct ID matching.
@@ -405,6 +412,9 @@ func (e *provider) findBestMatch(q songQuery, tracks model.MediaFiles, threshold
return bestMatch, found
}
// buildTitleQueries converts agent songs into normalized songQuery structs for title+artist matching.
// It skips songs that have already been matched in prior phases (by ID, MBID, or ISRC) and sanitizes
// all string fields for consistent comparison (lowercase, diacritics removed, articles stripped from artist names).
func (e *provider) buildTitleQueries(songs []agents.Song, priorMatches ...map[string]model.MediaFile) []songQuery {
var queries []songQuery
for _, s := range songs {
@@ -423,42 +433,61 @@ func (e *provider) buildTitleQueries(songs []agents.Song, priorMatches ...map[st
return queries
}
// selectBestMatchingSongs assembles the final result by mapping input songs to their best matching
// library tracks. It iterates through the input songs in order and selects the first available match
// using priority order: ID > MBID > ISRC > title+artist.
//
// The function also handles deduplication: when multiple different input songs would match the same
// library track (e.g., "Song (Live)" and "Song (Remastered)" both matching "Song (Live)" in the library),
// only the first match is kept. However, if the same input song appears multiple times (intentional
// repetition), duplicates are preserved in the output.
//
// Returns up to 'count' MediaFiles, preserving the input order. Songs that cannot be matched are skipped.
func (e *provider) selectBestMatchingSongs(songs []agents.Song, byID, byMBID, byISRC, byTitleArtist map[string]model.MediaFile, count int) model.MediaFiles {
var mfs model.MediaFiles
mfs := make(model.MediaFiles, 0, len(songs))
// Track MediaFile.ID -> input song that added it, for deduplication
addedBy := make(map[string]agents.Song, len(songs))
for _, t := range songs {
if len(mfs) == count {
break
}
// Try ID match first
if t.ID != "" {
if mf, ok := byID[t.ID]; ok {
mfs = append(mfs, mf)
continue
mf, found := findMatchingTrack(t, byID, byMBID, byISRC, byTitleArtist)
if !found {
continue
}
// Check for duplicate library track
if prevSong, alreadyAdded := addedBy[mf.ID]; alreadyAdded {
// Only add duplicate if input songs are identical
if t != prevSong {
continue // Different input songs → skip mismatch-induced duplicate
}
} else {
addedBy[mf.ID] = t
}
// Try MBID match second
if t.MBID != "" {
if mf, ok := byMBID[t.MBID]; ok {
mfs = append(mfs, mf)
continue
}
}
// Try ISRC match third
if t.ISRC != "" {
if mf, ok := byISRC[t.ISRC]; ok {
mfs = append(mfs, mf)
continue
}
}
// Fall back to title+artist match (composite key preserves duplicate titles)
key := str.SanitizeFieldForSorting(t.Name) + "|" + str.SanitizeFieldForSortingNoArticle(t.Artist)
if mf, ok := byTitleArtist[key]; ok {
mfs = append(mfs, mf)
}
mfs = append(mfs, mf)
}
return mfs
}
// findMatchingTrack looks up a song in the match maps using priority order: ID > MBID > ISRC > title+artist.
// Returns the matched MediaFile and true if found, or an empty MediaFile and false if no match exists.
func findMatchingTrack(t agents.Song, byID, byMBID, byISRC, byTitleArtist map[string]model.MediaFile) (model.MediaFile, bool) {
// Try identifier-based matches first (ID, MBID, ISRC)
if mf, found := lookupByIdentifiers(t, byID, byMBID, byISRC); found {
return mf, true
}
// Fall back to title+artist fuzzy match
key := str.SanitizeFieldForSorting(t.Name) + "|" + str.SanitizeFieldForSortingNoArticle(t.Artist)
if mf, ok := byTitleArtist[key]; ok {
return mf, true
}
return model.MediaFile{}, false
}
// similarityRatio calculates the similarity between two strings using Jaro-Winkler algorithm.
// Returns a value between 0.0 (completely different) and 1.0 (identical).
// Jaro-Winkler is well-suited for matching song titles because it gives higher scores
+135
View File
@@ -624,4 +624,139 @@ var _ = Describe("Provider - Song Matching", func() {
})
})
})
Describe("Deduplication of mismatched songs", func() {
var track model.MediaFile
BeforeEach(func() {
DeferCleanup(configtest.SetupConfig())
conf.Server.SimilarSongsMatchThreshold = 85 // Allow fuzzy matching
track = model.MediaFile{ID: "track-1", Title: "Test Track", Artist: "Test Artist"}
// Setup for GetEntityByID to return the track
artistRepo.On("Get", "track-1").Return(nil, model.ErrNotFound).Once()
albumRepo.On("Get", "track-1").Return(nil, model.ErrNotFound).Once()
mediaFileRepo.On("Get", "track-1").Return(&track, nil).Once()
})
It("removes duplicates when different input songs match the same library track", func() {
// Agent returns two different versions that will both fuzzy-match to the same library track
returnedSongs := []agents.Song{
{Name: "Bohemian Rhapsody (Live)", Artist: "Queen"},
{Name: "Bohemian Rhapsody (Original Mix)", Artist: "Queen"},
}
// Library only has one version
libraryTrack := model.MediaFile{
ID: "br-live", Title: "Bohemian Rhapsody (Live)", Artist: "Queen",
}
setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{libraryTrack})
songs, err := provider.SimilarSongs(ctx, "track-1", 5)
Expect(err).ToNot(HaveOccurred())
// Should only return one track, not two duplicates
Expect(songs).To(HaveLen(1))
Expect(songs[0].ID).To(Equal("br-live"))
})
It("preserves duplicates when identical input songs match the same library track", func() {
// Agent returns the exact same song twice (intentional repetition)
returnedSongs := []agents.Song{
{Name: "Bohemian Rhapsody", Artist: "Queen", Album: "A Night at the Opera"},
{Name: "Bohemian Rhapsody", Artist: "Queen", Album: "A Night at the Opera"},
}
// Library has matching track
libraryTrack := model.MediaFile{
ID: "br", Title: "Bohemian Rhapsody", Artist: "Queen", Album: "A Night at the Opera",
}
setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{libraryTrack})
songs, err := provider.SimilarSongs(ctx, "track-1", 5)
Expect(err).ToNot(HaveOccurred())
// Should return two tracks since input songs were identical
Expect(songs).To(HaveLen(2))
Expect(songs[0].ID).To(Equal("br"))
Expect(songs[1].ID).To(Equal("br"))
})
It("handles mixed scenario with both identical and different input songs", func() {
// Agent returns: Song A, Song B (different from A), Song A again (same as first)
// All three match to the same library track
returnedSongs := []agents.Song{
{Name: "Yesterday", Artist: "The Beatles", Album: "Help!"},
{Name: "Yesterday (Remastered)", Artist: "The Beatles", Album: "1"}, // Different version
{Name: "Yesterday", Artist: "The Beatles", Album: "Help!"}, // Same as first
{Name: "Yesterday (Anthology)", Artist: "The Beatles", Album: "Anthology"}, // Another different version
}
// Library only has one version
libraryTrack := model.MediaFile{
ID: "yesterday", Title: "Yesterday", Artist: "The Beatles", Album: "Help!",
}
setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{libraryTrack})
songs, err := provider.SimilarSongs(ctx, "track-1", 5)
Expect(err).ToNot(HaveOccurred())
// Should return 2 tracks:
// 1. First "Yesterday" (original)
// 2. Third "Yesterday" (same as first, so kept)
// Skip: Second "Yesterday (Remastered)" (different input, same library track)
// Skip: Fourth "Yesterday (Anthology)" (different input, same library track)
Expect(songs).To(HaveLen(2))
Expect(songs[0].ID).To(Equal("yesterday"))
Expect(songs[1].ID).To(Equal("yesterday"))
})
It("does not deduplicate songs that match different library tracks", func() {
// Agent returns different songs that match different library tracks
returnedSongs := []agents.Song{
{Name: "Song A", Artist: "Artist"},
{Name: "Song B", Artist: "Artist"},
{Name: "Song C", Artist: "Artist"},
}
// Library has all three songs
trackA := model.MediaFile{ID: "track-a", Title: "Song A", Artist: "Artist"}
trackB := model.MediaFile{ID: "track-b", Title: "Song B", Artist: "Artist"}
trackC := model.MediaFile{ID: "track-c", Title: "Song C", Artist: "Artist"}
setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{trackA, trackB, trackC})
songs, err := provider.SimilarSongs(ctx, "track-1", 5)
Expect(err).ToNot(HaveOccurred())
// All three should be returned since they match different library tracks
Expect(songs).To(HaveLen(3))
Expect(songs[0].ID).To(Equal("track-a"))
Expect(songs[1].ID).To(Equal("track-b"))
Expect(songs[2].ID).To(Equal("track-c"))
})
It("respects count limit after deduplication", func() {
// Agent returns 4 songs: 2 unique + 2 that would create duplicates
returnedSongs := []agents.Song{
{Name: "Song A", Artist: "Artist"},
{Name: "Song A (Live)", Artist: "Artist"}, // Different, matches same track
{Name: "Song B", Artist: "Artist"},
{Name: "Song B (Remix)", Artist: "Artist"}, // Different, matches same track
}
trackA := model.MediaFile{ID: "track-a", Title: "Song A", Artist: "Artist"}
trackB := model.MediaFile{ID: "track-b", Title: "Song B", Artist: "Artist"}
setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{trackA, trackB})
// Request only 2 songs
songs, err := provider.SimilarSongs(ctx, "track-1", 2)
Expect(err).ToNot(HaveOccurred())
// Should return exactly 2: Song A and Song B (skipping duplicates)
Expect(songs).To(HaveLen(2))
Expect(songs[0].ID).To(Equal("track-a"))
Expect(songs[1].ID).To(Equal("track-b"))
})
})
})