From 36252823ce1c8c22db19090fa80db3c802f54c6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Deluan=20Quint=C3=A3o?= Date: Fri, 30 Jan 2026 15:25:00 +0100 Subject: [PATCH] fix(agents): deduplicate mismatched songs in similar songs matching (#4956) * feat(agents): enhance song matching by removing unwanted duplicates while preserving identical entries Signed-off-by: Deluan * refactor: consolidate duplicate checks Signed-off-by: Deluan --------- Signed-off-by: Deluan --- core/external/provider_matching.go | 85 ++++++++++----- core/external/provider_matching_test.go | 135 ++++++++++++++++++++++++ 2 files changed, 192 insertions(+), 28 deletions(-) diff --git a/core/external/provider_matching.go b/core/external/provider_matching.go index 8a6c6a09..74ad56d4 100644 --- a/core/external/provider_matching.go +++ b/core/external/provider_matching.go @@ -119,17 +119,24 @@ func (e *provider) matchSongsToLibrary(ctx context.Context, songs []agents.Song, // songMatchedIn checks if a song has already been matched in any of the provided match maps. // It checks the song's ID, MBID, and ISRC fields against the corresponding map keys. func songMatchedIn(s agents.Song, priorMatches ...map[string]model.MediaFile) bool { + _, found := lookupByIdentifiers(s, priorMatches...) + return found +} + +// lookupByIdentifiers searches for a song's identifiers (ID, MBID, ISRC) in the provided maps. +// Returns the first matching MediaFile found and true, or an empty MediaFile and false if no match. +func lookupByIdentifiers(s agents.Song, maps ...map[string]model.MediaFile) (model.MediaFile, bool) { keys := []string{s.ID, s.MBID, s.ISRC} - for _, m := range priorMatches { + for _, m := range maps { for _, key := range keys { if key != "" { if mf, ok := m[key]; ok && mf.ID != "" { - return true + return mf, true } } } } - return false + return model.MediaFile{}, false } // loadTracksByID fetches MediaFiles from the library using direct ID matching. @@ -405,6 +412,9 @@ func (e *provider) findBestMatch(q songQuery, tracks model.MediaFiles, threshold return bestMatch, found } +// buildTitleQueries converts agent songs into normalized songQuery structs for title+artist matching. +// It skips songs that have already been matched in prior phases (by ID, MBID, or ISRC) and sanitizes +// all string fields for consistent comparison (lowercase, diacritics removed, articles stripped from artist names). func (e *provider) buildTitleQueries(songs []agents.Song, priorMatches ...map[string]model.MediaFile) []songQuery { var queries []songQuery for _, s := range songs { @@ -423,42 +433,61 @@ func (e *provider) buildTitleQueries(songs []agents.Song, priorMatches ...map[st return queries } +// selectBestMatchingSongs assembles the final result by mapping input songs to their best matching +// library tracks. It iterates through the input songs in order and selects the first available match +// using priority order: ID > MBID > ISRC > title+artist. +// +// The function also handles deduplication: when multiple different input songs would match the same +// library track (e.g., "Song (Live)" and "Song (Remastered)" both matching "Song (Live)" in the library), +// only the first match is kept. However, if the same input song appears multiple times (intentional +// repetition), duplicates are preserved in the output. +// +// Returns up to 'count' MediaFiles, preserving the input order. Songs that cannot be matched are skipped. func (e *provider) selectBestMatchingSongs(songs []agents.Song, byID, byMBID, byISRC, byTitleArtist map[string]model.MediaFile, count int) model.MediaFiles { - var mfs model.MediaFiles + mfs := make(model.MediaFiles, 0, len(songs)) + // Track MediaFile.ID -> input song that added it, for deduplication + addedBy := make(map[string]agents.Song, len(songs)) + for _, t := range songs { if len(mfs) == count { break } - // Try ID match first - if t.ID != "" { - if mf, ok := byID[t.ID]; ok { - mfs = append(mfs, mf) - continue + + mf, found := findMatchingTrack(t, byID, byMBID, byISRC, byTitleArtist) + if !found { + continue + } + + // Check for duplicate library track + if prevSong, alreadyAdded := addedBy[mf.ID]; alreadyAdded { + // Only add duplicate if input songs are identical + if t != prevSong { + continue // Different input songs → skip mismatch-induced duplicate } + } else { + addedBy[mf.ID] = t } - // Try MBID match second - if t.MBID != "" { - if mf, ok := byMBID[t.MBID]; ok { - mfs = append(mfs, mf) - continue - } - } - // Try ISRC match third - if t.ISRC != "" { - if mf, ok := byISRC[t.ISRC]; ok { - mfs = append(mfs, mf) - continue - } - } - // Fall back to title+artist match (composite key preserves duplicate titles) - key := str.SanitizeFieldForSorting(t.Name) + "|" + str.SanitizeFieldForSortingNoArticle(t.Artist) - if mf, ok := byTitleArtist[key]; ok { - mfs = append(mfs, mf) - } + + mfs = append(mfs, mf) } return mfs } +// findMatchingTrack looks up a song in the match maps using priority order: ID > MBID > ISRC > title+artist. +// Returns the matched MediaFile and true if found, or an empty MediaFile and false if no match exists. +func findMatchingTrack(t agents.Song, byID, byMBID, byISRC, byTitleArtist map[string]model.MediaFile) (model.MediaFile, bool) { + // Try identifier-based matches first (ID, MBID, ISRC) + if mf, found := lookupByIdentifiers(t, byID, byMBID, byISRC); found { + return mf, true + } + // Fall back to title+artist fuzzy match + key := str.SanitizeFieldForSorting(t.Name) + "|" + str.SanitizeFieldForSortingNoArticle(t.Artist) + if mf, ok := byTitleArtist[key]; ok { + return mf, true + } + return model.MediaFile{}, false +} + // similarityRatio calculates the similarity between two strings using Jaro-Winkler algorithm. // Returns a value between 0.0 (completely different) and 1.0 (identical). // Jaro-Winkler is well-suited for matching song titles because it gives higher scores diff --git a/core/external/provider_matching_test.go b/core/external/provider_matching_test.go index 3a902cba..b3624ef3 100644 --- a/core/external/provider_matching_test.go +++ b/core/external/provider_matching_test.go @@ -624,4 +624,139 @@ var _ = Describe("Provider - Song Matching", func() { }) }) }) + + Describe("Deduplication of mismatched songs", func() { + var track model.MediaFile + + BeforeEach(func() { + DeferCleanup(configtest.SetupConfig()) + conf.Server.SimilarSongsMatchThreshold = 85 // Allow fuzzy matching + + track = model.MediaFile{ID: "track-1", Title: "Test Track", Artist: "Test Artist"} + + // Setup for GetEntityByID to return the track + artistRepo.On("Get", "track-1").Return(nil, model.ErrNotFound).Once() + albumRepo.On("Get", "track-1").Return(nil, model.ErrNotFound).Once() + mediaFileRepo.On("Get", "track-1").Return(&track, nil).Once() + }) + + It("removes duplicates when different input songs match the same library track", func() { + // Agent returns two different versions that will both fuzzy-match to the same library track + returnedSongs := []agents.Song{ + {Name: "Bohemian Rhapsody (Live)", Artist: "Queen"}, + {Name: "Bohemian Rhapsody (Original Mix)", Artist: "Queen"}, + } + // Library only has one version + libraryTrack := model.MediaFile{ + ID: "br-live", Title: "Bohemian Rhapsody (Live)", Artist: "Queen", + } + + setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{libraryTrack}) + + songs, err := provider.SimilarSongs(ctx, "track-1", 5) + + Expect(err).ToNot(HaveOccurred()) + // Should only return one track, not two duplicates + Expect(songs).To(HaveLen(1)) + Expect(songs[0].ID).To(Equal("br-live")) + }) + + It("preserves duplicates when identical input songs match the same library track", func() { + // Agent returns the exact same song twice (intentional repetition) + returnedSongs := []agents.Song{ + {Name: "Bohemian Rhapsody", Artist: "Queen", Album: "A Night at the Opera"}, + {Name: "Bohemian Rhapsody", Artist: "Queen", Album: "A Night at the Opera"}, + } + // Library has matching track + libraryTrack := model.MediaFile{ + ID: "br", Title: "Bohemian Rhapsody", Artist: "Queen", Album: "A Night at the Opera", + } + + setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{libraryTrack}) + + songs, err := provider.SimilarSongs(ctx, "track-1", 5) + + Expect(err).ToNot(HaveOccurred()) + // Should return two tracks since input songs were identical + Expect(songs).To(HaveLen(2)) + Expect(songs[0].ID).To(Equal("br")) + Expect(songs[1].ID).To(Equal("br")) + }) + + It("handles mixed scenario with both identical and different input songs", func() { + // Agent returns: Song A, Song B (different from A), Song A again (same as first) + // All three match to the same library track + returnedSongs := []agents.Song{ + {Name: "Yesterday", Artist: "The Beatles", Album: "Help!"}, + {Name: "Yesterday (Remastered)", Artist: "The Beatles", Album: "1"}, // Different version + {Name: "Yesterday", Artist: "The Beatles", Album: "Help!"}, // Same as first + {Name: "Yesterday (Anthology)", Artist: "The Beatles", Album: "Anthology"}, // Another different version + } + // Library only has one version + libraryTrack := model.MediaFile{ + ID: "yesterday", Title: "Yesterday", Artist: "The Beatles", Album: "Help!", + } + + setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{libraryTrack}) + + songs, err := provider.SimilarSongs(ctx, "track-1", 5) + + Expect(err).ToNot(HaveOccurred()) + // Should return 2 tracks: + // 1. First "Yesterday" (original) + // 2. Third "Yesterday" (same as first, so kept) + // Skip: Second "Yesterday (Remastered)" (different input, same library track) + // Skip: Fourth "Yesterday (Anthology)" (different input, same library track) + Expect(songs).To(HaveLen(2)) + Expect(songs[0].ID).To(Equal("yesterday")) + Expect(songs[1].ID).To(Equal("yesterday")) + }) + + It("does not deduplicate songs that match different library tracks", func() { + // Agent returns different songs that match different library tracks + returnedSongs := []agents.Song{ + {Name: "Song A", Artist: "Artist"}, + {Name: "Song B", Artist: "Artist"}, + {Name: "Song C", Artist: "Artist"}, + } + // Library has all three songs + trackA := model.MediaFile{ID: "track-a", Title: "Song A", Artist: "Artist"} + trackB := model.MediaFile{ID: "track-b", Title: "Song B", Artist: "Artist"} + trackC := model.MediaFile{ID: "track-c", Title: "Song C", Artist: "Artist"} + + setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{trackA, trackB, trackC}) + + songs, err := provider.SimilarSongs(ctx, "track-1", 5) + + Expect(err).ToNot(HaveOccurred()) + // All three should be returned since they match different library tracks + Expect(songs).To(HaveLen(3)) + Expect(songs[0].ID).To(Equal("track-a")) + Expect(songs[1].ID).To(Equal("track-b")) + Expect(songs[2].ID).To(Equal("track-c")) + }) + + It("respects count limit after deduplication", func() { + // Agent returns 4 songs: 2 unique + 2 that would create duplicates + returnedSongs := []agents.Song{ + {Name: "Song A", Artist: "Artist"}, + {Name: "Song A (Live)", Artist: "Artist"}, // Different, matches same track + {Name: "Song B", Artist: "Artist"}, + {Name: "Song B (Remix)", Artist: "Artist"}, // Different, matches same track + } + trackA := model.MediaFile{ID: "track-a", Title: "Song A", Artist: "Artist"} + trackB := model.MediaFile{ID: "track-b", Title: "Song B", Artist: "Artist"} + + setupSimilarSongsExpectations(returnedSongs, model.MediaFiles{trackA, trackB}) + + // Request only 2 songs + songs, err := provider.SimilarSongs(ctx, "track-1", 2) + + Expect(err).ToNot(HaveOccurred()) + // Should return exactly 2: Song A and Song B (skipping duplicates) + Expect(songs).To(HaveLen(2)) + Expect(songs[0].ID).To(Equal("track-a")) + Expect(songs[1].ID).To(Equal("track-b")) + }) + }) })