package persistence import ( "fmt" "regexp" "strings" "unicode" "unicode/utf8" . "github.com/Masterminds/squirrel" "github.com/navidrome/navidrome/log" ) // containsCJK returns true if the string contains any CJK (Chinese/Japanese/Korean) characters. // CJK text doesn't use spaces between words, so FTS5's unicode61 tokenizer treats entire // CJK phrases as single tokens, making token-based search ineffective for CJK content. func containsCJK(s string) bool { for _, r := range s { if unicode.Is(unicode.Han, r) || unicode.Is(unicode.Hiragana, r) || unicode.Is(unicode.Katakana, r) || unicode.Is(unicode.Hangul, r) { return true } } return false } // fts5SpecialChars matches characters that should be stripped from user input. // We keep only Unicode letters, numbers, whitespace, * (prefix wildcard), " (phrase quotes), // and \x00 (internal placeholder marker). All punctuation is removed because the unicode61 // tokenizer treats it as token separators, and characters like ' can cause FTS5 parse errors // as unbalanced string delimiters. var fts5SpecialChars = regexp.MustCompile(`[^\p{L}\p{N}\s*"\x00]`) // fts5PunctStrip strips everything except letters and numbers (no whitespace, wildcards, or quotes). // Used for normalizing words at index time to create concatenated forms (e.g., "R.E.M." → "REM"). var fts5PunctStrip = regexp.MustCompile(`[^\p{L}\p{N}]`) // fts5Operators matches FTS5 boolean operators as whole words (case-insensitive). var fts5Operators = regexp.MustCompile(`(?i)\b(AND|OR|NOT|NEAR)\b`) // fts5LeadingStar matches a * at the start of a token. FTS5 only supports * at the end (prefix queries). var fts5LeadingStar = regexp.MustCompile(`(^|[\s])\*+`) // normalizeForFTS takes multiple strings, strips non-letter/non-number characters from each word, // and returns a space-separated string of words that changed after stripping (deduplicated). // This is used at index time to create concatenated forms: "R.E.M." → "REM", "AC/DC" → "ACDC". func normalizeForFTS(values ...string) string { seen := make(map[string]struct{}) var result []string for _, v := range values { for _, word := range strings.Fields(v) { stripped := fts5PunctStrip.ReplaceAllString(word, "") if stripped == "" || stripped == word { continue } lower := strings.ToLower(stripped) if _, ok := seen[lower]; ok { continue } seen[lower] = struct{}{} result = append(result, stripped) } } return strings.Join(result, " ") } // isSingleUnicodeLetter returns true if token is exactly one Unicode letter. func isSingleUnicodeLetter(token string) bool { r, size := utf8.DecodeRuneInString(token) return size == len(token) && size > 0 && unicode.IsLetter(r) } // namePunctuation is the set of characters commonly used as separators in artist/album // names (hyphens, slashes, dots, apostrophes). Only words containing these are candidates // for punctuated-word processing; other special characters (^, :, &) are just stripped. const namePunctuation = `-/.''` // processPunctuatedWords handles words with embedded name punctuation before the general // special-character stripping. For each punctuated word it produces either: // - A quoted phrase for dotted abbreviations: R.E.M. → "R E M" // - A phrase+concat OR for other patterns: a-ha → ("a ha" OR aha*) func processPunctuatedWords(input string, phrases []string) (string, []string) { words := strings.Fields(input) var result []string for _, w := range words { if strings.HasPrefix(w, "\x00") || strings.ContainsAny(w, `*"`) || !strings.ContainsAny(w, namePunctuation) { result = append(result, w) continue } concat := fts5PunctStrip.ReplaceAllString(w, "") if concat == "" || concat == w { result = append(result, w) continue } subTokens := strings.Fields(fts5SpecialChars.ReplaceAllString(w, " ")) if len(subTokens) < 2 { // Single sub-token after splitting (e.g., N' → N): just use the stripped form result = append(result, concat) continue } // Dotted abbreviations (R.E.M., U.K.) — all single letters separated by dots only if isDottedAbbreviation(w, subTokens) { phrases = append(phrases, fmt.Sprintf(`"%s"`, strings.Join(subTokens, " "))) } else { // Punctuated names (a-ha, AC/DC, Jay-Z) — phrase for adjacency + concat for search_normalized phrases = append(phrases, fmt.Sprintf(`("%s" OR %s*)`, strings.Join(subTokens, " "), concat)) } result = append(result, fmt.Sprintf("\x00PHRASE%d\x00", len(phrases)-1)) } return strings.Join(result, " "), phrases } // isDottedAbbreviation returns true if w uses only dots as punctuation and all sub-tokens // are single letters (e.g., "R.E.M.", "U.K." but not "a-ha" or "AC/DC"). func isDottedAbbreviation(w string, subTokens []string) bool { for _, r := range w { if !unicode.IsLetter(r) && !unicode.IsNumber(r) && r != '.' { return false } } for _, st := range subTokens { if !isSingleUnicodeLetter(st) { return false } } return true } // buildFTS5Query preprocesses user input into a safe FTS5 MATCH expression. // It preserves quoted phrases and * prefix wildcards, neutralizes FTS5 operators // (by lowercasing them, since FTS5 operators are case-sensitive) and strips // special characters to prevent query injection. func buildFTS5Query(userInput string) string { q := strings.TrimSpace(userInput) if q == "" || q == `""` { return "" } var phrases []string result := q for { start := strings.Index(result, `"`) if start == -1 { break } end := strings.Index(result[start+1:], `"`) if end == -1 { // Unmatched quote — remove it result = result[:start] + result[start+1:] break } end += start + 1 phrase := result[start : end+1] // includes quotes phrases = append(phrases, phrase) result = result[:start] + fmt.Sprintf("\x00PHRASE%d\x00", len(phrases)-1) + result[end+1:] } // Neutralize FTS5 operators by lowercasing them (FTS5 operators are case-sensitive: // AND, OR, NOT, NEAR are operators, but and, or, not, near are plain tokens) result = fts5Operators.ReplaceAllStringFunc(result, strings.ToLower) // Handle words with embedded punctuation (a-ha, AC/DC, R.E.M.) before stripping result, phrases = processPunctuatedWords(result, phrases) result = fts5SpecialChars.ReplaceAllString(result, " ") result = fts5LeadingStar.ReplaceAllString(result, "$1") tokens := strings.Fields(result) // Append * to plain tokens for prefix matching (e.g., "love" → "love*"). // Skip tokens that are already wildcarded or are quoted phrase placeholders. for i, t := range tokens { if strings.HasPrefix(t, "\x00") || strings.HasSuffix(t, "*") { continue } tokens[i] = t + "*" } result = strings.Join(tokens, " ") for i, phrase := range phrases { placeholder := fmt.Sprintf("\x00PHRASE%d\x00", i) result = strings.ReplaceAll(result, placeholder, phrase) } return result } // likeSearchColumns defines the core columns to search with LIKE queries. // These are the primary user-visible fields for each entity type. // Used as a fallback when FTS5 cannot handle the query (e.g., CJK text, punctuation-only input). var likeSearchColumns = map[string][]string{ "media_file": {"title", "album", "artist", "album_artist"}, "album": {"name", "album_artist"}, "artist": {"name"}, } // likeSearchExpr generates LIKE-based search filters against core columns. // Each word in the query must match at least one column (AND between words), // and each word can match any column (OR within a word). // Used as a fallback when FTS5 cannot handle the query (e.g., CJK text, punctuation-only input). func likeSearchExpr(tableName string, s string) Sqlizer { s = strings.TrimSpace(s) if s == "" { log.Trace("Search using LIKE backend, query is empty", "table", tableName) return nil } columns, ok := likeSearchColumns[tableName] if !ok { log.Trace("Search using LIKE backend, couldn't find columns for this table", "table", tableName) return nil } words := strings.Fields(s) wordFilters := And{} for _, word := range words { colFilters := Or{} for _, col := range columns { colFilters = append(colFilters, Like{tableName + "." + col: "%" + word + "%"}) } wordFilters = append(wordFilters, colFilters) } log.Trace("Search using LIKE backend", "query", wordFilters, "table", tableName) return wordFilters } // ftsSearchColumns defines which FTS5 columns are included in general search. // Columns not listed here are indexed but not searched by default, // enabling future additions (comments, lyrics, bios) without affecting general search. var ftsSearchColumns = map[string]string{ "media_file": "{title album artist album_artist sort_title sort_album_name sort_artist_name sort_album_artist_name disc_subtitle search_participants search_normalized}", "album": "{name sort_album_name album_artist search_participants discs catalog_num album_version search_normalized}", "artist": "{name sort_artist_name search_normalized}", } // ftsSearchExpr generates an FTS5 MATCH-based search filter. // If the query produces no FTS tokens (e.g., punctuation-only like "!!!!!!!"), // it falls back to LIKE-based search. func ftsSearchExpr(tableName string, s string) Sqlizer { q := buildFTS5Query(s) if q == "" { s = strings.TrimSpace(strings.ReplaceAll(s, `"`, "")) if s != "" { log.Trace("Search using LIKE fallback for non-tokenizable query", "table", tableName, "query", s) return likeSearchExpr(tableName, s) } return nil } ftsTable := tableName + "_fts" matchExpr := q if cols, ok := ftsSearchColumns[tableName]; ok { matchExpr = cols + " : (" + q + ")" } filter := Expr( tableName+".rowid IN (SELECT rowid FROM "+ftsTable+" WHERE "+ftsTable+" MATCH ?)", matchExpr, ) log.Trace("Search using FTS5 backend", "table", tableName, "query", q, "filter", filter) return filter }