mirror of
https://github.com/navidrome/navidrome.git
synced 2026-06-19 07:37:15 +00:00
5ec6e6a8d4
* fix(subsonic): make search3 empty-query pagination fast at large offsets Empty-query search3 (used by clients like Symfonium to sync the whole library) degraded linearly with songOffset: the offset optimization in optimizePagination keeps the original query's LEFT JOINs (annotation, bookmark, library) inside its rowid NOT IN subquery, making it as slow as plain OFFSET (~5s per page at offset 900K on a 920K-track library). Rewrite the empty-query branch of doSearch to use the same two-phase approach as the FTS search: Phase 1 paginates rowids on the bare main table, which SQLite satisfies with a covering index at any offset; Phase 2 hydrates only the page's rows with all JOINs. The Phase 2 hydration logic is extracted into hydrateRowidPage, now shared with ftsSearch.execute. Also replace the media_file_missing index with a composite covering index on (missing, library_id), so Phase 1 stays covering for non-admin users, whose queries include a library_id filter. The composite serves all missing-only lookups via its prefix. With a 920K-track / 85K-album test library, search3 empty-query responses are now flat (~0.1s) at every offset, for both admin and non-admin users (previously 3-5s at offsets above 600K). * refactor(persistence): share search Phase 1 contract and dedup junction fan-out Extract the Phase 1 query assembly that was duplicated between the FTS search and the empty-query search into executeTwoPhase: both paths now supply only their strategy-specific FROM/JOINs and ORDER BY, while the shared contract (missing filter, library access, options.Filters, and Max/Offset semantics) lives in one place. Also fix a pagination integrity bug: the artist library filter joins the library_artist junction table, so an artist present in multiple libraries produced duplicate rowids in Phase 1, corrupting offset-based pagination (short pages and repeated artists during full-library syncs). Phase 1 now applies DISTINCT whenever a junction-based LibraryFilter is set. DISTINCT is used instead of GROUP BY because bm25() cannot be evaluated in a grouped query; plain-filter tables (media_file, album) skip the dedup so their Phase 1 keeps the streaming covering-index plan. This also fixes the same duplication in the pre-existing FTS search path. * fix(persistence): pin artist search Phase 1 join order with CROSS JOIN search3 always filters artists by library (library_artist.library_id IN ...), and with the junction JOIN in the search Phase 1 rowid query SQLite chose to drive from library_artist, sorting every junction row with a temp b-tree on each page — a flat ~200ms penalty per request at 405K artists, even at offset 0 (the previous code avoided this by accident: its GROUP BY artist.id pinned an artist-driven plan). Use CROSS JOIN (SQLite's explicit join-order override) in a search-only variant of the artist library filter, keeping artist as the outer table so Phase 1 streams rowids in artist.id order from the primary key index and LIMIT/OFFSET short-circuits. The DISTINCT dedup stays and costs nothing under the streaming plan. Other artist queries keep the planner's freedom. With 405K artists, empty-query artist search is now 0.07s at offset 0 and 0.25s at offset 399K end-to-end (was 0.31s/0.34s before this fix, and up to 1.2s on master at deep offsets). Artist FTS text search is unaffected.
406 lines
15 KiB
Go
406 lines
15 KiB
Go
package persistence
|
|
|
|
import (
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
|
|
. "github.com/Masterminds/squirrel"
|
|
"github.com/deluan/sanitize"
|
|
"github.com/navidrome/navidrome/log"
|
|
"github.com/navidrome/navidrome/model"
|
|
)
|
|
|
|
// containsCJK returns true if the string contains any CJK (Chinese/Japanese/Korean) characters.
|
|
// CJK text doesn't use spaces between words, so FTS5's unicode61 tokenizer treats entire
|
|
// CJK phrases as single tokens, making token-based search ineffective for CJK content.
|
|
func containsCJK(s string) bool {
|
|
for _, r := range s {
|
|
if unicode.Is(unicode.Han, r) ||
|
|
unicode.Is(unicode.Hiragana, r) ||
|
|
unicode.Is(unicode.Katakana, r) ||
|
|
unicode.Is(unicode.Hangul, r) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// fts5SpecialChars matches characters that should be stripped from user input.
|
|
// We keep only Unicode letters, numbers, whitespace, * (prefix wildcard), " (phrase quotes),
|
|
// and \x00 (internal placeholder marker). All punctuation is removed because the unicode61
|
|
// tokenizer treats it as token separators, and characters like ' can cause FTS5 parse errors
|
|
// as unbalanced string delimiters.
|
|
var fts5SpecialChars = regexp.MustCompile(`[^\p{L}\p{N}\s*"\x00]`)
|
|
|
|
// fts5PunctStrip strips everything except letters and numbers (no whitespace, wildcards, or quotes).
|
|
// Used for normalizing words at index time to create concatenated forms (e.g., "R.E.M." → "REM").
|
|
var fts5PunctStrip = regexp.MustCompile(`[^\p{L}\p{N}]`)
|
|
|
|
// fts5Operators matches FTS5 boolean operators as whole words (case-insensitive).
|
|
var fts5Operators = regexp.MustCompile(`(?i)\b(AND|OR|NOT|NEAR)\b`)
|
|
|
|
// fts5LeadingStar matches a * at the start of a token. FTS5 only supports * at the end (prefix queries).
|
|
var fts5LeadingStar = regexp.MustCompile(`(^|[\s])\*+`)
|
|
|
|
// normalizeForFTS takes multiple strings and returns a space-separated, deduplicated list of
|
|
// alternative searchable forms for each word: punctuation-stripped (R.E.M. → REM, AC/DC → ACDC)
|
|
// and ASCII-transliterated (Bjørk → Bjork, œuvre → oeuvre). The transliterated form is needed
|
|
// because FTS5's `unicode61 remove_diacritics 2` only handles NFKD-decomposable diacritics —
|
|
// atomic letters like ø/æ/œ/ß survive tokenization, so the query side and index side disagree
|
|
// without an explicit transliterated entry here.
|
|
func normalizeForFTS(values ...string) string {
|
|
seen := make(map[string]struct{})
|
|
var result []string
|
|
add := func(orig, variant string) {
|
|
if variant == "" || variant == orig {
|
|
return
|
|
}
|
|
lower := strings.ToLower(variant)
|
|
if _, ok := seen[lower]; ok {
|
|
return
|
|
}
|
|
seen[lower] = struct{}{}
|
|
result = append(result, variant)
|
|
}
|
|
for _, v := range values {
|
|
for word := range strings.FieldsSeq(v) {
|
|
transliterated := sanitize.Accents(word)
|
|
// Concatenated ASCII form: R.E.M. → REM, AC/DC → ACDC, St-Étienne → StEtienne.
|
|
add(word, fts5PunctStrip.ReplaceAllString(transliterated, ""))
|
|
// Accent-only transliteration for words without name-punctuation (Bjørk → Bjork).
|
|
add(word, transliterated)
|
|
}
|
|
}
|
|
return strings.Join(result, " ")
|
|
}
|
|
|
|
// isSingleUnicodeLetter returns true if token is exactly one Unicode letter.
|
|
func isSingleUnicodeLetter(token string) bool {
|
|
r, size := utf8.DecodeRuneInString(token)
|
|
return size == len(token) && size > 0 && unicode.IsLetter(r)
|
|
}
|
|
|
|
// namePunctuation is the set of characters commonly used as separators in artist/album
|
|
// names (hyphens, slashes, dots, apostrophes). Only words containing these are candidates
|
|
// for punctuated-word processing; other special characters (^, :, &) are just stripped.
|
|
const namePunctuation = `-/.''`
|
|
|
|
// processPunctuatedWords handles words with embedded name punctuation before the general
|
|
// special-character stripping. For each punctuated word it produces either:
|
|
// - A quoted phrase for dotted abbreviations: R.E.M. → "R E M"
|
|
// - A phrase+concat OR for other patterns: a-ha → ("a ha" OR aha*)
|
|
func processPunctuatedWords(input string, phrases []string) (string, []string) {
|
|
words := strings.Fields(input)
|
|
var result []string
|
|
for _, w := range words {
|
|
if strings.HasPrefix(w, "\x00") || strings.ContainsAny(w, `*"`) || !strings.ContainsAny(w, namePunctuation) {
|
|
result = append(result, w)
|
|
continue
|
|
}
|
|
concat := fts5PunctStrip.ReplaceAllString(w, "")
|
|
if concat == "" || concat == w {
|
|
result = append(result, w)
|
|
continue
|
|
}
|
|
subTokens := strings.Fields(fts5SpecialChars.ReplaceAllString(w, " "))
|
|
if len(subTokens) < 2 {
|
|
// Single sub-token after splitting (e.g., N' → N): just use the stripped form
|
|
result = append(result, concat)
|
|
continue
|
|
}
|
|
// Dotted abbreviations (R.E.M., U.K.) — all single letters separated by dots only
|
|
if isDottedAbbreviation(w, subTokens) {
|
|
phrases = append(phrases, fmt.Sprintf(`"%s"`, strings.Join(subTokens, " ")))
|
|
} else {
|
|
// Punctuated names (a-ha, AC/DC, Jay-Z) — phrase for adjacency + concat for search_normalized
|
|
phrases = append(phrases, fmt.Sprintf(`("%s" OR %s*)`, strings.Join(subTokens, " "), concat))
|
|
}
|
|
result = append(result, fmt.Sprintf("\x00PHRASE%d\x00", len(phrases)-1))
|
|
}
|
|
return strings.Join(result, " "), phrases
|
|
}
|
|
|
|
// isDottedAbbreviation returns true if w uses only dots as punctuation and all sub-tokens
|
|
// are single letters (e.g., "R.E.M.", "U.K." but not "a-ha" or "AC/DC").
|
|
func isDottedAbbreviation(w string, subTokens []string) bool {
|
|
for _, r := range w {
|
|
if !unicode.IsLetter(r) && !unicode.IsNumber(r) && r != '.' {
|
|
return false
|
|
}
|
|
}
|
|
for _, st := range subTokens {
|
|
if !isSingleUnicodeLetter(st) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// buildFTS5Query preprocesses user input into a safe FTS5 MATCH expression.
|
|
// It preserves quoted phrases and * prefix wildcards, neutralizes FTS5 operators
|
|
// (by lowercasing them, since FTS5 operators are case-sensitive) and strips
|
|
// special characters to prevent query injection.
|
|
func buildFTS5Query(userInput string) string {
|
|
q := strings.TrimSpace(userInput)
|
|
if q == "" || q == `""` {
|
|
return ""
|
|
}
|
|
|
|
var phrases []string
|
|
result := q
|
|
for {
|
|
start := strings.Index(result, `"`)
|
|
if start == -1 {
|
|
break
|
|
}
|
|
end := strings.Index(result[start+1:], `"`)
|
|
if end == -1 {
|
|
// Unmatched quote — remove it
|
|
result = result[:start] + result[start+1:]
|
|
break
|
|
}
|
|
end += start + 1
|
|
phrase := result[start : end+1] // includes quotes
|
|
phrases = append(phrases, phrase)
|
|
result = result[:start] + fmt.Sprintf("\x00PHRASE%d\x00", len(phrases)-1) + result[end+1:]
|
|
}
|
|
|
|
// Transliterate non-ASCII letters in the unquoted portion (ø→o, æ→ae, œ→oe, ß→ss, …)
|
|
// so the query matches the ASCII variants emitted by normalizeForFTS at index time.
|
|
// FTS5's own `remove_diacritics 2` only strips NFKD-decomposable marks, so without
|
|
// this step queries for words containing these letters can miss. Quoted phrases are
|
|
// left untouched so they continue to match the original text in title/artist columns.
|
|
result = sanitize.Accents(result)
|
|
|
|
// Neutralize FTS5 operators by lowercasing them (FTS5 operators are case-sensitive:
|
|
// AND, OR, NOT, NEAR are operators, but and, or, not, near are plain tokens)
|
|
result = fts5Operators.ReplaceAllStringFunc(result, strings.ToLower)
|
|
|
|
// Handle words with embedded punctuation (a-ha, AC/DC, R.E.M.) before stripping
|
|
result, phrases = processPunctuatedWords(result, phrases)
|
|
|
|
result = fts5SpecialChars.ReplaceAllString(result, " ")
|
|
result = fts5LeadingStar.ReplaceAllString(result, "$1")
|
|
tokens := strings.Fields(result)
|
|
|
|
// Append * to plain tokens for prefix matching (e.g., "love" → "love*").
|
|
// Skip tokens that are already wildcarded or are quoted phrase placeholders.
|
|
for i, t := range tokens {
|
|
if strings.HasPrefix(t, "\x00") || strings.HasSuffix(t, "*") {
|
|
continue
|
|
}
|
|
tokens[i] = t + "*"
|
|
}
|
|
|
|
// Use explicit AND between tokens — FTS5's implicit AND (space-separated)
|
|
// doesn't work correctly with parenthesized OR groups from processPunctuatedWords.
|
|
result = strings.Join(tokens, " AND ")
|
|
|
|
for i, phrase := range phrases {
|
|
placeholder := fmt.Sprintf("\x00PHRASE%d\x00", i)
|
|
result = strings.ReplaceAll(result, placeholder, phrase)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// ftsColumn pairs an FTS5 column name with its BM25 relevance weight.
|
|
type ftsColumn struct {
|
|
Name string
|
|
Weight float64
|
|
}
|
|
|
|
// ftsColumnDefs defines FTS5 columns and their BM25 relevance weights.
|
|
// The order MUST match the column order in the FTS5 table definition (see migrations).
|
|
// All columns are both searched and ranked. When adding indexed-but-not-searched
|
|
// columns in the future, use Weight: 0 to exclude from the search column filter.
|
|
var ftsColumnDefs = map[string][]ftsColumn{
|
|
"media_file": {
|
|
{"title", 10.0},
|
|
{"album", 5.0},
|
|
{"artist", 3.0},
|
|
{"album_artist", 3.0},
|
|
{"sort_title", 1.0},
|
|
{"sort_album_name", 1.0},
|
|
{"sort_artist_name", 1.0},
|
|
{"sort_album_artist_name", 1.0},
|
|
{"disc_subtitle", 1.0},
|
|
{"search_participants", 2.0},
|
|
{"search_normalized", 1.0},
|
|
},
|
|
"album": {
|
|
{"name", 10.0},
|
|
{"sort_album_name", 1.0},
|
|
{"album_artist", 3.0},
|
|
{"search_participants", 2.0},
|
|
{"discs", 1.0},
|
|
{"catalog_num", 1.0},
|
|
{"album_version", 1.0},
|
|
{"search_normalized", 1.0},
|
|
},
|
|
"artist": {
|
|
{"name", 10.0},
|
|
{"sort_artist_name", 1.0},
|
|
{"search_normalized", 1.0},
|
|
},
|
|
}
|
|
|
|
// ftsColumnFilters and ftsBM25Weights are precomputed from ftsColumnDefs at init time
|
|
// to avoid per-query allocations.
|
|
var (
|
|
ftsColumnFilters = map[string]string{}
|
|
ftsBM25Weights = map[string]string{}
|
|
)
|
|
|
|
func init() {
|
|
for table, cols := range ftsColumnDefs {
|
|
var names []string
|
|
weights := make([]string, len(cols))
|
|
for i, c := range cols {
|
|
if c.Weight > 0 {
|
|
names = append(names, c.Name)
|
|
}
|
|
weights[i] = fmt.Sprintf("%.1f", c.Weight)
|
|
}
|
|
ftsColumnFilters[table] = "{" + strings.Join(names, " ") + "}"
|
|
ftsBM25Weights[table] = strings.Join(weights, ", ")
|
|
}
|
|
}
|
|
|
|
// ftsSearch implements searchStrategy using FTS5 full-text search with BM25 ranking.
|
|
type ftsSearch struct {
|
|
tableName string
|
|
ftsTable string
|
|
matchExpr string
|
|
rankExpr string
|
|
}
|
|
|
|
// ToSql returns a single-query fallback for the REST filter path (no two-phase split).
|
|
func (s *ftsSearch) ToSql() (string, []any, error) {
|
|
sql := s.tableName + ".rowid IN (SELECT rowid FROM " + s.ftsTable + " WHERE " + s.ftsTable + " MATCH ?)"
|
|
return sql, []any{s.matchExpr}, nil
|
|
}
|
|
|
|
// execute runs a two-phase FTS5 search (see executeTwoPhase): Phase 1 here contributes the
|
|
// FTS MATCH join and BM25 rank ordering. Complex ORDER BY (function calls, aggregations) are
|
|
// dropped from Phase 1.
|
|
func (s *ftsSearch) execute(r sqlRepository, sq SelectBuilder, dest any, cfg searchConfig, options model.QueryOptions) error {
|
|
qualifiedOrderBys := []string{s.rankExpr}
|
|
for _, ob := range cfg.OrderBy {
|
|
if qualified := qualifyOrderBy(s.tableName, ob); qualified != "" {
|
|
qualifiedOrderBys = append(qualifiedOrderBys, qualified)
|
|
}
|
|
}
|
|
|
|
rowidCore := Select(s.tableName+".rowid").
|
|
From(s.tableName).
|
|
Join(s.ftsTable+" ON "+s.ftsTable+".rowid = "+s.tableName+".rowid AND "+s.ftsTable+" MATCH ?", s.matchExpr).
|
|
OrderBy(qualifiedOrderBys...)
|
|
return r.executeTwoPhase(sq, dest, rowidCore, cfg, options)
|
|
}
|
|
|
|
// qualifyOrderBy prepends tableName to a simple column name. Returns empty string for
|
|
// complex expressions (function calls, aggregations) that can't be used in Phase 1.
|
|
func qualifyOrderBy(tableName, orderBy string) string {
|
|
orderBy = strings.TrimSpace(orderBy)
|
|
if orderBy == "" || strings.ContainsAny(orderBy, "(,") {
|
|
return ""
|
|
}
|
|
parts := strings.Fields(orderBy)
|
|
if !strings.Contains(parts[0], ".") {
|
|
parts[0] = tableName + "." + parts[0]
|
|
}
|
|
return strings.Join(parts, " ")
|
|
}
|
|
|
|
// ftsQueryDegraded returns true when the FTS query lost significant discriminating
|
|
// content compared to the original input. This happens when special characters that
|
|
// are part of the entity name (e.g., "1+", "C++", "!!!", "C#") get stripped by FTS
|
|
// tokenization, leaving only very short/broad tokens. Also detects quoted phrases
|
|
// that would be degraded by FTS5's unicode61 tokenizer (e.g., "1+" → token "1").
|
|
func ftsQueryDegraded(original, ftsQuery string) bool {
|
|
original = strings.TrimSpace(original)
|
|
if original == "" || ftsQuery == "" {
|
|
return false
|
|
}
|
|
// Strip quotes from original for comparison — we want the raw content
|
|
stripped := strings.ReplaceAll(original, `"`, "")
|
|
// Extract the alphanumeric content from the original query
|
|
alphaNum := fts5PunctStrip.ReplaceAllString(stripped, "")
|
|
// If the original is entirely alphanumeric, nothing was stripped — not degraded
|
|
if len(alphaNum) == len(stripped) {
|
|
return false
|
|
}
|
|
// Check if all effective FTS tokens are very short (≤2 chars).
|
|
// Short tokens with prefix matching are too broad when special chars were stripped.
|
|
// For quoted phrases, extract the content and check the tokens inside.
|
|
tokens := strings.FieldsSeq(ftsQuery)
|
|
for t := range tokens {
|
|
t = strings.TrimSuffix(t, "*")
|
|
// Skip internal phrase placeholders
|
|
if strings.HasPrefix(t, "\x00") {
|
|
return false
|
|
}
|
|
// For OR groups from processPunctuatedWords (e.g., ("a ha" OR aha*)),
|
|
// the punctuated word was already handled meaningfully — not degraded.
|
|
if strings.HasPrefix(t, "(") {
|
|
return false
|
|
}
|
|
// For quoted phrases, check the tokens inside as FTS5 will tokenize them
|
|
if strings.HasPrefix(t, `"`) {
|
|
// Extract content between quotes
|
|
inner := strings.Trim(t, `"`)
|
|
innerAlpha := fts5PunctStrip.ReplaceAllString(inner, " ")
|
|
for it := range strings.FieldsSeq(innerAlpha) {
|
|
if len(it) > 2 {
|
|
return false
|
|
}
|
|
}
|
|
continue
|
|
}
|
|
if len(t) > 2 {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// newFTSSearch creates an FTS5 search strategy. Falls back to LIKE search if the
|
|
// query produces no FTS tokens (e.g., punctuation-only like "!!!!!!!") or if FTS
|
|
// tokenization stripped significant content from the query (e.g., "1+" → "1*").
|
|
// Returns nil when the query produces no searchable tokens at all.
|
|
func newFTSSearch(tableName, query string) searchStrategy {
|
|
q := buildFTS5Query(query)
|
|
if q == "" || ftsQueryDegraded(query, q) {
|
|
// Fallback: try LIKE search with the raw query
|
|
cleaned := strings.TrimSpace(strings.ReplaceAll(query, `"`, ""))
|
|
if cleaned != "" {
|
|
log.Trace("Search using LIKE fallback for non-tokenizable query", "table", tableName, "query", cleaned)
|
|
return newLikeSearch(tableName, cleaned)
|
|
}
|
|
return nil
|
|
}
|
|
ftsTable := tableName + "_fts"
|
|
matchExpr := q
|
|
if cols, ok := ftsColumnFilters[tableName]; ok {
|
|
matchExpr = cols + " : (" + q + ")"
|
|
}
|
|
|
|
rankExpr := ftsTable + ".rank"
|
|
if weights, ok := ftsBM25Weights[tableName]; ok {
|
|
rankExpr = "bm25(" + ftsTable + ", " + weights + ")"
|
|
}
|
|
|
|
s := &ftsSearch{
|
|
tableName: tableName,
|
|
ftsTable: ftsTable,
|
|
matchExpr: matchExpr,
|
|
rankExpr: rankExpr,
|
|
}
|
|
log.Trace("Search using FTS5 backend", "table", tableName, "query", q, "filter", s)
|
|
return s
|
|
}
|