Update search function

2026-05-04 10:03:53 +02:00
parent c016dcabaa
commit 72edf7b258
2 changed files with 54 additions and 153 deletions
@@ -4,21 +4,17 @@ import (
 	"io/fs"
 	"log"
 	"net/http"
-	"os"
 	"path/filepath"
-	"regexp"
 	"sort"
 	"strings"
 	"unicode"
 )

 type searchResult struct {
-	Name    string
-	URL     string
-	Path    string
-	Score   int    // number of query tokens that hit
-	NameHit bool   // at least one hit came from the folder name
-	Snippet string // ~300 chars around first body hit, or page stub for name-only hits
+	Name  string
+	URL   string
+	Path  string
+	Score int
 }

 type searchPageData struct {
@@ -52,15 +48,15 @@ func (h *handler) handleSearch(w http.ResponseWriter, r *http.Request) {
 	}
 }

-// searchWiki walks root and scores each directory by how many whitespace-split
-// query tokens hit a word in either the folder name or its index.md body.
-// A word "hits" a token via case-insensitive equality or Levenshtein ≤ 2.
-// Folder-name hits break score ties above content-only hits.
+// searchWiki walks root and scores each directory by how well the folder name
+// matches the query. Page contents are not searched. Higher score = more
+// relevant; exact matches rank first.
 func searchWiki(root, query string) []searchResult {
 	if query == "" {
 		return nil
 	}
-	qTokens := tokenize(query)
+	qLower := strings.ToLower(query)
+	qTokens := tokenize(qLower)
 	if len(qTokens) == 0 {
 		return nil
 	}
@@ -78,40 +74,19 @@ func searchWiki(root, query string) []searchResult {
 			return nil
 		}
 		name := d.Name()
-		body, _ := os.ReadFile(filepath.Join(fsPath, "index.md"))
-
-		nameWords := tokenize(name)
-		bodyStr := string(body)
-		bodyLower := strings.ToLower(bodyStr)
-		bodyWords := tokenize(bodyLower)
-
-		score := 0
-		nameHit := false
-		for _, qt := range qTokens {
-			inName := tokenInWords(qt, nameWords)
-			inBody := tokenInWords(qt, bodyWords)
-			if inName || inBody {
-				score++
-			}
-			if inName {
-				nameHit = true
-			}
-		}
+		score := scoreName(strings.ToLower(name), qLower, qTokens)
 		if score == 0 {
 			return nil
 		}
-
 		rel, relErr := filepath.Rel(walkRoot, fsPath)
 		if relErr != nil {
 			return nil
 		}
 		results = append(results, searchResult{
-			Name:    name,
-			URL:     "/" + filepath.ToSlash(rel) + "/",
-			Path:    filepath.ToSlash(rel),
-			Score:   score,
-			NameHit: nameHit,
-			Snippet: makeSnippet(bodyStr, bodyLower, qTokens),
+			Name:  name,
+			URL:   "/" + filepath.ToSlash(rel) + "/",
+			Path:  filepath.ToSlash(rel),
+			Score: score,
 		})
 		return nil
 	})
@@ -120,14 +95,52 @@ func searchWiki(root, query string) []searchResult {
 		if results[i].Score != results[j].Score {
 			return results[i].Score > results[j].Score
 		}
-		if results[i].NameHit != results[j].NameHit {
-			return results[i].NameHit
+		di, dj := strings.Count(results[i].Path, "/"), strings.Count(results[j].Path, "/")
+		if di != dj {
+			return di < dj
 		}
 		return strings.ToLower(results[i].Name) < strings.ToLower(results[j].Name)
 	})
 	return results
 }

+// scoreName ranks how well nameLower matches the query. Whole-name exact
+// match dominates; otherwise score is the sum of each token's best match
+// against the words in the name. Position within the name does not matter —
+// nesting depth is the tiebreaker, applied by the caller.
+func scoreName(nameLower, qLower string, qTokens []string) int {
+	if nameLower == qLower {
+		return 1000
+	}
+	score := 0
+	nameWords := tokenize(nameLower)
+	for _, qt := range qTokens {
+		best := 0
+		for _, w := range nameWords {
+			switch {
+			case w == qt:
+				if best < 100 {
+					best = 100
+				}
+			case strings.HasPrefix(w, qt):
+				if best < 50 {
+					best = 50
+				}
+			case strings.Contains(w, qt):
+				if best < 20 {
+					best = 20
+				}
+			case levenshtein(w, qt) <= 2:
+				if best < 5 {
+					best = 5
+				}
+			}
+		}
+		score += best
+	}
+	return score
+}
+
 // resolveWalkRoot resolves symlinks so WalkDir descends into the real tree
 // even when the configured wiki root is itself a symlink (as on the NAS).
 func resolveWalkRoot(root string) string {
@@ -172,117 +185,6 @@ func tokenize(s string) []string {
 	return tokens
 }

-// tokenInWords reports whether qt matches any word exactly or within
-// Levenshtein distance 2. qt and words must already be lowercase.
-func tokenInWords(qt string, words []string) bool {
-	for _, w := range words {
-		if w == qt {
-			return true
-		}
-		if levenshtein(w, qt) <= 2 {
-			return true
-		}
-	}
-	return false
-}
-
-var snippetWS = regexp.MustCompile(`\s+`)
-
-const snippetWindow = 300
-
-// makeSnippet returns ~300 characters of body around the earliest substring
-// match of any query token. When no token has an exact substring span (e.g.
-// matched only via Levenshtein, or the hit was folder-name-only), it falls
-// back to the first ~300 chars of the body with the leading heading stripped.
-// Returns "" only when the body itself is empty.
-func makeSnippet(body, bodyLower string, tokens []string) string {
-	pos := -1
-	for _, t := range tokens {
-		i := strings.Index(bodyLower, t)
-		if i < 0 {
-			continue
-		}
-		if pos < 0 || i < pos {
-			pos = i
-		}
-	}
-	if pos < 0 {
-		return makeStub(body)
-	}
-
-	half := snippetWindow / 2
-	start := pos - half
-	if start < 0 {
-		start = 0
-	}
-	end := pos + half
-	if end > len(body) {
-		end = len(body)
-	}
-	start, end = expandToWordBoundaries(body, start, end)
-	out := snippetWS.ReplaceAllString(body[start:end], " ")
-	out = strings.TrimSpace(out)
-	if start > 0 {
-		out = "…" + out
-	}
-	if end < len(body) {
-		out = out + "…"
-	}
-	return out
-}
-
-// makeStub returns ~snippetWindow chars from the start of body, with the
-// leading "# Heading" line stripped. Returns "" for an empty body.
-func makeStub(body string) string {
-	stripped := string(stripFirstHeading([]byte(body)))
-	stripped = strings.TrimSpace(stripped)
-	if stripped == "" {
-		return ""
-	}
-	end := snippetWindow
-	if end > len(stripped) {
-		end = len(stripped)
-	}
-	_, end = expandToWordBoundaries(stripped, 0, end)
-	out := snippetWS.ReplaceAllString(stripped[:end], " ")
-	out = strings.TrimSpace(out)
-	if end < len(stripped) {
-		out = out + "…"
-	}
-	return out
-}
-
-// expandToWordBoundaries adjusts start/end so they don't split a word and
-// don't fall in the middle of a UTF-8 sequence. start moves forward past
-// any partial word at the beginning; end moves backward to the previous
-// word boundary.
-func expandToWordBoundaries(s string, start, end int) (int, int) {
-	for start > 0 && start < len(s) && s[start]&0xC0 == 0x80 {
-		start--
-	}
-	for end < len(s) && s[end]&0xC0 == 0x80 {
-		end++
-	}
-	if start > 0 && start < len(s) && isWordByte(s[start-1]) && isWordByte(s[start]) {
-		for start < end && isWordByte(s[start]) {
-			start++
-		}
-	}
-	if end < len(s) && isWordByte(s[end-1]) && isWordByte(s[end]) {
-		for end > start && isWordByte(s[end-1]) {
-			end--
-		}
-	}
-	return start, end
-}
-
-func isWordByte(b byte) bool {
-	if b&0x80 != 0 {
-		return true // assume any multibyte char is part of a word
-	}
-	return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9')
-}
-
 // levenshtein returns the edit distance between a and b. Operates on runes so
 // multi-byte characters count as one edit.
 func levenshtein(a, b string) int {