From 99579ba7e5637f6e7559e698b2daa8733d1ab771 Mon Sep 17 00:00:00 2001 From: luxick Date: Mon, 4 May 2026 10:03:53 +0200 Subject: [PATCH] Update search function --- assets/search.html | 1 - search.go | 206 ++++++++++++--------------------------------- 2 files changed, 54 insertions(+), 153 deletions(-) diff --git a/assets/search.html b/assets/search.html index cc31320..e90ebe9 100644 --- a/assets/search.html +++ b/assets/search.html @@ -9,7 +9,6 @@
{{.Name}}
/{{.Path}}
- {{if .Snippet}}
{{.Snippet}}
{{end}}
{{end}} {{else}} diff --git a/search.go b/search.go index 67a75ee..7fb74b1 100644 --- a/search.go +++ b/search.go @@ -4,21 +4,17 @@ import ( "io/fs" "log" "net/http" - "os" "path/filepath" - "regexp" "sort" "strings" "unicode" ) type searchResult struct { - Name string - URL string - Path string - Score int // number of query tokens that hit - NameHit bool // at least one hit came from the folder name - Snippet string // ~300 chars around first body hit, or page stub for name-only hits + Name string + URL string + Path string + Score int } type searchPageData struct { @@ -52,15 +48,15 @@ func (h *handler) handleSearch(w http.ResponseWriter, r *http.Request) { } } -// searchWiki walks root and scores each directory by how many whitespace-split -// query tokens hit a word in either the folder name or its index.md body. -// A word "hits" a token via case-insensitive equality or Levenshtein ≤ 2. -// Folder-name hits break score ties above content-only hits. +// searchWiki walks root and scores each directory by how well the folder name +// matches the query. Page contents are not searched. Higher score = more +// relevant; exact matches rank first. func searchWiki(root, query string) []searchResult { if query == "" { return nil } - qTokens := tokenize(query) + qLower := strings.ToLower(query) + qTokens := tokenize(qLower) if len(qTokens) == 0 { return nil } @@ -78,40 +74,19 @@ func searchWiki(root, query string) []searchResult { return nil } name := d.Name() - body, _ := os.ReadFile(filepath.Join(fsPath, "index.md")) - - nameWords := tokenize(name) - bodyStr := string(body) - bodyLower := strings.ToLower(bodyStr) - bodyWords := tokenize(bodyLower) - - score := 0 - nameHit := false - for _, qt := range qTokens { - inName := tokenInWords(qt, nameWords) - inBody := tokenInWords(qt, bodyWords) - if inName || inBody { - score++ - } - if inName { - nameHit = true - } - } + score := scoreName(strings.ToLower(name), qLower, qTokens) if score == 0 { return nil } - rel, relErr := filepath.Rel(walkRoot, fsPath) if relErr != nil { return nil } results = append(results, searchResult{ - Name: name, - URL: "/" + filepath.ToSlash(rel) + "/", - Path: filepath.ToSlash(rel), - Score: score, - NameHit: nameHit, - Snippet: makeSnippet(bodyStr, bodyLower, qTokens), + Name: name, + URL: "/" + filepath.ToSlash(rel) + "/", + Path: filepath.ToSlash(rel), + Score: score, }) return nil }) @@ -120,14 +95,52 @@ func searchWiki(root, query string) []searchResult { if results[i].Score != results[j].Score { return results[i].Score > results[j].Score } - if results[i].NameHit != results[j].NameHit { - return results[i].NameHit + di, dj := strings.Count(results[i].Path, "/"), strings.Count(results[j].Path, "/") + if di != dj { + return di < dj } return strings.ToLower(results[i].Name) < strings.ToLower(results[j].Name) }) return results } +// scoreName ranks how well nameLower matches the query. Whole-name exact +// match dominates; otherwise score is the sum of each token's best match +// against the words in the name. Position within the name does not matter — +// nesting depth is the tiebreaker, applied by the caller. +func scoreName(nameLower, qLower string, qTokens []string) int { + if nameLower == qLower { + return 1000 + } + score := 0 + nameWords := tokenize(nameLower) + for _, qt := range qTokens { + best := 0 + for _, w := range nameWords { + switch { + case w == qt: + if best < 100 { + best = 100 + } + case strings.HasPrefix(w, qt): + if best < 50 { + best = 50 + } + case strings.Contains(w, qt): + if best < 20 { + best = 20 + } + case levenshtein(w, qt) <= 2: + if best < 5 { + best = 5 + } + } + } + score += best + } + return score +} + // resolveWalkRoot resolves symlinks so WalkDir descends into the real tree // even when the configured wiki root is itself a symlink (as on the NAS). func resolveWalkRoot(root string) string { @@ -172,117 +185,6 @@ func tokenize(s string) []string { return tokens } -// tokenInWords reports whether qt matches any word exactly or within -// Levenshtein distance 2. qt and words must already be lowercase. -func tokenInWords(qt string, words []string) bool { - for _, w := range words { - if w == qt { - return true - } - if levenshtein(w, qt) <= 2 { - return true - } - } - return false -} - -var snippetWS = regexp.MustCompile(`\s+`) - -const snippetWindow = 300 - -// makeSnippet returns ~300 characters of body around the earliest substring -// match of any query token. When no token has an exact substring span (e.g. -// matched only via Levenshtein, or the hit was folder-name-only), it falls -// back to the first ~300 chars of the body with the leading heading stripped. -// Returns "" only when the body itself is empty. -func makeSnippet(body, bodyLower string, tokens []string) string { - pos := -1 - for _, t := range tokens { - i := strings.Index(bodyLower, t) - if i < 0 { - continue - } - if pos < 0 || i < pos { - pos = i - } - } - if pos < 0 { - return makeStub(body) - } - - half := snippetWindow / 2 - start := pos - half - if start < 0 { - start = 0 - } - end := pos + half - if end > len(body) { - end = len(body) - } - start, end = expandToWordBoundaries(body, start, end) - out := snippetWS.ReplaceAllString(body[start:end], " ") - out = strings.TrimSpace(out) - if start > 0 { - out = "…" + out - } - if end < len(body) { - out = out + "…" - } - return out -} - -// makeStub returns ~snippetWindow chars from the start of body, with the -// leading "# Heading" line stripped. Returns "" for an empty body. -func makeStub(body string) string { - stripped := string(stripFirstHeading([]byte(body))) - stripped = strings.TrimSpace(stripped) - if stripped == "" { - return "" - } - end := snippetWindow - if end > len(stripped) { - end = len(stripped) - } - _, end = expandToWordBoundaries(stripped, 0, end) - out := snippetWS.ReplaceAllString(stripped[:end], " ") - out = strings.TrimSpace(out) - if end < len(stripped) { - out = out + "…" - } - return out -} - -// expandToWordBoundaries adjusts start/end so they don't split a word and -// don't fall in the middle of a UTF-8 sequence. start moves forward past -// any partial word at the beginning; end moves backward to the previous -// word boundary. -func expandToWordBoundaries(s string, start, end int) (int, int) { - for start > 0 && start < len(s) && s[start]&0xC0 == 0x80 { - start-- - } - for end < len(s) && s[end]&0xC0 == 0x80 { - end++ - } - if start > 0 && start < len(s) && isWordByte(s[start-1]) && isWordByte(s[start]) { - for start < end && isWordByte(s[start]) { - start++ - } - } - if end < len(s) && isWordByte(s[end-1]) && isWordByte(s[end]) { - for end > start && isWordByte(s[end-1]) { - end-- - } - } - return start, end -} - -func isWordByte(b byte) bool { - if b&0x80 != 0 { - return true // assume any multibyte char is part of a word - } - return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9') -} - // levenshtein returns the edit distance between a and b. Operates on runes so // multi-byte characters count as one edit. func levenshtein(a, b string) int {