Update search function
This commit is contained in:
@@ -4,21 +4,17 @@ import (
|
||||
"io/fs"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
type searchResult struct {
|
||||
Name string
|
||||
URL string
|
||||
Path string
|
||||
Score int // number of query tokens that hit
|
||||
NameHit bool // at least one hit came from the folder name
|
||||
Snippet string // ~300 chars around first body hit, or page stub for name-only hits
|
||||
Name string
|
||||
URL string
|
||||
Path string
|
||||
Score int
|
||||
}
|
||||
|
||||
type searchPageData struct {
|
||||
@@ -52,15 +48,15 @@ func (h *handler) handleSearch(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
// searchWiki walks root and scores each directory by how many whitespace-split
|
||||
// query tokens hit a word in either the folder name or its index.md body.
|
||||
// A word "hits" a token via case-insensitive equality or Levenshtein ≤ 2.
|
||||
// Folder-name hits break score ties above content-only hits.
|
||||
// searchWiki walks root and scores each directory by how well the folder name
|
||||
// matches the query. Page contents are not searched. Higher score = more
|
||||
// relevant; exact matches rank first.
|
||||
func searchWiki(root, query string) []searchResult {
|
||||
if query == "" {
|
||||
return nil
|
||||
}
|
||||
qTokens := tokenize(query)
|
||||
qLower := strings.ToLower(query)
|
||||
qTokens := tokenize(qLower)
|
||||
if len(qTokens) == 0 {
|
||||
return nil
|
||||
}
|
||||
@@ -78,40 +74,19 @@ func searchWiki(root, query string) []searchResult {
|
||||
return nil
|
||||
}
|
||||
name := d.Name()
|
||||
body, _ := os.ReadFile(filepath.Join(fsPath, "index.md"))
|
||||
|
||||
nameWords := tokenize(name)
|
||||
bodyStr := string(body)
|
||||
bodyLower := strings.ToLower(bodyStr)
|
||||
bodyWords := tokenize(bodyLower)
|
||||
|
||||
score := 0
|
||||
nameHit := false
|
||||
for _, qt := range qTokens {
|
||||
inName := tokenInWords(qt, nameWords)
|
||||
inBody := tokenInWords(qt, bodyWords)
|
||||
if inName || inBody {
|
||||
score++
|
||||
}
|
||||
if inName {
|
||||
nameHit = true
|
||||
}
|
||||
}
|
||||
score := scoreName(strings.ToLower(name), qLower, qTokens)
|
||||
if score == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
rel, relErr := filepath.Rel(walkRoot, fsPath)
|
||||
if relErr != nil {
|
||||
return nil
|
||||
}
|
||||
results = append(results, searchResult{
|
||||
Name: name,
|
||||
URL: "/" + filepath.ToSlash(rel) + "/",
|
||||
Path: filepath.ToSlash(rel),
|
||||
Score: score,
|
||||
NameHit: nameHit,
|
||||
Snippet: makeSnippet(bodyStr, bodyLower, qTokens),
|
||||
Name: name,
|
||||
URL: "/" + filepath.ToSlash(rel) + "/",
|
||||
Path: filepath.ToSlash(rel),
|
||||
Score: score,
|
||||
})
|
||||
return nil
|
||||
})
|
||||
@@ -120,14 +95,52 @@ func searchWiki(root, query string) []searchResult {
|
||||
if results[i].Score != results[j].Score {
|
||||
return results[i].Score > results[j].Score
|
||||
}
|
||||
if results[i].NameHit != results[j].NameHit {
|
||||
return results[i].NameHit
|
||||
di, dj := strings.Count(results[i].Path, "/"), strings.Count(results[j].Path, "/")
|
||||
if di != dj {
|
||||
return di < dj
|
||||
}
|
||||
return strings.ToLower(results[i].Name) < strings.ToLower(results[j].Name)
|
||||
})
|
||||
return results
|
||||
}
|
||||
|
||||
// scoreName ranks how well nameLower matches the query. Whole-name exact
|
||||
// match dominates; otherwise score is the sum of each token's best match
|
||||
// against the words in the name. Position within the name does not matter —
|
||||
// nesting depth is the tiebreaker, applied by the caller.
|
||||
func scoreName(nameLower, qLower string, qTokens []string) int {
|
||||
if nameLower == qLower {
|
||||
return 1000
|
||||
}
|
||||
score := 0
|
||||
nameWords := tokenize(nameLower)
|
||||
for _, qt := range qTokens {
|
||||
best := 0
|
||||
for _, w := range nameWords {
|
||||
switch {
|
||||
case w == qt:
|
||||
if best < 100 {
|
||||
best = 100
|
||||
}
|
||||
case strings.HasPrefix(w, qt):
|
||||
if best < 50 {
|
||||
best = 50
|
||||
}
|
||||
case strings.Contains(w, qt):
|
||||
if best < 20 {
|
||||
best = 20
|
||||
}
|
||||
case levenshtein(w, qt) <= 2:
|
||||
if best < 5 {
|
||||
best = 5
|
||||
}
|
||||
}
|
||||
}
|
||||
score += best
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
// resolveWalkRoot resolves symlinks so WalkDir descends into the real tree
|
||||
// even when the configured wiki root is itself a symlink (as on the NAS).
|
||||
func resolveWalkRoot(root string) string {
|
||||
@@ -172,117 +185,6 @@ func tokenize(s string) []string {
|
||||
return tokens
|
||||
}
|
||||
|
||||
// tokenInWords reports whether qt matches any word exactly or within
|
||||
// Levenshtein distance 2. qt and words must already be lowercase.
|
||||
func tokenInWords(qt string, words []string) bool {
|
||||
for _, w := range words {
|
||||
if w == qt {
|
||||
return true
|
||||
}
|
||||
if levenshtein(w, qt) <= 2 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
var snippetWS = regexp.MustCompile(`\s+`)
|
||||
|
||||
const snippetWindow = 300
|
||||
|
||||
// makeSnippet returns ~300 characters of body around the earliest substring
|
||||
// match of any query token. When no token has an exact substring span (e.g.
|
||||
// matched only via Levenshtein, or the hit was folder-name-only), it falls
|
||||
// back to the first ~300 chars of the body with the leading heading stripped.
|
||||
// Returns "" only when the body itself is empty.
|
||||
func makeSnippet(body, bodyLower string, tokens []string) string {
|
||||
pos := -1
|
||||
for _, t := range tokens {
|
||||
i := strings.Index(bodyLower, t)
|
||||
if i < 0 {
|
||||
continue
|
||||
}
|
||||
if pos < 0 || i < pos {
|
||||
pos = i
|
||||
}
|
||||
}
|
||||
if pos < 0 {
|
||||
return makeStub(body)
|
||||
}
|
||||
|
||||
half := snippetWindow / 2
|
||||
start := pos - half
|
||||
if start < 0 {
|
||||
start = 0
|
||||
}
|
||||
end := pos + half
|
||||
if end > len(body) {
|
||||
end = len(body)
|
||||
}
|
||||
start, end = expandToWordBoundaries(body, start, end)
|
||||
out := snippetWS.ReplaceAllString(body[start:end], " ")
|
||||
out = strings.TrimSpace(out)
|
||||
if start > 0 {
|
||||
out = "…" + out
|
||||
}
|
||||
if end < len(body) {
|
||||
out = out + "…"
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// makeStub returns ~snippetWindow chars from the start of body, with the
|
||||
// leading "# Heading" line stripped. Returns "" for an empty body.
|
||||
func makeStub(body string) string {
|
||||
stripped := string(stripFirstHeading([]byte(body)))
|
||||
stripped = strings.TrimSpace(stripped)
|
||||
if stripped == "" {
|
||||
return ""
|
||||
}
|
||||
end := snippetWindow
|
||||
if end > len(stripped) {
|
||||
end = len(stripped)
|
||||
}
|
||||
_, end = expandToWordBoundaries(stripped, 0, end)
|
||||
out := snippetWS.ReplaceAllString(stripped[:end], " ")
|
||||
out = strings.TrimSpace(out)
|
||||
if end < len(stripped) {
|
||||
out = out + "…"
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// expandToWordBoundaries adjusts start/end so they don't split a word and
|
||||
// don't fall in the middle of a UTF-8 sequence. start moves forward past
|
||||
// any partial word at the beginning; end moves backward to the previous
|
||||
// word boundary.
|
||||
func expandToWordBoundaries(s string, start, end int) (int, int) {
|
||||
for start > 0 && start < len(s) && s[start]&0xC0 == 0x80 {
|
||||
start--
|
||||
}
|
||||
for end < len(s) && s[end]&0xC0 == 0x80 {
|
||||
end++
|
||||
}
|
||||
if start > 0 && start < len(s) && isWordByte(s[start-1]) && isWordByte(s[start]) {
|
||||
for start < end && isWordByte(s[start]) {
|
||||
start++
|
||||
}
|
||||
}
|
||||
if end < len(s) && isWordByte(s[end-1]) && isWordByte(s[end]) {
|
||||
for end > start && isWordByte(s[end-1]) {
|
||||
end--
|
||||
}
|
||||
}
|
||||
return start, end
|
||||
}
|
||||
|
||||
func isWordByte(b byte) bool {
|
||||
if b&0x80 != 0 {
|
||||
return true // assume any multibyte char is part of a word
|
||||
}
|
||||
return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9')
|
||||
}
|
||||
|
||||
// levenshtein returns the edit distance between a and b. Operates on runes so
|
||||
// multi-byte characters count as one edit.
|
||||
func levenshtein(a, b string) int {
|
||||
|
||||
Reference in New Issue
Block a user