Update search function
This commit is contained in:
@@ -9,7 +9,6 @@
|
|||||||
<article class="search-card">
|
<article class="search-card">
|
||||||
<a href="{{.URL}}">{{.Name}}</a>
|
<a href="{{.URL}}">{{.Name}}</a>
|
||||||
<div class="muted">/{{.Path}}</div>
|
<div class="muted">/{{.Path}}</div>
|
||||||
{{if .Snippet}}<div>{{.Snippet}}</div>{{end}}
|
|
||||||
</article>
|
</article>
|
||||||
{{end}}
|
{{end}}
|
||||||
{{else}}
|
{{else}}
|
||||||
|
|||||||
@@ -4,21 +4,17 @@ import (
|
|||||||
"io/fs"
|
"io/fs"
|
||||||
"log"
|
"log"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"unicode"
|
"unicode"
|
||||||
)
|
)
|
||||||
|
|
||||||
type searchResult struct {
|
type searchResult struct {
|
||||||
Name string
|
Name string
|
||||||
URL string
|
URL string
|
||||||
Path string
|
Path string
|
||||||
Score int // number of query tokens that hit
|
Score int
|
||||||
NameHit bool // at least one hit came from the folder name
|
|
||||||
Snippet string // ~300 chars around first body hit, or page stub for name-only hits
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type searchPageData struct {
|
type searchPageData struct {
|
||||||
@@ -52,15 +48,15 @@ func (h *handler) handleSearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// searchWiki walks root and scores each directory by how many whitespace-split
|
// searchWiki walks root and scores each directory by how well the folder name
|
||||||
// query tokens hit a word in either the folder name or its index.md body.
|
// matches the query. Page contents are not searched. Higher score = more
|
||||||
// A word "hits" a token via case-insensitive equality or Levenshtein ≤ 2.
|
// relevant; exact matches rank first.
|
||||||
// Folder-name hits break score ties above content-only hits.
|
|
||||||
func searchWiki(root, query string) []searchResult {
|
func searchWiki(root, query string) []searchResult {
|
||||||
if query == "" {
|
if query == "" {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
qTokens := tokenize(query)
|
qLower := strings.ToLower(query)
|
||||||
|
qTokens := tokenize(qLower)
|
||||||
if len(qTokens) == 0 {
|
if len(qTokens) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -78,40 +74,19 @@ func searchWiki(root, query string) []searchResult {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
name := d.Name()
|
name := d.Name()
|
||||||
body, _ := os.ReadFile(filepath.Join(fsPath, "index.md"))
|
score := scoreName(strings.ToLower(name), qLower, qTokens)
|
||||||
|
|
||||||
nameWords := tokenize(name)
|
|
||||||
bodyStr := string(body)
|
|
||||||
bodyLower := strings.ToLower(bodyStr)
|
|
||||||
bodyWords := tokenize(bodyLower)
|
|
||||||
|
|
||||||
score := 0
|
|
||||||
nameHit := false
|
|
||||||
for _, qt := range qTokens {
|
|
||||||
inName := tokenInWords(qt, nameWords)
|
|
||||||
inBody := tokenInWords(qt, bodyWords)
|
|
||||||
if inName || inBody {
|
|
||||||
score++
|
|
||||||
}
|
|
||||||
if inName {
|
|
||||||
nameHit = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if score == 0 {
|
if score == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
rel, relErr := filepath.Rel(walkRoot, fsPath)
|
rel, relErr := filepath.Rel(walkRoot, fsPath)
|
||||||
if relErr != nil {
|
if relErr != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
results = append(results, searchResult{
|
results = append(results, searchResult{
|
||||||
Name: name,
|
Name: name,
|
||||||
URL: "/" + filepath.ToSlash(rel) + "/",
|
URL: "/" + filepath.ToSlash(rel) + "/",
|
||||||
Path: filepath.ToSlash(rel),
|
Path: filepath.ToSlash(rel),
|
||||||
Score: score,
|
Score: score,
|
||||||
NameHit: nameHit,
|
|
||||||
Snippet: makeSnippet(bodyStr, bodyLower, qTokens),
|
|
||||||
})
|
})
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
@@ -120,14 +95,52 @@ func searchWiki(root, query string) []searchResult {
|
|||||||
if results[i].Score != results[j].Score {
|
if results[i].Score != results[j].Score {
|
||||||
return results[i].Score > results[j].Score
|
return results[i].Score > results[j].Score
|
||||||
}
|
}
|
||||||
if results[i].NameHit != results[j].NameHit {
|
di, dj := strings.Count(results[i].Path, "/"), strings.Count(results[j].Path, "/")
|
||||||
return results[i].NameHit
|
if di != dj {
|
||||||
|
return di < dj
|
||||||
}
|
}
|
||||||
return strings.ToLower(results[i].Name) < strings.ToLower(results[j].Name)
|
return strings.ToLower(results[i].Name) < strings.ToLower(results[j].Name)
|
||||||
})
|
})
|
||||||
return results
|
return results
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// scoreName ranks how well nameLower matches the query. Whole-name exact
|
||||||
|
// match dominates; otherwise score is the sum of each token's best match
|
||||||
|
// against the words in the name. Position within the name does not matter —
|
||||||
|
// nesting depth is the tiebreaker, applied by the caller.
|
||||||
|
func scoreName(nameLower, qLower string, qTokens []string) int {
|
||||||
|
if nameLower == qLower {
|
||||||
|
return 1000
|
||||||
|
}
|
||||||
|
score := 0
|
||||||
|
nameWords := tokenize(nameLower)
|
||||||
|
for _, qt := range qTokens {
|
||||||
|
best := 0
|
||||||
|
for _, w := range nameWords {
|
||||||
|
switch {
|
||||||
|
case w == qt:
|
||||||
|
if best < 100 {
|
||||||
|
best = 100
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(w, qt):
|
||||||
|
if best < 50 {
|
||||||
|
best = 50
|
||||||
|
}
|
||||||
|
case strings.Contains(w, qt):
|
||||||
|
if best < 20 {
|
||||||
|
best = 20
|
||||||
|
}
|
||||||
|
case levenshtein(w, qt) <= 2:
|
||||||
|
if best < 5 {
|
||||||
|
best = 5
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
score += best
|
||||||
|
}
|
||||||
|
return score
|
||||||
|
}
|
||||||
|
|
||||||
// resolveWalkRoot resolves symlinks so WalkDir descends into the real tree
|
// resolveWalkRoot resolves symlinks so WalkDir descends into the real tree
|
||||||
// even when the configured wiki root is itself a symlink (as on the NAS).
|
// even when the configured wiki root is itself a symlink (as on the NAS).
|
||||||
func resolveWalkRoot(root string) string {
|
func resolveWalkRoot(root string) string {
|
||||||
@@ -172,117 +185,6 @@ func tokenize(s string) []string {
|
|||||||
return tokens
|
return tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
// tokenInWords reports whether qt matches any word exactly or within
|
|
||||||
// Levenshtein distance 2. qt and words must already be lowercase.
|
|
||||||
func tokenInWords(qt string, words []string) bool {
|
|
||||||
for _, w := range words {
|
|
||||||
if w == qt {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
if levenshtein(w, qt) <= 2 {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
var snippetWS = regexp.MustCompile(`\s+`)
|
|
||||||
|
|
||||||
const snippetWindow = 300
|
|
||||||
|
|
||||||
// makeSnippet returns ~300 characters of body around the earliest substring
|
|
||||||
// match of any query token. When no token has an exact substring span (e.g.
|
|
||||||
// matched only via Levenshtein, or the hit was folder-name-only), it falls
|
|
||||||
// back to the first ~300 chars of the body with the leading heading stripped.
|
|
||||||
// Returns "" only when the body itself is empty.
|
|
||||||
func makeSnippet(body, bodyLower string, tokens []string) string {
|
|
||||||
pos := -1
|
|
||||||
for _, t := range tokens {
|
|
||||||
i := strings.Index(bodyLower, t)
|
|
||||||
if i < 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if pos < 0 || i < pos {
|
|
||||||
pos = i
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if pos < 0 {
|
|
||||||
return makeStub(body)
|
|
||||||
}
|
|
||||||
|
|
||||||
half := snippetWindow / 2
|
|
||||||
start := pos - half
|
|
||||||
if start < 0 {
|
|
||||||
start = 0
|
|
||||||
}
|
|
||||||
end := pos + half
|
|
||||||
if end > len(body) {
|
|
||||||
end = len(body)
|
|
||||||
}
|
|
||||||
start, end = expandToWordBoundaries(body, start, end)
|
|
||||||
out := snippetWS.ReplaceAllString(body[start:end], " ")
|
|
||||||
out = strings.TrimSpace(out)
|
|
||||||
if start > 0 {
|
|
||||||
out = "…" + out
|
|
||||||
}
|
|
||||||
if end < len(body) {
|
|
||||||
out = out + "…"
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
// makeStub returns ~snippetWindow chars from the start of body, with the
|
|
||||||
// leading "# Heading" line stripped. Returns "" for an empty body.
|
|
||||||
func makeStub(body string) string {
|
|
||||||
stripped := string(stripFirstHeading([]byte(body)))
|
|
||||||
stripped = strings.TrimSpace(stripped)
|
|
||||||
if stripped == "" {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
end := snippetWindow
|
|
||||||
if end > len(stripped) {
|
|
||||||
end = len(stripped)
|
|
||||||
}
|
|
||||||
_, end = expandToWordBoundaries(stripped, 0, end)
|
|
||||||
out := snippetWS.ReplaceAllString(stripped[:end], " ")
|
|
||||||
out = strings.TrimSpace(out)
|
|
||||||
if end < len(stripped) {
|
|
||||||
out = out + "…"
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
// expandToWordBoundaries adjusts start/end so they don't split a word and
|
|
||||||
// don't fall in the middle of a UTF-8 sequence. start moves forward past
|
|
||||||
// any partial word at the beginning; end moves backward to the previous
|
|
||||||
// word boundary.
|
|
||||||
func expandToWordBoundaries(s string, start, end int) (int, int) {
|
|
||||||
for start > 0 && start < len(s) && s[start]&0xC0 == 0x80 {
|
|
||||||
start--
|
|
||||||
}
|
|
||||||
for end < len(s) && s[end]&0xC0 == 0x80 {
|
|
||||||
end++
|
|
||||||
}
|
|
||||||
if start > 0 && start < len(s) && isWordByte(s[start-1]) && isWordByte(s[start]) {
|
|
||||||
for start < end && isWordByte(s[start]) {
|
|
||||||
start++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if end < len(s) && isWordByte(s[end-1]) && isWordByte(s[end]) {
|
|
||||||
for end > start && isWordByte(s[end-1]) {
|
|
||||||
end--
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return start, end
|
|
||||||
}
|
|
||||||
|
|
||||||
func isWordByte(b byte) bool {
|
|
||||||
if b&0x80 != 0 {
|
|
||||||
return true // assume any multibyte char is part of a word
|
|
||||||
}
|
|
||||||
return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9')
|
|
||||||
}
|
|
||||||
|
|
||||||
// levenshtein returns the edit distance between a and b. Operates on runes so
|
// levenshtein returns the edit distance between a and b. Operates on runes so
|
||||||
// multi-byte characters count as one edit.
|
// multi-byte characters count as one edit.
|
||||||
func levenshtein(a, b string) int {
|
func levenshtein(a, b string) int {
|
||||||
|
|||||||
Reference in New Issue
Block a user