328 lines
8.0 KiB
Go
328 lines
8.0 KiB
Go
package main
|
|
|
|
import (
|
|
"io/fs"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
type searchResult struct {
|
|
Name string
|
|
URL string
|
|
Path string
|
|
Score int // number of query tokens that hit
|
|
NameHit bool // at least one hit came from the folder name
|
|
Snippet string // ~300 chars around first body hit, or page stub for name-only hits
|
|
}
|
|
|
|
type searchPageData struct {
|
|
Title string
|
|
Crumbs []crumb
|
|
EditMode bool
|
|
Query string
|
|
Results []searchResult
|
|
}
|
|
|
|
// handleSearch walks the wiki root and renders a search results page for the
|
|
// query in r.URL.Query().Get("q"). Only invoked when path is "/" and "q" is
|
|
// present.
|
|
func (h *handler) handleSearch(w http.ResponseWriter, r *http.Request) {
|
|
query := strings.TrimSpace(r.URL.Query().Get("q"))
|
|
results := searchWiki(h.root, query)
|
|
|
|
title := "Search"
|
|
if query != "" {
|
|
title = "Search: " + query
|
|
}
|
|
data := searchPageData{
|
|
Title: title,
|
|
Crumbs: []crumb{{Name: "search", URL: "/?q=" + query}},
|
|
Query: query,
|
|
Results: results,
|
|
}
|
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
|
if err := searchTmpl.ExecuteTemplate(w, "layout", data); err != nil {
|
|
log.Printf("search template error: %v", err)
|
|
}
|
|
}
|
|
|
|
// searchWiki walks root and scores each directory by how many whitespace-split
|
|
// query tokens hit a word in either the folder name or its index.md body.
|
|
// A word "hits" a token via case-insensitive equality or Levenshtein ≤ 2.
|
|
// Folder-name hits break score ties above content-only hits.
|
|
func searchWiki(root, query string) []searchResult {
|
|
if query == "" {
|
|
return nil
|
|
}
|
|
qTokens := tokenize(query)
|
|
if len(qTokens) == 0 {
|
|
return nil
|
|
}
|
|
|
|
walkRoot := resolveWalkRoot(root)
|
|
var results []searchResult
|
|
_ = filepath.WalkDir(walkRoot, func(fsPath string, d fs.DirEntry, err error) error {
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
if skip, walkErr := hiddenSkip(fsPath, walkRoot, d); skip {
|
|
return walkErr
|
|
}
|
|
if !d.IsDir() || fsPath == walkRoot {
|
|
return nil
|
|
}
|
|
name := d.Name()
|
|
body, _ := os.ReadFile(filepath.Join(fsPath, "index.md"))
|
|
|
|
nameWords := tokenize(name)
|
|
bodyStr := string(body)
|
|
bodyLower := strings.ToLower(bodyStr)
|
|
bodyWords := tokenize(bodyLower)
|
|
|
|
score := 0
|
|
nameHit := false
|
|
for _, qt := range qTokens {
|
|
inName := tokenInWords(qt, nameWords)
|
|
inBody := tokenInWords(qt, bodyWords)
|
|
if inName || inBody {
|
|
score++
|
|
}
|
|
if inName {
|
|
nameHit = true
|
|
}
|
|
}
|
|
if score == 0 {
|
|
return nil
|
|
}
|
|
|
|
rel, relErr := filepath.Rel(walkRoot, fsPath)
|
|
if relErr != nil {
|
|
return nil
|
|
}
|
|
results = append(results, searchResult{
|
|
Name: name,
|
|
URL: "/" + filepath.ToSlash(rel) + "/",
|
|
Path: filepath.ToSlash(rel),
|
|
Score: score,
|
|
NameHit: nameHit,
|
|
Snippet: makeSnippet(bodyStr, bodyLower, qTokens),
|
|
})
|
|
return nil
|
|
})
|
|
|
|
sort.SliceStable(results, func(i, j int) bool {
|
|
if results[i].Score != results[j].Score {
|
|
return results[i].Score > results[j].Score
|
|
}
|
|
if results[i].NameHit != results[j].NameHit {
|
|
return results[i].NameHit
|
|
}
|
|
return strings.ToLower(results[i].Name) < strings.ToLower(results[j].Name)
|
|
})
|
|
return results
|
|
}
|
|
|
|
// resolveWalkRoot resolves symlinks so WalkDir descends into the real tree
|
|
// even when the configured wiki root is itself a symlink (as on the NAS).
|
|
func resolveWalkRoot(root string) string {
|
|
if r, err := filepath.EvalSymlinks(root); err == nil {
|
|
return r
|
|
}
|
|
return root
|
|
}
|
|
|
|
// hiddenSkip handles dotfile/dot-dir entries during a WalkDir. It returns
|
|
// (skipped, walkErr): skipped=true means the caller should `return walkErr`
|
|
// to either prune the subtree (hidden dir) or move past the entry (hidden
|
|
// file). When skipped=false the entry should be processed normally.
|
|
func hiddenSkip(fsPath, walkRoot string, d fs.DirEntry) (bool, error) {
|
|
if !strings.HasPrefix(d.Name(), ".") {
|
|
return false, nil
|
|
}
|
|
if d.IsDir() && fsPath != walkRoot {
|
|
return true, filepath.SkipDir
|
|
}
|
|
return true, nil
|
|
}
|
|
|
|
// tokenize splits s into lowercase word tokens, breaking on any rune that is
|
|
// not a letter or digit. Unicode-aware so umlauts etc. survive intact.
|
|
func tokenize(s string) []string {
|
|
var tokens []string
|
|
var b strings.Builder
|
|
for _, r := range s {
|
|
if unicode.IsLetter(r) || unicode.IsDigit(r) {
|
|
b.WriteRune(unicode.ToLower(r))
|
|
continue
|
|
}
|
|
if b.Len() > 0 {
|
|
tokens = append(tokens, b.String())
|
|
b.Reset()
|
|
}
|
|
}
|
|
if b.Len() > 0 {
|
|
tokens = append(tokens, b.String())
|
|
}
|
|
return tokens
|
|
}
|
|
|
|
// tokenInWords reports whether qt matches any word exactly or within
|
|
// Levenshtein distance 2. qt and words must already be lowercase.
|
|
func tokenInWords(qt string, words []string) bool {
|
|
for _, w := range words {
|
|
if w == qt {
|
|
return true
|
|
}
|
|
if levenshtein(w, qt) <= 2 {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
var snippetWS = regexp.MustCompile(`\s+`)
|
|
|
|
const snippetWindow = 300
|
|
|
|
// makeSnippet returns ~300 characters of body around the earliest substring
|
|
// match of any query token. When no token has an exact substring span (e.g.
|
|
// matched only via Levenshtein, or the hit was folder-name-only), it falls
|
|
// back to the first ~300 chars of the body with the leading heading stripped.
|
|
// Returns "" only when the body itself is empty.
|
|
func makeSnippet(body, bodyLower string, tokens []string) string {
|
|
pos := -1
|
|
for _, t := range tokens {
|
|
i := strings.Index(bodyLower, t)
|
|
if i < 0 {
|
|
continue
|
|
}
|
|
if pos < 0 || i < pos {
|
|
pos = i
|
|
}
|
|
}
|
|
if pos < 0 {
|
|
return makeStub(body)
|
|
}
|
|
|
|
half := snippetWindow / 2
|
|
start := pos - half
|
|
if start < 0 {
|
|
start = 0
|
|
}
|
|
end := pos + half
|
|
if end > len(body) {
|
|
end = len(body)
|
|
}
|
|
start, end = expandToWordBoundaries(body, start, end)
|
|
out := snippetWS.ReplaceAllString(body[start:end], " ")
|
|
out = strings.TrimSpace(out)
|
|
if start > 0 {
|
|
out = "…" + out
|
|
}
|
|
if end < len(body) {
|
|
out = out + "…"
|
|
}
|
|
return out
|
|
}
|
|
|
|
// makeStub returns ~snippetWindow chars from the start of body, with the
|
|
// leading "# Heading" line stripped. Returns "" for an empty body.
|
|
func makeStub(body string) string {
|
|
stripped := string(stripFirstHeading([]byte(body)))
|
|
stripped = strings.TrimSpace(stripped)
|
|
if stripped == "" {
|
|
return ""
|
|
}
|
|
end := snippetWindow
|
|
if end > len(stripped) {
|
|
end = len(stripped)
|
|
}
|
|
_, end = expandToWordBoundaries(stripped, 0, end)
|
|
out := snippetWS.ReplaceAllString(stripped[:end], " ")
|
|
out = strings.TrimSpace(out)
|
|
if end < len(stripped) {
|
|
out = out + "…"
|
|
}
|
|
return out
|
|
}
|
|
|
|
// expandToWordBoundaries adjusts start/end so they don't split a word and
|
|
// don't fall in the middle of a UTF-8 sequence. start moves forward past
|
|
// any partial word at the beginning; end moves backward to the previous
|
|
// word boundary.
|
|
func expandToWordBoundaries(s string, start, end int) (int, int) {
|
|
for start > 0 && start < len(s) && s[start]&0xC0 == 0x80 {
|
|
start--
|
|
}
|
|
for end < len(s) && s[end]&0xC0 == 0x80 {
|
|
end++
|
|
}
|
|
if start > 0 && start < len(s) && isWordByte(s[start-1]) && isWordByte(s[start]) {
|
|
for start < end && isWordByte(s[start]) {
|
|
start++
|
|
}
|
|
}
|
|
if end < len(s) && isWordByte(s[end-1]) && isWordByte(s[end]) {
|
|
for end > start && isWordByte(s[end-1]) {
|
|
end--
|
|
}
|
|
}
|
|
return start, end
|
|
}
|
|
|
|
func isWordByte(b byte) bool {
|
|
if b&0x80 != 0 {
|
|
return true // assume any multibyte char is part of a word
|
|
}
|
|
return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9')
|
|
}
|
|
|
|
// levenshtein returns the edit distance between a and b. Operates on runes so
|
|
// multi-byte characters count as one edit.
|
|
func levenshtein(a, b string) int {
|
|
ar, br := []rune(a), []rune(b)
|
|
if len(ar) == 0 {
|
|
return len(br)
|
|
}
|
|
if len(br) == 0 {
|
|
return len(ar)
|
|
}
|
|
prev := make([]int, len(br)+1)
|
|
curr := make([]int, len(br)+1)
|
|
for j := range prev {
|
|
prev[j] = j
|
|
}
|
|
for i := 1; i <= len(ar); i++ {
|
|
curr[0] = i
|
|
for j := 1; j <= len(br); j++ {
|
|
cost := 1
|
|
if ar[i-1] == br[j-1] {
|
|
cost = 0
|
|
}
|
|
del := prev[j] + 1
|
|
ins := curr[j-1] + 1
|
|
sub := prev[j-1] + cost
|
|
curr[j] = min3(del, ins, sub)
|
|
}
|
|
prev, curr = curr, prev
|
|
}
|
|
return prev[len(br)]
|
|
}
|
|
|
|
func min3(a, b, c int) int {
|
|
m := a
|
|
if b < m {
|
|
m = b
|
|
}
|
|
if c < m {
|
|
m = c
|
|
}
|
|
return m
|
|
}
|