datascape/search.go

package main

import (
	"io/fs"
	"log"
	"net/http"
	"os"
	"path/filepath"
	"regexp"
	"sort"
	"strings"
	"unicode"
)

type searchResult struct {
	Name    string
	URL     string
	Path    string
	Score   int    // number of query tokens that hit
	NameHit bool   // at least one hit came from the folder name
	Snippet string // ~300 chars around first body hit, or page stub for name-only hits
}

type searchPageData struct {
	Title    string
	Crumbs   []crumb
	EditMode bool
	Query    string
	Results  []searchResult
}

// handleSearch walks the wiki root and renders a search results page for the
// query in r.URL.Query().Get("q"). Only invoked when path is "/" and "q" is
// present.
func (h *handler) handleSearch(w http.ResponseWriter, r *http.Request) {
	query := strings.TrimSpace(r.URL.Query().Get("q"))
	results := searchWiki(h.root, query)

	title := "Search"
	if query != "" {
		title = "Search: " + query
	}
	data := searchPageData{
		Title:   title,
		Crumbs:  []crumb{{Name: "search", URL: "/?q=" + query}},
		Query:   query,
		Results: results,
	}
	w.Header().Set("Content-Type", "text/html; charset=utf-8")
	if err := searchTmpl.ExecuteTemplate(w, "layout", data); err != nil {
		log.Printf("search template error: %v", err)
	}
}

// searchWiki walks root and scores each directory by how many whitespace-split
// query tokens hit a word in either the folder name or its index.md body.
// A word "hits" a token via case-insensitive equality or Levenshtein ≤ 2.
// Folder-name hits break score ties above content-only hits.
func searchWiki(root, query string) []searchResult {
	if query == "" {
		return nil
	}
	qTokens := tokenize(query)
	if len(qTokens) == 0 {
		return nil
	}

	walkRoot := resolveWalkRoot(root)
	var results []searchResult
	_ = filepath.WalkDir(walkRoot, func(fsPath string, d fs.DirEntry, err error) error {
		if err != nil {
			return nil
		}
		if skip, walkErr := hiddenSkip(fsPath, walkRoot, d); skip {
			return walkErr
		}
		if !d.IsDir() || fsPath == walkRoot {
			return nil
		}
		name := d.Name()
		body, _ := os.ReadFile(filepath.Join(fsPath, "index.md"))

		nameWords := tokenize(name)
		bodyStr := string(body)
		bodyLower := strings.ToLower(bodyStr)
		bodyWords := tokenize(bodyLower)

		score := 0
		nameHit := false
		for _, qt := range qTokens {
			inName := tokenInWords(qt, nameWords)
			inBody := tokenInWords(qt, bodyWords)
			if inName || inBody {
				score++
			}
			if inName {
				nameHit = true
			}
		}
		if score == 0 {
			return nil
		}

		rel, relErr := filepath.Rel(walkRoot, fsPath)
		if relErr != nil {
			return nil
		}
		results = append(results, searchResult{
			Name:    name,
			URL:     "/" + filepath.ToSlash(rel) + "/",
			Path:    filepath.ToSlash(rel),
			Score:   score,
			NameHit: nameHit,
			Snippet: makeSnippet(bodyStr, bodyLower, qTokens),
		})
		return nil
	})

	sort.SliceStable(results, func(i, j int) bool {
		if results[i].Score != results[j].Score {
			return results[i].Score > results[j].Score
		}
		if results[i].NameHit != results[j].NameHit {
			return results[i].NameHit
		}
		return strings.ToLower(results[i].Name) < strings.ToLower(results[j].Name)
	})
	return results
}

// resolveWalkRoot resolves symlinks so WalkDir descends into the real tree
// even when the configured wiki root is itself a symlink (as on the NAS).
func resolveWalkRoot(root string) string {
	if r, err := filepath.EvalSymlinks(root); err == nil {
		return r
	}
	return root
}

// hiddenSkip handles dotfile/dot-dir entries during a WalkDir. It returns
// (skipped, walkErr): skipped=true means the caller should `return walkErr`
// to either prune the subtree (hidden dir) or move past the entry (hidden
// file). When skipped=false the entry should be processed normally.
func hiddenSkip(fsPath, walkRoot string, d fs.DirEntry) (bool, error) {
	if !strings.HasPrefix(d.Name(), ".") {
		return false, nil
	}
	if d.IsDir() && fsPath != walkRoot {
		return true, filepath.SkipDir
	}
	return true, nil
}

// tokenize splits s into lowercase word tokens, breaking on any rune that is
// not a letter or digit. Unicode-aware so umlauts etc. survive intact.
func tokenize(s string) []string {
	var tokens []string
	var b strings.Builder
	for _, r := range s {
		if unicode.IsLetter(r) || unicode.IsDigit(r) {
			b.WriteRune(unicode.ToLower(r))
			continue
		}
		if b.Len() > 0 {
			tokens = append(tokens, b.String())
			b.Reset()
		}
	}
	if b.Len() > 0 {
		tokens = append(tokens, b.String())
	}
	return tokens
}

// tokenInWords reports whether qt matches any word exactly or within
// Levenshtein distance 2. qt and words must already be lowercase.
func tokenInWords(qt string, words []string) bool {
	for _, w := range words {
		if w == qt {
			return true
		}
		if levenshtein(w, qt) <= 2 {
			return true
		}
	}
	return false
}

var snippetWS = regexp.MustCompile(`\s+`)

const snippetWindow = 300

// makeSnippet returns ~300 characters of body around the earliest substring
// match of any query token. When no token has an exact substring span (e.g.
// matched only via Levenshtein, or the hit was folder-name-only), it falls
// back to the first ~300 chars of the body with the leading heading stripped.
// Returns "" only when the body itself is empty.
func makeSnippet(body, bodyLower string, tokens []string) string {
	pos := -1
	for _, t := range tokens {
		i := strings.Index(bodyLower, t)
		if i < 0 {
			continue
		}
		if pos < 0 || i < pos {
			pos = i
		}
	}
	if pos < 0 {
		return makeStub(body)
	}

	half := snippetWindow / 2
	start := pos - half
	if start < 0 {
		start = 0
	}
	end := pos + half
	if end > len(body) {
		end = len(body)
	}
	start, end = expandToWordBoundaries(body, start, end)
	out := snippetWS.ReplaceAllString(body[start:end], " ")
	out = strings.TrimSpace(out)
	if start > 0 {
		out = "…" + out
	}
	if end < len(body) {
		out = out + "…"
	}
	return out
}

// makeStub returns ~snippetWindow chars from the start of body, with the
// leading "# Heading" line stripped. Returns "" for an empty body.
func makeStub(body string) string {
	stripped := string(stripFirstHeading([]byte(body)))
	stripped = strings.TrimSpace(stripped)
	if stripped == "" {
		return ""
	}
	end := snippetWindow
	if end > len(stripped) {
		end = len(stripped)
	}
	_, end = expandToWordBoundaries(stripped, 0, end)
	out := snippetWS.ReplaceAllString(stripped[:end], " ")
	out = strings.TrimSpace(out)
	if end < len(stripped) {
		out = out + "…"
	}
	return out
}

// expandToWordBoundaries adjusts start/end so they don't split a word and
// don't fall in the middle of a UTF-8 sequence. start moves forward past
// any partial word at the beginning; end moves backward to the previous
// word boundary.
func expandToWordBoundaries(s string, start, end int) (int, int) {
	for start > 0 && start < len(s) && s[start]&0xC0 == 0x80 {
		start--
	}
	for end < len(s) && s[end]&0xC0 == 0x80 {
		end++
	}
	if start > 0 && start < len(s) && isWordByte(s[start-1]) && isWordByte(s[start]) {
		for start < end && isWordByte(s[start]) {
			start++
		}
	}
	if end < len(s) && isWordByte(s[end-1]) && isWordByte(s[end]) {
		for end > start && isWordByte(s[end-1]) {
			end--
		}
	}
	return start, end
}

func isWordByte(b byte) bool {
	if b&0x80 != 0 {
		return true // assume any multibyte char is part of a word
	}
	return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9')
}

// levenshtein returns the edit distance between a and b. Operates on runes so
// multi-byte characters count as one edit.
func levenshtein(a, b string) int {
	ar, br := []rune(a), []rune(b)
	if len(ar) == 0 {
		return len(br)
	}
	if len(br) == 0 {
		return len(ar)
	}
	prev := make([]int, len(br)+1)
	curr := make([]int, len(br)+1)
	for j := range prev {
		prev[j] = j
	}
	for i := 1; i <= len(ar); i++ {
		curr[0] = i
		for j := 1; j <= len(br); j++ {
			cost := 1
			if ar[i-1] == br[j-1] {
				cost = 0
			}
			del := prev[j] + 1
			ins := curr[j-1] + 1
			sub := prev[j-1] + cost
			curr[j] = min3(del, ins, sub)
		}
		prev, curr = curr, prev
	}
	return prev[len(br)]
}

func min3(a, b, c int) int {
	m := a
	if b < m {
		m = b
	}
	if c < m {
		m = c
	}
	return m
}