HTML checker is code complete, needs a pass for documentation

2025-11-02 23:08:40 -07:00
parent f6ed77923c
commit 4f9cdde1f2
6 changed files with 885 additions and 56 deletions
@@ -12,8 +12,11 @@ package htmlcheck
 import (
 	"errors"
 	"fmt"
+	"maps"
 	"net/url"
 	"strings"
+	"unicode"
+	"unicode/utf8"

 	"git.erbosoft.com/amy/amsterdam/util"
 	"github.com/bits-and-blooms/bitset"
@@ -35,8 +38,8 @@ type HTMLChecker interface {
 	InternalRefs() ([]string, error)
 }

-var AlreadyFinished = errors.New("the HTML checker has already finished")
-var NotYetFinished = errors.New("the HTML checker has not yet been finished")
+var ErrAlreadyFinished = errors.New("the HTML checker has already finished")
+var ErrNotYetFinished = errors.New("the HTML checker has not yet been finished")

 type htmlCheckerBackend interface {
 	getCheckerAttrValue(string) string
@@ -60,6 +63,9 @@ const (
 // htmlMarginSlop is a number of characters at the end of the line used to control word-wrapping.
 const htmlMarginSlop = 5

+// hyphApos is used to find hyphens and apostrophes.
+const hyphApos = "-'"
+
 type htmlCheckerImpl struct {
 	config             *HTMLCheckerConfig
 	started            bool
@@ -80,6 +86,7 @@ type htmlCheckerImpl struct {
 	tagRewriters       []rewriter
 	parenRewriters     []rewriter
 	outputFilters      []outputFilter
+	rawOutputFilters   []outputFilter
 	contextData        map[string]any
 	externalReferences map[*url.URL]bool
 	internalReferences map[string]bool
@@ -102,6 +109,17 @@ func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) {
 	}
 }

+func (ht *htmlCheckerImpl) copyOutputFilters(dest []outputFilter, source []string) {
+	for i := range source {
+		f, ok := outputFilterRegistry[source[i]]
+		if ok {
+			dest[i] = f
+		} else {
+			log.Errorf("filter %s is not found", source[i])
+		}
+	}
+}
+
 func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
 	config, ok := configsRegistry[configName]
 	if !ok {
@@ -128,6 +146,7 @@ func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
 		tagRewriters:       make([]rewriter, len(config.TagRewriters)),
 		parenRewriters:     make([]rewriter, len(config.ParenRewriters)),
 		outputFilters:      make([]outputFilter, len(config.OutputFilters)),
+		rawOutputFilters:   make([]outputFilter, len(config.RawOutputFilters)),
 		contextData:        make(map[string]any),
 		externalReferences: make(map[*url.URL]bool),
 		internalReferences: make(map[string]bool),
@@ -137,15 +156,66 @@ func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
 	rc.copyRewriters(rc.wordRewriters, config.WordRewriters)
 	rc.copyRewriters(rc.tagRewriters, config.TagRewriters)
 	rc.copyRewriters(rc.parenRewriters, config.ParenRewriters)
-	for i := range config.OutputFilters {
-		f, ok := outputFilterRegistry[config.OutputFilters[i]]
-		if ok {
-			rc.outputFilters[i] = f
-		} else {
-			log.Errorf("filter %s is not found", config.OutputFilters[i])
+	rc.copyOutputFilters(rc.outputFilters, config.OutputFilters)
+	rc.copyOutputFilters(rc.rawOutputFilters, config.RawOutputFilters)
+	return &rc, nil
+}
+
+func (ht *htmlCheckerImpl) getCheckerAttrValue(name string) string {
+	if name == "ANCHORTAIL" {
+		return ht.config.AnchorTail
+	}
+	return ""
+}
+
+func (ht *htmlCheckerImpl) sendTagMessage(msg string) {
+	switch msg {
+	case "NOBR":
+		ht.noBreakCount++
+	case "/NOBR":
+		ht.noBreakCount--
+	case "WBR":
+		ht.triggerWBR = true
+	}
+}
+
+func (ht *htmlCheckerImpl) getCheckerContextValue(name string) any {
+	return ht.contextData[name]
+}
+
+func (ht *htmlCheckerImpl) addExternalRef(ref *url.URL) {
+	ht.externalReferences[ref] = true
+}
+
+func (ht *htmlCheckerImpl) addInternalRef(ref string) {
+	ht.internalReferences[ref] = true
+}
+
+func (ht *htmlCheckerImpl) rewriterAttrValue(name string) string {
+	return ht.getCheckerAttrValue(name)
+}
+
+func (ht *htmlCheckerImpl) rewriterContextValue(name string) any {
+	return ht.contextData[name]
+}
+
+func (ht *htmlCheckerImpl) emitRune(ch rune, filters []outputFilter, countCols bool) {
+	handled := false
+	if len(filters) > 0 {
+		// try each output filter to see what we can do
+		for _, of := range filters {
+			handled = of.tryOutputRune(ht.outputBuffer, ch)
+			if handled {
+				break // found a filter to handle it, done
+			}
+		}
+		if !handled { // output the raw character
+			ht.outputBuffer.WriteRune(ch)
+		}
+		if countCols && ht.config.WordWrap > 0 {
+			ht.columns++
 		}
 	}
-	return &rc
 }

 func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) {
@@ -154,42 +224,52 @@ func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countC
 	}
 	realCountCols := countCols && (ht.config.WordWrap > 0)
 	if len(filters) == 0 {
+		// if there are no filters, just output the whole thing
 		ht.outputBuffer.WriteString(str)
 		if realCountCols {
-			ht.columns += len(str)
+			ht.columns += utf8.RuneCountInString(str)
 		}
 		return
 	}
 	temp := str
 	for len(temp) > 0 {
+		// We output as much of the string as we possibly can at once. Assume, for now, we'll output the whole thing.
 		outputLen := len(temp)
+
+		// Now look at each of the output filters to see if we should try outputting a lesser amount
+		// (i.e. does the string contain a "stopper" that one of the filters would like to mogrify?)
 		var stopper outputFilter = nil
 		for _, of := range filters {
+			// find the length of characters that DOESN'T match this filter
 			lnm := of.lengthNoMatch(temp)
 			if lnm >= 0 && lnm < outputLen {
+				// we've found a new stopper - record the length and the filter
 				outputLen = lnm
 				stopper = of
 			}
 			if outputLen <= 0 {
-				break
+				break // nothing left to do here
 			}
 		}
 		if outputLen > 0 {
+			// move over the unaltered characters first
 			ht.outputBuffer.WriteString(temp[:outputLen])
 			if realCountCols {
-				ht.columns += outputLen
+				ht.columns += utf8.RuneCountInString(temp[:outputLen])
 			}
 		}
 		if stopper != nil {
-			tmpch := temp[outputLen]
-			outputLen++
-			if !stopper.tryOutputCharacter(ht.outputBuffer, tmpch) {
-				ht.outputBuffer.WriteByte(tmpch)
+			// one of the output filters stopped us, try invoking it
+			tmpch, bsiz := utf8.DecodeRuneInString(temp[outputLen:])
+			outputLen += bsiz
+			if !stopper.tryOutputRune(ht.outputBuffer, tmpch) {
+				ht.outputBuffer.WriteRune(tmpch)
 			}
 			if realCountCols {
 				ht.columns++
 			}
 		}
+		// Chop the string and go around again.
 		if outputLen == len(temp) {
 			temp = ""
 		} else if outputLen > 0 {
@@ -199,7 +279,11 @@ func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countC
 }

 func (ht *htmlCheckerImpl) emitLineBreak() {
-
+	ht.emitString("\r\n", ht.rawOutputFilters, false)
+	if ht.config.WordWrap > 0 {
+		ht.columns = 0
+	}
+	ht.lines++
 }

 func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
@@ -208,17 +292,572 @@ func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
 	}
 }

+func (ht *htmlCheckerImpl) ensureSpaceOnLine(nchars int) {
+	if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
+		// add a line break if needed here
+		remainSpace := ht.config.WordWrap - ht.columns
+		if remainSpace < nchars {
+			ht.emitLineBreak()
+		}
+	}
+}
+
+func (ht *htmlCheckerImpl) emitMarkupData(md *markupData) {
+	if !md.rescan {
+		ht.ensureSpaceOnLine(len(md.text))
+		ht.emitString(md.beginMarkup, ht.rawOutputFilters, false)
+		ht.emitString(md.text, ht.outputFilters, true)
+		ht.emitString(md.endMarkup, ht.rawOutputFilters, false)
+	}
+}
+
+func (ht *htmlCheckerImpl) emitBracketedMarkupData(md *markupData, prefix rune, suffix rune) {
+	if !md.rescan {
+		l := len(md.text)
+		if l > 0 {
+			l += 2
+		}
+		ht.ensureSpaceOnLine(l)
+		if len(md.text) > 0 {
+			ht.emitRune(prefix, ht.outputFilters, true)
+		}
+		ht.emitString(md.beginMarkup, ht.rawOutputFilters, false)
+		ht.emitString(md.text, ht.outputFilters, true)
+		ht.emitString(md.endMarkup, ht.rawOutputFilters, false)
+		if len(md.text) > 0 {
+			ht.emitRune(suffix, ht.outputFilters, true)
+		}
+	}
+}
+
+func (ht *htmlCheckerImpl) doFlushWhitespace() {
+	outputLen := ht.tempBuffer.Len()
+	if outputLen > 0 {
+		forceLineBreak := false
+		if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
+			// adjust output if necessary for wordwrapping
+			remainSpace := ht.config.WordWrap - ht.columns
+			if remainSpace < outputLen {
+				outputLen = remainSpace
+			}
+			if outputLen <= 0 {
+				// this means that NONE of the whitespace would fit on this line...add a line break
+				forceLineBreak = true
+				outputLen = 0
+			}
+		}
+		if forceLineBreak {
+			ht.emitLineBreak()
+		}
+		if outputLen > 0 {
+			ht.emitString(ht.tempBuffer.String()[:outputLen], ht.outputFilters, true)
+		}
+		ht.tempBuffer.Reset()
+	}
+}
+
+func (ht *htmlCheckerImpl) doFlushNewlines() {
+	// Measure the number of line breaks we have.
+	lineBreaks, crs := 0, 0
+	for ch := range []byte(ht.tempBuffer.String()) {
+		switch ch {
+		case '\r':
+			crs++
+		case '\n':
+			crs = 0
+			lineBreaks++
+		}
+	}
+	if crs > 0 {
+		lineBreaks++
+	}
+
+	// Adjust the number of line breaks if rewrap is in effect.
+	if ht.config.Rewrap {
+		if lineBreaks < 2 {
+			// convert a single line break to whitespace
+			ht.tempBuffer.Reset()
+			ht.tempBuffer.WriteByte(' ')
+			ht.state = stateWhitespace
+			return
+		} else {
+			lineBreaks = 2 // compress out multiple blank lines
+		}
+	}
+
+	for lineBreaks > 0 {
+		ht.emitLineBreak()
+		lineBreaks--
+	}
+	ht.tempBuffer.Reset()
+	ht.state = stateWhitespace
+}
+
+func (ht *htmlCheckerImpl) emitFromStartOfTempBuffer(nrunes int) {
+	if nrunes > 0 {
+		if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
+			for nrunes > 0 {
+				curlen := min(nrunes, ht.config.WordWrap-ht.columns)
+				if curlen > 0 {
+					s := ht.tempBuffer.String()
+					bcurlen := util.RunesToBytes(s, curlen)
+					ht.emitString(s[:bcurlen], ht.outputFilters, true)
+					ht.tempBuffer.Reset()
+					ht.tempBuffer.WriteString(s[bcurlen:])
+					nrunes -= curlen
+				}
+				if ht.columns >= ht.config.WordWrap {
+					ht.emitLineBreak()
+				}
+			}
+		} else {
+			s := ht.tempBuffer.String()
+			bnrunes := util.RunesToBytes(s, nrunes)
+			ht.emitString(s[:bnrunes], ht.outputFilters, true)
+			ht.tempBuffer.Reset()
+			ht.tempBuffer.WriteString(s[bnrunes:])
+		}
+	}
+}
+
+func (ht *htmlCheckerImpl) attemptRewrite(rewriters []rewriter, data string) *markupData {
+	for _, r := range rewriters {
+		rc := r.Rewrite(data, ht)
+		if rc != nil {
+			return rc
+		}
+	}
+	return nil
+}
+
 func (ht *htmlCheckerImpl) doFlushString() bool {
-	return false // TODO
+	md := ht.attemptRewrite(ht.stringRewriters, ht.tempBuffer.String())
+	if md != nil {
+		ht.emitMarkupData(md)
+		ht.tempBuffer.Reset()
+		if md.rescan {
+			ht.parse(md.all())
+			return true
+		}
+		return false
+	}
+
+	first := true
+	for ht.tempBuffer.Len() > 0 {
+		sublen, isWord := util.WordRunLength(ht.tempBuffer.String())
+		if isWord {
+			// we want to check the word, but first we must eliminate leading hyphens and apostrophes
+			hyphCount := 0
+			for _, ch := range ht.tempBuffer.String() {
+				if hyphCount == sublen || !strings.ContainsRune(hyphApos, ch) {
+					break
+				}
+				hyphCount++
+			}
+			ht.emitFromStartOfTempBuffer(hyphCount)
+			sublen -= hyphCount
+
+			// now determine how many hyphens/apostrophes there are at the end of the word
+			runeArray := []rune(ht.tempBuffer.String())
+			wordLen := sublen
+			hyphCount = 0
+			for wordLen > 0 && strings.ContainsRune(hyphApos, runeArray[wordLen-1]) {
+				hyphCount++
+				wordLen--
+			}
+
+			if wordLen > 0 {
+				// extract the word and remove it from the start of the buffer
+				word := string(runeArray[:wordLen])
+				lw := len(word)
+				s := ht.tempBuffer.String()
+				ht.tempBuffer.Reset()
+				ht.tempBuffer.WriteString(s[lw:])
+
+				// try to rewrite this word
+				md := ht.attemptRewrite(ht.wordRewriters, word)
+				if md != nil {
+					// emit and/or reparse
+					ht.emitMarkupData(md)
+					if md.rescan {
+						ht.parse(md.all())
+					}
+				} else {
+					// just output the word normally
+					ht.ensureSpaceOnLine(wordLen)
+					ht.emitString(word, ht.outputFilters, true)
+				}
+			}
+
+			// now emit the rest of the hyphens/apostrophes
+			ht.emitFromStartOfTempBuffer(hyphCount)
+
+		} else {
+			// emit this many characters, line-breaking where required
+			totalRunes := utf8.RuneCountInString(ht.tempBuffer.String())
+			if sublen == totalRunes && !first && sublen <= htmlMarginSlop {
+				// This is intended to handle a small run of non-word characters at the end of a string (i.e.
+				// followed by whitespace) that should stay on the same line with its preceding word, to
+				// eliminate "funnies" in punctuation formatting.
+				ht.emitString(ht.tempBuffer.String(), ht.outputFilters, true)
+				ht.tempBuffer.Reset()
+				break
+			}
+
+			// This is kind of the inverse of the above check; if we have a small run of non-word
+			// characters at the START of a word (preceded by whitespace and followed by at least
+			// one word character), then ensure that we can keep that word and its prefixing non-word
+			// characters on the same line (again, avoiding "funnies" in formatting).
+			if sublen < totalRunes && first && sublen <= htmlMarginSlop {
+				fwLen, _ := util.WordRunLengthAfterPrefix(ht.tempBuffer.String(), sublen)
+				ht.ensureSpaceOnLine(sublen + fwLen)
+			}
+			ht.emitFromStartOfTempBuffer(sublen)
+		}
+		first = false
+	}
+	return false
+}
+
+func (ht *htmlCheckerImpl) handleAsHTML() bool {
+	ht.triggerWBR = false
+	tempString := ht.tempBuffer.String()
+	// Figure out where the start of the command word is.
+	startCmd := 0
+	closingTag := false
+	if startCmd < len(tempString) && tempString[startCmd] == '/' {
+		startCmd++
+		closingTag = true
+	}
+
+	// now figure out where it ends
+	endCmd := startCmd
+	for endCmd < len(tempString) {
+		if unicode.IsSpace(rune(tempString[endCmd])) {
+			break
+		}
+		endCmd++
+	}
+
+	if endCmd == startCmd || (endCmd-startCmd) > tagMaxLength {
+		// command word is empty or is too long to be an HTML tag
+		return false
+	}
+	possTagName := tempString[startCmd:endCmd]
+	tagIndex, ok := tagNameToIndex[strings.ToUpper(possTagName)]
+	if !ok {
+		// not a known HTML tag
+		return false
+	}
+	tag := tagIndexToObject[tagIndex]
+	if closingTag && !tag.allowClose {
+		// it's a closing tag and this tag doesn't permit the "close" form
+		return false
+	}
+	tagSetID := tagIndexToSetId[tagIndex]
+	if !ht.tagSet.Test(uint(tagSetID)) {
+		// the tag is not allowed - discard it, if one of the flags is set in the config
+		return ht.config.DiscardHTML || ht.config.DiscardRejected
+	}
+	if !ht.config.DiscardHTML && tag.balanceTags {
+		// this tag needs to be balanced - here's where we manipulate the stack
+		var valid bool
+		if closingTag {
+			valid = ht.tagStack.RemoveMostRecent(tag)
+		} else {
+			ht.tagStack.Push(tag)
+			valid = true
+		}
+		if !valid {
+			return false
+		}
+	}
+
+	// Give the tag object one last chance to dictate what we do with the tag.
+	realTagData := tag.rewriteContents(tempString, closingTag, ht)
+	if realTagData == "" || ht.config.DiscardHTML {
+		return true
+	}
+
+	// Emit the tag to the output.
+	ht.emitRune('<', ht.rawOutputFilters, false)
+	ht.emitString(realTagData, ht.rawOutputFilters, false)
+	ht.emitRune('>', ht.rawOutputFilters, false)
+
+	logicalLineBreak := false
+	if ht.triggerWBR && !closingTag && ht.noBreakCount > 0 {
+		// word break is logical line break, but only within no-break tags
+		logicalLineBreak = true
+	} else {
+		logicalLineBreak = tag.causeLineBreak(closingTag)
+	}
+	if logicalLineBreak {
+		ht.columns = 0
+	}
+	return true
+}
+
+func (ht *htmlCheckerImpl) containsHTMLComment() bool {
+	return ht.tempBuffer.Len() >= 3 && strings.HasPrefix(ht.tempBuffer.String(), "!--")
+}
+
+func (ht *htmlCheckerImpl) containsCompleteHTMLComment() bool {
+	if ht.tempBuffer.Len() >= 5 {
+		s := ht.tempBuffer.String()
+		return strings.HasPrefix(s, "!--") && strings.HasSuffix(s, "--")
+	}
+	return false
+}
+
+func (ht *htmlCheckerImpl) containsXMLConstruct() bool {
+	tempString := ht.tempBuffer.String()
+	ptr := 0
+	if len(tempString) > 1 && tempString[0] == '/' {
+		ptr++
+	}
+	for ptr < len(tempString) {
+		if tempString[ptr] == ':' {
+			return true
+		} else if unicode.IsSpace(rune(tempString[ptr])) {
+			break
+		}
+		ptr++
+	}
+	return false
+}
+
+func (ht *htmlCheckerImpl) finishTag() {
+	if ht.containsHTMLComment() {
+		if ht.containsCompleteHTMLComment() {
+			if !ht.config.DiscardComments {
+				// output the comment in the raw
+				ht.emitRune('<', ht.rawOutputFilters, false)
+				ht.emitString(ht.tempBuffer.String(), ht.rawOutputFilters, false)
+				ht.emitRune('>', ht.rawOutputFilters, false)
+				// clear state and retun to parsing
+				ht.tempBuffer.Reset()
+				ht.state = stateWhitespace
+			}
+		}
+		return
+	}
+	if ht.handleAsHTML() {
+		// this was valid HTML, we're done
+		ht.tempBuffer.Reset()
+		ht.state = stateWhitespace
+		return
+	}
+
+	// try to handle it with a tag rewriter
+	md := ht.attemptRewrite(ht.tagRewriters, ht.tempBuffer.String())
+	if md != nil {
+		ht.emitBracketedMarkupData(md, '<', '>')
+		ht.tempBuffer.Reset()
+		ht.state = stateWhitespace
+		if md.rescan {
+			ht.tempBuffer.WriteByte('<')
+			ht.state = stateChars
+			ht.parse(md.all() + ">")
+		}
+		return
+	}
+
+	if ht.config.DiscardXML && ht.containsXMLConstruct() {
+		// this tag is an XML construct, and needs to be discarded
+		ht.tempBuffer.Reset()
+		ht.state = stateWhitespace
+		return
+	}
+
+	// This tag has been rejected! process it normally as character data
+	rejection := ht.tempBuffer.String()
+	ht.tempBuffer.Reset()
+	ht.tempBuffer.WriteByte('<')
+	ht.state = stateChars
+	if len(rejection) > 0 {
+		ht.parse(rejection)
+	}
+	ht.parse(">")
+}
+
+func (ht *htmlCheckerImpl) finishParen() {
+	// Try to handle the element using a paren rewriter
+	md := ht.attemptRewrite(ht.parenRewriters, ht.tempBuffer.String())
+	if md != nil {
+		ht.emitBracketedMarkupData(md, '(', ')')
+		ht.tempBuffer.Reset()
+		ht.state = stateWhitespace
+		ht.parenLevel = 0
+		if md.rescan {
+			ht.tempBuffer.WriteByte('(')
+			ht.state = stateChars
+			ht.parse(md.all() + ")")
+		}
+		return
+	}
+
+	// Tag rejected! Process it normally as character data.
+	rejection := ht.tempBuffer.String()
+	ht.tempBuffer.Reset()
+	ht.tempBuffer.WriteByte('(')
+	ht.state = stateChars
+	ht.parenLevel = 0
+	if len(rejection) > 0 {
+		ht.parse(rejection)
+	}
+	ht.parse(")")
 }

 func (ht *htmlCheckerImpl) parse(str string) {
-
+	i := 0
+	for i < len(str) {
+		ch := str[i]
+		switch ht.state {
+		case stateWhitespace:
+			switch ch {
+			case ' ', '\t': // append space and tab verbatim
+				ht.tempBuffer.WriteByte(ch)
+				i++
+			case '\r', '\n': // flush and go to Newline state
+				ht.doFlushWhitespace()
+				ht.state = stateNewline
+				ht.tempBuffer.WriteByte(ch)
+				i++
+			case '<':
+				ht.doFlushWhitespace()
+				if ht.config.Angles {
+					ht.state = stateLeftAngle
+				} else {
+					// process < as ordinary character
+					ht.state = stateChars
+					ht.tempBuffer.WriteByte(ch)
+				}
+				i++
+			case '(':
+				ht.doFlushWhitespace()
+				if ht.config.Parens {
+					ht.state = stateParen
+				} else {
+					// process ( as ordinary character)
+					ht.state = stateChars
+					ht.tempBuffer.WriteByte(ch)
+				}
+				i++
+			case '\\': // backslash processing is tricky - go to Chars state to handle it
+				ht.doFlushWhitespace()
+				ht.state = stateChars
+			default:
+				ht.doFlushWhitespace()
+				ht.state = stateChars
+				ht.tempBuffer.WriteByte(ch)
+				i++
+			}
+		case stateChars:
+			switch ch {
+			case ' ', '\t': // go to Whitespace state
+				ht.doFlushString()
+				ht.state = stateWhitespace
+				ht.tempBuffer.WriteByte(ch)
+				i++
+			case '\r', '\n': // go to Newline state
+				ht.doFlushString()
+				ht.state = stateNewline
+				ht.tempBuffer.WriteByte(ch)
+				i++
+			case '<': // may be a start of tag
+				if ht.config.Angles {
+					ht.doFlushString()
+					ht.state = stateLeftAngle
+				} else {
+					ht.tempBuffer.WriteByte(ch)
+				}
+				i++
+			case '\\':
+				if i < (len(str) - 1) {
+					i++
+					ch = str[i]
+					if (ch == '(' && ht.config.Parens) || (ch == '<' && ht.config.Angles) {
+						// append the escaped character, omitting the backslash
+						ht.tempBuffer.WriteByte(ch)
+						i++
+					} else {
+						// append the backslash and hit the new character
+						ht.tempBuffer.WriteByte('\\')
+					}
+				} else {
+					// just append the backslash notrmally
+					ht.tempBuffer.WriteByte(ch)
+					i++
+				}
+			default: // just append the next character
+				ht.tempBuffer.WriteByte(ch)
+				i++
+			}
+		case stateLeftAngle:
+			switch ch {
+			case ' ', '\t', '\r', '\n': // output <, go to Whitespace state
+				ht.emitRune('<', ht.outputFilters, true)
+				ht.state = stateWhitespace
+			case '<': // output < and stay in this state
+				ht.emitRune('<', ht.outputFilters, true)
+				i++
+			default:
+				ht.state = stateTag
+				ht.tempBuffer.WriteByte(ch)
+				i++
+			}
+		case stateTag:
+			switch ch {
+			case '>': // finish the tag - this changes the state, and possibly calls parse() recursively
+				ht.finishTag()
+				i++
+			case '\'', '"': // go into "quote string" state inside the tag
+				ht.tempBuffer.WriteByte(ch)
+				ht.state = stateTagQuote
+				ht.quoteChar = ch
+				i++
+			default: // just append the character
+				ht.tempBuffer.WriteByte(ch)
+				i++
+			}
+		case stateParen:
+			switch ch {
+			case '(':
+				ht.tempBuffer.WriteByte(ch)
+				ht.parenLevel++
+				i++
+			case ')':
+				if ht.parenLevel == 0 {
+					ht.finishParen()
+				} else {
+					ht.tempBuffer.WriteByte(ch)
+					ht.parenLevel--
+				}
+				i++
+			default:
+				ht.tempBuffer.WriteByte(ch)
+				i++
+			}
+		case stateTagQuote:
+			ht.tempBuffer.WriteByte(ch)
+			if ch == ht.quoteChar {
+				ht.state = stateTag
+			}
+			i++
+		case stateNewline:
+			if ch == '\r' || ch == '\n' {
+				ht.tempBuffer.WriteByte(ch)
+				i++
+			} else {
+				ht.doFlushNewlines()
+			}
+		}
+	}
 }

 func (ht *htmlCheckerImpl) Append(str string) error {
 	if ht.finished {
-		return AlreadyFinished
+		return ErrAlreadyFinished
 	}
 	if !ht.started {
 		ht.started = true
@@ -231,7 +870,7 @@ func (ht *htmlCheckerImpl) Append(str string) error {

 func (ht *htmlCheckerImpl) Finish() error {
 	if ht.finished {
-		return AlreadyFinished
+		return ErrAlreadyFinished
 	}
 	if !ht.started {
 		ht.started = true
@@ -247,7 +886,125 @@ func (ht *htmlCheckerImpl) Finish() error {
 		case stateChars:
 			running = ht.doFlushString() // flush the temporary buffer
 		case stateLeftAngle:
-
+			// just emit a left angle character
+			ht.emitPossibleLineBreak()
+			ht.emitRune('<', ht.outputFilters, true)
+		case stateTag, stateTagQuote:
+			// we won't finish this tag, so it's automagically rejected
+			rejection := ht.tempBuffer.String()
+			ht.tempBuffer.Reset()
+			ht.tempBuffer.WriteByte('<')
+			ht.state = stateChars
+			if len(rejection) > 0 {
+				ht.parse(rejection)
+			}
+			running = true
+		case stateParen:
+			rejection := ht.tempBuffer.String()
+			ht.tempBuffer.Reset()
+			ht.tempBuffer.WriteByte('(')
+			ht.state = stateChars
+			ht.parenLevel = 0
+			if len(rejection) > 0 {
+				ht.parse(rejection)
+			}
+			running = true
 		}
 	}
+
+	// Now close all the HTML tags that were left open.
+	for !ht.tagStack.IsEmpty() {
+		tag, _ := ht.tagStack.Pop()
+		ht.outputBuffer.WriteString(tag.makeClosingTag())
+	}
+
+	ht.lines++
+	ht.finished = true
+	return nil
+}
+
+func (ht *htmlCheckerImpl) Reset() {
+	ht.started = false
+	ht.finished = false
+	ht.triggerWBR = false
+	ht.state = stateWhitespace
+	ht.quoteChar = byte(0)
+	ht.columns = 0
+	ht.lines = 0
+	ht.parenLevel = 0
+	ht.outputBuffer.Reset()
+	for u := range ht.externalReferences {
+		delete(ht.externalReferences, u)
+	}
+	for k := range ht.internalReferences {
+		delete(ht.internalReferences, k)
+	}
+	for c := range maps.Values(ht.counters) {
+		c.Reset()
+	}
+}
+
+func (ht *htmlCheckerImpl) Value() (string, error) {
+	if ht.finished {
+		return ht.outputBuffer.String(), nil
+	}
+	return "", ErrNotYetFinished
+}
+
+func (ht *htmlCheckerImpl) Length() (int, error) {
+	if ht.finished {
+		return ht.outputBuffer.Len(), nil
+	}
+	return 0, ErrNotYetFinished
+}
+
+func (ht *htmlCheckerImpl) Lines() (int, error) {
+	if ht.finished {
+		return ht.lines, nil
+	}
+	return 0, ErrNotYetFinished
+}
+
+func (ht *htmlCheckerImpl) Counter(name string) (int, error) {
+	if ht.finished {
+		cr, ok := ht.counters[name]
+		if ok {
+			return cr.GetCount(), nil
+		}
+		return 0, nil
+	}
+	return 0, ErrNotYetFinished
+}
+
+func (ht *htmlCheckerImpl) GetContext(name string) any {
+	return ht.contextData[name]
+}
+
+func (ht *htmlCheckerImpl) SetContext(name string, value any) {
+	ht.contextData[name] = value
+}
+
+func (ht *htmlCheckerImpl) ExternalRefs() ([]*url.URL, error) {
+	if ht.finished {
+		rc := make([]*url.URL, len(ht.externalReferences))
+		p := 0
+		for url := range maps.Keys(ht.externalReferences) {
+			rc[p] = url
+			p++
+		}
+		return rc, nil
+	}
+	return nil, ErrNotYetFinished
+}
+
+func (ht *htmlCheckerImpl) InternalRefs() ([]string, error) {
+	if ht.finished {
+		rc := make([]string, len(ht.internalReferences))
+		p := 0
+		for s := range maps.Keys(ht.internalReferences) {
+			rc[p] = s
+			p++
+		}
+	}
+	return nil, ErrNotYetFinished
 }
@@ -17,22 +17,24 @@ import (

 // HTMLCheckerConfig is a configuration that may be used with the HTML Checker.
 type HTMLCheckerConfig struct {
-	Name            string   `yaml:"name"`
-	WordWrap        int      `yaml:"wordWrap"`
-	Rewrap          bool     `yaml:"rewrap"`
-	Angles          bool     `yaml:"angles"`
-	Parens          bool     `yaml:"parens"`
-	DiscardHTML     bool     `yaml:"discardHTML"`
-	DiscardRejected bool     `yaml:"discardRejected"`
-	DiscardComments bool     `yaml:"discardComments"`
-	DiscardXML      bool     `yaml:"discardXML"`
-	OutputFilters   []string `yaml:"outputFilters"`
-	StringRewriters []string `yaml:"stringRewriters"`
-	WordRewriters   []string `yaml:"wordRewriters"`
-	TagRewriters    []string `yaml:"tagRewriters"`
-	ParenRewriters  []string `yaml:"parenRewriters"`
-	TagSet          string   `yaml:"tagSet"`
-	DisallowTags    []string `yaml:"disallowTags"`
+	Name             string   `yaml:"name"`
+	WordWrap         int      `yaml:"wordWrap"`
+	Rewrap           bool     `yaml:"rewrap"`
+	Angles           bool     `yaml:"angles"`
+	Parens           bool     `yaml:"parens"`
+	DiscardHTML      bool     `yaml:"discardHTML"`
+	DiscardRejected  bool     `yaml:"discardRejected"`
+	DiscardComments  bool     `yaml:"discardComments"`
+	DiscardXML       bool     `yaml:"discardXML"`
+	OutputFilters    []string `yaml:"outputFilters"`
+	RawOutputFilters []string `yaml:"rawOutputFilters"`
+	StringRewriters  []string `yaml:"stringRewriters"`
+	WordRewriters    []string `yaml:"wordRewriters"`
+	TagRewriters     []string `yaml:"tagRewriters"`
+	ParenRewriters   []string `yaml:"parenRewriters"`
+	TagSet           string   `yaml:"tagSet"`
+	DisallowTags     []string `yaml:"disallowTags"`
+	AnchorTail       string   `yaml:"anchorTail"`
 }

 // HTMLCheckerConfigFile represents all the configs as they exist in the file.
@@ -40,6 +42,8 @@ type HTMLCheckerConfigFile struct {
 	Configs []HTMLCheckerConfig `yaml:"configs"`
 }

+const defaultAnchorTail = "TARGET=\"Wander\""
+
 //go:embed configs.yaml
 var configData []byte

@@ -55,5 +59,8 @@ func init() {
 	}
 	for i := range cfgdata.Configs {
 		configsRegistry[cfgdata.Configs[i].Name] = &(cfgdata.Configs[i])
+		if cfgdata.Configs[i].AnchorTail == "" {
+			cfgdata.Configs[i].AnchorTail = defaultAnchorTail
+		}
 	}
 }
@@ -13,8 +13,8 @@ import "strings"

 // outputFilter is the interface for an HTML checker output filter.
 type outputFilter interface {
-	tryOutputCharacter(strings.Builder, byte) bool
-	matchCharacter(byte) bool
+	tryOutputRune(strings.Builder, rune) bool
+	matchRune(rune) bool
 	lengthNoMatch(string) int
 }

@@ -34,7 +34,7 @@ type htmlEncodingFilter struct{}
 const htmlEscapedChars = "<>&"

 // tryOutputCharacter outputs a character that needs to be escaped.
-func (f *htmlEncodingFilter) tryOutputCharacter(buf strings.Builder, ch byte) bool {
+func (f *htmlEncodingFilter) tryOutputRune(buf strings.Builder, ch rune) bool {
 	switch ch {
 	case '<':
 		buf.WriteString("&lt;")
@@ -49,15 +49,15 @@ func (f *htmlEncodingFilter) tryOutputCharacter(buf strings.Builder, ch byte) bo
 }

 // matchCharacter returns true if this character needs to be escaped.
-func (f *htmlEncodingFilter) matchCharacter(ch byte) bool {
-	return strings.IndexByte(htmlEscapedChars, ch) >= 0
+func (f *htmlEncodingFilter) matchRune(ch rune) bool {
+	return strings.ContainsRune(htmlEscapedChars, ch)
 }

 // lengthNoMatch returns the maximum length of unmatched characters at the start of the string.
 func (f *htmlEncodingFilter) lengthNoMatch(s string) int {
 	rc := len(s)
-	for _, c := range []byte(htmlEscapedChars) {
-		tmp := strings.IndexByte(s, c)
+	for _, c := range htmlEscapedChars {
+		tmp := strings.IndexRune(s, c)
 		if tmp >= 0 && tmp < rc {
 			rc = tmp
 			if rc == 0 {
@@ -26,6 +26,10 @@ type markupData struct {
 	rescan      bool
 }

+func (md *markupData) all() string {
+	return md.beginMarkup + md.text + md.endMarkup
+}
+
 // rewriterServices is an interface that provides services to rewriters.
 type rewriterServices interface {
 	rewriterAttrValue(string) string
@@ -11,7 +11,7 @@
 package util

 // Stack[T] is a simple generic array-based stack implementation.
-type Stack[T any] struct {
+type Stack[T comparable] struct {
 	elements []T
 }

@@ -43,8 +43,27 @@ func (stk *Stack[T]) Peek() (T, bool) {
 	return stk.elements[len(stk.elements)-1], true
 }

+func (stk *Stack[T]) RemoveMostRecent(data T) bool {
+	i := len(stk.elements) - 1
+	for i >= 0 {
+		if stk.elements[i] == data {
+			if i == 0 {
+				stk.elements = stk.elements[1:]
+			} else if (i + 1) == len(stk.elements) {
+				stk.elements = stk.elements[:i]
+			} else {
+				high := stk.elements[i+1:]
+				stk.elements = stk.elements[:i]
+				stk.elements = append(stk.elements, high...)
+			}
+			return true
+		}
+	}
+	return false
+}
+
 // NewStack creates and returns a new stack.
-func NewStack[T any]() *Stack[T] {
+func NewStack[T comparable]() *Stack[T] {
 	return &Stack[T]{
 		elements: make([]T, 0),
 	}
@@ -14,17 +14,10 @@ import (
 	"regexp"
 	"strings"
 	"unicode"
+	"unicode/utf8"
 )

-var numeric *regexp.Regexp
-
-func init() {
-	re, err := regexp.Compile("^[0-9]+$")
-	if err != nil {
-		panic(err)
-	}
-	numeric = re
-}
+var numeric *regexp.Regexp = regexp.MustCompile(`^[0-9]+$`)

 /* CapitalizeString changes the first character of the string to a capital.
 * Parameters:
@@ -80,3 +73,52 @@ func SqlEscape(s string, wildcards bool) string {
 func IsNumeric(s string) bool {
 	return numeric.MatchString(s)
 }
+
+/* RunesToBytes returns the number of bytes in a string counting the number of runes from the beginning.
+ * Parameters:
+ *     s - The string to work with.
+ *     runeCount - The number of runes to count from the start of the string.
+ * Returns:
+ *     The corresponding number of bytes.
+ */
+func RunesToBytes(s string, runeCount int) int {
+	bp := 0
+	for runeCount > 0 {
+		if bp >= len(s) {
+			return len(s)
+		}
+		_, c := utf8.DecodeRuneInString(s[bp:])
+		bp += c
+		runeCount--
+	}
+	return bp
+}
+
+func IsRuneWord(ch rune) bool {
+	return unicode.IsLetter(ch) || ch == '-' || ch == '\''
+}
+
+func WordRunLength(s string) (int, bool) {
+	c1, initLen := utf8.DecodeRuneInString(s)
+	wordChar := IsRuneWord(c1)
+	rlen := 1
+	for _, mch := range s[initLen:] {
+		if IsRuneWord(mch) != wordChar {
+			break
+		}
+		rlen++
+	}
+	return rlen, wordChar
+}
+
+func WordRunLengthAfterPrefix(s string, nrunes int) (int, bool) {
+	ofs := 0
+	for _, ch := range s {
+		if nrunes == 0 {
+			break
+		}
+		ofs += utf8.RuneLen(ch)
+		nrunes--
+	}
+	return WordRunLength(s[ofs:])
+}