diff --git a/htmlcheck/checker.go b/htmlcheck/checker.go
index 885234c..32aa830 100644
--- a/htmlcheck/checker.go
+++ b/htmlcheck/checker.go
@@ -12,8 +12,11 @@ package htmlcheck
import (
"errors"
"fmt"
+ "maps"
"net/url"
"strings"
+ "unicode"
+ "unicode/utf8"
"git.erbosoft.com/amy/amsterdam/util"
"github.com/bits-and-blooms/bitset"
@@ -35,8 +38,8 @@ type HTMLChecker interface {
InternalRefs() ([]string, error)
}
-var AlreadyFinished = errors.New("the HTML checker has already finished")
-var NotYetFinished = errors.New("the HTML checker has not yet been finished")
+var ErrAlreadyFinished = errors.New("the HTML checker has already finished")
+var ErrNotYetFinished = errors.New("the HTML checker has not yet been finished")
type htmlCheckerBackend interface {
getCheckerAttrValue(string) string
@@ -60,6 +63,9 @@ const (
// htmlMarginSlop is a number of characters at the end of the line used to control word-wrapping.
const htmlMarginSlop = 5
+// hyphApos is used to find hyphens and apostrophes.
+const hyphApos = "-'"
+
type htmlCheckerImpl struct {
config *HTMLCheckerConfig
started bool
@@ -80,6 +86,7 @@ type htmlCheckerImpl struct {
tagRewriters []rewriter
parenRewriters []rewriter
outputFilters []outputFilter
+ rawOutputFilters []outputFilter
contextData map[string]any
externalReferences map[*url.URL]bool
internalReferences map[string]bool
@@ -102,6 +109,17 @@ func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) {
}
}
+func (ht *htmlCheckerImpl) copyOutputFilters(dest []outputFilter, source []string) {
+ for i := range source {
+ f, ok := outputFilterRegistry[source[i]]
+ if ok {
+ dest[i] = f
+ } else {
+ log.Errorf("filter %s is not found", source[i])
+ }
+ }
+}
+
func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
config, ok := configsRegistry[configName]
if !ok {
@@ -128,6 +146,7 @@ func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
tagRewriters: make([]rewriter, len(config.TagRewriters)),
parenRewriters: make([]rewriter, len(config.ParenRewriters)),
outputFilters: make([]outputFilter, len(config.OutputFilters)),
+ rawOutputFilters: make([]outputFilter, len(config.RawOutputFilters)),
contextData: make(map[string]any),
externalReferences: make(map[*url.URL]bool),
internalReferences: make(map[string]bool),
@@ -137,15 +156,66 @@ func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
rc.copyRewriters(rc.wordRewriters, config.WordRewriters)
rc.copyRewriters(rc.tagRewriters, config.TagRewriters)
rc.copyRewriters(rc.parenRewriters, config.ParenRewriters)
- for i := range config.OutputFilters {
- f, ok := outputFilterRegistry[config.OutputFilters[i]]
- if ok {
- rc.outputFilters[i] = f
- } else {
- log.Errorf("filter %s is not found", config.OutputFilters[i])
+ rc.copyOutputFilters(rc.outputFilters, config.OutputFilters)
+ rc.copyOutputFilters(rc.rawOutputFilters, config.RawOutputFilters)
+ return &rc, nil
+}
+
+func (ht *htmlCheckerImpl) getCheckerAttrValue(name string) string {
+ if name == "ANCHORTAIL" {
+ return ht.config.AnchorTail
+ }
+ return ""
+}
+
+func (ht *htmlCheckerImpl) sendTagMessage(msg string) {
+ switch msg {
+ case "NOBR":
+ ht.noBreakCount++
+ case "/NOBR":
+ ht.noBreakCount--
+ case "WBR":
+ ht.triggerWBR = true
+ }
+}
+
+func (ht *htmlCheckerImpl) getCheckerContextValue(name string) any {
+ return ht.contextData[name]
+}
+
+func (ht *htmlCheckerImpl) addExternalRef(ref *url.URL) {
+ ht.externalReferences[ref] = true
+}
+
+func (ht *htmlCheckerImpl) addInternalRef(ref string) {
+ ht.internalReferences[ref] = true
+}
+
+func (ht *htmlCheckerImpl) rewriterAttrValue(name string) string {
+ return ht.getCheckerAttrValue(name)
+}
+
+func (ht *htmlCheckerImpl) rewriterContextValue(name string) any {
+ return ht.contextData[name]
+}
+
+func (ht *htmlCheckerImpl) emitRune(ch rune, filters []outputFilter, countCols bool) {
+ handled := false
+ if len(filters) > 0 {
+ // try each output filter to see what we can do
+ for _, of := range filters {
+ handled = of.tryOutputRune(ht.outputBuffer, ch)
+ if handled {
+ break // found a filter to handle it, done
+ }
+ }
+ if !handled { // output the raw character
+ ht.outputBuffer.WriteRune(ch)
+ }
+ if countCols && ht.config.WordWrap > 0 {
+ ht.columns++
}
}
- return &rc
}
func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) {
@@ -154,42 +224,52 @@ func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countC
}
realCountCols := countCols && (ht.config.WordWrap > 0)
if len(filters) == 0 {
+ // if there are no filters, just output the whole thing
ht.outputBuffer.WriteString(str)
if realCountCols {
- ht.columns += len(str)
+ ht.columns += utf8.RuneCountInString(str)
}
return
}
temp := str
for len(temp) > 0 {
+ // We output as much of the string as we possibly can at once. Assume, for now, we'll output the whole thing.
outputLen := len(temp)
+
+ // Now look at each of the output filters to see if we should try outputting a lesser amount
+ // (i.e. does the string contain a "stopper" that one of the filters would like to mogrify?)
var stopper outputFilter = nil
for _, of := range filters {
+ // find the length of characters that DOESN'T match this filter
lnm := of.lengthNoMatch(temp)
if lnm >= 0 && lnm < outputLen {
+ // we've found a new stopper - record the length and the filter
outputLen = lnm
stopper = of
}
if outputLen <= 0 {
- break
+ break // nothing left to do here
}
}
if outputLen > 0 {
+ // move over the unaltered characters first
ht.outputBuffer.WriteString(temp[:outputLen])
if realCountCols {
- ht.columns += outputLen
+ ht.columns += utf8.RuneCountInString(temp[:outputLen])
}
}
if stopper != nil {
- tmpch := temp[outputLen]
- outputLen++
- if !stopper.tryOutputCharacter(ht.outputBuffer, tmpch) {
- ht.outputBuffer.WriteByte(tmpch)
+ // one of the output filters stopped us, try invoking it
+ tmpch, bsiz := utf8.DecodeRuneInString(temp[outputLen:])
+ outputLen += bsiz
+ if !stopper.tryOutputRune(ht.outputBuffer, tmpch) {
+ ht.outputBuffer.WriteRune(tmpch)
}
if realCountCols {
ht.columns++
}
}
+ // Chop the string and go around again.
if outputLen == len(temp) {
temp = ""
} else if outputLen > 0 {
@@ -199,7 +279,11 @@ func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countC
}
func (ht *htmlCheckerImpl) emitLineBreak() {
-
+ ht.emitString("\r\n", ht.rawOutputFilters, false)
+ if ht.config.WordWrap > 0 {
+ ht.columns = 0
+ }
+ ht.lines++
}
func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
@@ -208,17 +292,572 @@ func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
}
}
+func (ht *htmlCheckerImpl) ensureSpaceOnLine(nchars int) {
+ if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
+ // add a line break if needed here
+ remainSpace := ht.config.WordWrap - ht.columns
+ if remainSpace < nchars {
+ ht.emitLineBreak()
+ }
+ }
+}
+
+func (ht *htmlCheckerImpl) emitMarkupData(md *markupData) {
+ if !md.rescan {
+ ht.ensureSpaceOnLine(len(md.text))
+ ht.emitString(md.beginMarkup, ht.rawOutputFilters, false)
+ ht.emitString(md.text, ht.outputFilters, true)
+ ht.emitString(md.endMarkup, ht.rawOutputFilters, false)
+ }
+}
+
+func (ht *htmlCheckerImpl) emitBracketedMarkupData(md *markupData, prefix rune, suffix rune) {
+ if !md.rescan {
+ l := len(md.text)
+ if l > 0 {
+ l += 2
+ }
+ ht.ensureSpaceOnLine(l)
+ if len(md.text) > 0 {
+ ht.emitRune(prefix, ht.outputFilters, true)
+ }
+ ht.emitString(md.beginMarkup, ht.rawOutputFilters, false)
+ ht.emitString(md.text, ht.outputFilters, true)
+ ht.emitString(md.endMarkup, ht.rawOutputFilters, false)
+ if len(md.text) > 0 {
+ ht.emitRune(suffix, ht.outputFilters, true)
+ }
+ }
+}
+
+func (ht *htmlCheckerImpl) doFlushWhitespace() {
+ outputLen := ht.tempBuffer.Len()
+ if outputLen > 0 {
+ forceLineBreak := false
+ if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
+ // adjust output if necessary for wordwrapping
+ remainSpace := ht.config.WordWrap - ht.columns
+ if remainSpace < outputLen {
+ outputLen = remainSpace
+ }
+ if outputLen <= 0 {
+ // this means that NONE of the whitespace would fit on this line...add a line break
+ forceLineBreak = true
+ outputLen = 0
+ }
+ }
+ if forceLineBreak {
+ ht.emitLineBreak()
+ }
+ if outputLen > 0 {
+ ht.emitString(ht.tempBuffer.String()[:outputLen], ht.outputFilters, true)
+ }
+ ht.tempBuffer.Reset()
+ }
+}
+
+func (ht *htmlCheckerImpl) doFlushNewlines() {
+ // Measure the number of line breaks we have.
+ lineBreaks, crs := 0, 0
+ for ch := range []byte(ht.tempBuffer.String()) {
+ switch ch {
+ case '\r':
+ crs++
+ case '\n':
+ crs = 0
+ lineBreaks++
+ }
+ }
+ if crs > 0 {
+ lineBreaks++
+ }
+
+ // Adjust the number of line breaks if rewrap is in effect.
+ if ht.config.Rewrap {
+ if lineBreaks < 2 {
+ // convert a single line break to whitespace
+ ht.tempBuffer.Reset()
+ ht.tempBuffer.WriteByte(' ')
+ ht.state = stateWhitespace
+ return
+ } else {
+ lineBreaks = 2 // compress out multiple blank lines
+ }
+ }
+
+ for lineBreaks > 0 {
+ ht.emitLineBreak()
+ lineBreaks--
+ }
+ ht.tempBuffer.Reset()
+ ht.state = stateWhitespace
+}
+
+func (ht *htmlCheckerImpl) emitFromStartOfTempBuffer(nrunes int) {
+ if nrunes > 0 {
+ if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
+ for nrunes > 0 {
+ curlen := min(nrunes, ht.config.WordWrap-ht.columns)
+ if curlen > 0 {
+ s := ht.tempBuffer.String()
+ bcurlen := util.RunesToBytes(s, curlen)
+ ht.emitString(s[:bcurlen], ht.outputFilters, true)
+ ht.tempBuffer.Reset()
+ ht.tempBuffer.WriteString(s[bcurlen:])
+ nrunes -= curlen
+ }
+ if ht.columns >= ht.config.WordWrap {
+ ht.emitLineBreak()
+ }
+ }
+ } else {
+ s := ht.tempBuffer.String()
+ bnrunes := util.RunesToBytes(s, nrunes)
+ ht.emitString(s[:bnrunes], ht.outputFilters, true)
+ ht.tempBuffer.Reset()
+ ht.tempBuffer.WriteString(s[bnrunes:])
+ }
+ }
+}
+
+func (ht *htmlCheckerImpl) attemptRewrite(rewriters []rewriter, data string) *markupData {
+ for _, r := range rewriters {
+ rc := r.Rewrite(data, ht)
+ if rc != nil {
+ return rc
+ }
+ }
+ return nil
+}
+
func (ht *htmlCheckerImpl) doFlushString() bool {
- return false // TODO
+ md := ht.attemptRewrite(ht.stringRewriters, ht.tempBuffer.String())
+ if md != nil {
+ ht.emitMarkupData(md)
+ ht.tempBuffer.Reset()
+ if md.rescan {
+ ht.parse(md.all())
+ return true
+ }
+ return false
+ }
+
+ first := true
+ for ht.tempBuffer.Len() > 0 {
+ sublen, isWord := util.WordRunLength(ht.tempBuffer.String())
+ if isWord {
+ // we want to check the word, but first we must eliminate leading hyphens and apostrophes
+ hyphCount := 0
+ for _, ch := range ht.tempBuffer.String() {
+ if hyphCount == sublen || !strings.ContainsRune(hyphApos, ch) {
+ break
+ }
+ hyphCount++
+ }
+ ht.emitFromStartOfTempBuffer(hyphCount)
+ sublen -= hyphCount
+
+ // now determine how many hyphens/apostrophes there are at the end of the word
+ runeArray := []rune(ht.tempBuffer.String())
+ wordLen := sublen
+ hyphCount = 0
+ for wordLen > 0 && strings.ContainsRune(hyphApos, runeArray[wordLen-1]) {
+ hyphCount++
+ wordLen--
+ }
+
+ if wordLen > 0 {
+ // extract the word and remove it from the start of the buffer
+ word := string(runeArray[:wordLen])
+ lw := len(word)
+ s := ht.tempBuffer.String()
+ ht.tempBuffer.Reset()
+ ht.tempBuffer.WriteString(s[lw:])
+
+ // try to rewrite this word
+ md := ht.attemptRewrite(ht.wordRewriters, word)
+ if md != nil {
+ // emit and/or reparse
+ ht.emitMarkupData(md)
+ if md.rescan {
+ ht.parse(md.all())
+ }
+ } else {
+ // just output the word normally
+ ht.ensureSpaceOnLine(wordLen)
+ ht.emitString(word, ht.outputFilters, true)
+ }
+ }
+
+ // now emit the rest of the hyphens/apostrophes
+ ht.emitFromStartOfTempBuffer(hyphCount)
+
+ } else {
+ // emit this many characters, line-breaking where required
+ totalRunes := utf8.RuneCountInString(ht.tempBuffer.String())
+ if sublen == totalRunes && !first && sublen <= htmlMarginSlop {
+ // This is intended to handle a small run of non-word characters at the end of a string (i.e.
+ // followed by whitespace) that should stay on the same line with its preceding word, to
+ // eliminate "funnies" in punctuation formatting.
+ ht.emitString(ht.tempBuffer.String(), ht.outputFilters, true)
+ ht.tempBuffer.Reset()
+ break
+ }
+
+ // This is kind of the inverse of the above check; if we have a small run of non-word
+ // characters at the START of a word (preceded by whitespace and followed by at least
+ // one word character), then ensure that we can keep that word and its prefixing non-word
+ // characters on the same line (again, avoiding "funnies" in formatting).
+ if sublen < totalRunes && first && sublen <= htmlMarginSlop {
+ fwLen, _ := util.WordRunLengthAfterPrefix(ht.tempBuffer.String(), sublen)
+ ht.ensureSpaceOnLine(sublen + fwLen)
+ }
+ ht.emitFromStartOfTempBuffer(sublen)
+ }
+ first = false
+ }
+ return false
+}
+
+func (ht *htmlCheckerImpl) handleAsHTML() bool {
+ ht.triggerWBR = false
+ tempString := ht.tempBuffer.String()
+ // Figure out where the start of the command word is.
+ startCmd := 0
+ closingTag := false
+ if startCmd < len(tempString) && tempString[startCmd] == '/' {
+ startCmd++
+ closingTag = true
+ }
+
+ // now figure out where it ends
+ endCmd := startCmd
+ for endCmd < len(tempString) {
+ if unicode.IsSpace(rune(tempString[endCmd])) {
+ break
+ }
+ endCmd++
+ }
+
+ if endCmd == startCmd || (endCmd-startCmd) > tagMaxLength {
+ // command word is empty or is too long to be an HTML tag
+ return false
+ }
+ possTagName := tempString[startCmd:endCmd]
+ tagIndex, ok := tagNameToIndex[strings.ToUpper(possTagName)]
+ if !ok {
+ // not a known HTML tag
+ return false
+ }
+ tag := tagIndexToObject[tagIndex]
+ if closingTag && !tag.allowClose {
+ // it's a closing tag and this tag doesn't permit the "close" form
+ return false
+ }
+ tagSetID := tagIndexToSetId[tagIndex]
+ if !ht.tagSet.Test(uint(tagSetID)) {
+ // the tag is not allowed - discard it, if one of the flags is set in the config
+ return ht.config.DiscardHTML || ht.config.DiscardRejected
+ }
+ if !ht.config.DiscardHTML && tag.balanceTags {
+ // this tag needs to be balanced - here's where we manipulate the stack
+ var valid bool
+ if closingTag {
+ valid = ht.tagStack.RemoveMostRecent(tag)
+ } else {
+ ht.tagStack.Push(tag)
+ valid = true
+ }
+ if !valid {
+ return false
+ }
+ }
+
+ // Give the tag object one last chance to dictate what we do with the tag.
+ realTagData := tag.rewriteContents(tempString, closingTag, ht)
+ if realTagData == "" || ht.config.DiscardHTML {
+ return true
+ }
+
+ // Emit the tag to the output.
+ ht.emitRune('<', ht.rawOutputFilters, false)
+ ht.emitString(realTagData, ht.rawOutputFilters, false)
+ ht.emitRune('>', ht.rawOutputFilters, false)
+
+ logicalLineBreak := false
+ if ht.triggerWBR && !closingTag && ht.noBreakCount > 0 {
+ // word break is logical line break, but only within no-break tags
+ logicalLineBreak = true
+ } else {
+ logicalLineBreak = tag.causeLineBreak(closingTag)
+ }
+ if logicalLineBreak {
+ ht.columns = 0
+ }
+ return true
+}
+
+func (ht *htmlCheckerImpl) containsHTMLComment() bool {
+ return ht.tempBuffer.Len() >= 3 && strings.HasPrefix(ht.tempBuffer.String(), "!--")
+}
+
+func (ht *htmlCheckerImpl) containsCompleteHTMLComment() bool {
+ if ht.tempBuffer.Len() >= 5 {
+ s := ht.tempBuffer.String()
+ return strings.HasPrefix(s, "!--") && strings.HasSuffix(s, "--")
+ }
+ return false
+}
+
+func (ht *htmlCheckerImpl) containsXMLConstruct() bool {
+ tempString := ht.tempBuffer.String()
+ ptr := 0
+ if len(tempString) > 1 && tempString[0] == '/' {
+ ptr++
+ }
+ for ptr < len(tempString) {
+ if tempString[ptr] == ':' {
+ return true
+ } else if unicode.IsSpace(rune(tempString[ptr])) {
+ break
+ }
+ ptr++
+ }
+ return false
+}
+
+func (ht *htmlCheckerImpl) finishTag() {
+ if ht.containsHTMLComment() {
+ if ht.containsCompleteHTMLComment() {
+ if !ht.config.DiscardComments {
+ // output the comment in the raw
+ ht.emitRune('<', ht.rawOutputFilters, false)
+ ht.emitString(ht.tempBuffer.String(), ht.rawOutputFilters, false)
+ ht.emitRune('>', ht.rawOutputFilters, false)
+ // clear state and retun to parsing
+ ht.tempBuffer.Reset()
+ ht.state = stateWhitespace
+ }
+ }
+ return
+ }
+ if ht.handleAsHTML() {
+ // this was valid HTML, we're done
+ ht.tempBuffer.Reset()
+ ht.state = stateWhitespace
+ return
+ }
+
+ // try to handle it with a tag rewriter
+ md := ht.attemptRewrite(ht.tagRewriters, ht.tempBuffer.String())
+ if md != nil {
+ ht.emitBracketedMarkupData(md, '<', '>')
+ ht.tempBuffer.Reset()
+ ht.state = stateWhitespace
+ if md.rescan {
+ ht.tempBuffer.WriteByte('<')
+ ht.state = stateChars
+ ht.parse(md.all() + ">")
+ }
+ return
+ }
+
+ if ht.config.DiscardXML && ht.containsXMLConstruct() {
+ // this tag is an XML construct, and needs to be discarded
+ ht.tempBuffer.Reset()
+ ht.state = stateWhitespace
+ return
+ }
+
+ // This tag has been rejected! process it normally as character data
+ rejection := ht.tempBuffer.String()
+ ht.tempBuffer.Reset()
+ ht.tempBuffer.WriteByte('<')
+ ht.state = stateChars
+ if len(rejection) > 0 {
+ ht.parse(rejection)
+ }
+ ht.parse(">")
+}
+
+func (ht *htmlCheckerImpl) finishParen() {
+ // Try to handle the element using a paren rewriter
+ md := ht.attemptRewrite(ht.parenRewriters, ht.tempBuffer.String())
+ if md != nil {
+ ht.emitBracketedMarkupData(md, '(', ')')
+ ht.tempBuffer.Reset()
+ ht.state = stateWhitespace
+ ht.parenLevel = 0
+ if md.rescan {
+ ht.tempBuffer.WriteByte('(')
+ ht.state = stateChars
+ ht.parse(md.all() + ")")
+ }
+ return
+ }
+
+ // Tag rejected! Process it normally as character data.
+ rejection := ht.tempBuffer.String()
+ ht.tempBuffer.Reset()
+ ht.tempBuffer.WriteByte('(')
+ ht.state = stateChars
+ ht.parenLevel = 0
+ if len(rejection) > 0 {
+ ht.parse(rejection)
+ }
+ ht.parse(")")
}
func (ht *htmlCheckerImpl) parse(str string) {
-
+ i := 0
+ for i < len(str) {
+ ch := str[i]
+ switch ht.state {
+ case stateWhitespace:
+ switch ch {
+ case ' ', '\t': // append space and tab verbatim
+ ht.tempBuffer.WriteByte(ch)
+ i++
+ case '\r', '\n': // flush and go to Newline state
+ ht.doFlushWhitespace()
+ ht.state = stateNewline
+ ht.tempBuffer.WriteByte(ch)
+ i++
+ case '<':
+ ht.doFlushWhitespace()
+ if ht.config.Angles {
+ ht.state = stateLeftAngle
+ } else {
+ // process < as ordinary character
+ ht.state = stateChars
+ ht.tempBuffer.WriteByte(ch)
+ }
+ i++
+ case '(':
+ ht.doFlushWhitespace()
+ if ht.config.Parens {
+ ht.state = stateParen
+ } else {
+ // process ( as ordinary character)
+ ht.state = stateChars
+ ht.tempBuffer.WriteByte(ch)
+ }
+ i++
+ case '\\': // backslash processing is tricky - go to Chars state to handle it
+ ht.doFlushWhitespace()
+ ht.state = stateChars
+ default:
+ ht.doFlushWhitespace()
+ ht.state = stateChars
+ ht.tempBuffer.WriteByte(ch)
+ i++
+ }
+ case stateChars:
+ switch ch {
+ case ' ', '\t': // go to Whitespace state
+ ht.doFlushString()
+ ht.state = stateWhitespace
+ ht.tempBuffer.WriteByte(ch)
+ i++
+ case '\r', '\n': // go to Newline state
+ ht.doFlushString()
+ ht.state = stateNewline
+ ht.tempBuffer.WriteByte(ch)
+ i++
+ case '<': // may be a start of tag
+ if ht.config.Angles {
+ ht.doFlushString()
+ ht.state = stateLeftAngle
+ } else {
+ ht.tempBuffer.WriteByte(ch)
+ }
+ i++
+ case '\\':
+ if i < (len(str) - 1) {
+ i++
+ ch = str[i]
+ if (ch == '(' && ht.config.Parens) || (ch == '<' && ht.config.Angles) {
+ // append the escaped character, omitting the backslash
+ ht.tempBuffer.WriteByte(ch)
+ i++
+ } else {
+ // append the backslash and hit the new character
+ ht.tempBuffer.WriteByte('\\')
+ }
+ } else {
+ // just append the backslash notrmally
+ ht.tempBuffer.WriteByte(ch)
+ i++
+ }
+ default: // just append the next character
+ ht.tempBuffer.WriteByte(ch)
+ i++
+ }
+ case stateLeftAngle:
+ switch ch {
+ case ' ', '\t', '\r', '\n': // output <, go to Whitespace state
+ ht.emitRune('<', ht.outputFilters, true)
+ ht.state = stateWhitespace
+ case '<': // output < and stay in this state
+ ht.emitRune('<', ht.outputFilters, true)
+ i++
+ default:
+ ht.state = stateTag
+ ht.tempBuffer.WriteByte(ch)
+ i++
+ }
+ case stateTag:
+ switch ch {
+ case '>': // finish the tag - this changes the state, and possibly calls parse() recursively
+ ht.finishTag()
+ i++
+ case '\'', '"': // go into "quote string" state inside the tag
+ ht.tempBuffer.WriteByte(ch)
+ ht.state = stateTagQuote
+ ht.quoteChar = ch
+ i++
+ default: // just append the character
+ ht.tempBuffer.WriteByte(ch)
+ i++
+ }
+ case stateParen:
+ switch ch {
+ case '(':
+ ht.tempBuffer.WriteByte(ch)
+ ht.parenLevel++
+ i++
+ case ')':
+ if ht.parenLevel == 0 {
+ ht.finishParen()
+ } else {
+ ht.tempBuffer.WriteByte(ch)
+ ht.parenLevel--
+ }
+ i++
+ default:
+ ht.tempBuffer.WriteByte(ch)
+ i++
+ }
+ case stateTagQuote:
+ ht.tempBuffer.WriteByte(ch)
+ if ch == ht.quoteChar {
+ ht.state = stateTag
+ }
+ i++
+ case stateNewline:
+ if ch == '\r' || ch == '\n' {
+ ht.tempBuffer.WriteByte(ch)
+ i++
+ } else {
+ ht.doFlushNewlines()
+ }
+ }
+ }
}
func (ht *htmlCheckerImpl) Append(str string) error {
if ht.finished {
- return AlreadyFinished
+ return ErrAlreadyFinished
}
if !ht.started {
ht.started = true
@@ -231,7 +870,7 @@ func (ht *htmlCheckerImpl) Append(str string) error {
func (ht *htmlCheckerImpl) Finish() error {
if ht.finished {
- return AlreadyFinished
+ return ErrAlreadyFinished
}
if !ht.started {
ht.started = true
@@ -247,7 +886,125 @@ func (ht *htmlCheckerImpl) Finish() error {
case stateChars:
running = ht.doFlushString() // flush the temporary buffer
case stateLeftAngle:
-
+ // just emit a left angle character
+ ht.emitPossibleLineBreak()
+ ht.emitRune('<', ht.outputFilters, true)
+ case stateTag, stateTagQuote:
+ // we won't finish this tag, so it's automagically rejected
+ rejection := ht.tempBuffer.String()
+ ht.tempBuffer.Reset()
+ ht.tempBuffer.WriteByte('<')
+ ht.state = stateChars
+ if len(rejection) > 0 {
+ ht.parse(rejection)
+ }
+ running = true
+ case stateParen:
+ rejection := ht.tempBuffer.String()
+ ht.tempBuffer.Reset()
+ ht.tempBuffer.WriteByte('(')
+ ht.state = stateChars
+ ht.parenLevel = 0
+ if len(rejection) > 0 {
+ ht.parse(rejection)
+ }
+ running = true
}
}
+
+ // Now close all the HTML tags that were left open.
+ for !ht.tagStack.IsEmpty() {
+ tag, _ := ht.tagStack.Pop()
+ ht.outputBuffer.WriteString(tag.makeClosingTag())
+ }
+
+ ht.lines++
+ ht.finished = true
+ return nil
+}
+
+func (ht *htmlCheckerImpl) Reset() {
+ ht.started = false
+ ht.finished = false
+ ht.triggerWBR = false
+ ht.state = stateWhitespace
+ ht.quoteChar = byte(0)
+ ht.columns = 0
+ ht.lines = 0
+ ht.parenLevel = 0
+ ht.outputBuffer.Reset()
+ for u := range ht.externalReferences {
+ delete(ht.externalReferences, u)
+ }
+ for k := range ht.internalReferences {
+ delete(ht.internalReferences, k)
+ }
+ for c := range maps.Values(ht.counters) {
+ c.Reset()
+ }
+}
+
+func (ht *htmlCheckerImpl) Value() (string, error) {
+ if ht.finished {
+ return ht.outputBuffer.String(), nil
+ }
+ return "", ErrNotYetFinished
+}
+
+func (ht *htmlCheckerImpl) Length() (int, error) {
+ if ht.finished {
+ return ht.outputBuffer.Len(), nil
+ }
+ return 0, ErrNotYetFinished
+}
+
+func (ht *htmlCheckerImpl) Lines() (int, error) {
+ if ht.finished {
+ return ht.lines, nil
+ }
+ return 0, ErrNotYetFinished
+}
+
+func (ht *htmlCheckerImpl) Counter(name string) (int, error) {
+ if ht.finished {
+ cr, ok := ht.counters[name]
+ if ok {
+ return cr.GetCount(), nil
+ }
+ return 0, nil
+ }
+ return 0, ErrNotYetFinished
+}
+
+func (ht *htmlCheckerImpl) GetContext(name string) any {
+ return ht.contextData[name]
+}
+
+func (ht *htmlCheckerImpl) SetContext(name string, value any) {
+ ht.contextData[name] = value
+}
+
+func (ht *htmlCheckerImpl) ExternalRefs() ([]*url.URL, error) {
+ if ht.finished {
+ rc := make([]*url.URL, len(ht.externalReferences))
+ p := 0
+ for url := range maps.Keys(ht.externalReferences) {
+ rc[p] = url
+ p++
+ }
+ return rc, nil
+ }
+ return nil, ErrNotYetFinished
+}
+
+func (ht *htmlCheckerImpl) InternalRefs() ([]string, error) {
+ if ht.finished {
+ rc := make([]string, len(ht.internalReferences))
+ p := 0
+ for s := range maps.Keys(ht.internalReferences) {
+ rc[p] = s
+ p++
+ }
+ }
+ return nil, ErrNotYetFinished
}
diff --git a/htmlcheck/checker_config.go b/htmlcheck/checker_config.go
index 3d72674..310c138 100644
--- a/htmlcheck/checker_config.go
+++ b/htmlcheck/checker_config.go
@@ -17,22 +17,24 @@ import (
// HTMLCheckerConfig is a configuration that may be used with the HTML Checker.
type HTMLCheckerConfig struct {
- Name string `yaml:"name"`
- WordWrap int `yaml:"wordWrap"`
- Rewrap bool `yaml:"rewrap"`
- Angles bool `yaml:"angles"`
- Parens bool `yaml:"parens"`
- DiscardHTML bool `yaml:"discardHTML"`
- DiscardRejected bool `yaml:"discardRejected"`
- DiscardComments bool `yaml:"discardComments"`
- DiscardXML bool `yaml:"discardXML"`
- OutputFilters []string `yaml:"outputFilters"`
- StringRewriters []string `yaml:"stringRewriters"`
- WordRewriters []string `yaml:"wordRewriters"`
- TagRewriters []string `yaml:"tagRewriters"`
- ParenRewriters []string `yaml:"parenRewriters"`
- TagSet string `yaml:"tagSet"`
- DisallowTags []string `yaml:"disallowTags"`
+ Name string `yaml:"name"`
+ WordWrap int `yaml:"wordWrap"`
+ Rewrap bool `yaml:"rewrap"`
+ Angles bool `yaml:"angles"`
+ Parens bool `yaml:"parens"`
+ DiscardHTML bool `yaml:"discardHTML"`
+ DiscardRejected bool `yaml:"discardRejected"`
+ DiscardComments bool `yaml:"discardComments"`
+ DiscardXML bool `yaml:"discardXML"`
+ OutputFilters []string `yaml:"outputFilters"`
+ RawOutputFilters []string `yaml:"rawOutputFilters"`
+ StringRewriters []string `yaml:"stringRewriters"`
+ WordRewriters []string `yaml:"wordRewriters"`
+ TagRewriters []string `yaml:"tagRewriters"`
+ ParenRewriters []string `yaml:"parenRewriters"`
+ TagSet string `yaml:"tagSet"`
+ DisallowTags []string `yaml:"disallowTags"`
+ AnchorTail string `yaml:"anchorTail"`
}
// HTMLCheckerConfigFile represents all the configs as they exist in the file.
@@ -40,6 +42,8 @@ type HTMLCheckerConfigFile struct {
Configs []HTMLCheckerConfig `yaml:"configs"`
}
+const defaultAnchorTail = "TARGET=\"Wander\""
+
//go:embed configs.yaml
var configData []byte
@@ -55,5 +59,8 @@ func init() {
}
for i := range cfgdata.Configs {
configsRegistry[cfgdata.Configs[i].Name] = &(cfgdata.Configs[i])
+ if cfgdata.Configs[i].AnchorTail == "" {
+ cfgdata.Configs[i].AnchorTail = defaultAnchorTail
+ }
}
}
diff --git a/htmlcheck/filter.go b/htmlcheck/filter.go
index 73e5648..96af31f 100644
--- a/htmlcheck/filter.go
+++ b/htmlcheck/filter.go
@@ -13,8 +13,8 @@ import "strings"
// outputFilter is the interface for an HTML checker output filter.
type outputFilter interface {
- tryOutputCharacter(strings.Builder, byte) bool
- matchCharacter(byte) bool
+ tryOutputRune(strings.Builder, rune) bool
+ matchRune(rune) bool
lengthNoMatch(string) int
}
@@ -34,7 +34,7 @@ type htmlEncodingFilter struct{}
const htmlEscapedChars = "<>&"
// tryOutputCharacter outputs a character that needs to be escaped.
-func (f *htmlEncodingFilter) tryOutputCharacter(buf strings.Builder, ch byte) bool {
+func (f *htmlEncodingFilter) tryOutputRune(buf strings.Builder, ch rune) bool {
switch ch {
case '<':
buf.WriteString("<")
@@ -49,15 +49,15 @@ func (f *htmlEncodingFilter) tryOutputCharacter(buf strings.Builder, ch byte) bo
}
// matchCharacter returns true if this character needs to be escaped.
-func (f *htmlEncodingFilter) matchCharacter(ch byte) bool {
- return strings.IndexByte(htmlEscapedChars, ch) >= 0
+func (f *htmlEncodingFilter) matchRune(ch rune) bool {
+ return strings.ContainsRune(htmlEscapedChars, ch)
}
// lengthNoMatch returns the maximum length of unmatched characters at the start of the string.
func (f *htmlEncodingFilter) lengthNoMatch(s string) int {
rc := len(s)
- for _, c := range []byte(htmlEscapedChars) {
- tmp := strings.IndexByte(s, c)
+ for _, c := range htmlEscapedChars {
+ tmp := strings.IndexRune(s, c)
if tmp >= 0 && tmp < rc {
rc = tmp
if rc == 0 {
diff --git a/htmlcheck/rewriter.go b/htmlcheck/rewriter.go
index c9c1374..3dd7353 100644
--- a/htmlcheck/rewriter.go
+++ b/htmlcheck/rewriter.go
@@ -26,6 +26,10 @@ type markupData struct {
rescan bool
}
+func (md *markupData) all() string {
+ return md.beginMarkup + md.text + md.endMarkup
+}
+
// rewriterServices is an interface that provides services to rewriters.
type rewriterServices interface {
rewriterAttrValue(string) string
diff --git a/util/stack.go b/util/stack.go
index 60eaf3b..d041487 100644
--- a/util/stack.go
+++ b/util/stack.go
@@ -11,7 +11,7 @@
package util
// Stack[T] is a simple generic array-based stack implementation.
-type Stack[T any] struct {
+type Stack[T comparable] struct {
elements []T
}
@@ -43,8 +43,27 @@ func (stk *Stack[T]) Peek() (T, bool) {
return stk.elements[len(stk.elements)-1], true
}
+func (stk *Stack[T]) RemoveMostRecent(data T) bool {
+ i := len(stk.elements) - 1
+ for i >= 0 {
+ if stk.elements[i] == data {
+ if i == 0 {
+ stk.elements = stk.elements[1:]
+ } else if (i + 1) == len(stk.elements) {
+ stk.elements = stk.elements[:i]
+ } else {
+ high := stk.elements[i+1:]
+ stk.elements = stk.elements[:i]
+ stk.elements = append(stk.elements, high...)
+ }
+ return true
+ }
+ }
+ return false
+}
+
// NewStack creates and returns a new stack.
-func NewStack[T any]() *Stack[T] {
+func NewStack[T comparable]() *Stack[T] {
return &Stack[T]{
elements: make([]T, 0),
}
diff --git a/util/util.go b/util/util.go
index 7d50053..14de49b 100644
--- a/util/util.go
+++ b/util/util.go
@@ -14,17 +14,10 @@ import (
"regexp"
"strings"
"unicode"
+ "unicode/utf8"
)
-var numeric *regexp.Regexp
-
-func init() {
- re, err := regexp.Compile("^[0-9]+$")
- if err != nil {
- panic(err)
- }
- numeric = re
-}
+var numeric *regexp.Regexp = regexp.MustCompile(`^[0-9]+$`)
/* CapitalizeString changes the first character of the string to a capital.
* Parameters:
@@ -80,3 +73,52 @@ func SqlEscape(s string, wildcards bool) string {
func IsNumeric(s string) bool {
return numeric.MatchString(s)
}
+
+/* RunesToBytes returns the number of bytes in a string counting the number of runes from the beginning.
+ * Parameters:
+ * s - The string to work with.
+ * runeCount - The number of runes to count from the start of the string.
+ * Returns:
+ * The corresponding number of bytes.
+ */
+func RunesToBytes(s string, runeCount int) int {
+ bp := 0
+ for runeCount > 0 {
+ if bp >= len(s) {
+ return len(s)
+ }
+ _, c := utf8.DecodeRuneInString(s[bp:])
+ bp += c
+ runeCount--
+ }
+ return bp
+}
+
+func IsRuneWord(ch rune) bool {
+ return unicode.IsLetter(ch) || ch == '-' || ch == '\''
+}
+
+func WordRunLength(s string) (int, bool) {
+ c1, initLen := utf8.DecodeRuneInString(s)
+ wordChar := IsRuneWord(c1)
+ rlen := 1
+ for _, mch := range s[initLen:] {
+ if IsRuneWord(mch) != wordChar {
+ break
+ }
+ rlen++
+ }
+ return rlen, wordChar
+}
+
+func WordRunLengthAfterPrefix(s string, nrunes int) (int, bool) {
+ ofs := 0
+ for _, ch := range s {
+ if nrunes == 0 {
+ break
+ }
+ ofs += utf8.RuneLen(ch)
+ nrunes--
+ }
+ return WordRunLength(s[ofs:])
+}