Files
amsterdam/htmlcheck/checker.go
T

1114 lines
33 KiB
Go

/*
* Amsterdam Web Communities System
* Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
// The htmlcheck package contains the HTML Checker.
package htmlcheck
import (
"errors"
"fmt"
"net/url"
"strings"
"unicode"
"unicode/utf8"
"git.erbosoft.com/amy/amsterdam/util"
"github.com/bits-and-blooms/bitset"
log "github.com/sirupsen/logrus"
)
/*----------------------------------------------------------------------------
* External definitions
*----------------------------------------------------------------------------
*/
// HTMLChecker is a component that checks HTML and reformats it as needed.
type HTMLChecker interface {
Append(string) error // add additional string to the checker state
Finish() error // finish parsing HTML
Reset() // clear state
Value() (string, error) // return value
Length() (int, error) // return text length
Lines() (int, error) // return number of lines
Counter(string) (int, error) // return value of a counter
GetContext(string) any // get a context value
SetContext(string, any) // set a context value
ExternalRefs() ([]*url.URL, error) // return a list of external references
InternalRefs() ([]string, error) // return a list of internal references
}
// ErrAlreadyFinished is a common error that's returned if the checker has been finished when it shouldn't be.
var ErrAlreadyFinished = errors.New("the HTML checker has already finished")
// ErrNotYetFinished is a common error that's returned if the checker has not been finished when it should be.
var ErrNotYetFinished = errors.New("the HTML checker has not yet been finished")
/*----------------------------------------------------------------------------
* Internal definitions
*----------------------------------------------------------------------------
*/
// htmlCheckerBackend is an interface used by subcomponents to communicate back to the HTML checker.
type htmlCheckerBackend interface {
getCheckerAttrValue(string) string
sendTagMessage(string)
getCheckerContextValue(string) any
addExternalRef(*url.URL)
addInternalRef(string)
}
// State constants for the state machine.
const (
stateWhitespace = 0 // processing whitespace
stateChars = 1 // processing character data
stateLeftAngle = 2 // processing a left angle bracket
stateTag = 3 // processing the contents of a tag
stateParen = 4 // processing a string in parentheses
stateTagQuote = 5 // processing a quoted string inside a tag
stateNewline = 6 // processing newlines
)
// htmlMarginSlop is a number of characters at the end of the line used to control word-wrapping.
const htmlMarginSlop = 5
// hyphApos is used to find hyphens and apostrophes.
const hyphApos = "-'"
// htmlCheckerImpl is the implementation of the HTML checker.
type htmlCheckerImpl struct {
config *HTMLCheckerConfig // pointer to configuration
started bool // has checker been started?
finished bool // has checker been finished?
state int // current state
quoteChar byte // quote character to match in stateTagQuote
parenLevel int // parenthesis level in stateParen
columns int // current column position - runes, not bytes!
lines int // lines of text
noBreakCount int // current NOBR nesting count
triggerWBR bool // do we need to trigger a word break?
outputBuffer strings.Builder // output is gathered here
tempBuffer strings.Builder // input is gathered here within a state and flushed on transition
tagStack *util.Stack[*tag] // keeps track of nested HTML tags
counters map[string]*countingRewriter // counters for times rewrites have happened
stringRewriters []rewriter // loaded string rewriters
wordRewriters []rewriter // loaded word rewriters
tagRewriters []rewriter // loaded tag rewriters
parenRewriters []rewriter // loaded parenthesis rewriters
outputFilters []outputFilter // loaded standard output filters
rawOutputFilters []outputFilter // loaded "raw" output filters
contextData map[string]any // holds context values
externalReferences map[*url.URL]bool // saved external references
internalReferences map[string]bool // saved internal references
tagSet *bitset.BitSet // set of valid tags from configuration
}
/*----------------------------------------------------------------------------
* Construction helpers
*----------------------------------------------------------------------------
*/
// copyRewriters looks up all rewriters in the source array and builds a target array.
func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) {
for i := range source {
rw, ok := rewriterRegistry[source[i]]
if ok {
if rw.Name() != "" {
crw := MakeCountingRewriter(rw)
ht.counters[rw.Name()] = crw
rw = crw
}
dest[i] = rw
} else {
log.Errorf("rewriter %s is not found", source[i])
}
}
}
// copyOutputFilters looks up all output filters in the source array and builds a target array.
func (ht *htmlCheckerImpl) copyOutputFilters(dest []outputFilter, source []string) {
for i := range source {
f, ok := outputFilterRegistry[source[i]]
if ok {
dest[i] = f
} else {
log.Errorf("filter %s is not found", source[i])
}
}
}
/*----------------------------------------------------------------------------
* The construction function
*----------------------------------------------------------------------------
*/
/* AmNewHTMLChecker creates a new HTML Checker object.
* Parametrers:
* configName - Name of the configuration to use.
* Returns:
* New HTML checker reference.
* Standard Go error status.
*/
func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
config, ok := configsRegistry[configName]
if !ok {
return nil, fmt.Errorf("configuration %s not found", configName)
}
var tset *bitset.BitSet = nil
if config.TagSet != "" {
tset, ok = tagSetNameToSet[config.TagSet]
if !ok {
return nil, fmt.Errorf("tag set %s not found", config.TagSet)
}
}
rc := htmlCheckerImpl{
config: config,
started: false,
finished: false,
state: stateWhitespace,
parenLevel: 0,
columns: 0,
lines: 0,
noBreakCount: 0,
triggerWBR: false,
tagStack: util.NewStack[*tag](),
counters: make(map[string]*countingRewriter),
stringRewriters: make([]rewriter, len(config.StringRewriters)),
wordRewriters: make([]rewriter, len(config.WordRewriters)),
tagRewriters: make([]rewriter, len(config.TagRewriters)),
parenRewriters: make([]rewriter, len(config.ParenRewriters)),
outputFilters: make([]outputFilter, len(config.OutputFilters)),
rawOutputFilters: make([]outputFilter, len(config.RawOutputFilters)),
contextData: make(map[string]any),
externalReferences: make(map[*url.URL]bool),
internalReferences: make(map[string]bool),
tagSet: tset,
}
rc.copyRewriters(rc.stringRewriters, config.StringRewriters)
rc.copyRewriters(rc.wordRewriters, config.WordRewriters)
rc.copyRewriters(rc.tagRewriters, config.TagRewriters)
rc.copyRewriters(rc.parenRewriters, config.ParenRewriters)
rc.copyOutputFilters(rc.outputFilters, config.OutputFilters)
rc.copyOutputFilters(rc.rawOutputFilters, config.RawOutputFilters)
return &rc, nil
}
/*----------------------------------------------------------------------------
* Implementations from htmlCheckerBackend and rewriterServices
*----------------------------------------------------------------------------
*/
// getCheckerAttrValue returns the value of an HTML checker attribute.
func (ht *htmlCheckerImpl) getCheckerAttrValue(name string) string {
if name == "ANCHORTAIL" {
return ht.config.AnchorTail
}
return ""
}
// sendTagMessage offers specific HTML tags a way to send messages to affect the HTML checker's state.
func (ht *htmlCheckerImpl) sendTagMessage(msg string) {
switch msg {
case "NOBR":
ht.noBreakCount++
case "/NOBR":
ht.noBreakCount--
case "WBR":
ht.triggerWBR = true
}
}
// getCheckerContextValue returns a context value set on the HTML checker.
func (ht *htmlCheckerImpl) getCheckerContextValue(name string) any {
return ht.contextData[name]
}
// addExternalRef adds an external reference to the checker's logs.
func (ht *htmlCheckerImpl) addExternalRef(ref *url.URL) {
ht.externalReferences[ref] = true
}
// addInternalRef adds an internal reference to the checker's logs.
func (ht *htmlCheckerImpl) addInternalRef(ref string) {
ht.internalReferences[ref] = true
}
// rewriterAttrValue returns the value of an HTML checker attribute.
func (ht *htmlCheckerImpl) rewriterAttrValue(name string) string {
return ht.getCheckerAttrValue(name)
}
// rewriterContextValue returns a context value set on the HTML checker.
func (ht *htmlCheckerImpl) rewriterContextValue(name string) any {
return ht.contextData[name]
}
/*----------------------------------------------------------------------------
* Internal functions forming the meat of the parser
*----------------------------------------------------------------------------
*/
// emitRune emits a rune to the output buffer, respecting the specified output filters.
func (ht *htmlCheckerImpl) emitRune(ch rune, filters []outputFilter, countCols bool) {
handled := false
if len(filters) > 0 {
// try each output filter to see what we can do
for _, of := range filters {
handled = of.tryOutputRune(&ht.outputBuffer, ch)
if handled {
break // found a filter to handle it, done
}
}
}
if !handled { // output the raw character
ht.outputBuffer.WriteRune(ch)
}
if countCols && ht.config.WordWrap > 0 {
ht.columns++
}
}
// emitString emits an entire string to the output buffer, respecting the specified output filters.
func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) {
if str != "" {
realCountCols := countCols && ht.config.WordWrap > 0
if len(filters) == 0 {
// if there are no filters, just output the whole thing
ht.outputBuffer.WriteString(str)
if realCountCols {
ht.columns += utf8.RuneCountInString(str)
}
} else {
temp := str
for len(temp) > 0 {
// We output as much of the string as we possibly can at once. Assume, for now, we'll output the whole thing.
outputLen := len(temp)
// Now look at each of the output filters to see if we should try outputting a lesser amount
// (i.e. does the string contain a "stopper" that one of the filters would like to mogrify?)
var stopper outputFilter = nil
for _, of := range filters {
// find the length of characters that DOESN'T match this filter
lnm := of.lengthNoMatch(temp)
if lnm >= 0 && lnm < outputLen {
// we've found a new stopper - record the length and the filter
outputLen = lnm
stopper = of
}
if outputLen <= 0 {
break // nothing left to do here
}
}
if outputLen > 0 {
// move over the unaltered characters first
ht.outputBuffer.WriteString(temp[:outputLen])
if realCountCols {
ht.columns += utf8.RuneCountInString(temp[:outputLen])
}
}
if stopper != nil {
// one of the output filters stopped us, try invoking it
tmpch, bsiz := utf8.DecodeRuneInString(temp[outputLen:])
outputLen += bsiz
if !stopper.tryOutputRune(&ht.outputBuffer, tmpch) {
ht.outputBuffer.WriteRune(tmpch)
}
if realCountCols {
ht.columns++
}
}
// Chop the string and go around again.
if outputLen == len(temp) {
temp = ""
} else if outputLen > 0 {
temp = temp[outputLen:]
}
}
}
}
}
// emitLineBreak emits a line break to the output.
func (ht *htmlCheckerImpl) emitLineBreak() {
ht.emitString("\r\n", ht.rawOutputFilters, false)
if ht.config.WordWrap > 0 {
ht.columns = 0
}
ht.lines++
}
// emitPossibleLineBreak emits a line break to the output, if it's warranted.
func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 && ht.columns >= ht.config.WordWrap {
ht.emitLineBreak()
}
}
// ensureSpaceOnLine makes sure we have enough space on the current line for a certain number of runes, adding a line break if needed.
func (ht *htmlCheckerImpl) ensureSpaceOnLine(nrunes int) {
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
// add a line break if needed here
remainSpace := ht.config.WordWrap - ht.columns
if remainSpace < nrunes {
ht.emitLineBreak()
}
}
}
// emitMarkupData emits the markup data in the specified data structure.
func (ht *htmlCheckerImpl) emitMarkupData(md *markupData) {
if !md.rescan {
ht.ensureSpaceOnLine(utf8.RuneCountInString(md.text))
ht.emitString(md.beginMarkup, ht.rawOutputFilters, false)
ht.emitString(md.text, ht.outputFilters, true)
ht.emitString(md.endMarkup, ht.rawOutputFilters, false)
}
}
// emitBrackedtedMarkupData emits the marketed data in the specified data structure, with prefix and suffix runes.
func (ht *htmlCheckerImpl) emitBracketedMarkupData(md *markupData, prefix rune, suffix rune) {
if !md.rescan {
l := utf8.RuneCountInString(md.text)
if l > 0 {
l += 2
}
ht.ensureSpaceOnLine(l)
if len(md.text) > 0 {
ht.emitRune(prefix, ht.outputFilters, true)
}
ht.emitString(md.beginMarkup, ht.rawOutputFilters, false)
ht.emitString(md.text, ht.outputFilters, true)
ht.emitString(md.endMarkup, ht.rawOutputFilters, false)
if len(md.text) > 0 {
ht.emitRune(suffix, ht.outputFilters, true)
}
}
}
// doFlushWhitespace flushes out all the whitespace in the temporary buffer.
func (ht *htmlCheckerImpl) doFlushWhitespace() {
outputLen := ht.tempBuffer.Len()
if outputLen > 0 {
forceLineBreak := false
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
// adjust output if necessary for wordwrapping
remainSpace := ht.config.WordWrap - ht.columns
if remainSpace < outputLen {
outputLen = remainSpace
}
if outputLen <= 0 {
// this means that NONE of the whitespace would fit on this line...add a line break
forceLineBreak = true
outputLen = 0
}
}
if forceLineBreak {
ht.emitLineBreak()
}
if outputLen > 0 {
ht.emitString(ht.tempBuffer.String()[:outputLen], ht.outputFilters, true)
}
ht.tempBuffer.Reset()
}
}
// doFlushNewlines flushes all the newlines that are in the temporary buffer.
func (ht *htmlCheckerImpl) doFlushNewlines() {
// Measure the number of line breaks we have.
lineBreaks, crs := 0, 0
for _, ch := range []byte(ht.tempBuffer.String()) {
switch ch {
case '\r':
crs++
case '\n':
crs = 0
lineBreaks++
}
}
if crs > 0 {
lineBreaks++
}
// Adjust the number of line breaks if rewrap is in effect.
if ht.config.Rewrap {
if lineBreaks < 2 {
// convert a single line break to whitespace
ht.tempBuffer.Reset()
ht.tempBuffer.WriteByte(' ')
ht.state = stateWhitespace
return
} else {
lineBreaks = 2 // compress out multiple blank lines
}
}
// emit line breaks
for lineBreaks > 0 {
ht.emitLineBreak()
lineBreaks--
}
ht.tempBuffer.Reset()
ht.state = stateWhitespace
}
// emitFromStartOfTempBuffer emits a certain number of runes from the start of the temporary buffer.
func (ht *htmlCheckerImpl) emitFromStartOfTempBuffer(nrunes int) {
if nrunes > 0 {
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
for nrunes > 0 {
curlen := min(nrunes, ht.config.WordWrap-ht.columns)
if curlen > 0 {
s := ht.tempBuffer.String()
bcurlen := util.RunesToBytes(s, curlen)
ht.emitString(s[:bcurlen], ht.outputFilters, true)
ht.tempBuffer.Reset()
ht.tempBuffer.WriteString(s[bcurlen:])
nrunes -= curlen
}
if ht.columns >= ht.config.WordWrap {
ht.emitLineBreak()
}
}
} else {
s := ht.tempBuffer.String()
bnrunes := util.RunesToBytes(s, nrunes)
ht.emitString(s[:bnrunes], ht.outputFilters, true)
ht.tempBuffer.Reset()
ht.tempBuffer.WriteString(s[bnrunes:])
}
}
}
// attemptRewrite attempts to apply a list of rewriters on the text, returning the first one that matches.
func (ht *htmlCheckerImpl) attemptRewrite(rewriters []rewriter, data string) *markupData {
for _, r := range rewriters {
rc := r.Rewrite(data, ht)
if rc != nil {
return rc
}
}
return nil
}
// doFlushString attempts to flush a string from the temporary buffer.
func (ht *htmlCheckerImpl) doFlushString() bool {
md := ht.attemptRewrite(ht.stringRewriters, ht.tempBuffer.String())
if md != nil {
ht.emitMarkupData(md)
ht.tempBuffer.Reset()
if md.rescan {
ht.parse(md.all())
return true
}
return false
}
first := true
for ht.tempBuffer.Len() > 0 {
sublen, isWord := util.WordRunLength(ht.tempBuffer.String())
if isWord {
// we want to check the word, but first we must eliminate leading hyphens and apostrophes
hyphCount := 0
for _, ch := range ht.tempBuffer.String() {
if hyphCount == sublen || !strings.ContainsRune(hyphApos, ch) {
break
}
hyphCount++
}
ht.emitFromStartOfTempBuffer(hyphCount)
sublen -= hyphCount
// now determine how many hyphens/apostrophes there are at the end of the word
runeArray := []rune(ht.tempBuffer.String())
wordLen := sublen
hyphCount = 0
for wordLen > 0 && strings.ContainsRune(hyphApos, runeArray[wordLen-1]) {
hyphCount++
wordLen--
}
if wordLen > 0 {
// extract the word and remove it from the start of the buffer
word := string(runeArray[:wordLen])
lw := len(word)
s := ht.tempBuffer.String()
ht.tempBuffer.Reset()
ht.tempBuffer.WriteString(s[lw:])
// try to rewrite this word
md := ht.attemptRewrite(ht.wordRewriters, word)
if md != nil {
// emit and/or reparse
ht.emitMarkupData(md)
if md.rescan {
ht.parse(md.all())
}
} else {
// just output the word normally
ht.ensureSpaceOnLine(wordLen)
ht.emitString(word, ht.outputFilters, true)
}
}
// now emit the rest of the hyphens/apostrophes
ht.emitFromStartOfTempBuffer(hyphCount)
} else {
// emit this many characters, line-breaking where required
totalRunes := utf8.RuneCountInString(ht.tempBuffer.String())
if sublen == totalRunes && !first && sublen <= htmlMarginSlop {
// This is intended to handle a small run of non-word characters at the end of a string (i.e.
// followed by whitespace) that should stay on the same line with its preceding word, to
// eliminate "funnies" in punctuation formatting.
ht.emitString(ht.tempBuffer.String(), ht.outputFilters, true)
ht.tempBuffer.Reset()
break
}
// This is kind of the inverse of the above check; if we have a small run of non-word
// characters at the START of a word (preceded by whitespace and followed by at least
// one word character), then ensure that we can keep that word and its prefixing non-word
// characters on the same line (again, avoiding "funnies" in formatting).
if sublen < totalRunes && first && sublen <= htmlMarginSlop {
fwLen, _ := util.WordRunLengthAfterPrefix(ht.tempBuffer.String(), sublen)
ht.ensureSpaceOnLine(sublen + fwLen)
}
ht.emitFromStartOfTempBuffer(sublen)
}
first = false
}
return false
}
// handleAsHTML attempts to handle the contents of the tag in the temporary buffer as HTML.
func (ht *htmlCheckerImpl) handleAsHTML() bool {
ht.triggerWBR = false
tempString := ht.tempBuffer.String()
// Figure out where the start of the command word is.
startCmd := 0
closingTag := false
if startCmd < len(tempString) && tempString[startCmd] == '/' {
startCmd++
closingTag = true
}
// now figure out where it ends
endCmd := startCmd
for endCmd < len(tempString) {
if unicode.IsSpace(rune(tempString[endCmd])) {
break
}
endCmd++
}
if endCmd == startCmd || (endCmd-startCmd) > tagMaxLength {
// command word is empty or is too long to be an HTML tag
return false
}
tagIndex, ok := tagNameToIndex[strings.ToUpper(tempString[startCmd:endCmd])]
if !ok {
// not a known HTML tag
return false
}
tag := tagIndexToObject[tagIndex]
if closingTag && !tag.allowClose {
// it's a closing tag and this tag doesn't permit the "close" form
return false
}
if ht.tagSet != nil {
tagSetID := tagIndexToSetId[tagIndex]
if !ht.tagSet.Test(uint(tagSetID)) {
// the tag is not allowed - discard it, if one of the flags is set in the config
return ht.config.DiscardHTML || ht.config.DiscardRejected
}
}
if !ht.config.DiscardHTML && tag.balanceTags {
// this tag needs to be balanced - here's where we manipulate the stack
var valid bool
if closingTag {
valid = ht.tagStack.RemoveMostRecent(tag)
} else {
ht.tagStack.Push(tag)
valid = true
}
if !valid {
return false
}
}
// Give the tag object one last chance to dictate what we do with the tag.
realTagData := tag.rewriteContents(tempString, closingTag, ht)
if realTagData == "" || ht.config.DiscardHTML {
return true
}
// Emit the tag to the output.
ht.emitRune('<', ht.rawOutputFilters, false)
ht.emitString(realTagData, ht.rawOutputFilters, false)
ht.emitRune('>', ht.rawOutputFilters, false)
// Determine whether this tag causes a "logical line break."
logicalLineBreak := false
if ht.triggerWBR && !closingTag && ht.noBreakCount > 0 {
// word break is logical line break, but only within no-break tags
logicalLineBreak = true
} else {
logicalLineBreak = tag.causeLineBreak(closingTag)
}
if logicalLineBreak {
ht.columns = 0
}
return true
}
// containsHTMLComment returns true if the temporary buffer contains (the start of) an HTML comment.
func (ht *htmlCheckerImpl) containsHTMLComment() bool {
return ht.tempBuffer.Len() >= 3 && strings.HasPrefix(ht.tempBuffer.String(), "!--")
}
// containsCompleteHTMLComment returns true if the temporary buffer contains a complete HTML comment.
func (ht *htmlCheckerImpl) containsCompleteHTMLComment() bool {
if ht.tempBuffer.Len() >= 5 {
s := ht.tempBuffer.String()
return strings.HasPrefix(s, "!--") && strings.HasSuffix(s, "--")
}
return false
}
// containsXMLConstruct returns true if the temporary buffer contains an XML-style namespaced tag.
func (ht *htmlCheckerImpl) containsXMLConstruct() bool {
tempString := ht.tempBuffer.String()
ptr := 0
if len(tempString) > 1 && tempString[0] == '/' {
ptr++
}
for ptr < len(tempString) {
if tempString[ptr] == ':' {
return true
} else if unicode.IsSpace(rune(tempString[ptr])) {
break
}
ptr++
}
return false
}
// finishTag processes and outputs the tag in the temporary buffer.
func (ht *htmlCheckerImpl) finishTag() {
if ht.containsHTMLComment() {
if ht.containsCompleteHTMLComment() && !ht.config.DiscardComments {
// output the comment in the raw
ht.emitRune('<', ht.rawOutputFilters, false)
ht.emitString(ht.tempBuffer.String(), ht.rawOutputFilters, false)
ht.emitRune('>', ht.rawOutputFilters, false)
// clear state and return to parsing
ht.tempBuffer.Reset()
ht.state = stateWhitespace
}
return
}
if ht.handleAsHTML() {
// this was valid HTML, we're done
ht.tempBuffer.Reset()
ht.state = stateWhitespace
return
}
// try to handle it with a tag rewriter
md := ht.attemptRewrite(ht.tagRewriters, ht.tempBuffer.String())
if md != nil {
ht.emitBracketedMarkupData(md, '<', '>')
ht.tempBuffer.Reset()
ht.state = stateWhitespace
if md.rescan {
ht.tempBuffer.WriteByte('<')
ht.state = stateChars
ht.parse(md.all() + ">")
}
return
}
if ht.config.DiscardXML && ht.containsXMLConstruct() {
// this tag is an XML construct, and needs to be discarded
ht.tempBuffer.Reset()
ht.state = stateWhitespace
return
}
// This tag has been rejected! process it normally as character data
rejection := ht.tempBuffer.String()
ht.tempBuffer.Reset()
ht.tempBuffer.WriteByte('<')
ht.state = stateChars
if len(rejection) > 0 {
ht.parse(rejection)
}
ht.parse(">")
}
// finishParen processes and outputs the parenthesized construct in the temporary buffer.
func (ht *htmlCheckerImpl) finishParen() {
// Try to handle the element using a paren rewriter
md := ht.attemptRewrite(ht.parenRewriters, ht.tempBuffer.String())
if md != nil {
ht.emitBracketedMarkupData(md, '(', ')')
ht.tempBuffer.Reset()
ht.state = stateWhitespace
ht.parenLevel = 0
if md.rescan {
ht.tempBuffer.WriteByte('(')
ht.state = stateChars
ht.parse(md.all() + ")")
}
return
}
// Tag rejected! Process it normally as character data.
rejection := ht.tempBuffer.String()
ht.tempBuffer.Reset()
ht.tempBuffer.WriteByte('(')
ht.state = stateChars
ht.parenLevel = 0
if len(rejection) > 0 {
ht.parse(rejection)
}
ht.parse(")")
}
// parse handles the meat of parsing an input string; it runs the state machine on the input.
func (ht *htmlCheckerImpl) parse(str string) {
i := 0
for i < len(str) {
ch := str[i]
switch ht.state {
case stateWhitespace:
switch ch {
case ' ', '\t': // append space and tab verbatim
ht.tempBuffer.WriteByte(ch)
i++
case '\r', '\n': // flush and go to Newline state
ht.doFlushWhitespace()
ht.state = stateNewline
ht.tempBuffer.WriteByte(ch)
i++
case '<':
ht.doFlushWhitespace()
if ht.config.Angles {
ht.state = stateLeftAngle
} else {
// process < as ordinary character
ht.state = stateChars
ht.tempBuffer.WriteByte(ch)
}
i++
case '(':
ht.doFlushWhitespace()
if ht.config.Parens {
ht.state = stateParen
} else {
// process ( as ordinary character)
ht.state = stateChars
ht.tempBuffer.WriteByte(ch)
}
i++
case '\\': // backslash processing is tricky - go to Chars state to handle it
ht.doFlushWhitespace()
ht.state = stateChars
default:
ht.doFlushWhitespace()
ht.state = stateChars
ht.tempBuffer.WriteByte(ch)
i++
}
case stateChars:
switch ch {
case ' ', '\t': // go to Whitespace state
ht.doFlushString()
ht.state = stateWhitespace
ht.tempBuffer.WriteByte(ch)
i++
case '\r', '\n': // go to Newline state
ht.doFlushString()
ht.state = stateNewline
ht.tempBuffer.WriteByte(ch)
i++
case '<': // may be a start of tag
if ht.config.Angles {
ht.doFlushString()
ht.state = stateLeftAngle
} else {
ht.tempBuffer.WriteByte(ch)
}
i++
case '\\':
if i < (len(str) - 1) {
i++
ch = str[i]
if (ch == '(' && ht.config.Parens) || (ch == '<' && ht.config.Angles) {
// append the escaped character, omitting the backslash
ht.tempBuffer.WriteByte(ch)
i++
} else {
// append the backslash and hit the new character
ht.tempBuffer.WriteByte('\\')
}
} else {
// just append the backslash normally
ht.tempBuffer.WriteByte(ch)
i++
}
default: // just append the next character
ht.tempBuffer.WriteByte(ch)
i++
}
case stateLeftAngle:
switch ch {
case ' ', '\t', '\r', '\n': // output <, go to Whitespace state
ht.emitRune('<', ht.outputFilters, true)
ht.state = stateWhitespace
case '<': // output < and stay in this state
ht.emitRune('<', ht.outputFilters, true)
i++
default: // begin processing tag
ht.state = stateTag
ht.tempBuffer.WriteByte(ch)
i++
}
case stateTag:
switch ch {
case '>': // finish the tag - this changes the state, and possibly calls parse() recursively
ht.finishTag()
i++
case '\'', '"': // go into "quote string" state inside the tag
ht.tempBuffer.WriteByte(ch)
ht.state = stateTagQuote
ht.quoteChar = ch
i++
default: // just append the character
ht.tempBuffer.WriteByte(ch)
i++
}
case stateParen:
switch ch {
case '(': // nest parentheses one level deeper
ht.tempBuffer.WriteByte(ch)
ht.parenLevel++
i++
case ')':
if ht.parenLevel == 0 {
ht.finishParen() // finish paren, changing state and recursively parsing if necessary
} else {
// nest parentheses one LESS level deeper
ht.tempBuffer.WriteByte(ch)
ht.parenLevel--
}
i++
default:
ht.tempBuffer.WriteByte(ch)
i++
}
case stateTagQuote:
ht.tempBuffer.WriteByte(ch)
if ch == ht.quoteChar {
ht.state = stateTag
}
i++
case stateNewline:
if ch == '\r' || ch == '\n' {
ht.tempBuffer.WriteByte(ch)
i++
} else {
ht.doFlushNewlines()
}
default:
log.Fatalf("invalid parser state: %d", ht.state)
}
}
}
/*----------------------------------------------------------------------------
* Implementations from the HTMLChecker interface
*----------------------------------------------------------------------------
*/
/* Append adds an additional string to the HTML checker data.
* Parameters:
* str - The string to be added and parsed.
* Returns:
* Standard Go error status.
*/
func (ht *htmlCheckerImpl) Append(str string) error {
if ht.finished {
return ErrAlreadyFinished
}
if !ht.started {
ht.started = true
}
if str != "" {
ht.parse(str)
}
return nil
}
/* Finish completes the HTML checker parsing and makes the result available.
* Returns:
* Standard Go error status.
*/
func (ht *htmlCheckerImpl) Finish() error {
if ht.finished {
return ErrAlreadyFinished
}
if !ht.started {
ht.started = true
}
// This is the "end parse" loop, in which we resolve any funny state the parser has
// found itself in and clear out the internal buffers.
running := true
for running {
running = false // make sure we stop unless this is set to true
switch ht.state {
case stateWhitespace, stateNewline:
// do nothing - discard whitespace or newlines at end
case stateChars:
running = ht.doFlushString() // flush the temporary buffer
case stateLeftAngle:
// just emit a left angle character
ht.emitPossibleLineBreak()
ht.emitRune('<', ht.outputFilters, true)
case stateTag, stateTagQuote:
// we won't finish this tag, so it's automagically rejected
rejection := ht.tempBuffer.String()
ht.tempBuffer.Reset()
ht.tempBuffer.WriteByte('<')
ht.state = stateChars
if len(rejection) > 0 {
ht.parse(rejection)
}
running = true
case stateParen:
// we won't finish this, so it's automatically rejected
rejection := ht.tempBuffer.String()
ht.tempBuffer.Reset()
ht.tempBuffer.WriteByte('(')
ht.state = stateChars
ht.parenLevel = 0
if len(rejection) > 0 {
ht.parse(rejection)
}
running = true
}
}
// Now close all the HTML tags that were left open.
for !ht.tagStack.IsEmpty() {
tag, _ := ht.tagStack.Pop()
ht.outputBuffer.WriteString(tag.makeClosingTag())
}
ht.lines++
ht.finished = true
return nil
}
// Reset clears the internal state of the HTML Checker.
func (ht *htmlCheckerImpl) Reset() {
ht.started = false
ht.finished = false
ht.triggerWBR = false
ht.state = stateWhitespace
ht.quoteChar = byte(0)
ht.columns = 0
ht.lines = 0
ht.parenLevel = 0
ht.outputBuffer.Reset()
ht.tempBuffer.Reset()
ht.tagStack.Clear()
for u := range ht.externalReferences {
delete(ht.externalReferences, u)
}
for k := range ht.internalReferences {
delete(ht.internalReferences, k)
}
for _, c := range ht.counters {
c.Reset()
}
}
// Value returns the value of the output from the HTML Checker.
func (ht *htmlCheckerImpl) Value() (string, error) {
if ht.finished {
return ht.outputBuffer.String(), nil
}
return "", ErrNotYetFinished
}
// Length returns the length in bytes of the HTML Checker result.
func (ht *htmlCheckerImpl) Length() (int, error) {
if ht.finished {
return ht.outputBuffer.Len(), nil
}
return 0, ErrNotYetFinished
}
// Lines returns the number of lines of text in the HTML Checker result.
func (ht *htmlCheckerImpl) Lines() (int, error) {
if ht.finished {
return ht.lines, nil
}
return 0, ErrNotYetFinished
}
// Counter returns the value of a counter maintained by the HTML Checker (corresponding to a rewriter).
func (ht *htmlCheckerImpl) Counter(name string) (int, error) {
if ht.finished {
cr, ok := ht.counters[name]
if ok {
return cr.GetCount(), nil
}
return 0, nil
}
return 0, ErrNotYetFinished
}
// GetContext returns an HTML checker context value.
func (ht *htmlCheckerImpl) GetContext(name string) any {
return ht.contextData[name]
}
// SetContext sets an HTML checker context value.
func (ht *htmlCheckerImpl) SetContext(name string, value any) {
ht.contextData[name] = value
}
// ExternalRefs returns a list of URLs as external references in the parsed text.
func (ht *htmlCheckerImpl) ExternalRefs() ([]*url.URL, error) {
if ht.finished {
rc := make([]*url.URL, len(ht.externalReferences))
p := 0
for url := range ht.externalReferences {
rc[p] = url
p++
}
return rc, nil
}
return nil, ErrNotYetFinished
}
// InternalRefs returns a list of internal references in the parsed text.
func (ht *htmlCheckerImpl) InternalRefs() ([]string, error) {
if ht.finished {
rc := make([]string, len(ht.internalReferences))
p := 0
for s := range ht.internalReferences {
rc[p] = s
p++
}
}
return nil, ErrNotYetFinished
}