1155 lines
34 KiB
Go
1155 lines
34 KiB
Go
/*
|
|
* Amsterdam Web Communities System
|
|
* Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
|
*
|
|
* SPDX-License-Identifier: MPL-2.0
|
|
*/
|
|
// The htmlcheck package contains the HTML Checker.
|
|
package htmlcheck
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"net/url"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
|
|
"git.erbosoft.com/amy/amsterdam/util"
|
|
"github.com/bits-and-blooms/bitset"
|
|
log "github.com/sirupsen/logrus"
|
|
)
|
|
|
|
/*----------------------------------------------------------------------------
|
|
* External definitions
|
|
*----------------------------------------------------------------------------
|
|
*/
|
|
|
|
// HTMLChecker is a component that checks HTML and reformats it as needed.
|
|
type HTMLChecker interface {
|
|
Append(string) error // add additional string to the checker state
|
|
Finish() error // finish parsing HTML
|
|
Reset() // clear state
|
|
Value() (string, error) // return value
|
|
Length() (int, error) // return text length
|
|
Lines() (int, error) // return number of lines
|
|
Counter(string) (int, error) // return value of a counter
|
|
GetContext(string) any // get a context value
|
|
SetContext(string, any) // set a context value
|
|
ExternalRefs() ([]*url.URL, error) // return a list of external references
|
|
InternalRefs() ([]string, error) // return a list of internal references
|
|
}
|
|
|
|
// ErrAlreadyFinished is a common error that's returned if the checker has been finished when it shouldn't be.
|
|
var ErrAlreadyFinished = errors.New("the HTML checker has already finished")
|
|
|
|
// ErrNotYetFinished is a common error that's returned if the checker has not been finished when it should be.
|
|
var ErrNotYetFinished = errors.New("the HTML checker has not yet been finished")
|
|
|
|
/*----------------------------------------------------------------------------
|
|
* Internal definitions
|
|
*----------------------------------------------------------------------------
|
|
*/
|
|
|
|
// htmlCheckerBackend is an interface used by subcomponents to communicate back to the HTML checker.
|
|
type htmlCheckerBackend interface {
|
|
getCheckerAttrValue(string) string
|
|
sendTagMessage(string)
|
|
getCheckerContextValue(string) any
|
|
addExternalRef(*url.URL)
|
|
addInternalRef(string)
|
|
}
|
|
|
|
// State constants for the state machine.
|
|
const (
|
|
stateWhitespace = 0 // processing whitespace
|
|
stateChars = 1 // processing character data
|
|
stateLeftAngle = 2 // processing a left angle bracket
|
|
stateTag = 3 // processing the contents of a tag
|
|
stateParen = 4 // processing a string in parentheses
|
|
stateTagQuote = 5 // processing a quoted string inside a tag
|
|
stateNewline = 6 // processing newlines
|
|
)
|
|
|
|
// htmlMarginSlop is a number of characters at the end of the line used to control word-wrapping.
|
|
const htmlMarginSlop = 5
|
|
|
|
// hyphApos is used to find hyphens and apostrophes.
|
|
const hyphApos = "-'"
|
|
|
|
// htmlCheckerImpl is the implementation of the HTML checker.
|
|
type htmlCheckerImpl struct {
|
|
ctx context.Context // request context, as instances are generally per-request
|
|
config *HTMLCheckerConfig // pointer to configuration
|
|
started bool // has checker been started?
|
|
finished bool // has checker been finished?
|
|
state int // current state
|
|
quoteChar byte // quote character to match in stateTagQuote
|
|
parenLevel int // parenthesis level in stateParen
|
|
columns int // current column position - runes, not bytes!
|
|
lines int // lines of text
|
|
noBreakCount int // current NOBR nesting count
|
|
triggerWBR bool // do we need to trigger a word break?
|
|
outputBuffer strings.Builder // output is gathered here
|
|
tempBuffer strings.Builder // input is gathered here within a state and flushed on transition
|
|
tagStack *util.Stack[*tag] // keeps track of nested HTML tags
|
|
counters map[string]*countingRewriter // counters for times rewrites have happened
|
|
stringRewriters []rewriter // loaded string rewriters
|
|
wordRewriters []rewriter // loaded word rewriters
|
|
tagRewriters []rewriter // loaded tag rewriters
|
|
parenRewriters []rewriter // loaded parenthesis rewriters
|
|
outputFilters []outputFilter // loaded standard output filters
|
|
rawOutputFilters []outputFilter // loaded "raw" output filters
|
|
contextData map[string]any // holds context values
|
|
externalReferences map[*url.URL]bool // saved external references
|
|
internalReferences map[string]bool // saved internal references
|
|
tagSet *bitset.BitSet // set of valid tags from configuration
|
|
}
|
|
|
|
/*----------------------------------------------------------------------------
|
|
* Construction helpers
|
|
*----------------------------------------------------------------------------
|
|
*/
|
|
|
|
// copyRewriters looks up all rewriters in the source array and builds a target array.
|
|
func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) {
|
|
for i := range source {
|
|
rw, ok := rewriterRegistry[source[i]]
|
|
if ok {
|
|
if rw.Name() != "" {
|
|
crw := MakeCountingRewriter(rw)
|
|
ht.counters[rw.Name()] = crw
|
|
rw = crw
|
|
}
|
|
dest[i] = rw
|
|
} else {
|
|
log.Errorf("rewriter %s is not found", source[i])
|
|
}
|
|
}
|
|
}
|
|
|
|
// copyOutputFilters looks up all output filters in the source array and builds a target array.
|
|
func (ht *htmlCheckerImpl) copyOutputFilters(dest []outputFilter, source []string) {
|
|
for i := range source {
|
|
f, ok := outputFilterRegistry[source[i]]
|
|
if ok {
|
|
dest[i] = f
|
|
} else {
|
|
log.Errorf("filter %s is not found", source[i])
|
|
}
|
|
}
|
|
}
|
|
|
|
/*----------------------------------------------------------------------------
|
|
* The construction function
|
|
*----------------------------------------------------------------------------
|
|
*/
|
|
|
|
/* AmNewHTMLChecker creates a new HTML Checker object.
|
|
* Parameters:
|
|
* ctx - Standard Go context value.
|
|
* configName - Name of the configuration to use.
|
|
* Returns:
|
|
* New HTML checker reference.
|
|
* Standard Go error status.
|
|
*/
|
|
func AmNewHTMLChecker(ctx context.Context, configName string) (HTMLChecker, error) {
|
|
config, ok := configsRegistry[configName]
|
|
if !ok {
|
|
return nil, fmt.Errorf("configuration %s not found", configName)
|
|
}
|
|
var tset *bitset.BitSet = nil
|
|
if config.TagSet != "" {
|
|
tset, ok = tagSetNameToSet[config.TagSet]
|
|
if !ok {
|
|
return nil, fmt.Errorf("tag set %s not found", config.TagSet)
|
|
}
|
|
}
|
|
rc := htmlCheckerImpl{
|
|
ctx: ctx,
|
|
config: config,
|
|
started: false,
|
|
finished: false,
|
|
state: stateWhitespace,
|
|
parenLevel: 0,
|
|
columns: 0,
|
|
lines: 0,
|
|
noBreakCount: 0,
|
|
triggerWBR: false,
|
|
tagStack: util.NewStack[*tag](),
|
|
counters: make(map[string]*countingRewriter),
|
|
stringRewriters: make([]rewriter, len(config.StringRewriters)),
|
|
wordRewriters: make([]rewriter, len(config.WordRewriters)),
|
|
tagRewriters: make([]rewriter, len(config.TagRewriters)),
|
|
parenRewriters: make([]rewriter, len(config.ParenRewriters)),
|
|
outputFilters: make([]outputFilter, len(config.OutputFilters)),
|
|
rawOutputFilters: make([]outputFilter, len(config.RawOutputFilters)),
|
|
contextData: make(map[string]any),
|
|
externalReferences: make(map[*url.URL]bool),
|
|
internalReferences: make(map[string]bool),
|
|
tagSet: tset,
|
|
}
|
|
rc.copyRewriters(rc.stringRewriters, config.StringRewriters)
|
|
rc.copyRewriters(rc.wordRewriters, config.WordRewriters)
|
|
rc.copyRewriters(rc.tagRewriters, config.TagRewriters)
|
|
rc.copyRewriters(rc.parenRewriters, config.ParenRewriters)
|
|
rc.copyOutputFilters(rc.outputFilters, config.OutputFilters)
|
|
rc.copyOutputFilters(rc.rawOutputFilters, config.RawOutputFilters)
|
|
return &rc, nil
|
|
}
|
|
|
|
/*----------------------------------------------------------------------------
|
|
* Implementations from htmlCheckerBackend and rewriterServices
|
|
*----------------------------------------------------------------------------
|
|
*/
|
|
|
|
// getCheckerAttrValue returns the value of an HTML checker attribute.
|
|
func (ht *htmlCheckerImpl) getCheckerAttrValue(name string) string {
|
|
if name == "ANCHORTAIL" {
|
|
return ht.config.AnchorTail
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// sendTagMessage offers specific HTML tags a way to send messages to affect the HTML checker's state.
|
|
func (ht *htmlCheckerImpl) sendTagMessage(msg string) {
|
|
switch msg {
|
|
case "NOBR":
|
|
ht.noBreakCount++
|
|
case "/NOBR":
|
|
ht.noBreakCount--
|
|
case "WBR":
|
|
ht.triggerWBR = true
|
|
}
|
|
}
|
|
|
|
// getCheckerContextValue returns a context value set on the HTML checker.
|
|
func (ht *htmlCheckerImpl) getCheckerContextValue(name string) any {
|
|
return ht.contextData[name]
|
|
}
|
|
|
|
// addExternalRef adds an external reference to the checker's logs.
|
|
func (ht *htmlCheckerImpl) addExternalRef(ref *url.URL) {
|
|
ht.externalReferences[ref] = true
|
|
}
|
|
|
|
// addInternalRef adds an internal reference to the checker's logs.
|
|
func (ht *htmlCheckerImpl) addInternalRef(ref string) {
|
|
ht.internalReferences[ref] = true
|
|
}
|
|
|
|
// rewriterAttrValue returns the value of an HTML checker attribute.
|
|
func (ht *htmlCheckerImpl) rewriterAttrValue(name string) string {
|
|
return ht.getCheckerAttrValue(name)
|
|
}
|
|
|
|
// rewriterContextValue returns a context value set on the HTML checker.
|
|
func (ht *htmlCheckerImpl) rewriterContextValue(name string) any {
|
|
return ht.contextData[name]
|
|
}
|
|
|
|
/*----------------------------------------------------------------------------
|
|
* Internal functions forming the meat of the parser
|
|
*----------------------------------------------------------------------------
|
|
*/
|
|
|
|
// emitRune emits a rune to the output buffer, respecting the specified output filters.
|
|
func (ht *htmlCheckerImpl) emitRune(ch rune, filters []outputFilter, countCols bool) {
|
|
handled := false
|
|
if len(filters) > 0 {
|
|
// try each output filter to see what we can do
|
|
for _, of := range filters {
|
|
handled = of.tryOutputRune(&ht.outputBuffer, ch)
|
|
if handled {
|
|
break // found a filter to handle it, done
|
|
}
|
|
}
|
|
}
|
|
if !handled { // output the raw character
|
|
ht.outputBuffer.WriteRune(ch)
|
|
}
|
|
if countCols && ht.config.WordWrap > 0 {
|
|
ht.columns++
|
|
}
|
|
}
|
|
|
|
// emitString emits an entire string to the output buffer, respecting the specified output filters.
|
|
func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) {
|
|
if str != "" {
|
|
realCountCols := countCols && ht.config.WordWrap > 0
|
|
if len(filters) == 0 {
|
|
// if there are no filters, just output the whole thing
|
|
ht.outputBuffer.WriteString(str)
|
|
if realCountCols {
|
|
ht.columns += utf8.RuneCountInString(str)
|
|
}
|
|
} else {
|
|
temp := str
|
|
for len(temp) > 0 {
|
|
// We output as much of the string as we possibly can at once. Assume, for now, we'll output the whole thing.
|
|
outputLen := len(temp)
|
|
|
|
// Now look at each of the output filters to see if we should try outputting a lesser amount
|
|
// (i.e. does the string contain a "stopper" that one of the filters would like to mogrify?)
|
|
var stopper outputFilter = nil
|
|
for _, of := range filters {
|
|
// find the length of characters that DOESN'T match this filter
|
|
lnm := of.lengthNoMatch(temp)
|
|
if lnm >= 0 && lnm < outputLen {
|
|
// we've found a new stopper - record the length and the filter
|
|
outputLen = lnm
|
|
stopper = of
|
|
}
|
|
if outputLen <= 0 {
|
|
break // nothing left to do here
|
|
}
|
|
}
|
|
if outputLen > 0 {
|
|
// move over the unaltered characters first
|
|
ht.outputBuffer.WriteString(temp[:outputLen])
|
|
if realCountCols {
|
|
ht.columns += utf8.RuneCountInString(temp[:outputLen])
|
|
}
|
|
}
|
|
if stopper != nil {
|
|
// one of the output filters stopped us, try invoking it
|
|
tmpch, bsiz := utf8.DecodeRuneInString(temp[outputLen:])
|
|
outputLen += bsiz
|
|
if !stopper.tryOutputRune(&ht.outputBuffer, tmpch) {
|
|
ht.outputBuffer.WriteRune(tmpch)
|
|
}
|
|
if realCountCols {
|
|
ht.columns++
|
|
}
|
|
}
|
|
// Chop the string and go around again.
|
|
if outputLen == len(temp) {
|
|
temp = ""
|
|
} else if outputLen > 0 {
|
|
temp = temp[outputLen:]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// emitLineBreak emits a line break to the output.
|
|
func (ht *htmlCheckerImpl) emitLineBreak() {
|
|
ht.emitString("\r\n", ht.rawOutputFilters, false)
|
|
if ht.config.WordWrap > 0 {
|
|
ht.columns = 0
|
|
}
|
|
ht.lines++
|
|
}
|
|
|
|
// emitPossibleLineBreak emits a line break to the output, if it's warranted.
|
|
func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
|
|
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 && ht.columns >= ht.config.WordWrap {
|
|
ht.emitLineBreak()
|
|
}
|
|
}
|
|
|
|
// ensureSpaceOnLine makes sure we have enough space on the current line for a certain number of runes, adding a line break if needed.
|
|
func (ht *htmlCheckerImpl) ensureSpaceOnLine(nrunes int) {
|
|
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
|
|
// add a line break if needed here
|
|
remainSpace := ht.config.WordWrap - ht.columns
|
|
if remainSpace < nrunes {
|
|
ht.emitLineBreak()
|
|
}
|
|
}
|
|
}
|
|
|
|
// emitMarkupData emits the markup data in the specified data structure.
|
|
func (ht *htmlCheckerImpl) emitMarkupData(md *markupData) {
|
|
if !md.rescan {
|
|
ht.ensureSpaceOnLine(utf8.RuneCountInString(md.text))
|
|
ht.emitString(md.beginMarkup, ht.rawOutputFilters, false)
|
|
ht.emitString(md.text, ht.outputFilters, true)
|
|
ht.emitString(md.endMarkup, ht.rawOutputFilters, false)
|
|
}
|
|
}
|
|
|
|
// emitBrackedtedMarkupData emits the marketed data in the specified data structure, with prefix and suffix runes.
|
|
func (ht *htmlCheckerImpl) emitBracketedMarkupData(md *markupData, prefix rune, suffix rune) {
|
|
if !md.rescan {
|
|
l := utf8.RuneCountInString(md.text)
|
|
if l > 0 {
|
|
l += 2
|
|
}
|
|
ht.ensureSpaceOnLine(l)
|
|
if len(md.text) > 0 {
|
|
ht.emitRune(prefix, ht.outputFilters, true)
|
|
}
|
|
ht.emitString(md.beginMarkup, ht.rawOutputFilters, false)
|
|
ht.emitString(md.text, ht.outputFilters, true)
|
|
ht.emitString(md.endMarkup, ht.rawOutputFilters, false)
|
|
if len(md.text) > 0 {
|
|
ht.emitRune(suffix, ht.outputFilters, true)
|
|
}
|
|
}
|
|
}
|
|
|
|
// doFlushWhitespace flushes out all the whitespace in the temporary buffer.
|
|
func (ht *htmlCheckerImpl) doFlushWhitespace() {
|
|
outputLen := ht.tempBuffer.Len()
|
|
if outputLen > 0 {
|
|
forceLineBreak := false
|
|
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
|
|
// adjust output if necessary for wordwrapping
|
|
remainSpace := ht.config.WordWrap - ht.columns
|
|
if remainSpace < outputLen {
|
|
outputLen = remainSpace
|
|
}
|
|
if outputLen <= 0 {
|
|
// this means that NONE of the whitespace would fit on this line...add a line break
|
|
forceLineBreak = true
|
|
outputLen = 0
|
|
}
|
|
}
|
|
if forceLineBreak {
|
|
ht.emitLineBreak()
|
|
}
|
|
if outputLen > 0 {
|
|
ht.emitString(ht.tempBuffer.String()[:outputLen], ht.outputFilters, true)
|
|
}
|
|
ht.tempBuffer.Reset()
|
|
}
|
|
}
|
|
|
|
// doFlushNewlines flushes all the newlines that are in the temporary buffer.
|
|
func (ht *htmlCheckerImpl) doFlushNewlines() {
|
|
// Measure the number of line breaks we have.
|
|
lineBreaks, crs := 0, 0
|
|
for _, ch := range []byte(ht.tempBuffer.String()) {
|
|
switch ch {
|
|
case '\r':
|
|
crs++
|
|
case '\n':
|
|
crs = 0
|
|
lineBreaks++
|
|
}
|
|
}
|
|
if crs > 0 {
|
|
lineBreaks++
|
|
}
|
|
|
|
// Adjust the number of line breaks if rewrap is in effect.
|
|
if ht.config.Rewrap {
|
|
if lineBreaks < 2 {
|
|
// convert a single line break to whitespace
|
|
ht.tempBuffer.Reset()
|
|
ht.tempBuffer.WriteByte(' ')
|
|
ht.state = stateWhitespace
|
|
return
|
|
} else {
|
|
lineBreaks = 2 // compress out multiple blank lines
|
|
}
|
|
}
|
|
|
|
// emit line breaks
|
|
for lineBreaks > 0 {
|
|
ht.emitLineBreak()
|
|
lineBreaks--
|
|
}
|
|
ht.tempBuffer.Reset()
|
|
ht.state = stateWhitespace
|
|
}
|
|
|
|
// emitFromStartOfTempBuffer emits a certain number of runes from the start of the temporary buffer.
|
|
func (ht *htmlCheckerImpl) emitFromStartOfTempBuffer(nrunes int) {
|
|
if nrunes > 0 {
|
|
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
|
|
for nrunes > 0 {
|
|
curlen := min(nrunes, ht.config.WordWrap-ht.columns)
|
|
if curlen > 0 {
|
|
s := ht.tempBuffer.String()
|
|
bcurlen := util.RunesToBytes(s, curlen)
|
|
ht.emitString(s[:bcurlen], ht.outputFilters, true)
|
|
ht.tempBuffer.Reset()
|
|
ht.tempBuffer.WriteString(s[bcurlen:])
|
|
nrunes -= curlen
|
|
}
|
|
if ht.columns >= ht.config.WordWrap {
|
|
ht.emitLineBreak()
|
|
}
|
|
}
|
|
} else {
|
|
s := ht.tempBuffer.String()
|
|
bnrunes := util.RunesToBytes(s, nrunes)
|
|
ht.emitString(s[:bnrunes], ht.outputFilters, true)
|
|
ht.tempBuffer.Reset()
|
|
ht.tempBuffer.WriteString(s[bnrunes:])
|
|
}
|
|
}
|
|
}
|
|
|
|
// attemptRewrite attempts to apply a list of rewriters on the text, returning the first one that matches.
|
|
func (ht *htmlCheckerImpl) attemptRewrite(rewriters []rewriter, data string) *markupData {
|
|
for _, r := range rewriters {
|
|
rc := r.Rewrite(ht.ctx, data, ht)
|
|
if rc != nil {
|
|
return rc
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// doFlushString attempts to flush a string from the temporary buffer.
|
|
func (ht *htmlCheckerImpl) doFlushString() (bool, error) {
|
|
md := ht.attemptRewrite(ht.stringRewriters, ht.tempBuffer.String())
|
|
if md != nil {
|
|
ht.emitMarkupData(md)
|
|
ht.tempBuffer.Reset()
|
|
if md.rescan {
|
|
err := ht.parse(md.all())
|
|
return true, err
|
|
}
|
|
return false, nil
|
|
}
|
|
|
|
first := true
|
|
for ht.tempBuffer.Len() > 0 {
|
|
sublen, isWord := util.WordRunLength(ht.tempBuffer.String())
|
|
if isWord {
|
|
// we want to check the word, but first we must eliminate leading hyphens and apostrophes
|
|
hyphCount := 0
|
|
for _, ch := range ht.tempBuffer.String() {
|
|
if hyphCount == sublen || !strings.ContainsRune(hyphApos, ch) {
|
|
break
|
|
}
|
|
hyphCount++
|
|
}
|
|
ht.emitFromStartOfTempBuffer(hyphCount)
|
|
sublen -= hyphCount
|
|
|
|
// now determine how many hyphens/apostrophes there are at the end of the word
|
|
runeArray := []rune(ht.tempBuffer.String())
|
|
wordLen := sublen
|
|
hyphCount = 0
|
|
for wordLen > 0 && strings.ContainsRune(hyphApos, runeArray[wordLen-1]) {
|
|
hyphCount++
|
|
wordLen--
|
|
}
|
|
|
|
if wordLen > 0 {
|
|
// extract the word and remove it from the start of the buffer
|
|
word := string(runeArray[:wordLen])
|
|
lw := len(word)
|
|
s := ht.tempBuffer.String()
|
|
ht.tempBuffer.Reset()
|
|
ht.tempBuffer.WriteString(s[lw:])
|
|
|
|
// try to rewrite this word
|
|
md := ht.attemptRewrite(ht.wordRewriters, word)
|
|
if md != nil {
|
|
// emit and/or reparse
|
|
ht.emitMarkupData(md)
|
|
if md.rescan {
|
|
err := ht.parse(md.all())
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
}
|
|
} else {
|
|
// just output the word normally
|
|
ht.ensureSpaceOnLine(wordLen)
|
|
ht.emitString(word, ht.outputFilters, true)
|
|
}
|
|
}
|
|
|
|
// now emit the rest of the hyphens/apostrophes
|
|
ht.emitFromStartOfTempBuffer(hyphCount)
|
|
|
|
} else {
|
|
// emit this many characters, line-breaking where required
|
|
totalRunes := utf8.RuneCountInString(ht.tempBuffer.String())
|
|
if sublen == totalRunes && !first && sublen <= htmlMarginSlop {
|
|
// This is intended to handle a small run of non-word characters at the end of a string (i.e.
|
|
// followed by whitespace) that should stay on the same line with its preceding word, to
|
|
// eliminate "funnies" in punctuation formatting.
|
|
ht.emitString(ht.tempBuffer.String(), ht.outputFilters, true)
|
|
ht.tempBuffer.Reset()
|
|
break
|
|
}
|
|
|
|
// This is kind of the inverse of the above check; if we have a small run of non-word
|
|
// characters at the START of a word (preceded by whitespace and followed by at least
|
|
// one word character), then ensure that we can keep that word and its prefixing non-word
|
|
// characters on the same line (again, avoiding "funnies" in formatting).
|
|
if sublen < totalRunes && first && sublen <= htmlMarginSlop {
|
|
fwLen, _ := util.WordRunLengthAfterPrefix(ht.tempBuffer.String(), sublen)
|
|
ht.ensureSpaceOnLine(sublen + fwLen)
|
|
}
|
|
ht.emitFromStartOfTempBuffer(sublen)
|
|
}
|
|
first = false
|
|
}
|
|
return false, nil
|
|
}
|
|
|
|
// handleAsHTML attempts to handle the contents of the tag in the temporary buffer as HTML.
|
|
func (ht *htmlCheckerImpl) handleAsHTML() bool {
|
|
ht.triggerWBR = false
|
|
tempString := ht.tempBuffer.String()
|
|
|
|
// Figure out where the start of the command word is.
|
|
startCmd := 0
|
|
closingTag := false
|
|
if startCmd < len(tempString) && tempString[startCmd] == '/' {
|
|
startCmd++
|
|
closingTag = true
|
|
}
|
|
|
|
// now figure out where it ends
|
|
endCmd := startCmd
|
|
for endCmd < len(tempString) {
|
|
if unicode.IsSpace(rune(tempString[endCmd])) {
|
|
break
|
|
}
|
|
endCmd++
|
|
}
|
|
|
|
if endCmd == startCmd || (endCmd-startCmd) > tagMaxLength {
|
|
// command word is empty or is too long to be an HTML tag
|
|
return false
|
|
}
|
|
tagIndex, ok := tagNameToIndex[strings.ToUpper(tempString[startCmd:endCmd])]
|
|
if !ok {
|
|
// not a known HTML tag
|
|
return false
|
|
}
|
|
tag := tagIndexToObject[tagIndex]
|
|
if closingTag && !tag.allowClose {
|
|
// it's a closing tag and this tag doesn't permit the "close" form
|
|
return false
|
|
}
|
|
if ht.tagSet != nil {
|
|
tagSetID := tagIndexToSetId[tagIndex]
|
|
if !ht.tagSet.Test(uint(tagSetID)) {
|
|
// the tag is not allowed - discard it, if one of the flags is set in the config
|
|
return ht.config.DiscardHTML || ht.config.DiscardRejected
|
|
}
|
|
}
|
|
if !ht.config.DiscardHTML && tag.balanceTags {
|
|
// this tag needs to be balanced - here's where we manipulate the stack
|
|
var valid bool
|
|
if closingTag {
|
|
valid = ht.tagStack.RemoveMostRecent(tag)
|
|
} else {
|
|
ht.tagStack.Push(tag)
|
|
valid = true
|
|
}
|
|
if !valid {
|
|
return false
|
|
}
|
|
}
|
|
|
|
// Give the tag object one last chance to dictate what we do with the tag.
|
|
realTagData := tag.rewriteContents(tempString, closingTag, ht)
|
|
if realTagData == "" || ht.config.DiscardHTML {
|
|
return true
|
|
}
|
|
|
|
// Emit the tag to the output.
|
|
ht.emitRune('<', ht.rawOutputFilters, false)
|
|
ht.emitString(realTagData, ht.rawOutputFilters, false)
|
|
ht.emitRune('>', ht.rawOutputFilters, false)
|
|
|
|
// Determine whether this tag causes a "logical line break."
|
|
logicalLineBreak := false
|
|
if ht.triggerWBR && !closingTag && ht.noBreakCount > 0 {
|
|
// word break is logical line break, but only within no-break tags
|
|
logicalLineBreak = true
|
|
} else {
|
|
logicalLineBreak = tag.causeLineBreak(closingTag)
|
|
}
|
|
if logicalLineBreak {
|
|
ht.columns = 0
|
|
}
|
|
return true
|
|
}
|
|
|
|
// containsHTMLComment returns true if the temporary buffer contains (the start of) an HTML comment.
|
|
func (ht *htmlCheckerImpl) containsHTMLComment() bool {
|
|
return ht.tempBuffer.Len() >= 3 && strings.HasPrefix(ht.tempBuffer.String(), "!--")
|
|
}
|
|
|
|
// containsCompleteHTMLComment returns true if the temporary buffer contains a complete HTML comment.
|
|
func (ht *htmlCheckerImpl) containsCompleteHTMLComment() bool {
|
|
if ht.tempBuffer.Len() >= 5 {
|
|
s := ht.tempBuffer.String()
|
|
return strings.HasPrefix(s, "!--") && strings.HasSuffix(s, "--")
|
|
}
|
|
return false
|
|
}
|
|
|
|
// containsXMLConstruct returns true if the temporary buffer contains an XML-style namespaced tag.
|
|
func (ht *htmlCheckerImpl) containsXMLConstruct() bool {
|
|
tempString := ht.tempBuffer.String()
|
|
ptr := 0
|
|
if len(tempString) > 1 && tempString[0] == '/' {
|
|
ptr++
|
|
}
|
|
for ptr < len(tempString) {
|
|
if tempString[ptr] == ':' {
|
|
return true
|
|
} else if unicode.IsSpace(rune(tempString[ptr])) {
|
|
break
|
|
}
|
|
ptr++
|
|
}
|
|
return false
|
|
}
|
|
|
|
// finishTag processes and outputs the tag in the temporary buffer.
|
|
func (ht *htmlCheckerImpl) finishTag() error {
|
|
if ht.containsHTMLComment() {
|
|
if ht.containsCompleteHTMLComment() && !ht.config.DiscardComments {
|
|
// output the comment in the raw
|
|
ht.emitRune('<', ht.rawOutputFilters, false)
|
|
ht.emitString(ht.tempBuffer.String(), ht.rawOutputFilters, false)
|
|
ht.emitRune('>', ht.rawOutputFilters, false)
|
|
// clear state and return to parsing
|
|
ht.tempBuffer.Reset()
|
|
ht.state = stateWhitespace
|
|
}
|
|
return nil
|
|
}
|
|
if ht.handleAsHTML() {
|
|
// this was valid HTML, we're done
|
|
ht.tempBuffer.Reset()
|
|
ht.state = stateWhitespace
|
|
return nil
|
|
}
|
|
|
|
// try to handle it with a tag rewriter
|
|
md := ht.attemptRewrite(ht.tagRewriters, ht.tempBuffer.String())
|
|
if md != nil {
|
|
ht.emitBracketedMarkupData(md, '<', '>')
|
|
ht.tempBuffer.Reset()
|
|
ht.state = stateWhitespace
|
|
var err error = nil
|
|
if md.rescan {
|
|
ht.tempBuffer.WriteByte('<')
|
|
ht.state = stateChars
|
|
err = ht.parse(md.all() + ">")
|
|
}
|
|
return err
|
|
}
|
|
|
|
if ht.config.DiscardXML && ht.containsXMLConstruct() {
|
|
// this tag is an XML construct, and needs to be discarded
|
|
ht.tempBuffer.Reset()
|
|
ht.state = stateWhitespace
|
|
return nil
|
|
}
|
|
|
|
// This tag has been rejected! process it normally as character data
|
|
rejection := ht.tempBuffer.String()
|
|
ht.tempBuffer.Reset()
|
|
ht.tempBuffer.WriteByte('<')
|
|
ht.state = stateChars
|
|
var err error = nil
|
|
if len(rejection) > 0 {
|
|
err = ht.parse(rejection)
|
|
}
|
|
if err == nil {
|
|
err = ht.parse(">")
|
|
}
|
|
return err
|
|
}
|
|
|
|
// finishParen processes and outputs the parenthesized construct in the temporary buffer.
|
|
func (ht *htmlCheckerImpl) finishParen() error {
|
|
// Try to handle the element using a paren rewriter
|
|
md := ht.attemptRewrite(ht.parenRewriters, ht.tempBuffer.String())
|
|
if md != nil {
|
|
ht.emitBracketedMarkupData(md, '(', ')')
|
|
ht.tempBuffer.Reset()
|
|
ht.state = stateWhitespace
|
|
ht.parenLevel = 0
|
|
var err error = nil
|
|
if md.rescan {
|
|
ht.tempBuffer.WriteByte('(')
|
|
ht.state = stateChars
|
|
err = ht.parse(md.all() + ")")
|
|
}
|
|
return err
|
|
}
|
|
|
|
// Tag rejected! Process it normally as character data.
|
|
rejection := ht.tempBuffer.String()
|
|
ht.tempBuffer.Reset()
|
|
ht.tempBuffer.WriteByte('(')
|
|
ht.state = stateChars
|
|
ht.parenLevel = 0
|
|
var err error = nil
|
|
if len(rejection) > 0 {
|
|
err = ht.parse(rejection)
|
|
}
|
|
if err == nil {
|
|
err = ht.parse(")")
|
|
}
|
|
return err
|
|
}
|
|
|
|
// parse handles the meat of parsing an input string; it runs the state machine on the input.
|
|
func (ht *htmlCheckerImpl) parse(str string) error {
|
|
i := 0
|
|
for i < len(str) {
|
|
select {
|
|
case <-ht.ctx.Done():
|
|
return ht.ctx.Err()
|
|
default:
|
|
ch := str[i]
|
|
switch ht.state {
|
|
case stateWhitespace:
|
|
switch ch {
|
|
case ' ', '\t': // append space and tab verbatim
|
|
ht.tempBuffer.WriteByte(ch)
|
|
i++
|
|
case '\r', '\n': // flush and go to Newline state
|
|
ht.doFlushWhitespace()
|
|
ht.state = stateNewline
|
|
ht.tempBuffer.WriteByte(ch)
|
|
i++
|
|
case '<':
|
|
ht.doFlushWhitespace()
|
|
if ht.config.Angles {
|
|
ht.state = stateLeftAngle
|
|
} else {
|
|
// process < as ordinary character
|
|
ht.state = stateChars
|
|
ht.tempBuffer.WriteByte(ch)
|
|
}
|
|
i++
|
|
case '(':
|
|
ht.doFlushWhitespace()
|
|
if ht.config.Parens {
|
|
ht.state = stateParen
|
|
} else {
|
|
// process ( as ordinary character)
|
|
ht.state = stateChars
|
|
ht.tempBuffer.WriteByte(ch)
|
|
}
|
|
i++
|
|
case '\\': // backslash processing is tricky - go to Chars state to handle it
|
|
ht.doFlushWhitespace()
|
|
ht.state = stateChars
|
|
default:
|
|
ht.doFlushWhitespace()
|
|
ht.state = stateChars
|
|
ht.tempBuffer.WriteByte(ch)
|
|
i++
|
|
}
|
|
case stateChars:
|
|
switch ch {
|
|
case ' ', '\t': // go to Whitespace state
|
|
_, err := ht.doFlushString()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ht.state = stateWhitespace
|
|
ht.tempBuffer.WriteByte(ch)
|
|
i++
|
|
case '\r', '\n': // go to Newline state
|
|
_, err := ht.doFlushString()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ht.state = stateNewline
|
|
ht.tempBuffer.WriteByte(ch)
|
|
i++
|
|
case '<': // may be a start of tag
|
|
if ht.config.Angles {
|
|
_, err := ht.doFlushString()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ht.state = stateLeftAngle
|
|
} else {
|
|
ht.tempBuffer.WriteByte(ch)
|
|
}
|
|
i++
|
|
case '\\':
|
|
if i < (len(str) - 1) {
|
|
i++
|
|
ch = str[i]
|
|
if (ch == '(' && ht.config.Parens) || (ch == '<' && ht.config.Angles) {
|
|
// append the escaped character, omitting the backslash
|
|
ht.tempBuffer.WriteByte(ch)
|
|
i++
|
|
} else {
|
|
// append the backslash and hit the new character
|
|
ht.tempBuffer.WriteByte('\\')
|
|
}
|
|
} else {
|
|
// just append the backslash normally
|
|
ht.tempBuffer.WriteByte(ch)
|
|
i++
|
|
}
|
|
default: // just append the next character
|
|
ht.tempBuffer.WriteByte(ch)
|
|
i++
|
|
}
|
|
case stateLeftAngle:
|
|
switch ch {
|
|
case ' ', '\t', '\r', '\n': // output <, go to Whitespace state
|
|
ht.emitRune('<', ht.outputFilters, true)
|
|
ht.state = stateWhitespace
|
|
case '<': // output < and stay in this state
|
|
ht.emitRune('<', ht.outputFilters, true)
|
|
i++
|
|
default: // begin processing tag
|
|
ht.state = stateTag
|
|
ht.tempBuffer.WriteByte(ch)
|
|
i++
|
|
}
|
|
case stateTag:
|
|
switch ch {
|
|
case '>': // finish the tag - this changes the state, and possibly calls parse() recursively
|
|
err := ht.finishTag()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
i++
|
|
case '\'', '"': // go into "quote string" state inside the tag
|
|
ht.tempBuffer.WriteByte(ch)
|
|
ht.state = stateTagQuote
|
|
ht.quoteChar = ch
|
|
i++
|
|
default: // just append the character
|
|
ht.tempBuffer.WriteByte(ch)
|
|
i++
|
|
}
|
|
case stateParen:
|
|
switch ch {
|
|
case '(': // nest parentheses one level deeper
|
|
ht.tempBuffer.WriteByte(ch)
|
|
ht.parenLevel++
|
|
i++
|
|
case ')':
|
|
if ht.parenLevel == 0 {
|
|
err := ht.finishParen() // finish paren, changing state and recursively parsing if necessary
|
|
if err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
// nest parentheses one LESS level deeper
|
|
ht.tempBuffer.WriteByte(ch)
|
|
ht.parenLevel--
|
|
}
|
|
i++
|
|
default:
|
|
ht.tempBuffer.WriteByte(ch)
|
|
i++
|
|
}
|
|
case stateTagQuote:
|
|
ht.tempBuffer.WriteByte(ch)
|
|
if ch == ht.quoteChar {
|
|
ht.state = stateTag
|
|
}
|
|
i++
|
|
case stateNewline:
|
|
if ch == '\r' || ch == '\n' {
|
|
ht.tempBuffer.WriteByte(ch)
|
|
i++
|
|
} else {
|
|
ht.doFlushNewlines()
|
|
}
|
|
default:
|
|
log.Fatalf("invalid parser state: %d", ht.state)
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
/*----------------------------------------------------------------------------
|
|
* Implementations from the HTMLChecker interface
|
|
*----------------------------------------------------------------------------
|
|
*/
|
|
|
|
/* Append adds an additional string to the HTML checker data.
|
|
* Parameters:
|
|
* str - The string to be added and parsed.
|
|
* Returns:
|
|
* Standard Go error status.
|
|
*/
|
|
func (ht *htmlCheckerImpl) Append(str string) error {
|
|
if ht.finished {
|
|
return ErrAlreadyFinished
|
|
}
|
|
if !ht.started {
|
|
ht.started = true
|
|
}
|
|
var err error = nil
|
|
if str != "" {
|
|
err = ht.parse(str)
|
|
}
|
|
return err
|
|
}
|
|
|
|
/* Finish completes the HTML checker parsing and makes the result available.
|
|
* Returns:
|
|
* Standard Go error status.
|
|
*/
|
|
func (ht *htmlCheckerImpl) Finish() error {
|
|
if ht.finished {
|
|
return ErrAlreadyFinished
|
|
}
|
|
if !ht.started {
|
|
ht.started = true
|
|
}
|
|
// This is the "end parse" loop, in which we resolve any funny state the parser has
|
|
// found itself in and clear out the internal buffers.
|
|
running := true
|
|
for running {
|
|
running = false // make sure we stop unless this is set to true
|
|
var err error = nil
|
|
switch ht.state {
|
|
case stateWhitespace, stateNewline:
|
|
// do nothing - discard whitespace or newlines at end
|
|
case stateChars:
|
|
running, err = ht.doFlushString() // flush the temporary buffer
|
|
case stateLeftAngle:
|
|
// just emit a left angle character
|
|
ht.emitPossibleLineBreak()
|
|
ht.emitRune('<', ht.outputFilters, true)
|
|
case stateTag, stateTagQuote:
|
|
// we won't finish this tag, so it's automagically rejected
|
|
rejection := ht.tempBuffer.String()
|
|
ht.tempBuffer.Reset()
|
|
ht.tempBuffer.WriteByte('<')
|
|
ht.state = stateChars
|
|
if len(rejection) > 0 {
|
|
err = ht.parse(rejection)
|
|
}
|
|
running = true
|
|
case stateParen:
|
|
// we won't finish this, so it's automagically rejected
|
|
rejection := ht.tempBuffer.String()
|
|
ht.tempBuffer.Reset()
|
|
ht.tempBuffer.WriteByte('(')
|
|
ht.state = stateChars
|
|
ht.parenLevel = 0
|
|
if len(rejection) > 0 {
|
|
err = ht.parse(rejection)
|
|
}
|
|
running = true
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Now close all the HTML tags that were left open.
|
|
for !ht.tagStack.IsEmpty() {
|
|
tag, _ := ht.tagStack.Pop()
|
|
ht.outputBuffer.WriteString(tag.makeClosingTag())
|
|
}
|
|
|
|
ht.lines++
|
|
ht.finished = true
|
|
return nil
|
|
}
|
|
|
|
// Reset clears the internal state of the HTML Checker.
|
|
func (ht *htmlCheckerImpl) Reset() {
|
|
ht.started = false
|
|
ht.finished = false
|
|
ht.triggerWBR = false
|
|
ht.state = stateWhitespace
|
|
ht.quoteChar = byte(0)
|
|
ht.columns = 0
|
|
ht.lines = 0
|
|
ht.parenLevel = 0
|
|
ht.outputBuffer.Reset()
|
|
ht.tempBuffer.Reset()
|
|
ht.tagStack.Clear()
|
|
clear(ht.externalReferences)
|
|
clear(ht.internalReferences)
|
|
for _, c := range ht.counters {
|
|
c.Reset()
|
|
}
|
|
}
|
|
|
|
// Value returns the value of the output from the HTML Checker.
|
|
func (ht *htmlCheckerImpl) Value() (string, error) {
|
|
if ht.finished {
|
|
return ht.outputBuffer.String(), nil
|
|
}
|
|
return "", ErrNotYetFinished
|
|
}
|
|
|
|
// Length returns the length in bytes of the HTML Checker result.
|
|
func (ht *htmlCheckerImpl) Length() (int, error) {
|
|
if ht.finished {
|
|
return ht.outputBuffer.Len(), nil
|
|
}
|
|
return 0, ErrNotYetFinished
|
|
}
|
|
|
|
// Lines returns the number of lines of text in the HTML Checker result.
|
|
func (ht *htmlCheckerImpl) Lines() (int, error) {
|
|
if ht.finished {
|
|
return ht.lines, nil
|
|
}
|
|
return 0, ErrNotYetFinished
|
|
}
|
|
|
|
// Counter returns the value of a counter maintained by the HTML Checker (corresponding to a rewriter).
|
|
func (ht *htmlCheckerImpl) Counter(name string) (int, error) {
|
|
if ht.finished {
|
|
cr, ok := ht.counters[name]
|
|
if ok {
|
|
return cr.GetCount(), nil
|
|
}
|
|
return 0, nil
|
|
}
|
|
return 0, ErrNotYetFinished
|
|
}
|
|
|
|
// GetContext returns an HTML checker context value.
|
|
func (ht *htmlCheckerImpl) GetContext(name string) any {
|
|
return ht.contextData[name]
|
|
}
|
|
|
|
// SetContext sets an HTML checker context value.
|
|
func (ht *htmlCheckerImpl) SetContext(name string, value any) {
|
|
ht.contextData[name] = value
|
|
}
|
|
|
|
// ExternalRefs returns a list of URLs as external references in the parsed text.
|
|
func (ht *htmlCheckerImpl) ExternalRefs() ([]*url.URL, error) {
|
|
if ht.finished {
|
|
rc := make([]*url.URL, len(ht.externalReferences))
|
|
p := 0
|
|
for url := range ht.externalReferences {
|
|
rc[p] = url
|
|
p++
|
|
}
|
|
return rc, nil
|
|
}
|
|
return nil, ErrNotYetFinished
|
|
}
|
|
|
|
// InternalRefs returns a list of internal references in the parsed text.
|
|
func (ht *htmlCheckerImpl) InternalRefs() ([]string, error) {
|
|
if ht.finished {
|
|
rc := make([]string, len(ht.internalReferences))
|
|
p := 0
|
|
for s := range ht.internalReferences {
|
|
rc[p] = s
|
|
p++
|
|
}
|
|
}
|
|
return nil, ErrNotYetFinished
|
|
}
|