documentation pass done, ready to use the HTML Checker in other code
This commit is contained in:
+216
-118
@@ -12,7 +12,6 @@ package htmlcheck
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"maps"
|
||||
"net/url"
|
||||
"strings"
|
||||
"unicode"
|
||||
@@ -23,24 +22,38 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
* External definitions
|
||||
*----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
// HTMLChecker is a component that checks HTML and reformats it as needed.
|
||||
type HTMLChecker interface {
|
||||
Append(string) error
|
||||
Finish() error
|
||||
Reset()
|
||||
Value() (string, error)
|
||||
Length() (int, error)
|
||||
Lines() (int, error)
|
||||
Counter(string) (int, error)
|
||||
GetContext(string) any
|
||||
SetContext(string, any)
|
||||
ExternalRefs() ([]*url.URL, error)
|
||||
InternalRefs() ([]string, error)
|
||||
Append(string) error // add additional string to the checker state
|
||||
Finish() error // finish parsing HTML
|
||||
Reset() // clear state
|
||||
Value() (string, error) // return value
|
||||
Length() (int, error) // return text length
|
||||
Lines() (int, error) // return number of lines
|
||||
Counter(string) (int, error) // return value of a counter
|
||||
GetContext(string) any // get a context value
|
||||
SetContext(string, any) // set a context value
|
||||
ExternalRefs() ([]*url.URL, error) // return a list of external references
|
||||
InternalRefs() ([]string, error) // return a list of internal references
|
||||
}
|
||||
|
||||
// ErrAlreadyFinished is a common error that's returned if the checker has been finished when it shouldn't be.
|
||||
var ErrAlreadyFinished = errors.New("the HTML checker has already finished")
|
||||
|
||||
// ErrNotYetFinished is a common error that's returned if the checker has not been finished when it should be.
|
||||
var ErrNotYetFinished = errors.New("the HTML checker has not yet been finished")
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
* Internal definitions
|
||||
*----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
// htmlCheckerBackend is an interface used by subcomponents to communicate back to the HTML checker.
|
||||
type htmlCheckerBackend interface {
|
||||
getCheckerAttrValue(string) string
|
||||
sendTagMessage(string)
|
||||
@@ -66,33 +79,40 @@ const htmlMarginSlop = 5
|
||||
// hyphApos is used to find hyphens and apostrophes.
|
||||
const hyphApos = "-'"
|
||||
|
||||
// htmlCheckerImpl is the implementation of the HTML checker.
|
||||
type htmlCheckerImpl struct {
|
||||
config *HTMLCheckerConfig
|
||||
started bool
|
||||
finished bool
|
||||
state int
|
||||
quoteChar byte
|
||||
parenLevel int
|
||||
columns int
|
||||
lines int
|
||||
noBreakCount int
|
||||
triggerWBR bool
|
||||
outputBuffer strings.Builder
|
||||
tempBuffer strings.Builder
|
||||
tagStack *util.Stack[*tag]
|
||||
counters map[string]*countingRewriter
|
||||
stringRewriters []rewriter
|
||||
wordRewriters []rewriter
|
||||
tagRewriters []rewriter
|
||||
parenRewriters []rewriter
|
||||
outputFilters []outputFilter
|
||||
rawOutputFilters []outputFilter
|
||||
contextData map[string]any
|
||||
externalReferences map[*url.URL]bool
|
||||
internalReferences map[string]bool
|
||||
tagSet *bitset.BitSet
|
||||
config *HTMLCheckerConfig // pointer to configuration
|
||||
started bool // has checker been started?
|
||||
finished bool // has checker been finished?
|
||||
state int // current state
|
||||
quoteChar byte // quote character to match in stateTagQuote
|
||||
parenLevel int // parenthesis level in stateParen
|
||||
columns int // current column position - runes, not bytes!
|
||||
lines int // lines of text
|
||||
noBreakCount int // current NOBR nesting count
|
||||
triggerWBR bool // do we need to trigger a word break?
|
||||
outputBuffer strings.Builder // output is gathered here
|
||||
tempBuffer strings.Builder // input is gathered here within a state and flushed on transition
|
||||
tagStack *util.Stack[*tag] // keeps track of nested HTML tags
|
||||
counters map[string]*countingRewriter // counters for times rewrites have happened
|
||||
stringRewriters []rewriter // loaded string rewriters
|
||||
wordRewriters []rewriter // loaded word rewriters
|
||||
tagRewriters []rewriter // loaded tag rewriters
|
||||
parenRewriters []rewriter // loaded parenthesis rewriters
|
||||
outputFilters []outputFilter // loaded standard output filters
|
||||
rawOutputFilters []outputFilter // loaded "raw" output filters
|
||||
contextData map[string]any // holds context values
|
||||
externalReferences map[*url.URL]bool // saved external references
|
||||
internalReferences map[string]bool // saved internal references
|
||||
tagSet *bitset.BitSet // set of valid tags from configuration
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
* Construction helpers
|
||||
*----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
// copyRewriters looks up all rewriters in the source array and builds a target array.
|
||||
func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) {
|
||||
for i := range source {
|
||||
rw, ok := rewriterRegistry[source[i]]
|
||||
@@ -109,6 +129,7 @@ func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) {
|
||||
}
|
||||
}
|
||||
|
||||
// copyOutputFilters looks up all output filters in the source array and builds a target array.
|
||||
func (ht *htmlCheckerImpl) copyOutputFilters(dest []outputFilter, source []string) {
|
||||
for i := range source {
|
||||
f, ok := outputFilterRegistry[source[i]]
|
||||
@@ -120,6 +141,18 @@ func (ht *htmlCheckerImpl) copyOutputFilters(dest []outputFilter, source []strin
|
||||
}
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
* The construction function
|
||||
*----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* AmNewHTMLChecker creates a new HTML Checker object.
|
||||
* Parametrers:
|
||||
* configName - Name of the configuration to use.
|
||||
* Returns:
|
||||
* New HTML checker reference.
|
||||
* Standard Go error status.
|
||||
*/
|
||||
func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
|
||||
config, ok := configsRegistry[configName]
|
||||
if !ok {
|
||||
@@ -161,6 +194,12 @@ func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
|
||||
return &rc, nil
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
* Implementations from htmlCheckerBackend and rewriterServices
|
||||
*----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
// getCheckerAttrValue returns the value of an HTML checker attribute.
|
||||
func (ht *htmlCheckerImpl) getCheckerAttrValue(name string) string {
|
||||
if name == "ANCHORTAIL" {
|
||||
return ht.config.AnchorTail
|
||||
@@ -168,6 +207,7 @@ func (ht *htmlCheckerImpl) getCheckerAttrValue(name string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
// sendTagMessage offers specific HTML tags a way to send messages to affect the HTML checker's state.
|
||||
func (ht *htmlCheckerImpl) sendTagMessage(msg string) {
|
||||
switch msg {
|
||||
case "NOBR":
|
||||
@@ -179,26 +219,37 @@ func (ht *htmlCheckerImpl) sendTagMessage(msg string) {
|
||||
}
|
||||
}
|
||||
|
||||
// getCheckerContextValue returns a context value set on the HTML checker.
|
||||
func (ht *htmlCheckerImpl) getCheckerContextValue(name string) any {
|
||||
return ht.contextData[name]
|
||||
}
|
||||
|
||||
// addExternalRef adds an external reference to the checker's logs.
|
||||
func (ht *htmlCheckerImpl) addExternalRef(ref *url.URL) {
|
||||
ht.externalReferences[ref] = true
|
||||
}
|
||||
|
||||
// addInternalRef adds an internal reference to the checker's logs.
|
||||
func (ht *htmlCheckerImpl) addInternalRef(ref string) {
|
||||
ht.internalReferences[ref] = true
|
||||
}
|
||||
|
||||
// rewriterAttrValue returns the value of an HTML checker attribute.
|
||||
func (ht *htmlCheckerImpl) rewriterAttrValue(name string) string {
|
||||
return ht.getCheckerAttrValue(name)
|
||||
}
|
||||
|
||||
// rewriterContextValue returns a context value set on the HTML checker.
|
||||
func (ht *htmlCheckerImpl) rewriterContextValue(name string) any {
|
||||
return ht.contextData[name]
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
* Internal functions forming the meat of the parser
|
||||
*----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
// emitRune emits a rune to the output buffer, respecting the specified output filters.
|
||||
func (ht *htmlCheckerImpl) emitRune(ch rune, filters []outputFilter, countCols bool) {
|
||||
handled := false
|
||||
if len(filters) > 0 {
|
||||
@@ -209,75 +260,76 @@ func (ht *htmlCheckerImpl) emitRune(ch rune, filters []outputFilter, countCols b
|
||||
break // found a filter to handle it, done
|
||||
}
|
||||
}
|
||||
if !handled { // output the raw character
|
||||
ht.outputBuffer.WriteRune(ch)
|
||||
}
|
||||
if countCols && ht.config.WordWrap > 0 {
|
||||
ht.columns++
|
||||
}
|
||||
}
|
||||
if !handled { // output the raw character
|
||||
ht.outputBuffer.WriteRune(ch)
|
||||
}
|
||||
if countCols && ht.config.WordWrap > 0 {
|
||||
ht.columns++
|
||||
}
|
||||
}
|
||||
|
||||
// emitString emits an entire string to the output buffer, respecting the specified output filters.
|
||||
func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) {
|
||||
if str == "" {
|
||||
return
|
||||
}
|
||||
realCountCols := countCols && (ht.config.WordWrap > 0)
|
||||
if len(filters) == 0 {
|
||||
// if there are no filters, just output the whole thing
|
||||
ht.outputBuffer.WriteString(str)
|
||||
if realCountCols {
|
||||
ht.columns += utf8.RuneCountInString(str)
|
||||
}
|
||||
return
|
||||
}
|
||||
temp := str
|
||||
for len(temp) > 0 {
|
||||
// We output as much of the string as we possibly can at once. Assume, for now, we'll output the whole thing.
|
||||
outputLen := len(temp)
|
||||
if str != "" {
|
||||
realCountCols := countCols && ht.config.WordWrap > 0
|
||||
if len(filters) == 0 {
|
||||
// if there are no filters, just output the whole thing
|
||||
ht.outputBuffer.WriteString(str)
|
||||
if realCountCols {
|
||||
ht.columns += utf8.RuneCountInString(str)
|
||||
}
|
||||
} else {
|
||||
temp := str
|
||||
for len(temp) > 0 {
|
||||
// We output as much of the string as we possibly can at once. Assume, for now, we'll output the whole thing.
|
||||
outputLen := len(temp)
|
||||
|
||||
// Now look at each of the output filters to see if we should try outputting a lesser amount
|
||||
// (i.e. does the string contain a "stopper" that one of the filters would like to mogrify?)
|
||||
var stopper outputFilter = nil
|
||||
for _, of := range filters {
|
||||
// find the length of characters that DOESN'T match this filter
|
||||
lnm := of.lengthNoMatch(temp)
|
||||
if lnm >= 0 && lnm < outputLen {
|
||||
// we've found a new stopper - record the length and the filter
|
||||
outputLen = lnm
|
||||
stopper = of
|
||||
// Now look at each of the output filters to see if we should try outputting a lesser amount
|
||||
// (i.e. does the string contain a "stopper" that one of the filters would like to mogrify?)
|
||||
var stopper outputFilter = nil
|
||||
for _, of := range filters {
|
||||
// find the length of characters that DOESN'T match this filter
|
||||
lnm := of.lengthNoMatch(temp)
|
||||
if lnm >= 0 && lnm < outputLen {
|
||||
// we've found a new stopper - record the length and the filter
|
||||
outputLen = lnm
|
||||
stopper = of
|
||||
}
|
||||
if outputLen <= 0 {
|
||||
break // nothing left to do here
|
||||
}
|
||||
}
|
||||
if outputLen > 0 {
|
||||
// move over the unaltered characters first
|
||||
ht.outputBuffer.WriteString(temp[:outputLen])
|
||||
if realCountCols {
|
||||
ht.columns += utf8.RuneCountInString(temp[:outputLen])
|
||||
}
|
||||
}
|
||||
if stopper != nil {
|
||||
// one of the output filters stopped us, try invoking it
|
||||
tmpch, bsiz := utf8.DecodeRuneInString(temp[outputLen:])
|
||||
outputLen += bsiz
|
||||
if !stopper.tryOutputRune(ht.outputBuffer, tmpch) {
|
||||
ht.outputBuffer.WriteRune(tmpch)
|
||||
}
|
||||
if realCountCols {
|
||||
ht.columns++
|
||||
}
|
||||
}
|
||||
// Chop the string and go around again.
|
||||
if outputLen == len(temp) {
|
||||
temp = ""
|
||||
} else if outputLen > 0 {
|
||||
temp = temp[outputLen:]
|
||||
}
|
||||
}
|
||||
if outputLen <= 0 {
|
||||
break // nothing left to do here
|
||||
}
|
||||
}
|
||||
if outputLen > 0 {
|
||||
// move over the unaltered characters first
|
||||
ht.outputBuffer.WriteString(temp[:outputLen])
|
||||
if realCountCols {
|
||||
ht.columns += utf8.RuneCountInString(temp[:outputLen])
|
||||
}
|
||||
}
|
||||
if stopper != nil {
|
||||
// one of the output filters stopped us, try invoking it
|
||||
tmpch, bsiz := utf8.DecodeRuneInString(temp[outputLen:])
|
||||
outputLen += bsiz
|
||||
if !stopper.tryOutputRune(ht.outputBuffer, tmpch) {
|
||||
ht.outputBuffer.WriteRune(tmpch)
|
||||
}
|
||||
if realCountCols {
|
||||
ht.columns++
|
||||
}
|
||||
}
|
||||
// Chop the string and go around again.
|
||||
if outputLen == len(temp) {
|
||||
temp = ""
|
||||
} else if outputLen > 0 {
|
||||
temp = temp[outputLen:]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// emitLineBreak emits a line break to the output.
|
||||
func (ht *htmlCheckerImpl) emitLineBreak() {
|
||||
ht.emitString("\r\n", ht.rawOutputFilters, false)
|
||||
if ht.config.WordWrap > 0 {
|
||||
@@ -286,34 +338,38 @@ func (ht *htmlCheckerImpl) emitLineBreak() {
|
||||
ht.lines++
|
||||
}
|
||||
|
||||
// emitPossibleLineBreak emits a line break to the output, if it's warranted.
|
||||
func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
|
||||
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 && ht.columns >= ht.config.WordWrap {
|
||||
ht.emitLineBreak()
|
||||
}
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) ensureSpaceOnLine(nchars int) {
|
||||
// ensureSpaceOnLine makes sure we have enough space on the current line for a certain number of runes, adding a line break if needed.
|
||||
func (ht *htmlCheckerImpl) ensureSpaceOnLine(nrunes int) {
|
||||
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
|
||||
// add a line break if needed here
|
||||
remainSpace := ht.config.WordWrap - ht.columns
|
||||
if remainSpace < nchars {
|
||||
if remainSpace < nrunes {
|
||||
ht.emitLineBreak()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// emitMarkupData emits the markup data in the specified data structure.
|
||||
func (ht *htmlCheckerImpl) emitMarkupData(md *markupData) {
|
||||
if !md.rescan {
|
||||
ht.ensureSpaceOnLine(len(md.text))
|
||||
ht.ensureSpaceOnLine(utf8.RuneCountInString(md.text))
|
||||
ht.emitString(md.beginMarkup, ht.rawOutputFilters, false)
|
||||
ht.emitString(md.text, ht.outputFilters, true)
|
||||
ht.emitString(md.endMarkup, ht.rawOutputFilters, false)
|
||||
}
|
||||
}
|
||||
|
||||
// emitBrackedtedMarkupData emits the marketed data in the specified data structure, with prefix and suffix runes.
|
||||
func (ht *htmlCheckerImpl) emitBracketedMarkupData(md *markupData, prefix rune, suffix rune) {
|
||||
if !md.rescan {
|
||||
l := len(md.text)
|
||||
l := utf8.RuneCountInString(md.text)
|
||||
if l > 0 {
|
||||
l += 2
|
||||
}
|
||||
@@ -330,6 +386,7 @@ func (ht *htmlCheckerImpl) emitBracketedMarkupData(md *markupData, prefix rune,
|
||||
}
|
||||
}
|
||||
|
||||
// doFlushWhitespace flushes out all the whitespace in the temporary buffer.
|
||||
func (ht *htmlCheckerImpl) doFlushWhitespace() {
|
||||
outputLen := ht.tempBuffer.Len()
|
||||
if outputLen > 0 {
|
||||
@@ -356,6 +413,7 @@ func (ht *htmlCheckerImpl) doFlushWhitespace() {
|
||||
}
|
||||
}
|
||||
|
||||
// doFlushNewlines flushes all the newlines that are in the temporary buffer.
|
||||
func (ht *htmlCheckerImpl) doFlushNewlines() {
|
||||
// Measure the number of line breaks we have.
|
||||
lineBreaks, crs := 0, 0
|
||||
@@ -385,6 +443,7 @@ func (ht *htmlCheckerImpl) doFlushNewlines() {
|
||||
}
|
||||
}
|
||||
|
||||
// emit line breaks
|
||||
for lineBreaks > 0 {
|
||||
ht.emitLineBreak()
|
||||
lineBreaks--
|
||||
@@ -393,6 +452,7 @@ func (ht *htmlCheckerImpl) doFlushNewlines() {
|
||||
ht.state = stateWhitespace
|
||||
}
|
||||
|
||||
// emitFromStartOfTempBuffer emits a certain number of runes from the start of the temporary buffer.
|
||||
func (ht *htmlCheckerImpl) emitFromStartOfTempBuffer(nrunes int) {
|
||||
if nrunes > 0 {
|
||||
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
|
||||
@@ -420,6 +480,7 @@ func (ht *htmlCheckerImpl) emitFromStartOfTempBuffer(nrunes int) {
|
||||
}
|
||||
}
|
||||
|
||||
// attemptRewrite attempts to apply a list of rewriters on the text, returning the first one that matches.
|
||||
func (ht *htmlCheckerImpl) attemptRewrite(rewriters []rewriter, data string) *markupData {
|
||||
for _, r := range rewriters {
|
||||
rc := r.Rewrite(data, ht)
|
||||
@@ -430,6 +491,7 @@ func (ht *htmlCheckerImpl) attemptRewrite(rewriters []rewriter, data string) *ma
|
||||
return nil
|
||||
}
|
||||
|
||||
// doFlushString attempts to flush a string from the temporary buffer.
|
||||
func (ht *htmlCheckerImpl) doFlushString() bool {
|
||||
md := ht.attemptRewrite(ht.stringRewriters, ht.tempBuffer.String())
|
||||
if md != nil {
|
||||
@@ -519,9 +581,11 @@ func (ht *htmlCheckerImpl) doFlushString() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// handleAsHTML attempts to handle the contents of the tag in the temporary buffer as HTML.
|
||||
func (ht *htmlCheckerImpl) handleAsHTML() bool {
|
||||
ht.triggerWBR = false
|
||||
tempString := ht.tempBuffer.String()
|
||||
|
||||
// Figure out where the start of the command word is.
|
||||
startCmd := 0
|
||||
closingTag := false
|
||||
@@ -543,8 +607,7 @@ func (ht *htmlCheckerImpl) handleAsHTML() bool {
|
||||
// command word is empty or is too long to be an HTML tag
|
||||
return false
|
||||
}
|
||||
possTagName := tempString[startCmd:endCmd]
|
||||
tagIndex, ok := tagNameToIndex[strings.ToUpper(possTagName)]
|
||||
tagIndex, ok := tagNameToIndex[strings.ToUpper(tempString[startCmd:endCmd])]
|
||||
if !ok {
|
||||
// not a known HTML tag
|
||||
return false
|
||||
@@ -584,6 +647,7 @@ func (ht *htmlCheckerImpl) handleAsHTML() bool {
|
||||
ht.emitString(realTagData, ht.rawOutputFilters, false)
|
||||
ht.emitRune('>', ht.rawOutputFilters, false)
|
||||
|
||||
// Determine whether this tag causes a "logical line break."
|
||||
logicalLineBreak := false
|
||||
if ht.triggerWBR && !closingTag && ht.noBreakCount > 0 {
|
||||
// word break is logical line break, but only within no-break tags
|
||||
@@ -597,10 +661,12 @@ func (ht *htmlCheckerImpl) handleAsHTML() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// containsHTMLComment returns true if the temporary buffer contains (the start of) an HTML comment.
|
||||
func (ht *htmlCheckerImpl) containsHTMLComment() bool {
|
||||
return ht.tempBuffer.Len() >= 3 && strings.HasPrefix(ht.tempBuffer.String(), "!--")
|
||||
}
|
||||
|
||||
// containsCompleteHTMLComment returns true if the temporary buffer contains a complete HTML comment.
|
||||
func (ht *htmlCheckerImpl) containsCompleteHTMLComment() bool {
|
||||
if ht.tempBuffer.Len() >= 5 {
|
||||
s := ht.tempBuffer.String()
|
||||
@@ -609,6 +675,7 @@ func (ht *htmlCheckerImpl) containsCompleteHTMLComment() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// containsXMLConstruct returns true if the temporary buffer contains an XML-style namespaced tag.
|
||||
func (ht *htmlCheckerImpl) containsXMLConstruct() bool {
|
||||
tempString := ht.tempBuffer.String()
|
||||
ptr := 0
|
||||
@@ -626,18 +693,17 @@ func (ht *htmlCheckerImpl) containsXMLConstruct() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// finishTag processes and outputs the tag in the temporary buffer.
|
||||
func (ht *htmlCheckerImpl) finishTag() {
|
||||
if ht.containsHTMLComment() {
|
||||
if ht.containsCompleteHTMLComment() {
|
||||
if !ht.config.DiscardComments {
|
||||
// output the comment in the raw
|
||||
ht.emitRune('<', ht.rawOutputFilters, false)
|
||||
ht.emitString(ht.tempBuffer.String(), ht.rawOutputFilters, false)
|
||||
ht.emitRune('>', ht.rawOutputFilters, false)
|
||||
// clear state and retun to parsing
|
||||
ht.tempBuffer.Reset()
|
||||
ht.state = stateWhitespace
|
||||
}
|
||||
if ht.containsCompleteHTMLComment() && !ht.config.DiscardComments {
|
||||
// output the comment in the raw
|
||||
ht.emitRune('<', ht.rawOutputFilters, false)
|
||||
ht.emitString(ht.tempBuffer.String(), ht.rawOutputFilters, false)
|
||||
ht.emitRune('>', ht.rawOutputFilters, false)
|
||||
// clear state and return to parsing
|
||||
ht.tempBuffer.Reset()
|
||||
ht.state = stateWhitespace
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -680,6 +746,7 @@ func (ht *htmlCheckerImpl) finishTag() {
|
||||
ht.parse(">")
|
||||
}
|
||||
|
||||
// finishParen processes and outputs the parenthesized construct in the temporary buffer.
|
||||
func (ht *htmlCheckerImpl) finishParen() {
|
||||
// Try to handle the element using a paren rewriter
|
||||
md := ht.attemptRewrite(ht.parenRewriters, ht.tempBuffer.String())
|
||||
@@ -708,6 +775,7 @@ func (ht *htmlCheckerImpl) finishParen() {
|
||||
ht.parse(")")
|
||||
}
|
||||
|
||||
// parse handles the meat of parsing an input string; it runs the state machine on the input.
|
||||
func (ht *htmlCheckerImpl) parse(str string) {
|
||||
i := 0
|
||||
for i < len(str) {
|
||||
@@ -785,7 +853,7 @@ func (ht *htmlCheckerImpl) parse(str string) {
|
||||
ht.tempBuffer.WriteByte('\\')
|
||||
}
|
||||
} else {
|
||||
// just append the backslash notrmally
|
||||
// just append the backslash normally
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
}
|
||||
@@ -801,7 +869,7 @@ func (ht *htmlCheckerImpl) parse(str string) {
|
||||
case '<': // output < and stay in this state
|
||||
ht.emitRune('<', ht.outputFilters, true)
|
||||
i++
|
||||
default:
|
||||
default: // begin processing tag
|
||||
ht.state = stateTag
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
@@ -822,14 +890,15 @@ func (ht *htmlCheckerImpl) parse(str string) {
|
||||
}
|
||||
case stateParen:
|
||||
switch ch {
|
||||
case '(':
|
||||
case '(': // nest parentheses one level deeper
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
ht.parenLevel++
|
||||
i++
|
||||
case ')':
|
||||
if ht.parenLevel == 0 {
|
||||
ht.finishParen()
|
||||
ht.finishParen() // finish paren, changing state and recursively parsing if necessary
|
||||
} else {
|
||||
// nest parentheses one LESS level deeper
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
ht.parenLevel--
|
||||
}
|
||||
@@ -851,10 +920,23 @@ func (ht *htmlCheckerImpl) parse(str string) {
|
||||
} else {
|
||||
ht.doFlushNewlines()
|
||||
}
|
||||
default:
|
||||
log.Fatalf("invalid parser state: %d", ht.state)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
* Implementations from the HTMLChecker interface
|
||||
*----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* Append adds an additional string to the HTML checker data.
|
||||
* Parameters:
|
||||
* str - The string to be added and parsed.
|
||||
* Returns:
|
||||
* Standard Go error status.
|
||||
*/
|
||||
func (ht *htmlCheckerImpl) Append(str string) error {
|
||||
if ht.finished {
|
||||
return ErrAlreadyFinished
|
||||
@@ -868,6 +950,10 @@ func (ht *htmlCheckerImpl) Append(str string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
/* Finish completes the HTML checker parsing and makes the result available.
|
||||
* Returns:
|
||||
* Standard Go error status.
|
||||
*/
|
||||
func (ht *htmlCheckerImpl) Finish() error {
|
||||
if ht.finished {
|
||||
return ErrAlreadyFinished
|
||||
@@ -900,6 +986,7 @@ func (ht *htmlCheckerImpl) Finish() error {
|
||||
}
|
||||
running = true
|
||||
case stateParen:
|
||||
// we won't finish this, so it's automatically rejected
|
||||
rejection := ht.tempBuffer.String()
|
||||
ht.tempBuffer.Reset()
|
||||
ht.tempBuffer.WriteByte('(')
|
||||
@@ -923,6 +1010,7 @@ func (ht *htmlCheckerImpl) Finish() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Reset clears the internal state of the HTML Checker.
|
||||
func (ht *htmlCheckerImpl) Reset() {
|
||||
ht.started = false
|
||||
ht.finished = false
|
||||
@@ -933,17 +1021,20 @@ func (ht *htmlCheckerImpl) Reset() {
|
||||
ht.lines = 0
|
||||
ht.parenLevel = 0
|
||||
ht.outputBuffer.Reset()
|
||||
ht.tempBuffer.Reset()
|
||||
ht.tagStack.Clear()
|
||||
for u := range ht.externalReferences {
|
||||
delete(ht.externalReferences, u)
|
||||
}
|
||||
for k := range ht.internalReferences {
|
||||
delete(ht.internalReferences, k)
|
||||
}
|
||||
for c := range maps.Values(ht.counters) {
|
||||
for _, c := range ht.counters {
|
||||
c.Reset()
|
||||
}
|
||||
}
|
||||
|
||||
// Value returns the value of the output from the HTML Checker.
|
||||
func (ht *htmlCheckerImpl) Value() (string, error) {
|
||||
if ht.finished {
|
||||
return ht.outputBuffer.String(), nil
|
||||
@@ -951,6 +1042,7 @@ func (ht *htmlCheckerImpl) Value() (string, error) {
|
||||
return "", ErrNotYetFinished
|
||||
}
|
||||
|
||||
// Length returns the length in bytes of the HTML Checker result.
|
||||
func (ht *htmlCheckerImpl) Length() (int, error) {
|
||||
if ht.finished {
|
||||
return ht.outputBuffer.Len(), nil
|
||||
@@ -958,6 +1050,7 @@ func (ht *htmlCheckerImpl) Length() (int, error) {
|
||||
return 0, ErrNotYetFinished
|
||||
}
|
||||
|
||||
// Lines returns the number of lines of text in the HTML Checker result.
|
||||
func (ht *htmlCheckerImpl) Lines() (int, error) {
|
||||
if ht.finished {
|
||||
return ht.lines, nil
|
||||
@@ -965,6 +1058,7 @@ func (ht *htmlCheckerImpl) Lines() (int, error) {
|
||||
return 0, ErrNotYetFinished
|
||||
}
|
||||
|
||||
// Counter returns the value of a counter maintained by the HTML Checker (corresponding to a rewriter).
|
||||
func (ht *htmlCheckerImpl) Counter(name string) (int, error) {
|
||||
if ht.finished {
|
||||
cr, ok := ht.counters[name]
|
||||
@@ -976,19 +1070,22 @@ func (ht *htmlCheckerImpl) Counter(name string) (int, error) {
|
||||
return 0, ErrNotYetFinished
|
||||
}
|
||||
|
||||
// GetContext returns an HTML checker context value.
|
||||
func (ht *htmlCheckerImpl) GetContext(name string) any {
|
||||
return ht.contextData[name]
|
||||
}
|
||||
|
||||
// SetContext sets an HTML checker context value.
|
||||
func (ht *htmlCheckerImpl) SetContext(name string, value any) {
|
||||
ht.contextData[name] = value
|
||||
}
|
||||
|
||||
// ExternalRefs returns a list of URLs as external references in the parsed text.
|
||||
func (ht *htmlCheckerImpl) ExternalRefs() ([]*url.URL, error) {
|
||||
if ht.finished {
|
||||
rc := make([]*url.URL, len(ht.externalReferences))
|
||||
p := 0
|
||||
for url := range maps.Keys(ht.externalReferences) {
|
||||
for url := range ht.externalReferences {
|
||||
rc[p] = url
|
||||
p++
|
||||
}
|
||||
@@ -997,11 +1094,12 @@ func (ht *htmlCheckerImpl) ExternalRefs() ([]*url.URL, error) {
|
||||
return nil, ErrNotYetFinished
|
||||
}
|
||||
|
||||
// InternalRefs returns a list of internal references in the parsed text.
|
||||
func (ht *htmlCheckerImpl) InternalRefs() ([]string, error) {
|
||||
if ht.finished {
|
||||
rc := make([]string, len(ht.internalReferences))
|
||||
p := 0
|
||||
for s := range maps.Keys(ht.internalReferences) {
|
||||
for s := range ht.internalReferences {
|
||||
rc[p] = s
|
||||
p++
|
||||
}
|
||||
|
||||
@@ -42,6 +42,7 @@ type HTMLCheckerConfigFile struct {
|
||||
Configs []HTMLCheckerConfig `yaml:"configs"`
|
||||
}
|
||||
|
||||
// defaultAnchorTail is the default value of the anchor tail.
|
||||
const defaultAnchorTail = "TARGET=\"Wander\""
|
||||
|
||||
//go:embed configs.yaml
|
||||
|
||||
@@ -32,7 +32,7 @@ type EmoticonConfig struct {
|
||||
emos map[string]*EmoticonDef
|
||||
}
|
||||
|
||||
// emoticonRewriter is the implementation of rewriter in this file
|
||||
// emoticonRewriter is the implementation of rewriter in this file.
|
||||
type emoticonRewriter struct {
|
||||
config *EmoticonConfig
|
||||
prefixChars []byte
|
||||
|
||||
+1
-1
@@ -33,7 +33,7 @@ type htmlEncodingFilter struct{}
|
||||
// htmlEscapedChars is a list of HTML characters that are escaped.
|
||||
const htmlEscapedChars = "<>&"
|
||||
|
||||
// tryOutputCharacter outputs a character that needs to be escaped.
|
||||
// tryOutputRune outputs a rune that needs to be escaped.
|
||||
func (f *htmlEncodingFilter) tryOutputRune(buf strings.Builder, ch rune) bool {
|
||||
switch ch {
|
||||
case '<':
|
||||
|
||||
+12
-12
@@ -38,10 +38,10 @@ const (
|
||||
tagSetNSCPForms = 16 // Netscape form tags
|
||||
tagSetNSCPBlockFormat = 17 // Netscape block-formatting tags
|
||||
tagSetNSCPServer = 18 // the Netscape <SERVER> tag
|
||||
tagSetMSFTDocFormat = 19 // Microsoft-specific document formatting
|
||||
tagSetMSFTInlineFormat = 20 // Microsoft-specific inline formatting
|
||||
tagSetMSFTBlockFormat = 21 // Microsoft-specific block formatting
|
||||
tagSetMSFTActiveContent = 22 // Microsoft-specific active content
|
||||
tagSetMSFTDocFormat = 19 // Micro$oft-specific document formatting
|
||||
tagSetMSFTInlineFormat = 20 // Micro$oft-specific inline formatting
|
||||
tagSetMSFTBlockFormat = 21 // Micro$oft-specific block formatting
|
||||
tagSetMSFTActiveContent = 22 // Micro$oft-specific active content
|
||||
tagSetServerPage = 23 // server-side page use
|
||||
tagSetJavaServer = 24 // Java server page use
|
||||
tagSetComment = 25 // HTML comments
|
||||
@@ -54,14 +54,14 @@ type rewriteContentsFunc func(*tag, string, bool, htmlCheckerBackend) string
|
||||
|
||||
// tag is a structure describing a particular HTML tag.
|
||||
type tag struct {
|
||||
name string
|
||||
index int
|
||||
lineBreak bool
|
||||
allowClose bool
|
||||
balanceTags bool
|
||||
clb causeLineBreakFunc
|
||||
ct closingTagFunc
|
||||
rwc rewriteContentsFunc
|
||||
name string // tag name, upper case
|
||||
index int // index in the array
|
||||
lineBreak bool // does the tag cause line breaks?
|
||||
allowClose bool // is a close form of the tag allowed?
|
||||
balanceTags bool // do we need to balance open and close tags?
|
||||
clb causeLineBreakFunc // does this tag cause line breaks?
|
||||
ct closingTagFunc // generate closing tag
|
||||
rwc rewriteContentsFunc // rewrite the contents if necessary
|
||||
}
|
||||
|
||||
// causeLineBreak returns true if the tag causes a line break.
|
||||
|
||||
Reference in New Issue
Block a user