documentation pass done, ready to use the HTML Checker in other code

This commit is contained in:
2025-11-03 15:20:41 -07:00
parent 4f9cdde1f2
commit 1ba02f37e9
7 changed files with 255 additions and 132 deletions
+158 -60
View File
@@ -12,7 +12,6 @@ package htmlcheck
import ( import (
"errors" "errors"
"fmt" "fmt"
"maps"
"net/url" "net/url"
"strings" "strings"
"unicode" "unicode"
@@ -23,24 +22,38 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
/*----------------------------------------------------------------------------
* External definitions
*----------------------------------------------------------------------------
*/
// HTMLChecker is a component that checks HTML and reformats it as needed. // HTMLChecker is a component that checks HTML and reformats it as needed.
type HTMLChecker interface { type HTMLChecker interface {
Append(string) error Append(string) error // add additional string to the checker state
Finish() error Finish() error // finish parsing HTML
Reset() Reset() // clear state
Value() (string, error) Value() (string, error) // return value
Length() (int, error) Length() (int, error) // return text length
Lines() (int, error) Lines() (int, error) // return number of lines
Counter(string) (int, error) Counter(string) (int, error) // return value of a counter
GetContext(string) any GetContext(string) any // get a context value
SetContext(string, any) SetContext(string, any) // set a context value
ExternalRefs() ([]*url.URL, error) ExternalRefs() ([]*url.URL, error) // return a list of external references
InternalRefs() ([]string, error) InternalRefs() ([]string, error) // return a list of internal references
} }
// ErrAlreadyFinished is a common error that's returned if the checker has been finished when it shouldn't be.
var ErrAlreadyFinished = errors.New("the HTML checker has already finished") var ErrAlreadyFinished = errors.New("the HTML checker has already finished")
// ErrNotYetFinished is a common error that's returned if the checker has not been finished when it should be.
var ErrNotYetFinished = errors.New("the HTML checker has not yet been finished") var ErrNotYetFinished = errors.New("the HTML checker has not yet been finished")
/*----------------------------------------------------------------------------
* Internal definitions
*----------------------------------------------------------------------------
*/
// htmlCheckerBackend is an interface used by subcomponents to communicate back to the HTML checker.
type htmlCheckerBackend interface { type htmlCheckerBackend interface {
getCheckerAttrValue(string) string getCheckerAttrValue(string) string
sendTagMessage(string) sendTagMessage(string)
@@ -66,33 +79,40 @@ const htmlMarginSlop = 5
// hyphApos is used to find hyphens and apostrophes. // hyphApos is used to find hyphens and apostrophes.
const hyphApos = "-'" const hyphApos = "-'"
// htmlCheckerImpl is the implementation of the HTML checker.
type htmlCheckerImpl struct { type htmlCheckerImpl struct {
config *HTMLCheckerConfig config *HTMLCheckerConfig // pointer to configuration
started bool started bool // has checker been started?
finished bool finished bool // has checker been finished?
state int state int // current state
quoteChar byte quoteChar byte // quote character to match in stateTagQuote
parenLevel int parenLevel int // parenthesis level in stateParen
columns int columns int // current column position - runes, not bytes!
lines int lines int // lines of text
noBreakCount int noBreakCount int // current NOBR nesting count
triggerWBR bool triggerWBR bool // do we need to trigger a word break?
outputBuffer strings.Builder outputBuffer strings.Builder // output is gathered here
tempBuffer strings.Builder tempBuffer strings.Builder // input is gathered here within a state and flushed on transition
tagStack *util.Stack[*tag] tagStack *util.Stack[*tag] // keeps track of nested HTML tags
counters map[string]*countingRewriter counters map[string]*countingRewriter // counters for times rewrites have happened
stringRewriters []rewriter stringRewriters []rewriter // loaded string rewriters
wordRewriters []rewriter wordRewriters []rewriter // loaded word rewriters
tagRewriters []rewriter tagRewriters []rewriter // loaded tag rewriters
parenRewriters []rewriter parenRewriters []rewriter // loaded parenthesis rewriters
outputFilters []outputFilter outputFilters []outputFilter // loaded standard output filters
rawOutputFilters []outputFilter rawOutputFilters []outputFilter // loaded "raw" output filters
contextData map[string]any contextData map[string]any // holds context values
externalReferences map[*url.URL]bool externalReferences map[*url.URL]bool // saved external references
internalReferences map[string]bool internalReferences map[string]bool // saved internal references
tagSet *bitset.BitSet tagSet *bitset.BitSet // set of valid tags from configuration
} }
/*----------------------------------------------------------------------------
* Construction helpers
*----------------------------------------------------------------------------
*/
// copyRewriters looks up all rewriters in the source array and builds a target array.
func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) { func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) {
for i := range source { for i := range source {
rw, ok := rewriterRegistry[source[i]] rw, ok := rewriterRegistry[source[i]]
@@ -109,6 +129,7 @@ func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) {
} }
} }
// copyOutputFilters looks up all output filters in the source array and builds a target array.
func (ht *htmlCheckerImpl) copyOutputFilters(dest []outputFilter, source []string) { func (ht *htmlCheckerImpl) copyOutputFilters(dest []outputFilter, source []string) {
for i := range source { for i := range source {
f, ok := outputFilterRegistry[source[i]] f, ok := outputFilterRegistry[source[i]]
@@ -120,6 +141,18 @@ func (ht *htmlCheckerImpl) copyOutputFilters(dest []outputFilter, source []strin
} }
} }
/*----------------------------------------------------------------------------
* The construction function
*----------------------------------------------------------------------------
*/
/* AmNewHTMLChecker creates a new HTML Checker object.
* Parametrers:
* configName - Name of the configuration to use.
* Returns:
* New HTML checker reference.
* Standard Go error status.
*/
func AmNewHTMLChecker(configName string) (HTMLChecker, error) { func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
config, ok := configsRegistry[configName] config, ok := configsRegistry[configName]
if !ok { if !ok {
@@ -161,6 +194,12 @@ func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
return &rc, nil return &rc, nil
} }
/*----------------------------------------------------------------------------
* Implementations from htmlCheckerBackend and rewriterServices
*----------------------------------------------------------------------------
*/
// getCheckerAttrValue returns the value of an HTML checker attribute.
func (ht *htmlCheckerImpl) getCheckerAttrValue(name string) string { func (ht *htmlCheckerImpl) getCheckerAttrValue(name string) string {
if name == "ANCHORTAIL" { if name == "ANCHORTAIL" {
return ht.config.AnchorTail return ht.config.AnchorTail
@@ -168,6 +207,7 @@ func (ht *htmlCheckerImpl) getCheckerAttrValue(name string) string {
return "" return ""
} }
// sendTagMessage offers specific HTML tags a way to send messages to affect the HTML checker's state.
func (ht *htmlCheckerImpl) sendTagMessage(msg string) { func (ht *htmlCheckerImpl) sendTagMessage(msg string) {
switch msg { switch msg {
case "NOBR": case "NOBR":
@@ -179,26 +219,37 @@ func (ht *htmlCheckerImpl) sendTagMessage(msg string) {
} }
} }
// getCheckerContextValue returns a context value set on the HTML checker.
func (ht *htmlCheckerImpl) getCheckerContextValue(name string) any { func (ht *htmlCheckerImpl) getCheckerContextValue(name string) any {
return ht.contextData[name] return ht.contextData[name]
} }
// addExternalRef adds an external reference to the checker's logs.
func (ht *htmlCheckerImpl) addExternalRef(ref *url.URL) { func (ht *htmlCheckerImpl) addExternalRef(ref *url.URL) {
ht.externalReferences[ref] = true ht.externalReferences[ref] = true
} }
// addInternalRef adds an internal reference to the checker's logs.
func (ht *htmlCheckerImpl) addInternalRef(ref string) { func (ht *htmlCheckerImpl) addInternalRef(ref string) {
ht.internalReferences[ref] = true ht.internalReferences[ref] = true
} }
// rewriterAttrValue returns the value of an HTML checker attribute.
func (ht *htmlCheckerImpl) rewriterAttrValue(name string) string { func (ht *htmlCheckerImpl) rewriterAttrValue(name string) string {
return ht.getCheckerAttrValue(name) return ht.getCheckerAttrValue(name)
} }
// rewriterContextValue returns a context value set on the HTML checker.
func (ht *htmlCheckerImpl) rewriterContextValue(name string) any { func (ht *htmlCheckerImpl) rewriterContextValue(name string) any {
return ht.contextData[name] return ht.contextData[name]
} }
/*----------------------------------------------------------------------------
* Internal functions forming the meat of the parser
*----------------------------------------------------------------------------
*/
// emitRune emits a rune to the output buffer, respecting the specified output filters.
func (ht *htmlCheckerImpl) emitRune(ch rune, filters []outputFilter, countCols bool) { func (ht *htmlCheckerImpl) emitRune(ch rune, filters []outputFilter, countCols bool) {
handled := false handled := false
if len(filters) > 0 { if len(filters) > 0 {
@@ -209,28 +260,26 @@ func (ht *htmlCheckerImpl) emitRune(ch rune, filters []outputFilter, countCols b
break // found a filter to handle it, done break // found a filter to handle it, done
} }
} }
}
if !handled { // output the raw character if !handled { // output the raw character
ht.outputBuffer.WriteRune(ch) ht.outputBuffer.WriteRune(ch)
} }
if countCols && ht.config.WordWrap > 0 { if countCols && ht.config.WordWrap > 0 {
ht.columns++ ht.columns++
} }
}
} }
// emitString emits an entire string to the output buffer, respecting the specified output filters.
func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) { func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) {
if str == "" { if str != "" {
return realCountCols := countCols && ht.config.WordWrap > 0
}
realCountCols := countCols && (ht.config.WordWrap > 0)
if len(filters) == 0 { if len(filters) == 0 {
// if there are no filters, just output the whole thing // if there are no filters, just output the whole thing
ht.outputBuffer.WriteString(str) ht.outputBuffer.WriteString(str)
if realCountCols { if realCountCols {
ht.columns += utf8.RuneCountInString(str) ht.columns += utf8.RuneCountInString(str)
} }
return } else {
}
temp := str temp := str
for len(temp) > 0 { for len(temp) > 0 {
// We output as much of the string as we possibly can at once. Assume, for now, we'll output the whole thing. // We output as much of the string as we possibly can at once. Assume, for now, we'll output the whole thing.
@@ -276,8 +325,11 @@ func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countC
temp = temp[outputLen:] temp = temp[outputLen:]
} }
} }
}
}
} }
// emitLineBreak emits a line break to the output.
func (ht *htmlCheckerImpl) emitLineBreak() { func (ht *htmlCheckerImpl) emitLineBreak() {
ht.emitString("\r\n", ht.rawOutputFilters, false) ht.emitString("\r\n", ht.rawOutputFilters, false)
if ht.config.WordWrap > 0 { if ht.config.WordWrap > 0 {
@@ -286,34 +338,38 @@ func (ht *htmlCheckerImpl) emitLineBreak() {
ht.lines++ ht.lines++
} }
// emitPossibleLineBreak emits a line break to the output, if it's warranted.
func (ht *htmlCheckerImpl) emitPossibleLineBreak() { func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 && ht.columns >= ht.config.WordWrap { if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 && ht.columns >= ht.config.WordWrap {
ht.emitLineBreak() ht.emitLineBreak()
} }
} }
func (ht *htmlCheckerImpl) ensureSpaceOnLine(nchars int) { // ensureSpaceOnLine makes sure we have enough space on the current line for a certain number of runes, adding a line break if needed.
func (ht *htmlCheckerImpl) ensureSpaceOnLine(nrunes int) {
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 { if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
// add a line break if needed here // add a line break if needed here
remainSpace := ht.config.WordWrap - ht.columns remainSpace := ht.config.WordWrap - ht.columns
if remainSpace < nchars { if remainSpace < nrunes {
ht.emitLineBreak() ht.emitLineBreak()
} }
} }
} }
// emitMarkupData emits the markup data in the specified data structure.
func (ht *htmlCheckerImpl) emitMarkupData(md *markupData) { func (ht *htmlCheckerImpl) emitMarkupData(md *markupData) {
if !md.rescan { if !md.rescan {
ht.ensureSpaceOnLine(len(md.text)) ht.ensureSpaceOnLine(utf8.RuneCountInString(md.text))
ht.emitString(md.beginMarkup, ht.rawOutputFilters, false) ht.emitString(md.beginMarkup, ht.rawOutputFilters, false)
ht.emitString(md.text, ht.outputFilters, true) ht.emitString(md.text, ht.outputFilters, true)
ht.emitString(md.endMarkup, ht.rawOutputFilters, false) ht.emitString(md.endMarkup, ht.rawOutputFilters, false)
} }
} }
// emitBrackedtedMarkupData emits the marketed data in the specified data structure, with prefix and suffix runes.
func (ht *htmlCheckerImpl) emitBracketedMarkupData(md *markupData, prefix rune, suffix rune) { func (ht *htmlCheckerImpl) emitBracketedMarkupData(md *markupData, prefix rune, suffix rune) {
if !md.rescan { if !md.rescan {
l := len(md.text) l := utf8.RuneCountInString(md.text)
if l > 0 { if l > 0 {
l += 2 l += 2
} }
@@ -330,6 +386,7 @@ func (ht *htmlCheckerImpl) emitBracketedMarkupData(md *markupData, prefix rune,
} }
} }
// doFlushWhitespace flushes out all the whitespace in the temporary buffer.
func (ht *htmlCheckerImpl) doFlushWhitespace() { func (ht *htmlCheckerImpl) doFlushWhitespace() {
outputLen := ht.tempBuffer.Len() outputLen := ht.tempBuffer.Len()
if outputLen > 0 { if outputLen > 0 {
@@ -356,6 +413,7 @@ func (ht *htmlCheckerImpl) doFlushWhitespace() {
} }
} }
// doFlushNewlines flushes all the newlines that are in the temporary buffer.
func (ht *htmlCheckerImpl) doFlushNewlines() { func (ht *htmlCheckerImpl) doFlushNewlines() {
// Measure the number of line breaks we have. // Measure the number of line breaks we have.
lineBreaks, crs := 0, 0 lineBreaks, crs := 0, 0
@@ -385,6 +443,7 @@ func (ht *htmlCheckerImpl) doFlushNewlines() {
} }
} }
// emit line breaks
for lineBreaks > 0 { for lineBreaks > 0 {
ht.emitLineBreak() ht.emitLineBreak()
lineBreaks-- lineBreaks--
@@ -393,6 +452,7 @@ func (ht *htmlCheckerImpl) doFlushNewlines() {
ht.state = stateWhitespace ht.state = stateWhitespace
} }
// emitFromStartOfTempBuffer emits a certain number of runes from the start of the temporary buffer.
func (ht *htmlCheckerImpl) emitFromStartOfTempBuffer(nrunes int) { func (ht *htmlCheckerImpl) emitFromStartOfTempBuffer(nrunes int) {
if nrunes > 0 { if nrunes > 0 {
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 { if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
@@ -420,6 +480,7 @@ func (ht *htmlCheckerImpl) emitFromStartOfTempBuffer(nrunes int) {
} }
} }
// attemptRewrite attempts to apply a list of rewriters on the text, returning the first one that matches.
func (ht *htmlCheckerImpl) attemptRewrite(rewriters []rewriter, data string) *markupData { func (ht *htmlCheckerImpl) attemptRewrite(rewriters []rewriter, data string) *markupData {
for _, r := range rewriters { for _, r := range rewriters {
rc := r.Rewrite(data, ht) rc := r.Rewrite(data, ht)
@@ -430,6 +491,7 @@ func (ht *htmlCheckerImpl) attemptRewrite(rewriters []rewriter, data string) *ma
return nil return nil
} }
// doFlushString attempts to flush a string from the temporary buffer.
func (ht *htmlCheckerImpl) doFlushString() bool { func (ht *htmlCheckerImpl) doFlushString() bool {
md := ht.attemptRewrite(ht.stringRewriters, ht.tempBuffer.String()) md := ht.attemptRewrite(ht.stringRewriters, ht.tempBuffer.String())
if md != nil { if md != nil {
@@ -519,9 +581,11 @@ func (ht *htmlCheckerImpl) doFlushString() bool {
return false return false
} }
// handleAsHTML attempts to handle the contents of the tag in the temporary buffer as HTML.
func (ht *htmlCheckerImpl) handleAsHTML() bool { func (ht *htmlCheckerImpl) handleAsHTML() bool {
ht.triggerWBR = false ht.triggerWBR = false
tempString := ht.tempBuffer.String() tempString := ht.tempBuffer.String()
// Figure out where the start of the command word is. // Figure out where the start of the command word is.
startCmd := 0 startCmd := 0
closingTag := false closingTag := false
@@ -543,8 +607,7 @@ func (ht *htmlCheckerImpl) handleAsHTML() bool {
// command word is empty or is too long to be an HTML tag // command word is empty or is too long to be an HTML tag
return false return false
} }
possTagName := tempString[startCmd:endCmd] tagIndex, ok := tagNameToIndex[strings.ToUpper(tempString[startCmd:endCmd])]
tagIndex, ok := tagNameToIndex[strings.ToUpper(possTagName)]
if !ok { if !ok {
// not a known HTML tag // not a known HTML tag
return false return false
@@ -584,6 +647,7 @@ func (ht *htmlCheckerImpl) handleAsHTML() bool {
ht.emitString(realTagData, ht.rawOutputFilters, false) ht.emitString(realTagData, ht.rawOutputFilters, false)
ht.emitRune('>', ht.rawOutputFilters, false) ht.emitRune('>', ht.rawOutputFilters, false)
// Determine whether this tag causes a "logical line break."
logicalLineBreak := false logicalLineBreak := false
if ht.triggerWBR && !closingTag && ht.noBreakCount > 0 { if ht.triggerWBR && !closingTag && ht.noBreakCount > 0 {
// word break is logical line break, but only within no-break tags // word break is logical line break, but only within no-break tags
@@ -597,10 +661,12 @@ func (ht *htmlCheckerImpl) handleAsHTML() bool {
return true return true
} }
// containsHTMLComment returns true if the temporary buffer contains (the start of) an HTML comment.
func (ht *htmlCheckerImpl) containsHTMLComment() bool { func (ht *htmlCheckerImpl) containsHTMLComment() bool {
return ht.tempBuffer.Len() >= 3 && strings.HasPrefix(ht.tempBuffer.String(), "!--") return ht.tempBuffer.Len() >= 3 && strings.HasPrefix(ht.tempBuffer.String(), "!--")
} }
// containsCompleteHTMLComment returns true if the temporary buffer contains a complete HTML comment.
func (ht *htmlCheckerImpl) containsCompleteHTMLComment() bool { func (ht *htmlCheckerImpl) containsCompleteHTMLComment() bool {
if ht.tempBuffer.Len() >= 5 { if ht.tempBuffer.Len() >= 5 {
s := ht.tempBuffer.String() s := ht.tempBuffer.String()
@@ -609,6 +675,7 @@ func (ht *htmlCheckerImpl) containsCompleteHTMLComment() bool {
return false return false
} }
// containsXMLConstruct returns true if the temporary buffer contains an XML-style namespaced tag.
func (ht *htmlCheckerImpl) containsXMLConstruct() bool { func (ht *htmlCheckerImpl) containsXMLConstruct() bool {
tempString := ht.tempBuffer.String() tempString := ht.tempBuffer.String()
ptr := 0 ptr := 0
@@ -626,19 +693,18 @@ func (ht *htmlCheckerImpl) containsXMLConstruct() bool {
return false return false
} }
// finishTag processes and outputs the tag in the temporary buffer.
func (ht *htmlCheckerImpl) finishTag() { func (ht *htmlCheckerImpl) finishTag() {
if ht.containsHTMLComment() { if ht.containsHTMLComment() {
if ht.containsCompleteHTMLComment() { if ht.containsCompleteHTMLComment() && !ht.config.DiscardComments {
if !ht.config.DiscardComments {
// output the comment in the raw // output the comment in the raw
ht.emitRune('<', ht.rawOutputFilters, false) ht.emitRune('<', ht.rawOutputFilters, false)
ht.emitString(ht.tempBuffer.String(), ht.rawOutputFilters, false) ht.emitString(ht.tempBuffer.String(), ht.rawOutputFilters, false)
ht.emitRune('>', ht.rawOutputFilters, false) ht.emitRune('>', ht.rawOutputFilters, false)
// clear state and retun to parsing // clear state and return to parsing
ht.tempBuffer.Reset() ht.tempBuffer.Reset()
ht.state = stateWhitespace ht.state = stateWhitespace
} }
}
return return
} }
if ht.handleAsHTML() { if ht.handleAsHTML() {
@@ -680,6 +746,7 @@ func (ht *htmlCheckerImpl) finishTag() {
ht.parse(">") ht.parse(">")
} }
// finishParen processes and outputs the parenthesized construct in the temporary buffer.
func (ht *htmlCheckerImpl) finishParen() { func (ht *htmlCheckerImpl) finishParen() {
// Try to handle the element using a paren rewriter // Try to handle the element using a paren rewriter
md := ht.attemptRewrite(ht.parenRewriters, ht.tempBuffer.String()) md := ht.attemptRewrite(ht.parenRewriters, ht.tempBuffer.String())
@@ -708,6 +775,7 @@ func (ht *htmlCheckerImpl) finishParen() {
ht.parse(")") ht.parse(")")
} }
// parse handles the meat of parsing an input string; it runs the state machine on the input.
func (ht *htmlCheckerImpl) parse(str string) { func (ht *htmlCheckerImpl) parse(str string) {
i := 0 i := 0
for i < len(str) { for i < len(str) {
@@ -785,7 +853,7 @@ func (ht *htmlCheckerImpl) parse(str string) {
ht.tempBuffer.WriteByte('\\') ht.tempBuffer.WriteByte('\\')
} }
} else { } else {
// just append the backslash notrmally // just append the backslash normally
ht.tempBuffer.WriteByte(ch) ht.tempBuffer.WriteByte(ch)
i++ i++
} }
@@ -801,7 +869,7 @@ func (ht *htmlCheckerImpl) parse(str string) {
case '<': // output < and stay in this state case '<': // output < and stay in this state
ht.emitRune('<', ht.outputFilters, true) ht.emitRune('<', ht.outputFilters, true)
i++ i++
default: default: // begin processing tag
ht.state = stateTag ht.state = stateTag
ht.tempBuffer.WriteByte(ch) ht.tempBuffer.WriteByte(ch)
i++ i++
@@ -822,14 +890,15 @@ func (ht *htmlCheckerImpl) parse(str string) {
} }
case stateParen: case stateParen:
switch ch { switch ch {
case '(': case '(': // nest parentheses one level deeper
ht.tempBuffer.WriteByte(ch) ht.tempBuffer.WriteByte(ch)
ht.parenLevel++ ht.parenLevel++
i++ i++
case ')': case ')':
if ht.parenLevel == 0 { if ht.parenLevel == 0 {
ht.finishParen() ht.finishParen() // finish paren, changing state and recursively parsing if necessary
} else { } else {
// nest parentheses one LESS level deeper
ht.tempBuffer.WriteByte(ch) ht.tempBuffer.WriteByte(ch)
ht.parenLevel-- ht.parenLevel--
} }
@@ -851,10 +920,23 @@ func (ht *htmlCheckerImpl) parse(str string) {
} else { } else {
ht.doFlushNewlines() ht.doFlushNewlines()
} }
default:
log.Fatalf("invalid parser state: %d", ht.state)
} }
} }
} }
/*----------------------------------------------------------------------------
* Implementations from the HTMLChecker interface
*----------------------------------------------------------------------------
*/
/* Append adds an additional string to the HTML checker data.
* Parameters:
* str - The string to be added and parsed.
* Returns:
* Standard Go error status.
*/
func (ht *htmlCheckerImpl) Append(str string) error { func (ht *htmlCheckerImpl) Append(str string) error {
if ht.finished { if ht.finished {
return ErrAlreadyFinished return ErrAlreadyFinished
@@ -868,6 +950,10 @@ func (ht *htmlCheckerImpl) Append(str string) error {
return nil return nil
} }
/* Finish completes the HTML checker parsing and makes the result available.
* Returns:
* Standard Go error status.
*/
func (ht *htmlCheckerImpl) Finish() error { func (ht *htmlCheckerImpl) Finish() error {
if ht.finished { if ht.finished {
return ErrAlreadyFinished return ErrAlreadyFinished
@@ -900,6 +986,7 @@ func (ht *htmlCheckerImpl) Finish() error {
} }
running = true running = true
case stateParen: case stateParen:
// we won't finish this, so it's automatically rejected
rejection := ht.tempBuffer.String() rejection := ht.tempBuffer.String()
ht.tempBuffer.Reset() ht.tempBuffer.Reset()
ht.tempBuffer.WriteByte('(') ht.tempBuffer.WriteByte('(')
@@ -923,6 +1010,7 @@ func (ht *htmlCheckerImpl) Finish() error {
return nil return nil
} }
// Reset clears the internal state of the HTML Checker.
func (ht *htmlCheckerImpl) Reset() { func (ht *htmlCheckerImpl) Reset() {
ht.started = false ht.started = false
ht.finished = false ht.finished = false
@@ -933,17 +1021,20 @@ func (ht *htmlCheckerImpl) Reset() {
ht.lines = 0 ht.lines = 0
ht.parenLevel = 0 ht.parenLevel = 0
ht.outputBuffer.Reset() ht.outputBuffer.Reset()
ht.tempBuffer.Reset()
ht.tagStack.Clear()
for u := range ht.externalReferences { for u := range ht.externalReferences {
delete(ht.externalReferences, u) delete(ht.externalReferences, u)
} }
for k := range ht.internalReferences { for k := range ht.internalReferences {
delete(ht.internalReferences, k) delete(ht.internalReferences, k)
} }
for c := range maps.Values(ht.counters) { for _, c := range ht.counters {
c.Reset() c.Reset()
} }
} }
// Value returns the value of the output from the HTML Checker.
func (ht *htmlCheckerImpl) Value() (string, error) { func (ht *htmlCheckerImpl) Value() (string, error) {
if ht.finished { if ht.finished {
return ht.outputBuffer.String(), nil return ht.outputBuffer.String(), nil
@@ -951,6 +1042,7 @@ func (ht *htmlCheckerImpl) Value() (string, error) {
return "", ErrNotYetFinished return "", ErrNotYetFinished
} }
// Length returns the length in bytes of the HTML Checker result.
func (ht *htmlCheckerImpl) Length() (int, error) { func (ht *htmlCheckerImpl) Length() (int, error) {
if ht.finished { if ht.finished {
return ht.outputBuffer.Len(), nil return ht.outputBuffer.Len(), nil
@@ -958,6 +1050,7 @@ func (ht *htmlCheckerImpl) Length() (int, error) {
return 0, ErrNotYetFinished return 0, ErrNotYetFinished
} }
// Lines returns the number of lines of text in the HTML Checker result.
func (ht *htmlCheckerImpl) Lines() (int, error) { func (ht *htmlCheckerImpl) Lines() (int, error) {
if ht.finished { if ht.finished {
return ht.lines, nil return ht.lines, nil
@@ -965,6 +1058,7 @@ func (ht *htmlCheckerImpl) Lines() (int, error) {
return 0, ErrNotYetFinished return 0, ErrNotYetFinished
} }
// Counter returns the value of a counter maintained by the HTML Checker (corresponding to a rewriter).
func (ht *htmlCheckerImpl) Counter(name string) (int, error) { func (ht *htmlCheckerImpl) Counter(name string) (int, error) {
if ht.finished { if ht.finished {
cr, ok := ht.counters[name] cr, ok := ht.counters[name]
@@ -976,19 +1070,22 @@ func (ht *htmlCheckerImpl) Counter(name string) (int, error) {
return 0, ErrNotYetFinished return 0, ErrNotYetFinished
} }
// GetContext returns an HTML checker context value.
func (ht *htmlCheckerImpl) GetContext(name string) any { func (ht *htmlCheckerImpl) GetContext(name string) any {
return ht.contextData[name] return ht.contextData[name]
} }
// SetContext sets an HTML checker context value.
func (ht *htmlCheckerImpl) SetContext(name string, value any) { func (ht *htmlCheckerImpl) SetContext(name string, value any) {
ht.contextData[name] = value ht.contextData[name] = value
} }
// ExternalRefs returns a list of URLs as external references in the parsed text.
func (ht *htmlCheckerImpl) ExternalRefs() ([]*url.URL, error) { func (ht *htmlCheckerImpl) ExternalRefs() ([]*url.URL, error) {
if ht.finished { if ht.finished {
rc := make([]*url.URL, len(ht.externalReferences)) rc := make([]*url.URL, len(ht.externalReferences))
p := 0 p := 0
for url := range maps.Keys(ht.externalReferences) { for url := range ht.externalReferences {
rc[p] = url rc[p] = url
p++ p++
} }
@@ -997,11 +1094,12 @@ func (ht *htmlCheckerImpl) ExternalRefs() ([]*url.URL, error) {
return nil, ErrNotYetFinished return nil, ErrNotYetFinished
} }
// InternalRefs returns a list of internal references in the parsed text.
func (ht *htmlCheckerImpl) InternalRefs() ([]string, error) { func (ht *htmlCheckerImpl) InternalRefs() ([]string, error) {
if ht.finished { if ht.finished {
rc := make([]string, len(ht.internalReferences)) rc := make([]string, len(ht.internalReferences))
p := 0 p := 0
for s := range maps.Keys(ht.internalReferences) { for s := range ht.internalReferences {
rc[p] = s rc[p] = s
p++ p++
} }
+1
View File
@@ -42,6 +42,7 @@ type HTMLCheckerConfigFile struct {
Configs []HTMLCheckerConfig `yaml:"configs"` Configs []HTMLCheckerConfig `yaml:"configs"`
} }
// defaultAnchorTail is the default value of the anchor tail.
const defaultAnchorTail = "TARGET=\"Wander\"" const defaultAnchorTail = "TARGET=\"Wander\""
//go:embed configs.yaml //go:embed configs.yaml
+1 -1
View File
@@ -32,7 +32,7 @@ type EmoticonConfig struct {
emos map[string]*EmoticonDef emos map[string]*EmoticonDef
} }
// emoticonRewriter is the implementation of rewriter in this file // emoticonRewriter is the implementation of rewriter in this file.
type emoticonRewriter struct { type emoticonRewriter struct {
config *EmoticonConfig config *EmoticonConfig
prefixChars []byte prefixChars []byte
+1 -1
View File
@@ -33,7 +33,7 @@ type htmlEncodingFilter struct{}
// htmlEscapedChars is a list of HTML characters that are escaped. // htmlEscapedChars is a list of HTML characters that are escaped.
const htmlEscapedChars = "<>&" const htmlEscapedChars = "<>&"
// tryOutputCharacter outputs a character that needs to be escaped. // tryOutputRune outputs a rune that needs to be escaped.
func (f *htmlEncodingFilter) tryOutputRune(buf strings.Builder, ch rune) bool { func (f *htmlEncodingFilter) tryOutputRune(buf strings.Builder, ch rune) bool {
switch ch { switch ch {
case '<': case '<':
+12 -12
View File
@@ -38,10 +38,10 @@ const (
tagSetNSCPForms = 16 // Netscape form tags tagSetNSCPForms = 16 // Netscape form tags
tagSetNSCPBlockFormat = 17 // Netscape block-formatting tags tagSetNSCPBlockFormat = 17 // Netscape block-formatting tags
tagSetNSCPServer = 18 // the Netscape <SERVER> tag tagSetNSCPServer = 18 // the Netscape <SERVER> tag
tagSetMSFTDocFormat = 19 // Microsoft-specific document formatting tagSetMSFTDocFormat = 19 // Micro$oft-specific document formatting
tagSetMSFTInlineFormat = 20 // Microsoft-specific inline formatting tagSetMSFTInlineFormat = 20 // Micro$oft-specific inline formatting
tagSetMSFTBlockFormat = 21 // Microsoft-specific block formatting tagSetMSFTBlockFormat = 21 // Micro$oft-specific block formatting
tagSetMSFTActiveContent = 22 // Microsoft-specific active content tagSetMSFTActiveContent = 22 // Micro$oft-specific active content
tagSetServerPage = 23 // server-side page use tagSetServerPage = 23 // server-side page use
tagSetJavaServer = 24 // Java server page use tagSetJavaServer = 24 // Java server page use
tagSetComment = 25 // HTML comments tagSetComment = 25 // HTML comments
@@ -54,14 +54,14 @@ type rewriteContentsFunc func(*tag, string, bool, htmlCheckerBackend) string
// tag is a structure describing a particular HTML tag. // tag is a structure describing a particular HTML tag.
type tag struct { type tag struct {
name string name string // tag name, upper case
index int index int // index in the array
lineBreak bool lineBreak bool // does the tag cause line breaks?
allowClose bool allowClose bool // is a close form of the tag allowed?
balanceTags bool balanceTags bool // do we need to balance open and close tags?
clb causeLineBreakFunc clb causeLineBreakFunc // does this tag cause line breaks?
ct closingTagFunc ct closingTagFunc // generate closing tag
rwc rewriteContentsFunc rwc rewriteContentsFunc // rewrite the contents if necessary
} }
// causeLineBreak returns true if the tag causes a line break. // causeLineBreak returns true if the tag causes a line break.
+7
View File
@@ -43,6 +43,7 @@ func (stk *Stack[T]) Peek() (T, bool) {
return stk.elements[len(stk.elements)-1], true return stk.elements[len(stk.elements)-1], true
} }
// RemoveMostRecent looks for the most recent particular data element on the stack, and removes that.
func (stk *Stack[T]) RemoveMostRecent(data T) bool { func (stk *Stack[T]) RemoveMostRecent(data T) bool {
i := len(stk.elements) - 1 i := len(stk.elements) - 1
for i >= 0 { for i >= 0 {
@@ -58,10 +59,16 @@ func (stk *Stack[T]) RemoveMostRecent(data T) bool {
} }
return true return true
} }
i--
} }
return false return false
} }
// Clear clears out the stack.
func (stk *Stack[T]) Clear() {
stk.elements = make([]T, 0)
}
// NewStack creates and returns a new stack. // NewStack creates and returns a new stack.
func NewStack[T comparable]() *Stack[T] { func NewStack[T comparable]() *Stack[T] {
return &Stack[T]{ return &Stack[T]{
+17
View File
@@ -94,10 +94,18 @@ func RunesToBytes(s string, runeCount int) int {
return bp return bp
} }
// IsRuneWord returns true if the given rune is part of a word.
func IsRuneWord(ch rune) bool { func IsRuneWord(ch rune) bool {
return unicode.IsLetter(ch) || ch == '-' || ch == '\'' return unicode.IsLetter(ch) || ch == '-' || ch == '\''
} }
/* WordRunLength calculates the number of runes at the start of the string that are either word or non-word characters.
* Parameters:
* s - The string under test.
* Returns:
* The run length in runes.
* true if the run is a length of word characters, false if it's a run of non-word characters.
*/
func WordRunLength(s string) (int, bool) { func WordRunLength(s string) (int, bool) {
c1, initLen := utf8.DecodeRuneInString(s) c1, initLen := utf8.DecodeRuneInString(s)
wordChar := IsRuneWord(c1) wordChar := IsRuneWord(c1)
@@ -111,6 +119,15 @@ func WordRunLength(s string) (int, bool) {
return rlen, wordChar return rlen, wordChar
} }
/* WordRunLengthAfterPrefix calculates the number of runes after a certain number in the string
* that are either word or non-word characters.
* Parameters:
* s - The string under test.
* nrunes - The number of runes to skip at the start of the string.
* Returns:
* The run length in runes.
* true if the run is a length of word characters, false if it's a run of non-word characters.
*/
func WordRunLengthAfterPrefix(s string, nrunes int) (int, bool) { func WordRunLengthAfterPrefix(s string, nrunes int) (int, bool) {
ofs := 0 ofs := 0
for _, ch := range s { for _, ch := range s {