HTML checker is code complete, needs a pass for documentation
This commit is contained in:
+779
-22
@@ -12,8 +12,11 @@ package htmlcheck
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"maps"
|
||||
"net/url"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"git.erbosoft.com/amy/amsterdam/util"
|
||||
"github.com/bits-and-blooms/bitset"
|
||||
@@ -35,8 +38,8 @@ type HTMLChecker interface {
|
||||
InternalRefs() ([]string, error)
|
||||
}
|
||||
|
||||
var AlreadyFinished = errors.New("the HTML checker has already finished")
|
||||
var NotYetFinished = errors.New("the HTML checker has not yet been finished")
|
||||
var ErrAlreadyFinished = errors.New("the HTML checker has already finished")
|
||||
var ErrNotYetFinished = errors.New("the HTML checker has not yet been finished")
|
||||
|
||||
type htmlCheckerBackend interface {
|
||||
getCheckerAttrValue(string) string
|
||||
@@ -60,6 +63,9 @@ const (
|
||||
// htmlMarginSlop is a number of characters at the end of the line used to control word-wrapping.
|
||||
const htmlMarginSlop = 5
|
||||
|
||||
// hyphApos is used to find hyphens and apostrophes.
|
||||
const hyphApos = "-'"
|
||||
|
||||
type htmlCheckerImpl struct {
|
||||
config *HTMLCheckerConfig
|
||||
started bool
|
||||
@@ -80,6 +86,7 @@ type htmlCheckerImpl struct {
|
||||
tagRewriters []rewriter
|
||||
parenRewriters []rewriter
|
||||
outputFilters []outputFilter
|
||||
rawOutputFilters []outputFilter
|
||||
contextData map[string]any
|
||||
externalReferences map[*url.URL]bool
|
||||
internalReferences map[string]bool
|
||||
@@ -102,6 +109,17 @@ func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) {
|
||||
}
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) copyOutputFilters(dest []outputFilter, source []string) {
|
||||
for i := range source {
|
||||
f, ok := outputFilterRegistry[source[i]]
|
||||
if ok {
|
||||
dest[i] = f
|
||||
} else {
|
||||
log.Errorf("filter %s is not found", source[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
|
||||
config, ok := configsRegistry[configName]
|
||||
if !ok {
|
||||
@@ -128,6 +146,7 @@ func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
|
||||
tagRewriters: make([]rewriter, len(config.TagRewriters)),
|
||||
parenRewriters: make([]rewriter, len(config.ParenRewriters)),
|
||||
outputFilters: make([]outputFilter, len(config.OutputFilters)),
|
||||
rawOutputFilters: make([]outputFilter, len(config.RawOutputFilters)),
|
||||
contextData: make(map[string]any),
|
||||
externalReferences: make(map[*url.URL]bool),
|
||||
internalReferences: make(map[string]bool),
|
||||
@@ -137,15 +156,66 @@ func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
|
||||
rc.copyRewriters(rc.wordRewriters, config.WordRewriters)
|
||||
rc.copyRewriters(rc.tagRewriters, config.TagRewriters)
|
||||
rc.copyRewriters(rc.parenRewriters, config.ParenRewriters)
|
||||
for i := range config.OutputFilters {
|
||||
f, ok := outputFilterRegistry[config.OutputFilters[i]]
|
||||
if ok {
|
||||
rc.outputFilters[i] = f
|
||||
} else {
|
||||
log.Errorf("filter %s is not found", config.OutputFilters[i])
|
||||
rc.copyOutputFilters(rc.outputFilters, config.OutputFilters)
|
||||
rc.copyOutputFilters(rc.rawOutputFilters, config.RawOutputFilters)
|
||||
return &rc, nil
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) getCheckerAttrValue(name string) string {
|
||||
if name == "ANCHORTAIL" {
|
||||
return ht.config.AnchorTail
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) sendTagMessage(msg string) {
|
||||
switch msg {
|
||||
case "NOBR":
|
||||
ht.noBreakCount++
|
||||
case "/NOBR":
|
||||
ht.noBreakCount--
|
||||
case "WBR":
|
||||
ht.triggerWBR = true
|
||||
}
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) getCheckerContextValue(name string) any {
|
||||
return ht.contextData[name]
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) addExternalRef(ref *url.URL) {
|
||||
ht.externalReferences[ref] = true
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) addInternalRef(ref string) {
|
||||
ht.internalReferences[ref] = true
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) rewriterAttrValue(name string) string {
|
||||
return ht.getCheckerAttrValue(name)
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) rewriterContextValue(name string) any {
|
||||
return ht.contextData[name]
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) emitRune(ch rune, filters []outputFilter, countCols bool) {
|
||||
handled := false
|
||||
if len(filters) > 0 {
|
||||
// try each output filter to see what we can do
|
||||
for _, of := range filters {
|
||||
handled = of.tryOutputRune(ht.outputBuffer, ch)
|
||||
if handled {
|
||||
break // found a filter to handle it, done
|
||||
}
|
||||
}
|
||||
if !handled { // output the raw character
|
||||
ht.outputBuffer.WriteRune(ch)
|
||||
}
|
||||
if countCols && ht.config.WordWrap > 0 {
|
||||
ht.columns++
|
||||
}
|
||||
}
|
||||
return &rc
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) {
|
||||
@@ -154,42 +224,52 @@ func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countC
|
||||
}
|
||||
realCountCols := countCols && (ht.config.WordWrap > 0)
|
||||
if len(filters) == 0 {
|
||||
// if there are no filters, just output the whole thing
|
||||
ht.outputBuffer.WriteString(str)
|
||||
if realCountCols {
|
||||
ht.columns += len(str)
|
||||
ht.columns += utf8.RuneCountInString(str)
|
||||
}
|
||||
return
|
||||
}
|
||||
temp := str
|
||||
for len(temp) > 0 {
|
||||
// We output as much of the string as we possibly can at once. Assume, for now, we'll output the whole thing.
|
||||
outputLen := len(temp)
|
||||
|
||||
// Now look at each of the output filters to see if we should try outputting a lesser amount
|
||||
// (i.e. does the string contain a "stopper" that one of the filters would like to mogrify?)
|
||||
var stopper outputFilter = nil
|
||||
for _, of := range filters {
|
||||
// find the length of characters that DOESN'T match this filter
|
||||
lnm := of.lengthNoMatch(temp)
|
||||
if lnm >= 0 && lnm < outputLen {
|
||||
// we've found a new stopper - record the length and the filter
|
||||
outputLen = lnm
|
||||
stopper = of
|
||||
}
|
||||
if outputLen <= 0 {
|
||||
break
|
||||
break // nothing left to do here
|
||||
}
|
||||
}
|
||||
if outputLen > 0 {
|
||||
// move over the unaltered characters first
|
||||
ht.outputBuffer.WriteString(temp[:outputLen])
|
||||
if realCountCols {
|
||||
ht.columns += outputLen
|
||||
ht.columns += utf8.RuneCountInString(temp[:outputLen])
|
||||
}
|
||||
}
|
||||
if stopper != nil {
|
||||
tmpch := temp[outputLen]
|
||||
outputLen++
|
||||
if !stopper.tryOutputCharacter(ht.outputBuffer, tmpch) {
|
||||
ht.outputBuffer.WriteByte(tmpch)
|
||||
// one of the output filters stopped us, try invoking it
|
||||
tmpch, bsiz := utf8.DecodeRuneInString(temp[outputLen:])
|
||||
outputLen += bsiz
|
||||
if !stopper.tryOutputRune(ht.outputBuffer, tmpch) {
|
||||
ht.outputBuffer.WriteRune(tmpch)
|
||||
}
|
||||
if realCountCols {
|
||||
ht.columns++
|
||||
}
|
||||
}
|
||||
// Chop the string and go around again.
|
||||
if outputLen == len(temp) {
|
||||
temp = ""
|
||||
} else if outputLen > 0 {
|
||||
@@ -199,7 +279,11 @@ func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countC
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) emitLineBreak() {
|
||||
|
||||
ht.emitString("\r\n", ht.rawOutputFilters, false)
|
||||
if ht.config.WordWrap > 0 {
|
||||
ht.columns = 0
|
||||
}
|
||||
ht.lines++
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
|
||||
@@ -208,17 +292,572 @@ func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
|
||||
}
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) ensureSpaceOnLine(nchars int) {
|
||||
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
|
||||
// add a line break if needed here
|
||||
remainSpace := ht.config.WordWrap - ht.columns
|
||||
if remainSpace < nchars {
|
||||
ht.emitLineBreak()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) emitMarkupData(md *markupData) {
|
||||
if !md.rescan {
|
||||
ht.ensureSpaceOnLine(len(md.text))
|
||||
ht.emitString(md.beginMarkup, ht.rawOutputFilters, false)
|
||||
ht.emitString(md.text, ht.outputFilters, true)
|
||||
ht.emitString(md.endMarkup, ht.rawOutputFilters, false)
|
||||
}
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) emitBracketedMarkupData(md *markupData, prefix rune, suffix rune) {
|
||||
if !md.rescan {
|
||||
l := len(md.text)
|
||||
if l > 0 {
|
||||
l += 2
|
||||
}
|
||||
ht.ensureSpaceOnLine(l)
|
||||
if len(md.text) > 0 {
|
||||
ht.emitRune(prefix, ht.outputFilters, true)
|
||||
}
|
||||
ht.emitString(md.beginMarkup, ht.rawOutputFilters, false)
|
||||
ht.emitString(md.text, ht.outputFilters, true)
|
||||
ht.emitString(md.endMarkup, ht.rawOutputFilters, false)
|
||||
if len(md.text) > 0 {
|
||||
ht.emitRune(suffix, ht.outputFilters, true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) doFlushWhitespace() {
|
||||
outputLen := ht.tempBuffer.Len()
|
||||
if outputLen > 0 {
|
||||
forceLineBreak := false
|
||||
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
|
||||
// adjust output if necessary for wordwrapping
|
||||
remainSpace := ht.config.WordWrap - ht.columns
|
||||
if remainSpace < outputLen {
|
||||
outputLen = remainSpace
|
||||
}
|
||||
if outputLen <= 0 {
|
||||
// this means that NONE of the whitespace would fit on this line...add a line break
|
||||
forceLineBreak = true
|
||||
outputLen = 0
|
||||
}
|
||||
}
|
||||
if forceLineBreak {
|
||||
ht.emitLineBreak()
|
||||
}
|
||||
if outputLen > 0 {
|
||||
ht.emitString(ht.tempBuffer.String()[:outputLen], ht.outputFilters, true)
|
||||
}
|
||||
ht.tempBuffer.Reset()
|
||||
}
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) doFlushNewlines() {
|
||||
// Measure the number of line breaks we have.
|
||||
lineBreaks, crs := 0, 0
|
||||
for ch := range []byte(ht.tempBuffer.String()) {
|
||||
switch ch {
|
||||
case '\r':
|
||||
crs++
|
||||
case '\n':
|
||||
crs = 0
|
||||
lineBreaks++
|
||||
}
|
||||
}
|
||||
if crs > 0 {
|
||||
lineBreaks++
|
||||
}
|
||||
|
||||
// Adjust the number of line breaks if rewrap is in effect.
|
||||
if ht.config.Rewrap {
|
||||
if lineBreaks < 2 {
|
||||
// convert a single line break to whitespace
|
||||
ht.tempBuffer.Reset()
|
||||
ht.tempBuffer.WriteByte(' ')
|
||||
ht.state = stateWhitespace
|
||||
return
|
||||
} else {
|
||||
lineBreaks = 2 // compress out multiple blank lines
|
||||
}
|
||||
}
|
||||
|
||||
for lineBreaks > 0 {
|
||||
ht.emitLineBreak()
|
||||
lineBreaks--
|
||||
}
|
||||
ht.tempBuffer.Reset()
|
||||
ht.state = stateWhitespace
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) emitFromStartOfTempBuffer(nrunes int) {
|
||||
if nrunes > 0 {
|
||||
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 {
|
||||
for nrunes > 0 {
|
||||
curlen := min(nrunes, ht.config.WordWrap-ht.columns)
|
||||
if curlen > 0 {
|
||||
s := ht.tempBuffer.String()
|
||||
bcurlen := util.RunesToBytes(s, curlen)
|
||||
ht.emitString(s[:bcurlen], ht.outputFilters, true)
|
||||
ht.tempBuffer.Reset()
|
||||
ht.tempBuffer.WriteString(s[bcurlen:])
|
||||
nrunes -= curlen
|
||||
}
|
||||
if ht.columns >= ht.config.WordWrap {
|
||||
ht.emitLineBreak()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
s := ht.tempBuffer.String()
|
||||
bnrunes := util.RunesToBytes(s, nrunes)
|
||||
ht.emitString(s[:bnrunes], ht.outputFilters, true)
|
||||
ht.tempBuffer.Reset()
|
||||
ht.tempBuffer.WriteString(s[bnrunes:])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) attemptRewrite(rewriters []rewriter, data string) *markupData {
|
||||
for _, r := range rewriters {
|
||||
rc := r.Rewrite(data, ht)
|
||||
if rc != nil {
|
||||
return rc
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) doFlushString() bool {
|
||||
return false // TODO
|
||||
md := ht.attemptRewrite(ht.stringRewriters, ht.tempBuffer.String())
|
||||
if md != nil {
|
||||
ht.emitMarkupData(md)
|
||||
ht.tempBuffer.Reset()
|
||||
if md.rescan {
|
||||
ht.parse(md.all())
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
first := true
|
||||
for ht.tempBuffer.Len() > 0 {
|
||||
sublen, isWord := util.WordRunLength(ht.tempBuffer.String())
|
||||
if isWord {
|
||||
// we want to check the word, but first we must eliminate leading hyphens and apostrophes
|
||||
hyphCount := 0
|
||||
for _, ch := range ht.tempBuffer.String() {
|
||||
if hyphCount == sublen || !strings.ContainsRune(hyphApos, ch) {
|
||||
break
|
||||
}
|
||||
hyphCount++
|
||||
}
|
||||
ht.emitFromStartOfTempBuffer(hyphCount)
|
||||
sublen -= hyphCount
|
||||
|
||||
// now determine how many hyphens/apostrophes there are at the end of the word
|
||||
runeArray := []rune(ht.tempBuffer.String())
|
||||
wordLen := sublen
|
||||
hyphCount = 0
|
||||
for wordLen > 0 && strings.ContainsRune(hyphApos, runeArray[wordLen-1]) {
|
||||
hyphCount++
|
||||
wordLen--
|
||||
}
|
||||
|
||||
if wordLen > 0 {
|
||||
// extract the word and remove it from the start of the buffer
|
||||
word := string(runeArray[:wordLen])
|
||||
lw := len(word)
|
||||
s := ht.tempBuffer.String()
|
||||
ht.tempBuffer.Reset()
|
||||
ht.tempBuffer.WriteString(s[lw:])
|
||||
|
||||
// try to rewrite this word
|
||||
md := ht.attemptRewrite(ht.wordRewriters, word)
|
||||
if md != nil {
|
||||
// emit and/or reparse
|
||||
ht.emitMarkupData(md)
|
||||
if md.rescan {
|
||||
ht.parse(md.all())
|
||||
}
|
||||
} else {
|
||||
// just output the word normally
|
||||
ht.ensureSpaceOnLine(wordLen)
|
||||
ht.emitString(word, ht.outputFilters, true)
|
||||
}
|
||||
}
|
||||
|
||||
// now emit the rest of the hyphens/apostrophes
|
||||
ht.emitFromStartOfTempBuffer(hyphCount)
|
||||
|
||||
} else {
|
||||
// emit this many characters, line-breaking where required
|
||||
totalRunes := utf8.RuneCountInString(ht.tempBuffer.String())
|
||||
if sublen == totalRunes && !first && sublen <= htmlMarginSlop {
|
||||
// This is intended to handle a small run of non-word characters at the end of a string (i.e.
|
||||
// followed by whitespace) that should stay on the same line with its preceding word, to
|
||||
// eliminate "funnies" in punctuation formatting.
|
||||
ht.emitString(ht.tempBuffer.String(), ht.outputFilters, true)
|
||||
ht.tempBuffer.Reset()
|
||||
break
|
||||
}
|
||||
|
||||
// This is kind of the inverse of the above check; if we have a small run of non-word
|
||||
// characters at the START of a word (preceded by whitespace and followed by at least
|
||||
// one word character), then ensure that we can keep that word and its prefixing non-word
|
||||
// characters on the same line (again, avoiding "funnies" in formatting).
|
||||
if sublen < totalRunes && first && sublen <= htmlMarginSlop {
|
||||
fwLen, _ := util.WordRunLengthAfterPrefix(ht.tempBuffer.String(), sublen)
|
||||
ht.ensureSpaceOnLine(sublen + fwLen)
|
||||
}
|
||||
ht.emitFromStartOfTempBuffer(sublen)
|
||||
}
|
||||
first = false
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) handleAsHTML() bool {
|
||||
ht.triggerWBR = false
|
||||
tempString := ht.tempBuffer.String()
|
||||
// Figure out where the start of the command word is.
|
||||
startCmd := 0
|
||||
closingTag := false
|
||||
if startCmd < len(tempString) && tempString[startCmd] == '/' {
|
||||
startCmd++
|
||||
closingTag = true
|
||||
}
|
||||
|
||||
// now figure out where it ends
|
||||
endCmd := startCmd
|
||||
for endCmd < len(tempString) {
|
||||
if unicode.IsSpace(rune(tempString[endCmd])) {
|
||||
break
|
||||
}
|
||||
endCmd++
|
||||
}
|
||||
|
||||
if endCmd == startCmd || (endCmd-startCmd) > tagMaxLength {
|
||||
// command word is empty or is too long to be an HTML tag
|
||||
return false
|
||||
}
|
||||
possTagName := tempString[startCmd:endCmd]
|
||||
tagIndex, ok := tagNameToIndex[strings.ToUpper(possTagName)]
|
||||
if !ok {
|
||||
// not a known HTML tag
|
||||
return false
|
||||
}
|
||||
tag := tagIndexToObject[tagIndex]
|
||||
if closingTag && !tag.allowClose {
|
||||
// it's a closing tag and this tag doesn't permit the "close" form
|
||||
return false
|
||||
}
|
||||
tagSetID := tagIndexToSetId[tagIndex]
|
||||
if !ht.tagSet.Test(uint(tagSetID)) {
|
||||
// the tag is not allowed - discard it, if one of the flags is set in the config
|
||||
return ht.config.DiscardHTML || ht.config.DiscardRejected
|
||||
}
|
||||
if !ht.config.DiscardHTML && tag.balanceTags {
|
||||
// this tag needs to be balanced - here's where we manipulate the stack
|
||||
var valid bool
|
||||
if closingTag {
|
||||
valid = ht.tagStack.RemoveMostRecent(tag)
|
||||
} else {
|
||||
ht.tagStack.Push(tag)
|
||||
valid = true
|
||||
}
|
||||
if !valid {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Give the tag object one last chance to dictate what we do with the tag.
|
||||
realTagData := tag.rewriteContents(tempString, closingTag, ht)
|
||||
if realTagData == "" || ht.config.DiscardHTML {
|
||||
return true
|
||||
}
|
||||
|
||||
// Emit the tag to the output.
|
||||
ht.emitRune('<', ht.rawOutputFilters, false)
|
||||
ht.emitString(realTagData, ht.rawOutputFilters, false)
|
||||
ht.emitRune('>', ht.rawOutputFilters, false)
|
||||
|
||||
logicalLineBreak := false
|
||||
if ht.triggerWBR && !closingTag && ht.noBreakCount > 0 {
|
||||
// word break is logical line break, but only within no-break tags
|
||||
logicalLineBreak = true
|
||||
} else {
|
||||
logicalLineBreak = tag.causeLineBreak(closingTag)
|
||||
}
|
||||
if logicalLineBreak {
|
||||
ht.columns = 0
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) containsHTMLComment() bool {
|
||||
return ht.tempBuffer.Len() >= 3 && strings.HasPrefix(ht.tempBuffer.String(), "!--")
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) containsCompleteHTMLComment() bool {
|
||||
if ht.tempBuffer.Len() >= 5 {
|
||||
s := ht.tempBuffer.String()
|
||||
return strings.HasPrefix(s, "!--") && strings.HasSuffix(s, "--")
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) containsXMLConstruct() bool {
|
||||
tempString := ht.tempBuffer.String()
|
||||
ptr := 0
|
||||
if len(tempString) > 1 && tempString[0] == '/' {
|
||||
ptr++
|
||||
}
|
||||
for ptr < len(tempString) {
|
||||
if tempString[ptr] == ':' {
|
||||
return true
|
||||
} else if unicode.IsSpace(rune(tempString[ptr])) {
|
||||
break
|
||||
}
|
||||
ptr++
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) finishTag() {
|
||||
if ht.containsHTMLComment() {
|
||||
if ht.containsCompleteHTMLComment() {
|
||||
if !ht.config.DiscardComments {
|
||||
// output the comment in the raw
|
||||
ht.emitRune('<', ht.rawOutputFilters, false)
|
||||
ht.emitString(ht.tempBuffer.String(), ht.rawOutputFilters, false)
|
||||
ht.emitRune('>', ht.rawOutputFilters, false)
|
||||
// clear state and retun to parsing
|
||||
ht.tempBuffer.Reset()
|
||||
ht.state = stateWhitespace
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
if ht.handleAsHTML() {
|
||||
// this was valid HTML, we're done
|
||||
ht.tempBuffer.Reset()
|
||||
ht.state = stateWhitespace
|
||||
return
|
||||
}
|
||||
|
||||
// try to handle it with a tag rewriter
|
||||
md := ht.attemptRewrite(ht.tagRewriters, ht.tempBuffer.String())
|
||||
if md != nil {
|
||||
ht.emitBracketedMarkupData(md, '<', '>')
|
||||
ht.tempBuffer.Reset()
|
||||
ht.state = stateWhitespace
|
||||
if md.rescan {
|
||||
ht.tempBuffer.WriteByte('<')
|
||||
ht.state = stateChars
|
||||
ht.parse(md.all() + ">")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if ht.config.DiscardXML && ht.containsXMLConstruct() {
|
||||
// this tag is an XML construct, and needs to be discarded
|
||||
ht.tempBuffer.Reset()
|
||||
ht.state = stateWhitespace
|
||||
return
|
||||
}
|
||||
|
||||
// This tag has been rejected! process it normally as character data
|
||||
rejection := ht.tempBuffer.String()
|
||||
ht.tempBuffer.Reset()
|
||||
ht.tempBuffer.WriteByte('<')
|
||||
ht.state = stateChars
|
||||
if len(rejection) > 0 {
|
||||
ht.parse(rejection)
|
||||
}
|
||||
ht.parse(">")
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) finishParen() {
|
||||
// Try to handle the element using a paren rewriter
|
||||
md := ht.attemptRewrite(ht.parenRewriters, ht.tempBuffer.String())
|
||||
if md != nil {
|
||||
ht.emitBracketedMarkupData(md, '(', ')')
|
||||
ht.tempBuffer.Reset()
|
||||
ht.state = stateWhitespace
|
||||
ht.parenLevel = 0
|
||||
if md.rescan {
|
||||
ht.tempBuffer.WriteByte('(')
|
||||
ht.state = stateChars
|
||||
ht.parse(md.all() + ")")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Tag rejected! Process it normally as character data.
|
||||
rejection := ht.tempBuffer.String()
|
||||
ht.tempBuffer.Reset()
|
||||
ht.tempBuffer.WriteByte('(')
|
||||
ht.state = stateChars
|
||||
ht.parenLevel = 0
|
||||
if len(rejection) > 0 {
|
||||
ht.parse(rejection)
|
||||
}
|
||||
ht.parse(")")
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) parse(str string) {
|
||||
|
||||
i := 0
|
||||
for i < len(str) {
|
||||
ch := str[i]
|
||||
switch ht.state {
|
||||
case stateWhitespace:
|
||||
switch ch {
|
||||
case ' ', '\t': // append space and tab verbatim
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
case '\r', '\n': // flush and go to Newline state
|
||||
ht.doFlushWhitespace()
|
||||
ht.state = stateNewline
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
case '<':
|
||||
ht.doFlushWhitespace()
|
||||
if ht.config.Angles {
|
||||
ht.state = stateLeftAngle
|
||||
} else {
|
||||
// process < as ordinary character
|
||||
ht.state = stateChars
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
}
|
||||
i++
|
||||
case '(':
|
||||
ht.doFlushWhitespace()
|
||||
if ht.config.Parens {
|
||||
ht.state = stateParen
|
||||
} else {
|
||||
// process ( as ordinary character)
|
||||
ht.state = stateChars
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
}
|
||||
i++
|
||||
case '\\': // backslash processing is tricky - go to Chars state to handle it
|
||||
ht.doFlushWhitespace()
|
||||
ht.state = stateChars
|
||||
default:
|
||||
ht.doFlushWhitespace()
|
||||
ht.state = stateChars
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
}
|
||||
case stateChars:
|
||||
switch ch {
|
||||
case ' ', '\t': // go to Whitespace state
|
||||
ht.doFlushString()
|
||||
ht.state = stateWhitespace
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
case '\r', '\n': // go to Newline state
|
||||
ht.doFlushString()
|
||||
ht.state = stateNewline
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
case '<': // may be a start of tag
|
||||
if ht.config.Angles {
|
||||
ht.doFlushString()
|
||||
ht.state = stateLeftAngle
|
||||
} else {
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
}
|
||||
i++
|
||||
case '\\':
|
||||
if i < (len(str) - 1) {
|
||||
i++
|
||||
ch = str[i]
|
||||
if (ch == '(' && ht.config.Parens) || (ch == '<' && ht.config.Angles) {
|
||||
// append the escaped character, omitting the backslash
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
} else {
|
||||
// append the backslash and hit the new character
|
||||
ht.tempBuffer.WriteByte('\\')
|
||||
}
|
||||
} else {
|
||||
// just append the backslash notrmally
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
}
|
||||
default: // just append the next character
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
}
|
||||
case stateLeftAngle:
|
||||
switch ch {
|
||||
case ' ', '\t', '\r', '\n': // output <, go to Whitespace state
|
||||
ht.emitRune('<', ht.outputFilters, true)
|
||||
ht.state = stateWhitespace
|
||||
case '<': // output < and stay in this state
|
||||
ht.emitRune('<', ht.outputFilters, true)
|
||||
i++
|
||||
default:
|
||||
ht.state = stateTag
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
}
|
||||
case stateTag:
|
||||
switch ch {
|
||||
case '>': // finish the tag - this changes the state, and possibly calls parse() recursively
|
||||
ht.finishTag()
|
||||
i++
|
||||
case '\'', '"': // go into "quote string" state inside the tag
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
ht.state = stateTagQuote
|
||||
ht.quoteChar = ch
|
||||
i++
|
||||
default: // just append the character
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
}
|
||||
case stateParen:
|
||||
switch ch {
|
||||
case '(':
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
ht.parenLevel++
|
||||
i++
|
||||
case ')':
|
||||
if ht.parenLevel == 0 {
|
||||
ht.finishParen()
|
||||
} else {
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
ht.parenLevel--
|
||||
}
|
||||
i++
|
||||
default:
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
}
|
||||
case stateTagQuote:
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
if ch == ht.quoteChar {
|
||||
ht.state = stateTag
|
||||
}
|
||||
i++
|
||||
case stateNewline:
|
||||
if ch == '\r' || ch == '\n' {
|
||||
ht.tempBuffer.WriteByte(ch)
|
||||
i++
|
||||
} else {
|
||||
ht.doFlushNewlines()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) Append(str string) error {
|
||||
if ht.finished {
|
||||
return AlreadyFinished
|
||||
return ErrAlreadyFinished
|
||||
}
|
||||
if !ht.started {
|
||||
ht.started = true
|
||||
@@ -231,7 +870,7 @@ func (ht *htmlCheckerImpl) Append(str string) error {
|
||||
|
||||
func (ht *htmlCheckerImpl) Finish() error {
|
||||
if ht.finished {
|
||||
return AlreadyFinished
|
||||
return ErrAlreadyFinished
|
||||
}
|
||||
if !ht.started {
|
||||
ht.started = true
|
||||
@@ -247,7 +886,125 @@ func (ht *htmlCheckerImpl) Finish() error {
|
||||
case stateChars:
|
||||
running = ht.doFlushString() // flush the temporary buffer
|
||||
case stateLeftAngle:
|
||||
|
||||
// just emit a left angle character
|
||||
ht.emitPossibleLineBreak()
|
||||
ht.emitRune('<', ht.outputFilters, true)
|
||||
case stateTag, stateTagQuote:
|
||||
// we won't finish this tag, so it's automagically rejected
|
||||
rejection := ht.tempBuffer.String()
|
||||
ht.tempBuffer.Reset()
|
||||
ht.tempBuffer.WriteByte('<')
|
||||
ht.state = stateChars
|
||||
if len(rejection) > 0 {
|
||||
ht.parse(rejection)
|
||||
}
|
||||
running = true
|
||||
case stateParen:
|
||||
rejection := ht.tempBuffer.String()
|
||||
ht.tempBuffer.Reset()
|
||||
ht.tempBuffer.WriteByte('(')
|
||||
ht.state = stateChars
|
||||
ht.parenLevel = 0
|
||||
if len(rejection) > 0 {
|
||||
ht.parse(rejection)
|
||||
}
|
||||
running = true
|
||||
}
|
||||
}
|
||||
|
||||
// Now close all the HTML tags that were left open.
|
||||
for !ht.tagStack.IsEmpty() {
|
||||
tag, _ := ht.tagStack.Pop()
|
||||
ht.outputBuffer.WriteString(tag.makeClosingTag())
|
||||
}
|
||||
|
||||
ht.lines++
|
||||
ht.finished = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) Reset() {
|
||||
ht.started = false
|
||||
ht.finished = false
|
||||
ht.triggerWBR = false
|
||||
ht.state = stateWhitespace
|
||||
ht.quoteChar = byte(0)
|
||||
ht.columns = 0
|
||||
ht.lines = 0
|
||||
ht.parenLevel = 0
|
||||
ht.outputBuffer.Reset()
|
||||
for u := range ht.externalReferences {
|
||||
delete(ht.externalReferences, u)
|
||||
}
|
||||
for k := range ht.internalReferences {
|
||||
delete(ht.internalReferences, k)
|
||||
}
|
||||
for c := range maps.Values(ht.counters) {
|
||||
c.Reset()
|
||||
}
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) Value() (string, error) {
|
||||
if ht.finished {
|
||||
return ht.outputBuffer.String(), nil
|
||||
}
|
||||
return "", ErrNotYetFinished
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) Length() (int, error) {
|
||||
if ht.finished {
|
||||
return ht.outputBuffer.Len(), nil
|
||||
}
|
||||
return 0, ErrNotYetFinished
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) Lines() (int, error) {
|
||||
if ht.finished {
|
||||
return ht.lines, nil
|
||||
}
|
||||
return 0, ErrNotYetFinished
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) Counter(name string) (int, error) {
|
||||
if ht.finished {
|
||||
cr, ok := ht.counters[name]
|
||||
if ok {
|
||||
return cr.GetCount(), nil
|
||||
}
|
||||
return 0, nil
|
||||
}
|
||||
return 0, ErrNotYetFinished
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) GetContext(name string) any {
|
||||
return ht.contextData[name]
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) SetContext(name string, value any) {
|
||||
ht.contextData[name] = value
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) ExternalRefs() ([]*url.URL, error) {
|
||||
if ht.finished {
|
||||
rc := make([]*url.URL, len(ht.externalReferences))
|
||||
p := 0
|
||||
for url := range maps.Keys(ht.externalReferences) {
|
||||
rc[p] = url
|
||||
p++
|
||||
}
|
||||
return rc, nil
|
||||
}
|
||||
return nil, ErrNotYetFinished
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) InternalRefs() ([]string, error) {
|
||||
if ht.finished {
|
||||
rc := make([]string, len(ht.internalReferences))
|
||||
p := 0
|
||||
for s := range maps.Keys(ht.internalReferences) {
|
||||
rc[p] = s
|
||||
p++
|
||||
}
|
||||
}
|
||||
return nil, ErrNotYetFinished
|
||||
}
|
||||
|
||||
+23
-16
@@ -17,22 +17,24 @@ import (
|
||||
|
||||
// HTMLCheckerConfig is a configuration that may be used with the HTML Checker.
|
||||
type HTMLCheckerConfig struct {
|
||||
Name string `yaml:"name"`
|
||||
WordWrap int `yaml:"wordWrap"`
|
||||
Rewrap bool `yaml:"rewrap"`
|
||||
Angles bool `yaml:"angles"`
|
||||
Parens bool `yaml:"parens"`
|
||||
DiscardHTML bool `yaml:"discardHTML"`
|
||||
DiscardRejected bool `yaml:"discardRejected"`
|
||||
DiscardComments bool `yaml:"discardComments"`
|
||||
DiscardXML bool `yaml:"discardXML"`
|
||||
OutputFilters []string `yaml:"outputFilters"`
|
||||
StringRewriters []string `yaml:"stringRewriters"`
|
||||
WordRewriters []string `yaml:"wordRewriters"`
|
||||
TagRewriters []string `yaml:"tagRewriters"`
|
||||
ParenRewriters []string `yaml:"parenRewriters"`
|
||||
TagSet string `yaml:"tagSet"`
|
||||
DisallowTags []string `yaml:"disallowTags"`
|
||||
Name string `yaml:"name"`
|
||||
WordWrap int `yaml:"wordWrap"`
|
||||
Rewrap bool `yaml:"rewrap"`
|
||||
Angles bool `yaml:"angles"`
|
||||
Parens bool `yaml:"parens"`
|
||||
DiscardHTML bool `yaml:"discardHTML"`
|
||||
DiscardRejected bool `yaml:"discardRejected"`
|
||||
DiscardComments bool `yaml:"discardComments"`
|
||||
DiscardXML bool `yaml:"discardXML"`
|
||||
OutputFilters []string `yaml:"outputFilters"`
|
||||
RawOutputFilters []string `yaml:"rawOutputFilters"`
|
||||
StringRewriters []string `yaml:"stringRewriters"`
|
||||
WordRewriters []string `yaml:"wordRewriters"`
|
||||
TagRewriters []string `yaml:"tagRewriters"`
|
||||
ParenRewriters []string `yaml:"parenRewriters"`
|
||||
TagSet string `yaml:"tagSet"`
|
||||
DisallowTags []string `yaml:"disallowTags"`
|
||||
AnchorTail string `yaml:"anchorTail"`
|
||||
}
|
||||
|
||||
// HTMLCheckerConfigFile represents all the configs as they exist in the file.
|
||||
@@ -40,6 +42,8 @@ type HTMLCheckerConfigFile struct {
|
||||
Configs []HTMLCheckerConfig `yaml:"configs"`
|
||||
}
|
||||
|
||||
const defaultAnchorTail = "TARGET=\"Wander\""
|
||||
|
||||
//go:embed configs.yaml
|
||||
var configData []byte
|
||||
|
||||
@@ -55,5 +59,8 @@ func init() {
|
||||
}
|
||||
for i := range cfgdata.Configs {
|
||||
configsRegistry[cfgdata.Configs[i].Name] = &(cfgdata.Configs[i])
|
||||
if cfgdata.Configs[i].AnchorTail == "" {
|
||||
cfgdata.Configs[i].AnchorTail = defaultAnchorTail
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+7
-7
@@ -13,8 +13,8 @@ import "strings"
|
||||
|
||||
// outputFilter is the interface for an HTML checker output filter.
|
||||
type outputFilter interface {
|
||||
tryOutputCharacter(strings.Builder, byte) bool
|
||||
matchCharacter(byte) bool
|
||||
tryOutputRune(strings.Builder, rune) bool
|
||||
matchRune(rune) bool
|
||||
lengthNoMatch(string) int
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ type htmlEncodingFilter struct{}
|
||||
const htmlEscapedChars = "<>&"
|
||||
|
||||
// tryOutputCharacter outputs a character that needs to be escaped.
|
||||
func (f *htmlEncodingFilter) tryOutputCharacter(buf strings.Builder, ch byte) bool {
|
||||
func (f *htmlEncodingFilter) tryOutputRune(buf strings.Builder, ch rune) bool {
|
||||
switch ch {
|
||||
case '<':
|
||||
buf.WriteString("<")
|
||||
@@ -49,15 +49,15 @@ func (f *htmlEncodingFilter) tryOutputCharacter(buf strings.Builder, ch byte) bo
|
||||
}
|
||||
|
||||
// matchCharacter returns true if this character needs to be escaped.
|
||||
func (f *htmlEncodingFilter) matchCharacter(ch byte) bool {
|
||||
return strings.IndexByte(htmlEscapedChars, ch) >= 0
|
||||
func (f *htmlEncodingFilter) matchRune(ch rune) bool {
|
||||
return strings.ContainsRune(htmlEscapedChars, ch)
|
||||
}
|
||||
|
||||
// lengthNoMatch returns the maximum length of unmatched characters at the start of the string.
|
||||
func (f *htmlEncodingFilter) lengthNoMatch(s string) int {
|
||||
rc := len(s)
|
||||
for _, c := range []byte(htmlEscapedChars) {
|
||||
tmp := strings.IndexByte(s, c)
|
||||
for _, c := range htmlEscapedChars {
|
||||
tmp := strings.IndexRune(s, c)
|
||||
if tmp >= 0 && tmp < rc {
|
||||
rc = tmp
|
||||
if rc == 0 {
|
||||
|
||||
@@ -26,6 +26,10 @@ type markupData struct {
|
||||
rescan bool
|
||||
}
|
||||
|
||||
func (md *markupData) all() string {
|
||||
return md.beginMarkup + md.text + md.endMarkup
|
||||
}
|
||||
|
||||
// rewriterServices is an interface that provides services to rewriters.
|
||||
type rewriterServices interface {
|
||||
rewriterAttrValue(string) string
|
||||
|
||||
+21
-2
@@ -11,7 +11,7 @@
|
||||
package util
|
||||
|
||||
// Stack[T] is a simple generic array-based stack implementation.
|
||||
type Stack[T any] struct {
|
||||
type Stack[T comparable] struct {
|
||||
elements []T
|
||||
}
|
||||
|
||||
@@ -43,8 +43,27 @@ func (stk *Stack[T]) Peek() (T, bool) {
|
||||
return stk.elements[len(stk.elements)-1], true
|
||||
}
|
||||
|
||||
func (stk *Stack[T]) RemoveMostRecent(data T) bool {
|
||||
i := len(stk.elements) - 1
|
||||
for i >= 0 {
|
||||
if stk.elements[i] == data {
|
||||
if i == 0 {
|
||||
stk.elements = stk.elements[1:]
|
||||
} else if (i + 1) == len(stk.elements) {
|
||||
stk.elements = stk.elements[:i]
|
||||
} else {
|
||||
high := stk.elements[i+1:]
|
||||
stk.elements = stk.elements[:i]
|
||||
stk.elements = append(stk.elements, high...)
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// NewStack creates and returns a new stack.
|
||||
func NewStack[T any]() *Stack[T] {
|
||||
func NewStack[T comparable]() *Stack[T] {
|
||||
return &Stack[T]{
|
||||
elements: make([]T, 0),
|
||||
}
|
||||
|
||||
+51
-9
@@ -14,17 +14,10 @@ import (
|
||||
"regexp"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
var numeric *regexp.Regexp
|
||||
|
||||
func init() {
|
||||
re, err := regexp.Compile("^[0-9]+$")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
numeric = re
|
||||
}
|
||||
var numeric *regexp.Regexp = regexp.MustCompile(`^[0-9]+$`)
|
||||
|
||||
/* CapitalizeString changes the first character of the string to a capital.
|
||||
* Parameters:
|
||||
@@ -80,3 +73,52 @@ func SqlEscape(s string, wildcards bool) string {
|
||||
func IsNumeric(s string) bool {
|
||||
return numeric.MatchString(s)
|
||||
}
|
||||
|
||||
/* RunesToBytes returns the number of bytes in a string counting the number of runes from the beginning.
|
||||
* Parameters:
|
||||
* s - The string to work with.
|
||||
* runeCount - The number of runes to count from the start of the string.
|
||||
* Returns:
|
||||
* The corresponding number of bytes.
|
||||
*/
|
||||
func RunesToBytes(s string, runeCount int) int {
|
||||
bp := 0
|
||||
for runeCount > 0 {
|
||||
if bp >= len(s) {
|
||||
return len(s)
|
||||
}
|
||||
_, c := utf8.DecodeRuneInString(s[bp:])
|
||||
bp += c
|
||||
runeCount--
|
||||
}
|
||||
return bp
|
||||
}
|
||||
|
||||
func IsRuneWord(ch rune) bool {
|
||||
return unicode.IsLetter(ch) || ch == '-' || ch == '\''
|
||||
}
|
||||
|
||||
func WordRunLength(s string) (int, bool) {
|
||||
c1, initLen := utf8.DecodeRuneInString(s)
|
||||
wordChar := IsRuneWord(c1)
|
||||
rlen := 1
|
||||
for _, mch := range s[initLen:] {
|
||||
if IsRuneWord(mch) != wordChar {
|
||||
break
|
||||
}
|
||||
rlen++
|
||||
}
|
||||
return rlen, wordChar
|
||||
}
|
||||
|
||||
func WordRunLengthAfterPrefix(s string, nrunes int) (int, bool) {
|
||||
ofs := 0
|
||||
for _, ch := range s {
|
||||
if nrunes == 0 {
|
||||
break
|
||||
}
|
||||
ofs += utf8.RuneLen(ch)
|
||||
nrunes--
|
||||
}
|
||||
return WordRunLength(s[ofs:])
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user