beginning to implement the HTML Checker itself - incomplete
This commit is contained in:
+218
-2
@@ -9,7 +9,16 @@
|
||||
// The htmlcheck package contains the HTML Checker.
|
||||
package htmlcheck
|
||||
|
||||
import "net/url"
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"git.erbosoft.com/amy/amsterdam/util"
|
||||
"github.com/bits-and-blooms/bitset"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// HTMLChecker is a component that checks HTML and reformats it as needed.
|
||||
type HTMLChecker interface {
|
||||
@@ -26,7 +35,8 @@ type HTMLChecker interface {
|
||||
InternalRefs() ([]string, error)
|
||||
}
|
||||
|
||||
// var NotYetFinished = errors.New("the HTML checker has not yet been finished")
|
||||
var AlreadyFinished = errors.New("the HTML checker has already finished")
|
||||
var NotYetFinished = errors.New("the HTML checker has not yet been finished")
|
||||
|
||||
type htmlCheckerBackend interface {
|
||||
getCheckerAttrValue(string) string
|
||||
@@ -35,3 +45,209 @@ type htmlCheckerBackend interface {
|
||||
addExternalRef(*url.URL)
|
||||
addInternalRef(string)
|
||||
}
|
||||
|
||||
// State constants for the state machine.
|
||||
const (
|
||||
stateWhitespace = 0
|
||||
stateChars = 1
|
||||
stateLeftAngle = 2
|
||||
stateTag = 3
|
||||
stateParen = 4
|
||||
stateTagQuote = 5
|
||||
stateNewline = 6
|
||||
)
|
||||
|
||||
// htmlMarginSlop is a number of characters at the end of the line used to control word-wrapping.
|
||||
const htmlMarginSlop = 5
|
||||
|
||||
type htmlCheckerImpl struct {
|
||||
config *HTMLCheckerConfig
|
||||
started bool
|
||||
finished bool
|
||||
state int
|
||||
quoteChar byte
|
||||
parenLevel int
|
||||
columns int
|
||||
lines int
|
||||
noBreakCount int
|
||||
triggerWBR bool
|
||||
outputBuffer strings.Builder
|
||||
tempBuffer strings.Builder
|
||||
tagStack *util.Stack[*tag]
|
||||
counters map[string]*countingRewriter
|
||||
stringRewriters []rewriter
|
||||
wordRewriters []rewriter
|
||||
tagRewriters []rewriter
|
||||
parenRewriters []rewriter
|
||||
outputFilters []outputFilter
|
||||
contextData map[string]any
|
||||
externalReferences map[*url.URL]bool
|
||||
internalReferences map[string]bool
|
||||
tagSet *bitset.BitSet
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) {
|
||||
for i := range source {
|
||||
rw, ok := rewriterRegistry[source[i]]
|
||||
if ok {
|
||||
if rw.Name() != "" {
|
||||
crw := MakeCountingRewriter(rw)
|
||||
ht.counters[rw.Name()] = crw
|
||||
rw = crw
|
||||
}
|
||||
dest[i] = rw
|
||||
} else {
|
||||
log.Errorf("rewriter %s is not found", source[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
|
||||
config, ok := configsRegistry[configName]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("configuration %s not found", configName)
|
||||
}
|
||||
tset, ok := tagSetNameToSet[config.TagSet]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("tag set %s not found", config.TagSet)
|
||||
}
|
||||
rc := htmlCheckerImpl{
|
||||
config: config,
|
||||
started: false,
|
||||
finished: false,
|
||||
state: stateWhitespace,
|
||||
parenLevel: 0,
|
||||
columns: 0,
|
||||
lines: 0,
|
||||
noBreakCount: 0,
|
||||
triggerWBR: false,
|
||||
tagStack: util.NewStack[*tag](),
|
||||
counters: make(map[string]*countingRewriter),
|
||||
stringRewriters: make([]rewriter, len(config.StringRewriters)),
|
||||
wordRewriters: make([]rewriter, len(config.WordRewriters)),
|
||||
tagRewriters: make([]rewriter, len(config.TagRewriters)),
|
||||
parenRewriters: make([]rewriter, len(config.ParenRewriters)),
|
||||
outputFilters: make([]outputFilter, len(config.OutputFilters)),
|
||||
contextData: make(map[string]any),
|
||||
externalReferences: make(map[*url.URL]bool),
|
||||
internalReferences: make(map[string]bool),
|
||||
tagSet: tset,
|
||||
}
|
||||
rc.copyRewriters(rc.stringRewriters, config.StringRewriters)
|
||||
rc.copyRewriters(rc.wordRewriters, config.WordRewriters)
|
||||
rc.copyRewriters(rc.tagRewriters, config.TagRewriters)
|
||||
rc.copyRewriters(rc.parenRewriters, config.ParenRewriters)
|
||||
for i := range config.OutputFilters {
|
||||
f, ok := outputFilterRegistry[config.OutputFilters[i]]
|
||||
if ok {
|
||||
rc.outputFilters[i] = f
|
||||
} else {
|
||||
log.Errorf("filter %s is not found", config.OutputFilters[i])
|
||||
}
|
||||
}
|
||||
return &rc
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) {
|
||||
if str == "" {
|
||||
return
|
||||
}
|
||||
realCountCols := countCols && (ht.config.WordWrap > 0)
|
||||
if len(filters) == 0 {
|
||||
ht.outputBuffer.WriteString(str)
|
||||
if realCountCols {
|
||||
ht.columns += len(str)
|
||||
}
|
||||
return
|
||||
}
|
||||
temp := str
|
||||
for len(temp) > 0 {
|
||||
outputLen := len(temp)
|
||||
var stopper outputFilter = nil
|
||||
for _, of := range filters {
|
||||
lnm := of.lengthNoMatch(temp)
|
||||
if lnm >= 0 && lnm < outputLen {
|
||||
outputLen = lnm
|
||||
stopper = of
|
||||
}
|
||||
if outputLen <= 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
if outputLen > 0 {
|
||||
ht.outputBuffer.WriteString(temp[:outputLen])
|
||||
if realCountCols {
|
||||
ht.columns += outputLen
|
||||
}
|
||||
}
|
||||
if stopper != nil {
|
||||
tmpch := temp[outputLen]
|
||||
outputLen++
|
||||
if !stopper.tryOutputCharacter(ht.outputBuffer, tmpch) {
|
||||
ht.outputBuffer.WriteByte(tmpch)
|
||||
}
|
||||
if realCountCols {
|
||||
ht.columns++
|
||||
}
|
||||
}
|
||||
if outputLen == len(temp) {
|
||||
temp = ""
|
||||
} else if outputLen > 0 {
|
||||
temp = temp[outputLen:]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) emitLineBreak() {
|
||||
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
|
||||
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 && ht.columns >= ht.config.WordWrap {
|
||||
ht.emitLineBreak()
|
||||
}
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) doFlushString() bool {
|
||||
return false // TODO
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) parse(str string) {
|
||||
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) Append(str string) error {
|
||||
if ht.finished {
|
||||
return AlreadyFinished
|
||||
}
|
||||
if !ht.started {
|
||||
ht.started = true
|
||||
}
|
||||
if str != "" {
|
||||
ht.parse(str)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (ht *htmlCheckerImpl) Finish() error {
|
||||
if ht.finished {
|
||||
return AlreadyFinished
|
||||
}
|
||||
if !ht.started {
|
||||
ht.started = true
|
||||
}
|
||||
// This is the "end parse" loop, in which we resolve any funny state the parser has
|
||||
// found itself in and clear out the internal buffers.
|
||||
running := true
|
||||
for running {
|
||||
running = false // make sure we stop unless this is set to true
|
||||
switch ht.state {
|
||||
case stateWhitespace, stateNewline:
|
||||
// do nothing - discard whitespace or newlines at end
|
||||
case stateChars:
|
||||
running = ht.doFlushString() // flush the temporary buffer
|
||||
case stateLeftAngle:
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,6 @@ package htmlcheck
|
||||
import (
|
||||
_ "embed"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
@@ -36,48 +35,6 @@ type HTMLCheckerConfig struct {
|
||||
DisallowTags []string `yaml:"disallowTags"`
|
||||
}
|
||||
|
||||
func (cfg *HTMLCheckerConfig) rezOutputFilters() []outputFilter {
|
||||
rc := make([]outputFilter, 0, len(cfg.OutputFilters))
|
||||
for i := range cfg.OutputFilters {
|
||||
f, ok := outputFilterRegistry[cfg.OutputFilters[i]]
|
||||
if ok {
|
||||
rc = append(rc, f)
|
||||
} else {
|
||||
log.Errorf("filter %s is not found", cfg.OutputFilters[i])
|
||||
}
|
||||
}
|
||||
return rc
|
||||
}
|
||||
|
||||
func rezRewriters(desired []string) []rewriter {
|
||||
rc := make([]rewriter, 0, len(desired))
|
||||
for i := range desired {
|
||||
r, ok := rewriterRegistry[desired[i]]
|
||||
if ok {
|
||||
rc = append(rc, r)
|
||||
} else {
|
||||
log.Errorf("rewriter %s is not found", desired[i])
|
||||
}
|
||||
}
|
||||
return rc
|
||||
}
|
||||
|
||||
func (cfg *HTMLCheckerConfig) rezStringRewriters() []rewriter {
|
||||
return rezRewriters(cfg.StringRewriters)
|
||||
}
|
||||
|
||||
func (cfg *HTMLCheckerConfig) rezWordRewriters() []rewriter {
|
||||
return rezRewriters(cfg.WordRewriters)
|
||||
}
|
||||
|
||||
func (cfg *HTMLCheckerConfig) rezTagRewriters() []rewriter {
|
||||
return rezRewriters(cfg.TagRewriters)
|
||||
}
|
||||
|
||||
func (cfg *HTMLCheckerConfig) rezParenRewriters() []rewriter {
|
||||
return rezRewriters(cfg.ParenRewriters)
|
||||
}
|
||||
|
||||
// HTMLCheckerConfigFile represents all the configs as they exist in the file.
|
||||
type HTMLCheckerConfigFile struct {
|
||||
Configs []HTMLCheckerConfig `yaml:"configs"`
|
||||
|
||||
@@ -294,3 +294,47 @@ func (rw *userLinkRewriter) Rewrite(data string, svc rewriterServices) *markupDa
|
||||
rescan: false,
|
||||
}
|
||||
}
|
||||
|
||||
// countingRewriter is a wrapper around rewriter that counts the number of rewrites.
|
||||
type countingRewriter struct {
|
||||
inner rewriter
|
||||
count int
|
||||
}
|
||||
|
||||
// Name returns the rewriter's name.
|
||||
func (rw *countingRewriter) Name() string {
|
||||
return rw.inner.Name()
|
||||
}
|
||||
|
||||
/* Rewrite rewrites the given string data and adds markup before and after if needed.
|
||||
* Parameters:
|
||||
* data - The data to be rewritten.
|
||||
* svc - Services interface we can use.
|
||||
* Returns:
|
||||
* Pointer to markup data, or nil.
|
||||
*/
|
||||
func (rw *countingRewriter) Rewrite(data string, svc rewriterServices) *markupData {
|
||||
rc := rw.inner.Rewrite(data, svc)
|
||||
if rc != nil && !rc.rescan {
|
||||
rw.count++
|
||||
}
|
||||
return rc
|
||||
}
|
||||
|
||||
// GetCount returns the rewriter's count.
|
||||
func (rw *countingRewriter) GetCount() int {
|
||||
return rw.count
|
||||
}
|
||||
|
||||
// Reset resets the rewriter.
|
||||
func (rw *countingRewriter) Reset() {
|
||||
rw.count = 0
|
||||
}
|
||||
|
||||
// MakeCountingRewriter wraps the rewriter in a countingRewriter.
|
||||
func MakeCountingRewriter(rw rewriter) *countingRewriter {
|
||||
return &countingRewriter{
|
||||
inner: rw,
|
||||
count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user