beginning to implement the HTML Checker itself - incomplete
This commit is contained in:
+218
-2
@@ -9,7 +9,16 @@
|
|||||||
// The htmlcheck package contains the HTML Checker.
|
// The htmlcheck package contains the HTML Checker.
|
||||||
package htmlcheck
|
package htmlcheck
|
||||||
|
|
||||||
import "net/url"
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"git.erbosoft.com/amy/amsterdam/util"
|
||||||
|
"github.com/bits-and-blooms/bitset"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
// HTMLChecker is a component that checks HTML and reformats it as needed.
|
// HTMLChecker is a component that checks HTML and reformats it as needed.
|
||||||
type HTMLChecker interface {
|
type HTMLChecker interface {
|
||||||
@@ -26,7 +35,8 @@ type HTMLChecker interface {
|
|||||||
InternalRefs() ([]string, error)
|
InternalRefs() ([]string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
// var NotYetFinished = errors.New("the HTML checker has not yet been finished")
|
var AlreadyFinished = errors.New("the HTML checker has already finished")
|
||||||
|
var NotYetFinished = errors.New("the HTML checker has not yet been finished")
|
||||||
|
|
||||||
type htmlCheckerBackend interface {
|
type htmlCheckerBackend interface {
|
||||||
getCheckerAttrValue(string) string
|
getCheckerAttrValue(string) string
|
||||||
@@ -35,3 +45,209 @@ type htmlCheckerBackend interface {
|
|||||||
addExternalRef(*url.URL)
|
addExternalRef(*url.URL)
|
||||||
addInternalRef(string)
|
addInternalRef(string)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// State constants for the state machine.
|
||||||
|
const (
|
||||||
|
stateWhitespace = 0
|
||||||
|
stateChars = 1
|
||||||
|
stateLeftAngle = 2
|
||||||
|
stateTag = 3
|
||||||
|
stateParen = 4
|
||||||
|
stateTagQuote = 5
|
||||||
|
stateNewline = 6
|
||||||
|
)
|
||||||
|
|
||||||
|
// htmlMarginSlop is a number of characters at the end of the line used to control word-wrapping.
|
||||||
|
const htmlMarginSlop = 5
|
||||||
|
|
||||||
|
type htmlCheckerImpl struct {
|
||||||
|
config *HTMLCheckerConfig
|
||||||
|
started bool
|
||||||
|
finished bool
|
||||||
|
state int
|
||||||
|
quoteChar byte
|
||||||
|
parenLevel int
|
||||||
|
columns int
|
||||||
|
lines int
|
||||||
|
noBreakCount int
|
||||||
|
triggerWBR bool
|
||||||
|
outputBuffer strings.Builder
|
||||||
|
tempBuffer strings.Builder
|
||||||
|
tagStack *util.Stack[*tag]
|
||||||
|
counters map[string]*countingRewriter
|
||||||
|
stringRewriters []rewriter
|
||||||
|
wordRewriters []rewriter
|
||||||
|
tagRewriters []rewriter
|
||||||
|
parenRewriters []rewriter
|
||||||
|
outputFilters []outputFilter
|
||||||
|
contextData map[string]any
|
||||||
|
externalReferences map[*url.URL]bool
|
||||||
|
internalReferences map[string]bool
|
||||||
|
tagSet *bitset.BitSet
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) {
|
||||||
|
for i := range source {
|
||||||
|
rw, ok := rewriterRegistry[source[i]]
|
||||||
|
if ok {
|
||||||
|
if rw.Name() != "" {
|
||||||
|
crw := MakeCountingRewriter(rw)
|
||||||
|
ht.counters[rw.Name()] = crw
|
||||||
|
rw = crw
|
||||||
|
}
|
||||||
|
dest[i] = rw
|
||||||
|
} else {
|
||||||
|
log.Errorf("rewriter %s is not found", source[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
|
||||||
|
config, ok := configsRegistry[configName]
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("configuration %s not found", configName)
|
||||||
|
}
|
||||||
|
tset, ok := tagSetNameToSet[config.TagSet]
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("tag set %s not found", config.TagSet)
|
||||||
|
}
|
||||||
|
rc := htmlCheckerImpl{
|
||||||
|
config: config,
|
||||||
|
started: false,
|
||||||
|
finished: false,
|
||||||
|
state: stateWhitespace,
|
||||||
|
parenLevel: 0,
|
||||||
|
columns: 0,
|
||||||
|
lines: 0,
|
||||||
|
noBreakCount: 0,
|
||||||
|
triggerWBR: false,
|
||||||
|
tagStack: util.NewStack[*tag](),
|
||||||
|
counters: make(map[string]*countingRewriter),
|
||||||
|
stringRewriters: make([]rewriter, len(config.StringRewriters)),
|
||||||
|
wordRewriters: make([]rewriter, len(config.WordRewriters)),
|
||||||
|
tagRewriters: make([]rewriter, len(config.TagRewriters)),
|
||||||
|
parenRewriters: make([]rewriter, len(config.ParenRewriters)),
|
||||||
|
outputFilters: make([]outputFilter, len(config.OutputFilters)),
|
||||||
|
contextData: make(map[string]any),
|
||||||
|
externalReferences: make(map[*url.URL]bool),
|
||||||
|
internalReferences: make(map[string]bool),
|
||||||
|
tagSet: tset,
|
||||||
|
}
|
||||||
|
rc.copyRewriters(rc.stringRewriters, config.StringRewriters)
|
||||||
|
rc.copyRewriters(rc.wordRewriters, config.WordRewriters)
|
||||||
|
rc.copyRewriters(rc.tagRewriters, config.TagRewriters)
|
||||||
|
rc.copyRewriters(rc.parenRewriters, config.ParenRewriters)
|
||||||
|
for i := range config.OutputFilters {
|
||||||
|
f, ok := outputFilterRegistry[config.OutputFilters[i]]
|
||||||
|
if ok {
|
||||||
|
rc.outputFilters[i] = f
|
||||||
|
} else {
|
||||||
|
log.Errorf("filter %s is not found", config.OutputFilters[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return &rc
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) {
|
||||||
|
if str == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
realCountCols := countCols && (ht.config.WordWrap > 0)
|
||||||
|
if len(filters) == 0 {
|
||||||
|
ht.outputBuffer.WriteString(str)
|
||||||
|
if realCountCols {
|
||||||
|
ht.columns += len(str)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
temp := str
|
||||||
|
for len(temp) > 0 {
|
||||||
|
outputLen := len(temp)
|
||||||
|
var stopper outputFilter = nil
|
||||||
|
for _, of := range filters {
|
||||||
|
lnm := of.lengthNoMatch(temp)
|
||||||
|
if lnm >= 0 && lnm < outputLen {
|
||||||
|
outputLen = lnm
|
||||||
|
stopper = of
|
||||||
|
}
|
||||||
|
if outputLen <= 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if outputLen > 0 {
|
||||||
|
ht.outputBuffer.WriteString(temp[:outputLen])
|
||||||
|
if realCountCols {
|
||||||
|
ht.columns += outputLen
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if stopper != nil {
|
||||||
|
tmpch := temp[outputLen]
|
||||||
|
outputLen++
|
||||||
|
if !stopper.tryOutputCharacter(ht.outputBuffer, tmpch) {
|
||||||
|
ht.outputBuffer.WriteByte(tmpch)
|
||||||
|
}
|
||||||
|
if realCountCols {
|
||||||
|
ht.columns++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if outputLen == len(temp) {
|
||||||
|
temp = ""
|
||||||
|
} else if outputLen > 0 {
|
||||||
|
temp = temp[outputLen:]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ht *htmlCheckerImpl) emitLineBreak() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
|
||||||
|
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 && ht.columns >= ht.config.WordWrap {
|
||||||
|
ht.emitLineBreak()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ht *htmlCheckerImpl) doFlushString() bool {
|
||||||
|
return false // TODO
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ht *htmlCheckerImpl) parse(str string) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ht *htmlCheckerImpl) Append(str string) error {
|
||||||
|
if ht.finished {
|
||||||
|
return AlreadyFinished
|
||||||
|
}
|
||||||
|
if !ht.started {
|
||||||
|
ht.started = true
|
||||||
|
}
|
||||||
|
if str != "" {
|
||||||
|
ht.parse(str)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ht *htmlCheckerImpl) Finish() error {
|
||||||
|
if ht.finished {
|
||||||
|
return AlreadyFinished
|
||||||
|
}
|
||||||
|
if !ht.started {
|
||||||
|
ht.started = true
|
||||||
|
}
|
||||||
|
// This is the "end parse" loop, in which we resolve any funny state the parser has
|
||||||
|
// found itself in and clear out the internal buffers.
|
||||||
|
running := true
|
||||||
|
for running {
|
||||||
|
running = false // make sure we stop unless this is set to true
|
||||||
|
switch ht.state {
|
||||||
|
case stateWhitespace, stateNewline:
|
||||||
|
// do nothing - discard whitespace or newlines at end
|
||||||
|
case stateChars:
|
||||||
|
running = ht.doFlushString() // flush the temporary buffer
|
||||||
|
case stateLeftAngle:
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ package htmlcheck
|
|||||||
import (
|
import (
|
||||||
_ "embed"
|
_ "embed"
|
||||||
|
|
||||||
log "github.com/sirupsen/logrus"
|
|
||||||
"gopkg.in/yaml.v3"
|
"gopkg.in/yaml.v3"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -36,48 +35,6 @@ type HTMLCheckerConfig struct {
|
|||||||
DisallowTags []string `yaml:"disallowTags"`
|
DisallowTags []string `yaml:"disallowTags"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (cfg *HTMLCheckerConfig) rezOutputFilters() []outputFilter {
|
|
||||||
rc := make([]outputFilter, 0, len(cfg.OutputFilters))
|
|
||||||
for i := range cfg.OutputFilters {
|
|
||||||
f, ok := outputFilterRegistry[cfg.OutputFilters[i]]
|
|
||||||
if ok {
|
|
||||||
rc = append(rc, f)
|
|
||||||
} else {
|
|
||||||
log.Errorf("filter %s is not found", cfg.OutputFilters[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rc
|
|
||||||
}
|
|
||||||
|
|
||||||
func rezRewriters(desired []string) []rewriter {
|
|
||||||
rc := make([]rewriter, 0, len(desired))
|
|
||||||
for i := range desired {
|
|
||||||
r, ok := rewriterRegistry[desired[i]]
|
|
||||||
if ok {
|
|
||||||
rc = append(rc, r)
|
|
||||||
} else {
|
|
||||||
log.Errorf("rewriter %s is not found", desired[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rc
|
|
||||||
}
|
|
||||||
|
|
||||||
func (cfg *HTMLCheckerConfig) rezStringRewriters() []rewriter {
|
|
||||||
return rezRewriters(cfg.StringRewriters)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (cfg *HTMLCheckerConfig) rezWordRewriters() []rewriter {
|
|
||||||
return rezRewriters(cfg.WordRewriters)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (cfg *HTMLCheckerConfig) rezTagRewriters() []rewriter {
|
|
||||||
return rezRewriters(cfg.TagRewriters)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (cfg *HTMLCheckerConfig) rezParenRewriters() []rewriter {
|
|
||||||
return rezRewriters(cfg.ParenRewriters)
|
|
||||||
}
|
|
||||||
|
|
||||||
// HTMLCheckerConfigFile represents all the configs as they exist in the file.
|
// HTMLCheckerConfigFile represents all the configs as they exist in the file.
|
||||||
type HTMLCheckerConfigFile struct {
|
type HTMLCheckerConfigFile struct {
|
||||||
Configs []HTMLCheckerConfig `yaml:"configs"`
|
Configs []HTMLCheckerConfig `yaml:"configs"`
|
||||||
|
|||||||
@@ -294,3 +294,47 @@ func (rw *userLinkRewriter) Rewrite(data string, svc rewriterServices) *markupDa
|
|||||||
rescan: false,
|
rescan: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// countingRewriter is a wrapper around rewriter that counts the number of rewrites.
|
||||||
|
type countingRewriter struct {
|
||||||
|
inner rewriter
|
||||||
|
count int
|
||||||
|
}
|
||||||
|
|
||||||
|
// Name returns the rewriter's name.
|
||||||
|
func (rw *countingRewriter) Name() string {
|
||||||
|
return rw.inner.Name()
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Rewrite rewrites the given string data and adds markup before and after if needed.
|
||||||
|
* Parameters:
|
||||||
|
* data - The data to be rewritten.
|
||||||
|
* svc - Services interface we can use.
|
||||||
|
* Returns:
|
||||||
|
* Pointer to markup data, or nil.
|
||||||
|
*/
|
||||||
|
func (rw *countingRewriter) Rewrite(data string, svc rewriterServices) *markupData {
|
||||||
|
rc := rw.inner.Rewrite(data, svc)
|
||||||
|
if rc != nil && !rc.rescan {
|
||||||
|
rw.count++
|
||||||
|
}
|
||||||
|
return rc
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetCount returns the rewriter's count.
|
||||||
|
func (rw *countingRewriter) GetCount() int {
|
||||||
|
return rw.count
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset resets the rewriter.
|
||||||
|
func (rw *countingRewriter) Reset() {
|
||||||
|
rw.count = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// MakeCountingRewriter wraps the rewriter in a countingRewriter.
|
||||||
|
func MakeCountingRewriter(rw rewriter) *countingRewriter {
|
||||||
|
return &countingRewriter{
|
||||||
|
inner: rw,
|
||||||
|
count: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -0,0 +1,51 @@
|
|||||||
|
/*
|
||||||
|
* Amsterdam Web Communities System
|
||||||
|
* Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved
|
||||||
|
*
|
||||||
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
|
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Package util contains utility definitions.
|
||||||
|
package util
|
||||||
|
|
||||||
|
// Stack[T] is a simple generic array-based stack implementation.
|
||||||
|
type Stack[T any] struct {
|
||||||
|
elements []T
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsEmpty returns true if the stack is empty.
|
||||||
|
func (stk *Stack[T]) IsEmpty() bool {
|
||||||
|
return len(stk.elements) == 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// Push adds a value to the top of the stack.
|
||||||
|
func (stk *Stack[T]) Push(data T) {
|
||||||
|
stk.elements = append(stk.elements, data)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pop removes and returns a value from the top of the stack.
|
||||||
|
func (stk *Stack[T]) Pop() (T, bool) {
|
||||||
|
if stk.IsEmpty() {
|
||||||
|
return *new(T), false
|
||||||
|
}
|
||||||
|
topElement := stk.elements[len(stk.elements)-1]
|
||||||
|
stk.elements = stk.elements[:len(stk.elements)-1]
|
||||||
|
return topElement, true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Peek returns the current value on the top of the stack.
|
||||||
|
func (stk *Stack[T]) Peek() (T, bool) {
|
||||||
|
if stk.IsEmpty() {
|
||||||
|
return *new(T), false
|
||||||
|
}
|
||||||
|
return stk.elements[len(stk.elements)-1], true
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewStack creates and returns a new stack.
|
||||||
|
func NewStack[T any]() *Stack[T] {
|
||||||
|
return &Stack[T]{
|
||||||
|
elements: make([]T, 0),
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user