beginning to implement the HTML Checker itself - incomplete

This commit is contained in:
2025-10-31 23:48:46 -06:00
parent 8a2185e912
commit f6ed77923c
4 changed files with 313 additions and 45 deletions
+218 -2
View File
@@ -9,7 +9,16 @@
// The htmlcheck package contains the HTML Checker.
package htmlcheck
import "net/url"
import (
"errors"
"fmt"
"net/url"
"strings"
"git.erbosoft.com/amy/amsterdam/util"
"github.com/bits-and-blooms/bitset"
log "github.com/sirupsen/logrus"
)
// HTMLChecker is a component that checks HTML and reformats it as needed.
type HTMLChecker interface {
@@ -26,7 +35,8 @@ type HTMLChecker interface {
InternalRefs() ([]string, error)
}
// var NotYetFinished = errors.New("the HTML checker has not yet been finished")
var AlreadyFinished = errors.New("the HTML checker has already finished")
var NotYetFinished = errors.New("the HTML checker has not yet been finished")
type htmlCheckerBackend interface {
getCheckerAttrValue(string) string
@@ -35,3 +45,209 @@ type htmlCheckerBackend interface {
addExternalRef(*url.URL)
addInternalRef(string)
}
// State constants for the state machine.
const (
stateWhitespace = 0
stateChars = 1
stateLeftAngle = 2
stateTag = 3
stateParen = 4
stateTagQuote = 5
stateNewline = 6
)
// htmlMarginSlop is a number of characters at the end of the line used to control word-wrapping.
const htmlMarginSlop = 5
type htmlCheckerImpl struct {
config *HTMLCheckerConfig
started bool
finished bool
state int
quoteChar byte
parenLevel int
columns int
lines int
noBreakCount int
triggerWBR bool
outputBuffer strings.Builder
tempBuffer strings.Builder
tagStack *util.Stack[*tag]
counters map[string]*countingRewriter
stringRewriters []rewriter
wordRewriters []rewriter
tagRewriters []rewriter
parenRewriters []rewriter
outputFilters []outputFilter
contextData map[string]any
externalReferences map[*url.URL]bool
internalReferences map[string]bool
tagSet *bitset.BitSet
}
func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) {
for i := range source {
rw, ok := rewriterRegistry[source[i]]
if ok {
if rw.Name() != "" {
crw := MakeCountingRewriter(rw)
ht.counters[rw.Name()] = crw
rw = crw
}
dest[i] = rw
} else {
log.Errorf("rewriter %s is not found", source[i])
}
}
}
func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
config, ok := configsRegistry[configName]
if !ok {
return nil, fmt.Errorf("configuration %s not found", configName)
}
tset, ok := tagSetNameToSet[config.TagSet]
if !ok {
return nil, fmt.Errorf("tag set %s not found", config.TagSet)
}
rc := htmlCheckerImpl{
config: config,
started: false,
finished: false,
state: stateWhitespace,
parenLevel: 0,
columns: 0,
lines: 0,
noBreakCount: 0,
triggerWBR: false,
tagStack: util.NewStack[*tag](),
counters: make(map[string]*countingRewriter),
stringRewriters: make([]rewriter, len(config.StringRewriters)),
wordRewriters: make([]rewriter, len(config.WordRewriters)),
tagRewriters: make([]rewriter, len(config.TagRewriters)),
parenRewriters: make([]rewriter, len(config.ParenRewriters)),
outputFilters: make([]outputFilter, len(config.OutputFilters)),
contextData: make(map[string]any),
externalReferences: make(map[*url.URL]bool),
internalReferences: make(map[string]bool),
tagSet: tset,
}
rc.copyRewriters(rc.stringRewriters, config.StringRewriters)
rc.copyRewriters(rc.wordRewriters, config.WordRewriters)
rc.copyRewriters(rc.tagRewriters, config.TagRewriters)
rc.copyRewriters(rc.parenRewriters, config.ParenRewriters)
for i := range config.OutputFilters {
f, ok := outputFilterRegistry[config.OutputFilters[i]]
if ok {
rc.outputFilters[i] = f
} else {
log.Errorf("filter %s is not found", config.OutputFilters[i])
}
}
return &rc
}
func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) {
if str == "" {
return
}
realCountCols := countCols && (ht.config.WordWrap > 0)
if len(filters) == 0 {
ht.outputBuffer.WriteString(str)
if realCountCols {
ht.columns += len(str)
}
return
}
temp := str
for len(temp) > 0 {
outputLen := len(temp)
var stopper outputFilter = nil
for _, of := range filters {
lnm := of.lengthNoMatch(temp)
if lnm >= 0 && lnm < outputLen {
outputLen = lnm
stopper = of
}
if outputLen <= 0 {
break
}
}
if outputLen > 0 {
ht.outputBuffer.WriteString(temp[:outputLen])
if realCountCols {
ht.columns += outputLen
}
}
if stopper != nil {
tmpch := temp[outputLen]
outputLen++
if !stopper.tryOutputCharacter(ht.outputBuffer, tmpch) {
ht.outputBuffer.WriteByte(tmpch)
}
if realCountCols {
ht.columns++
}
}
if outputLen == len(temp) {
temp = ""
} else if outputLen > 0 {
temp = temp[outputLen:]
}
}
}
func (ht *htmlCheckerImpl) emitLineBreak() {
}
func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 && ht.columns >= ht.config.WordWrap {
ht.emitLineBreak()
}
}
func (ht *htmlCheckerImpl) doFlushString() bool {
return false // TODO
}
func (ht *htmlCheckerImpl) parse(str string) {
}
func (ht *htmlCheckerImpl) Append(str string) error {
if ht.finished {
return AlreadyFinished
}
if !ht.started {
ht.started = true
}
if str != "" {
ht.parse(str)
}
return nil
}
func (ht *htmlCheckerImpl) Finish() error {
if ht.finished {
return AlreadyFinished
}
if !ht.started {
ht.started = true
}
// This is the "end parse" loop, in which we resolve any funny state the parser has
// found itself in and clear out the internal buffers.
running := true
for running {
running = false // make sure we stop unless this is set to true
switch ht.state {
case stateWhitespace, stateNewline:
// do nothing - discard whitespace or newlines at end
case stateChars:
running = ht.doFlushString() // flush the temporary buffer
case stateLeftAngle:
}
}
}
-43
View File
@@ -12,7 +12,6 @@ package htmlcheck
import (
_ "embed"
log "github.com/sirupsen/logrus"
"gopkg.in/yaml.v3"
)
@@ -36,48 +35,6 @@ type HTMLCheckerConfig struct {
DisallowTags []string `yaml:"disallowTags"`
}
func (cfg *HTMLCheckerConfig) rezOutputFilters() []outputFilter {
rc := make([]outputFilter, 0, len(cfg.OutputFilters))
for i := range cfg.OutputFilters {
f, ok := outputFilterRegistry[cfg.OutputFilters[i]]
if ok {
rc = append(rc, f)
} else {
log.Errorf("filter %s is not found", cfg.OutputFilters[i])
}
}
return rc
}
func rezRewriters(desired []string) []rewriter {
rc := make([]rewriter, 0, len(desired))
for i := range desired {
r, ok := rewriterRegistry[desired[i]]
if ok {
rc = append(rc, r)
} else {
log.Errorf("rewriter %s is not found", desired[i])
}
}
return rc
}
func (cfg *HTMLCheckerConfig) rezStringRewriters() []rewriter {
return rezRewriters(cfg.StringRewriters)
}
func (cfg *HTMLCheckerConfig) rezWordRewriters() []rewriter {
return rezRewriters(cfg.WordRewriters)
}
func (cfg *HTMLCheckerConfig) rezTagRewriters() []rewriter {
return rezRewriters(cfg.TagRewriters)
}
func (cfg *HTMLCheckerConfig) rezParenRewriters() []rewriter {
return rezRewriters(cfg.ParenRewriters)
}
// HTMLCheckerConfigFile represents all the configs as they exist in the file.
type HTMLCheckerConfigFile struct {
Configs []HTMLCheckerConfig `yaml:"configs"`
+44
View File
@@ -294,3 +294,47 @@ func (rw *userLinkRewriter) Rewrite(data string, svc rewriterServices) *markupDa
rescan: false,
}
}
// countingRewriter is a wrapper around rewriter that counts the number of rewrites.
type countingRewriter struct {
inner rewriter
count int
}
// Name returns the rewriter's name.
func (rw *countingRewriter) Name() string {
return rw.inner.Name()
}
/* Rewrite rewrites the given string data and adds markup before and after if needed.
* Parameters:
* data - The data to be rewritten.
* svc - Services interface we can use.
* Returns:
* Pointer to markup data, or nil.
*/
func (rw *countingRewriter) Rewrite(data string, svc rewriterServices) *markupData {
rc := rw.inner.Rewrite(data, svc)
if rc != nil && !rc.rescan {
rw.count++
}
return rc
}
// GetCount returns the rewriter's count.
func (rw *countingRewriter) GetCount() int {
return rw.count
}
// Reset resets the rewriter.
func (rw *countingRewriter) Reset() {
rw.count = 0
}
// MakeCountingRewriter wraps the rewriter in a countingRewriter.
func MakeCountingRewriter(rw rewriter) *countingRewriter {
return &countingRewriter{
inner: rw,
count: 0,
}
}
+51
View File
@@ -0,0 +1,51 @@
/*
* Amsterdam Web Communities System
* Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
// Package util contains utility definitions.
package util
// Stack[T] is a simple generic array-based stack implementation.
type Stack[T any] struct {
elements []T
}
// IsEmpty returns true if the stack is empty.
func (stk *Stack[T]) IsEmpty() bool {
return len(stk.elements) == 0
}
// Push adds a value to the top of the stack.
func (stk *Stack[T]) Push(data T) {
stk.elements = append(stk.elements, data)
}
// Pop removes and returns a value from the top of the stack.
func (stk *Stack[T]) Pop() (T, bool) {
if stk.IsEmpty() {
return *new(T), false
}
topElement := stk.elements[len(stk.elements)-1]
stk.elements = stk.elements[:len(stk.elements)-1]
return topElement, true
}
// Peek returns the current value on the top of the stack.
func (stk *Stack[T]) Peek() (T, bool) {
if stk.IsEmpty() {
return *new(T), false
}
return stk.elements[len(stk.elements)-1], true
}
// NewStack creates and returns a new stack.
func NewStack[T any]() *Stack[T] {
return &Stack[T]{
elements: make([]T, 0),
}
}