From f6ed77923cfacb27970a9b001d41e1857b3c3987 Mon Sep 17 00:00:00 2001 From: Amy Gale Ruth Bowersox Date: Fri, 31 Oct 2025 23:48:46 -0600 Subject: [PATCH] beginning to implement the HTML Checker itself - incomplete --- htmlcheck/checker.go | 220 +++++++++++++++++++++++++++++++++++- htmlcheck/checker_config.go | 43 ------- htmlcheck/rewriter.go | 44 ++++++++ util/stack.go | 51 +++++++++ 4 files changed, 313 insertions(+), 45 deletions(-) create mode 100644 util/stack.go diff --git a/htmlcheck/checker.go b/htmlcheck/checker.go index 330f195..885234c 100644 --- a/htmlcheck/checker.go +++ b/htmlcheck/checker.go @@ -9,7 +9,16 @@ // The htmlcheck package contains the HTML Checker. package htmlcheck -import "net/url" +import ( + "errors" + "fmt" + "net/url" + "strings" + + "git.erbosoft.com/amy/amsterdam/util" + "github.com/bits-and-blooms/bitset" + log "github.com/sirupsen/logrus" +) // HTMLChecker is a component that checks HTML and reformats it as needed. type HTMLChecker interface { @@ -26,7 +35,8 @@ type HTMLChecker interface { InternalRefs() ([]string, error) } -// var NotYetFinished = errors.New("the HTML checker has not yet been finished") +var AlreadyFinished = errors.New("the HTML checker has already finished") +var NotYetFinished = errors.New("the HTML checker has not yet been finished") type htmlCheckerBackend interface { getCheckerAttrValue(string) string @@ -35,3 +45,209 @@ type htmlCheckerBackend interface { addExternalRef(*url.URL) addInternalRef(string) } + +// State constants for the state machine. +const ( + stateWhitespace = 0 + stateChars = 1 + stateLeftAngle = 2 + stateTag = 3 + stateParen = 4 + stateTagQuote = 5 + stateNewline = 6 +) + +// htmlMarginSlop is a number of characters at the end of the line used to control word-wrapping. +const htmlMarginSlop = 5 + +type htmlCheckerImpl struct { + config *HTMLCheckerConfig + started bool + finished bool + state int + quoteChar byte + parenLevel int + columns int + lines int + noBreakCount int + triggerWBR bool + outputBuffer strings.Builder + tempBuffer strings.Builder + tagStack *util.Stack[*tag] + counters map[string]*countingRewriter + stringRewriters []rewriter + wordRewriters []rewriter + tagRewriters []rewriter + parenRewriters []rewriter + outputFilters []outputFilter + contextData map[string]any + externalReferences map[*url.URL]bool + internalReferences map[string]bool + tagSet *bitset.BitSet +} + +func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) { + for i := range source { + rw, ok := rewriterRegistry[source[i]] + if ok { + if rw.Name() != "" { + crw := MakeCountingRewriter(rw) + ht.counters[rw.Name()] = crw + rw = crw + } + dest[i] = rw + } else { + log.Errorf("rewriter %s is not found", source[i]) + } + } +} + +func AmNewHTMLChecker(configName string) (HTMLChecker, error) { + config, ok := configsRegistry[configName] + if !ok { + return nil, fmt.Errorf("configuration %s not found", configName) + } + tset, ok := tagSetNameToSet[config.TagSet] + if !ok { + return nil, fmt.Errorf("tag set %s not found", config.TagSet) + } + rc := htmlCheckerImpl{ + config: config, + started: false, + finished: false, + state: stateWhitespace, + parenLevel: 0, + columns: 0, + lines: 0, + noBreakCount: 0, + triggerWBR: false, + tagStack: util.NewStack[*tag](), + counters: make(map[string]*countingRewriter), + stringRewriters: make([]rewriter, len(config.StringRewriters)), + wordRewriters: make([]rewriter, len(config.WordRewriters)), + tagRewriters: make([]rewriter, len(config.TagRewriters)), + parenRewriters: make([]rewriter, len(config.ParenRewriters)), + outputFilters: make([]outputFilter, len(config.OutputFilters)), + contextData: make(map[string]any), + externalReferences: make(map[*url.URL]bool), + internalReferences: make(map[string]bool), + tagSet: tset, + } + rc.copyRewriters(rc.stringRewriters, config.StringRewriters) + rc.copyRewriters(rc.wordRewriters, config.WordRewriters) + rc.copyRewriters(rc.tagRewriters, config.TagRewriters) + rc.copyRewriters(rc.parenRewriters, config.ParenRewriters) + for i := range config.OutputFilters { + f, ok := outputFilterRegistry[config.OutputFilters[i]] + if ok { + rc.outputFilters[i] = f + } else { + log.Errorf("filter %s is not found", config.OutputFilters[i]) + } + } + return &rc +} + +func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) { + if str == "" { + return + } + realCountCols := countCols && (ht.config.WordWrap > 0) + if len(filters) == 0 { + ht.outputBuffer.WriteString(str) + if realCountCols { + ht.columns += len(str) + } + return + } + temp := str + for len(temp) > 0 { + outputLen := len(temp) + var stopper outputFilter = nil + for _, of := range filters { + lnm := of.lengthNoMatch(temp) + if lnm >= 0 && lnm < outputLen { + outputLen = lnm + stopper = of + } + if outputLen <= 0 { + break + } + } + if outputLen > 0 { + ht.outputBuffer.WriteString(temp[:outputLen]) + if realCountCols { + ht.columns += outputLen + } + } + if stopper != nil { + tmpch := temp[outputLen] + outputLen++ + if !stopper.tryOutputCharacter(ht.outputBuffer, tmpch) { + ht.outputBuffer.WriteByte(tmpch) + } + if realCountCols { + ht.columns++ + } + } + if outputLen == len(temp) { + temp = "" + } else if outputLen > 0 { + temp = temp[outputLen:] + } + } +} + +func (ht *htmlCheckerImpl) emitLineBreak() { + +} + +func (ht *htmlCheckerImpl) emitPossibleLineBreak() { + if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 && ht.columns >= ht.config.WordWrap { + ht.emitLineBreak() + } +} + +func (ht *htmlCheckerImpl) doFlushString() bool { + return false // TODO +} + +func (ht *htmlCheckerImpl) parse(str string) { + +} + +func (ht *htmlCheckerImpl) Append(str string) error { + if ht.finished { + return AlreadyFinished + } + if !ht.started { + ht.started = true + } + if str != "" { + ht.parse(str) + } + return nil +} + +func (ht *htmlCheckerImpl) Finish() error { + if ht.finished { + return AlreadyFinished + } + if !ht.started { + ht.started = true + } + // This is the "end parse" loop, in which we resolve any funny state the parser has + // found itself in and clear out the internal buffers. + running := true + for running { + running = false // make sure we stop unless this is set to true + switch ht.state { + case stateWhitespace, stateNewline: + // do nothing - discard whitespace or newlines at end + case stateChars: + running = ht.doFlushString() // flush the temporary buffer + case stateLeftAngle: + + } + } +} diff --git a/htmlcheck/checker_config.go b/htmlcheck/checker_config.go index d2ad6ce..3d72674 100644 --- a/htmlcheck/checker_config.go +++ b/htmlcheck/checker_config.go @@ -12,7 +12,6 @@ package htmlcheck import ( _ "embed" - log "github.com/sirupsen/logrus" "gopkg.in/yaml.v3" ) @@ -36,48 +35,6 @@ type HTMLCheckerConfig struct { DisallowTags []string `yaml:"disallowTags"` } -func (cfg *HTMLCheckerConfig) rezOutputFilters() []outputFilter { - rc := make([]outputFilter, 0, len(cfg.OutputFilters)) - for i := range cfg.OutputFilters { - f, ok := outputFilterRegistry[cfg.OutputFilters[i]] - if ok { - rc = append(rc, f) - } else { - log.Errorf("filter %s is not found", cfg.OutputFilters[i]) - } - } - return rc -} - -func rezRewriters(desired []string) []rewriter { - rc := make([]rewriter, 0, len(desired)) - for i := range desired { - r, ok := rewriterRegistry[desired[i]] - if ok { - rc = append(rc, r) - } else { - log.Errorf("rewriter %s is not found", desired[i]) - } - } - return rc -} - -func (cfg *HTMLCheckerConfig) rezStringRewriters() []rewriter { - return rezRewriters(cfg.StringRewriters) -} - -func (cfg *HTMLCheckerConfig) rezWordRewriters() []rewriter { - return rezRewriters(cfg.WordRewriters) -} - -func (cfg *HTMLCheckerConfig) rezTagRewriters() []rewriter { - return rezRewriters(cfg.TagRewriters) -} - -func (cfg *HTMLCheckerConfig) rezParenRewriters() []rewriter { - return rezRewriters(cfg.ParenRewriters) -} - // HTMLCheckerConfigFile represents all the configs as they exist in the file. type HTMLCheckerConfigFile struct { Configs []HTMLCheckerConfig `yaml:"configs"` diff --git a/htmlcheck/rewriter.go b/htmlcheck/rewriter.go index 80b4ecd..c9c1374 100644 --- a/htmlcheck/rewriter.go +++ b/htmlcheck/rewriter.go @@ -294,3 +294,47 @@ func (rw *userLinkRewriter) Rewrite(data string, svc rewriterServices) *markupDa rescan: false, } } + +// countingRewriter is a wrapper around rewriter that counts the number of rewrites. +type countingRewriter struct { + inner rewriter + count int +} + +// Name returns the rewriter's name. +func (rw *countingRewriter) Name() string { + return rw.inner.Name() +} + +/* Rewrite rewrites the given string data and adds markup before and after if needed. + * Parameters: + * data - The data to be rewritten. + * svc - Services interface we can use. + * Returns: + * Pointer to markup data, or nil. + */ +func (rw *countingRewriter) Rewrite(data string, svc rewriterServices) *markupData { + rc := rw.inner.Rewrite(data, svc) + if rc != nil && !rc.rescan { + rw.count++ + } + return rc +} + +// GetCount returns the rewriter's count. +func (rw *countingRewriter) GetCount() int { + return rw.count +} + +// Reset resets the rewriter. +func (rw *countingRewriter) Reset() { + rw.count = 0 +} + +// MakeCountingRewriter wraps the rewriter in a countingRewriter. +func MakeCountingRewriter(rw rewriter) *countingRewriter { + return &countingRewriter{ + inner: rw, + count: 0, + } +} diff --git a/util/stack.go b/util/stack.go new file mode 100644 index 0000000..60eaf3b --- /dev/null +++ b/util/stack.go @@ -0,0 +1,51 @@ +/* + * Amsterdam Web Communities System + * Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +// Package util contains utility definitions. +package util + +// Stack[T] is a simple generic array-based stack implementation. +type Stack[T any] struct { + elements []T +} + +// IsEmpty returns true if the stack is empty. +func (stk *Stack[T]) IsEmpty() bool { + return len(stk.elements) == 0 +} + +// Push adds a value to the top of the stack. +func (stk *Stack[T]) Push(data T) { + stk.elements = append(stk.elements, data) +} + +// Pop removes and returns a value from the top of the stack. +func (stk *Stack[T]) Pop() (T, bool) { + if stk.IsEmpty() { + return *new(T), false + } + topElement := stk.elements[len(stk.elements)-1] + stk.elements = stk.elements[:len(stk.elements)-1] + return topElement, true +} + +// Peek returns the current value on the top of the stack. +func (stk *Stack[T]) Peek() (T, bool) { + if stk.IsEmpty() { + return *new(T), false + } + return stk.elements[len(stk.elements)-1], true +} + +// NewStack creates and returns a new stack. +func NewStack[T any]() *Stack[T] { + return &Stack[T]{ + elements: make([]T, 0), + } +}