diff --git a/htmlcheck/checker.go b/htmlcheck/checker.go
index 330f195..885234c 100644
--- a/htmlcheck/checker.go
+++ b/htmlcheck/checker.go
@@ -9,7 +9,16 @@
// The htmlcheck package contains the HTML Checker.
package htmlcheck
-import "net/url"
+import (
+ "errors"
+ "fmt"
+ "net/url"
+ "strings"
+
+ "git.erbosoft.com/amy/amsterdam/util"
+ "github.com/bits-and-blooms/bitset"
+ log "github.com/sirupsen/logrus"
+)
// HTMLChecker is a component that checks HTML and reformats it as needed.
type HTMLChecker interface {
@@ -26,7 +35,8 @@ type HTMLChecker interface {
InternalRefs() ([]string, error)
}
-// var NotYetFinished = errors.New("the HTML checker has not yet been finished")
+var AlreadyFinished = errors.New("the HTML checker has already finished")
+var NotYetFinished = errors.New("the HTML checker has not yet been finished")
type htmlCheckerBackend interface {
getCheckerAttrValue(string) string
@@ -35,3 +45,209 @@ type htmlCheckerBackend interface {
addExternalRef(*url.URL)
addInternalRef(string)
}
+
+// State constants for the state machine.
+const (
+ stateWhitespace = 0
+ stateChars = 1
+ stateLeftAngle = 2
+ stateTag = 3
+ stateParen = 4
+ stateTagQuote = 5
+ stateNewline = 6
+)
+
+// htmlMarginSlop is a number of characters at the end of the line used to control word-wrapping.
+const htmlMarginSlop = 5
+
+type htmlCheckerImpl struct {
+ config *HTMLCheckerConfig
+ started bool
+ finished bool
+ state int
+ quoteChar byte
+ parenLevel int
+ columns int
+ lines int
+ noBreakCount int
+ triggerWBR bool
+ outputBuffer strings.Builder
+ tempBuffer strings.Builder
+ tagStack *util.Stack[*tag]
+ counters map[string]*countingRewriter
+ stringRewriters []rewriter
+ wordRewriters []rewriter
+ tagRewriters []rewriter
+ parenRewriters []rewriter
+ outputFilters []outputFilter
+ contextData map[string]any
+ externalReferences map[*url.URL]bool
+ internalReferences map[string]bool
+ tagSet *bitset.BitSet
+}
+
+func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) {
+ for i := range source {
+ rw, ok := rewriterRegistry[source[i]]
+ if ok {
+ if rw.Name() != "" {
+ crw := MakeCountingRewriter(rw)
+ ht.counters[rw.Name()] = crw
+ rw = crw
+ }
+ dest[i] = rw
+ } else {
+ log.Errorf("rewriter %s is not found", source[i])
+ }
+ }
+}
+
+func AmNewHTMLChecker(configName string) (HTMLChecker, error) {
+ config, ok := configsRegistry[configName]
+ if !ok {
+ return nil, fmt.Errorf("configuration %s not found", configName)
+ }
+ tset, ok := tagSetNameToSet[config.TagSet]
+ if !ok {
+ return nil, fmt.Errorf("tag set %s not found", config.TagSet)
+ }
+ rc := htmlCheckerImpl{
+ config: config,
+ started: false,
+ finished: false,
+ state: stateWhitespace,
+ parenLevel: 0,
+ columns: 0,
+ lines: 0,
+ noBreakCount: 0,
+ triggerWBR: false,
+ tagStack: util.NewStack[*tag](),
+ counters: make(map[string]*countingRewriter),
+ stringRewriters: make([]rewriter, len(config.StringRewriters)),
+ wordRewriters: make([]rewriter, len(config.WordRewriters)),
+ tagRewriters: make([]rewriter, len(config.TagRewriters)),
+ parenRewriters: make([]rewriter, len(config.ParenRewriters)),
+ outputFilters: make([]outputFilter, len(config.OutputFilters)),
+ contextData: make(map[string]any),
+ externalReferences: make(map[*url.URL]bool),
+ internalReferences: make(map[string]bool),
+ tagSet: tset,
+ }
+ rc.copyRewriters(rc.stringRewriters, config.StringRewriters)
+ rc.copyRewriters(rc.wordRewriters, config.WordRewriters)
+ rc.copyRewriters(rc.tagRewriters, config.TagRewriters)
+ rc.copyRewriters(rc.parenRewriters, config.ParenRewriters)
+ for i := range config.OutputFilters {
+ f, ok := outputFilterRegistry[config.OutputFilters[i]]
+ if ok {
+ rc.outputFilters[i] = f
+ } else {
+ log.Errorf("filter %s is not found", config.OutputFilters[i])
+ }
+ }
+ return &rc
+}
+
+func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) {
+ if str == "" {
+ return
+ }
+ realCountCols := countCols && (ht.config.WordWrap > 0)
+ if len(filters) == 0 {
+ ht.outputBuffer.WriteString(str)
+ if realCountCols {
+ ht.columns += len(str)
+ }
+ return
+ }
+ temp := str
+ for len(temp) > 0 {
+ outputLen := len(temp)
+ var stopper outputFilter = nil
+ for _, of := range filters {
+ lnm := of.lengthNoMatch(temp)
+ if lnm >= 0 && lnm < outputLen {
+ outputLen = lnm
+ stopper = of
+ }
+ if outputLen <= 0 {
+ break
+ }
+ }
+ if outputLen > 0 {
+ ht.outputBuffer.WriteString(temp[:outputLen])
+ if realCountCols {
+ ht.columns += outputLen
+ }
+ }
+ if stopper != nil {
+ tmpch := temp[outputLen]
+ outputLen++
+ if !stopper.tryOutputCharacter(ht.outputBuffer, tmpch) {
+ ht.outputBuffer.WriteByte(tmpch)
+ }
+ if realCountCols {
+ ht.columns++
+ }
+ }
+ if outputLen == len(temp) {
+ temp = ""
+ } else if outputLen > 0 {
+ temp = temp[outputLen:]
+ }
+ }
+}
+
+func (ht *htmlCheckerImpl) emitLineBreak() {
+
+}
+
+func (ht *htmlCheckerImpl) emitPossibleLineBreak() {
+ if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 && ht.columns >= ht.config.WordWrap {
+ ht.emitLineBreak()
+ }
+}
+
+func (ht *htmlCheckerImpl) doFlushString() bool {
+ return false // TODO
+}
+
+func (ht *htmlCheckerImpl) parse(str string) {
+
+}
+
+func (ht *htmlCheckerImpl) Append(str string) error {
+ if ht.finished {
+ return AlreadyFinished
+ }
+ if !ht.started {
+ ht.started = true
+ }
+ if str != "" {
+ ht.parse(str)
+ }
+ return nil
+}
+
+func (ht *htmlCheckerImpl) Finish() error {
+ if ht.finished {
+ return AlreadyFinished
+ }
+ if !ht.started {
+ ht.started = true
+ }
+ // This is the "end parse" loop, in which we resolve any funny state the parser has
+ // found itself in and clear out the internal buffers.
+ running := true
+ for running {
+ running = false // make sure we stop unless this is set to true
+ switch ht.state {
+ case stateWhitespace, stateNewline:
+ // do nothing - discard whitespace or newlines at end
+ case stateChars:
+ running = ht.doFlushString() // flush the temporary buffer
+ case stateLeftAngle:
+
+ }
+ }
+}
diff --git a/htmlcheck/checker_config.go b/htmlcheck/checker_config.go
index d2ad6ce..3d72674 100644
--- a/htmlcheck/checker_config.go
+++ b/htmlcheck/checker_config.go
@@ -12,7 +12,6 @@ package htmlcheck
import (
_ "embed"
- log "github.com/sirupsen/logrus"
"gopkg.in/yaml.v3"
)
@@ -36,48 +35,6 @@ type HTMLCheckerConfig struct {
DisallowTags []string `yaml:"disallowTags"`
}
-func (cfg *HTMLCheckerConfig) rezOutputFilters() []outputFilter {
- rc := make([]outputFilter, 0, len(cfg.OutputFilters))
- for i := range cfg.OutputFilters {
- f, ok := outputFilterRegistry[cfg.OutputFilters[i]]
- if ok {
- rc = append(rc, f)
- } else {
- log.Errorf("filter %s is not found", cfg.OutputFilters[i])
- }
- }
- return rc
-}
-
-func rezRewriters(desired []string) []rewriter {
- rc := make([]rewriter, 0, len(desired))
- for i := range desired {
- r, ok := rewriterRegistry[desired[i]]
- if ok {
- rc = append(rc, r)
- } else {
- log.Errorf("rewriter %s is not found", desired[i])
- }
- }
- return rc
-}
-
-func (cfg *HTMLCheckerConfig) rezStringRewriters() []rewriter {
- return rezRewriters(cfg.StringRewriters)
-}
-
-func (cfg *HTMLCheckerConfig) rezWordRewriters() []rewriter {
- return rezRewriters(cfg.WordRewriters)
-}
-
-func (cfg *HTMLCheckerConfig) rezTagRewriters() []rewriter {
- return rezRewriters(cfg.TagRewriters)
-}
-
-func (cfg *HTMLCheckerConfig) rezParenRewriters() []rewriter {
- return rezRewriters(cfg.ParenRewriters)
-}
-
// HTMLCheckerConfigFile represents all the configs as they exist in the file.
type HTMLCheckerConfigFile struct {
Configs []HTMLCheckerConfig `yaml:"configs"`
diff --git a/htmlcheck/rewriter.go b/htmlcheck/rewriter.go
index 80b4ecd..c9c1374 100644
--- a/htmlcheck/rewriter.go
+++ b/htmlcheck/rewriter.go
@@ -294,3 +294,47 @@ func (rw *userLinkRewriter) Rewrite(data string, svc rewriterServices) *markupDa
rescan: false,
}
}
+
+// countingRewriter is a wrapper around rewriter that counts the number of rewrites.
+type countingRewriter struct {
+ inner rewriter
+ count int
+}
+
+// Name returns the rewriter's name.
+func (rw *countingRewriter) Name() string {
+ return rw.inner.Name()
+}
+
+/* Rewrite rewrites the given string data and adds markup before and after if needed.
+ * Parameters:
+ * data - The data to be rewritten.
+ * svc - Services interface we can use.
+ * Returns:
+ * Pointer to markup data, or nil.
+ */
+func (rw *countingRewriter) Rewrite(data string, svc rewriterServices) *markupData {
+ rc := rw.inner.Rewrite(data, svc)
+ if rc != nil && !rc.rescan {
+ rw.count++
+ }
+ return rc
+}
+
+// GetCount returns the rewriter's count.
+func (rw *countingRewriter) GetCount() int {
+ return rw.count
+}
+
+// Reset resets the rewriter.
+func (rw *countingRewriter) Reset() {
+ rw.count = 0
+}
+
+// MakeCountingRewriter wraps the rewriter in a countingRewriter.
+func MakeCountingRewriter(rw rewriter) *countingRewriter {
+ return &countingRewriter{
+ inner: rw,
+ count: 0,
+ }
+}
diff --git a/util/stack.go b/util/stack.go
new file mode 100644
index 0000000..60eaf3b
--- /dev/null
+++ b/util/stack.go
@@ -0,0 +1,51 @@
+/*
+ * Amsterdam Web Communities System
+ * Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+
+// Package util contains utility definitions.
+package util
+
+// Stack[T] is a simple generic array-based stack implementation.
+type Stack[T any] struct {
+ elements []T
+}
+
+// IsEmpty returns true if the stack is empty.
+func (stk *Stack[T]) IsEmpty() bool {
+ return len(stk.elements) == 0
+}
+
+// Push adds a value to the top of the stack.
+func (stk *Stack[T]) Push(data T) {
+ stk.elements = append(stk.elements, data)
+}
+
+// Pop removes and returns a value from the top of the stack.
+func (stk *Stack[T]) Pop() (T, bool) {
+ if stk.IsEmpty() {
+ return *new(T), false
+ }
+ topElement := stk.elements[len(stk.elements)-1]
+ stk.elements = stk.elements[:len(stk.elements)-1]
+ return topElement, true
+}
+
+// Peek returns the current value on the top of the stack.
+func (stk *Stack[T]) Peek() (T, bool) {
+ if stk.IsEmpty() {
+ return *new(T), false
+ }
+ return stk.elements[len(stk.elements)-1], true
+}
+
+// NewStack creates and returns a new stack.
+func NewStack[T any]() *Stack[T] {
+ return &Stack[T]{
+ elements: make([]T, 0),
+ }
+}