From eb47b001bbc401993557fca078ee6413a4811f9e Mon Sep 17 00:00:00 2001 From: Amy Gale Ruth Bowersox Date: Wed, 29 Oct 2025 22:50:25 -0600 Subject: [PATCH] landed enough rewriters and filters to begin building configurations for the HTML checker --- database/post_link.go | 275 +++++++++++++++++++++++++++++++++ htmlcheck/configs.yaml | 48 ++++++ htmlcheck/emoticon_rewriter.go | 72 +++++++-- htmlcheck/filter.go | 69 +++++++++ htmlcheck/rewriter.go | 169 +++++++++++++++++++- 5 files changed, 614 insertions(+), 19 deletions(-) create mode 100644 database/post_link.go create mode 100644 htmlcheck/configs.yaml create mode 100644 htmlcheck/filter.go diff --git a/database/post_link.go b/database/post_link.go new file mode 100644 index 0000000..ce93d0e --- /dev/null +++ b/database/post_link.go @@ -0,0 +1,275 @@ +/* + * Amsterdam Web Communities System + * Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +// The database package contains database management and storage logic. +package database + +import ( + "errors" + "math" + "strconv" + "strings" +) + +// PostLinkData is the structure holding the decoded parts of the post link. +type PostLinkData struct { + Community string + Conference string + Topic int16 + FirstPost int32 + LastPost int32 +} + +// NeedsDBVerification returns true if the post link data needs tro be varified against the database. +func (d *PostLinkData) NeedsDBVerification() bool { + return d.Community != "" || d.Conference != "" +} + +// VerifyNames verifies the post link data against the database. +func (d *PostLinkData) VerifyNames() error { + if d.Community != "" { + comm, err := AmGetCommunityByAlias(d.Community) + if err != nil { + return err + } + if comm == nil { + return errors.New("community alias not found") + } + } + if d.Conference != "" { + conf, err := AmGetConferenceByAlias(d.Conference) + if err != nil { + return err + } + if conf == nil { + return errors.New("conference alias not found") + } + } + return nil +} + +// Maximum lengths of the components. +const ( + maxLinkLength = 130 + maxCommunityLength = 32 + maxConferenceLength = 64 +) + +// validateCommunity validates the community name and saves it. +func validateCommunity(name string, rc *PostLinkData) error { + if len(name) > maxCommunityLength { + return errors.New("community alias is too long") + } + if !AmIsValidAmsterdamID(name) { + return errors.New("community alias is not a valid identifier") + } + rc.Community = name + return nil +} + +// validateConference validates the conference name and saves it. +func validateConference(name string, rc *PostLinkData) error { + if len(name) > maxConferenceLength { + return errors.New("conference alias is too long") + } + if !AmIsValidAmsterdamID(name) { + return errors.New("conference alias is not a valid identifier") + } + rc.Conference = name + return nil +} + +// decodeTopicNumber decodes the topic number and saves it. +func decodeTopicNumber(data string, rc *PostLinkData) error { + v, err := strconv.Atoi(data) + if err != nil { + return errors.New("invalid topic number reference") + } + if v > math.MaxInt16 { + return errors.New("topic number out of range") + } + rc.Topic = int16(v) + return nil +} + +// decodePostRange decodes the post ranges (first and last post) and saves them. +func decodePostRange(data string, rc *PostLinkData) error { + pos := strings.IndexByte(data, '-') + var tempVal int32 = -1 + if pos > 0 { + temp := data[:pos] + data = data[pos+1:] + v, err := strconv.Atoi(temp) + if err != nil { + return errors.New("invalid post number reference") + } + tempVal = int32(v) + + if len(data) == 0 { + // range is open-ended (number-) + rc.FirstPost = tempVal + rc.LastPost = -1 + return nil + } + } else if pos == 0 { + return errors.New("cannot have - at beginning of post range") + } + + v2, err := strconv.Atoi(data) + if err != nil { + return errors.New("invalid post number reference") + } + rc.FirstPost = int32(v2) + if tempVal >= 0 { + if tempVal < rc.FirstPost { + // "frontwards" range - reorder the components + rc.LastPost = rc.FirstPost + rc.FirstPost = tempVal + } else { + // "backwards" range + rc.LastPost = tempVal + } + } else { + // a "range" of a single post + rc.LastPost = rc.FirstPost + } + return nil +} + +/* AmDecodePostLink decodes a post link and returns the complete breakdown of its components. + * Parameters: + * data - The post link to be decoded. + * Returns: + * Pointer to structure containing post link data, or nil. + * Standard Go error status. + */ +func AmDecodePostLink(data string) (*PostLinkData, error) { + if data == "" { + return nil, errors.New("empty string") + } + if len(data) > maxLinkLength { + return nil, errors.New("post link string too long") + } + rc := PostLinkData{ + Community: "", + Conference: "", + Topic: -1, + FirstPost: -1, + LastPost: -1, + } + + work := data + // First test: Bang + pos := strings.IndexByte(work, '!') + if pos > 0 { + err := validateCommunity(work[:pos], &rc) + if err != nil { + return nil, err + } + work = work[pos+1:] + if len(work) == 0 { + return &rc, nil // community link + } + } else if pos == 0 { + return nil, errors.New("cannot have ! at beginning") + } + + // Second test: Dot #1 + pos = strings.IndexByte(work, '.') + if pos < 0 { + // no dots in here, must be either "postlink" or "community!conference" + var err error + if rc.Community == "" { + err = decodePostRange(work, &rc) + } else { + err = validateConference(work, &rc) + } + if err != nil { + return nil, err + } + } + + // Peel off the initial substring before the dot. + confOrTopic := work[:pos] + work = work[pos+1:] + if len(work) == 0 { + // we had "conference." or "topic." or maybe "community!conference." + if rc.Community == "" { + // it's either "conference." or "topic." - try the latter first + err := decodeTopicNumber(confOrTopic, &rc) + if err != nil { + // it's not a topic number, try it as a conference name + err = validateConference(confOrTopic, &rc) + } + if err != nil { + return nil, err + } + } else { + // it was "community!conference." + err := validateConference(confOrTopic, &rc) + if err != nil { + return nil, err + } + } + } + + // Third test: Dot #2 + pos = strings.IndexByte(work, '.') + if pos < 0 { + // we had "conference.topic" or "topic.posts" or maybe "community!conference.topic" + var err error + if rc.Community == "" { + // either "conference.topic" or "topic.posts" + isTopic := false + err = decodeTopicNumber(confOrTopic, &rc) + if err != nil { + // it's "conference.topic" + err = validateConference(confOrTopic, &rc) + isTopic = true + } + if err == nil { + if isTopic { + err = decodeTopicNumber(work, &rc) + } else { + err = decodePostRange(work, &rc) + } + } + } else { + // we have "community!conference.topic" + err = validateConference(confOrTopic, &rc) + if err == nil { + err = decodeTopicNumber(work, &rc) + } + } + if err != nil { + return nil, err + } + return &rc, nil + } else if pos == 0 { + return nil, errors.New("cannot have . at beginning of string") + } + + // We definitely have "conference.topic.something" or "community!conference.topic.something" + err := validateConference(confOrTopic, &rc) + if err == nil { + err = decodeTopicNumber(work[:pos], &rc) + } + if err != nil { + return nil, err + } + work = work[pos+1:] + if len(work) == 0 { + // we had "conference.topic." or "communtiy!conference.topic.", those are both valid + return &rc, nil + } + err = decodePostRange(work, &rc) // the rest must be the post range + if err != nil { + return nil, err + } + return &rc, nil +} diff --git a/htmlcheck/configs.yaml b/htmlcheck/configs.yaml new file mode 100644 index 0000000..8ba6e66 --- /dev/null +++ b/htmlcheck/configs.yaml @@ -0,0 +1,48 @@ +# +# Amsterdam Web Communities System +# Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# +configs: + - name: "post-body" + wordWrap: 55 + angles: true + parens: true + discardHTML: false + outputFilters: + - html + stringRewriters: + - emoticon + - email + - url + tagRewriters: + - emoticon_tag + - postlink + - userlink + - email + - url + parenRewriters: + - userlink + tagSet: normal + - name: "post-pseud" + wordWrap: 0 + angles: true + parens: false + discardHTML: false + outputFilters: + - html + tagSet: restricted + - name: "preview" + wordWrap: 55 + angles: true + parens: true + discardHTML: false + outputFilters: + - html + stringRewriters: + - emoticon + - email + - url diff --git a/htmlcheck/emoticon_rewriter.go b/htmlcheck/emoticon_rewriter.go index cbd2ff2..83778a4 100644 --- a/htmlcheck/emoticon_rewriter.go +++ b/htmlcheck/emoticon_rewriter.go @@ -12,6 +12,7 @@ package htmlcheck import ( _ "embed" "math" + "regexp" "strings" "gopkg.in/yaml.v3" @@ -28,13 +29,13 @@ type EmoticonDef struct { type EmoticonConfig struct { PrefixChars string `yaml:"prefixChars"` Emoticons []EmoticonDef `yaml:"emoticons"` + emos map[string]*EmoticonDef } // emoticonRewriter is the implementation of rewriter in this file type emoticonRewriter struct { config *EmoticonConfig prefixChars []byte - emos map[string]*EmoticonDef patterns map[string]string minLength int } @@ -42,28 +43,27 @@ type emoticonRewriter struct { //go:embed emoticons.yaml var rawEmoConfig []byte -// EmoticonRewriter is the singleton instance of the emoticon rewriter. -var EmoticonRewriter rewriter - -// init loads the configuration and creates the singleton instance. +// init loads the configuration and registers the rewriters. func init() { var cfg EmoticonConfig if err := yaml.Unmarshal(rawEmoConfig, &cfg); err != nil { panic(err) } + cfg.emos = make(map[string]*EmoticonDef) + for i, def := range cfg.Emoticons { + cfg.emos[def.Name] = &(cfg.Emoticons[i]) + } rw := emoticonRewriter{ config: &cfg, prefixChars: []byte(cfg.PrefixChars), - emos: make(map[string]*EmoticonDef), patterns: make(map[string]string), minLength: math.MaxInt, } - for i, def := range rw.config.Emoticons { - rw.emos[def.Name] = &(rw.config.Emoticons[i]) + for _, def := range rw.config.Emoticons { for _, p := range def.Patterns { f := false - for k := range rw.prefixChars { - if p[0] == rw.prefixChars[k] { + for i := range rw.prefixChars { + if p[0] == rw.prefixChars[i] { f = true break } @@ -74,7 +74,13 @@ func init() { } } } - EmoticonRewriter = &rw + rewriterRegistry[rw.Name()] = &rw + + rw2 := emoticonTagRewriter{ + config: &cfg, + re: regexp.MustCompile(`^ei:\s*(\w+)(\s*/)?\s*$`), + } + rewriterRegistry[rw2.Name()] = &rw2 } // Name returns the rewriter's name. @@ -113,7 +119,7 @@ func (rw *emoticonRewriter) Rewrite(data string, svc rewriterServices) *markupDa for k, v := range rw.patterns { if strings.HasPrefix(work, k) { looking = false - output.WriteString(rw.emos[v].Replace) + output.WriteString(rw.config.emos[v].Replace) work = work[len(k):] didReplace = true break @@ -136,5 +142,45 @@ func (rw *emoticonRewriter) Rewrite(data string, svc rewriterServices) *markupDa return nil } output.WriteString(work) - return &markupData{beginMarkup: "", text: output.String(), endMarkup: "", rescan: true} + return &markupData{ + beginMarkup: "", + text: output.String(), + endMarkup: "", + rescan: true, + } +} + +// emoticonTagRewriter rewrites emoticon tags. +type emoticonTagRewriter struct { + config *EmoticonConfig + re *regexp.Regexp +} + +// Name returns the rewriter's name. +func (rw *emoticonTagRewriter) Name() string { + return "emoticon_tag" +} + +/* Rewrite rewrites the given string data and adds markup before and after if needed. + * Parameters: + * data - The data to be rewritten. + * svc - Services interface we can use. + * Returns: + * Pointer to markup data, or nil. + */ +func (rw *emoticonTagRewriter) Rewrite(data string, svc rewriterServices) *markupData { + m := rw.re.FindStringSubmatch(data) + if m == nil { + return nil + } + d, ok := rw.config.emos[m[1]] + if !ok { + return nil + } + return &markupData{ + beginMarkup: "", + text: d.Replace, + endMarkup: "", + rescan: false, + } } diff --git a/htmlcheck/filter.go b/htmlcheck/filter.go new file mode 100644 index 0000000..73e5648 --- /dev/null +++ b/htmlcheck/filter.go @@ -0,0 +1,69 @@ +/* + * Amsterdam Web Communities System + * Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +// The htmlcheck package contains the HTML Checker. +package htmlcheck + +import "strings" + +// outputFilter is the interface for an HTML checker output filter. +type outputFilter interface { + tryOutputCharacter(strings.Builder, byte) bool + matchCharacter(byte) bool + lengthNoMatch(string) int +} + +// outputFilterRegistry contains a listing of all defined output filters. +var outputFilterRegistry = make(map[string]outputFilter) + +// init registers all known filters. +func init() { + f := htmlEncodingFilter{} + outputFilterRegistry["html"] = &f +} + +// htmlEncodingFilter is a filter that escapes certain characters in HTML. +type htmlEncodingFilter struct{} + +// htmlEscapedChars is a list of HTML characters that are escaped. +const htmlEscapedChars = "<>&" + +// tryOutputCharacter outputs a character that needs to be escaped. +func (f *htmlEncodingFilter) tryOutputCharacter(buf strings.Builder, ch byte) bool { + switch ch { + case '<': + buf.WriteString("<") + case '>': + buf.WriteString(">") + case '&': + buf.WriteString("&") + default: + return false + } + return true +} + +// matchCharacter returns true if this character needs to be escaped. +func (f *htmlEncodingFilter) matchCharacter(ch byte) bool { + return strings.IndexByte(htmlEscapedChars, ch) >= 0 +} + +// lengthNoMatch returns the maximum length of unmatched characters at the start of the string. +func (f *htmlEncodingFilter) lengthNoMatch(s string) int { + rc := len(s) + for _, c := range []byte(htmlEscapedChars) { + tmp := strings.IndexByte(s, c) + if tmp >= 0 && tmp < rc { + rc = tmp + if rc == 0 { + return 0 + } + } + } + return rc +} diff --git a/htmlcheck/rewriter.go b/htmlcheck/rewriter.go index 6d03b7a..80b4ecd 100644 --- a/htmlcheck/rewriter.go +++ b/htmlcheck/rewriter.go @@ -10,9 +10,12 @@ package htmlcheck import ( + "fmt" "net/mail" "net/url" "strings" + + "git.erbosoft.com/amy/amsterdam/database" ) // markupData holds the return from rewriters. @@ -37,12 +40,24 @@ type rewriter interface { Rewrite(string, rewriterServices) *markupData } +// rewriterRegistry contains a list of all rewriters. +var rewriterRegistry = make(map[string]rewriter) + +// init registers our rewriters with the registry. +func init() { + r1 := emailRewriter{} + rewriterRegistry[r1.Name()] = &r1 + r2 := urlRewriter{} + rewriterRegistry[r2.Name()] = &r2 + r3 := postLinkRewriter{} + rewriterRegistry[r3.Name()] = &r3 + r4 := userLinkRewriter{} + rewriterRegistry[r4.Name()] = &r4 +} + // emailRewriter is an implementation of Rewriter that recognizes E-mail addresses. type emailRewriter struct{} -// EmailRewriter is a singleton implementration of rewriter for E-mail addresses. -var EmailRewriter = emailRewriter{} - // Name returns the rewriter's name. func (rw *emailRewriter) Name() string { return "email" @@ -81,9 +96,6 @@ func (rw *emailRewriter) Rewrite(data string, svc rewriterServices) *markupData // urlRewriter is an implementation of Rewriter that recognizes URLs. type urlRewriter struct{} -// URLRewriter is a singleton implementration of rewriter for URLs. -var URLRewriter = urlRewriter{} - // Name returns the rewriter's name. func (rw *urlRewriter) Name() string { return "url" @@ -137,3 +149,148 @@ func (rw *urlRewriter) Rewrite(data string, svc rewriterServices) *markupData { rescan: false, } } + +// postLinkRewriter is the rewriter that handles links to conference posts. +type postLinkRewriter struct{} + +// postLinkURLPrefix is the default URL prefix for post links. +const postLinkURLPrefix = "x-postlink:" + +// Name returns the rewriter's name. +func (rw *postLinkRewriter) Name() string { + return "postlink" +} + +// buildPostLink constructs a full post link from decoded data and context. +func buildPostLink(decoded, context *database.PostLinkData) string { + var b strings.Builder + started := false + if decoded.Community == "" { + b.WriteString(context.Community) + } else { + b.WriteString(decoded.Community) + started = true + } + b.WriteString("!") + if decoded.Conference == "" { + if started { + return b.String() + } + b.WriteString(context.Conference) + } else { + b.WriteString(decoded.Conference) + } + b.WriteString(".") + if decoded.Topic == -1 { + if started { + return b.String() + } + b.WriteString(fmt.Sprintf("%d", context.Topic)) + } else { + b.WriteString(fmt.Sprintf("%d", decoded.Topic)) + } + b.WriteString(".") + if decoded.FirstPost != -1 { + b.WriteString(fmt.Sprintf("%d", decoded.FirstPost)) + if decoded.FirstPost != decoded.LastPost { + b.WriteString("-") + if decoded.LastPost != -1 { + b.WriteString(fmt.Sprintf("%d", decoded.LastPost)) + } + } + } + return b.String() +} + +/* Rewrite rewrites the given string data and adds markup before and after if needed. + * Parameters: + * data - The data to be rewritten. + * svc - Services interface we can use. + * Returns: + * Pointer to markup data, or nil. + */ +func (rw *postLinkRewriter) Rewrite(data string, svc rewriterServices) *markupData { + q := svc.rewriterContextValue("PostLinkDecoderContext") + if q == nil { + return nil + } + ctxt := q.(*database.PostLinkData) + + mydata, err := database.AmDecodePostLink(data) + if err != nil { + return nil + } + err = mydata.VerifyNames() + if err != nil { + return nil + } + // build post link, add it as an internal reference + link := buildPostLink(mydata, ctxt) + svc.addInternalRef(link) + // build the necessary markup and return it + var openA strings.Builder + openA.WriteString("") + return &markupData{ + beginMarkup: openA.String(), + text: data, + endMarkup: "", + rescan: false, + } +} + +// userLinkRewriter is the rewriter that handles links to user names. +type userLinkRewriter struct{} + +// userLinkURIPrefix is the default URL prefix for user links. +const userLinkURIPRefix = "x-userlink:" + +// Name returns the rewriter's name. +func (rw *userLinkRewriter) Name() string { + return "userlink" +} + +/* Rewrite rewrites the given string data and adds markup before and after if needed. + * Parameters: + * data - The data to be rewritten. + * svc - Services interface we can use. + * Returns: + * Pointer to markup data, or nil. + */ +func (rw *userLinkRewriter) Rewrite(data string, svc rewriterServices) *markupData { + if data == "" || len(data) > 64 || !database.AmIsValidAmsterdamID(data) { + return nil + } + + user, err := database.AmGetUserByName(data) + if err != nil || user == nil { + return nil + } + + // build the necessary markup and return it + var openA strings.Builder + openA.WriteString("") + return &markupData{ + beginMarkup: openA.String(), + text: data, + endMarkup: "", + rescan: false, + } +}