diff --git a/config/config.go b/config/config.go index e1ffaf7..33fd351 100644 --- a/config/config.go +++ b/config/config.go @@ -16,7 +16,7 @@ import ( "os" argparse "github.com/alexflint/go-arg" - "github.com/labstack/gommon/log" + log "github.com/sirupsen/logrus" "gopkg.in/yaml.v3" ) @@ -86,6 +86,9 @@ type AmConfig struct { Prioritize string `yaml:"prioritize"` } `yaml:"countryList"` } `yaml:"rendering"` + Posting struct { + ExternalDictionary string `yaml:"externalDictionary"` + } `yaml:"posting"` } //go:embed default.yaml @@ -164,6 +167,7 @@ func overlayConfig(dest *AmConfig, loaded *AmConfig, defaults *AmConfig) { dest.Rendering.TemplateDir = overlayString(loaded.Rendering.TemplateDir, defaults.Rendering.TemplateDir) dest.Rendering.CookieKey = overlayString(loaded.Rendering.CookieKey, defaults.Rendering.CookieKey) dest.Rendering.CountryList.Prioritize = overlayString(loaded.Rendering.CountryList.Prioritize, defaults.Rendering.CountryList.Prioritize) + dest.Posting.ExternalDictionary = overlayString(loaded.Posting.ExternalDictionary, defaults.Posting.ExternalDictionary) } // SetupConfig loads the command line arguments, loads the config file, and prepares GlobalConfig. diff --git a/config/default.yaml b/config/default.yaml index 79704d3..d9c1fff 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -43,3 +43,5 @@ rendering: cookiekey: ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz countryList: prioritize: US +posting: + externalDictionary: "" diff --git a/database/audit.go b/database/audit.go index d7c4a98..a6b148b 100644 --- a/database/audit.go +++ b/database/audit.go @@ -13,7 +13,7 @@ import ( "fmt" "time" - "github.com/labstack/gommon/log" + log "github.com/sirupsen/logrus" ) // AuditRecord holds an audit record instance. diff --git a/go.mod b/go.mod index ffcdb4a..a7def4e 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/alexflint/go-arg v1.6.0 github.com/biter777/countries v1.7.5 github.com/bits-and-blooms/bitset v1.24.0 + github.com/derekparker/trie v0.0.0-20230829180723-39f4de51ef7d github.com/disintegration/imaging v1.6.2 github.com/go-sql-driver/mysql v1.9.3 github.com/gorilla/sessions v1.4.0 diff --git a/go.sum b/go.sum index c475121..ba283a6 100644 --- a/go.sum +++ b/go.sum @@ -15,6 +15,8 @@ github.com/bits-and-blooms/bitset v1.24.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/derekparker/trie v0.0.0-20230829180723-39f4de51ef7d h1:hUWoLdw5kvo2xCsqlsIBMvWUc1QCSsCYD2J2+Fg6YoU= +github.com/derekparker/trie v0.0.0-20230829180723-39f4de51ef7d/go.mod h1:C7Es+DLenIpPc9J6IYw4jrK0h7S9bKj4DNl8+KxGEXU= github.com/disintegration/imaging v1.6.2 h1:w1LecBlG2Lnp8B3jk5zSuNqd7b4DXhcjwek1ei82L+c= github.com/disintegration/imaging v1.6.2/go.mod h1:44/5580QXChDfwIclfc/PCwrr44amcmDAg8hxG0Ewe4= github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg= diff --git a/htmlcheck/configs.yaml b/htmlcheck/configs.yaml index 8ba6e66..093bd64 100644 --- a/htmlcheck/configs.yaml +++ b/htmlcheck/configs.yaml @@ -46,3 +46,54 @@ configs: - emoticon - email - url + wordRewriters: + - spelling + tagRewriters: + - emoticon_tag + - postlink + - userlink + - email + - url + parenRewriters: + - userlink + tagSet: normal + - name: "escaper" + wordWrap: 0 + angles: false + parens: false + discardHTML: false + outputFilters: + - html + - name: "mail-post" + wordWrap: 55 + angles: true + parens: false + dicardHTML: true + discardRejected: true + tagSet: normal + - name: "post-from-email" + wordWrap: 55 + rewrap: true + angles: true + parens: true + discardHTML: false + dicardRejected: true + discardComments: true + discardXML: true + outputFilters: + - html + stringRewriters: + - emoticon + - email + - url + tagRewriters: + - emoticon_tag + - postlink + - userlink + - email + - url + parenRewriters: + - userlink + tagSet: normal + disallowTags: + - font diff --git a/htmlcheck/dict_composite.go b/htmlcheck/dict_composite.go new file mode 100644 index 0000000..71b7e63 --- /dev/null +++ b/htmlcheck/dict_composite.go @@ -0,0 +1,90 @@ +/* + * Amsterdam Web Communities System + * Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +// The htmlcheck package contains the HTML Checker. +package htmlcheck + +import "strings" + +// CompositeDictionary is a dictionary that wraps several base dictionaries, and adds some extra behavior. +type CompositeDictionary struct { + dicts []SpellingDictionary +} + +// Ready returns true if the dictionary has been fully loaded. +func (d *CompositeDictionary) Ready() bool { + for _, sd := range d.dicts { + if !sd.Ready() { + return false + } + } + return true +} + +// Size returns the number of words in the dictionary. +func (d *CompositeDictionary) Size() int { + rc := 0 + for _, sd := range d.dicts { + rc += sd.Size() + } + return rc +} + +// checkSimple passes a word to the subdictionaries to check it. +func (d *CompositeDictionary) checkSimple(word string) bool { + for _, sd := range d.dicts { + if sd.CheckWord(word) { + return true + } + } + return false +} + +// checkHyphenates breaks a hyphenatewd work up into parts and checks each one. +func (d *CompositeDictionary) checkHyphenates(word string) bool { + parts := strings.Split(word, "-") + if len(parts) == 1 { + return false // no hyphens + } + for _, frag := range parts { + // each fragment greater than 1 character must be in dictionary + if len(frag) > 1 { + if !d.checkSimple(frag) { + return false + } + } + } + return true +} + +// CheckWord returns true if a word appears in the dictionary. +func (d *CompositeDictionary) CheckWord(word string) bool { + if len(word) <= 1 { + return true // words of length 1 get a free pass + } + realWord := strings.ToLower(word) + if d.checkSimple(realWord) { + return true + } + if strings.HasSuffix(realWord, "'s") { + l := len(realWord) + base := realWord[:l-2] + if d.checkSimple(base) { + return true + } + return d.checkHyphenates(base) + } + return d.checkHyphenates(realWord) +} + +// NewCompositeDict wraps an array of SpellingDictionary objects up in a composite. +func NewCompositeDict(dicts []SpellingDictionary) *CompositeDictionary { + return &CompositeDictionary{ + dicts: dicts, + } +} diff --git a/htmlcheck/dict_trie.go b/htmlcheck/dict_trie.go new file mode 100644 index 0000000..36478da --- /dev/null +++ b/htmlcheck/dict_trie.go @@ -0,0 +1,98 @@ +/* + * Amsterdam Web Communities System + * Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +// The htmlcheck package contains the HTML Checker. +package htmlcheck + +import ( + "bufio" + "strings" + "sync" + "sync/atomic" + + "github.com/derekparker/trie" + log "github.com/sirupsen/logrus" +) + +// TrieDictionary is a ModSpellingDictionary implemented using a trie. +type TrieDictionary struct { + mutex sync.Mutex + loaded atomic.Bool + trie *trie.Trie + count int +} + +// Ready lets us know if the dictionary is fully loaded. +func (d *TrieDictionary) Ready() bool { + return d.loaded.Load() +} + +// Size returns the number of words in the dictionary. +func (d *TrieDictionary) Size() int { + d.mutex.Lock() + defer d.mutex.Unlock() + return d.count +} + +// CheckWord returns true if a word is in the dictionary, false if not. +func (d *TrieDictionary) CheckWord(word string) bool { + d.mutex.Lock() + defer d.mutex.Unlock() + _, rc := d.trie.Find(strings.ToLower(word)) + return rc +} + +// AddWord adds a new word to the dictionary. +func (d *TrieDictionary) AddWord(word string) { + d.mutex.Lock() + defer d.mutex.Unlock() + d.trie.Add(strings.ToLower(word), true) + d.count++ +} + +// DelWord deletes a word from the dictionary. +func (d *TrieDictionary) DelWord(word string) { + // not implemented for this type +} + +// Clear removes all words from the dictionary. +func (d *TrieDictionary) Clear() { + d.mutex.Lock() + defer d.mutex.Unlock() + d.trie = trie.New() + d.count = 0 +} + +// loadDict is a goroutine that loads the dictionary in the background. +func loadDict(d *TrieDictionary, words []byte) { + d.mutex.Lock() + defer d.mutex.Unlock() + scanner := bufio.NewScanner(strings.NewReader(string(words))) + for scanner.Scan() { + word := strings.TrimSpace(scanner.Text()) + if word != "" { + d.trie.Add(strings.ToLower(word), true) + } + } + if err := scanner.Err(); err != nil { + log.Fatalf("failed to load dictionary: %v", err) + } + d.loaded.Store(true) +} + +// LoadTrieDict creates a TrieDictionary from a byte array that represents a word list (one word per line). +func LoadTrieDict(words []byte) *TrieDictionary { + rc := TrieDictionary{ + loaded: atomic.Bool{}, + trie: trie.New(), + count: 0, + } + rc.loaded.Store(false) + go loadDict(&rc, words) + return &rc +} diff --git a/htmlcheck/dictionary.go b/htmlcheck/dictionary.go index 4c537dd..07d95e6 100644 --- a/htmlcheck/dictionary.go +++ b/htmlcheck/dictionary.go @@ -9,15 +9,86 @@ // The htmlcheck package contains the HTML Checker. package htmlcheck +import ( + _ "embed" + "os" + + "git.erbosoft.com/amy/amsterdam/config" + log "github.com/sirupsen/logrus" +) + +// SpellingDictionary is a simple dictionary interface. type SpellingDictionary interface { Ready() bool Size() int CheckWord(string) bool } +// ModSpellingDictionary is an intrerface to a modifiable spelling dictionary. type ModSpellingDictionary interface { SpellingDictionary AddWord(string) DelWord(string) Clear() } + +//go:embed en-us.dict +var mainDict []byte + +//go:embed supplement.dict +var supplementaryDict []byte + +// SetupDicts sets up the dictionaries and the spelling rewriter. +func SetupDicts() { + dicts := make([]SpellingDictionary, 2, 3) + dicts[0] = LoadTrieDict(mainDict) + dicts[1] = LoadTrieDict(supplementaryDict) + if config.GlobalConfig.Posting.ExternalDictionary != "" { + data, err := os.ReadFile(config.GlobalConfig.Posting.ExternalDictionary) + if err == nil { + ndict := LoadTrieDict(data) + dicts = append(dicts, ndict) + } else { + log.Errorf("failed to load external dictionary %s: %v", config.GlobalConfig.Posting.ExternalDictionary, err) + } + } + rw := spellingRewriter{ + dict: NewCompositeDict(dicts), + } + rewriterRegistry[rw.Name()] = &rw +} + +// spellingRewriter is a rewriter that flags spelling errors. +type spellingRewriter struct { + dict SpellingDictionary +} + +// defaultBeginError is the markup that indicates the start of an error. +const defaultBeginError = "" + +// defaultEndError is the markup that indicates the end of an error. +const defaultEndError = "" + +// Name returns the rewriter's name. +func (rw *spellingRewriter) Name() string { + return "spelling" +} + +/* Rewrite rewrites the given string data and adds markup before and after if needed. + * Parameters: + * data - The data to be rewritten. + * svc - Services interface we can use. + * Returns: + * Pointer to markup data, or nil. + */ +func (rw *spellingRewriter) Rewrite(data string, svc rewriterServices) *markupData { + if rw.dict.CheckWord(data) { + return nil + } + return &markupData{ + beginMarkup: defaultBeginError, + text: data, + endMarkup: defaultEndError, + rescan: false, + } +} diff --git a/main.go b/main.go index a30bea4..246750c 100644 --- a/main.go +++ b/main.go @@ -21,6 +21,7 @@ import ( "git.erbosoft.com/amy/amsterdam/config" "git.erbosoft.com/amy/amsterdam/database" "git.erbosoft.com/amy/amsterdam/email" + "git.erbosoft.com/amy/amsterdam/htmlcheck" "git.erbosoft.com/amy/amsterdam/ui" "github.com/labstack/echo-contrib/session" "github.com/labstack/echo/v4" @@ -101,6 +102,7 @@ func main() { defer closer() closer = email.SetupMailSender() defer closer() + htmlcheck.SetupDicts() ui.SetupTemplates() closer = ui.SetupSessionManager() defer closer()