landed the dictionary objects and spelling rewriter, which also allowed me to complete the configuration YAML file

This commit is contained in:
2025-10-30 22:33:39 -06:00
parent 05a43bcd47
commit 596d7de7de
10 changed files with 323 additions and 2 deletions
+51
View File
@@ -46,3 +46,54 @@ configs:
- emoticon
- email
- url
wordRewriters:
- spelling
tagRewriters:
- emoticon_tag
- postlink
- userlink
- email
- url
parenRewriters:
- userlink
tagSet: normal
- name: "escaper"
wordWrap: 0
angles: false
parens: false
discardHTML: false
outputFilters:
- html
- name: "mail-post"
wordWrap: 55
angles: true
parens: false
dicardHTML: true
discardRejected: true
tagSet: normal
- name: "post-from-email"
wordWrap: 55
rewrap: true
angles: true
parens: true
discardHTML: false
dicardRejected: true
discardComments: true
discardXML: true
outputFilters:
- html
stringRewriters:
- emoticon
- email
- url
tagRewriters:
- emoticon_tag
- postlink
- userlink
- email
- url
parenRewriters:
- userlink
tagSet: normal
disallowTags:
- font
+90
View File
@@ -0,0 +1,90 @@
/*
* Amsterdam Web Communities System
* Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
// The htmlcheck package contains the HTML Checker.
package htmlcheck
import "strings"
// CompositeDictionary is a dictionary that wraps several base dictionaries, and adds some extra behavior.
type CompositeDictionary struct {
dicts []SpellingDictionary
}
// Ready returns true if the dictionary has been fully loaded.
func (d *CompositeDictionary) Ready() bool {
for _, sd := range d.dicts {
if !sd.Ready() {
return false
}
}
return true
}
// Size returns the number of words in the dictionary.
func (d *CompositeDictionary) Size() int {
rc := 0
for _, sd := range d.dicts {
rc += sd.Size()
}
return rc
}
// checkSimple passes a word to the subdictionaries to check it.
func (d *CompositeDictionary) checkSimple(word string) bool {
for _, sd := range d.dicts {
if sd.CheckWord(word) {
return true
}
}
return false
}
// checkHyphenates breaks a hyphenatewd work up into parts and checks each one.
func (d *CompositeDictionary) checkHyphenates(word string) bool {
parts := strings.Split(word, "-")
if len(parts) == 1 {
return false // no hyphens
}
for _, frag := range parts {
// each fragment greater than 1 character must be in dictionary
if len(frag) > 1 {
if !d.checkSimple(frag) {
return false
}
}
}
return true
}
// CheckWord returns true if a word appears in the dictionary.
func (d *CompositeDictionary) CheckWord(word string) bool {
if len(word) <= 1 {
return true // words of length 1 get a free pass
}
realWord := strings.ToLower(word)
if d.checkSimple(realWord) {
return true
}
if strings.HasSuffix(realWord, "'s") {
l := len(realWord)
base := realWord[:l-2]
if d.checkSimple(base) {
return true
}
return d.checkHyphenates(base)
}
return d.checkHyphenates(realWord)
}
// NewCompositeDict wraps an array of SpellingDictionary objects up in a composite.
func NewCompositeDict(dicts []SpellingDictionary) *CompositeDictionary {
return &CompositeDictionary{
dicts: dicts,
}
}
+98
View File
@@ -0,0 +1,98 @@
/*
* Amsterdam Web Communities System
* Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
// The htmlcheck package contains the HTML Checker.
package htmlcheck
import (
"bufio"
"strings"
"sync"
"sync/atomic"
"github.com/derekparker/trie"
log "github.com/sirupsen/logrus"
)
// TrieDictionary is a ModSpellingDictionary implemented using a trie.
type TrieDictionary struct {
mutex sync.Mutex
loaded atomic.Bool
trie *trie.Trie
count int
}
// Ready lets us know if the dictionary is fully loaded.
func (d *TrieDictionary) Ready() bool {
return d.loaded.Load()
}
// Size returns the number of words in the dictionary.
func (d *TrieDictionary) Size() int {
d.mutex.Lock()
defer d.mutex.Unlock()
return d.count
}
// CheckWord returns true if a word is in the dictionary, false if not.
func (d *TrieDictionary) CheckWord(word string) bool {
d.mutex.Lock()
defer d.mutex.Unlock()
_, rc := d.trie.Find(strings.ToLower(word))
return rc
}
// AddWord adds a new word to the dictionary.
func (d *TrieDictionary) AddWord(word string) {
d.mutex.Lock()
defer d.mutex.Unlock()
d.trie.Add(strings.ToLower(word), true)
d.count++
}
// DelWord deletes a word from the dictionary.
func (d *TrieDictionary) DelWord(word string) {
// not implemented for this type
}
// Clear removes all words from the dictionary.
func (d *TrieDictionary) Clear() {
d.mutex.Lock()
defer d.mutex.Unlock()
d.trie = trie.New()
d.count = 0
}
// loadDict is a goroutine that loads the dictionary in the background.
func loadDict(d *TrieDictionary, words []byte) {
d.mutex.Lock()
defer d.mutex.Unlock()
scanner := bufio.NewScanner(strings.NewReader(string(words)))
for scanner.Scan() {
word := strings.TrimSpace(scanner.Text())
if word != "" {
d.trie.Add(strings.ToLower(word), true)
}
}
if err := scanner.Err(); err != nil {
log.Fatalf("failed to load dictionary: %v", err)
}
d.loaded.Store(true)
}
// LoadTrieDict creates a TrieDictionary from a byte array that represents a word list (one word per line).
func LoadTrieDict(words []byte) *TrieDictionary {
rc := TrieDictionary{
loaded: atomic.Bool{},
trie: trie.New(),
count: 0,
}
rc.loaded.Store(false)
go loadDict(&rc, words)
return &rc
}
+71
View File
@@ -9,15 +9,86 @@
// The htmlcheck package contains the HTML Checker.
package htmlcheck
import (
_ "embed"
"os"
"git.erbosoft.com/amy/amsterdam/config"
log "github.com/sirupsen/logrus"
)
// SpellingDictionary is a simple dictionary interface.
type SpellingDictionary interface {
Ready() bool
Size() int
CheckWord(string) bool
}
// ModSpellingDictionary is an intrerface to a modifiable spelling dictionary.
type ModSpellingDictionary interface {
SpellingDictionary
AddWord(string)
DelWord(string)
Clear()
}
//go:embed en-us.dict
var mainDict []byte
//go:embed supplement.dict
var supplementaryDict []byte
// SetupDicts sets up the dictionaries and the spelling rewriter.
func SetupDicts() {
dicts := make([]SpellingDictionary, 2, 3)
dicts[0] = LoadTrieDict(mainDict)
dicts[1] = LoadTrieDict(supplementaryDict)
if config.GlobalConfig.Posting.ExternalDictionary != "" {
data, err := os.ReadFile(config.GlobalConfig.Posting.ExternalDictionary)
if err == nil {
ndict := LoadTrieDict(data)
dicts = append(dicts, ndict)
} else {
log.Errorf("failed to load external dictionary %s: %v", config.GlobalConfig.Posting.ExternalDictionary, err)
}
}
rw := spellingRewriter{
dict: NewCompositeDict(dicts),
}
rewriterRegistry[rw.Name()] = &rw
}
// spellingRewriter is a rewriter that flags spelling errors.
type spellingRewriter struct {
dict SpellingDictionary
}
// defaultBeginError is the markup that indicates the start of an error.
const defaultBeginError = "<span class=\"text-red-600 font-bold\">"
// defaultEndError is the markup that indicates the end of an error.
const defaultEndError = "</span>"
// Name returns the rewriter's name.
func (rw *spellingRewriter) Name() string {
return "spelling"
}
/* Rewrite rewrites the given string data and adds markup before and after if needed.
* Parameters:
* data - The data to be rewritten.
* svc - Services interface we can use.
* Returns:
* Pointer to markup data, or nil.
*/
func (rw *spellingRewriter) Rewrite(data string, svc rewriterServices) *markupData {
if rw.dict.CheckWord(data) {
return nil
}
return &markupData{
beginMarkup: defaultBeginError,
text: data,
endMarkup: defaultEndError,
rescan: false,
}
}