landed the dictionary objects and spelling rewriter, which also allowed me to complete the configuration YAML file
This commit is contained in:
@@ -46,3 +46,54 @@ configs:
|
||||
- emoticon
|
||||
- email
|
||||
- url
|
||||
wordRewriters:
|
||||
- spelling
|
||||
tagRewriters:
|
||||
- emoticon_tag
|
||||
- postlink
|
||||
- userlink
|
||||
- email
|
||||
- url
|
||||
parenRewriters:
|
||||
- userlink
|
||||
tagSet: normal
|
||||
- name: "escaper"
|
||||
wordWrap: 0
|
||||
angles: false
|
||||
parens: false
|
||||
discardHTML: false
|
||||
outputFilters:
|
||||
- html
|
||||
- name: "mail-post"
|
||||
wordWrap: 55
|
||||
angles: true
|
||||
parens: false
|
||||
dicardHTML: true
|
||||
discardRejected: true
|
||||
tagSet: normal
|
||||
- name: "post-from-email"
|
||||
wordWrap: 55
|
||||
rewrap: true
|
||||
angles: true
|
||||
parens: true
|
||||
discardHTML: false
|
||||
dicardRejected: true
|
||||
discardComments: true
|
||||
discardXML: true
|
||||
outputFilters:
|
||||
- html
|
||||
stringRewriters:
|
||||
- emoticon
|
||||
- email
|
||||
- url
|
||||
tagRewriters:
|
||||
- emoticon_tag
|
||||
- postlink
|
||||
- userlink
|
||||
- email
|
||||
- url
|
||||
parenRewriters:
|
||||
- userlink
|
||||
tagSet: normal
|
||||
disallowTags:
|
||||
- font
|
||||
|
||||
@@ -0,0 +1,90 @@
|
||||
/*
|
||||
* Amsterdam Web Communities System
|
||||
* Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
// The htmlcheck package contains the HTML Checker.
|
||||
package htmlcheck
|
||||
|
||||
import "strings"
|
||||
|
||||
// CompositeDictionary is a dictionary that wraps several base dictionaries, and adds some extra behavior.
|
||||
type CompositeDictionary struct {
|
||||
dicts []SpellingDictionary
|
||||
}
|
||||
|
||||
// Ready returns true if the dictionary has been fully loaded.
|
||||
func (d *CompositeDictionary) Ready() bool {
|
||||
for _, sd := range d.dicts {
|
||||
if !sd.Ready() {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Size returns the number of words in the dictionary.
|
||||
func (d *CompositeDictionary) Size() int {
|
||||
rc := 0
|
||||
for _, sd := range d.dicts {
|
||||
rc += sd.Size()
|
||||
}
|
||||
return rc
|
||||
}
|
||||
|
||||
// checkSimple passes a word to the subdictionaries to check it.
|
||||
func (d *CompositeDictionary) checkSimple(word string) bool {
|
||||
for _, sd := range d.dicts {
|
||||
if sd.CheckWord(word) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// checkHyphenates breaks a hyphenatewd work up into parts and checks each one.
|
||||
func (d *CompositeDictionary) checkHyphenates(word string) bool {
|
||||
parts := strings.Split(word, "-")
|
||||
if len(parts) == 1 {
|
||||
return false // no hyphens
|
||||
}
|
||||
for _, frag := range parts {
|
||||
// each fragment greater than 1 character must be in dictionary
|
||||
if len(frag) > 1 {
|
||||
if !d.checkSimple(frag) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// CheckWord returns true if a word appears in the dictionary.
|
||||
func (d *CompositeDictionary) CheckWord(word string) bool {
|
||||
if len(word) <= 1 {
|
||||
return true // words of length 1 get a free pass
|
||||
}
|
||||
realWord := strings.ToLower(word)
|
||||
if d.checkSimple(realWord) {
|
||||
return true
|
||||
}
|
||||
if strings.HasSuffix(realWord, "'s") {
|
||||
l := len(realWord)
|
||||
base := realWord[:l-2]
|
||||
if d.checkSimple(base) {
|
||||
return true
|
||||
}
|
||||
return d.checkHyphenates(base)
|
||||
}
|
||||
return d.checkHyphenates(realWord)
|
||||
}
|
||||
|
||||
// NewCompositeDict wraps an array of SpellingDictionary objects up in a composite.
|
||||
func NewCompositeDict(dicts []SpellingDictionary) *CompositeDictionary {
|
||||
return &CompositeDictionary{
|
||||
dicts: dicts,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
/*
|
||||
* Amsterdam Web Communities System
|
||||
* Copyright (c) 2025 Erbosoft Metaverse Design Solutions, All Rights Reserved
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*/
|
||||
// The htmlcheck package contains the HTML Checker.
|
||||
package htmlcheck
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/derekparker/trie"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// TrieDictionary is a ModSpellingDictionary implemented using a trie.
|
||||
type TrieDictionary struct {
|
||||
mutex sync.Mutex
|
||||
loaded atomic.Bool
|
||||
trie *trie.Trie
|
||||
count int
|
||||
}
|
||||
|
||||
// Ready lets us know if the dictionary is fully loaded.
|
||||
func (d *TrieDictionary) Ready() bool {
|
||||
return d.loaded.Load()
|
||||
}
|
||||
|
||||
// Size returns the number of words in the dictionary.
|
||||
func (d *TrieDictionary) Size() int {
|
||||
d.mutex.Lock()
|
||||
defer d.mutex.Unlock()
|
||||
return d.count
|
||||
}
|
||||
|
||||
// CheckWord returns true if a word is in the dictionary, false if not.
|
||||
func (d *TrieDictionary) CheckWord(word string) bool {
|
||||
d.mutex.Lock()
|
||||
defer d.mutex.Unlock()
|
||||
_, rc := d.trie.Find(strings.ToLower(word))
|
||||
return rc
|
||||
}
|
||||
|
||||
// AddWord adds a new word to the dictionary.
|
||||
func (d *TrieDictionary) AddWord(word string) {
|
||||
d.mutex.Lock()
|
||||
defer d.mutex.Unlock()
|
||||
d.trie.Add(strings.ToLower(word), true)
|
||||
d.count++
|
||||
}
|
||||
|
||||
// DelWord deletes a word from the dictionary.
|
||||
func (d *TrieDictionary) DelWord(word string) {
|
||||
// not implemented for this type
|
||||
}
|
||||
|
||||
// Clear removes all words from the dictionary.
|
||||
func (d *TrieDictionary) Clear() {
|
||||
d.mutex.Lock()
|
||||
defer d.mutex.Unlock()
|
||||
d.trie = trie.New()
|
||||
d.count = 0
|
||||
}
|
||||
|
||||
// loadDict is a goroutine that loads the dictionary in the background.
|
||||
func loadDict(d *TrieDictionary, words []byte) {
|
||||
d.mutex.Lock()
|
||||
defer d.mutex.Unlock()
|
||||
scanner := bufio.NewScanner(strings.NewReader(string(words)))
|
||||
for scanner.Scan() {
|
||||
word := strings.TrimSpace(scanner.Text())
|
||||
if word != "" {
|
||||
d.trie.Add(strings.ToLower(word), true)
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
log.Fatalf("failed to load dictionary: %v", err)
|
||||
}
|
||||
d.loaded.Store(true)
|
||||
}
|
||||
|
||||
// LoadTrieDict creates a TrieDictionary from a byte array that represents a word list (one word per line).
|
||||
func LoadTrieDict(words []byte) *TrieDictionary {
|
||||
rc := TrieDictionary{
|
||||
loaded: atomic.Bool{},
|
||||
trie: trie.New(),
|
||||
count: 0,
|
||||
}
|
||||
rc.loaded.Store(false)
|
||||
go loadDict(&rc, words)
|
||||
return &rc
|
||||
}
|
||||
@@ -9,15 +9,86 @@
|
||||
// The htmlcheck package contains the HTML Checker.
|
||||
package htmlcheck
|
||||
|
||||
import (
|
||||
_ "embed"
|
||||
"os"
|
||||
|
||||
"git.erbosoft.com/amy/amsterdam/config"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// SpellingDictionary is a simple dictionary interface.
|
||||
type SpellingDictionary interface {
|
||||
Ready() bool
|
||||
Size() int
|
||||
CheckWord(string) bool
|
||||
}
|
||||
|
||||
// ModSpellingDictionary is an intrerface to a modifiable spelling dictionary.
|
||||
type ModSpellingDictionary interface {
|
||||
SpellingDictionary
|
||||
AddWord(string)
|
||||
DelWord(string)
|
||||
Clear()
|
||||
}
|
||||
|
||||
//go:embed en-us.dict
|
||||
var mainDict []byte
|
||||
|
||||
//go:embed supplement.dict
|
||||
var supplementaryDict []byte
|
||||
|
||||
// SetupDicts sets up the dictionaries and the spelling rewriter.
|
||||
func SetupDicts() {
|
||||
dicts := make([]SpellingDictionary, 2, 3)
|
||||
dicts[0] = LoadTrieDict(mainDict)
|
||||
dicts[1] = LoadTrieDict(supplementaryDict)
|
||||
if config.GlobalConfig.Posting.ExternalDictionary != "" {
|
||||
data, err := os.ReadFile(config.GlobalConfig.Posting.ExternalDictionary)
|
||||
if err == nil {
|
||||
ndict := LoadTrieDict(data)
|
||||
dicts = append(dicts, ndict)
|
||||
} else {
|
||||
log.Errorf("failed to load external dictionary %s: %v", config.GlobalConfig.Posting.ExternalDictionary, err)
|
||||
}
|
||||
}
|
||||
rw := spellingRewriter{
|
||||
dict: NewCompositeDict(dicts),
|
||||
}
|
||||
rewriterRegistry[rw.Name()] = &rw
|
||||
}
|
||||
|
||||
// spellingRewriter is a rewriter that flags spelling errors.
|
||||
type spellingRewriter struct {
|
||||
dict SpellingDictionary
|
||||
}
|
||||
|
||||
// defaultBeginError is the markup that indicates the start of an error.
|
||||
const defaultBeginError = "<span class=\"text-red-600 font-bold\">"
|
||||
|
||||
// defaultEndError is the markup that indicates the end of an error.
|
||||
const defaultEndError = "</span>"
|
||||
|
||||
// Name returns the rewriter's name.
|
||||
func (rw *spellingRewriter) Name() string {
|
||||
return "spelling"
|
||||
}
|
||||
|
||||
/* Rewrite rewrites the given string data and adds markup before and after if needed.
|
||||
* Parameters:
|
||||
* data - The data to be rewritten.
|
||||
* svc - Services interface we can use.
|
||||
* Returns:
|
||||
* Pointer to markup data, or nil.
|
||||
*/
|
||||
func (rw *spellingRewriter) Rewrite(data string, svc rewriterServices) *markupData {
|
||||
if rw.dict.CheckWord(data) {
|
||||
return nil
|
||||
}
|
||||
return &markupData{
|
||||
beginMarkup: defaultBeginError,
|
||||
text: data,
|
||||
endMarkup: defaultEndError,
|
||||
rescan: false,
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user