From 4f9cdde1f2733df823491ac81189cbe07ae3eae1 Mon Sep 17 00:00:00 2001 From: Amy Gale Ruth Bowersox Date: Sun, 2 Nov 2025 23:08:40 -0700 Subject: [PATCH] HTML checker is code complete, needs a pass for documentation --- htmlcheck/checker.go | 801 +++++++++++++++++++++++++++++++++++- htmlcheck/checker_config.go | 39 +- htmlcheck/filter.go | 14 +- htmlcheck/rewriter.go | 4 + util/stack.go | 23 +- util/util.go | 60 ++- 6 files changed, 885 insertions(+), 56 deletions(-) diff --git a/htmlcheck/checker.go b/htmlcheck/checker.go index 885234c..32aa830 100644 --- a/htmlcheck/checker.go +++ b/htmlcheck/checker.go @@ -12,8 +12,11 @@ package htmlcheck import ( "errors" "fmt" + "maps" "net/url" "strings" + "unicode" + "unicode/utf8" "git.erbosoft.com/amy/amsterdam/util" "github.com/bits-and-blooms/bitset" @@ -35,8 +38,8 @@ type HTMLChecker interface { InternalRefs() ([]string, error) } -var AlreadyFinished = errors.New("the HTML checker has already finished") -var NotYetFinished = errors.New("the HTML checker has not yet been finished") +var ErrAlreadyFinished = errors.New("the HTML checker has already finished") +var ErrNotYetFinished = errors.New("the HTML checker has not yet been finished") type htmlCheckerBackend interface { getCheckerAttrValue(string) string @@ -60,6 +63,9 @@ const ( // htmlMarginSlop is a number of characters at the end of the line used to control word-wrapping. const htmlMarginSlop = 5 +// hyphApos is used to find hyphens and apostrophes. +const hyphApos = "-'" + type htmlCheckerImpl struct { config *HTMLCheckerConfig started bool @@ -80,6 +86,7 @@ type htmlCheckerImpl struct { tagRewriters []rewriter parenRewriters []rewriter outputFilters []outputFilter + rawOutputFilters []outputFilter contextData map[string]any externalReferences map[*url.URL]bool internalReferences map[string]bool @@ -102,6 +109,17 @@ func (ht *htmlCheckerImpl) copyRewriters(dest []rewriter, source []string) { } } +func (ht *htmlCheckerImpl) copyOutputFilters(dest []outputFilter, source []string) { + for i := range source { + f, ok := outputFilterRegistry[source[i]] + if ok { + dest[i] = f + } else { + log.Errorf("filter %s is not found", source[i]) + } + } +} + func AmNewHTMLChecker(configName string) (HTMLChecker, error) { config, ok := configsRegistry[configName] if !ok { @@ -128,6 +146,7 @@ func AmNewHTMLChecker(configName string) (HTMLChecker, error) { tagRewriters: make([]rewriter, len(config.TagRewriters)), parenRewriters: make([]rewriter, len(config.ParenRewriters)), outputFilters: make([]outputFilter, len(config.OutputFilters)), + rawOutputFilters: make([]outputFilter, len(config.RawOutputFilters)), contextData: make(map[string]any), externalReferences: make(map[*url.URL]bool), internalReferences: make(map[string]bool), @@ -137,15 +156,66 @@ func AmNewHTMLChecker(configName string) (HTMLChecker, error) { rc.copyRewriters(rc.wordRewriters, config.WordRewriters) rc.copyRewriters(rc.tagRewriters, config.TagRewriters) rc.copyRewriters(rc.parenRewriters, config.ParenRewriters) - for i := range config.OutputFilters { - f, ok := outputFilterRegistry[config.OutputFilters[i]] - if ok { - rc.outputFilters[i] = f - } else { - log.Errorf("filter %s is not found", config.OutputFilters[i]) + rc.copyOutputFilters(rc.outputFilters, config.OutputFilters) + rc.copyOutputFilters(rc.rawOutputFilters, config.RawOutputFilters) + return &rc, nil +} + +func (ht *htmlCheckerImpl) getCheckerAttrValue(name string) string { + if name == "ANCHORTAIL" { + return ht.config.AnchorTail + } + return "" +} + +func (ht *htmlCheckerImpl) sendTagMessage(msg string) { + switch msg { + case "NOBR": + ht.noBreakCount++ + case "/NOBR": + ht.noBreakCount-- + case "WBR": + ht.triggerWBR = true + } +} + +func (ht *htmlCheckerImpl) getCheckerContextValue(name string) any { + return ht.contextData[name] +} + +func (ht *htmlCheckerImpl) addExternalRef(ref *url.URL) { + ht.externalReferences[ref] = true +} + +func (ht *htmlCheckerImpl) addInternalRef(ref string) { + ht.internalReferences[ref] = true +} + +func (ht *htmlCheckerImpl) rewriterAttrValue(name string) string { + return ht.getCheckerAttrValue(name) +} + +func (ht *htmlCheckerImpl) rewriterContextValue(name string) any { + return ht.contextData[name] +} + +func (ht *htmlCheckerImpl) emitRune(ch rune, filters []outputFilter, countCols bool) { + handled := false + if len(filters) > 0 { + // try each output filter to see what we can do + for _, of := range filters { + handled = of.tryOutputRune(ht.outputBuffer, ch) + if handled { + break // found a filter to handle it, done + } + } + if !handled { // output the raw character + ht.outputBuffer.WriteRune(ch) + } + if countCols && ht.config.WordWrap > 0 { + ht.columns++ } } - return &rc } func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countCols bool) { @@ -154,42 +224,52 @@ func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countC } realCountCols := countCols && (ht.config.WordWrap > 0) if len(filters) == 0 { + // if there are no filters, just output the whole thing ht.outputBuffer.WriteString(str) if realCountCols { - ht.columns += len(str) + ht.columns += utf8.RuneCountInString(str) } return } temp := str for len(temp) > 0 { + // We output as much of the string as we possibly can at once. Assume, for now, we'll output the whole thing. outputLen := len(temp) + + // Now look at each of the output filters to see if we should try outputting a lesser amount + // (i.e. does the string contain a "stopper" that one of the filters would like to mogrify?) var stopper outputFilter = nil for _, of := range filters { + // find the length of characters that DOESN'T match this filter lnm := of.lengthNoMatch(temp) if lnm >= 0 && lnm < outputLen { + // we've found a new stopper - record the length and the filter outputLen = lnm stopper = of } if outputLen <= 0 { - break + break // nothing left to do here } } if outputLen > 0 { + // move over the unaltered characters first ht.outputBuffer.WriteString(temp[:outputLen]) if realCountCols { - ht.columns += outputLen + ht.columns += utf8.RuneCountInString(temp[:outputLen]) } } if stopper != nil { - tmpch := temp[outputLen] - outputLen++ - if !stopper.tryOutputCharacter(ht.outputBuffer, tmpch) { - ht.outputBuffer.WriteByte(tmpch) + // one of the output filters stopped us, try invoking it + tmpch, bsiz := utf8.DecodeRuneInString(temp[outputLen:]) + outputLen += bsiz + if !stopper.tryOutputRune(ht.outputBuffer, tmpch) { + ht.outputBuffer.WriteRune(tmpch) } if realCountCols { ht.columns++ } } + // Chop the string and go around again. if outputLen == len(temp) { temp = "" } else if outputLen > 0 { @@ -199,7 +279,11 @@ func (ht *htmlCheckerImpl) emitString(str string, filters []outputFilter, countC } func (ht *htmlCheckerImpl) emitLineBreak() { - + ht.emitString("\r\n", ht.rawOutputFilters, false) + if ht.config.WordWrap > 0 { + ht.columns = 0 + } + ht.lines++ } func (ht *htmlCheckerImpl) emitPossibleLineBreak() { @@ -208,17 +292,572 @@ func (ht *htmlCheckerImpl) emitPossibleLineBreak() { } } +func (ht *htmlCheckerImpl) ensureSpaceOnLine(nchars int) { + if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 { + // add a line break if needed here + remainSpace := ht.config.WordWrap - ht.columns + if remainSpace < nchars { + ht.emitLineBreak() + } + } +} + +func (ht *htmlCheckerImpl) emitMarkupData(md *markupData) { + if !md.rescan { + ht.ensureSpaceOnLine(len(md.text)) + ht.emitString(md.beginMarkup, ht.rawOutputFilters, false) + ht.emitString(md.text, ht.outputFilters, true) + ht.emitString(md.endMarkup, ht.rawOutputFilters, false) + } +} + +func (ht *htmlCheckerImpl) emitBracketedMarkupData(md *markupData, prefix rune, suffix rune) { + if !md.rescan { + l := len(md.text) + if l > 0 { + l += 2 + } + ht.ensureSpaceOnLine(l) + if len(md.text) > 0 { + ht.emitRune(prefix, ht.outputFilters, true) + } + ht.emitString(md.beginMarkup, ht.rawOutputFilters, false) + ht.emitString(md.text, ht.outputFilters, true) + ht.emitString(md.endMarkup, ht.rawOutputFilters, false) + if len(md.text) > 0 { + ht.emitRune(suffix, ht.outputFilters, true) + } + } +} + +func (ht *htmlCheckerImpl) doFlushWhitespace() { + outputLen := ht.tempBuffer.Len() + if outputLen > 0 { + forceLineBreak := false + if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 { + // adjust output if necessary for wordwrapping + remainSpace := ht.config.WordWrap - ht.columns + if remainSpace < outputLen { + outputLen = remainSpace + } + if outputLen <= 0 { + // this means that NONE of the whitespace would fit on this line...add a line break + forceLineBreak = true + outputLen = 0 + } + } + if forceLineBreak { + ht.emitLineBreak() + } + if outputLen > 0 { + ht.emitString(ht.tempBuffer.String()[:outputLen], ht.outputFilters, true) + } + ht.tempBuffer.Reset() + } +} + +func (ht *htmlCheckerImpl) doFlushNewlines() { + // Measure the number of line breaks we have. + lineBreaks, crs := 0, 0 + for ch := range []byte(ht.tempBuffer.String()) { + switch ch { + case '\r': + crs++ + case '\n': + crs = 0 + lineBreaks++ + } + } + if crs > 0 { + lineBreaks++ + } + + // Adjust the number of line breaks if rewrap is in effect. + if ht.config.Rewrap { + if lineBreaks < 2 { + // convert a single line break to whitespace + ht.tempBuffer.Reset() + ht.tempBuffer.WriteByte(' ') + ht.state = stateWhitespace + return + } else { + lineBreaks = 2 // compress out multiple blank lines + } + } + + for lineBreaks > 0 { + ht.emitLineBreak() + lineBreaks-- + } + ht.tempBuffer.Reset() + ht.state = stateWhitespace +} + +func (ht *htmlCheckerImpl) emitFromStartOfTempBuffer(nrunes int) { + if nrunes > 0 { + if ht.config.WordWrap > 0 && ht.noBreakCount <= 0 { + for nrunes > 0 { + curlen := min(nrunes, ht.config.WordWrap-ht.columns) + if curlen > 0 { + s := ht.tempBuffer.String() + bcurlen := util.RunesToBytes(s, curlen) + ht.emitString(s[:bcurlen], ht.outputFilters, true) + ht.tempBuffer.Reset() + ht.tempBuffer.WriteString(s[bcurlen:]) + nrunes -= curlen + } + if ht.columns >= ht.config.WordWrap { + ht.emitLineBreak() + } + } + } else { + s := ht.tempBuffer.String() + bnrunes := util.RunesToBytes(s, nrunes) + ht.emitString(s[:bnrunes], ht.outputFilters, true) + ht.tempBuffer.Reset() + ht.tempBuffer.WriteString(s[bnrunes:]) + } + } +} + +func (ht *htmlCheckerImpl) attemptRewrite(rewriters []rewriter, data string) *markupData { + for _, r := range rewriters { + rc := r.Rewrite(data, ht) + if rc != nil { + return rc + } + } + return nil +} + func (ht *htmlCheckerImpl) doFlushString() bool { - return false // TODO + md := ht.attemptRewrite(ht.stringRewriters, ht.tempBuffer.String()) + if md != nil { + ht.emitMarkupData(md) + ht.tempBuffer.Reset() + if md.rescan { + ht.parse(md.all()) + return true + } + return false + } + + first := true + for ht.tempBuffer.Len() > 0 { + sublen, isWord := util.WordRunLength(ht.tempBuffer.String()) + if isWord { + // we want to check the word, but first we must eliminate leading hyphens and apostrophes + hyphCount := 0 + for _, ch := range ht.tempBuffer.String() { + if hyphCount == sublen || !strings.ContainsRune(hyphApos, ch) { + break + } + hyphCount++ + } + ht.emitFromStartOfTempBuffer(hyphCount) + sublen -= hyphCount + + // now determine how many hyphens/apostrophes there are at the end of the word + runeArray := []rune(ht.tempBuffer.String()) + wordLen := sublen + hyphCount = 0 + for wordLen > 0 && strings.ContainsRune(hyphApos, runeArray[wordLen-1]) { + hyphCount++ + wordLen-- + } + + if wordLen > 0 { + // extract the word and remove it from the start of the buffer + word := string(runeArray[:wordLen]) + lw := len(word) + s := ht.tempBuffer.String() + ht.tempBuffer.Reset() + ht.tempBuffer.WriteString(s[lw:]) + + // try to rewrite this word + md := ht.attemptRewrite(ht.wordRewriters, word) + if md != nil { + // emit and/or reparse + ht.emitMarkupData(md) + if md.rescan { + ht.parse(md.all()) + } + } else { + // just output the word normally + ht.ensureSpaceOnLine(wordLen) + ht.emitString(word, ht.outputFilters, true) + } + } + + // now emit the rest of the hyphens/apostrophes + ht.emitFromStartOfTempBuffer(hyphCount) + + } else { + // emit this many characters, line-breaking where required + totalRunes := utf8.RuneCountInString(ht.tempBuffer.String()) + if sublen == totalRunes && !first && sublen <= htmlMarginSlop { + // This is intended to handle a small run of non-word characters at the end of a string (i.e. + // followed by whitespace) that should stay on the same line with its preceding word, to + // eliminate "funnies" in punctuation formatting. + ht.emitString(ht.tempBuffer.String(), ht.outputFilters, true) + ht.tempBuffer.Reset() + break + } + + // This is kind of the inverse of the above check; if we have a small run of non-word + // characters at the START of a word (preceded by whitespace and followed by at least + // one word character), then ensure that we can keep that word and its prefixing non-word + // characters on the same line (again, avoiding "funnies" in formatting). + if sublen < totalRunes && first && sublen <= htmlMarginSlop { + fwLen, _ := util.WordRunLengthAfterPrefix(ht.tempBuffer.String(), sublen) + ht.ensureSpaceOnLine(sublen + fwLen) + } + ht.emitFromStartOfTempBuffer(sublen) + } + first = false + } + return false +} + +func (ht *htmlCheckerImpl) handleAsHTML() bool { + ht.triggerWBR = false + tempString := ht.tempBuffer.String() + // Figure out where the start of the command word is. + startCmd := 0 + closingTag := false + if startCmd < len(tempString) && tempString[startCmd] == '/' { + startCmd++ + closingTag = true + } + + // now figure out where it ends + endCmd := startCmd + for endCmd < len(tempString) { + if unicode.IsSpace(rune(tempString[endCmd])) { + break + } + endCmd++ + } + + if endCmd == startCmd || (endCmd-startCmd) > tagMaxLength { + // command word is empty or is too long to be an HTML tag + return false + } + possTagName := tempString[startCmd:endCmd] + tagIndex, ok := tagNameToIndex[strings.ToUpper(possTagName)] + if !ok { + // not a known HTML tag + return false + } + tag := tagIndexToObject[tagIndex] + if closingTag && !tag.allowClose { + // it's a closing tag and this tag doesn't permit the "close" form + return false + } + tagSetID := tagIndexToSetId[tagIndex] + if !ht.tagSet.Test(uint(tagSetID)) { + // the tag is not allowed - discard it, if one of the flags is set in the config + return ht.config.DiscardHTML || ht.config.DiscardRejected + } + if !ht.config.DiscardHTML && tag.balanceTags { + // this tag needs to be balanced - here's where we manipulate the stack + var valid bool + if closingTag { + valid = ht.tagStack.RemoveMostRecent(tag) + } else { + ht.tagStack.Push(tag) + valid = true + } + if !valid { + return false + } + } + + // Give the tag object one last chance to dictate what we do with the tag. + realTagData := tag.rewriteContents(tempString, closingTag, ht) + if realTagData == "" || ht.config.DiscardHTML { + return true + } + + // Emit the tag to the output. + ht.emitRune('<', ht.rawOutputFilters, false) + ht.emitString(realTagData, ht.rawOutputFilters, false) + ht.emitRune('>', ht.rawOutputFilters, false) + + logicalLineBreak := false + if ht.triggerWBR && !closingTag && ht.noBreakCount > 0 { + // word break is logical line break, but only within no-break tags + logicalLineBreak = true + } else { + logicalLineBreak = tag.causeLineBreak(closingTag) + } + if logicalLineBreak { + ht.columns = 0 + } + return true +} + +func (ht *htmlCheckerImpl) containsHTMLComment() bool { + return ht.tempBuffer.Len() >= 3 && strings.HasPrefix(ht.tempBuffer.String(), "!--") +} + +func (ht *htmlCheckerImpl) containsCompleteHTMLComment() bool { + if ht.tempBuffer.Len() >= 5 { + s := ht.tempBuffer.String() + return strings.HasPrefix(s, "!--") && strings.HasSuffix(s, "--") + } + return false +} + +func (ht *htmlCheckerImpl) containsXMLConstruct() bool { + tempString := ht.tempBuffer.String() + ptr := 0 + if len(tempString) > 1 && tempString[0] == '/' { + ptr++ + } + for ptr < len(tempString) { + if tempString[ptr] == ':' { + return true + } else if unicode.IsSpace(rune(tempString[ptr])) { + break + } + ptr++ + } + return false +} + +func (ht *htmlCheckerImpl) finishTag() { + if ht.containsHTMLComment() { + if ht.containsCompleteHTMLComment() { + if !ht.config.DiscardComments { + // output the comment in the raw + ht.emitRune('<', ht.rawOutputFilters, false) + ht.emitString(ht.tempBuffer.String(), ht.rawOutputFilters, false) + ht.emitRune('>', ht.rawOutputFilters, false) + // clear state and retun to parsing + ht.tempBuffer.Reset() + ht.state = stateWhitespace + } + } + return + } + if ht.handleAsHTML() { + // this was valid HTML, we're done + ht.tempBuffer.Reset() + ht.state = stateWhitespace + return + } + + // try to handle it with a tag rewriter + md := ht.attemptRewrite(ht.tagRewriters, ht.tempBuffer.String()) + if md != nil { + ht.emitBracketedMarkupData(md, '<', '>') + ht.tempBuffer.Reset() + ht.state = stateWhitespace + if md.rescan { + ht.tempBuffer.WriteByte('<') + ht.state = stateChars + ht.parse(md.all() + ">") + } + return + } + + if ht.config.DiscardXML && ht.containsXMLConstruct() { + // this tag is an XML construct, and needs to be discarded + ht.tempBuffer.Reset() + ht.state = stateWhitespace + return + } + + // This tag has been rejected! process it normally as character data + rejection := ht.tempBuffer.String() + ht.tempBuffer.Reset() + ht.tempBuffer.WriteByte('<') + ht.state = stateChars + if len(rejection) > 0 { + ht.parse(rejection) + } + ht.parse(">") +} + +func (ht *htmlCheckerImpl) finishParen() { + // Try to handle the element using a paren rewriter + md := ht.attemptRewrite(ht.parenRewriters, ht.tempBuffer.String()) + if md != nil { + ht.emitBracketedMarkupData(md, '(', ')') + ht.tempBuffer.Reset() + ht.state = stateWhitespace + ht.parenLevel = 0 + if md.rescan { + ht.tempBuffer.WriteByte('(') + ht.state = stateChars + ht.parse(md.all() + ")") + } + return + } + + // Tag rejected! Process it normally as character data. + rejection := ht.tempBuffer.String() + ht.tempBuffer.Reset() + ht.tempBuffer.WriteByte('(') + ht.state = stateChars + ht.parenLevel = 0 + if len(rejection) > 0 { + ht.parse(rejection) + } + ht.parse(")") } func (ht *htmlCheckerImpl) parse(str string) { - + i := 0 + for i < len(str) { + ch := str[i] + switch ht.state { + case stateWhitespace: + switch ch { + case ' ', '\t': // append space and tab verbatim + ht.tempBuffer.WriteByte(ch) + i++ + case '\r', '\n': // flush and go to Newline state + ht.doFlushWhitespace() + ht.state = stateNewline + ht.tempBuffer.WriteByte(ch) + i++ + case '<': + ht.doFlushWhitespace() + if ht.config.Angles { + ht.state = stateLeftAngle + } else { + // process < as ordinary character + ht.state = stateChars + ht.tempBuffer.WriteByte(ch) + } + i++ + case '(': + ht.doFlushWhitespace() + if ht.config.Parens { + ht.state = stateParen + } else { + // process ( as ordinary character) + ht.state = stateChars + ht.tempBuffer.WriteByte(ch) + } + i++ + case '\\': // backslash processing is tricky - go to Chars state to handle it + ht.doFlushWhitespace() + ht.state = stateChars + default: + ht.doFlushWhitespace() + ht.state = stateChars + ht.tempBuffer.WriteByte(ch) + i++ + } + case stateChars: + switch ch { + case ' ', '\t': // go to Whitespace state + ht.doFlushString() + ht.state = stateWhitespace + ht.tempBuffer.WriteByte(ch) + i++ + case '\r', '\n': // go to Newline state + ht.doFlushString() + ht.state = stateNewline + ht.tempBuffer.WriteByte(ch) + i++ + case '<': // may be a start of tag + if ht.config.Angles { + ht.doFlushString() + ht.state = stateLeftAngle + } else { + ht.tempBuffer.WriteByte(ch) + } + i++ + case '\\': + if i < (len(str) - 1) { + i++ + ch = str[i] + if (ch == '(' && ht.config.Parens) || (ch == '<' && ht.config.Angles) { + // append the escaped character, omitting the backslash + ht.tempBuffer.WriteByte(ch) + i++ + } else { + // append the backslash and hit the new character + ht.tempBuffer.WriteByte('\\') + } + } else { + // just append the backslash notrmally + ht.tempBuffer.WriteByte(ch) + i++ + } + default: // just append the next character + ht.tempBuffer.WriteByte(ch) + i++ + } + case stateLeftAngle: + switch ch { + case ' ', '\t', '\r', '\n': // output <, go to Whitespace state + ht.emitRune('<', ht.outputFilters, true) + ht.state = stateWhitespace + case '<': // output < and stay in this state + ht.emitRune('<', ht.outputFilters, true) + i++ + default: + ht.state = stateTag + ht.tempBuffer.WriteByte(ch) + i++ + } + case stateTag: + switch ch { + case '>': // finish the tag - this changes the state, and possibly calls parse() recursively + ht.finishTag() + i++ + case '\'', '"': // go into "quote string" state inside the tag + ht.tempBuffer.WriteByte(ch) + ht.state = stateTagQuote + ht.quoteChar = ch + i++ + default: // just append the character + ht.tempBuffer.WriteByte(ch) + i++ + } + case stateParen: + switch ch { + case '(': + ht.tempBuffer.WriteByte(ch) + ht.parenLevel++ + i++ + case ')': + if ht.parenLevel == 0 { + ht.finishParen() + } else { + ht.tempBuffer.WriteByte(ch) + ht.parenLevel-- + } + i++ + default: + ht.tempBuffer.WriteByte(ch) + i++ + } + case stateTagQuote: + ht.tempBuffer.WriteByte(ch) + if ch == ht.quoteChar { + ht.state = stateTag + } + i++ + case stateNewline: + if ch == '\r' || ch == '\n' { + ht.tempBuffer.WriteByte(ch) + i++ + } else { + ht.doFlushNewlines() + } + } + } } func (ht *htmlCheckerImpl) Append(str string) error { if ht.finished { - return AlreadyFinished + return ErrAlreadyFinished } if !ht.started { ht.started = true @@ -231,7 +870,7 @@ func (ht *htmlCheckerImpl) Append(str string) error { func (ht *htmlCheckerImpl) Finish() error { if ht.finished { - return AlreadyFinished + return ErrAlreadyFinished } if !ht.started { ht.started = true @@ -247,7 +886,125 @@ func (ht *htmlCheckerImpl) Finish() error { case stateChars: running = ht.doFlushString() // flush the temporary buffer case stateLeftAngle: - + // just emit a left angle character + ht.emitPossibleLineBreak() + ht.emitRune('<', ht.outputFilters, true) + case stateTag, stateTagQuote: + // we won't finish this tag, so it's automagically rejected + rejection := ht.tempBuffer.String() + ht.tempBuffer.Reset() + ht.tempBuffer.WriteByte('<') + ht.state = stateChars + if len(rejection) > 0 { + ht.parse(rejection) + } + running = true + case stateParen: + rejection := ht.tempBuffer.String() + ht.tempBuffer.Reset() + ht.tempBuffer.WriteByte('(') + ht.state = stateChars + ht.parenLevel = 0 + if len(rejection) > 0 { + ht.parse(rejection) + } + running = true } } + + // Now close all the HTML tags that were left open. + for !ht.tagStack.IsEmpty() { + tag, _ := ht.tagStack.Pop() + ht.outputBuffer.WriteString(tag.makeClosingTag()) + } + + ht.lines++ + ht.finished = true + return nil +} + +func (ht *htmlCheckerImpl) Reset() { + ht.started = false + ht.finished = false + ht.triggerWBR = false + ht.state = stateWhitespace + ht.quoteChar = byte(0) + ht.columns = 0 + ht.lines = 0 + ht.parenLevel = 0 + ht.outputBuffer.Reset() + for u := range ht.externalReferences { + delete(ht.externalReferences, u) + } + for k := range ht.internalReferences { + delete(ht.internalReferences, k) + } + for c := range maps.Values(ht.counters) { + c.Reset() + } +} + +func (ht *htmlCheckerImpl) Value() (string, error) { + if ht.finished { + return ht.outputBuffer.String(), nil + } + return "", ErrNotYetFinished +} + +func (ht *htmlCheckerImpl) Length() (int, error) { + if ht.finished { + return ht.outputBuffer.Len(), nil + } + return 0, ErrNotYetFinished +} + +func (ht *htmlCheckerImpl) Lines() (int, error) { + if ht.finished { + return ht.lines, nil + } + return 0, ErrNotYetFinished +} + +func (ht *htmlCheckerImpl) Counter(name string) (int, error) { + if ht.finished { + cr, ok := ht.counters[name] + if ok { + return cr.GetCount(), nil + } + return 0, nil + } + return 0, ErrNotYetFinished +} + +func (ht *htmlCheckerImpl) GetContext(name string) any { + return ht.contextData[name] +} + +func (ht *htmlCheckerImpl) SetContext(name string, value any) { + ht.contextData[name] = value +} + +func (ht *htmlCheckerImpl) ExternalRefs() ([]*url.URL, error) { + if ht.finished { + rc := make([]*url.URL, len(ht.externalReferences)) + p := 0 + for url := range maps.Keys(ht.externalReferences) { + rc[p] = url + p++ + } + return rc, nil + } + return nil, ErrNotYetFinished +} + +func (ht *htmlCheckerImpl) InternalRefs() ([]string, error) { + if ht.finished { + rc := make([]string, len(ht.internalReferences)) + p := 0 + for s := range maps.Keys(ht.internalReferences) { + rc[p] = s + p++ + } + } + return nil, ErrNotYetFinished } diff --git a/htmlcheck/checker_config.go b/htmlcheck/checker_config.go index 3d72674..310c138 100644 --- a/htmlcheck/checker_config.go +++ b/htmlcheck/checker_config.go @@ -17,22 +17,24 @@ import ( // HTMLCheckerConfig is a configuration that may be used with the HTML Checker. type HTMLCheckerConfig struct { - Name string `yaml:"name"` - WordWrap int `yaml:"wordWrap"` - Rewrap bool `yaml:"rewrap"` - Angles bool `yaml:"angles"` - Parens bool `yaml:"parens"` - DiscardHTML bool `yaml:"discardHTML"` - DiscardRejected bool `yaml:"discardRejected"` - DiscardComments bool `yaml:"discardComments"` - DiscardXML bool `yaml:"discardXML"` - OutputFilters []string `yaml:"outputFilters"` - StringRewriters []string `yaml:"stringRewriters"` - WordRewriters []string `yaml:"wordRewriters"` - TagRewriters []string `yaml:"tagRewriters"` - ParenRewriters []string `yaml:"parenRewriters"` - TagSet string `yaml:"tagSet"` - DisallowTags []string `yaml:"disallowTags"` + Name string `yaml:"name"` + WordWrap int `yaml:"wordWrap"` + Rewrap bool `yaml:"rewrap"` + Angles bool `yaml:"angles"` + Parens bool `yaml:"parens"` + DiscardHTML bool `yaml:"discardHTML"` + DiscardRejected bool `yaml:"discardRejected"` + DiscardComments bool `yaml:"discardComments"` + DiscardXML bool `yaml:"discardXML"` + OutputFilters []string `yaml:"outputFilters"` + RawOutputFilters []string `yaml:"rawOutputFilters"` + StringRewriters []string `yaml:"stringRewriters"` + WordRewriters []string `yaml:"wordRewriters"` + TagRewriters []string `yaml:"tagRewriters"` + ParenRewriters []string `yaml:"parenRewriters"` + TagSet string `yaml:"tagSet"` + DisallowTags []string `yaml:"disallowTags"` + AnchorTail string `yaml:"anchorTail"` } // HTMLCheckerConfigFile represents all the configs as they exist in the file. @@ -40,6 +42,8 @@ type HTMLCheckerConfigFile struct { Configs []HTMLCheckerConfig `yaml:"configs"` } +const defaultAnchorTail = "TARGET=\"Wander\"" + //go:embed configs.yaml var configData []byte @@ -55,5 +59,8 @@ func init() { } for i := range cfgdata.Configs { configsRegistry[cfgdata.Configs[i].Name] = &(cfgdata.Configs[i]) + if cfgdata.Configs[i].AnchorTail == "" { + cfgdata.Configs[i].AnchorTail = defaultAnchorTail + } } } diff --git a/htmlcheck/filter.go b/htmlcheck/filter.go index 73e5648..96af31f 100644 --- a/htmlcheck/filter.go +++ b/htmlcheck/filter.go @@ -13,8 +13,8 @@ import "strings" // outputFilter is the interface for an HTML checker output filter. type outputFilter interface { - tryOutputCharacter(strings.Builder, byte) bool - matchCharacter(byte) bool + tryOutputRune(strings.Builder, rune) bool + matchRune(rune) bool lengthNoMatch(string) int } @@ -34,7 +34,7 @@ type htmlEncodingFilter struct{} const htmlEscapedChars = "<>&" // tryOutputCharacter outputs a character that needs to be escaped. -func (f *htmlEncodingFilter) tryOutputCharacter(buf strings.Builder, ch byte) bool { +func (f *htmlEncodingFilter) tryOutputRune(buf strings.Builder, ch rune) bool { switch ch { case '<': buf.WriteString("<") @@ -49,15 +49,15 @@ func (f *htmlEncodingFilter) tryOutputCharacter(buf strings.Builder, ch byte) bo } // matchCharacter returns true if this character needs to be escaped. -func (f *htmlEncodingFilter) matchCharacter(ch byte) bool { - return strings.IndexByte(htmlEscapedChars, ch) >= 0 +func (f *htmlEncodingFilter) matchRune(ch rune) bool { + return strings.ContainsRune(htmlEscapedChars, ch) } // lengthNoMatch returns the maximum length of unmatched characters at the start of the string. func (f *htmlEncodingFilter) lengthNoMatch(s string) int { rc := len(s) - for _, c := range []byte(htmlEscapedChars) { - tmp := strings.IndexByte(s, c) + for _, c := range htmlEscapedChars { + tmp := strings.IndexRune(s, c) if tmp >= 0 && tmp < rc { rc = tmp if rc == 0 { diff --git a/htmlcheck/rewriter.go b/htmlcheck/rewriter.go index c9c1374..3dd7353 100644 --- a/htmlcheck/rewriter.go +++ b/htmlcheck/rewriter.go @@ -26,6 +26,10 @@ type markupData struct { rescan bool } +func (md *markupData) all() string { + return md.beginMarkup + md.text + md.endMarkup +} + // rewriterServices is an interface that provides services to rewriters. type rewriterServices interface { rewriterAttrValue(string) string diff --git a/util/stack.go b/util/stack.go index 60eaf3b..d041487 100644 --- a/util/stack.go +++ b/util/stack.go @@ -11,7 +11,7 @@ package util // Stack[T] is a simple generic array-based stack implementation. -type Stack[T any] struct { +type Stack[T comparable] struct { elements []T } @@ -43,8 +43,27 @@ func (stk *Stack[T]) Peek() (T, bool) { return stk.elements[len(stk.elements)-1], true } +func (stk *Stack[T]) RemoveMostRecent(data T) bool { + i := len(stk.elements) - 1 + for i >= 0 { + if stk.elements[i] == data { + if i == 0 { + stk.elements = stk.elements[1:] + } else if (i + 1) == len(stk.elements) { + stk.elements = stk.elements[:i] + } else { + high := stk.elements[i+1:] + stk.elements = stk.elements[:i] + stk.elements = append(stk.elements, high...) + } + return true + } + } + return false +} + // NewStack creates and returns a new stack. -func NewStack[T any]() *Stack[T] { +func NewStack[T comparable]() *Stack[T] { return &Stack[T]{ elements: make([]T, 0), } diff --git a/util/util.go b/util/util.go index 7d50053..14de49b 100644 --- a/util/util.go +++ b/util/util.go @@ -14,17 +14,10 @@ import ( "regexp" "strings" "unicode" + "unicode/utf8" ) -var numeric *regexp.Regexp - -func init() { - re, err := regexp.Compile("^[0-9]+$") - if err != nil { - panic(err) - } - numeric = re -} +var numeric *regexp.Regexp = regexp.MustCompile(`^[0-9]+$`) /* CapitalizeString changes the first character of the string to a capital. * Parameters: @@ -80,3 +73,52 @@ func SqlEscape(s string, wildcards bool) string { func IsNumeric(s string) bool { return numeric.MatchString(s) } + +/* RunesToBytes returns the number of bytes in a string counting the number of runes from the beginning. + * Parameters: + * s - The string to work with. + * runeCount - The number of runes to count from the start of the string. + * Returns: + * The corresponding number of bytes. + */ +func RunesToBytes(s string, runeCount int) int { + bp := 0 + for runeCount > 0 { + if bp >= len(s) { + return len(s) + } + _, c := utf8.DecodeRuneInString(s[bp:]) + bp += c + runeCount-- + } + return bp +} + +func IsRuneWord(ch rune) bool { + return unicode.IsLetter(ch) || ch == '-' || ch == '\'' +} + +func WordRunLength(s string) (int, bool) { + c1, initLen := utf8.DecodeRuneInString(s) + wordChar := IsRuneWord(c1) + rlen := 1 + for _, mch := range s[initLen:] { + if IsRuneWord(mch) != wordChar { + break + } + rlen++ + } + return rlen, wordChar +} + +func WordRunLengthAfterPrefix(s string, nrunes int) (int, bool) { + ofs := 0 + for _, ch := range s { + if nrunes == 0 { + break + } + ofs += utf8.RuneLen(ch) + nrunes-- + } + return WordRunLength(s[ofs:]) +}