landed the tag repository

This commit is contained in:
2025-10-31 22:33:56 -06:00
parent d69715557e
commit 8a2185e912
+211
View File
@@ -14,12 +14,45 @@ import (
"net/url"
"regexp"
"strings"
"github.com/bits-and-blooms/bitset"
)
// Constants used to group individual HTML tags.
const (
tagSetInlineFormat = 1 // inline formatting
tagSetAnchor = 2 // the <A> tag
tagSetBlockFormat = 3 // block-level formatting
tagSetActiveContent = 4 // active content like objects and scripts
tagSetImageMaps = 5 // image map tags
tagSetDocFormat = 6 // document-level formatting
tagSetFontFormat = 7 // the <FONT> tag
tagSetForms = 8 // form tags
tagSetTables = 9 // table tags
tagSetChangeMarkup = 10 // change markup (<DEL> and <INS>)
tagSetFrames = 11 // frame tags
tagSetImages = 12 // the <IMG> tag
tagSetPreformat = 13 // the <PRE> tag and similar
tagSetNSCPInlineFormat = 14 // Netscape-specific inline formatting
tagSetNSCPLayers = 15 // Netscape layer tags
tagSetNSCPForms = 16 // Netscape form tags
tagSetNSCPBlockFormat = 17 // Netscape block-formatting tags
tagSetNSCPServer = 18 // the Netscape <SERVER> tag
tagSetMSFTDocFormat = 19 // Microsoft-specific document formatting
tagSetMSFTInlineFormat = 20 // Microsoft-specific inline formatting
tagSetMSFTBlockFormat = 21 // Microsoft-specific block formatting
tagSetMSFTActiveContent = 22 // Microsoft-specific active content
tagSetServerPage = 23 // server-side page use
tagSetJavaServer = 24 // Java server page use
tagSetComment = 25 // HTML comments
)
// Functions used inside the tag to implement "overridden" behavior.
type causeLineBreakFunc func(*tag, bool) bool
type closingTagFunc func(*tag) string
type rewriteContentsFunc func(*tag, string, bool, htmlCheckerBackend) string
// tag is a structure describing a particular HTML tag.
type tag struct {
name string
index int
@@ -31,6 +64,7 @@ type tag struct {
rwc rewriteContentsFunc
}
// causeLineBreak returns true if the tag causes a line break.
func (t *tag) causeLineBreak(isClosing bool) bool {
if t.clb == nil {
return t.lineBreak
@@ -38,6 +72,7 @@ func (t *tag) causeLineBreak(isClosing bool) bool {
return t.clb(t, isClosing)
}
// makeClosingTag creates a closing tag for this one.
func (t *tag) makeClosingTag() string {
if t.ct == nil {
return ""
@@ -45,6 +80,7 @@ func (t *tag) makeClosingTag() string {
return t.ct(t)
}
// rewriteContents is a hook used to rewrite the contents of the tag.
func (t *tag) rewriteContents(contents string, isClosing bool, ctxt htmlCheckerBackend) string {
if t.rwc == nil {
return contents
@@ -52,6 +88,7 @@ func (t *tag) rewriteContents(contents string, isClosing bool, ctxt htmlCheckerB
return t.rwc(t, contents, isClosing, ctxt)
}
// createSimpleTag creates a structure for a simple tag.
func createSimpleTag(name string, brk bool) *tag {
return &tag{
name: strings.ToUpper(name),
@@ -65,6 +102,7 @@ func createSimpleTag(name string, brk bool) *tag {
}
}
// createWBRTag creates a structure for a WBR (word break) tag.
func createWBRTag() *tag {
return &tag{
name: "WBR",
@@ -81,10 +119,12 @@ func createWBRTag() *tag {
}
}
// stdClosingTag is the standard way a closing tag is made.
func stdClosingTag(tag *tag) string {
return fmt.Sprintf("</%s>", tag.name)
}
// createOpenCloseTag creates a tag that has a specific open and close form.
func createOpenCloseTag(name string, brk bool) *tag {
return &tag{
name: strings.ToUpper(name),
@@ -98,6 +138,7 @@ func createOpenCloseTag(name string, brk bool) *tag {
}
}
// createListElementTag creates a tag that is part of a list.
func createListElementTag(name string) *tag {
return &tag{
name: strings.ToUpper(name),
@@ -113,6 +154,7 @@ func createListElementTag(name string) *tag {
}
}
// createBalancedTag creates a tag that should have opens and closes inherently balanced.
func createBalancedTag(name string, brk bool) *tag {
return &tag{
name: strings.ToUpper(name),
@@ -126,6 +168,7 @@ func createBalancedTag(name string, brk bool) *tag {
}
}
// createNOBRTag creates a NOBR (no break) tag.
func createNOBRTag() *tag {
return &tag{
name: "NOBR",
@@ -146,11 +189,16 @@ func createNOBRTag() *tag {
}
}
// Patterns to be used in recognizing attributes in an <A> tag.
var hrefPattern = regexp.MustCompile(`(?i:href\s*=)`)
var targetPattern = regexp.MustCompile(`(?i:target\s*=)`)
// extractAttribute extracts an attribute value from the contents of an <A> tag.
func extractAttribute(s string) string {
s = strings.TrimSpace(s)
if len(s) == 0 {
return ""
}
if s[0] == '\'' || s[0] == '"' {
p := strings.IndexByte(s[1:], s[0])
if p < 0 {
@@ -161,6 +209,7 @@ func extractAttribute(s string) string {
return strings.Fields(s)[0]
}
// rewriteATagContents rewrites the contents of an <A> tag.
func rewriteATagContents(t *tag, contents string, isClosing bool, ctxt htmlCheckerBackend) string {
if isClosing {
return contents // don't bother checking close tag
@@ -191,6 +240,7 @@ func rewriteATagContents(t *tag, contents string, isClosing bool, ctxt htmlCheck
return contents + " " + tail
}
// createATag creates an <A> tag.
func createATag() *tag {
return &tag{
name: "A",
@@ -203,3 +253,164 @@ func createATag() *tag {
rwc: rewriteATagContents,
}
}
// tagNameToIndex is a mapping from tag names to indexes into the arrays.
var tagNameToIndex = make(map[string]int)
// tagIndexToObject contains the actual tags.
var tagIndexToObject = make([]*tag, 0, 50)
// tagIndexToSetId contains the set ID values for each tag.
var tagIndexToSetId = make([]int, 0, 50)
// tagMaxLength is the maximum length of a tag name.
var tagMaxLength = 0
// tagSetNameToSet is the listing of bit sets corresponding to the set names in configuration.
var tagSetNameToSet = make(map[string]*bitset.BitSet)
// enshrineTag adds a tag to our internal repository structures.
func enshrineTag(tag *tag, set int) {
ndx := len(tagIndexToObject)
tagIndexToObject = append(tagIndexToObject, tag)
tagIndexToSetId = append(tagIndexToSetId, set)
tag.index = ndx
tagNameToIndex[tag.name] = ndx
if len(tag.name) > tagMaxLength {
tagMaxLength = len(tag.name)
}
}
// init actually sets up the tag repository.
func init() {
enshrineTag(createSimpleTag("!DOCTYPE", false), tagSetDocFormat)
enshrineTag(createSimpleTag("%", false), tagSetServerPage)
enshrineTag(createSimpleTag("%=", false), tagSetServerPage)
enshrineTag(createSimpleTag("%@", false), tagSetServerPage)
enshrineTag(createATag(), tagSetAnchor)
enshrineTag(createBalancedTag("ABBR", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("ACRONYM", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("ADDRESS", true), tagSetBlockFormat)
enshrineTag(createBalancedTag("APPLET", false), tagSetActiveContent)
enshrineTag(createSimpleTag("AREA", false), tagSetImageMaps)
enshrineTag(createBalancedTag("B", false), tagSetInlineFormat)
enshrineTag(createSimpleTag("BASE", false), tagSetDocFormat)
enshrineTag(createSimpleTag("BASEFONT", false), tagSetDocFormat)
enshrineTag(createBalancedTag("BDO", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("BEAN", false), tagSetJavaServer)
enshrineTag(createSimpleTag("BGSOUND", false), tagSetMSFTDocFormat)
enshrineTag(createBalancedTag("BIG", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("BLINK", false), tagSetNSCPInlineFormat)
enshrineTag(createBalancedTag("BLOCKQUOTE", true), tagSetBlockFormat)
enshrineTag(createOpenCloseTag("BODY", false), tagSetDocFormat)
enshrineTag(createSimpleTag("BR", true), tagSetBlockFormat)
enshrineTag(createOpenCloseTag("BUTTON", false), tagSetForms)
enshrineTag(createBalancedTag("CAPTION", true), tagSetTables)
enshrineTag(createBalancedTag("CENTER", true), tagSetBlockFormat)
enshrineTag(createBalancedTag("CITE", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("CODE", false), tagSetInlineFormat)
enshrineTag(createSimpleTag("COL", true), tagSetTables)
enshrineTag(createOpenCloseTag("COLGROUP", true), tagSetTables)
enshrineTag(createBalancedTag("COMMENT", false), tagSetMSFTInlineFormat)
enshrineTag(createListElementTag("DD"), tagSetBlockFormat)
enshrineTag(createBalancedTag("DEL", false), tagSetChangeMarkup)
enshrineTag(createBalancedTag("DFN", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("DIR", true), tagSetBlockFormat)
enshrineTag(createBalancedTag("DIV", true), tagSetBlockFormat)
enshrineTag(createBalancedTag("DL", true), tagSetBlockFormat)
enshrineTag(createListElementTag("DT"), tagSetBlockFormat)
enshrineTag(createBalancedTag("EM", false), tagSetInlineFormat)
enshrineTag(createSimpleTag("EMBED", false), tagSetActiveContent)
enshrineTag(createBalancedTag("FIELDSET", false), tagSetForms)
enshrineTag(createBalancedTag("FONT", false), tagSetFontFormat)
enshrineTag(createBalancedTag("FORM", false), tagSetForms)
enshrineTag(createSimpleTag("FRAME", true), tagSetFrames)
enshrineTag(createBalancedTag("FRAMESET", false), tagSetFrames)
enshrineTag(createBalancedTag("H1", true), tagSetFontFormat)
enshrineTag(createBalancedTag("H2", true), tagSetFontFormat)
enshrineTag(createBalancedTag("H3", true), tagSetFontFormat)
enshrineTag(createBalancedTag("H4", true), tagSetFontFormat)
enshrineTag(createBalancedTag("H5", true), tagSetFontFormat)
enshrineTag(createBalancedTag("H6", true), tagSetFontFormat)
enshrineTag(createOpenCloseTag("HEAD", false), tagSetDocFormat)
enshrineTag(createSimpleTag("HR", true), tagSetBlockFormat)
enshrineTag(createOpenCloseTag("HTML", false), tagSetDocFormat)
enshrineTag(createBalancedTag("I", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("IFRAME", true), tagSetFrames)
enshrineTag(createBalancedTag("ILAYER", true), tagSetNSCPLayers)
enshrineTag(createSimpleTag("IMG", false), tagSetImages)
enshrineTag(createSimpleTag("INPUT", false), tagSetForms)
enshrineTag(createBalancedTag("INS", false), tagSetChangeMarkup)
enshrineTag(createSimpleTag("ISINDEX", false), tagSetForms)
enshrineTag(createBalancedTag("KBD", false), tagSetInlineFormat)
enshrineTag(createSimpleTag("KEYGEN", false), tagSetNSCPForms)
enshrineTag(createBalancedTag("LABEL", false), tagSetForms)
enshrineTag(createBalancedTag("LAYER", true), tagSetNSCPLayers)
enshrineTag(createBalancedTag("LEGEND", false), tagSetForms)
enshrineTag(createListElementTag("LI"), tagSetBlockFormat)
enshrineTag(createSimpleTag("LINK", false), tagSetDocFormat)
enshrineTag(createBalancedTag("LISTING", false), tagSetMSFTInlineFormat)
enshrineTag(createBalancedTag("MAP", false), tagSetImageMaps)
enshrineTag(createBalancedTag("MARQUEE", true), tagSetMSFTBlockFormat)
enshrineTag(createBalancedTag("MENU", true), tagSetBlockFormat)
enshrineTag(createSimpleTag("META", false), tagSetDocFormat)
enshrineTag(createBalancedTag("MULTICOL", false), tagSetNSCPBlockFormat)
enshrineTag(createNOBRTag(), tagSetBlockFormat)
enshrineTag(createBalancedTag("NOEMBED", false), tagSetActiveContent)
enshrineTag(createBalancedTag("NOFRAMES", false), tagSetFrames)
enshrineTag(createBalancedTag("NOLAYER", false), tagSetNSCPLayers)
enshrineTag(createBalancedTag("NOSCRIPT", false), tagSetActiveContent)
enshrineTag(createBalancedTag("OBJECT", false), tagSetActiveContent)
enshrineTag(createBalancedTag("OL", true), tagSetBlockFormat)
enshrineTag(createBalancedTag("OPTGROUP", false), tagSetForms)
enshrineTag(createListElementTag("OPTION"), tagSetForms)
enshrineTag(createOpenCloseTag("P", true), tagSetBlockFormat)
enshrineTag(createSimpleTag("PARAM", false), tagSetActiveContent)
enshrineTag(createSimpleTag("PLAINTEXT", false), tagSetPreformat)
enshrineTag(createBalancedTag("PRE", false), tagSetPreformat)
enshrineTag(createBalancedTag("Q", false), tagSetInlineFormat)
enshrineTag(createSimpleTag("RT", false), tagSetMSFTActiveContent)
enshrineTag(createBalancedTag("RUBY", false), tagSetMSFTActiveContent)
enshrineTag(createBalancedTag("S", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("SAMP", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("SCRIPT", false), tagSetActiveContent)
enshrineTag(createBalancedTag("SELECT", false), tagSetForms)
enshrineTag(createBalancedTag("SERVER", false), tagSetNSCPServer)
enshrineTag(createBalancedTag("SERVLET", false), tagSetJavaServer)
enshrineTag(createBalancedTag("SMALL", false), tagSetInlineFormat)
enshrineTag(createSimpleTag("SPACER", false), tagSetNSCPInlineFormat)
enshrineTag(createBalancedTag("SPAN", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("STRIKE", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("STRONG", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("STYLE", false), tagSetDocFormat)
enshrineTag(createBalancedTag("SUB", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("SUP", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("TABLE", true), tagSetTables)
enshrineTag(createOpenCloseTag("TBODY", true), tagSetTables)
enshrineTag(createBalancedTag("TD", true), tagSetTables)
enshrineTag(createBalancedTag("TEXTAREA", true), tagSetForms)
enshrineTag(createOpenCloseTag("TFOOT", true), tagSetTables)
enshrineTag(createBalancedTag("TH", true), tagSetTables)
enshrineTag(createOpenCloseTag("THEAD", true), tagSetTables)
enshrineTag(createBalancedTag("TITLE", false), tagSetDocFormat)
enshrineTag(createBalancedTag("TR", true), tagSetTables)
enshrineTag(createBalancedTag("TT", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("U", false), tagSetInlineFormat)
enshrineTag(createBalancedTag("UL", true), tagSetBlockFormat)
enshrineTag(createBalancedTag("VAR", false), tagSetInlineFormat)
enshrineTag(createWBRTag(), tagSetBlockFormat)
enshrineTag(createBalancedTag("XML", false), tagSetMSFTActiveContent)
enshrineTag(createBalancedTag("XMP", false), tagSetNSCPInlineFormat)
// Create the tag sets.
bs := bitset.New(tagSetComment + 1)
bs.Set(tagSetInlineFormat)
bs.Set(tagSetAnchor)
bs.Set(tagSetBlockFormat)
bs.Set(tagSetFontFormat)
bs.Set(tagSetImages)
tagSetNameToSet["normal"] = bs
bs = bitset.New(tagSetComment + 1)
bs.Set(tagSetInlineFormat)
tagSetNameToSet["restricted"] = bs
}