diff --git a/htmlcheck/tags.go b/htmlcheck/tags.go index 6c37524..f2a392a 100644 --- a/htmlcheck/tags.go +++ b/htmlcheck/tags.go @@ -14,12 +14,45 @@ import ( "net/url" "regexp" "strings" + + "github.com/bits-and-blooms/bitset" ) +// Constants used to group individual HTML tags. +const ( + tagSetInlineFormat = 1 // inline formatting + tagSetAnchor = 2 // the tag + tagSetBlockFormat = 3 // block-level formatting + tagSetActiveContent = 4 // active content like objects and scripts + tagSetImageMaps = 5 // image map tags + tagSetDocFormat = 6 // document-level formatting + tagSetFontFormat = 7 // the tag + tagSetForms = 8 // form tags + tagSetTables = 9 // table tags + tagSetChangeMarkup = 10 // change markup ( and ) + tagSetFrames = 11 // frame tags + tagSetImages = 12 // the tag + tagSetPreformat = 13 // the
 tag and similar
+	tagSetNSCPInlineFormat  = 14 // Netscape-specific inline formatting
+	tagSetNSCPLayers        = 15 // Netscape layer tags
+	tagSetNSCPForms         = 16 // Netscape form tags
+	tagSetNSCPBlockFormat   = 17 // Netscape block-formatting tags
+	tagSetNSCPServer        = 18 // the Netscape  tag
+	tagSetMSFTDocFormat     = 19 // Microsoft-specific document formatting
+	tagSetMSFTInlineFormat  = 20 // Microsoft-specific inline formatting
+	tagSetMSFTBlockFormat   = 21 // Microsoft-specific block formatting
+	tagSetMSFTActiveContent = 22 // Microsoft-specific active content
+	tagSetServerPage        = 23 // server-side page use
+	tagSetJavaServer        = 24 // Java server page use
+	tagSetComment           = 25 // HTML comments
+)
+
+// Functions used inside the tag to implement "overridden" behavior.
 type causeLineBreakFunc func(*tag, bool) bool
 type closingTagFunc func(*tag) string
 type rewriteContentsFunc func(*tag, string, bool, htmlCheckerBackend) string
 
+// tag is a structure describing a particular HTML tag.
 type tag struct {
 	name        string
 	index       int
@@ -31,6 +64,7 @@ type tag struct {
 	rwc         rewriteContentsFunc
 }
 
+// causeLineBreak returns true if the tag causes a line break.
 func (t *tag) causeLineBreak(isClosing bool) bool {
 	if t.clb == nil {
 		return t.lineBreak
@@ -38,6 +72,7 @@ func (t *tag) causeLineBreak(isClosing bool) bool {
 	return t.clb(t, isClosing)
 }
 
+// makeClosingTag creates a closing tag for this one.
 func (t *tag) makeClosingTag() string {
 	if t.ct == nil {
 		return ""
@@ -45,6 +80,7 @@ func (t *tag) makeClosingTag() string {
 	return t.ct(t)
 }
 
+// rewriteContents is a hook used to rewrite the contents of the tag.
 func (t *tag) rewriteContents(contents string, isClosing bool, ctxt htmlCheckerBackend) string {
 	if t.rwc == nil {
 		return contents
@@ -52,6 +88,7 @@ func (t *tag) rewriteContents(contents string, isClosing bool, ctxt htmlCheckerB
 	return t.rwc(t, contents, isClosing, ctxt)
 }
 
+// createSimpleTag creates a structure for a simple tag.
 func createSimpleTag(name string, brk bool) *tag {
 	return &tag{
 		name:        strings.ToUpper(name),
@@ -65,6 +102,7 @@ func createSimpleTag(name string, brk bool) *tag {
 	}
 }
 
+// createWBRTag creates a structure for a WBR (word break) tag.
 func createWBRTag() *tag {
 	return &tag{
 		name:        "WBR",
@@ -81,10 +119,12 @@ func createWBRTag() *tag {
 	}
 }
 
+// stdClosingTag is the standard way a closing tag is made.
 func stdClosingTag(tag *tag) string {
 	return fmt.Sprintf("", tag.name)
 }
 
+// createOpenCloseTag creates a tag that has a specific open and close form.
 func createOpenCloseTag(name string, brk bool) *tag {
 	return &tag{
 		name:        strings.ToUpper(name),
@@ -98,6 +138,7 @@ func createOpenCloseTag(name string, brk bool) *tag {
 	}
 }
 
+// createListElementTag creates a tag that is part of a list.
 func createListElementTag(name string) *tag {
 	return &tag{
 		name:        strings.ToUpper(name),
@@ -113,6 +154,7 @@ func createListElementTag(name string) *tag {
 	}
 }
 
+// createBalancedTag creates a tag that should have opens and closes inherently balanced.
 func createBalancedTag(name string, brk bool) *tag {
 	return &tag{
 		name:        strings.ToUpper(name),
@@ -126,6 +168,7 @@ func createBalancedTag(name string, brk bool) *tag {
 	}
 }
 
+// createNOBRTag creates a NOBR (no break) tag.
 func createNOBRTag() *tag {
 	return &tag{
 		name:        "NOBR",
@@ -146,11 +189,16 @@ func createNOBRTag() *tag {
 	}
 }
 
+// Patterns to be used in recognizing attributes in an  tag.
 var hrefPattern = regexp.MustCompile(`(?i:href\s*=)`)
 var targetPattern = regexp.MustCompile(`(?i:target\s*=)`)
 
+// extractAttribute extracts an attribute value from the contents of an  tag.
 func extractAttribute(s string) string {
 	s = strings.TrimSpace(s)
+	if len(s) == 0 {
+		return ""
+	}
 	if s[0] == '\'' || s[0] == '"' {
 		p := strings.IndexByte(s[1:], s[0])
 		if p < 0 {
@@ -161,6 +209,7 @@ func extractAttribute(s string) string {
 	return strings.Fields(s)[0]
 }
 
+// rewriteATagContents rewrites the contents of an  tag.
 func rewriteATagContents(t *tag, contents string, isClosing bool, ctxt htmlCheckerBackend) string {
 	if isClosing {
 		return contents // don't bother checking close tag
@@ -191,6 +240,7 @@ func rewriteATagContents(t *tag, contents string, isClosing bool, ctxt htmlCheck
 	return contents + " " + tail
 }
 
+// createATag creates an  tag.
 func createATag() *tag {
 	return &tag{
 		name:        "A",
@@ -203,3 +253,164 @@ func createATag() *tag {
 		rwc:         rewriteATagContents,
 	}
 }
+
+// tagNameToIndex is a mapping from tag names to indexes into the arrays.
+var tagNameToIndex = make(map[string]int)
+
+// tagIndexToObject contains the actual tags.
+var tagIndexToObject = make([]*tag, 0, 50)
+
+// tagIndexToSetId contains the set ID values for each tag.
+var tagIndexToSetId = make([]int, 0, 50)
+
+// tagMaxLength is the maximum length of a tag name.
+var tagMaxLength = 0
+
+// tagSetNameToSet is the listing of bit sets corresponding to the set names in configuration.
+var tagSetNameToSet = make(map[string]*bitset.BitSet)
+
+// enshrineTag adds a tag to our internal repository structures.
+func enshrineTag(tag *tag, set int) {
+	ndx := len(tagIndexToObject)
+	tagIndexToObject = append(tagIndexToObject, tag)
+	tagIndexToSetId = append(tagIndexToSetId, set)
+	tag.index = ndx
+	tagNameToIndex[tag.name] = ndx
+	if len(tag.name) > tagMaxLength {
+		tagMaxLength = len(tag.name)
+	}
+}
+
+// init actually sets up the tag repository.
+func init() {
+	enshrineTag(createSimpleTag("!DOCTYPE", false), tagSetDocFormat)
+	enshrineTag(createSimpleTag("%", false), tagSetServerPage)
+	enshrineTag(createSimpleTag("%=", false), tagSetServerPage)
+	enshrineTag(createSimpleTag("%@", false), tagSetServerPage)
+	enshrineTag(createATag(), tagSetAnchor)
+	enshrineTag(createBalancedTag("ABBR", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("ACRONYM", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("ADDRESS", true), tagSetBlockFormat)
+	enshrineTag(createBalancedTag("APPLET", false), tagSetActiveContent)
+	enshrineTag(createSimpleTag("AREA", false), tagSetImageMaps)
+	enshrineTag(createBalancedTag("B", false), tagSetInlineFormat)
+	enshrineTag(createSimpleTag("BASE", false), tagSetDocFormat)
+	enshrineTag(createSimpleTag("BASEFONT", false), tagSetDocFormat)
+	enshrineTag(createBalancedTag("BDO", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("BEAN", false), tagSetJavaServer)
+	enshrineTag(createSimpleTag("BGSOUND", false), tagSetMSFTDocFormat)
+	enshrineTag(createBalancedTag("BIG", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("BLINK", false), tagSetNSCPInlineFormat)
+	enshrineTag(createBalancedTag("BLOCKQUOTE", true), tagSetBlockFormat)
+	enshrineTag(createOpenCloseTag("BODY", false), tagSetDocFormat)
+	enshrineTag(createSimpleTag("BR", true), tagSetBlockFormat)
+	enshrineTag(createOpenCloseTag("BUTTON", false), tagSetForms)
+	enshrineTag(createBalancedTag("CAPTION", true), tagSetTables)
+	enshrineTag(createBalancedTag("CENTER", true), tagSetBlockFormat)
+	enshrineTag(createBalancedTag("CITE", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("CODE", false), tagSetInlineFormat)
+	enshrineTag(createSimpleTag("COL", true), tagSetTables)
+	enshrineTag(createOpenCloseTag("COLGROUP", true), tagSetTables)
+	enshrineTag(createBalancedTag("COMMENT", false), tagSetMSFTInlineFormat)
+	enshrineTag(createListElementTag("DD"), tagSetBlockFormat)
+	enshrineTag(createBalancedTag("DEL", false), tagSetChangeMarkup)
+	enshrineTag(createBalancedTag("DFN", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("DIR", true), tagSetBlockFormat)
+	enshrineTag(createBalancedTag("DIV", true), tagSetBlockFormat)
+	enshrineTag(createBalancedTag("DL", true), tagSetBlockFormat)
+	enshrineTag(createListElementTag("DT"), tagSetBlockFormat)
+	enshrineTag(createBalancedTag("EM", false), tagSetInlineFormat)
+	enshrineTag(createSimpleTag("EMBED", false), tagSetActiveContent)
+	enshrineTag(createBalancedTag("FIELDSET", false), tagSetForms)
+	enshrineTag(createBalancedTag("FONT", false), tagSetFontFormat)
+	enshrineTag(createBalancedTag("FORM", false), tagSetForms)
+	enshrineTag(createSimpleTag("FRAME", true), tagSetFrames)
+	enshrineTag(createBalancedTag("FRAMESET", false), tagSetFrames)
+	enshrineTag(createBalancedTag("H1", true), tagSetFontFormat)
+	enshrineTag(createBalancedTag("H2", true), tagSetFontFormat)
+	enshrineTag(createBalancedTag("H3", true), tagSetFontFormat)
+	enshrineTag(createBalancedTag("H4", true), tagSetFontFormat)
+	enshrineTag(createBalancedTag("H5", true), tagSetFontFormat)
+	enshrineTag(createBalancedTag("H6", true), tagSetFontFormat)
+	enshrineTag(createOpenCloseTag("HEAD", false), tagSetDocFormat)
+	enshrineTag(createSimpleTag("HR", true), tagSetBlockFormat)
+	enshrineTag(createOpenCloseTag("HTML", false), tagSetDocFormat)
+	enshrineTag(createBalancedTag("I", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("IFRAME", true), tagSetFrames)
+	enshrineTag(createBalancedTag("ILAYER", true), tagSetNSCPLayers)
+	enshrineTag(createSimpleTag("IMG", false), tagSetImages)
+	enshrineTag(createSimpleTag("INPUT", false), tagSetForms)
+	enshrineTag(createBalancedTag("INS", false), tagSetChangeMarkup)
+	enshrineTag(createSimpleTag("ISINDEX", false), tagSetForms)
+	enshrineTag(createBalancedTag("KBD", false), tagSetInlineFormat)
+	enshrineTag(createSimpleTag("KEYGEN", false), tagSetNSCPForms)
+	enshrineTag(createBalancedTag("LABEL", false), tagSetForms)
+	enshrineTag(createBalancedTag("LAYER", true), tagSetNSCPLayers)
+	enshrineTag(createBalancedTag("LEGEND", false), tagSetForms)
+	enshrineTag(createListElementTag("LI"), tagSetBlockFormat)
+	enshrineTag(createSimpleTag("LINK", false), tagSetDocFormat)
+	enshrineTag(createBalancedTag("LISTING", false), tagSetMSFTInlineFormat)
+	enshrineTag(createBalancedTag("MAP", false), tagSetImageMaps)
+	enshrineTag(createBalancedTag("MARQUEE", true), tagSetMSFTBlockFormat)
+	enshrineTag(createBalancedTag("MENU", true), tagSetBlockFormat)
+	enshrineTag(createSimpleTag("META", false), tagSetDocFormat)
+	enshrineTag(createBalancedTag("MULTICOL", false), tagSetNSCPBlockFormat)
+	enshrineTag(createNOBRTag(), tagSetBlockFormat)
+	enshrineTag(createBalancedTag("NOEMBED", false), tagSetActiveContent)
+	enshrineTag(createBalancedTag("NOFRAMES", false), tagSetFrames)
+	enshrineTag(createBalancedTag("NOLAYER", false), tagSetNSCPLayers)
+	enshrineTag(createBalancedTag("NOSCRIPT", false), tagSetActiveContent)
+	enshrineTag(createBalancedTag("OBJECT", false), tagSetActiveContent)
+	enshrineTag(createBalancedTag("OL", true), tagSetBlockFormat)
+	enshrineTag(createBalancedTag("OPTGROUP", false), tagSetForms)
+	enshrineTag(createListElementTag("OPTION"), tagSetForms)
+	enshrineTag(createOpenCloseTag("P", true), tagSetBlockFormat)
+	enshrineTag(createSimpleTag("PARAM", false), tagSetActiveContent)
+	enshrineTag(createSimpleTag("PLAINTEXT", false), tagSetPreformat)
+	enshrineTag(createBalancedTag("PRE", false), tagSetPreformat)
+	enshrineTag(createBalancedTag("Q", false), tagSetInlineFormat)
+	enshrineTag(createSimpleTag("RT", false), tagSetMSFTActiveContent)
+	enshrineTag(createBalancedTag("RUBY", false), tagSetMSFTActiveContent)
+	enshrineTag(createBalancedTag("S", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("SAMP", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("SCRIPT", false), tagSetActiveContent)
+	enshrineTag(createBalancedTag("SELECT", false), tagSetForms)
+	enshrineTag(createBalancedTag("SERVER", false), tagSetNSCPServer)
+	enshrineTag(createBalancedTag("SERVLET", false), tagSetJavaServer)
+	enshrineTag(createBalancedTag("SMALL", false), tagSetInlineFormat)
+	enshrineTag(createSimpleTag("SPACER", false), tagSetNSCPInlineFormat)
+	enshrineTag(createBalancedTag("SPAN", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("STRIKE", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("STRONG", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("STYLE", false), tagSetDocFormat)
+	enshrineTag(createBalancedTag("SUB", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("SUP", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("TABLE", true), tagSetTables)
+	enshrineTag(createOpenCloseTag("TBODY", true), tagSetTables)
+	enshrineTag(createBalancedTag("TD", true), tagSetTables)
+	enshrineTag(createBalancedTag("TEXTAREA", true), tagSetForms)
+	enshrineTag(createOpenCloseTag("TFOOT", true), tagSetTables)
+	enshrineTag(createBalancedTag("TH", true), tagSetTables)
+	enshrineTag(createOpenCloseTag("THEAD", true), tagSetTables)
+	enshrineTag(createBalancedTag("TITLE", false), tagSetDocFormat)
+	enshrineTag(createBalancedTag("TR", true), tagSetTables)
+	enshrineTag(createBalancedTag("TT", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("U", false), tagSetInlineFormat)
+	enshrineTag(createBalancedTag("UL", true), tagSetBlockFormat)
+	enshrineTag(createBalancedTag("VAR", false), tagSetInlineFormat)
+	enshrineTag(createWBRTag(), tagSetBlockFormat)
+	enshrineTag(createBalancedTag("XML", false), tagSetMSFTActiveContent)
+	enshrineTag(createBalancedTag("XMP", false), tagSetNSCPInlineFormat)
+
+	// Create the tag sets.
+	bs := bitset.New(tagSetComment + 1)
+	bs.Set(tagSetInlineFormat)
+	bs.Set(tagSetAnchor)
+	bs.Set(tagSetBlockFormat)
+	bs.Set(tagSetFontFormat)
+	bs.Set(tagSetImages)
+	tagSetNameToSet["normal"] = bs
+	bs = bitset.New(tagSetComment + 1)
+	bs.Set(tagSetInlineFormat)
+	tagSetNameToSet["restricted"] = bs
+}