have HTML checker check context error in parsing loop

This commit is contained in:
2025-12-20 23:17:07 -07:00
parent 5c8bb8dd5e
commit 813d6aa44d
+69 -31
View File
@@ -499,16 +499,16 @@ func (ht *htmlCheckerImpl) attemptRewrite(rewriters []rewriter, data string) *ma
} }
// doFlushString attempts to flush a string from the temporary buffer. // doFlushString attempts to flush a string from the temporary buffer.
func (ht *htmlCheckerImpl) doFlushString() bool { func (ht *htmlCheckerImpl) doFlushString() (bool, error) {
md := ht.attemptRewrite(ht.stringRewriters, ht.tempBuffer.String()) md := ht.attemptRewrite(ht.stringRewriters, ht.tempBuffer.String())
if md != nil { if md != nil {
ht.emitMarkupData(md) ht.emitMarkupData(md)
ht.tempBuffer.Reset() ht.tempBuffer.Reset()
if md.rescan { if md.rescan {
ht.parse(md.all()) err := ht.parse(md.all())
return true return true, err
} }
return false return false, nil
} }
first := true first := true
@@ -549,7 +549,10 @@ func (ht *htmlCheckerImpl) doFlushString() bool {
// emit and/or reparse // emit and/or reparse
ht.emitMarkupData(md) ht.emitMarkupData(md)
if md.rescan { if md.rescan {
ht.parse(md.all()) err := ht.parse(md.all())
if err != nil {
return false, err
}
} }
} else { } else {
// just output the word normally // just output the word normally
@@ -585,7 +588,7 @@ func (ht *htmlCheckerImpl) doFlushString() bool {
} }
first = false first = false
} }
return false return false, nil
} }
// handleAsHTML attempts to handle the contents of the tag in the temporary buffer as HTML. // handleAsHTML attempts to handle the contents of the tag in the temporary buffer as HTML.
@@ -703,7 +706,7 @@ func (ht *htmlCheckerImpl) containsXMLConstruct() bool {
} }
// finishTag processes and outputs the tag in the temporary buffer. // finishTag processes and outputs the tag in the temporary buffer.
func (ht *htmlCheckerImpl) finishTag() { func (ht *htmlCheckerImpl) finishTag() error {
if ht.containsHTMLComment() { if ht.containsHTMLComment() {
if ht.containsCompleteHTMLComment() && !ht.config.DiscardComments { if ht.containsCompleteHTMLComment() && !ht.config.DiscardComments {
// output the comment in the raw // output the comment in the raw
@@ -714,13 +717,13 @@ func (ht *htmlCheckerImpl) finishTag() {
ht.tempBuffer.Reset() ht.tempBuffer.Reset()
ht.state = stateWhitespace ht.state = stateWhitespace
} }
return return nil
} }
if ht.handleAsHTML() { if ht.handleAsHTML() {
// this was valid HTML, we're done // this was valid HTML, we're done
ht.tempBuffer.Reset() ht.tempBuffer.Reset()
ht.state = stateWhitespace ht.state = stateWhitespace
return return nil
} }
// try to handle it with a tag rewriter // try to handle it with a tag rewriter
@@ -729,19 +732,20 @@ func (ht *htmlCheckerImpl) finishTag() {
ht.emitBracketedMarkupData(md, '<', '>') ht.emitBracketedMarkupData(md, '<', '>')
ht.tempBuffer.Reset() ht.tempBuffer.Reset()
ht.state = stateWhitespace ht.state = stateWhitespace
var err error = nil
if md.rescan { if md.rescan {
ht.tempBuffer.WriteByte('<') ht.tempBuffer.WriteByte('<')
ht.state = stateChars ht.state = stateChars
ht.parse(md.all() + ">") err = ht.parse(md.all() + ">")
} }
return return err
} }
if ht.config.DiscardXML && ht.containsXMLConstruct() { if ht.config.DiscardXML && ht.containsXMLConstruct() {
// this tag is an XML construct, and needs to be discarded // this tag is an XML construct, and needs to be discarded
ht.tempBuffer.Reset() ht.tempBuffer.Reset()
ht.state = stateWhitespace ht.state = stateWhitespace
return return nil
} }
// This tag has been rejected! process it normally as character data // This tag has been rejected! process it normally as character data
@@ -749,14 +753,18 @@ func (ht *htmlCheckerImpl) finishTag() {
ht.tempBuffer.Reset() ht.tempBuffer.Reset()
ht.tempBuffer.WriteByte('<') ht.tempBuffer.WriteByte('<')
ht.state = stateChars ht.state = stateChars
var err error = nil
if len(rejection) > 0 { if len(rejection) > 0 {
ht.parse(rejection) err = ht.parse(rejection)
} }
ht.parse(">") if err == nil {
err = ht.parse(">")
}
return err
} }
// finishParen processes and outputs the parenthesized construct in the temporary buffer. // finishParen processes and outputs the parenthesized construct in the temporary buffer.
func (ht *htmlCheckerImpl) finishParen() { func (ht *htmlCheckerImpl) finishParen() error {
// Try to handle the element using a paren rewriter // Try to handle the element using a paren rewriter
md := ht.attemptRewrite(ht.parenRewriters, ht.tempBuffer.String()) md := ht.attemptRewrite(ht.parenRewriters, ht.tempBuffer.String())
if md != nil { if md != nil {
@@ -764,12 +772,13 @@ func (ht *htmlCheckerImpl) finishParen() {
ht.tempBuffer.Reset() ht.tempBuffer.Reset()
ht.state = stateWhitespace ht.state = stateWhitespace
ht.parenLevel = 0 ht.parenLevel = 0
var err error = nil
if md.rescan { if md.rescan {
ht.tempBuffer.WriteByte('(') ht.tempBuffer.WriteByte('(')
ht.state = stateChars ht.state = stateChars
ht.parse(md.all() + ")") err = ht.parse(md.all() + ")")
} }
return return err
} }
// Tag rejected! Process it normally as character data. // Tag rejected! Process it normally as character data.
@@ -778,16 +787,24 @@ func (ht *htmlCheckerImpl) finishParen() {
ht.tempBuffer.WriteByte('(') ht.tempBuffer.WriteByte('(')
ht.state = stateChars ht.state = stateChars
ht.parenLevel = 0 ht.parenLevel = 0
var err error = nil
if len(rejection) > 0 { if len(rejection) > 0 {
ht.parse(rejection) err = ht.parse(rejection)
} }
ht.parse(")") if err == nil {
err = ht.parse(")")
}
return err
} }
// parse handles the meat of parsing an input string; it runs the state machine on the input. // parse handles the meat of parsing an input string; it runs the state machine on the input.
func (ht *htmlCheckerImpl) parse(str string) { func (ht *htmlCheckerImpl) parse(str string) error {
i := 0 i := 0
for i < len(str) { for i < len(str) {
err := ht.ctx.Err()
if err != nil {
return err
}
ch := str[i] ch := str[i]
switch ht.state { switch ht.state {
case stateWhitespace: case stateWhitespace:
@@ -832,18 +849,27 @@ func (ht *htmlCheckerImpl) parse(str string) {
case stateChars: case stateChars:
switch ch { switch ch {
case ' ', '\t': // go to Whitespace state case ' ', '\t': // go to Whitespace state
ht.doFlushString() _, err := ht.doFlushString()
if err != nil {
return err
}
ht.state = stateWhitespace ht.state = stateWhitespace
ht.tempBuffer.WriteByte(ch) ht.tempBuffer.WriteByte(ch)
i++ i++
case '\r', '\n': // go to Newline state case '\r', '\n': // go to Newline state
ht.doFlushString() _, err := ht.doFlushString()
if err != nil {
return err
}
ht.state = stateNewline ht.state = stateNewline
ht.tempBuffer.WriteByte(ch) ht.tempBuffer.WriteByte(ch)
i++ i++
case '<': // may be a start of tag case '<': // may be a start of tag
if ht.config.Angles { if ht.config.Angles {
ht.doFlushString() _, err := ht.doFlushString()
if err != nil {
return err
}
ht.state = stateLeftAngle ht.state = stateLeftAngle
} else { } else {
ht.tempBuffer.WriteByte(ch) ht.tempBuffer.WriteByte(ch)
@@ -886,7 +912,10 @@ func (ht *htmlCheckerImpl) parse(str string) {
case stateTag: case stateTag:
switch ch { switch ch {
case '>': // finish the tag - this changes the state, and possibly calls parse() recursively case '>': // finish the tag - this changes the state, and possibly calls parse() recursively
ht.finishTag() err := ht.finishTag()
if err != nil {
return err
}
i++ i++
case '\'', '"': // go into "quote string" state inside the tag case '\'', '"': // go into "quote string" state inside the tag
ht.tempBuffer.WriteByte(ch) ht.tempBuffer.WriteByte(ch)
@@ -905,7 +934,10 @@ func (ht *htmlCheckerImpl) parse(str string) {
i++ i++
case ')': case ')':
if ht.parenLevel == 0 { if ht.parenLevel == 0 {
ht.finishParen() // finish paren, changing state and recursively parsing if necessary err := ht.finishParen() // finish paren, changing state and recursively parsing if necessary
if err != nil {
return err
}
} else { } else {
// nest parentheses one LESS level deeper // nest parentheses one LESS level deeper
ht.tempBuffer.WriteByte(ch) ht.tempBuffer.WriteByte(ch)
@@ -933,6 +965,7 @@ func (ht *htmlCheckerImpl) parse(str string) {
log.Fatalf("invalid parser state: %d", ht.state) log.Fatalf("invalid parser state: %d", ht.state)
} }
} }
return nil
} }
/*---------------------------------------------------------------------------- /*----------------------------------------------------------------------------
@@ -953,10 +986,11 @@ func (ht *htmlCheckerImpl) Append(str string) error {
if !ht.started { if !ht.started {
ht.started = true ht.started = true
} }
var err error = nil
if str != "" { if str != "" {
ht.parse(str) err = ht.parse(str)
} }
return nil return err
} }
/* Finish completes the HTML checker parsing and makes the result available. /* Finish completes the HTML checker parsing and makes the result available.
@@ -975,11 +1009,12 @@ func (ht *htmlCheckerImpl) Finish() error {
running := true running := true
for running { for running {
running = false // make sure we stop unless this is set to true running = false // make sure we stop unless this is set to true
var err error = nil
switch ht.state { switch ht.state {
case stateWhitespace, stateNewline: case stateWhitespace, stateNewline:
// do nothing - discard whitespace or newlines at end // do nothing - discard whitespace or newlines at end
case stateChars: case stateChars:
running = ht.doFlushString() // flush the temporary buffer running, err = ht.doFlushString() // flush the temporary buffer
case stateLeftAngle: case stateLeftAngle:
// just emit a left angle character // just emit a left angle character
ht.emitPossibleLineBreak() ht.emitPossibleLineBreak()
@@ -991,21 +1026,24 @@ func (ht *htmlCheckerImpl) Finish() error {
ht.tempBuffer.WriteByte('<') ht.tempBuffer.WriteByte('<')
ht.state = stateChars ht.state = stateChars
if len(rejection) > 0 { if len(rejection) > 0 {
ht.parse(rejection) err = ht.parse(rejection)
} }
running = true running = true
case stateParen: case stateParen:
// we won't finish this, so it's automatically rejected // we won't finish this, so it's automagically rejected
rejection := ht.tempBuffer.String() rejection := ht.tempBuffer.String()
ht.tempBuffer.Reset() ht.tempBuffer.Reset()
ht.tempBuffer.WriteByte('(') ht.tempBuffer.WriteByte('(')
ht.state = stateChars ht.state = stateChars
ht.parenLevel = 0 ht.parenLevel = 0
if len(rejection) > 0 { if len(rejection) > 0 {
ht.parse(rejection) err = ht.parse(rejection)
} }
running = true running = true
} }
if err != nil {
return err
}
} }
// Now close all the HTML tags that were left open. // Now close all the HTML tags that were left open.