From 1de77b0859746157eff3a41d64079ab46f4e4813 Mon Sep 17 00:00:00 2001 From: Vincent Gao Date: Thu, 25 Jun 2026 14:12:16 +0200 Subject: [PATCH 1/2] Fix AssertionError crashes when parsing malformed HTML MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two separate assertions in the parser incorrectly assumed that certain conditions can only occur during fragment parsing (innerHTML mode), but real-world malformed markup can trigger them in a full parse: 1. ``resetInsertionMode``: when a ```` is valid in that position during ordinary parsing; the correct mode is "inSelect". Remove ``select`` from the guarded set so it falls through to the existing ``newModes`` lookup. For ``colgroup``, ``head``, and ``html``, replace the hard assert with a ``continue`` so the loop finds a better ancestor rather than crashing. 2. ``InTablePhase.processEOF``: malformed markup such as ```` can leave ```` as the current node while the parser is in "in table" mode without being in innerHTML mode. Replace the assertion with a ``parseError`` call so the parser reports the condition and stops cleanly. Reproduces crashes reported in issue #568 (oss-fuzz / Beautiful Soup test cases): * ``b'-
'`` → AssertionError --- html5lib/html5parser.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index b3c206d1..a5d44852 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -364,10 +364,16 @@ def resetInsertionMode(self): assert self.innerHTML last = True nodeName = self.innerHTML - # Check for conditions that should only happen in the innerHTML - # case - if nodeName in ("select", "colgroup", "head", "html"): - assert self.innerHTML + # "select" may appear in the stack during normal parsing (e.g. + # inside foreign content); in that case fall through to the + # newModes lookup so we end up in "inSelect". The remaining + # names ("colgroup", "head", "html") should only be reachable + # when the fragment-parsing algorithm is in use. Malformed + # markup can land us here in a full parse, so gracefully skip + # those nodes rather than crashing with an AssertionError. + if nodeName in ("colgroup", "head", "html"): + if not self.innerHTML: + continue if not last and node.namespace != self.tree.defaultNamespace: continue @@ -1696,7 +1702,11 @@ def processEOF(self): if self.tree.openElements[-1].name != "html": self.parser.parseError("eof-in-table") else: - assert self.parser.innerHTML + # The current node is ; in normal parsing this is the + # innerHTML case, but malformed markup (e.g.
) + # can reach here too. Either way, just stop parsing. + if not self.parser.innerHTML: + self.parser.parseError("eof-in-table") # Stop parsing def processSpaceCharacters(self, token): From d9ac326245220a618fbca98fc37f7abfbee992f9 Mon Sep 17 00:00:00 2001 From: Vincent Gao Date: Thu, 25 Jun 2026 17:16:27 +0200 Subject: [PATCH 2/2] Add malformed parser crash regression tests --- html5lib/tests/test_parser2.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index 6b464bea..83147342 100644 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -91,3 +91,15 @@ def test_self_closing_col(): parser = HTMLParser() parser.parseFragment('
') assert not parser.errors + + +def test_malformed_select_in_foreign_content_does_not_crash(): + parser = HTMLParser() + assert parser.parse(b'-