From 1de77b0859746157eff3a41d64079ab46f4e4813 Mon Sep 17 00:00:00 2001
From: Vincent Gao <gaobing1230@gmail.com>
Date: Thu, 25 Jun 2026 14:12:16 +0200
Subject: [PATCH 1/2] Fix AssertionError crashes when parsing malformed HTML
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two separate assertions in the parser incorrectly assumed that certain
conditions can only occur during fragment parsing (innerHTML mode), but
real-world malformed markup can trigger them in a full parse:

1. ``resetInsertionMode``: when a ``<select>`` element appears in the
   open-elements stack inside foreign content (e.g. inside ``<math>``),
   the subsequent ``resetInsertionMode`` call encountered the element
   name in the ("select", "colgroup", ...) guard and raised
   ``AssertionError``.  Per the WHATWG spec, ``<select>`` is valid in
   that position during ordinary parsing; the correct mode is
   "inSelect".  Remove ``select`` from the guarded set so it falls
   through to the existing ``newModes`` lookup.  For ``colgroup``,
   ``head``, and ``html``, replace the hard assert with a ``continue``
   so the loop finds a better ancestor rather than crashing.

2. ``InTablePhase.processEOF``: malformed markup such as
   ``<table><svg><html>`` can leave ``<html>`` as the current node
   while the parser is in "in table" mode without being in innerHTML
   mode.  Replace the assertion with a ``parseError`` call so the
   parser reports the condition and stops cleanly.

Reproduces crashes reported in issue #568 (oss-fuzz / Beautiful Soup
test cases):
  * ``b'-<math><sElect><mi><sElect><sElect>'``  → AssertionError
  * ``b'\xc3\xb1<table><svg><html>'``           → AssertionError
---
 html5lib/html5parser.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
index b3c206d1..a5d44852 100644
--- a/html5lib/html5parser.py
+++ b/html5lib/html5parser.py
@@ -364,10 +364,16 @@ def resetInsertionMode(self):
                 assert self.innerHTML
                 last = True
                 nodeName = self.innerHTML
-            # Check for conditions that should only happen in the innerHTML
-            # case
-            if nodeName in ("select", "colgroup", "head", "html"):
-                assert self.innerHTML
+            # "select" may appear in the stack during normal parsing (e.g.
+            # inside foreign content); in that case fall through to the
+            # newModes lookup so we end up in "inSelect".  The remaining
+            # names ("colgroup", "head", "html") should only be reachable
+            # when the fragment-parsing algorithm is in use.  Malformed
+            # markup can land us here in a full parse, so gracefully skip
+            # those nodes rather than crashing with an AssertionError.
+            if nodeName in ("colgroup", "head", "html"):
+                if not self.innerHTML:
+                    continue
 
             if not last and node.namespace != self.tree.defaultNamespace:
                 continue
@@ -1696,7 +1702,11 @@ def processEOF(self):
         if self.tree.openElements[-1].name != "html":
             self.parser.parseError("eof-in-table")
         else:
-            assert self.parser.innerHTML
+            # The current node is <html>; in normal parsing this is the
+            # innerHTML case, but malformed markup (e.g. <table><svg><html>)
+            # can reach here too.  Either way, just stop parsing.
+            if not self.parser.innerHTML:
+                self.parser.parseError("eof-in-table")
         # Stop parsing
 
     def processSpaceCharacters(self, token):

From d9ac326245220a618fbca98fc37f7abfbee992f9 Mon Sep 17 00:00:00 2001
From: Vincent Gao <gaobing1230@gmail.com>
Date: Thu, 25 Jun 2026 17:16:27 +0200
Subject: [PATCH 2/2] Add malformed parser crash regression tests

---
 html5lib/tests/test_parser2.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py
index 6b464bea..83147342 100644
--- a/html5lib/tests/test_parser2.py
+++ b/html5lib/tests/test_parser2.py
@@ -91,3 +91,15 @@ def test_self_closing_col():
     parser = HTMLParser()
     parser.parseFragment('<table><colgroup><col /></colgroup></table>')
     assert not parser.errors
+
+
+def test_malformed_select_in_foreign_content_does_not_crash():
+    parser = HTMLParser()
+    assert parser.parse(b'-<math><sElect><mi><sElect><sElect>') is not None
+    assert parser.errors
+
+
+def test_malformed_table_with_html_in_foreign_content_does_not_crash():
+    parser = HTMLParser()
+    assert parser.parse(b'\xc3\xb1<table><svg><html>') is not None
+    assert parser.errors