Work around broken DOCTYPEs that confuse sgmllib.
authorAdam Sampson <ats@offog.org>
Sat, 26 Jun 2010 09:57:11 +0000 (09:57 +0000)
committerAdam Sampson <ats@offog.org>
Sat, 26 Jun 2010 09:57:11 +0000 (09:57 +0000)
NEWS
rawdoglib/rawdog.py

diff --git a/NEWS b/NEWS
index 565d65f1339e9d997263f592c128bb86d4332de8..2dc275dd6e44427381d1f96424654871a90927b2 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -23,6 +23,8 @@ mxtidy_args hook is now called tidy_args.
 Allow template variables to start with an underscore (patch from Oberon
 Faelord).
 
+Work around broken DOCTYPEs that confuse sgmllib.
+
 - rawdog 2.12
 
 Make rawdog work with Python 2.6 (reported by Roy Lanek).
index 212075246e5833c9b809cf1055fd3c6f0c247b7f..037abec3dd4c7de3387976d7e123a6aac0cca740 100644 (file)
@@ -110,6 +110,11 @@ def sanitise_html(html, baseurl, inline, config):
        # sgmllib handles "<br/>/" as a SHORTTAG; this workaround from
        # feedparser.
        html = re.sub(r'(\S)/>', r'\1 />', html)
+
+       # sgmllib is fragile with broken processing instructions (e.g.
+       # "<!doctype html!>"); just remove them all.
+       html = re.sub(r'<![^>]*>', '', html)
+
        html = feedparser._resolveRelativeURIs(html, baseurl, "UTF-8", type)
        p = feedparser._HTMLSanitizer("UTF-8", type)
        p.feed(html)