From: Adam Sampson Date: Sat, 26 Jun 2010 09:57:11 +0000 (+0000) Subject: Work around broken DOCTYPEs that confuse sgmllib. X-Git-Tag: v2.13~3 X-Git-Url: http://git.ozo.com/?a=commitdiff_plain;h=dff8fdfc5a3b38a1b7931c461e4be91ec0712382;p=rawdog%2F.git Work around broken DOCTYPEs that confuse sgmllib. --- diff --git a/NEWS b/NEWS index 565d65f..2dc275d 100644 --- a/NEWS +++ b/NEWS @@ -23,6 +23,8 @@ mxtidy_args hook is now called tidy_args. Allow template variables to start with an underscore (patch from Oberon Faelord). +Work around broken DOCTYPEs that confuse sgmllib. + - rawdog 2.12 Make rawdog work with Python 2.6 (reported by Roy Lanek). diff --git a/rawdoglib/rawdog.py b/rawdoglib/rawdog.py index 2120752..037abec 100644 --- a/rawdoglib/rawdog.py +++ b/rawdoglib/rawdog.py @@ -110,6 +110,11 @@ def sanitise_html(html, baseurl, inline, config): # sgmllib handles "
/" as a SHORTTAG; this workaround from # feedparser. html = re.sub(r'(\S)/>', r'\1 />', html) + + # sgmllib is fragile with broken processing instructions (e.g. + # ""); just remove them all. + html = re.sub(r']*>', '', html) + html = feedparser._resolveRelativeURIs(html, baseurl, "UTF-8", type) p = feedparser._HTMLSanitizer("UTF-8", type) p.feed(html)