From: Adam Sampson Date: Tue, 9 Oct 2018 19:24:03 +0000 (+0100) Subject: Work with feedparser's new module structure. X-Git-Url: http://git.ozo.com/?a=commitdiff_plain;h=df658c3284c02b31a0e3055f61b0e4483dd8b104;p=rawdog%2F.git Work with feedparser's new module structure. rawdog needs access to some of feedparser's internals; the locations of these have changed after the 5.2 release. --- diff --git a/NEWS b/NEWS index adf75b6..5da3af2 100644 --- a/NEWS +++ b/NEWS @@ -16,6 +16,10 @@ trying to change it, override the corresponding options explicitly. The meaning of tidylib's wrap option has also changed, so set a sensible default value. +Support the current development version of feedparser (which will +presumably be the 5.3 release eventually), which has been restructured +into multiple modules. + - rawdog 2.22 When handling an HTTP 301 redirect response, check whether the new diff --git a/rawdoglib/rawdog.py b/rawdoglib/rawdog.py index 10a5535..23568ab 100644 --- a/rawdoglib/rawdog.py +++ b/rawdoglib/rawdog.py @@ -53,15 +53,15 @@ try: except: mxtidy = None -# Turn off content-cleaning, since we want to see an approximation to the -# original content for hashing. rawdog will sanitise HTML when writing. -feedparser.RESOLVE_RELATIVE_URIS = 0 -feedparser.SANITIZE_HTML = 0 - -# Disable microformat support, because it tends to return poor-quality data -# (e.g. identifying inappropriate things as enclosures), and it relies on -# BeautifulSoup which is unable to parse many feeds. -feedparser.PARSE_MICROFORMATS = 0 +# The sanitisation code was restructured in feedparser 5.3. +try: + _resolveRelativeURIs = feedparser.urls._resolveRelativeURIs +except AttributeError: + _resolveRelativeURIs = feedparser._resolveRelativeURIs +try: + _HTMLSanitizer = feedparser.sanitizer._HTMLSanitizer +except AttributeError: + _HTMLSanitizer = feedparser._HTMLSanitizer # This is initialised in main(). persister = None @@ -119,8 +119,8 @@ def sanitise_html(html, baseurl, inline, config): # ""); just remove them all. html = re.sub(r']*>', '', html) - html = feedparser._resolveRelativeURIs(html, baseurl, "UTF-8", type) - p = feedparser._HTMLSanitizer("UTF-8", type) + html = _resolveRelativeURIs(html, baseurl, "UTF-8", type) + p = _HTMLSanitizer("UTF-8", type) p.feed(html) html = p.output() @@ -491,12 +491,29 @@ class Feed: if not ":" in url: url = "file:" + url + parse_args = { + "etag": self.etag, + "modified": self.modified, + "agent": HTTP_AGENT, + "handlers": handlers, + } + # Turn off content-cleaning, as we need the original content + # for hashing and we'll do this ourselves afterwards. + if hasattr(feedparser, "api"): + # feedparser >= 5.3 + parse_args["sanitize_html"] = False + parse_args["resolve_relative_uris"] = False + else: + # feedparser < 5.3 + feedparser.RESOLVE_RELATIVE_URIS = 0 + feedparser.SANITIZE_HTML = 0 + # Microformat support (removed in 5.3) tends to return + # poor-quality data, and relies on BeautifulSoup which + # is unable to parse many feeds. + feedparser.PARSE_MICROFORMATS = 0 + try: - result = feedparser.parse(url, - etag=self.etag, - modified=self.modified, - agent=HTTP_AGENT, - handlers=handlers) + result = feedparser.parse(url, **parse_args) except Exception, e: result = { "rawdog_exception": e,