Work with feedparser's new module structure.
authorAdam Sampson <ats@offog.org>
Tue, 9 Oct 2018 19:24:03 +0000 (20:24 +0100)
committerAdam Sampson <ats@offog.org>
Tue, 9 Oct 2018 19:24:03 +0000 (20:24 +0100)
rawdog needs access to some of feedparser's internals; the locations of
these have changed after the 5.2 release.

NEWS
rawdoglib/rawdog.py

diff --git a/NEWS b/NEWS
index adf75b65eda6c833d28eae8eda89fcd37915fc4d..5da3af2efad19a7fc7ef6860cc8e8b0c37531b33 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -16,6 +16,10 @@ trying to change it, override the corresponding options explicitly.
 The meaning of tidylib's wrap option has also changed, so set a sensible
 default value.
 
+Support the current development version of feedparser (which will
+presumably be the 5.3 release eventually), which has been restructured
+into multiple modules.
+
 - rawdog 2.22
 
 When handling an HTTP 301 redirect response, check whether the new
index 10a55350c3852e2afae971a23d58f4b45375d8fc..23568ab21c032ca74ab35001596e2238a2bc5e62 100644 (file)
@@ -53,15 +53,15 @@ try:
 except:
        mxtidy = None
 
-# Turn off content-cleaning, since we want to see an approximation to the
-# original content for hashing. rawdog will sanitise HTML when writing.
-feedparser.RESOLVE_RELATIVE_URIS = 0
-feedparser.SANITIZE_HTML = 0
-
-# Disable microformat support, because it tends to return poor-quality data
-# (e.g. identifying inappropriate things as enclosures), and it relies on
-# BeautifulSoup which is unable to parse many feeds.
-feedparser.PARSE_MICROFORMATS = 0
+# The sanitisation code was restructured in feedparser 5.3.
+try:
+       _resolveRelativeURIs = feedparser.urls._resolveRelativeURIs
+except AttributeError:
+       _resolveRelativeURIs = feedparser._resolveRelativeURIs
+try:
+       _HTMLSanitizer = feedparser.sanitizer._HTMLSanitizer
+except AttributeError:
+       _HTMLSanitizer = feedparser._HTMLSanitizer
 
 # This is initialised in main().
 persister = None
@@ -119,8 +119,8 @@ def sanitise_html(html, baseurl, inline, config):
        # "<!doctype html!>"); just remove them all.
        html = re.sub(r'<![^>]*>', '', html)
 
-       html = feedparser._resolveRelativeURIs(html, baseurl, "UTF-8", type)
-       p = feedparser._HTMLSanitizer("UTF-8", type)
+       html = _resolveRelativeURIs(html, baseurl, "UTF-8", type)
+       p = _HTMLSanitizer("UTF-8", type)
        p.feed(html)
        html = p.output()
 
@@ -491,12 +491,29 @@ class Feed:
                if not ":" in url:
                        url = "file:" + url
 
+               parse_args = {
+                       "etag": self.etag,
+                       "modified": self.modified,
+                       "agent": HTTP_AGENT,
+                       "handlers": handlers,
+                       }
+               # Turn off content-cleaning, as we need the original content
+               # for hashing and we'll do this ourselves afterwards.
+               if hasattr(feedparser, "api"):
+                       # feedparser >= 5.3
+                       parse_args["sanitize_html"] = False
+                       parse_args["resolve_relative_uris"] = False
+               else:
+                       # feedparser < 5.3
+                       feedparser.RESOLVE_RELATIVE_URIS = 0
+                       feedparser.SANITIZE_HTML = 0
+                       # Microformat support (removed in 5.3) tends to return
+                       # poor-quality data, and relies on BeautifulSoup which
+                       # is unable to parse many feeds.
+                       feedparser.PARSE_MICROFORMATS = 0
+
                try:
-                       result = feedparser.parse(url,
-                               etag=self.etag,
-                               modified=self.modified,
-                               agent=HTTP_AGENT,
-                               handlers=handlers)
+                       result = feedparser.parse(url, **parse_args)
                except Exception, e:
                        result = {
                                "rawdog_exception": e,