From: Adam Sampson <ats@offog.org>
Date: Tue, 9 Oct 2018 19:24:03 +0000 (+0100)
Subject: Work with feedparser's new module structure.
X-Git-Url: http://git.ozo.com/?a=commitdiff_plain;h=df658c3284c02b31a0e3055f61b0e4483dd8b104;p=rawdog%2F.git

Work with feedparser's new module structure.

rawdog needs access to some of feedparser's internals; the locations of
these have changed after the 5.2 release.
---

diff --git a/NEWS b/NEWS
index adf75b6..5da3af2 100644
--- a/NEWS
+++ b/NEWS
@@ -16,6 +16,10 @@ trying to change it, override the corresponding options explicitly.
 The meaning of tidylib's wrap option has also changed, so set a sensible
 default value.
 
+Support the current development version of feedparser (which will
+presumably be the 5.3 release eventually), which has been restructured
+into multiple modules.
+
 - rawdog 2.22
 
 When handling an HTTP 301 redirect response, check whether the new
diff --git a/rawdoglib/rawdog.py b/rawdoglib/rawdog.py
index 10a5535..23568ab 100644
--- a/rawdoglib/rawdog.py
+++ b/rawdoglib/rawdog.py
@@ -53,15 +53,15 @@ try:
 except:
 	mxtidy = None
 
-# Turn off content-cleaning, since we want to see an approximation to the
-# original content for hashing. rawdog will sanitise HTML when writing.
-feedparser.RESOLVE_RELATIVE_URIS = 0
-feedparser.SANITIZE_HTML = 0
-
-# Disable microformat support, because it tends to return poor-quality data
-# (e.g. identifying inappropriate things as enclosures), and it relies on
-# BeautifulSoup which is unable to parse many feeds.
-feedparser.PARSE_MICROFORMATS = 0
+# The sanitisation code was restructured in feedparser 5.3.
+try:
+	_resolveRelativeURIs = feedparser.urls._resolveRelativeURIs
+except AttributeError:
+	_resolveRelativeURIs = feedparser._resolveRelativeURIs
+try:
+	_HTMLSanitizer = feedparser.sanitizer._HTMLSanitizer
+except AttributeError:
+	_HTMLSanitizer = feedparser._HTMLSanitizer
 
 # This is initialised in main().
 persister = None
@@ -119,8 +119,8 @@ def sanitise_html(html, baseurl, inline, config):
 	# "<!doctype html!>"); just remove them all.
 	html = re.sub(r'<![^>]*>', '', html)
 
-	html = feedparser._resolveRelativeURIs(html, baseurl, "UTF-8", type)
-	p = feedparser._HTMLSanitizer("UTF-8", type)
+	html = _resolveRelativeURIs(html, baseurl, "UTF-8", type)
+	p = _HTMLSanitizer("UTF-8", type)
 	p.feed(html)
 	html = p.output()
 
@@ -491,12 +491,29 @@ class Feed:
 		if not ":" in url:
 			url = "file:" + url
 
+		parse_args = {
+			"etag": self.etag,
+			"modified": self.modified,
+			"agent": HTTP_AGENT,
+			"handlers": handlers,
+			}
+		# Turn off content-cleaning, as we need the original content
+		# for hashing and we'll do this ourselves afterwards.
+		if hasattr(feedparser, "api"):
+			# feedparser >= 5.3
+			parse_args["sanitize_html"] = False
+			parse_args["resolve_relative_uris"] = False
+		else:
+			# feedparser < 5.3
+			feedparser.RESOLVE_RELATIVE_URIS = 0
+			feedparser.SANITIZE_HTML = 0
+			# Microformat support (removed in 5.3) tends to return
+			# poor-quality data, and relies on BeautifulSoup which
+			# is unable to parse many feeds.
+			feedparser.PARSE_MICROFORMATS = 0
+
 		try:
-			result = feedparser.parse(url,
-				etag=self.etag,
-				modified=self.modified,
-				agent=HTTP_AGENT,
-				handlers=handlers)
+			result = feedparser.parse(url, **parse_args)
 		except Exception, e:
 			result = {
 				"rawdog_exception": e,