From: Adam Sampson Date: Sun, 4 Aug 2013 13:35:51 +0000 (+0000) Subject: Make feedscanner handle Content-Encoding: gzip. X-Git-Tag: v2.18~2 X-Git-Url: http://git.ozo.com/?a=commitdiff_plain;h=2ffa6902a80c712c7261cca6ca447efcb908b2a8;p=rawdog%2F.git Make feedscanner handle Content-Encoding: gzip. Having to do this by hand (when feedparser does too) is a pain -- it really ought to be handled by urllib2 upstream. --- diff --git a/NEWS b/NEWS index 673063e..aa94a39 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,10 @@ Add some advice in PLUGINS about escaping template parameters. Willem reported that the enclosure plugin didn't do this, and having had a look at the others it seems to be a common problem. +Make feedscanner handle "Content-Encoding: gzip" in responses, as +tumblr.com's webservers will use this even if you explicitly refuse it +in the request. + - rawdog 2.17 Add a one-paragraph description of rawdog to the README file, for use by diff --git a/rawdoglib/feedscanner.py b/rawdoglib/feedscanner.py index a690655..e32aa80 100644 --- a/rawdoglib/feedscanner.py +++ b/rawdoglib/feedscanner.py @@ -32,7 +32,9 @@ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. """ +import cStringIO import feedparser +import gzip import re import urllib2 import urlparse @@ -51,11 +53,22 @@ def fetch_url(url): """Fetch the given URL and return the data from it as a Unicode string.""" request = urllib2.Request(url) + request.add_header("Accept-Encoding", "gzip") f = urllib2.urlopen(request) + headers = f.info() data = f.read() f.close() + # We have to support gzip encoding because some servers will use it + # even if you explicitly refuse it in Accept-Encoding. + encodings = headers.get("Content-Encoding", "") + encodings = [s.strip() for s in encodings.split(",")] + if "gzip" in encodings: + f = gzip.GzipFile(fileobj=cStringIO.StringIO(data)) + data = f.read() + f.close() + # Silently ignore encoding errors -- we don't need to go to the bother of # detecting the encoding properly (like feedparser does). data = data.decode("UTF-8", "ignore") diff --git a/test-rawdog b/test-rawdog index d460246..50d4770 100644 --- a/test-rawdog +++ b/test-rawdog @@ -1747,6 +1747,22 @@ EOF rune "Adding feed" -a $httpurl/page.html contains "$statedir/config" $httpurl/feed.atom +begin "add feed, gzip-encoded response" +make_rss20 $httpdir/feed.rss +make_html_head $httpdir/page.html < +EOF +rune "Adding feed" -a $httpurl/gzip/page.html +contains "$statedir/config" $httpurl/feed.rss + +begin "add feed, gzip-encoded feed" +make_rss20 $httpdir/feed.rss +make_html_head $httpdir/page.html < +EOF +rune "Adding feed" -a $httpurl/page.html +contains "$statedir/config" $httpurl/gzip/feed.rss + begin "remove feed" add "feed 3h $httpurl/0.rss" add "feed 3h $httpurl/1.rss" diff --git a/testserver.py b/testserver.py index b7d0e6b..2fe5a14 100644 --- a/testserver.py +++ b/testserver.py @@ -20,6 +20,8 @@ import BaseHTTPServer import SimpleHTTPServer import SocketServer import base64 +import cStringIO +import gzip import hashlib import os import re @@ -108,6 +110,13 @@ class HTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): self.end_headers() return None + encoding = None + m = re.match(r'^/(gzip)(/.*)$', self.path) + if m: + # Request for a content encoding. + encoding = m.group(1) + self.path = m.group(2) + m = re.match(r'^/([^/]+)$', self.path) if m: # Request for a file. @@ -142,6 +151,19 @@ class HTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): mime_type = "text/html" self.send_response(200) + + if encoding: + self.send_header("Content-Encoding", encoding) + if encoding == "gzip": + data = f.read() + f.close() + f = cStringIO.StringIO() + g = gzip.GzipFile(fileobj=f, mode="wb") + g.write(data) + g.close() + size = f.tell() + f.seek(0) + self.send_header("Content-Length", size) self.send_header("Content-Type", mime_type) self.send_header("ETag", etag)