From: Adam Sampson <ats@offog.org>
Date: Sun, 4 Aug 2013 13:35:51 +0000 (+0000)
Subject: Make feedscanner handle Content-Encoding: gzip.
X-Git-Tag: v2.18~2
X-Git-Url: http://git.ozo.com/?a=commitdiff_plain;h=2ffa6902a80c712c7261cca6ca447efcb908b2a8;p=rawdog%2F.git

Make feedscanner handle Content-Encoding: gzip.

Having to do this by hand (when feedparser does too) is a pain --
it really ought to be handled by urllib2 upstream.
---

diff --git a/NEWS b/NEWS
index 673063e..aa94a39 100644
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,10 @@ Add some advice in PLUGINS about escaping template parameters. Willem
 reported that the enclosure plugin didn't do this, and having had a look
 at the others it seems to be a common problem.
 
+Make feedscanner handle "Content-Encoding: gzip" in responses, as
+tumblr.com's webservers will use this even if you explicitly refuse it
+in the request.
+
 - rawdog 2.17
 
 Add a one-paragraph description of rawdog to the README file, for use by
diff --git a/rawdoglib/feedscanner.py b/rawdoglib/feedscanner.py
index a690655..e32aa80 100644
--- a/rawdoglib/feedscanner.py
+++ b/rawdoglib/feedscanner.py
@@ -32,7 +32,9 @@ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 PERFORMANCE OF THIS SOFTWARE.
 """
 
+import cStringIO
 import feedparser
+import gzip
 import re
 import urllib2
 import urlparse
@@ -51,11 +53,22 @@ def fetch_url(url):
     """Fetch the given URL and return the data from it as a Unicode string."""
 
     request = urllib2.Request(url)
+    request.add_header("Accept-Encoding", "gzip")
 
     f = urllib2.urlopen(request)
+    headers = f.info()
     data = f.read()
     f.close()
 
+    # We have to support gzip encoding because some servers will use it
+    # even if you explicitly refuse it in Accept-Encoding.
+    encodings = headers.get("Content-Encoding", "")
+    encodings = [s.strip() for s in encodings.split(",")]
+    if "gzip" in encodings:
+        f = gzip.GzipFile(fileobj=cStringIO.StringIO(data))
+        data = f.read()
+        f.close()
+
     # Silently ignore encoding errors -- we don't need to go to the bother of
     # detecting the encoding properly (like feedparser does).
     data = data.decode("UTF-8", "ignore")
diff --git a/test-rawdog b/test-rawdog
index d460246..50d4770 100644
--- a/test-rawdog
+++ b/test-rawdog
@@ -1747,6 +1747,22 @@ EOF
 rune "Adding feed" -a $httpurl/page.html
 contains "$statedir/config" $httpurl/feed.atom
 
+begin "add feed, gzip-encoded response"
+make_rss20 $httpdir/feed.rss
+make_html_head $httpdir/page.html <<EOF
+<link rel="alternate" type="application/rss+xml" title="RSS" href="$httpurl/feed.rss">
+EOF
+rune "Adding feed" -a $httpurl/gzip/page.html
+contains "$statedir/config" $httpurl/feed.rss
+
+begin "add feed, gzip-encoded feed"
+make_rss20 $httpdir/feed.rss
+make_html_head $httpdir/page.html <<EOF
+<link rel="alternate" type="application/rss+xml" title="RSS" href="$httpurl/gzip/feed.rss">
+EOF
+rune "Adding feed" -a $httpurl/page.html
+contains "$statedir/config" $httpurl/gzip/feed.rss
+
 begin "remove feed"
 add "feed 3h $httpurl/0.rss"
 add "feed 3h $httpurl/1.rss"
diff --git a/testserver.py b/testserver.py
index b7d0e6b..2fe5a14 100644
--- a/testserver.py
+++ b/testserver.py
@@ -20,6 +20,8 @@ import BaseHTTPServer
 import SimpleHTTPServer
 import SocketServer
 import base64
+import cStringIO
+import gzip
 import hashlib
 import os
 import re
@@ -108,6 +110,13 @@ class HTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
             self.end_headers()
             return None
 
+        encoding = None
+        m = re.match(r'^/(gzip)(/.*)$', self.path)
+        if m:
+            # Request for a content encoding.
+            encoding = m.group(1)
+            self.path = m.group(2)
+
         m = re.match(r'^/([^/]+)$', self.path)
         if m:
             # Request for a file.
@@ -142,6 +151,19 @@ class HTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
                 mime_type = "text/html"
 
             self.send_response(200)
+
+            if encoding:
+                self.send_header("Content-Encoding", encoding)
+                if encoding == "gzip":
+                    data = f.read()
+                    f.close()
+                    f = cStringIO.StringIO()
+                    g = gzip.GzipFile(fileobj=f, mode="wb")
+                    g.write(data)
+                    g.close()
+                    size = f.tell()
+                    f.seek(0)
+
             self.send_header("Content-Length", size)
             self.send_header("Content-Type", mime_type)
             self.send_header("ETag", etag)