Make feedscanner handle Content-Encoding: gzip.

author Adam Sampson <ats@offog.org>

Sun, 4 Aug 2013 13:35:51 +0000 (13:35 +0000)

committer Adam Sampson <ats@offog.org>

Sun, 4 Aug 2013 13:35:51 +0000 (13:35 +0000)
author Adam Sampson <ats@offog.org>
Sun, 4 Aug 2013 13:35:51 +0000 (13:35 +0000)
committer Adam Sampson <ats@offog.org>
Sun, 4 Aug 2013 13:35:51 +0000 (13:35 +0000)
diff --git a/NEWS b/NEWS

index 673063e0689f9ac2d358a9dd28b273b0cd891dd4..aa94a3986c9fb5b1c2c371be183ff63a7ca6991f 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,10 @@ Add some advice in PLUGINS about escaping template parameters. Willem
  reported that the enclosure plugin didn't do this, and having had a look
  at the others it seems to be a common problem.
  
+Make feedscanner handle "Content-Encoding: gzip" in responses, as
+tumblr.com's webservers will use this even if you explicitly refuse it
+in the request.
+
  - rawdog 2.17
  
  Add a one-paragraph description of rawdog to the README file, for use by
diff --git a/rawdoglib/feedscanner.py b/rawdoglib/feedscanner.py

index a690655a1669e2c2590ef307f15e12e4c6292a56..e32aa80b5a266b3e09e8af8c6a4f6a4214a98fd3 100644 (file)
--- a/rawdoglib/feedscanner.py
+++ b/rawdoglib/feedscanner.py
@@ -32,7 +32,9 @@ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  PERFORMANCE OF THIS SOFTWARE.
  """
  
+import cStringIO
  import feedparser
+import gzip
  import re
  import urllib2
  import urlparse
@@ -51,11 +53,22 @@ def fetch_url(url):
      """Fetch the given URL and return the data from it as a Unicode string."""
  
      request = urllib2.Request(url)
+    request.add_header("Accept-Encoding", "gzip")
  
      f = urllib2.urlopen(request)
+    headers = f.info()
      data = f.read()
      f.close()
  
+    # We have to support gzip encoding because some servers will use it
+    # even if you explicitly refuse it in Accept-Encoding.
+    encodings = headers.get("Content-Encoding", "")
+    encodings = [s.strip() for s in encodings.split(",")]
+    if "gzip" in encodings:
+        f = gzip.GzipFile(fileobj=cStringIO.StringIO(data))
+        data = f.read()
+        f.close()
+
      # Silently ignore encoding errors -- we don't need to go to the bother of
      # detecting the encoding properly (like feedparser does).
      data = data.decode("UTF-8", "ignore")
diff --git a/test-rawdog b/test-rawdog

index d460246ada9f34d0e8fd3dda0d3698470ede941c..50d477078a4a232f7ede40f776ba2f3a2e1ccc58 100644 (file)
--- a/test-rawdog
+++ b/test-rawdog
@@ -1747,6 +1747,22 @@ EOF
  rune "Adding feed" -a $httpurl/page.html
  contains "$statedir/config" $httpurl/feed.atom
  
+begin "add feed, gzip-encoded response"
+make_rss20 $httpdir/feed.rss
+make_html_head $httpdir/page.html <<EOF
+<link rel="alternate" type="application/rss+xml" title="RSS" href="$httpurl/feed.rss">
+EOF
+rune "Adding feed" -a $httpurl/gzip/page.html
+contains "$statedir/config" $httpurl/feed.rss
+
+begin "add feed, gzip-encoded feed"
+make_rss20 $httpdir/feed.rss
+make_html_head $httpdir/page.html <<EOF
+<link rel="alternate" type="application/rss+xml" title="RSS" href="$httpurl/gzip/feed.rss">
+EOF
+rune "Adding feed" -a $httpurl/page.html
+contains "$statedir/config" $httpurl/gzip/feed.rss
+
  begin "remove feed"
  add "feed 3h $httpurl/0.rss"
  add "feed 3h $httpurl/1.rss"
diff --git a/testserver.py b/testserver.py

index b7d0e6b4bb8160aac1cb72186dd2069664daf08d..2fe5a14ee3775885dd9bb6c3fa89cb569eee858c 100644 (file)
--- a/testserver.py
+++ b/testserver.py
@@ -20,6 +20,8 @@ import BaseHTTPServer
  import SimpleHTTPServer
  import SocketServer
  import base64
+import cStringIO
+import gzip
  import hashlib
  import os
  import re
@@ -108,6 +110,13 @@ class HTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
              self.end_headers()
              return None
  
+        encoding = None
+        m = re.match(r'^/(gzip)(/.*)$', self.path)
+        if m:
+            # Request for a content encoding.
+            encoding = m.group(1)
+            self.path = m.group(2)
+
          m = re.match(r'^/([^/]+)$', self.path)
          if m:
              # Request for a file.
@@ -142,6 +151,19 @@ class HTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
                  mime_type = "text/html"
  
              self.send_response(200)
+
+            if encoding:
+                self.send_header("Content-Encoding", encoding)
+                if encoding == "gzip":
+                    data = f.read()
+                    f.close()
+                    f = cStringIO.StringIO()
+                    g = gzip.GzipFile(fileobj=f, mode="wb")
+                    g.write(data)
+                    g.close()
+                    size = f.tell()
+                    f.seek(0)
+
              self.send_header("Content-Length", size)
              self.send_header("Content-Type", mime_type)
              self.send_header("ETag", etag)
author	Adam Sampson <ats@offog.org>
	Sun, 4 Aug 2013 13:35:51 +0000 (13:35 +0000)
committer	Adam Sampson <ats@offog.org>
	Sun, 4 Aug 2013 13:35:51 +0000 (13:35 +0000)
NEWS		patch \| blob \| history
rawdoglib/feedscanner.py		patch \| blob \| history
test-rawdog		patch \| blob \| history
testserver.py		patch \| blob \| history