From e0b5eca233c2743beff1cc56b9b0c256bf68bed9 Mon Sep 17 00:00:00 2001 From: Adam Sampson Date: Tue, 29 Jul 2003 06:35:32 +0000 Subject: [PATCH] Updated to upstream version 2.52 and made our modifications again. --- rawdoglib/feedparser.py | 165 ++++++++++++++++++++++++---------------- 1 file changed, 99 insertions(+), 66 deletions(-) diff --git a/rawdoglib/feedparser.py b/rawdoglib/feedparser.py index c315ead..8953c25 100644 --- a/rawdoglib/feedparser.py +++ b/rawdoglib/feedparser.py @@ -3,7 +3,7 @@ Visit http://diveintomark.org/projects/feed_parser/ for the latest version -Handles RSS 0.9x, RSS 1.0, RSS 2.0, Pie feeds +Handles RSS 0.9x, RSS 1.0, RSS 2.0, Pie/Atom/Echo feeds RSS 0.9x/common elements: - title, link, guid, description, webMaster, managingEditor, language @@ -13,7 +13,7 @@ Additional RSS 1.0/2.0 elements: - dc:rights, dc:language, dc:creator, dc:date, dc:subject, content:encoded, admin:generatorAgent, admin:errorReportsTo, -Addition Pie elements: +Addition Pie/Atom/Echo elements: - subtitle, created, issued, modified, summary, id, content Things it handles that choke other parsers: @@ -24,8 +24,8 @@ Things it handles that choke other parsers: - guid in item element - fullitem in item element - non-standard namespaces -- inline XML in content (Pie) -- multiple content items per entry (Pie) +- inline XML in content (Pie/Atom/Echo) +- multiple content items per entry (Pie/Atom/Echo) Requires Python 2.2 or later @@ -33,11 +33,12 @@ Modified for rawdog usage by Adam Sampson : - increased socket timeout to 30s """ -__version__ = "2.4" -__author__ = "Mark Pilgrim (f8dy@diveintomark.org)" +__version__ = "2.5.2" +__author__ = "Mark Pilgrim " __copyright__ = "Copyright 2002-3, Mark Pilgrim" -__contributors__ = ["Jason Diamond (jason@injektilo.org)"] -__license__ = "GPL" # see full license below +__contributors__ = ["Jason Diamond ", + "John Beimler "] +__license__ = "Python" __history__ = """ 1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements, added Simon Fell's test suite @@ -65,33 +66,27 @@ __history__ = """ also, make sure we send the User-Agent even if urllib2 isn't available. Match any variation of backend.userland.com/rss namespace. 2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is. -2.4 - 7/9/2003 - MAP - added preliminary Pie support based on Sam Ruby's +2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's snapshot of July 1 ; changed project name +2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree); + removed unnecessary urllib code -- urllib2 should always be available anyway; + return actual url, status, and full HTTP headers (as result['url'], + result['status'], and result['headers']) if parsing a remote feed over HTTP -- + this should pass all the HTTP tests at ; + added the latest namespace-of-the-week for RSS 2.0 +2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom + User-Agent (otherwise urllib2 sends two, which confuses some servers) +2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for + inline and as used in some RSS 2.0 feeds """ -# Copyright (C) 2003 Mark Pilgrim and Jason Diamond -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - try: import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py timeoutsocket.setDefaultSocketTimeout(30) except ImportError: pass -import cgi, re, sgmllib, string, StringIO, urllib, gzip +import cgi, re, sgmllib, string, StringIO, gzip, urllib2 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') USER_AGENT = "UltraLiberalFeedParser/%s +http://diveintomark.org/projects/feed_parser/" % __version__ @@ -107,6 +102,7 @@ def decodeEntities(data): class FeedParser(sgmllib.SGMLParser): namespaces = {"http://backend.userland.com/rss": "", + "http://blogs.law.harvard.edu/tech/rss": "", "http://purl.org/rss/1.0/": "", "http://example.com/newformat#": "", "http://example.com/necho": "", @@ -117,7 +113,8 @@ class FeedParser(sgmllib.SGMLParser): "http://purl.org/rss/1.0/modules/company/": "co", "http://purl.org/rss/1.0/modules/syndication/": "sy", "http://purl.org/dc/elements/1.1/": "dc", - "http://webns.net/mvcb/": "admin"} + "http://webns.net/mvcb/": "admin", + "http://www.w3.org/1999/xhtml": "xhtml"} def reset(self): self.channel = {} @@ -349,6 +346,23 @@ class FeedParser(sgmllib.SGMLParser): self.contentmode = None self.contenttype = None self.contentlang = None + + def start_body(self, attrs): + self.incontent = 1 + self.contentmode = 'xml' + self.contenttype = 'application/xhtml+xml' + xmllang = [v for k, v in attrs if k=='xml:lang'] + if xmllang: + self.contentlang = xmllang[0] + self.push('content', 1) + + start_div = start_body + start_xhtml_body = start_body + start_xhtml_div = start_body + end_body = end_content + end_div = end_content + end_xhtml_body = end_content + end_xhtml_div = end_content def unknown_starttag(self, tag, attrs): if self.incontent and self.contentmode == 'xml': @@ -393,18 +407,26 @@ class FeedParser(sgmllib.SGMLParser): # called for each character reference, e.g. for " ", ref will be "160" # Reconstruct the original character reference. if not self.elementstack: return - self.elementstack[-1][2].append("&#%(ref)s;" % locals()) + text = "&#%s;" % ref + if self.incontent and self.contentmode == 'xml': + text = cgi.escape(text) + self.elementstack[-1][2].append(text) def handle_entityref(self, ref): # called for each entity reference, e.g. for "©", ref will be "copy" # Reconstruct the original entity reference. if not self.elementstack: return - self.elementstack[-1][2].append("&%(ref)s;" % locals()) + text = "&%s;" % ref + if self.incontent and self.contentmode == 'xml': + text = cgi.escape(text) + self.elementstack[-1][2].append(text) def handle_data(self, text): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references if not self.elementstack: return + if self.incontent and self.contentmode == 'xml': + text = cgi.escape(text) self.elementstack[-1][2].append(text) def handle_comment(self, text): @@ -447,6 +469,29 @@ class FeedParser(sgmllib.SGMLParser): return k+3 return sgmllib.SGMLParser.parse_declaration(self, i) +class FeedURLHandler(urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler): + def http_error_default(self, req, fp, code, msg, headers): + if ((code / 100) == 3) and (code != 304): + return self.http_error_302(req, fp, code, msg, headers) + from urllib import addinfourl + infourl = addinfourl(fp, headers, req.get_full_url()) + infourl.status = code + return infourl +# raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) + + def http_error_302(self, req, fp, code, msg, headers): + infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) + infourl.status = code + return infourl + + def http_error_301(self, req, fp, code, msg, headers): + infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) + infourl.status = code + return infourl + + http_error_300 = http_error_302 + http_error_307 = http_error_302 + def open_resource(source, etag=None, modified=None, agent=None, referrer=None): """ URI, filename, or string --> stream @@ -470,10 +515,6 @@ def open_resource(source, etag=None, modified=None, agent=None, referrer=None): If the referrer argument is supplied, it will be used as the value of a Referer[sic] request header. - - The optional arguments are only used if the source argument is an HTTP - URL and the urllib2 module is importable (i.e., you must be using Python - version 2.0 or higher). """ if hasattr(source, "read"): @@ -486,37 +527,23 @@ def open_resource(source, etag=None, modified=None, agent=None, referrer=None): agent = USER_AGENT # try to open with urllib2 (to use optional headers) - try: - import urllib2 - request = urllib2.Request(source) - if etag: - request.add_header("If-None-Match", etag) - if modified: - request.add_header("If-Modified-Since", format_http_date(modified)) - request.add_header("User-Agent", agent) - if referrer: - # http://www.dictionary.com/search?q=referer - request.add_header("Referer", referrer) + request = urllib2.Request(source) + if etag: + request.add_header("If-None-Match", etag) + if modified: + request.add_header("If-Modified-Since", format_http_date(modified)) + request.add_header("User-Agent", agent) + if referrer: + request.add_header("Referer", referrer) request.add_header("Accept-encoding", "gzip") - try: - return urllib2.urlopen(request) - except urllib2.HTTPError: - # either the resource is not modified or some other HTTP - # error occurred so return an empty resource - return StringIO.StringIO("") - except: - # source must not be a valid URL but it might be a valid filename - pass - except ImportError: - # urllib2 isn't available so try to open with urllib - o = urllib.FancyURLopener() - o.addheaders = [('User-agent', o.version + ", " + agent)] - try: - return o.open(source) - except: - # source still might be a filename - pass - + opener = urllib2.build_opener(FeedURLHandler()) + opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent + try: + return opener.open(request) + except: + # source is not a valid URL, but it might be a valid filename + pass + # try to open with native open function (if source is a filename) try: return open(source) @@ -632,6 +659,14 @@ def parse(uri, etag=None, modified=None, agent=None, referrer=None): newModified = get_modified(f) if newModified: result["modified"] = newModified elif modified: result["modified"] = modified + if hasattr(f, "url"): + result["url"] = f.url + if hasattr(f, "headers"): + result["headers"] = f.headers.dict + if hasattr(f, "status"): + result["status"] = f.status + elif hasattr(f, "url"): + result["status"] = 200 f.close() return result @@ -655,9 +690,7 @@ if __name__ == '__main__': print url print result = parse(url) - pprint(result['channel']) - if result['items']: - pprint(result['items']) + pprint(result) print """ -- 2.35.1