From 84d49507f69f9a95d012d672a358ef1af2c94c3e Mon Sep 17 00:00:00 2001 From: Adam Sampson Date: Sat, 20 Sep 2003 14:16:05 +0000 Subject: [PATCH] Convert all incoming text to UTF-8. --- NEWS | 5 +++++ rawdoglib/rawdog.py | 34 ++++++++++++++++++++++++++-------- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index fc80ddc..183c656 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,11 @@ Reverted the "retry immediately" behaviour from 1.2, since it causes denied or broken feeds to get checked every time rawdog is run. +Updated feedparser to 2.5.3, which now returns the XML encoding used. +rawdog uses this information to convert all incoming items into UTF-8, so +multiple encodings are now handled correctly (and the HTML output is now +encoded in UTF-8 rather than ISO-8859-1 as before). + - rawdog 1.2 Updated feedparser to 2.5.2, which fixes a bug that was making rawdog diff --git a/rawdoglib/rawdog.py b/rawdoglib/rawdog.py index ab213a8..1136e24 100644 --- a/rawdoglib/rawdog.py +++ b/rawdoglib/rawdog.py @@ -112,25 +112,29 @@ class Feed: # and feed will be empty. In this case we return 0 so that # we know not to expire articles that came from this feed. + self.encoding = p.get("encoding") + if self.encoding is None: + self.encoding = "utf-8" + channel = p["channel"] if channel.has_key("title"): - self.title = channel["title"] + self.title = self.decode(channel["title"]) if channel.has_key("link"): - self.link = channel["link"] + self.link = self.decode(channel["link"]) feed = self.url seen_items = 0 sequence = 0 for item in p["items"]: - title = item.get("title") - link = item.get("link") + title = self.decode(item.get("title")) + link = self.decode(item.get("link")) description = None if description is None and item.has_key("content"): - description = select_content(item["content"]) + description = select_content(self.decode(item["content"])) if description is None and item.has_key("content_encoded"): - description = item["content_encoded"] + description = self.decode(item["content_encoded"]) if description is None: - description = item.get("description") + description = self.decode(item.get("description")) article = Article(feed, title, link, description, now, sequence) @@ -144,6 +148,20 @@ class Feed: return seen_items + def decode(self, s): + """Convert a string retrieved from the feed to UTF-8.""" + if s is None: + return None + try: + us = s.decode(self.encoding) + return us.encode("utf-8") + except ValueError: + # Badly-encoded string (or misguessed encoding). + return s + except LookupError: + # Unknown encoding. + return s + def get_html_name(self): if self.title is not None: return self.title @@ -347,7 +365,7 @@ class Rawdog(Persistable): "http://www.w3.org/TR/html4/strict.dtd"> - """ + """ if config["userefresh"]: print >>f, """""" print >>f, """ -- 2.35.1