From c64b96d891ab46fcea522140dcb1069678a7f3c9 Mon Sep 17 00:00:00 2001
From: Adam Sampson <ats@offog.org>
Date: Wed, 28 Jan 2009 12:57:10 +0000
Subject: [PATCH] Rewrite encode_references to use regexps.

This is another result of profiling: in my fairly large test config,
this takes the time spent in encode_references from 148 seconds down to
2 seconds.
---
 NEWS                |  5 +++--
 rawdoglib/rawdog.py | 14 ++++----------
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/NEWS b/NEWS
index dbb839e..a81fbe3 100644
--- a/NEWS
+++ b/NEWS
@@ -1,7 +1,8 @@
 - rawdog 2.12
 
-Cache the result of locale.getpreferredencoding(). This significantly
-speeds up writing output files.
+Cache the result of locale.getpreferredencoding(), and rewrite
+encode_references() to use regexps. This significantly speeds up writing
+output files.
 
 Update feedparser to revision 291, which fixes the handling of
 <media:title> elements (reported by Darren Griffith).
diff --git a/rawdoglib/rawdog.py b/rawdoglib/rawdog.py
index 3dba503..79ab28b 100644
--- a/rawdoglib/rawdog.py
+++ b/rawdoglib/rawdog.py
@@ -59,18 +59,12 @@ def format_time(secs, config):
 		format = config["timeformat"] + ", " + config["dayformat"]
 	return safe_ftime(format, t)
 
+high_char_re = re.compile(r'[^\000-\177]')
 def encode_references(s):
 	"""Encode characters in a Unicode string using HTML references."""
-	r = StringIO()
-	for c in s:
-		n = ord(c)
-		if n >= 128:
-			r.write("&#" + str(n) + ";")
-		else:
-			r.write(c)
-	v = r.getvalue()
-	r.close()
-	return v
+	def encode(m):
+		return "&#" + str(ord(m.group(0))) + ";"
+	return high_char_re.sub(encode, s)
 
 # This list of block-level elements came from the HTML 4.01 specification.
 block_level_re = re.compile(r'^\s*<(p|h1|h2|h3|h4|h5|h6|ul|ol|pre|dl|div|noscript|blockquote|form|hr|table|fieldset|address)[^a-z]', re.I)
-- 
2.35.1