Rewrite encode_references to use regexps.
authorAdam Sampson <ats@offog.org>
Wed, 28 Jan 2009 12:57:10 +0000 (12:57 +0000)
committerAdam Sampson <ats@offog.org>
Wed, 28 Jan 2009 12:57:10 +0000 (12:57 +0000)
This is another result of profiling: in my fairly large test config,
this takes the time spent in encode_references from 148 seconds down to
2 seconds.

NEWS
rawdoglib/rawdog.py

diff --git a/NEWS b/NEWS
index dbb839e5c0666efe343b1c8196e0370ecdb45eab..a81fbe31abd7b3f0fff7f90af4e00d3c5ff9f0ec 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,7 +1,8 @@
 - rawdog 2.12
 
-Cache the result of locale.getpreferredencoding(). This significantly
-speeds up writing output files.
+Cache the result of locale.getpreferredencoding(), and rewrite
+encode_references() to use regexps. This significantly speeds up writing
+output files.
 
 Update feedparser to revision 291, which fixes the handling of
 <media:title> elements (reported by Darren Griffith).
index 3dba503bec37f28b8a28fe843986487d139553fd..79ab28b75621bb397ed4df411b2553e69b4ba2de 100644 (file)
@@ -59,18 +59,12 @@ def format_time(secs, config):
                format = config["timeformat"] + ", " + config["dayformat"]
        return safe_ftime(format, t)
 
+high_char_re = re.compile(r'[^\000-\177]')
 def encode_references(s):
        """Encode characters in a Unicode string using HTML references."""
-       r = StringIO()
-       for c in s:
-               n = ord(c)
-               if n >= 128:
-                       r.write("&#" + str(n) + ";")
-               else:
-                       r.write(c)
-       v = r.getvalue()
-       r.close()
-       return v
+       def encode(m):
+               return "&#" + str(ord(m.group(0))) + ";"
+       return high_char_re.sub(encode, s)
 
 # This list of block-level elements came from the HTML 4.01 specification.
 block_level_re = re.compile(r'^\s*<(p|h1|h2|h3|h4|h5|h6|ul|ol|pre|dl|div|noscript|blockquote|form|hr|table|fieldset|address)[^a-z]', re.I)