From c64b96d891ab46fcea522140dcb1069678a7f3c9 Mon Sep 17 00:00:00 2001 From: Adam Sampson Date: Wed, 28 Jan 2009 12:57:10 +0000 Subject: [PATCH] Rewrite encode_references to use regexps. This is another result of profiling: in my fairly large test config, this takes the time spent in encode_references from 148 seconds down to 2 seconds. --- NEWS | 5 +++-- rawdoglib/rawdog.py | 14 ++++---------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/NEWS b/NEWS index dbb839e..a81fbe3 100644 --- a/NEWS +++ b/NEWS @@ -1,7 +1,8 @@ - rawdog 2.12 -Cache the result of locale.getpreferredencoding(). This significantly -speeds up writing output files. +Cache the result of locale.getpreferredencoding(), and rewrite +encode_references() to use regexps. This significantly speeds up writing +output files. Update feedparser to revision 291, which fixes the handling of elements (reported by Darren Griffith). diff --git a/rawdoglib/rawdog.py b/rawdoglib/rawdog.py index 3dba503..79ab28b 100644 --- a/rawdoglib/rawdog.py +++ b/rawdoglib/rawdog.py @@ -59,18 +59,12 @@ def format_time(secs, config): format = config["timeformat"] + ", " + config["dayformat"] return safe_ftime(format, t) +high_char_re = re.compile(r'[^\000-\177]') def encode_references(s): """Encode characters in a Unicode string using HTML references.""" - r = StringIO() - for c in s: - n = ord(c) - if n >= 128: - r.write("&#" + str(n) + ";") - else: - r.write(c) - v = r.getvalue() - r.close() - return v + def encode(m): + return "&#" + str(ord(m.group(0))) + ";" + return high_char_re.sub(encode, s) # This list of block-level elements came from the HTML 4.01 specification. block_level_re = re.compile(r'^\s*<(p|h1|h2|h3|h4|h5|h6|ul|ol|pre|dl|div|noscript|blockquote|form|hr|table|fieldset|address)[^a-z]', re.I) -- 2.35.1