From: Adam Sampson Date: Tue, 22 Dec 2009 15:55:17 +0000 (+0000) Subject: Use PyTidyLib rather than mx.Tidy when available. X-Git-Tag: v2.13~5 X-Git-Url: http://git.ozo.com/?a=commitdiff_plain;h=635be250a552c599c3428975b2441a5cac6095a7;p=rawdog%2F.git Use PyTidyLib rather than mx.Tidy when available. --- diff --git a/NEWS b/NEWS index 9662593..123cee7 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,11 @@ Add the "showtracebacks" option, which causes exceptions that occur while a feed is being fetched to be reported with a traceback in the resulting error message. +Use PyTidyLib in preference to mx.Tidy when available (suggested by +Joseph Reagle). If neither is available, "tidyhtml true" just does +nothing, so it's now turned on in the provided config file. The +mxtidy_args hook is now called tidy_args. + - rawdog 2.12 Make rawdog work with Python 2.6 (reported by Roy Lanek). diff --git a/PLUGINS b/PLUGINS index 694f0b3..033923b 100644 --- a/PLUGINS +++ b/PLUGINS @@ -247,17 +247,17 @@ normal expansion process); you can thus use this hook either for manipulating template parameters, or for replacing the template system entirely. -### mxtidy_args(config, args, baseurl, inline) +### tidy_args(config, args, baseurl, inline) -* args: a dictionary of keyword arguments for mx.Tidy.tidy +* args: a dictionary of keyword arguments for Tidy * baseurl: the URL at which the HTML was originally found * inline: a boolean indicating whether the output should be inline HTML or a block element When HTML is being sanitised by rawdog and the "tidyhtml" option is -enabled, this hook will be called just before mx.Tidy.tidy is run. It -can be used to add or modify mx.Tidy options; for example, to make it -produce XHTML output. +enabled, this hook will be called just before Tidy is run (either via +PyTidyLib or via mx.Tidy). It can be used to add or modify Tidy options; +for example, to make it produce XHTML output. ### clean_html(config, html, baseurl, inline) diff --git a/config b/config index f111057..bd9fb6b 100644 --- a/config +++ b/config @@ -182,9 +182,9 @@ blocklevelhtml true # Whether to attempt to turn feed-provided HTML into valid HTML. # The most common problem that this solves is a non-closed element in an # article causing formatting problems for the rest of the page. -# If this option is turned on, you must have the mx.Tidy Python module +# For this option to have any effect, you need to have PyTidyLib or mx.Tixy # installed. -tidyhtml false +tidyhtml true # Whether the articles displayed should be sorted first by the date # provided in the feed (useful for "planet" pages, where you're diff --git a/rawdoglib/rawdog.py b/rawdoglib/rawdog.py index 221b365..7c9c220 100644 --- a/rawdoglib/rawdog.py +++ b/rawdoglib/rawdog.py @@ -37,6 +37,16 @@ except: hashlib = None import sha +try: + import tidylib +except: + tidylib = None + +try: + import mx.Tidy as mxtidy +except: + mxtidy = None + try: import feedfinder except: @@ -115,11 +125,22 @@ def sanitise_html(html, baseurl, inline, config): html = "

" + html if config["tidyhtml"]: - import mx.Tidy - args = { "wrap": 0, "numeric_entities": 1 } + args = {"numeric_entities": 1, + "output_html": 1, + "output_xhtml": 0, + "output_xml": 0, + "wrap": 0} plugins.call_hook("mxtidy_args", config, args, baseurl, inline) - output = mx.Tidy.tidy(html, None, None, - **args)[2] + plugins.call_hook("tidy_args", config, args, baseurl, inline) + if tidylib is not None: + # Disable PyTidyLib's somewhat unhelpful defaults. + tidylib.BASE_OPTIONS = {} + output = tidylib.tidy_document(html, args)[0] + elif mxtidy is not None: + output = mxtidy.tidy(html, None, None, **args)[2] + else: + # No Tidy bindings installed -- do nothing. + output = "" + html + "" html = output[output.find("") + 6 : output.rfind("")].strip()