#!/usr/bin/env python # -*- coding: utf8 -*- """Convert 'dumb' plaintext sequences to their unicode counterparts. It is not always convenient to make full use of the unicode character set when text is created. The most common example of this is the use of multiple hyphens to denote en- and em-dashes, multiple full-stops to denote ellipses, and " and ' to denote curled quotation marks. BetterType provides a simple and flexible framework for converting such text into more robust unicode. The module includes a wide variety of predefined translations, and provides a wrapper for applying translations to HTML as well as plain text. Use as a module: import bettertype x = bettertype.better_text("That girl---she's really hot!") # Can work with HTML fragments or whole pages: x = bettertype.better_html('

"Oh well..." she sighed.

') # Only translate ellipses and dashes: s = bettertype.better_text("Uh...", ['ellipses', 'dashes']) """ __url__ = "http://v.cx/2008/BetterType" __author__ = "Rob Shearer" __copyright__ = """ Copyright (c) 2008 Rob Shearer. All right reserved. Redistribution and use of this software, in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. This software might contain bugs. There might be bugs that stop it from doing what it was designed to do. There might be bugs that make it do unexpected things. Potentially catastrophic things. Anyone who uses this software does so at their own risk; the authors take no responsibility for the results. """ __version__ = "0.1.1" __version_info__ = (0, 1, 1) __changelog__ = { (0, 1, 0) : { 'released' : (2008, 8, 16), 'comment' : "Initial release", }, (0, 1, 1) : { 'released' : (2008, 12, 18), 'fixed' : ["Removed fraction slash conversion"], } } import sys import optparse from os import path import re from collections import defaultdict, deque from HTMLParser import HTMLParser import htmlentitydefs import codecs class Rule: """Represents a rule to translate one text sequence to another. Rules are only applicable within a restricted 'context' defined by lookahead and lookbehind regular expressions. """ def __init__(self, to_replace, lookbehind_pattern, lookahead_pattern, replacement): assert to_replace self.to_replace = to_replace self.lookbehind = re.compile( r'(?:.|\n)*(?:' + lookbehind_pattern + r')$\n$', re.UNICODE ) self.lookahead = re.compile(lookahead_pattern, re.UNICODE) self.replacement = replacement class BetterType: """Translates an entire text string according to a collection of rules.""" def __init__(self, rules): self.rules_for_char = defaultdict(list) for r in rules: self.rules_for_char[r.to_replace[0]].append(r) self.scanner = re.compile( '|'.join(map(lambda x: re.escape(x.to_replace), rules)), re.UNICODE ) self.max_replacement_chars = max([len(r.replacement) for r in rules]) self.pre_chars = 50 self.post_chars = 50 def __call__(self, string, prefix='', suffix=''): """Return a translated version of `string`. The `prefix` and `suffix` strings are not translated, but translation is performed as though `string` were present in the context `prefix + string + suffix`. """ follow_chars = self.post_chars + self.max_replacement_chars pos = 0 output = [] for match_pos in [x.start() for x in self.scanner.finditer(string)]: if match_pos < pos: continue output.append(string[pos:match_pos]) pos = match_pos pre_context = "\n" for i in reversed(output): pre_context = i + pre_context if len(pre_context) >= self.pre_chars + 1: break if len(pre_context) < self.pre_chars: pre_context = ( prefix[len(pre_context) - self.pre_chars:] + pre_context ) post_context = string[pos:pos + follow_chars] if len(post_context) < follow_chars: post_context += suffix[:follow_chars - len(post_context)] for r in self.rules_for_char[string[pos]]: if (string.startswith(r.to_replace, pos) and r.lookbehind.match(pre_context) and r.lookahead.match(post_context[len(r.to_replace):])): output.append(r.replacement) pos += len(r.to_replace) break # done with work at this position; move on output.append(string[pos:]) return ''.join(output) default_rulesets = ('escapes', 'dashes', 'ellipses', 'educate quotes', 'symbols') """The default rulesets roughly correspond with SmartyPants functionality.""" def better_text(text, ruleset_names=default_rulesets): """Convert `text` in accordance with the given rulesets. Valid ruleset names are keys from the `rulesets` dictionary. Note that the order in which rulesets are named is significant: a given character can only be converted by a single rule, so the first rule to match it wins. """ bt = BetterType(reduce(lambda x,y: x + rulesets[y], ruleset_names, [])) return bt(text) # HTML interface: def must_html_escape(c): return c in ('&', '<') def often_html_escaped(c): return c in ('&', '<', '"') class BetterHtml(HTMLParser): """An HTML parser which transforms plaintext runs using a conversion function.""" class _ConversionBuffer: def __init__(self, convert, encode): self.buf = deque() self.prefix = '' self.output = [] self.convert = convert self.encode = encode self.pre_chars = getattr(convert, "prefix_chars", 50) self.post_chars = getattr(convert, "post_chars", 50) assert self.post_chars > 0 def update_prefix(self, text): if len(text) > self.pre_chars: self.prefix = text[-self.pre_chars:] else: self.prefix = self.prefix[len(text) - self.pre_chars:] + text def __call__(self, in_text, in_noise): suffix_length = sum([len(x[0]) for x in self.buf]) if self.buf: suffix_length -= len(self.buf[0][0]) while self.buf and suffix_length + len(in_text) > self.post_chars: (text, noise) = self.buf.popleft() assert noise == None suffix = (''.join([x[0] for x in self.buf]) + in_text[:self.post_chars - suffix_length]) assert len(suffix) <= self.post_chars text = self.convert(text, self.prefix, suffix) noise = self.encode(text) while True: assert noise != None self.output.append(noise) self.update_prefix(text) if self.buf: suffix_length -= len(self.buf[0][0]) if not self.buf or self.buf[0][1] == None: break (text, noise) = self.buf.popleft() if in_noise == None or self.buf: self.buf.append((in_text,in_noise)) else: self.output.append(in_noise) self.update_prefix(in_text) def done(self): while self.buf: (text, noise) = self.buf.popleft() if noise == None: suffix = ''.join([x[0] for x in self.buf]) converted = self.convert(text, self.prefix, suffix) noise = self.encode(converted) else: converted = text self.output.append(noise) self.update_prefix(converted) return ''.join(self.output) block_tags = ('p', 'div', 'blockquote', 'pre', 'table', 'dl', 'ol', 'ul', 'li', 'iframe', 'hr', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') nondisplayed_tags = ('script', 'style') dont_process_tags = ('script', 'style', 'pre', 'code', 'kbd', 'script', 'math') def __init__(self, convert, encode, process_escaped=must_html_escape): self.process_escaped = ( lambda x : process_escaped(x) and not self.suppress_processing ) self.suppress_processing = 0 self.nondisplayed = 0 self.buf = self._ConversionBuffer(convert, encode) HTMLParser.__init__(self) def do_process(self, string): if self.suppress_processing: self.dont_process(string, string) else: self.buf(string, None) def dont_process(self, output, as_text): self.buf('' if self.nondisplayed else as_text, output) def get_output(self): return self.buf.done() def handle_starttag(self, tag, attrs): if tag in self.dont_process_tags: self.suppress_processing += 1 if tag in self.nondisplayed_tags: self.nondisplayed += 1 self.dont_process(self.get_starttag_text(), '\n' if tag in self.block_tags else '') def handle_startendtag(self, tag, attrs): self.dont_process(self.get_starttag_text(), '\n' if tag in self.block_tags else '') def handle_endtag(self, tag): if tag in self.dont_process_tags: self.suppress_processing -= 1 if tag in self.nondisplayed_tags: self.nondisplayed -= 1 self.dont_process('', '\n' if tag in self.block_tags else '') def handle_data(self, data): self.do_process(data) def handle_charref(self, name): char = unichr(int(name[1:], 16) if name[0] == 'x' else int(name, 10)) if self.process_escaped(char): self.do_process(char) else: self.dont_process("&#"+name+";", char) def handle_entityref(self, name): char = unichr(htmlentitydefs.name2codepoint.get(name, '')) if self.process_escaped(char): self.do_process(char) else: self.dont_process("&"+name+";", char) def handle_comment(self, data): self.dont_process('<--'+data+'-->','') def handle_decl(self, decl): self.dont_process('','') def handle_pi(self, data): self.dont_process('','') def html_escape(string): """HTML escapes a unicode string. This must be in a library somewhere!""" output = '' for c in string: if c == '<': output += '<' elif c == '&': output += '&' elif c == '>': output += '>' elif ord(c) > 127: output += '&#%d;' % ord(c) else: output += c return output def better_html(html, ruleset_names=default_rulesets, allow_escaped=must_html_escape): """Convert the text runs in `html` in accordance with the given rulesets. Valid ruleset names are keys from the `rulesets` dictionary. Note that the order in which rulesets are named is significant: a given character can only be converted by a single rule, so the first rule to match it wins. Only characters for which `allow_escaped` returns True willbe considered for replacement if they are encoded as entities in the source HTML. This provides a general escaping mechanism for all characters other than `<` and `&`, which can only appear in HTML text as entities. To allow these characters to appear in escaped and unescaped sequences in the same HTML source, place `html_escapes` at the beginning of `ruleset_names` and precede '<' and '&' entities with backslashes to treat them as literals. """ bt = BetterType(reduce(lambda x,y: x + rulesets[y], ruleset_names, [])) bh = BetterHtml(bt, html_escape, allow_escaped) bh.feed(html) return bh.get_output() def better_smartypants(html, config='1'): """SmartyPants compatibility mode.""" rulesets = ['escapes'] process_escaped = must_html_escape if '1' == config: config = 'qbde' if '2' == config: config = 'qbDe' if '3' == config: config = 'qbie' if '-1' == config: rulesets.append('stupefy') # The order the rules are added is significant; don't re-order these lines: if 'D' in config: rulesets.append('dashes') if 'd' in config: rulesets.append('simple dashes') if 'i' in config: rulesets.append('reverse dashes') if 'e' in config: rulesets.append('ellipses') if 'B' in config: rulesets.append('all backticks') if 'b' in config: rulesets.append('double backticks') if 'q' in config: rulesets.append('educate quotes') if 'w' in config: process_escaped = often_html_escaped return better_html(html, rulesets, process_escaped) # Tests: test_suites = ( ((lambda x: better_text(x), "standard text"), ( ('''"_He_ doesn't think so..."''', u'“_He_ doesn’t think so…”'), ('''"Are you quite sure, ma'am?--is not there a little mistake?" said Jane. "I certainly saw Mr. Darcy speaking to her."''', u'''“Are you quite sure, ma’am?–is not there a little mistake?” said Jane. “I certainly saw Mr. Darcy speaking to her.”'''), )), ((lambda x: better_html(x), "standard HTML"), ( # from smartypants.pl: ('''"Isn't this fun?"''', "“Isn’t this fun?”"), # adapted from smartypants.py (version 1.5_1.6): ("1440-80's", "1440-80’s"), ("1440-'80s", "1440-’80s"), ("1440---'80s", "1440—’80s"), ("1960s", "1960s"), ("1960's", "1960’s"), ("one two '60s", "one two ’60s"), ("'60s", "’60s"), ("""""", """"""), ("""

He said "Let's write some code." This code here if True:\n\tprint "Okay" is python code.

""", """

He said “Let’s write some code.” This code here if True:\n\tprint "Okay" is python code.

"""), ("21st century", "21st century"), ("3rd", "3rd"), ('''"Isn't this fun?"''', '''“Isn’t this fun?”'''), )), ((lambda x: better_html(x, ('escapes', 'reverse dashes', 'ellipses', 'educate quotes'), often_html_escaped), "smartypants.py compatible"), ( # adapted from smartypants.py (version 1.5_1.6): ("1440-80's", "1440-80’s"), ("1440-'80s", "1440-’80s"), ("1440---'80s", "1440–’80s"), ("1960s", "1960s"), ("1960's", "1960’s"), ("one two '60s", "one two ’60s"), ("'60s", "’60s"), ("""""", """"""), ("""

He said "Let's write some code." This code here if True:\n\tprint "Okay" is python code.

""", """

He said “Let’s write some code.” This code here if True:\n\tprint "Okay" is python code.

"""), ("21st century", "21st century"), ("3rd", "3rd"), ('''"Isn't this fun?"''', '''“Isn’t this fun?”'''), )), ) def run_tests(): num_tests = 0 failures = [] for ((func, desc), tests) in test_suites: for (test, desired_output) in tests: num_tests += 1 result = func(test) if result != desired_output: failures.append((desc, test, desired_output, result)) # n = 80000 # s = ''.join(['x' for i in xrange(n)]) # for i in xrange(n): x = s[i:] return (num_tests, failures) # Built-in rule library: # Rules have four components: the characters to be replaced, a regular # expression which must match before those characters, a regular expression # which must match after those characters, and the replacement text. # - These rules will always be used to match against plaintext---when # processing HTML all character entities will have been transformed to # their unicode counterparts. # - All regular expressions will be interpreted under unicode semantics. # This means, for example, that \s will match any unicode whitespace, # such as the non-breaking spaces used in many HTML documents. # - A character can only be replaced once, so if one rule matches at a # point in the input then no other rule will be matched against any # of the replaced (or replacement) characters. # - Matching is done from the beginning to the end of input by position # of the characters to be replaced. If Rule 1 replaces 'foo', Rule 2 # replaces 'oo', and Rule 3 replaces 'one', then for the string 'foone' # Rule 1 will have the first chance to match, Rule 2 the second chance, # and Rule 3 the last chance, regardless of the order in which they occur # in the rule sets. If any rule does match then the later rules will not--- # after a Rule 1 match the only characters eligible for replacement would # be 'ne'. # - Lookbehind and lookahead do not affect matching order, only whether or # not a match occurs. # - At any given position, rules are matched in order, so the first matching # rule wins. # - The string matched by lookbehind will already have had all possible rules # applied to it. The string matched by lookahead will not have been matched # against any rules. rulesets = { 'simple dashes' : [ Rule('--', '[^-]','[^-]', unichr(0x2014)), # EM DASH ], 'dashes' : [ Rule('--', '[^-]','[^-]', unichr(0x2013)), # EN DASH Rule('---', '[^-]','[^-]', unichr(0x2014)), # EM DASH Rule('----','[^-]','[^-]', unichr(0x2015)), # HORIZONTAL BAR ], 'reverse dashes' : [ # for compatibility---strongly discouraged Rule('--', '[^-]','[^-]', unichr(0x2014)), # EM DASH Rule('---', '[^-]','[^-]', unichr(0x2013)), # EN DASH ], 'figure dashes' : [ Rule('--', '[^-\D]','[^-\D]', unichr(0x2012)), # FIGURE DASH Rule('--', '[^-]','[^-]', unichr(0x2013)), # EN DASH Rule('---', '[^-]','[^-]', unichr(0x2014)), # EM DASH ], 'stupefy' : [ Rule(unichr(0x2012), '','', '--'), Rule(unichr(0x2013), '','', '--'), Rule(unichr(0x2014), '','', '---'), Rule(unichr(0x2015), '','', '----'), Rule(unichr(0x2026), '','', '...'), Rule(unichr(0x2018), '','', "'"), Rule(unichr(0x2019), '','', "'"), Rule(unichr(0x201C), '','', '"'), Rule(unichr(0x201D), '','', '"'), ], 'escapes' : [ Rule(r'\\', '','', '\\'), Rule(r'\"', '','', '"'), Rule(r"\'", '','', "'"), Rule(r'\`', '','', '`'), Rule(r'\-', '','', '-'), Rule(r'\.', '','', '.'), ], 'html_escapes' : [ Rule(r'\\', '','', '\\'), Rule(r'\<', '','', '<'), Rule(r'\&', '','', '&'), ], 'ellipses' : [ Rule('...', '[^.]','[^.]', unichr(0x2026)), # HORIZONTAL ELLIPSIS Rule('. . .', '[^.]','[^.]', unichr(0x2026)), # HORIZONTAL ELLIPSIS ], 'double backticks' : [ Rule('``', '','', unichr(0x201C)), # LEFT DOUBLE QUOTATION MARK Rule("''", '','', unichr(0x201D)), # RIGHT DOUBLE QUOTATION MARK ], 'single backticks' : [ Rule('`', '','', unichr(0x2018)), # LEFT SINGLE QUOTATION MARK Rule("'", '','', unichr(0x2019)), # RIGHT SINGLE QUOTATION MARK ], 'all backticks' : [ Rule('``', '','', unichr(0x201C)), # LEFT DOUBLE QUOTATION MARK Rule("''", '','', unichr(0x201D)), # RIGHT DOUBLE QUOTATION MARK Rule('`', '','', unichr(0x2018)), # LEFT SINGLE QUOTATION MARK Rule("'", '','', unichr(0x2019)), # RIGHT SINGLE QUOTATION MARK ], # 'smartypants simple quotes' : [ # # Rules used by SmartyPants.pl for single-character tokens # # We always have better context for guessing quotes, so there's little place for these rules. # Rule("'", r'\S','', unichr(0x2019)), # Rule("'", '','', unichr(0x2018)), # Rule('"', r'\S','', unichr(0x201D)), # Rule('"', '','', unichr(0x201C)), # ], 'educate quotes' : [ # Both these rules are used in SmartyPants.pl to deal with lack of context. # We should always have reasonable context; these rules would seldom help and occasionally hurt. # Rule("'", '^','''[!"#\\$\\%'()*+,-.\\/:;<=>?\\@\\[\\\\\\]\\^_`{|}~]\B''' u'’'), # Rule('"', '^','''[!"#\\$\\%'()*+,-.\\/:;<=>?\\@\\[\\\\\\]\\^_`{|}~]\B''', u'”'), Rule('"\'', '',r'\w', u'“‘'), # nested quotes before a word are open quotes Rule('\'"', '',r'\w', u'‘“'), Rule("'", '',r'\d0s', u'’'), # decades : "the '80s" Rule("'", '',r'\d\ds', u'’'), # SmartyPants.pl also allows this for non-zero years. Rule("'", ur'\s|(--)|\u2013|\u2014',r'\w', u'‘'), # whitespace or dashes + quote + word -> open quote Rule('"', ur'\s|(--)|\u2013|\u2014',r'\w', u'“'), Rule("'", r'[^\s\[\{\(\-]','', u'’'), # quotes following anything other than whitespace or "open"-style marks are close quotes Rule('"', r'[^\s\[\{\(\-]','', u'”'), Rule("'", '',r'\s|(s\b)', u'’'), # quotes followed by whitespace (or a single quote followed by s) are close quotes Rule('"', '',r'\s', u'”'), Rule("'", '','$', u'’'), # a quote at the end of a string is probably a close quote Rule('"', '','$', u'”'), Rule("'", '','', u'‘'), # anything else is probably an open quote Rule('"', '','', u'“'), ], 'symbols' : [ Rule('SS', '\n','\n', unichr(0x00A7)), # SECTION SIGN Rule('PTE', r'\b',r'\b', unichr(0x3250)), # PARTNERSHIP SIGN Rule('LTD', r'\b',r'\b', unichr(0x32CF)), # LIMITED LIABILITY SIGN Rule('(c)', '','', unichr(0x00A9)), # COPYRIGHT SIGN Rule('(C)', '','', unichr(0x00A9)), # COPYRIGHT SIGN Rule('(R)', '','', unichr(0x00AE)), # REGISTERED SIGN # Rule('No', '','', unichr(0x2116)), # NUMERO SIGN Rule('(TM)', '',r'\b', unichr(0x2122)), # TRADE MARK SIGN Rule('u', r'\d|\b','[AFglmsVW]\b', unichr(0x00B5)), # MICRO SIGN --- must go after units! Rule('...', '[^.]','[^.]', unichr(0x2026)), # HORIZONTAL ELLIPSIS Rule('. . .', '[^.]','[^.]', unichr(0x2026)), # HORIZONTAL ELLIPSIS Rule('..', '[^.]','[^.]', unichr(0x2025)), # TWO DOT LEADER Rule('o/oo', r'\d|\b','\b', unichr(0x2030)), # PER MILLE SIGN Rule('o/ooo', r'\d|\b','\b', unichr(0x2030)), # PER TEN THOUSAND SIGN Rule('!!', '[^?!]','[^?!]', unichr(0x203C)), # DOUBLE EXCLAMATION MARK # Rule('/', r'\d',r'\d', unichr(0x2044)), # FRACTION SLASH (needs numbers to be super/subs for spacing to work...) Rule('??', '[^?!]','[^?!]', unichr(0x2047)), # DOUBLE QUESTION MARK Rule('?!', '[^?!]','[^?!]', unichr(0x2048)), # QUESTION EXCLAMATION MARK Rule('!?', '[^?!]','[^?!]', unichr(0x2049)), # EXCLAMATION QUESTION MARK Rule('a/c', r'\b',r'\b', unichr(0x2100)), # ACCOUNT OF Rule('a/s', r'\b',r'\b', unichr(0x2101)), # ADDRESSED TO THE SUBJECT Rule('c/o', r'\b',r'\b', unichr(0x2105)), # CARE OF Rule('c/u', r'\b',r'\b', unichr(0x2106)), # CADA UNA ], # ('´', ('',''), unichr(0x2032)) # PRIME # ('´´', ('',''), unichr(0x2033)) # DOUBLE PRIME # ('´´´', ('',''), unichr(0x2034)) # TRIPLE PRIME # ('‵‵', ('',''), unichr(0x2036)) # REVERSED DOUBLE PRIME # ('‵‵‵', ('',''), unichr(0x2037)) # REVERSED TRIPLE PRIME # ('´´´´', ('',''), unichr(0x2057)) # QUADRUPLE PRIME 'superscripts' : [ Rule('^0', '',r'\D', unichr(0x2070)), # SUPERSCRIPT ZERO Rule('^1', '',r'\D', unichr(0x00B9)), # SUPERSCRIPT ONE Rule('^2', '',r'\D', unichr(0x00B2)), # SUPERSCRIPT TWO Rule('^3', '',r'\D', unichr(0x00B3)), # SUPERSCRIPT THREE Rule('^4', '',r'\D', unichr(0x2074)), # SUPERSCRIPT FOUR Rule('^5', '',r'\D', unichr(0x2075)), # SUPERSCRIPT FIVE Rule('^6', '',r'\D', unichr(0x2076)), # SUPERSCRIPT SIX Rule('^7', '',r'\D', unichr(0x2077)), # SUPERSCRIPT SEVEN Rule('^8', '',r'\D', unichr(0x2078)), # SUPERSCRIPT EIGHT Rule('^9', '',r'\D', unichr(0x2079)), # SUPERSCRIPT NINE ], 'subscripts': [ Rule('_0', '',r'\D', unichr(0x2080)), # SUBSCRIPT ZERO Rule('_1', '',r'\D', unichr(0x2081)), # SUBSCRIPT ONE Rule('_2', '',r'\D', unichr(0x2082)), # SUBSCRIPT TWO Rule('_3', '',r'\D', unichr(0x2083)), # SUBSCRIPT THREE Rule('_4', '',r'\D', unichr(0x2084)), # SUBSCRIPT FOUR Rule('_5', '',r'\D', unichr(0x2085)), # SUBSCRIPT FIVE Rule('_6', '',r'\D', unichr(0x2086)), # SUBSCRIPT SIX Rule('_7', '',r'\D', unichr(0x2087)), # SUBSCRIPT SEVEN Rule('_8', '',r'\D', unichr(0x2088)), # SUBSCRIPT EIGHT Rule('_9', '',r'\D', unichr(0x2089)), # SUBSCRIPT NINE ], 'currencies' : [ Rule('GBP ', r'\b',r'\d', unichr(0x00A3)), # POUND SIGN Rule('GBP', r'\b',r'\d', unichr(0x00A3)), # POUND SIGN Rule('JPY ', r'\b',r'\d', unichr(0x00A5)), # YEN SIGN Rule('JPY', r'\b',r'\d', unichr(0x00A5)), # YEN SIGN Rule('Rs ', r'\b',r'\d', unichr(0x20A8)), # RUPEE SIGN Rule('Rs', r'\b',r'\d', unichr(0x20A8)), # RUPEE SIGN Rule('INR ', r'\b',r'\d', unichr(0x20A8)), # RUPEE SIGN Rule('INR', r'\b',r'\d', unichr(0x20A8)), # RUPEE SIGN Rule('EUR ', r'\b',r'\d', unichr(0x20AC)), # EURO SIGN Rule('EUR', r'\b',r'\d', unichr(0x20AC)), # EURO SIGN ], 'fractions' : [ Rule('1⁄4', r'\D',r'\D', unichr(0x00BC)), # VULGAR FRACTION ONE QUARTER Rule('1⁄2', r'\D',r'\D', unichr(0x00BD)), # VULGAR FRACTION ONE HALF Rule('3⁄4', r'\D',r'\D', unichr(0x00BE)), # VULGAR FRACTION THREE QUARTERS Rule('1⁄3', r'\D',r'\D', unichr(0x2153)), # VULGAR FRACTION ONE THIRD Rule('2⁄3', r'\D',r'\D', unichr(0x2154)), # VULGAR FRACTION TWO THIRDS Rule('1⁄5', r'\D',r'\D', unichr(0x2155)), # VULGAR FRACTION ONE FIFTH Rule('2⁄5', r'\D',r'\D', unichr(0x2156)), # VULGAR FRACTION TWO FIFTHS Rule('3⁄5', r'\D',r'\D', unichr(0x2157)), # VULGAR FRACTION THREE FIFTHS Rule('4⁄5', r'\D',r'\D', unichr(0x2158)), # VULGAR FRACTION FOUR FIFTHS Rule('1⁄6', r'\D',r'\D', unichr(0x2159)), # VULGAR FRACTION ONE SIXTH Rule('5⁄6', r'\D',r'\D', unichr(0x215A)), # VULGAR FRACTION FIVE SIXTHS Rule('1⁄8', r'\D',r'\D', unichr(0x215B)), # VULGAR FRACTION ONE EIGHTH Rule('3⁄8', r'\D',r'\D', unichr(0x215C)), # VULGAR FRACTION THREE EIGHTHS Rule('5⁄8', r'\D',r'\D', unichr(0x215D)), # VULGAR FRACTION FIVE EIGHTHS Rule('7⁄8', r'\D',r'\D', unichr(0x215E)), # VULGAR FRACTION SEVEN EIGHTHS Rule('1⁄', r'\D',r'\s', unichr(0x215F)), # FRACTION NUMERATOR ONE ], # ('I', ('',''), unichr(0x2160)) # ROMAN NUMERAL ONE # ('II', ('',''), unichr(0x2161)) # ROMAN NUMERAL TWO # ('III', ('',''), unichr(0x2162)) # ROMAN NUMERAL THREE # ('IV', ('',''), unichr(0x2163)) # ROMAN NUMERAL FOUR # ('V', ('',''), unichr(0x2164)) # ROMAN NUMERAL FIVE # ('VI', ('',''), unichr(0x2165)) # ROMAN NUMERAL SIX # ('VII', ('',''), unichr(0x2166)) # ROMAN NUMERAL SEVEN # ('VIII', ('',''), unichr(0x2167)) # ROMAN NUMERAL EIGHT # ('IX', ('',''), unichr(0x2168)) # ROMAN NUMERAL NINE # ('X', ('',''), unichr(0x2169)) # ROMAN NUMERAL TEN # ('XI', ('',''), unichr(0x216A)) # ROMAN NUMERAL ELEVEN # ('XII', ('',''), unichr(0x216B)) # ROMAN NUMERAL TWELVE # ('L', ('',''), unichr(0x216C)) # ROMAN NUMERAL FIFTY # ('C', ('',''), unichr(0x216D)) # ROMAN NUMERAL ONE HUNDRED # ('D', ('',''), unichr(0x216E)) # ROMAN NUMERAL FIVE HUNDRED # ('M', ('',''), unichr(0x216F)) # ROMAN NUMERAL ONE THOUSAND # ('i', ('',''), unichr(0x2170)) # SMALL ROMAN NUMERAL ONE # ('ii', ('',''), unichr(0x2171)) # SMALL ROMAN NUMERAL TWO # ('iii', ('',''), unichr(0x2172)) # SMALL ROMAN NUMERAL THREE # ('iv', ('',''), unichr(0x2173)) # SMALL ROMAN NUMERAL FOUR # ('v', ('',''), unichr(0x2174)) # SMALL ROMAN NUMERAL FIVE # ('vi', ('',''), unichr(0x2175)) # SMALL ROMAN NUMERAL SIX # ('vii', ('',''), unichr(0x2176)) # SMALL ROMAN NUMERAL SEVEN # ('viii', ('',''), unichr(0x2177)) # SMALL ROMAN NUMERAL EIGHT # ('ix', ('',''), unichr(0x2178)) # SMALL ROMAN NUMERAL NINE # ('x', ('',''), unichr(0x2179)) # SMALL ROMAN NUMERAL TEN # ('xi', ('',''), unichr(0x217A)) # SMALL ROMAN NUMERAL ELEVEN # ('xii', ('',''), unichr(0x217B)) # SMALL ROMAN NUMERAL TWELVE # ('l', ('',''), unichr(0x217C)) # SMALL ROMAN NUMERAL FIFTY # ('c', ('',''), unichr(0x217D)) # SMALL ROMAN NUMERAL ONE HUNDRED # ('d', ('',''), unichr(0x217E)) # SMALL ROMAN NUMERAL FIVE HUNDRED # ('m', ('',''), unichr(0x217F)) # SMALL ROMAN NUMERAL ONE THOUSAND 'arrows' : [ Rule('<->', '','', unichr(0x2194)), # LEFT RIGHT ARROW Rule('<-', '','', unichr(0x2190)), # LEFTWARDS ARROW Rule('->', '','', unichr(0x2192)), # RIGHTWARDS ARROW Rule('<=>', '','', unichr(0x21D4)), # LEFT RIGHT DOUBLE ARROW Rule('<=', '','', unichr(0x21D0)), # LEFTWARDS DOUBLE ARROW Rule('=>', '','', unichr(0x21D2)), # RIGHTWARDS DOUBLE ARROW ], 'mathematics' : [ Rule('+/-', '','', unichr(0x00B1)), # PLUS-MINUS SIGN Rule('x', '((\d\d)|[1-9]) ?',r' ?\d', unichr(0x00D7)), # MULTIPLICATION SIGN Rule('-', r'\d ?',r' ?\d', unichr(0x2212)), # MINUS SIGN # Rule('/', '','', unichr(0x2215)), # DIVISION SLASH # Rule('\\', '','', unichr(0x2216)), # SET MINUS Rule('/=', '','', unichr(0x2260)), # NOT EQUAL TO Rule('!=', '','', unichr(0x2260)), # NOT EQUAL TO Rule('<>', '','', unichr(0x2260)), # NOT EQUAL TO Rule('<=', '','', unichr(0x2264)), # LESS-THAN OR EQUAL TO Rule('>=', '','', unichr(0x2265)), # GREATER-THAN OR EQUAL TO Rule('<<', '[^<]','[^<]', unichr(0x226A)), # MUCH LESS-THAN Rule('>>', '[^>]','[^>]', unichr(0x226B)), # MUCH GREATER-THAN Rule('<<<', '[^<]','[^<]', unichr(0x22D8)), # VERY MUCH LESS-THAN Rule('>>>', '[^>]','[^>]', unichr(0x22D9)), # VERY MUCH GREATER-THAN Rule('::=', '','', unichr(0x2A74)), # DOUBLE COLON EQUAL Rule('==', '[^=]','[^=]', unichr(0x2A75)), # TWO CONSECUTIVE EQUALS SIGNS Rule('===', '[^=]','[^=]', unichr(0x2A76)), # THREE CONSECUTIVE EQUALS SIGNS ], 'character literals' : [ Rule('[NUL]', '','', unichr(0x2400)), # SYMBOL FOR NULL Rule('[SOH]', '','', unichr(0x2401)), # SYMBOL FOR START OF HEADING Rule('[STX]', '','', unichr(0x2402)), # SYMBOL FOR START OF TEXT Rule('[ETX]', '','', unichr(0x2403)), # SYMBOL FOR END OF TEXT Rule('[EOT]', '','', unichr(0x2404)), # SYMBOL FOR END OF TRANSMISSION Rule('[ENQ]', '','', unichr(0x2405)), # SYMBOL FOR ENQUIRY Rule('[ACK]', '','', unichr(0x2406)), # SYMBOL FOR ACKNOWLEDGE Rule('[BEL]', '','', unichr(0x2407)), # SYMBOL FOR BELL Rule('[BS]', '','', unichr(0x2408)), # SYMBOL FOR BACKSPACE Rule('[HT]', '','', unichr(0x2409)), # SYMBOL FOR HORIZONTAL TABULATION Rule('[LF]', '','', unichr(0x240A)), # SYMBOL FOR LINE FEED Rule('[VT]', '','', unichr(0x240B)), # SYMBOL FOR VERTICAL TABULATION Rule('[FF]', '','', unichr(0x240C)), # SYMBOL FOR FORM FEED Rule('[CR]', '','', unichr(0x240D)), # SYMBOL FOR CARRIAGE RETURN Rule('[SO]', '','', unichr(0x240E)), # SYMBOL FOR SHIFT OUT Rule('[SI]', '','', unichr(0x240F)), # SYMBOL FOR SHIFT IN Rule('[DLE]', '','', unichr(0x2410)), # SYMBOL FOR DATA LINK ESCAPE Rule('[DC1]', '','', unichr(0x2411)), # SYMBOL FOR DEVICE CONTROL ONE Rule('[DC2]', '','', unichr(0x2412)), # SYMBOL FOR DEVICE CONTROL TWO Rule('[DC3]', '','', unichr(0x2413)), # SYMBOL FOR DEVICE CONTROL THREE Rule('[DC4]', '','', unichr(0x2414)), # SYMBOL FOR DEVICE CONTROL FOUR Rule('[NAK]', '','', unichr(0x2415)), # SYMBOL FOR NEGATIVE ACKNOWLEDGE Rule('[SYN]', '','', unichr(0x2416)), # SYMBOL FOR SYNCHRONOUS IDLE Rule('[ETB]', '','', unichr(0x2417)), # SYMBOL FOR END OF TRANSMISSION BLOCK Rule('[CAN]', '','', unichr(0x2418)), # SYMBOL FOR CANCEL Rule('[EM]', '','', unichr(0x2419)), # SYMBOL FOR END OF MEDIUM Rule('[SUB]', '','', unichr(0x241A)), # SYMBOL FOR SUBSTITUTE Rule('[ESC]', '','', unichr(0x241B)), # SYMBOL FOR ESCAPE Rule('[FS]', '','', unichr(0x241C)), # SYMBOL FOR FILE SEPARATOR Rule('[GS]', '','', unichr(0x241D)), # SYMBOL FOR GROUP SEPARATOR Rule('[RS]', '','', unichr(0x241E)), # SYMBOL FOR RECORD SEPARATOR Rule('[US]', '','', unichr(0x241F)), # SYMBOL FOR UNIT SEPARATOR Rule('[SP]', '','', unichr(0x2420)), # SYMBOL FOR SPACE Rule('[DEL]', '','', unichr(0x2421)), # SYMBOL FOR DELETE Rule('[NL]', '','', unichr(0x2424)), # SYMBOL FOR NEWLINE ], 'units' : [ Rule('°C', '',r'\b', unichr(0x2103)), # DEGREE CELSIUS Rule('°F', '',r'\b', unichr(0x2109)), # DEGREE FAHRENHEIT Rule('Ohm', r'\d|\b',r'\b', unichr(0x2126)), # OHM SIGN Rule('Hg', r'\d|\b',r'\b', unichr(0x32CC)), # SQUARE HG Rule('erg', r'\d|\b',r'\b', unichr(0x32CD)), # SQUARE ERG Rule('eV', r'\d|\b',r'\b', unichr(0x32CE)), # SQUARE EV Rule('hPa', r'\d|\b',r'\b', unichr(0x3371)), # SQUARE HPA Rule('da', r'\d|\b',r'\b', unichr(0x3372)), # SQUARE DA Rule('AU', r'\d|\b',r'\b', unichr(0x3373)), # SQUARE AU Rule('bar', r'\d|\b',r'\b', unichr(0x3374)), # SQUARE BAR Rule('oV', r'\d|\b',r'\b', unichr(0x3375)), # SQUARE OV Rule('pc', r'\d|\b',r'\b', unichr(0x3376)), # SQUARE PC Rule('dm', r'\d|\b',r'\b', unichr(0x3377)), # SQUARE DM Rule('dm^2', r'\d|\b',r'\b', unichr(0x3378)), # SQUARE DM SQUARED Rule('dm^3', r'\d|\b',r'\b', unichr(0x3379)), # SQUARE DM CUBED Rule('IU', r'\d|\b',r'\b', unichr(0x337A)), # SQUARE IU Rule('pA', r'\d|\b',r'\b', unichr(0x3380)), # SQUARE PA AMPS Rule('nA', r'\d|\b',r'\b', unichr(0x3381)), # SQUARE NA Rule('μA', r'\d|\b',r'\b', unichr(0x3382)), # SQUARE MU A Rule('uA', r'\d|\b',r'\b', unichr(0x3382)), # SQUARE MU A Rule('mA', r'\d|\b',r'\b', unichr(0x3383)), # SQUARE MA Rule('kA', r'\d|\b',r'\b', unichr(0x3384)), # SQUARE KA Rule('KB', r'\d|\b',r'\b', unichr(0x3385)), # SQUARE KB Rule('MB', r'\d|\b',r'\b', unichr(0x3386)), # SQUARE MB Rule('GB', r'\d|\b',r'\b', unichr(0x3387)), # SQUARE GB Rule('cal', r'\d|\b',r'\b', unichr(0x3388)), # SQUARE CAL Rule('kcal', r'\d|\b',r'\b', unichr(0x3389)), # SQUARE KCAL Rule('pF', r'\d|\b',r'\b', unichr(0x338A)), # SQUARE PF Rule('nF', r'\d|\b',r'\b', unichr(0x338B)), # SQUARE NF Rule('μF', r'\d|\b',r'\b', unichr(0x338C)), # SQUARE MU F Rule('uF', r'\d|\b',r'\b', unichr(0x338C)), # SQUARE MU F Rule('μg', r'\d|\b',r'\b', unichr(0x338D)), # SQUARE MU G Rule('ug', r'\d|\b',r'\b', unichr(0x338D)), # SQUARE MU G Rule('mg', r'\d|\b',r'\b', unichr(0x338E)), # SQUARE MG Rule('kg', r'\d|\b',r'\b', unichr(0x338F)), # SQUARE KG Rule('Hz', r'\d|\b',r'\b', unichr(0x3390)), # SQUARE HZ Rule('kHz', r'\d|\b',r'\b', unichr(0x3391)), # SQUARE KHZ Rule('MHz', r'\d|\b',r'\b', unichr(0x3392)), # SQUARE MHZ Rule('GHz', r'\d|\b',r'\b', unichr(0x3393)), # SQUARE GHZ Rule('THz', r'\d|\b',r'\b', unichr(0x3394)), # SQUARE THZ Rule('μl', r'\d|\b',r'\b', unichr(0x3395)), # SQUARE MU L Rule('ul', r'\d|\b',r'\b', unichr(0x3395)), # SQUARE MU L Rule('ml', r'\d|\b',r'\b', unichr(0x3396)), # SQUARE ML Rule('dl', r'\d|\b',r'\b', unichr(0x3397)), # SQUARE DL Rule('kl', r'\d|\b',r'\b', unichr(0x3398)), # SQUARE KL Rule('fm', r'\d|\b',r'\b', unichr(0x3399)), # SQUARE FM Rule('nm', r'\d|\b',r'\b', unichr(0x339A)), # SQUARE NM Rule('μm', r'\d|\b',r'\b', unichr(0x339B)), # SQUARE MU M Rule('um', r'\d|\b',r'\b', unichr(0x339B)), # SQUARE MU M Rule('mm', r'\d|\b',r'\b', unichr(0x339C)), # SQUARE MM Rule('cm', r'\d|\b',r'\b', unichr(0x339D)), # SQUARE CM Rule('km', r'\d|\b',r'\b', unichr(0x339E)), # SQUARE KM Rule('mm^2', r'\d|\b',r'\b', unichr(0x339F)), # SQUARE MM SQUARED Rule('cm^2', r'\d|\b',r'\b', unichr(0x33A0)), # SQUARE CM SQUARED Rule('m^2', r'\d|\b',r'\b', unichr(0x33A1)), # SQUARE M SQUARED Rule('km^2', r'\d|\b',r'\b', unichr(0x33A2)), # SQUARE KM SQUARED Rule('mm^3', r'\d|\b',r'\b', unichr(0x33A3)), # SQUARE MM CUBED Rule('cm^3', r'\d|\b',r'\b', unichr(0x33A4)), # SQUARE CM CUBED Rule('m^3', r'\d|\b',r'\b', unichr(0x33A5)), # SQUARE M CUBED Rule('km^3', r'\d|\b',r'\b', unichr(0x33A6)), # SQUARE KM CUBED Rule('m/s', r'\d|\b',r'\b', unichr(0x33A7)), # SQUARE M OVER S Rule('m/s^2', r'\d|\b',r'\b', unichr(0x33A8)), # SQUARE M OVER S SQUARED Rule('Pa', r'\d|\b',r'\b', unichr(0x33A9)), # SQUARE PA Rule('kPa', r'\d|\b',r'\b', unichr(0x33AA)), # SQUARE KPA Rule('MPa', r'\d|\b',r'\b', unichr(0x33AB)), # SQUARE MPA Rule('GPa', r'\d|\b',r'\b', unichr(0x33AC)), # SQUARE GPA Rule('rad', r'\d|\b',r'\b', unichr(0x33AD)), # SQUARE RAD Rule('rad/s', r'\d|\b',r'\b', unichr(0x33AE)), # SQUARE RAD OVER S Rule('rad/s^2', r'\d|\b',r'\b', unichr(0x33AF)), # SQUARE RAD OVER S SQUARED Rule('ps', r'\d|\b',r'\b', unichr(0x33B0)), # SQUARE PS Rule('ns', r'\d|\b',r'\b', unichr(0x33B1)), # SQUARE NS Rule('μs', r'\d|\b',r'\b', unichr(0x33B2)), # SQUARE MU S Rule('us', r'\d|\b',r'\b', unichr(0x33B2)), # SQUARE MU S Rule('ms', r'\d|\b',r'\b', unichr(0x33B3)), # SQUARE MS Rule('pV', r'\d|\b',r'\b', unichr(0x33B4)), # SQUARE PV Rule('nV', r'\d|\b',r'\b', unichr(0x33B5)), # SQUARE NV Rule('μV', r'\d|\b',r'\b', unichr(0x33B6)), # SQUARE MU V Rule('uV', r'\d|\b',r'\b', unichr(0x33B6)), # SQUARE MU V Rule('mV', r'\d|\b',r'\b', unichr(0x33B7)), # SQUARE MV Rule('kV', r'\d|\b',r'\b', unichr(0x33B8)), # SQUARE KV Rule('MV', r'\d|\b',r'\b', unichr(0x33B9)), # SQUARE MV MEGA Rule('pW', r'\d|\b',r'\b', unichr(0x33BA)), # SQUARE PW Rule('nW', r'\d|\b',r'\b', unichr(0x33BB)), # SQUARE NW Rule('μW', r'\d|\b',r'\b', unichr(0x33BC)), # SQUARE MU W Rule('uW', r'\d|\b',r'\b', unichr(0x33BC)), # SQUARE MU W Rule('mW', r'\d|\b',r'\b', unichr(0x33BD)), # SQUARE MW Rule('kW', r'\d|\b',r'\b', unichr(0x33BE)), # SQUARE KW Rule('MW', r'\d|\b',r'\b', unichr(0x33BF)), # SQUARE MW MEGA Rule('kΩ', r'\d|\b',r'\b', unichr(0x33C0)), # SQUARE K OHM Rule('MΩ', r'\d|\b',r'\b', unichr(0x33C1)), # SQUARE M OHM Rule('a.m.', r'\d|\b',r'\b', unichr(0x33C2)), # SQUARE AM Rule('Bq', r'\d|\b',r'\b', unichr(0x33C3)), # SQUARE BQ Rule('cc', r'\d|\b',r'\b', unichr(0x33C4)), # SQUARE CC Rule('cd', r'\d|\b',r'\b', unichr(0x33C5)), # SQUARE CD Rule('C/kg', r'\d|\b',r'\b', unichr(0x33C6)), # SQUARE C OVER KG Rule('Co.', r'\d|\b',r'\b', unichr(0x33C7)), # SQUARE CO Rule('dB', r'\d|\b',r'\b', unichr(0x33C8)), # SQUARE DB Rule('Gy', r'\d|\b',r'\b', unichr(0x33C9)), # SQUARE GY Rule('ha', r'\d|\b',r'\b', unichr(0x33CA)), # SQUARE HA Rule('HP', r'\d|\b',r'\b', unichr(0x33CB)), # SQUARE HP Rule('in', r'\d|\b',r'\b', unichr(0x33CC)), # SQUARE IN Rule('KK', r'\d|\b',r'\b', unichr(0x33CD)), # SQUARE KK Rule('KM', r'\d|\b',r'\b', unichr(0x33CE)), # SQUARE KM CAPITAL Rule('kt', r'\d|\b',r'\b', unichr(0x33CF)), # SQUARE KT Rule('lm', r'\d|\b',r'\b', unichr(0x33D0)), # SQUARE LM Rule('ln', r'\d|\b',r'\b', unichr(0x33D1)), # SQUARE LN Rule('log', r'\d|\b',r'\b', unichr(0x33D2)), # SQUARE LOG Rule('lx', r'\d|\b',r'\b', unichr(0x33D3)), # SQUARE LX Rule('mb', r'\d|\b',r'\b', unichr(0x33D4)), # SQUARE MB SMALL Rule('mil', r'\d|\b',r'\b', unichr(0x33D5)), # SQUARE MIL Rule('mol', r'\d|\b',r'\b', unichr(0x33D6)), # SQUARE MOL Rule('PH', r'\d|\b',r'\b', unichr(0x33D7)), # SQUARE PH Rule('p.m.', r'\d|\b',r'\b', unichr(0x33D8)), # SQUARE PM Rule('PPM', r'\d|\b',r'\b', unichr(0x33D9)), # SQUARE PPM Rule('PR', r'\d|\b',r'\b', unichr(0x33DA)), # SQUARE PR Rule('sr', r'\d|\b',r'\b', unichr(0x33DB)), # SQUARE SR Rule('Sv', r'\d|\b',r'\b', unichr(0x33DC)), # SQUARE SV Rule('Wb', r'\d|\b',r'\b', unichr(0x33DD)), # SQUARE WB Rule('V/m', r'\d|\b',r'\b', unichr(0x33DE)), # SQUARE V OVER M Rule('A/m', r'\d|\b',r'\b', unichr(0x33DF)), # SQUARE A OVER M Rule('gal', r'\d|\b',r'\b', unichr(0x33FF)), # SQUARE GAL ], } # Command-line interface: def main(argv=None): version = ( "BetterType version " + __version__ + " by " + __author__ + "\n\nFor more information visit <" + __url__ + ">.\n" + __copyright__ ) usage = "%prog [options] [FILE...]" description = \ """BetterType reads the named input FILEs or an expression specified with the -e option (or standard input if neither is given) and converts character sequences into their unicode counterparts, e.g. 'educating' typewriter quotes into left and right quotation marks. A wide range of translation rulesets is available.""" try: parser = optparse.OptionParser( description=description, version=version, usage=usage ) parser.add_option("-o", "--output", help="write output to FILE", metavar="FILE") parser.add_option("--test", action="store_true", help="show results of internal tests and exit") parser.add_option("--text", action="store_const", dest="format", const="text", help="treat input as plaintext") parser.add_option("--html", action="store_const", dest="format", const="html", help="treat input as HTML") parser.add_option("-r", "--ruleset", action="append", dest="rulesets", choices=rulesets.keys(), help="apply the rule set named NAME", metavar="NAME") parser.add_option("--names", action="store_true", dest="show_names", help="show loaded ruleset names and exit") parser.add_option("-q", "--quotes", action="append_const", dest="rulesets", const="educate quotes", help="translate \" and \' characters into curly quotes based on context") parser.add_option("-b", "--double-backticks", action="append_const", dest="rulesets", const="double backticks", help="translate `` and '' sequences into double quotes") parser.add_option("-B", "--all-backticks", action="append_const", dest="rulesets", const="all backticks", help="translate `` and '' sequences into double quotes and ` and ' into single quotes") parser.add_option("-d", "--simple-dashes", action="append_const", dest="rulesets", const="simple dashes", help="translate -- into em dash") parser.add_option("-D", "--dashes", action="append_const", dest="rulesets", const="dashes", help="translate -- and --- into en- and em-dashes") parser.add_option("-i", "--inverted-dashes", action="append_const", dest="rulesets", const="reverse dashes", help="translate -- into em dash and --- into en dash") parser.add_option("-e", "--ellipses", action="append_const", dest="rulesets", const="ellipses", help="translate ... sequences") parser.add_option("-w", "--process-escaped-quotes", action="store_true", dest="process_escaped_quotes", help="allow replacements for " entities (HTML mode only)") parser.add_option("--escapes", action="append_const", dest="rulesets", const="escapes", help="enable backslash escapes for dashes, dots and quotes") parser.add_option("-a", "--data", action="append", help="perform transformation on EXPR", metavar="EXPR") (options, args) = parser.parse_args() if options.show_names: print >>sys.stderr, "The following rulesets are available:" for name in sorted(rulesets.keys()): print >>sys.stderr, "\t", name return 0 if options.rulesets == None: options.rulesets = default_rulesets def call_func(string, filename=None): if options.format == "text" or options.format != "html" and not \ (filename and path.splitext(filename)[1] in ('.htm','.html')): return better_text(string, options.rulesets) if options.process_escaped_quotes: return better_html(string, options.rulesets, often_html_escaped) return better_html(string, options.rulesets) if options.test: (num_tests, failures) = run_tests() print >>sys.stderr,\ len(failures), "failures in", num_tests, "tests\n" for (desc, test, desired, actual) in failures: print >>sys.stderr, "In '" + desc + "' mode, this:\n" print >>sys.stderr, test print >>sys.stderr, "\nshould become this:\n" print >>sys.stderr, desired print >>sys.stderr, "\nbut instead became:\n" print >>sys.stderr, actual + "\n" return 2 if failures else 0 if not args and options.data == None: options.data = (sys.stdin.read(),) if options.output: outfile = codecs.open(options.output, 'w', encoding='utf-8') else: if sys.stdout.encoding: enc = sys.stdout.encoding else: enc = 'utf-8' outfile = codecs.EncodedFile(sys.stdout, 'unicode_internal', enc) if not options.data == None: for e in options.data: outfile.write(call_func(e)) for filename in args: f = codecs.open(filename, 'U', encoding='utf-8') outfile.write(call_func(f.read(), filename)) except Exception, e: print >>sys.stderr, e return 2 if __name__ == "__main__": sys.exit(main())