#!/usr/bin/env python
# -*- coding: utf8 -*-
"""Convert 'dumb' plaintext sequences to their unicode counterparts.

It is not always convenient to make full use of the unicode character
set when text is created. The most common example of this is the use
of multiple hyphens to denote en- and em-dashes, multiple full-stops
to denote ellipses, and " and ' to denote curled quotation marks.

BetterType provides a simple and flexible framework for converting such
text into more robust unicode. The module includes a wide variety of
predefined translations, and provides a wrapper for applying
translations to HTML as well as plain text.

Use as a module:

    import bettertype

    x = bettertype.better_text("That girl---she's really hot!")

    # Can work with HTML fragments or whole pages:
    x = bettertype.better_html('<div>"Oh well..." she sighed.</div>')
    
    # Only translate ellipses and dashes:
    s = bettertype.better_text("Uh...", ['ellipses', 'dashes'])
"""

__url__ = "http://v.cx/2008/BetterType"
__author__ = "Rob Shearer"
__copyright__ = """
Copyright (c) 2008 Rob Shearer. All right reserved.

Redistribution and use of this software, in source and binary forms, with or
without modification, are permitted provided that the following conditions are
met:

  * Redistributions of source code must retain the above copyright notice,
    this list of conditions and the following disclaimer.

  * Redistributions in binary form must reproduce the above copyright notice,
    this list of conditions and the following disclaimer in the documentation
    and/or other materials provided with the distribution.

  * Neither the name of the author nor the names of its contributors may be
    used to endorse or promote products derived from this software without
    specific prior written permission.

This software might contain bugs. There might be bugs that stop it from doing
what it was designed to do. There might be bugs that make it do unexpected
things. Potentially catastrophic things. Anyone who uses this software does so
at their own risk; the authors take no responsibility for the results.
"""

__version__ = "0.1.1"
__version_info__ = (0, 1, 1)
__changelog__ = {
    (0, 1, 0) : {
        'released' : (2008, 8, 16),
        'comment' : "Initial release",
    },
    (0, 1, 1) : {
        'released' : (2008, 12, 18),
        'fixed' : ["Removed fraction slash conversion"],
    }
}

import sys
import optparse
from os import path
import re
from collections import defaultdict, deque
from HTMLParser import HTMLParser
import htmlentitydefs
import codecs

class Rule:
    """Represents a rule to translate one text sequence to another.
    
    Rules are only applicable within a restricted 'context' defined by
    lookahead and lookbehind regular expressions.
    """
    def __init__(self, to_replace,
                 lookbehind_pattern, lookahead_pattern, replacement):
        assert to_replace
        self.to_replace = to_replace
        self.lookbehind = re.compile(
            r'(?:.|\n)*(?:' + lookbehind_pattern + r')$\n$', re.UNICODE
        )
        self.lookahead = re.compile(lookahead_pattern, re.UNICODE)
        self.replacement = replacement

class BetterType:
    """Translates an entire text string according to a collection of rules."""
    def __init__(self, rules):
        self.rules_for_char = defaultdict(list)
        for r in rules: self.rules_for_char[r.to_replace[0]].append(r)
        self.scanner = re.compile(
            '|'.join(map(lambda x: re.escape(x.to_replace), rules)),
            re.UNICODE
        )
        self.max_replacement_chars = max([len(r.replacement) for r in rules])
        self.pre_chars = 50
        self.post_chars = 50
    def __call__(self, string, prefix='', suffix=''):
        """Return a translated version of `string`.
        
        The `prefix` and `suffix` strings are not translated, but translation
        is performed as though `string` were present in the context
        `prefix + string + suffix`.
        """
        follow_chars = self.post_chars + self.max_replacement_chars
        pos = 0
        output = []
        for match_pos in [x.start() for x in self.scanner.finditer(string)]:
            if match_pos < pos: continue
            output.append(string[pos:match_pos])
            pos = match_pos
            pre_context = "\n"
            for i in reversed(output):
                pre_context = i + pre_context
                if len(pre_context) >= self.pre_chars + 1: break
            if len(pre_context) < self.pre_chars:
                pre_context = (
                    prefix[len(pre_context) - self.pre_chars:] + pre_context
                )
            post_context = string[pos:pos + follow_chars]
            if len(post_context) < follow_chars:
                post_context += suffix[:follow_chars - len(post_context)]
            for r in self.rules_for_char[string[pos]]:
                if (string.startswith(r.to_replace, pos) and
                    r.lookbehind.match(pre_context) and
                    r.lookahead.match(post_context[len(r.to_replace):])):
                    output.append(r.replacement)
                    pos += len(r.to_replace)
                    break # done with work at this position; move on
        output.append(string[pos:])
        return ''.join(output)
        
default_rulesets = ('escapes', 'dashes', 'ellipses', 'educate quotes', 'symbols')
"""The default rulesets roughly correspond with SmartyPants functionality."""

def better_text(text, ruleset_names=default_rulesets):
    """Convert `text` in accordance with the given rulesets.
    
    Valid ruleset names are keys from the `rulesets` dictionary. Note that the
    order in which rulesets are named is significant: a given character can
    only be converted by a single rule, so the first rule to match it wins.
    """
    bt = BetterType(reduce(lambda x,y: x + rulesets[y], ruleset_names, []))
    return bt(text)

# HTML interface:

def must_html_escape(c): return c in ('&', '<')
def often_html_escaped(c): return c in ('&', '<', '"')

class BetterHtml(HTMLParser):
    """An HTML parser which transforms plaintext runs using a conversion function."""
    class _ConversionBuffer:
        def __init__(self, convert, encode):
            self.buf = deque()
            self.prefix = ''
            self.output = []
            self.convert = convert
            self.encode = encode
            self.pre_chars = getattr(convert, "prefix_chars", 50)
            self.post_chars = getattr(convert, "post_chars", 50)
            assert self.post_chars > 0
        def update_prefix(self, text):
            if len(text) > self.pre_chars:
                self.prefix = text[-self.pre_chars:]
            else:
                self.prefix = self.prefix[len(text) - self.pre_chars:] + text
        def __call__(self, in_text, in_noise):
            suffix_length = sum([len(x[0]) for x in self.buf])
            if self.buf: suffix_length -= len(self.buf[0][0])
            while self.buf and suffix_length + len(in_text) > self.post_chars:
                (text, noise) = self.buf.popleft()
                assert noise == None
                suffix = (''.join([x[0] for x in self.buf]) +
                            in_text[:self.post_chars - suffix_length])
                assert len(suffix) <= self.post_chars
                text = self.convert(text, self.prefix, suffix)
                noise = self.encode(text)
                while True:
                    assert noise != None
                    self.output.append(noise)
                    self.update_prefix(text)
                    if self.buf: suffix_length -= len(self.buf[0][0])
                    if not self.buf or self.buf[0][1] == None: break
                    (text, noise) = self.buf.popleft()
            if in_noise == None or self.buf:
                self.buf.append((in_text,in_noise))
            else:
                self.output.append(in_noise)
                self.update_prefix(in_text)

        def done(self):
            while self.buf:
                (text, noise) = self.buf.popleft()
                if noise == None:
                    suffix = ''.join([x[0] for x in self.buf])
                    converted = self.convert(text, self.prefix, suffix)
                    noise = self.encode(converted)
                else: converted = text
                self.output.append(noise)
                self.update_prefix(converted)
            return ''.join(self.output)

    block_tags = ('p', 'div', 'blockquote', 'pre', 'table',
                  'dl', 'ol', 'ul', 'li', 'iframe', 'hr', 'br',
                  'h1', 'h2', 'h3', 'h4', 'h5', 'h6')
    nondisplayed_tags = ('script', 'style')
    dont_process_tags = ('script', 'style',
                         'pre', 'code', 'kbd', 'script', 'math')
    def __init__(self, convert, encode, process_escaped=must_html_escape):
        self.process_escaped = (
            lambda x : process_escaped(x) and not self.suppress_processing
        )
        self.suppress_processing = 0
        self.nondisplayed = 0
        self.buf = self._ConversionBuffer(convert, encode)
        HTMLParser.__init__(self)

    def do_process(self, string):
        if self.suppress_processing: self.dont_process(string, string)
        else: self.buf(string, None)
    def dont_process(self, output, as_text):
        self.buf('' if self.nondisplayed else as_text, output)
    def get_output(self): return self.buf.done()
    
    def handle_starttag(self, tag, attrs):
        if tag in self.dont_process_tags: self.suppress_processing += 1
        if tag in self.nondisplayed_tags: self.nondisplayed += 1
        self.dont_process(self.get_starttag_text(),
                          '\n' if tag in self.block_tags else '')
    def handle_startendtag(self, tag, attrs):
        self.dont_process(self.get_starttag_text(),
                          '\n' if tag in self.block_tags else '')
    def handle_endtag(self, tag):
        if tag in self.dont_process_tags: self.suppress_processing -= 1
        if tag in self.nondisplayed_tags: self.nondisplayed -= 1
        self.dont_process('</'+tag+'>',
                          '\n' if tag in self.block_tags else '')
    def handle_data(self, data): self.do_process(data)
    def handle_charref(self, name):
        char = unichr(int(name[1:], 16) if name[0] == 'x' else int(name, 10))
        if self.process_escaped(char): self.do_process(char)
        else: self.dont_process("&#"+name+";", char)
    def handle_entityref(self, name):
        char = unichr(htmlentitydefs.name2codepoint.get(name, ''))
        if self.process_escaped(char): self.do_process(char)
        else: self.dont_process("&"+name+";", char)
    def handle_comment(self, data): self.dont_process('<--'+data+'-->','')
    def handle_decl(self, decl): self.dont_process('<!'+decl+'>','')
    def handle_pi(self, data): self.dont_process('<?'+data+'>','')

def html_escape(string):
    """HTML escapes a unicode string. This must be in a library somewhere!"""
    output = ''
    for c in string:
        if c == '<': output += '&lt;'
        elif c == '&': output += '&amp;'
        elif c == '>': output += '&gt;'
        elif ord(c) > 127: output += '&#%d;' % ord(c)
        else: output += c
    return output

def better_html(html,
    ruleset_names=default_rulesets, allow_escaped=must_html_escape):
    """Convert the text runs in `html` in accordance with the given rulesets.
    
    Valid ruleset names are keys from the `rulesets` dictionary. Note that the
    order in which rulesets are named is significant: a given character can
    only be converted by a single rule, so the first rule to match it wins.
    
    Only characters for which `allow_escaped` returns True willbe considered
    for replacement if they are encoded as entities in the source HTML. This
    provides a general escaping mechanism for all characters other than `<`
    and `&`, which can only appear in HTML text as entities. To allow these
    characters to appear in escaped and unescaped sequences in the same HTML
    source, place `html_escapes` at the beginning of `ruleset_names` and
    precede '<' and '&' entities with backslashes to treat them as literals.
    """
    bt = BetterType(reduce(lambda x,y: x + rulesets[y], ruleset_names, []))
    bh = BetterHtml(bt, html_escape, allow_escaped)
    bh.feed(html)
    return bh.get_output()

def better_smartypants(html, config='1'):
    """SmartyPants compatibility mode."""
    rulesets = ['escapes']
    process_escaped = must_html_escape
    if '1' == config: config = 'qbde'
    if '2' == config: config = 'qbDe'
    if '3' == config: config = 'qbie'
    if '-1' == config: rulesets.append('stupefy')
    # The order the rules are added is significant; don't re-order these lines:
    if 'D' in config: rulesets.append('dashes')
    if 'd' in config: rulesets.append('simple dashes')
    if 'i' in config: rulesets.append('reverse dashes')
    if 'e' in config: rulesets.append('ellipses')
    if 'B' in config: rulesets.append('all backticks')
    if 'b' in config: rulesets.append('double backticks')
    if 'q' in config: rulesets.append('educate quotes')
    if 'w' in config: process_escaped = often_html_escaped
    return better_html(html, rulesets, process_escaped)

# Tests:

test_suites = (
    ((lambda x: better_text(x), "standard text"), (
        ('''"_He_ doesn't think so..."''', u'“_He_ doesn’t think so…”'),
        ('''"Are you quite sure, ma'am?--is not there a little mistake?"
said Jane.  "I certainly saw Mr. Darcy speaking to her."''',
         u'''“Are you quite sure, ma’am?–is not there a little mistake?”
said Jane.  “I certainly saw Mr. Darcy speaking to her.”'''),
    )),
    ((lambda x: better_html(x), "standard HTML"), (
        # from smartypants.pl:
        ('''"Isn't this fun?"''', "&#8220;Isn&#8217;t this fun?&#8221;"),
        # adapted from smartypants.py (version 1.5_1.6):
        ("1440-80's", "1440-80&#8217;s"),
        ("1440-'80s", "1440-&#8217;80s"),
        ("1440---'80s", "1440&#8212;&#8217;80s"),
        ("1960s", "1960s"),
        ("1960's", "1960&#8217;s"),
        ("one two '60s", "one two &#8217;60s"),
        ("'60s", "&#8217;60s"),
        ("""<script type="text/javascript">\n<!--\nvar href = "http://www.google.com";\nvar linktext = "google";\ndocument.write('<a href="' + href + '">' + linktext + "</a>");\n//-->\n</script>""",
         """<script type="text/javascript">\n<!--\nvar href = "http://www.google.com";\nvar linktext = "google";\ndocument.write('<a href="' + href + '">' + linktext + "</a>");\n//-->\n</script>"""),
        ("""<p>He said "Let's write some code." This code here <code>if True:\n\tprint "Okay"</code> is python code.</p>""",
         """<p>He said &#8220;Let&#8217;s write some code.&#8221; This code here <code>if True:\n\tprint "Okay"</code> is python code.</p>"""),
        ("21st century", "21st century"),
        ("3rd", "3rd"),
        ('''"Isn't this fun?"''', '''&#8220;Isn&#8217;t this fun?&#8221;'''),
    )),
    ((lambda x: better_html(x, ('escapes', 'reverse dashes', 'ellipses', 'educate quotes'), often_html_escaped),
      "smartypants.py compatible"), (
        # adapted from smartypants.py (version 1.5_1.6):
        ("1440-80's", "1440-80&#8217;s"),
        ("1440-'80s", "1440-&#8217;80s"),
        ("1440---'80s", "1440&#8211;&#8217;80s"),
        ("1960s", "1960s"),
        ("1960's", "1960&#8217;s"),
        ("one two '60s", "one two &#8217;60s"),
        ("'60s", "&#8217;60s"),
        ("""<script type="text/javascript">\n<!--\nvar href = "http://www.google.com";\nvar linktext = "google";\ndocument.write('<a href="' + href + '">' + linktext + "</a>");\n//-->\n</script>""",
         """<script type="text/javascript">\n<!--\nvar href = "http://www.google.com";\nvar linktext = "google";\ndocument.write('<a href="' + href + '">' + linktext + "</a>");\n//-->\n</script>"""),
        ("""<p>He said &quot;Let's write some code.&quot; This code here <code>if True:\n\tprint &quot;Okay&quot;</code> is python code.</p>""",
         """<p>He said &#8220;Let&#8217;s write some code.&#8221; This code here <code>if True:\n\tprint &quot;Okay&quot;</code> is python code.</p>"""),
        ("21st century", "21st century"),
        ("3rd", "3rd"),
        ('''"Isn't this fun?"''', '''&#8220;Isn&#8217;t this fun?&#8221;'''),
    )),
)

def run_tests():
    num_tests = 0
    failures = []
    for ((func, desc), tests) in test_suites:
        for (test, desired_output) in tests:
            num_tests += 1
            result = func(test)
            if result != desired_output:
                failures.append((desc, test, desired_output, result))
    # n = 80000
    # s = ''.join(['x' for i in xrange(n)])
    # for i in xrange(n): x = s[i:]
    return (num_tests, failures)

# Built-in rule library:

# Rules have four components: the characters to be replaced, a regular
# expression which must match before those characters, a regular expression
# which must match after those characters, and the replacement text.
#  - These rules will always be used to match against plaintext---when
#    processing HTML all character entities will have been transformed to
#    their unicode counterparts.
#  - All regular expressions will be interpreted under unicode semantics.
#    This means, for example, that \s will match any unicode whitespace,
#    such as the non-breaking spaces used in many HTML documents.
#  - A character can only be replaced once, so if one rule matches at a
#    point in the input then no other rule will be matched against any
#    of the replaced (or replacement) characters.
#  - Matching is done from the beginning to the end of input by position
#    of the characters to be replaced. If Rule 1 replaces 'foo', Rule 2
#    replaces 'oo', and Rule 3 replaces 'one', then for the string 'foone'
#    Rule 1 will have the first chance to match, Rule 2 the second chance,
#    and Rule 3 the last chance, regardless of the order in which they occur
#    in the rule sets. If any rule does match then the later rules will not---
#    after a Rule 1 match the only characters eligible for replacement would
#    be 'ne'.
#  - Lookbehind and lookahead do not affect matching order, only whether or
#    not a match occurs.
#  - At any given position, rules are matched in order, so the first matching
#    rule wins.
#  - The string matched by lookbehind will already have had all possible rules
#    applied to it. The string matched by lookahead will not have been matched
#    against any rules.

rulesets = {
    'simple dashes' : [
        Rule('--', '[^-]','[^-]', unichr(0x2014)), # EM DASH
    ],
    'dashes' : [
        Rule('--',  '[^-]','[^-]', unichr(0x2013)), # EN DASH
        Rule('---', '[^-]','[^-]', unichr(0x2014)), # EM DASH
        Rule('----','[^-]','[^-]', unichr(0x2015)), # HORIZONTAL BAR
    ],
    'reverse dashes' : [ # for compatibility---strongly discouraged
        Rule('--',  '[^-]','[^-]', unichr(0x2014)), # EM DASH
        Rule('---', '[^-]','[^-]', unichr(0x2013)), # EN DASH
    ],
    'figure dashes' : [
        Rule('--',  '[^-\D]','[^-\D]', unichr(0x2012)), # FIGURE DASH
        Rule('--',  '[^-]','[^-]', unichr(0x2013)), # EN DASH
        Rule('---', '[^-]','[^-]', unichr(0x2014)), # EM DASH
    ],
    
    'stupefy' : [
        Rule(unichr(0x2012), '','', '--'),
        Rule(unichr(0x2013), '','', '--'),
        Rule(unichr(0x2014), '','', '---'),
        Rule(unichr(0x2015), '','', '----'),
        Rule(unichr(0x2026), '','', '...'),
        Rule(unichr(0x2018), '','', "'"),
        Rule(unichr(0x2019), '','', "'"),
        Rule(unichr(0x201C), '','', '"'),
        Rule(unichr(0x201D), '','', '"'),
    ],
    
    'escapes' : [
        Rule(r'\\', '','',    '\\'),
        Rule(r'\"', '','',    '"'),
        Rule(r"\'", '','',    "'"),
        Rule(r'\`', '','',    '`'),
        Rule(r'\-', '','',    '-'),
        Rule(r'\.', '','',    '.'),
    ],
    
    'html_escapes' : [
        Rule(r'\\', '','',    '\\'),
        Rule(r'\<', '','',    '<'),
        Rule(r'\&', '','',    '&'),
    ],
    
    'ellipses' : [
        Rule('...',     '[^.]','[^.]',	unichr(0x2026)),	# HORIZONTAL ELLIPSIS
        Rule('. . .',	'[^.]','[^.]',	unichr(0x2026)),	# HORIZONTAL ELLIPSIS
    ],
    
    'double backticks' : [
        Rule('``',  '','', unichr(0x201C)),   # LEFT DOUBLE QUOTATION MARK
        Rule("''",  '','', unichr(0x201D)),   # RIGHT DOUBLE QUOTATION MARK
    ],
    
    'single backticks' : [
        Rule('`',   '','', unichr(0x2018)),   # LEFT SINGLE QUOTATION MARK
        Rule("'",   '','', unichr(0x2019)),   # RIGHT SINGLE QUOTATION MARK
    ],
    
    'all backticks' : [
        Rule('``',  '','', unichr(0x201C)),   # LEFT DOUBLE QUOTATION MARK
        Rule("''",  '','', unichr(0x201D)),   # RIGHT DOUBLE QUOTATION MARK
        Rule('`',   '','', unichr(0x2018)),   # LEFT SINGLE QUOTATION MARK
        Rule("'",   '','', unichr(0x2019)),   # RIGHT SINGLE QUOTATION MARK
    ],
    
    # 'smartypants simple quotes' : [
    #     # Rules used by SmartyPants.pl for single-character tokens
    #     # We always have better context for guessing quotes, so there's little place for these rules.
    #     Rule("'", r'\S','', unichr(0x2019)),
    #     Rule("'", '','', unichr(0x2018)),
    #     Rule('"', r'\S','', unichr(0x201D)),
    #     Rule('"', '','', unichr(0x201C)),
    # ],
    
    'educate quotes' : [
        # Both these rules are used in SmartyPants.pl to deal with lack of context.
        # We should always have reasonable context; these rules would seldom help and occasionally hurt.
        # Rule("'",   '^','''[!"#\\$\\%'()*+,-.\\/:;<=>?\\@\\[\\\\\\]\\^_`{|}~]\B''' u'’'),
        # Rule('"',   '^','''[!"#\\$\\%'()*+,-.\\/:;<=>?\\@\\[\\\\\\]\\^_`{|}~]\B''', u'”'),
        
        Rule('"\'', '',r'\w', u'“‘'),   # nested quotes before a word are open quotes
        Rule('\'"', '',r'\w', u'‘“'),
        Rule("'",   '',r'\d0s', u'’'),  # decades : "the '80s"
        Rule("'",   '',r'\d\ds', u'’'), # SmartyPants.pl also allows this for non-zero years.
        Rule("'",   ur'\s|(--)|\u2013|\u2014',r'\w',  u'‘'), # whitespace or dashes + quote + word -> open quote
        Rule('"',   ur'\s|(--)|\u2013|\u2014',r'\w',  u'“'),
        Rule("'",   r'[^\s\[\{\(\-]','',  u'’'), # quotes following anything other than whitespace or "open"-style marks are close quotes
        Rule('"',   r'[^\s\[\{\(\-]','',  u'”'),
        Rule("'",   '',r'\s|(s\b)', u'’'), # quotes followed by whitespace (or a single quote followed by s) are close quotes
        Rule('"',   '',r'\s', u'”'),
        Rule("'", '','$', u'’'), # a quote at the end of a string is probably a close quote
        Rule('"', '','$', u'”'),
        Rule("'",   '','',  u'‘'), # anything else is probably an open quote
        Rule('"',   '','',  u'“'),
    ],

    'symbols' : [
        Rule('SS',	'\n','\n',	unichr(0x00A7)),	# SECTION SIGN
        Rule('PTE',	r'\b',r'\b',	unichr(0x3250)),	# PARTNERSHIP SIGN
        Rule('LTD',	r'\b',r'\b',	unichr(0x32CF)),	# LIMITED LIABILITY SIGN
        Rule('(c)',	'','',	unichr(0x00A9)),		# COPYRIGHT SIGN
        Rule('(C)',	'','',	unichr(0x00A9)),		# COPYRIGHT SIGN
        Rule('(R)',	'','',	unichr(0x00AE)),		# REGISTERED SIGN
        # Rule('No',    '','', unichr(0x2116)),   # NUMERO SIGN
        Rule('(TM)',	'',r'\b',	unichr(0x2122)),	    # TRADE MARK SIGN
        Rule('u',	r'\d|\b','[AFglmsVW]\b',	unichr(0x00B5)), # MICRO SIGN --- must go after units!
        Rule('...',	'[^.]','[^.]',	unichr(0x2026)),		# HORIZONTAL ELLIPSIS
        Rule('. . .',	'[^.]','[^.]',	unichr(0x2026)),	# HORIZONTAL ELLIPSIS
        Rule('..',	'[^.]','[^.]',	unichr(0x2025)),		# TWO DOT LEADER
        Rule('o/oo',	r'\d|\b','\b',	unichr(0x2030)),	# PER MILLE SIGN
        Rule('o/ooo',	r'\d|\b','\b',	unichr(0x2030)),	# PER TEN THOUSAND SIGN
        Rule('!!',	'[^?!]','[^?!]',	unichr(0x203C)),	# DOUBLE EXCLAMATION MARK
        # Rule('/', r'\d',r'\d',    unichr(0x2044)),        # FRACTION SLASH (needs numbers to be super/subs for spacing to work...)
        Rule('??',	'[^?!]','[^?!]',	unichr(0x2047)),	# DOUBLE QUESTION MARK
        Rule('?!',	'[^?!]','[^?!]',	unichr(0x2048)),	# QUESTION EXCLAMATION MARK
        Rule('!?',	'[^?!]','[^?!]',	unichr(0x2049)),	# EXCLAMATION QUESTION MARK
        Rule('a/c',	r'\b',r'\b',	unichr(0x2100)),		# ACCOUNT OF
        Rule('a/s',	r'\b',r'\b',	unichr(0x2101)),		# ADDRESSED TO THE SUBJECT
        Rule('c/o',	r'\b',r'\b',	unichr(0x2105)),		# CARE OF
        Rule('c/u',	r'\b',r'\b',	unichr(0x2106)),		# CADA UNA
    ],

# ('´', ('',''), unichr(0x2032))        # PRIME
# ('´´',    ('',''), unichr(0x2033))    # DOUBLE PRIME
# ('´´´',   ('',''), unichr(0x2034))    # TRIPLE PRIME
# ('‵‵',    ('',''), unichr(0x2036))    # REVERSED DOUBLE PRIME
# ('‵‵‵',   ('',''), unichr(0x2037))    # REVERSED TRIPLE PRIME
# ('´´´´',  ('',''), unichr(0x2057))    # QUADRUPLE PRIME

    'superscripts' : [
        Rule('^0',	'',r'\D',	unichr(0x2070)),	# SUPERSCRIPT ZERO
        Rule('^1',	'',r'\D',	unichr(0x00B9)),    # SUPERSCRIPT ONE
        Rule('^2',	'',r'\D',	unichr(0x00B2)),    # SUPERSCRIPT TWO
        Rule('^3',	'',r'\D',	unichr(0x00B3)),    # SUPERSCRIPT THREE
        Rule('^4',	'',r'\D',	unichr(0x2074)),	# SUPERSCRIPT FOUR
        Rule('^5',	'',r'\D',	unichr(0x2075)),	# SUPERSCRIPT FIVE
        Rule('^6',	'',r'\D',	unichr(0x2076)),    # SUPERSCRIPT SIX
        Rule('^7',	'',r'\D',	unichr(0x2077)),	# SUPERSCRIPT SEVEN
        Rule('^8',	'',r'\D',	unichr(0x2078)),	# SUPERSCRIPT EIGHT
        Rule('^9',	'',r'\D',	unichr(0x2079)),	# SUPERSCRIPT NINE
    ],
    'subscripts': [
        Rule('_0',	'',r'\D',	unichr(0x2080)),	# SUBSCRIPT ZERO
        Rule('_1',	'',r'\D',	unichr(0x2081)),    # SUBSCRIPT ONE
        Rule('_2',	'',r'\D',	unichr(0x2082)),    # SUBSCRIPT TWO
        Rule('_3',	'',r'\D',	unichr(0x2083)),    # SUBSCRIPT THREE
        Rule('_4',	'',r'\D',	unichr(0x2084)),	# SUBSCRIPT FOUR
        Rule('_5',	'',r'\D',	unichr(0x2085)),	# SUBSCRIPT FIVE
        Rule('_6',	'',r'\D',	unichr(0x2086)),    # SUBSCRIPT SIX
        Rule('_7',	'',r'\D',	unichr(0x2087)),	# SUBSCRIPT SEVEN
        Rule('_8',	'',r'\D',	unichr(0x2088)),	# SUBSCRIPT EIGHT
        Rule('_9',	'',r'\D',	unichr(0x2089)),	# SUBSCRIPT NINE
    ],

    'currencies' : [
        Rule('GBP ',	r'\b',r'\d',	unichr(0x00A3)),	# POUND SIGN
        Rule('GBP',     r'\b',r'\d',	unichr(0x00A3)),	# POUND SIGN
        Rule('JPY ',	r'\b',r'\d',	unichr(0x00A5)),	# YEN SIGN
        Rule('JPY', 	r'\b',r'\d',	unichr(0x00A5)),	# YEN SIGN
        Rule('Rs ', 	r'\b',r'\d',	unichr(0x20A8)),	# RUPEE SIGN
        Rule('Rs',  	r'\b',r'\d',	unichr(0x20A8)),	# RUPEE SIGN
        Rule('INR ',	r'\b',r'\d',	unichr(0x20A8)),	# RUPEE SIGN
        Rule('INR',	    r'\b',r'\d',	unichr(0x20A8)),	# RUPEE SIGN
        Rule('EUR ',	r'\b',r'\d',	unichr(0x20AC)),	# EURO SIGN
        Rule('EUR',	    r'\b',r'\d',	unichr(0x20AC)),	# EURO SIGN
    ],

    'fractions' : [
        Rule('1⁄4',	r'\D',r'\D',	unichr(0x00BC)),    # VULGAR FRACTION ONE QUARTER
        Rule('1⁄2',	r'\D',r'\D',	unichr(0x00BD)),	# VULGAR FRACTION ONE HALF
        Rule('3⁄4',	r'\D',r'\D',	unichr(0x00BE)),	# VULGAR FRACTION THREE QUARTERS
        Rule('1⁄3',	r'\D',r'\D',	unichr(0x2153)),	# VULGAR FRACTION ONE THIRD
        Rule('2⁄3',	r'\D',r'\D',	unichr(0x2154)),	# VULGAR FRACTION TWO THIRDS
        Rule('1⁄5',	r'\D',r'\D',	unichr(0x2155)),	# VULGAR FRACTION ONE FIFTH
        Rule('2⁄5',	r'\D',r'\D',	unichr(0x2156)),	# VULGAR FRACTION TWO FIFTHS
        Rule('3⁄5',	r'\D',r'\D',	unichr(0x2157)),	# VULGAR FRACTION THREE FIFTHS
        Rule('4⁄5',	r'\D',r'\D',	unichr(0x2158)),	# VULGAR FRACTION FOUR FIFTHS
        Rule('1⁄6',	r'\D',r'\D',	unichr(0x2159)),	# VULGAR FRACTION ONE SIXTH
        Rule('5⁄6',	r'\D',r'\D',	unichr(0x215A)),	# VULGAR FRACTION FIVE SIXTHS
        Rule('1⁄8',	r'\D',r'\D',	unichr(0x215B)),	# VULGAR FRACTION ONE EIGHTH
        Rule('3⁄8',	r'\D',r'\D',	unichr(0x215C)),	# VULGAR FRACTION THREE EIGHTHS
        Rule('5⁄8',	r'\D',r'\D',	unichr(0x215D)),	# VULGAR FRACTION FIVE EIGHTHS
        Rule('7⁄8',	r'\D',r'\D',	unichr(0x215E)),	# VULGAR FRACTION SEVEN EIGHTHS
        Rule('1⁄',	r'\D',r'\s',	unichr(0x215F)),	# FRACTION NUMERATOR ONE
    ],

# ('I', ('',''), unichr(0x2160))        # ROMAN NUMERAL ONE
# ('II',    ('',''), unichr(0x2161))        # ROMAN NUMERAL TWO
# ('III',   ('',''), unichr(0x2162))        # ROMAN NUMERAL THREE
# ('IV',    ('',''), unichr(0x2163))        # ROMAN NUMERAL FOUR
# ('V', ('',''), unichr(0x2164))        # ROMAN NUMERAL FIVE
# ('VI',    ('',''), unichr(0x2165))        # ROMAN NUMERAL SIX
# ('VII',   ('',''), unichr(0x2166))        # ROMAN NUMERAL SEVEN
# ('VIII',  ('',''), unichr(0x2167))        # ROMAN NUMERAL EIGHT
# ('IX',    ('',''), unichr(0x2168))        # ROMAN NUMERAL NINE
# ('X', ('',''), unichr(0x2169))        # ROMAN NUMERAL TEN
# ('XI',    ('',''), unichr(0x216A))        # ROMAN NUMERAL ELEVEN
# ('XII',   ('',''), unichr(0x216B))        # ROMAN NUMERAL TWELVE
# ('L', ('',''), unichr(0x216C))        # ROMAN NUMERAL FIFTY
# ('C', ('',''), unichr(0x216D))        # ROMAN NUMERAL ONE HUNDRED
# ('D', ('',''), unichr(0x216E))        # ROMAN NUMERAL FIVE HUNDRED
# ('M', ('',''), unichr(0x216F))        # ROMAN NUMERAL ONE THOUSAND
# ('i', ('',''), unichr(0x2170))        # SMALL ROMAN NUMERAL ONE
# ('ii',    ('',''), unichr(0x2171))        # SMALL ROMAN NUMERAL TWO
# ('iii',   ('',''), unichr(0x2172))        # SMALL ROMAN NUMERAL THREE
# ('iv',    ('',''), unichr(0x2173))        # SMALL ROMAN NUMERAL FOUR
# ('v', ('',''), unichr(0x2174))        # SMALL ROMAN NUMERAL FIVE
# ('vi',    ('',''), unichr(0x2175))        # SMALL ROMAN NUMERAL SIX
# ('vii',   ('',''), unichr(0x2176))        # SMALL ROMAN NUMERAL SEVEN
# ('viii',  ('',''), unichr(0x2177))        # SMALL ROMAN NUMERAL EIGHT
# ('ix',    ('',''), unichr(0x2178))        # SMALL ROMAN NUMERAL NINE
# ('x', ('',''), unichr(0x2179))        # SMALL ROMAN NUMERAL TEN
# ('xi',    ('',''), unichr(0x217A))        # SMALL ROMAN NUMERAL ELEVEN
# ('xii',   ('',''), unichr(0x217B))        # SMALL ROMAN NUMERAL TWELVE
# ('l', ('',''), unichr(0x217C))        # SMALL ROMAN NUMERAL FIFTY
# ('c', ('',''), unichr(0x217D))        # SMALL ROMAN NUMERAL ONE HUNDRED
# ('d', ('',''), unichr(0x217E))        # SMALL ROMAN NUMERAL FIVE HUNDRED
# ('m', ('',''), unichr(0x217F))        # SMALL ROMAN NUMERAL ONE THOUSAND

    'arrows' : [
        Rule('<->',	'','',	unichr(0x2194)),		# LEFT RIGHT ARROW
        Rule('<-',	'','',	unichr(0x2190)),		# LEFTWARDS ARROW
        Rule('->',	'','',	unichr(0x2192)),		# RIGHTWARDS ARROW
        Rule('<=>',	'','',	unichr(0x21D4)),		# LEFT RIGHT DOUBLE ARROW
        Rule('<=',	'','',	unichr(0x21D0)),		# LEFTWARDS DOUBLE ARROW
        Rule('=>',	'','',	unichr(0x21D2)),		# RIGHTWARDS DOUBLE ARROW
    ],

    'mathematics' : [
        Rule('+/-',	'','',	unichr(0x00B1)),		    # PLUS-MINUS SIGN
        Rule('x',	'((\d\d)|[1-9]) ?',r' ?\d',   unichr(0x00D7)),  # MULTIPLICATION SIGN
        Rule('-',	r'\d ?',r' ?\d',	unichr(0x2212)),	# MINUS SIGN
        # Rule('/', '','', unichr(0x2215)),           # DIVISION SLASH
        # Rule('\\',    '','', unichr(0x2216)),       # SET MINUS
        Rule('/=',	'','',	unichr(0x2260)),		    # NOT EQUAL TO
        Rule('!=',	'','',	unichr(0x2260)),		    # NOT EQUAL TO
        Rule('<>',	'','',	unichr(0x2260)),		    # NOT EQUAL TO
        Rule('<=',	'','',	unichr(0x2264)),		    # LESS-THAN OR EQUAL TO
        Rule('>=',	'','',	unichr(0x2265)),		    # GREATER-THAN OR EQUAL TO
        Rule('<<',	'[^<]','[^<]',	unichr(0x226A)),	# MUCH LESS-THAN
        Rule('>>',	'[^>]','[^>]',	unichr(0x226B)),	# MUCH GREATER-THAN
        Rule('<<<',	'[^<]','[^<]',	unichr(0x22D8)),	# VERY MUCH LESS-THAN
        Rule('>>>',	'[^>]','[^>]',	unichr(0x22D9)),	# VERY MUCH GREATER-THAN
        Rule('::=',	'','',	        unichr(0x2A74)),	# DOUBLE COLON EQUAL
        Rule('==',	'[^=]','[^=]',	unichr(0x2A75)),	# TWO CONSECUTIVE EQUALS SIGNS
        Rule('===',	'[^=]','[^=]',	unichr(0x2A76)),	# THREE CONSECUTIVE EQUALS SIGNS
    ],

    'character literals' : [
        Rule('[NUL]',   '','',    unichr(0x2400)),		# SYMBOL FOR NULL
        Rule('[SOH]',	'','',	unichr(0x2401)),		# SYMBOL FOR START OF HEADING
        Rule('[STX]',	'','',	unichr(0x2402)),		# SYMBOL FOR START OF TEXT
        Rule('[ETX]',	'','',	unichr(0x2403)),		# SYMBOL FOR END OF TEXT
        Rule('[EOT]',	'','',	unichr(0x2404)),		# SYMBOL FOR END OF TRANSMISSION
        Rule('[ENQ]',	'','',	unichr(0x2405)),		# SYMBOL FOR ENQUIRY
        Rule('[ACK]',	'','',	unichr(0x2406)),		# SYMBOL FOR ACKNOWLEDGE
        Rule('[BEL]',	'','',	unichr(0x2407)),		# SYMBOL FOR BELL
        Rule('[BS]',	'','',	unichr(0x2408)),		# SYMBOL FOR BACKSPACE
        Rule('[HT]',	'','',	unichr(0x2409)),		# SYMBOL FOR HORIZONTAL TABULATION
        Rule('[LF]',	'','',	unichr(0x240A)),		# SYMBOL FOR LINE FEED
        Rule('[VT]',	'','',	unichr(0x240B)),		# SYMBOL FOR VERTICAL TABULATION
        Rule('[FF]',	'','',	unichr(0x240C)),		# SYMBOL FOR FORM FEED
        Rule('[CR]',	'','',	unichr(0x240D)),		# SYMBOL FOR CARRIAGE RETURN
        Rule('[SO]',	'','',	unichr(0x240E)),		# SYMBOL FOR SHIFT OUT
        Rule('[SI]',	'','',	unichr(0x240F)),		# SYMBOL FOR SHIFT IN
        Rule('[DLE]',	'','',	unichr(0x2410)),		# SYMBOL FOR DATA LINK ESCAPE
        Rule('[DC1]',	'','',	unichr(0x2411)),		# SYMBOL FOR DEVICE CONTROL ONE
        Rule('[DC2]',	'','',	unichr(0x2412)),		# SYMBOL FOR DEVICE CONTROL TWO
        Rule('[DC3]',	'','',	unichr(0x2413)),		# SYMBOL FOR DEVICE CONTROL THREE
        Rule('[DC4]',	'','',	unichr(0x2414)),		# SYMBOL FOR DEVICE CONTROL FOUR
        Rule('[NAK]',	'','',	unichr(0x2415)),		# SYMBOL FOR NEGATIVE ACKNOWLEDGE
        Rule('[SYN]',	'','',	unichr(0x2416)),		# SYMBOL FOR SYNCHRONOUS IDLE
        Rule('[ETB]',	'','',	unichr(0x2417)),		# SYMBOL FOR END OF TRANSMISSION BLOCK
        Rule('[CAN]',	'','',	unichr(0x2418)),		# SYMBOL FOR CANCEL
        Rule('[EM]',	'','',	unichr(0x2419)),		# SYMBOL FOR END OF MEDIUM
        Rule('[SUB]',	'','',	unichr(0x241A)),		# SYMBOL FOR SUBSTITUTE
        Rule('[ESC]',	'','',	unichr(0x241B)),		# SYMBOL FOR ESCAPE
        Rule('[FS]',	'','',	unichr(0x241C)),		# SYMBOL FOR FILE SEPARATOR
        Rule('[GS]',	'','',	unichr(0x241D)),		# SYMBOL FOR GROUP SEPARATOR
        Rule('[RS]',	'','',	unichr(0x241E)),		# SYMBOL FOR RECORD SEPARATOR
        Rule('[US]',	'','',	unichr(0x241F)),		# SYMBOL FOR UNIT SEPARATOR
        Rule('[SP]',	'','',	unichr(0x2420)),		# SYMBOL FOR SPACE
        Rule('[DEL]',	'','',	unichr(0x2421)),		# SYMBOL FOR DELETE
        Rule('[NL]',	'','',	unichr(0x2424)),		# SYMBOL FOR NEWLINE
    ],

    'units' : [
        Rule('°C',	'',r'\b',	unichr(0x2103)),	            # DEGREE CELSIUS
        Rule('°F',	'',r'\b',	unichr(0x2109)),		        # DEGREE FAHRENHEIT
        Rule('Ohm',	r'\d|\b',r'\b',	unichr(0x2126)),		# OHM SIGN
        Rule('Hg',	r'\d|\b',r'\b',	unichr(0x32CC)),		# SQUARE HG
        Rule('erg',	r'\d|\b',r'\b',	unichr(0x32CD)),		# SQUARE ERG
        Rule('eV',	r'\d|\b',r'\b',	unichr(0x32CE)),		# SQUARE EV
        Rule('hPa',	r'\d|\b',r'\b',	unichr(0x3371)),		# SQUARE HPA
        Rule('da',	r'\d|\b',r'\b',	unichr(0x3372)),		# SQUARE DA
        Rule('AU',	r'\d|\b',r'\b',	unichr(0x3373)),		# SQUARE AU
        Rule('bar',	r'\d|\b',r'\b',	unichr(0x3374)),		# SQUARE BAR
        Rule('oV',	r'\d|\b',r'\b',	unichr(0x3375)),		# SQUARE OV
        Rule('pc',	r'\d|\b',r'\b',	unichr(0x3376)),		# SQUARE PC
        Rule('dm',	r'\d|\b',r'\b',	unichr(0x3377)),		# SQUARE DM
        Rule('dm^2',	r'\d|\b',r'\b',	unichr(0x3378)),	# SQUARE DM SQUARED
        Rule('dm^3',	r'\d|\b',r'\b',	unichr(0x3379)),	# SQUARE DM CUBED
        Rule('IU',	r'\d|\b',r'\b',	unichr(0x337A)),		# SQUARE IU
        Rule('pA',	r'\d|\b',r'\b',	unichr(0x3380)),		# SQUARE PA AMPS
        Rule('nA',	r'\d|\b',r'\b',	unichr(0x3381)),		# SQUARE NA
        Rule('μA',	r'\d|\b',r'\b',	unichr(0x3382)),		# SQUARE MU A
        Rule('uA',	r'\d|\b',r'\b',	unichr(0x3382)),		# SQUARE MU A
        Rule('mA',	r'\d|\b',r'\b',	unichr(0x3383)),		# SQUARE MA
        Rule('kA',	r'\d|\b',r'\b',	unichr(0x3384)),		# SQUARE KA
        Rule('KB',	r'\d|\b',r'\b',	unichr(0x3385)),		# SQUARE KB
        Rule('MB',	r'\d|\b',r'\b',	unichr(0x3386)),		# SQUARE MB
        Rule('GB',	r'\d|\b',r'\b',	unichr(0x3387)),		# SQUARE GB
        Rule('cal',	r'\d|\b',r'\b',	unichr(0x3388)),		# SQUARE CAL
        Rule('kcal',	r'\d|\b',r'\b',	unichr(0x3389)),	# SQUARE KCAL
        Rule('pF',	r'\d|\b',r'\b',	unichr(0x338A)),		# SQUARE PF
        Rule('nF',	r'\d|\b',r'\b',	unichr(0x338B)),		# SQUARE NF
        Rule('μF',	r'\d|\b',r'\b',	unichr(0x338C)),		# SQUARE MU F
        Rule('uF',	r'\d|\b',r'\b',	unichr(0x338C)),		# SQUARE MU F
        Rule('μg',	r'\d|\b',r'\b',	unichr(0x338D)),		# SQUARE MU G
        Rule('ug',	r'\d|\b',r'\b',	unichr(0x338D)),		# SQUARE MU G
        Rule('mg',	r'\d|\b',r'\b',	unichr(0x338E)),		# SQUARE MG
        Rule('kg',	r'\d|\b',r'\b',	unichr(0x338F)),		# SQUARE KG
        Rule('Hz',	r'\d|\b',r'\b',	unichr(0x3390)),		# SQUARE HZ
        Rule('kHz',	r'\d|\b',r'\b',	unichr(0x3391)),		# SQUARE KHZ
        Rule('MHz',	r'\d|\b',r'\b',	unichr(0x3392)),		# SQUARE MHZ
        Rule('GHz',	r'\d|\b',r'\b',	unichr(0x3393)),		# SQUARE GHZ
        Rule('THz',	r'\d|\b',r'\b',	unichr(0x3394)),		# SQUARE THZ
        Rule('μl',	r'\d|\b',r'\b',	unichr(0x3395)),		# SQUARE MU L
        Rule('ul',	r'\d|\b',r'\b',	unichr(0x3395)),		# SQUARE MU L
        Rule('ml',	r'\d|\b',r'\b',	unichr(0x3396)),		# SQUARE ML
        Rule('dl',	r'\d|\b',r'\b',	unichr(0x3397)),		# SQUARE DL
        Rule('kl',	r'\d|\b',r'\b',	unichr(0x3398)),		# SQUARE KL
        Rule('fm',	r'\d|\b',r'\b',	unichr(0x3399)),		# SQUARE FM
        Rule('nm',	r'\d|\b',r'\b',	unichr(0x339A)),		# SQUARE NM
        Rule('μm',	r'\d|\b',r'\b',	unichr(0x339B)),		# SQUARE MU M
        Rule('um',	r'\d|\b',r'\b',	unichr(0x339B)),		# SQUARE MU M
        Rule('mm',	r'\d|\b',r'\b',	unichr(0x339C)),		# SQUARE MM
        Rule('cm',	r'\d|\b',r'\b',	unichr(0x339D)),		# SQUARE CM
        Rule('km',	r'\d|\b',r'\b',	unichr(0x339E)),		# SQUARE KM
        Rule('mm^2',	r'\d|\b',r'\b',	unichr(0x339F)),	# SQUARE MM SQUARED
        Rule('cm^2',	r'\d|\b',r'\b',	unichr(0x33A0)),	# SQUARE CM SQUARED
        Rule('m^2',	r'\d|\b',r'\b',	unichr(0x33A1)),		# SQUARE M SQUARED
        Rule('km^2',	r'\d|\b',r'\b',	unichr(0x33A2)),	# SQUARE KM SQUARED
        Rule('mm^3',	r'\d|\b',r'\b',	unichr(0x33A3)),	# SQUARE MM CUBED
        Rule('cm^3',	r'\d|\b',r'\b',	unichr(0x33A4)),	# SQUARE CM CUBED
        Rule('m^3',	r'\d|\b',r'\b',	unichr(0x33A5)),		# SQUARE M CUBED
        Rule('km^3',	r'\d|\b',r'\b',	unichr(0x33A6)),	# SQUARE KM CUBED
        Rule('m/s',	r'\d|\b',r'\b',	unichr(0x33A7)),		# SQUARE M OVER S
        Rule('m/s^2',	r'\d|\b',r'\b',	unichr(0x33A8)),	# SQUARE M OVER S SQUARED
        Rule('Pa',	r'\d|\b',r'\b',	unichr(0x33A9)),		# SQUARE PA
        Rule('kPa',	r'\d|\b',r'\b',	unichr(0x33AA)),		# SQUARE KPA
        Rule('MPa',	r'\d|\b',r'\b',	unichr(0x33AB)),		# SQUARE MPA
        Rule('GPa',	r'\d|\b',r'\b',	unichr(0x33AC)),		# SQUARE GPA
        Rule('rad',	r'\d|\b',r'\b',	unichr(0x33AD)),		# SQUARE RAD
        Rule('rad/s',	r'\d|\b',r'\b',	unichr(0x33AE)),	# SQUARE RAD OVER S
        Rule('rad/s^2',	r'\d|\b',r'\b',	unichr(0x33AF)),	# SQUARE RAD OVER S SQUARED
        Rule('ps',	r'\d|\b',r'\b',	unichr(0x33B0)),		# SQUARE PS
        Rule('ns',	r'\d|\b',r'\b',	unichr(0x33B1)),		# SQUARE NS
        Rule('μs',	r'\d|\b',r'\b',	unichr(0x33B2)),		# SQUARE MU S
        Rule('us',	r'\d|\b',r'\b',	unichr(0x33B2)),		# SQUARE MU S
        Rule('ms',	r'\d|\b',r'\b',	unichr(0x33B3)),		# SQUARE MS
        Rule('pV',	r'\d|\b',r'\b',	unichr(0x33B4)),		# SQUARE PV
        Rule('nV',	r'\d|\b',r'\b',	unichr(0x33B5)),		# SQUARE NV
        Rule('μV',	r'\d|\b',r'\b',	unichr(0x33B6)),		# SQUARE MU V
        Rule('uV',	r'\d|\b',r'\b',	unichr(0x33B6)),		# SQUARE MU V
        Rule('mV',	r'\d|\b',r'\b',	unichr(0x33B7)),		# SQUARE MV
        Rule('kV',	r'\d|\b',r'\b',	unichr(0x33B8)),		# SQUARE KV
        Rule('MV',	r'\d|\b',r'\b',	unichr(0x33B9)),		# SQUARE MV MEGA
        Rule('pW',	r'\d|\b',r'\b',	unichr(0x33BA)),		# SQUARE PW
        Rule('nW',	r'\d|\b',r'\b',	unichr(0x33BB)),		# SQUARE NW
        Rule('μW',	r'\d|\b',r'\b',	unichr(0x33BC)),		# SQUARE MU W
        Rule('uW',	r'\d|\b',r'\b',	unichr(0x33BC)),		# SQUARE MU W
        Rule('mW',	r'\d|\b',r'\b',	unichr(0x33BD)),		# SQUARE MW
        Rule('kW',	r'\d|\b',r'\b',	unichr(0x33BE)),		# SQUARE KW
        Rule('MW',	r'\d|\b',r'\b',	unichr(0x33BF)),		# SQUARE MW MEGA
        Rule('kΩ',	r'\d|\b',r'\b',	unichr(0x33C0)),		# SQUARE K OHM
        Rule('MΩ',	r'\d|\b',r'\b',	unichr(0x33C1)),		# SQUARE M OHM
        Rule('a.m.',	r'\d|\b',r'\b',	unichr(0x33C2)),	# SQUARE AM
        Rule('Bq',	r'\d|\b',r'\b',	unichr(0x33C3)),		# SQUARE BQ
        Rule('cc',	r'\d|\b',r'\b',	unichr(0x33C4)),		# SQUARE CC
        Rule('cd',	r'\d|\b',r'\b',	unichr(0x33C5)),		# SQUARE CD
        Rule('C/kg',	r'\d|\b',r'\b',	unichr(0x33C6)),	# SQUARE C OVER KG
        Rule('Co.',	r'\d|\b',r'\b',	unichr(0x33C7)),		# SQUARE CO
        Rule('dB',	r'\d|\b',r'\b',	unichr(0x33C8)),		# SQUARE DB
        Rule('Gy',	r'\d|\b',r'\b',	unichr(0x33C9)),		# SQUARE GY
        Rule('ha',	r'\d|\b',r'\b',	unichr(0x33CA)),		# SQUARE HA
        Rule('HP',	r'\d|\b',r'\b',	unichr(0x33CB)),		# SQUARE HP
        Rule('in',	r'\d|\b',r'\b',	unichr(0x33CC)),		# SQUARE IN
        Rule('KK',	r'\d|\b',r'\b',	unichr(0x33CD)),		# SQUARE KK
        Rule('KM',	r'\d|\b',r'\b',	unichr(0x33CE)),		# SQUARE KM CAPITAL
        Rule('kt',	r'\d|\b',r'\b',	unichr(0x33CF)),		# SQUARE KT
        Rule('lm',	r'\d|\b',r'\b',	unichr(0x33D0)),		# SQUARE LM
        Rule('ln',	r'\d|\b',r'\b',	unichr(0x33D1)),		# SQUARE LN
        Rule('log',	r'\d|\b',r'\b',	unichr(0x33D2)),		# SQUARE LOG
        Rule('lx',	r'\d|\b',r'\b',	unichr(0x33D3)),		# SQUARE LX
        Rule('mb',	r'\d|\b',r'\b',	unichr(0x33D4)),		# SQUARE MB SMALL
        Rule('mil',	r'\d|\b',r'\b',	unichr(0x33D5)),		# SQUARE MIL
        Rule('mol',	r'\d|\b',r'\b',	unichr(0x33D6)),		# SQUARE MOL
        Rule('PH',	r'\d|\b',r'\b',	unichr(0x33D7)),		# SQUARE PH
        Rule('p.m.',	r'\d|\b',r'\b',	unichr(0x33D8)),	# SQUARE PM
        Rule('PPM',	r'\d|\b',r'\b',	unichr(0x33D9)),		# SQUARE PPM
        Rule('PR',	r'\d|\b',r'\b',	unichr(0x33DA)),		# SQUARE PR
        Rule('sr',	r'\d|\b',r'\b',	unichr(0x33DB)),		# SQUARE SR
        Rule('Sv',	r'\d|\b',r'\b',	unichr(0x33DC)),		# SQUARE SV
        Rule('Wb',	r'\d|\b',r'\b',	unichr(0x33DD)),		# SQUARE WB
        Rule('V/m',	r'\d|\b',r'\b',	unichr(0x33DE)),		# SQUARE V OVER M
        Rule('A/m',	r'\d|\b',r'\b',	unichr(0x33DF)),		# SQUARE A OVER M
        Rule('gal',	r'\d|\b',r'\b',	unichr(0x33FF)),		# SQUARE GAL
    ],
}


# Command-line interface:

def main(argv=None):
    version = ( "BetterType version " + __version__ + " by " + __author__ +
                "\n\nFor more information visit <" + __url__ + ">.\n" +
                __copyright__ )
    usage = "%prog [options] [FILE...]"
    description = \
"""BetterType reads the named input FILEs or an expression specified with the
-e option (or standard input if neither is given) and converts character
sequences into their unicode counterparts, e.g. 'educating' typewriter quotes
into left and right quotation marks. A wide range of translation rulesets is
available."""
    try:
        parser = optparse.OptionParser(
            description=description, version=version, usage=usage
        )
        parser.add_option("-o", "--output",
            help="write output to FILE", metavar="FILE")
        parser.add_option("--test", action="store_true",
            help="show results of internal tests and exit")
        parser.add_option("--text",
            action="store_const", dest="format", const="text",
            help="treat input as plaintext")
        parser.add_option("--html",
            action="store_const", dest="format", const="html",
            help="treat input as HTML")
        parser.add_option("-r", "--ruleset",
            action="append", dest="rulesets", choices=rulesets.keys(),
            help="apply the rule set named NAME", metavar="NAME")
        parser.add_option("--names",
            action="store_true", dest="show_names",
            help="show loaded ruleset names and exit")
        parser.add_option("-q", "--quotes",
            action="append_const", dest="rulesets", const="educate quotes",
            help="translate \" and \' characters into curly quotes based on context")
        parser.add_option("-b", "--double-backticks",
            action="append_const", dest="rulesets", const="double backticks",
            help="translate `` and '' sequences into double quotes")
        parser.add_option("-B", "--all-backticks",
            action="append_const", dest="rulesets", const="all backticks",
            help="translate `` and '' sequences into double quotes and ` and ' into single quotes")
        parser.add_option("-d", "--simple-dashes",
            action="append_const", dest="rulesets", const="simple dashes",
            help="translate -- into em dash")
        parser.add_option("-D", "--dashes",
            action="append_const", dest="rulesets", const="dashes",
            help="translate -- and --- into en- and em-dashes")
        parser.add_option("-i", "--inverted-dashes",
            action="append_const", dest="rulesets", const="reverse dashes",
            help="translate -- into em dash and --- into en dash")
        parser.add_option("-e", "--ellipses",
            action="append_const", dest="rulesets", const="ellipses",
            help="translate ... sequences")
        parser.add_option("-w", "--process-escaped-quotes",
            action="store_true", dest="process_escaped_quotes",
            help="allow replacements for &quot; entities (HTML mode only)")
        parser.add_option("--escapes",
            action="append_const", dest="rulesets", const="escapes",
            help="enable backslash escapes for dashes, dots and quotes")
        parser.add_option("-a", "--data", action="append",
            help="perform transformation on EXPR", metavar="EXPR")
        (options, args) = parser.parse_args()
        
        if options.show_names:
            print >>sys.stderr, "The following rulesets are available:"
            for name in sorted(rulesets.keys()):
                print >>sys.stderr, "\t", name
            return 0
        
        if options.rulesets == None: options.rulesets = default_rulesets
        
        def call_func(string, filename=None):
            if options.format == "text" or options.format != "html" and not \
                (filename and path.splitext(filename)[1] in ('.htm','.html')):
                return better_text(string, options.rulesets)
            if options.process_escaped_quotes:
                return better_html(string, options.rulesets, often_html_escaped)
            return better_html(string, options.rulesets)
        
        if options.test:
            (num_tests, failures) = run_tests()
            print >>sys.stderr,\
                len(failures), "failures in", num_tests, "tests\n"
            for (desc, test, desired, actual) in failures:
                print >>sys.stderr, "In '" + desc + "' mode, this:\n"
                print >>sys.stderr, test
                print >>sys.stderr, "\nshould become this:\n"
                print >>sys.stderr, desired
                print >>sys.stderr, "\nbut instead became:\n"
                print >>sys.stderr, actual + "\n"
            return 2 if failures else 0
        
        if not args and options.data == None:
            options.data = (sys.stdin.read(),)
        
        if options.output:
            outfile = codecs.open(options.output, 'w', encoding='utf-8')
        else:
            if sys.stdout.encoding: enc = sys.stdout.encoding
            else: enc = 'utf-8'
            outfile = codecs.EncodedFile(sys.stdout, 'unicode_internal', enc)
        
        if not options.data == None:
            for e in options.data: outfile.write(call_func(e))
        for filename in args:
            f = codecs.open(filename, 'U', encoding='utf-8')
            outfile.write(call_func(f.read(), filename))
    except Exception, e:
        print >>sys.stderr, e
        return 2

if __name__ == "__main__": sys.exit(main())
