#!/usr/bin/env python """ An Encoding Normaliser. License: GPL 2; share and enjoy! Author: Sean B. Palmer, inamidst.com Service: http://inamidst.com/encnorm Source: http://inamidst.com/inside/encnorm heh, heh. that reminds me that I want to implement a utf-8/general charset munging CGI, and I don't know why... sbp: as a temporary fix for sites that have the wrong setting? deltab: yes, and for when people use utf-8 in IRC and you wonder wearily whilst wandering what they're saying, or for when a program outputs something in a character encoding and you don't want to mess about with guessing and iconv @@ http://swtch.com/usr/local/plan9/src/cmd/dict/oed.c """ import cgitb; cgitb.enable() import sys, os, re, cgi, urllib, codecs method = os.environ.get('REQUEST_METHOD') form = cgi.FieldStorage() form.__call__ = lambda s: form[s].value # sys.stdout = codecs.getwriter('utf-8')(sys.stdout) # print "Content-Type: text/plain; charset=utf-8" # print def serve(status, body, mime=None, charset=None): mime = mime or 'text/html' charset = charset or 'utf-8' sys.stdout.write("Status: %s\r\n" % status) sys.stdout.write("Content-Type: %s; charset=%s\r\n\r\n" % (mime, charset)) sys.stdout.write(body) sys.exit() def convert(text, here, there): if (there == 'pyraw'): if (here != '-'): raise Exception, "Can only convert - to pyraw." text = ('%r' % str(text))[1:-1] return cgi.escape(text) # @@ escape? if (here == 'u-cp1252') and (there == 'utf-8'): text = text.decode('utf-8') here, there = 'unicode', 'cp1252' if (here == 'unicode') and (there == 'cp1252'): # http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT map = {0x20AC: 0x80, 0x201A: 0x82, 0x0192: 0x83, 0x201E: 0x84, 0x2026: 0x85, 0x2020: 0x86, 0x2021: 0x87, 0x02C6: 0x88, 0x2030: 0x89, 0x0160: 0x8A, 0x2039: 0x8B, 0x0152: 0x8C, 0x017D: 0x8E, 0x2018: 0x91, 0x2019: 0x92, 0x201C: 0x93, 0x201D: 0x94, 0x2022: 0x95, 0x2013: 0x96, 0x2014: 0x97, 0x02DC: 0x98, 0x2122: 0x99, 0x0161: 0x9A, 0x203A: 0x9B, 0x0153: 0x9C, 0x017E: 0x9E, 0x0178: 0x9F} error = (0x81, 0x8D, 0x8F, 0x90, 0x9D) result = '' for char in text: char = ord(char) if map.has_key(char): result += chr(map[char]) elif char in error: result += chr(char) # raise Exception, "Can't convert byte %r to cp1252." % char elif char <= 0xFF: result += chr(char) else: raise Exception, "Can't encode character %r to cp1252." % char return result elif (here == 'html-bytes') and (there == 'utf-8'): #
utf-8: é
#
iso-8859-1: é
#
utf-8 and iso-8859-1: é é
r_hbyte = re.compile(r'&#(\d+);') text = r_hbyte.sub(lambda m: chr(int(m.group(1))), str(text)) result = [] for nugget in text.split(' '): try: result.append(unicode(nugget, 'utf-8')) except: try: result.append(unicode(nugget, 'iso-8859-1')) except: result.append(unicode(nugget)) return ' '.join([n.encode('utf-8') for n in result]) elif (here == 'double-utf-8') and (there == 'utf-8'): result = [] for chunk in text.split(' '): # print `chunk` try: chunk = chunk.decode('utf-8').encode('iso-8859-1') test = unicode(chunk, 'utf-8') # print 'WHOO' result.append(chunk) except Exception, e: # print 'BOO', e result.append(chunk) return ' '.join(result) elif (here == 'utf-8') and (there == 'punycode'): return unicode(text, 'utf-8').encode('punycode') elif (here == 'utf-8') and (there == 'html-entities'): result = [] for character in unicode(text, 'utf-8'): if ord(character) > 0x80: result.append('&#x%04X;' % ord(character)) else: result.append(character) result = ''.join(result) return result + '

' + cgi.escape(result) else: try: return unicode(text, here).encode(there) except: raise Exception, "Can't process from %s to %s." % (here, there) def post(): # @@ from charset, to charset # @@ header munging--remove html:meta if it sets the wrong charset, etc. if form.has_key('uri'): uri = form('uri') u = urllib.urlopen(uri) body = u.read() mime = u.info().get('Content-Type', 'text/html').split(';')[0] u.close() lines = body.splitlines() here, there = form('from'), form('to') result = [] for line in lines: result.append(convert(line, here, there)) serve(200, '\n'.join(result), mime=mime) elif form.has_key('text'): text = form('text') lines = text.splitlines() here, there = form('from'), form('to') result = [('

Converted %s to ' + '%s:

') % (here, there)] for line in lines: result.append('
' + convert(line, here, there) + '
') serve(200, '\n'.join(result)) else: serve(500, "

Form requires either a uri or text value.

\n") def homepage(): if form.has_key('from'): here = '' % form('from') else: here = """From: """ if form.has_key('to'): there = '' % form('to') else: there = """To: """ serve(200, """ Fix Encoding - A Character Encodings Normaliser

Encoding Normaliser

Text:
%s %s
URI:
%s %s

The source is the best place to get an idea of this service's features at the moment.

Sean B. Palmer
""" % (here, there, here, there)) def main(): if form.has_key('uri') or form.has_key('text'): post() elif method == 'GET': homepage() elif method == 'POST': post() else: serve(501, "

Method must be either GET or POST

\n") if __name__=="__main__": main()