#!/usr/bin/env python """ charguess.py - Guess Character Encoding Author: Sean B. Palmer, inamidst.com Guesses between: ascii, iso-8859-1, cp1252, and utf-8 """ import sys, os.path, re r_hibyte = re.compile(r'[\x80-\xff]') r_winbyte = re.compile(r'[\x80-\xbf]') def charguess(fn): ascii = True utf8 = True cp1252 = False iso88591 = False if not os.path.exists(fn): print >> sys.stderr, "Error! Not found: %s" % fn return if not os.path.isfile(fn): if os.path.isdir(fn): print >> sys.stderr, "Error! Directory: %s" % fn else: print >> sys.stderr, "Error! Not a regular file: %s" % fn return f = open(fn, 'rb') for line in f: # If there's an r_hibyte, it's not ascii # If it doesn't decode as utf-8, it's not utf-8 # If there's an r_winbyte, it's cp1252 # Otherwise, iso-8859-1 if r_hibyte.search(line): ascii = False try: line.decode('utf-8') except UnicodeDecodeError: utf8 = False if r_winbyte.search(line): cp1252 = True break else: iso88591 = True f.close() if ascii: return 'ascii' elif utf8: return 'utf-8' elif cp1252: return 'cp1252' else: return 'iso-8859-1' def main(): for fn in sys.argv[1:]: result = charguess(fn) if result: print result + ':', fn if __name__ == '__main__': main()