>>> import unicodedata >>> letters = set() >>> letter = frozenset(['Lu', 'Ll', 'Lt', 'Lm', 'Lo']) >>> for codepoint in xrange(0x0000, 0xFFFF): ... char = unichr(codepoint) ... if unicodedata.category(char) in letter: ... letters.add(char) >>> for codepoint in xrange(0x0000, 0xFFFF): ... char = unichr(codepoint) ... if unicodedata.category(char) in letter: ... regexp.append(char) ... >>> regexp.append(']') >>> len(regexp) 45642 >>> prev = None for codepoint in xrange(0x0000, 0xFFFF): char = unichr(codepoint) if unicodedata.category(char) in letter: if (prev is not None) and ((prev + 1) == codepoint) and (regexp[-1] != '-'): regexp.append('-') elif (prev is not None) and ((prev + 1) != codepoint) and (regexp[-1] == '-'): regexp.append(unichr(prev)) elif (prev is not None) and (regexp[-1] != '-'): regexp.append(char) elif prev is None: regexp.append(char) prev = codepoint pos = 0 output = [] while True: if pos > 0: if (ord(input[pos - 1]) + 1) == ord(input[pos]): if output[-1] != '-': output.append('-') if pos > 1: if (ord(input[pos - 2]) + 1) == ord(input[pos - 1]) and \ (ord(input[pos - 1]) + 1) != ord(input[pos]): output.append(input[pos - 1]) output.append(input[pos]) else: output.append(input[pos]) pos += 1 if pos >= len(input): break >>> pos = 0 >>> output = [] >>> while True: ... if pos > 0: ... if (ord(input[pos - 1]) + 1) == ord(input[pos]): ... if output[-1] != '-': ... output.append('-') ... if pos > 1: ... if (ord(input[pos - 2]) + 1) == ord(input[pos - 1]) and \ ... (ord(input[pos - 1]) + 1) != ord(input[pos]): ... output.append(input[pos - 1]) ... output.append(input[pos]) ... else: output.append(input[pos]) ... pos += 1 ... if pos >= len(input): ... break ... >>> len(output) 646 >>> ''.join(output) u'[A-Za-z\xaa-\xd6\xd8-\xf6\xf8-\u0220\u0222-\u0233\u0250-\u02ad\u02b0-\u02b8\u02bb-\u02c1\u02d0-\u02d1\u02e0-\u02e4\u02ee-\u038a\u038c-\u03a1\u03a3-\u03ce\u03d0-\u03f5\u0400-\u0481\u048a-\u04ce\u04d0-\u04f5\u04f8-\u04f9\u0500-\u050f\u0531-\u0556\u0559-\u0587\u05d0-\u05ea\u05f0-\u05f2\u0621-\u063a\u0640-\u064a\u066e-\u066f\u0671-\u06d3\u06d5-\u06e6\u06fa-\u06fc\u0710-\u072c\u0780-\u07a5\u07b1-\u0939\u093d-\u0961\u0985-\u098c\u098f-\u0990\u0993-\u09a8\u09aa-\u09b0\u09b2-\u09b9\u09dc-\u09dd\u09df-\u09e1\u09f0-\u09f1\u0a05-\u0a0a\u0a0f-\u0a10\u0a13-\u0a28\u0a2a-\u0a30\u0a32-\u0a33\u0a35-\u0a36\u0a38-\u0a39\u0a59-\u0a5c\u0a5e-\u0a74\u0a85-\u0a8b\u0a8d-\u0a91\u0a93-\u0aa8\u0aaa-\u0ab0\u0ab2-\u0ab3\u0ab5-\u0ab9\u0abd-\u0b0c\u0b0f-\u0b10\u0b13-\u0b28\u0b2a-\u0b30\u0b32-\u0b33\u0b36-\u0b39\u0b3d-\u0b5d\u0b5f-\u0b61\u0b83-\u0b8a\u0b8e-\u0b90\u0b92-\u0b95\u0b99-\u0b9a\u0b9c-\u0b9f\u0ba3-\u0ba4\u0ba8-\u0baa\u0bae-\u0bb5\u0bb7-\u0bb9\u0c05-\u0c0c\u0c0e-\u0c10\u0c12-\u0c28\u0c2a-\u0c33\u0c35-\u0c39\u0c60-\u0c61\u0c85-\u0c8c\u0c8e-\u0c90\u0c92-\u0ca8\u0caa-\u0cb3\u0cb5-\u0cb9\u0cde-\u0ce1\u0d05-\u0d0c\u0d0e-\u0d10\u0d12-\u0d28\u0d2a-\u0d39\u0d60-\u0d61\u0d85-\u0d96\u0d9a-\u0db1\u0db3-\u0dbb\u0dbd-\u0dc6\u0e01-\u0e30\u0e32-\u0e33\u0e40-\u0e46\u0e81-\u0e82\u0e84-\u0e88\u0e8a-\u0e97\u0e99-\u0e9f\u0ea1-\u0ea3\u0ea5-\u0eab\u0ead-\u0eb0\u0eb2-\u0eb3\u0ebd-\u0ec4\u0ec6-\u0edd\u0f00-\u0f47\u0f49-\u0f6a\u0f88-\u0f8b\u1000-\u1021\u1023-\u1027\u1029-\u102a\u1050-\u1055\u10a0-\u10c5\u10d0-\u10f8\u1100-\u1159\u115f-\u11a2\u11a8-\u11f9\u1200-\u1206\u1208-\u1246\u1248-\u124d\u1250-\u1256\u1258-\u125d\u1260-\u1286\u1288-\u128d\u1290-\u12ae\u12b0-\u12b5\u12b8-\u12be\u12c0-\u12c5\u12c8-\u12ce\u12d0-\u12d6\u12d8-\u12ee\u12f0-\u130e\u1310-\u1315\u1318-\u131e\u1320-\u1346\u1348-\u135a\u13a0-\u13f4\u1401-\u166c\u166f-\u1676\u1681-\u169a\u16a0-\u16ea\u1700-\u170c\u170e-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176c\u176e-\u1770\u1780-\u17b3\u17d7-\u1877\u1880-\u18a8\u1e00-\u1e9b\u1ea0-\u1ef9\u1f00-\u1f15\u1f18-\u1f1d\u1f20-\u1f45\u1f48-\u1f4d\u1f50-\u1f57\u1f59-\u1f7d\u1f80-\u1fb4\u1fb6-\u1fbc\u1fbe-\u1fc4\u1fc6-\u1fcc\u1fd0-\u1fd3\u1fd6-\u1fdb\u1fe0-\u1fec\u1ff2-\u1ff4\u1ff6-\u1ffc\u2071-\u2113\u2115-\u211d\u2124-\u212d\u212f-\u2131\u2133-\u2139\u213d-\u213f\u2145-\u2149\u3005-\u3006\u3031-\u3035\u303b-\u303c\u3041-\u3096\u309d-\u309f\u30a1-\u30fa\u30fc-\u30ff\u3105-\u312c\u3131-\u318e\u31a0-\u31b7\u31f0-\u31ff\u3400-\u4db5\u4e00-\u9fa5\ua000-\ua48c\uac00-\ud7a3\uf900-\ufa2d\ufa30-\ufa6a\ufb00-\ufb06\ufb13-\ufb17\ufb1d-\ufb28\ufb2a-\ufb36\ufb38-\ufb3c\ufb3e-\ufb41\ufb43-\ufb44\ufb46-\ufbb1\ufbd3-\ufd3d\ufd50-\ufd8f\ufd92-\ufdc7\ufdf0-\ufdfb\ufe70-\ufe74\ufe76-\ufefc\uff21-\uff3a\uff41-\uff5a\uff66-\uffbe\uffc2-\uffc7\uffca-\uffcf\uffd2-\uffd7\uffda-\uffdc]' >>>