klenwell information services : PythonUnicode

Wiki source for PythonUnicode


Show raw source

=====Python Unicode=====
return to DevPython

Part of my ongoing struggle to understand unicode.

"It does not make sense to have a string without knowing what encoding it uses."
-- Joel Spolsky

"Python speaks in strings but thinks in unicode. I think."
-- Tom at klenwell.com

====Code====
===Unit Test example showing string manipulation using unicode===
%%(python)
def testDecodingUtf16Errors(self):
# utf-16 BOM (long-endian)
bom_16_le = u'\xff\xfe'

# show two kinds of syntax for unicode
ua1 = unicode('a')
ua2 = u'a'
self.assertEqual(ua1, ua2)
self.assertEqual(type(ua1), type(ua2))

# build unicode object
non_ascii_char = unichr(40960)
utf_16_obj = 'abc%sdef%sghi' % (non_ascii_char, non_ascii_char)
self.assertEqual(type(utf_16_obj), unicode)

# trying to encode a bad ascii char: this should raise an error
partial_e = "'ascii' codec can't encode character"
try:
ascii_encoded = utf_16_obj.encode('ascii')
raise Exception('expected error ascii-encoding "%s": %s' % (
ascii_encoded, e_msg))
except UnicodeEncodeError, e:
self.assertEqual(type(e), UnicodeEncodeError)
self.assertTrue(str(e).startswith(partial_e))

# TO SUMMARIZE: encoding, decoding, and reencoding
# ENCODE unicode objects to strings
utf_16_str = utf_16_encoded = utf_16_obj.encode('utf-16', 'replace')
ascii_str = utf_16_obj.encode('ascii', 'replace')
self.assertEqual(type(utf_16_encoded), type(utf_16_str))
self.assertEqual(type(utf_16_str), str)
self.assertEqual(type(ascii_str), str)
self.assertEqual(ascii_str, 'abc?def?ghi')
self.assertRaises(UnicodeDecodeError, utf_16_str.startswith, bom_16_le)

# DECODE strings back to unicode objects
utf_16_unicode_obj = utf_16_str.decode('utf-16')
ascii_unicode_obj = ascii_str.decode('ascii')
ascii_unicode_obj2 = unicode(ascii_str)
self.assertEqual(type(utf_16_unicode_obj), unicode)
self.assertEqual(type(ascii_unicode_obj), unicode)
self.assertEqual(type(ascii_unicode_obj), type(ascii_unicode_obj))
self.assertFalse(utf_16_unicode_obj.startswith(bom_16_le))

# and ENCODE back to strings
utf_16_obj_to_str = utf_16_unicode_obj = utf_16_obj.encode('utf-16', 'replace')
ascii_obj_to_str = ascii_unicode_obj.encode('ascii', 'replace')
self.assertEqual(type(utf_16_obj_to_str), str)
self.assertEqual(type(ascii_obj_to_str), str)
self.assertEqual(utf_16_obj_to_str, utf_16_str)
self.assertEqual(ascii_obj_to_str, ascii_str)
%%

====References====
http://code.alexreisner.com/articles/character-encoding.html
http://www.joelonsoftware.com/articles/Unicode.html
http://docs.python.org/howto/unicode.html