klenwell information services : PythonUnicode

Python Unicode

return to DevPython

Part of my ongoing struggle to understand unicode.

"It does not make sense to have a string without knowing what encoding it uses."
-- Joel Spolsky

"Python speaks in strings but thinks in unicode. I think."
-- Tom at klenwell.com

Code

Unit Test example showing string manipulation using unicode

    def testDecodingUtf16Errors(self):
        # utf-16 BOM (long-endian)
        bom_16_le = u'\xff\xfe'
       
        # show two kinds of syntax for unicode
        ua1 = unicode('a')
        ua2 = u'a'        
        self.assertEqual(ua1, ua2)
        self.assertEqual(type(ua1), type(ua2))        
       
        # build unicode object        
        non_ascii_char = unichr(40960)
        utf_16_obj = 'abc%sdef%sghi' % (non_ascii_char, non_ascii_char)
        self.assertEqual(type(utf_16_obj), unicode)
       
        # trying to encode a bad ascii char: this should raise an error
        partial_e = "'ascii' codec can't encode character"
        try:
            ascii_encoded = utf_16_obj.encode('ascii')            
            raise Exception('expected error ascii-encoding "%s": %s' % (
                ascii_encoded, e_msg))
        except UnicodeEncodeError, e:
            self.assertEqual(type(e), UnicodeEncodeError)
            self.assertTrue(str(e).startswith(partial_e))
       
        # TO SUMMARIZE: encoding, decoding, and reencoding            
        # ENCODE unicode objects to strings
        utf_16_str = utf_16_encoded = utf_16_obj.encode('utf-16', 'replace')  
        ascii_str = utf_16_obj.encode('ascii', 'replace')
        self.assertEqual(type(utf_16_encoded), type(utf_16_str))
        self.assertEqual(type(utf_16_str), str)
        self.assertEqual(type(ascii_str), str)
        self.assertEqual(ascii_str, 'abc?def?ghi')
        self.assertRaises(UnicodeDecodeError, utf_16_str.startswith, bom_16_le)
       
        # DECODE strings back to unicode objects
        utf_16_unicode_obj = utf_16_str.decode('utf-16')
        ascii_unicode_obj = ascii_str.decode('ascii')
        ascii_unicode_obj2 = unicode(ascii_str)
        self.assertEqual(type(utf_16_unicode_obj), unicode)
        self.assertEqual(type(ascii_unicode_obj), unicode)
        self.assertEqual(type(ascii_unicode_obj), type(ascii_unicode_obj))
        self.assertFalse(utf_16_unicode_obj.startswith(bom_16_le))
       
        # and ENCODE back to strings
        utf_16_obj_to_str = utf_16_unicode_obj = utf_16_obj.encode('utf-16', 'replace')  
        ascii_obj_to_str = ascii_unicode_obj.encode('ascii', 'replace')
        self.assertEqual(type(utf_16_obj_to_str), str)
        self.assertEqual(type(ascii_obj_to_str), str)
        self.assertEqual(utf_16_obj_to_str, utf_16_str)
        self.assertEqual(ascii_obj_to_str, ascii_str)


References

http://code.alexreisner.com/articles/character-encoding.html
http://www.joelonsoftware.com/articles/Unicode.html
http://docs.python.org/howto/unicode.html