|
1 -- |
|
2 -- | |
|
3 -- Module : Codec.Binary.UTF8.String |
|
4 -- Copyright : (c) Eric Mertens 2007 |
|
5 -- License : BSD3-style (see LICENSE) |
|
6 -- |
|
7 -- Maintainer: emertens@galois.com |
|
8 -- Stability : experimental |
|
9 -- Portability : portable |
|
10 -- |
|
11 -- Support for encoding UTF8 Strings to and from @[Word8]@ |
|
12 -- |
|
13 |
|
14 module Codec.Binary.UTF8.String ( |
|
15 encode |
|
16 , decode |
|
17 , encodeString |
|
18 , decodeString |
|
19 ) where |
|
20 |
|
21 import Data.Word (Word8) |
|
22 import Data.Bits ((.|.),(.&.),shiftL,shiftR) |
|
23 import Data.Char (chr,ord) |
|
24 |
|
25 default(Int) |
|
26 |
|
27 -- | Encode a string using 'encode' and store the result in a 'String'. |
|
28 encodeString :: String -> String |
|
29 encodeString xs = map (toEnum . fromEnum) (encode xs) |
|
30 |
|
31 -- | Decode a string using 'decode' using a 'String' as input. |
|
32 -- | This is not safe but it is necessary if UTF-8 encoded text |
|
33 -- | has been loaded into a 'String' prior to being decoded. |
|
34 decodeString :: String -> String |
|
35 decodeString xs = decode (map (toEnum . fromEnum) xs) |
|
36 |
|
37 replacement_character :: Char |
|
38 replacement_character = '\xfffd' |
|
39 |
|
40 -- | Encode a Haskell String to a list of Word8 values, in UTF8 format. |
|
41 encode :: String -> [Word8] |
|
42 encode = concatMap (map fromIntegral . go . ord) |
|
43 where |
|
44 go oc |
|
45 | oc <= 0x7f = [oc] |
|
46 |
|
47 | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6) |
|
48 , 0x80 + oc .&. 0x3f |
|
49 ] |
|
50 |
|
51 | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12) |
|
52 , 0x80 + ((oc `shiftR` 6) .&. 0x3f) |
|
53 , 0x80 + oc .&. 0x3f |
|
54 ] |
|
55 | otherwise = [ 0xf0 + (oc `shiftR` 18) |
|
56 , 0x80 + ((oc `shiftR` 12) .&. 0x3f) |
|
57 , 0x80 + ((oc `shiftR` 6) .&. 0x3f) |
|
58 , 0x80 + oc .&. 0x3f |
|
59 ] |
|
60 |
|
61 -- |
|
62 -- | Decode a UTF8 string packed into a list of Word8 values, directly to String |
|
63 -- |
|
64 decode :: [Word8] -> String |
|
65 decode [ ] = "" |
|
66 decode (c:cs) |
|
67 | c < 0x80 = chr (fromEnum c) : decode cs |
|
68 | c < 0xc0 = replacement_character : decode cs |
|
69 | c < 0xe0 = multi1 |
|
70 | c < 0xf0 = multi_byte 2 0xf 0x800 |
|
71 | c < 0xf8 = multi_byte 3 0x7 0x10000 |
|
72 | c < 0xfc = multi_byte 4 0x3 0x200000 |
|
73 | c < 0xfe = multi_byte 5 0x1 0x4000000 |
|
74 | otherwise = replacement_character : decode cs |
|
75 where |
|
76 multi1 = case cs of |
|
77 c1 : ds | c1 .&. 0xc0 == 0x80 -> |
|
78 let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f) |
|
79 in if d >= 0x000080 then toEnum d : decode ds |
|
80 else replacement_character : decode ds |
|
81 _ -> replacement_character : decode cs |
|
82 |
|
83 multi_byte :: Int -> Word8 -> Int -> [Char] |
|
84 multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask)) |
|
85 where |
|
86 aux 0 rs acc |
|
87 | overlong <= acc && acc <= 0x10ffff && |
|
88 (acc < 0xd800 || 0xdfff < acc) && |
|
89 (acc < 0xfffe || 0xffff < acc) = chr acc : decode rs |
|
90 | otherwise = replacement_character : decode rs |
|
91 |
|
92 aux n (r:rs) acc |
|
93 | r .&. 0xc0 == 0x80 = aux (n-1) rs |
|
94 $ shiftL acc 6 .|. fromEnum (r .&. 0x3f) |
|
95 |
|
96 aux _ rs _ = replacement_character : decode rs |
|
97 |