1804
|
1 |
--
|
|
2 |
-- |
|
|
3 |
-- Module : Codec.Binary.UTF8.String
|
|
4 |
-- Copyright : (c) Eric Mertens 2007
|
|
5 |
-- License : BSD3-style (see LICENSE)
|
|
6 |
--
|
|
7 |
-- Maintainer: emertens@galois.com
|
|
8 |
-- Stability : experimental
|
|
9 |
-- Portability : portable
|
|
10 |
--
|
|
11 |
-- Support for encoding UTF8 Strings to and from @[Word8]@
|
|
12 |
--
|
|
13 |
|
|
14 |
module Codec.Binary.UTF8.String (
|
|
15 |
encode
|
|
16 |
, decode
|
|
17 |
, encodeString
|
|
18 |
, decodeString
|
|
19 |
) where
|
|
20 |
|
|
21 |
import Data.Word (Word8)
|
|
22 |
import Data.Bits ((.|.),(.&.),shiftL,shiftR)
|
|
23 |
import Data.Char (chr,ord)
|
|
24 |
|
|
25 |
default(Int)
|
|
26 |
|
|
27 |
-- | Encode a string using 'encode' and store the result in a 'String'.
|
|
28 |
encodeString :: String -> String
|
|
29 |
encodeString xs = map (toEnum . fromEnum) (encode xs)
|
|
30 |
|
|
31 |
-- | Decode a string using 'decode' using a 'String' as input.
|
|
32 |
-- | This is not safe but it is necessary if UTF-8 encoded text
|
|
33 |
-- | has been loaded into a 'String' prior to being decoded.
|
|
34 |
decodeString :: String -> String
|
|
35 |
decodeString xs = decode (map (toEnum . fromEnum) xs)
|
|
36 |
|
|
37 |
replacement_character :: Char
|
|
38 |
replacement_character = '\xfffd'
|
|
39 |
|
|
40 |
-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
|
|
41 |
encode :: String -> [Word8]
|
|
42 |
encode = concatMap (map fromIntegral . go . ord)
|
|
43 |
where
|
|
44 |
go oc
|
|
45 |
| oc <= 0x7f = [oc]
|
|
46 |
|
|
47 |
| oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6)
|
|
48 |
, 0x80 + oc .&. 0x3f
|
|
49 |
]
|
|
50 |
|
|
51 |
| oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12)
|
|
52 |
, 0x80 + ((oc `shiftR` 6) .&. 0x3f)
|
|
53 |
, 0x80 + oc .&. 0x3f
|
|
54 |
]
|
|
55 |
| otherwise = [ 0xf0 + (oc `shiftR` 18)
|
|
56 |
, 0x80 + ((oc `shiftR` 12) .&. 0x3f)
|
|
57 |
, 0x80 + ((oc `shiftR` 6) .&. 0x3f)
|
|
58 |
, 0x80 + oc .&. 0x3f
|
|
59 |
]
|
|
60 |
|
|
61 |
--
|
|
62 |
-- | Decode a UTF8 string packed into a list of Word8 values, directly to String
|
|
63 |
--
|
|
64 |
decode :: [Word8] -> String
|
|
65 |
decode [ ] = ""
|
|
66 |
decode (c:cs)
|
|
67 |
| c < 0x80 = chr (fromEnum c) : decode cs
|
|
68 |
| c < 0xc0 = replacement_character : decode cs
|
|
69 |
| c < 0xe0 = multi1
|
|
70 |
| c < 0xf0 = multi_byte 2 0xf 0x800
|
|
71 |
| c < 0xf8 = multi_byte 3 0x7 0x10000
|
|
72 |
| c < 0xfc = multi_byte 4 0x3 0x200000
|
|
73 |
| c < 0xfe = multi_byte 5 0x1 0x4000000
|
|
74 |
| otherwise = replacement_character : decode cs
|
|
75 |
where
|
|
76 |
multi1 = case cs of
|
|
77 |
c1 : ds | c1 .&. 0xc0 == 0x80 ->
|
|
78 |
let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f)
|
|
79 |
in if d >= 0x000080 then toEnum d : decode ds
|
|
80 |
else replacement_character : decode ds
|
|
81 |
_ -> replacement_character : decode cs
|
|
82 |
|
|
83 |
multi_byte :: Int -> Word8 -> Int -> [Char]
|
|
84 |
multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
|
|
85 |
where
|
|
86 |
aux 0 rs acc
|
|
87 |
| overlong <= acc && acc <= 0x10ffff &&
|
|
88 |
(acc < 0xd800 || 0xdfff < acc) &&
|
|
89 |
(acc < 0xfffe || 0xffff < acc) = chr acc : decode rs
|
|
90 |
| otherwise = replacement_character : decode rs
|
|
91 |
|
|
92 |
aux n (r:rs) acc
|
|
93 |
| r .&. 0xc0 == 0x80 = aux (n-1) rs
|
|
94 |
$ shiftL acc 6 .|. fromEnum (r .&. 0x3f)
|
|
95 |
|
|
96 |
aux _ rs _ = replacement_character : decode rs
|
|
97 |
|