tools/confuse.hs
author Grigory Ustinov <grenka@altlinux.org>
Wed, 28 Nov 2018 17:20:22 +0300
changeset 14351 6b56a0a0afb4
parent 10075 dbaf90a0fbe0
permissions -rw-r--r--
sed -i -E "s/ёж$/ёжик/g" *ru*; sed -i -E "s/еж([аиу])$/ёжик\1/g" *ru* grep -E "(ёж|еж[аиу])$" *ru*
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
10064
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     1
{-# LANGUAGE OverloadedStrings #-}
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     2
module Confuse where
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     3
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     4
import Numeric
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     5
import Data.Char
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     6
import Control.Monad
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     7
import qualified Data.ByteString as B
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
     8
import qualified Data.ByteString.UTF8 as UTF8
10073
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
     9
import qualified Data.Map as Map
10064
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    10
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    11
hx :: [Char] -> String
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    12
hx cs = let ch = (chr . fst . last . readHex $ cs) in
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    13
            case ch of
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    14
                 '\'' -> "''"
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    15
                 '\\' -> "\\\\"
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    16
                 c -> c : []
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    17
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    18
conv :: String -> B.ByteString
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    19
conv s = B.concat ["('", UTF8.fromString i, "', '", UTF8.fromString r, "')"]
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    20
    where
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    21
        i :: String
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    22
        i = hx s
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    23
        r :: String
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    24
        r = concatMap hx . words . takeWhile ((/=) ';') . tail $ dropWhile ((/=) '\t') s
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    25
10073
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    26
convRules :: (B.ByteString, [B.ByteString]) -> B.ByteString
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    27
convRules (a, b) = B.concat ["<reset>", u a, "</reset>\n<s>", B.concat $ map u b, "</s>"]
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    28
    where
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    29
        u a = B.concat ["\\","u",a]
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    30
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    31
toPair :: String -> (B.ByteString, [B.ByteString])
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    32
toPair s = (UTF8.fromString $ takeWhile isHexDigit s, map UTF8.fromString . words . takeWhile ((/=) ';') . tail $ dropWhile ((/=) '\t') s)
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    33
865a4089278d Now convert to xml rules
unc0rr
parents: 10064
diff changeset
    34
10064
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    35
main = do
bf1a5ef4ef14 confusables.txt to insert query converter
unc0rr
parents:
diff changeset
    36
    ll <- liftM (filter (isHexDigit . head) . filter (not . null) . lines) $ readFile "confusables.txt"
10075
dbaf90a0fbe0 Filter confusables to some limits
unc0rr
parents: 10073
diff changeset
    37
    B.writeFile "rules.txt" . B.intercalate "\n" . map convRules . Map.toList . Map.fromList . filter notTooLong . filter fits16bit . map toPair $ ll
dbaf90a0fbe0 Filter confusables to some limits
unc0rr
parents: 10073
diff changeset
    38
    where
dbaf90a0fbe0 Filter confusables to some limits
unc0rr
parents: 10073
diff changeset
    39
        notTooLong = (>) 6 . length . snd
dbaf90a0fbe0 Filter confusables to some limits
unc0rr
parents: 10073
diff changeset
    40
        fits16bit (a, b) = let f = (>) 5 . B.length in all f $ a:b
dbaf90a0fbe0 Filter confusables to some limits
unc0rr
parents: 10073
diff changeset
    41