суббота, 27 марта 2010 г.

russian.lsp

Обновил модуль "russian.lsp".

1. Нашёл баг в функции cyr-translit: "explode" неправильно работал из-под Windows с русскими буквами. Заменил конструкцию (explode linea) на (cyr-explode linea) и всё заработало. В функции cyr-explode пришлось выяснять текущую кодировку и использовать для препарирования строки или explode или find-all ".", смотря какой из вариантов работает.

2. Нашёл мелкий баг в функции cyr-utf-win: в конце строки возвращался (char nil), то есть, "\000". На печати он был не виден, поэтому баг я раньше не замечал.

; Russian encoding support by Anon. 27 Mar 2010.
; Varning: this module and codepages inside are far not complete. Some special
; symbols are not included.
;
; Functions:
;
; Usage: (cyr-win-utf "text in windows-1251 encoding")
; Decodes text from windows-1251 to utf-8
;
; Usage: (cyr-koi-utf "text in KOI8R encoding")
; Decodes text from KOI8R to utf-8
;
; Usage: (cyr-utf-win "text in utf-8 encoding")
; Decodes text from utf-8 to win-1251
;
; Usage: (url-encode-utf "В. Пупкин")
; Encodes strings for urls (utf)
; Output: "%D0%92.%20%D0%9F%D1%83%D0%BF%D0%BA%D0%B8%D0%BD"
;
; Usage: (url-encode-win "Василий Пупкин")
; Encodes strings for urls (windows-1251)
; Output: "%C2.%20%CF%F3%EF%EA%E8%ED"
;
; Usage: (cyr-rtf "\\u1055\\'3F\\u1086\\'3F\\u1089\\'3F\\u1083\\'3F")
; Decodes russian letters sequences in the .rtf file.
; Example: (cyr-rtf (read-file "report.rtf"))
;
; Usage: (cyr-rtf-r "спецификация")
; Encodes a string to the rtf russian letters format (useful for searching
; in raw-rtf). Expect problems with spaces and other \u special symbols.
;
; Usage: (cyr-translit "Щёлочь, кислота, алюминий") => "SCHyolochy, kislota, alyuminiy"
;
; Usage: (cyr-translit_ "Василий Пупкин") => "Vasiliy_Pupkin"

; Russian alphabet (33 small and 33 big letters)
(set 'cyr-alphabet (list "а" "б" "в" "г" "д" "е" "ё" "ж" "з" "и" "й" "к" "л" "м" "н" "о" "п" "р" "с" "т" "у" "ф" "х" "ц" "ч" "ш" "щ" "ъ" "ы" "ь" "э" "ю" "я" "А" "Б" "В" "Г" "Д" "Е" "Ё" "Ж" "З" "И" "Й" "К" "Л" "М" "Н" "О" "П" "Р" "С" "Т" "У" "Ф" "Х" "Ц" "Ч" "Ш" "Щ" "Ъ" "Ы" "Ь" "Э" "Ю" "Я"))

; Url-encoding for russian letters UTF
(set 'en-url-utf '(("а" "%D0%B0") ("б" "%D0%B1") ("в" "%D0%B2") ("г" "%D0%B3")
("д" "%D0%B4") ("е" "%D0%B5") ("ё" "%D1%91") ("ж" "%D0%B6") ("з" "%D0%B7")
("и" "%D0%B8") ("й" "%D0%B9") ("к" "%D0%BA") ("л" "%D0%BB") ("м" "%D0%BC")
("н" "%D0%BD") ("о" "%D0%BE") ("п" "%D0%BF") ("р" "%D1%80") ("с" "%D1%81")
("т" "%D1%82") ("у" "%D1%83") ("ф" "%D1%84") ("х" "%D1%85") ("ц" "%D1%86")
("ч" "%D1%87") ("ш" "%D1%88") ("щ" "%D1%89") ("ъ" "%D1%8A") ("ы" "%D1%8B")
("ь" "%D1%8C") ("э" "%D1%8D") ("ю" "%D1%8E") ("я" "%D1%8F") ("А" "%D0%90")
("Б" "%D0%91") ("В" "%D0%92") ("Г" "%D0%93") ("Д" "%D0%94") ("Е" "%D0%95")
("Ё" "%D0%81") ("Ж" "%D0%96") ("З" "%D0%97") ("И" "%D0%98") ("Й" "%D0%99")
("К" "%D0%9A") ("Л" "%D0%9B") ("М" "%D0%9C") ("Н" "%D0%9D") ("О" "%D0%9E")
("П" "%D0%9F") ("Р" "%D0%A0") ("С" "%D0%A1") ("Т" "%D0%A2") ("У" "%D0%A3")
("Ф" "%D0%A4") ("Х" "%D0%A5") ("Ц" "%D0%A6") ("Ч" "%D0%A7") ("Ш" "%D0%A8")
("Щ" "%D0%A9") ("Ъ" "%D0%AA") ("Ы" "%D0%AB") ("Ь" "%D0%AC") ("Э" "%D0%AD")
("Ю" "%D0%AE") ("Я" "%D0%AF")))

; Url-encoding for russian letters windows-1251
(set 'en-url-win '(("я" "%FF") ("ю" "%FE") ("э" "%FD") ("ь" "%FC") ("ы" "%FB")
("ъ" "%FA") ("щ" "%F9") ("ш" "%F8") ("ч" "%F7") ("ц" "%F6") ("х" "%F5")
("ф" "%F4") ("у" "%F3") ("т" "%F2") ("с" "%F1") ("р" "%F0") ("п" "%EF")
("о" "%EE") ("н" "%ED") ("м" "%EC") ("л" "%EB") ("к" "%EA") ("й" "%E9")
("и" "%E8") ("з" "%E7") ("ж" "%E6") ("ё" "%B8") ("е" "%E5") ("д" "%E4")
("г" "%E3") ("в" "%E2") ("б" "%E1") ("а" "%E0") ("Я" "%DF") ("Ю" "%DE")
("Э" "%DD") ("Ь" "%DC") ("Ы" "%DB") ("Ъ" "%DA") ("Щ" "%D9") ("Ш" "%D8")
("Ч" "%D7") ("Ц" "%D6") ("Х" "%D5") ("Ф" "%D4") ("У" "%D3") ("Т" "%D2")
("С" "%D1") ("Р" "%D0") ("П" "%CF") ("О" "%CE") ("Н" "%CD") ("М" "%CC")
("Л" "%CB") ("К" "%CA") ("Й" "%C9") ("И" "%C8") ("З" "%C7") ("Ж" "%C6")
("Ё" "%A8") ("Е" "%C5") ("Д" "%C4") ("Г" "%C3") ("В" "%C2") ("Б" "%C1")
("А" "%C0")))

; Url-encoding for special symbols
(set 'en-url-sym '((" " "%20") ("$" "%24") ("&" "%26") ("+" "%2B")
("," "%2C") ("/" "%2F") (":" "%3A") (";" "%3B") ("=" "%3D") ("?" "%3F") ("@" "%40")
(" " "%20") ("\"" "%22") ("<" "%3C") (">" "%3E") ("#" "%23") ("%" "%25") ("{" "%7B")
("}" "%7D") ("|" "%7C") ("\\" "%5C") ("^" "%5E") ("~" "%7E") ("[" "%5B") ("]" "%5D")
("`" "%60")))

; Encoding table utf-8
(set 'en-utf-8 '((53424 "а") (53425 "б") (53426 "в") (53427 "г") (53428 "д")
(53429 "е") (53649 "ё") (53430 "ж") (53431 "з") (53432 "и") (53433 "й")
(53434 "к") (53435 "л") (53436 "м") (53437 "н") (53438 "о") (53439 "п")
(53632 "р") (53633 "с") (53634 "т") (53635 "у") (53636 "ф") (53637 "х")
(53638 "ц") (53639 "ч") (53640 "ш") (53641 "щ") (53642 "ъ") (53643 "ы")
(53644 "ь") (53645 "э") (53646 "ю") (53647 "я") (53392 "А") (53393 "Б")
(53394 "В") (53395 "Г") (53396 "Д") (53397 "Е") (53377 "Ё") (53398 "Ж")
(53399 "З") (53400 "И") (53401 "Й") (53402 "К") (53403 "Л") (53404 "М")
(53405 "Н") (53406 "О") (53407 "П") (53408 "Р") (53409 "С") (53410 "Т")
(53411 "У") (53412 "Ф") (53413 "Х") (53414 "Ц") (53415 "Ч") (53416 "Ш")
(53417 "Щ") (53418 "Ъ") (53419 "Ы") (53420 "Ь") (53421 "Э") (53422 "Ю")
(53423 "Я") (14844052 "—") (14845078 "№")))

; Encoding table windows-1251
(set 'en-win-1251 '((255 "я") (254 "ю") (253 "э") (252 "ь") (251 "ы")
(250 "ъ") (249 "щ") (248 "ш") (247 "ч") (246 "ц") (245 "х") (244 "ф")
(243 "у") (242 "т") (241 "с") (240 "р") (239 "п") (238 "о") (237 "н")
(236 "м") (235 "л") (234 "к") (233 "й") (232 "и") (231 "з") (230 "ж")
(184 "ё") (229 "е") (228 "д") (227 "г") (226 "в") (225 "б") (224 "а")
(223 "Я") (222 "Ю") (221 "Э") (220 "Ь") (219 "Ы") (218 "Ъ") (217 "Щ")
(216 "Ш") (215 "Ч") (214 "Ц") (213 "Х") (212 "Ф") (211 "У") (210 "Т")
(209 "С") (208 "Р") (207 "П") (206 "О") (205 "Н") (204 "М") (203 "Л")
(202 "К") (201 "Й") (200 "И") (199 "З") (198 "Ж") (168 "Ё") (197 "Е")
(196 "Д") (195 "Г") (194 "В") (193 "Б") (192 "А") (185 "№")))

; Encoding table KOI8R
(set 'en-koi8r '((241 "Я") (224 "Ю") (252 "Э") (248 "Ь") (249 "Ы")
(255 "Ъ") (253 "Щ") (251 "Ш") (254 "Ч") (227 "Ц") (232 "Х") (230 "Ф")
(245 "У") (244 "Т") (243 "С") (242 "Р") (240 "П") (239 "О") (238 "Н")
(237 "М") (236 "Л") (235 "К") (234 "Й") (233 "И") (250 "З") (246 "Ж")
(179 "Ё") (229 "Е") (228 "Д") (231 "Г") (247 "В") (226 "Б") (225 "А")
(209 "я") (192 "ю") (220 "э") (216 "ь") (217 "ы") (223 "ъ") (221 "щ")
(219 "ш") (222 "ч") (195 "ц") (200 "х") (198 "ф") (213 "у") (212 "т")
(211 "с") (210 "р") (208 "п") (207 "о") (206 "н") (205 "м") (204 "л")
(203 "к") (202 "й") (201 "и") (218 "з") (214 "ж") (163 "ё") (197 "е")
(196 "д") (199 "г") (215 "в") (194 "б") (193 "а")))

; RTF-encoding (4-digit one). Warning: instead of "3f" sometimes you can see "3F".
(set 'en-rtf '(({\u1072\'3f} {а}) ({\u1073\'3f} {б}) ({\u1074\'3f} {в})
({\u1075\'3f} {г}) ({\u1076\'3f} {д}) ({\u1077\'3f} {е}) ({\u1105\'3f} {ё})
({\u1078\'3f} {ж}) ({\u1079\'3f} {з}) ({\u1080\'3f} {и}) ({\u1081\'3f} {й})
({\u1082\'3f} {к}) ({\u1083\'3f} {л}) ({\u1084\'3f} {м}) ({\u1085\'3f} {н})
({\u1086\'3f} {о}) ({\u1087\'3f} {п}) ({\u1088\'3f} {р}) ({\u1089\'3f} {с})
({\u1090\'3f} {т}) ({\u1091\'3f} {у}) ({\u1092\'3f} {ф}) ({\u1093\'3f} {х})
({\u1094\'3f} {ц}) ({\u1095\'3f} {ч}) ({\u1096\'3f} {ш}) ({\u1097\'3f} {щ})
({\u1098\'3f} {ъ}) ({\u1099\'3f} {ы}) ({\u1100\'3f} {ь}) ({\u1101\'3f} {э})
({\u1102\'3f} {ю}) ({\u1103\'3f} {я}) ({\u1040\'3f} {А}) ({\u1041\'3f} {Б})
({\u1042\'3f} {В}) ({\u1043\'3f} {Г}) ({\u1044\'3f} {Д}) ({\u1045\'3f} {Е})
({\u1025\'3f} {Ё}) ({\u1046\'3f} {Ж}) ({\u1047\'3f} {З}) ({\u1048\'3f} {И})
({\u1049\'3f} {Й}) ({\u1050\'3f} {К}) ({\u1051\'3f} {Л}) ({\u1052\'3f} {М})
({\u1053\'3f} {Н}) ({\u1054\'3f} {О}) ({\u1055\'3f} {П}) ({\u1056\'3f} {Р})
({\u1057\'3f} {С}) ({\u1058\'3f} {Т}) ({\u1059\'3f} {У}) ({\u1060\'3f} {Ф})
({\u1061\'3f} {Х}) ({\u1062\'3f} {Ц}) ({\u1063\'3f} {Ч}) ({\u1064\'3f} {Ш})
({\u1065\'3f} {Щ}) ({\u1066\'3f} {Ъ}) ({\u1067\'3f} {Ы}) ({\u1068\'3f} {Ь})
({\u1069\'3f} {Э}) ({\u1070\'3f} {Ю}) ({\u1071\'3f} {Я})
({\u160\'3f} { }) ({\u8470\'3f} {№})))

; Translit encoding
(set 'translit '(("а" "a") ("б" "b") ("в" "v") ("г" "g") ("д" "d") ("е" "e") ("ё" "yo")
("ж" "zh") ("з" "z") ("и" "i") ("й" "y") ("к" "k") ("л" "l") ("м" "m") ("н" "n")
("о" "o") ("п" "p") ("р" "r") ("с" "s") ("т" "t") ("у" "u") ("ф" "f") ("х" "h")
("ц" "c") ("ч" "ch") ("ш" "sh") ("щ" "sch") ("ъ" "y") ("ы" "y") ("ь" "y") ("э" "e")
("ю" "yu") ("я" "ya") ("А" "A") ("Б" "B") ("В" "V") ("Г" "G") ("Д" "D") ("Е" "E")
("Ё" "YO") ("Ж" "ZH") ("З" "Z") ("И" "I") ("Й" "Y") ("К" "K") ("Л" "L") ("М" "M")
("Н" "N") ("О" "O") ("П" "P") ("Р" "R") ("С" "S") ("Т" "T") ("У" "U") ("Ф" "F")
("Х" "H") ("Ц" "C") ("Ч" "CH") ("Ш" "SH") ("Щ" "SCH") ("Ъ" "Y") ("Ы" "Y") ("Ь" "Y")
("Э" "E") ("Ю" "YU") ("Я" "YA")))

; Internal function. Explodes string to bytes
(define (cortar linea)
(unpack (dup "b" (length linea)) linea))

; Internal function. Applies codetable to the list of bytes
(define (en-de-byte linea tabla (o-linea ""))
(dolist (x (cortar linea))
(push (or (lookup x tabla) (char x)) o-linea -1))
o-linea)

; Internal function. Explodes string to letters
; There can be surprises, becouse both find-all and explode
; should work (in theory) with any encoding.
(define (cyr-explode linea)
(if (= (last (parse ((set-locale) 0) ".")) "1251")
(find-all "." linea)
(explode linea)))

; Usage: (cyr-translit "Щёлочь, кислота, аллюминий") => "SCHyolochy, kislota, allyuminiy"
(define (cyr-translit linea)
(join (map (fn (x) (or (lookup x translit) x)) (cyr-explode linea))))

; Usage: (cyr-translit_ "Василий Пупкин") => "Vasiliy_Pupkin"
(define (cyr-translit_ linea)
(replace " " (cyr-translit linea) "_"))

; Usage: (cyr-rtf "\\u1055\\'3F\\u1086\\'3F\\u1089\\'3F\\u1083\\'3F")
; Decodes russian letters in the .rtf file.
(define (cyr-rtf linea)
(replace {\\u[18][0-9]+\\'3f} linea (lookup (lower-case $it) en-rtf) 1))

; Usage: (cyr-rtf-r "спецификация")
; Encodes a string to the rtf russian letters format (useful for searching
; in raw-rtf). Expect problems with spaces and other \u special symbols.
(define (cyr-rtf-r linea)
(let (letra "" efecto "")
(while (!= (set 'letra (pop linea)) "")
(push (or (lookup letra (map reverse en-rtf)) letra) efecto -1))
(replace "\\" efecto "\\\\")
(replace "'3f" efecto "'3F")))

; Usage: (url-encode-utf "Василий Пупкин")
; Encodes strings for urls (utf)
(define (url-encode-utf t-linea)
(set 't-out "")
(dostring (t-char t-linea)
(push (or (lookup (char t-char) en-url-utf)
(lookup (char t-char) en-url-sym)
(char t-char)) t-out -1))
t-out)

; Usage: (url-encode-win "Василий Пупкин")
; Encodes strings for urls (windows-1251)
(define (url-encode-win t-linea)
(set 't-out "")
(dostring (t-char t-linea)
(push (or (lookup (char t-char) en-url-win)
(lookup (char t-char) en-url-sym)
(char t-char)) t-out -1))
t-out)

; Usage: (cyr-win-utf "text in windows-1251 encoding")
; Decodes text from windows-1251 to utf-8
(define (cyr-win-utf linea)
(en-de-byte linea en-win-1251))

; Usage: (cyr-koi-utf "text in KOI8R encoding")
; Decodes text from KOI8R to utf-8
(define (cyr-koi-utf linea)
(en-de-byte linea en-koi8r))

; Usage: (cyr-utf-win "text in utf-8 encoding")
; Decodes text from utf-8 to win-1251
(define (cyr-utf-win linea)
(let (linea (cortar linea) o-linea "" f-byte nil)
(while (set 'f-byte (pop linea))
(if (or (= f-byte 208) (= f-byte 209))
(push (or (lookup (+ (mul f-byte 256) (or (pop linea) 0)) en-utf-8) "?") o-linea -1)
(if (= f-byte 226)
(push (or
(lookup (+ 14811136 (mul (or (pop linea) 0) 256) (or (pop linea) 0)) en-utf-8)
"?") o-linea -1)
(push (char f-byte) o-linea -1))))
o-linea))

; Functions commented below will be useful for adding another charsets,
; like CP866 or ISO-8859-5 or whatever
;
; Encodes one russian utf char to url-ready form
;(define (url-encode-char t-char)
; (append
; "%" (format "%X" ((unpack "b b" t-char) 0))
; "%" (format "%X" ((unpack "b b" t-char) 1))))
;
;(set 'url-charset '())
;(dolist (x cyr-alpabet)
; (push (append (list x) (list (url-encode-char x))) url-charset -1))
;
; Types list ready to insert to the code
;(set 'counter 2)
;(dolist (x en-win-1251)
; (print "(\"" (x 1) "\" \"%" (format "%X" (x 0)) "\") ")
; (if (= counter 6)
; (begin (print "\n ") (set 'counter 0)))
; (set 'counter (+ counter 1)))

Комментариев нет:

Отправить комментарий

Архив блога