Paste: io.encodings.korean

Author: ageldama
Mode: factor
Date: Wed, 11 Feb 2009 14:46:20
Plain Text |
! Copyright (C) 2009 Yun, Jonhyouk.
! See http://factorcode.org/license.txt for BSD license.
USING: arrays assocs byte-arrays io io.encodings io.encodings.ascii
io.encodings.iana io.files io.streams.string kernel locals math
math.order math.parser memoize multiline namespaces sequences
splitting ;
IN: io.encodings.korean


SINGLETON: cp949

ALIAS: ms949 cp949
ALIAS: euc-kr cp949

cp949 "EUC-KR" register-encoding



<PRIVATE

! parse cp949.txt -> table

: cp949.txt-lines ( -- seq )
    ! "cp949.txt" from ...
    ! <http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT>
    "resource:work/io/encodings/korean/data/cp949.txt"
    ascii file-lines ;

: (PCL-drop-comments) ( seq -- newseq )
    [ "#" split1 drop ] map harvest ;

: (PCL-split-column) ( line -- columns )
    "\t" split 2 head ;

: (PCL-parse-hex) ( s -- n )
    2 short tail hex> ;

: (PCL-parse-line) ( line -- code-unicode )
    (PCL-split-column)
    [ (PCL-parse-hex) ] map ;

: process-codetable-lines ( lines -- assoc )
    (PCL-drop-comments)
    [ (PCL-parse-line) ] map ;


! convert cp949 <-> unicode

MEMO: cp949.txt>alist ( -- alist )
    cp949.txt-lines process-codetable-lines ;

cp949.txt>alist drop ! parse time hack

MEMO: cp949.txt>alist* ( -- alist )
    cp949.txt>alist [ reverse ] map ;

cp949.txt>alist* drop ! parse time hack (2)


MEMO: cp949->unicode ( b -- u )
    cp949.txt>alist at ;

MEMO: unicode->cp949 ( u -- b )
    cp949.txt>alist* at ;

:: (2b->1mb) ( c1 c2 -- mb )
    c1 8 shift c2 + ;

:: (1mb->1st) ( mb -- c1 )
    mb HEX: ff00 bitand -8 shift ;

:: (1mb->2nd) ( mb -- c2 )
    mb HEX: ff bitand ;

:: (1mb->2b) ( mb -- c1 c2 )
    mb (1mb->1st)
    mb (1mb->2nd) ;

: cp949-1st? ( n -- ? )
    dup f = not
    [ HEX: 81 HEX: fe between? ] when ;

: (1byte-unicode?) ( n -- ? )
    0 HEX: ff between? ;



M: cp949 encode-char ( char stream encoding -- )
    [let | encoding [ ]
           stream [ ]
           char [ ] |
        char unicode->cp949 (1byte-unicode?)
        [ char 1byte-array
          stream stream-write ]
        [ char unicode->cp949
          (1mb->2b) 2byte-array
          stream stream-write ]
        if ] ;

    
: (replace-eof) ( -- n ) HEX: fffd ;

:: (decode-char) ( stream -- char/f )
    stream stream-read1
    dup cp949-1st?
    [ ]
    [ stream stream-read1 (2b->1mb) cp949->unicode ]
    if ;

M: cp949 decode-char ( stream encoding -- char/f )
    drop (decode-char)
    dup f =
    [ drop (replace-eof) ]
    [ ] if ;


! TODO: <encoder>

! TODO: <decoder>




! EOF

New Annotation

Summary:
Author:
Mode:
Body: