######################################################################## # $Id: CJK.perl,v 1.8 2002/04/26 16:06:52 RRM Exp $ # CJK.perl # Jens Lippmann , # Boy Yang , # Werner Lemberg # # Extension to LaTeX2HTML V 96.2 to supply support for the # "CJK" LaTeX package. # ######################################################################## # Change Log: # =========== # jcl = Jens Lippmann # # $Log: CJK.perl,v $ # Revision 1.8 2002/04/26 16:06:52 RRM # -- JIS is EUC-JP, not ISO-2022-JP. # # Revision 1.7 2002/04/26 14:17:31 RRM # -- fixed MIME names for the encodings; thanks to Jungshik Shin for # the correct names # # Revision 1.6 2002/04/24 22:27:00 RRM # -- automatic recognition of document charset, based upon the # encoding in the first {CJK} or {CJK*} environment. # # Revision 1.5 1999/06/06 14:24:59 MRO # # # -- many cleanups wrt. to TeXlive # -- changed $* to /m as far as possible. $* is deprecated in perl5, all # occurrences should be removed. # # Revision 1.4 1999/04/09 18:11:27 JCL # changed my e-Mail address # # Revision 1.3 1998/02/19 22:24:26 latex2html # th-darmstadt -> tu-darmstadt # # Revision 1.2 1996/12/17 17:11:41 JCL # typo # # Revision 1.1 1996/12/17 17:07:32 JCL # - introduced to CVS repository # - adjusted technical notes according to Werner's proposal # - added support for CJK* environment # # jcl 16-DEC-96 - Created # ######################################################################## # Notes: # To may view the results only with a browser configured for the # specific language. # To configure the browser, use eg. the "document encoding" menu # of NetScape. # # Technical Notes: # We use the pre_process hook to change any text coming in to # LaTeX2HTML such that we convert from the outer representation # of double byte characters to an inner, LaTeX2HTML specific # representation. # The two outer representations recognized are described as follows: # o standard CJK encodings (GB, KS, Big5, SJIS, etc.) # Each symbol is formed by two characters, the first in the range # [\201-\237\241-\376] (octal) or 0x81-0x9F, 0xA1-0xFE (hexadecimal), # the second in the range # [\100-\176\200-\377] (octal) or 0x40-0x7E, 0x80-0xFF (hexadecimal). # o CJK internal encoding (to conveniently use CJK processed files) # Each symbol is a sequence with a leading character in the range # [\201-\237\241-\376] or 0x81-0x9F, 0xA1-0xFE, # a sequence of digits forming the decimal representation of the # second character from standard encoded form (eg. "65", "128"), # and a trailing 0xFF. # The internal LaTeX2HTML representation is the same as the CJK # encoded form. # Additionally, we handle TeX's normalized representation of special # characters (eg. ^^e4), which is helpful when LaTeX2HTML processes # the .aux file. # # The post_process hook will convert the LaTeX2HTML internal coding # into standard Big5/SJIS encoding, which then remains in the # HTML text. # # The revert_to_raw_tex hook will convert the internal encoding # back to standard encoding to help with image creation. # ######################################################################## package main; # possible values for the 1st optional argument to \begin{CJK} # and the corresponding charset: %CJK_charset = ( 'Bg5' , 'Big5' , 'Bg5+' , 'Big5Plus' , 'Bg5hk' , 'Big5-HKSCS' , 'GB' , 'gb2312' , 'GBt' , 'gbt_12345' , 'GBK' , 'GBK' # , 'JIS' , 'ISO-2022-JP' , 'JIS' , 'EUC-JP' , 'SJIS' , 'Shift_JIS' , 'KS' , 'EUC-KR' , 'UTF8' , 'UTF-8' , 'EUC-TW' , 'X-EUC-TW' , 'EUC-JP' , 'EUC-JP' , 'EUC-KR' , 'EUC-KR' , 'CP949' , 'X-Windows-949' ); # Use 'Bg5' => 'big5' as default charset, for both input and output, # unless it is set already with a value for $CJK_AUTO_CHARSET $CJK_AUTO_CHARSET = '' unless (defined $CJK_AUTO_CHARSET); $charset = $CHARSET = $CJK_AUTO_CHARSET || $CJK_charset{'Bg5'}; sub pre_pre_process { # Handle TeX's normalized special character encoding. # This *might* be done by LaTeX2HTML, too, but yet we don't # rely on it. s/\^\^([^0-9a-f])/chr((64+ord($1))&127)/gem; s/\^\^([0-9a-f][0-9a-f])/chr(hex($1))/gem; # Care for standard CJK encoding -> l2h internal form. s/([\201-\237\241-\376])([\100-\176\200-\376])/"$1" . ord($2) . "\377"/gem; } sub post_post_process { # l2h internal form -> standard CJK encoding s/([\201-\237\241-\376])(\d+)\377/"$1" . chr($2)/ge; } sub revert_to_raw_tex_hook { # l2h internal form -> standard CJK encoding s/([\201-\237\241-\376])(\d+)\377/"$1" . chr($2)/ge; } sub do_cmd_CJKchar { local($_) = @_; &get_next_optional_argument; s/$next_pair_rx/chr($2)/eo; s/$next_pair_rx/$2\377/o; $_; } # Handle CJK environments. # The usage of \CJKspace, \CJKnospace is not implemented yet. # sub do_env_CJK { local($_) = @_; my ($cjk_enc); # skip font encoding &get_next_optional_argument; # handle CJK encoding $cjk_enc = &missing_braces unless ((s/$next_pair_pr_rx/$cjk_enc = $2; ''/eo) ||(s/$next_pair_rx/$cjk_enc = $2; ''/eo)); $cjk_enc =~ s/^\s+|\s+$//g; if ($cjk_enc) { if (!defined $CJK_charset{$cjk_enc}) { &write_warning ( "unknown charset code: $cjk_enc in CJK environment."); } elsif (!$CJK_AUTO_CHARSET) { $CJK_AUTO_CHARSET = $charset = $CHARSET = $CJK_charset{$cjk_enc}; } elsif ($CHARSET eq $CJK_charset{$cjk_enc}) { # compatible; do nothing. } else { &write_warning ( "Only one charset allowed per document: $CHARSET"); &write_warning ( "Ignoring request for ".$CJK_charset{$cjk_enc}); } } # skip CJK font family s/$next_pair_rx//o; $_; } # Handle CJK* environments. # The usage of \CJKspace, \CJKnospace is not implemented yet. # We won't catch single newlines following CJK symbols, because # this would require to suppress the newlines in the HTML output, # leading to overly long lines. # sub do_env_CJKstar { local($_) = &do_env_CJK; #CJK symbols eat ensuing white space s/([\201-\237\241-\376]\d+\377)[ \t]+/\1/g; $_; } # most of the commands here need some action which is not implemented yet. &ignore_commands(<<_IGNORED_CMDS_); CJKCJKchar CJKboldshift CJKcaption # {} CJKenc # {} CJKencfamily # [] # {} # {} CJKfamily # {} CJKfontenc # {} # {} CJKglue CJKhangul CJKhangulchar CJKhanja CJKkern CJKlatinchar CJKnospace CJKspace CJKtilde CJKtolerance CJKuppercase Unicode # {} # {} nbs standardtilde _IGNORED_CMDS_ # we need \AtBeginDocument and \AtEndDocument &ignore_commands(<<_IGNORED_CMDS_); AtBeginDocument # {} AtEndDocument # {} _IGNORED_CMDS_ # This must be the last line. 1;