man/man3/Tcl_ExternalToUtfDString.3 - toolchains/skids - Git at Google

 '\"
 '\" Copyright (c) 1997-1998 Sun Microsystems, Inc.
 '\"
 '\" See the file "license.terms" for information on usage and redistribution
 '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 '\"
 '\" RCS: @(#) $Id: Encoding.3,v 1.11 2002/07/01 18:24:39 jenglish Exp $
 '\"
 '\" The definitions below are for supplemental macros used in Tcl/Tk
 '\" manual entries.
 '\"
 '\" .AP type name in/out ?indent?
 '\"	Start paragraph describing an argument to a library procedure.
 '\"	type is type of argument (int, etc.), in/out is either "in", "out",
 '\"	or "in/out" to describe whether procedure reads or modifies arg,
 '\"	and indent is equivalent to second arg of .IP (shouldn't ever be
 '\"	needed;  use .AS below instead)
 '\"
 '\" .AS ?type? ?name?
 '\"	Give maximum sizes of arguments for setting tab stops.  Type and
 '\"	name are examples of largest possible arguments that will be passed
 '\"	to .AP later.  If args are omitted, default tab stops are used.
 '\"
 '\" .BS
 '\"	Start box enclosure.  From here until next .BE, everything will be
 '\"	enclosed in one large box.
 '\"
 '\" .BE
 '\"	End of box enclosure.
 '\"
 '\" .CS
 '\"	Begin code excerpt.
 '\"
 '\" .CE
 '\"	End code excerpt.
 '\"
 '\" .VS ?version? ?br?
 '\"	Begin vertical sidebar, for use in marking newly-changed parts
 '\"	of man pages.  The first argument is ignored and used for recording
 '\"	the version when the .VS was added, so that the sidebars can be
 '\"	found and removed when they reach a certain age.  If another argument
 '\"	is present, then a line break is forced before starting the sidebar.
 '\"
 '\" .VE
 '\"	End of vertical sidebar.
 '\"
 '\" .DS
 '\"	Begin an indented unfilled display.
 '\"
 '\" .DE
 '\"	End of indented unfilled display.
 '\"
 '\" .SO
 '\"	Start of list of standard options for a Tk widget.  The
 '\"	options follow on successive lines, in four columns separated
 '\"	by tabs.
 '\"
 '\" .SE
 '\"	End of list of standard options for a Tk widget.
 '\"
 '\" .OP cmdName dbName dbClass
 '\"	Start of description of a specific option.  cmdName gives the
 '\"	option's name as specified in the class command, dbName gives
 '\"	the option's name in the option database, and dbClass gives
 '\"	the option's class in the option database.
 '\"
 '\" .UL arg1 arg2
 '\"	Print arg1 underlined, then print arg2 normally.
 '\"
 '\" RCS: @(#) $Id: man.macros,v 1.4 2000/08/25 06:18:32 ericm Exp $
 '\"
 '\"	# Set up traps and other miscellaneous stuff for Tcl/Tk man pages.
 .if t .wh -1.3i ^B
 .nr ^l \n(.l
 .ad b
 '\"	# Start an argument description
 .de AP
 .ie !"\\$4"" .TP \\$4
 .el \{\
 .   ie !"\\$2"" .TP \\n()Cu
 .   el          .TP 15
 .\}
 .ta \\n()Au \\n()Bu
 .ie !"\\$3"" \{\
 \&\\$1	\\fI\\$2\\fP	(\\$3)
 .\".b
 .\}
 .el \{\
 .br
 .ie !"\\$2"" \{\
 \&\\$1	\\fI\\$2\\fP
 .\}
 .el \{\
 \&\\fI\\$1\\fP
 .\}
 .\}
 ..
 '\"	# define tabbing values for .AP
 .de AS
 .nr )A 10n
 .if !"\\$1"" .nr )A \\w'\\$1'u+3n
 .nr )B \\n()Au+15n
 .\"
 .if !"\\$2"" .nr )B \\w'\\$2'u+\\n()Au+3n
 .nr )C \\n()Bu+\\w'(in/out)'u+2n
 ..
 .AS Tcl_Interp Tcl_CreateInterp in/out
 '\"	# BS - start boxed text
 '\"	# ^y = starting y location
 '\"	# ^b = 1
 .de BS
 .br
 .mk ^y
 .nr ^b 1u
 .if n .nf
 .if n .ti 0
 .if n \l'\\n(.lu\(ul'
 .if n .fi
 ..
 '\"	# BE - end boxed text (draw box now)
 .de BE
 .nf
 .ti 0
 .mk ^t
 .ie n \l'\\n(^lu\(ul'
 .el \{\
 .\"	Draw four-sided box normally, but don't draw top of
 .\"	box if the box started on an earlier page.
 .ie !\\n(^b-1 \{\
 \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul'
 .\}
 .el \}\
 \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul'
 .\}
 .\}
 .fi
 .br
 .nr ^b 0
 ..
 '\"	# VS - start vertical sidebar
 '\"	# ^Y = starting y location
 '\"	# ^v = 1 (for troff;  for nroff this doesn't matter)
 .de VS
 .if !"\\$2"" .br
 .mk ^Y
 .ie n 'mc \s12\(br\s0
 .el .nr ^v 1u
 ..
 '\"	# VE - end of vertical sidebar
 .de VE
 .ie n 'mc
 .el \{\
 .ev 2
 .nf
 .ti 0
 .mk ^t
 \h'|\\n(^lu+3n'\L'|\\n(^Yu-1v\(bv'\v'\\n(^tu+1v-\\n(^Yu'\h'-|\\n(^lu+3n'
 .sp -1
 .fi
 .ev
 .\}
 .nr ^v 0
 ..
 '\"	# Special macro to handle page bottom:  finish off current
 '\"	# box/sidebar if in box/sidebar mode, then invoked standard
 '\"	# page bottom macro.
 .de ^B
 .ev 2
 'ti 0
 'nf
 .mk ^t
 .if \\n(^b \{\
 .\"	Draw three-sided box if this is the box's first page,
 .\"	draw two sides but no top otherwise.
 .ie !\\n(^b-1 \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c
 .el \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c
 .\}
 .if \\n(^v \{\
 .nr ^x \\n(^tu+1v-\\n(^Yu
 \kx\h'-\\nxu'\h'|\\n(^lu+3n'\ky\L'-\\n(^xu'\v'\\n(^xu'\h'|0u'\c
 .\}
 .bp
 'fi
 .ev
 .if \\n(^b \{\
 .mk ^y
 .nr ^b 2
 .\}
 .if \\n(^v \{\
 .mk ^Y
 .\}
 ..
 '\"	# DS - begin display
 .de DS
 .RS
 .nf
 .sp
 ..
 '\"	# DE - end display
 .de DE
 .fi
 .RE
 .sp
 ..
 '\"	# SO - start of list of standard options
 .de SO
 .SH "STANDARD OPTIONS"
 .LP
 .nf
 .ta 5.5c 11c
 .ft B
 ..
 '\"	# SE - end of list of standard options
 .de SE
 .fi
 .ft R
 .LP
 See the \\fBoptions\\fR manual entry for details on the standard options.
 ..
 '\"	# OP - start of full description for a single option
 .de OP
 .LP
 .nf
 .ta 4c
 Command-Line Name:	\\fB\\$1\\fR
 Database Name:	\\fB\\$2\\fR
 Database Class:	\\fB\\$3\\fR
 .fi
 .IP
 ..
 '\"	# CS - begin code excerpt
 .de CS
 .RS
 .nf
 .ta .25i .5i .75i 1i
 ..
 '\"	# CE - end code excerpt
 .de CE
 .fi
 .RE
 ..
 .de UL
 \\$1\l'|0\(ul'\\$2
 ..
 .TH Tcl_GetEncoding 3 "8.1" Tcl "Tcl Library Procedures"
 .BS
 .SH NAME
 Tcl_GetEncoding, Tcl_FreeEncoding, Tcl_ExternalToUtfDString, Tcl_ExternalToUtf, Tcl_UtfToExternalDString, Tcl_UtfToExternal, Tcl_WinTCharToUtf, Tcl_WinUtfToTChar, Tcl_GetEncodingName, Tcl_SetSystemEncoding, Tcl_GetEncodingNames, Tcl_CreateEncoding, Tcl_GetDefaultEncodingDir, Tcl_SetDefaultEncodingDir \- procedures for creating and using encodings.
 .SH SYNOPSIS
 .nf
 \fB#include <tcl.h>\fR
 .sp
 Tcl_Encoding
 \fBTcl_GetEncoding\fR(\fIinterp, name\fR)
 .sp
 void
 \fBTcl_FreeEncoding\fR(\fIencoding\fR)
 .sp
 char *
 \fBTcl_ExternalToUtfDString\fR(\fIencoding, src, srcLen, dstPtr\fR)
 .sp
 int
 \fBTcl_ExternalToUtf\fR(\fIinterp, encoding, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
 	dstCharsPtr\fR)
 .sp
 char *
 \fBTcl_UtfToExternalDString\fR(\fIencoding, src, srcLen, dstPtr\fR)
 .sp
 int
 \fBTcl_UtfToExternal\fR(\fIinterp, encoding, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
 	dstCharsPtr\fR)
 .sp
 char *
 \fBTcl_WinTCharToUtf\fR(\fItsrc, srcLen, dstPtr\fR)
 .sp
 TCHAR *
 \fBTcl_WinUtfToTChar\fR(\fIsrc, srcLen, dstPtr\fR)
 .sp
 CONST char *
 \fBTcl_GetEncodingName\fR(\fIencoding\fR)
 .sp
 int
 \fBTcl_SetSystemEncoding\fR(\fIinterp, name\fR)
 .sp
 void
 \fBTcl_GetEncodingNames\fR(\fIinterp\fR)
 .sp
 Tcl_Encoding
 \fBTcl_CreateEncoding\fR(\fItypePtr\fR)
 .sp
 CONST char *
 \fBTcl_GetDefaultEncodingDir\fR(\fIvoid\fR)
 .sp
 void
 \fBTcl_SetDefaultEncodingDir\fR(\fIpath\fR)


 .SH ARGUMENTS
 .AS Tcl_EncodingState *dstWrotePtr
 .AP Tcl_Interp *interp in
 Interpreter to use for error reporting, or NULL if no error reporting is
 desired.
 .AP "CONST char" *name in
 Name of encoding to load.
 .AP Tcl_Encoding encoding in
 The encoding to query, free, or use for converting text.  If \fIencoding\fR is
 NULL, the current system encoding is used.
 .AP "CONST char" *src in
 For the \fBTcl_ExternalToUtf\fR functions, an array of bytes in the
 specified encoding that are to be converted to UTF-8.  For the
 \fBTcl_UtfToExternal\fR and \fBTcl_WinUtfToTChar\fR functions, an array of
 UTF-8 characters to be converted to the specified encoding.
 .AP "CONST TCHAR" *tsrc in
 An array of Windows TCHAR characters to convert to UTF-8.
 .AP int srcLen in
 Length of \fIsrc\fR or \fItsrc\fR in bytes.  If the length is negative, the
 encoding-specific length of the string is used.
 .AP Tcl_DString *dstPtr out
 Pointer to an uninitialized or free \fBTcl_DString\fR in which the converted
 result will be stored.
 .AP int flags in
 Various flag bits OR-ed together.
 TCL_ENCODING_START signifies that the
 source buffer is the first block in a (potentially multi-block) input
 stream, telling the conversion routine to reset to an initial state and
 perform any initialization that needs to occur before the first byte is
 converted.  TCL_ENCODING_END signifies that the source buffer is the last
 block in a (potentially multi-block) input stream, telling the conversion
 routine to perform any finalization that needs to occur after the last
 byte is converted and then to reset to an initial state.
 TCL_ENCODING_STOPONERROR signifies that the conversion routine should
 return immediately upon reading a source character that doesn't exist in
 the target encoding; otherwise a default fallback character will
 automatically be substituted.
 .AP Tcl_EncodingState *statePtr in/out
 Used when converting a (generally long or indefinite length) byte stream
 in a piece by piece fashion.  The conversion routine stores its current
 state in \fI*statePtr\fR after \fIsrc\fR (the buffer containing the
 current piece) has been converted; that state information must be passed
 back when converting the next piece of the stream so the conversion
 routine knows what state it was in when it left off at the end of the
 last piece.  May be NULL, in which case the value specified for \fIflags\fR
 is ignored and the source buffer is assumed to contain the complete string to
 convert.
 .AP char *dst out
 Buffer in which the converted result will be stored.  No more than
 \fIdstLen\fR bytes will be stored in \fIdst\fR.
 .AP int dstLen in
 The maximum length of the output buffer \fIdst\fR in bytes.
 .AP int *srcReadPtr out
 Filled with the number of bytes from \fIsrc\fR that were actually
 converted.  This may be less than the original source length if there was
 a problem converting some source characters.  May be NULL.
 .AP int *dstWrotePtr out
 Filled with the number of bytes that were actually stored in the output
 buffer as a result of the conversion.  May be NULL.
 .AP int *dstCharsPtr out
 Filled with the number of characters that correspond to the number of bytes
 stored in the output buffer.  May be NULL.
 .AP Tcl_EncodingType *typePtr in
 Structure that defines a new type of encoding.
 .AP "CONST char" *path in
 A path to the location of the encoding file.
 .BE
 .SH INTRODUCTION
 .PP
 These routines convert between Tcl's internal character representation,
 UTF-8, and character representations used by various operating systems or
 file systems, such as Unicode, ASCII, or Shift-JIS.  When operating on
 strings, such as such as obtaining the names of files or displaying
 characters using international fonts, the strings must be translated into
 one or possibly multiple formats that the various system calls can use.  For
 instance, on a Japanese Unix workstation, a user might obtain a filename
 represented in the EUC-JP file encoding and then translate the characters to
 the jisx0208 font encoding in order to display the filename in a Tk widget.
 The purpose of the encoding package is to help bridge the translation gap.
 UTF-8 provides an intermediate staging ground for all the various
 encodings.  In the example above, text would be translated into UTF-8 from
 whatever file encoding the operating system is using.  Then it would be
 translated from UTF-8 into whatever font encoding the display routines
 require.
 .PP
 Some basic encodings are compiled into Tcl.  Others can be defined by the
 user or dynamically loaded from encoding files in a
 platform-independent manner.
 .SH DESCRIPTION
 .PP
 \fBTcl_GetEncoding\fR finds an encoding given its \fIname\fR.  The name may
 refer to a builtin Tcl encoding, a user-defined encoding registered by
 calling \fBTcl_CreateEncoding\fR, or a dynamically-loadable encoding
 file.  The return value is a token that represents the encoding and can be
 used in subsequent calls to procedures such as \fBTcl_GetEncodingName\fR,
 \fBTcl_FreeEncoding\fR, and \fBTcl_UtfToExternal\fR.  If the name did not
 refer to any known or loadable encoding, NULL is returned and an error
 message is returned in \fIinterp\fR.
 .PP
 The encoding package maintains a database of all encodings currently in use.
 The first time \fIname\fR is seen, \fBTcl_GetEncoding\fR returns an
 encoding with a reference count of 1.  If the same \fIname\fR is requested
 further times, then the reference count for that encoding is incremented
 without the overhead of allocating a new encoding and all its associated
 data structures.
 .PP
 When an \fIencoding\fR is no longer needed, \fBTcl_FreeEncoding\fR
 should be called to release it.  When an \fIencoding\fR is no longer in use
 anywhere (i.e., it has been freed as many times as it has been gotten)
 \fBTcl_FreeEncoding\fR will release all storage the encoding was using
 and delete it from the database.
 .PP
 \fBTcl_ExternalToUtfDString\fR converts a source buffer \fIsrc\fR from the
 specified \fIencoding\fR into UTF-8.  The converted bytes are stored in
 \fIdstPtr\fR, which is then NULL terminated.  The caller should eventually
 call \fBTcl_DStringFree\fR to free any information stored in \fIdstPtr\fR.
 When converting, if any of the characters in the source buffer cannot be
 represented in the target encoding, a default fallback character will be
 used.  The return value is a pointer to the value stored in the DString.
 .PP
 \fBTcl_ExternalToUtf\fR converts a source buffer \fIsrc\fR from the specified
 \fIencoding\fR into UTF-8.  Up to \fIsrcLen\fR bytes are converted from the
 source buffer and up to \fIdstLen\fR converted bytes are stored in \fIdst\fR.
 In all cases, \fI*srcReadPtr\fR is filled with the number of bytes that were
 successfully converted from \fIsrc\fR and \fI*dstWrotePtr\fR is filled with
 the corresponding number of bytes that were stored in \fIdst\fR.  The return
 value is one of the following:
 .RS
 .IP \fBTCL_OK\fR 29
 All bytes of \fIsrc\fR were converted.
 .IP \fBTCL_CONVERT_NOSPACE\fR 29
 The destination buffer was not large enough for all of the converted data; as
 many characters as could fit were converted though.
 .IP \fBTCL_CONVERT_MULTIBYTE\fR 29
 The last fews bytes in the source buffer were the beginning of a multibyte
 sequence, but more bytes were needed to complete this sequence.  A
 subsequent call to the conversion routine should pass a buffer containing
 the unconverted bytes that remained in \fIsrc\fR plus some further bytes
 from the source stream to properly convert the formerly split-up multibyte
 sequence.
 .IP \fBTCL_CONVERT_SYNTAX\fR 29
 The source buffer contained an invalid character sequence.  This may occur
 if the input stream has been damaged or if the input encoding method was
 misidentified.
 .IP \fBTCL_CONVERT_UNKNOWN\fR 29
 The source buffer contained a character that could not be represented in
 the target encoding and TCL_ENCODING_STOPONERROR was specified.
 .RE
 .LP
 \fBTcl_UtfToExternalDString\fR converts a source buffer \fIsrc\fR from UTF-8
 into the specified \fIencoding\fR.  The converted bytes are stored in
 \fIdstPtr\fR, which is then terminated with the appropriate encoding-specific
 NULL.  The caller should eventually call \fBTcl_DStringFree\fR to free any
 information stored in \fIdstPtr\fR.  When converting, if any of the
 characters in the source buffer cannot be represented in the target
 encoding, a default fallback character will be used.  The return value is
 a pointer to the value stored in the DString.
 .PP
 \fBTcl_UtfToExternal\fR converts a source buffer \fIsrc\fR from UTF-8 into
 the specified \fIencoding\fR.  Up to \fIsrcLen\fR bytes are converted from
 the source buffer and up to \fIdstLen\fR converted bytes are stored in
 \fIdst\fR.  In all cases, \fI*srcReadPtr\fR is filled with the number of
 bytes that were successfully converted from \fIsrc\fR and \fI*dstWrotePtr\fR
 is filled with the corresponding number of bytes that were stored in
 \fIdst\fR.  The return values are the same as the return values for
 \fBTcl_ExternalToUtf\fR.
 .PP
 \fBTcl_WinUtfToTChar\fR and \fBTcl_WinTCharToUtf\fR are
 Windows-only convenience
 functions for converting between UTF-8 and Windows strings.  On Windows 95
 (as with the Macintosh and Unix operating systems),
 all strings exchanged between Tcl and the operating system are "char"
 based.  On Windows NT, some strings exchanged between Tcl and the
 operating system are "char" oriented while others are in Unicode.  By
 convention, in Windows a TCHAR is a character in the ANSI code page
 on Windows 95 and a Unicode character on Windows NT.
 .PP
 If you planned to use the same "char" based interfaces on both Windows
 95 and Windows NT, you could use \fBTcl_UtfToExternal\fR and
 \fBTcl_ExternalToUtf\fR (or their \fBTcl_DString\fR equivalents) with an
 encoding of NULL (the current system encoding).  On the other hand,
 if you planned to use the Unicode interface when running on Windows NT
 and the "char" interfaces when running on Windows 95, you would have
 to perform the following type of test over and over in your program
 (as represented in pseudo-code):
 .CS
 if (running NT) {
     encoding <- Tcl_GetEncoding("unicode");
     nativeBuffer <- Tcl_UtfToExternal(encoding, utfBuffer);
     Tcl_FreeEncoding(encoding);
 } else {
     nativeBuffer <- Tcl_UtfToExternal(NULL, utfBuffer);
 .CE
 \fBTcl_WinUtfToTChar\fR and \fBTcl_WinTCharToUtf\fR automatically
 handle this test and use the proper encoding based on the current
 operating system.  \fBTcl_WinUtfToTChar\fR returns a pointer to
 a TCHAR string, and \fBTcl_WinTCharToUtf\fR expects a TCHAR string
 pointer as the \fIsrc\fR string.  Otherwise, these functions
 behave identically to \fBTcl_UtfToExternalDString\fR and
 \fBTcl_ExternalToUtfDString\fR.
 .PP
 \fBTcl_GetEncodingName\fR is roughly the inverse of \fBTcl_GetEncoding\fR.
 Given an \fIencoding\fR, the return value is the \fIname\fR argument that
 was used to create the encoding.  The string returned by
 \fBTcl_GetEncodingName\fR is only guaranteed to persist until the
 \fIencoding\fR is deleted.  The caller must not modify this string.
 .PP
 \fBTcl_SetSystemEncoding\fR sets the default encoding that should be used
 whenever the user passes a NULL value for the \fIencoding\fR argument to
 any of the other encoding functions.  If \fIname\fR is NULL, the system
 encoding is reset to the default system encoding, \fBbinary\fR.  If the
 name did not refer to any known or loadable encoding, TCL_ERROR is
 returned and an error message is left in \fIinterp\fR.  Otherwise, this
 procedure increments the reference count of the new system encoding,
 decrements the reference count of the old system encoding, and returns
 TCL_OK.
 .PP
 \fBTcl_GetEncodingNames\fR sets the \fIinterp\fR result to a list
 consisting of the names of all the encodings that are currently defined
 or can be dynamically loaded, searching the encoding path specified by
 \fBTcl_SetDefaultEncodingDir\fR.  This procedure does not ensure that the
 dynamically-loadable encoding files contain valid data, but merely that they
 exist.
 .PP
 \fBTcl_CreateEncoding\fR defines a new encoding and registers the C
 procedures that are called back to convert between the encoding and
 UTF-8.  Encodings created by \fBTcl_CreateEncoding\fR are thereafter
 visible in the database used by \fBTcl_GetEncoding\fR.  Just as with the
 \fBTcl_GetEncoding\fR procedure, the return value is a token that
 represents the encoding and can be used in subsequent calls to other
 encoding functions.  \fBTcl_CreateEncoding\fR returns an encoding with a
 reference count of 1. If an encoding with the specified \fIname\fR
 already exists, then its entry in the database is replaced with the new
 encoding; the token for the old encoding will remain valid and continue
 to behave as before, but users of the new token will now call the new
 encoding procedures.
 .PP
 The \fItypePtr\fR argument to \fBTcl_CreateEncoding\fR contains information
 about the name of the encoding and the procedures that will be called to
 convert between this encoding and UTF-8.  It is defined as follows:
 .PP
 .CS
 typedef struct Tcl_EncodingType {
 	CONST char *\fIencodingName\fR;
 	Tcl_EncodingConvertProc *\fItoUtfProc\fR;
 	Tcl_EncodingConvertProc *\fIfromUtfProc\fR;
 	Tcl_EncodingFreeProc *\fIfreeProc\fR;
 	ClientData \fIclientData\fR;
 	int \fInullSize\fR;
 } Tcl_EncodingType;
 .CE
 .PP
 The \fIencodingName\fR provides a string name for the encoding, by
 which it can be referred in other procedures such as
 \fBTcl_GetEncoding\fR.  The \fItoUtfProc\fR refers to a callback
 procedure to invoke to convert text from this encoding into UTF-8.
 The \fIfromUtfProc\fR refers to a callback procedure to invoke to
 convert text from UTF-8 into this encoding.  The \fIfreeProc\fR refers
 to a callback procedure to invoke when this encoding is deleted.  The
 \fIfreeProc\fR field may be NULL.  The \fIclientData\fR contains an
 arbitrary one-word value passed to \fItoUtfProc\fR, \fIfromUtfProc\fR,
 and \fIfreeProc\fR whenever they are called.  Typically, this is a
 pointer to a data structure containing encoding-specific information
 that can be used by the callback procedures.  For instance, two very
 similar encodings such as \fBascii\fR and \fBmacRoman\fR may use the
 same callback procedure, but use different values of \fIclientData\fR
 to control its behavior.  The \fInullSize\fR specifies the number of
 zero bytes that signify end-of-string in this encoding.  It must be
 \fB1\fR (for single-byte or multi-byte encodings like ASCII or
 Shift-JIS) or \fB2\fR (for double-byte encodings like Unicode).
 Constant-sized encodings with 3 or more bytes per character (such as
 CNS11643) are not accepted.
 .PP
 The callback procedures \fItoUtfProc\fR and \fIfromUtfProc\fR should match the
 type \fBTcl_EncodingConvertProc\fR:
 .PP
 .CS
 typedef int Tcl_EncodingConvertProc(
 	ClientData \fIclientData\fR,
 	CONST char *\fIsrc\fR,
 	int \fIsrcLen\fR,
 	int \fIflags\fR,
 	Tcl_Encoding *\fIstatePtr\fR,
 	char *\fIdst\fR,
 	int \fIdstLen\fR,
 	int *\fIsrcReadPtr\fR,
 	int *\fIdstWrotePtr\fR,
 	int *\fIdstCharsPtr\fR);
 .CE
 .PP
 The \fItoUtfProc\fR and \fIfromUtfProc\fR procedures are called by the
 \fBTcl_ExternalToUtf\fR or \fBTcl_UtfToExternal\fR family of functions to
 perform the actual conversion.  The \fIclientData\fR parameter to these
 procedures is the same as the \fIclientData\fR field specified to
 \fBTcl_CreateEncoding\fR when the encoding was created.  The remaining
 arguments to the callback procedures are the same as the arguments,
 documented at the top, to \fBTcl_ExternalToUtf\fR or
 \fBTcl_UtfToExternal\fR, with the following exceptions.  If the
 \fIsrcLen\fR argument to one of those high-level functions is negative,
 the value passed to the callback procedure will be the appropriate
 encoding-specific string length of \fIsrc\fR.  If any of the \fIsrcReadPtr\fR,
 \fIdstWrotePtr\fR, or \fIdstCharsPtr\fR arguments to one of the high-level
 functions is NULL, the corresponding value passed to the callback
 procedure will be a non-NULL location.
 .PP
 The callback procedure \fIfreeProc\fR, if non-NULL, should match the type
 \fBTcl_EncodingFreeProc\fR:
 .CS
 typedef void Tcl_EncodingFreeProc(
 	ClientData \fIclientData\fR);
 .CE
 .PP
 This \fIfreeProc\fR function is called when the encoding is deleted.  The
 \fIclientData\fR parameter is the same as the \fIclientData\fR field
 specified to \fBTcl_CreateEncoding\fR when the encoding was created.
 .PP

 \fBTcl_GetDefaultEncodingDir\fR and \fBTcl_SetDefaultEncodingDir\fR
 access and set the directory to use when locating the default encoding
 files.  If this value is not NULL, the \fBTclpInitLibraryPath\fR routine
 appends the path to the head of the search path, and uses this path as
 the first place to look into when trying to locate the encoding file.

 .SH "ENCODING FILES"
 Space would prohibit precompiling into Tcl every possible encoding
 algorithm, so many encodings are stored on disk as dynamically-loadable
 encoding files.  This behavior also allows the user to create additional
 encoding files that can be loaded using the same mechanism.  These
 encoding files contain information about the tables and/or escape
 sequences used to map between an external encoding and Unicode.  The
 external encoding may consist of single-byte, multi-byte, or double-byte
 characters.
 .PP
 Each dynamically-loadable encoding is represented as a text file.  The
 initial line of the file, beginning with a ``#'' symbol, is a comment
 that provides a human-readable description of the file.  The next line
 identifies the type of encoding file.  It can be one of the following
 letters:
 .IP "[1]   \fBS\fR"
 A single-byte encoding, where one character is always one byte long in the
 encoding.  An example is \fBiso8859-1\fR, used by many European languages.
 .IP "[2]   \fBD\fR"
 A double-byte encoding, where one character is always two bytes long in the
 encoding.  An example is \fBbig5\fR, used for Chinese text.
 .IP "[3]   \fBM\fR"
 A multi-byte encoding, where one character may be either one or two bytes long.
 Certain bytes are a lead bytes, indicating that another byte must follow
 and that together the two bytes represent one character.  Other bytes are not
 lead bytes and represent themselves.  An example is \fBshiftjis\fR, used by
 many Japanese computers.
 .IP "[4]   \fBE\fR"
 An escape-sequence encoding, specifying that certain sequences of bytes
 do not represent characters, but commands that describe how following bytes
 should be interpreted.
 .PP
 The rest of the lines in the file depend on the type.
 .PP
 Cases [1], [2], and [3] are collectively referred to as table-based encoding
 files.  The lines in a table-based encoding file are in the same
 format as this example taken from the \fBshiftjis\fR encoding (this is not
 the complete file):
 .CS
 # Encoding file: shiftjis, multi-byte
 M
 003F 0 40
 00
 0000000100020003000400050006000700080009000A000B000C000D000E000F
 0010001100120013001400150016001700180019001A001B001C001D001E001F
 0020002100220023002400250026002700280029002A002B002C002D002E002F
 0030003100320033003400350036003700380039003A003B003C003D003E003F
 0040004100420043004400450046004700480049004A004B004C004D004E004F
 0050005100520053005400550056005700580059005A005B005C005D005E005F
 0060006100620063006400650066006700680069006A006B006C006D006E006F
 0070007100720073007400750076007700780079007A007B007C007D203E007F
 0080000000000000000000000000000000000000000000000000000000000000
 0000000000000000000000000000000000000000000000000000000000000000
 0000FF61FF62FF63FF64FF65FF66FF67FF68FF69FF6AFF6BFF6CFF6DFF6EFF6F
 FF70FF71FF72FF73FF74FF75FF76FF77FF78FF79FF7AFF7BFF7CFF7DFF7EFF7F
 FF80FF81FF82FF83FF84FF85FF86FF87FF88FF89FF8AFF8BFF8CFF8DFF8EFF8F
 FF90FF91FF92FF93FF94FF95FF96FF97FF98FF99FF9AFF9BFF9CFF9DFF9EFF9F
 0000000000000000000000000000000000000000000000000000000000000000
 0000000000000000000000000000000000000000000000000000000000000000
 81
 0000000000000000000000000000000000000000000000000000000000000000
 0000000000000000000000000000000000000000000000000000000000000000
 0000000000000000000000000000000000000000000000000000000000000000
 0000000000000000000000000000000000000000000000000000000000000000
 300030013002FF0CFF0E30FBFF1AFF1BFF1FFF01309B309C00B4FF4000A8FF3E
 FFE3FF3F30FD30FE309D309E30034EDD30053006300730FC20152010FF0F005C
 301C2016FF5C2026202520182019201C201DFF08FF0930143015FF3BFF3DFF5B
 FF5D30083009300A300B300C300D300E300F30103011FF0B221200B100D70000
 00F7FF1D2260FF1CFF1E22662267221E22342642264000B0203220332103FFE5
 FF0400A200A3FF05FF03FF06FF0AFF2000A72606260525CB25CF25CE25C725C6
 25A125A025B325B225BD25BC203B301221922190219121933013000000000000
 000000000000000000000000000000002208220B2286228722822283222A2229
 000000000000000000000000000000002227222800AC21D221D4220022030000
 0000000000000000000000000000000000000000222022A52312220222072261
 2252226A226B221A223D221D2235222B222C0000000000000000000000000000
 212B2030266F266D266A2020202100B6000000000000000025EF000000000000
 .CE
 .PP
 The third line of the file is three numbers.  The first number is the
 fallback character (in base 16) to use when converting from UTF-8 to this
 encoding.  The second number is a \fB1\fR if this file represents the
 encoding for a symbol font, or \fB0\fR otherwise.  The last number (in base
 10) is how many pages of data follow.
 .PP
 Subsequent lines in the example above are pages that describe how to map
 from the encoding into 2-byte Unicode.  The first line in a page identifies
 the page number.  Following it are 256 double-byte numbers, arranged as 16
 rows of 16 numbers.  Given a character in the encoding, the high byte of
 that character is used to select which page, and the low byte of that
 character is used as an index to select one of the double-byte numbers in
 that page \- the value obtained being the corresponding Unicode character.
 By examination of the example above, one can see that the characters 0x7E
 and 0x8163 in \fBshiftjis\fR map to 203E and 2026 in Unicode, respectively.
 .PP
 Following the first page will be all the other pages, each in the same
 format as the first: one number identifying the page followed by 256
 double-byte Unicode characters.  If a character in the encoding maps to the
 Unicode character 0000, it means that the character doesn't actually exist.
 If all characters on a page would map to 0000, that page can be omitted.
 .PP
 Case [4] is the escape-sequence encoding file.  The lines in an this type of
 file are in the same format as this example taken from the \fBiso2022-jp\fR
 encoding:
 .CS
 .ta 1.5i
 # Encoding file: iso2022-jp, escape-driven
 E
 init		{}
 final		{}
 iso8859-1	\\x1b(B
 jis0201		\\x1b(J
 jis0208		\\x1b$@
 jis0208		\\x1b$B
 jis0212		\\x1b$(D
 gb2312		\\x1b$A
 ksc5601		\\x1b$(C
 .CE
 .PP
 In the file, the first column represents an option and the second column
 is the associated value.  \fBinit\fR is a string to emit or expect before
 the first character is converted, while \fBfinal\fR is a string to emit
 or expect after the last character.  All other options are names of
 table-based encodings; the associated value is the escape-sequence that
 marks that encoding.  Tcl syntax is used for the values; in the above
 example, for instance, ``\fB{}\fR'' represents the empty string and
 ``\fB\\x1b\fR'' represents character 27.
 .PP
 When \fBTcl_GetEncoding\fR encounters an encoding \fIname\fR that has not
 been loaded, it attempts to load an encoding file called \fIname\fB.enc\fR
 from the \fBencoding\fR subdirectory of each directory specified in the
 library path \fB$tcl_libPath\fR.  If the encoding file exists, but is
 malformed, an error message will be left in \fIinterp\fR.
 .SH KEYWORDS
 utf, encoding, convert