| '\" |
| '\" Copyright (c) 1997 Sun Microsystems, Inc. |
| '\" |
| '\" See the file "license.terms" for information on usage and redistribution |
| '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. |
| '\" |
| '\" RCS: @(#) $Id: Utf.3,v 1.13 2002/07/01 18:24:39 jenglish Exp $ |
| '\" |
| '\" The definitions below are for supplemental macros used in Tcl/Tk |
| '\" manual entries. |
| '\" |
| '\" .AP type name in/out ?indent? |
| '\" Start paragraph describing an argument to a library procedure. |
| '\" type is type of argument (int, etc.), in/out is either "in", "out", |
| '\" or "in/out" to describe whether procedure reads or modifies arg, |
| '\" and indent is equivalent to second arg of .IP (shouldn't ever be |
| '\" needed; use .AS below instead) |
| '\" |
| '\" .AS ?type? ?name? |
| '\" Give maximum sizes of arguments for setting tab stops. Type and |
| '\" name are examples of largest possible arguments that will be passed |
| '\" to .AP later. If args are omitted, default tab stops are used. |
| '\" |
| '\" .BS |
| '\" Start box enclosure. From here until next .BE, everything will be |
| '\" enclosed in one large box. |
| '\" |
| '\" .BE |
| '\" End of box enclosure. |
| '\" |
| '\" .CS |
| '\" Begin code excerpt. |
| '\" |
| '\" .CE |
| '\" End code excerpt. |
| '\" |
| '\" .VS ?version? ?br? |
| '\" Begin vertical sidebar, for use in marking newly-changed parts |
| '\" of man pages. The first argument is ignored and used for recording |
| '\" the version when the .VS was added, so that the sidebars can be |
| '\" found and removed when they reach a certain age. If another argument |
| '\" is present, then a line break is forced before starting the sidebar. |
| '\" |
| '\" .VE |
| '\" End of vertical sidebar. |
| '\" |
| '\" .DS |
| '\" Begin an indented unfilled display. |
| '\" |
| '\" .DE |
| '\" End of indented unfilled display. |
| '\" |
| '\" .SO |
| '\" Start of list of standard options for a Tk widget. The |
| '\" options follow on successive lines, in four columns separated |
| '\" by tabs. |
| '\" |
| '\" .SE |
| '\" End of list of standard options for a Tk widget. |
| '\" |
| '\" .OP cmdName dbName dbClass |
| '\" Start of description of a specific option. cmdName gives the |
| '\" option's name as specified in the class command, dbName gives |
| '\" the option's name in the option database, and dbClass gives |
| '\" the option's class in the option database. |
| '\" |
| '\" .UL arg1 arg2 |
| '\" Print arg1 underlined, then print arg2 normally. |
| '\" |
| '\" RCS: @(#) $Id: man.macros,v 1.4 2000/08/25 06:18:32 ericm Exp $ |
| '\" |
| '\" # Set up traps and other miscellaneous stuff for Tcl/Tk man pages. |
| .if t .wh -1.3i ^B |
| .nr ^l \n(.l |
| .ad b |
| '\" # Start an argument description |
| .de AP |
| .ie !"\\$4"" .TP \\$4 |
| .el \{\ |
| . ie !"\\$2"" .TP \\n()Cu |
| . el .TP 15 |
| .\} |
| .ta \\n()Au \\n()Bu |
| .ie !"\\$3"" \{\ |
| \&\\$1 \\fI\\$2\\fP (\\$3) |
| .\".b |
| .\} |
| .el \{\ |
| .br |
| .ie !"\\$2"" \{\ |
| \&\\$1 \\fI\\$2\\fP |
| .\} |
| .el \{\ |
| \&\\fI\\$1\\fP |
| .\} |
| .\} |
| .. |
| '\" # define tabbing values for .AP |
| .de AS |
| .nr )A 10n |
| .if !"\\$1"" .nr )A \\w'\\$1'u+3n |
| .nr )B \\n()Au+15n |
| .\" |
| .if !"\\$2"" .nr )B \\w'\\$2'u+\\n()Au+3n |
| .nr )C \\n()Bu+\\w'(in/out)'u+2n |
| .. |
| .AS Tcl_Interp Tcl_CreateInterp in/out |
| '\" # BS - start boxed text |
| '\" # ^y = starting y location |
| '\" # ^b = 1 |
| .de BS |
| .br |
| .mk ^y |
| .nr ^b 1u |
| .if n .nf |
| .if n .ti 0 |
| .if n \l'\\n(.lu\(ul' |
| .if n .fi |
| .. |
| '\" # BE - end boxed text (draw box now) |
| .de BE |
| .nf |
| .ti 0 |
| .mk ^t |
| .ie n \l'\\n(^lu\(ul' |
| .el \{\ |
| .\" Draw four-sided box normally, but don't draw top of |
| .\" box if the box started on an earlier page. |
| .ie !\\n(^b-1 \{\ |
| \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul' |
| .\} |
| .el \}\ |
| \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul' |
| .\} |
| .\} |
| .fi |
| .br |
| .nr ^b 0 |
| .. |
| '\" # VS - start vertical sidebar |
| '\" # ^Y = starting y location |
| '\" # ^v = 1 (for troff; for nroff this doesn't matter) |
| .de VS |
| .if !"\\$2"" .br |
| .mk ^Y |
| .ie n 'mc \s12\(br\s0 |
| .el .nr ^v 1u |
| .. |
| '\" # VE - end of vertical sidebar |
| .de VE |
| .ie n 'mc |
| .el \{\ |
| .ev 2 |
| .nf |
| .ti 0 |
| .mk ^t |
| \h'|\\n(^lu+3n'\L'|\\n(^Yu-1v\(bv'\v'\\n(^tu+1v-\\n(^Yu'\h'-|\\n(^lu+3n' |
| .sp -1 |
| .fi |
| .ev |
| .\} |
| .nr ^v 0 |
| .. |
| '\" # Special macro to handle page bottom: finish off current |
| '\" # box/sidebar if in box/sidebar mode, then invoked standard |
| '\" # page bottom macro. |
| .de ^B |
| .ev 2 |
| 'ti 0 |
| 'nf |
| .mk ^t |
| .if \\n(^b \{\ |
| .\" Draw three-sided box if this is the box's first page, |
| .\" draw two sides but no top otherwise. |
| .ie !\\n(^b-1 \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c |
| .el \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c |
| .\} |
| .if \\n(^v \{\ |
| .nr ^x \\n(^tu+1v-\\n(^Yu |
| \kx\h'-\\nxu'\h'|\\n(^lu+3n'\ky\L'-\\n(^xu'\v'\\n(^xu'\h'|0u'\c |
| .\} |
| .bp |
| 'fi |
| .ev |
| .if \\n(^b \{\ |
| .mk ^y |
| .nr ^b 2 |
| .\} |
| .if \\n(^v \{\ |
| .mk ^Y |
| .\} |
| .. |
| '\" # DS - begin display |
| .de DS |
| .RS |
| .nf |
| .sp |
| .. |
| '\" # DE - end display |
| .de DE |
| .fi |
| .RE |
| .sp |
| .. |
| '\" # SO - start of list of standard options |
| .de SO |
| .SH "STANDARD OPTIONS" |
| .LP |
| .nf |
| .ta 5.5c 11c |
| .ft B |
| .. |
| '\" # SE - end of list of standard options |
| .de SE |
| .fi |
| .ft R |
| .LP |
| See the \\fBoptions\\fR manual entry for details on the standard options. |
| .. |
| '\" # OP - start of full description for a single option |
| .de OP |
| .LP |
| .nf |
| .ta 4c |
| Command-Line Name: \\fB\\$1\\fR |
| Database Name: \\fB\\$2\\fR |
| Database Class: \\fB\\$3\\fR |
| .fi |
| .IP |
| .. |
| '\" # CS - begin code excerpt |
| .de CS |
| .RS |
| .nf |
| .ta .25i .5i .75i 1i |
| .. |
| '\" # CE - end code excerpt |
| .de CE |
| .fi |
| .RE |
| .. |
| .de UL |
| \\$1\l'|0\(ul'\\$2 |
| .. |
| .TH Utf 3 "8.1" Tcl "Tcl Library Procedures" |
| .BS |
| .SH NAME |
| Tcl_UniChar, Tcl_UniCharCaseMatch, Tcl_UniCharNcasecmp, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating UTF-8 strings. |
| .SH SYNOPSIS |
| .nf |
| \fB#include <tcl.h>\fR |
| .sp |
| typedef ... Tcl_UniChar; |
| .sp |
| int |
| \fBTcl_UniCharToUtf\fR(\fIch, buf\fR) |
| .sp |
| int |
| \fBTcl_UtfToUniChar\fR(\fIsrc, chPtr\fR) |
| .VS 8.4 |
| .sp |
| char * |
| \fBTcl_UniCharToUtfDString\fR(\fIuniStr, numChars, dstPtr\fR) |
| .sp |
| Tcl_UniChar * |
| \fBTcl_UtfToUniCharDString\fR(\fIsrc, len, dstPtr\fR) |
| .VE 8.4 |
| .sp |
| int |
| \fBTcl_UniCharLen\fR(\fIuniStr\fR) |
| .sp |
| int |
| \fBTcl_UniCharNcmp\fR(\fIuniStr, uniStr, num\fR) |
| .VS 8.4 |
| .sp |
| int |
| \fBTcl_UniCharNcasecmp\fR(\fIuniStr, uniStr, num\fR) |
| .sp |
| int |
| \fBTcl_UniCharCaseMatch\fR(\fIuniStr, uniPattern, nocase\fR) |
| .VE 8.4 |
| .sp |
| int |
| \fBTcl_UtfNcmp\fR(\fIsrc, src, num\fR) |
| .sp |
| int |
| \fBTcl_UtfNcasecmp\fR(\fIsrc, src, num\fR) |
| .sp |
| int |
| \fBTcl_UtfCharComplete\fR(\fIsrc, len\fR) |
| .sp |
| int |
| \fBTcl_NumUtfChars\fR(\fIsrc, len\fR) |
| .VS 8.4 |
| .sp |
| CONST char * |
| \fBTcl_UtfFindFirst\fR(\fIsrc, ch\fR) |
| .sp |
| CONST char * |
| \fBTcl_UtfFindLast\fR(\fIsrc, ch\fR) |
| .sp |
| CONST char * |
| \fBTcl_UtfNext\fR(\fIsrc\fR) |
| .sp |
| CONST char * |
| \fBTcl_UtfPrev\fR(\fIsrc, start\fR) |
| .VE 8.4 |
| .sp |
| Tcl_UniChar |
| \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR) |
| .VS 8.4 |
| .sp |
| CONST char * |
| \fBTcl_UtfAtIndex\fR(\fIsrc, index\fR) |
| .VE 8.4 |
| .sp |
| int |
| \fBTcl_UtfBackslash\fR(\fIsrc, readPtr, dst\fR) |
| .SH ARGUMENTS |
| .AS "CONST Tcl_UniChar" numChars in/out |
| .AP char *buf out |
| Buffer in which the UTF-8 representation of the Tcl_UniChar is stored. At most |
| TCL_UTF_MAX bytes are stored in the buffer. |
| .AP int ch in |
| The Tcl_UniChar to be converted or examined. |
| .AP Tcl_UniChar *chPtr out |
| Filled with the Tcl_UniChar represented by the head of the UTF-8 string. |
| .AP "CONST char" *src in |
| Pointer to a UTF-8 string. |
| .AP "CONST Tcl_UniChar" *uniStr in |
| A NULL-terminated Unicode string. |
| .AP "CONST Tcl_UniChar" *uniPattern in |
| A NULL-terminated Unicode string. |
| .AP int len in |
| The length of the UTF-8 string in bytes (not UTF-8 characters). If |
| negative, all bytes up to the first null byte are used. |
| .AP int numChars in |
| The length of the Unicode string in characters. Must be greater than or |
| equal to 0. |
| .AP "Tcl_DString" *dstPtr in/out |
| A pointer to a previously-initialized \fBTcl_DString\fR. |
| .AP "unsigned long" num in |
| The number of characters to compare. |
| .AP "CONST char" *start in |
| Pointer to the beginning of a UTF-8 string. |
| .AP int index in |
| The index of a character (not byte) in the UTF-8 string. |
| .AP int *readPtr out |
| If non-NULL, filled with the number of bytes in the backslash sequence, |
| including the backslash character. |
| .AP char *dst out |
| Buffer in which the bytes represented by the backslash sequence are stored. |
| At most TCL_UTF_MAX bytes are stored in the buffer. |
| .VS 8.4 |
| .AP int nocase in |
| Specifies whether the match should be done case-sensitive (0) or |
| case-insensitive (1). |
| .VE 8.4 |
| .BE |
| |
| .SH DESCRIPTION |
| .PP |
| These routines convert between UTF-8 strings and Tcl_UniChars. A |
| Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size |
| quantity. A UTF-8 character is a Unicode character represented as |
| a varying-length sequence of up to TCL_UTF_MAX bytes. A multibyte UTF-8 |
| sequence consists of a lead byte followed by some number of trail bytes. |
| .PP |
| \fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to |
| represent one Unicode character in the UTF-8 representation. |
| .PP |
| \fBTcl_UniCharToUtf\fR stores the Tcl_UniChar \fIch\fR as a UTF-8 string |
| in starting at \fIbuf\fR. The return value is the number of bytes stored |
| in \fIbuf\fR. |
| .PP |
| \fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR |
| and stores it as a Tcl_UniChar in \fI*chPtr\fR. The return value is the |
| number of bytes read from \fIsrc\fR.. The caller must ensure that the |
| source buffer is long enough such that this routine does not run off the |
| end and dereference non-existent or random memory; if the source buffer |
| is known to be null terminated, this will not happen. If the input is |
| not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first |
| byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and |
| 0x00ff and return 1. |
| .PP |
| \fBTcl_UniCharToUtfDString\fR converts the given Unicode string |
| to UTF-8, storing the result in a previously-initialized \fBTcl_DString\fR. |
| You must specify the length of the given Unicode string. |
| The return value is a pointer to the UTF-8 representation of the |
| Unicode string. Storage for the return value is appended to the |
| end of the \fBTcl_DString\fR. |
| .PP |
| \fBTcl_UtfToUniCharDString\fR converts the given UTF-8 string to Unicode, |
| storing the result in the previously-initialized \fBTcl_DString\fR. |
| you may either specify the length of the given UTF-8 string or "-1", |
| in which case \fBTcl_UtfToUniCharDString\fR uses \fBstrlen\fR to |
| calculate the length. The return value is a pointer to the Unicode |
| representation of the UTF-8 string. Storage for the return value |
| is appended to the end of the \fBTcl_DString\fR. The Unicode string |
| is terminated with a Unicode NULL character. |
| .PP |
| \fBTcl_UniCharLen\fR corresponds to \fBstrlen\fR for Unicode |
| characters. It accepts a NULL-terminated Unicode string and returns |
| the number of Unicode characters (not bytes) in that string. |
| .PP |
| \fBTcl_UniCharNcmp\fR and \fBTcl_UniCharNcasecmp\fR correspond to |
| \fBstrncmp\fR and \fBstrncasecmp\fR, respectively, for Unicode characters. |
| They accepts two NULL-terminated Unicode strings and the number of characters |
| to compare. Both strings are assumed to be at least \fIlen\fR characters |
| long. \fBTcl_UniCharNcmp\fR compares the two strings character-by-character |
| according to the Unicode character ordering. It returns an integer greater |
| than, equal to, or less than 0 if the first string is greater than, equal |
| to, or less than the second string respectively. \fBTcl_UniCharNcasecmp\fR |
| is the Unicode case insensitive version. |
| .PP |
| .VS 8.4 |
| \fBTcl_UniCharCaseMatch\fR is the Unicode equivalent to |
| \fBTcl_StringCaseMatch\fR. It accepts a NULL-terminated Unicode string, |
| a Unicode pattern, and a boolean value specifying whether the match should |
| be case sensitive and returns whether the string matches the pattern. |
| .VE 8.4 |
| .PP |
| \fBTcl_UtfNcmp\fR corresponds to \fBstrncmp\fR for UTF-8 strings. It |
| accepts two NULL-terminated UTF-8 strings and the number of characters |
| to compare. (Both strings are assumed to be at least \fIlen\fR |
| characters long.) \fBTcl_UtfNcmp\fR compares the two strings |
| character-by-character according to the Unicode character ordering. |
| It returns an integer greater than, equal to, or less than 0 if the |
| first string is greater than, equal to, or less than the second string |
| respectively. |
| .PP |
| \fBTcl_UtfNcasecmp\fR corresponds to \fBstrncasecmp\fR for UTF-8 |
| strings. It is similar to \fBTcl_UtfNcmp\fR except comparisons ignore |
| differences in case when comparing upper, lower or title case |
| characters. |
| .PP |
| \fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR |
| of length \fIlen\fR bytes is long enough to be decoded by |
| \fBTcl_UtfToUniChar\fR, or 0 otherwise. This function does not guarantee |
| that the UTF-8 string is properly formed. This routine is used by |
| procedures that are operating on a byte at a time and need to know if a |
| full Tcl_UniChar has been seen. |
| .PP |
| \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings. It |
| returns the number of Tcl_UniChars that are represented by the UTF-8 string |
| \fIsrc\fR. The length of the source string is \fIlen\fR bytes. If the |
| length is negative, all bytes up to the first NULL byte are used. |
| .PP |
| \fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings. It |
| returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR |
| in the NULL-terminated UTF-8 string \fIsrc\fR. The NULL terminator is |
| considered part of the UTF-8 string. |
| .PP |
| \fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings. It |
| returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR |
| in the NULL terminated UTF-8 string \fIsrc\fR. The NULL terminator is |
| considered part of the UTF-8 string. |
| .PP |
| Given \fIsrc\fR, a pointer to some location in a UTF-8 string, |
| \fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the |
| string. The caller must not ask for the next character after the last |
| character in the string. |
| .PP |
| Given \fIsrc\fR, a pointer to some location in a UTF-8 string, |
| \fBTcl_UtfPrev\fR returns a pointer to the previous UTF-8 character in the |
| string. This function will not back up to a position before \fIstart\fR, |
| the start of the UTF-8 string. If \fIsrc\fR was already at \fIstart\fR, the |
| return value will be \fIstart\fR. |
| .PP |
| \fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the |
| Pascal Ord() function. It returns the Tcl_UniChar represented at the |
| specified character (not byte) \fIindex\fR in the UTF-8 string |
| \fIsrc\fR. The source string must contain at least \fIindex\fR |
| characters. Behavior is undefined if a negative \fIindex\fR is given. |
| .PP |
| \fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not |
| byte) \fIindex\fR in the UTF-8 string \fIsrc\fR. The source string must |
| contain at least \fIindex\fR characters. This is equivalent to calling |
| \fBTcl_UtfNext\fR \fIindex\fR times. If a negative \fIindex\fR is given, |
| the return pointer points to the first character in the source string. |
| .PP |
| \fBTcl_UtfBackslash\fR is a utility procedure used by several of the Tcl |
| commands. It parses a backslash sequence and stores the properly formed |
| UTF-8 character represented by the backslash sequence in the output |
| buffer \fIdst\fR. At most TCL_UTF_MAX bytes are stored in the buffer. |
| \fBTcl_UtfBackslash\fR modifies \fI*readPtr\fR to contain the number |
| of bytes in the backslash sequence, including the backslash character. |
| The return value is the number of bytes stored in the output buffer. |
| .PP |
| See the \fBTcl\fR manual entry for information on the valid backslash |
| sequences. All of the sequences described in the Tcl manual entry are |
| supported by \fBTcl_UtfBackslash\fR. |
| |
| .SH KEYWORDS |
| utf, unicode, backslash |