| '\" |
| '\" Copyright (c) 1998 Sun Microsystems, Inc. |
| '\" Copyright (c) 1999 Scriptics Corporation |
| '\" |
| '\" See the file "license.terms" for information on usage and redistribution |
| '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. |
| '\" |
| '\" RCS: @(#) $Id: re_syntax.n,v 1.3 1999/07/14 19:09:36 jpeek Exp $ |
| '\" |
| '\" The definitions below are for supplemental macros used in Tcl/Tk |
| '\" manual entries. |
| '\" |
| '\" .AP type name in/out ?indent? |
| '\" Start paragraph describing an argument to a library procedure. |
| '\" type is type of argument (int, etc.), in/out is either "in", "out", |
| '\" or "in/out" to describe whether procedure reads or modifies arg, |
| '\" and indent is equivalent to second arg of .IP (shouldn't ever be |
| '\" needed; use .AS below instead) |
| '\" |
| '\" .AS ?type? ?name? |
| '\" Give maximum sizes of arguments for setting tab stops. Type and |
| '\" name are examples of largest possible arguments that will be passed |
| '\" to .AP later. If args are omitted, default tab stops are used. |
| '\" |
| '\" .BS |
| '\" Start box enclosure. From here until next .BE, everything will be |
| '\" enclosed in one large box. |
| '\" |
| '\" .BE |
| '\" End of box enclosure. |
| '\" |
| '\" .CS |
| '\" Begin code excerpt. |
| '\" |
| '\" .CE |
| '\" End code excerpt. |
| '\" |
| '\" .VS ?version? ?br? |
| '\" Begin vertical sidebar, for use in marking newly-changed parts |
| '\" of man pages. The first argument is ignored and used for recording |
| '\" the version when the .VS was added, so that the sidebars can be |
| '\" found and removed when they reach a certain age. If another argument |
| '\" is present, then a line break is forced before starting the sidebar. |
| '\" |
| '\" .VE |
| '\" End of vertical sidebar. |
| '\" |
| '\" .DS |
| '\" Begin an indented unfilled display. |
| '\" |
| '\" .DE |
| '\" End of indented unfilled display. |
| '\" |
| '\" .SO |
| '\" Start of list of standard options for a Tk widget. The |
| '\" options follow on successive lines, in four columns separated |
| '\" by tabs. |
| '\" |
| '\" .SE |
| '\" End of list of standard options for a Tk widget. |
| '\" |
| '\" .OP cmdName dbName dbClass |
| '\" Start of description of a specific option. cmdName gives the |
| '\" option's name as specified in the class command, dbName gives |
| '\" the option's name in the option database, and dbClass gives |
| '\" the option's class in the option database. |
| '\" |
| '\" .UL arg1 arg2 |
| '\" Print arg1 underlined, then print arg2 normally. |
| '\" |
| '\" RCS: @(#) $Id: man.macros,v 1.4 2000/08/25 06:18:32 ericm Exp $ |
| '\" |
| '\" # Set up traps and other miscellaneous stuff for Tcl/Tk man pages. |
| .if t .wh -1.3i ^B |
| .nr ^l \n(.l |
| .ad b |
| '\" # Start an argument description |
| .de AP |
| .ie !"\\$4"" .TP \\$4 |
| .el \{\ |
| . ie !"\\$2"" .TP \\n()Cu |
| . el .TP 15 |
| .\} |
| .ta \\n()Au \\n()Bu |
| .ie !"\\$3"" \{\ |
| \&\\$1 \\fI\\$2\\fP (\\$3) |
| .\".b |
| .\} |
| .el \{\ |
| .br |
| .ie !"\\$2"" \{\ |
| \&\\$1 \\fI\\$2\\fP |
| .\} |
| .el \{\ |
| \&\\fI\\$1\\fP |
| .\} |
| .\} |
| .. |
| '\" # define tabbing values for .AP |
| .de AS |
| .nr )A 10n |
| .if !"\\$1"" .nr )A \\w'\\$1'u+3n |
| .nr )B \\n()Au+15n |
| .\" |
| .if !"\\$2"" .nr )B \\w'\\$2'u+\\n()Au+3n |
| .nr )C \\n()Bu+\\w'(in/out)'u+2n |
| .. |
| .AS Tcl_Interp Tcl_CreateInterp in/out |
| '\" # BS - start boxed text |
| '\" # ^y = starting y location |
| '\" # ^b = 1 |
| .de BS |
| .br |
| .mk ^y |
| .nr ^b 1u |
| .if n .nf |
| .if n .ti 0 |
| .if n \l'\\n(.lu\(ul' |
| .if n .fi |
| .. |
| '\" # BE - end boxed text (draw box now) |
| .de BE |
| .nf |
| .ti 0 |
| .mk ^t |
| .ie n \l'\\n(^lu\(ul' |
| .el \{\ |
| .\" Draw four-sided box normally, but don't draw top of |
| .\" box if the box started on an earlier page. |
| .ie !\\n(^b-1 \{\ |
| \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul' |
| .\} |
| .el \}\ |
| \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul' |
| .\} |
| .\} |
| .fi |
| .br |
| .nr ^b 0 |
| .. |
| '\" # VS - start vertical sidebar |
| '\" # ^Y = starting y location |
| '\" # ^v = 1 (for troff; for nroff this doesn't matter) |
| .de VS |
| .if !"\\$2"" .br |
| .mk ^Y |
| .ie n 'mc \s12\(br\s0 |
| .el .nr ^v 1u |
| .. |
| '\" # VE - end of vertical sidebar |
| .de VE |
| .ie n 'mc |
| .el \{\ |
| .ev 2 |
| .nf |
| .ti 0 |
| .mk ^t |
| \h'|\\n(^lu+3n'\L'|\\n(^Yu-1v\(bv'\v'\\n(^tu+1v-\\n(^Yu'\h'-|\\n(^lu+3n' |
| .sp -1 |
| .fi |
| .ev |
| .\} |
| .nr ^v 0 |
| .. |
| '\" # Special macro to handle page bottom: finish off current |
| '\" # box/sidebar if in box/sidebar mode, then invoked standard |
| '\" # page bottom macro. |
| .de ^B |
| .ev 2 |
| 'ti 0 |
| 'nf |
| .mk ^t |
| .if \\n(^b \{\ |
| .\" Draw three-sided box if this is the box's first page, |
| .\" draw two sides but no top otherwise. |
| .ie !\\n(^b-1 \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c |
| .el \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c |
| .\} |
| .if \\n(^v \{\ |
| .nr ^x \\n(^tu+1v-\\n(^Yu |
| \kx\h'-\\nxu'\h'|\\n(^lu+3n'\ky\L'-\\n(^xu'\v'\\n(^xu'\h'|0u'\c |
| .\} |
| .bp |
| 'fi |
| .ev |
| .if \\n(^b \{\ |
| .mk ^y |
| .nr ^b 2 |
| .\} |
| .if \\n(^v \{\ |
| .mk ^Y |
| .\} |
| .. |
| '\" # DS - begin display |
| .de DS |
| .RS |
| .nf |
| .sp |
| .. |
| '\" # DE - end display |
| .de DE |
| .fi |
| .RE |
| .sp |
| .. |
| '\" # SO - start of list of standard options |
| .de SO |
| .SH "STANDARD OPTIONS" |
| .LP |
| .nf |
| .ta 5.5c 11c |
| .ft B |
| .. |
| '\" # SE - end of list of standard options |
| .de SE |
| .fi |
| .ft R |
| .LP |
| See the \\fBoptions\\fR manual entry for details on the standard options. |
| .. |
| '\" # OP - start of full description for a single option |
| .de OP |
| .LP |
| .nf |
| .ta 4c |
| Command-Line Name: \\fB\\$1\\fR |
| Database Name: \\fB\\$2\\fR |
| Database Class: \\fB\\$3\\fR |
| .fi |
| .IP |
| .. |
| '\" # CS - begin code excerpt |
| .de CS |
| .RS |
| .nf |
| .ta .25i .5i .75i 1i |
| .. |
| '\" # CE - end code excerpt |
| .de CE |
| .fi |
| .RE |
| .. |
| .de UL |
| \\$1\l'|0\(ul'\\$2 |
| .. |
| .TH re_syntax n "8.1" Tcl "Tcl Built-In Commands" |
| .BS |
| .SH NAME |
| re_syntax \- Syntax of Tcl regular expressions. |
| .BE |
| |
| .SH DESCRIPTION |
| .PP |
| A \fIregular expression\fR describes strings of characters. |
| It's a pattern that matches certain strings and doesn't match others. |
| |
| .SH "DIFFERENT FLAVORS OF REs" |
| Regular expressions (``RE''s), as defined by POSIX, come in two |
| flavors: \fIextended\fR REs (``EREs'') and \fIbasic\fR REs (``BREs''). |
| EREs are roughly those of the traditional \fIegrep\fR, while BREs are |
| roughly those of the traditional \fIed\fR. This implementation adds |
| a third flavor, \fIadvanced\fR REs (``AREs''), basically EREs with |
| some significant extensions. |
| .PP |
| This manual page primarily describes AREs. BREs mostly exist for |
| backward compatibility in some old programs; they will be discussed at |
| the end. POSIX EREs are almost an exact subset of AREs. Features of |
| AREs that are not present in EREs will be indicated. |
| |
| .SH "REGULAR EXPRESSION SYNTAX" |
| .PP |
| Tcl regular expressions are implemented using the package written by |
| Henry Spencer, based on the 1003.2 spec and some (not quite all) of |
| the Perl5 extensions (thanks, Henry!). Much of the description of |
| regular expressions below is copied verbatim from his manual entry. |
| .PP |
| An ARE is one or more \fIbranches\fR, |
| separated by `\fB|\fR', |
| matching anything that matches any of the branches. |
| .PP |
| A branch is zero or more \fIconstraints\fR or \fIquantified atoms\fR, |
| concatenated. |
| It matches a match for the first, followed by a match for the second, etc; |
| an empty branch matches the empty string. |
| .PP |
| A quantified atom is an \fIatom\fR possibly followed |
| by a single \fIquantifier\fR. |
| Without a quantifier, it matches a match for the atom. |
| The quantifiers, |
| and what a so-quantified atom matches, are: |
| .RS 2 |
| .TP 6 |
| \fB*\fR |
| a sequence of 0 or more matches of the atom |
| .TP |
| \fB+\fR |
| a sequence of 1 or more matches of the atom |
| .TP |
| \fB?\fR |
| a sequence of 0 or 1 matches of the atom |
| .TP |
| \fB{\fIm\fB}\fR |
| a sequence of exactly \fIm\fR matches of the atom |
| .TP |
| \fB{\fIm\fB,}\fR |
| a sequence of \fIm\fR or more matches of the atom |
| .TP |
| \fB{\fIm\fB,\fIn\fB}\fR |
| a sequence of \fIm\fR through \fIn\fR (inclusive) matches of the atom; |
| \fIm\fR may not exceed \fIn\fR |
| .TP |
| \fB*? +? ?? {\fIm\fB}? {\fIm\fB,}? {\fIm\fB,\fIn\fB}?\fR |
| \fInon-greedy\fR quantifiers, |
| which match the same possibilities, |
| but prefer the smallest number rather than the largest number |
| of matches (see MATCHING) |
| .RE |
| .PP |
| The forms using |
| \fB{\fR and \fB}\fR |
| are known as \fIbound\fRs. |
| The numbers |
| \fIm\fR and \fIn\fR are unsigned decimal integers |
| with permissible values from 0 to 255 inclusive. |
| .PP |
| An atom is one of: |
| .RS 2 |
| .TP 6 |
| \fB(\fIre\fB)\fR |
| (where \fIre\fR is any regular expression) |
| matches a match for |
| \fIre\fR, with the match noted for possible reporting |
| .TP |
| \fB(?:\fIre\fB)\fR |
| as previous, |
| but does no reporting |
| (a ``non-capturing'' set of parentheses) |
| .TP |
| \fB()\fR |
| matches an empty string, |
| noted for possible reporting |
| .TP |
| \fB(?:)\fR |
| matches an empty string, |
| without reporting |
| .TP |
| \fB[\fIchars\fB]\fR |
| a \fIbracket expression\fR, |
| matching any one of the \fIchars\fR (see BRACKET EXPRESSIONS for more detail) |
| .TP |
| \fB.\fR |
| matches any single character |
| .TP |
| \fB\e\fIk\fR |
| (where \fIk\fR is a non-alphanumeric character) |
| matches that character taken as an ordinary character, |
| e.g. \e\e matches a backslash character |
| .TP |
| \fB\e\fIc\fR |
| where \fIc\fR is alphanumeric |
| (possibly followed by other characters), |
| an \fIescape\fR (AREs only), |
| see ESCAPES below |
| .TP |
| \fB{\fR |
| when followed by a character other than a digit, |
| matches the left-brace character `\fB{\fR'; |
| when followed by a digit, it is the beginning of a |
| \fIbound\fR (see above) |
| .TP |
| \fIx\fR |
| where \fIx\fR is |
| a single character with no other significance, matches that character. |
| .RE |
| .PP |
| A \fIconstraint\fR matches an empty string when specific conditions |
| are met. |
| A constraint may not be followed by a quantifier. |
| The simple constraints are as follows; some more constraints are |
| described later, under ESCAPES. |
| .RS 2 |
| .TP 8 |
| \fB^\fR |
| matches at the beginning of a line |
| .TP |
| \fB$\fR |
| matches at the end of a line |
| .TP |
| \fB(?=\fIre\fB)\fR |
| \fIpositive lookahead\fR (AREs only), matches at any point |
| where a substring matching \fIre\fR begins |
| .TP |
| \fB(?!\fIre\fB)\fR |
| \fInegative lookahead\fR (AREs only), matches at any point |
| where no substring matching \fIre\fR begins |
| .RE |
| .PP |
| The lookahead constraints may not contain back references (see later), |
| and all parentheses within them are considered non-capturing. |
| .PP |
| An RE may not end with `\fB\e\fR'. |
| |
| .SH "BRACKET EXPRESSIONS" |
| A \fIbracket expression\fR is a list of characters enclosed in `\fB[\|]\fR'. |
| It normally matches any single character from the list (but see below). |
| If the list begins with `\fB^\fR', |
| it matches any single character |
| (but see below) \fInot\fR from the rest of the list. |
| .PP |
| If two characters in the list are separated by `\fB\-\fR', |
| this is shorthand |
| for the full \fIrange\fR of characters between those two (inclusive) in the |
| collating sequence, |
| e.g. |
| \fB[0\-9]\fR |
| in ASCII matches any decimal digit. |
| Two ranges may not share an |
| endpoint, so e.g. |
| \fBa\-c\-e\fR |
| is illegal. |
| Ranges are very collating-sequence-dependent, |
| and portable programs should avoid relying on them. |
| .PP |
| To include a literal |
| \fB]\fR |
| or |
| \fB\-\fR |
| in the list, |
| the simplest method is to |
| enclose it in |
| \fB[.\fR and \fB.]\fR |
| to make it a collating element (see below). |
| Alternatively, |
| make it the first character |
| (following a possible `\fB^\fR'), |
| or (AREs only) precede it with `\fB\e\fR'. |
| Alternatively, for `\fB\-\fR', |
| make it the last character, |
| or the second endpoint of a range. |
| To use a literal |
| \fB\-\fR |
| as the first endpoint of a range, |
| make it a collating element |
| or (AREs only) precede it with `\fB\e\fR'. |
| With the exception of these, some combinations using |
| \fB[\fR |
| (see next |
| paragraphs), and escapes, |
| all other special characters lose their |
| special significance within a bracket expression. |
| .PP |
| Within a bracket expression, a collating element (a character, |
| a multi-character sequence that collates as if it were a single character, |
| or a collating-sequence name for either) |
| enclosed in |
| \fB[.\fR and \fB.]\fR |
| stands for the |
| sequence of characters of that collating element. |
| The sequence is a single element of the bracket expression's list. |
| A bracket expression in a locale that has |
| multi-character collating elements |
| can thus match more than one character. |
| .VS 8.2 |
| So (insidiously), a bracket expression that starts with \fB^\fR |
| can match multi-character collating elements even if none of them |
| appear in the bracket expression! |
| (\fINote:\fR Tcl currently has no multi-character collating elements. |
| This information is only for illustration.) |
| .PP |
| For example, assume the collating sequence includes a \fBch\fR |
| multi-character collating element. |
| Then the RE \fB[[.ch.]]*c\fR (zero or more \fBch\fP's followed by \fBc\fP) |
| matches the first five characters of `\fBchchcc\fR'. |
| Also, the RE \fB[^c]b\fR matches all of `\fBchb\fR' |
| (because \fB[^c]\fR matches the multi-character \fBch\fR). |
| .VE 8.2 |
| .PP |
| Within a bracket expression, a collating element enclosed in |
| \fB[=\fR |
| and |
| \fB=]\fR |
| is an equivalence class, standing for the sequences of characters |
| of all collating elements equivalent to that one, including itself. |
| (If there are no other equivalent collating elements, |
| the treatment is as if the enclosing delimiters were `\fB[.\fR'\& |
| and `\fB.]\fR'.) |
| For example, if |
| \fBo\fR |
| and |
| \fB\o'o^'\fR |
| are the members of an equivalence class, |
| then `\fB[[=o=]]\fR', `\fB[[=\o'o^'=]]\fR', |
| and `\fB[o\o'o^']\fR'\& |
| are all synonymous. |
| An equivalence class may not be an endpoint |
| of a range. |
| .VS 8.2 |
| (\fINote:\fR |
| Tcl currently implements only the Unicode locale. |
| It doesn't define any equivalence classes. |
| The examples above are just illustrations.) |
| .VE 8.2 |
| .PP |
| Within a bracket expression, the name of a \fIcharacter class\fR enclosed |
| in |
| \fB[:\fR |
| and |
| \fB:]\fR |
| stands for the list of all characters |
| (not all collating elements!) |
| belonging to that |
| class. |
| Standard character classes are: |
| .PP |
| .RS |
| .ne 5 |
| .nf |
| .ta 3c |
| \fBalpha\fR A letter. |
| \fBupper\fR An upper-case letter. |
| \fBlower\fR A lower-case letter. |
| \fBdigit\fR A decimal digit. |
| \fBxdigit\fR A hexadecimal digit. |
| \fBalnum\fR An alphanumeric (letter or digit). |
| \fBprint\fR An alphanumeric (same as alnum). |
| \fBblank\fR A space or tab character. |
| \fBspace\fR A character producing white space in displayed text. |
| \fBpunct\fR A punctuation character. |
| \fBgraph\fR A character with a visible representation. |
| \fBcntrl\fR A control character. |
| .fi |
| .RE |
| .PP |
| A locale may provide others. |
| .VS 8.2 |
| (Note that the current Tcl implementation has only one locale: |
| the Unicode locale.) |
| .VE 8.2 |
| A character class may not be used as an endpoint of a range. |
| .PP |
| There are two special cases of bracket expressions: |
| the bracket expressions |
| \fB[[:<:]]\fR |
| and |
| \fB[[:>:]]\fR |
| are constraints, matching empty strings at |
| the beginning and end of a word respectively. |
| '\" note, discussion of escapes below references this definition of word |
| A word is defined as a sequence of |
| word characters |
| that is neither preceded nor followed by |
| word characters. |
| A word character is an |
| \fIalnum\fR |
| character |
| or an underscore |
| (\fB_\fR). |
| These special bracket expressions are deprecated; |
| users of AREs should use constraint escapes instead (see below). |
| .SH ESCAPES |
| Escapes (AREs only), which begin with a |
| \fB\e\fR |
| followed by an alphanumeric character, |
| come in several varieties: |
| character entry, class shorthands, constraint escapes, and back references. |
| A |
| \fB\e\fR |
| followed by an alphanumeric character but not constituting |
| a valid escape is illegal in AREs. |
| In EREs, there are no escapes: |
| outside a bracket expression, |
| a |
| \fB\e\fR |
| followed by an alphanumeric character merely stands for that |
| character as an ordinary character, |
| and inside a bracket expression, |
| \fB\e\fR |
| is an ordinary character. |
| (The latter is the one actual incompatibility between EREs and AREs.) |
| .PP |
| Character-entry escapes (AREs only) exist to make it easier to specify |
| non-printing and otherwise inconvenient characters in REs: |
| .RS 2 |
| .TP 5 |
| \fB\ea\fR |
| alert (bell) character, as in C |
| .TP |
| \fB\eb\fR |
| backspace, as in C |
| .TP |
| \fB\eB\fR |
| synonym for |
| \fB\e\fR |
| to help reduce backslash doubling in some |
| applications where there are multiple levels of backslash processing |
| .TP |
| \fB\ec\fIX\fR |
| (where X is any character) the character whose |
| low-order 5 bits are the same as those of |
| \fIX\fR, |
| and whose other bits are all zero |
| .TP |
| \fB\ee\fR |
| the character whose collating-sequence name |
| is `\fBESC\fR', |
| or failing that, the character with octal value 033 |
| .TP |
| \fB\ef\fR |
| formfeed, as in C |
| .TP |
| \fB\en\fR |
| newline, as in C |
| .TP |
| \fB\er\fR |
| carriage return, as in C |
| .TP |
| \fB\et\fR |
| horizontal tab, as in C |
| .TP |
| \fB\eu\fIwxyz\fR |
| (where |
| \fIwxyz\fR |
| is exactly four hexadecimal digits) |
| the Unicode character |
| \fBU+\fIwxyz\fR |
| in the local byte ordering |
| .TP |
| \fB\eU\fIstuvwxyz\fR |
| (where |
| \fIstuvwxyz\fR |
| is exactly eight hexadecimal digits) |
| reserved for a somewhat-hypothetical Unicode extension to 32 bits |
| .TP |
| \fB\ev\fR |
| vertical tab, as in C |
| are all available. |
| .TP |
| \fB\ex\fIhhh\fR |
| (where |
| \fIhhh\fR |
| is any sequence of hexadecimal digits) |
| the character whose hexadecimal value is |
| \fB0x\fIhhh\fR |
| (a single character no matter how many hexadecimal digits are used). |
| .TP |
| \fB\e0\fR |
| the character whose value is |
| \fB0\fR |
| .TP |
| \fB\e\fIxy\fR |
| (where |
| \fIxy\fR |
| is exactly two octal digits, |
| and is not a |
| \fIback reference\fR (see below)) |
| the character whose octal value is |
| \fB0\fIxy\fR |
| .TP |
| \fB\e\fIxyz\fR |
| (where |
| \fIxyz\fR |
| is exactly three octal digits, |
| and is not a |
| back reference (see below)) |
| the character whose octal value is |
| \fB0\fIxyz\fR |
| .RE |
| .PP |
| Hexadecimal digits are `\fB0\fR'-`\fB9\fR', `\fBa\fR'-`\fBf\fR', |
| and `\fBA\fR'-`\fBF\fR'. |
| Octal digits are `\fB0\fR'-`\fB7\fR'. |
| .PP |
| The character-entry escapes are always taken as ordinary characters. |
| For example, |
| \fB\e135\fR |
| is |
| \fB]\fR |
| in ASCII, |
| but |
| \fB\e135\fR |
| does not terminate a bracket expression. |
| Beware, however, that some applications (e.g., C compilers) interpret |
| such sequences themselves before the regular-expression package |
| gets to see them, which may require doubling (quadrupling, etc.) the `\fB\e\fR'. |
| .PP |
| Class-shorthand escapes (AREs only) provide shorthands for certain commonly-used |
| character classes: |
| .RS 2 |
| .TP 10 |
| \fB\ed\fR |
| \fB[[:digit:]]\fR |
| .TP |
| \fB\es\fR |
| \fB[[:space:]]\fR |
| .TP |
| \fB\ew\fR |
| \fB[[:alnum:]_]\fR |
| (note underscore) |
| .TP |
| \fB\eD\fR |
| \fB[^[:digit:]]\fR |
| .TP |
| \fB\eS\fR |
| \fB[^[:space:]]\fR |
| .TP |
| \fB\eW\fR |
| \fB[^[:alnum:]_]\fR |
| (note underscore) |
| .RE |
| .PP |
| Within bracket expressions, `\fB\ed\fR', `\fB\es\fR', |
| and `\fB\ew\fR'\& |
| lose their outer brackets, |
| and `\fB\eD\fR', `\fB\eS\fR', |
| and `\fB\eW\fR'\& |
| are illegal. |
| .VS 8.2 |
| (So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR. |
| Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.) |
| .VE 8.2 |
| .PP |
| A constraint escape (AREs only) is a constraint, |
| matching the empty string if specific conditions are met, |
| written as an escape: |
| .RS 2 |
| .TP 6 |
| \fB\eA\fR |
| matches only at the beginning of the string |
| (see MATCHING, below, for how this differs from `\fB^\fR') |
| .TP |
| \fB\em\fR |
| matches only at the beginning of a word |
| .TP |
| \fB\eM\fR |
| matches only at the end of a word |
| .TP |
| \fB\ey\fR |
| matches only at the beginning or end of a word |
| .TP |
| \fB\eY\fR |
| matches only at a point that is not the beginning or end of a word |
| .TP |
| \fB\eZ\fR |
| matches only at the end of the string |
| (see MATCHING, below, for how this differs from `\fB$\fR') |
| .TP |
| \fB\e\fIm\fR |
| (where |
| \fIm\fR |
| is a nonzero digit) a \fIback reference\fR, see below |
| .TP |
| \fB\e\fImnn\fR |
| (where |
| \fIm\fR |
| is a nonzero digit, and |
| \fInn\fR |
| is some more digits, |
| and the decimal value |
| \fImnn\fR |
| is not greater than the number of closing capturing parentheses seen so far) |
| a \fIback reference\fR, see below |
| .RE |
| .PP |
| A word is defined as in the specification of |
| \fB[[:<:]]\fR |
| and |
| \fB[[:>:]]\fR |
| above. |
| Constraint escapes are illegal within bracket expressions. |
| .PP |
| A back reference (AREs only) matches the same string matched by the parenthesized |
| subexpression specified by the number, |
| so that (e.g.) |
| \fB([bc])\e1\fR |
| matches |
| \fBbb\fR |
| or |
| \fBcc\fR |
| but not `\fBbc\fR'. |
| The subexpression must entirely precede the back reference in the RE. |
| Subexpressions are numbered in the order of their leading parentheses. |
| Non-capturing parentheses do not define subexpressions. |
| .PP |
| There is an inherent historical ambiguity between octal character-entry |
| escapes and back references, which is resolved by heuristics, |
| as hinted at above. |
| A leading zero always indicates an octal escape. |
| A single non-zero digit, not followed by another digit, |
| is always taken as a back reference. |
| A multi-digit sequence not starting with a zero is taken as a back |
| reference if it comes after a suitable subexpression |
| (i.e. the number is in the legal range for a back reference), |
| and otherwise is taken as octal. |
| .SH "METASYNTAX" |
| In addition to the main syntax described above, there are some special |
| forms and miscellaneous syntactic facilities available. |
| .PP |
| Normally the flavor of RE being used is specified by |
| application-dependent means. |
| However, this can be overridden by a \fIdirector\fR. |
| If an RE of any flavor begins with `\fB***:\fR', |
| the rest of the RE is an ARE. |
| If an RE of any flavor begins with `\fB***=\fR', |
| the rest of the RE is taken to be a literal string, |
| with all characters considered ordinary characters. |
| .PP |
| An ARE may begin with \fIembedded options\fR: |
| a sequence |
| \fB(?\fIxyz\fB)\fR |
| (where |
| \fIxyz\fR |
| is one or more alphabetic characters) |
| specifies options affecting the rest of the RE. |
| These supplement, and can override, |
| any options specified by the application. |
| The available option letters are: |
| .RS 2 |
| .TP 3 |
| \fBb\fR |
| rest of RE is a BRE |
| .TP 3 |
| \fBc\fR |
| case-sensitive matching (usual default) |
| .TP 3 |
| \fBe\fR |
| rest of RE is an ERE |
| .TP 3 |
| \fBi\fR |
| case-insensitive matching (see MATCHING, below) |
| .TP 3 |
| \fBm\fR |
| historical synonym for |
| \fBn\fR |
| .TP 3 |
| \fBn\fR |
| newline-sensitive matching (see MATCHING, below) |
| .TP 3 |
| \fBp\fR |
| partial newline-sensitive matching (see MATCHING, below) |
| .TP 3 |
| \fBq\fR |
| rest of RE is a literal (``quoted'') string, all ordinary characters |
| .TP 3 |
| \fBs\fR |
| non-newline-sensitive matching (usual default) |
| .TP 3 |
| \fBt\fR |
| tight syntax (usual default; see below) |
| .TP 3 |
| \fBw\fR |
| inverse partial newline-sensitive (``weird'') matching (see MATCHING, below) |
| .TP 3 |
| \fBx\fR |
| expanded syntax (see below) |
| .RE |
| .PP |
| Embedded options take effect at the |
| \fB)\fR |
| terminating the sequence. |
| They are available only at the start of an ARE, |
| and may not be used later within it. |
| .PP |
| In addition to the usual (\fItight\fR) RE syntax, in which all characters are |
| significant, there is an \fIexpanded\fR syntax, |
| available in all flavors of RE |
| with the \fB-expanded\fR switch, or in AREs with the embedded x option. |
| In the expanded syntax, |
| white-space characters are ignored |
| and all characters between a |
| \fB#\fR |
| and the following newline (or the end of the RE) are ignored, |
| permitting paragraphing and commenting a complex RE. |
| There are three exceptions to that basic rule: |
| .RS 2 |
| .PP |
| a white-space character or `\fB#\fR' preceded by `\fB\e\fR' is retained |
| .PP |
| white space or `\fB#\fR' within a bracket expression is retained |
| .PP |
| white space and comments are illegal within multi-character symbols |
| like the ARE `\fB(?:\fR' or the BRE `\fB\e(\fR' |
| .RE |
| .PP |
| Expanded-syntax white-space characters are blank, tab, newline, and |
| .VS 8.2 |
| any character that belongs to the \fIspace\fR character class. |
| .VE 8.2 |
| .PP |
| Finally, in an ARE, |
| outside bracket expressions, the sequence `\fB(?#\fIttt\fB)\fR' |
| (where |
| \fIttt\fR |
| is any text not containing a `\fB)\fR') |
| is a comment, |
| completely ignored. |
| Again, this is not allowed between the characters of |
| multi-character symbols like `\fB(?:\fR'. |
| Such comments are more a historical artifact than a useful facility, |
| and their use is deprecated; |
| use the expanded syntax instead. |
| .PP |
| \fINone\fR of these metasyntax extensions is available if the application |
| (or an initial |
| \fB***=\fR |
| director) |
| has specified that the user's input be treated as a literal string |
| rather than as an RE. |
| .SH MATCHING |
| In the event that an RE could match more than one substring of a given |
| string, |
| the RE matches the one starting earliest in the string. |
| If the RE could match more than one substring starting at that point, |
| its choice is determined by its \fIpreference\fR: |
| either the longest substring, or the shortest. |
| .PP |
| Most atoms, and all constraints, have no preference. |
| A parenthesized RE has the same preference (possibly none) as the RE. |
| A quantified atom with quantifier |
| \fB{\fIm\fB}\fR |
| or |
| \fB{\fIm\fB}?\fR |
| has the same preference (possibly none) as the atom itself. |
| A quantified atom with other normal quantifiers (including |
| \fB{\fIm\fB,\fIn\fB}\fR |
| with |
| \fIm\fR |
| equal to |
| \fIn\fR) |
| prefers longest match. |
| A quantified atom with other non-greedy quantifiers (including |
| \fB{\fIm\fB,\fIn\fB}?\fR |
| with |
| \fIm\fR |
| equal to |
| \fIn\fR) |
| prefers shortest match. |
| A branch has the same preference as the first quantified atom in it |
| which has a preference. |
| An RE consisting of two or more branches connected by the |
| \fB|\fR |
| operator prefers longest match. |
| .PP |
| Subject to the constraints imposed by the rules for matching the whole RE, |
| subexpressions also match the longest or shortest possible substrings, |
| based on their preferences, |
| with subexpressions starting earlier in the RE taking priority over |
| ones starting later. |
| Note that outer subexpressions thus take priority over |
| their component subexpressions. |
| .PP |
| Note that the quantifiers |
| \fB{1,1}\fR |
| and |
| \fB{1,1}?\fR |
| can be used to force longest and shortest preference, respectively, |
| on a subexpression or a whole RE. |
| .PP |
| Match lengths are measured in characters, not collating elements. |
| An empty string is considered longer than no match at all. |
| For example, |
| \fBbb*\fR |
| matches the three middle characters of `\fBabbbc\fR', |
| \fB(week|wee)(night|knights)\fR |
| matches all ten characters of `\fBweeknights\fR', |
| when |
| \fB(.*).*\fR |
| is matched against |
| \fBabc\fR |
| the parenthesized subexpression |
| matches all three characters, and |
| when |
| \fB(a*)*\fR |
| is matched against |
| \fBbc\fR |
| both the whole RE and the parenthesized |
| subexpression match an empty string. |
| .PP |
| If case-independent matching is specified, |
| the effect is much as if all case distinctions had vanished from the |
| alphabet. |
| When an alphabetic that exists in multiple cases appears as an |
| ordinary character outside a bracket expression, it is effectively |
| transformed into a bracket expression containing both cases, |
| so that |
| \fBx\fR |
| becomes `\fB[xX]\fR'. |
| When it appears inside a bracket expression, all case counterparts |
| of it are added to the bracket expression, so that |
| \fB[x]\fR |
| becomes |
| \fB[xX]\fR |
| and |
| \fB[^x]\fR |
| becomes `\fB[^xX]\fR'. |
| .PP |
| If newline-sensitive matching is specified, \fB.\fR |
| and bracket expressions using |
| \fB^\fR |
| will never match the newline character |
| (so that matches will never cross newlines unless the RE |
| explicitly arranges it) |
| and |
| \fB^\fR |
| and |
| \fB$\fR |
| will match the empty string after and before a newline |
| respectively, in addition to matching at beginning and end of string |
| respectively. |
| ARE |
| \fB\eA\fR |
| and |
| \fB\eZ\fR |
| continue to match beginning or end of string \fIonly\fR. |
| .PP |
| If partial newline-sensitive matching is specified, |
| this affects \fB.\fR |
| and bracket expressions |
| as with newline-sensitive matching, but not |
| \fB^\fR |
| and `\fB$\fR'. |
| .PP |
| If inverse partial newline-sensitive matching is specified, |
| this affects |
| \fB^\fR |
| and |
| \fB$\fR |
| as with |
| newline-sensitive matching, |
| but not \fB.\fR |
| and bracket expressions. |
| This isn't very useful but is provided for symmetry. |
| .SH "LIMITS AND COMPATIBILITY" |
| No particular limit is imposed on the length of REs. |
| Programs intended to be highly portable should not employ REs longer |
| than 256 bytes, |
| as a POSIX-compliant implementation can refuse to accept such REs. |
| .PP |
| The only feature of AREs that is actually incompatible with |
| POSIX EREs is that |
| \fB\e\fR |
| does not lose its special |
| significance inside bracket expressions. |
| All other ARE features use syntax which is illegal or has |
| undefined or unspecified effects in POSIX EREs; |
| the |
| \fB***\fR |
| syntax of directors likewise is outside the POSIX |
| syntax for both BREs and EREs. |
| .PP |
| Many of the ARE extensions are borrowed from Perl, but some have |
| been changed to clean them up, and a few Perl extensions are not present. |
| Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR', |
| the lack of special treatment for a trailing newline, |
| the addition of complemented bracket expressions to the things |
| affected by newline-sensitive matching, |
| the restrictions on parentheses and back references in lookahead constraints, |
| and the longest/shortest-match (rather than first-match) matching semantics. |
| .PP |
| The matching rules for REs containing both normal and non-greedy quantifiers |
| have changed since early beta-test versions of this package. |
| (The new rules are much simpler and cleaner, |
| but don't work as hard at guessing the user's real intentions.) |
| .PP |
| Henry Spencer's original 1986 \fIregexp\fR package, |
| still in widespread use (e.g., in pre-8.1 releases of Tcl), |
| implemented an early version of today's EREs. |
| There are four incompatibilities between \fIregexp\fR's near-EREs |
| (`RREs' for short) and AREs. |
| In roughly increasing order of significance: |
| .PP |
| .RS |
| In AREs, |
| \fB\e\fR |
| followed by an alphanumeric character is either an |
| escape or an error, |
| while in RREs, it was just another way of writing the |
| alphanumeric. |
| This should not be a problem because there was no reason to write |
| such a sequence in RREs. |
| .PP |
| \fB{\fR |
| followed by a digit in an ARE is the beginning of a bound, |
| while in RREs, |
| \fB{\fR |
| was always an ordinary character. |
| Such sequences should be rare, |
| and will often result in an error because following characters |
| will not look like a valid bound. |
| .PP |
| In AREs, |
| \fB\e\fR |
| remains a special character within `\fB[\|]\fR', |
| so a literal |
| \fB\e\fR |
| within |
| \fB[\|]\fR |
| must be written `\fB\e\e\fR'. |
| \fB\e\e\fR |
| also gives a literal |
| \fB\e\fR |
| within |
| \fB[\|]\fR |
| in RREs, |
| but only truly paranoid programmers routinely doubled the backslash. |
| .PP |
| AREs report the longest/shortest match for the RE, |
| rather than the first found in a specified search order. |
| This may affect some RREs which were written in the expectation that |
| the first match would be reported. |
| (The careful crafting of RREs to optimize the search order for fast |
| matching is obsolete (AREs examine all possible matches |
| in parallel, and their performance is largely insensitive to their |
| complexity) but cases where the search order was exploited to deliberately |
| find a match which was \fInot\fR the longest/shortest will need rewriting.) |
| .RE |
| |
| .SH "BASIC REGULAR EXPRESSIONS" |
| BREs differ from EREs in several respects. `\fB|\fR', `\fB+\fR', |
| and |
| \fB?\fR |
| are ordinary characters and there is no equivalent |
| for their functionality. |
| The delimiters for bounds are |
| \fB\e{\fR |
| and `\fB\e}\fR', |
| with |
| \fB{\fR |
| and |
| \fB}\fR |
| by themselves ordinary characters. |
| The parentheses for nested subexpressions are |
| \fB\e(\fR |
| and `\fB\e)\fR', |
| with |
| \fB(\fR |
| and |
| \fB)\fR |
| by themselves ordinary characters. |
| \fB^\fR |
| is an ordinary character except at the beginning of the |
| RE or the beginning of a parenthesized subexpression, |
| \fB$\fR |
| is an ordinary character except at the end of the |
| RE or the end of a parenthesized subexpression, |
| and |
| \fB*\fR |
| is an ordinary character if it appears at the beginning of the |
| RE or the beginning of a parenthesized subexpression |
| (after a possible leading `\fB^\fR'). |
| Finally, |
| single-digit back references are available, |
| and |
| \fB\e<\fR |
| and |
| \fB\e>\fR |
| are synonyms for |
| \fB[[:<:]]\fR |
| and |
| \fB[[:>:]]\fR |
| respectively; |
| no other escapes are available. |
| |
| .SH "SEE ALSO" |
| RegExp(3), regexp(n), regsub(n), lsearch(n), switch(n), text(n) |
| |
| .SH KEYWORDS |
| match, regular expression, string |