1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.1 2005/06/15 08:57:10 ph10 Exp $ */
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
10 Written by Philip Hazel
11 Copyright (c) 1997-2005 University of Cambridge
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
43 /* This module contains the external function pcre_compile(), along with
44 supporting internal functions that are not used by other modules. */
47 #include "pcre_internal.h"
50 /*************************************************
51 * Code parameters and static tables *
52 *************************************************/
54 /* Maximum number of items on the nested bracket stacks at compile time. This
55 applies to the nesting of all kinds of parentheses. It does not limit
56 un-nested, non-capturing parentheses. This number can be made bigger if
57 necessary - it is used to dimension one int and one unsigned char vector at
60 #define BRASTACK_SIZE 200
63 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
64 are simple data values; negative values are for special things like \d and so
65 on. Zero means further processing is needed (for things like \x), or the escape
68 #if !EBCDIC /* This is the "normal" table for ASCII systems */
69 static const short int escapes[] = {
70 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
71 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
72 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
73 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
74 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
75 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
76 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
77 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
78 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
79 0, 0, -ESC_z /* x - z */
82 #else /* This is the "abnormal" table for EBCDIC systems */
83 static const short int escapes[] = {
84 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
85 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
86 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
87 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
88 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
89 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
90 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
91 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
92 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
93 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
94 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
95 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
96 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
97 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
98 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
99 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
100 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
101 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
102 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
103 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
104 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
105 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
106 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
111 /* Tables of names of POSIX character classes and their lengths. The list is
112 terminated by a zero length entry. The first three must be alpha, upper, lower,
113 as this is assumed for handling case independence. */
115 static const char *const posix_names[] = {
116 "alpha", "lower", "upper",
117 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
118 "print", "punct", "space", "word", "xdigit" };
120 static const uschar posix_name_lengths[] = {
121 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
123 /* Table of class bit maps for each POSIX class; up to three may be combined
124 to form the class. The table for [:blank:] is dynamically modified to remove
125 the vertical space characters. */
127 static const int posix_class_maps[] = {
128 cbit_lower, cbit_upper, -1, /* alpha */
129 cbit_lower, -1, -1, /* lower */
130 cbit_upper, -1, -1, /* upper */
131 cbit_digit, cbit_lower, cbit_upper, /* alnum */
132 cbit_print, cbit_cntrl, -1, /* ascii */
133 cbit_space, -1, -1, /* blank - a GNU extension */
134 cbit_cntrl, -1, -1, /* cntrl */
135 cbit_digit, -1, -1, /* digit */
136 cbit_graph, -1, -1, /* graph */
137 cbit_print, -1, -1, /* print */
138 cbit_punct, -1, -1, /* punct */
139 cbit_space, -1, -1, /* space */
140 cbit_word, -1, -1, /* word - a Perl extension */
141 cbit_xdigit,-1, -1 /* xdigit */
145 /* The texts of compile-time error messages. These are "char *" because they
146 are passed to the outside world. */
148 static const char *error_texts[] = {
150 "\\ at end of pattern",
151 "\\c at end of pattern",
152 "unrecognized character follows \\",
153 "numbers out of order in {} quantifier",
155 "number too big in {} quantifier",
156 "missing terminating ] for character class",
157 "invalid escape sequence in character class",
158 "range out of order in character class",
161 "operand of unlimited repeat could match the empty string",
162 "internal error: unexpected repeat",
163 "unrecognized character after (?",
164 "POSIX named classes are supported only within a class",
167 "reference to non-existent subpattern",
168 "erroffset passed as NULL",
169 "unknown option bit(s) set",
170 "missing ) after comment",
171 "parentheses nested too deeply",
173 "regular expression too large",
174 "failed to get memory",
175 "unmatched parentheses",
176 "internal error: code overflow",
177 "unrecognized character after (?<",
179 "lookbehind assertion is not fixed length",
180 "malformed number after (?(",
181 "conditional group contains more than two branches",
182 "assertion expected after (?(",
183 "(?R or (?digits must be followed by )",
185 "unknown POSIX class name",
186 "POSIX collating elements are not supported",
187 "this version of PCRE is not compiled with PCRE_UTF8 support",
189 "character value in \\x{...} sequence is too large",
191 "invalid condition (?(0)",
192 "\\C not allowed in lookbehind assertion",
193 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
194 "number after (?C is > 255",
195 "closing ) for (?C expected",
197 "recursive call could loop indefinitely",
198 "unrecognized character after (?P",
199 "syntax error after (?P",
200 "two named groups have the same name",
201 "invalid UTF-8 string",
203 "support for \\P, \\p, and \\X has not been compiled",
204 "malformed \\P or \\p sequence",
205 "unknown property name after \\P or \\p"
209 /* Table to identify digits and hex digits. This is used when compiling
210 patterns. Note that the tables in chartables are dependent on the locale, and
211 may mark arbitrary characters as digits - but the PCRE compiling code expects
212 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
213 a private table here. It costs 256 bytes, but it is a lot faster than doing
214 character value tests (at least in some simple cases I timed), and in some
215 applications one wants PCRE to compile efficiently as well as match
218 For convenience, we use the same bit definitions as in chartables:
221 0x08 hexadecimal digit
223 Then we can use ctype_digit and ctype_xdigit in the code. */
225 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
226 static const unsigned char digitab[] =
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
234 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
235 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
236 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
237 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
240 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
261 #else /* This is the "abnormal" case, for EBCDIC systems */
262 static const unsigned char digitab[] =
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
280 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
288 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
294 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
295 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
297 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
298 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
299 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
300 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
302 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
306 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
307 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
309 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
311 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
314 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
315 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
316 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
317 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
318 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
319 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
320 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
321 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
322 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
323 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
324 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
325 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
326 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
327 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
328 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
329 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
333 /* Definition to allow mutual recursion */
336 compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
337 int *, int *, branch_chain *, compile_data *);
341 /*************************************************
343 *************************************************/
345 /* This function is called when a \ has been encountered. It either returns a
346 positive value for a simple escape such as \n, or a negative value which
347 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
348 a positive value greater than 255 may be returned. On entry, ptr is pointing at
349 the \. On exit, it is on the final character of the escape sequence.
352 ptrptr points to the pattern position pointer
353 errorcodeptr points to the errorcode variable
354 bracount number of previous extracting brackets
355 options the options bits
356 isclass TRUE if inside a character class
358 Returns: zero or positive => a data character
359 negative => a special escape sequence
360 on error, errorptr is set
364 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
365 int options, BOOL isclass)
367 const uschar *ptr = *ptrptr;
370 /* If backslash is at the end of the pattern, it's an error. */
373 if (c == 0) *errorcodeptr = ERR1;
375 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
376 a table. A non-zero result is something that can be returned immediately.
377 Otherwise further processing may be required. */
379 #if !EBCDIC /* ASCII coding */
380 else if (c < '0' || c > 'z') {} /* Not alphameric */
381 else if ((i = escapes[c - '0']) != 0) c = i;
383 #else /* EBCDIC coding */
384 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
385 else if ((i = escapes[c - 0x48]) != 0) c = i;
388 /* Escapes that need further processing, or are illegal. */
392 const uschar *oldptr;
395 /* A number of Perl escapes are not handled by PCRE. We give an explicit
403 *errorcodeptr = ERR37;
406 /* The handling of escape sequences consisting of a string of digits
407 starting with one that is not zero is not straightforward. By experiment,
408 the way Perl works seems to be as follows:
410 Outside a character class, the digits are read as a decimal number. If the
411 number is less than 10, or if there are that many previous extracting
412 left brackets, then it is a back reference. Otherwise, up to three octal
413 digits are read to form an escaped byte. Thus \123 is likely to be octal
414 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
415 value is greater than 377, the least significant 8 bits are taken. Inside a
416 character class, \ followed by a digit is always an octal number. */
418 case '1': case '2': case '3': case '4': case '5':
419 case '6': case '7': case '8': case '9':
425 while ((digitab[ptr[1]] & ctype_digit) != 0)
426 c = c * 10 + *(++ptr) - '0';
427 if (c < 10 || c <= bracount)
432 ptr = oldptr; /* Put the pointer back and fall through */
435 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
436 generates a binary zero byte and treats the digit as a following literal.
437 Thus we have to pull back the pointer by one. */
439 if ((c = *ptr) >= '8')
446 /* \0 always starts an octal number, but we may drop through to here with a
447 larger first octal digit. */
451 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
452 c = c * 8 + *(++ptr) - '0';
453 c &= 255; /* Take least significant 8 bits */
456 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
457 which can be greater than 0xff, but only if the ddd are hex digits. */
461 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
463 const uschar *pt = ptr + 2;
464 register int count = 0;
466 while ((digitab[*pt] & ctype_xdigit) != 0)
470 #if !EBCDIC /* ASCII coding */
471 if (cc >= 'a') cc -= 32; /* Convert to upper case */
472 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
473 #else /* EBCDIC coding */
474 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
475 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
480 if (c < 0 || count > 8) *errorcodeptr = ERR34;
484 /* If the sequence of hex digits does not end with '}', then we don't
485 recognize this construct; fall through to the normal \x handling. */
489 /* Read just a single hex char */
492 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
494 int cc; /* Some compilers don't like ++ */
495 cc = *(++ptr); /* in initializers */
496 #if !EBCDIC /* ASCII coding */
497 if (cc >= 'a') cc -= 32; /* Convert to upper case */
498 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
499 #else /* EBCDIC coding */
500 if (cc <= 'z') cc += 64; /* Convert to upper case */
501 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
506 /* Other special escapes not starting with a digit are straightforward */
512 *errorcodeptr = ERR2;
516 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
517 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
518 (However, an EBCDIC equivalent has now been added.) */
520 #if !EBCDIC /* ASCII coding */
521 if (c >= 'a' && c <= 'z') c -= 32;
523 #else /* EBCDIC coding */
524 if (c >= 'a' && c <= 'z') c += 64;
529 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
530 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
531 for Perl compatibility, it is a literal. This code looks a bit odd, but
532 there used to be some cases other than the default, and there may be again
533 in future, so I haven't "optimized" it. */
536 if ((options & PCRE_EXTRA) != 0) switch(c)
539 *errorcodeptr = ERR3;
553 /*************************************************
555 *************************************************/
557 /* This function is called after \P or \p has been encountered, provided that
558 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
559 pointing at the P or p. On exit, it is pointing at the final character of the
563 ptrptr points to the pattern position pointer
564 negptr points to a boolean that is set TRUE for negation else FALSE
565 errorcodeptr points to the error code variable
567 Returns: value from ucp_type_table, or -1 for an invalid type
571 get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
574 const uschar *ptr = *ptrptr;
578 if (c == 0) goto ERROR_RETURN;
582 /* \P or \p can be followed by a one- or two-character name in {}, optionally
583 preceded by ^ for negation. */
592 for (i = 0; i <= 2; i++)
595 if (c == 0) goto ERROR_RETURN;
599 if (c !='}') /* Try to distinguish error cases */
601 while (*(++ptr) != 0 && *ptr != '}');
602 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
607 /* Otherwise there is just one following character */
617 /* Search for a recognized property name using binary chop */
620 top = _pcre_utt_size;
625 c = strcmp(name, _pcre_utt[i].name);
626 if (c == 0) return _pcre_utt[i].value;
627 if (c > 0) bot = i + 1; else top = i;
631 *errorcodeptr = ERR47;
636 *errorcodeptr = ERR46;
645 /*************************************************
646 * Check for counted repeat *
647 *************************************************/
649 /* This function is called when a '{' is encountered in a place where it might
650 start a quantifier. It looks ahead to see if it really is a quantifier or not.
651 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
652 where the ddds are digits.
655 p pointer to the first char after '{'
657 Returns: TRUE or FALSE
661 is_counted_repeat(const uschar *p)
663 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
664 while ((digitab[*p] & ctype_digit) != 0) p++;
665 if (*p == '}') return TRUE;
667 if (*p++ != ',') return FALSE;
668 if (*p == '}') return TRUE;
670 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
671 while ((digitab[*p] & ctype_digit) != 0) p++;
678 /*************************************************
679 * Read repeat counts *
680 *************************************************/
682 /* Read an item of the form {n,m} and return the values. This is called only
683 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
684 so the syntax is guaranteed to be correct, but we need to check the values.
687 p pointer to first char after '{'
688 minp pointer to int for min
689 maxp pointer to int for max
690 returned as -1 if no max
691 errorcodeptr points to error code variable
693 Returns: pointer to '}' on success;
694 current ptr on error, with errorcodeptr set non-zero
697 static const uschar *
698 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
703 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
705 if (*p == '}') max = min; else
710 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
713 *errorcodeptr = ERR4;
719 /* Do paranoid checks, then fill in the required variables, and pass back the
720 pointer to the terminating '}'. */
722 if (min > 65535 || max > 65535)
723 *errorcodeptr = ERR5;
734 /*************************************************
735 * Find first significant op code *
736 *************************************************/
738 /* This is called by several functions that scan a compiled expression looking
739 for a fixed first character, or an anchoring op code etc. It skips over things
740 that do not influence this. For some calls, a change of option is important.
741 For some calls, it makes sense to skip negative forward and all backward
742 assertions, and also the \b assertion; for others it does not.
745 code pointer to the start of the group
746 options pointer to external options
747 optbit the option bit whose changing is significant, or
749 skipassert TRUE if certain assertions are to be skipped
751 Returns: pointer to the first significant opcode
755 first_significant_code(const uschar *code, int *options, int optbit,
763 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
764 *options = (int)code[1];
770 case OP_ASSERTBACK_NOT:
771 if (!skipassert) return code;
772 do code += GET(code, 1); while (*code == OP_ALT);
773 code += _pcre_OP_lengths[*code];
776 case OP_WORD_BOUNDARY:
777 case OP_NOT_WORD_BOUNDARY:
778 if (!skipassert) return code;
784 code += _pcre_OP_lengths[*code];
791 /* Control never reaches here */
797 /*************************************************
798 * Find the fixed length of a pattern *
799 *************************************************/
801 /* Scan a pattern and compute the fixed length of subject that will match it,
802 if the length is fixed. This is needed for dealing with backward assertions.
803 In UTF8 mode, the result is in characters rather than bytes.
806 code points to the start of the pattern (the bracket)
807 options the compiling options
809 Returns: the fixed length, or -1 if there is no fixed length,
810 or -2 if \C was encountered
814 find_fixedlength(uschar *code, int options)
818 register int branchlength = 0;
819 register uschar *cc = code + 1 + LINK_SIZE;
821 /* Scan along the opcodes for this branch. If we get to the end of the
822 branch, check the length against that of the other branches. */
827 register int op = *cc;
828 if (op >= OP_BRA) op = OP_BRA;
835 d = find_fixedlength(cc, options);
838 do cc += GET(cc, 1); while (*cc == OP_ALT);
842 /* Reached end of a branch; if it's a ket it is the end of a nested
843 call. If it's ALT it is an alternation in a nested call. If it is
844 END it's the end of the outer call. All can be handled by the same code. */
851 if (length < 0) length = branchlength;
852 else if (length != branchlength) return -1;
853 if (*cc != OP_ALT) return length;
858 /* Skip over assertive subpatterns */
863 case OP_ASSERTBACK_NOT:
864 do cc += GET(cc, 1); while (*cc == OP_ALT);
867 /* Skip over things that don't match chars */
880 case OP_NOT_WORD_BOUNDARY:
881 case OP_WORD_BOUNDARY:
882 cc += _pcre_OP_lengths[*cc];
885 /* Handle literal characters */
892 if ((options & PCRE_UTF8) != 0)
894 while ((*cc & 0xc0) == 0x80) cc++;
899 /* Handle exact repetitions. The count is already in characters, but we
900 need to skip over a multibyte character in UTF8 mode. */
903 branchlength += GET2(cc,1);
906 if ((options & PCRE_UTF8) != 0)
908 while((*cc & 0x80) == 0x80) cc++;
914 branchlength += GET2(cc,1);
918 /* Handle single-char matchers */
927 case OP_NOT_WHITESPACE:
929 case OP_NOT_WORDCHAR:
936 /* The single-byte matcher isn't allowed */
941 /* Check a class for variable quantification */
945 cc += GET(cc, 1) - 33;
963 if (GET2(cc,1) != GET2(cc,3)) return -1;
964 branchlength += GET2(cc,1);
973 /* Anything else is variable length */
979 /* Control never gets here */
985 /*************************************************
986 * Scan compiled regex for numbered bracket *
987 *************************************************/
989 /* This little function scans through a compiled pattern until it finds a
990 capturing bracket with the given number.
993 code points to start of expression
994 utf8 TRUE in UTF-8 mode
995 number the required bracket number
997 Returns: pointer to the opcode for the bracket, or NULL if not found
1000 static const uschar *
1001 find_bracket(const uschar *code, BOOL utf8, int number)
1003 #ifndef SUPPORT_UTF8
1004 utf8 = utf8; /* Stop pedantic compilers complaining */
1009 register int c = *code;
1010 if (c == OP_END) return NULL;
1011 else if (c > OP_BRA)
1014 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1015 if (n == number) return (uschar *)code;
1016 code += _pcre_OP_lengths[OP_BRA];
1020 code += _pcre_OP_lengths[c];
1024 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1025 by a multi-byte character. The length in the table is a minimum, so we have
1026 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1027 can use relatively efficient code. */
1042 while ((*code & 0xc0) == 0x80) code++;
1045 /* XCLASS is used for classes that cannot be represented just by a bit
1046 map. This includes negated single high-valued characters. The length in
1047 the table is zero; the actual length is stored in the compiled code. */
1050 code += GET(code, 1) + 1;
1060 /*************************************************
1061 * Scan compiled regex for recursion reference *
1062 *************************************************/
1064 /* This little function scans through a compiled pattern until it finds an
1065 instance of OP_RECURSE.
1068 code points to start of expression
1069 utf8 TRUE in UTF-8 mode
1071 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1074 static const uschar *
1075 find_recurse(const uschar *code, BOOL utf8)
1077 #ifndef SUPPORT_UTF8
1078 utf8 = utf8; /* Stop pedantic compilers complaining */
1083 register int c = *code;
1084 if (c == OP_END) return NULL;
1085 else if (c == OP_RECURSE) return code;
1086 else if (c > OP_BRA)
1088 code += _pcre_OP_lengths[OP_BRA];
1092 code += _pcre_OP_lengths[c];
1096 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1097 by a multi-byte character. The length in the table is a minimum, so we have
1098 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1099 can use relatively efficient code. */
1114 while ((*code & 0xc0) == 0x80) code++;
1117 /* XCLASS is used for classes that cannot be represented just by a bit
1118 map. This includes negated single high-valued characters. The length in
1119 the table is zero; the actual length is stored in the compiled code. */
1122 code += GET(code, 1) + 1;
1132 /*************************************************
1133 * Scan compiled branch for non-emptiness *
1134 *************************************************/
1136 /* This function scans through a branch of a compiled pattern to see whether it
1137 can match the empty string or not. It is called only from could_be_empty()
1138 below. Note that first_significant_code() skips over assertions. If we hit an
1139 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1140 whose current branch will already have been scanned.
1143 code points to start of search
1144 endcode points to where to stop
1145 utf8 TRUE if in UTF8 mode
1147 Returns: TRUE if what is matched could be empty
1151 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1154 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1156 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1158 const uschar *ccode;
1165 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1167 /* Scan a closed bracket */
1169 empty_branch = FALSE;
1172 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1173 empty_branch = TRUE;
1174 code += GET(code, 1);
1176 while (*code == OP_ALT);
1177 if (!empty_branch) return FALSE; /* All branches are non-empty */
1178 code += 1 + LINK_SIZE;
1184 /* Check for quantifiers after a class */
1188 ccode = code + GET(code, 1);
1189 goto CHECK_CLASS_REPEAT;
1202 case OP_CRSTAR: /* These could be empty; continue */
1208 default: /* Non-repeat => class must match */
1209 case OP_CRPLUS: /* These repeats aren't empty */
1215 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1220 /* Opcodes that must match a character */
1227 case OP_NOT_WHITESPACE:
1229 case OP_NOT_WORDCHAR:
1243 case OP_TYPEMINPLUS:
1255 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1256 followed by a multibyte character */
1265 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1276 /*************************************************
1277 * Scan compiled regex for non-emptiness *
1278 *************************************************/
1280 /* This function is called to check for left recursive calls. We want to check
1281 the current branch of the current pattern to see if it could match the empty
1282 string. If it could, we must look outwards for branches at other levels,
1283 stopping when we pass beyond the bracket which is the subject of the recursion.
1286 code points to start of the recursion
1287 endcode points to where to stop (current RECURSE item)
1288 bcptr points to the chain of current (unclosed) branch starts
1289 utf8 TRUE if in UTF-8 mode
1291 Returns: TRUE if what is matched could be empty
1295 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1298 while (bcptr != NULL && bcptr->current >= code)
1300 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1301 bcptr = bcptr->outer;
1308 /*************************************************
1309 * Check for POSIX class syntax *
1310 *************************************************/
1312 /* This function is called when the sequence "[:" or "[." or "[=" is
1313 encountered in a character class. It checks whether this is followed by an
1314 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1318 ptr pointer to the initial [
1319 endptr where to return the end pointer
1320 cd pointer to compile data
1322 Returns: TRUE or FALSE
1326 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1328 int terminator; /* Don't combine these lines; the Solaris cc */
1329 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1330 if (*(++ptr) == '^') ptr++;
1331 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1332 if (*ptr == terminator && ptr[1] == ']')
1343 /*************************************************
1344 * Check POSIX class name *
1345 *************************************************/
1347 /* This function is called to check the name given in a POSIX-style class entry
1351 ptr points to the first letter
1352 len the length of the name
1354 Returns: a value representing the name, or -1 if unknown
1358 check_posix_name(const uschar *ptr, int len)
1360 register int yield = 0;
1361 while (posix_name_lengths[yield] != 0)
1363 if (len == posix_name_lengths[yield] &&
1364 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1371 /*************************************************
1372 * Adjust OP_RECURSE items in repeated group *
1373 *************************************************/
1375 /* OP_RECURSE items contain an offset from the start of the regex to the group
1376 that is referenced. This means that groups can be replicated for fixed
1377 repetition simply by copying (because the recursion is allowed to refer to
1378 earlier groups that are outside the current group). However, when a group is
1379 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1380 it, after it has been compiled. This means that any OP_RECURSE items within it
1381 that refer to the group itself or any contained groups have to have their
1382 offsets adjusted. That is the job of this function. Before it is called, the
1383 partially compiled regex must be temporarily terminated with OP_END.
1386 group points to the start of the group
1387 adjust the amount by which the group is to be moved
1388 utf8 TRUE in UTF-8 mode
1389 cd contains pointers to tables etc.
1395 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1397 uschar *ptr = group;
1398 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1400 int offset = GET(ptr, 1);
1401 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1402 ptr += 1 + LINK_SIZE;
1408 /*************************************************
1409 * Insert an automatic callout point *
1410 *************************************************/
1412 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1413 callout points before each pattern item.
1416 code current code pointer
1417 ptr current pattern pointer
1418 cd pointers to tables etc
1420 Returns: new code pointer
1424 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1426 *code++ = OP_CALLOUT;
1428 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1429 PUT(code, LINK_SIZE, 0); /* Default length */
1430 return code + 2*LINK_SIZE;
1435 /*************************************************
1436 * Complete a callout item *
1437 *************************************************/
1439 /* A callout item contains the length of the next item in the pattern, which
1440 we can't fill in till after we have reached the relevant point. This is used
1441 for both automatic and manual callouts.
1444 previous_callout points to previous callout item
1445 ptr current pattern pointer
1446 cd pointers to tables etc
1452 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1454 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1455 PUT(previous_callout, 2 + LINK_SIZE, length);
1461 /*************************************************
1462 * Get othercase range *
1463 *************************************************/
1465 /* This function is passed the start and end of a class range, in UTF-8 mode
1466 with UCP support. It searches up the characters, looking for internal ranges of
1467 characters in the "other" case. Each call returns the next one, updating the
1471 cptr points to starting character value; updated
1473 ocptr where to put start of othercase range
1474 odptr where to put end of othercase range
1476 Yield: TRUE when range returned; FALSE when no more
1480 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1482 int c, chartype, othercase, next;
1484 for (c = *cptr; c <= d; c++)
1486 if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1490 if (c > d) return FALSE;
1493 next = othercase + 1;
1495 for (++c; c <= d; c++)
1497 if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1508 #endif /* SUPPORT_UCP */
1511 /*************************************************
1512 * Compile one branch *
1513 *************************************************/
1515 /* Scan the pattern, compiling it into the code vector. If the options are
1516 changed during the branch, the pointer is used to change the external options
1520 optionsptr pointer to the option bits
1521 brackets points to number of extracting brackets used
1522 codeptr points to the pointer to the current code point
1523 ptrptr points to the current pattern pointer
1524 errorcodeptr points to error code variable
1525 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1526 reqbyteptr set to the last literal character required, else < 0
1527 bcptr points to current branch chain
1528 cd contains pointers to tables etc.
1530 Returns: TRUE on success
1531 FALSE, with *errorcodeptr set non-zero on error
1535 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1536 const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1537 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1539 int repeat_type, op_type;
1540 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1542 int greedy_default, greedy_non_default;
1543 int firstbyte, reqbyte;
1544 int zeroreqbyte, zerofirstbyte;
1545 int req_caseopt, reqvary, tempreqvary;
1547 int options = *optionsptr;
1548 int after_manual_callout = 0;
1550 register uschar *code = *codeptr;
1552 BOOL inescq = FALSE;
1553 BOOL groupsetfirstbyte = FALSE;
1554 const uschar *ptr = *ptrptr;
1555 const uschar *tempptr;
1556 uschar *previous = NULL;
1557 uschar *previous_callout = NULL;
1558 uschar classbits[32];
1562 BOOL utf8 = (options & PCRE_UTF8) != 0;
1563 uschar *class_utf8data;
1564 uschar utf8_char[6];
1569 /* Set up the default and non-default settings for greediness */
1571 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1572 greedy_non_default = greedy_default ^ 1;
1574 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1575 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1576 matches a non-fixed char first char; reqbyte just remains unset if we never
1579 When we hit a repeat whose minimum is zero, we may have to adjust these values
1580 to take the zero repeat into account. This is implemented by setting them to
1581 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1582 item types that can be repeated set these backoff variables appropriately. */
1584 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1586 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1587 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1588 value > 255. It is added into the firstbyte or reqbyte variables to record the
1589 case status of the value. This is used only for ASCII characters. */
1591 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1593 /* Switch on next character until the end of the branch */
1598 BOOL possessive_quantifier;
1600 int class_charcount;
1610 /* Next byte in the pattern */
1614 /* If in \Q...\E, check for the end; if not, we have a literal */
1616 if (inescq && c != 0)
1618 if (c == '\\' && ptr[1] == 'E')
1626 if (previous_callout != NULL)
1628 complete_callout(previous_callout, ptr, cd);
1629 previous_callout = NULL;
1631 if ((options & PCRE_AUTO_CALLOUT) != 0)
1633 previous_callout = code;
1634 code = auto_callout(code, ptr, cd);
1640 /* Fill in length of a previous callout, except when the next thing is
1643 is_quantifier = c == '*' || c == '+' || c == '?' ||
1644 (c == '{' && is_counted_repeat(ptr+1));
1646 if (!is_quantifier && previous_callout != NULL &&
1647 after_manual_callout-- <= 0)
1649 complete_callout(previous_callout, ptr, cd);
1650 previous_callout = NULL;
1653 /* In extended mode, skip white space and comments */
1655 if ((options & PCRE_EXTENDED) != 0)
1657 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1660 /* The space before the ; is to avoid a warning on a silly compiler
1661 on the Macintosh. */
1662 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1663 if (c != 0) continue; /* Else fall through to handle end of string */
1667 /* No auto callout for quantifiers. */
1669 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1671 previous_callout = code;
1672 code = auto_callout(code, ptr, cd);
1677 /* The branch terminates at end of string, |, or ). */
1682 *firstbyteptr = firstbyte;
1683 *reqbyteptr = reqbyte;
1688 /* Handle single-character metacharacters. In multiline mode, ^ disables
1689 the setting of any following char as a first character. */
1692 if ((options & PCRE_MULTILINE) != 0)
1694 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1705 /* There can never be a first char if '.' is first, whatever happens about
1706 repeats. The value of reqbyte doesn't change either. */
1709 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1710 zerofirstbyte = firstbyte;
1711 zeroreqbyte = reqbyte;
1716 /* Character classes. If the included characters are all < 255 in value, we
1717 build a 32-byte bitmap of the permitted characters, except in the special
1718 case where there is only one such character. For negated classes, we build
1719 the map as usual, then invert it at the end. However, we use a different
1720 opcode so that data characters > 255 can be handled correctly.
1722 If the class contains characters outside the 0-255 range, a different
1723 opcode is compiled. It may optionally have a bit map for characters < 256,
1724 but those above are are explicitly listed afterwards. A flag byte tells
1725 whether the bitmap is present, and whether this is a negated class or not.
1731 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1732 they are encountered at the top level, so we'll do that too. */
1734 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1735 check_posix_syntax(ptr, &tempptr, cd))
1737 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1741 /* If the first character is '^', set the negation flag and skip it. */
1743 if ((c = *(++ptr)) == '^')
1745 negate_class = TRUE;
1750 negate_class = FALSE;
1753 /* Keep a count of chars with values < 256 so that we can optimize the case
1754 of just a single character (as long as it's < 256). For higher valued UTF-8
1755 characters, we don't yet do any optimization. */
1757 class_charcount = 0;
1758 class_lastchar = -1;
1761 class_utf8 = FALSE; /* No chars >= 256 */
1762 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1765 /* Initialize the 32-char bit map to all zeros. We have to build the
1766 map in a temporary bit of store, in case the class contains only 1
1767 character (< 256), because in that case the compiled code doesn't use the
1770 memset(classbits, 0, 32 * sizeof(uschar));
1772 /* Process characters until ] is reached. By writing this as a "do" it
1773 means that an initial ] is taken as a data character. The first pass
1774 through the regex checked the overall syntax, so we don't need to be very
1775 strict here. At the start of the loop, c contains the first byte of the
1781 if (utf8 && c > 127)
1782 { /* Braces are required because the */
1783 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1787 /* Inside \Q...\E everything is literal except \E */
1791 if (c == '\\' && ptr[1] == 'E')
1797 else goto LONE_SINGLE_CHARACTER;
1800 /* Handle POSIX class names. Perl allows a negation extension of the
1801 form [:^name:]. A square bracket that doesn't match the syntax is
1802 treated as a literal. We also recognize the POSIX constructions
1803 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1807 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1808 check_posix_syntax(ptr, &tempptr, cd))
1810 BOOL local_negate = FALSE;
1812 register const uschar *cbits = cd->cbits;
1816 *errorcodeptr = ERR31;
1823 local_negate = TRUE;
1827 posix_class = check_posix_name(ptr, tempptr - ptr);
1828 if (posix_class < 0)
1830 *errorcodeptr = ERR30;
1834 /* If matching is caseless, upper and lower are converted to
1835 alpha. This relies on the fact that the class table starts with
1836 alpha, lower, upper as the first 3 entries. */
1838 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1841 /* Or into the map we are building up to 3 of the static class
1842 tables, or their negations. The [:blank:] class sets up the same
1843 chars as the [:space:] class (all white space). We remove the vertical
1844 white space chars afterwards. */
1847 for (i = 0; i < 3; i++)
1849 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1850 int taboffset = posix_class_maps[posix_class + i];
1851 if (taboffset < 0) break;
1855 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1857 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1858 if (blankclass) classbits[1] |= 0x3c;
1862 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1863 if (blankclass) classbits[1] &= ~0x3c;
1868 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1869 continue; /* End of POSIX syntax handling */
1872 /* Backslash may introduce a single character, or it may introduce one
1873 of the specials, which just set a flag. Escaped items are checked for
1874 validity in the pre-compiling pass. The sequence \b is a special case.
1875 Inside a class (and only there) it is treated as backspace. Elsewhere
1876 it marks a word boundary. Other escapes have preset maps ready to
1877 or into the one we are building. We assume they have more than one
1878 character in them, so set class_charcount bigger than one. */
1882 c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1884 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1885 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1886 else if (-c == ESC_Q) /* Handle start of quoted string */
1888 if (ptr[1] == '\\' && ptr[2] == 'E')
1890 ptr += 2; /* avoid empty string */
1898 register const uschar *cbits = cd->cbits;
1899 class_charcount += 2; /* Greater than 1 is what matters */
1903 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1907 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1911 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1915 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1919 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1920 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1924 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1925 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1933 int property = get_ucp(&ptr, &negated, errorcodeptr);
1934 if (property < 0) goto FAILED;
1936 *class_utf8data++ = ((-c == ESC_p) != negated)?
1937 XCL_PROP : XCL_NOTPROP;
1938 *class_utf8data++ = property;
1939 class_charcount -= 2; /* Not a < 256 character */
1944 /* Unrecognized escapes are faulted if PCRE is running in its
1945 strict mode. By default, for compatibility with Perl, they are
1946 treated as literals. */
1949 if ((options & PCRE_EXTRA) != 0)
1951 *errorcodeptr = ERR7;
1954 c = *ptr; /* The final character */
1955 class_charcount -= 2; /* Undo the default count from above */
1959 /* Fall through if we have a single character (c >= 0). This may be
1960 > 256 in UTF-8 mode. */
1962 } /* End of backslash handling */
1964 /* A single character may be followed by '-' to form a range. However,
1965 Perl does not permit ']' to be the end of the range. A '-' character
1966 here is treated as a literal. */
1968 if (ptr[1] == '-' && ptr[2] != ']')
1975 { /* Braces are required because the */
1976 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1980 d = *ptr; /* Not UTF-8 mode */
1982 /* The second part of a range can be a single-character escape, but
1983 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1984 in such circumstances. */
1988 const uschar *oldptr = ptr;
1989 d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1991 /* \b is backslash; \X is literal X; any other special means the '-'
1996 if (d == -ESC_b) d = '\b';
1997 else if (d == -ESC_X) d = 'X'; else
2000 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2005 /* The check that the two values are in the correct order happens in
2006 the pre-pass. Optimize one-character ranges */
2008 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2010 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2011 matching, we have to use an XCLASS with extra data items. Caseless
2012 matching for characters > 127 is available only if UCP support is
2016 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2020 /* With UCP support, we can find the other case equivalents of
2021 the relevant characters. There may be several ranges. Optimize how
2022 they fit with the basic range. */
2025 if ((options & PCRE_CASELESS) != 0)
2030 while (get_othercase_range(&cc, origd, &occ, &ocd))
2032 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2034 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2035 { /* if there is overlap, */
2036 c = occ; /* noting that if occ < c */
2037 continue; /* we can't have ocd > d */
2038 } /* because a subrange is */
2039 if (ocd > d && occ <= d + 1) /* always shorter than */
2040 { /* the basic range. */
2047 *class_utf8data++ = XCL_SINGLE;
2051 *class_utf8data++ = XCL_RANGE;
2052 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2054 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2057 #endif /* SUPPORT_UCP */
2059 /* Now record the original range, possibly modified for UCP caseless
2060 overlapping ranges. */
2062 *class_utf8data++ = XCL_RANGE;
2063 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2064 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2066 /* With UCP support, we are done. Without UCP support, there is no
2067 caseless matching for UTF-8 characters > 127; we can use the bit map
2068 for the smaller ones. */
2071 continue; /* With next character in the class */
2073 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2075 /* Adjust upper limit and fall through to set up the map */
2079 #endif /* SUPPORT_UCP */
2081 #endif /* SUPPORT_UTF8 */
2083 /* We use the bit map for all cases when not in UTF-8 mode; else
2084 ranges that lie entirely within 0-127 when there is UCP support; else
2085 for partial ranges without UCP support. */
2089 classbits[c/8] |= (1 << (c&7));
2090 if ((options & PCRE_CASELESS) != 0)
2092 int uc = cd->fcc[c]; /* flip case */
2093 classbits[uc/8] |= (1 << (uc&7));
2095 class_charcount++; /* in case a one-char range */
2099 continue; /* Go get the next char in the class */
2102 /* Handle a lone single character - we can get here for a normal
2103 non-escape char, or after \ that introduces a single character or for an
2104 apparent range that isn't. */
2106 LONE_SINGLE_CHARACTER:
2108 /* Handle a character that cannot go in the bit map */
2111 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2114 *class_utf8data++ = XCL_SINGLE;
2115 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2118 if ((options & PCRE_CASELESS) != 0)
2122 if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2125 *class_utf8data++ = XCL_SINGLE;
2126 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2129 #endif /* SUPPORT_UCP */
2133 #endif /* SUPPORT_UTF8 */
2135 /* Handle a single-byte character */
2137 classbits[c/8] |= (1 << (c&7));
2138 if ((options & PCRE_CASELESS) != 0)
2140 c = cd->fcc[c]; /* flip case */
2141 classbits[c/8] |= (1 << (c&7));
2148 /* Loop until ']' reached; the check for end of string happens inside the
2149 loop. This "while" is the end of the "do" above. */
2151 while ((c = *(++ptr)) != ']' || inescq);
2153 /* If class_charcount is 1, we saw precisely one character whose value is
2154 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2155 can optimize the negative case only if there were no characters >= 128
2156 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2157 single-bytes only. This is an historical hangover. Maybe one day we can
2158 tidy these opcodes to handle multi-byte characters.
2160 The optimization throws away the bit map. We turn the item into a
2161 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2162 that OP_NOT does not support multibyte characters. In the positive case, it
2163 can cause firstbyte to be set. Otherwise, there can be no first char if
2164 this item is first, whatever repeat count may follow. In the case of
2165 reqbyte, save the previous value for reinstating. */
2168 if (class_charcount == 1 &&
2170 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2173 if (class_charcount == 1)
2176 zeroreqbyte = reqbyte;
2178 /* The OP_NOT opcode works on one-byte characters only. */
2182 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2183 zerofirstbyte = firstbyte;
2185 *code++ = class_lastchar;
2189 /* For a single, positive character, get the value into mcbuffer, and
2190 then we can handle this with the normal one-character code. */
2193 if (utf8 && class_lastchar > 127)
2194 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2198 mcbuffer[0] = class_lastchar;
2202 } /* End of 1-char optimization */
2204 /* The general case - not the one-char optimization. If this is the first
2205 thing in the branch, there can be no first char setting, whatever the
2206 repeat count. Any reqbyte setting must remain unchanged after any kind of
2209 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2210 zerofirstbyte = firstbyte;
2211 zeroreqbyte = reqbyte;
2213 /* If there are characters with values > 255, we have to compile an
2214 extended class, with its own opcode. If there are no characters < 256,
2215 we can omit the bitmap. */
2220 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2221 *code++ = OP_XCLASS;
2223 *code = negate_class? XCL_NOT : 0;
2225 /* If the map is required, install it, and move on to the end of
2228 if (class_charcount > 0)
2231 memcpy(code, classbits, 32);
2232 code = class_utf8data;
2235 /* If the map is not required, slide down the extra data. */
2239 int len = class_utf8data - (code + 33);
2240 memmove(code + 1, code + 33, len);
2244 /* Now fill in the complete length of the item */
2246 PUT(previous, 1, code - previous);
2247 break; /* End of class handling */
2251 /* If there are no characters > 255, negate the 32-byte map if necessary,
2252 and copy it into the code vector. If this is the first thing in the branch,
2253 there can be no first char setting, whatever the repeat count. Any reqbyte
2254 setting must remain unchanged after any kind of repeat. */
2258 *code++ = OP_NCLASS;
2259 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2264 memcpy(code, classbits, 32);
2269 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2270 has been tested above. */
2273 if (!is_quantifier) goto NORMAL_CHAR;
2274 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2275 if (*errorcodeptr != 0) goto FAILED;
2293 if (previous == NULL)
2295 *errorcodeptr = ERR9;
2299 if (repeat_min == 0)
2301 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2302 reqbyte = zeroreqbyte; /* Ditto */
2305 /* Remember whether this is a variable length repeat */
2307 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2309 op_type = 0; /* Default single-char op codes */
2310 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2312 /* Save start of previous item, in case we have to move it up to make space
2313 for an inserted OP_ONCE for the additional '+' extension. */
2315 tempcode = previous;
2317 /* If the next character is '+', we have a possessive quantifier. This
2318 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2319 If the next character is '?' this is a minimizing repeat, by default,
2320 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2321 repeat type to the non-default. */
2325 repeat_type = 0; /* Force greedy */
2326 possessive_quantifier = TRUE;
2329 else if (ptr[1] == '?')
2331 repeat_type = greedy_non_default;
2334 else repeat_type = greedy_default;
2336 /* If previous was a recursion, we need to wrap it inside brackets so that
2337 it can be replicated if necessary. */
2339 if (*previous == OP_RECURSE)
2341 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2342 code += 1 + LINK_SIZE;
2344 PUT(previous, 1, code - previous);
2346 PUT(code, 1, code - previous);
2347 code += 1 + LINK_SIZE;
2350 /* If previous was a character match, abolish the item and generate a
2351 repeat item instead. If a char item has a minumum of more than one, ensure
2352 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2353 the first thing in a branch because the x will have gone into firstbyte
2356 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2358 /* Deal with UTF-8 characters that take up more than one byte. It's
2359 easier to write this out separately than try to macrify it. Use c to
2360 hold the length of the character in bytes, plus 0x80 to flag that it's a
2361 length rather than a small character. */
2364 if (utf8 && (code[-1] & 0x80) != 0)
2366 uschar *lastchar = code - 1;
2367 while((*lastchar & 0xc0) == 0x80) lastchar--;
2368 c = code - lastchar; /* Length of UTF-8 character */
2369 memcpy(utf8_char, lastchar, c); /* Save the char */
2370 c |= 0x80; /* Flag c as a length */
2375 /* Handle the case of a single byte - either with no UTF8 support, or
2376 with UTF-8 disabled, or for a UTF-8 character < 128. */
2380 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2383 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2386 /* If previous was a single negated character ([^a] or similar), we use
2387 one of the special opcodes, replacing it. The code is shared with single-
2388 character repeats by setting opt_type to add a suitable offset into
2389 repeat_type. OP_NOT is currently used only for single-byte chars. */
2391 else if (*previous == OP_NOT)
2393 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2395 goto OUTPUT_SINGLE_REPEAT;
2398 /* If previous was a character type match (\d or similar), abolish it and
2399 create a suitable repeat item. The code is shared with single-character
2400 repeats by setting op_type to add a suitable offset into repeat_type. Note
2401 the the Unicode property types will be present only when SUPPORT_UCP is
2402 defined, but we don't wrap the little bits of code here because it just
2403 makes it horribly messy. */
2405 else if (*previous < OP_EODN)
2409 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2412 OUTPUT_SINGLE_REPEAT:
2413 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2417 code = previous; /* Usually overwrite previous item */
2419 /* If the maximum is zero then the minimum must also be zero; Perl allows
2420 this case, so we do too - by simply omitting the item altogether. */
2422 if (repeat_max == 0) goto END_REPEAT;
2424 /* All real repeats make it impossible to handle partial matching (maybe
2425 one day we will be able to remove this restriction). */
2427 if (repeat_max != 1) cd->nopartial = TRUE;
2429 /* Combine the op_type with the repeat_type */
2431 repeat_type += op_type;
2433 /* A minimum of zero is handled either as the special case * or ?, or as
2434 an UPTO, with the maximum given. */
2436 if (repeat_min == 0)
2438 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2439 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2442 *code++ = OP_UPTO + repeat_type;
2443 PUT2INC(code, 0, repeat_max);
2447 /* A repeat minimum of 1 is optimized into some special cases. If the
2448 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2449 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2450 one less than the maximum. */
2452 else if (repeat_min == 1)
2454 if (repeat_max == -1)
2455 *code++ = OP_PLUS + repeat_type;
2458 code = oldcode; /* leave previous item in place */
2459 if (repeat_max == 1) goto END_REPEAT;
2460 *code++ = OP_UPTO + repeat_type;
2461 PUT2INC(code, 0, repeat_max - 1);
2465 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2466 handled as an EXACT followed by an UPTO. */
2470 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2471 PUT2INC(code, 0, repeat_min);
2473 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2474 we have to insert the character for the previous code. For a repeated
2475 Unicode property match, there is an extra byte that defines the
2476 required property. In UTF-8 mode, long characters have their length in
2477 c, with the 0x80 bit as a flag. */
2482 if (utf8 && c >= 128)
2484 memcpy(code, utf8_char, c & 7);
2491 if (prop_type >= 0) *code++ = prop_type;
2493 *code++ = OP_STAR + repeat_type;
2496 /* Else insert an UPTO if the max is greater than the min, again
2497 preceded by the character, for the previously inserted code. */
2499 else if (repeat_max != repeat_min)
2502 if (utf8 && c >= 128)
2504 memcpy(code, utf8_char, c & 7);
2510 if (prop_type >= 0) *code++ = prop_type;
2511 repeat_max -= repeat_min;
2512 *code++ = OP_UPTO + repeat_type;
2513 PUT2INC(code, 0, repeat_max);
2517 /* The character or character type itself comes last in all cases. */
2520 if (utf8 && c >= 128)
2522 memcpy(code, utf8_char, c & 7);
2529 /* For a repeated Unicode property match, there is an extra byte that
2530 defines the required property. */
2533 if (prop_type >= 0) *code++ = prop_type;
2537 /* If previous was a character class or a back reference, we put the repeat
2538 stuff after it, but just skip the item if the repeat was {0,0}. */
2540 else if (*previous == OP_CLASS ||
2541 *previous == OP_NCLASS ||
2543 *previous == OP_XCLASS ||
2545 *previous == OP_REF)
2547 if (repeat_max == 0)
2553 /* All real repeats make it impossible to handle partial matching (maybe
2554 one day we will be able to remove this restriction). */
2556 if (repeat_max != 1) cd->nopartial = TRUE;
2558 if (repeat_min == 0 && repeat_max == -1)
2559 *code++ = OP_CRSTAR + repeat_type;
2560 else if (repeat_min == 1 && repeat_max == -1)
2561 *code++ = OP_CRPLUS + repeat_type;
2562 else if (repeat_min == 0 && repeat_max == 1)
2563 *code++ = OP_CRQUERY + repeat_type;
2566 *code++ = OP_CRRANGE + repeat_type;
2567 PUT2INC(code, 0, repeat_min);
2568 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2569 PUT2INC(code, 0, repeat_max);
2573 /* If previous was a bracket group, we may have to replicate it in certain
2576 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2577 *previous == OP_COND)
2581 int len = code - previous;
2582 uschar *bralink = NULL;
2584 /* If the maximum repeat count is unlimited, find the end of the bracket
2585 by scanning through from the start, and compute the offset back to it
2586 from the current code pointer. There may be an OP_OPT setting following
2587 the final KET, so we can't find the end just by going back from the code
2590 if (repeat_max == -1)
2592 register uschar *ket = previous;
2593 do ket += GET(ket, 1); while (*ket != OP_KET);
2594 ketoffset = code - ket;
2597 /* The case of a zero minimum is special because of the need to stick
2598 OP_BRAZERO in front of it, and because the group appears once in the
2599 data, whereas in other cases it appears the minimum number of times. For
2600 this reason, it is simplest to treat this case separately, as otherwise
2601 the code gets far too messy. There are several special subcases when the
2604 if (repeat_min == 0)
2606 /* If the maximum is also zero, we just omit the group from the output
2609 if (repeat_max == 0)
2615 /* If the maximum is 1 or unlimited, we just have to stick in the
2616 BRAZERO and do no more at this point. However, we do need to adjust
2617 any OP_RECURSE calls inside the group that refer to the group itself or
2618 any internal group, because the offset is from the start of the whole
2619 regex. Temporarily terminate the pattern while doing this. */
2621 if (repeat_max <= 1)
2624 adjust_recurse(previous, 1, utf8, cd);
2625 memmove(previous+1, previous, len);
2627 *previous++ = OP_BRAZERO + repeat_type;
2630 /* If the maximum is greater than 1 and limited, we have to replicate
2631 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2632 The first one has to be handled carefully because it's the original
2633 copy, which has to be moved up. The remainder can be handled by code
2634 that is common with the non-zero minimum case below. We have to
2635 adjust the value or repeat_max, since one less copy is required. Once
2636 again, we may have to adjust any OP_RECURSE calls inside the group. */
2642 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2643 memmove(previous + 2 + LINK_SIZE, previous, len);
2644 code += 2 + LINK_SIZE;
2645 *previous++ = OP_BRAZERO + repeat_type;
2646 *previous++ = OP_BRA;
2648 /* We chain together the bracket offset fields that have to be
2649 filled in later when the ends of the brackets are reached. */
2651 offset = (bralink == NULL)? 0 : previous - bralink;
2653 PUTINC(previous, 0, offset);
2659 /* If the minimum is greater than zero, replicate the group as many
2660 times as necessary, and adjust the maximum to the number of subsequent
2661 copies that we need. If we set a first char from the group, and didn't
2662 set a required char, copy the latter from the former. */
2668 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2669 for (i = 1; i < repeat_min; i++)
2671 memcpy(code, previous, len);
2675 if (repeat_max > 0) repeat_max -= repeat_min;
2678 /* This code is common to both the zero and non-zero minimum cases. If
2679 the maximum is limited, it replicates the group in a nested fashion,
2680 remembering the bracket starts on a stack. In the case of a zero minimum,
2681 the first one was set up above. In all cases the repeat_max now specifies
2682 the number of additional copies needed. */
2684 if (repeat_max >= 0)
2686 for (i = repeat_max - 1; i >= 0; i--)
2688 *code++ = OP_BRAZERO + repeat_type;
2690 /* All but the final copy start a new nesting, maintaining the
2691 chain of brackets outstanding. */
2697 offset = (bralink == NULL)? 0 : code - bralink;
2699 PUTINC(code, 0, offset);
2702 memcpy(code, previous, len);
2706 /* Now chain through the pending brackets, and fill in their length
2707 fields (which are holding the chain links pro tem). */
2709 while (bralink != NULL)
2712 int offset = code - bralink + 1;
2713 uschar *bra = code - offset;
2714 oldlinkoffset = GET(bra, 1);
2715 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2717 PUTINC(code, 0, offset);
2718 PUT(bra, 1, offset);
2722 /* If the maximum is unlimited, set a repeater in the final copy. We
2723 can't just offset backwards from the current code point, because we
2724 don't know if there's been an options resetting after the ket. The
2725 correct offset was computed above. */
2727 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2730 /* Else there's some kind of shambles */
2734 *errorcodeptr = ERR11;
2738 /* If the character following a repeat is '+', we wrap the entire repeated
2739 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2740 Sun's Java package. The repeated item starts at tempcode, not at previous,
2741 which might be the first part of a string whose (former) last char we
2742 repeated. However, we don't support '+' after a greediness '?'. */
2744 if (possessive_quantifier)
2746 int len = code - tempcode;
2747 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2748 code += 1 + LINK_SIZE;
2749 len += 1 + LINK_SIZE;
2750 tempcode[0] = OP_ONCE;
2752 PUTINC(code, 0, len);
2753 PUT(tempcode, 1, len);
2756 /* In all case we no longer have a previous item. We also set the
2757 "follows varying string" flag for subsequently encountered reqbytes if
2758 it isn't already set and we have just passed a varying length item. */
2762 cd->req_varyopt |= reqvary;
2766 /* Start of nested bracket sub-expression, or comment or lookahead or
2767 lookbehind or option setting or condition. First deal with special things
2768 that can come after a bracket; all are introduced by ?, and the appearance
2769 of any of them means that this is not a referencing group. They were
2770 checked for validity in the first pass over the string, so we don't have to
2771 check for syntax errors here. */
2774 newoptions = options;
2777 if (*(++ptr) == '?')
2784 case '#': /* Comment; skip to ket */
2786 while (*ptr != ')') ptr++;
2789 case ':': /* Non-extracting bracket */
2795 bravalue = OP_COND; /* Conditional group */
2797 /* Condition to test for recursion */
2801 code[1+LINK_SIZE] = OP_CREF;
2802 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2807 /* Condition to test for a numbered subpattern match. We know that
2808 if a digit follows ( then there will just be digits until ) because
2809 the syntax was checked in the first pass. */
2811 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2813 int condref; /* Don't amalgamate; some compilers */
2814 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2815 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2818 *errorcodeptr = ERR35;
2822 code[1+LINK_SIZE] = OP_CREF;
2823 PUT2(code, 2+LINK_SIZE, condref);
2826 /* For conditions that are assertions, we just fall through, having
2827 set bravalue above. */
2830 case '=': /* Positive lookahead */
2831 bravalue = OP_ASSERT;
2835 case '!': /* Negative lookahead */
2836 bravalue = OP_ASSERT_NOT;
2840 case '<': /* Lookbehinds */
2843 case '=': /* Positive lookbehind */
2844 bravalue = OP_ASSERTBACK;
2848 case '!': /* Negative lookbehind */
2849 bravalue = OP_ASSERTBACK_NOT;
2855 case '>': /* One-time brackets */
2860 case 'C': /* Callout - may be followed by digits; */
2861 previous_callout = code; /* Save for later completion */
2862 after_manual_callout = 1; /* Skip one item before completing */
2863 *code++ = OP_CALLOUT; /* Already checked that the terminating */
2864 { /* closing parenthesis is present. */
2866 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2867 n = n * 10 + *ptr - '0';
2870 *errorcodeptr = ERR38;
2874 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2875 PUT(code, LINK_SIZE, 0); /* Default length */
2876 code += 2 * LINK_SIZE;
2881 case 'P': /* Named subpattern handling */
2882 if (*(++ptr) == '<') /* Definition */
2885 uschar *slot = cd->name_table;
2886 const uschar *name; /* Don't amalgamate; some compilers */
2887 name = ++ptr; /* grumble at autoincrement in declaration */
2889 while (*ptr++ != '>');
2890 namelen = ptr - name - 1;
2892 for (i = 0; i < cd->names_found; i++)
2894 int crc = memcmp(name, slot+2, namelen);
2897 if (slot[2+namelen] == 0)
2899 *errorcodeptr = ERR43;
2902 crc = -1; /* Current name is substring */
2906 memmove(slot + cd->name_entry_size, slot,
2907 (cd->names_found - i) * cd->name_entry_size);
2910 slot += cd->name_entry_size;
2913 PUT2(slot, 0, *brackets + 1);
2914 memcpy(slot + 2, name, namelen);
2915 slot[2+namelen] = 0;
2917 goto NUMBERED_GROUP;
2920 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2924 const uschar *name = ptr;
2925 uschar *slot = cd->name_table;
2927 while (*ptr != ')') ptr++;
2928 namelen = ptr - name;
2930 for (i = 0; i < cd->names_found; i++)
2932 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2933 slot += cd->name_entry_size;
2935 if (i >= cd->names_found)
2937 *errorcodeptr = ERR15;
2941 recno = GET2(slot, 0);
2943 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2945 /* Back reference */
2949 PUT2INC(code, 0, recno);
2950 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2951 if (recno > cd->top_backref) cd->top_backref = recno;
2955 /* Should never happen */
2958 case 'R': /* Pattern recursion */
2959 ptr++; /* Same as (?0) */
2962 /* Recursion or "subroutine" call */
2964 case '0': case '1': case '2': case '3': case '4':
2965 case '5': case '6': case '7': case '8': case '9':
2967 const uschar *called;
2969 while((digitab[*ptr] & ctype_digit) != 0)
2970 recno = recno * 10 + *ptr++ - '0';
2972 /* Come here from code above that handles a named recursion */
2978 /* Find the bracket that is being referenced. Temporarily end the
2979 regex in case it doesn't exist. */
2982 called = (recno == 0)?
2983 cd->start_code : find_bracket(cd->start_code, utf8, recno);
2987 *errorcodeptr = ERR15;
2991 /* If the subpattern is still open, this is a recursive call. We
2992 check to see if this is a left recursion that could loop for ever,
2993 and diagnose that case. */
2995 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2997 *errorcodeptr = ERR40;
3001 /* Insert the recursion/subroutine item */
3004 PUT(code, 1, called - cd->start_code);
3005 code += 1 + LINK_SIZE;
3009 /* Character after (? not specially recognized */
3011 default: /* Option setting */
3015 while (*ptr != ')' && *ptr != ':')
3019 case '-': optset = &unset; break;
3021 case 'i': *optset |= PCRE_CASELESS; break;
3022 case 'm': *optset |= PCRE_MULTILINE; break;
3023 case 's': *optset |= PCRE_DOTALL; break;
3024 case 'x': *optset |= PCRE_EXTENDED; break;
3025 case 'U': *optset |= PCRE_UNGREEDY; break;
3026 case 'X': *optset |= PCRE_EXTRA; break;
3030 /* Set up the changed option bits, but don't change anything yet. */
3032 newoptions = (options | set) & (~unset);
3034 /* If the options ended with ')' this is not the start of a nested
3035 group with option changes, so the options change at this level. Compile
3036 code to change the ims options if this setting actually changes any of
3037 them. We also pass the new setting back so that it can be put at the
3038 start of any following branches, and when this group ends (if we are in
3039 a group), a resetting item can be compiled.
3041 Note that if this item is right at the start of the pattern, the
3042 options will have been abstracted and made global, so there will be no
3043 change to compile. */
3047 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3050 *code++ = newoptions & PCRE_IMS;
3053 /* Change options at this level, and pass them back for use
3054 in subsequent branches. Reset the greedy defaults and the case
3055 value for firstbyte and reqbyte. */
3057 *optionsptr = options = newoptions;
3058 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3059 greedy_non_default = greedy_default ^ 1;
3060 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3062 previous = NULL; /* This item can't be repeated */
3063 continue; /* It is complete */
3066 /* If the options ended with ':' we are heading into a nested group
3067 with possible change of options. Such groups are non-capturing and are
3068 not assertions of any kind. All we need to do is skip over the ':';
3069 the newoptions value is handled below. */
3076 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3077 non-capturing and behave like (?:...) brackets */
3079 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3084 /* Else we have a referencing group; adjust the opcode. If the bracket
3085 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3086 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3091 if (++(*brackets) > EXTRACT_BASIC_MAX)
3093 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3094 code[1+LINK_SIZE] = OP_BRANUMBER;
3095 PUT2(code, 2+LINK_SIZE, *brackets);
3098 else bravalue = OP_BRA + *brackets;
3101 /* Process nested bracketed re. Assertions may not be repeated, but other
3102 kinds can be. We copy code into a non-register variable in order to be able
3103 to pass its address because some compilers complain otherwise. Pass in a
3104 new setting for the ims options if they have changed. */
3106 previous = (bravalue >= OP_ONCE)? code : NULL;
3109 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3112 newoptions, /* The complete new option state */
3113 options & PCRE_IMS, /* The previous ims option state */
3114 brackets, /* Extracting bracket count */
3115 &tempcode, /* Where to put code (updated) */
3116 &ptr, /* Input pointer (updated) */
3117 errorcodeptr, /* Where to put an error message */
3118 (bravalue == OP_ASSERTBACK ||
3119 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3120 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3121 &subfirstbyte, /* For possible first char */
3122 &subreqbyte, /* For possible last char */
3123 bcptr, /* Current branch chain */
3124 cd)) /* Tables block */
3127 /* At the end of compiling, code is still pointing to the start of the
3128 group, while tempcode has been updated to point past the end of the group
3129 and any option resetting that may follow it. The pattern pointer (ptr)
3130 is on the bracket. */
3132 /* If this is a conditional bracket, check that there are no more than
3133 two branches in the group. */
3135 else if (bravalue == OP_COND)
3144 while (*tc != OP_KET);
3148 *errorcodeptr = ERR27;
3152 /* If there is just one branch, we must not make use of its firstbyte or
3153 reqbyte, because this is equivalent to an empty second branch. */
3155 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3158 /* Handle updating of the required and first characters. Update for normal
3159 brackets of all kinds, and conditions with two branches (see code above).
3160 If the bracket is followed by a quantifier with zero repeat, we have to
3161 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3162 main loop so that they can be accessed for the back off. */
3164 zeroreqbyte = reqbyte;
3165 zerofirstbyte = firstbyte;
3166 groupsetfirstbyte = FALSE;
3168 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3170 /* If we have not yet set a firstbyte in this branch, take it from the
3171 subpattern, remembering that it was set here so that a repeat of more
3172 than one can replicate it as reqbyte if necessary. If the subpattern has
3173 no firstbyte, set "none" for the whole branch. In both cases, a zero
3174 repeat forces firstbyte to "none". */
3176 if (firstbyte == REQ_UNSET)
3178 if (subfirstbyte >= 0)
3180 firstbyte = subfirstbyte;
3181 groupsetfirstbyte = TRUE;
3183 else firstbyte = REQ_NONE;
3184 zerofirstbyte = REQ_NONE;
3187 /* If firstbyte was previously set, convert the subpattern's firstbyte
3188 into reqbyte if there wasn't one, using the vary flag that was in
3189 existence beforehand. */
3191 else if (subfirstbyte >= 0 && subreqbyte < 0)
3192 subreqbyte = subfirstbyte | tempreqvary;
3194 /* If the subpattern set a required byte (or set a first byte that isn't
3195 really the first byte - see above), set it. */
3197 if (subreqbyte >= 0) reqbyte = subreqbyte;
3200 /* For a forward assertion, we take the reqbyte, if set. This can be
3201 helpful if the pattern that follows the assertion doesn't set a different
3202 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3203 for an assertion, however because it leads to incorrect effect for patterns
3204 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3205 of a firstbyte. This is overcome by a scan at the end if there's no
3206 firstbyte, looking for an asserted first char. */
3208 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3210 /* Now update the main code pointer to the end of the group. */
3214 /* Error if hit end of pattern */
3218 *errorcodeptr = ERR14;
3223 /* Check \ for being a real metacharacter; if not, fall through and handle
3224 it as a data character at the start of a string. Escape items are checked
3225 for validity in the pre-compiling pass. */
3229 c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3231 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3232 are arranged to be the negation of the corresponding OP_values. For the
3233 back references, the values are ESC_REF plus the reference number. Only
3234 back references and those types that consume a character may be repeated.
3235 We can test for values between ESC_b and ESC_Z for the latter; this may
3236 have to change if any new ones are ever created. */
3240 if (-c == ESC_Q) /* Handle start of quoted string */
3242 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3247 /* For metasequences that actually match a character, we disable the
3248 setting of a first character if it hasn't already been set. */
3250 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3251 firstbyte = REQ_NONE;
3253 /* Set values to reset to if this is followed by a zero repeat. */
3255 zerofirstbyte = firstbyte;
3256 zeroreqbyte = reqbyte;
3258 /* Back references are handled specially */
3262 int number = -c - ESC_REF;
3265 PUT2INC(code, 0, number);
3268 /* So are Unicode property matches, if supported. We know that get_ucp
3269 won't fail because it was tested in the pre-pass. */
3272 else if (-c == ESC_P || -c == ESC_p)
3275 int value = get_ucp(&ptr, &negated, errorcodeptr);
3277 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3282 /* For the rest, we can obtain the OP value by negating the escape
3287 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3293 /* We have a data character whose value is in c. In UTF-8 mode it may have
3294 a value > 127. We set its representation in the length/buffer, and then
3295 handle it as a data character. */
3298 if (utf8 && c > 127)
3299 mclength = _pcre_ord2utf8(c, mcbuffer);
3310 /* Handle a literal character. It is guaranteed not to be whitespace or #
3311 when the extended flag is set. If we are in UTF-8 mode, it may be a
3312 multi-byte literal character. */
3320 if (utf8 && (c & 0xc0) == 0xc0)
3322 while ((ptr[1] & 0xc0) == 0x80)
3323 mcbuffer[mclength++] = *(++ptr);
3327 /* At this point we have the character's bytes in mcbuffer, and the length
3328 in mclength. When not in UTF-8 mode, the length is always 1. */
3332 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3333 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3335 /* Set the first and required bytes appropriately. If no previous first
3336 byte, set it from this character, but revert to none on a zero repeat.
3337 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3340 if (firstbyte == REQ_UNSET)
3342 zerofirstbyte = REQ_NONE;
3343 zeroreqbyte = reqbyte;
3345 /* If the character is more than one byte long, we can set firstbyte
3346 only if it is not to be matched caselessly. */
3348 if (mclength == 1 || req_caseopt == 0)
3350 firstbyte = mcbuffer[0] | req_caseopt;
3351 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3353 else firstbyte = reqbyte = REQ_NONE;
3356 /* firstbyte was previously set; we can set reqbyte only the length is
3357 1 or the matching is caseful. */
3361 zerofirstbyte = firstbyte;
3362 zeroreqbyte = reqbyte;
3363 if (mclength == 1 || req_caseopt == 0)
3364 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3367 break; /* End of literal character handling */
3369 } /* end of big loop */
3371 /* Control never reaches here by falling through, only by a goto for all the
3372 error states. Pass back the position in the pattern so that it can be displayed
3373 to the user for diagnosing the error. */
3383 /*************************************************
3384 * Compile sequence of alternatives *
3385 *************************************************/
3387 /* On entry, ptr is pointing past the bracket character, but on return
3388 it points to the closing bracket, or vertical bar, or end of string.
3389 The code variable is pointing at the byte into which the BRA operator has been
3390 stored. If the ims options are changed at the start (for a (?ims: group) or
3391 during any branch, we need to insert an OP_OPT item at the start of every
3392 following branch to ensure they get set correctly at run time, and also pass
3393 the new options into every subsequent branch compile.
3396 options option bits, including any changes for this subpattern
3397 oldims previous settings of ims option bits
3398 brackets -> int containing the number of extracting brackets used
3399 codeptr -> the address of the current code pointer
3400 ptrptr -> the address of the current pattern pointer
3401 errorcodeptr -> pointer to error code variable
3402 lookbehind TRUE if this is a lookbehind assertion
3403 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3404 firstbyteptr place to put the first required character, or a negative number
3405 reqbyteptr place to put the last required character, or a negative number
3406 bcptr pointer to the chain of currently open branches
3407 cd points to the data block with tables pointers etc.
3409 Returns: TRUE on success
3413 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3414 const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3415 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3417 const uschar *ptr = *ptrptr;
3418 uschar *code = *codeptr;
3419 uschar *last_branch = code;
3420 uschar *start_bracket = code;
3421 uschar *reverse_count = NULL;
3422 int firstbyte, reqbyte;
3423 int branchfirstbyte, branchreqbyte;
3429 firstbyte = reqbyte = REQ_UNSET;
3431 /* Offset is set zero to mark that this bracket is still open */
3434 code += 1 + LINK_SIZE + skipbytes;
3436 /* Loop for each alternative branch */
3440 /* Handle a change of ims options at the start of the branch */
3442 if ((options & PCRE_IMS) != oldims)
3445 *code++ = options & PCRE_IMS;
3448 /* Set up dummy OP_REVERSE if lookbehind assertion */
3452 *code++ = OP_REVERSE;
3453 reverse_count = code;
3457 /* Now compile the branch */
3459 if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3460 &branchfirstbyte, &branchreqbyte, &bc, cd))
3466 /* If this is the first branch, the firstbyte and reqbyte values for the
3467 branch become the values for the regex. */
3469 if (*last_branch != OP_ALT)
3471 firstbyte = branchfirstbyte;
3472 reqbyte = branchreqbyte;
3475 /* If this is not the first branch, the first char and reqbyte have to
3476 match the values from all the previous branches, except that if the previous
3477 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3478 REQ_VARY for the regex. */
3482 /* If we previously had a firstbyte, but it doesn't match the new branch,
3483 we have to abandon the firstbyte for the regex, but if there was previously
3484 no reqbyte, it takes on the value of the old firstbyte. */
3486 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3488 if (reqbyte < 0) reqbyte = firstbyte;
3489 firstbyte = REQ_NONE;
3492 /* If we (now or from before) have no firstbyte, a firstbyte from the
3493 branch becomes a reqbyte if there isn't a branch reqbyte. */
3495 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3496 branchreqbyte = branchfirstbyte;
3498 /* Now ensure that the reqbytes match */
3500 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3502 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3505 /* If lookbehind, check that this branch matches a fixed-length string,
3506 and put the length into the OP_REVERSE item. Temporarily mark the end of
3507 the branch with OP_END. */
3513 length = find_fixedlength(last_branch, options);
3514 DPRINTF(("fixed length = %d\n", length));
3517 *errorcodeptr = (length == -2)? ERR36 : ERR25;
3521 PUT(reverse_count, 0, length);
3524 /* Reached end of expression, either ')' or end of pattern. Go back through
3525 the alternative branches and reverse the chain of offsets, with the field in
3526 the BRA item now becoming an offset to the first alternative. If there are
3527 no alternatives, it points to the end of the group. The length in the
3528 terminating ket is always the length of the whole bracketed item. If any of
3529 the ims options were changed inside the group, compile a resetting op-code
3530 following, except at the very end of the pattern. Return leaving the pointer
3531 at the terminating char. */
3535 int length = code - last_branch;
3538 int prev_length = GET(last_branch, 1);
3539 PUT(last_branch, 1, length);
3540 length = prev_length;
3541 last_branch -= length;
3545 /* Fill in the ket */
3548 PUT(code, 1, code - start_bracket);
3549 code += 1 + LINK_SIZE;
3551 /* Resetting option if needed */
3553 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3559 /* Set values to pass back */
3563 *firstbyteptr = firstbyte;
3564 *reqbyteptr = reqbyte;
3568 /* Another branch follows; insert an "or" node. Its length field points back
3569 to the previous branch while the bracket remains open. At the end the chain
3570 is reversed. It's done like this so that the start of the bracket has a
3571 zero offset until it is closed, making it possible to detect recursion. */
3574 PUT(code, 1, code - last_branch);
3575 bc.current = last_branch = code;
3576 code += 1 + LINK_SIZE;
3579 /* Control never reaches here */
3585 /*************************************************
3586 * Check for anchored expression *
3587 *************************************************/
3589 /* Try to find out if this is an anchored regular expression. Consider each
3590 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3591 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3592 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3593 counts, since OP_CIRC can match in the middle.
3595 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3596 This is the code for \G, which means "match at start of match position, taking
3597 into account the match offset".
3599 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3600 because that will try the rest of the pattern at all possible matching points,
3601 so there is no point trying again.... er ....
3603 .... except when the .* appears inside capturing parentheses, and there is a
3604 subsequent back reference to those parentheses. We haven't enough information
3605 to catch that case precisely.
3607 At first, the best we could do was to detect when .* was in capturing brackets
3608 and the highest back reference was greater than or equal to that level.
3609 However, by keeping a bitmap of the first 31 back references, we can catch some
3610 of the more common cases more precisely.
3613 code points to start of expression (the bracket)
3614 options points to the options setting
3615 bracket_map a bitmap of which brackets we are inside while testing; this
3616 handles up to substring 31; after that we just have to take
3617 the less precise approach
3618 backref_map the back reference bitmap
3620 Returns: TRUE or FALSE
3624 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3625 unsigned int backref_map)
3628 const uschar *scode =
3629 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3630 register int op = *scode;
3632 /* Capturing brackets */
3638 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3639 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3640 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3643 /* Other brackets */
3645 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3647 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3650 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3651 are or may be referenced. */
3653 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3654 (*options & PCRE_DOTALL) != 0)
3656 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3659 /* Check for explicit anchoring */
3661 else if (op != OP_SOD && op != OP_SOM &&
3662 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3664 code += GET(code, 1);
3666 while (*code == OP_ALT); /* Loop for each alternative */
3672 /*************************************************
3673 * Check for starting with ^ or .* *
3674 *************************************************/
3676 /* This is called to find out if every branch starts with ^ or .* so that
3677 "first char" processing can be done to speed things up in multiline
3678 matching and for non-DOTALL patterns that start with .* (which must start at
3679 the beginning or after \n). As in the case of is_anchored() (see above), we
3680 have to take account of back references to capturing brackets that contain .*
3681 because in that case we can't make the assumption.
3684 code points to start of expression (the bracket)
3685 bracket_map a bitmap of which brackets we are inside while testing; this
3686 handles up to substring 31; after that we just have to take
3687 the less precise approach
3688 backref_map the back reference bitmap
3690 Returns: TRUE or FALSE
3694 is_startline(const uschar *code, unsigned int bracket_map,
3695 unsigned int backref_map)
3698 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3700 register int op = *scode;
3702 /* Capturing brackets */
3708 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3709 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3710 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3713 /* Other brackets */
3715 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3716 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3718 /* .* means "start at start or after \n" if it isn't in brackets that
3719 may be referenced. */
3721 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3723 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3726 /* Check for explicit circumflex */
3728 else if (op != OP_CIRC) return FALSE;
3730 /* Move on to the next alternative */
3732 code += GET(code, 1);
3734 while (*code == OP_ALT); /* Loop for each alternative */
3740 /*************************************************
3741 * Check for asserted fixed first char *
3742 *************************************************/
3744 /* During compilation, the "first char" settings from forward assertions are
3745 discarded, because they can cause conflicts with actual literals that follow.
3746 However, if we end up without a first char setting for an unanchored pattern,
3747 it is worth scanning the regex to see if there is an initial asserted first
3748 char. If all branches start with the same asserted char, or with a bracket all
3749 of whose alternatives start with the same asserted char (recurse ad lib), then
3750 we return that char, otherwise -1.
3753 code points to start of expression (the bracket)
3754 options pointer to the options (used to check casing changes)
3755 inassert TRUE if in an assertion
3757 Returns: -1 or the fixed first char
3761 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3763 register int c = -1;
3766 const uschar *scode =
3767 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3768 register int op = *scode;
3770 if (op >= OP_BRA) op = OP_BRA;
3781 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3783 if (c < 0) c = d; else if (c != d) return -1;
3786 case OP_EXACT: /* Fall through */
3793 if (!inassert) return -1;
3797 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3799 else if (c != scode[1]) return -1;
3803 code += GET(code, 1);
3805 while (*code == OP_ALT);
3811 /*************************************************
3812 * Compile a Regular Expression *
3813 *************************************************/
3815 /* This function takes a string and returns a pointer to a block of store
3816 holding a compiled version of the expression. The original API for this
3817 function had no error code return variable; it is retained for backwards
3818 compatibility. The new function is given a new name.
3821 pattern the regular expression
3822 options various option bits
3823 errorcodeptr pointer to error code variable (pcre_compile2() only)
3824 can be NULL if you don't want a code value
3825 errorptr pointer to pointer to error text
3826 erroroffset ptr offset in pattern where error was detected
3827 tables pointer to character tables or NULL
3829 Returns: pointer to compiled data block, or NULL on error,
3830 with errorptr and erroroffset set
3834 pcre_compile(const char *pattern, int options, const char **errorptr,
3835 int *erroroffset, const unsigned char *tables)
3837 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3842 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3843 const char **errorptr, int *erroroffset, const unsigned char *tables)
3846 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3847 int c, firstbyte, reqbyte;
3849 int branch_extra = 0;
3850 int branch_newextra;
3851 int item_count = -1;
3853 int max_name_size = 0;
3854 int lastitemlength = 0;
3860 BOOL inescq = FALSE;
3861 unsigned int brastackptr = 0;
3864 const uschar *codestart;
3866 compile_data compile_block;
3867 int brastack[BRASTACK_SIZE];
3868 uschar bralenstack[BRASTACK_SIZE];
3870 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3871 can do is just return NULL, but we can set a code value if there is a code
3874 if (errorptr == NULL)
3876 if (errorcodeptr != NULL) *errorcodeptr = 99;
3881 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3883 /* However, we can give a message for this error */
3885 if (erroroffset == NULL)
3888 goto PCRE_EARLY_ERROR_RETURN;
3893 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3896 utf8 = (options & PCRE_UTF8) != 0;
3897 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3898 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3901 goto PCRE_EARLY_ERROR_RETURN;
3904 if ((options & PCRE_UTF8) != 0)
3907 goto PCRE_EARLY_ERROR_RETURN;
3911 if ((options & ~PUBLIC_OPTIONS) != 0)
3914 goto PCRE_EARLY_ERROR_RETURN;
3917 /* Set up pointers to the individual character tables */
3919 if (tables == NULL) tables = _pcre_default_tables;
3920 compile_block.lcc = tables + lcc_offset;
3921 compile_block.fcc = tables + fcc_offset;
3922 compile_block.cbits = tables + cbits_offset;
3923 compile_block.ctypes = tables + ctypes_offset;
3925 /* Maximum back reference and backref bitmap. This is updated for numeric
3926 references during the first pass, but for named references during the actual
3927 compile pass. The bitmap records up to 31 back references to help in deciding
3928 whether (.*) can be treated as anchored or not. */
3930 compile_block.top_backref = 0;
3931 compile_block.backref_map = 0;
3933 /* Reflect pattern for debugging output */
3935 DPRINTF(("------------------------------------------------------------------\n"));
3936 DPRINTF(("%s\n", pattern));
3938 /* The first thing to do is to make a pass over the pattern to compute the
3939 amount of store required to hold the compiled code. This does not have to be
3940 perfect as long as errors are overestimates. At the same time we can detect any
3941 flag settings right at the start, and extract them. Make an attempt to correct
3942 for any counted white space if an "extended" flag setting appears late in the
3943 pattern. We can't be so clever for #-comments. */
3945 ptr = (const uschar *)(pattern - 1);
3946 while ((c = *(++ptr)) != 0)
3953 /* If we are inside a \Q...\E sequence, all chars are literal */
3957 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
3961 /* Otherwise, first check for ignored whitespace and comments */
3963 if ((options & PCRE_EXTENDED) != 0)
3965 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3968 /* The space before the ; is to avoid a warning on a silly compiler
3969 on the Macintosh. */
3970 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3976 item_count++; /* Is zero for the first non-comment item */
3978 /* Allow space for auto callout before every item except quantifiers. */
3980 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
3981 c != '*' && c != '+' && c != '?' &&
3982 (c != '{' || !is_counted_repeat(ptr + 1)))
3983 length += 2 + 2*LINK_SIZE;
3987 /* A backslashed item may be an escaped data character or it may be a
3991 c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
3992 if (errorcode != 0) goto PCRE_ERROR_RETURN;
3994 lastitemlength = 1; /* Default length of last item for repeats */
3996 if (c >= 0) /* Data character */
3998 length += 2; /* For a one-byte character */
4001 if (utf8 && c > 127)
4004 for (i = 0; i < _pcre_utf8_table1_size; i++)
4005 if (c <= _pcre_utf8_table1[i]) break;
4007 lastitemlength += i;
4014 /* If \Q, enter "literal" mode */
4022 /* \X is supported only if Unicode property support is compiled */
4028 goto PCRE_ERROR_RETURN;
4032 /* \P and \p are for Unicode properties, but only when the support has
4033 been compiled. Each item needs 2 bytes. */
4035 else if (-c == ESC_P || -c == ESC_p)
4041 if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;
4045 goto PCRE_ERROR_RETURN;
4049 /* Other escapes need one byte */
4053 /* A back reference needs an additional 2 bytes, plus either one or 5
4054 bytes for a repeat. We also need to keep the value of the highest
4059 int refnum = -c - ESC_REF;
4060 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4061 if (refnum > compile_block.top_backref)
4062 compile_block.top_backref = refnum;
4063 length += 2; /* For single back reference */
4064 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4066 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4067 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4068 if ((min == 0 && (max == 1 || max == -1)) ||
4069 (min == 1 && max == -1))
4072 if (ptr[1] == '?') ptr++;
4077 case '^': /* Single-byte metacharacters */
4084 case '*': /* These repeats won't be after brackets; */
4085 case '+': /* those are handled separately */
4088 goto POSESSIVE; /* A few lines below */
4090 /* This covers the cases of braced repeats after a single char, metachar,
4091 class, or back reference. */
4094 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4095 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4096 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4098 /* These special cases just insert one extra opcode */
4100 if ((min == 0 && (max == 1 || max == -1)) ||
4101 (min == 1 && max == -1))
4104 /* These cases might insert additional copies of a preceding character. */
4110 length -= lastitemlength; /* Uncount the original char or metachar */
4111 if (min > 0) length += 3 + lastitemlength;
4113 length += lastitemlength + ((max > 0)? 3 : 1);
4116 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4118 POSESSIVE: /* Test for possessive quantifier */
4122 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4126 /* An alternation contains an offset to the next branch or ket. If any ims
4127 options changed in the previous branch(es), and/or if we are in a
4128 lookbehind assertion, extra space will be needed at the start of the
4129 branch. This is handled by branch_extra. */
4132 length += 1 + LINK_SIZE + branch_extra;
4135 /* A character class uses 33 characters provided that all the character
4136 values are less than 256. Otherwise, it uses a bit map for low valued
4137 characters, and individual items for others. Don't worry about character
4138 types that aren't allowed in classes - they'll get picked up during the
4139 compile. A character class that contains only one single-byte character
4140 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4141 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4144 if (*(++ptr) == '^')
4146 class_optcount = 10; /* Greater than one */
4149 else class_optcount = 0;
4155 /* Written as a "do" so that an initial ']' is taken as data */
4159 /* Inside \Q...\E everything is literal except \E */
4163 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4169 /* Outside \Q...\E, check for escapes */
4173 c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4174 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4176 /* \b is backspace inside a class; \X is literal */
4178 if (-c == ESC_b) c = '\b';
4179 else if (-c == ESC_X) c = 'X';
4181 /* \Q enters quoting mode */
4183 else if (-c == ESC_Q)
4189 /* Handle escapes that turn into characters */
4191 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4193 /* Escapes that are meta-things. The normal ones just affect the
4194 bit map, but Unicode properties require an XCLASS extended item. */
4198 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4200 if (-c == ESC_p || -c == ESC_P)
4205 length += LINK_SIZE + 2;
4213 /* Check the syntax for POSIX stuff. The bits we actually handle are
4214 checked during the real compile phase. */
4216 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4219 class_optcount = 10; /* Make sure > 1 */
4222 /* Anything else increments the possible optimization count. We have to
4223 detect ranges here so that we can compute the number of extra ranges for
4224 caseless wide characters when UCP support is available. If there are wide
4225 characters, we are going to have to use an XCLASS, even for single
4238 GETCHARLEN(c, ptr, extra);
4246 /* Come here from handling \ above when it escapes to a char value */
4248 NON_SPECIAL_CHARACTER:
4254 uschar const *hyptr = ptr++;
4258 d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4259 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4260 if (-d == ESC_b) d = '\b'; /* backspace */
4261 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4263 else if (ptr[1] != 0 && ptr[1] != ']')
4270 GETCHARLEN(d, ptr, extra);
4277 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4280 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4281 127 for caseless matching, we will need to use an XCLASS. */
4285 class_optcount = 10; /* Ensure > 1 */
4289 goto PCRE_ERROR_RETURN;
4293 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4296 if (!class_utf8) /* Allow for XCLASS overhead */
4299 length += LINK_SIZE + 2;
4303 /* If we have UCP support, find out how many extra ranges are
4304 needed to map the other case of characters within this range. We
4305 have to mimic the range optimization here, because extending the
4306 range upwards might push d over a boundary that makes is use
4307 another byte in the UTF-8 representation. */
4309 if ((options & PCRE_CASELESS) != 0)
4314 while (get_othercase_range(&cc, origd, &occ, &ocd))
4316 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4318 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4319 { /* if there is overlap, */
4320 c = occ; /* noting that if occ < c */
4321 continue; /* we can't have ocd > d */
4322 } /* because a subrange is */
4323 if (ocd > d && occ <= d + 1) /* always shorter than */
4324 { /* the basic range. */
4329 /* An extra item is needed */
4331 length += 1 + _pcre_ord2utf8(occ, buffer) +
4332 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4335 #endif /* SUPPORT_UCP */
4337 /* The length of the (possibly extended) range */
4339 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
4341 #endif /* SUPPORT_UTF8 */
4345 /* We have a single character. There is nothing to be done unless we
4346 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4347 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4353 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4356 class_optcount = 10; /* Ensure > 1 */
4357 if (!class_utf8) /* Allow for XCLASS overhead */
4360 length += LINK_SIZE + 2;
4363 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4364 (1 + _pcre_ord2utf8(c, buffer));
4365 #else /* SUPPORT_UCP */
4366 length += 1 + _pcre_ord2utf8(c, buffer);
4367 #endif /* SUPPORT_UCP */
4369 #endif /* SUPPORT_UTF8 */
4373 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4375 if (*ptr == 0) /* Missing terminating ']' */
4378 goto PCRE_ERROR_RETURN;
4381 /* We can optimize when there was only one optimizable character. Repeats
4382 for positive and negated single one-byte chars are handled by the general
4383 code. Here, we handle repeats for the class opcodes. */
4385 if (class_optcount == 1) length += 3; else
4389 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4390 we also need extra for wrapping the whole thing in a sub-pattern. */
4392 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4394 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4395 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4396 if ((min == 0 && (max == 1 || max == -1)) ||
4397 (min == 1 && max == -1))
4403 length += 2 + 2*LINK_SIZE;
4405 else if (ptr[1] == '?') ptr++;
4410 /* Brackets may be genuine groups or special things */
4413 branch_newextra = 0;
4414 bracket_length = 1 + LINK_SIZE;
4416 /* Handle special forms of bracket, which all start (? */
4425 /* Skip over comments entirely */
4428 while (*ptr != 0 && *ptr != ')') ptr++;
4432 goto PCRE_ERROR_RETURN;
4436 /* Non-referencing groups and lookaheads just move the pointer on, and
4437 then behave like a non-special bracket, except that they don't increment
4438 the count of extracting brackets. Ditto for the "once only" bracket,
4439 which is in Perl from version 5.005. */
4448 /* (?R) specifies a recursive call to the regex, which is an extension
4449 to provide the facility which can be obtained by (?p{perl-code}) in
4450 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4452 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4453 the appropriate numbered brackets. This includes both recursive and
4454 non-recursive calls. (?R) is now synonymous with (?0). */
4459 case '0': case '1': case '2': case '3': case '4':
4460 case '5': case '6': case '7': case '8': case '9':
4463 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4467 goto PCRE_ERROR_RETURN;
4469 length += 1 + LINK_SIZE;
4471 /* If this item is quantified, it will get wrapped inside brackets so
4472 as to use the code for quantified brackets. We jump down and use the
4473 code that handles this for real brackets. */
4475 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4477 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4478 duplength = 5 + 3 * LINK_SIZE;
4479 goto HANDLE_QUANTIFIED_BRACKETS;
4483 /* (?C) is an extension which provides "callout" - to provide a bit of
4484 the functionality of the Perl (?{...}) feature. An optional number may
4485 follow (default is zero). */
4489 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4493 goto PCRE_ERROR_RETURN;
4495 length += 2 + 2*LINK_SIZE;
4498 /* Named subpatterns are an extension copied from Python */
4504 const uschar *p; /* Don't amalgamate; some compilers */
4505 p = ++ptr; /* grumble at autoincrement in declaration */
4506 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4510 goto PCRE_ERROR_RETURN;
4513 if (ptr - p > max_name_size) max_name_size = (ptr - p);
4517 if (*ptr == '=' || *ptr == '>')
4519 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4523 goto PCRE_ERROR_RETURN;
4528 /* Unknown character after (?P */
4531 goto PCRE_ERROR_RETURN;
4533 /* Lookbehinds are in Perl from version 5.005 */
4537 if (*ptr == '=' || *ptr == '!')
4539 branch_newextra = 1 + LINK_SIZE;
4540 length += 1 + LINK_SIZE; /* For the first branch */
4544 goto PCRE_ERROR_RETURN;
4546 /* Conditionals are in Perl from version 5.005. The bracket must either
4547 be followed by a number (for bracket reference) or by an assertion
4548 group, or (a PCRE extension) by 'R' for a recursion test. */
4551 if (ptr[3] == 'R' && ptr[4] == ')')
4556 else if ((digitab[ptr[3]] & ctype_digit) != 0)
4560 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4564 goto PCRE_ERROR_RETURN;
4567 else /* An assertion must follow */
4569 ptr++; /* Can treat like ':' as far as spacing is concerned */
4570 if (ptr[2] != '?' ||
4571 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4573 ptr += 2; /* To get right offset in message */
4575 goto PCRE_ERROR_RETURN;
4580 /* Else loop checking valid options until ) is met. Anything else is an
4581 error. If we are without any brackets, i.e. at top level, the settings
4582 act as if specified in the options, so massage the options immediately.
4583 This is for backward compatibility with Perl 5.004. */
4596 *optset |= PCRE_CASELESS;
4600 *optset |= PCRE_MULTILINE;
4604 *optset |= PCRE_DOTALL;
4608 *optset |= PCRE_EXTENDED;
4612 *optset |= PCRE_EXTRA;
4616 *optset |= PCRE_UNGREEDY;
4623 /* A termination by ')' indicates an options-setting-only item; if
4624 this is at the very start of the pattern (indicated by item_count
4625 being zero), we use it to set the global options. This is helpful
4626 when analyzing the pattern for first characters, etc. Otherwise
4627 nothing is done here and it is handled during the compiling
4630 [Historical note: Up to Perl 5.8, options settings at top level
4631 were always global settings, wherever they appeared in the pattern.
4632 That is, they were equivalent to an external setting. From 5.8
4633 onwards, they apply only to what follows (which is what you might
4637 if (item_count == 0)
4639 options = (options | set) & (~unset);
4640 set = unset = 0; /* To save length */
4641 item_count--; /* To allow for several */
4646 /* A termination by ':' indicates the start of a nested group with
4647 the given options set. This is again handled at compile time, but
4648 we must allow for compiled space if any of the ims options are
4649 set. We also have to allow for resetting space at the end of
4650 the group, which is why 4 is added to the length and not just 2.
4651 If there are several changes of options within the same group, this
4652 will lead to an over-estimate on the length, but this shouldn't
4653 matter very much. We also have to allow for resetting options at
4654 the start of any alternations, which we do by setting
4655 branch_newextra to 2. Finally, we record whether the case-dependent
4656 flag ever changes within the regex. This is used by the "required
4660 if (((set|unset) & PCRE_IMS) != 0)
4663 branch_newextra = 2;
4664 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4668 /* Unrecognized option character */
4672 goto PCRE_ERROR_RETURN;
4676 /* If we hit a closing bracket, that's it - this is a freestanding
4677 option-setting. We need to ensure that branch_extra is updated if
4678 necessary. The only values branch_newextra can have here are 0 or 2.
4679 If the value is 2, then branch_extra must either be 2 or 5, depending
4680 on whether this is a lookbehind group or not. */
4685 if (branch_newextra == 2 &&
4686 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4687 branch_extra += branch_newextra;
4691 /* If options were terminated by ':' control comes here. Fall through
4692 to handle the group below. */
4696 /* Extracting brackets must be counted so we can process escapes in a
4697 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
4698 need an additional 3 bytes of store per extracting bracket. However, if
4699 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
4700 must leave the count alone (it will aways be zero). */
4702 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
4705 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4708 /* Save length for computing whole length at end if there's a repeat that
4709 requires duplication of the group. Also save the current value of
4710 branch_extra, and start the new group with the new value. If non-zero, this
4711 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4713 if (brastackptr >= sizeof(brastack)/sizeof(int))
4716 goto PCRE_ERROR_RETURN;
4719 bralenstack[brastackptr] = branch_extra;
4720 branch_extra = branch_newextra;
4722 brastack[brastackptr++] = length;
4723 length += bracket_length;
4726 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4727 have to replicate this bracket up to that many times. If brastackptr is
4728 0 this is an unmatched bracket which will generate an error, but take care
4729 not to try to access brastack[-1] when computing the length and restoring
4730 the branch_extra value. */
4733 length += 1 + LINK_SIZE;
4734 if (brastackptr > 0)
4736 duplength = length - brastack[--brastackptr];
4737 branch_extra = bralenstack[brastackptr];
4741 /* The following code is also used when a recursion such as (?3) is
4742 followed by a quantifier, because in that case, it has to be wrapped inside
4743 brackets so that the quantifier works. The value of duplength must be
4744 set before arrival. */
4746 HANDLE_QUANTIFIED_BRACKETS:
4748 /* Leave ptr at the final char; for read_repeat_counts this happens
4749 automatically; for the others we need an increment. */
4751 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
4753 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4754 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4756 else if (c == '*') { min = 0; max = -1; ptr++; }
4757 else if (c == '+') { min = 1; max = -1; ptr++; }
4758 else if (c == '?') { min = 0; max = 1; ptr++; }
4759 else { min = 1; max = 1; }
4761 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4762 group, and if the maximum is greater than zero, we have to replicate
4763 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4769 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4772 /* When the minimum is greater than zero, we have to replicate up to
4773 minval-1 times, with no additions required in the copies. Then, if there
4774 is a limited maximum we have to replicate up to maxval-1 times allowing
4775 for a BRAZERO item before each optional copy and nesting brackets for all
4776 but one of the optional copies. */
4780 length += (min - 1) * duplength;
4781 if (max > min) /* Need this test as max=-1 means no limit */
4782 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4783 - (2 + 2*LINK_SIZE);
4786 /* Allow space for once brackets for "possessive quantifier" */
4791 length += 2 + 2*LINK_SIZE;
4795 /* Non-special character. It won't be space or # in extended mode, so it is
4796 always a genuine character. If we are in a \Q...\E sequence, check for the
4797 end; if not, we have a literal. */
4802 if (inescq && c == '\\' && ptr[1] == 'E')
4809 length += 2; /* For a one-byte character */
4810 lastitemlength = 1; /* Default length of last item for repeats */
4812 /* In UTF-8 mode, check for additional bytes. */
4815 if (utf8 && (c & 0xc0) == 0xc0)
4817 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
4818 { /* because the end is marked */
4819 lastitemlength++; /* by a zero byte. */
4830 length += 2 + LINK_SIZE; /* For final KET and END */
4832 if ((options & PCRE_AUTO_CALLOUT) != 0)
4833 length += 2 + 2*LINK_SIZE; /* For final callout */
4835 if (length > MAX_PATTERN_SIZE)
4838 goto PCRE_EARLY_ERROR_RETURN;
4841 /* Compute the size of data block needed and get it, either from malloc or
4842 externally provided function. */
4844 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4845 re = (real_pcre *)(pcre_malloc)(size);
4850 goto PCRE_EARLY_ERROR_RETURN;
4853 /* Put in the magic number, and save the sizes, options, and character table
4854 pointer. NULL is used for the default character tables. The nullpad field is at
4855 the end; it's there to help in the case when a regex compiled on a system with
4856 4-byte pointers is run on another with 8-byte pointers. */
4858 re->magic_number = MAGIC_NUMBER;
4860 re->options = options;
4862 re->name_table_offset = sizeof(real_pcre);
4863 re->name_entry_size = max_name_size + 3;
4864 re->name_count = name_count;
4866 re->tables = (tables == _pcre_default_tables)? NULL : tables;
4869 /* The starting points of the name/number translation table and of the code are
4870 passed around in the compile data block. */
4872 compile_block.names_found = 0;
4873 compile_block.name_entry_size = max_name_size + 3;
4874 compile_block.name_table = (uschar *)re + re->name_table_offset;
4875 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4876 compile_block.start_code = codestart;
4877 compile_block.start_pattern = (const uschar *)pattern;
4878 compile_block.req_varyopt = 0;
4879 compile_block.nopartial = FALSE;
4881 /* Set up a starting, non-extracting bracket, then compile the expression. On
4882 error, errorcode will be set non-zero, so we don't need to look at the result
4883 of the function here. */
4885 ptr = (const uschar *)pattern;
4886 code = (uschar *)codestart;
4889 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4890 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4891 re->top_bracket = bracount;
4892 re->top_backref = compile_block.top_backref;
4894 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
4896 /* If not reached end of pattern on success, there's an excess bracket. */
4898 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
4900 /* Fill in the terminating state and check for disastrous overflow, but
4901 if debugging, leave the test till after things are printed out. */
4906 if (code - codestart > length) errorcode = ERR23;
4909 /* Give an error if there's back reference to a non-existent capturing
4912 if (re->top_backref > re->top_bracket) errorcode = ERR15;
4914 /* Failed to compile, or error while post-processing */
4920 *erroroffset = ptr - (const uschar *)pattern;
4921 PCRE_EARLY_ERROR_RETURN:
4922 *errorptr = error_texts[errorcode];
4923 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
4927 /* If the anchored option was not passed, set the flag if we can determine that
4928 the pattern is anchored by virtue of ^ characters or \A or anything else (such
4929 as starting with .* when DOTALL is set).
4931 Otherwise, if we know what the first character has to be, save it, because that
4932 speeds up unanchored matches no end. If not, see if we can set the
4933 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4934 start with ^. and also when all branches start with .* for non-DOTALL matches.
4937 if ((options & PCRE_ANCHORED) == 0)
4939 int temp_options = options;
4940 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4941 re->options |= PCRE_ANCHORED;
4945 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4946 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
4948 int ch = firstbyte & 255;
4949 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4950 compile_block.fcc[ch] == ch)? ch : firstbyte;
4951 re->options |= PCRE_FIRSTSET;
4953 else if (is_startline(codestart, 0, compile_block.backref_map))
4954 re->options |= PCRE_STARTLINE;
4958 /* For an anchored pattern, we use the "required byte" only if it follows a
4959 variable length item in the regex. Remove the caseless flag for non-caseable
4963 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4965 int ch = reqbyte & 255;
4966 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
4967 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
4968 re->options |= PCRE_REQCHSET;
4971 /* Print out the compiled data for debugging */
4975 printf("Length = %d top_bracket = %d top_backref = %d\n",
4976 length, re->top_bracket, re->top_backref);
4978 if (re->options != 0)
4980 printf("%s%s%s%s%s%s%s%s%s%s\n",
4981 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
4982 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
4983 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
4984 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
4985 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
4986 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
4987 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
4988 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
4989 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
4990 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
4993 if ((re->options & PCRE_FIRSTSET) != 0)
4995 int ch = re->first_byte & 255;
4996 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4997 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
4998 else printf("First char = \\x%02x%s\n", ch, caseless);
5001 if ((re->options & PCRE_REQCHSET) != 0)
5003 int ch = re->req_byte & 255;
5004 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5005 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5006 else printf("Req char = \\x%02x%s\n", ch, caseless);
5009 _pcre_printint(re, stdout);
5011 /* This check is done here in the debugging case so that the code that
5012 was compiled can be seen. */
5014 if (code - codestart > length)
5017 *errorptr = error_texts[ERR23];
5018 *erroroffset = ptr - (uschar *)pattern;
5019 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5027 /* End of pcre_compile.c */