1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.5 2007/06/26 11:16:54 ph10 Exp $ */
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
10 Written by Philip Hazel
11 Copyright (c) 1997-2007 University of Cambridge
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
43 /* This module contains the external function pcre_compile(), along with
44 supporting internal functions that are not used by other modules. */
47 #define NLBLOCK cd /* Block containing newline information */
48 #define PSSTART start_pattern /* Field containing processed string start */
49 #define PSEND end_pattern /* Field containing processed string end */
52 #include "pcre_internal.h"
55 /* When DEBUG is defined, we need the pcre_printint() function, which is also
56 used by pcretest. DEBUG is not defined when building a production library. */
59 #include "pcre_printint.src"
63 /* Macro for setting individual bits in class bitmaps. */
65 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68 /*************************************************
69 * Code parameters and static tables *
70 *************************************************/
72 /* This value specifies the size of stack workspace that is used during the
73 first pre-compile phase that determines how much memory is required. The regex
74 is partly compiled into this space, but the compiled parts are discarded as
75 soon as they can be, so that hopefully there will never be an overrun. The code
76 does, however, check for an overrun. The largest amount I've seen used is 218,
77 so this number is very generous.
79 The same workspace is used during the second, actual compile phase for
80 remembering forward references to groups so that they can be filled in at the
81 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
82 is 4 there is plenty of room. */
84 #define COMPILE_WORK_SIZE (4096)
87 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
88 are simple data values; negative values are for special things like \d and so
89 on. Zero means further processing is needed (for things like \x), or the escape
92 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
93 static const short int escapes[] = {
94 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
95 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
96 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
97 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
98 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
99 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
100 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
101 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
102 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
103 0, 0, -ESC_z /* x - z */
106 #else /* This is the "abnormal" table for EBCDIC systems */
107 static const short int escapes[] = {
108 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
109 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
110 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
111 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
112 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
113 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
114 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
115 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
116 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
117 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
118 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
119 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
120 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
121 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
122 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
123 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
124 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
125 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
126 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
127 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
128 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
129 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
135 /* Tables of names of POSIX character classes and their lengths. The list is
136 terminated by a zero length entry. The first three must be alpha, lower, upper,
137 as this is assumed for handling case independence. */
139 static const char *const posix_names[] = {
140 "alpha", "lower", "upper",
141 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
142 "print", "punct", "space", "word", "xdigit" };
144 static const uschar posix_name_lengths[] = {
145 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
147 /* Table of class bit maps for each POSIX class. Each class is formed from a
148 base map, with an optional addition or removal of another map. Then, for some
149 classes, there is some additional tweaking: for [:blank:] the vertical space
150 characters are removed, and for [:alpha:] and [:alnum:] the underscore
151 character is removed. The triples in the table consist of the base map offset,
152 second map offset or -1 if no second map, and a non-negative value for map
153 addition or a negative value for map subtraction (if there are two maps). The
154 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
155 remove vertical space characters, 2 => remove underscore. */
157 static const int posix_class_maps[] = {
158 cbit_word, cbit_digit, -2, /* alpha */
159 cbit_lower, -1, 0, /* lower */
160 cbit_upper, -1, 0, /* upper */
161 cbit_word, -1, 2, /* alnum - word without underscore */
162 cbit_print, cbit_cntrl, 0, /* ascii */
163 cbit_space, -1, 1, /* blank - a GNU extension */
164 cbit_cntrl, -1, 0, /* cntrl */
165 cbit_digit, -1, 0, /* digit */
166 cbit_graph, -1, 0, /* graph */
167 cbit_print, -1, 0, /* print */
168 cbit_punct, -1, 0, /* punct */
169 cbit_space, -1, 0, /* space */
170 cbit_word, -1, 0, /* word - a Perl extension */
171 cbit_xdigit,-1, 0 /* xdigit */
175 #define STRING(a) # a
176 #define XSTRING(s) STRING(s)
178 /* The texts of compile-time error messages. These are "char *" because they
179 are passed to the outside world. Do not ever re-use any error number, because
180 they are documented. Always add a new error instead. Messages marked DEAD below
181 are no longer used. */
183 static const char *error_texts[] = {
185 "\\ at end of pattern",
186 "\\c at end of pattern",
187 "unrecognized character follows \\",
188 "numbers out of order in {} quantifier",
190 "number too big in {} quantifier",
191 "missing terminating ] for character class",
192 "invalid escape sequence in character class",
193 "range out of order in character class",
196 "operand of unlimited repeat could match the empty string", /** DEAD **/
197 "internal error: unexpected repeat",
198 "unrecognized character after (?",
199 "POSIX named classes are supported only within a class",
202 "reference to non-existent subpattern",
203 "erroffset passed as NULL",
204 "unknown option bit(s) set",
205 "missing ) after comment",
206 "parentheses nested too deeply", /** DEAD **/
208 "regular expression too large",
209 "failed to get memory",
210 "unmatched parentheses",
211 "internal error: code overflow",
212 "unrecognized character after (?<",
214 "lookbehind assertion is not fixed length",
215 "malformed number or name after (?(",
216 "conditional group contains more than two branches",
217 "assertion expected after (?(",
218 "(?R or (?[+-]digits must be followed by )",
220 "unknown POSIX class name",
221 "POSIX collating elements are not supported",
222 "this version of PCRE is not compiled with PCRE_UTF8 support",
223 "spare error", /** DEAD **/
224 "character value in \\x{...} sequence is too large",
226 "invalid condition (?(0)",
227 "\\C not allowed in lookbehind assertion",
228 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
229 "number after (?C is > 255",
230 "closing ) for (?C expected",
232 "recursive call could loop indefinitely",
233 "unrecognized character after (?P",
234 "syntax error in subpattern name (missing terminator)",
235 "two named subpatterns have the same name",
236 "invalid UTF-8 string",
238 "support for \\P, \\p, and \\X has not been compiled",
239 "malformed \\P or \\p sequence",
240 "unknown property name after \\P or \\p",
241 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
242 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
244 "repeated subpattern is too long",
245 "octal value is greater than \\377 (not in UTF-8 mode)",
246 "internal error: overran compiling workspace",
247 "internal error: previously-checked referenced subpattern not found",
248 "DEFINE group contains more than one branch",
250 "repeating a DEFINE group is not allowed",
251 "inconsistent NEWLINE options",
252 "\\g is not followed by a braced name or an optionally braced non-zero number",
253 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
257 /* Table to identify digits and hex digits. This is used when compiling
258 patterns. Note that the tables in chartables are dependent on the locale, and
259 may mark arbitrary characters as digits - but the PCRE compiling code expects
260 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
261 a private table here. It costs 256 bytes, but it is a lot faster than doing
262 character value tests (at least in some simple cases I timed), and in some
263 applications one wants PCRE to compile efficiently as well as match
266 For convenience, we use the same bit definitions as in chartables:
269 0x08 hexadecimal digit
271 Then we can use ctype_digit and ctype_xdigit in the code. */
273 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
274 static const unsigned char digitab[] =
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
282 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
283 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
284 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
288 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
302 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
309 #else /* This is the "abnormal" case, for EBCDIC systems */
310 static const unsigned char digitab[] =
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
321 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
328 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
336 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
342 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
343 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
345 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
346 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
347 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
348 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
350 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
354 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
355 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
357 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
359 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
360 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
361 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
362 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
363 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
364 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
365 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
366 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
367 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
368 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
369 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
370 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
371 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
372 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
373 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
374 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
375 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
376 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
377 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
381 /* Definition to allow mutual recursion */
384 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
385 int *, int *, branch_chain *, compile_data *, int *);
389 /*************************************************
391 *************************************************/
393 /* This function is called when a \ has been encountered. It either returns a
394 positive value for a simple escape such as \n, or a negative value which
395 encodes one of the more complicated things such as \d. A backreference to group
396 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
397 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
398 ptr is pointing at the \. On exit, it is on the final character of the escape
402 ptrptr points to the pattern position pointer
403 errorcodeptr points to the errorcode variable
404 bracount number of previous extracting brackets
405 options the options bits
406 isclass TRUE if inside a character class
408 Returns: zero or positive => a data character
409 negative => a special escape sequence
410 on error, errorptr is set
414 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
415 int options, BOOL isclass)
417 BOOL utf8 = (options & PCRE_UTF8) != 0;
418 const uschar *ptr = *ptrptr + 1;
421 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
422 ptr--; /* Set pointer back to the last byte */
424 /* If backslash is at the end of the pattern, it's an error. */
426 if (c == 0) *errorcodeptr = ERR1;
428 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
429 a table. A non-zero result is something that can be returned immediately.
430 Otherwise further processing may be required. */
432 #ifndef EBCDIC /* ASCII coding */
433 else if (c < '0' || c > 'z') {} /* Not alphameric */
434 else if ((i = escapes[c - '0']) != 0) c = i;
436 #else /* EBCDIC coding */
437 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
438 else if ((i = escapes[c - 0x48]) != 0) c = i;
441 /* Escapes that need further processing, or are illegal. */
445 const uschar *oldptr;
446 BOOL braced, negated;
450 /* A number of Perl escapes are not handled by PCRE. We give an explicit
458 *errorcodeptr = ERR37;
461 /* \g must be followed by a number, either plain or braced. If positive, it
462 is an absolute backreference. If negative, it is a relative backreference.
463 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
464 reference to a named group. This is part of Perl's movement towards a
465 unified syntax for back references. As this is synonymous with \k{name}, we
466 fudge it up by pretending it really was \k. */
472 for (p = ptr+2; *p != 0 && *p != '}'; p++)
473 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
474 if (*p != 0 && *p != '}')
489 else negated = FALSE;
492 while ((digitab[ptr[1]] & ctype_digit) != 0)
493 c = c * 10 + *(++ptr) - '0';
495 if (c == 0 || (braced && *(++ptr) != '}'))
497 *errorcodeptr = ERR57;
505 *errorcodeptr = ERR15;
508 c = bracount - (c - 1);
514 /* The handling of escape sequences consisting of a string of digits
515 starting with one that is not zero is not straightforward. By experiment,
516 the way Perl works seems to be as follows:
518 Outside a character class, the digits are read as a decimal number. If the
519 number is less than 10, or if there are that many previous extracting
520 left brackets, then it is a back reference. Otherwise, up to three octal
521 digits are read to form an escaped byte. Thus \123 is likely to be octal
522 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
523 value is greater than 377, the least significant 8 bits are taken. Inside a
524 character class, \ followed by a digit is always an octal number. */
526 case '1': case '2': case '3': case '4': case '5':
527 case '6': case '7': case '8': case '9':
533 while ((digitab[ptr[1]] & ctype_digit) != 0)
534 c = c * 10 + *(++ptr) - '0';
535 if (c < 10 || c <= bracount)
540 ptr = oldptr; /* Put the pointer back and fall through */
543 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
544 generates a binary zero byte and treats the digit as a following literal.
545 Thus we have to pull back the pointer by one. */
547 if ((c = *ptr) >= '8')
554 /* \0 always starts an octal number, but we may drop through to here with a
555 larger first octal digit. The original code used just to take the least
556 significant 8 bits of octal numbers (I think this is what early Perls used
557 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
558 than 3 octal digits. */
562 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
563 c = c * 8 + *(++ptr) - '0';
564 if (!utf8 && c > 255) *errorcodeptr = ERR51;
567 /* \x is complicated. \x{ddd} is a character number which can be greater
568 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
569 treated as a data character. */
574 const uschar *pt = ptr + 2;
578 while ((digitab[*pt] & ctype_xdigit) != 0)
580 register int cc = *pt++;
581 if (c == 0 && cc == '0') continue; /* Leading zeroes */
584 #ifndef EBCDIC /* ASCII coding */
585 if (cc >= 'a') cc -= 32; /* Convert to upper case */
586 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
587 #else /* EBCDIC coding */
588 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
589 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
595 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
600 /* If the sequence of hex digits does not end with '}', then we don't
601 recognize this construct; fall through to the normal \x handling. */
604 /* Read just a single-byte hex-defined char */
607 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
609 int cc; /* Some compilers don't like ++ */
610 cc = *(++ptr); /* in initializers */
611 #ifndef EBCDIC /* ASCII coding */
612 if (cc >= 'a') cc -= 32; /* Convert to upper case */
613 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
614 #else /* EBCDIC coding */
615 if (cc <= 'z') cc += 64; /* Convert to upper case */
616 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
621 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
622 This coding is ASCII-specific, but then the whole concept of \cx is
623 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
629 *errorcodeptr = ERR2;
633 #ifndef EBCDIC /* ASCII coding */
634 if (c >= 'a' && c <= 'z') c -= 32;
636 #else /* EBCDIC coding */
637 if (c >= 'a' && c <= 'z') c += 64;
642 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
643 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
644 for Perl compatibility, it is a literal. This code looks a bit odd, but
645 there used to be some cases other than the default, and there may be again
646 in future, so I haven't "optimized" it. */
649 if ((options & PCRE_EXTRA) != 0) switch(c)
652 *errorcodeptr = ERR3;
666 /*************************************************
668 *************************************************/
670 /* This function is called after \P or \p has been encountered, provided that
671 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
672 pointing at the P or p. On exit, it is pointing at the final character of the
676 ptrptr points to the pattern position pointer
677 negptr points to a boolean that is set TRUE for negation else FALSE
678 dptr points to an int that is set to the detailed property value
679 errorcodeptr points to the error code variable
681 Returns: type value from ucp_type_table, or -1 for an invalid type
685 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
688 const uschar *ptr = *ptrptr;
692 if (c == 0) goto ERROR_RETURN;
696 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
706 for (i = 0; i < sizeof(name) - 1; i++)
709 if (c == 0) goto ERROR_RETURN;
713 if (c !='}') goto ERROR_RETURN;
717 /* Otherwise there is just one following character */
727 /* Search for a recognized property name using binary chop */
730 top = _pcre_utt_size;
734 i = (bot + top) >> 1;
735 c = strcmp(name, _pcre_utt[i].name);
738 *dptr = _pcre_utt[i].value;
739 return _pcre_utt[i].type;
741 if (c > 0) bot = i + 1; else top = i;
744 *errorcodeptr = ERR47;
749 *errorcodeptr = ERR46;
758 /*************************************************
759 * Check for counted repeat *
760 *************************************************/
762 /* This function is called when a '{' is encountered in a place where it might
763 start a quantifier. It looks ahead to see if it really is a quantifier or not.
764 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
765 where the ddds are digits.
768 p pointer to the first char after '{'
770 Returns: TRUE or FALSE
774 is_counted_repeat(const uschar *p)
776 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
777 while ((digitab[*p] & ctype_digit) != 0) p++;
778 if (*p == '}') return TRUE;
780 if (*p++ != ',') return FALSE;
781 if (*p == '}') return TRUE;
783 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
784 while ((digitab[*p] & ctype_digit) != 0) p++;
791 /*************************************************
792 * Read repeat counts *
793 *************************************************/
795 /* Read an item of the form {n,m} and return the values. This is called only
796 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
797 so the syntax is guaranteed to be correct, but we need to check the values.
800 p pointer to first char after '{'
801 minp pointer to int for min
802 maxp pointer to int for max
803 returned as -1 if no max
804 errorcodeptr points to error code variable
806 Returns: pointer to '}' on success;
807 current ptr on error, with errorcodeptr set non-zero
810 static const uschar *
811 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
816 /* Read the minimum value and do a paranoid check: a negative value indicates
817 an integer overflow. */
819 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
820 if (min < 0 || min > 65535)
822 *errorcodeptr = ERR5;
826 /* Read the maximum value if there is one, and again do a paranoid on its size.
827 Also, max must not be less than min. */
829 if (*p == '}') max = min; else
834 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
835 if (max < 0 || max > 65535)
837 *errorcodeptr = ERR5;
842 *errorcodeptr = ERR4;
848 /* Fill in the required variables, and pass back the pointer to the terminating
858 /*************************************************
859 * Find forward referenced subpattern *
860 *************************************************/
862 /* This function scans along a pattern's text looking for capturing
863 subpatterns, and counting them. If it finds a named pattern that matches the
864 name it is given, it returns its number. Alternatively, if the name is NULL, it
865 returns when it reaches a given numbered subpattern. This is used for forward
866 references to subpatterns. We know that if (?P< is encountered, the name will
867 be terminated by '>' because that is checked in the first pass.
870 ptr current position in the pattern
871 count current count of capturing parens so far encountered
872 name name to seek, or NULL if seeking a numbered subpattern
873 lorn name length, or subpattern number if name is NULL
874 xmode TRUE if we are in /x mode
876 Returns: the number of the named subpattern, or -1 if not found
880 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
883 const uschar *thisname;
885 for (; *ptr != 0; ptr++)
889 /* Skip over backslashed characters and also entire \Q...\E */
893 if (*(++ptr) == 0) return -1;
894 if (*ptr == 'Q') for (;;)
896 while (*(++ptr) != 0 && *ptr != '\\');
897 if (*ptr == 0) return -1;
898 if (*(++ptr) == 'E') break;
903 /* Skip over character classes */
907 while (*(++ptr) != ']')
911 if (*(++ptr) == 0) return -1;
912 if (*ptr == 'Q') for (;;)
914 while (*(++ptr) != 0 && *ptr != '\\');
915 if (*ptr == 0) return -1;
916 if (*(++ptr) == 'E') break;
924 /* Skip comments in /x mode */
926 if (xmode && *ptr == '#')
928 while (*(++ptr) != 0 && *ptr != '\n');
929 if (*ptr == 0) return -1;
933 /* An opening parens must now be a real metacharacter */
935 if (*ptr != '(') continue;
939 if (name == NULL && count == lorn) return count;
944 if (*ptr == 'P') ptr++; /* Allow optional P */
946 /* We have to disambiguate (?<! and (?<= from (?<name> */
948 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
954 if (name == NULL && count == lorn) return count;
956 if (term == '<') term = '>';
958 while (*ptr != term) ptr++;
959 if (name != NULL && lorn == ptr - thisname &&
960 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
969 /*************************************************
970 * Find first significant op code *
971 *************************************************/
973 /* This is called by several functions that scan a compiled expression looking
974 for a fixed first character, or an anchoring op code etc. It skips over things
975 that do not influence this. For some calls, a change of option is important.
976 For some calls, it makes sense to skip negative forward and all backward
977 assertions, and also the \b assertion; for others it does not.
980 code pointer to the start of the group
981 options pointer to external options
982 optbit the option bit whose changing is significant, or
984 skipassert TRUE if certain assertions are to be skipped
986 Returns: pointer to the first significant opcode
990 first_significant_code(const uschar *code, int *options, int optbit,
998 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
999 *options = (int)code[1];
1005 case OP_ASSERTBACK_NOT:
1006 if (!skipassert) return code;
1007 do code += GET(code, 1); while (*code == OP_ALT);
1008 code += _pcre_OP_lengths[*code];
1011 case OP_WORD_BOUNDARY:
1012 case OP_NOT_WORD_BOUNDARY:
1013 if (!skipassert) return code;
1020 code += _pcre_OP_lengths[*code];
1027 /* Control never reaches here */
1033 /*************************************************
1034 * Find the fixed length of a pattern *
1035 *************************************************/
1037 /* Scan a pattern and compute the fixed length of subject that will match it,
1038 if the length is fixed. This is needed for dealing with backward assertions.
1039 In UTF8 mode, the result is in characters rather than bytes.
1042 code points to the start of the pattern (the bracket)
1043 options the compiling options
1045 Returns: the fixed length, or -1 if there is no fixed length,
1046 or -2 if \C was encountered
1050 find_fixedlength(uschar *code, int options)
1054 register int branchlength = 0;
1055 register uschar *cc = code + 1 + LINK_SIZE;
1057 /* Scan along the opcodes for this branch. If we get to the end of the
1058 branch, check the length against that of the other branches. */
1063 register int op = *cc;
1071 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1072 if (d < 0) return d;
1074 do cc += GET(cc, 1); while (*cc == OP_ALT);
1075 cc += 1 + LINK_SIZE;
1078 /* Reached end of a branch; if it's a ket it is the end of a nested
1079 call. If it's ALT it is an alternation in a nested call. If it is
1080 END it's the end of the outer call. All can be handled by the same code. */
1087 if (length < 0) length = branchlength;
1088 else if (length != branchlength) return -1;
1089 if (*cc != OP_ALT) return length;
1090 cc += 1 + LINK_SIZE;
1094 /* Skip over assertive subpatterns */
1099 case OP_ASSERTBACK_NOT:
1100 do cc += GET(cc, 1); while (*cc == OP_ALT);
1103 /* Skip over things that don't match chars */
1117 case OP_NOT_WORD_BOUNDARY:
1118 case OP_WORD_BOUNDARY:
1119 cc += _pcre_OP_lengths[*cc];
1122 /* Handle literal characters */
1130 if ((options & PCRE_UTF8) != 0)
1132 while ((*cc & 0xc0) == 0x80) cc++;
1137 /* Handle exact repetitions. The count is already in characters, but we
1138 need to skip over a multibyte character in UTF8 mode. */
1141 branchlength += GET2(cc,1);
1144 if ((options & PCRE_UTF8) != 0)
1146 while((*cc & 0x80) == 0x80) cc++;
1152 branchlength += GET2(cc,1);
1156 /* Handle single-char matchers */
1165 case OP_NOT_WHITESPACE:
1167 case OP_NOT_WORDCHAR:
1174 /* The single-byte matcher isn't allowed */
1179 /* Check a class for variable quantification */
1183 cc += GET(cc, 1) - 33;
1201 if (GET2(cc,1) != GET2(cc,3)) return -1;
1202 branchlength += GET2(cc,1);
1211 /* Anything else is variable length */
1217 /* Control never gets here */
1223 /*************************************************
1224 * Scan compiled regex for numbered bracket *
1225 *************************************************/
1227 /* This little function scans through a compiled pattern until it finds a
1228 capturing bracket with the given number.
1231 code points to start of expression
1232 utf8 TRUE in UTF-8 mode
1233 number the required bracket number
1235 Returns: pointer to the opcode for the bracket, or NULL if not found
1238 static const uschar *
1239 find_bracket(const uschar *code, BOOL utf8, int number)
1243 register int c = *code;
1244 if (c == OP_END) return NULL;
1246 /* XCLASS is used for classes that cannot be represented just by a bit
1247 map. This includes negated single high-valued characters. The length in
1248 the table is zero; the actual length is stored in the compiled code. */
1250 if (c == OP_XCLASS) code += GET(code, 1);
1252 /* Handle capturing bracket */
1254 else if (c == OP_CBRA)
1256 int n = GET2(code, 1+LINK_SIZE);
1257 if (n == number) return (uschar *)code;
1258 code += _pcre_OP_lengths[c];
1261 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1262 a multi-byte character. The length in the table is a minimum, so we have to
1263 arrange to skip the extra bytes. */
1267 code += _pcre_OP_lengths[c];
1286 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1296 /*************************************************
1297 * Scan compiled regex for recursion reference *
1298 *************************************************/
1300 /* This little function scans through a compiled pattern until it finds an
1301 instance of OP_RECURSE.
1304 code points to start of expression
1305 utf8 TRUE in UTF-8 mode
1307 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1310 static const uschar *
1311 find_recurse(const uschar *code, BOOL utf8)
1315 register int c = *code;
1316 if (c == OP_END) return NULL;
1317 if (c == OP_RECURSE) return code;
1319 /* XCLASS is used for classes that cannot be represented just by a bit
1320 map. This includes negated single high-valued characters. The length in
1321 the table is zero; the actual length is stored in the compiled code. */
1323 if (c == OP_XCLASS) code += GET(code, 1);
1325 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1326 that are followed by a character may be followed by a multi-byte character.
1327 The length in the table is a minimum, so we have to arrange to skip the extra
1332 code += _pcre_OP_lengths[c];
1351 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1361 /*************************************************
1362 * Scan compiled branch for non-emptiness *
1363 *************************************************/
1365 /* This function scans through a branch of a compiled pattern to see whether it
1366 can match the empty string or not. It is called from could_be_empty()
1367 below and from compile_branch() when checking for an unlimited repeat of a
1368 group that can match nothing. Note that first_significant_code() skips over
1369 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1370 struck an inner bracket whose current branch will already have been scanned.
1373 code points to start of search
1374 endcode points to where to stop
1375 utf8 TRUE if in UTF8 mode
1377 Returns: TRUE if what is matched could be empty
1381 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1384 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1386 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1388 const uschar *ccode;
1392 /* Groups with zero repeats can of course be empty; skip them. */
1394 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1396 code += _pcre_OP_lengths[c];
1397 do code += GET(code, 1); while (*code == OP_ALT);
1402 /* For other groups, scan the branches. */
1404 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1407 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1409 /* Scan a closed bracket */
1411 empty_branch = FALSE;
1414 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1415 empty_branch = TRUE;
1416 code += GET(code, 1);
1418 while (*code == OP_ALT);
1419 if (!empty_branch) return FALSE; /* All branches are non-empty */
1424 /* Handle the other opcodes */
1428 /* Check for quantifiers after a class */
1432 ccode = code + GET(code, 1);
1433 goto CHECK_CLASS_REPEAT;
1446 case OP_CRSTAR: /* These could be empty; continue */
1452 default: /* Non-repeat => class must match */
1453 case OP_CRPLUS: /* These repeats aren't empty */
1459 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1464 /* Opcodes that must match a character */
1471 case OP_NOT_WHITESPACE:
1473 case OP_NOT_WORDCHAR:
1489 case OP_TYPEMINPLUS:
1490 case OP_TYPEPOSPLUS:
1502 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1503 MINUPTO, and POSUPTO may be followed by a multibyte character */
1515 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1526 /*************************************************
1527 * Scan compiled regex for non-emptiness *
1528 *************************************************/
1530 /* This function is called to check for left recursive calls. We want to check
1531 the current branch of the current pattern to see if it could match the empty
1532 string. If it could, we must look outwards for branches at other levels,
1533 stopping when we pass beyond the bracket which is the subject of the recursion.
1536 code points to start of the recursion
1537 endcode points to where to stop (current RECURSE item)
1538 bcptr points to the chain of current (unclosed) branch starts
1539 utf8 TRUE if in UTF-8 mode
1541 Returns: TRUE if what is matched could be empty
1545 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1548 while (bcptr != NULL && bcptr->current >= code)
1550 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1551 bcptr = bcptr->outer;
1558 /*************************************************
1559 * Check for POSIX class syntax *
1560 *************************************************/
1562 /* This function is called when the sequence "[:" or "[." or "[=" is
1563 encountered in a character class. It checks whether this is followed by an
1564 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1568 ptr pointer to the initial [
1569 endptr where to return the end pointer
1570 cd pointer to compile data
1572 Returns: TRUE or FALSE
1576 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1578 int terminator; /* Don't combine these lines; the Solaris cc */
1579 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1580 if (*(++ptr) == '^') ptr++;
1581 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1582 if (*ptr == terminator && ptr[1] == ']')
1593 /*************************************************
1594 * Check POSIX class name *
1595 *************************************************/
1597 /* This function is called to check the name given in a POSIX-style class entry
1601 ptr points to the first letter
1602 len the length of the name
1604 Returns: a value representing the name, or -1 if unknown
1608 check_posix_name(const uschar *ptr, int len)
1610 register int yield = 0;
1611 while (posix_name_lengths[yield] != 0)
1613 if (len == posix_name_lengths[yield] &&
1614 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1621 /*************************************************
1622 * Adjust OP_RECURSE items in repeated group *
1623 *************************************************/
1625 /* OP_RECURSE items contain an offset from the start of the regex to the group
1626 that is referenced. This means that groups can be replicated for fixed
1627 repetition simply by copying (because the recursion is allowed to refer to
1628 earlier groups that are outside the current group). However, when a group is
1629 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1630 it, after it has been compiled. This means that any OP_RECURSE items within it
1631 that refer to the group itself or any contained groups have to have their
1632 offsets adjusted. That one of the jobs of this function. Before it is called,
1633 the partially compiled regex must be temporarily terminated with OP_END.
1635 This function has been extended with the possibility of forward references for
1636 recursions and subroutine calls. It must also check the list of such references
1637 for the group we are dealing with. If it finds that one of the recursions in
1638 the current group is on this list, it adjusts the offset in the list, not the
1639 value in the reference (which is a group number).
1642 group points to the start of the group
1643 adjust the amount by which the group is to be moved
1644 utf8 TRUE in UTF-8 mode
1645 cd contains pointers to tables etc.
1646 save_hwm the hwm forward reference pointer at the start of the group
1652 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1655 uschar *ptr = group;
1656 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1661 /* See if this recursion is on the forward reference list. If so, adjust the
1664 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1666 offset = GET(hc, 0);
1667 if (cd->start_code + offset == ptr + 1)
1669 PUT(hc, 0, offset + adjust);
1674 /* Otherwise, adjust the recursion offset if it's after the start of this
1679 offset = GET(ptr, 1);
1680 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1683 ptr += 1 + LINK_SIZE;
1689 /*************************************************
1690 * Insert an automatic callout point *
1691 *************************************************/
1693 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1694 callout points before each pattern item.
1697 code current code pointer
1698 ptr current pattern pointer
1699 cd pointers to tables etc
1701 Returns: new code pointer
1705 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1707 *code++ = OP_CALLOUT;
1709 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1710 PUT(code, LINK_SIZE, 0); /* Default length */
1711 return code + 2*LINK_SIZE;
1716 /*************************************************
1717 * Complete a callout item *
1718 *************************************************/
1720 /* A callout item contains the length of the next item in the pattern, which
1721 we can't fill in till after we have reached the relevant point. This is used
1722 for both automatic and manual callouts.
1725 previous_callout points to previous callout item
1726 ptr current pattern pointer
1727 cd pointers to tables etc
1733 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1735 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1736 PUT(previous_callout, 2 + LINK_SIZE, length);
1742 /*************************************************
1743 * Get othercase range *
1744 *************************************************/
1746 /* This function is passed the start and end of a class range, in UTF-8 mode
1747 with UCP support. It searches up the characters, looking for internal ranges of
1748 characters in the "other" case. Each call returns the next one, updating the
1752 cptr points to starting character value; updated
1754 ocptr where to put start of othercase range
1755 odptr where to put end of othercase range
1757 Yield: TRUE when range returned; FALSE when no more
1761 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1762 unsigned int *odptr)
1764 unsigned int c, othercase, next;
1766 for (c = *cptr; c <= d; c++)
1767 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1769 if (c > d) return FALSE;
1772 next = othercase + 1;
1774 for (++c; c <= d; c++)
1776 if (_pcre_ucp_othercase(c) != next) break;
1785 #endif /* SUPPORT_UCP */
1789 /*************************************************
1790 * Check if auto-possessifying is possible *
1791 *************************************************/
1793 /* This function is called for unlimited repeats of certain items, to see
1794 whether the next thing could possibly match the repeated item. If not, it makes
1795 sense to automatically possessify the repeated item.
1798 op_code the repeated op code
1799 this data for this item, depends on the opcode
1800 utf8 TRUE in UTF-8 mode
1801 utf8_char used for utf8 character bytes, NULL if not relevant
1802 ptr next character in pattern
1803 options options bits
1804 cd contains pointers to tables etc.
1806 Returns: TRUE if possessifying is wanted
1810 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1811 const uschar *ptr, int options, compile_data *cd)
1815 /* Skip whitespace and comments in extended mode */
1817 if ((options & PCRE_EXTENDED) != 0)
1821 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1824 while (*(++ptr) != 0)
1825 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1831 /* If the next item is one that we can handle, get its value. A non-negative
1832 value is a character, a negative value is an escape value. */
1836 int temperrorcode = 0;
1837 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1838 if (temperrorcode != 0) return FALSE;
1839 ptr++; /* Point after the escape sequence */
1842 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1845 if (utf8) { GETCHARINC(next, ptr); } else
1852 /* Skip whitespace and comments in extended mode */
1854 if ((options & PCRE_EXTENDED) != 0)
1858 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1861 while (*(++ptr) != 0)
1862 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1868 /* If the next thing is itself optional, we have to give up. */
1870 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1873 /* Now compare the next item with the previous opcode. If the previous is a
1874 positive single character match, "item" either contains the character or, if
1875 "item" is greater than 127 in utf8 mode, the character's bytes are in
1879 /* Handle cases when the next item is a character. */
1881 if (next >= 0) switch(op_code)
1885 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1887 return item != next;
1889 /* For CHARNC (caseless character) we must check the other case. If we have
1890 Unicode property support, we can use it to test the other case of
1891 high-valued characters. */
1895 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1897 if (item == next) return FALSE;
1901 unsigned int othercase;
1902 if (next < 128) othercase = cd->fcc[next]; else
1904 othercase = _pcre_ucp_othercase((unsigned int)next);
1906 othercase = NOTACHAR;
1908 return (unsigned int)item != othercase;
1911 #endif /* SUPPORT_UTF8 */
1912 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1914 /* For OP_NOT, "item" must be a single-byte character. */
1917 if (next < 0) return FALSE; /* Not a character */
1918 if (item == next) return TRUE;
1919 if ((options & PCRE_CASELESS) == 0) return FALSE;
1923 unsigned int othercase;
1924 if (next < 128) othercase = cd->fcc[next]; else
1926 othercase = _pcre_ucp_othercase(next);
1928 othercase = NOTACHAR;
1930 return (unsigned int)item == othercase;
1933 #endif /* SUPPORT_UTF8 */
1934 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1937 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1940 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1943 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1945 case OP_NOT_WHITESPACE:
1946 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1949 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1951 case OP_NOT_WORDCHAR:
1952 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1977 return op_code != OP_HSPACE;
1979 return op_code == OP_HSPACE;
1993 return op_code != OP_VSPACE;
1995 return op_code == OP_VSPACE;
2003 /* Handle the case when the next item is \d, \s, etc. */
2010 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2015 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2018 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2021 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2024 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2027 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2030 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2055 return -next != ESC_h;
2057 return -next == ESC_h;
2071 return -next != ESC_v;
2073 return -next == ESC_v;
2081 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2082 next == -ESC_h || next == -ESC_v;
2085 return next == -ESC_d;
2088 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2090 case OP_NOT_WHITESPACE:
2091 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2094 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2097 return next == -ESC_h;
2099 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2101 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2104 return next == -ESC_v;
2107 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2109 case OP_NOT_WORDCHAR:
2110 return next == -ESC_w || next == -ESC_d;
2116 /* Control does not reach here */
2121 /*************************************************
2122 * Compile one branch *
2123 *************************************************/
2125 /* Scan the pattern, compiling it into the a vector. If the options are
2126 changed during the branch, the pointer is used to change the external options
2127 bits. This function is used during the pre-compile phase when we are trying
2128 to find out the amount of memory needed, as well as during the real compile
2129 phase. The value of lengthptr distinguishes the two phases.
2132 optionsptr pointer to the option bits
2133 codeptr points to the pointer to the current code point
2134 ptrptr points to the current pattern pointer
2135 errorcodeptr points to error code variable
2136 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2137 reqbyteptr set to the last literal character required, else < 0
2138 bcptr points to current branch chain
2139 cd contains pointers to tables etc.
2140 lengthptr NULL during the real compile phase
2141 points to length accumulator during pre-compile phase
2143 Returns: TRUE on success
2144 FALSE, with *errorcodeptr set non-zero on error
2148 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2149 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2150 compile_data *cd, int *lengthptr)
2152 int repeat_type, op_type;
2153 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2155 int greedy_default, greedy_non_default;
2156 int firstbyte, reqbyte;
2157 int zeroreqbyte, zerofirstbyte;
2158 int req_caseopt, reqvary, tempreqvary;
2159 int options = *optionsptr;
2160 int after_manual_callout = 0;
2161 int length_prevgroup = 0;
2163 register uschar *code = *codeptr;
2164 uschar *last_code = code;
2165 uschar *orig_code = code;
2167 BOOL inescq = FALSE;
2168 BOOL groupsetfirstbyte = FALSE;
2169 const uschar *ptr = *ptrptr;
2170 const uschar *tempptr;
2171 uschar *previous = NULL;
2172 uschar *previous_callout = NULL;
2173 uschar *save_hwm = NULL;
2174 uschar classbits[32];
2178 BOOL utf8 = (options & PCRE_UTF8) != 0;
2179 uschar *class_utf8data;
2180 uschar utf8_char[6];
2183 uschar *utf8_char = NULL;
2187 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2190 /* Set up the default and non-default settings for greediness */
2192 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2193 greedy_non_default = greedy_default ^ 1;
2195 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2196 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2197 matches a non-fixed char first char; reqbyte just remains unset if we never
2200 When we hit a repeat whose minimum is zero, we may have to adjust these values
2201 to take the zero repeat into account. This is implemented by setting them to
2202 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2203 item types that can be repeated set these backoff variables appropriately. */
2205 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2207 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2208 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2209 value > 255. It is added into the firstbyte or reqbyte variables to record the
2210 case status of the value. This is used only for ASCII characters. */
2212 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2214 /* Switch on next character until the end of the branch */
2219 BOOL possessive_quantifier;
2222 BOOL reset_bracount;
2223 int class_charcount;
2235 /* Get next byte in the pattern */
2239 /* If we are in the pre-compile phase, accumulate the length used for the
2240 previous cycle of this loop. */
2242 if (lengthptr != NULL)
2245 if (code > cd->hwm) cd->hwm = code; /* High water info */
2247 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2249 *errorcodeptr = ERR52;
2253 /* There is at least one situation where code goes backwards: this is the
2254 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2255 the class is simply eliminated. However, it is created first, so we have to
2256 allow memory for it. Therefore, don't ever reduce the length at this point.
2259 if (code < last_code) code = last_code;
2260 *lengthptr += code - last_code;
2261 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2263 /* If "previous" is set and it is not at the start of the work space, move
2264 it back to there, in order to avoid filling up the work space. Otherwise,
2265 if "previous" is NULL, reset the current code pointer to the start. */
2267 if (previous != NULL)
2269 if (previous > orig_code)
2271 memmove(orig_code, previous, code - previous);
2272 code -= previous - orig_code;
2273 previous = orig_code;
2276 else code = orig_code;
2278 /* Remember where this code item starts so we can pick up the length
2284 /* In the real compile phase, just check the workspace used by the forward
2287 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2289 *errorcodeptr = ERR52;
2293 /* If in \Q...\E, check for the end; if not, we have a literal */
2295 if (inescq && c != 0)
2297 if (c == '\\' && ptr[1] == 'E')
2305 if (previous_callout != NULL)
2307 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2308 complete_callout(previous_callout, ptr, cd);
2309 previous_callout = NULL;
2311 if ((options & PCRE_AUTO_CALLOUT) != 0)
2313 previous_callout = code;
2314 code = auto_callout(code, ptr, cd);
2320 /* Fill in length of a previous callout, except when the next thing is
2323 is_quantifier = c == '*' || c == '+' || c == '?' ||
2324 (c == '{' && is_counted_repeat(ptr+1));
2326 if (!is_quantifier && previous_callout != NULL &&
2327 after_manual_callout-- <= 0)
2329 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2330 complete_callout(previous_callout, ptr, cd);
2331 previous_callout = NULL;
2334 /* In extended mode, skip white space and comments */
2336 if ((options & PCRE_EXTENDED) != 0)
2338 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2341 while (*(++ptr) != 0)
2343 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2345 if (*ptr != 0) continue;
2347 /* Else fall through to handle end of string */
2352 /* No auto callout for quantifiers. */
2354 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2356 previous_callout = code;
2357 code = auto_callout(code, ptr, cd);
2362 /* ===================================================================*/
2363 case 0: /* The branch terminates at string end */
2364 case '|': /* or | or ) */
2366 *firstbyteptr = firstbyte;
2367 *reqbyteptr = reqbyte;
2370 if (lengthptr != NULL)
2372 *lengthptr += code - last_code; /* To include callout length */
2373 DPRINTF((">> end branch\n"));
2378 /* ===================================================================*/
2379 /* Handle single-character metacharacters. In multiline mode, ^ disables
2380 the setting of any following char as a first character. */
2383 if ((options & PCRE_MULTILINE) != 0)
2385 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2396 /* There can never be a first char if '.' is first, whatever happens about
2397 repeats. The value of reqbyte doesn't change either. */
2400 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2401 zerofirstbyte = firstbyte;
2402 zeroreqbyte = reqbyte;
2408 /* ===================================================================*/
2409 /* Character classes. If the included characters are all < 256, we build a
2410 32-byte bitmap of the permitted characters, except in the special case
2411 where there is only one such character. For negated classes, we build the
2412 map as usual, then invert it at the end. However, we use a different opcode
2413 so that data characters > 255 can be handled correctly.
2415 If the class contains characters outside the 0-255 range, a different
2416 opcode is compiled. It may optionally have a bit map for characters < 256,
2417 but those above are are explicitly listed afterwards. A flag byte tells
2418 whether the bitmap is present, and whether this is a negated class or not.
2424 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2425 they are encountered at the top level, so we'll do that too. */
2427 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2428 check_posix_syntax(ptr, &tempptr, cd))
2430 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2434 /* If the first character is '^', set the negation flag and skip it. */
2436 if ((c = *(++ptr)) == '^')
2438 negate_class = TRUE;
2443 negate_class = FALSE;
2446 /* Keep a count of chars with values < 256 so that we can optimize the case
2447 of just a single character (as long as it's < 256). However, For higher
2448 valued UTF-8 characters, we don't yet do any optimization. */
2450 class_charcount = 0;
2451 class_lastchar = -1;
2453 /* Initialize the 32-char bit map to all zeros. We build the map in a
2454 temporary bit of memory, in case the class contains only 1 character (less
2455 than 256), because in that case the compiled code doesn't use the bit map.
2458 memset(classbits, 0, 32 * sizeof(uschar));
2461 class_utf8 = FALSE; /* No chars >= 256 */
2462 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2465 /* Process characters until ] is reached. By writing this as a "do" it
2466 means that an initial ] is taken as a data character. At the start of the
2467 loop, c contains the first byte of the character. */
2471 const uschar *oldptr;
2474 if (utf8 && c > 127)
2475 { /* Braces are required because the */
2476 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2480 /* Inside \Q...\E everything is literal except \E */
2484 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2486 inescq = FALSE; /* Reset literal state */
2487 ptr++; /* Skip the 'E' */
2488 continue; /* Carry on with next */
2490 goto CHECK_RANGE; /* Could be range if \E follows */
2493 /* Handle POSIX class names. Perl allows a negation extension of the
2494 form [:^name:]. A square bracket that doesn't match the syntax is
2495 treated as a literal. We also recognize the POSIX constructions
2496 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2500 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2501 check_posix_syntax(ptr, &tempptr, cd))
2503 BOOL local_negate = FALSE;
2504 int posix_class, taboffset, tabopt;
2505 register const uschar *cbits = cd->cbits;
2510 *errorcodeptr = ERR31;
2517 local_negate = TRUE;
2521 posix_class = check_posix_name(ptr, tempptr - ptr);
2522 if (posix_class < 0)
2524 *errorcodeptr = ERR30;
2528 /* If matching is caseless, upper and lower are converted to
2529 alpha. This relies on the fact that the class table starts with
2530 alpha, lower, upper as the first 3 entries. */
2532 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2535 /* We build the bit map for the POSIX class in a chunk of local store
2536 because we may be adding and subtracting from it, and we don't want to
2537 subtract bits that may be in the main map already. At the end we or the
2538 result into the bit map that is being built. */
2542 /* Copy in the first table (always present) */
2544 memcpy(pbits, cbits + posix_class_maps[posix_class],
2545 32 * sizeof(uschar));
2547 /* If there is a second table, add or remove it as required. */
2549 taboffset = posix_class_maps[posix_class + 1];
2550 tabopt = posix_class_maps[posix_class + 2];
2555 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2557 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2560 /* Not see if we need to remove any special characters. An option
2561 value of 1 removes vertical space and 2 removes underscore. */
2563 if (tabopt < 0) tabopt = -tabopt;
2564 if (tabopt == 1) pbits[1] &= ~0x3c;
2565 else if (tabopt == 2) pbits[11] &= 0x7f;
2567 /* Add the POSIX table or its complement into the main table that is
2568 being built and we are done. */
2571 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2573 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2576 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2577 continue; /* End of POSIX syntax handling */
2580 /* Backslash may introduce a single character, or it may introduce one
2581 of the specials, which just set a flag. The sequence \b is a special
2582 case. Inside a class (and only there) it is treated as backspace.
2583 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2584 to or into the one we are building. We assume they have more than one
2585 character in them, so set class_charcount bigger than one. */
2589 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2590 if (*errorcodeptr != 0) goto FAILED;
2592 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2593 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2594 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2595 else if (-c == ESC_Q) /* Handle start of quoted string */
2597 if (ptr[1] == '\\' && ptr[2] == 'E')
2599 ptr += 2; /* avoid empty string */
2607 register const uschar *cbits = cd->cbits;
2608 class_charcount += 2; /* Greater than 1 is what matters */
2610 /* Save time by not doing this in the pre-compile phase. */
2612 if (lengthptr == NULL) switch (-c)
2615 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2619 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2623 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2627 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2631 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2632 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2636 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2637 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2640 case ESC_E: /* Perl ignores an orphan \E */
2643 default: /* Not recognized; fall through */
2644 break; /* Need "default" setting to stop compiler warning. */
2647 /* In the pre-compile phase, just do the recognition. */
2649 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2650 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2652 /* We need to deal with \H, \h, \V, and \v in both phases because
2653 they use extra memory. */
2657 SETBIT(classbits, 0x09); /* VT */
2658 SETBIT(classbits, 0x20); /* SPACE */
2659 SETBIT(classbits, 0xa0); /* NSBP */
2664 *class_utf8data++ = XCL_SINGLE;
2665 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2666 *class_utf8data++ = XCL_SINGLE;
2667 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2668 *class_utf8data++ = XCL_RANGE;
2669 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2670 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2671 *class_utf8data++ = XCL_SINGLE;
2672 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2673 *class_utf8data++ = XCL_SINGLE;
2674 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2675 *class_utf8data++ = XCL_SINGLE;
2676 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2684 for (c = 0; c < 32; c++)
2689 case 0x09/8: x ^= 1 << (0x09%8); break;
2690 case 0x20/8: x ^= 1 << (0x20%8); break;
2691 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2701 *class_utf8data++ = XCL_RANGE;
2702 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2703 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2704 *class_utf8data++ = XCL_RANGE;
2705 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2706 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2707 *class_utf8data++ = XCL_RANGE;
2708 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2709 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2710 *class_utf8data++ = XCL_RANGE;
2711 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2712 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2713 *class_utf8data++ = XCL_RANGE;
2714 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2715 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2716 *class_utf8data++ = XCL_RANGE;
2717 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2718 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2719 *class_utf8data++ = XCL_RANGE;
2720 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2721 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2729 SETBIT(classbits, 0x0a); /* LF */
2730 SETBIT(classbits, 0x0b); /* VT */
2731 SETBIT(classbits, 0x0c); /* FF */
2732 SETBIT(classbits, 0x0d); /* CR */
2733 SETBIT(classbits, 0x85); /* NEL */
2738 *class_utf8data++ = XCL_RANGE;
2739 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2740 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2748 for (c = 0; c < 32; c++)
2753 case 0x0a/8: x ^= 1 << (0x0a%8);
2758 case 0x85/8: x ^= 1 << (0x85%8); break;
2768 *class_utf8data++ = XCL_RANGE;
2769 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2770 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2771 *class_utf8data++ = XCL_RANGE;
2772 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2773 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2779 /* We need to deal with \P and \p in both phases. */
2782 if (-c == ESC_p || -c == ESC_P)
2786 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2787 if (ptype < 0) goto FAILED;
2789 *class_utf8data++ = ((-c == ESC_p) != negated)?
2790 XCL_PROP : XCL_NOTPROP;
2791 *class_utf8data++ = ptype;
2792 *class_utf8data++ = pdata;
2793 class_charcount -= 2; /* Not a < 256 character */
2797 /* Unrecognized escapes are faulted if PCRE is running in its
2798 strict mode. By default, for compatibility with Perl, they are
2799 treated as literals. */
2801 if ((options & PCRE_EXTRA) != 0)
2803 *errorcodeptr = ERR7;
2807 class_charcount -= 2; /* Undo the default count from above */
2808 c = *ptr; /* Get the final character and fall through */
2811 /* Fall through if we have a single character (c >= 0). This may be
2812 greater than 256 in UTF-8 mode. */
2814 } /* End of backslash handling */
2816 /* A single character may be followed by '-' to form a range. However,
2817 Perl does not permit ']' to be the end of the range. A '-' character
2818 at the end is treated as a literal. Perl ignores orphaned \E sequences
2819 entirely. The code for handling \Q and \E is messy. */
2822 while (ptr[1] == '\\' && ptr[2] == 'E')
2830 if (!inescq && ptr[1] == '-')
2834 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2836 /* If we hit \Q (not followed by \E) at this point, go into escaped
2839 while (*ptr == '\\' && ptr[1] == 'Q')
2842 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2847 if (*ptr == 0 || (!inescq && *ptr == ']'))
2850 goto LONE_SINGLE_CHARACTER;
2855 { /* Braces are required because the */
2856 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2860 d = *ptr; /* Not UTF-8 mode */
2862 /* The second part of a range can be a single-character escape, but
2863 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2864 in such circumstances. */
2866 if (!inescq && d == '\\')
2868 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2869 if (*errorcodeptr != 0) goto FAILED;
2871 /* \b is backslash; \X is literal X; \R is literal R; any other
2872 special means the '-' was literal */
2876 if (d == -ESC_b) d = '\b';
2877 else if (d == -ESC_X) d = 'X';
2878 else if (d == -ESC_R) d = 'R'; else
2881 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2886 /* Check that the two values are in the correct order. Optimize
2887 one-character ranges */
2891 *errorcodeptr = ERR8;
2895 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2897 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2898 matching, we have to use an XCLASS with extra data items. Caseless
2899 matching for characters > 127 is available only if UCP support is
2903 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2907 /* With UCP support, we can find the other case equivalents of
2908 the relevant characters. There may be several ranges. Optimize how
2909 they fit with the basic range. */
2912 if ((options & PCRE_CASELESS) != 0)
2914 unsigned int occ, ocd;
2915 unsigned int cc = c;
2916 unsigned int origd = d;
2917 while (get_othercase_range(&cc, origd, &occ, &ocd))
2919 if (occ >= (unsigned int)c &&
2920 ocd <= (unsigned int)d)
2921 continue; /* Skip embedded ranges */
2923 if (occ < (unsigned int)c &&
2924 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2925 { /* if there is overlap, */
2926 c = occ; /* noting that if occ < c */
2927 continue; /* we can't have ocd > d */
2928 } /* because a subrange is */
2929 if (ocd > (unsigned int)d &&
2930 occ <= (unsigned int)d + 1) /* always shorter than */
2931 { /* the basic range. */
2938 *class_utf8data++ = XCL_SINGLE;
2942 *class_utf8data++ = XCL_RANGE;
2943 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2945 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2948 #endif /* SUPPORT_UCP */
2950 /* Now record the original range, possibly modified for UCP caseless
2951 overlapping ranges. */
2953 *class_utf8data++ = XCL_RANGE;
2954 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2955 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2957 /* With UCP support, we are done. Without UCP support, there is no
2958 caseless matching for UTF-8 characters > 127; we can use the bit map
2959 for the smaller ones. */
2962 continue; /* With next character in the class */
2964 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2966 /* Adjust upper limit and fall through to set up the map */
2970 #endif /* SUPPORT_UCP */
2972 #endif /* SUPPORT_UTF8 */
2974 /* We use the bit map for all cases when not in UTF-8 mode; else
2975 ranges that lie entirely within 0-127 when there is UCP support; else
2976 for partial ranges without UCP support. */
2978 class_charcount += d - c + 1;
2981 /* We can save a bit of time by skipping this in the pre-compile. */
2983 if (lengthptr == NULL) for (; c <= d; c++)
2985 classbits[c/8] |= (1 << (c&7));
2986 if ((options & PCRE_CASELESS) != 0)
2988 int uc = cd->fcc[c]; /* flip case */
2989 classbits[uc/8] |= (1 << (uc&7));
2993 continue; /* Go get the next char in the class */
2996 /* Handle a lone single character - we can get here for a normal
2997 non-escape char, or after \ that introduces a single character or for an
2998 apparent range that isn't. */
3000 LONE_SINGLE_CHARACTER:
3002 /* Handle a character that cannot go in the bit map */
3005 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3008 *class_utf8data++ = XCL_SINGLE;
3009 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3012 if ((options & PCRE_CASELESS) != 0)
3014 unsigned int othercase;
3015 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3017 *class_utf8data++ = XCL_SINGLE;
3018 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3021 #endif /* SUPPORT_UCP */
3025 #endif /* SUPPORT_UTF8 */
3027 /* Handle a single-byte character */
3029 classbits[c/8] |= (1 << (c&7));
3030 if ((options & PCRE_CASELESS) != 0)
3032 c = cd->fcc[c]; /* flip case */
3033 classbits[c/8] |= (1 << (c&7));
3040 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3042 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3044 if (c == 0) /* Missing terminating ']' */
3046 *errorcodeptr = ERR6;
3050 /* If class_charcount is 1, we saw precisely one character whose value is
3051 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3052 can optimize the negative case only if there were no characters >= 128
3053 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3054 single-bytes only. This is an historical hangover. Maybe one day we can
3055 tidy these opcodes to handle multi-byte characters.
3057 The optimization throws away the bit map. We turn the item into a
3058 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3059 that OP_NOT does not support multibyte characters. In the positive case, it
3060 can cause firstbyte to be set. Otherwise, there can be no first char if
3061 this item is first, whatever repeat count may follow. In the case of
3062 reqbyte, save the previous value for reinstating. */
3065 if (class_charcount == 1 &&
3067 (!class_utf8 && (!negate_class || class_lastchar < 128))))
3070 if (class_charcount == 1)
3073 zeroreqbyte = reqbyte;
3075 /* The OP_NOT opcode works on one-byte characters only. */
3079 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3080 zerofirstbyte = firstbyte;
3082 *code++ = class_lastchar;
3086 /* For a single, positive character, get the value into mcbuffer, and
3087 then we can handle this with the normal one-character code. */
3090 if (utf8 && class_lastchar > 127)
3091 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3095 mcbuffer[0] = class_lastchar;
3099 } /* End of 1-char optimization */
3101 /* The general case - not the one-char optimization. If this is the first
3102 thing in the branch, there can be no first char setting, whatever the
3103 repeat count. Any reqbyte setting must remain unchanged after any kind of
3106 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3107 zerofirstbyte = firstbyte;
3108 zeroreqbyte = reqbyte;
3110 /* If there are characters with values > 255, we have to compile an
3111 extended class, with its own opcode. If there are no characters < 256,
3112 we can omit the bitmap in the actual compiled code. */
3117 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3118 *code++ = OP_XCLASS;
3120 *code = negate_class? XCL_NOT : 0;
3122 /* If the map is required, move up the extra data to make room for it;
3123 otherwise just move the code pointer to the end of the extra data. */
3125 if (class_charcount > 0)
3128 memmove(code + 32, code, class_utf8data - code);
3129 memcpy(code, classbits, 32);
3130 code = class_utf8data + 32;
3132 else code = class_utf8data;
3134 /* Now fill in the complete length of the item */
3136 PUT(previous, 1, code - previous);
3137 break; /* End of class handling */
3141 /* If there are no characters > 255, negate the 32-byte map if necessary,
3142 and copy it into the code vector. If this is the first thing in the branch,
3143 there can be no first char setting, whatever the repeat count. Any reqbyte
3144 setting must remain unchanged after any kind of repeat. */
3148 *code++ = OP_NCLASS;
3149 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3150 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3155 memcpy(code, classbits, 32);
3161 /* ===================================================================*/
3162 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3163 has been tested above. */
3166 if (!is_quantifier) goto NORMAL_CHAR;
3167 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3168 if (*errorcodeptr != 0) goto FAILED;
3186 if (previous == NULL)
3188 *errorcodeptr = ERR9;
3192 if (repeat_min == 0)
3194 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3195 reqbyte = zeroreqbyte; /* Ditto */
3198 /* Remember whether this is a variable length repeat */
3200 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3202 op_type = 0; /* Default single-char op codes */
3203 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3205 /* Save start of previous item, in case we have to move it up to make space
3206 for an inserted OP_ONCE for the additional '+' extension. */
3208 tempcode = previous;
3210 /* If the next character is '+', we have a possessive quantifier. This
3211 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3212 If the next character is '?' this is a minimizing repeat, by default,
3213 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3214 repeat type to the non-default. */
3218 repeat_type = 0; /* Force greedy */
3219 possessive_quantifier = TRUE;
3222 else if (ptr[1] == '?')
3224 repeat_type = greedy_non_default;
3227 else repeat_type = greedy_default;
3229 /* If previous was a character match, abolish the item and generate a
3230 repeat item instead. If a char item has a minumum of more than one, ensure
3231 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3232 the first thing in a branch because the x will have gone into firstbyte
3235 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3237 /* Deal with UTF-8 characters that take up more than one byte. It's
3238 easier to write this out separately than try to macrify it. Use c to
3239 hold the length of the character in bytes, plus 0x80 to flag that it's a
3240 length rather than a small character. */
3243 if (utf8 && (code[-1] & 0x80) != 0)
3245 uschar *lastchar = code - 1;
3246 while((*lastchar & 0xc0) == 0x80) lastchar--;
3247 c = code - lastchar; /* Length of UTF-8 character */
3248 memcpy(utf8_char, lastchar, c); /* Save the char */
3249 c |= 0x80; /* Flag c as a length */
3254 /* Handle the case of a single byte - either with no UTF8 support, or
3255 with UTF-8 disabled, or for a UTF-8 character < 128. */
3259 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3262 /* If the repetition is unlimited, it pays to see if the next thing on
3263 the line is something that cannot possibly match this character. If so,
3264 automatically possessifying this item gains some performance in the case
3265 where the match fails. */
3267 if (!possessive_quantifier &&
3269 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3272 repeat_type = 0; /* Force greedy */
3273 possessive_quantifier = TRUE;
3276 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3279 /* If previous was a single negated character ([^a] or similar), we use
3280 one of the special opcodes, replacing it. The code is shared with single-
3281 character repeats by setting opt_type to add a suitable offset into
3282 repeat_type. We can also test for auto-possessification. OP_NOT is
3283 currently used only for single-byte chars. */
3285 else if (*previous == OP_NOT)
3287 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3289 if (!possessive_quantifier &&
3291 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3293 repeat_type = 0; /* Force greedy */
3294 possessive_quantifier = TRUE;
3296 goto OUTPUT_SINGLE_REPEAT;
3299 /* If previous was a character type match (\d or similar), abolish it and
3300 create a suitable repeat item. The code is shared with single-character
3301 repeats by setting op_type to add a suitable offset into repeat_type. Note
3302 the the Unicode property types will be present only when SUPPORT_UCP is
3303 defined, but we don't wrap the little bits of code here because it just
3304 makes it horribly messy. */
3306 else if (*previous < OP_EODN)
3309 int prop_type, prop_value;
3310 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3313 if (!possessive_quantifier &&
3315 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3317 repeat_type = 0; /* Force greedy */
3318 possessive_quantifier = TRUE;
3321 OUTPUT_SINGLE_REPEAT:
3322 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3324 prop_type = previous[1];
3325 prop_value = previous[2];
3327 else prop_type = prop_value = -1;
3330 code = previous; /* Usually overwrite previous item */
3332 /* If the maximum is zero then the minimum must also be zero; Perl allows
3333 this case, so we do too - by simply omitting the item altogether. */
3335 if (repeat_max == 0) goto END_REPEAT;
3337 /* All real repeats make it impossible to handle partial matching (maybe
3338 one day we will be able to remove this restriction). */
3340 if (repeat_max != 1) cd->nopartial = TRUE;
3342 /* Combine the op_type with the repeat_type */
3344 repeat_type += op_type;
3346 /* A minimum of zero is handled either as the special case * or ?, or as
3347 an UPTO, with the maximum given. */
3349 if (repeat_min == 0)
3351 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3352 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3355 *code++ = OP_UPTO + repeat_type;
3356 PUT2INC(code, 0, repeat_max);
3360 /* A repeat minimum of 1 is optimized into some special cases. If the
3361 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3362 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3363 one less than the maximum. */
3365 else if (repeat_min == 1)
3367 if (repeat_max == -1)
3368 *code++ = OP_PLUS + repeat_type;
3371 code = oldcode; /* leave previous item in place */
3372 if (repeat_max == 1) goto END_REPEAT;
3373 *code++ = OP_UPTO + repeat_type;
3374 PUT2INC(code, 0, repeat_max - 1);
3378 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3379 handled as an EXACT followed by an UPTO. */
3383 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3384 PUT2INC(code, 0, repeat_min);
3386 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3387 we have to insert the character for the previous code. For a repeated
3388 Unicode property match, there are two extra bytes that define the
3389 required property. In UTF-8 mode, long characters have their length in
3390 c, with the 0x80 bit as a flag. */
3395 if (utf8 && c >= 128)
3397 memcpy(code, utf8_char, c & 7);
3406 *code++ = prop_type;
3407 *code++ = prop_value;
3410 *code++ = OP_STAR + repeat_type;
3413 /* Else insert an UPTO if the max is greater than the min, again
3414 preceded by the character, for the previously inserted code. If the
3415 UPTO is just for 1 instance, we can use QUERY instead. */
3417 else if (repeat_max != repeat_min)
3420 if (utf8 && c >= 128)
3422 memcpy(code, utf8_char, c & 7);
3430 *code++ = prop_type;
3431 *code++ = prop_value;
3433 repeat_max -= repeat_min;
3435 if (repeat_max == 1)
3437 *code++ = OP_QUERY + repeat_type;
3441 *code++ = OP_UPTO + repeat_type;
3442 PUT2INC(code, 0, repeat_max);
3447 /* The character or character type itself comes last in all cases. */
3450 if (utf8 && c >= 128)
3452 memcpy(code, utf8_char, c & 7);
3459 /* For a repeated Unicode property match, there are two extra bytes that
3460 define the required property. */
3465 *code++ = prop_type;
3466 *code++ = prop_value;
3471 /* If previous was a character class or a back reference, we put the repeat
3472 stuff after it, but just skip the item if the repeat was {0,0}. */
3474 else if (*previous == OP_CLASS ||
3475 *previous == OP_NCLASS ||
3477 *previous == OP_XCLASS ||
3479 *previous == OP_REF)
3481 if (repeat_max == 0)
3487 /* All real repeats make it impossible to handle partial matching (maybe
3488 one day we will be able to remove this restriction). */
3490 if (repeat_max != 1) cd->nopartial = TRUE;
3492 if (repeat_min == 0 && repeat_max == -1)
3493 *code++ = OP_CRSTAR + repeat_type;
3494 else if (repeat_min == 1 && repeat_max == -1)
3495 *code++ = OP_CRPLUS + repeat_type;
3496 else if (repeat_min == 0 && repeat_max == 1)
3497 *code++ = OP_CRQUERY + repeat_type;
3500 *code++ = OP_CRRANGE + repeat_type;
3501 PUT2INC(code, 0, repeat_min);
3502 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3503 PUT2INC(code, 0, repeat_max);
3507 /* If previous was a bracket group, we may have to replicate it in certain
3510 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3511 *previous == OP_ONCE || *previous == OP_COND)
3515 int len = code - previous;
3516 uschar *bralink = NULL;
3518 /* Repeating a DEFINE group is pointless */
3520 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3522 *errorcodeptr = ERR55;
3526 /* This is a paranoid check to stop integer overflow later on */
3528 if (len > MAX_DUPLENGTH)
3530 *errorcodeptr = ERR50;
3534 /* If the maximum repeat count is unlimited, find the end of the bracket
3535 by scanning through from the start, and compute the offset back to it
3536 from the current code pointer. There may be an OP_OPT setting following
3537 the final KET, so we can't find the end just by going back from the code
3540 if (repeat_max == -1)
3542 register uschar *ket = previous;
3543 do ket += GET(ket, 1); while (*ket != OP_KET);
3544 ketoffset = code - ket;
3547 /* The case of a zero minimum is special because of the need to stick
3548 OP_BRAZERO in front of it, and because the group appears once in the
3549 data, whereas in other cases it appears the minimum number of times. For
3550 this reason, it is simplest to treat this case separately, as otherwise
3551 the code gets far too messy. There are several special subcases when the
3554 if (repeat_min == 0)
3556 /* If the maximum is also zero, we just omit the group from the output
3559 if (repeat_max == 0)
3565 /* If the maximum is 1 or unlimited, we just have to stick in the
3566 BRAZERO and do no more at this point. However, we do need to adjust
3567 any OP_RECURSE calls inside the group that refer to the group itself or
3568 any internal or forward referenced group, because the offset is from
3569 the start of the whole regex. Temporarily terminate the pattern while
3572 if (repeat_max <= 1)
3575 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3576 memmove(previous+1, previous, len);
3578 *previous++ = OP_BRAZERO + repeat_type;
3581 /* If the maximum is greater than 1 and limited, we have to replicate
3582 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3583 The first one has to be handled carefully because it's the original
3584 copy, which has to be moved up. The remainder can be handled by code
3585 that is common with the non-zero minimum case below. We have to
3586 adjust the value or repeat_max, since one less copy is required. Once
3587 again, we may have to adjust any OP_RECURSE calls inside the group. */
3593 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3594 memmove(previous + 2 + LINK_SIZE, previous, len);
3595 code += 2 + LINK_SIZE;
3596 *previous++ = OP_BRAZERO + repeat_type;
3597 *previous++ = OP_BRA;
3599 /* We chain together the bracket offset fields that have to be
3600 filled in later when the ends of the brackets are reached. */
3602 offset = (bralink == NULL)? 0 : previous - bralink;
3604 PUTINC(previous, 0, offset);
3610 /* If the minimum is greater than zero, replicate the group as many
3611 times as necessary, and adjust the maximum to the number of subsequent
3612 copies that we need. If we set a first char from the group, and didn't
3613 set a required char, copy the latter from the former. If there are any
3614 forward reference subroutine calls in the group, there will be entries on
3615 the workspace list; replicate these with an appropriate increment. */
3621 /* In the pre-compile phase, we don't actually do the replication. We
3622 just adjust the length as if we had. */
3624 if (lengthptr != NULL)
3625 *lengthptr += (repeat_min - 1)*length_prevgroup;
3627 /* This is compiling for real */
3631 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3632 for (i = 1; i < repeat_min; i++)
3635 uschar *this_hwm = cd->hwm;
3636 memcpy(code, previous, len);
3637 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3639 PUT(cd->hwm, 0, GET(hc, 0) + len);
3640 cd->hwm += LINK_SIZE;
3642 save_hwm = this_hwm;
3648 if (repeat_max > 0) repeat_max -= repeat_min;
3651 /* This code is common to both the zero and non-zero minimum cases. If
3652 the maximum is limited, it replicates the group in a nested fashion,
3653 remembering the bracket starts on a stack. In the case of a zero minimum,
3654 the first one was set up above. In all cases the repeat_max now specifies
3655 the number of additional copies needed. Again, we must remember to
3656 replicate entries on the forward reference list. */
3658 if (repeat_max >= 0)
3660 /* In the pre-compile phase, we don't actually do the replication. We
3661 just adjust the length as if we had. For each repetition we must add 1
3662 to the length for BRAZERO and for all but the last repetition we must
3663 add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3665 if (lengthptr != NULL && repeat_max > 0)
3666 *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3667 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3669 /* This is compiling for real */
3671 else for (i = repeat_max - 1; i >= 0; i--)
3674 uschar *this_hwm = cd->hwm;
3676 *code++ = OP_BRAZERO + repeat_type;
3678 /* All but the final copy start a new nesting, maintaining the
3679 chain of brackets outstanding. */
3685 offset = (bralink == NULL)? 0 : code - bralink;
3687 PUTINC(code, 0, offset);
3690 memcpy(code, previous, len);
3691 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3693 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3694 cd->hwm += LINK_SIZE;
3696 save_hwm = this_hwm;
3700 /* Now chain through the pending brackets, and fill in their length
3701 fields (which are holding the chain links pro tem). */
3703 while (bralink != NULL)
3706 int offset = code - bralink + 1;
3707 uschar *bra = code - offset;
3708 oldlinkoffset = GET(bra, 1);
3709 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3711 PUTINC(code, 0, offset);
3712 PUT(bra, 1, offset);
3716 /* If the maximum is unlimited, set a repeater in the final copy. We
3717 can't just offset backwards from the current code point, because we
3718 don't know if there's been an options resetting after the ket. The
3719 correct offset was computed above.
3721 Then, when we are doing the actual compile phase, check to see whether
3722 this group is a non-atomic one that could match an empty string. If so,
3723 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3724 that runtime checking can be done. [This check is also applied to
3725 atomic groups at runtime, but in a different way.] */
3729 uschar *ketcode = code - ketoffset;
3730 uschar *bracode = ketcode - GET(ketcode, 1);
3731 *ketcode = OP_KETRMAX + repeat_type;
3732 if (lengthptr == NULL && *bracode != OP_ONCE)
3734 uschar *scode = bracode;
3737 if (could_be_empty_branch(scode, ketcode, utf8))
3739 *bracode += OP_SBRA - OP_BRA;
3742 scode += GET(scode, 1);
3744 while (*scode == OP_ALT);
3749 /* Else there's some kind of shambles */
3753 *errorcodeptr = ERR11;
3757 /* If the character following a repeat is '+', or if certain optimization
3758 tests above succeeded, possessive_quantifier is TRUE. For some of the
3759 simpler opcodes, there is an special alternative opcode for this. For
3760 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3761 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3762 but the special opcodes can optimize it a bit. The repeated item starts at
3763 tempcode, not at previous, which might be the first part of a string whose
3764 (former) last char we repeated.
3766 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3767 an 'upto' may follow. We skip over an 'exact' item, and then test the
3768 length of what remains before proceeding. */
3770 if (possessive_quantifier)
3773 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3774 *tempcode == OP_NOTEXACT)
3775 tempcode += _pcre_OP_lengths[*tempcode];
3776 len = code - tempcode;
3777 if (len > 0) switch (*tempcode)
3779 case OP_STAR: *tempcode = OP_POSSTAR; break;
3780 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3781 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3782 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3784 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3785 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3786 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3787 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3789 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3790 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3791 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3792 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3795 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3796 code += 1 + LINK_SIZE;
3797 len += 1 + LINK_SIZE;
3798 tempcode[0] = OP_ONCE;
3800 PUTINC(code, 0, len);
3801 PUT(tempcode, 1, len);
3806 /* In all case we no longer have a previous item. We also set the
3807 "follows varying string" flag for subsequently encountered reqbytes if
3808 it isn't already set and we have just passed a varying length item. */
3812 cd->req_varyopt |= reqvary;
3816 /* ===================================================================*/
3817 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3818 lookbehind or option setting or condition or all the other extended
3819 parenthesis forms. First deal with the specials; all are introduced by ?,
3820 and the appearance of any of them means that this is not a capturing
3824 newoptions = options;
3828 reset_bracount = FALSE;
3830 if (*(++ptr) == '?')
3832 int i, set, unset, namelen;
3839 case '#': /* Comment; skip to ket */
3841 while (*ptr != 0 && *ptr != ')') ptr++;
3844 *errorcodeptr = ERR18;
3850 /* ------------------------------------------------------------ */
3851 case '|': /* Reset capture count for each branch */
3852 reset_bracount = TRUE;
3855 /* ------------------------------------------------------------ */
3856 case ':': /* Non-capturing bracket */
3862 /* ------------------------------------------------------------ */
3864 bravalue = OP_COND; /* Conditional group */
3866 /* A condition can be an assertion, a number (referring to a numbered
3867 group), a name (referring to a named group), or 'R', referring to
3868 recursion. R<digits> and R&name are also permitted for recursion tests.
3870 There are several syntaxes for testing a named group: (?(name)) is used
3871 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3873 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3874 be the recursive thing or the name 'R' (and similarly for 'R' followed
3875 by digits), and (b) a number could be a name that consists of digits.
3876 In both cases, we look for a name first; if not found, we try the other
3879 /* For conditions that are assertions, check the syntax, and then exit
3880 the switch. This will take control down to where bracketed groups,
3881 including assertions, are processed. */
3883 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3886 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3887 below), and all need to skip 3 bytes at the start of the group. */
3889 code[1+LINK_SIZE] = OP_CREF;
3893 /* Check for a test for recursion in a named group. */
3895 if (ptr[1] == 'R' && ptr[2] == '&')
3899 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3902 /* Check for a test for a named group's having been set, using the Perl
3903 syntax (?(<name>) or (?('name') */
3905 else if (ptr[1] == '<')
3910 else if (ptr[1] == '\'')
3918 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3921 /* We now expect to read a name; any thing else is an error */
3923 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3925 ptr += 1; /* To get the right offset */
3926 *errorcodeptr = ERR28;
3930 /* Read the name, but also get it as a number if it's all digits */
3934 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3937 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3938 recno * 10 + *ptr - '0' : -1;
3941 namelen = ptr - name;
3943 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3945 ptr--; /* Error offset */
3946 *errorcodeptr = ERR26;
3950 /* Do no further checking in the pre-compile phase. */
3952 if (lengthptr != NULL) break;
3954 /* In the real compile we do the work of looking for the actual
3955 reference. If the string started with "+" or "-" we require the rest to
3956 be digits, in which case recno will be set. */
3962 *errorcodeptr = ERR58;
3967 recno = cd->bracount - recno + 1;
3970 *errorcodeptr = ERR15;
3974 else recno += cd->bracount;
3975 PUT2(code, 2+LINK_SIZE, recno);
3979 /* Otherwise (did not start with "+" or "-"), start by looking for the
3982 slot = cd->name_table;
3983 for (i = 0; i < cd->names_found; i++)
3985 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3986 slot += cd->name_entry_size;
3989 /* Found a previous named subpattern */
3991 if (i < cd->names_found)
3993 recno = GET2(slot, 0);
3994 PUT2(code, 2+LINK_SIZE, recno);
3997 /* Search the pattern for a forward reference */
3999 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4000 (options & PCRE_EXTENDED) != 0)) > 0)
4002 PUT2(code, 2+LINK_SIZE, i);
4005 /* If terminator == 0 it means that the name followed directly after
4006 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4007 some further alternatives to try. For the cases where terminator != 0
4008 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4009 now checked all the possibilities, so give an error. */
4011 else if (terminator != 0)
4013 *errorcodeptr = ERR15;
4017 /* Check for (?(R) for recursion. Allow digits after R to specify a
4018 specific group number. */
4020 else if (*name == 'R')
4023 for (i = 1; i < namelen; i++)
4025 if ((digitab[name[i]] & ctype_digit) == 0)
4027 *errorcodeptr = ERR15;
4030 recno = recno * 10 + name[i] - '0';
4032 if (recno == 0) recno = RREF_ANY;
4033 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4034 PUT2(code, 2+LINK_SIZE, recno);
4037 /* Similarly, check for the (?(DEFINE) "condition", which is always
4040 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4042 code[1+LINK_SIZE] = OP_DEF;
4046 /* Check for the "name" actually being a subpattern number. */
4050 PUT2(code, 2+LINK_SIZE, recno);
4053 /* Either an unidentified subpattern, or a reference to (?(0) */
4057 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4063 /* ------------------------------------------------------------ */
4064 case '=': /* Positive lookahead */
4065 bravalue = OP_ASSERT;
4070 /* ------------------------------------------------------------ */
4071 case '!': /* Negative lookahead */
4072 bravalue = OP_ASSERT_NOT;
4077 /* ------------------------------------------------------------ */
4078 case '<': /* Lookbehind or named define */
4081 case '=': /* Positive lookbehind */
4082 bravalue = OP_ASSERTBACK;
4086 case '!': /* Negative lookbehind */
4087 bravalue = OP_ASSERTBACK_NOT;
4091 default: /* Could be name define, else bad */
4092 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4093 ptr++; /* Correct offset for error */
4094 *errorcodeptr = ERR24;
4100 /* ------------------------------------------------------------ */
4101 case '>': /* One-time brackets */
4107 /* ------------------------------------------------------------ */
4108 case 'C': /* Callout - may be followed by digits; */
4109 previous_callout = code; /* Save for later completion */
4110 after_manual_callout = 1; /* Skip one item before completing */
4111 *code++ = OP_CALLOUT;
4114 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4115 n = n * 10 + *ptr - '0';
4118 *errorcodeptr = ERR39;
4123 *errorcodeptr = ERR38;
4127 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4128 PUT(code, LINK_SIZE, 0); /* Default length */
4129 code += 2 * LINK_SIZE;
4135 /* ------------------------------------------------------------ */
4136 case 'P': /* Python-style named subpattern handling */
4137 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4139 is_recurse = *ptr == '>';
4141 goto NAMED_REF_OR_RECURSE;
4143 else if (*ptr != '<') /* Test for Python-style definition */
4145 *errorcodeptr = ERR41;
4148 /* Fall through to handle (?P< as (?< is handled */
4151 /* ------------------------------------------------------------ */
4152 DEFINE_NAME: /* Come here from (?< handling */
4155 terminator = (*ptr == '<')? '>' : '\'';
4158 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4159 namelen = ptr - name;
4161 /* In the pre-compile phase, just do a syntax check. */
4163 if (lengthptr != NULL)
4165 if (*ptr != terminator)
4167 *errorcodeptr = ERR42;
4170 if (cd->names_found >= MAX_NAME_COUNT)
4172 *errorcodeptr = ERR49;
4175 if (namelen + 3 > cd->name_entry_size)
4177 cd->name_entry_size = namelen + 3;
4178 if (namelen > MAX_NAME_SIZE)
4180 *errorcodeptr = ERR48;
4186 /* In the real compile, create the entry in the table */
4190 slot = cd->name_table;
4191 for (i = 0; i < cd->names_found; i++)
4193 int crc = memcmp(name, slot+2, namelen);
4196 if (slot[2+namelen] == 0)
4198 if ((options & PCRE_DUPNAMES) == 0)
4200 *errorcodeptr = ERR43;
4204 else crc = -1; /* Current name is substring */
4208 memmove(slot + cd->name_entry_size, slot,
4209 (cd->names_found - i) * cd->name_entry_size);
4212 slot += cd->name_entry_size;
4215 PUT2(slot, 0, cd->bracount + 1);
4216 memcpy(slot + 2, name, namelen);
4217 slot[2+namelen] = 0;
4221 /* In both cases, count the number of names we've encountered. */
4223 ptr++; /* Move past > or ' */
4225 goto NUMBERED_GROUP;
4228 /* ------------------------------------------------------------ */
4229 case '&': /* Perl recursion/subroutine syntax */
4234 /* We come here from the Python syntax above that handles both
4235 references (?P=name) and recursion (?P>name), as well as falling
4236 through from the Perl recursion syntax (?&name). */
4238 NAMED_REF_OR_RECURSE:
4240 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4241 namelen = ptr - name;
4243 /* In the pre-compile phase, do a syntax check and set a dummy
4244 reference number. */
4246 if (lengthptr != NULL)
4248 if (*ptr != terminator)
4250 *errorcodeptr = ERR42;
4253 if (namelen > MAX_NAME_SIZE)
4255 *errorcodeptr = ERR48;
4261 /* In the real compile, seek the name in the table */
4265 slot = cd->name_table;
4266 for (i = 0; i < cd->names_found; i++)
4268 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4269 slot += cd->name_entry_size;
4272 if (i < cd->names_found) /* Back reference */
4274 recno = GET2(slot, 0);
4276 else if ((recno = /* Forward back reference */
4277 find_parens(ptr, cd->bracount, name, namelen,
4278 (options & PCRE_EXTENDED) != 0)) <= 0)
4280 *errorcodeptr = ERR15;
4285 /* In both phases, we can now go to the code than handles numerical
4286 recursion or backreferences. */
4288 if (is_recurse) goto HANDLE_RECURSION;
4289 else goto HANDLE_REFERENCE;
4292 /* ------------------------------------------------------------ */
4293 case 'R': /* Recursion */
4294 ptr++; /* Same as (?0) */
4298 /* ------------------------------------------------------------ */
4300 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4301 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4303 const uschar *called;
4305 if ((refsign = *ptr) == '+') ptr++;
4306 else if (refsign == '-')
4308 if ((digitab[ptr[1]] & ctype_digit) == 0)
4309 goto OTHER_CHAR_AFTER_QUERY;
4314 while((digitab[*ptr] & ctype_digit) != 0)
4315 recno = recno * 10 + *ptr++ - '0';
4319 *errorcodeptr = ERR29;
4327 *errorcodeptr = ERR58;
4330 recno = cd->bracount - recno + 1;
4333 *errorcodeptr = ERR15;
4337 else if (refsign == '+')
4341 *errorcodeptr = ERR58;
4344 recno += cd->bracount;
4347 /* Come here from code above that handles a named recursion */
4352 called = cd->start_code;
4354 /* When we are actually compiling, find the bracket that is being
4355 referenced. Temporarily end the regex in case it doesn't exist before
4356 this point. If we end up with a forward reference, first check that
4357 the bracket does occur later so we can give the error (and position)
4358 now. Then remember this forward reference in the workspace so it can
4359 be filled in at the end. */
4361 if (lengthptr == NULL)
4364 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4366 /* Forward reference */
4370 if (find_parens(ptr, cd->bracount, NULL, recno,
4371 (options & PCRE_EXTENDED) != 0) < 0)
4373 *errorcodeptr = ERR15;
4376 called = cd->start_code + recno;
4377 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4380 /* If not a forward reference, and the subpattern is still open,
4381 this is a recursive call. We check to see if this is a left
4382 recursion that could loop for ever, and diagnose that case. */
4384 else if (GET(called, 1) == 0 &&
4385 could_be_empty(called, code, bcptr, utf8))
4387 *errorcodeptr = ERR40;
4392 /* Insert the recursion/subroutine item, automatically wrapped inside
4393 "once" brackets. Set up a "previous group" length so that a
4394 subsequent quantifier will work. */
4397 PUT(code, 1, 2 + 2*LINK_SIZE);
4398 code += 1 + LINK_SIZE;
4401 PUT(code, 1, called - cd->start_code);
4402 code += 1 + LINK_SIZE;
4405 PUT(code, 1, 2 + 2*LINK_SIZE);
4406 code += 1 + LINK_SIZE;
4408 length_prevgroup = 3 + 3*LINK_SIZE;
4411 /* Can't determine a first byte now */
4413 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4417 /* ------------------------------------------------------------ */
4418 default: /* Other characters: check option setting */
4419 OTHER_CHAR_AFTER_QUERY:
4423 while (*ptr != ')' && *ptr != ':')
4427 case '-': optset = &unset; break;
4429 case 'J': /* Record that it changed in the external options */
4430 *optset |= PCRE_DUPNAMES;
4431 cd->external_options |= PCRE_JCHANGED;
4434 case 'i': *optset |= PCRE_CASELESS; break;
4435 case 'm': *optset |= PCRE_MULTILINE; break;
4436 case 's': *optset |= PCRE_DOTALL; break;
4437 case 'x': *optset |= PCRE_EXTENDED; break;
4438 case 'U': *optset |= PCRE_UNGREEDY; break;
4439 case 'X': *optset |= PCRE_EXTRA; break;
4441 default: *errorcodeptr = ERR12;
4442 ptr--; /* Correct the offset */
4447 /* Set up the changed option bits, but don't change anything yet. */
4449 newoptions = (options | set) & (~unset);
4451 /* If the options ended with ')' this is not the start of a nested
4452 group with option changes, so the options change at this level. If this
4453 item is right at the start of the pattern, the options can be
4454 abstracted and made external in the pre-compile phase, and ignored in
4455 the compile phase. This can be helpful when matching -- for instance in
4456 caseless checking of required bytes.
4458 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4459 definitely *not* at the start of the pattern because something has been
4460 compiled. In the pre-compile phase, however, the code pointer can have
4461 that value after the start, because it gets reset as code is discarded
4462 during the pre-compile. However, this can happen only at top level - if
4463 we are within parentheses, the starting BRA will still be present. At
4464 any parenthesis level, the length value can be used to test if anything
4465 has been compiled at that level. Thus, a test for both these conditions
4466 is necessary to ensure we correctly detect the start of the pattern in
4469 If we are not at the pattern start, compile code to change the ims
4470 options if this setting actually changes any of them. We also pass the
4471 new setting back so that it can be put at the start of any following
4472 branches, and when this group ends (if we are in a group), a resetting
4473 item can be compiled. */
4477 if (code == cd->start_code + 1 + LINK_SIZE &&
4478 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4480 cd->external_options = newoptions;
4481 options = newoptions;
4485 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4488 *code++ = newoptions & PCRE_IMS;
4491 /* Change options at this level, and pass them back for use
4492 in subsequent branches. Reset the greedy defaults and the case
4493 value for firstbyte and reqbyte. */
4495 *optionsptr = options = newoptions;
4496 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4497 greedy_non_default = greedy_default ^ 1;
4498 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4501 previous = NULL; /* This item can't be repeated */
4502 continue; /* It is complete */
4505 /* If the options ended with ':' we are heading into a nested group
4506 with possible change of options. Such groups are non-capturing and are
4507 not assertions of any kind. All we need to do is skip over the ':';
4508 the newoptions value is handled below. */
4512 } /* End of switch for character following (? */
4513 } /* End of (? handling */
4515 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4516 all unadorned brackets become non-capturing and behave like (?:...)
4519 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4524 /* Else we have a capturing group. */
4530 PUT2(code, 1+LINK_SIZE, cd->bracount);
4534 /* Process nested bracketed regex. Assertions may not be repeated, but
4535 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4536 non-register variable in order to be able to pass its address because some
4537 compilers complain otherwise. Pass in a new setting for the ims options if
4538 they have changed. */
4540 previous = (bravalue >= OP_ONCE)? code : NULL;
4543 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4544 length_prevgroup = 0; /* Initialize for pre-compile phase */
4547 newoptions, /* The complete new option state */
4548 options & PCRE_IMS, /* The previous ims option state */
4549 &tempcode, /* Where to put code (updated) */
4550 &ptr, /* Input pointer (updated) */
4551 errorcodeptr, /* Where to put an error message */
4552 (bravalue == OP_ASSERTBACK ||
4553 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4554 reset_bracount, /* True if (?| group */
4555 skipbytes, /* Skip over bracket number */
4556 &subfirstbyte, /* For possible first char */
4557 &subreqbyte, /* For possible last char */
4558 bcptr, /* Current branch chain */
4559 cd, /* Tables block */
4560 (lengthptr == NULL)? NULL : /* Actual compile phase */
4561 &length_prevgroup /* Pre-compile phase */
4565 /* At the end of compiling, code is still pointing to the start of the
4566 group, while tempcode has been updated to point past the end of the group
4567 and any option resetting that may follow it. The pattern pointer (ptr)
4568 is on the bracket. */
4570 /* If this is a conditional bracket, check that there are no more than
4571 two branches in the group, or just one if it's a DEFINE group. We do this
4572 in the real compile phase, not in the pre-pass, where the whole group may
4573 not be available. */
4575 if (bravalue == OP_COND && lengthptr == NULL)
4584 while (*tc != OP_KET);
4586 /* A DEFINE group is never obeyed inline (the "condition" is always
4587 false). It must have only one branch. */
4589 if (code[LINK_SIZE+1] == OP_DEF)
4593 *errorcodeptr = ERR54;
4596 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4599 /* A "normal" conditional group. If there is just one branch, we must not
4600 make use of its firstbyte or reqbyte, because this is equivalent to an
4601 empty second branch. */
4607 *errorcodeptr = ERR27;
4610 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4614 /* Error if hit end of pattern */
4618 *errorcodeptr = ERR14;
4622 /* In the pre-compile phase, update the length by the length of the nested
4623 group, less the brackets at either end. Then reduce the compiled code to
4624 just the brackets so that it doesn't use much memory if it is duplicated by
4627 if (lengthptr != NULL)
4629 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4631 PUTINC(code, 0, 1 + LINK_SIZE);
4633 PUTINC(code, 0, 1 + LINK_SIZE);
4636 /* Otherwise update the main code pointer to the end of the group. */
4638 else code = tempcode;
4640 /* For a DEFINE group, required and first character settings are not
4643 if (bravalue == OP_DEF) break;
4645 /* Handle updating of the required and first characters for other types of
4646 group. Update for normal brackets of all kinds, and conditions with two
4647 branches (see code above). If the bracket is followed by a quantifier with
4648 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4649 zerofirstbyte outside the main loop so that they can be accessed for the
4652 zeroreqbyte = reqbyte;
4653 zerofirstbyte = firstbyte;
4654 groupsetfirstbyte = FALSE;
4656 if (bravalue >= OP_ONCE)
4658 /* If we have not yet set a firstbyte in this branch, take it from the
4659 subpattern, remembering that it was set here so that a repeat of more
4660 than one can replicate it as reqbyte if necessary. If the subpattern has
4661 no firstbyte, set "none" for the whole branch. In both cases, a zero
4662 repeat forces firstbyte to "none". */
4664 if (firstbyte == REQ_UNSET)
4666 if (subfirstbyte >= 0)
4668 firstbyte = subfirstbyte;
4669 groupsetfirstbyte = TRUE;
4671 else firstbyte = REQ_NONE;
4672 zerofirstbyte = REQ_NONE;
4675 /* If firstbyte was previously set, convert the subpattern's firstbyte
4676 into reqbyte if there wasn't one, using the vary flag that was in
4677 existence beforehand. */
4679 else if (subfirstbyte >= 0 && subreqbyte < 0)
4680 subreqbyte = subfirstbyte | tempreqvary;
4682 /* If the subpattern set a required byte (or set a first byte that isn't
4683 really the first byte - see above), set it. */
4685 if (subreqbyte >= 0) reqbyte = subreqbyte;
4688 /* For a forward assertion, we take the reqbyte, if set. This can be
4689 helpful if the pattern that follows the assertion doesn't set a different
4690 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4691 for an assertion, however because it leads to incorrect effect for patterns
4692 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4693 of a firstbyte. This is overcome by a scan at the end if there's no
4694 firstbyte, looking for an asserted first char. */
4696 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4697 break; /* End of processing '(' */
4700 /* ===================================================================*/
4701 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4702 are arranged to be the negation of the corresponding OP_values. For the
4703 back references, the values are ESC_REF plus the reference number. Only
4704 back references and those types that consume a character may be repeated.
4705 We can test for values between ESC_b and ESC_Z for the latter; this may
4706 have to change if any new ones are ever created. */
4710 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4711 if (*errorcodeptr != 0) goto FAILED;
4715 if (-c == ESC_Q) /* Handle start of quoted string */
4717 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4722 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4724 /* For metasequences that actually match a character, we disable the
4725 setting of a first character if it hasn't already been set. */
4727 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4728 firstbyte = REQ_NONE;
4730 /* Set values to reset to if this is followed by a zero repeat. */
4732 zerofirstbyte = firstbyte;
4733 zeroreqbyte = reqbyte;
4735 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
4736 We also support \k{name} (.NET syntax) */
4738 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
4741 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
4742 goto NAMED_REF_OR_RECURSE;
4745 /* Back references are handled specially; must disable firstbyte if
4746 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4751 recno = -c - ESC_REF;
4753 HANDLE_REFERENCE: /* Come here from named backref handling */
4754 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4757 PUT2INC(code, 0, recno);
4758 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4759 if (recno > cd->top_backref) cd->top_backref = recno;
4762 /* So are Unicode property matches, if supported. */
4765 else if (-c == ESC_P || -c == ESC_p)
4769 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4770 if (ptype < 0) goto FAILED;
4772 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4778 /* If Unicode properties are not supported, \X, \P, and \p are not
4781 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4783 *errorcodeptr = ERR45;
4788 /* For the rest (including \X when Unicode properties are supported), we
4789 can obtain the OP value by negating the escape value. */
4793 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4799 /* We have a data character whose value is in c. In UTF-8 mode it may have
4800 a value > 127. We set its representation in the length/buffer, and then
4801 handle it as a data character. */
4804 if (utf8 && c > 127)
4805 mclength = _pcre_ord2utf8(c, mcbuffer);
4816 /* ===================================================================*/
4817 /* Handle a literal character. It is guaranteed not to be whitespace or #
4818 when the extended flag is set. If we are in UTF-8 mode, it may be a
4819 multi-byte literal character. */
4827 if (utf8 && c >= 0xc0)
4829 while ((ptr[1] & 0xc0) == 0x80)
4830 mcbuffer[mclength++] = *(++ptr);
4834 /* At this point we have the character's bytes in mcbuffer, and the length
4835 in mclength. When not in UTF-8 mode, the length is always 1. */
4839 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4840 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4842 /* Set the first and required bytes appropriately. If no previous first
4843 byte, set it from this character, but revert to none on a zero repeat.
4844 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4847 if (firstbyte == REQ_UNSET)
4849 zerofirstbyte = REQ_NONE;
4850 zeroreqbyte = reqbyte;
4852 /* If the character is more than one byte long, we can set firstbyte
4853 only if it is not to be matched caselessly. */
4855 if (mclength == 1 || req_caseopt == 0)
4857 firstbyte = mcbuffer[0] | req_caseopt;
4858 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4860 else firstbyte = reqbyte = REQ_NONE;
4863 /* firstbyte was previously set; we can set reqbyte only the length is
4864 1 or the matching is caseful. */
4868 zerofirstbyte = firstbyte;
4869 zeroreqbyte = reqbyte;
4870 if (mclength == 1 || req_caseopt == 0)
4871 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4874 break; /* End of literal character handling */
4876 } /* end of big loop */
4879 /* Control never reaches here by falling through, only by a goto for all the
4880 error states. Pass back the position in the pattern so that it can be displayed
4881 to the user for diagnosing the error. */
4891 /*************************************************
4892 * Compile sequence of alternatives *
4893 *************************************************/
4895 /* On entry, ptr is pointing past the bracket character, but on return it
4896 points to the closing bracket, or vertical bar, or end of string. The code
4897 variable is pointing at the byte into which the BRA operator has been stored.
4898 If the ims options are changed at the start (for a (?ims: group) or during any
4899 branch, we need to insert an OP_OPT item at the start of every following branch
4900 to ensure they get set correctly at run time, and also pass the new options
4901 into every subsequent branch compile.
4903 This function is used during the pre-compile phase when we are trying to find
4904 out the amount of memory needed, as well as during the real compile phase. The
4905 value of lengthptr distinguishes the two phases.
4908 options option bits, including any changes for this subpattern
4909 oldims previous settings of ims option bits
4910 codeptr -> the address of the current code pointer
4911 ptrptr -> the address of the current pattern pointer
4912 errorcodeptr -> pointer to error code variable
4913 lookbehind TRUE if this is a lookbehind assertion
4914 reset_bracount TRUE to reset the count for each branch
4915 skipbytes skip this many bytes at start (for brackets and OP_COND)
4916 firstbyteptr place to put the first required character, or a negative number
4917 reqbyteptr place to put the last required character, or a negative number
4918 bcptr pointer to the chain of currently open branches
4919 cd points to the data block with tables pointers etc.
4920 lengthptr NULL during the real compile phase
4921 points to length accumulator during pre-compile phase
4923 Returns: TRUE on success
4927 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4928 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
4929 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
4932 const uschar *ptr = *ptrptr;
4933 uschar *code = *codeptr;
4934 uschar *last_branch = code;
4935 uschar *start_bracket = code;
4936 uschar *reverse_count = NULL;
4937 int firstbyte, reqbyte;
4938 int branchfirstbyte, branchreqbyte;
4947 firstbyte = reqbyte = REQ_UNSET;
4949 /* Accumulate the length for use in the pre-compile phase. Start with the
4950 length of the BRA and KET and any extra bytes that are required at the
4951 beginning. We accumulate in a local variable to save frequent testing of
4952 lenthptr for NULL. We cannot do this by looking at the value of code at the
4953 start and end of each alternative, because compiled items are discarded during
4954 the pre-compile phase so that the work space is not exceeded. */
4956 length = 2 + 2*LINK_SIZE + skipbytes;
4958 /* WARNING: If the above line is changed for any reason, you must also change
4959 the code that abstracts option settings at the start of the pattern and makes
4960 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4961 pre-compile phase to find out whether anything has yet been compiled or not. */
4963 /* Offset is set zero to mark that this bracket is still open */
4966 code += 1 + LINK_SIZE + skipbytes;
4968 /* Loop for each alternative branch */
4970 orig_bracount = max_bracount = cd->bracount;
4973 /* For a (?| group, reset the capturing bracket count so that each branch
4974 uses the same numbers. */
4976 if (reset_bracount) cd->bracount = orig_bracount;
4978 /* Handle a change of ims options at the start of the branch */
4980 if ((options & PCRE_IMS) != oldims)
4983 *code++ = options & PCRE_IMS;
4987 /* Set up dummy OP_REVERSE if lookbehind assertion */
4991 *code++ = OP_REVERSE;
4992 reverse_count = code;
4994 length += 1 + LINK_SIZE;
4997 /* Now compile the branch; in the pre-compile phase its length gets added
5000 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5001 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5007 /* Keep the highest bracket count in case (?| was used and some branch
5008 has fewer than the rest. */
5010 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5012 /* In the real compile phase, there is some post-processing to be done. */
5014 if (lengthptr == NULL)
5016 /* If this is the first branch, the firstbyte and reqbyte values for the
5017 branch become the values for the regex. */
5019 if (*last_branch != OP_ALT)
5021 firstbyte = branchfirstbyte;
5022 reqbyte = branchreqbyte;
5025 /* If this is not the first branch, the first char and reqbyte have to
5026 match the values from all the previous branches, except that if the
5027 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5028 and we set REQ_VARY for the regex. */
5032 /* If we previously had a firstbyte, but it doesn't match the new branch,
5033 we have to abandon the firstbyte for the regex, but if there was
5034 previously no reqbyte, it takes on the value of the old firstbyte. */
5036 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5038 if (reqbyte < 0) reqbyte = firstbyte;
5039 firstbyte = REQ_NONE;
5042 /* If we (now or from before) have no firstbyte, a firstbyte from the
5043 branch becomes a reqbyte if there isn't a branch reqbyte. */
5045 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5046 branchreqbyte = branchfirstbyte;
5048 /* Now ensure that the reqbytes match */
5050 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5052 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5055 /* If lookbehind, check that this branch matches a fixed-length string, and
5056 put the length into the OP_REVERSE item. Temporarily mark the end of the
5057 branch with OP_END. */
5063 fixed_length = find_fixedlength(last_branch, options);
5064 DPRINTF(("fixed length = %d\n", fixed_length));
5065 if (fixed_length < 0)
5067 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5071 PUT(reverse_count, 0, fixed_length);
5075 /* Reached end of expression, either ')' or end of pattern. In the real
5076 compile phase, go back through the alternative branches and reverse the chain
5077 of offsets, with the field in the BRA item now becoming an offset to the
5078 first alternative. If there are no alternatives, it points to the end of the
5079 group. The length in the terminating ket is always the length of the whole
5080 bracketed item. If any of the ims options were changed inside the group,
5081 compile a resetting op-code following, except at the very end of the pattern.
5082 Return leaving the pointer at the terminating char. */
5086 if (lengthptr == NULL)
5088 int branch_length = code - last_branch;
5091 int prev_length = GET(last_branch, 1);
5092 PUT(last_branch, 1, branch_length);
5093 branch_length = prev_length;
5094 last_branch -= branch_length;
5096 while (branch_length > 0);
5099 /* Fill in the ket */
5102 PUT(code, 1, code - start_bracket);
5103 code += 1 + LINK_SIZE;
5105 /* Resetting option if needed */
5107 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5114 /* Retain the highest bracket number, in case resetting was used. */
5116 cd->bracount = max_bracount;
5118 /* Set values to pass back */
5122 *firstbyteptr = firstbyte;
5123 *reqbyteptr = reqbyte;
5124 if (lengthptr != NULL) *lengthptr += length;
5128 /* Another branch follows. In the pre-compile phase, we can move the code
5129 pointer back to where it was for the start of the first branch. (That is,
5130 pretend that each branch is the only one.)
5132 In the real compile phase, insert an ALT node. Its length field points back
5133 to the previous branch while the bracket remains open. At the end the chain
5134 is reversed. It's done like this so that the start of the bracket has a
5135 zero offset until it is closed, making it possible to detect recursion. */
5137 if (lengthptr != NULL)
5139 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5140 length += 1 + LINK_SIZE;
5145 PUT(code, 1, code - last_branch);
5146 bc.current = last_branch = code;
5147 code += 1 + LINK_SIZE;
5152 /* Control never reaches here */
5158 /*************************************************
5159 * Check for anchored expression *
5160 *************************************************/
5162 /* Try to find out if this is an anchored regular expression. Consider each
5163 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5164 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5165 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5166 counts, since OP_CIRC can match in the middle.
5168 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5169 This is the code for \G, which means "match at start of match position, taking
5170 into account the match offset".
5172 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5173 because that will try the rest of the pattern at all possible matching points,
5174 so there is no point trying again.... er ....
5176 .... except when the .* appears inside capturing parentheses, and there is a
5177 subsequent back reference to those parentheses. We haven't enough information
5178 to catch that case precisely.
5180 At first, the best we could do was to detect when .* was in capturing brackets
5181 and the highest back reference was greater than or equal to that level.
5182 However, by keeping a bitmap of the first 31 back references, we can catch some
5183 of the more common cases more precisely.
5186 code points to start of expression (the bracket)
5187 options points to the options setting
5188 bracket_map a bitmap of which brackets we are inside while testing; this
5189 handles up to substring 31; after that we just have to take
5190 the less precise approach
5191 backref_map the back reference bitmap
5193 Returns: TRUE or FALSE
5197 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5198 unsigned int backref_map)
5201 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5202 options, PCRE_MULTILINE, FALSE);
5203 register int op = *scode;
5205 /* Non-capturing brackets */
5209 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5212 /* Capturing brackets */
5214 else if (op == OP_CBRA)
5216 int n = GET2(scode, 1+LINK_SIZE);
5217 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5218 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5221 /* Other brackets */
5223 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5225 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5228 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5229 are or may be referenced. */
5231 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5232 op == OP_TYPEPOSSTAR) &&
5233 (*options & PCRE_DOTALL) != 0)
5235 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5238 /* Check for explicit anchoring */
5240 else if (op != OP_SOD && op != OP_SOM &&
5241 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5243 code += GET(code, 1);
5245 while (*code == OP_ALT); /* Loop for each alternative */
5251 /*************************************************
5252 * Check for starting with ^ or .* *
5253 *************************************************/
5255 /* This is called to find out if every branch starts with ^ or .* so that
5256 "first char" processing can be done to speed things up in multiline
5257 matching and for non-DOTALL patterns that start with .* (which must start at
5258 the beginning or after \n). As in the case of is_anchored() (see above), we
5259 have to take account of back references to capturing brackets that contain .*
5260 because in that case we can't make the assumption.
5263 code points to start of expression (the bracket)
5264 bracket_map a bitmap of which brackets we are inside while testing; this
5265 handles up to substring 31; after that we just have to take
5266 the less precise approach
5267 backref_map the back reference bitmap
5269 Returns: TRUE or FALSE
5273 is_startline(const uschar *code, unsigned int bracket_map,
5274 unsigned int backref_map)
5277 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5279 register int op = *scode;
5281 /* Non-capturing brackets */
5285 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5288 /* Capturing brackets */
5290 else if (op == OP_CBRA)
5292 int n = GET2(scode, 1+LINK_SIZE);
5293 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5294 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5297 /* Other brackets */
5299 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5300 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5302 /* .* means "start at start or after \n" if it isn't in brackets that
5303 may be referenced. */
5305 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5307 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5310 /* Check for explicit circumflex */
5312 else if (op != OP_CIRC) return FALSE;
5314 /* Move on to the next alternative */
5316 code += GET(code, 1);
5318 while (*code == OP_ALT); /* Loop for each alternative */
5324 /*************************************************
5325 * Check for asserted fixed first char *
5326 *************************************************/
5328 /* During compilation, the "first char" settings from forward assertions are
5329 discarded, because they can cause conflicts with actual literals that follow.
5330 However, if we end up without a first char setting for an unanchored pattern,
5331 it is worth scanning the regex to see if there is an initial asserted first
5332 char. If all branches start with the same asserted char, or with a bracket all
5333 of whose alternatives start with the same asserted char (recurse ad lib), then
5334 we return that char, otherwise -1.
5337 code points to start of expression (the bracket)
5338 options pointer to the options (used to check casing changes)
5339 inassert TRUE if in an assertion
5341 Returns: -1 or the fixed first char
5345 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5347 register int c = -1;
5350 const uschar *scode =
5351 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5352 register int op = *scode;
5364 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5366 if (c < 0) c = d; else if (c != d) return -1;
5369 case OP_EXACT: /* Fall through */
5377 if (!inassert) return -1;
5381 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5383 else if (c != scode[1]) return -1;
5387 code += GET(code, 1);
5389 while (*code == OP_ALT);
5395 /*************************************************
5396 * Compile a Regular Expression *
5397 *************************************************/
5399 /* This function takes a string and returns a pointer to a block of store
5400 holding a compiled version of the expression. The original API for this
5401 function had no error code return variable; it is retained for backwards
5402 compatibility. The new function is given a new name.
5405 pattern the regular expression
5406 options various option bits
5407 errorcodeptr pointer to error code variable (pcre_compile2() only)
5408 can be NULL if you don't want a code value
5409 errorptr pointer to pointer to error text
5410 erroroffset ptr offset in pattern where error was detected
5411 tables pointer to character tables or NULL
5413 Returns: pointer to compiled data block, or NULL on error,
5414 with errorptr and erroroffset set
5417 PCRE_EXP_DEFN pcre *
5418 pcre_compile(const char *pattern, int options, const char **errorptr,
5419 int *erroroffset, const unsigned char *tables)
5421 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5425 PCRE_EXP_DEFN pcre *
5426 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5427 const char **errorptr, int *erroroffset, const unsigned char *tables)
5430 int length = 1; /* For final END opcode */
5431 int firstbyte, reqbyte, newline;
5438 const uschar *codestart;
5440 compile_data compile_block;
5441 compile_data *cd = &compile_block;
5443 /* This space is used for "compiling" into during the first phase, when we are
5444 computing the amount of memory that is needed. Compiled items are thrown away
5445 as soon as possible, so that a fairly large buffer should be sufficient for
5446 this purpose. The same space is used in the second phase for remembering where
5447 to fill in forward references to subpatterns. */
5449 uschar cworkspace[COMPILE_WORK_SIZE];
5452 /* Set this early so that early errors get offset 0. */
5454 ptr = (const uschar *)pattern;
5456 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5457 can do is just return NULL, but we can set a code value if there is a code
5460 if (errorptr == NULL)
5462 if (errorcodeptr != NULL) *errorcodeptr = 99;
5467 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5469 /* However, we can give a message for this error */
5471 if (erroroffset == NULL)
5474 goto PCRE_EARLY_ERROR_RETURN2;
5479 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5482 utf8 = (options & PCRE_UTF8) != 0;
5483 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5484 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5487 goto PCRE_EARLY_ERROR_RETURN2;
5490 if ((options & PCRE_UTF8) != 0)
5493 goto PCRE_EARLY_ERROR_RETURN;
5497 if ((options & ~PUBLIC_OPTIONS) != 0)
5500 goto PCRE_EARLY_ERROR_RETURN;
5503 /* Set up pointers to the individual character tables */
5505 if (tables == NULL) tables = _pcre_default_tables;
5506 cd->lcc = tables + lcc_offset;
5507 cd->fcc = tables + fcc_offset;
5508 cd->cbits = tables + cbits_offset;
5509 cd->ctypes = tables + ctypes_offset;
5511 /* Handle different types of newline. The three bits give seven cases. The
5512 current code allows for fixed one- or two-byte sequences, plus "any" and
5515 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5517 case 0: newline = NEWLINE; break; /* Compile-time default */
5518 case PCRE_NEWLINE_CR: newline = '\r'; break;
5519 case PCRE_NEWLINE_LF: newline = '\n'; break;
5520 case PCRE_NEWLINE_CR+
5521 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5522 case PCRE_NEWLINE_ANY: newline = -1; break;
5523 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5524 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5529 cd->nltype = NLTYPE_ANYCRLF;
5531 else if (newline < 0)
5533 cd->nltype = NLTYPE_ANY;
5537 cd->nltype = NLTYPE_FIXED;
5541 cd->nl[0] = (newline >> 8) & 255;
5542 cd->nl[1] = newline & 255;
5547 cd->nl[0] = newline;
5551 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5552 references to help in deciding whether (.*) can be treated as anchored or not.
5555 cd->top_backref = 0;
5556 cd->backref_map = 0;
5558 /* Reflect pattern for debugging output */
5560 DPRINTF(("------------------------------------------------------------------\n"));
5561 DPRINTF(("%s\n", pattern));
5563 /* Pretend to compile the pattern while actually just accumulating the length
5564 of memory required. This behaviour is triggered by passing a non-NULL final
5565 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5566 to compile parts of the pattern into; the compiled code is discarded when it is
5567 no longer needed, so hopefully this workspace will never overflow, though there
5568 is a test for its doing so. */
5571 cd->names_found = 0;
5572 cd->name_entry_size = 0;
5573 cd->name_table = NULL;
5574 cd->start_workspace = cworkspace;
5575 cd->start_code = cworkspace;
5576 cd->hwm = cworkspace;
5577 cd->start_pattern = (const uschar *)pattern;
5578 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5579 cd->req_varyopt = 0;
5580 cd->nopartial = FALSE;
5581 cd->external_options = options;
5583 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5584 don't need to look at the result of the function here. The initial options have
5585 been put into the cd block so that they can be changed if an option setting is
5586 found within the regex right at the beginning. Bringing initial option settings
5587 outside can help speed up starting point checks. */
5591 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5592 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5594 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5596 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5597 cd->hwm - cworkspace));
5599 if (length > MAX_PATTERN_SIZE)
5602 goto PCRE_EARLY_ERROR_RETURN;
5605 /* Compute the size of data block needed and get it, either from malloc or
5606 externally provided function. Integer overflow should no longer be possible
5607 because nowadays we limit the maximum value of cd->names_found and
5608 cd->name_entry_size. */
5610 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5611 re = (real_pcre *)(pcre_malloc)(size);
5616 goto PCRE_EARLY_ERROR_RETURN;
5619 /* Put in the magic number, and save the sizes, initial options, and character
5620 table pointer. NULL is used for the default character tables. The nullpad field
5621 is at the end; it's there to help in the case when a regex compiled on a system
5622 with 4-byte pointers is run on another with 8-byte pointers. */
5624 re->magic_number = MAGIC_NUMBER;
5626 re->options = cd->external_options;
5630 re->name_table_offset = sizeof(real_pcre);
5631 re->name_entry_size = cd->name_entry_size;
5632 re->name_count = cd->names_found;
5634 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5637 /* The starting points of the name/number translation table and of the code are
5638 passed around in the compile data block. The start/end pattern and initial
5639 options are already set from the pre-compile phase, as is the name_entry_size
5640 field. Reset the bracket count and the names_found field. Also reset the hwm
5641 field; this time it's used for remembering forward references to subpatterns.
5645 cd->names_found = 0;
5646 cd->name_table = (uschar *)re + re->name_table_offset;
5647 codestart = cd->name_table + re->name_entry_size * re->name_count;
5648 cd->start_code = codestart;
5649 cd->hwm = cworkspace;
5650 cd->req_varyopt = 0;
5651 cd->nopartial = FALSE;
5653 /* Set up a starting, non-extracting bracket, then compile the expression. On
5654 error, errorcode will be set non-zero, so we don't need to look at the result
5655 of the function here. */
5657 ptr = (const uschar *)pattern;
5658 code = (uschar *)codestart;
5660 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5661 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5662 re->top_bracket = cd->bracount;
5663 re->top_backref = cd->top_backref;
5665 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5667 /* If not reached end of pattern on success, there's an excess bracket. */
5669 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5671 /* Fill in the terminating state and check for disastrous overflow, but
5672 if debugging, leave the test till after things are printed out. */
5677 if (code - codestart > length) errorcode = ERR23;
5680 /* Fill in any forward references that are required. */
5682 while (errorcode == 0 && cd->hwm > cworkspace)
5685 const uschar *groupptr;
5686 cd->hwm -= LINK_SIZE;
5687 offset = GET(cd->hwm, 0);
5688 recno = GET(codestart, offset);
5689 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5690 if (groupptr == NULL) errorcode = ERR53;
5691 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5694 /* Give an error if there's back reference to a non-existent capturing
5697 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5699 /* Failed to compile, or error while post-processing */
5704 PCRE_EARLY_ERROR_RETURN:
5705 *erroroffset = ptr - (const uschar *)pattern;
5706 PCRE_EARLY_ERROR_RETURN2:
5707 *errorptr = error_texts[errorcode];
5708 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5712 /* If the anchored option was not passed, set the flag if we can determine that
5713 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5714 as starting with .* when DOTALL is set).
5716 Otherwise, if we know what the first byte has to be, save it, because that
5717 speeds up unanchored matches no end. If not, see if we can set the
5718 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5719 start with ^. and also when all branches start with .* for non-DOTALL matches.
5722 if ((re->options & PCRE_ANCHORED) == 0)
5724 int temp_options = re->options; /* May get changed during these scans */
5725 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5726 re->options |= PCRE_ANCHORED;
5730 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5731 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5733 int ch = firstbyte & 255;
5734 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5735 cd->fcc[ch] == ch)? ch : firstbyte;
5736 re->options |= PCRE_FIRSTSET;
5738 else if (is_startline(codestart, 0, cd->backref_map))
5739 re->options |= PCRE_STARTLINE;
5743 /* For an anchored pattern, we use the "required byte" only if it follows a
5744 variable length item in the regex. Remove the caseless flag for non-caseable
5748 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5750 int ch = reqbyte & 255;
5751 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5752 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5753 re->options |= PCRE_REQCHSET;
5756 /* Print out the compiled data if debugging is enabled. This is never the
5757 case when building a production library. */
5761 printf("Length = %d top_bracket = %d top_backref = %d\n",
5762 length, re->top_bracket, re->top_backref);
5764 if (re->options != 0)
5766 printf("%s%s%s%s%s%s%s%s%s\n",
5767 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5768 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5769 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5770 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5771 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5772 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5773 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5774 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5775 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5778 if ((re->options & PCRE_FIRSTSET) != 0)
5780 int ch = re->first_byte & 255;
5781 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5783 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5784 else printf("First char = \\x%02x%s\n", ch, caseless);
5787 if ((re->options & PCRE_REQCHSET) != 0)
5789 int ch = re->req_byte & 255;
5790 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5792 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5793 else printf("Req char = \\x%02x%s\n", ch, caseless);
5796 pcre_printint(re, stdout, TRUE);
5798 /* This check is done here in the debugging case so that the code that
5799 was compiled can be seen. */
5801 if (code - codestart > length)
5804 *errorptr = error_texts[ERR23];
5805 *erroroffset = ptr - (uschar *)pattern;
5806 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5814 /* End of pcre_compile.c */