1 /* $Cambridge: exim/src/src/pcre/pcre_exec.c,v 1.3 2006/11/07 16:50:36 ph10 Exp $ */
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
10 Written by Philip Hazel
11 Copyright (c) 1997-2006 University of Cambridge
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
43 /* This module contains pcre_exec(), the externally visible function that does
44 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
45 possible. There are also some static supporting functions. */
47 #define NLBLOCK md /* The block containing newline information */
48 #include "pcre_internal.h"
51 /* Structure for building a chain of data that actually lives on the
52 stack, for holding the values of the subject pointer at the start of each
53 subpattern, so as to detect when an empty string has been matched by a
54 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
55 are on the heap, not on the stack. */
57 typedef struct eptrblock {
58 struct eptrblock *epb_prev;
62 /* Flag bits for the match() function */
64 #define match_condassert 0x01 /* Called to check a condition assertion */
65 #define match_isgroup 0x02 /* Set if start of bracketed group */
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
71 #define MATCH_NOMATCH 0
73 /* Maximum number of ints of offset to save on the stack for recursive calls.
74 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
75 because the offset vector is always a multiple of 3 long. */
77 #define REC_STACK_SAVE_MAX 30
79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
87 /*************************************************
88 * Debugging function to print chars *
89 *************************************************/
91 /* Print a sequence of chars in printable format, stopping at the end of the
92 subject if the requested.
95 p points to characters
96 length number to print
97 is_subject TRUE if printing from within md->start_subject
98 md pointer to matching data block, if is_subject is TRUE
104 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
107 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
109 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
115 /*************************************************
116 * Match a back-reference *
117 *************************************************/
119 /* If a back reference hasn't been set, the length that is passed is greater
120 than the number of characters left in the string, so the match fails.
123 offset index into the offset vector
124 eptr points into the subject
125 length length to be matched
126 md points to match data block
129 Returns: TRUE if matched
133 match_ref(int offset, register USPTR eptr, int length, match_data *md,
134 unsigned long int ims)
136 USPTR p = md->start_subject + md->offset_vector[offset];
139 if (eptr >= md->end_subject)
140 printf("matching subject <null>");
143 printf("matching subject ");
144 pchars(eptr, length, TRUE, md);
146 printf(" against backref ");
147 pchars(p, length, FALSE, md);
151 /* Always fail if not enough characters left */
153 if (length > md->end_subject - eptr) return FALSE;
155 /* Separate the caselesss case for speed */
157 if ((ims & PCRE_CASELESS) != 0)
160 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
163 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170 /***************************************************************************
171 ****************************************************************************
172 RECURSION IN THE match() FUNCTION
174 The match() function is highly recursive, though not every recursive call
175 increases the recursive depth. Nevertheless, some regular expressions can cause
176 it to recurse to a great depth. I was writing for Unix, so I just let it call
177 itself recursively. This uses the stack for saving everything that has to be
178 saved for a recursive call. On Unix, the stack can be large, and this works
181 It turns out that on some non-Unix-like systems there are problems with
182 programs that use a lot of stack. (This despite the fact that every last chip
183 has oodles of memory these days, and techniques for extending the stack have
184 been known for decades.) So....
186 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
187 calls by keeping local variables that need to be preserved in blocks of memory
188 obtained from malloc() instead instead of on the stack. Macros are used to
189 achieve this so that the actual code doesn't look very different to what it
191 ****************************************************************************
192 ***************************************************************************/
195 /* These versions of the macros use the stack, as normal. There are debugging
196 versions and production versions. */
199 #define REGISTER register
201 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
203 printf("match() called in line %d\n", __LINE__); \
204 rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
205 printf("to line %d\n", __LINE__); \
207 #define RRETURN(ra) \
209 printf("match() returned %d from line %d ", ra, __LINE__); \
213 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
214 rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
215 #define RRETURN(ra) return ra
221 /* These versions of the macros manage a private stack on the heap. Note
222 that the rd argument of RMATCH isn't actually used. It's the md argument of
223 match(), which never changes. */
227 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
229 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
230 if (setjmp(frame->Xwhere) == 0)\
232 newframe->Xeptr = ra;\
233 newframe->Xecode = rb;\
234 newframe->Xoffset_top = rc;\
235 newframe->Xims = re;\
236 newframe->Xeptrb = rf;\
237 newframe->Xflags = rg;\
238 newframe->Xrdepth = frame->Xrdepth + 1;\
239 newframe->Xprevframe = frame;\
241 DPRINTF(("restarting from line %d\n", __LINE__));\
246 DPRINTF(("longjumped back to line %d\n", __LINE__));\
247 frame = md->thisframe;\
248 rx = frame->Xresult;\
254 heapframe *newframe = frame;\
255 frame = newframe->Xprevframe;\
256 (pcre_stack_free)(newframe);\
259 frame->Xresult = ra;\
260 md->thisframe = frame;\
261 longjmp(frame->Xwhere, 1);\
267 /* Structure for remembering the local variables in a private frame */
269 typedef struct heapframe {
270 struct heapframe *Xprevframe;
272 /* Function arguments that may change */
275 const uschar *Xecode;
280 unsigned int Xrdepth;
282 /* Function local variables */
284 const uschar *Xcallpat;
285 const uschar *Xcharptr;
290 const uschar *Xsaved_eptr;
292 recursion_info Xnew_recursive;
299 unsigned long int Xoriginal_ims;
304 int Xprop_fail_result;
308 int *Xprop_test_variable;
320 int Xsave_capture_last;
321 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
322 int Xstacksave[REC_STACK_SAVE_MAX];
326 /* Place to pass back result, and where to jump back to */
336 /***************************************************************************
337 ***************************************************************************/
341 /*************************************************
342 * Match from current position *
343 *************************************************/
345 /* On entry ecode points to the first opcode, and eptr to the first character
346 in the subject string, while eptrb holds the value of eptr at the start of the
347 last bracketed group - used for breaking infinite loops matching zero-length
348 strings. This function is called recursively in many circumstances. Whenever it
349 returns a negative (error) response, the outer incarnation must also return the
352 Performance note: It might be tempting to extract commonly used fields from the
353 md structure (e.g. utf8, end_subject) into individual variables to improve
354 performance. Tests using gcc on a SPARC disproved this; in the first case, it
355 made performance worse.
358 eptr pointer in subject
359 ecode position in code
360 offset_top current top pointer
361 md pointer to "static" info for the match
362 ims current /i, /m, and /s options
363 eptrb pointer to chain of blocks containing eptr at start of
364 brackets - for testing for empty matches
366 match_condassert - this is an assertion condition
367 match_isgroup - this is the start of a bracketed group
368 rdepth the recursion depth
370 Returns: MATCH_MATCH if matched ) these values are >= 0
371 MATCH_NOMATCH if failed to match )
372 a negative PCRE_ERROR_xxx value if aborted by an error condition
373 (e.g. stopped by repeated call or recursion limit)
377 match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
378 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
379 int flags, unsigned int rdepth)
381 /* These variables do not need to be preserved over recursion in this function,
382 so they can be ordinary variables in all cases. Mark them with "register"
383 because they are used a lot in loops. */
385 register int rrc; /* Returns from recursive calls */
386 register int i; /* Used for loops not involving calls to RMATCH() */
387 register unsigned int c; /* Character values not kept over RMATCH() calls */
388 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
390 /* When recursion is not being used, all "local" variables that have to be
391 preserved over calls to RMATCH() are part of a "frame" which is obtained from
392 heap storage. Set up the top-level frame here; others are obtained from the
393 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
396 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
397 frame->Xprevframe = NULL; /* Marks the top level */
399 /* Copy in the original argument variables */
402 frame->Xecode = ecode;
403 frame->Xoffset_top = offset_top;
405 frame->Xeptrb = eptrb;
406 frame->Xflags = flags;
407 frame->Xrdepth = rdepth;
409 /* This is where control jumps back to to effect "recursion" */
413 /* Macros make the argument variables come from the current frame */
415 #define eptr frame->Xeptr
416 #define ecode frame->Xecode
417 #define offset_top frame->Xoffset_top
418 #define ims frame->Xims
419 #define eptrb frame->Xeptrb
420 #define flags frame->Xflags
421 #define rdepth frame->Xrdepth
423 /* Ditto for the local variables */
426 #define charptr frame->Xcharptr
428 #define callpat frame->Xcallpat
429 #define data frame->Xdata
430 #define next frame->Xnext
431 #define pp frame->Xpp
432 #define prev frame->Xprev
433 #define saved_eptr frame->Xsaved_eptr
435 #define new_recursive frame->Xnew_recursive
437 #define cur_is_word frame->Xcur_is_word
438 #define condition frame->Xcondition
439 #define minimize frame->Xminimize
440 #define prev_is_word frame->Xprev_is_word
442 #define original_ims frame->Xoriginal_ims
445 #define prop_type frame->Xprop_type
446 #define prop_value frame->Xprop_value
447 #define prop_fail_result frame->Xprop_fail_result
448 #define prop_category frame->Xprop_category
449 #define prop_chartype frame->Xprop_chartype
450 #define prop_script frame->Xprop_script
451 #define prop_test_variable frame->Xprop_test_variable
454 #define ctype frame->Xctype
455 #define fc frame->Xfc
456 #define fi frame->Xfi
457 #define length frame->Xlength
458 #define max frame->Xmax
459 #define min frame->Xmin
460 #define number frame->Xnumber
461 #define offset frame->Xoffset
462 #define op frame->Xop
463 #define save_capture_last frame->Xsave_capture_last
464 #define save_offset1 frame->Xsave_offset1
465 #define save_offset2 frame->Xsave_offset2
466 #define save_offset3 frame->Xsave_offset3
467 #define stacksave frame->Xstacksave
469 #define newptrb frame->Xnewptrb
471 /* When recursion is being used, local variables are allocated on the stack and
472 get preserved during recursion in the normal way. In this environment, fi and
473 i, and fc and c, can be the same variables. */
480 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
481 const uschar *charptr; /* in small blocks of the code. My normal */
482 #endif /* style of coding would have declared */
483 const uschar *callpat; /* them within each of those blocks. */
484 const uschar *data; /* However, in order to accommodate the */
485 const uschar *next; /* version of this code that uses an */
486 USPTR pp; /* external "stack" implemented on the */
487 const uschar *prev; /* heap, it is easier to declare them all */
488 USPTR saved_eptr; /* here, so the declarations can be cut */
489 /* out in a block. The only declarations */
490 recursion_info new_recursive; /* within blocks below are for variables */
491 /* that do not have to be preserved over */
492 BOOL cur_is_word; /* a recursive call to RMATCH(). */
497 unsigned long int original_ims;
502 int prop_fail_result;
506 int *prop_test_variable;
516 int save_capture_last;
517 int save_offset1, save_offset2, save_offset3;
518 int stacksave[REC_STACK_SAVE_MAX];
523 /* These statements are here to stop the compiler complaining about unitialized
528 prop_fail_result = 0;
529 prop_test_variable = NULL;
532 /* This label is used for tail recursion, which is used in a few cases even
533 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
534 used. Thanks to Ian Taylor for noticing this possibility and sending the
539 /* OK, now we can get on with the real code of the function. Recursive calls
540 are specified by the macro RMATCH and RRETURN is used to return. When
541 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
542 and a "return", respectively (possibly with some debugging if DEBUG is
543 defined). However, RMATCH isn't like a function call because it's quite a
544 complicated macro. It has to be used in one particular way. This shouldn't,
545 however, impact performance when true recursion is being used. */
547 /* First check that we haven't called match() too many times, or that we
548 haven't exceeded the recursive call limit. */
550 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
551 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
553 original_ims = ims; /* Save for resetting on ')' */
556 utf8 = md->utf8; /* Local copy of the flag */
561 /* At the start of a bracketed group, add the current subject pointer to the
562 stack of such pointers, to be re-instated at the end of the group when we hit
563 the closing ket. When match() is called in other circumstances, we don't add to
566 if ((flags & match_isgroup) != 0)
568 newptrb.epb_prev = eptrb;
569 newptrb.epb_saved_eptr = eptr;
573 /* Now start processing the operations. */
580 /* For partial matching, remember if we ever hit the end of the subject after
581 matching at least one subject character. */
584 eptr >= md->end_subject &&
585 eptr > md->start_match)
588 /* Opening capturing bracket. If there is space in the offset vector, save
589 the current subject position in the working slot at the top of the vector. We
590 mustn't change the current values of the data slot, because they may be set
591 from a previous iteration of this group, and be referred to by a reference
594 If the bracket fails to match, we need to restore this value and also the
595 values of the final offsets, in case they were set by a previous iteration of
598 If there isn't enough space in the offset vector, treat this as if it were a
599 non-capturing bracket. Don't worry about setting the flag for the error case
600 here; that is handled in the code for KET. */
604 number = op - OP_BRA;
606 /* For extended extraction brackets (large number), we have to fish out the
607 number from a dummy opcode at the start. */
609 if (number > EXTRACT_BASIC_MAX)
610 number = GET2(ecode, 2+LINK_SIZE);
611 offset = number << 1;
614 printf("start bracket %d subject=", number);
615 pchars(eptr, 16, TRUE, md);
619 if (offset < md->offset_max)
621 save_offset1 = md->offset_vector[offset];
622 save_offset2 = md->offset_vector[offset+1];
623 save_offset3 = md->offset_vector[md->offset_end - number];
624 save_capture_last = md->capture_last;
626 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
627 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
631 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
634 md->capture_last = save_capture_last;
635 ecode += GET(ecode, 1);
637 while (*ecode == OP_ALT);
639 DPRINTF(("bracket %d failed\n", number));
641 md->offset_vector[offset] = save_offset1;
642 md->offset_vector[offset+1] = save_offset2;
643 md->offset_vector[md->offset_end - number] = save_offset3;
645 RRETURN(MATCH_NOMATCH);
648 /* Insufficient room for saving captured contents */
653 /* Other types of node can be handled by a switch */
657 case OP_BRA: /* Non-capturing bracket: optimized */
658 DPRINTF(("start bracket 0\n"));
660 /* Loop for all the alternatives */
664 /* When we get to the final alternative within the brackets, we would
665 return the result of a recursive call to match() whatever happened. We
666 can reduce stack usage by turning this into a tail recursion. */
668 if (ecode[GET(ecode, 1)] != OP_ALT)
670 ecode += 1 + LINK_SIZE;
671 flags = match_isgroup;
672 DPRINTF(("bracket 0 tail recursion\n"));
676 /* For non-final alternatives, continue the loop for a NOMATCH result;
679 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 ecode += GET(ecode, 1);
684 /* Control never reaches here. */
686 /* Conditional group: compilation checked that there are no more than
687 two branches. If the condition is false, skipping the first branch takes us
688 past the end if there is only one branch, but that's OK because that is
689 exactly what going to the ket would do. As there is only one branch to be
690 obeyed, we can use tail recursion to avoid using another stack frame. */
693 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
695 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
696 condition = (offset == CREF_RECURSE * 2)?
697 (md->recursive != NULL) :
698 (offset < offset_top && md->offset_vector[offset] >= 0);
699 ecode += condition? (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1));
700 flags = match_isgroup;
704 /* The condition is an assertion. Call match() to evaluate it - setting
705 the final argument TRUE causes it to stop at the end of an assertion. */
709 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
710 match_condassert | match_isgroup);
711 if (rrc == MATCH_MATCH)
713 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
714 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
716 else if (rrc != MATCH_NOMATCH)
718 RRETURN(rrc); /* Need braces because of following else */
720 else ecode += GET(ecode, 1);
722 /* We are now at the branch that is to be obeyed. As there is only one,
723 we can use tail recursion to avoid using another stack frame. */
725 ecode += 1 + LINK_SIZE;
726 flags = match_isgroup;
729 /* Control never reaches here */
731 /* Skip over conditional reference or large extraction number data if
739 /* End of the pattern. If we are in a recursion, we should restore the
740 offsets appropriately and continue from after the call. */
743 if (md->recursive != NULL && md->recursive->group_num == 0)
745 recursion_info *rec = md->recursive;
746 DPRINTF(("End of pattern in a (?0) recursion\n"));
747 md->recursive = rec->prevrec;
748 memmove(md->offset_vector, rec->offset_save,
749 rec->saved_max * sizeof(int));
750 md->start_match = rec->save_start;
752 ecode = rec->after_call;
756 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
757 string - backtracking will then try other alternatives, if any. */
759 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
760 md->end_match_ptr = eptr; /* Record where we ended */
761 md->end_offset_top = offset_top; /* and how many extracts were taken */
762 RRETURN(MATCH_MATCH);
764 /* Change option settings */
769 DPRINTF(("ims set to %02lx\n", ims));
772 /* Assertion brackets. Check the alternative branches in turn - the
773 matching won't pass the KET for an assertion. If any one branch matches,
774 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
775 start of each branch to move the current point backwards, so the code at
776 this level is identical to the lookahead case. */
782 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
784 if (rrc == MATCH_MATCH) break;
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 ecode += GET(ecode, 1);
788 while (*ecode == OP_ALT);
789 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
791 /* If checking an assertion for a condition, return MATCH_MATCH. */
793 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
795 /* Continue from after the assertion, updating the offsets high water
796 mark, since extracts may have been taken during the assertion. */
798 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
799 ecode += 1 + LINK_SIZE;
800 offset_top = md->end_offset_top;
803 /* Negative assertion: all branches must fail to match */
806 case OP_ASSERTBACK_NOT:
809 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
811 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
812 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
813 ecode += GET(ecode,1);
815 while (*ecode == OP_ALT);
817 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
819 ecode += 1 + LINK_SIZE;
822 /* Move the subject pointer back. This occurs only at the start of
823 each branch of a lookbehind assertion. If we are too close to the start to
824 move back, this match function fails. When working with UTF-8 we move
825 back a number of characters, not bytes. */
832 for (i = 0; i < c; i++)
835 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
842 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
845 eptr -= GET(ecode,1);
846 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
849 /* Skip to next op code */
851 ecode += 1 + LINK_SIZE;
854 /* The callout item calls an external function, if one is provided, passing
855 details of the match so far. This is mainly for debugging, though the
856 function is able to force a failure. */
859 if (pcre_callout != NULL)
861 pcre_callout_block cb;
862 cb.version = 1; /* Version 1 of the callout block */
863 cb.callout_number = ecode[1];
864 cb.offset_vector = md->offset_vector;
865 cb.subject = (PCRE_SPTR)md->start_subject;
866 cb.subject_length = md->end_subject - md->start_subject;
867 cb.start_match = md->start_match - md->start_subject;
868 cb.current_position = eptr - md->start_subject;
869 cb.pattern_position = GET(ecode, 2);
870 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
871 cb.capture_top = offset_top/2;
872 cb.capture_last = md->capture_last;
873 cb.callout_data = md->callout_data;
874 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
875 if (rrc < 0) RRETURN(rrc);
877 ecode += 2 + 2*LINK_SIZE;
880 /* Recursion either matches the current regex, or some subexpression. The
881 offset data is the offset to the starting bracket from the start of the
882 whole pattern. (This is so that it works from duplicated subpatterns.)
884 If there are any capturing brackets started but not finished, we have to
885 save their starting points and reinstate them after the recursion. However,
886 we don't know how many such there are (offset_top records the completed
887 total) so we just have to save all the potential data. There may be up to
888 65535 such values, which is too large to put on the stack, but using malloc
889 for small numbers seems expensive. As a compromise, the stack is used when
890 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
891 is used. A problem is what to do if the malloc fails ... there is no way of
892 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
893 values on the stack, and accept that the rest may be wrong.
895 There are also other values that have to be saved. We use a chained
896 sequence of blocks that actually live on the stack. Thanks to Robin Houston
897 for the original version of this logic. */
901 callpat = md->start_code + GET(ecode, 1);
902 new_recursive.group_num = *callpat - OP_BRA;
904 /* For extended extraction brackets (large number), we have to fish out
905 the number from a dummy opcode at the start. */
907 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
908 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
910 /* Add to "recursing stack" */
912 new_recursive.prevrec = md->recursive;
913 md->recursive = &new_recursive;
915 /* Find where to continue from afterwards */
917 ecode += 1 + LINK_SIZE;
918 new_recursive.after_call = ecode;
920 /* Now save the offset data. */
922 new_recursive.saved_max = md->offset_end;
923 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
924 new_recursive.offset_save = stacksave;
927 new_recursive.offset_save =
928 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
929 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
932 memcpy(new_recursive.offset_save, md->offset_vector,
933 new_recursive.saved_max * sizeof(int));
934 new_recursive.save_start = md->start_match;
935 md->start_match = eptr;
937 /* OK, now we can do the recursion. For each top-level alternative we
938 restore the offset and recursion data. */
940 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
943 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
944 eptrb, match_isgroup);
945 if (rrc == MATCH_MATCH)
947 DPRINTF(("Recursion matched\n"));
948 md->recursive = new_recursive.prevrec;
949 if (new_recursive.offset_save != stacksave)
950 (pcre_free)(new_recursive.offset_save);
951 RRETURN(MATCH_MATCH);
953 else if (rrc != MATCH_NOMATCH)
955 DPRINTF(("Recursion gave error %d\n", rrc));
959 md->recursive = &new_recursive;
960 memcpy(md->offset_vector, new_recursive.offset_save,
961 new_recursive.saved_max * sizeof(int));
962 callpat += GET(callpat, 1);
964 while (*callpat == OP_ALT);
966 DPRINTF(("Recursion didn't match\n"));
967 md->recursive = new_recursive.prevrec;
968 if (new_recursive.offset_save != stacksave)
969 (pcre_free)(new_recursive.offset_save);
970 RRETURN(MATCH_NOMATCH);
972 /* Control never reaches here */
974 /* "Once" brackets are like assertion brackets except that after a match,
975 the point in the subject string is not moved back. Thus there can never be
976 a move back into the brackets. Friedl calls these "atomic" subpatterns.
977 Check the alternative branches in turn - the matching won't pass the KET
978 for this kind of subpattern. If any one branch matches, we carry on as at
979 the end of a normal bracket, leaving the subject pointer. */
987 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
988 eptrb, match_isgroup);
989 if (rrc == MATCH_MATCH) break;
990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
991 ecode += GET(ecode,1);
993 while (*ecode == OP_ALT);
995 /* If hit the end of the group (which could be repeated), fail */
997 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
999 /* Continue as from after the assertion, updating the offsets high water
1000 mark, since extracts may have been taken. */
1002 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1004 offset_top = md->end_offset_top;
1005 eptr = md->end_match_ptr;
1007 /* For a non-repeating ket, just continue at this level. This also
1008 happens for a repeating ket if no characters were matched in the group.
1009 This is the forcible breaking of infinite loops as implemented in Perl
1010 5.005. If there is an options reset, it will get obeyed in the normal
1011 course of events. */
1013 if (*ecode == OP_KET || eptr == saved_eptr)
1015 ecode += 1+LINK_SIZE;
1019 /* The repeating kets try the rest of the pattern or restart from the
1020 preceding bracket, in the appropriate order. The second "call" of match()
1021 uses tail recursion, to avoid using another stack frame. We need to reset
1022 any options that changed within the bracket before re-running it, so
1023 check the next opcode. */
1025 if (ecode[1+LINK_SIZE] == OP_OPT)
1027 ims = (ims & ~PCRE_IMS) | ecode[4];
1028 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1031 if (*ecode == OP_KETRMIN)
1033 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1034 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1036 flags = match_isgroup;
1039 else /* OP_KETRMAX */
1041 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1042 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1043 ecode += 1 + LINK_SIZE;
1047 /* Control never gets here */
1049 /* An alternation is the end of a branch; scan along to find the end of the
1050 bracketed group and go to there. */
1053 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1056 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1057 that it may occur zero times. It may repeat infinitely, or not at all -
1058 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1059 repeat limits are compiled as a number of copies, with the optional ones
1060 preceded by BRAZERO or BRAMINZERO. */
1065 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
1066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1067 do next += GET(next,1); while (*next == OP_ALT);
1068 ecode = next + 1+LINK_SIZE;
1075 do next += GET(next,1); while (*next == OP_ALT);
1076 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
1078 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1083 /* End of a group, repeated or non-repeating. If we are at the end of
1084 an assertion "group", stop matching and return MATCH_MATCH, but record the
1085 current high water mark for use by positive assertions. Do this also
1086 for the "once" (not-backup up) groups. */
1091 prev = ecode - GET(ecode, 1);
1092 saved_eptr = eptrb->epb_saved_eptr;
1094 /* Back up the stack of bracket start pointers. */
1096 eptrb = eptrb->epb_prev;
1098 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1099 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1102 md->end_match_ptr = eptr; /* For ONCE */
1103 md->end_offset_top = offset_top;
1104 RRETURN(MATCH_MATCH);
1107 /* In all other cases except a conditional group we have to check the
1108 group number back at the start and if necessary complete handling an
1109 extraction by setting the offsets and bumping the high water mark. */
1111 if (*prev != OP_COND)
1113 number = *prev - OP_BRA;
1115 /* For extended extraction brackets (large number), we have to fish out
1116 the number from a dummy opcode at the start. */
1118 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
1119 offset = number << 1;
1122 printf("end bracket %d", number);
1126 /* Test for a numbered group. This includes groups called as a result
1127 of recursion. Note that whole-pattern recursion is coded as a recurse
1128 into group 0, so it won't be picked up here. Instead, we catch it when
1129 the OP_END is reached. */
1133 md->capture_last = number;
1134 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1136 md->offset_vector[offset] =
1137 md->offset_vector[md->offset_end - number];
1138 md->offset_vector[offset+1] = eptr - md->start_subject;
1139 if (offset_top <= offset) offset_top = offset + 2;
1142 /* Handle a recursively called group. Restore the offsets
1143 appropriately and continue from after the call. */
1145 if (md->recursive != NULL && md->recursive->group_num == number)
1147 recursion_info *rec = md->recursive;
1148 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1149 md->recursive = rec->prevrec;
1150 md->start_match = rec->save_start;
1151 memcpy(md->offset_vector, rec->offset_save,
1152 rec->saved_max * sizeof(int));
1153 ecode = rec->after_call;
1160 /* Reset the value of the ims flags, in case they got changed during
1164 DPRINTF(("ims reset to %02lx\n", ims));
1166 /* For a non-repeating ket, just continue at this level. This also
1167 happens for a repeating ket if no characters were matched in the group.
1168 This is the forcible breaking of infinite loops as implemented in Perl
1169 5.005. If there is an options reset, it will get obeyed in the normal
1170 course of events. */
1172 if (*ecode == OP_KET || eptr == saved_eptr)
1174 ecode += 1 + LINK_SIZE;
1178 /* The repeating kets try the rest of the pattern or restart from the
1179 preceding bracket, in the appropriate order. In the second case, we can use
1180 tail recursion to avoid using another stack frame. */
1182 if (*ecode == OP_KETRMIN)
1184 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1185 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1187 flags = match_isgroup;
1190 else /* OP_KETRMAX */
1192 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1193 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1194 ecode += 1 + LINK_SIZE;
1198 /* Control never gets here */
1200 /* Start of subject unless notbol, or after internal newline if multiline */
1203 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1204 if ((ims & PCRE_MULTILINE) != 0)
1206 if (eptr != md->start_subject &&
1207 (eptr == md->end_subject ||
1208 eptr < md->start_subject + md->nllen ||
1209 !IS_NEWLINE(eptr - md->nllen)))
1210 RRETURN(MATCH_NOMATCH);
1214 /* ... else fall through */
1216 /* Start of subject assertion */
1219 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1223 /* Start of match assertion */
1226 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1230 /* Assert before internal newline if multiline, or before a terminating
1231 newline unless endonly is set, else end of subject unless noteol is set. */
1234 if ((ims & PCRE_MULTILINE) != 0)
1236 if (eptr < md->end_subject)
1237 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1239 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1245 if (md->noteol) RRETURN(MATCH_NOMATCH);
1248 if (eptr != md->end_subject &&
1249 (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
1250 RRETURN(MATCH_NOMATCH);
1255 /* ... else fall through for endonly */
1257 /* End of subject assertion (\z) */
1260 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1264 /* End of subject or ending \n assertion (\Z) */
1267 if (eptr != md->end_subject &&
1268 (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
1269 RRETURN(MATCH_NOMATCH);
1273 /* Word boundary assertions */
1275 case OP_NOT_WORD_BOUNDARY:
1276 case OP_WORD_BOUNDARY:
1279 /* Find out if the previous and current characters are "word" characters.
1280 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1281 be "non-word" characters. */
1286 if (eptr == md->start_subject) prev_is_word = FALSE; else
1288 const uschar *lastptr = eptr - 1;
1289 while((*lastptr & 0xc0) == 0x80) lastptr--;
1290 GETCHAR(c, lastptr);
1291 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1293 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1296 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1302 /* More streamlined when not in UTF-8 mode */
1305 prev_is_word = (eptr != md->start_subject) &&
1306 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1307 cur_is_word = (eptr < md->end_subject) &&
1308 ((md->ctypes[*eptr] & ctype_word) != 0);
1311 /* Now see if the situation is what we want */
1313 if ((*ecode++ == OP_WORD_BOUNDARY)?
1314 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1315 RRETURN(MATCH_NOMATCH);
1319 /* Match a single character type; inline for speed */
1322 if ((ims & PCRE_DOTALL) == 0)
1324 if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
1325 RRETURN(MATCH_NOMATCH);
1327 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1329 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1333 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1334 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1337 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1342 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1343 GETCHARINCTEST(c, eptr);
1348 (md->ctypes[c] & ctype_digit) != 0
1350 RRETURN(MATCH_NOMATCH);
1355 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1356 GETCHARINCTEST(c, eptr);
1361 (md->ctypes[c] & ctype_digit) == 0
1363 RRETURN(MATCH_NOMATCH);
1367 case OP_NOT_WHITESPACE:
1368 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1369 GETCHARINCTEST(c, eptr);
1374 (md->ctypes[c] & ctype_space) != 0
1376 RRETURN(MATCH_NOMATCH);
1381 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1382 GETCHARINCTEST(c, eptr);
1387 (md->ctypes[c] & ctype_space) == 0
1389 RRETURN(MATCH_NOMATCH);
1393 case OP_NOT_WORDCHAR:
1394 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1395 GETCHARINCTEST(c, eptr);
1400 (md->ctypes[c] & ctype_word) != 0
1402 RRETURN(MATCH_NOMATCH);
1407 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1408 GETCHARINCTEST(c, eptr);
1413 (md->ctypes[c] & ctype_word) == 0
1415 RRETURN(MATCH_NOMATCH);
1420 /* Check the next character by Unicode property. We will get here only
1421 if the support is in the binary; otherwise a compile-time error occurs. */
1425 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1426 GETCHARINCTEST(c, eptr);
1428 int chartype, script;
1429 int category = _pcre_ucp_findprop(c, &chartype, &script);
1434 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1438 if ((chartype == ucp_Lu ||
1439 chartype == ucp_Ll ||
1440 chartype == ucp_Lt) == (op == OP_NOTPROP))
1441 RRETURN(MATCH_NOMATCH);
1445 if ((ecode[2] != category) == (op == OP_PROP))
1446 RRETURN(MATCH_NOMATCH);
1450 if ((ecode[2] != chartype) == (op == OP_PROP))
1451 RRETURN(MATCH_NOMATCH);
1455 if ((ecode[2] != script) == (op == OP_PROP))
1456 RRETURN(MATCH_NOMATCH);
1460 RRETURN(PCRE_ERROR_INTERNAL);
1468 /* Match an extended Unicode sequence. We will get here only if the support
1469 is in the binary; otherwise a compile-time error occurs. */
1472 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1473 GETCHARINCTEST(c, eptr);
1475 int chartype, script;
1476 int category = _pcre_ucp_findprop(c, &chartype, &script);
1477 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1478 while (eptr < md->end_subject)
1481 if (!utf8) c = *eptr; else
1483 GETCHARLEN(c, eptr, len);
1485 category = _pcre_ucp_findprop(c, &chartype, &script);
1486 if (category != ucp_M) break;
1495 /* Match a back reference, possibly repeatedly. Look past the end of the
1496 item to see if there is repeat information following. The code is similar
1497 to that for character classes, but repeated for efficiency. Then obey
1498 similar code to character type repeats - written out again for speed.
1499 However, if the referenced string is the empty string, always treat
1500 it as matched, any number of times (otherwise there could be infinite
1505 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1506 ecode += 3; /* Advance past item */
1508 /* If the reference is unset, set the length to be longer than the amount
1509 of subject left; this ensures that every attempt at a match fails. We
1510 can't just fail here, because of the possibility of quantifiers with zero
1513 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1514 md->end_subject - eptr + 1 :
1515 md->offset_vector[offset+1] - md->offset_vector[offset];
1517 /* Set up for repetition, or handle the non-repeated case */
1527 c = *ecode++ - OP_CRSTAR;
1528 minimize = (c & 1) != 0;
1529 min = rep_min[c]; /* Pick up values from tables; */
1530 max = rep_max[c]; /* zero for max => infinity */
1531 if (max == 0) max = INT_MAX;
1536 minimize = (*ecode == OP_CRMINRANGE);
1537 min = GET2(ecode, 1);
1538 max = GET2(ecode, 3);
1539 if (max == 0) max = INT_MAX;
1543 default: /* No repeat follows */
1544 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1546 continue; /* With the main loop */
1549 /* If the length of the reference is zero, just continue with the
1552 if (length == 0) continue;
1554 /* First, ensure the minimum number of matches are present. We get back
1555 the length of the reference string explicitly rather than passing the
1556 address of eptr, so that eptr can be a register variable. */
1558 for (i = 1; i <= min; i++)
1560 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1564 /* If min = max, continue at the same level without recursion.
1565 They are not both allowed to be zero. */
1567 if (min == max) continue;
1569 /* If minimizing, keep trying and advancing the pointer */
1573 for (fi = min;; fi++)
1575 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1576 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1577 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1578 RRETURN(MATCH_NOMATCH);
1581 /* Control never gets here */
1584 /* If maximizing, find the longest string and work backwards */
1589 for (i = min; i < max; i++)
1591 if (!match_ref(offset, eptr, length, md, ims)) break;
1596 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1597 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1600 RRETURN(MATCH_NOMATCH);
1603 /* Control never gets here */
1607 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1608 used when all the characters in the class have values in the range 0-255,
1609 and either the matching is caseful, or the characters are in the range
1610 0-127 when UTF-8 processing is enabled. The only difference between
1611 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1614 First, look past the end of the item to see if there is repeat information
1615 following. Then obey similar code to character type repeats - written out
1621 data = ecode + 1; /* Save for matching */
1622 ecode += 33; /* Advance past the item */
1632 c = *ecode++ - OP_CRSTAR;
1633 minimize = (c & 1) != 0;
1634 min = rep_min[c]; /* Pick up values from tables; */
1635 max = rep_max[c]; /* zero for max => infinity */
1636 if (max == 0) max = INT_MAX;
1641 minimize = (*ecode == OP_CRMINRANGE);
1642 min = GET2(ecode, 1);
1643 max = GET2(ecode, 3);
1644 if (max == 0) max = INT_MAX;
1648 default: /* No repeat follows */
1653 /* First, ensure the minimum number of matches are present. */
1659 for (i = 1; i <= min; i++)
1661 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1662 GETCHARINC(c, eptr);
1665 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1669 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1675 /* Not UTF-8 mode */
1677 for (i = 1; i <= min; i++)
1679 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1681 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1685 /* If max == min we can continue with the main loop without the
1688 if (min == max) continue;
1690 /* If minimizing, keep testing the rest of the expression and advancing
1691 the pointer while it matches the class. */
1699 for (fi = min;; fi++)
1701 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1702 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1703 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1704 GETCHARINC(c, eptr);
1707 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1711 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1717 /* Not UTF-8 mode */
1719 for (fi = min;; fi++)
1721 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1722 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1723 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1725 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1728 /* Control never gets here */
1731 /* If maximizing, find the longest possible run, then work backwards. */
1741 for (i = min; i < max; i++)
1744 if (eptr >= md->end_subject) break;
1745 GETCHARLEN(c, eptr, len);
1748 if (op == OP_CLASS) break;
1752 if ((data[c/8] & (1 << (c&7))) == 0) break;
1758 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1759 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1760 if (eptr-- == pp) break; /* Stop if tried at original pos */
1766 /* Not UTF-8 mode */
1768 for (i = min; i < max; i++)
1770 if (eptr >= md->end_subject) break;
1772 if ((data[c/8] & (1 << (c&7))) == 0) break;
1777 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1778 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1783 RRETURN(MATCH_NOMATCH);
1786 /* Control never gets here */
1789 /* Match an extended character class. This opcode is encountered only
1790 in UTF-8 mode, because that's the only time it is compiled. */
1795 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1796 ecode += GET(ecode, 1); /* Advance past the item */
1806 c = *ecode++ - OP_CRSTAR;
1807 minimize = (c & 1) != 0;
1808 min = rep_min[c]; /* Pick up values from tables; */
1809 max = rep_max[c]; /* zero for max => infinity */
1810 if (max == 0) max = INT_MAX;
1815 minimize = (*ecode == OP_CRMINRANGE);
1816 min = GET2(ecode, 1);
1817 max = GET2(ecode, 3);
1818 if (max == 0) max = INT_MAX;
1822 default: /* No repeat follows */
1827 /* First, ensure the minimum number of matches are present. */
1829 for (i = 1; i <= min; i++)
1831 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1832 GETCHARINC(c, eptr);
1833 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1836 /* If max == min we can continue with the main loop without the
1839 if (min == max) continue;
1841 /* If minimizing, keep testing the rest of the expression and advancing
1842 the pointer while it matches the class. */
1846 for (fi = min;; fi++)
1848 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1849 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1850 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1851 GETCHARINC(c, eptr);
1852 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1854 /* Control never gets here */
1857 /* If maximizing, find the longest possible run, then work backwards. */
1862 for (i = min; i < max; i++)
1865 if (eptr >= md->end_subject) break;
1866 GETCHARLEN(c, eptr, len);
1867 if (!_pcre_xclass(c, data)) break;
1872 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1874 if (eptr-- == pp) break; /* Stop if tried at original pos */
1877 RRETURN(MATCH_NOMATCH);
1880 /* Control never gets here */
1882 #endif /* End of XCLASS */
1884 /* Match a single character, casefully */
1892 GETCHARLEN(fc, ecode, length);
1893 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1894 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1899 /* Non-UTF-8 mode */
1901 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1902 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1907 /* Match a single character, caselessly */
1915 GETCHARLEN(fc, ecode, length);
1917 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1919 /* If the pattern character's value is < 128, we have only one byte, and
1920 can use the fast lookup table. */
1924 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1927 /* Otherwise we must pick up the subject character */
1932 GETCHARINC(dc, eptr);
1935 /* If we have Unicode property support, we can use it to test the other
1936 case of the character, if there is one. */
1941 if (dc != _pcre_ucp_othercase(fc))
1943 RRETURN(MATCH_NOMATCH);
1948 #endif /* SUPPORT_UTF8 */
1950 /* Non-UTF-8 mode */
1952 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1953 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1958 /* Match a single character repeatedly; different opcodes share code. */
1961 min = max = GET2(ecode, 1);
1968 max = GET2(ecode, 1);
1969 minimize = *ecode == OP_MINUPTO;
1979 c = *ecode++ - OP_STAR;
1980 minimize = (c & 1) != 0;
1981 min = rep_min[c]; /* Pick up values from tables; */
1982 max = rep_max[c]; /* zero for max => infinity */
1983 if (max == 0) max = INT_MAX;
1985 /* Common code for all repeated single-character matches. We can give
1986 up quickly if there are fewer than the minimum number of characters left in
1995 GETCHARLEN(fc, ecode, length);
1996 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1999 /* Handle multibyte character matching specially here. There is
2000 support for caseless matching if UCP support is present. */
2009 if ((ims & PCRE_CASELESS) != 0 &&
2010 (othercase = _pcre_ucp_othercase(fc)) >= 0 &&
2012 oclength = _pcre_ord2utf8(othercase, occhars);
2013 #endif /* SUPPORT_UCP */
2015 for (i = 1; i <= min; i++)
2017 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2018 /* Need braces because of following else */
2019 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2022 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2027 if (min == max) continue;
2031 for (fi = min;; fi++)
2033 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2034 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2035 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2036 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2037 /* Need braces because of following else */
2038 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2041 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2045 /* Control never gets here */
2050 for (i = min; i < max; i++)
2052 if (eptr > md->end_subject - length) break;
2053 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2054 else if (oclength == 0) break;
2057 if (memcmp(eptr, occhars, oclength) != 0) break;
2063 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2064 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2067 RRETURN(MATCH_NOMATCH);
2069 /* Control never gets here */
2072 /* If the length of a UTF-8 character is 1, we fall through here, and
2073 obey the code as for non-UTF-8 characters below, though in this case the
2074 value of fc will always be < 128. */
2077 #endif /* SUPPORT_UTF8 */
2079 /* When not in UTF-8 mode, load a single-byte character. */
2081 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2085 /* The value of fc at this point is always less than 256, though we may or
2086 may not be in UTF-8 mode. The code is duplicated for the caseless and
2087 caseful cases, for speed, since matching characters is likely to be quite
2088 common. First, ensure the minimum number of matches are present. If min =
2089 max, continue at the same level without recursing. Otherwise, if
2090 minimizing, keep trying the rest of the expression and advancing one
2091 matching character if failing, up to the maximum. Alternatively, if
2092 maximizing, find the maximum number of characters and work backwards. */
2094 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2097 if ((ims & PCRE_CASELESS) != 0)
2100 for (i = 1; i <= min; i++)
2101 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2102 if (min == max) continue;
2105 for (fi = min;; fi++)
2107 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2108 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2109 if (fi >= max || eptr >= md->end_subject ||
2110 fc != md->lcc[*eptr++])
2111 RRETURN(MATCH_NOMATCH);
2113 /* Control never gets here */
2118 for (i = min; i < max; i++)
2120 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2125 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2127 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2129 RRETURN(MATCH_NOMATCH);
2131 /* Control never gets here */
2134 /* Caseful comparisons (includes all multi-byte characters) */
2138 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2139 if (min == max) continue;
2142 for (fi = min;; fi++)
2144 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2145 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2146 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2147 RRETURN(MATCH_NOMATCH);
2149 /* Control never gets here */
2154 for (i = min; i < max; i++)
2156 if (eptr >= md->end_subject || fc != *eptr) break;
2161 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2163 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2165 RRETURN(MATCH_NOMATCH);
2168 /* Control never gets here */
2170 /* Match a negated single one-byte character. The character we are
2171 checking can be multibyte. */
2174 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2176 GETCHARINCTEST(c, eptr);
2177 if ((ims & PCRE_CASELESS) != 0)
2183 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2187 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2191 /* Match a negated single one-byte character repeatedly. This is almost a
2192 repeat of the code for a repeated single character, but I haven't found a
2193 nice way of commoning these up that doesn't require a test of the
2194 positive/negative option for each character match. Maybe that wouldn't add
2195 very much to the time taken, but character matching *is* what this is all
2199 min = max = GET2(ecode, 1);
2206 max = GET2(ecode, 1);
2207 minimize = *ecode == OP_NOTMINUPTO;
2216 case OP_NOTMINQUERY:
2217 c = *ecode++ - OP_NOTSTAR;
2218 minimize = (c & 1) != 0;
2219 min = rep_min[c]; /* Pick up values from tables; */
2220 max = rep_max[c]; /* zero for max => infinity */
2221 if (max == 0) max = INT_MAX;
2223 /* Common code for all repeated single-byte matches. We can give up quickly
2224 if there are fewer than the minimum number of bytes left in the
2228 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2231 /* The code is duplicated for the caseless and caseful cases, for speed,
2232 since matching characters is likely to be quite common. First, ensure the
2233 minimum number of matches are present. If min = max, continue at the same
2234 level without recursing. Otherwise, if minimizing, keep trying the rest of
2235 the expression and advancing one matching character if failing, up to the
2236 maximum. Alternatively, if maximizing, find the maximum number of
2237 characters and work backwards. */
2239 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2242 if ((ims & PCRE_CASELESS) != 0)
2251 for (i = 1; i <= min; i++)
2253 GETCHARINC(d, eptr);
2254 if (d < 256) d = md->lcc[d];
2255 if (fc == d) RRETURN(MATCH_NOMATCH);
2261 /* Not UTF-8 mode */
2263 for (i = 1; i <= min; i++)
2264 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2267 if (min == max) continue;
2276 for (fi = min;; fi++)
2278 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2279 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2280 GETCHARINC(d, eptr);
2281 if (d < 256) d = md->lcc[d];
2282 if (fi >= max || eptr >= md->end_subject || fc == d)
2283 RRETURN(MATCH_NOMATCH);
2288 /* Not UTF-8 mode */
2290 for (fi = min;; fi++)
2292 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2293 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2294 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2295 RRETURN(MATCH_NOMATCH);
2298 /* Control never gets here */
2312 for (i = min; i < max; i++)
2315 if (eptr >= md->end_subject) break;
2316 GETCHARLEN(d, eptr, len);
2317 if (d < 256) d = md->lcc[d];
2323 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2324 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2325 if (eptr-- == pp) break; /* Stop if tried at original pos */
2331 /* Not UTF-8 mode */
2333 for (i = min; i < max; i++)
2335 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2340 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2341 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2346 RRETURN(MATCH_NOMATCH);
2348 /* Control never gets here */
2351 /* Caseful comparisons */
2360 for (i = 1; i <= min; i++)
2362 GETCHARINC(d, eptr);
2363 if (fc == d) RRETURN(MATCH_NOMATCH);
2368 /* Not UTF-8 mode */
2370 for (i = 1; i <= min; i++)
2371 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2374 if (min == max) continue;
2383 for (fi = min;; fi++)
2385 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2386 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2387 GETCHARINC(d, eptr);
2388 if (fi >= max || eptr >= md->end_subject || fc == d)
2389 RRETURN(MATCH_NOMATCH);
2394 /* Not UTF-8 mode */
2396 for (fi = min;; fi++)
2398 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2399 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2400 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2401 RRETURN(MATCH_NOMATCH);
2404 /* Control never gets here */
2418 for (i = min; i < max; i++)
2421 if (eptr >= md->end_subject) break;
2422 GETCHARLEN(d, eptr, len);
2428 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2429 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2430 if (eptr-- == pp) break; /* Stop if tried at original pos */
2436 /* Not UTF-8 mode */
2438 for (i = min; i < max; i++)
2440 if (eptr >= md->end_subject || fc == *eptr) break;
2445 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2446 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2451 RRETURN(MATCH_NOMATCH);
2454 /* Control never gets here */
2456 /* Match a single character type repeatedly; several different opcodes
2457 share code. This is very similar to the code for single characters, but we
2458 repeat it in the interests of efficiency. */
2461 min = max = GET2(ecode, 1);
2467 case OP_TYPEMINUPTO:
2469 max = GET2(ecode, 1);
2470 minimize = *ecode == OP_TYPEMINUPTO;
2475 case OP_TYPEMINSTAR:
2477 case OP_TYPEMINPLUS:
2479 case OP_TYPEMINQUERY:
2480 c = *ecode++ - OP_TYPESTAR;
2481 minimize = (c & 1) != 0;
2482 min = rep_min[c]; /* Pick up values from tables; */
2483 max = rep_max[c]; /* zero for max => infinity */
2484 if (max == 0) max = INT_MAX;
2486 /* Common code for all repeated single character type matches. Note that
2487 in UTF-8 mode, '.' matches a character of any length, but for the other
2488 character types, the valid characters are all one-byte long. */
2491 ctype = *ecode++; /* Code for the character type */
2494 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2496 prop_fail_result = ctype == OP_NOTPROP;
2497 prop_type = *ecode++;
2498 prop_value = *ecode++;
2500 else prop_type = -1;
2503 /* First, ensure the minimum number of matches are present. Use inline
2504 code for maximizing the speed, and do the type test once at the start
2505 (i.e. keep it out of the loop). Also we can test that there are at least
2506 the minimum number of bytes before we start. This isn't as effective in
2507 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2508 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2509 and single-bytes. */
2511 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2520 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2521 for (i = 1; i <= min; i++)
2523 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2524 GETCHARINC(c, eptr);
2529 for (i = 1; i <= min; i++)
2531 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2532 GETCHARINC(c, eptr);
2533 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2534 if ((prop_chartype == ucp_Lu ||
2535 prop_chartype == ucp_Ll ||
2536 prop_chartype == ucp_Lt) == prop_fail_result)
2537 RRETURN(MATCH_NOMATCH);
2542 for (i = 1; i <= min; i++)
2544 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2545 GETCHARINC(c, eptr);
2546 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2547 if ((prop_category == prop_value) == prop_fail_result)
2548 RRETURN(MATCH_NOMATCH);
2553 for (i = 1; i <= min; i++)
2555 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2556 GETCHARINC(c, eptr);
2557 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2558 if ((prop_chartype == prop_value) == prop_fail_result)
2559 RRETURN(MATCH_NOMATCH);
2564 for (i = 1; i <= min; i++)
2566 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2567 GETCHARINC(c, eptr);
2568 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2569 if ((prop_script == prop_value) == prop_fail_result)
2570 RRETURN(MATCH_NOMATCH);
2575 RRETURN(PCRE_ERROR_INTERNAL);
2580 /* Match extended Unicode sequences. We will get here only if the
2581 support is in the binary; otherwise a compile-time error occurs. */
2583 else if (ctype == OP_EXTUNI)
2585 for (i = 1; i <= min; i++)
2587 GETCHARINCTEST(c, eptr);
2588 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2589 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2590 while (eptr < md->end_subject)
2593 if (!utf8) c = *eptr; else
2595 GETCHARLEN(c, eptr, len);
2597 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2598 if (prop_category != ucp_M) break;
2605 #endif /* SUPPORT_UCP */
2607 /* Handle all other cases when the coding is UTF-8 */
2610 if (utf8) switch(ctype)
2613 for (i = 1; i <= min; i++)
2615 if (eptr >= md->end_subject ||
2616 ((ims & PCRE_DOTALL) == 0 &&
2617 eptr <= md->end_subject - md->nllen &&
2619 RRETURN(MATCH_NOMATCH);
2621 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2630 for (i = 1; i <= min; i++)
2632 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2633 GETCHARINC(c, eptr);
2634 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2635 RRETURN(MATCH_NOMATCH);
2640 for (i = 1; i <= min; i++)
2642 if (eptr >= md->end_subject ||
2643 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2644 RRETURN(MATCH_NOMATCH);
2645 /* No need to skip more bytes - we know it's a 1-byte character */
2649 case OP_NOT_WHITESPACE:
2650 for (i = 1; i <= min; i++)
2652 if (eptr >= md->end_subject ||
2653 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2654 RRETURN(MATCH_NOMATCH);
2655 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2660 for (i = 1; i <= min; i++)
2662 if (eptr >= md->end_subject ||
2663 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2664 RRETURN(MATCH_NOMATCH);
2665 /* No need to skip more bytes - we know it's a 1-byte character */
2669 case OP_NOT_WORDCHAR:
2670 for (i = 1; i <= min; i++)
2672 if (eptr >= md->end_subject ||
2673 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2674 RRETURN(MATCH_NOMATCH);
2675 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2680 for (i = 1; i <= min; i++)
2682 if (eptr >= md->end_subject ||
2683 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2684 RRETURN(MATCH_NOMATCH);
2685 /* No need to skip more bytes - we know it's a 1-byte character */
2690 RRETURN(PCRE_ERROR_INTERNAL);
2691 } /* End switch(ctype) */
2694 #endif /* SUPPORT_UTF8 */
2696 /* Code for the non-UTF-8 case for minimum matching of operators other
2697 than OP_PROP and OP_NOTPROP. */
2702 if ((ims & PCRE_DOTALL) == 0)
2704 for (i = 1; i <= min; i++)
2706 if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
2707 RRETURN(MATCH_NOMATCH);
2719 for (i = 1; i <= min; i++)
2720 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2724 for (i = 1; i <= min; i++)
2725 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2728 case OP_NOT_WHITESPACE:
2729 for (i = 1; i <= min; i++)
2730 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2734 for (i = 1; i <= min; i++)
2735 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2738 case OP_NOT_WORDCHAR:
2739 for (i = 1; i <= min; i++)
2740 if ((md->ctypes[*eptr++] & ctype_word) != 0)
2741 RRETURN(MATCH_NOMATCH);
2745 for (i = 1; i <= min; i++)
2746 if ((md->ctypes[*eptr++] & ctype_word) == 0)
2747 RRETURN(MATCH_NOMATCH);
2751 RRETURN(PCRE_ERROR_INTERNAL);
2755 /* If min = max, continue at the same level without recursing */
2757 if (min == max) continue;
2759 /* If minimizing, we have to test the rest of the pattern before each
2760 subsequent match. Again, separate the UTF-8 case for speed, and also
2761 separate the UCP cases. */
2771 for (fi = min;; fi++)
2773 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2774 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2775 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2776 GETCHARINC(c, eptr);
2777 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2782 for (fi = min;; fi++)
2784 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2786 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2787 GETCHARINC(c, eptr);
2788 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2789 if ((prop_chartype == ucp_Lu ||
2790 prop_chartype == ucp_Ll ||
2791 prop_chartype == ucp_Lt) == prop_fail_result)
2792 RRETURN(MATCH_NOMATCH);
2797 for (fi = min;; fi++)
2799 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2800 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2801 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2802 GETCHARINC(c, eptr);
2803 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2804 if ((prop_category == prop_value) == prop_fail_result)
2805 RRETURN(MATCH_NOMATCH);
2810 for (fi = min;; fi++)
2812 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2814 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2815 GETCHARINC(c, eptr);
2816 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2817 if ((prop_chartype == prop_value) == prop_fail_result)
2818 RRETURN(MATCH_NOMATCH);
2823 for (fi = min;; fi++)
2825 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2827 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2828 GETCHARINC(c, eptr);
2829 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2830 if ((prop_script == prop_value) == prop_fail_result)
2831 RRETURN(MATCH_NOMATCH);
2836 RRETURN(PCRE_ERROR_INTERNAL);
2841 /* Match extended Unicode sequences. We will get here only if the
2842 support is in the binary; otherwise a compile-time error occurs. */
2844 else if (ctype == OP_EXTUNI)
2846 for (fi = min;; fi++)
2848 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2849 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2850 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2851 GETCHARINCTEST(c, eptr);
2852 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2853 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2854 while (eptr < md->end_subject)
2857 if (!utf8) c = *eptr; else
2859 GETCHARLEN(c, eptr, len);
2861 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2862 if (prop_category != ucp_M) break;
2869 #endif /* SUPPORT_UCP */
2875 for (fi = min;; fi++)
2877 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2878 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2879 if (fi >= max || eptr >= md->end_subject ||
2880 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
2881 eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
2882 RRETURN(MATCH_NOMATCH);
2884 GETCHARINC(c, eptr);
2887 case OP_ANY: /* This is the DOTALL case */
2894 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
2895 RRETURN(MATCH_NOMATCH);
2899 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
2900 RRETURN(MATCH_NOMATCH);
2903 case OP_NOT_WHITESPACE:
2904 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
2905 RRETURN(MATCH_NOMATCH);
2909 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
2910 RRETURN(MATCH_NOMATCH);
2913 case OP_NOT_WORDCHAR:
2914 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
2915 RRETURN(MATCH_NOMATCH);
2919 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
2920 RRETURN(MATCH_NOMATCH);
2924 RRETURN(PCRE_ERROR_INTERNAL);
2930 /* Not UTF-8 mode */
2932 for (fi = min;; fi++)
2934 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2935 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2936 if (fi >= max || eptr >= md->end_subject ||
2937 ((ims & PCRE_DOTALL) == 0 &&
2938 eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
2939 RRETURN(MATCH_NOMATCH);
2944 case OP_ANY: /* This is the DOTALL case */
2951 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2955 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2958 case OP_NOT_WHITESPACE:
2959 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2963 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2966 case OP_NOT_WORDCHAR:
2967 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
2971 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
2975 RRETURN(PCRE_ERROR_INTERNAL);
2979 /* Control never gets here */
2982 /* If maximizing it is worth using inline code for speed, doing the type
2983 test once at the start (i.e. keep it out of the loop). Again, keep the
2984 UTF-8 and UCP stuff separate. */
2988 pp = eptr; /* Remember where we started */
2996 for (i = min; i < max; i++)
2999 if (eptr >= md->end_subject) break;
3000 GETCHARLEN(c, eptr, len);
3001 if (prop_fail_result) break;
3007 for (i = min; i < max; i++)
3010 if (eptr >= md->end_subject) break;
3011 GETCHARLEN(c, eptr, len);
3012 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3013 if ((prop_chartype == ucp_Lu ||
3014 prop_chartype == ucp_Ll ||
3015 prop_chartype == ucp_Lt) == prop_fail_result)
3022 for (i = min; i < max; i++)
3025 if (eptr >= md->end_subject) break;
3026 GETCHARLEN(c, eptr, len);
3027 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3028 if ((prop_category == prop_value) == prop_fail_result)
3035 for (i = min; i < max; i++)
3038 if (eptr >= md->end_subject) break;
3039 GETCHARLEN(c, eptr, len);
3040 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3041 if ((prop_chartype == prop_value) == prop_fail_result)
3048 for (i = min; i < max; i++)
3051 if (eptr >= md->end_subject) break;
3052 GETCHARLEN(c, eptr, len);
3053 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3054 if ((prop_script == prop_value) == prop_fail_result)
3061 /* eptr is now past the end of the maximum run */
3065 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3067 if (eptr-- == pp) break; /* Stop if tried at original pos */
3072 /* Match extended Unicode sequences. We will get here only if the
3073 support is in the binary; otherwise a compile-time error occurs. */
3075 else if (ctype == OP_EXTUNI)
3077 for (i = min; i < max; i++)
3079 if (eptr >= md->end_subject) break;
3080 GETCHARINCTEST(c, eptr);
3081 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3082 if (prop_category == ucp_M) break;
3083 while (eptr < md->end_subject)
3086 if (!utf8) c = *eptr; else
3088 GETCHARLEN(c, eptr, len);
3090 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3091 if (prop_category != ucp_M) break;
3096 /* eptr is now past the end of the maximum run */
3100 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3101 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3102 if (eptr-- == pp) break; /* Stop if tried at original pos */
3103 for (;;) /* Move back over one extended */
3107 if (!utf8) c = *eptr; else
3109 GETCHARLEN(c, eptr, len);
3111 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3112 if (prop_category != ucp_M) break;
3119 #endif /* SUPPORT_UCP */
3130 /* Special code is required for UTF8, but when the maximum is
3131 unlimited we don't need it, so we repeat the non-UTF8 code. This is
3132 probably worth it, because .* is quite a common idiom. */
3136 if ((ims & PCRE_DOTALL) == 0)
3138 for (i = min; i < max; i++)
3140 if (eptr >= md->end_subject ||
3141 (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
3144 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3149 for (i = min; i < max; i++)
3151 if (eptr >= md->end_subject) break;
3153 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3158 /* Handle unlimited UTF-8 repeat */
3162 if ((ims & PCRE_DOTALL) == 0)
3164 for (i = min; i < max; i++)
3166 if (eptr >= md->end_subject ||
3167 (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
3176 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3182 /* The byte case is the same as non-UTF8 */
3186 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3191 for (i = min; i < max; i++)
3194 if (eptr >= md->end_subject) break;
3195 GETCHARLEN(c, eptr, len);
3196 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3202 for (i = min; i < max; i++)
3205 if (eptr >= md->end_subject) break;
3206 GETCHARLEN(c, eptr, len);
3207 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3212 case OP_NOT_WHITESPACE:
3213 for (i = min; i < max; i++)
3216 if (eptr >= md->end_subject) break;
3217 GETCHARLEN(c, eptr, len);
3218 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3224 for (i = min; i < max; i++)
3227 if (eptr >= md->end_subject) break;
3228 GETCHARLEN(c, eptr, len);
3229 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3234 case OP_NOT_WORDCHAR:
3235 for (i = min; i < max; i++)
3238 if (eptr >= md->end_subject) break;
3239 GETCHARLEN(c, eptr, len);
3240 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3246 for (i = min; i < max; i++)
3249 if (eptr >= md->end_subject) break;
3250 GETCHARLEN(c, eptr, len);
3251 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3257 RRETURN(PCRE_ERROR_INTERNAL);
3260 /* eptr is now past the end of the maximum run */
3264 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3265 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3266 if (eptr-- == pp) break; /* Stop if tried at original pos */
3273 /* Not UTF-8 mode */
3278 if ((ims & PCRE_DOTALL) == 0)
3280 for (i = min; i < max; i++)
3282 if (eptr >= md->end_subject ||
3283 (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
3289 /* For DOTALL case, fall through and treat as \C */
3293 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3298 for (i = min; i < max; i++)
3300 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3307 for (i = min; i < max; i++)
3309 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3315 case OP_NOT_WHITESPACE:
3316 for (i = min; i < max; i++)
3318 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3325 for (i = min; i < max; i++)
3327 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3333 case OP_NOT_WORDCHAR:
3334 for (i = min; i < max; i++)
3336 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3343 for (i = min; i < max; i++)
3345 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3352 RRETURN(PCRE_ERROR_INTERNAL);
3355 /* eptr is now past the end of the maximum run */
3359 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3361 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3365 /* Get here if we can't make it match with any permitted repetitions */
3367 RRETURN(MATCH_NOMATCH);
3369 /* Control never gets here */
3371 /* There's been some horrible disaster. Since all codes > OP_BRA are
3372 for capturing brackets, and there shouldn't be any gaps between 0 and
3373 OP_BRA, arrival here can only mean there is something seriously wrong
3374 in the code above or the OP_xxx definitions. */
3377 DPRINTF(("Unknown opcode %d\n", *ecode));
3378 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
3381 /* Do not stick any code in here without much thought; it is assumed
3382 that "continue" in the code above comes out to here to repeat the main
3385 } /* End of main loop */
3386 /* Control never reaches here */
3390 /***************************************************************************
3391 ****************************************************************************
3392 RECURSION IN THE match() FUNCTION
3394 Undefine all the macros that were defined above to handle this. */
3412 #undef new_recursive
3428 #undef save_capture_last
3438 /* These two are defined as macros in both cases */
3443 /***************************************************************************
3444 ***************************************************************************/
3448 /*************************************************
3449 * Execute a Regular Expression *
3450 *************************************************/
3452 /* This function applies a compiled re to a subject string and picks out
3453 portions of the string if it matches. Two elements in the vector are set for
3454 each substring: the offsets to the start and end of the substring.
3457 argument_re points to the compiled expression
3458 extra_data points to extra data or is NULL
3459 subject points to the subject string
3460 length length of subject string (may contain binary zeros)
3461 start_offset where to start in the subject string
3463 offsets points to a vector of ints to be filled in with offsets
3464 offsetcount the number of elements in the vector
3466 Returns: > 0 => success; value is the number of elements filled in
3467 = 0 => success, but offsets is not big enough
3468 -1 => failed to match
3469 < -1 => some kind of unexpected problem
3473 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3474 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3477 int rc, resetcount, ocount;
3478 int first_byte = -1;
3482 unsigned long int ims;
3483 BOOL using_temporary_offsets = FALSE;
3487 BOOL first_byte_caseless = FALSE;
3488 BOOL req_byte_caseless = FALSE;
3489 match_data match_block;
3490 match_data *md = &match_block;
3491 const uschar *tables;
3492 const uschar *start_bits = NULL;
3493 USPTR start_match = (USPTR)subject + start_offset;
3495 USPTR req_byte_ptr = start_match - 1;
3497 pcre_study_data internal_study;
3498 const pcre_study_data *study;
3500 real_pcre internal_re;
3501 const real_pcre *external_re = (const real_pcre *)argument_re;
3502 const real_pcre *re = external_re;
3504 /* Plausibility checks */
3506 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3507 if (re == NULL || subject == NULL ||
3508 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3509 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3511 /* Fish out the optional data from the extra_data structure, first setting
3512 the default values. */
3515 md->match_limit = MATCH_LIMIT;
3516 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3517 md->callout_data = NULL;
3519 /* The table pointer is always in native byte order. */
3521 tables = external_re->tables;
3523 if (extra_data != NULL)
3525 register unsigned int flags = extra_data->flags;
3526 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3527 study = (const pcre_study_data *)extra_data->study_data;
3528 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3529 md->match_limit = extra_data->match_limit;
3530 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3531 md->match_limit_recursion = extra_data->match_limit_recursion;
3532 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3533 md->callout_data = extra_data->callout_data;
3534 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3537 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3538 is a feature that makes it possible to save compiled regex and re-use them
3539 in other programs later. */
3541 if (tables == NULL) tables = _pcre_default_tables;
3543 /* Check that the first field in the block is the magic number. If it is not,
3544 test for a regex that was compiled on a host of opposite endianness. If this is
3545 the case, flipped values are put in internal_re and internal_study if there was
3548 if (re->magic_number != MAGIC_NUMBER)
3550 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3551 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3552 if (study != NULL) study = &internal_study;
3555 /* Set up other data */
3557 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3558 startline = (re->options & PCRE_STARTLINE) != 0;
3559 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3561 /* The code starts after the real_pcre block and the capture name table. */
3563 md->start_code = (const uschar *)external_re + re->name_table_offset +
3564 re->name_count * re->name_entry_size;
3566 md->start_subject = (USPTR)subject;
3567 md->start_offset = start_offset;
3568 md->end_subject = md->start_subject + length;
3569 end_subject = md->end_subject;
3571 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3572 md->utf8 = (re->options & PCRE_UTF8) != 0;
3574 md->notbol = (options & PCRE_NOTBOL) != 0;
3575 md->noteol = (options & PCRE_NOTEOL) != 0;
3576 md->notempty = (options & PCRE_NOTEMPTY) != 0;
3577 md->partial = (options & PCRE_PARTIAL) != 0;
3580 md->recursive = NULL; /* No recursion at top level */
3582 md->lcc = tables + lcc_offset;
3583 md->ctypes = tables + ctypes_offset;
3585 /* Handle different types of newline. The two bits give four cases. If nothing
3586 is set at run time, whatever was used at compile time applies. */
3588 switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) &
3591 default: newline = NEWLINE; break; /* Compile-time default */
3592 case PCRE_NEWLINE_CR: newline = '\r'; break;
3593 case PCRE_NEWLINE_LF: newline = '\n'; break;
3594 case PCRE_NEWLINE_CR+
3595 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3601 md->nl[0] = (newline >> 8) & 255;
3602 md->nl[1] = newline & 255;
3607 md->nl[0] = newline;
3610 /* Partial matching is supported only for a restricted set of regexes at the
3613 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3614 return PCRE_ERROR_BADPARTIAL;
3616 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3617 back the character offset. */
3620 if (md->utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3622 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3623 return PCRE_ERROR_BADUTF8;
3624 if (start_offset > 0 && start_offset < length)
3626 int tb = ((uschar *)subject)[start_offset];
3630 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3636 /* The ims options can vary during the matching as a result of the presence
3637 of (?ims) items in the pattern. They are kept in a local variable so that
3638 restoring at the exit of a group is easy. */
3640 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3642 /* If the expression has got more back references than the offsets supplied can
3643 hold, we get a temporary chunk of working store to use during the matching.
3644 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3647 ocount = offsetcount - (offsetcount % 3);
3649 if (re->top_backref > 0 && re->top_backref >= ocount/3)
3651 ocount = re->top_backref * 3 + 3;
3652 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3653 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3654 using_temporary_offsets = TRUE;
3655 DPRINTF(("Got memory to hold back references\n"));
3657 else md->offset_vector = offsets;
3659 md->offset_end = ocount;
3660 md->offset_max = (2*ocount)/3;
3661 md->offset_overflow = FALSE;
3662 md->capture_last = -1;
3664 /* Compute the minimum number of offsets that we need to reset each time. Doing
3665 this makes a huge difference to execution time when there aren't many brackets
3668 resetcount = 2 + re->top_bracket * 2;
3669 if (resetcount > offsetcount) resetcount = ocount;
3671 /* Reset the working variable associated with each extraction. These should
3672 never be used unless previously set, but they get saved and restored, and so we
3673 initialize them to avoid reading uninitialized locations. */
3675 if (md->offset_vector != NULL)
3677 register int *iptr = md->offset_vector + ocount;
3678 register int *iend = iptr - resetcount/2 + 1;
3679 while (--iptr >= iend) *iptr = -1;
3682 /* Set up the first character to match, if available. The first_byte value is
3683 never set for an anchored regular expression, but the anchoring may be forced
3684 at run time, so we have to test for anchoring. The first char may be unset for
3685 an unanchored pattern, of course. If there's no first char and the pattern was
3686 studied, there may be a bitmap of possible first characters. */
3690 if ((re->options & PCRE_FIRSTSET) != 0)
3692 first_byte = re->first_byte & 255;
3693 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3694 first_byte = md->lcc[first_byte];
3697 if (!startline && study != NULL &&
3698 (study->options & PCRE_STUDY_MAPPED) != 0)
3699 start_bits = study->start_bits;
3702 /* For anchored or unanchored matches, there may be a "last known required
3705 if ((re->options & PCRE_REQCHSET) != 0)
3707 req_byte = re->req_byte & 255;
3708 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3709 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3712 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3713 the loop runs just once. */
3717 USPTR save_end_subject = end_subject;
3719 /* Reset the maximum number of extractions we might see. */
3721 if (md->offset_vector != NULL)
3723 register int *iptr = md->offset_vector;
3724 register int *iend = iptr + resetcount;
3725 while (iptr < iend) *iptr++ = -1;
3728 /* Advance to a unique first char if possible. If firstline is TRUE, the
3729 start of the match is constrained to the first line of a multiline string.
3730 Implement this by temporarily adjusting end_subject so that we stop scanning
3731 at a newline. If the match fails at the newline, later code breaks this loop.
3736 USPTR t = start_match;
3737 while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++;
3741 /* Now test for a unique first byte */
3743 if (first_byte >= 0)
3745 if (first_byte_caseless)
3746 while (start_match < end_subject &&
3747 md->lcc[*start_match] != first_byte)
3750 while (start_match < end_subject && *start_match != first_byte)
3754 /* Or to just after a linebreak for a multiline match if possible */
3758 if (start_match >= md->start_subject + md->nllen +
3761 while (start_match <= end_subject &&
3762 !IS_NEWLINE(start_match - md->nllen))
3767 /* Or to a non-unique first char after study */
3769 else if (start_bits != NULL)
3771 while (start_match < end_subject)
3773 register unsigned int c = *start_match;
3774 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3778 /* Restore fudged end_subject */
3780 end_subject = save_end_subject;
3782 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3783 printf(">>>> Match against: ");
3784 pchars(start_match, end_subject - start_match, TRUE, md);
3788 /* If req_byte is set, we know that that character must appear in the subject
3789 for the match to succeed. If the first character is set, req_byte must be
3790 later in the subject; otherwise the test starts at the match point. This
3791 optimization can save a huge amount of backtracking in patterns with nested
3792 unlimited repeats that aren't going to match. Writing separate code for
3793 cased/caseless versions makes it go faster, as does using an autoincrement
3794 and backing off on a match.
3796 HOWEVER: when the subject string is very, very long, searching to its end can
3797 take a long time, and give bad performance on quite ordinary patterns. This
3798 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
3799 don't do this when the string is sufficiently long.
3801 ALSO: this processing is disabled when partial matching is requested.
3804 if (req_byte >= 0 &&
3805 end_subject - start_match < REQ_BYTE_MAX &&
3808 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
3810 /* We don't need to repeat the search if we haven't yet reached the
3811 place we found it at last time. */
3813 if (p > req_byte_ptr)
3815 if (req_byte_caseless)
3817 while (p < end_subject)
3819 register int pp = *p++;
3820 if (pp == req_byte || pp == req_byte2) { p--; break; }
3825 while (p < end_subject)
3827 if (*p++ == req_byte) { p--; break; }
3831 /* If we can't find the required character, break the matching loop */
3833 if (p >= end_subject) break;
3835 /* If we have found the required character, save the point where we
3836 found it, so that we don't search again next time round the loop if
3837 the start hasn't passed this character yet. */
3843 /* When a match occurs, substrings will be set for all internal extractions;
3844 we just need to set up the whole thing as substring 0 before returning. If
3845 there were too many extractions, set the return code to zero. In the case
3846 where we had to get some local store to hold offsets for backreferences, copy
3847 those back references that we can. In this case there need not be overflow
3848 if certain parts of the pattern were not used. */
3850 md->start_match = start_match;
3851 md->match_call_count = 0;
3853 rc = match(start_match, md->start_code, 2, md, ims, NULL, match_isgroup, 0);
3855 /* When the result is no match, if the subject's first character was a
3856 newline and the PCRE_FIRSTLINE option is set, break (which will return
3857 PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
3858 newline in the subject. Otherwise, advance the pointer to the next character
3859 and continue - but the continuation will actually happen only when the
3860 pattern is not anchored. */
3862 if (rc == MATCH_NOMATCH)
3865 start_match <= md->end_subject - md->nllen &&
3866 IS_NEWLINE(start_match))
3871 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
3877 if (rc != MATCH_MATCH)
3879 DPRINTF((">>>> error: returning %d\n", rc));
3883 /* We have a match! Copy the offset information from temporary store if
3886 if (using_temporary_offsets)
3888 if (offsetcount >= 4)
3890 memcpy(offsets + 2, md->offset_vector + 2,
3891 (offsetcount - 2) * sizeof(int));
3892 DPRINTF(("Copied offsets from temporary memory\n"));
3894 if (md->end_offset_top > offsetcount)
3895 md->offset_overflow = TRUE;
3897 DPRINTF(("Freeing temporary memory\n"));
3898 (pcre_free)(md->offset_vector);
3901 rc = md->offset_overflow? 0 : md->end_offset_top/2;
3903 if (offsetcount < 2) rc = 0; else
3905 offsets[0] = start_match - md->start_subject;
3906 offsets[1] = md->end_match_ptr - md->start_subject;
3909 DPRINTF((">>>> returning %d\n", rc));
3913 /* This "while" is the end of the "do" above */
3915 while (!anchored && start_match <= end_subject);
3917 if (using_temporary_offsets)
3919 DPRINTF(("Freeing temporary memory\n"));
3920 (pcre_free)(md->offset_vector);
3923 if (md->partial && md->hitend)
3925 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
3926 return PCRE_ERROR_PARTIAL;
3930 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
3931 return PCRE_ERROR_NOMATCH;
3935 /* End of pcre_exec.c */