1 /* $Cambridge: exim/src/src/pcre/pcre_exec.c,v 1.5 2007/06/26 11:16:54 ph10 Exp $ */
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
10 Written by Philip Hazel
11 Copyright (c) 1997-2007 University of Cambridge
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
43 /* This module contains pcre_exec(), the externally visible function that does
44 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
45 possible. There are also some static supporting functions. */
47 #define NLBLOCK md /* Block containing newline information */
48 #define PSSTART start_subject /* Field containing processed string start */
49 #define PSEND end_subject /* Field containing processed string end */
51 #include "pcre_internal.h"
53 /* Undefine some potentially clashing cpp symbols */
58 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
59 obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
61 #define EPTR_WORK_SIZE (1000)
63 /* Flag bits for the match() function */
65 #define match_condassert 0x01 /* Called to check a condition assertion */
66 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
67 #define match_tail_recursed 0x04 /* Tail recursive call */
69 /* Non-error returns from the match() function. Error returns are externally
70 defined PCRE_ERROR_xxx codes, which are all negative. */
73 #define MATCH_NOMATCH 0
75 /* Maximum number of ints of offset to save on the stack for recursive calls.
76 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
77 because the offset vector is always a multiple of 3 long. */
79 #define REC_STACK_SAVE_MAX 30
81 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
83 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
84 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89 /*************************************************
90 * Debugging function to print chars *
91 *************************************************/
93 /* Print a sequence of chars in printable format, stopping at the end of the
94 subject if the requested.
97 p points to characters
98 length number to print
99 is_subject TRUE if printing from within md->start_subject
100 md pointer to matching data block, if is_subject is TRUE
106 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
109 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
111 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
117 /*************************************************
118 * Match a back-reference *
119 *************************************************/
121 /* If a back reference hasn't been set, the length that is passed is greater
122 than the number of characters left in the string, so the match fails.
125 offset index into the offset vector
126 eptr points into the subject
127 length length to be matched
128 md points to match data block
131 Returns: TRUE if matched
135 match_ref(int offset, register USPTR eptr, int length, match_data *md,
136 unsigned long int ims)
138 USPTR p = md->start_subject + md->offset_vector[offset];
141 if (eptr >= md->end_subject)
142 printf("matching subject <null>");
145 printf("matching subject ");
146 pchars(eptr, length, TRUE, md);
148 printf(" against backref ");
149 pchars(p, length, FALSE, md);
153 /* Always fail if not enough characters left */
155 if (length > md->end_subject - eptr) return FALSE;
157 /* Separate the caselesss case for speed */
159 if ((ims & PCRE_CASELESS) != 0)
162 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
165 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
172 /***************************************************************************
173 ****************************************************************************
174 RECURSION IN THE match() FUNCTION
176 The match() function is highly recursive, though not every recursive call
177 increases the recursive depth. Nevertheless, some regular expressions can cause
178 it to recurse to a great depth. I was writing for Unix, so I just let it call
179 itself recursively. This uses the stack for saving everything that has to be
180 saved for a recursive call. On Unix, the stack can be large, and this works
183 It turns out that on some non-Unix-like systems there are problems with
184 programs that use a lot of stack. (This despite the fact that every last chip
185 has oodles of memory these days, and techniques for extending the stack have
186 been known for decades.) So....
188 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
189 calls by keeping local variables that need to be preserved in blocks of memory
190 obtained from malloc() instead instead of on the stack. Macros are used to
191 achieve this so that the actual code doesn't look very different to what it
194 The original heap-recursive code used longjmp(). However, it seems that this
195 can be very slow on some operating systems. Following a suggestion from Stan
196 Switzer, the use of longjmp() has been abolished, at the cost of having to
197 provide a unique number for each call to RMATCH. There is no way of generating
198 a sequence of numbers at compile time in C. I have given them names, to make
199 them stand out more clearly.
201 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
202 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
203 tests. Furthermore, not using longjmp() means that local dynamic variables
204 don't have indeterminate values; this has meant that the frame size can be
205 reduced because the result can be "passed back" by straight setting of the
206 variable instead of being passed in the frame.
207 ****************************************************************************
208 ***************************************************************************/
211 /* Numbers for RMATCH calls */
213 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
214 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
215 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
216 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
217 RM41, RM42, RM43, RM44, RM45, RM46, RM47 };
220 /* These versions of the macros use the stack, as normal. There are debugging
221 versions and production versions. Note that the "rw" argument of RMATCH isn't
222 actuall used in this definition. */
225 #define REGISTER register
228 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
230 printf("match() called in line %d\n", __LINE__); \
231 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
232 printf("to line %d\n", __LINE__); \
234 #define RRETURN(ra) \
236 printf("match() returned %d from line %d ", ra, __LINE__); \
240 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
241 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
242 #define RRETURN(ra) return ra
248 /* These versions of the macros manage a private stack on the heap. Note that
249 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
250 argument of match(), which never changes. */
254 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
256 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
257 frame->Xwhere = rw; \
258 newframe->Xeptr = ra;\
259 newframe->Xecode = rb;\
260 newframe->Xmstart = mstart;\
261 newframe->Xoffset_top = rc;\
262 newframe->Xims = re;\
263 newframe->Xeptrb = rf;\
264 newframe->Xflags = rg;\
265 newframe->Xrdepth = frame->Xrdepth + 1;\
266 newframe->Xprevframe = frame;\
268 DPRINTF(("restarting from line %d\n", __LINE__));\
271 DPRINTF(("jumped back to line %d\n", __LINE__));\
276 heapframe *newframe = frame;\
277 frame = newframe->Xprevframe;\
278 (pcre_stack_free)(newframe);\
288 /* Structure for remembering the local variables in a private frame */
290 typedef struct heapframe {
291 struct heapframe *Xprevframe;
293 /* Function arguments that may change */
296 const uschar *Xecode;
297 const uschar *Xmstart;
302 unsigned int Xrdepth;
304 /* Function local variables */
306 const uschar *Xcallpat;
307 const uschar *Xcharptr;
312 const uschar *Xsaved_eptr;
314 recursion_info Xnew_recursive;
320 unsigned long int Xoriginal_ims;
325 int Xprop_fail_result;
342 int Xsave_capture_last;
343 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
344 int Xstacksave[REC_STACK_SAVE_MAX];
348 /* Where to jump back to */
357 /***************************************************************************
358 ***************************************************************************/
362 /*************************************************
363 * Match from current position *
364 *************************************************/
366 /* This function is called recursively in many circumstances. Whenever it
367 returns a negative (error) response, the outer incarnation must also return the
370 Performance note: It might be tempting to extract commonly used fields from the
371 md structure (e.g. utf8, end_subject) into individual variables to improve
372 performance. Tests using gcc on a SPARC disproved this; in the first case, it
373 made performance worse.
376 eptr pointer to current character in subject
377 ecode pointer to current position in compiled code
378 mstart pointer to the current match start position (can be modified
380 offset_top current top pointer
381 md pointer to "static" info for the match
382 ims current /i, /m, and /s options
383 eptrb pointer to chain of blocks containing eptr at start of
384 brackets - for testing for empty matches
386 match_condassert - this is an assertion condition
387 match_cbegroup - this is the start of an unlimited repeat
388 group that can match an empty string
389 match_tail_recursed - this is a tail_recursed group
390 rdepth the recursion depth
392 Returns: MATCH_MATCH if matched ) these values are >= 0
393 MATCH_NOMATCH if failed to match )
394 a negative PCRE_ERROR_xxx value if aborted by an error condition
395 (e.g. stopped by repeated call or recursion limit)
399 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
400 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
401 int flags, unsigned int rdepth)
403 /* These variables do not need to be preserved over recursion in this function,
404 so they can be ordinary variables in all cases. Mark some of them with
405 "register" because they are used a lot in loops. */
407 register int rrc; /* Returns from recursive calls */
408 register int i; /* Used for loops not involving calls to RMATCH() */
409 register unsigned int c; /* Character values not kept over RMATCH() calls */
410 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
412 BOOL minimize, possessive; /* Quantifier options */
414 /* When recursion is not being used, all "local" variables that have to be
415 preserved over calls to RMATCH() are part of a "frame" which is obtained from
416 heap storage. Set up the top-level frame here; others are obtained from the
417 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
420 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
421 frame->Xprevframe = NULL; /* Marks the top level */
423 /* Copy in the original argument variables */
426 frame->Xecode = ecode;
427 frame->Xmstart = mstart;
428 frame->Xoffset_top = offset_top;
430 frame->Xeptrb = eptrb;
431 frame->Xflags = flags;
432 frame->Xrdepth = rdepth;
434 /* This is where control jumps back to to effect "recursion" */
438 /* Macros make the argument variables come from the current frame */
440 #define eptr frame->Xeptr
441 #define ecode frame->Xecode
442 #define mstart frame->Xmstart
443 #define offset_top frame->Xoffset_top
444 #define ims frame->Xims
445 #define eptrb frame->Xeptrb
446 #define flags frame->Xflags
447 #define rdepth frame->Xrdepth
449 /* Ditto for the local variables */
452 #define charptr frame->Xcharptr
454 #define callpat frame->Xcallpat
455 #define data frame->Xdata
456 #define next frame->Xnext
457 #define pp frame->Xpp
458 #define prev frame->Xprev
459 #define saved_eptr frame->Xsaved_eptr
461 #define new_recursive frame->Xnew_recursive
463 #define cur_is_word frame->Xcur_is_word
464 #define condition frame->Xcondition
465 #define prev_is_word frame->Xprev_is_word
467 #define original_ims frame->Xoriginal_ims
470 #define prop_type frame->Xprop_type
471 #define prop_value frame->Xprop_value
472 #define prop_fail_result frame->Xprop_fail_result
473 #define prop_category frame->Xprop_category
474 #define prop_chartype frame->Xprop_chartype
475 #define prop_script frame->Xprop_script
476 #define oclength frame->Xoclength
477 #define occhars frame->Xocchars
480 #define ctype frame->Xctype
481 #define fc frame->Xfc
482 #define fi frame->Xfi
483 #define length frame->Xlength
484 #define max frame->Xmax
485 #define min frame->Xmin
486 #define number frame->Xnumber
487 #define offset frame->Xoffset
488 #define op frame->Xop
489 #define save_capture_last frame->Xsave_capture_last
490 #define save_offset1 frame->Xsave_offset1
491 #define save_offset2 frame->Xsave_offset2
492 #define save_offset3 frame->Xsave_offset3
493 #define stacksave frame->Xstacksave
495 #define newptrb frame->Xnewptrb
497 /* When recursion is being used, local variables are allocated on the stack and
498 get preserved during recursion in the normal way. In this environment, fi and
499 i, and fc and c, can be the same variables. */
501 #else /* NO_RECURSE not defined */
506 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
507 const uschar *charptr; /* in small blocks of the code. My normal */
508 #endif /* style of coding would have declared */
509 const uschar *callpat; /* them within each of those blocks. */
510 const uschar *data; /* However, in order to accommodate the */
511 const uschar *next; /* version of this code that uses an */
512 USPTR pp; /* external "stack" implemented on the */
513 const uschar *prev; /* heap, it is easier to declare them all */
514 USPTR saved_eptr; /* here, so the declarations can be cut */
515 /* out in a block. The only declarations */
516 recursion_info new_recursive; /* within blocks below are for variables */
517 /* that do not have to be preserved over */
518 BOOL cur_is_word; /* a recursive call to RMATCH(). */
522 unsigned long int original_ims;
527 int prop_fail_result;
542 int save_capture_last;
543 int save_offset1, save_offset2, save_offset3;
544 int stacksave[REC_STACK_SAVE_MAX];
547 #endif /* NO_RECURSE */
549 /* These statements are here to stop the compiler complaining about unitialized
554 prop_fail_result = 0;
558 /* This label is used for tail recursion, which is used in a few cases even
559 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
560 used. Thanks to Ian Taylor for noticing this possibility and sending the
565 /* OK, now we can get on with the real code of the function. Recursive calls
566 are specified by the macro RMATCH and RRETURN is used to return. When
567 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
568 and a "return", respectively (possibly with some debugging if DEBUG is
569 defined). However, RMATCH isn't like a function call because it's quite a
570 complicated macro. It has to be used in one particular way. This shouldn't,
571 however, impact performance when true recursion is being used. */
574 utf8 = md->utf8; /* Local copy of the flag */
579 /* First check that we haven't called match() too many times, or that we
580 haven't exceeded the recursive call limit. */
582 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
583 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
585 original_ims = ims; /* Save for resetting on ')' */
587 /* At the start of a group with an unlimited repeat that may match an empty
588 string, the match_cbegroup flag is set. When this is the case, add the current
589 subject pointer to the chain of such remembered pointers, to be checked when we
590 hit the closing ket, in order to break infinite loops that match no characters.
591 When match() is called in other circumstances, don't add to the chain. If this
592 is a tail recursion, use a block from the workspace, as the one on the stack is
595 if ((flags & match_cbegroup) != 0)
598 if ((flags & match_tail_recursed) != 0)
600 if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
601 p = md->eptrchain + md->eptrn++;
604 p->epb_saved_eptr = eptr;
609 /* Now start processing the opcodes. */
613 minimize = possessive = FALSE;
616 /* For partial matching, remember if we ever hit the end of the subject after
617 matching at least one subject character. */
620 eptr >= md->end_subject &&
626 /* Handle a capturing bracket. If there is space in the offset vector, save
627 the current subject position in the working slot at the top of the vector.
628 We mustn't change the current values of the data slot, because they may be
629 set from a previous iteration of this group, and be referred to by a
630 reference inside the group.
632 If the bracket fails to match, we need to restore this value and also the
633 values of the final offsets, in case they were set by a previous iteration
636 If there isn't enough space in the offset vector, treat this as if it were
637 a non-capturing bracket. Don't worry about setting the flag for the error
638 case here; that is handled in the code for KET. */
642 number = GET2(ecode, 1+LINK_SIZE);
643 offset = number << 1;
646 printf("start bracket %d\n", number);
648 pchars(eptr, 16, TRUE, md);
652 if (offset < md->offset_max)
654 save_offset1 = md->offset_vector[offset];
655 save_offset2 = md->offset_vector[offset+1];
656 save_offset3 = md->offset_vector[md->offset_end - number];
657 save_capture_last = md->capture_last;
659 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
660 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
662 flags = (op == OP_SCBRA)? match_cbegroup : 0;
665 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
666 ims, eptrb, flags, RM1);
667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
668 md->capture_last = save_capture_last;
669 ecode += GET(ecode, 1);
671 while (*ecode == OP_ALT);
673 DPRINTF(("bracket %d failed\n", number));
675 md->offset_vector[offset] = save_offset1;
676 md->offset_vector[offset+1] = save_offset2;
677 md->offset_vector[md->offset_end - number] = save_offset3;
679 RRETURN(MATCH_NOMATCH);
682 /* Insufficient room for saving captured contents. Treat as a non-capturing
685 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
687 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
688 final alternative within the brackets, we would return the result of a
689 recursive call to match() whatever happened. We can reduce stack usage by
690 turning this into a tail recursion. */
694 DPRINTF(("start non-capturing bracket\n"));
695 flags = (op >= OP_SBRA)? match_cbegroup : 0;
698 if (ecode[GET(ecode, 1)] != OP_ALT)
700 ecode += _pcre_OP_lengths[*ecode];
701 flags |= match_tail_recursed;
702 DPRINTF(("bracket 0 tail recursion\n"));
706 /* For non-final alternatives, continue the loop for a NOMATCH result;
709 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
711 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
712 ecode += GET(ecode, 1);
714 /* Control never reaches here. */
716 /* Conditional group: compilation checked that there are no more than
717 two branches. If the condition is false, skipping the first branch takes us
718 past the end if there is only one branch, but that's OK because that is
719 exactly what going to the ket would do. As there is only one branch to be
720 obeyed, we can use tail recursion to avoid using another stack frame. */
724 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
726 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
727 condition = md->recursive != NULL &&
728 (offset == RREF_ANY || offset == md->recursive->group_num);
729 ecode += condition? 3 : GET(ecode, 1);
732 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
734 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
735 condition = offset < offset_top && md->offset_vector[offset] >= 0;
736 ecode += condition? 3 : GET(ecode, 1);
739 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
742 ecode += GET(ecode, 1);
745 /* The condition is an assertion. Call match() to evaluate it - setting
746 the final argument match_condassert causes it to stop at the end of an
751 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
752 match_condassert, RM3);
753 if (rrc == MATCH_MATCH)
756 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
757 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
759 else if (rrc != MATCH_NOMATCH)
761 RRETURN(rrc); /* Need braces because of following else */
766 ecode += GET(ecode, 1);
770 /* We are now at the branch that is to be obeyed. As there is only one,
771 we can use tail recursion to avoid using another stack frame. If the second
772 alternative doesn't exist, we can just plough on. */
774 if (condition || *ecode == OP_ALT)
776 ecode += 1 + LINK_SIZE;
777 flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
782 ecode += 1 + LINK_SIZE;
787 /* End of the pattern. If we are in a top-level recursion, we should
788 restore the offsets appropriately and continue from after the call. */
791 if (md->recursive != NULL && md->recursive->group_num == 0)
793 recursion_info *rec = md->recursive;
794 DPRINTF(("End of pattern in a (?0) recursion\n"));
795 md->recursive = rec->prevrec;
796 memmove(md->offset_vector, rec->offset_save,
797 rec->saved_max * sizeof(int));
798 mstart = rec->save_start;
800 ecode = rec->after_call;
804 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
805 string - backtracking will then try other alternatives, if any. */
807 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
808 md->end_match_ptr = eptr; /* Record where we ended */
809 md->end_offset_top = offset_top; /* and how many extracts were taken */
810 md->start_match_ptr = mstart; /* and the start (\K can modify) */
811 RRETURN(MATCH_MATCH);
813 /* Change option settings */
818 DPRINTF(("ims set to %02lx\n", ims));
821 /* Assertion brackets. Check the alternative branches in turn - the
822 matching won't pass the KET for an assertion. If any one branch matches,
823 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
824 start of each branch to move the current point backwards, so the code at
825 this level is identical to the lookahead case. */
831 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
833 if (rrc == MATCH_MATCH) break;
834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
835 ecode += GET(ecode, 1);
837 while (*ecode == OP_ALT);
838 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
840 /* If checking an assertion for a condition, return MATCH_MATCH. */
842 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
844 /* Continue from after the assertion, updating the offsets high water
845 mark, since extracts may have been taken during the assertion. */
847 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
848 ecode += 1 + LINK_SIZE;
849 offset_top = md->end_offset_top;
852 /* Negative assertion: all branches must fail to match */
855 case OP_ASSERTBACK_NOT:
858 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
860 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
861 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
862 ecode += GET(ecode,1);
864 while (*ecode == OP_ALT);
866 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
868 ecode += 1 + LINK_SIZE;
871 /* Move the subject pointer back. This occurs only at the start of
872 each branch of a lookbehind assertion. If we are too close to the start to
873 move back, this match function fails. When working with UTF-8 we move
874 back a number of characters, not bytes. */
884 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
891 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
894 eptr -= GET(ecode, 1);
895 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
898 /* Skip to next op code */
900 ecode += 1 + LINK_SIZE;
903 /* The callout item calls an external function, if one is provided, passing
904 details of the match so far. This is mainly for debugging, though the
905 function is able to force a failure. */
908 if (pcre_callout != NULL)
910 pcre_callout_block cb;
911 cb.version = 1; /* Version 1 of the callout block */
912 cb.callout_number = ecode[1];
913 cb.offset_vector = md->offset_vector;
914 cb.subject = (PCRE_SPTR)md->start_subject;
915 cb.subject_length = md->end_subject - md->start_subject;
916 cb.start_match = mstart - md->start_subject;
917 cb.current_position = eptr - md->start_subject;
918 cb.pattern_position = GET(ecode, 2);
919 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
920 cb.capture_top = offset_top/2;
921 cb.capture_last = md->capture_last;
922 cb.callout_data = md->callout_data;
923 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
924 if (rrc < 0) RRETURN(rrc);
926 ecode += 2 + 2*LINK_SIZE;
929 /* Recursion either matches the current regex, or some subexpression. The
930 offset data is the offset to the starting bracket from the start of the
931 whole pattern. (This is so that it works from duplicated subpatterns.)
933 If there are any capturing brackets started but not finished, we have to
934 save their starting points and reinstate them after the recursion. However,
935 we don't know how many such there are (offset_top records the completed
936 total) so we just have to save all the potential data. There may be up to
937 65535 such values, which is too large to put on the stack, but using malloc
938 for small numbers seems expensive. As a compromise, the stack is used when
939 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
940 is used. A problem is what to do if the malloc fails ... there is no way of
941 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
942 values on the stack, and accept that the rest may be wrong.
944 There are also other values that have to be saved. We use a chained
945 sequence of blocks that actually live on the stack. Thanks to Robin Houston
946 for the original version of this logic. */
950 callpat = md->start_code + GET(ecode, 1);
951 new_recursive.group_num = (callpat == md->start_code)? 0 :
952 GET2(callpat, 1 + LINK_SIZE);
954 /* Add to "recursing stack" */
956 new_recursive.prevrec = md->recursive;
957 md->recursive = &new_recursive;
959 /* Find where to continue from afterwards */
961 ecode += 1 + LINK_SIZE;
962 new_recursive.after_call = ecode;
964 /* Now save the offset data. */
966 new_recursive.saved_max = md->offset_end;
967 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
968 new_recursive.offset_save = stacksave;
971 new_recursive.offset_save =
972 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
973 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
976 memcpy(new_recursive.offset_save, md->offset_vector,
977 new_recursive.saved_max * sizeof(int));
978 new_recursive.save_start = mstart;
981 /* OK, now we can do the recursion. For each top-level alternative we
982 restore the offset and recursion data. */
984 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
985 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
988 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
989 md, ims, eptrb, flags, RM6);
990 if (rrc == MATCH_MATCH)
992 DPRINTF(("Recursion matched\n"));
993 md->recursive = new_recursive.prevrec;
994 if (new_recursive.offset_save != stacksave)
995 (pcre_free)(new_recursive.offset_save);
996 RRETURN(MATCH_MATCH);
998 else if (rrc != MATCH_NOMATCH)
1000 DPRINTF(("Recursion gave error %d\n", rrc));
1004 md->recursive = &new_recursive;
1005 memcpy(md->offset_vector, new_recursive.offset_save,
1006 new_recursive.saved_max * sizeof(int));
1007 callpat += GET(callpat, 1);
1009 while (*callpat == OP_ALT);
1011 DPRINTF(("Recursion didn't match\n"));
1012 md->recursive = new_recursive.prevrec;
1013 if (new_recursive.offset_save != stacksave)
1014 (pcre_free)(new_recursive.offset_save);
1015 RRETURN(MATCH_NOMATCH);
1017 /* Control never reaches here */
1019 /* "Once" brackets are like assertion brackets except that after a match,
1020 the point in the subject string is not moved back. Thus there can never be
1021 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1022 Check the alternative branches in turn - the matching won't pass the KET
1023 for this kind of subpattern. If any one branch matches, we carry on as at
1024 the end of a normal bracket, leaving the subject pointer. */
1032 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
1034 if (rrc == MATCH_MATCH) break;
1035 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1036 ecode += GET(ecode,1);
1038 while (*ecode == OP_ALT);
1040 /* If hit the end of the group (which could be repeated), fail */
1042 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1044 /* Continue as from after the assertion, updating the offsets high water
1045 mark, since extracts may have been taken. */
1047 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1049 offset_top = md->end_offset_top;
1050 eptr = md->end_match_ptr;
1052 /* For a non-repeating ket, just continue at this level. This also
1053 happens for a repeating ket if no characters were matched in the group.
1054 This is the forcible breaking of infinite loops as implemented in Perl
1055 5.005. If there is an options reset, it will get obeyed in the normal
1056 course of events. */
1058 if (*ecode == OP_KET || eptr == saved_eptr)
1060 ecode += 1+LINK_SIZE;
1064 /* The repeating kets try the rest of the pattern or restart from the
1065 preceding bracket, in the appropriate order. The second "call" of match()
1066 uses tail recursion, to avoid using another stack frame. We need to reset
1067 any options that changed within the bracket before re-running it, so
1068 check the next opcode. */
1070 if (ecode[1+LINK_SIZE] == OP_OPT)
1072 ims = (ims & ~PCRE_IMS) | ecode[4];
1073 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1076 if (*ecode == OP_KETRMIN)
1078 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0,
1080 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1082 flags = match_tail_recursed;
1085 else /* OP_KETRMAX */
1087 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1088 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1089 ecode += 1 + LINK_SIZE;
1090 flags = match_tail_recursed;
1093 /* Control never gets here */
1095 /* An alternation is the end of a branch; scan along to find the end of the
1096 bracketed group and go to there. */
1099 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1102 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1103 that it may occur zero times. It may repeat infinitely, or not at all -
1104 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1105 repeat limits are compiled as a number of copies, with the optional ones
1106 preceded by BRAZERO or BRAMINZERO. */
1111 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1112 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1113 do next += GET(next,1); while (*next == OP_ALT);
1114 ecode = next + 1 + LINK_SIZE;
1121 do next += GET(next, 1); while (*next == OP_ALT);
1122 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1123 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1128 /* End of a group, repeated or non-repeating. */
1133 prev = ecode - GET(ecode, 1);
1135 /* If this was a group that remembered the subject start, in order to break
1136 infinite repeats of empty string matches, retrieve the subject start from
1137 the chain. Otherwise, set it NULL. */
1139 if (*prev >= OP_SBRA)
1141 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1142 eptrb = eptrb->epb_prev; /* Backup to previous group */
1144 else saved_eptr = NULL;
1146 /* If we are at the end of an assertion group, stop matching and return
1147 MATCH_MATCH, but record the current high water mark for use by positive
1148 assertions. Do this also for the "once" (atomic) groups. */
1150 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1151 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1154 md->end_match_ptr = eptr; /* For ONCE */
1155 md->end_offset_top = offset_top;
1156 RRETURN(MATCH_MATCH);
1159 /* For capturing groups we have to check the group number back at the start
1160 and if necessary complete handling an extraction by setting the offsets and
1161 bumping the high water mark. Note that whole-pattern recursion is coded as
1162 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1163 when the OP_END is reached. Other recursion is handled here. */
1165 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1167 number = GET2(prev, 1+LINK_SIZE);
1168 offset = number << 1;
1171 printf("end bracket %d", number);
1175 md->capture_last = number;
1176 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1178 md->offset_vector[offset] =
1179 md->offset_vector[md->offset_end - number];
1180 md->offset_vector[offset+1] = eptr - md->start_subject;
1181 if (offset_top <= offset) offset_top = offset + 2;
1184 /* Handle a recursively called group. Restore the offsets
1185 appropriately and continue from after the call. */
1187 if (md->recursive != NULL && md->recursive->group_num == number)
1189 recursion_info *rec = md->recursive;
1190 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1191 md->recursive = rec->prevrec;
1192 mstart = rec->save_start;
1193 memcpy(md->offset_vector, rec->offset_save,
1194 rec->saved_max * sizeof(int));
1195 ecode = rec->after_call;
1201 /* For both capturing and non-capturing groups, reset the value of the ims
1202 flags, in case they got changed during the group. */
1205 DPRINTF(("ims reset to %02lx\n", ims));
1207 /* For a non-repeating ket, just continue at this level. This also
1208 happens for a repeating ket if no characters were matched in the group.
1209 This is the forcible breaking of infinite loops as implemented in Perl
1210 5.005. If there is an options reset, it will get obeyed in the normal
1211 course of events. */
1213 if (*ecode == OP_KET || eptr == saved_eptr)
1215 ecode += 1 + LINK_SIZE;
1219 /* The repeating kets try the rest of the pattern or restart from the
1220 preceding bracket, in the appropriate order. In the second case, we can use
1221 tail recursion to avoid using another stack frame. */
1223 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1225 if (*ecode == OP_KETRMIN)
1227 RMATCH(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0,
1229 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1231 flags |= match_tail_recursed;
1234 else /* OP_KETRMAX */
1236 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1237 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1238 ecode += 1 + LINK_SIZE;
1239 flags = match_tail_recursed;
1242 /* Control never gets here */
1244 /* Start of subject unless notbol, or after internal newline if multiline */
1247 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1248 if ((ims & PCRE_MULTILINE) != 0)
1250 if (eptr != md->start_subject &&
1251 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1252 RRETURN(MATCH_NOMATCH);
1256 /* ... else fall through */
1258 /* Start of subject assertion */
1261 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1265 /* Start of match assertion */
1268 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1272 /* Reset the start of match point */
1279 /* Assert before internal newline if multiline, or before a terminating
1280 newline unless endonly is set, else end of subject unless noteol is set. */
1283 if ((ims & PCRE_MULTILINE) != 0)
1285 if (eptr < md->end_subject)
1286 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1288 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1294 if (md->noteol) RRETURN(MATCH_NOMATCH);
1297 if (eptr != md->end_subject &&
1298 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1299 RRETURN(MATCH_NOMATCH);
1304 /* ... else fall through for endonly */
1306 /* End of subject assertion (\z) */
1309 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1313 /* End of subject or ending \n assertion (\Z) */
1316 if (eptr != md->end_subject &&
1317 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1318 RRETURN(MATCH_NOMATCH);
1322 /* Word boundary assertions */
1324 case OP_NOT_WORD_BOUNDARY:
1325 case OP_WORD_BOUNDARY:
1328 /* Find out if the previous and current characters are "word" characters.
1329 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1330 be "non-word" characters. */
1335 if (eptr == md->start_subject) prev_is_word = FALSE; else
1337 const uschar *lastptr = eptr - 1;
1338 while((*lastptr & 0xc0) == 0x80) lastptr--;
1339 GETCHAR(c, lastptr);
1340 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1342 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1345 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1351 /* More streamlined when not in UTF-8 mode */
1354 prev_is_word = (eptr != md->start_subject) &&
1355 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1356 cur_is_word = (eptr < md->end_subject) &&
1357 ((md->ctypes[*eptr] & ctype_word) != 0);
1360 /* Now see if the situation is what we want */
1362 if ((*ecode++ == OP_WORD_BOUNDARY)?
1363 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1364 RRETURN(MATCH_NOMATCH);
1368 /* Match a single character type; inline for speed */
1371 if ((ims & PCRE_DOTALL) == 0)
1373 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1375 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1377 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1381 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1382 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1385 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1390 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1391 GETCHARINCTEST(c, eptr);
1396 (md->ctypes[c] & ctype_digit) != 0
1398 RRETURN(MATCH_NOMATCH);
1403 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1404 GETCHARINCTEST(c, eptr);
1409 (md->ctypes[c] & ctype_digit) == 0
1411 RRETURN(MATCH_NOMATCH);
1415 case OP_NOT_WHITESPACE:
1416 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1417 GETCHARINCTEST(c, eptr);
1422 (md->ctypes[c] & ctype_space) != 0
1424 RRETURN(MATCH_NOMATCH);
1429 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1430 GETCHARINCTEST(c, eptr);
1435 (md->ctypes[c] & ctype_space) == 0
1437 RRETURN(MATCH_NOMATCH);
1441 case OP_NOT_WORDCHAR:
1442 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1443 GETCHARINCTEST(c, eptr);
1448 (md->ctypes[c] & ctype_word) != 0
1450 RRETURN(MATCH_NOMATCH);
1455 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1456 GETCHARINCTEST(c, eptr);
1461 (md->ctypes[c] & ctype_word) == 0
1463 RRETURN(MATCH_NOMATCH);
1468 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1469 GETCHARINCTEST(c, eptr);
1472 default: RRETURN(MATCH_NOMATCH);
1474 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1488 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1489 GETCHARINCTEST(c, eptr);
1494 case 0x20: /* SPACE */
1495 case 0xa0: /* NBSP */
1496 case 0x1680: /* OGHAM SPACE MARK */
1497 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1498 case 0x2000: /* EN QUAD */
1499 case 0x2001: /* EM QUAD */
1500 case 0x2002: /* EN SPACE */
1501 case 0x2003: /* EM SPACE */
1502 case 0x2004: /* THREE-PER-EM SPACE */
1503 case 0x2005: /* FOUR-PER-EM SPACE */
1504 case 0x2006: /* SIX-PER-EM SPACE */
1505 case 0x2007: /* FIGURE SPACE */
1506 case 0x2008: /* PUNCTUATION SPACE */
1507 case 0x2009: /* THIN SPACE */
1508 case 0x200A: /* HAIR SPACE */
1509 case 0x202f: /* NARROW NO-BREAK SPACE */
1510 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1511 case 0x3000: /* IDEOGRAPHIC SPACE */
1512 RRETURN(MATCH_NOMATCH);
1518 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1519 GETCHARINCTEST(c, eptr);
1522 default: RRETURN(MATCH_NOMATCH);
1524 case 0x20: /* SPACE */
1525 case 0xa0: /* NBSP */
1526 case 0x1680: /* OGHAM SPACE MARK */
1527 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1528 case 0x2000: /* EN QUAD */
1529 case 0x2001: /* EM QUAD */
1530 case 0x2002: /* EN SPACE */
1531 case 0x2003: /* EM SPACE */
1532 case 0x2004: /* THREE-PER-EM SPACE */
1533 case 0x2005: /* FOUR-PER-EM SPACE */
1534 case 0x2006: /* SIX-PER-EM SPACE */
1535 case 0x2007: /* FIGURE SPACE */
1536 case 0x2008: /* PUNCTUATION SPACE */
1537 case 0x2009: /* THIN SPACE */
1538 case 0x200A: /* HAIR SPACE */
1539 case 0x202f: /* NARROW NO-BREAK SPACE */
1540 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1541 case 0x3000: /* IDEOGRAPHIC SPACE */
1548 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1549 GETCHARINCTEST(c, eptr);
1557 case 0x85: /* NEL */
1558 case 0x2028: /* LINE SEPARATOR */
1559 case 0x2029: /* PARAGRAPH SEPARATOR */
1560 RRETURN(MATCH_NOMATCH);
1566 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1567 GETCHARINCTEST(c, eptr);
1570 default: RRETURN(MATCH_NOMATCH);
1575 case 0x85: /* NEL */
1576 case 0x2028: /* LINE SEPARATOR */
1577 case 0x2029: /* PARAGRAPH SEPARATOR */
1584 /* Check the next character by Unicode property. We will get here only
1585 if the support is in the binary; otherwise a compile-time error occurs. */
1589 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1590 GETCHARINCTEST(c, eptr);
1592 int chartype, script;
1593 int category = _pcre_ucp_findprop(c, &chartype, &script);
1598 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1602 if ((chartype == ucp_Lu ||
1603 chartype == ucp_Ll ||
1604 chartype == ucp_Lt) == (op == OP_NOTPROP))
1605 RRETURN(MATCH_NOMATCH);
1609 if ((ecode[2] != category) == (op == OP_PROP))
1610 RRETURN(MATCH_NOMATCH);
1614 if ((ecode[2] != chartype) == (op == OP_PROP))
1615 RRETURN(MATCH_NOMATCH);
1619 if ((ecode[2] != script) == (op == OP_PROP))
1620 RRETURN(MATCH_NOMATCH);
1624 RRETURN(PCRE_ERROR_INTERNAL);
1631 /* Match an extended Unicode sequence. We will get here only if the support
1632 is in the binary; otherwise a compile-time error occurs. */
1635 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1636 GETCHARINCTEST(c, eptr);
1638 int chartype, script;
1639 int category = _pcre_ucp_findprop(c, &chartype, &script);
1640 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1641 while (eptr < md->end_subject)
1644 if (!utf8) c = *eptr; else
1646 GETCHARLEN(c, eptr, len);
1648 category = _pcre_ucp_findprop(c, &chartype, &script);
1649 if (category != ucp_M) break;
1658 /* Match a back reference, possibly repeatedly. Look past the end of the
1659 item to see if there is repeat information following. The code is similar
1660 to that for character classes, but repeated for efficiency. Then obey
1661 similar code to character type repeats - written out again for speed.
1662 However, if the referenced string is the empty string, always treat
1663 it as matched, any number of times (otherwise there could be infinite
1668 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1669 ecode += 3; /* Advance past item */
1671 /* If the reference is unset, set the length to be longer than the amount
1672 of subject left; this ensures that every attempt at a match fails. We
1673 can't just fail here, because of the possibility of quantifiers with zero
1676 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1677 md->end_subject - eptr + 1 :
1678 md->offset_vector[offset+1] - md->offset_vector[offset];
1680 /* Set up for repetition, or handle the non-repeated case */
1690 c = *ecode++ - OP_CRSTAR;
1691 minimize = (c & 1) != 0;
1692 min = rep_min[c]; /* Pick up values from tables; */
1693 max = rep_max[c]; /* zero for max => infinity */
1694 if (max == 0) max = INT_MAX;
1699 minimize = (*ecode == OP_CRMINRANGE);
1700 min = GET2(ecode, 1);
1701 max = GET2(ecode, 3);
1702 if (max == 0) max = INT_MAX;
1706 default: /* No repeat follows */
1707 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1709 continue; /* With the main loop */
1712 /* If the length of the reference is zero, just continue with the
1715 if (length == 0) continue;
1717 /* First, ensure the minimum number of matches are present. We get back
1718 the length of the reference string explicitly rather than passing the
1719 address of eptr, so that eptr can be a register variable. */
1721 for (i = 1; i <= min; i++)
1723 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1727 /* If min = max, continue at the same level without recursion.
1728 They are not both allowed to be zero. */
1730 if (min == max) continue;
1732 /* If minimizing, keep trying and advancing the pointer */
1736 for (fi = min;; fi++)
1738 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1739 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1740 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1741 RRETURN(MATCH_NOMATCH);
1744 /* Control never gets here */
1747 /* If maximizing, find the longest string and work backwards */
1752 for (i = min; i < max; i++)
1754 if (!match_ref(offset, eptr, length, md, ims)) break;
1759 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1760 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1763 RRETURN(MATCH_NOMATCH);
1766 /* Control never gets here */
1770 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1771 used when all the characters in the class have values in the range 0-255,
1772 and either the matching is caseful, or the characters are in the range
1773 0-127 when UTF-8 processing is enabled. The only difference between
1774 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1777 First, look past the end of the item to see if there is repeat information
1778 following. Then obey similar code to character type repeats - written out
1784 data = ecode + 1; /* Save for matching */
1785 ecode += 33; /* Advance past the item */
1795 c = *ecode++ - OP_CRSTAR;
1796 minimize = (c & 1) != 0;
1797 min = rep_min[c]; /* Pick up values from tables; */
1798 max = rep_max[c]; /* zero for max => infinity */
1799 if (max == 0) max = INT_MAX;
1804 minimize = (*ecode == OP_CRMINRANGE);
1805 min = GET2(ecode, 1);
1806 max = GET2(ecode, 3);
1807 if (max == 0) max = INT_MAX;
1811 default: /* No repeat follows */
1816 /* First, ensure the minimum number of matches are present. */
1822 for (i = 1; i <= min; i++)
1824 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1825 GETCHARINC(c, eptr);
1828 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1832 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1838 /* Not UTF-8 mode */
1840 for (i = 1; i <= min; i++)
1842 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1844 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1848 /* If max == min we can continue with the main loop without the
1851 if (min == max) continue;
1853 /* If minimizing, keep testing the rest of the expression and advancing
1854 the pointer while it matches the class. */
1862 for (fi = min;; fi++)
1864 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1865 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1866 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1867 GETCHARINC(c, eptr);
1870 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1874 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1880 /* Not UTF-8 mode */
1882 for (fi = min;; fi++)
1884 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1886 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1888 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1891 /* Control never gets here */
1894 /* If maximizing, find the longest possible run, then work backwards. */
1904 for (i = min; i < max; i++)
1907 if (eptr >= md->end_subject) break;
1908 GETCHARLEN(c, eptr, len);
1911 if (op == OP_CLASS) break;
1915 if ((data[c/8] & (1 << (c&7))) == 0) break;
1921 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1922 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1923 if (eptr-- == pp) break; /* Stop if tried at original pos */
1929 /* Not UTF-8 mode */
1931 for (i = min; i < max; i++)
1933 if (eptr >= md->end_subject) break;
1935 if ((data[c/8] & (1 << (c&7))) == 0) break;
1940 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1941 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1946 RRETURN(MATCH_NOMATCH);
1949 /* Control never gets here */
1952 /* Match an extended character class. This opcode is encountered only
1953 in UTF-8 mode, because that's the only time it is compiled. */
1958 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1959 ecode += GET(ecode, 1); /* Advance past the item */
1969 c = *ecode++ - OP_CRSTAR;
1970 minimize = (c & 1) != 0;
1971 min = rep_min[c]; /* Pick up values from tables; */
1972 max = rep_max[c]; /* zero for max => infinity */
1973 if (max == 0) max = INT_MAX;
1978 minimize = (*ecode == OP_CRMINRANGE);
1979 min = GET2(ecode, 1);
1980 max = GET2(ecode, 3);
1981 if (max == 0) max = INT_MAX;
1985 default: /* No repeat follows */
1990 /* First, ensure the minimum number of matches are present. */
1992 for (i = 1; i <= min; i++)
1994 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1995 GETCHARINC(c, eptr);
1996 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1999 /* If max == min we can continue with the main loop without the
2002 if (min == max) continue;
2004 /* If minimizing, keep testing the rest of the expression and advancing
2005 the pointer while it matches the class. */
2009 for (fi = min;; fi++)
2011 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2013 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2014 GETCHARINC(c, eptr);
2015 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2017 /* Control never gets here */
2020 /* If maximizing, find the longest possible run, then work backwards. */
2025 for (i = min; i < max; i++)
2028 if (eptr >= md->end_subject) break;
2029 GETCHARLEN(c, eptr, len);
2030 if (!_pcre_xclass(c, data)) break;
2035 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2036 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2037 if (eptr-- == pp) break; /* Stop if tried at original pos */
2040 RRETURN(MATCH_NOMATCH);
2043 /* Control never gets here */
2045 #endif /* End of XCLASS */
2047 /* Match a single character, casefully */
2055 GETCHARLEN(fc, ecode, length);
2056 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2057 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2062 /* Non-UTF-8 mode */
2064 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2065 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2070 /* Match a single character, caselessly */
2078 GETCHARLEN(fc, ecode, length);
2080 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2082 /* If the pattern character's value is < 128, we have only one byte, and
2083 can use the fast lookup table. */
2087 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2090 /* Otherwise we must pick up the subject character */
2095 GETCHARINC(dc, eptr);
2098 /* If we have Unicode property support, we can use it to test the other
2099 case of the character, if there is one. */
2104 if (dc != _pcre_ucp_othercase(fc))
2106 RRETURN(MATCH_NOMATCH);
2111 #endif /* SUPPORT_UTF8 */
2113 /* Non-UTF-8 mode */
2115 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2116 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2121 /* Match a single character repeatedly. */
2124 min = max = GET2(ecode, 1);
2135 max = GET2(ecode, 1);
2136 minimize = *ecode == OP_MINUPTO;
2167 c = *ecode++ - OP_STAR;
2168 minimize = (c & 1) != 0;
2169 min = rep_min[c]; /* Pick up values from tables; */
2170 max = rep_max[c]; /* zero for max => infinity */
2171 if (max == 0) max = INT_MAX;
2173 /* Common code for all repeated single-character matches. We can give
2174 up quickly if there are fewer than the minimum number of characters left in
2183 GETCHARLEN(fc, ecode, length);
2184 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2187 /* Handle multibyte character matching specially here. There is
2188 support for caseless matching if UCP support is present. */
2193 unsigned int othercase;
2194 if ((ims & PCRE_CASELESS) != 0 &&
2195 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2196 oclength = _pcre_ord2utf8(othercase, occhars);
2198 #endif /* SUPPORT_UCP */
2200 for (i = 1; i <= min; i++)
2202 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2204 /* Need braces because of following else */
2205 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2208 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2211 #else /* without SUPPORT_UCP */
2212 else { RRETURN(MATCH_NOMATCH); }
2213 #endif /* SUPPORT_UCP */
2216 if (min == max) continue;
2220 for (fi = min;; fi++)
2222 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2223 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2224 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2225 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2227 /* Need braces because of following else */
2228 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2231 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2234 #else /* without SUPPORT_UCP */
2235 else { RRETURN (MATCH_NOMATCH); }
2236 #endif /* SUPPORT_UCP */
2238 /* Control never gets here */
2244 for (i = min; i < max; i++)
2246 if (eptr > md->end_subject - length) break;
2247 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2249 else if (oclength == 0) break;
2252 if (memcmp(eptr, occhars, oclength) != 0) break;
2255 #else /* without SUPPORT_UCP */
2257 #endif /* SUPPORT_UCP */
2260 if (possessive) continue;
2263 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2264 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2265 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2269 #else /* without SUPPORT_UCP */
2271 #endif /* SUPPORT_UCP */
2274 /* Control never gets here */
2277 /* If the length of a UTF-8 character is 1, we fall through here, and
2278 obey the code as for non-UTF-8 characters below, though in this case the
2279 value of fc will always be < 128. */
2282 #endif /* SUPPORT_UTF8 */
2284 /* When not in UTF-8 mode, load a single-byte character. */
2286 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2290 /* The value of fc at this point is always less than 256, though we may or
2291 may not be in UTF-8 mode. The code is duplicated for the caseless and
2292 caseful cases, for speed, since matching characters is likely to be quite
2293 common. First, ensure the minimum number of matches are present. If min =
2294 max, continue at the same level without recursing. Otherwise, if
2295 minimizing, keep trying the rest of the expression and advancing one
2296 matching character if failing, up to the maximum. Alternatively, if
2297 maximizing, find the maximum number of characters and work backwards. */
2299 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2302 if ((ims & PCRE_CASELESS) != 0)
2305 for (i = 1; i <= min; i++)
2306 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2307 if (min == max) continue;
2310 for (fi = min;; fi++)
2312 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2313 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2314 if (fi >= max || eptr >= md->end_subject ||
2315 fc != md->lcc[*eptr++])
2316 RRETURN(MATCH_NOMATCH);
2318 /* Control never gets here */
2323 for (i = min; i < max; i++)
2325 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2328 if (possessive) continue;
2331 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2333 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2335 RRETURN(MATCH_NOMATCH);
2337 /* Control never gets here */
2340 /* Caseful comparisons (includes all multi-byte characters) */
2344 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2345 if (min == max) continue;
2348 for (fi = min;; fi++)
2350 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2351 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2352 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2353 RRETURN(MATCH_NOMATCH);
2355 /* Control never gets here */
2360 for (i = min; i < max; i++)
2362 if (eptr >= md->end_subject || fc != *eptr) break;
2365 if (possessive) continue;
2368 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2370 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2372 RRETURN(MATCH_NOMATCH);
2375 /* Control never gets here */
2377 /* Match a negated single one-byte character. The character we are
2378 checking can be multibyte. */
2381 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2383 GETCHARINCTEST(c, eptr);
2384 if ((ims & PCRE_CASELESS) != 0)
2390 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2394 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2398 /* Match a negated single one-byte character repeatedly. This is almost a
2399 repeat of the code for a repeated single character, but I haven't found a
2400 nice way of commoning these up that doesn't require a test of the
2401 positive/negative option for each character match. Maybe that wouldn't add
2402 very much to the time taken, but character matching *is* what this is all
2406 min = max = GET2(ecode, 1);
2413 max = GET2(ecode, 1);
2414 minimize = *ecode == OP_NOTMINUPTO;
2432 case OP_NOTPOSQUERY:
2442 max = GET2(ecode, 1);
2451 case OP_NOTMINQUERY:
2452 c = *ecode++ - OP_NOTSTAR;
2453 minimize = (c & 1) != 0;
2454 min = rep_min[c]; /* Pick up values from tables; */
2455 max = rep_max[c]; /* zero for max => infinity */
2456 if (max == 0) max = INT_MAX;
2458 /* Common code for all repeated single-byte matches. We can give up quickly
2459 if there are fewer than the minimum number of bytes left in the
2463 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2466 /* The code is duplicated for the caseless and caseful cases, for speed,
2467 since matching characters is likely to be quite common. First, ensure the
2468 minimum number of matches are present. If min = max, continue at the same
2469 level without recursing. Otherwise, if minimizing, keep trying the rest of
2470 the expression and advancing one matching character if failing, up to the
2471 maximum. Alternatively, if maximizing, find the maximum number of
2472 characters and work backwards. */
2474 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2477 if ((ims & PCRE_CASELESS) != 0)
2485 register unsigned int d;
2486 for (i = 1; i <= min; i++)
2488 GETCHARINC(d, eptr);
2489 if (d < 256) d = md->lcc[d];
2490 if (fc == d) RRETURN(MATCH_NOMATCH);
2496 /* Not UTF-8 mode */
2498 for (i = 1; i <= min; i++)
2499 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2502 if (min == max) continue;
2510 register unsigned int d;
2511 for (fi = min;; fi++)
2513 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2514 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2515 GETCHARINC(d, eptr);
2516 if (d < 256) d = md->lcc[d];
2517 if (fi >= max || eptr >= md->end_subject || fc == d)
2518 RRETURN(MATCH_NOMATCH);
2523 /* Not UTF-8 mode */
2525 for (fi = min;; fi++)
2527 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2528 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2529 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2530 RRETURN(MATCH_NOMATCH);
2533 /* Control never gets here */
2546 register unsigned int d;
2547 for (i = min; i < max; i++)
2550 if (eptr >= md->end_subject) break;
2551 GETCHARLEN(d, eptr, len);
2552 if (d < 256) d = md->lcc[d];
2556 if (possessive) continue;
2559 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2560 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2561 if (eptr-- == pp) break; /* Stop if tried at original pos */
2567 /* Not UTF-8 mode */
2569 for (i = min; i < max; i++)
2571 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2574 if (possessive) continue;
2577 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2578 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2583 RRETURN(MATCH_NOMATCH);
2585 /* Control never gets here */
2588 /* Caseful comparisons */
2596 register unsigned int d;
2597 for (i = 1; i <= min; i++)
2599 GETCHARINC(d, eptr);
2600 if (fc == d) RRETURN(MATCH_NOMATCH);
2605 /* Not UTF-8 mode */
2607 for (i = 1; i <= min; i++)
2608 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2611 if (min == max) continue;
2619 register unsigned int d;
2620 for (fi = min;; fi++)
2622 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2623 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2624 GETCHARINC(d, eptr);
2625 if (fi >= max || eptr >= md->end_subject || fc == d)
2626 RRETURN(MATCH_NOMATCH);
2631 /* Not UTF-8 mode */
2633 for (fi = min;; fi++)
2635 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2636 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2637 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2638 RRETURN(MATCH_NOMATCH);
2641 /* Control never gets here */
2654 register unsigned int d;
2655 for (i = min; i < max; i++)
2658 if (eptr >= md->end_subject) break;
2659 GETCHARLEN(d, eptr, len);
2663 if (possessive) continue;
2666 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2668 if (eptr-- == pp) break; /* Stop if tried at original pos */
2674 /* Not UTF-8 mode */
2676 for (i = min; i < max; i++)
2678 if (eptr >= md->end_subject || fc == *eptr) break;
2681 if (possessive) continue;
2684 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2685 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2690 RRETURN(MATCH_NOMATCH);
2693 /* Control never gets here */
2695 /* Match a single character type repeatedly; several different opcodes
2696 share code. This is very similar to the code for single characters, but we
2697 repeat it in the interests of efficiency. */
2700 min = max = GET2(ecode, 1);
2706 case OP_TYPEMINUPTO:
2708 max = GET2(ecode, 1);
2709 minimize = *ecode == OP_TYPEMINUPTO;
2713 case OP_TYPEPOSSTAR:
2720 case OP_TYPEPOSPLUS:
2727 case OP_TYPEPOSQUERY:
2734 case OP_TYPEPOSUPTO:
2737 max = GET2(ecode, 1);
2742 case OP_TYPEMINSTAR:
2744 case OP_TYPEMINPLUS:
2746 case OP_TYPEMINQUERY:
2747 c = *ecode++ - OP_TYPESTAR;
2748 minimize = (c & 1) != 0;
2749 min = rep_min[c]; /* Pick up values from tables; */
2750 max = rep_max[c]; /* zero for max => infinity */
2751 if (max == 0) max = INT_MAX;
2753 /* Common code for all repeated single character type matches. Note that
2754 in UTF-8 mode, '.' matches a character of any length, but for the other
2755 character types, the valid characters are all one-byte long. */
2758 ctype = *ecode++; /* Code for the character type */
2761 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2763 prop_fail_result = ctype == OP_NOTPROP;
2764 prop_type = *ecode++;
2765 prop_value = *ecode++;
2767 else prop_type = -1;
2770 /* First, ensure the minimum number of matches are present. Use inline
2771 code for maximizing the speed, and do the type test once at the start
2772 (i.e. keep it out of the loop). Also we can test that there are at least
2773 the minimum number of bytes before we start. This isn't as effective in
2774 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2775 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2776 and single-bytes. */
2778 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2787 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2788 for (i = 1; i <= min; i++)
2790 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2791 GETCHARINCTEST(c, eptr);
2796 for (i = 1; i <= min; i++)
2798 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2799 GETCHARINCTEST(c, eptr);
2800 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2801 if ((prop_chartype == ucp_Lu ||
2802 prop_chartype == ucp_Ll ||
2803 prop_chartype == ucp_Lt) == prop_fail_result)
2804 RRETURN(MATCH_NOMATCH);
2809 for (i = 1; i <= min; i++)
2811 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2812 GETCHARINCTEST(c, eptr);
2813 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2814 if ((prop_category == prop_value) == prop_fail_result)
2815 RRETURN(MATCH_NOMATCH);
2820 for (i = 1; i <= min; i++)
2822 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2823 GETCHARINCTEST(c, eptr);
2824 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2825 if ((prop_chartype == prop_value) == prop_fail_result)
2826 RRETURN(MATCH_NOMATCH);
2831 for (i = 1; i <= min; i++)
2833 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2834 GETCHARINCTEST(c, eptr);
2835 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2836 if ((prop_script == prop_value) == prop_fail_result)
2837 RRETURN(MATCH_NOMATCH);
2842 RRETURN(PCRE_ERROR_INTERNAL);
2846 /* Match extended Unicode sequences. We will get here only if the
2847 support is in the binary; otherwise a compile-time error occurs. */
2849 else if (ctype == OP_EXTUNI)
2851 for (i = 1; i <= min; i++)
2853 GETCHARINCTEST(c, eptr);
2854 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2855 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2856 while (eptr < md->end_subject)
2859 if (!utf8) c = *eptr; else
2861 GETCHARLEN(c, eptr, len);
2863 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2864 if (prop_category != ucp_M) break;
2871 #endif /* SUPPORT_UCP */
2873 /* Handle all other cases when the coding is UTF-8 */
2876 if (utf8) switch(ctype)
2879 for (i = 1; i <= min; i++)
2881 if (eptr >= md->end_subject ||
2882 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2883 RRETURN(MATCH_NOMATCH);
2885 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2894 for (i = 1; i <= min; i++)
2896 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2897 GETCHARINC(c, eptr);
2900 default: RRETURN(MATCH_NOMATCH);
2902 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2916 for (i = 1; i <= min; i++)
2918 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2919 GETCHARINC(c, eptr);
2924 case 0x20: /* SPACE */
2925 case 0xa0: /* NBSP */
2926 case 0x1680: /* OGHAM SPACE MARK */
2927 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2928 case 0x2000: /* EN QUAD */
2929 case 0x2001: /* EM QUAD */
2930 case 0x2002: /* EN SPACE */
2931 case 0x2003: /* EM SPACE */
2932 case 0x2004: /* THREE-PER-EM SPACE */
2933 case 0x2005: /* FOUR-PER-EM SPACE */
2934 case 0x2006: /* SIX-PER-EM SPACE */
2935 case 0x2007: /* FIGURE SPACE */
2936 case 0x2008: /* PUNCTUATION SPACE */
2937 case 0x2009: /* THIN SPACE */
2938 case 0x200A: /* HAIR SPACE */
2939 case 0x202f: /* NARROW NO-BREAK SPACE */
2940 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2941 case 0x3000: /* IDEOGRAPHIC SPACE */
2942 RRETURN(MATCH_NOMATCH);
2948 for (i = 1; i <= min; i++)
2950 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2951 GETCHARINC(c, eptr);
2954 default: RRETURN(MATCH_NOMATCH);
2956 case 0x20: /* SPACE */
2957 case 0xa0: /* NBSP */
2958 case 0x1680: /* OGHAM SPACE MARK */
2959 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2960 case 0x2000: /* EN QUAD */
2961 case 0x2001: /* EM QUAD */
2962 case 0x2002: /* EN SPACE */
2963 case 0x2003: /* EM SPACE */
2964 case 0x2004: /* THREE-PER-EM SPACE */
2965 case 0x2005: /* FOUR-PER-EM SPACE */
2966 case 0x2006: /* SIX-PER-EM SPACE */
2967 case 0x2007: /* FIGURE SPACE */
2968 case 0x2008: /* PUNCTUATION SPACE */
2969 case 0x2009: /* THIN SPACE */
2970 case 0x200A: /* HAIR SPACE */
2971 case 0x202f: /* NARROW NO-BREAK SPACE */
2972 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2973 case 0x3000: /* IDEOGRAPHIC SPACE */
2980 for (i = 1; i <= min; i++)
2982 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2983 GETCHARINC(c, eptr);
2991 case 0x85: /* NEL */
2992 case 0x2028: /* LINE SEPARATOR */
2993 case 0x2029: /* PARAGRAPH SEPARATOR */
2994 RRETURN(MATCH_NOMATCH);
3000 for (i = 1; i <= min; i++)
3002 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3003 GETCHARINC(c, eptr);
3006 default: RRETURN(MATCH_NOMATCH);
3011 case 0x85: /* NEL */
3012 case 0x2028: /* LINE SEPARATOR */
3013 case 0x2029: /* PARAGRAPH SEPARATOR */
3020 for (i = 1; i <= min; i++)
3022 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3023 GETCHARINC(c, eptr);
3024 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3025 RRETURN(MATCH_NOMATCH);
3030 for (i = 1; i <= min; i++)
3032 if (eptr >= md->end_subject ||
3033 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3034 RRETURN(MATCH_NOMATCH);
3035 /* No need to skip more bytes - we know it's a 1-byte character */
3039 case OP_NOT_WHITESPACE:
3040 for (i = 1; i <= min; i++)
3042 if (eptr >= md->end_subject ||
3043 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
3044 RRETURN(MATCH_NOMATCH);
3045 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3050 for (i = 1; i <= min; i++)
3052 if (eptr >= md->end_subject ||
3053 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3054 RRETURN(MATCH_NOMATCH);
3055 /* No need to skip more bytes - we know it's a 1-byte character */
3059 case OP_NOT_WORDCHAR:
3060 for (i = 1; i <= min; i++)
3062 if (eptr >= md->end_subject ||
3063 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
3064 RRETURN(MATCH_NOMATCH);
3065 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3070 for (i = 1; i <= min; i++)
3072 if (eptr >= md->end_subject ||
3073 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3074 RRETURN(MATCH_NOMATCH);
3075 /* No need to skip more bytes - we know it's a 1-byte character */
3080 RRETURN(PCRE_ERROR_INTERNAL);
3081 } /* End switch(ctype) */
3084 #endif /* SUPPORT_UTF8 */
3086 /* Code for the non-UTF-8 case for minimum matching of operators other
3087 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3088 number of bytes present, as this was tested above. */
3093 if ((ims & PCRE_DOTALL) == 0)
3095 for (i = 1; i <= min; i++)
3097 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3108 /* Because of the CRLF case, we can't assume the minimum number of
3109 bytes are present in this case. */
3112 for (i = 1; i <= min; i++)
3114 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3117 default: RRETURN(MATCH_NOMATCH);
3119 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3131 for (i = 1; i <= min; i++)
3133 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3138 case 0x20: /* SPACE */
3139 case 0xa0: /* NBSP */
3140 RRETURN(MATCH_NOMATCH);
3146 for (i = 1; i <= min; i++)
3148 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3151 default: RRETURN(MATCH_NOMATCH);
3153 case 0x20: /* SPACE */
3154 case 0xa0: /* NBSP */
3161 for (i = 1; i <= min; i++)
3163 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3171 case 0x85: /* NEL */
3172 RRETURN(MATCH_NOMATCH);
3178 for (i = 1; i <= min; i++)
3180 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3183 default: RRETURN(MATCH_NOMATCH);
3188 case 0x85: /* NEL */
3195 for (i = 1; i <= min; i++)
3196 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3200 for (i = 1; i <= min; i++)
3201 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3204 case OP_NOT_WHITESPACE:
3205 for (i = 1; i <= min; i++)
3206 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3210 for (i = 1; i <= min; i++)
3211 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3214 case OP_NOT_WORDCHAR:
3215 for (i = 1; i <= min; i++)
3216 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3217 RRETURN(MATCH_NOMATCH);
3221 for (i = 1; i <= min; i++)
3222 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3223 RRETURN(MATCH_NOMATCH);
3227 RRETURN(PCRE_ERROR_INTERNAL);
3231 /* If min = max, continue at the same level without recursing */
3233 if (min == max) continue;
3235 /* If minimizing, we have to test the rest of the pattern before each
3236 subsequent match. Again, separate the UTF-8 case for speed, and also
3237 separate the UCP cases. */
3247 for (fi = min;; fi++)
3249 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3250 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3251 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3252 GETCHARINC(c, eptr);
3253 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3255 /* Control never gets here */
3258 for (fi = min;; fi++)
3260 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3261 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3262 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3263 GETCHARINC(c, eptr);
3264 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3265 if ((prop_chartype == ucp_Lu ||
3266 prop_chartype == ucp_Ll ||
3267 prop_chartype == ucp_Lt) == prop_fail_result)
3268 RRETURN(MATCH_NOMATCH);
3270 /* Control never gets here */
3273 for (fi = min;; fi++)
3275 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3276 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3277 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3278 GETCHARINC(c, eptr);
3279 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3280 if ((prop_category == prop_value) == prop_fail_result)
3281 RRETURN(MATCH_NOMATCH);
3283 /* Control never gets here */
3286 for (fi = min;; fi++)
3288 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3289 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3290 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3291 GETCHARINC(c, eptr);
3292 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3293 if ((prop_chartype == prop_value) == prop_fail_result)
3294 RRETURN(MATCH_NOMATCH);
3296 /* Control never gets here */
3299 for (fi = min;; fi++)
3301 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3302 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3303 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3304 GETCHARINC(c, eptr);
3305 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3306 if ((prop_script == prop_value) == prop_fail_result)
3307 RRETURN(MATCH_NOMATCH);
3309 /* Control never gets here */
3312 RRETURN(PCRE_ERROR_INTERNAL);
3316 /* Match extended Unicode sequences. We will get here only if the
3317 support is in the binary; otherwise a compile-time error occurs. */
3319 else if (ctype == OP_EXTUNI)
3321 for (fi = min;; fi++)
3323 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3324 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3325 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3326 GETCHARINCTEST(c, eptr);
3327 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3328 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3329 while (eptr < md->end_subject)
3332 if (!utf8) c = *eptr; else
3334 GETCHARLEN(c, eptr, len);
3336 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3337 if (prop_category != ucp_M) break;
3344 #endif /* SUPPORT_UCP */
3350 for (fi = min;; fi++)
3352 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3353 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3354 if (fi >= max || eptr >= md->end_subject ||
3355 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3357 RRETURN(MATCH_NOMATCH);
3359 GETCHARINC(c, eptr);
3362 case OP_ANY: /* This is the DOTALL case */
3371 default: RRETURN(MATCH_NOMATCH);
3373 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3390 case 0x20: /* SPACE */
3391 case 0xa0: /* NBSP */
3392 case 0x1680: /* OGHAM SPACE MARK */
3393 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3394 case 0x2000: /* EN QUAD */
3395 case 0x2001: /* EM QUAD */
3396 case 0x2002: /* EN SPACE */
3397 case 0x2003: /* EM SPACE */
3398 case 0x2004: /* THREE-PER-EM SPACE */
3399 case 0x2005: /* FOUR-PER-EM SPACE */
3400 case 0x2006: /* SIX-PER-EM SPACE */
3401 case 0x2007: /* FIGURE SPACE */
3402 case 0x2008: /* PUNCTUATION SPACE */
3403 case 0x2009: /* THIN SPACE */
3404 case 0x200A: /* HAIR SPACE */
3405 case 0x202f: /* NARROW NO-BREAK SPACE */
3406 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3407 case 0x3000: /* IDEOGRAPHIC SPACE */
3408 RRETURN(MATCH_NOMATCH);
3415 default: RRETURN(MATCH_NOMATCH);
3417 case 0x20: /* SPACE */
3418 case 0xa0: /* NBSP */
3419 case 0x1680: /* OGHAM SPACE MARK */
3420 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3421 case 0x2000: /* EN QUAD */
3422 case 0x2001: /* EM QUAD */
3423 case 0x2002: /* EN SPACE */
3424 case 0x2003: /* EM SPACE */
3425 case 0x2004: /* THREE-PER-EM SPACE */
3426 case 0x2005: /* FOUR-PER-EM SPACE */
3427 case 0x2006: /* SIX-PER-EM SPACE */
3428 case 0x2007: /* FIGURE SPACE */
3429 case 0x2008: /* PUNCTUATION SPACE */
3430 case 0x2009: /* THIN SPACE */
3431 case 0x200A: /* HAIR SPACE */
3432 case 0x202f: /* NARROW NO-BREAK SPACE */
3433 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3434 case 0x3000: /* IDEOGRAPHIC SPACE */
3447 case 0x85: /* NEL */
3448 case 0x2028: /* LINE SEPARATOR */
3449 case 0x2029: /* PARAGRAPH SEPARATOR */
3450 RRETURN(MATCH_NOMATCH);
3457 default: RRETURN(MATCH_NOMATCH);
3462 case 0x85: /* NEL */
3463 case 0x2028: /* LINE SEPARATOR */
3464 case 0x2029: /* PARAGRAPH SEPARATOR */
3470 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3471 RRETURN(MATCH_NOMATCH);
3475 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3476 RRETURN(MATCH_NOMATCH);
3479 case OP_NOT_WHITESPACE:
3480 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3481 RRETURN(MATCH_NOMATCH);
3485 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3486 RRETURN(MATCH_NOMATCH);
3489 case OP_NOT_WORDCHAR:
3490 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3491 RRETURN(MATCH_NOMATCH);
3495 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3496 RRETURN(MATCH_NOMATCH);
3500 RRETURN(PCRE_ERROR_INTERNAL);
3506 /* Not UTF-8 mode */
3508 for (fi = min;; fi++)
3510 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3511 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3512 if (fi >= max || eptr >= md->end_subject ||
3513 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3514 RRETURN(MATCH_NOMATCH);
3519 case OP_ANY: /* This is the DOTALL case */
3528 default: RRETURN(MATCH_NOMATCH);
3530 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3545 case 0x20: /* SPACE */
3546 case 0xa0: /* NBSP */
3547 RRETURN(MATCH_NOMATCH);
3554 default: RRETURN(MATCH_NOMATCH);
3556 case 0x20: /* SPACE */
3557 case 0xa0: /* NBSP */
3570 case 0x85: /* NEL */
3571 RRETURN(MATCH_NOMATCH);
3578 default: RRETURN(MATCH_NOMATCH);
3583 case 0x85: /* NEL */
3589 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3593 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3596 case OP_NOT_WHITESPACE:
3597 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3601 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3604 case OP_NOT_WORDCHAR:
3605 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3609 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3613 RRETURN(PCRE_ERROR_INTERNAL);
3617 /* Control never gets here */
3620 /* If maximizing, it is worth using inline code for speed, doing the type
3621 test once at the start (i.e. keep it out of the loop). Again, keep the
3622 UTF-8 and UCP stuff separate. */
3626 pp = eptr; /* Remember where we started */
3634 for (i = min; i < max; i++)
3637 if (eptr >= md->end_subject) break;
3638 GETCHARLEN(c, eptr, len);
3639 if (prop_fail_result) break;
3645 for (i = min; i < max; i++)
3648 if (eptr >= md->end_subject) break;
3649 GETCHARLEN(c, eptr, len);
3650 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3651 if ((prop_chartype == ucp_Lu ||
3652 prop_chartype == ucp_Ll ||
3653 prop_chartype == ucp_Lt) == prop_fail_result)
3660 for (i = min; i < max; i++)
3663 if (eptr >= md->end_subject) break;
3664 GETCHARLEN(c, eptr, len);
3665 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3666 if ((prop_category == prop_value) == prop_fail_result)
3673 for (i = min; i < max; i++)
3676 if (eptr >= md->end_subject) break;
3677 GETCHARLEN(c, eptr, len);
3678 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3679 if ((prop_chartype == prop_value) == prop_fail_result)
3686 for (i = min; i < max; i++)
3689 if (eptr >= md->end_subject) break;
3690 GETCHARLEN(c, eptr, len);
3691 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3692 if ((prop_script == prop_value) == prop_fail_result)
3699 /* eptr is now past the end of the maximum run */
3701 if (possessive) continue;
3704 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3705 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3706 if (eptr-- == pp) break; /* Stop if tried at original pos */
3711 /* Match extended Unicode sequences. We will get here only if the
3712 support is in the binary; otherwise a compile-time error occurs. */
3714 else if (ctype == OP_EXTUNI)
3716 for (i = min; i < max; i++)
3718 if (eptr >= md->end_subject) break;
3719 GETCHARINCTEST(c, eptr);
3720 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3721 if (prop_category == ucp_M) break;
3722 while (eptr < md->end_subject)
3725 if (!utf8) c = *eptr; else
3727 GETCHARLEN(c, eptr, len);
3729 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3730 if (prop_category != ucp_M) break;
3735 /* eptr is now past the end of the maximum run */
3737 if (possessive) continue;
3740 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3741 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3742 if (eptr-- == pp) break; /* Stop if tried at original pos */
3743 for (;;) /* Move back over one extended */
3747 if (!utf8) c = *eptr; else
3749 GETCHARLEN(c, eptr, len);
3751 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3752 if (prop_category != ucp_M) break;
3759 #endif /* SUPPORT_UCP */
3770 /* Special code is required for UTF8, but when the maximum is
3771 unlimited we don't need it, so we repeat the non-UTF8 code. This is
3772 probably worth it, because .* is quite a common idiom. */
3776 if ((ims & PCRE_DOTALL) == 0)
3778 for (i = min; i < max; i++)
3780 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3782 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3787 for (i = min; i < max; i++)
3789 if (eptr >= md->end_subject) break;
3791 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3796 /* Handle unlimited UTF-8 repeat */
3800 if ((ims & PCRE_DOTALL) == 0)
3802 for (i = min; i < max; i++)
3804 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3812 if (c > (unsigned int)(md->end_subject - eptr))
3813 c = md->end_subject - eptr;
3819 /* The byte case is the same as non-UTF8 */
3823 if (c > (unsigned int)(md->end_subject - eptr))
3824 c = md->end_subject - eptr;
3829 for (i = min; i < max; i++)
3832 if (eptr >= md->end_subject) break;
3833 GETCHARLEN(c, eptr, len);
3836 if (++eptr >= md->end_subject) break;
3837 if (*eptr == 0x000a) eptr++;
3841 if (c != 0x000a && c != 0x000b && c != 0x000c &&
3842 c != 0x0085 && c != 0x2028 && c != 0x2029)
3851 for (i = min; i < max; i++)
3855 if (eptr >= md->end_subject) break;
3856 GETCHARLEN(c, eptr, len);
3859 default: gotspace = FALSE; break;
3861 case 0x20: /* SPACE */
3862 case 0xa0: /* NBSP */
3863 case 0x1680: /* OGHAM SPACE MARK */
3864 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3865 case 0x2000: /* EN QUAD */
3866 case 0x2001: /* EM QUAD */
3867 case 0x2002: /* EN SPACE */
3868 case 0x2003: /* EM SPACE */
3869 case 0x2004: /* THREE-PER-EM SPACE */
3870 case 0x2005: /* FOUR-PER-EM SPACE */
3871 case 0x2006: /* SIX-PER-EM SPACE */
3872 case 0x2007: /* FIGURE SPACE */
3873 case 0x2008: /* PUNCTUATION SPACE */
3874 case 0x2009: /* THIN SPACE */
3875 case 0x200A: /* HAIR SPACE */
3876 case 0x202f: /* NARROW NO-BREAK SPACE */
3877 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3878 case 0x3000: /* IDEOGRAPHIC SPACE */
3882 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3889 for (i = min; i < max; i++)
3893 if (eptr >= md->end_subject) break;
3894 GETCHARLEN(c, eptr, len);
3897 default: gotspace = FALSE; break;
3902 case 0x85: /* NEL */
3903 case 0x2028: /* LINE SEPARATOR */
3904 case 0x2029: /* PARAGRAPH SEPARATOR */
3908 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3914 for (i = min; i < max; i++)
3917 if (eptr >= md->end_subject) break;
3918 GETCHARLEN(c, eptr, len);
3919 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3925 for (i = min; i < max; i++)
3928 if (eptr >= md->end_subject) break;
3929 GETCHARLEN(c, eptr, len);
3930 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3935 case OP_NOT_WHITESPACE:
3936 for (i = min; i < max; i++)
3939 if (eptr >= md->end_subject) break;
3940 GETCHARLEN(c, eptr, len);
3941 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3947 for (i = min; i < max; i++)
3950 if (eptr >= md->end_subject) break;
3951 GETCHARLEN(c, eptr, len);
3952 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3957 case OP_NOT_WORDCHAR:
3958 for (i = min; i < max; i++)
3961 if (eptr >= md->end_subject) break;
3962 GETCHARLEN(c, eptr, len);
3963 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3969 for (i = min; i < max; i++)
3972 if (eptr >= md->end_subject) break;
3973 GETCHARLEN(c, eptr, len);
3974 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3980 RRETURN(PCRE_ERROR_INTERNAL);
3983 /* eptr is now past the end of the maximum run */
3985 if (possessive) continue;
3988 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
3989 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3990 if (eptr-- == pp) break; /* Stop if tried at original pos */
3997 /* Not UTF-8 mode */
4002 if ((ims & PCRE_DOTALL) == 0)
4004 for (i = min; i < max; i++)
4006 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4011 /* For DOTALL case, fall through and treat as \C */
4015 if (c > (unsigned int)(md->end_subject - eptr))
4016 c = md->end_subject - eptr;
4021 for (i = min; i < max; i++)
4023 if (eptr >= md->end_subject) break;
4027 if (++eptr >= md->end_subject) break;
4028 if (*eptr == 0x000a) eptr++;
4032 if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
4040 for (i = min; i < max; i++)
4042 if (eptr >= md->end_subject) break;
4044 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4050 for (i = min; i < max; i++)
4052 if (eptr >= md->end_subject) break;
4054 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4060 for (i = min; i < max; i++)
4062 if (eptr >= md->end_subject) break;
4064 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4071 for (i = min; i < max; i++)
4073 if (eptr >= md->end_subject) break;
4075 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4082 for (i = min; i < max; i++)
4084 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4091 for (i = min; i < max; i++)
4093 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4099 case OP_NOT_WHITESPACE:
4100 for (i = min; i < max; i++)
4102 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4109 for (i = min; i < max; i++)
4111 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4117 case OP_NOT_WORDCHAR:
4118 for (i = min; i < max; i++)
4120 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4127 for (i = min; i < max; i++)
4129 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4136 RRETURN(PCRE_ERROR_INTERNAL);
4139 /* eptr is now past the end of the maximum run */
4141 if (possessive) continue;
4144 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4146 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4150 /* Get here if we can't make it match with any permitted repetitions */
4152 RRETURN(MATCH_NOMATCH);
4154 /* Control never gets here */
4156 /* There's been some horrible disaster. Arrival here can only mean there is
4157 something seriously wrong in the code above or the OP_xxx definitions. */
4160 DPRINTF(("Unknown opcode %d\n", *ecode));
4161 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4164 /* Do not stick any code in here without much thought; it is assumed
4165 that "continue" in the code above comes out to here to repeat the main
4168 } /* End of main loop */
4169 /* Control never reaches here */
4172 /* When compiling to use the heap rather than the stack for recursive calls to
4173 match(), the RRETURN() macro jumps here. The number that is saved in
4174 frame->Xwhere indicates which label we actually want to return to. */
4177 #define LBL(val) case val: goto L_RM##val;
4179 switch (frame->Xwhere)
4181 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4182 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
4183 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
4184 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
4185 LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) LBL(40)
4186 LBL(41) LBL(42) LBL(43) LBL(44) LBL(45) LBL(46) LBL(47)
4188 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4189 return PCRE_ERROR_INTERNAL;
4192 #endif /* NO_RECURSE */
4196 /***************************************************************************
4197 ****************************************************************************
4198 RECURSION IN THE match() FUNCTION
4200 Undefine all the macros that were defined above to handle this. */
4219 #undef new_recursive
4234 #undef save_capture_last
4244 /* These two are defined as macros in both cases */
4249 /***************************************************************************
4250 ***************************************************************************/
4254 /*************************************************
4255 * Execute a Regular Expression *
4256 *************************************************/
4258 /* This function applies a compiled re to a subject string and picks out
4259 portions of the string if it matches. Two elements in the vector are set for
4260 each substring: the offsets to the start and end of the substring.
4263 argument_re points to the compiled expression
4264 extra_data points to extra data or is NULL
4265 subject points to the subject string
4266 length length of subject string (may contain binary zeros)
4267 start_offset where to start in the subject string
4269 offsets points to a vector of ints to be filled in with offsets
4270 offsetcount the number of elements in the vector
4272 Returns: > 0 => success; value is the number of elements filled in
4273 = 0 => success, but offsets is not big enough
4274 -1 => failed to match
4275 < -1 => some kind of unexpected problem
4279 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4280 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4283 int rc, resetcount, ocount;
4284 int first_byte = -1;
4288 unsigned long int ims;
4289 BOOL using_temporary_offsets = FALSE;
4293 BOOL first_byte_caseless = FALSE;
4294 BOOL req_byte_caseless = FALSE;
4296 match_data match_block;
4297 match_data *md = &match_block;
4298 const uschar *tables;
4299 const uschar *start_bits = NULL;
4300 USPTR start_match = (USPTR)subject + start_offset;
4302 USPTR req_byte_ptr = start_match - 1;
4303 eptrblock eptrchain[EPTR_WORK_SIZE];
4305 pcre_study_data internal_study;
4306 const pcre_study_data *study;
4308 real_pcre internal_re;
4309 const real_pcre *external_re = (const real_pcre *)argument_re;
4310 const real_pcre *re = external_re;
4312 /* Plausibility checks */
4314 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4315 if (re == NULL || subject == NULL ||
4316 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4317 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4319 /* Fish out the optional data from the extra_data structure, first setting
4320 the default values. */
4323 md->match_limit = MATCH_LIMIT;
4324 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4325 md->callout_data = NULL;
4327 /* The table pointer is always in native byte order. */
4329 tables = external_re->tables;
4331 if (extra_data != NULL)
4333 register unsigned int flags = extra_data->flags;
4334 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4335 study = (const pcre_study_data *)extra_data->study_data;
4336 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4337 md->match_limit = extra_data->match_limit;
4338 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4339 md->match_limit_recursion = extra_data->match_limit_recursion;
4340 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4341 md->callout_data = extra_data->callout_data;
4342 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4345 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4346 is a feature that makes it possible to save compiled regex and re-use them
4347 in other programs later. */
4349 if (tables == NULL) tables = _pcre_default_tables;
4351 /* Check that the first field in the block is the magic number. If it is not,
4352 test for a regex that was compiled on a host of opposite endianness. If this is
4353 the case, flipped values are put in internal_re and internal_study if there was
4356 if (re->magic_number != MAGIC_NUMBER)
4358 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4359 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4360 if (study != NULL) study = &internal_study;
4363 /* Set up other data */
4365 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4366 startline = (re->options & PCRE_STARTLINE) != 0;
4367 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4369 /* The code starts after the real_pcre block and the capture name table. */
4371 md->start_code = (const uschar *)external_re + re->name_table_offset +
4372 re->name_count * re->name_entry_size;
4374 md->start_subject = (USPTR)subject;
4375 md->start_offset = start_offset;
4376 md->end_subject = md->start_subject + length;
4377 end_subject = md->end_subject;
4379 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4380 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4382 md->notbol = (options & PCRE_NOTBOL) != 0;
4383 md->noteol = (options & PCRE_NOTEOL) != 0;
4384 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4385 md->partial = (options & PCRE_PARTIAL) != 0;
4388 md->recursive = NULL; /* No recursion at top level */
4389 md->eptrchain = eptrchain; /* Make workspace generally available */
4391 md->lcc = tables + lcc_offset;
4392 md->ctypes = tables + ctypes_offset;
4394 /* Handle different types of newline. The three bits give eight cases. If
4395 nothing is set at run time, whatever was used at compile time applies. */
4397 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
4400 case 0: newline = NEWLINE; break; /* Compile-time default */
4401 case PCRE_NEWLINE_CR: newline = '\r'; break;
4402 case PCRE_NEWLINE_LF: newline = '\n'; break;
4403 case PCRE_NEWLINE_CR+
4404 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4405 case PCRE_NEWLINE_ANY: newline = -1; break;
4406 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4407 default: return PCRE_ERROR_BADNEWLINE;
4412 md->nltype = NLTYPE_ANYCRLF;
4414 else if (newline < 0)
4416 md->nltype = NLTYPE_ANY;
4420 md->nltype = NLTYPE_FIXED;
4424 md->nl[0] = (newline >> 8) & 255;
4425 md->nl[1] = newline & 255;
4430 md->nl[0] = newline;
4434 /* Partial matching is supported only for a restricted set of regexes at the
4437 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
4438 return PCRE_ERROR_BADPARTIAL;
4440 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4441 back the character offset. */
4444 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4446 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4447 return PCRE_ERROR_BADUTF8;
4448 if (start_offset > 0 && start_offset < length)
4450 int tb = ((uschar *)subject)[start_offset];
4454 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4460 /* The ims options can vary during the matching as a result of the presence
4461 of (?ims) items in the pattern. They are kept in a local variable so that
4462 restoring at the exit of a group is easy. */
4464 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4466 /* If the expression has got more back references than the offsets supplied can
4467 hold, we get a temporary chunk of working store to use during the matching.
4468 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4471 ocount = offsetcount - (offsetcount % 3);
4473 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4475 ocount = re->top_backref * 3 + 3;
4476 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4477 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4478 using_temporary_offsets = TRUE;
4479 DPRINTF(("Got memory to hold back references\n"));
4481 else md->offset_vector = offsets;
4483 md->offset_end = ocount;
4484 md->offset_max = (2*ocount)/3;
4485 md->offset_overflow = FALSE;
4486 md->capture_last = -1;
4488 /* Compute the minimum number of offsets that we need to reset each time. Doing
4489 this makes a huge difference to execution time when there aren't many brackets
4492 resetcount = 2 + re->top_bracket * 2;
4493 if (resetcount > offsetcount) resetcount = ocount;
4495 /* Reset the working variable associated with each extraction. These should
4496 never be used unless previously set, but they get saved and restored, and so we
4497 initialize them to avoid reading uninitialized locations. */
4499 if (md->offset_vector != NULL)
4501 register int *iptr = md->offset_vector + ocount;
4502 register int *iend = iptr - resetcount/2 + 1;
4503 while (--iptr >= iend) *iptr = -1;
4506 /* Set up the first character to match, if available. The first_byte value is
4507 never set for an anchored regular expression, but the anchoring may be forced
4508 at run time, so we have to test for anchoring. The first char may be unset for
4509 an unanchored pattern, of course. If there's no first char and the pattern was
4510 studied, there may be a bitmap of possible first characters. */
4514 if ((re->options & PCRE_FIRSTSET) != 0)
4516 first_byte = re->first_byte & 255;
4517 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4518 first_byte = md->lcc[first_byte];
4521 if (!startline && study != NULL &&
4522 (study->options & PCRE_STUDY_MAPPED) != 0)
4523 start_bits = study->start_bits;
4526 /* For anchored or unanchored matches, there may be a "last known required
4529 if ((re->options & PCRE_REQCHSET) != 0)
4531 req_byte = re->req_byte & 255;
4532 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4533 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4537 /* ==========================================================================*/
4539 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4540 the loop runs just once. */
4544 USPTR save_end_subject = end_subject;
4546 /* Reset the maximum number of extractions we might see. */
4548 if (md->offset_vector != NULL)
4550 register int *iptr = md->offset_vector;
4551 register int *iend = iptr + resetcount;
4552 while (iptr < iend) *iptr++ = -1;
4555 /* Advance to a unique first char if possible. If firstline is TRUE, the
4556 start of the match is constrained to the first line of a multiline string.
4557 That is, the match must be before or at the first newline. Implement this by
4558 temporarily adjusting end_subject so that we stop scanning at a newline. If
4559 the match fails at the newline, later code breaks this loop. */
4563 USPTR t = start_match;
4564 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4568 /* Now test for a unique first byte */
4570 if (first_byte >= 0)
4572 if (first_byte_caseless)
4573 while (start_match < end_subject &&
4574 md->lcc[*start_match] != first_byte)
4577 while (start_match < end_subject && *start_match != first_byte)
4581 /* Or to just after a linebreak for a multiline match if possible */
4585 if (start_match > md->start_subject + start_offset)
4587 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4590 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4591 and we are now at a LF, advance the match position by one more character.
4594 if (start_match[-1] == '\r' &&
4595 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4596 start_match < end_subject &&
4597 *start_match == '\n')
4602 /* Or to a non-unique first char after study */
4604 else if (start_bits != NULL)
4606 while (start_match < end_subject)
4608 register unsigned int c = *start_match;
4609 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4613 /* Restore fudged end_subject */
4615 end_subject = save_end_subject;
4617 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4618 printf(">>>> Match against: ");
4619 pchars(start_match, end_subject - start_match, TRUE, md);
4623 /* If req_byte is set, we know that that character must appear in the subject
4624 for the match to succeed. If the first character is set, req_byte must be
4625 later in the subject; otherwise the test starts at the match point. This
4626 optimization can save a huge amount of backtracking in patterns with nested
4627 unlimited repeats that aren't going to match. Writing separate code for
4628 cased/caseless versions makes it go faster, as does using an autoincrement
4629 and backing off on a match.
4631 HOWEVER: when the subject string is very, very long, searching to its end can
4632 take a long time, and give bad performance on quite ordinary patterns. This
4633 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4634 string... so we don't do this when the string is sufficiently long.
4636 ALSO: this processing is disabled when partial matching is requested.
4639 if (req_byte >= 0 &&
4640 end_subject - start_match < REQ_BYTE_MAX &&
4643 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4645 /* We don't need to repeat the search if we haven't yet reached the
4646 place we found it at last time. */
4648 if (p > req_byte_ptr)
4650 if (req_byte_caseless)
4652 while (p < end_subject)
4654 register int pp = *p++;
4655 if (pp == req_byte || pp == req_byte2) { p--; break; }
4660 while (p < end_subject)
4662 if (*p++ == req_byte) { p--; break; }
4666 /* If we can't find the required character, break the matching loop,
4667 forcing a match failure. */
4669 if (p >= end_subject)
4675 /* If we have found the required character, save the point where we
4676 found it, so that we don't search again next time round the loop if
4677 the start hasn't passed this character yet. */
4683 /* OK, we can now run the match. */
4685 md->start_match_ptr = start_match; /* Insurance */
4686 md->match_call_count = 0;
4687 md->eptrn = 0; /* Next free eptrchain slot */
4688 rc = match(start_match, md->start_code, start_match, 2, md,
4691 /* Any return other than MATCH_NOMATCH breaks the loop. */
4693 if (rc != MATCH_NOMATCH) break;
4695 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4696 newline in the subject (though it may continue over the newline). Therefore,
4697 if we have just failed to match, starting at a newline, do not continue. */
4699 if (firstline && IS_NEWLINE(start_match)) break;
4701 /* Advance the match position by one character. */
4706 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4710 /* Break the loop if the pattern is anchored or if we have passed the end of
4713 if (anchored || start_match > end_subject) break;
4715 /* If we have just passed a CR and the newline option is CRLF or ANY or
4716 ANYCRLF, and we are now at a LF, advance the match position by one more
4719 if (start_match[-1] == '\r' &&
4720 (md->nltype == NLTYPE_ANY ||
4721 md->nltype == NLTYPE_ANYCRLF ||
4723 start_match < end_subject &&
4724 *start_match == '\n')
4727 } /* End of for(;;) "bumpalong" loop */
4729 /* ==========================================================================*/
4731 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4734 (1) The pattern is anchored;
4736 (2) We are past the end of the subject;
4738 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4739 this option requests that a match occur at or before the first newline in
4742 When we have a match and the offset vector is big enough to deal with any
4743 backreferences, captured substring offsets will already be set up. In the case
4744 where we had to get some local store to hold offsets for backreference
4745 processing, copy those that we can. In this case there need not be overflow if
4746 certain parts of the pattern were not used, even though there are more
4747 capturing parentheses than vector slots. */
4749 if (rc == MATCH_MATCH)
4751 if (using_temporary_offsets)
4753 if (offsetcount >= 4)
4755 memcpy(offsets + 2, md->offset_vector + 2,
4756 (offsetcount - 2) * sizeof(int));
4757 DPRINTF(("Copied offsets from temporary memory\n"));
4759 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4760 DPRINTF(("Freeing temporary memory\n"));
4761 (pcre_free)(md->offset_vector);
4764 /* Set the return code to the number of captured strings, or 0 if there are
4765 too many to fit into the vector. */
4767 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4769 /* If there is space, set up the whole thing as substring 0. The value of
4770 md->start_match_ptr might be modified if \K was encountered on the success
4773 if (offsetcount < 2) rc = 0; else
4775 offsets[0] = md->start_match_ptr - md->start_subject;
4776 offsets[1] = md->end_match_ptr - md->start_subject;
4779 DPRINTF((">>>> returning %d\n", rc));
4783 /* Control gets here if there has been an error, or if the overall match
4784 attempt has failed at all permitted starting positions. */
4786 if (using_temporary_offsets)
4788 DPRINTF(("Freeing temporary memory\n"));
4789 (pcre_free)(md->offset_vector);
4792 if (rc != MATCH_NOMATCH)
4794 DPRINTF((">>>> error: returning %d\n", rc));
4797 else if (md->partial && md->hitend)
4799 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4800 return PCRE_ERROR_PARTIAL;
4804 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4805 return PCRE_ERROR_NOMATCH;
4809 /* End of pcre_exec.c */