1 /* $Cambridge: exim/src/src/pcre/pcre_exec.c,v 1.6 2007/11/12 13:02:19 nm4 Exp $ */
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
10 Written by Philip Hazel
11 Copyright (c) 1997-2007 University of Cambridge
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
43 /* This module contains pcre_exec(), the externally visible function that does
44 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
45 possible. There are also some static supporting functions. */
51 #define NLBLOCK md /* Block containing newline information */
52 #define PSSTART start_subject /* Field containing processed string start */
53 #define PSEND end_subject /* Field containing processed string end */
55 #include "pcre_internal.h"
57 /* Undefine some potentially clashing cpp symbols */
62 /* Flag bits for the match() function */
64 #define match_condassert 0x01 /* Called to check a condition assertion */
65 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
71 #define MATCH_NOMATCH 0
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
76 #define MATCH_COMMIT (-999)
77 #define MATCH_PRUNE (-998)
78 #define MATCH_SKIP (-997)
79 #define MATCH_THEN (-996)
81 /* Maximum number of ints of offset to save on the stack for recursive calls.
82 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
83 because the offset vector is always a multiple of 3 long. */
85 #define REC_STACK_SAVE_MAX 30
87 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
89 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
90 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95 /*************************************************
96 * Debugging function to print chars *
97 *************************************************/
99 /* Print a sequence of chars in printable format, stopping at the end of the
100 subject if the requested.
103 p points to characters
104 length number to print
105 is_subject TRUE if printing from within md->start_subject
106 md pointer to matching data block, if is_subject is TRUE
112 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
115 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
117 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
123 /*************************************************
124 * Match a back-reference *
125 *************************************************/
127 /* If a back reference hasn't been set, the length that is passed is greater
128 than the number of characters left in the string, so the match fails.
131 offset index into the offset vector
132 eptr points into the subject
133 length length to be matched
134 md points to match data block
137 Returns: TRUE if matched
141 match_ref(int offset, register USPTR eptr, int length, match_data *md,
142 unsigned long int ims)
144 USPTR p = md->start_subject + md->offset_vector[offset];
147 if (eptr >= md->end_subject)
148 printf("matching subject <null>");
151 printf("matching subject ");
152 pchars(eptr, length, TRUE, md);
154 printf(" against backref ");
155 pchars(p, length, FALSE, md);
159 /* Always fail if not enough characters left */
161 if (length > md->end_subject - eptr) return FALSE;
163 /* Separate the caselesss case for speed */
165 if ((ims & PCRE_CASELESS) != 0)
168 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
171 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
178 /***************************************************************************
179 ****************************************************************************
180 RECURSION IN THE match() FUNCTION
182 The match() function is highly recursive, though not every recursive call
183 increases the recursive depth. Nevertheless, some regular expressions can cause
184 it to recurse to a great depth. I was writing for Unix, so I just let it call
185 itself recursively. This uses the stack for saving everything that has to be
186 saved for a recursive call. On Unix, the stack can be large, and this works
189 It turns out that on some non-Unix-like systems there are problems with
190 programs that use a lot of stack. (This despite the fact that every last chip
191 has oodles of memory these days, and techniques for extending the stack have
192 been known for decades.) So....
194 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
195 calls by keeping local variables that need to be preserved in blocks of memory
196 obtained from malloc() instead instead of on the stack. Macros are used to
197 achieve this so that the actual code doesn't look very different to what it
200 The original heap-recursive code used longjmp(). However, it seems that this
201 can be very slow on some operating systems. Following a suggestion from Stan
202 Switzer, the use of longjmp() has been abolished, at the cost of having to
203 provide a unique number for each call to RMATCH. There is no way of generating
204 a sequence of numbers at compile time in C. I have given them names, to make
205 them stand out more clearly.
207 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
208 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
209 tests. Furthermore, not using longjmp() means that local dynamic variables
210 don't have indeterminate values; this has meant that the frame size can be
211 reduced because the result can be "passed back" by straight setting of the
212 variable instead of being passed in the frame.
213 ****************************************************************************
214 ***************************************************************************/
216 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
217 below must be updated in sync. */
219 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
220 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
221 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
222 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
223 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
224 RM51, RM52, RM53, RM54 };
226 /* These versions of the macros use the stack, as normal. There are debugging
227 versions and production versions. Note that the "rw" argument of RMATCH isn't
228 actuall used in this definition. */
231 #define REGISTER register
234 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
236 printf("match() called in line %d\n", __LINE__); \
237 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
238 printf("to line %d\n", __LINE__); \
240 #define RRETURN(ra) \
242 printf("match() returned %d from line %d ", ra, __LINE__); \
246 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
247 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
248 #define RRETURN(ra) return ra
254 /* These versions of the macros manage a private stack on the heap. Note that
255 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
256 argument of match(), which never changes. */
260 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
262 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
263 frame->Xwhere = rw; \
264 newframe->Xeptr = ra;\
265 newframe->Xecode = rb;\
266 newframe->Xmstart = mstart;\
267 newframe->Xoffset_top = rc;\
268 newframe->Xims = re;\
269 newframe->Xeptrb = rf;\
270 newframe->Xflags = rg;\
271 newframe->Xrdepth = frame->Xrdepth + 1;\
272 newframe->Xprevframe = frame;\
274 DPRINTF(("restarting from line %d\n", __LINE__));\
277 DPRINTF(("jumped back to line %d\n", __LINE__));\
282 heapframe *newframe = frame;\
283 frame = newframe->Xprevframe;\
284 (pcre_stack_free)(newframe);\
294 /* Structure for remembering the local variables in a private frame */
296 typedef struct heapframe {
297 struct heapframe *Xprevframe;
299 /* Function arguments that may change */
302 const uschar *Xecode;
303 const uschar *Xmstart;
308 unsigned int Xrdepth;
310 /* Function local variables */
312 const uschar *Xcallpat;
313 const uschar *Xcharptr;
318 const uschar *Xsaved_eptr;
320 recursion_info Xnew_recursive;
326 unsigned long int Xoriginal_ims;
331 int Xprop_fail_result;
348 int Xsave_capture_last;
349 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
350 int Xstacksave[REC_STACK_SAVE_MAX];
354 /* Where to jump back to */
363 /***************************************************************************
364 ***************************************************************************/
368 /*************************************************
369 * Match from current position *
370 *************************************************/
372 /* This function is called recursively in many circumstances. Whenever it
373 returns a negative (error) response, the outer incarnation must also return the
376 Performance note: It might be tempting to extract commonly used fields from the
377 md structure (e.g. utf8, end_subject) into individual variables to improve
378 performance. Tests using gcc on a SPARC disproved this; in the first case, it
379 made performance worse.
382 eptr pointer to current character in subject
383 ecode pointer to current position in compiled code
384 mstart pointer to the current match start position (can be modified
386 offset_top current top pointer
387 md pointer to "static" info for the match
388 ims current /i, /m, and /s options
389 eptrb pointer to chain of blocks containing eptr at start of
390 brackets - for testing for empty matches
392 match_condassert - this is an assertion condition
393 match_cbegroup - this is the start of an unlimited repeat
394 group that can match an empty string
395 rdepth the recursion depth
397 Returns: MATCH_MATCH if matched ) these values are >= 0
398 MATCH_NOMATCH if failed to match )
399 a negative PCRE_ERROR_xxx value if aborted by an error condition
400 (e.g. stopped by repeated call or recursion limit)
404 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
405 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
406 int flags, unsigned int rdepth)
408 /* These variables do not need to be preserved over recursion in this function,
409 so they can be ordinary variables in all cases. Mark some of them with
410 "register" because they are used a lot in loops. */
412 register int rrc; /* Returns from recursive calls */
413 register int i; /* Used for loops not involving calls to RMATCH() */
414 register unsigned int c; /* Character values not kept over RMATCH() calls */
415 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
417 BOOL minimize, possessive; /* Quantifier options */
419 /* When recursion is not being used, all "local" variables that have to be
420 preserved over calls to RMATCH() are part of a "frame" which is obtained from
421 heap storage. Set up the top-level frame here; others are obtained from the
422 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
425 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
426 frame->Xprevframe = NULL; /* Marks the top level */
428 /* Copy in the original argument variables */
431 frame->Xecode = ecode;
432 frame->Xmstart = mstart;
433 frame->Xoffset_top = offset_top;
435 frame->Xeptrb = eptrb;
436 frame->Xflags = flags;
437 frame->Xrdepth = rdepth;
439 /* This is where control jumps back to to effect "recursion" */
443 /* Macros make the argument variables come from the current frame */
445 #define eptr frame->Xeptr
446 #define ecode frame->Xecode
447 #define mstart frame->Xmstart
448 #define offset_top frame->Xoffset_top
449 #define ims frame->Xims
450 #define eptrb frame->Xeptrb
451 #define flags frame->Xflags
452 #define rdepth frame->Xrdepth
454 /* Ditto for the local variables */
457 #define charptr frame->Xcharptr
459 #define callpat frame->Xcallpat
460 #define data frame->Xdata
461 #define next frame->Xnext
462 #define pp frame->Xpp
463 #define prev frame->Xprev
464 #define saved_eptr frame->Xsaved_eptr
466 #define new_recursive frame->Xnew_recursive
468 #define cur_is_word frame->Xcur_is_word
469 #define condition frame->Xcondition
470 #define prev_is_word frame->Xprev_is_word
472 #define original_ims frame->Xoriginal_ims
475 #define prop_type frame->Xprop_type
476 #define prop_value frame->Xprop_value
477 #define prop_fail_result frame->Xprop_fail_result
478 #define prop_category frame->Xprop_category
479 #define prop_chartype frame->Xprop_chartype
480 #define prop_script frame->Xprop_script
481 #define oclength frame->Xoclength
482 #define occhars frame->Xocchars
485 #define ctype frame->Xctype
486 #define fc frame->Xfc
487 #define fi frame->Xfi
488 #define length frame->Xlength
489 #define max frame->Xmax
490 #define min frame->Xmin
491 #define number frame->Xnumber
492 #define offset frame->Xoffset
493 #define op frame->Xop
494 #define save_capture_last frame->Xsave_capture_last
495 #define save_offset1 frame->Xsave_offset1
496 #define save_offset2 frame->Xsave_offset2
497 #define save_offset3 frame->Xsave_offset3
498 #define stacksave frame->Xstacksave
500 #define newptrb frame->Xnewptrb
502 /* When recursion is being used, local variables are allocated on the stack and
503 get preserved during recursion in the normal way. In this environment, fi and
504 i, and fc and c, can be the same variables. */
506 #else /* NO_RECURSE not defined */
511 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
512 const uschar *charptr; /* in small blocks of the code. My normal */
513 #endif /* style of coding would have declared */
514 const uschar *callpat; /* them within each of those blocks. */
515 const uschar *data; /* However, in order to accommodate the */
516 const uschar *next; /* version of this code that uses an */
517 USPTR pp; /* external "stack" implemented on the */
518 const uschar *prev; /* heap, it is easier to declare them all */
519 USPTR saved_eptr; /* here, so the declarations can be cut */
520 /* out in a block. The only declarations */
521 recursion_info new_recursive; /* within blocks below are for variables */
522 /* that do not have to be preserved over */
523 BOOL cur_is_word; /* a recursive call to RMATCH(). */
527 unsigned long int original_ims;
532 int prop_fail_result;
547 int save_capture_last;
548 int save_offset1, save_offset2, save_offset3;
549 int stacksave[REC_STACK_SAVE_MAX];
552 #endif /* NO_RECURSE */
554 /* These statements are here to stop the compiler complaining about unitialized
559 prop_fail_result = 0;
563 /* This label is used for tail recursion, which is used in a few cases even
564 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
565 used. Thanks to Ian Taylor for noticing this possibility and sending the
570 /* OK, now we can get on with the real code of the function. Recursive calls
571 are specified by the macro RMATCH and RRETURN is used to return. When
572 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
573 and a "return", respectively (possibly with some debugging if DEBUG is
574 defined). However, RMATCH isn't like a function call because it's quite a
575 complicated macro. It has to be used in one particular way. This shouldn't,
576 however, impact performance when true recursion is being used. */
579 utf8 = md->utf8; /* Local copy of the flag */
584 /* First check that we haven't called match() too many times, or that we
585 haven't exceeded the recursive call limit. */
587 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
588 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
590 original_ims = ims; /* Save for resetting on ')' */
592 /* At the start of a group with an unlimited repeat that may match an empty
593 string, the match_cbegroup flag is set. When this is the case, add the current
594 subject pointer to the chain of such remembered pointers, to be checked when we
595 hit the closing ket, in order to break infinite loops that match no characters.
596 When match() is called in other circumstances, don't add to the chain. The
597 match_cbegroup flag must NOT be used with tail recursion, because the memory
598 block that is used is on the stack, so a new one may be required for each
601 if ((flags & match_cbegroup) != 0)
603 newptrb.epb_saved_eptr = eptr;
604 newptrb.epb_prev = eptrb;
608 /* Now start processing the opcodes. */
612 minimize = possessive = FALSE;
615 /* For partial matching, remember if we ever hit the end of the subject after
616 matching at least one subject character. */
619 eptr >= md->end_subject &&
626 RRETURN(MATCH_NOMATCH);
629 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
630 ims, eptrb, flags, RM51);
631 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
632 RRETURN(MATCH_PRUNE);
635 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
636 ims, eptrb, flags, RM52);
637 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
638 RRETURN(MATCH_COMMIT);
641 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
642 ims, eptrb, flags, RM53);
643 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
644 md->start_match_ptr = eptr; /* Pass back current position */
648 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
649 ims, eptrb, flags, RM54);
650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
653 /* Handle a capturing bracket. If there is space in the offset vector, save
654 the current subject position in the working slot at the top of the vector.
655 We mustn't change the current values of the data slot, because they may be
656 set from a previous iteration of this group, and be referred to by a
657 reference inside the group.
659 If the bracket fails to match, we need to restore this value and also the
660 values of the final offsets, in case they were set by a previous iteration
663 If there isn't enough space in the offset vector, treat this as if it were
664 a non-capturing bracket. Don't worry about setting the flag for the error
665 case here; that is handled in the code for KET. */
669 number = GET2(ecode, 1+LINK_SIZE);
670 offset = number << 1;
673 printf("start bracket %d\n", number);
675 pchars(eptr, 16, TRUE, md);
679 if (offset < md->offset_max)
681 save_offset1 = md->offset_vector[offset];
682 save_offset2 = md->offset_vector[offset+1];
683 save_offset3 = md->offset_vector[md->offset_end - number];
684 save_capture_last = md->capture_last;
686 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
687 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
689 flags = (op == OP_SCBRA)? match_cbegroup : 0;
692 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ims, eptrb, flags, RM1);
694 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
695 md->capture_last = save_capture_last;
696 ecode += GET(ecode, 1);
698 while (*ecode == OP_ALT);
700 DPRINTF(("bracket %d failed\n", number));
702 md->offset_vector[offset] = save_offset1;
703 md->offset_vector[offset+1] = save_offset2;
704 md->offset_vector[md->offset_end - number] = save_offset3;
706 RRETURN(MATCH_NOMATCH);
709 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
710 as a non-capturing bracket. */
712 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
713 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
715 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
717 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
718 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
720 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
721 final alternative within the brackets, we would return the result of a
722 recursive call to match() whatever happened. We can reduce stack usage by
723 turning this into a tail recursion, except in the case when match_cbegroup
728 DPRINTF(("start non-capturing bracket\n"));
729 flags = (op >= OP_SBRA)? match_cbegroup : 0;
732 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
734 if (flags == 0) /* Not a possibly empty group */
736 ecode += _pcre_OP_lengths[*ecode];
737 DPRINTF(("bracket 0 tail recursion\n"));
741 /* Possibly empty group; can't use tail recursion. */
743 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
748 /* For non-final alternatives, continue the loop for a NOMATCH result;
751 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
753 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
754 ecode += GET(ecode, 1);
756 /* Control never reaches here. */
758 /* Conditional group: compilation checked that there are no more than
759 two branches. If the condition is false, skipping the first branch takes us
760 past the end if there is only one branch, but that's OK because that is
761 exactly what going to the ket would do. As there is only one branch to be
762 obeyed, we can use tail recursion to avoid using another stack frame. */
766 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
768 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
769 condition = md->recursive != NULL &&
770 (offset == RREF_ANY || offset == md->recursive->group_num);
771 ecode += condition? 3 : GET(ecode, 1);
774 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
776 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
777 condition = offset < offset_top && md->offset_vector[offset] >= 0;
778 ecode += condition? 3 : GET(ecode, 1);
781 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
784 ecode += GET(ecode, 1);
787 /* The condition is an assertion. Call match() to evaluate it - setting
788 the final argument match_condassert causes it to stop at the end of an
793 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
794 match_condassert, RM3);
795 if (rrc == MATCH_MATCH)
798 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
799 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
801 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
803 RRETURN(rrc); /* Need braces because of following else */
808 ecode += GET(ecode, 1);
812 /* We are now at the branch that is to be obeyed. As there is only one,
813 we can use tail recursion to avoid using another stack frame, except when
814 match_cbegroup is required for an unlimited repeat of a possibly empty
815 group. If the second alternative doesn't exist, we can just plough on. */
817 if (condition || *ecode == OP_ALT)
819 ecode += 1 + LINK_SIZE;
820 if (op == OP_SCOND) /* Possibly empty group */
822 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
825 else /* Group must match something */
831 else /* Condition false & no 2nd alternative */
833 ecode += 1 + LINK_SIZE;
838 /* End of the pattern, either real or forced. If we are in a top-level
839 recursion, we should restore the offsets appropriately and continue from
844 if (md->recursive != NULL && md->recursive->group_num == 0)
846 recursion_info *rec = md->recursive;
847 DPRINTF(("End of pattern in a (?0) recursion\n"));
848 md->recursive = rec->prevrec;
849 memmove(md->offset_vector, rec->offset_save,
850 rec->saved_max * sizeof(int));
851 mstart = rec->save_start;
853 ecode = rec->after_call;
857 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
858 string - backtracking will then try other alternatives, if any. */
860 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
861 md->end_match_ptr = eptr; /* Record where we ended */
862 md->end_offset_top = offset_top; /* and how many extracts were taken */
863 md->start_match_ptr = mstart; /* and the start (\K can modify) */
864 RRETURN(MATCH_MATCH);
866 /* Change option settings */
871 DPRINTF(("ims set to %02lx\n", ims));
874 /* Assertion brackets. Check the alternative branches in turn - the
875 matching won't pass the KET for an assertion. If any one branch matches,
876 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
877 start of each branch to move the current point backwards, so the code at
878 this level is identical to the lookahead case. */
884 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
886 if (rrc == MATCH_MATCH) break;
887 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
888 ecode += GET(ecode, 1);
890 while (*ecode == OP_ALT);
891 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
893 /* If checking an assertion for a condition, return MATCH_MATCH. */
895 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
897 /* Continue from after the assertion, updating the offsets high water
898 mark, since extracts may have been taken during the assertion. */
900 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
901 ecode += 1 + LINK_SIZE;
902 offset_top = md->end_offset_top;
905 /* Negative assertion: all branches must fail to match */
908 case OP_ASSERTBACK_NOT:
911 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
913 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
914 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
915 ecode += GET(ecode,1);
917 while (*ecode == OP_ALT);
919 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
921 ecode += 1 + LINK_SIZE;
924 /* Move the subject pointer back. This occurs only at the start of
925 each branch of a lookbehind assertion. If we are too close to the start to
926 move back, this match function fails. When working with UTF-8 we move
927 back a number of characters, not bytes. */
937 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
944 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
947 eptr -= GET(ecode, 1);
948 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
951 /* Skip to next op code */
953 ecode += 1 + LINK_SIZE;
956 /* The callout item calls an external function, if one is provided, passing
957 details of the match so far. This is mainly for debugging, though the
958 function is able to force a failure. */
961 if (pcre_callout != NULL)
963 pcre_callout_block cb;
964 cb.version = 1; /* Version 1 of the callout block */
965 cb.callout_number = ecode[1];
966 cb.offset_vector = md->offset_vector;
967 cb.subject = (PCRE_SPTR)md->start_subject;
968 cb.subject_length = md->end_subject - md->start_subject;
969 cb.start_match = mstart - md->start_subject;
970 cb.current_position = eptr - md->start_subject;
971 cb.pattern_position = GET(ecode, 2);
972 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
973 cb.capture_top = offset_top/2;
974 cb.capture_last = md->capture_last;
975 cb.callout_data = md->callout_data;
976 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
977 if (rrc < 0) RRETURN(rrc);
979 ecode += 2 + 2*LINK_SIZE;
982 /* Recursion either matches the current regex, or some subexpression. The
983 offset data is the offset to the starting bracket from the start of the
984 whole pattern. (This is so that it works from duplicated subpatterns.)
986 If there are any capturing brackets started but not finished, we have to
987 save their starting points and reinstate them after the recursion. However,
988 we don't know how many such there are (offset_top records the completed
989 total) so we just have to save all the potential data. There may be up to
990 65535 such values, which is too large to put on the stack, but using malloc
991 for small numbers seems expensive. As a compromise, the stack is used when
992 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
993 is used. A problem is what to do if the malloc fails ... there is no way of
994 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
995 values on the stack, and accept that the rest may be wrong.
997 There are also other values that have to be saved. We use a chained
998 sequence of blocks that actually live on the stack. Thanks to Robin Houston
999 for the original version of this logic. */
1003 callpat = md->start_code + GET(ecode, 1);
1004 new_recursive.group_num = (callpat == md->start_code)? 0 :
1005 GET2(callpat, 1 + LINK_SIZE);
1007 /* Add to "recursing stack" */
1009 new_recursive.prevrec = md->recursive;
1010 md->recursive = &new_recursive;
1012 /* Find where to continue from afterwards */
1014 ecode += 1 + LINK_SIZE;
1015 new_recursive.after_call = ecode;
1017 /* Now save the offset data. */
1019 new_recursive.saved_max = md->offset_end;
1020 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1021 new_recursive.offset_save = stacksave;
1024 new_recursive.offset_save =
1025 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1026 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1029 memcpy(new_recursive.offset_save, md->offset_vector,
1030 new_recursive.saved_max * sizeof(int));
1031 new_recursive.save_start = mstart;
1034 /* OK, now we can do the recursion. For each top-level alternative we
1035 restore the offset and recursion data. */
1037 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1038 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1041 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1042 md, ims, eptrb, flags, RM6);
1043 if (rrc == MATCH_MATCH)
1045 DPRINTF(("Recursion matched\n"));
1046 md->recursive = new_recursive.prevrec;
1047 if (new_recursive.offset_save != stacksave)
1048 (pcre_free)(new_recursive.offset_save);
1049 RRETURN(MATCH_MATCH);
1051 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1053 DPRINTF(("Recursion gave error %d\n", rrc));
1057 md->recursive = &new_recursive;
1058 memcpy(md->offset_vector, new_recursive.offset_save,
1059 new_recursive.saved_max * sizeof(int));
1060 callpat += GET(callpat, 1);
1062 while (*callpat == OP_ALT);
1064 DPRINTF(("Recursion didn't match\n"));
1065 md->recursive = new_recursive.prevrec;
1066 if (new_recursive.offset_save != stacksave)
1067 (pcre_free)(new_recursive.offset_save);
1068 RRETURN(MATCH_NOMATCH);
1070 /* Control never reaches here */
1072 /* "Once" brackets are like assertion brackets except that after a match,
1073 the point in the subject string is not moved back. Thus there can never be
1074 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1075 Check the alternative branches in turn - the matching won't pass the KET
1076 for this kind of subpattern. If any one branch matches, we carry on as at
1077 the end of a normal bracket, leaving the subject pointer. */
1085 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1086 if (rrc == MATCH_MATCH) break;
1087 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1088 ecode += GET(ecode,1);
1090 while (*ecode == OP_ALT);
1092 /* If hit the end of the group (which could be repeated), fail */
1094 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1096 /* Continue as from after the assertion, updating the offsets high water
1097 mark, since extracts may have been taken. */
1099 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1101 offset_top = md->end_offset_top;
1102 eptr = md->end_match_ptr;
1104 /* For a non-repeating ket, just continue at this level. This also
1105 happens for a repeating ket if no characters were matched in the group.
1106 This is the forcible breaking of infinite loops as implemented in Perl
1107 5.005. If there is an options reset, it will get obeyed in the normal
1108 course of events. */
1110 if (*ecode == OP_KET || eptr == saved_eptr)
1112 ecode += 1+LINK_SIZE;
1116 /* The repeating kets try the rest of the pattern or restart from the
1117 preceding bracket, in the appropriate order. The second "call" of match()
1118 uses tail recursion, to avoid using another stack frame. We need to reset
1119 any options that changed within the bracket before re-running it, so
1120 check the next opcode. */
1122 if (ecode[1+LINK_SIZE] == OP_OPT)
1124 ims = (ims & ~PCRE_IMS) | ecode[4];
1125 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1128 if (*ecode == OP_KETRMIN)
1130 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1131 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1136 else /* OP_KETRMAX */
1138 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1139 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1140 ecode += 1 + LINK_SIZE;
1144 /* Control never gets here */
1146 /* An alternation is the end of a branch; scan along to find the end of the
1147 bracketed group and go to there. */
1150 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1153 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1154 that it may occur zero times. It may repeat infinitely, or not at all -
1155 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1156 repeat limits are compiled as a number of copies, with the optional ones
1157 preceded by BRAZERO or BRAMINZERO. */
1162 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1163 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1164 do next += GET(next,1); while (*next == OP_ALT);
1165 ecode = next + 1 + LINK_SIZE;
1172 do next += GET(next, 1); while (*next == OP_ALT);
1173 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1174 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1179 /* End of a group, repeated or non-repeating. */
1184 prev = ecode - GET(ecode, 1);
1186 /* If this was a group that remembered the subject start, in order to break
1187 infinite repeats of empty string matches, retrieve the subject start from
1188 the chain. Otherwise, set it NULL. */
1190 if (*prev >= OP_SBRA)
1192 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1193 eptrb = eptrb->epb_prev; /* Backup to previous group */
1195 else saved_eptr = NULL;
1197 /* If we are at the end of an assertion group, stop matching and return
1198 MATCH_MATCH, but record the current high water mark for use by positive
1199 assertions. Do this also for the "once" (atomic) groups. */
1201 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1202 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1205 md->end_match_ptr = eptr; /* For ONCE */
1206 md->end_offset_top = offset_top;
1207 RRETURN(MATCH_MATCH);
1210 /* For capturing groups we have to check the group number back at the start
1211 and if necessary complete handling an extraction by setting the offsets and
1212 bumping the high water mark. Note that whole-pattern recursion is coded as
1213 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1214 when the OP_END is reached. Other recursion is handled here. */
1216 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1218 number = GET2(prev, 1+LINK_SIZE);
1219 offset = number << 1;
1222 printf("end bracket %d", number);
1226 md->capture_last = number;
1227 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1229 md->offset_vector[offset] =
1230 md->offset_vector[md->offset_end - number];
1231 md->offset_vector[offset+1] = eptr - md->start_subject;
1232 if (offset_top <= offset) offset_top = offset + 2;
1235 /* Handle a recursively called group. Restore the offsets
1236 appropriately and continue from after the call. */
1238 if (md->recursive != NULL && md->recursive->group_num == number)
1240 recursion_info *rec = md->recursive;
1241 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1242 md->recursive = rec->prevrec;
1243 mstart = rec->save_start;
1244 memcpy(md->offset_vector, rec->offset_save,
1245 rec->saved_max * sizeof(int));
1246 ecode = rec->after_call;
1252 /* For both capturing and non-capturing groups, reset the value of the ims
1253 flags, in case they got changed during the group. */
1256 DPRINTF(("ims reset to %02lx\n", ims));
1258 /* For a non-repeating ket, just continue at this level. This also
1259 happens for a repeating ket if no characters were matched in the group.
1260 This is the forcible breaking of infinite loops as implemented in Perl
1261 5.005. If there is an options reset, it will get obeyed in the normal
1262 course of events. */
1264 if (*ecode == OP_KET || eptr == saved_eptr)
1266 ecode += 1 + LINK_SIZE;
1270 /* The repeating kets try the rest of the pattern or restart from the
1271 preceding bracket, in the appropriate order. In the second case, we can use
1272 tail recursion to avoid using another stack frame, unless we have an
1273 unlimited repeat of a group that can match an empty string. */
1275 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1277 if (*ecode == OP_KETRMIN)
1279 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1280 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1281 if (flags != 0) /* Could match an empty string */
1283 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1289 else /* OP_KETRMAX */
1291 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1292 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1293 ecode += 1 + LINK_SIZE;
1297 /* Control never gets here */
1299 /* Start of subject unless notbol, or after internal newline if multiline */
1302 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1303 if ((ims & PCRE_MULTILINE) != 0)
1305 if (eptr != md->start_subject &&
1306 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1307 RRETURN(MATCH_NOMATCH);
1311 /* ... else fall through */
1313 /* Start of subject assertion */
1316 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1320 /* Start of match assertion */
1323 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1327 /* Reset the start of match point */
1334 /* Assert before internal newline if multiline, or before a terminating
1335 newline unless endonly is set, else end of subject unless noteol is set. */
1338 if ((ims & PCRE_MULTILINE) != 0)
1340 if (eptr < md->end_subject)
1341 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1343 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1349 if (md->noteol) RRETURN(MATCH_NOMATCH);
1352 if (eptr != md->end_subject &&
1353 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1354 RRETURN(MATCH_NOMATCH);
1359 /* ... else fall through for endonly */
1361 /* End of subject assertion (\z) */
1364 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1368 /* End of subject or ending \n assertion (\Z) */
1371 if (eptr != md->end_subject &&
1372 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1373 RRETURN(MATCH_NOMATCH);
1377 /* Word boundary assertions */
1379 case OP_NOT_WORD_BOUNDARY:
1380 case OP_WORD_BOUNDARY:
1383 /* Find out if the previous and current characters are "word" characters.
1384 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1385 be "non-word" characters. */
1390 if (eptr == md->start_subject) prev_is_word = FALSE; else
1392 const uschar *lastptr = eptr - 1;
1393 while((*lastptr & 0xc0) == 0x80) lastptr--;
1394 GETCHAR(c, lastptr);
1395 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1397 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1400 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1406 /* More streamlined when not in UTF-8 mode */
1409 prev_is_word = (eptr != md->start_subject) &&
1410 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1411 cur_is_word = (eptr < md->end_subject) &&
1412 ((md->ctypes[*eptr] & ctype_word) != 0);
1415 /* Now see if the situation is what we want */
1417 if ((*ecode++ == OP_WORD_BOUNDARY)?
1418 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1419 RRETURN(MATCH_NOMATCH);
1423 /* Match a single character type; inline for speed */
1426 if ((ims & PCRE_DOTALL) == 0)
1428 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1430 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1432 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1436 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1437 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1440 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1445 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1446 GETCHARINCTEST(c, eptr);
1451 (md->ctypes[c] & ctype_digit) != 0
1453 RRETURN(MATCH_NOMATCH);
1458 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1459 GETCHARINCTEST(c, eptr);
1464 (md->ctypes[c] & ctype_digit) == 0
1466 RRETURN(MATCH_NOMATCH);
1470 case OP_NOT_WHITESPACE:
1471 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1472 GETCHARINCTEST(c, eptr);
1477 (md->ctypes[c] & ctype_space) != 0
1479 RRETURN(MATCH_NOMATCH);
1484 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1485 GETCHARINCTEST(c, eptr);
1490 (md->ctypes[c] & ctype_space) == 0
1492 RRETURN(MATCH_NOMATCH);
1496 case OP_NOT_WORDCHAR:
1497 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1498 GETCHARINCTEST(c, eptr);
1503 (md->ctypes[c] & ctype_word) != 0
1505 RRETURN(MATCH_NOMATCH);
1510 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1511 GETCHARINCTEST(c, eptr);
1516 (md->ctypes[c] & ctype_word) == 0
1518 RRETURN(MATCH_NOMATCH);
1523 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1524 GETCHARINCTEST(c, eptr);
1527 default: RRETURN(MATCH_NOMATCH);
1529 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1540 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1547 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1548 GETCHARINCTEST(c, eptr);
1553 case 0x20: /* SPACE */
1554 case 0xa0: /* NBSP */
1555 case 0x1680: /* OGHAM SPACE MARK */
1556 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1557 case 0x2000: /* EN QUAD */
1558 case 0x2001: /* EM QUAD */
1559 case 0x2002: /* EN SPACE */
1560 case 0x2003: /* EM SPACE */
1561 case 0x2004: /* THREE-PER-EM SPACE */
1562 case 0x2005: /* FOUR-PER-EM SPACE */
1563 case 0x2006: /* SIX-PER-EM SPACE */
1564 case 0x2007: /* FIGURE SPACE */
1565 case 0x2008: /* PUNCTUATION SPACE */
1566 case 0x2009: /* THIN SPACE */
1567 case 0x200A: /* HAIR SPACE */
1568 case 0x202f: /* NARROW NO-BREAK SPACE */
1569 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1570 case 0x3000: /* IDEOGRAPHIC SPACE */
1571 RRETURN(MATCH_NOMATCH);
1577 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1578 GETCHARINCTEST(c, eptr);
1581 default: RRETURN(MATCH_NOMATCH);
1583 case 0x20: /* SPACE */
1584 case 0xa0: /* NBSP */
1585 case 0x1680: /* OGHAM SPACE MARK */
1586 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1587 case 0x2000: /* EN QUAD */
1588 case 0x2001: /* EM QUAD */
1589 case 0x2002: /* EN SPACE */
1590 case 0x2003: /* EM SPACE */
1591 case 0x2004: /* THREE-PER-EM SPACE */
1592 case 0x2005: /* FOUR-PER-EM SPACE */
1593 case 0x2006: /* SIX-PER-EM SPACE */
1594 case 0x2007: /* FIGURE SPACE */
1595 case 0x2008: /* PUNCTUATION SPACE */
1596 case 0x2009: /* THIN SPACE */
1597 case 0x200A: /* HAIR SPACE */
1598 case 0x202f: /* NARROW NO-BREAK SPACE */
1599 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1600 case 0x3000: /* IDEOGRAPHIC SPACE */
1607 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1608 GETCHARINCTEST(c, eptr);
1616 case 0x85: /* NEL */
1617 case 0x2028: /* LINE SEPARATOR */
1618 case 0x2029: /* PARAGRAPH SEPARATOR */
1619 RRETURN(MATCH_NOMATCH);
1625 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1626 GETCHARINCTEST(c, eptr);
1629 default: RRETURN(MATCH_NOMATCH);
1634 case 0x85: /* NEL */
1635 case 0x2028: /* LINE SEPARATOR */
1636 case 0x2029: /* PARAGRAPH SEPARATOR */
1643 /* Check the next character by Unicode property. We will get here only
1644 if the support is in the binary; otherwise a compile-time error occurs. */
1648 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1649 GETCHARINCTEST(c, eptr);
1651 int chartype, script;
1652 int category = _pcre_ucp_findprop(c, &chartype, &script);
1657 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1661 if ((chartype == ucp_Lu ||
1662 chartype == ucp_Ll ||
1663 chartype == ucp_Lt) == (op == OP_NOTPROP))
1664 RRETURN(MATCH_NOMATCH);
1668 if ((ecode[2] != category) == (op == OP_PROP))
1669 RRETURN(MATCH_NOMATCH);
1673 if ((ecode[2] != chartype) == (op == OP_PROP))
1674 RRETURN(MATCH_NOMATCH);
1678 if ((ecode[2] != script) == (op == OP_PROP))
1679 RRETURN(MATCH_NOMATCH);
1683 RRETURN(PCRE_ERROR_INTERNAL);
1690 /* Match an extended Unicode sequence. We will get here only if the support
1691 is in the binary; otherwise a compile-time error occurs. */
1694 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1695 GETCHARINCTEST(c, eptr);
1697 int chartype, script;
1698 int category = _pcre_ucp_findprop(c, &chartype, &script);
1699 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1700 while (eptr < md->end_subject)
1703 if (!utf8) c = *eptr; else
1705 GETCHARLEN(c, eptr, len);
1707 category = _pcre_ucp_findprop(c, &chartype, &script);
1708 if (category != ucp_M) break;
1717 /* Match a back reference, possibly repeatedly. Look past the end of the
1718 item to see if there is repeat information following. The code is similar
1719 to that for character classes, but repeated for efficiency. Then obey
1720 similar code to character type repeats - written out again for speed.
1721 However, if the referenced string is the empty string, always treat
1722 it as matched, any number of times (otherwise there could be infinite
1727 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1728 ecode += 3; /* Advance past item */
1730 /* If the reference is unset, set the length to be longer than the amount
1731 of subject left; this ensures that every attempt at a match fails. We
1732 can't just fail here, because of the possibility of quantifiers with zero
1735 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1736 md->end_subject - eptr + 1 :
1737 md->offset_vector[offset+1] - md->offset_vector[offset];
1739 /* Set up for repetition, or handle the non-repeated case */
1749 c = *ecode++ - OP_CRSTAR;
1750 minimize = (c & 1) != 0;
1751 min = rep_min[c]; /* Pick up values from tables; */
1752 max = rep_max[c]; /* zero for max => infinity */
1753 if (max == 0) max = INT_MAX;
1758 minimize = (*ecode == OP_CRMINRANGE);
1759 min = GET2(ecode, 1);
1760 max = GET2(ecode, 3);
1761 if (max == 0) max = INT_MAX;
1765 default: /* No repeat follows */
1766 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1768 continue; /* With the main loop */
1771 /* If the length of the reference is zero, just continue with the
1774 if (length == 0) continue;
1776 /* First, ensure the minimum number of matches are present. We get back
1777 the length of the reference string explicitly rather than passing the
1778 address of eptr, so that eptr can be a register variable. */
1780 for (i = 1; i <= min; i++)
1782 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1786 /* If min = max, continue at the same level without recursion.
1787 They are not both allowed to be zero. */
1789 if (min == max) continue;
1791 /* If minimizing, keep trying and advancing the pointer */
1795 for (fi = min;; fi++)
1797 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1798 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1799 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1800 RRETURN(MATCH_NOMATCH);
1803 /* Control never gets here */
1806 /* If maximizing, find the longest string and work backwards */
1811 for (i = min; i < max; i++)
1813 if (!match_ref(offset, eptr, length, md, ims)) break;
1818 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1819 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1822 RRETURN(MATCH_NOMATCH);
1825 /* Control never gets here */
1829 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1830 used when all the characters in the class have values in the range 0-255,
1831 and either the matching is caseful, or the characters are in the range
1832 0-127 when UTF-8 processing is enabled. The only difference between
1833 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1836 First, look past the end of the item to see if there is repeat information
1837 following. Then obey similar code to character type repeats - written out
1843 data = ecode + 1; /* Save for matching */
1844 ecode += 33; /* Advance past the item */
1854 c = *ecode++ - OP_CRSTAR;
1855 minimize = (c & 1) != 0;
1856 min = rep_min[c]; /* Pick up values from tables; */
1857 max = rep_max[c]; /* zero for max => infinity */
1858 if (max == 0) max = INT_MAX;
1863 minimize = (*ecode == OP_CRMINRANGE);
1864 min = GET2(ecode, 1);
1865 max = GET2(ecode, 3);
1866 if (max == 0) max = INT_MAX;
1870 default: /* No repeat follows */
1875 /* First, ensure the minimum number of matches are present. */
1881 for (i = 1; i <= min; i++)
1883 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1884 GETCHARINC(c, eptr);
1887 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1891 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1897 /* Not UTF-8 mode */
1899 for (i = 1; i <= min; i++)
1901 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1903 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1907 /* If max == min we can continue with the main loop without the
1910 if (min == max) continue;
1912 /* If minimizing, keep testing the rest of the expression and advancing
1913 the pointer while it matches the class. */
1921 for (fi = min;; fi++)
1923 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1924 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1925 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1926 GETCHARINC(c, eptr);
1929 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1933 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1939 /* Not UTF-8 mode */
1941 for (fi = min;; fi++)
1943 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1945 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1947 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1950 /* Control never gets here */
1953 /* If maximizing, find the longest possible run, then work backwards. */
1963 for (i = min; i < max; i++)
1966 if (eptr >= md->end_subject) break;
1967 GETCHARLEN(c, eptr, len);
1970 if (op == OP_CLASS) break;
1974 if ((data[c/8] & (1 << (c&7))) == 0) break;
1980 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1981 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1982 if (eptr-- == pp) break; /* Stop if tried at original pos */
1988 /* Not UTF-8 mode */
1990 for (i = min; i < max; i++)
1992 if (eptr >= md->end_subject) break;
1994 if ((data[c/8] & (1 << (c&7))) == 0) break;
1999 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2005 RRETURN(MATCH_NOMATCH);
2008 /* Control never gets here */
2011 /* Match an extended character class. This opcode is encountered only
2012 in UTF-8 mode, because that's the only time it is compiled. */
2017 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2018 ecode += GET(ecode, 1); /* Advance past the item */
2028 c = *ecode++ - OP_CRSTAR;
2029 minimize = (c & 1) != 0;
2030 min = rep_min[c]; /* Pick up values from tables; */
2031 max = rep_max[c]; /* zero for max => infinity */
2032 if (max == 0) max = INT_MAX;
2037 minimize = (*ecode == OP_CRMINRANGE);
2038 min = GET2(ecode, 1);
2039 max = GET2(ecode, 3);
2040 if (max == 0) max = INT_MAX;
2044 default: /* No repeat follows */
2049 /* First, ensure the minimum number of matches are present. */
2051 for (i = 1; i <= min; i++)
2053 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2054 GETCHARINC(c, eptr);
2055 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2058 /* If max == min we can continue with the main loop without the
2061 if (min == max) continue;
2063 /* If minimizing, keep testing the rest of the expression and advancing
2064 the pointer while it matches the class. */
2068 for (fi = min;; fi++)
2070 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2071 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2072 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2073 GETCHARINC(c, eptr);
2074 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2076 /* Control never gets here */
2079 /* If maximizing, find the longest possible run, then work backwards. */
2084 for (i = min; i < max; i++)
2087 if (eptr >= md->end_subject) break;
2088 GETCHARLEN(c, eptr, len);
2089 if (!_pcre_xclass(c, data)) break;
2094 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2096 if (eptr-- == pp) break; /* Stop if tried at original pos */
2097 if (utf8) BACKCHAR(eptr);
2099 RRETURN(MATCH_NOMATCH);
2102 /* Control never gets here */
2104 #endif /* End of XCLASS */
2106 /* Match a single character, casefully */
2114 GETCHARLEN(fc, ecode, length);
2115 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2116 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2121 /* Non-UTF-8 mode */
2123 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2124 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2129 /* Match a single character, caselessly */
2137 GETCHARLEN(fc, ecode, length);
2139 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2141 /* If the pattern character's value is < 128, we have only one byte, and
2142 can use the fast lookup table. */
2146 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2149 /* Otherwise we must pick up the subject character */
2154 GETCHARINC(dc, eptr);
2157 /* If we have Unicode property support, we can use it to test the other
2158 case of the character, if there is one. */
2163 if (dc != _pcre_ucp_othercase(fc))
2165 RRETURN(MATCH_NOMATCH);
2170 #endif /* SUPPORT_UTF8 */
2172 /* Non-UTF-8 mode */
2174 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2175 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2180 /* Match a single character repeatedly. */
2183 min = max = GET2(ecode, 1);
2194 max = GET2(ecode, 1);
2195 minimize = *ecode == OP_MINUPTO;
2226 c = *ecode++ - OP_STAR;
2227 minimize = (c & 1) != 0;
2228 min = rep_min[c]; /* Pick up values from tables; */
2229 max = rep_max[c]; /* zero for max => infinity */
2230 if (max == 0) max = INT_MAX;
2232 /* Common code for all repeated single-character matches. We can give
2233 up quickly if there are fewer than the minimum number of characters left in
2242 GETCHARLEN(fc, ecode, length);
2243 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2246 /* Handle multibyte character matching specially here. There is
2247 support for caseless matching if UCP support is present. */
2252 unsigned int othercase;
2253 if ((ims & PCRE_CASELESS) != 0 &&
2254 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2255 oclength = _pcre_ord2utf8(othercase, occhars);
2257 #endif /* SUPPORT_UCP */
2259 for (i = 1; i <= min; i++)
2261 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2263 /* Need braces because of following else */
2264 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2267 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2270 #else /* without SUPPORT_UCP */
2271 else { RRETURN(MATCH_NOMATCH); }
2272 #endif /* SUPPORT_UCP */
2275 if (min == max) continue;
2279 for (fi = min;; fi++)
2281 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2282 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2283 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2284 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2286 /* Need braces because of following else */
2287 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2290 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2293 #else /* without SUPPORT_UCP */
2294 else { RRETURN (MATCH_NOMATCH); }
2295 #endif /* SUPPORT_UCP */
2297 /* Control never gets here */
2303 for (i = min; i < max; i++)
2305 if (eptr > md->end_subject - length) break;
2306 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2308 else if (oclength == 0) break;
2311 if (memcmp(eptr, occhars, oclength) != 0) break;
2314 #else /* without SUPPORT_UCP */
2316 #endif /* SUPPORT_UCP */
2319 if (possessive) continue;
2322 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2323 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2324 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2328 #else /* without SUPPORT_UCP */
2330 #endif /* SUPPORT_UCP */
2333 /* Control never gets here */
2336 /* If the length of a UTF-8 character is 1, we fall through here, and
2337 obey the code as for non-UTF-8 characters below, though in this case the
2338 value of fc will always be < 128. */
2341 #endif /* SUPPORT_UTF8 */
2343 /* When not in UTF-8 mode, load a single-byte character. */
2345 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2349 /* The value of fc at this point is always less than 256, though we may or
2350 may not be in UTF-8 mode. The code is duplicated for the caseless and
2351 caseful cases, for speed, since matching characters is likely to be quite
2352 common. First, ensure the minimum number of matches are present. If min =
2353 max, continue at the same level without recursing. Otherwise, if
2354 minimizing, keep trying the rest of the expression and advancing one
2355 matching character if failing, up to the maximum. Alternatively, if
2356 maximizing, find the maximum number of characters and work backwards. */
2358 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2361 if ((ims & PCRE_CASELESS) != 0)
2364 for (i = 1; i <= min; i++)
2365 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2366 if (min == max) continue;
2369 for (fi = min;; fi++)
2371 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2372 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2373 if (fi >= max || eptr >= md->end_subject ||
2374 fc != md->lcc[*eptr++])
2375 RRETURN(MATCH_NOMATCH);
2377 /* Control never gets here */
2382 for (i = min; i < max; i++)
2384 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2387 if (possessive) continue;
2390 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2392 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2394 RRETURN(MATCH_NOMATCH);
2396 /* Control never gets here */
2399 /* Caseful comparisons (includes all multi-byte characters) */
2403 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2404 if (min == max) continue;
2407 for (fi = min;; fi++)
2409 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2410 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2411 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2412 RRETURN(MATCH_NOMATCH);
2414 /* Control never gets here */
2419 for (i = min; i < max; i++)
2421 if (eptr >= md->end_subject || fc != *eptr) break;
2424 if (possessive) continue;
2427 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2429 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2431 RRETURN(MATCH_NOMATCH);
2434 /* Control never gets here */
2436 /* Match a negated single one-byte character. The character we are
2437 checking can be multibyte. */
2440 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2442 GETCHARINCTEST(c, eptr);
2443 if ((ims & PCRE_CASELESS) != 0)
2449 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2453 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2457 /* Match a negated single one-byte character repeatedly. This is almost a
2458 repeat of the code for a repeated single character, but I haven't found a
2459 nice way of commoning these up that doesn't require a test of the
2460 positive/negative option for each character match. Maybe that wouldn't add
2461 very much to the time taken, but character matching *is* what this is all
2465 min = max = GET2(ecode, 1);
2472 max = GET2(ecode, 1);
2473 minimize = *ecode == OP_NOTMINUPTO;
2491 case OP_NOTPOSQUERY:
2501 max = GET2(ecode, 1);
2510 case OP_NOTMINQUERY:
2511 c = *ecode++ - OP_NOTSTAR;
2512 minimize = (c & 1) != 0;
2513 min = rep_min[c]; /* Pick up values from tables; */
2514 max = rep_max[c]; /* zero for max => infinity */
2515 if (max == 0) max = INT_MAX;
2517 /* Common code for all repeated single-byte matches. We can give up quickly
2518 if there are fewer than the minimum number of bytes left in the
2522 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2525 /* The code is duplicated for the caseless and caseful cases, for speed,
2526 since matching characters is likely to be quite common. First, ensure the
2527 minimum number of matches are present. If min = max, continue at the same
2528 level without recursing. Otherwise, if minimizing, keep trying the rest of
2529 the expression and advancing one matching character if failing, up to the
2530 maximum. Alternatively, if maximizing, find the maximum number of
2531 characters and work backwards. */
2533 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2536 if ((ims & PCRE_CASELESS) != 0)
2544 register unsigned int d;
2545 for (i = 1; i <= min; i++)
2547 GETCHARINC(d, eptr);
2548 if (d < 256) d = md->lcc[d];
2549 if (fc == d) RRETURN(MATCH_NOMATCH);
2555 /* Not UTF-8 mode */
2557 for (i = 1; i <= min; i++)
2558 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2561 if (min == max) continue;
2569 register unsigned int d;
2570 for (fi = min;; fi++)
2572 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2573 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2574 GETCHARINC(d, eptr);
2575 if (d < 256) d = md->lcc[d];
2576 if (fi >= max || eptr >= md->end_subject || fc == d)
2577 RRETURN(MATCH_NOMATCH);
2582 /* Not UTF-8 mode */
2584 for (fi = min;; fi++)
2586 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2587 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2588 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2589 RRETURN(MATCH_NOMATCH);
2592 /* Control never gets here */
2605 register unsigned int d;
2606 for (i = min; i < max; i++)
2609 if (eptr >= md->end_subject) break;
2610 GETCHARLEN(d, eptr, len);
2611 if (d < 256) d = md->lcc[d];
2615 if (possessive) continue;
2618 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2619 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2620 if (eptr-- == pp) break; /* Stop if tried at original pos */
2626 /* Not UTF-8 mode */
2628 for (i = min; i < max; i++)
2630 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2633 if (possessive) continue;
2636 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2637 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2642 RRETURN(MATCH_NOMATCH);
2644 /* Control never gets here */
2647 /* Caseful comparisons */
2655 register unsigned int d;
2656 for (i = 1; i <= min; i++)
2658 GETCHARINC(d, eptr);
2659 if (fc == d) RRETURN(MATCH_NOMATCH);
2664 /* Not UTF-8 mode */
2666 for (i = 1; i <= min; i++)
2667 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2670 if (min == max) continue;
2678 register unsigned int d;
2679 for (fi = min;; fi++)
2681 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2682 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2683 GETCHARINC(d, eptr);
2684 if (fi >= max || eptr >= md->end_subject || fc == d)
2685 RRETURN(MATCH_NOMATCH);
2690 /* Not UTF-8 mode */
2692 for (fi = min;; fi++)
2694 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2695 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2696 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2697 RRETURN(MATCH_NOMATCH);
2700 /* Control never gets here */
2713 register unsigned int d;
2714 for (i = min; i < max; i++)
2717 if (eptr >= md->end_subject) break;
2718 GETCHARLEN(d, eptr, len);
2722 if (possessive) continue;
2725 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2726 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2727 if (eptr-- == pp) break; /* Stop if tried at original pos */
2733 /* Not UTF-8 mode */
2735 for (i = min; i < max; i++)
2737 if (eptr >= md->end_subject || fc == *eptr) break;
2740 if (possessive) continue;
2743 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2744 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2749 RRETURN(MATCH_NOMATCH);
2752 /* Control never gets here */
2754 /* Match a single character type repeatedly; several different opcodes
2755 share code. This is very similar to the code for single characters, but we
2756 repeat it in the interests of efficiency. */
2759 min = max = GET2(ecode, 1);
2765 case OP_TYPEMINUPTO:
2767 max = GET2(ecode, 1);
2768 minimize = *ecode == OP_TYPEMINUPTO;
2772 case OP_TYPEPOSSTAR:
2779 case OP_TYPEPOSPLUS:
2786 case OP_TYPEPOSQUERY:
2793 case OP_TYPEPOSUPTO:
2796 max = GET2(ecode, 1);
2801 case OP_TYPEMINSTAR:
2803 case OP_TYPEMINPLUS:
2805 case OP_TYPEMINQUERY:
2806 c = *ecode++ - OP_TYPESTAR;
2807 minimize = (c & 1) != 0;
2808 min = rep_min[c]; /* Pick up values from tables; */
2809 max = rep_max[c]; /* zero for max => infinity */
2810 if (max == 0) max = INT_MAX;
2812 /* Common code for all repeated single character type matches. Note that
2813 in UTF-8 mode, '.' matches a character of any length, but for the other
2814 character types, the valid characters are all one-byte long. */
2817 ctype = *ecode++; /* Code for the character type */
2820 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2822 prop_fail_result = ctype == OP_NOTPROP;
2823 prop_type = *ecode++;
2824 prop_value = *ecode++;
2826 else prop_type = -1;
2829 /* First, ensure the minimum number of matches are present. Use inline
2830 code for maximizing the speed, and do the type test once at the start
2831 (i.e. keep it out of the loop). Also we can test that there are at least
2832 the minimum number of bytes before we start. This isn't as effective in
2833 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2834 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2835 and single-bytes. */
2837 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2846 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2847 for (i = 1; i <= min; i++)
2849 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2850 GETCHARINCTEST(c, eptr);
2855 for (i = 1; i <= min; i++)
2857 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2858 GETCHARINCTEST(c, eptr);
2859 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2860 if ((prop_chartype == ucp_Lu ||
2861 prop_chartype == ucp_Ll ||
2862 prop_chartype == ucp_Lt) == prop_fail_result)
2863 RRETURN(MATCH_NOMATCH);
2868 for (i = 1; i <= min; i++)
2870 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2871 GETCHARINCTEST(c, eptr);
2872 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2873 if ((prop_category == prop_value) == prop_fail_result)
2874 RRETURN(MATCH_NOMATCH);
2879 for (i = 1; i <= min; i++)
2881 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2882 GETCHARINCTEST(c, eptr);
2883 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2884 if ((prop_chartype == prop_value) == prop_fail_result)
2885 RRETURN(MATCH_NOMATCH);
2890 for (i = 1; i <= min; i++)
2892 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2893 GETCHARINCTEST(c, eptr);
2894 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2895 if ((prop_script == prop_value) == prop_fail_result)
2896 RRETURN(MATCH_NOMATCH);
2901 RRETURN(PCRE_ERROR_INTERNAL);
2905 /* Match extended Unicode sequences. We will get here only if the
2906 support is in the binary; otherwise a compile-time error occurs. */
2908 else if (ctype == OP_EXTUNI)
2910 for (i = 1; i <= min; i++)
2912 GETCHARINCTEST(c, eptr);
2913 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2914 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2915 while (eptr < md->end_subject)
2918 if (!utf8) c = *eptr; else
2920 GETCHARLEN(c, eptr, len);
2922 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2923 if (prop_category != ucp_M) break;
2930 #endif /* SUPPORT_UCP */
2932 /* Handle all other cases when the coding is UTF-8 */
2935 if (utf8) switch(ctype)
2938 for (i = 1; i <= min; i++)
2940 if (eptr >= md->end_subject ||
2941 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2942 RRETURN(MATCH_NOMATCH);
2944 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2953 for (i = 1; i <= min; i++)
2955 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2956 GETCHARINC(c, eptr);
2959 default: RRETURN(MATCH_NOMATCH);
2961 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2972 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2979 for (i = 1; i <= min; i++)
2981 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2982 GETCHARINC(c, eptr);
2987 case 0x20: /* SPACE */
2988 case 0xa0: /* NBSP */
2989 case 0x1680: /* OGHAM SPACE MARK */
2990 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2991 case 0x2000: /* EN QUAD */
2992 case 0x2001: /* EM QUAD */
2993 case 0x2002: /* EN SPACE */
2994 case 0x2003: /* EM SPACE */
2995 case 0x2004: /* THREE-PER-EM SPACE */
2996 case 0x2005: /* FOUR-PER-EM SPACE */
2997 case 0x2006: /* SIX-PER-EM SPACE */
2998 case 0x2007: /* FIGURE SPACE */
2999 case 0x2008: /* PUNCTUATION SPACE */
3000 case 0x2009: /* THIN SPACE */
3001 case 0x200A: /* HAIR SPACE */
3002 case 0x202f: /* NARROW NO-BREAK SPACE */
3003 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3004 case 0x3000: /* IDEOGRAPHIC SPACE */
3005 RRETURN(MATCH_NOMATCH);
3011 for (i = 1; i <= min; i++)
3013 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3014 GETCHARINC(c, eptr);
3017 default: RRETURN(MATCH_NOMATCH);
3019 case 0x20: /* SPACE */
3020 case 0xa0: /* NBSP */
3021 case 0x1680: /* OGHAM SPACE MARK */
3022 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3023 case 0x2000: /* EN QUAD */
3024 case 0x2001: /* EM QUAD */
3025 case 0x2002: /* EN SPACE */
3026 case 0x2003: /* EM SPACE */
3027 case 0x2004: /* THREE-PER-EM SPACE */
3028 case 0x2005: /* FOUR-PER-EM SPACE */
3029 case 0x2006: /* SIX-PER-EM SPACE */
3030 case 0x2007: /* FIGURE SPACE */
3031 case 0x2008: /* PUNCTUATION SPACE */
3032 case 0x2009: /* THIN SPACE */
3033 case 0x200A: /* HAIR SPACE */
3034 case 0x202f: /* NARROW NO-BREAK SPACE */
3035 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3036 case 0x3000: /* IDEOGRAPHIC SPACE */
3043 for (i = 1; i <= min; i++)
3045 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3046 GETCHARINC(c, eptr);
3054 case 0x85: /* NEL */
3055 case 0x2028: /* LINE SEPARATOR */
3056 case 0x2029: /* PARAGRAPH SEPARATOR */
3057 RRETURN(MATCH_NOMATCH);
3063 for (i = 1; i <= min; i++)
3065 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3066 GETCHARINC(c, eptr);
3069 default: RRETURN(MATCH_NOMATCH);
3074 case 0x85: /* NEL */
3075 case 0x2028: /* LINE SEPARATOR */
3076 case 0x2029: /* PARAGRAPH SEPARATOR */
3083 for (i = 1; i <= min; i++)
3085 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3086 GETCHARINC(c, eptr);
3087 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3088 RRETURN(MATCH_NOMATCH);
3093 for (i = 1; i <= min; i++)
3095 if (eptr >= md->end_subject ||
3096 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3097 RRETURN(MATCH_NOMATCH);
3098 /* No need to skip more bytes - we know it's a 1-byte character */
3102 case OP_NOT_WHITESPACE:
3103 for (i = 1; i <= min; i++)
3105 if (eptr >= md->end_subject ||
3106 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3107 RRETURN(MATCH_NOMATCH);
3108 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3113 for (i = 1; i <= min; i++)
3115 if (eptr >= md->end_subject ||
3116 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3117 RRETURN(MATCH_NOMATCH);
3118 /* No need to skip more bytes - we know it's a 1-byte character */
3122 case OP_NOT_WORDCHAR:
3123 for (i = 1; i <= min; i++)
3125 if (eptr >= md->end_subject ||
3126 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3127 RRETURN(MATCH_NOMATCH);
3128 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3133 for (i = 1; i <= min; i++)
3135 if (eptr >= md->end_subject ||
3136 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3137 RRETURN(MATCH_NOMATCH);
3138 /* No need to skip more bytes - we know it's a 1-byte character */
3143 RRETURN(PCRE_ERROR_INTERNAL);
3144 } /* End switch(ctype) */
3147 #endif /* SUPPORT_UTF8 */
3149 /* Code for the non-UTF-8 case for minimum matching of operators other
3150 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3151 number of bytes present, as this was tested above. */
3156 if ((ims & PCRE_DOTALL) == 0)
3158 for (i = 1; i <= min; i++)
3160 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3171 /* Because of the CRLF case, we can't assume the minimum number of
3172 bytes are present in this case. */
3175 for (i = 1; i <= min; i++)
3177 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3180 default: RRETURN(MATCH_NOMATCH);
3182 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3190 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3197 for (i = 1; i <= min; i++)
3199 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3204 case 0x20: /* SPACE */
3205 case 0xa0: /* NBSP */
3206 RRETURN(MATCH_NOMATCH);
3212 for (i = 1; i <= min; i++)
3214 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3217 default: RRETURN(MATCH_NOMATCH);
3219 case 0x20: /* SPACE */
3220 case 0xa0: /* NBSP */
3227 for (i = 1; i <= min; i++)
3229 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3237 case 0x85: /* NEL */
3238 RRETURN(MATCH_NOMATCH);
3244 for (i = 1; i <= min; i++)
3246 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3249 default: RRETURN(MATCH_NOMATCH);
3254 case 0x85: /* NEL */
3261 for (i = 1; i <= min; i++)
3262 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3266 for (i = 1; i <= min; i++)
3267 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3270 case OP_NOT_WHITESPACE:
3271 for (i = 1; i <= min; i++)
3272 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3276 for (i = 1; i <= min; i++)
3277 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3280 case OP_NOT_WORDCHAR:
3281 for (i = 1; i <= min; i++)
3282 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3283 RRETURN(MATCH_NOMATCH);
3287 for (i = 1; i <= min; i++)
3288 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3289 RRETURN(MATCH_NOMATCH);
3293 RRETURN(PCRE_ERROR_INTERNAL);
3297 /* If min = max, continue at the same level without recursing */
3299 if (min == max) continue;
3301 /* If minimizing, we have to test the rest of the pattern before each
3302 subsequent match. Again, separate the UTF-8 case for speed, and also
3303 separate the UCP cases. */
3313 for (fi = min;; fi++)
3315 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3316 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3317 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3318 GETCHARINC(c, eptr);
3319 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3321 /* Control never gets here */
3324 for (fi = min;; fi++)
3326 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3327 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3328 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3329 GETCHARINC(c, eptr);
3330 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3331 if ((prop_chartype == ucp_Lu ||
3332 prop_chartype == ucp_Ll ||
3333 prop_chartype == ucp_Lt) == prop_fail_result)
3334 RRETURN(MATCH_NOMATCH);
3336 /* Control never gets here */
3339 for (fi = min;; fi++)
3341 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3342 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3343 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3344 GETCHARINC(c, eptr);
3345 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3346 if ((prop_category == prop_value) == prop_fail_result)
3347 RRETURN(MATCH_NOMATCH);
3349 /* Control never gets here */
3352 for (fi = min;; fi++)
3354 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3355 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3356 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3357 GETCHARINC(c, eptr);
3358 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3359 if ((prop_chartype == prop_value) == prop_fail_result)
3360 RRETURN(MATCH_NOMATCH);
3362 /* Control never gets here */
3365 for (fi = min;; fi++)
3367 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3368 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3369 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3370 GETCHARINC(c, eptr);
3371 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3372 if ((prop_script == prop_value) == prop_fail_result)
3373 RRETURN(MATCH_NOMATCH);
3375 /* Control never gets here */
3378 RRETURN(PCRE_ERROR_INTERNAL);
3382 /* Match extended Unicode sequences. We will get here only if the
3383 support is in the binary; otherwise a compile-time error occurs. */
3385 else if (ctype == OP_EXTUNI)
3387 for (fi = min;; fi++)
3389 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3390 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3391 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3392 GETCHARINCTEST(c, eptr);
3393 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3394 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3395 while (eptr < md->end_subject)
3398 if (!utf8) c = *eptr; else
3400 GETCHARLEN(c, eptr, len);
3402 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3403 if (prop_category != ucp_M) break;
3410 #endif /* SUPPORT_UCP */
3416 for (fi = min;; fi++)
3418 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3419 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3420 if (fi >= max || eptr >= md->end_subject ||
3421 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3423 RRETURN(MATCH_NOMATCH);
3425 GETCHARINC(c, eptr);
3428 case OP_ANY: /* This is the DOTALL case */
3437 default: RRETURN(MATCH_NOMATCH);
3439 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3449 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3459 case 0x20: /* SPACE */
3460 case 0xa0: /* NBSP */
3461 case 0x1680: /* OGHAM SPACE MARK */
3462 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3463 case 0x2000: /* EN QUAD */
3464 case 0x2001: /* EM QUAD */
3465 case 0x2002: /* EN SPACE */
3466 case 0x2003: /* EM SPACE */
3467 case 0x2004: /* THREE-PER-EM SPACE */
3468 case 0x2005: /* FOUR-PER-EM SPACE */
3469 case 0x2006: /* SIX-PER-EM SPACE */
3470 case 0x2007: /* FIGURE SPACE */
3471 case 0x2008: /* PUNCTUATION SPACE */
3472 case 0x2009: /* THIN SPACE */
3473 case 0x200A: /* HAIR SPACE */
3474 case 0x202f: /* NARROW NO-BREAK SPACE */
3475 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3476 case 0x3000: /* IDEOGRAPHIC SPACE */
3477 RRETURN(MATCH_NOMATCH);
3484 default: RRETURN(MATCH_NOMATCH);
3486 case 0x20: /* SPACE */
3487 case 0xa0: /* NBSP */
3488 case 0x1680: /* OGHAM SPACE MARK */
3489 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3490 case 0x2000: /* EN QUAD */
3491 case 0x2001: /* EM QUAD */
3492 case 0x2002: /* EN SPACE */
3493 case 0x2003: /* EM SPACE */
3494 case 0x2004: /* THREE-PER-EM SPACE */
3495 case 0x2005: /* FOUR-PER-EM SPACE */
3496 case 0x2006: /* SIX-PER-EM SPACE */
3497 case 0x2007: /* FIGURE SPACE */
3498 case 0x2008: /* PUNCTUATION SPACE */
3499 case 0x2009: /* THIN SPACE */
3500 case 0x200A: /* HAIR SPACE */
3501 case 0x202f: /* NARROW NO-BREAK SPACE */
3502 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3503 case 0x3000: /* IDEOGRAPHIC SPACE */
3516 case 0x85: /* NEL */
3517 case 0x2028: /* LINE SEPARATOR */
3518 case 0x2029: /* PARAGRAPH SEPARATOR */
3519 RRETURN(MATCH_NOMATCH);
3526 default: RRETURN(MATCH_NOMATCH);
3531 case 0x85: /* NEL */
3532 case 0x2028: /* LINE SEPARATOR */
3533 case 0x2029: /* PARAGRAPH SEPARATOR */
3539 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3540 RRETURN(MATCH_NOMATCH);
3544 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3545 RRETURN(MATCH_NOMATCH);
3548 case OP_NOT_WHITESPACE:
3549 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3550 RRETURN(MATCH_NOMATCH);
3554 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3555 RRETURN(MATCH_NOMATCH);
3558 case OP_NOT_WORDCHAR:
3559 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3560 RRETURN(MATCH_NOMATCH);
3564 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3565 RRETURN(MATCH_NOMATCH);
3569 RRETURN(PCRE_ERROR_INTERNAL);
3575 /* Not UTF-8 mode */
3577 for (fi = min;; fi++)
3579 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3580 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3581 if (fi >= max || eptr >= md->end_subject ||
3582 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3583 RRETURN(MATCH_NOMATCH);
3588 case OP_ANY: /* This is the DOTALL case */
3597 default: RRETURN(MATCH_NOMATCH);
3599 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3608 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3618 case 0x20: /* SPACE */
3619 case 0xa0: /* NBSP */
3620 RRETURN(MATCH_NOMATCH);
3627 default: RRETURN(MATCH_NOMATCH);
3629 case 0x20: /* SPACE */
3630 case 0xa0: /* NBSP */
3643 case 0x85: /* NEL */
3644 RRETURN(MATCH_NOMATCH);
3651 default: RRETURN(MATCH_NOMATCH);
3656 case 0x85: /* NEL */
3662 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3666 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3669 case OP_NOT_WHITESPACE:
3670 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3674 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3677 case OP_NOT_WORDCHAR:
3678 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3682 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3686 RRETURN(PCRE_ERROR_INTERNAL);
3690 /* Control never gets here */
3693 /* If maximizing, it is worth using inline code for speed, doing the type
3694 test once at the start (i.e. keep it out of the loop). Again, keep the
3695 UTF-8 and UCP stuff separate. */
3699 pp = eptr; /* Remember where we started */
3707 for (i = min; i < max; i++)
3710 if (eptr >= md->end_subject) break;
3711 GETCHARLEN(c, eptr, len);
3712 if (prop_fail_result) break;
3718 for (i = min; i < max; i++)
3721 if (eptr >= md->end_subject) break;
3722 GETCHARLEN(c, eptr, len);
3723 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3724 if ((prop_chartype == ucp_Lu ||
3725 prop_chartype == ucp_Ll ||
3726 prop_chartype == ucp_Lt) == prop_fail_result)
3733 for (i = min; i < max; i++)
3736 if (eptr >= md->end_subject) break;
3737 GETCHARLEN(c, eptr, len);
3738 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3739 if ((prop_category == prop_value) == prop_fail_result)
3746 for (i = min; i < max; i++)
3749 if (eptr >= md->end_subject) break;
3750 GETCHARLEN(c, eptr, len);
3751 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3752 if ((prop_chartype == prop_value) == prop_fail_result)
3759 for (i = min; i < max; i++)
3762 if (eptr >= md->end_subject) break;
3763 GETCHARLEN(c, eptr, len);
3764 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3765 if ((prop_script == prop_value) == prop_fail_result)
3772 /* eptr is now past the end of the maximum run */
3774 if (possessive) continue;
3777 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3778 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3779 if (eptr-- == pp) break; /* Stop if tried at original pos */
3780 if (utf8) BACKCHAR(eptr);
3784 /* Match extended Unicode sequences. We will get here only if the
3785 support is in the binary; otherwise a compile-time error occurs. */
3787 else if (ctype == OP_EXTUNI)
3789 for (i = min; i < max; i++)
3791 if (eptr >= md->end_subject) break;
3792 GETCHARINCTEST(c, eptr);
3793 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3794 if (prop_category == ucp_M) break;
3795 while (eptr < md->end_subject)
3798 if (!utf8) c = *eptr; else
3800 GETCHARLEN(c, eptr, len);
3802 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3803 if (prop_category != ucp_M) break;
3808 /* eptr is now past the end of the maximum run */
3810 if (possessive) continue;
3813 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3814 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3815 if (eptr-- == pp) break; /* Stop if tried at original pos */
3816 for (;;) /* Move back over one extended */
3819 if (!utf8) c = *eptr; else
3822 GETCHARLEN(c, eptr, len);
3824 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3825 if (prop_category != ucp_M) break;
3832 #endif /* SUPPORT_UCP */
3844 if ((ims & PCRE_DOTALL) == 0)
3846 for (i = min; i < max; i++)
3848 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3850 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3855 for (i = min; i < max; i++)
3857 if (eptr >= md->end_subject) break;
3859 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3864 /* Handle unlimited UTF-8 repeat */
3868 if ((ims & PCRE_DOTALL) == 0)
3870 for (i = min; i < max; i++)
3872 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3874 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3879 eptr = md->end_subject;
3884 /* The byte case is the same as non-UTF8 */
3888 if (c > (unsigned int)(md->end_subject - eptr))
3889 c = md->end_subject - eptr;
3894 for (i = min; i < max; i++)
3897 if (eptr >= md->end_subject) break;
3898 GETCHARLEN(c, eptr, len);
3901 if (++eptr >= md->end_subject) break;
3902 if (*eptr == 0x000a) eptr++;
3908 (c != 0x000b && c != 0x000c &&
3909 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3918 for (i = min; i < max; i++)
3922 if (eptr >= md->end_subject) break;
3923 GETCHARLEN(c, eptr, len);
3926 default: gotspace = FALSE; break;
3928 case 0x20: /* SPACE */
3929 case 0xa0: /* NBSP */
3930 case 0x1680: /* OGHAM SPACE MARK */
3931 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3932 case 0x2000: /* EN QUAD */
3933 case 0x2001: /* EM QUAD */
3934 case 0x2002: /* EN SPACE */
3935 case 0x2003: /* EM SPACE */
3936 case 0x2004: /* THREE-PER-EM SPACE */
3937 case 0x2005: /* FOUR-PER-EM SPACE */
3938 case 0x2006: /* SIX-PER-EM SPACE */
3939 case 0x2007: /* FIGURE SPACE */
3940 case 0x2008: /* PUNCTUATION SPACE */
3941 case 0x2009: /* THIN SPACE */
3942 case 0x200A: /* HAIR SPACE */
3943 case 0x202f: /* NARROW NO-BREAK SPACE */
3944 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3945 case 0x3000: /* IDEOGRAPHIC SPACE */
3949 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3956 for (i = min; i < max; i++)
3960 if (eptr >= md->end_subject) break;
3961 GETCHARLEN(c, eptr, len);
3964 default: gotspace = FALSE; break;
3969 case 0x85: /* NEL */
3970 case 0x2028: /* LINE SEPARATOR */
3971 case 0x2029: /* PARAGRAPH SEPARATOR */
3975 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3981 for (i = min; i < max; i++)
3984 if (eptr >= md->end_subject) break;
3985 GETCHARLEN(c, eptr, len);
3986 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3992 for (i = min; i < max; i++)
3995 if (eptr >= md->end_subject) break;
3996 GETCHARLEN(c, eptr, len);
3997 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4002 case OP_NOT_WHITESPACE:
4003 for (i = min; i < max; i++)
4006 if (eptr >= md->end_subject) break;
4007 GETCHARLEN(c, eptr, len);
4008 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4014 for (i = min; i < max; i++)
4017 if (eptr >= md->end_subject) break;
4018 GETCHARLEN(c, eptr, len);
4019 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4024 case OP_NOT_WORDCHAR:
4025 for (i = min; i < max; i++)
4028 if (eptr >= md->end_subject) break;
4029 GETCHARLEN(c, eptr, len);
4030 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4036 for (i = min; i < max; i++)
4039 if (eptr >= md->end_subject) break;
4040 GETCHARLEN(c, eptr, len);
4041 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4047 RRETURN(PCRE_ERROR_INTERNAL);
4050 /* eptr is now past the end of the maximum run */
4052 if (possessive) continue;
4055 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4056 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4057 if (eptr-- == pp) break; /* Stop if tried at original pos */
4062 #endif /* SUPPORT_UTF8 */
4064 /* Not UTF-8 mode */
4069 if ((ims & PCRE_DOTALL) == 0)
4071 for (i = min; i < max; i++)
4073 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4078 /* For DOTALL case, fall through and treat as \C */
4082 if (c > (unsigned int)(md->end_subject - eptr))
4083 c = md->end_subject - eptr;
4088 for (i = min; i < max; i++)
4090 if (eptr >= md->end_subject) break;
4094 if (++eptr >= md->end_subject) break;
4095 if (*eptr == 0x000a) eptr++;
4101 (c != 0x000b && c != 0x000c && c != 0x0085)))
4109 for (i = min; i < max; i++)
4111 if (eptr >= md->end_subject) break;
4113 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4119 for (i = min; i < max; i++)
4121 if (eptr >= md->end_subject) break;
4123 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4129 for (i = min; i < max; i++)
4131 if (eptr >= md->end_subject) break;
4133 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4140 for (i = min; i < max; i++)
4142 if (eptr >= md->end_subject) break;
4144 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4151 for (i = min; i < max; i++)
4153 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4160 for (i = min; i < max; i++)
4162 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4168 case OP_NOT_WHITESPACE:
4169 for (i = min; i < max; i++)
4171 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4178 for (i = min; i < max; i++)
4180 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4186 case OP_NOT_WORDCHAR:
4187 for (i = min; i < max; i++)
4189 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4196 for (i = min; i < max; i++)
4198 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4205 RRETURN(PCRE_ERROR_INTERNAL);
4208 /* eptr is now past the end of the maximum run */
4210 if (possessive) continue;
4213 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4215 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4219 /* Get here if we can't make it match with any permitted repetitions */
4221 RRETURN(MATCH_NOMATCH);
4223 /* Control never gets here */
4225 /* There's been some horrible disaster. Arrival here can only mean there is
4226 something seriously wrong in the code above or the OP_xxx definitions. */
4229 DPRINTF(("Unknown opcode %d\n", *ecode));
4230 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4233 /* Do not stick any code in here without much thought; it is assumed
4234 that "continue" in the code above comes out to here to repeat the main
4237 } /* End of main loop */
4238 /* Control never reaches here */
4241 /* When compiling to use the heap rather than the stack for recursive calls to
4242 match(), the RRETURN() macro jumps here. The number that is saved in
4243 frame->Xwhere indicates which label we actually want to return to. */
4246 #define LBL(val) case val: goto L_RM##val;
4248 switch (frame->Xwhere)
4250 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4251 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4252 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4253 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4256 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4257 LBL(32) LBL(34) LBL(42) LBL(46)
4259 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4260 #endif /* SUPPORT_UCP */
4261 #endif /* SUPPORT_UTF8 */
4263 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4264 return PCRE_ERROR_INTERNAL;
4267 #endif /* NO_RECURSE */
4271 /***************************************************************************
4272 ****************************************************************************
4273 RECURSION IN THE match() FUNCTION
4275 Undefine all the macros that were defined above to handle this. */
4294 #undef new_recursive
4309 #undef save_capture_last
4319 /* These two are defined as macros in both cases */
4324 /***************************************************************************
4325 ***************************************************************************/
4329 /*************************************************
4330 * Execute a Regular Expression *
4331 *************************************************/
4333 /* This function applies a compiled re to a subject string and picks out
4334 portions of the string if it matches. Two elements in the vector are set for
4335 each substring: the offsets to the start and end of the substring.
4338 argument_re points to the compiled expression
4339 extra_data points to extra data or is NULL
4340 subject points to the subject string
4341 length length of subject string (may contain binary zeros)
4342 start_offset where to start in the subject string
4344 offsets points to a vector of ints to be filled in with offsets
4345 offsetcount the number of elements in the vector
4347 Returns: > 0 => success; value is the number of elements filled in
4348 = 0 => success, but offsets is not big enough
4349 -1 => failed to match
4350 < -1 => some kind of unexpected problem
4354 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4355 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4358 int rc, resetcount, ocount;
4359 int first_byte = -1;
4363 unsigned long int ims;
4364 BOOL using_temporary_offsets = FALSE;
4368 BOOL first_byte_caseless = FALSE;
4369 BOOL req_byte_caseless = FALSE;
4371 match_data match_block;
4372 match_data *md = &match_block;
4373 const uschar *tables;
4374 const uschar *start_bits = NULL;
4375 USPTR start_match = (USPTR)subject + start_offset;
4377 USPTR req_byte_ptr = start_match - 1;
4379 pcre_study_data internal_study;
4380 const pcre_study_data *study;
4382 real_pcre internal_re;
4383 const real_pcre *external_re = (const real_pcre *)argument_re;
4384 const real_pcre *re = external_re;
4386 /* Plausibility checks */
4388 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4389 if (re == NULL || subject == NULL ||
4390 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4391 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4393 /* Fish out the optional data from the extra_data structure, first setting
4394 the default values. */
4397 md->match_limit = MATCH_LIMIT;
4398 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4399 md->callout_data = NULL;
4401 /* The table pointer is always in native byte order. */
4403 tables = external_re->tables;
4405 if (extra_data != NULL)
4407 register unsigned int flags = extra_data->flags;
4408 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4409 study = (const pcre_study_data *)extra_data->study_data;
4410 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4411 md->match_limit = extra_data->match_limit;
4412 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4413 md->match_limit_recursion = extra_data->match_limit_recursion;
4414 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4415 md->callout_data = extra_data->callout_data;
4416 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4419 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4420 is a feature that makes it possible to save compiled regex and re-use them
4421 in other programs later. */
4423 if (tables == NULL) tables = _pcre_default_tables;
4425 /* Check that the first field in the block is the magic number. If it is not,
4426 test for a regex that was compiled on a host of opposite endianness. If this is
4427 the case, flipped values are put in internal_re and internal_study if there was
4430 if (re->magic_number != MAGIC_NUMBER)
4432 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4433 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4434 if (study != NULL) study = &internal_study;
4437 /* Set up other data */
4439 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4440 startline = (re->flags & PCRE_STARTLINE) != 0;
4441 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4443 /* The code starts after the real_pcre block and the capture name table. */
4445 md->start_code = (const uschar *)external_re + re->name_table_offset +
4446 re->name_count * re->name_entry_size;
4448 md->start_subject = (USPTR)subject;
4449 md->start_offset = start_offset;
4450 md->end_subject = md->start_subject + length;
4451 end_subject = md->end_subject;
4453 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4454 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4456 md->notbol = (options & PCRE_NOTBOL) != 0;
4457 md->noteol = (options & PCRE_NOTEOL) != 0;
4458 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4459 md->partial = (options & PCRE_PARTIAL) != 0;
4462 md->recursive = NULL; /* No recursion at top level */
4464 md->lcc = tables + lcc_offset;
4465 md->ctypes = tables + ctypes_offset;
4467 /* Handle different \R options. */
4469 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4472 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4473 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4476 md->bsr_anycrlf = TRUE;
4478 md->bsr_anycrlf = FALSE;
4482 case PCRE_BSR_ANYCRLF:
4483 md->bsr_anycrlf = TRUE;
4486 case PCRE_BSR_UNICODE:
4487 md->bsr_anycrlf = FALSE;
4490 default: return PCRE_ERROR_BADNEWLINE;
4493 /* Handle different types of newline. The three bits give eight cases. If
4494 nothing is set at run time, whatever was used at compile time applies. */
4496 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4497 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4499 case 0: newline = NEWLINE; break; /* Compile-time default */
4500 case PCRE_NEWLINE_CR: newline = '\r'; break;
4501 case PCRE_NEWLINE_LF: newline = '\n'; break;
4502 case PCRE_NEWLINE_CR+
4503 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4504 case PCRE_NEWLINE_ANY: newline = -1; break;
4505 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4506 default: return PCRE_ERROR_BADNEWLINE;
4511 md->nltype = NLTYPE_ANYCRLF;
4513 else if (newline < 0)
4515 md->nltype = NLTYPE_ANY;
4519 md->nltype = NLTYPE_FIXED;
4523 md->nl[0] = (newline >> 8) & 255;
4524 md->nl[1] = newline & 255;
4529 md->nl[0] = newline;
4533 /* Partial matching is supported only for a restricted set of regexes at the
4536 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4537 return PCRE_ERROR_BADPARTIAL;
4539 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4540 back the character offset. */
4543 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4545 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4546 return PCRE_ERROR_BADUTF8;
4547 if (start_offset > 0 && start_offset < length)
4549 int tb = ((uschar *)subject)[start_offset];
4553 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4559 /* The ims options can vary during the matching as a result of the presence
4560 of (?ims) items in the pattern. They are kept in a local variable so that
4561 restoring at the exit of a group is easy. */
4563 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4565 /* If the expression has got more back references than the offsets supplied can
4566 hold, we get a temporary chunk of working store to use during the matching.
4567 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4570 ocount = offsetcount - (offsetcount % 3);
4572 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4574 ocount = re->top_backref * 3 + 3;
4575 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4576 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4577 using_temporary_offsets = TRUE;
4578 DPRINTF(("Got memory to hold back references\n"));
4580 else md->offset_vector = offsets;
4582 md->offset_end = ocount;
4583 md->offset_max = (2*ocount)/3;
4584 md->offset_overflow = FALSE;
4585 md->capture_last = -1;
4587 /* Compute the minimum number of offsets that we need to reset each time. Doing
4588 this makes a huge difference to execution time when there aren't many brackets
4591 resetcount = 2 + re->top_bracket * 2;
4592 if (resetcount > offsetcount) resetcount = ocount;
4594 /* Reset the working variable associated with each extraction. These should
4595 never be used unless previously set, but they get saved and restored, and so we
4596 initialize them to avoid reading uninitialized locations. */
4598 if (md->offset_vector != NULL)
4600 register int *iptr = md->offset_vector + ocount;
4601 register int *iend = iptr - resetcount/2 + 1;
4602 while (--iptr >= iend) *iptr = -1;
4605 /* Set up the first character to match, if available. The first_byte value is
4606 never set for an anchored regular expression, but the anchoring may be forced
4607 at run time, so we have to test for anchoring. The first char may be unset for
4608 an unanchored pattern, of course. If there's no first char and the pattern was
4609 studied, there may be a bitmap of possible first characters. */
4613 if ((re->flags & PCRE_FIRSTSET) != 0)
4615 first_byte = re->first_byte & 255;
4616 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4617 first_byte = md->lcc[first_byte];
4620 if (!startline && study != NULL &&
4621 (study->options & PCRE_STUDY_MAPPED) != 0)
4622 start_bits = study->start_bits;
4625 /* For anchored or unanchored matches, there may be a "last known required
4628 if ((re->flags & PCRE_REQCHSET) != 0)
4630 req_byte = re->req_byte & 255;
4631 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4632 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4636 /* ==========================================================================*/
4638 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4639 the loop runs just once. */
4643 USPTR save_end_subject = end_subject;
4644 USPTR new_start_match;
4646 /* Reset the maximum number of extractions we might see. */
4648 if (md->offset_vector != NULL)
4650 register int *iptr = md->offset_vector;
4651 register int *iend = iptr + resetcount;
4652 while (iptr < iend) *iptr++ = -1;
4655 /* Advance to a unique first char if possible. If firstline is TRUE, the
4656 start of the match is constrained to the first line of a multiline string.
4657 That is, the match must be before or at the first newline. Implement this by
4658 temporarily adjusting end_subject so that we stop scanning at a newline. If
4659 the match fails at the newline, later code breaks this loop. */
4663 USPTR t = start_match;
4664 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4668 /* Now test for a unique first byte */
4670 if (first_byte >= 0)
4672 if (first_byte_caseless)
4673 while (start_match < end_subject &&
4674 md->lcc[*start_match] != first_byte)
4677 while (start_match < end_subject && *start_match != first_byte)
4681 /* Or to just after a linebreak for a multiline match if possible */
4685 if (start_match > md->start_subject + start_offset)
4687 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4690 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4691 and we are now at a LF, advance the match position by one more character.
4694 if (start_match[-1] == '\r' &&
4695 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4696 start_match < end_subject &&
4697 *start_match == '\n')
4702 /* Or to a non-unique first char after study */
4704 else if (start_bits != NULL)
4706 while (start_match < end_subject)
4708 register unsigned int c = *start_match;
4709 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4713 /* Restore fudged end_subject */
4715 end_subject = save_end_subject;
4717 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4718 printf(">>>> Match against: ");
4719 pchars(start_match, end_subject - start_match, TRUE, md);
4723 /* If req_byte is set, we know that that character must appear in the subject
4724 for the match to succeed. If the first character is set, req_byte must be
4725 later in the subject; otherwise the test starts at the match point. This
4726 optimization can save a huge amount of backtracking in patterns with nested
4727 unlimited repeats that aren't going to match. Writing separate code for
4728 cased/caseless versions makes it go faster, as does using an autoincrement
4729 and backing off on a match.
4731 HOWEVER: when the subject string is very, very long, searching to its end can
4732 take a long time, and give bad performance on quite ordinary patterns. This
4733 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4734 string... so we don't do this when the string is sufficiently long.
4736 ALSO: this processing is disabled when partial matching is requested.
4739 if (req_byte >= 0 &&
4740 end_subject - start_match < REQ_BYTE_MAX &&
4743 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4745 /* We don't need to repeat the search if we haven't yet reached the
4746 place we found it at last time. */
4748 if (p > req_byte_ptr)
4750 if (req_byte_caseless)
4752 while (p < end_subject)
4754 register int pp = *p++;
4755 if (pp == req_byte || pp == req_byte2) { p--; break; }
4760 while (p < end_subject)
4762 if (*p++ == req_byte) { p--; break; }
4766 /* If we can't find the required character, break the matching loop,
4767 forcing a match failure. */
4769 if (p >= end_subject)
4775 /* If we have found the required character, save the point where we
4776 found it, so that we don't search again next time round the loop if
4777 the start hasn't passed this character yet. */
4783 /* OK, we can now run the match. */
4785 md->start_match_ptr = start_match;
4786 md->match_call_count = 0;
4787 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4791 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4792 exactly like PRUNE. */
4797 new_start_match = start_match + 1;
4800 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4805 /* SKIP passes back the next starting point explicitly. */
4808 new_start_match = md->start_match_ptr;
4811 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4817 /* Any other return is some kind of error. */
4823 /* Control reaches here for the various types of "no match at this point"
4824 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4828 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4829 newline in the subject (though it may continue over the newline). Therefore,
4830 if we have just failed to match, starting at a newline, do not continue. */
4832 if (firstline && IS_NEWLINE(start_match)) break;
4834 /* Advance to new matching position */
4836 start_match = new_start_match;
4838 /* Break the loop if the pattern is anchored or if we have passed the end of
4841 if (anchored || start_match > end_subject) break;
4843 /* If we have just passed a CR and we are now at a LF, and the pattern does
4844 not contain any explicit matches for \r or \n, and the newline option is CRLF
4845 or ANY or ANYCRLF, advance the match position by one more character. */
4847 if (start_match[-1] == '\r' &&
4848 start_match < end_subject &&
4849 *start_match == '\n' &&
4850 (re->flags & PCRE_HASCRORLF) == 0 &&
4851 (md->nltype == NLTYPE_ANY ||
4852 md->nltype == NLTYPE_ANYCRLF ||
4856 } /* End of for(;;) "bumpalong" loop */
4858 /* ==========================================================================*/
4860 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4863 (1) The pattern is anchored or the match was failed by (*COMMIT);
4865 (2) We are past the end of the subject;
4867 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4868 this option requests that a match occur at or before the first newline in
4871 When we have a match and the offset vector is big enough to deal with any
4872 backreferences, captured substring offsets will already be set up. In the case
4873 where we had to get some local store to hold offsets for backreference
4874 processing, copy those that we can. In this case there need not be overflow if
4875 certain parts of the pattern were not used, even though there are more
4876 capturing parentheses than vector slots. */
4880 if (rc == MATCH_MATCH)
4882 if (using_temporary_offsets)
4884 if (offsetcount >= 4)
4886 memcpy(offsets + 2, md->offset_vector + 2,
4887 (offsetcount - 2) * sizeof(int));
4888 DPRINTF(("Copied offsets from temporary memory\n"));
4890 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4891 DPRINTF(("Freeing temporary memory\n"));
4892 (pcre_free)(md->offset_vector);
4895 /* Set the return code to the number of captured strings, or 0 if there are
4896 too many to fit into the vector. */
4898 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4900 /* If there is space, set up the whole thing as substring 0. The value of
4901 md->start_match_ptr might be modified if \K was encountered on the success
4904 if (offsetcount < 2) rc = 0; else
4906 offsets[0] = md->start_match_ptr - md->start_subject;
4907 offsets[1] = md->end_match_ptr - md->start_subject;
4910 DPRINTF((">>>> returning %d\n", rc));
4914 /* Control gets here if there has been an error, or if the overall match
4915 attempt has failed at all permitted starting positions. */
4917 if (using_temporary_offsets)
4919 DPRINTF(("Freeing temporary memory\n"));
4920 (pcre_free)(md->offset_vector);
4923 if (rc != MATCH_NOMATCH)
4925 DPRINTF((">>>> error: returning %d\n", rc));
4928 else if (md->partial && md->hitend)
4930 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4931 return PCRE_ERROR_PARTIAL;
4935 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4936 return PCRE_ERROR_NOMATCH;
4940 /* End of pcre_exec.c */