summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorPhilip Hazel <ph10@hermes.cam.ac.uk>2004-10-07 13:04:13 +0000
committerPhilip Hazel <ph10@hermes.cam.ac.uk>2004-10-07 13:04:13 +0000
commitc86f62585c172e92377d4a5d58b754ed72094cc3 (patch)
tree043f9e6b99aaf76786d2aec996530a349b48e66d /src
parentabe6b90f9d4cb5296d74f73169d2a622fb9932e6 (diff)
Start
Diffstat (limited to 'src')
-rw-r--r--src/src/pcre/ChangeLog1611
-rw-r--r--src/src/pcre/LICENCE45
-rw-r--r--src/src/pcre/Makefile51
-rw-r--r--src/src/pcre/README20
-rw-r--r--src/src/pcre/config.h39
-rw-r--r--src/src/pcre/dftables.c173
-rw-r--r--src/src/pcre/get.c357
-rw-r--r--src/src/pcre/internal.h752
-rw-r--r--src/src/pcre/maketables.c146
-rw-r--r--src/src/pcre/pcre.c9194
-rw-r--r--src/src/pcre/pcre.h239
-rw-r--r--src/src/pcre/pcretest.c1789
-rw-r--r--src/src/pcre/printint.c465
-rw-r--r--src/src/pcre/study.c484
14 files changed, 15365 insertions, 0 deletions
diff --git a/src/src/pcre/ChangeLog b/src/src/pcre/ChangeLog
new file mode 100644
index 000000000..59aa955bd
--- /dev/null
+++ b/src/src/pcre/ChangeLog
@@ -0,0 +1,1611 @@
+ChangeLog for PCRE
+------------------
+
+Version 5.0 13-Sep-04
+---------------------
+
+ 1. Internal change: literal characters are no longer packed up into items
+ containing multiple characters in a single byte-string. Each character
+ is now matched using a separate opcode. However, there may be more than one
+ byte in the character in UTF-8 mode.
+
+ 2. The pcre_callout_block structure has two new fields: pattern_position and
+ next_item_length. These contain the offset in the pattern to the next match
+ item, and its length, respectively.
+
+ 3. The PCRE_AUTO_CALLOUT option for pcre_compile() requests the automatic
+ insertion of callouts before each pattern item. Added the /C option to
+ pcretest to make use of this.
+
+ 4. On the advice of a Windows user, the lines
+
+ #if defined(_WIN32) || defined(WIN32)
+ _setmode( _fileno( stdout ), 0x8000 );
+ #endif /* defined(_WIN32) || defined(WIN32) */
+
+ have been added to the source of pcretest. This apparently does useful
+ magic in relation to line terminators.
+
+ 5. Changed "r" and "w" in the calls to fopen() in pcretest to "rb" and "wb"
+ for the benefit of those environments where the "b" makes a difference.
+
+ 6. The icc compiler has the same options as gcc, but "configure" doesn't seem
+ to know about it. I have put a hack into configure.in that adds in code
+ to set GCC=yes if CC=icc. This seems to end up at a point in the
+ generated configure script that is early enough to affect the setting of
+ compiler options, which is what is needed, but I have no means of testing
+ whether it really works. (The user who reported this had patched the
+ generated configure script, which of course I cannot do.)
+
+ LATER: After change 22 below (new libtool files), the configure script
+ seems to know about icc (and also ecc). Therefore, I have commented out
+ this hack in configure.in.
+
+ 7. Added support for pkg-config (2 patches were sent in).
+
+ 8. Negated POSIX character classes that used a combination of internal tables
+ were completely broken. These were [[:^alpha:]], [[:^alnum:]], and
+ [[:^ascii]]. Typically, they would match almost any characters. The other
+ POSIX classes were not broken in this way.
+
+ 9. Matching the pattern "\b.*?" against "ab cd", starting at offset 1, failed
+ to find the match, as PCRE was deluded into thinking that the match had to
+ start at the start point or following a newline. The same bug applied to
+ patterns with negative forward assertions or any backward assertions
+ preceding ".*" at the start, unless the pattern required a fixed first
+ character. This was a failing pattern: "(?!.bcd).*". The bug is now fixed.
+
+10. In UTF-8 mode, when moving forwards in the subject after a failed match
+ starting at the last subject character, bytes beyond the end of the subject
+ string were read.
+
+11. Renamed the variable "class" as "classbits" to make life easier for C++
+ users. (Previously there was a macro definition, but it apparently wasn't
+ enough.)
+
+12. Added the new field "tables" to the extra data so that tables can be passed
+ in at exec time, or the internal tables can be re-selected. This allows
+ a compiled regex to be saved and re-used at a later time by a different
+ program that might have everything at different addresses.
+
+13. Modified the pcre-config script so that, when run on Solaris, it shows a
+ -R library as well as a -L library.
+
+14. The debugging options of pcretest (-d on the command line or D on a
+ pattern) showed incorrect output for anything following an extended class
+ that contained multibyte characters and which was followed by a quantifier.
+
+15. Added optional support for general category Unicode character properties
+ via the \p, \P, and \X escapes. Unicode property support implies UTF-8
+ support. It adds about 90K to the size of the library. The meanings of the
+ inbuilt class escapes such as \d and \s have NOT been changed.
+
+16. Updated pcredemo.c to include calls to free() to release the memory for the
+ compiled pattern.
+
+17. The generated file chartables.c was being created in the source directory
+ instead of in the building directory. This caused the build to fail if the
+ source directory was different from the building directory, and was
+ read-only.
+
+18. Added some sample Win commands from Mark Tetrode into the NON-UNIX-USE
+ file. No doubt somebody will tell me if they don't make sense... Also added
+ Dan Mooney's comments about building on OpenVMS.
+
+19. Added support for partial matching via the PCRE_PARTIAL option for
+ pcre_exec() and the \P data escape in pcretest.
+
+20. Extended pcretest with 3 new pattern features:
+
+ (i) A pattern option of the form ">rest-of-line" causes pcretest to
+ write the compiled pattern to the file whose name is "rest-of-line".
+ This is a straight binary dump of the data, with the saved pointer to
+ the character tables forced to be NULL. The study data, if any, is
+ written too. After writing, pcretest reads a new pattern.
+
+ (ii) If, instead of a pattern, "<rest-of-line" is given, pcretest reads a
+ compiled pattern from the given file. There must not be any
+ occurrences of "<" in the file name (pretty unlikely); if there are,
+ pcretest will instead treat the initial "<" as a pattern delimiter.
+ After reading in the pattern, pcretest goes on to read data lines as
+ usual.
+
+ (iii) The F pattern option causes pcretest to flip the bytes in the 32-bit
+ and 16-bit fields in a compiled pattern, to simulate a pattern that
+ was compiled on a host of opposite endianness.
+
+21. The pcre-exec() function can now cope with patterns that were compiled on
+ hosts of opposite endianness, with this restriction:
+
+ As for any compiled expression that is saved and used later, the tables
+ pointer field cannot be preserved; the extra_data field in the arguments
+ to pcre_exec() should be used to pass in a tables address if a value
+ other than the default internal tables were used at compile time.
+
+22. Calling pcre_exec() with a negative value of the "ovecsize" parameter is
+ now diagnosed as an error. Previously, most of the time, a negative number
+ would have been treated as zero, but if in addition "ovector" was passed as
+ NULL, a crash could occur.
+
+23. Updated the files ltmain.sh, config.sub, config.guess, and aclocal.m4 with
+ new versions from the libtool 1.5 distribution (the last one is a copy of
+ a file called libtool.m4). This seems to have fixed the need to patch
+ "configure" to support Darwin 1.3 (which I used to do). However, I still
+ had to patch ltmain.sh to ensure that ${SED} is set (it isn't on my
+ workstation).
+
+24. Changed the PCRE licence to be the more standard "BSD" licence.
+
+
+Version 4.5 01-Dec-03
+---------------------
+
+ 1. There has been some re-arrangement of the code for the match() function so
+ that it can be compiled in a version that does not call itself recursively.
+ Instead, it keeps those local variables that need separate instances for
+ each "recursion" in a frame on the heap, and gets/frees frames whenever it
+ needs to "recurse". Keeping track of where control must go is done by means
+ of setjmp/longjmp. The whole thing is implemented by a set of macros that
+ hide most of the details from the main code, and operates only if
+ NO_RECURSE is defined while compiling pcre.c. If PCRE is built using the
+ "configure" mechanism, "--disable-stack-for-recursion" turns on this way of
+ operating.
+
+ To make it easier for callers to provide specially tailored get/free
+ functions for this usage, two new functions, pcre_stack_malloc, and
+ pcre_stack_free, are used. They are always called in strict stacking order,
+ and the size of block requested is always the same.
+
+ The PCRE_CONFIG_STACKRECURSE info parameter can be used to find out whether
+ PCRE has been compiled to use the stack or the heap for recursion. The
+ -C option of pcretest uses this to show which version is compiled.
+
+ A new data escape \S, is added to pcretest; it causes the amounts of store
+ obtained and freed by both kinds of malloc/free at match time to be added
+ to the output.
+
+ 2. Changed the locale test to use "fr_FR" instead of "fr" because that's
+ what's available on my current Linux desktop machine.
+
+ 3. When matching a UTF-8 string, the test for a valid string at the start has
+ been extended. If start_offset is not zero, PCRE now checks that it points
+ to a byte that is the start of a UTF-8 character. If not, it returns
+ PCRE_ERROR_BADUTF8_OFFSET (-11). Note: the whole string is still checked;
+ this is necessary because there may be backward assertions in the pattern.
+ When matching the same subject several times, it may save resources to use
+ PCRE_NO_UTF8_CHECK on all but the first call if the string is long.
+
+ 4. The code for checking the validity of UTF-8 strings has been tightened so
+ that it rejects (a) strings containing 0xfe or 0xff bytes and (b) strings
+ containing "overlong sequences".
+
+ 5. Fixed a bug (appearing twice) that I could not find any way of exploiting!
+ I had written "if ((digitab[*p++] && chtab_digit) == 0)" where the "&&"
+ should have been "&", but it just so happened that all the cases this let
+ through by mistake were picked up later in the function.
+
+ 6. I had used a variable called "isblank" - this is a C99 function, causing
+ some compilers to warn. To avoid this, I renamed it (as "blankclass").
+
+ 7. Cosmetic: (a) only output another newline at the end of pcretest if it is
+ prompting; (b) run "./pcretest /dev/null" at the start of the test script
+ so the version is shown; (c) stop "make test" echoing "./RunTest".
+
+ 8. Added patches from David Burgess to enable PCRE to run on EBCDIC systems.
+
+ 9. The prototype for memmove() for systems that don't have it was using
+ size_t, but the inclusion of the header that defines size_t was later. I've
+ moved the #includes for the C headers earlier to avoid this.
+
+10. Added some adjustments to the code to make it easier to compiler on certain
+ special systems:
+
+ (a) Some "const" qualifiers were missing.
+ (b) Added the macro EXPORT before all exported functions; by default this
+ is defined to be empty.
+ (c) Changed the dftables auxiliary program (that builds chartables.c) so
+ that it reads its output file name as an argument instead of writing
+ to the standard output and assuming this can be redirected.
+
+11. In UTF-8 mode, if a recursive reference (e.g. (?1)) followed a character
+ class containing characters with values greater than 255, PCRE compilation
+ went into a loop.
+
+12. A recursive reference to a subpattern that was within another subpattern
+ that had a minimum quantifier of zero caused PCRE to crash. For example,
+ (x(y(?2))z)? provoked this bug with a subject that got as far as the
+ recursion. If the recursively-called subpattern itself had a zero repeat,
+ that was OK.
+
+13. In pcretest, the buffer for reading a data line was set at 30K, but the
+ buffer into which it was copied (for escape processing) was still set at
+ 1024, so long lines caused crashes.
+
+14. A pattern such as /[ab]{1,3}+/ failed to compile, giving the error
+ "internal error: code overflow...". This applied to any character class
+ that was followed by a possessive quantifier.
+
+15. Modified the Makefile to add libpcre.la as a prerequisite for
+ libpcreposix.la because I was told this is needed for a parallel build to
+ work.
+
+16. If a pattern that contained .* following optional items at the start was
+ studied, the wrong optimizing data was generated, leading to matching
+ errors. For example, studying /[ab]*.*c/ concluded, erroneously, that any
+ matching string must start with a or b or c. The correct conclusion for
+ this pattern is that a match can start with any character.
+
+
+Version 4.4 13-Aug-03
+---------------------
+
+ 1. In UTF-8 mode, a character class containing characters with values between
+ 127 and 255 was not handled correctly if the compiled pattern was studied.
+ In fixing this, I have also improved the studying algorithm for such
+ classes (slightly).
+
+ 2. Three internal functions had redundant arguments passed to them. Removal
+ might give a very teeny performance improvement.
+
+ 3. Documentation bug: the value of the capture_top field in a callout is *one
+ more than* the number of the hightest numbered captured substring.
+
+ 4. The Makefile linked pcretest and pcregrep with -lpcre, which could result
+ in incorrectly linking with a previously installed version. They now link
+ explicitly with libpcre.la.
+
+ 5. configure.in no longer needs to recognize Cygwin specially.
+
+ 6. A problem in pcre.in for Windows platforms is fixed.
+
+ 7. If a pattern was successfully studied, and the -d (or /D) flag was given to
+ pcretest, it used to include the size of the study block as part of its
+ output. Unfortunately, the structure contains a field that has a different
+ size on different hardware architectures. This meant that the tests that
+ showed this size failed. As the block is currently always of a fixed size,
+ this information isn't actually particularly useful in pcretest output, so
+ I have just removed it.
+
+ 8. Three pre-processor statements accidentally did not start in column 1.
+ Sadly, there are *still* compilers around that complain, even though
+ standard C has not required this for well over a decade. Sigh.
+
+ 9. In pcretest, the code for checking callouts passed small integers in the
+ callout_data field, which is a void * field. However, some picky compilers
+ complained about the casts involved for this on 64-bit systems. Now
+ pcretest passes the address of the small integer instead, which should get
+ rid of the warnings.
+
+10. By default, when in UTF-8 mode, PCRE now checks for valid UTF-8 strings at
+ both compile and run time, and gives an error if an invalid UTF-8 sequence
+ is found. There is a option for disabling this check in cases where the
+ string is known to be correct and/or the maximum performance is wanted.
+
+11. In response to a bug report, I changed one line in Makefile.in from
+
+ -Wl,--out-implib,.libs/lib@WIN_PREFIX@pcreposix.dll.a \
+ to
+ -Wl,--out-implib,.libs/@WIN_PREFIX@libpcreposix.dll.a \
+
+ to look similar to other lines, but I have no way of telling whether this
+ is the right thing to do, as I do not use Windows. No doubt I'll get told
+ if it's wrong...
+
+
+Version 4.3 21-May-03
+---------------------
+
+1. Two instances of @WIN_PREFIX@ omitted from the Windows targets in the
+ Makefile.
+
+2. Some refactoring to improve the quality of the code:
+
+ (i) The utf8_table... variables are now declared "const".
+
+ (ii) The code for \cx, which used the "case flipping" table to upper case
+ lower case letters, now just substracts 32. This is ASCII-specific,
+ but the whole concept of \cx is ASCII-specific, so it seems
+ reasonable.
+
+ (iii) PCRE was using its character types table to recognize decimal and
+ hexadecimal digits in the pattern. This is silly, because it handles
+ only 0-9, a-f, and A-F, but the character types table is locale-
+ specific, which means strange things might happen. A private
+ table is now used for this - though it costs 256 bytes, a table is
+ much faster than multiple explicit tests. Of course, the standard
+ character types table is still used for matching digits in subject
+ strings against \d.
+
+ (iv) Strictly, the identifier ESC_t is reserved by POSIX (all identifiers
+ ending in _t are). So I've renamed it as ESC_tee.
+
+3. The first argument for regexec() in the POSIX wrapper should have been
+ defined as "const".
+
+4. Changed pcretest to use malloc() for its buffers so that they can be
+ Electric Fenced for debugging.
+
+5. There were several places in the code where, in UTF-8 mode, PCRE would try
+ to read one or more bytes before the start of the subject string. Often this
+ had no effect on PCRE's behaviour, but in some circumstances it could
+ provoke a segmentation fault.
+
+6. A lookbehind at the start of a pattern in UTF-8 mode could also cause PCRE
+ to try to read one or more bytes before the start of the subject string.
+
+7. A lookbehind in a pattern matched in non-UTF-8 mode on a PCRE compiled with
+ UTF-8 support could misbehave in various ways if the subject string
+ contained bytes with the 0x80 bit set and the 0x40 bit unset in a lookbehind
+ area. (PCRE was not checking for the UTF-8 mode flag, and trying to move
+ back over UTF-8 characters.)
+
+
+Version 4.2 14-Apr-03
+---------------------
+
+1. Typo "#if SUPPORT_UTF8" instead of "#ifdef SUPPORT_UTF8" fixed.
+
+2. Changes to the building process, supplied by Ronald Landheer-Cieslak
+ [ON_WINDOWS]: new variable, "#" on non-Windows platforms
+ [NOT_ON_WINDOWS]: new variable, "#" on Windows platforms
+ [WIN_PREFIX]: new variable, "cyg" for Cygwin
+ * Makefile.in: use autoconf substitution for OBJEXT, EXEEXT, BUILD_OBJEXT
+ and BUILD_EXEEXT
+ Note: automatic setting of the BUILD variables is not yet working
+ set CPPFLAGS and BUILD_CPPFLAGS (but don't use yet) - should be used at
+ compile-time but not at link-time
+ [LINK]: use for linking executables only
+ make different versions for Windows and non-Windows
+ [LINKLIB]: new variable, copy of UNIX-style LINK, used for linking
+ libraries
+ [LINK_FOR_BUILD]: new variable
+ [OBJEXT]: use throughout
+ [EXEEXT]: use throughout
+ <winshared>: new target
+ <wininstall>: new target
+ <dftables.o>: use native compiler
+ <dftables>: use native linker
+ <install>: handle Windows platform correctly
+ <clean>: ditto
+ <check>: ditto
+ copy DLL to top builddir before testing
+
+ As part of these changes, -no-undefined was removed again. This was reported
+ to give trouble on HP-UX 11.0, so getting rid of it seems like a good idea
+ in any case.
+
+3. Some tidies to get rid of compiler warnings:
+
+ . In the match_data structure, match_limit was an unsigned long int, whereas
+ match_call_count was an int. I've made them both unsigned long ints.
+
+ . In pcretest the fact that a const uschar * doesn't automatically cast to
+ a void * provoked a warning.
+
+ . Turning on some more compiler warnings threw up some "shadow" variables
+ and a few more missing casts.
+
+4. If PCRE was complied with UTF-8 support, but called without the PCRE_UTF8
+ option, a class that contained a single character with a value between 128
+ and 255 (e.g. /[\xFF]/) caused PCRE to crash.
+
+5. If PCRE was compiled with UTF-8 support, but called without the PCRE_UTF8
+ option, a class that contained several characters, but with at least one
+ whose value was between 128 and 255 caused PCRE to crash.
+
+
+Version 4.1 12-Mar-03
+---------------------
+
+1. Compiling with gcc -pedantic found a couple of places where casts were
+needed, and a string in dftables.c that was longer than standard compilers are
+required to support.
+
+2. Compiling with Sun's compiler found a few more places where the code could
+be tidied up in order to avoid warnings.
+
+3. The variables for cross-compiling were called HOST_CC and HOST_CFLAGS; the
+first of these names is deprecated in the latest Autoconf in favour of the name
+CC_FOR_BUILD, because "host" is typically used to mean the system on which the
+compiled code will be run. I can't find a reference for HOST_CFLAGS, but by
+analogy I have changed it to CFLAGS_FOR_BUILD.
+
+4. Added -no-undefined to the linking command in the Makefile, because this is
+apparently helpful for Windows. To make it work, also added "-L. -lpcre" to the
+linking step for the pcreposix library.
+
+5. PCRE was failing to diagnose the case of two named groups with the same
+name.
+
+6. A problem with one of PCRE's optimizations was discovered. PCRE remembers a
+literal character that is needed in the subject for a match, and scans along to
+ensure that it is present before embarking on the full matching process. This
+saves time in cases of nested unlimited repeats that are never going to match.
+Problem: the scan can take a lot of time if the subject is very long (e.g.
+megabytes), thus penalizing straightforward matches. It is now done only if the
+amount of subject to be scanned is less than 1000 bytes.
+
+7. A lesser problem with the same optimization is that it was recording the
+first character of an anchored pattern as "needed", thus provoking a search
+right along the subject, even when the first match of the pattern was going to
+fail. The "needed" character is now not set for anchored patterns, unless it
+follows something in the pattern that is of non-fixed length. Thus, it still
+fulfils its original purpose of finding quick non-matches in cases of nested
+unlimited repeats, but isn't used for simple anchored patterns such as /^abc/.
+
+
+Version 4.0 17-Feb-03
+---------------------
+
+1. If a comment in an extended regex that started immediately after a meta-item
+extended to the end of string, PCRE compiled incorrect data. This could lead to
+all kinds of weird effects. Example: /#/ was bad; /()#/ was bad; /a#/ was not.
+
+2. Moved to autoconf 2.53 and libtool 1.4.2.
+
+3. Perl 5.8 no longer needs "use utf8" for doing UTF-8 things. Consequently,
+the special perltest8 script is no longer needed - all the tests can be run
+from a single perltest script.
+
+4. From 5.004, Perl has not included the VT character (0x0b) in the set defined
+by \s. It has now been removed in PCRE. This means it isn't recognized as
+whitespace in /x regexes too, which is the same as Perl. Note that the POSIX
+class [:space:] *does* include VT, thereby creating a mess.
+
+5. Added the class [:blank:] (a GNU extension from Perl 5.8) to match only
+space and tab.
+
+6. Perl 5.005 was a long time ago. It's time to amalgamate the tests that use
+its new features into the main test script, reducing the number of scripts.
+
+7. Perl 5.8 has changed the meaning of patterns like /a(?i)b/. Earlier versions
+were backward compatible, and made the (?i) apply to the whole pattern, as if
+/i were given. Now it behaves more logically, and applies the option setting
+only to what follows. PCRE has been changed to follow suit. However, if it
+finds options settings right at the start of the pattern, it extracts them into
+the global options, as before. Thus, they show up in the info data.
+
+8. Added support for the \Q...\E escape sequence. Characters in between are
+treated as literals. This is slightly different from Perl in that $ and @ are
+also handled as literals inside the quotes. In Perl, they will cause variable
+interpolation. Note the following examples:
+
+ Pattern PCRE matches Perl matches
+
+ \Qabc$xyz\E abc$xyz abc followed by the contents of $xyz
+ \Qabc\$xyz\E abc\$xyz abc\$xyz
+ \Qabc\E\$\Qxyz\E abc$xyz abc$xyz
+
+For compatibility with Perl, \Q...\E sequences are recognized inside character
+classes as well as outside them.
+
+9. Re-organized 3 code statements in pcretest to avoid "overflow in
+floating-point constant arithmetic" warnings from a Microsoft compiler. Added a
+(size_t) cast to one statement in pcretest and one in pcreposix to avoid
+signed/unsigned warnings.
+
+10. SunOS4 doesn't have strtoul(). This was used only for unpicking the -o
+option for pcretest, so I've replaced it by a simple function that does just
+that job.
+
+11. pcregrep was ending with code 0 instead of 2 for the commands "pcregrep" or
+"pcregrep -".
+
+12. Added "possessive quantifiers" ?+, *+, ++, and {,}+ which come from Sun's
+Java package. This provides some syntactic sugar for simple cases of what my
+documentation calls "once-only subpatterns". A pattern such as x*+ is the same
+as (?>x*). In other words, if what is inside (?>...) is just a single repeated
+item, you can use this simplified notation. Note that only makes sense with
+greedy quantifiers. Consequently, the use of the possessive quantifier forces
+greediness, whatever the setting of the PCRE_UNGREEDY option.
+
+13. A change of greediness default within a pattern was not taking effect at
+the current level for patterns like /(b+(?U)a+)/. It did apply to parenthesized
+subpatterns that followed. Patterns like /b+(?U)a+/ worked because the option
+was abstracted outside.
+
+14. PCRE now supports the \G assertion. It is true when the current matching
+position is at the start point of the match. This differs from \A when the
+starting offset is non-zero. Used with the /g option of pcretest (or similar
+code), it works in the same way as it does for Perl's /g option. If all
+alternatives of a regex begin with \G, the expression is anchored to the start
+match position, and the "anchored" flag is set in the compiled expression.
+
+15. Some bugs concerning the handling of certain option changes within patterns
+have been fixed. These applied to options other than (?ims). For example,
+"a(?x: b c )d" did not match "XabcdY" but did match "Xa b c dY". It should have
+been the other way round. Some of this was related to change 7 above.
+
+16. PCRE now gives errors for /[.x.]/ and /[=x=]/ as unsupported POSIX
+features, as Perl does. Previously, PCRE gave the warnings only for /[[.x.]]/
+and /[[=x=]]/. PCRE now also gives an error for /[:name:]/ because it supports
+POSIX classes only within a class (e.g. /[[:alpha:]]/).
+
+17. Added support for Perl's \C escape. This matches one byte, even in UTF8
+mode. Unlike ".", it always matches newline, whatever the setting of
+PCRE_DOTALL. However, PCRE does not permit \C to appear in lookbehind
+assertions. Perl allows it, but it doesn't (in general) work because it can't
+calculate the length of the lookbehind. At least, that's the case for Perl
+5.8.0 - I've been told they are going to document that it doesn't work in
+future.
+
+18. Added an error diagnosis for escapes that PCRE does not support: these are
+\L, \l, \N, \P, \p, \U, \u, and \X.
+
+19. Although correctly diagnosing a missing ']' in a character class, PCRE was
+reading past the end of the pattern in cases such as /[abcd/.
+
+20. PCRE was getting more memory than necessary for patterns with classes that
+contained both POSIX named classes and other characters, e.g. /[[:space:]abc/.
+
+21. Added some code, conditional on #ifdef VPCOMPAT, to make life easier for
+compiling PCRE for use with Virtual Pascal.
+
+22. Small fix to the Makefile to make it work properly if the build is done
+outside the source tree.
+
+23. Added a new extension: a condition to go with recursion. If a conditional
+subpattern starts with (?(R) the "true" branch is used if recursion has
+happened, whereas the "false" branch is used only at the top level.
+
+24. When there was a very long string of literal characters (over 255 bytes
+without UTF support, over 250 bytes with UTF support), the computation of how
+much memory was required could be incorrect, leading to segfaults or other
+strange effects.
+
+25. PCRE was incorrectly assuming anchoring (either to start of subject or to
+start of line for a non-DOTALL pattern) when a pattern started with (.*) and
+there was a subsequent back reference to those brackets. This meant that, for
+example, /(.*)\d+\1/ failed to match "abc123bc". Unfortunately, it isn't
+possible to check for precisely this case. All we can do is abandon the
+optimization if .* occurs inside capturing brackets when there are any back
+references whatsoever. (See below for a better fix that came later.)
+
+26. The handling of the optimization for finding the first character of a
+non-anchored pattern, and for finding a character that is required later in the
+match were failing in some cases. This didn't break the matching; it just
+failed to optimize when it could. The way this is done has been re-implemented.
+
+27. Fixed typo in error message for invalid (?R item (it said "(?p").
+
+28. Added a new feature that provides some of the functionality that Perl
+provides with (?{...}). The facility is termed a "callout". The way it is done
+in PCRE is for the caller to provide an optional function, by setting
+pcre_callout to its entry point. Like pcre_malloc and pcre_free, this is a
+global variable. By default it is unset, which disables all calling out. To get
+the function called, the regex must include (?C) at appropriate points. This
+is, in fact, equivalent to (?C0), and any number <= 255 may be given with (?C).
+This provides a means of identifying different callout points. When PCRE
+reaches such a point in the regex, if pcre_callout has been set, the external
+function is called. It is provided with data in a structure called
+pcre_callout_block, which is defined in pcre.h. If the function returns 0,
+matching continues; if it returns a non-zero value, the match at the current
+point fails. However, backtracking will occur if possible. [This was changed
+later and other features added - see item 49 below.]
+
+29. pcretest is upgraded to test the callout functionality. It provides a
+callout function that displays information. By default, it shows the start of
+the match and the current position in the text. There are some new data escapes
+to vary what happens:
+
+ \C+ in addition, show current contents of captured substrings
+ \C- do not supply a callout function
+ \C!n return 1 when callout number n is reached
+ \C!n!m return 1 when callout number n is reached for the mth time
+
+30. If pcregrep was called with the -l option and just a single file name, it
+output "<stdin>" if a match was found, instead of the file name.
+
+31. Improve the efficiency of the POSIX API to PCRE. If the number of capturing
+slots is less than POSIX_MALLOC_THRESHOLD, use a block on the stack to pass to
+pcre_exec(). This saves a malloc/free per call. The default value of
+POSIX_MALLOC_THRESHOLD is 10; it can be changed by --with-posix-malloc-threshold
+when configuring.
+
+32. The default maximum size of a compiled pattern is 64K. There have been a
+few cases of people hitting this limit. The code now uses macros to handle the
+storing of links as offsets within the compiled pattern. It defaults to 2-byte
+links, but this can be changed to 3 or 4 bytes by --with-link-size when
+configuring. Tests 2 and 5 work only with 2-byte links because they output
+debugging information about compiled patterns.
+
+33. Internal code re-arrangements:
+
+(a) Moved the debugging function for printing out a compiled regex into
+ its own source file (printint.c) and used #include to pull it into
+ pcretest.c and, when DEBUG is defined, into pcre.c, instead of having two
+ separate copies.
+
+(b) Defined the list of op-code names for debugging as a macro in
+ internal.h so that it is next to the definition of the opcodes.
+
+(c) Defined a table of op-code lengths for simpler skipping along compiled
+ code. This is again a macro in internal.h so that it is next to the
+ definition of the opcodes.
+
+34. Added support for recursive calls to individual subpatterns, along the
+lines of Robin Houston's patch (but implemented somewhat differently).
+
+35. Further mods to the Makefile to help Win32. Also, added code to pcregrep to
+allow it to read and process whole directories in Win32. This code was
+contributed by Lionel Fourquaux; it has not been tested by me.
+
+36. Added support for named subpatterns. The Python syntax (?P<name>...) is
+used to name a group. Names consist of alphanumerics and underscores, and must
+be unique. Back references use the syntax (?P=name) and recursive calls use
+(?P>name) which is a PCRE extension to the Python extension. Groups still have
+numbers. The function pcre_fullinfo() can be used after compilation to extract
+a name/number map. There are three relevant calls:
+
+ PCRE_INFO_NAMEENTRYSIZE yields the size of each entry in the map
+ PCRE_INFO_NAMECOUNT yields the number of entries
+ PCRE_INFO_NAMETABLE yields a pointer to the map.
+
+The map is a vector of fixed-size entries. The size of each entry depends on
+the length of the longest name used. The first two bytes of each entry are the
+group number, most significant byte first. There follows the corresponding
+name, zero terminated. The names are in alphabetical order.
+
+37. Make the maximum literal string in the compiled code 250 for the non-UTF-8
+case instead of 255. Making it the same both with and without UTF-8 support
+means that the same test output works with both.
+
+38. There was a case of malloc(0) in the POSIX testing code in pcretest. Avoid
+calling malloc() with a zero argument.
+
+39. Change 25 above had to resort to a heavy-handed test for the .* anchoring
+optimization. I've improved things by keeping a bitmap of backreferences with
+numbers 1-31 so that if .* occurs inside capturing brackets that are not in
+fact referenced, the optimization can be applied. It is unlikely that a
+relevant occurrence of .* (i.e. one which might indicate anchoring or forcing
+the match to follow \n) will appear inside brackets with a number greater than
+31, but if it does, any back reference > 31 suppresses the optimization.
+
+40. Added a new compile-time option PCRE_NO_AUTO_CAPTURE. This has the effect
+of disabling numbered capturing parentheses. Any opening parenthesis that is
+not followed by ? behaves as if it were followed by ?: but named parentheses
+can still be used for capturing (and they will acquire numbers in the usual
+way).
+
+41. Redesigned the return codes from the match() function into yes/no/error so
+that errors can be passed back from deep inside the nested calls. A malloc
+failure while inside a recursive subpattern call now causes the
+PCRE_ERROR_NOMEMORY return instead of quietly going wrong.
+
+42. It is now possible to set a limit on the number of times the match()
+function is called in a call to pcre_exec(). This facility makes it possible to
+limit the amount of recursion and backtracking, though not in a directly
+obvious way, because the match() function is used in a number of different
+circumstances. The count starts from zero for each position in the subject
+string (for non-anchored patterns). The default limit is, for compatibility, a
+large number, namely 10 000 000. You can change this in two ways:
+
+(a) When configuring PCRE before making, you can use --with-match-limit=n
+ to set a default value for the compiled library.
+
+(b) For each call to pcre_exec(), you can pass a pcre_extra block in which
+ a different value is set. See 45 below.
+
+If the limit is exceeded, pcre_exec() returns PCRE_ERROR_MATCHLIMIT.
+
+43. Added a new function pcre_config(int, void *) to enable run-time extraction
+of things that can be changed at compile time. The first argument specifies
+what is wanted and the second points to where the information is to be placed.
+The current list of available information is:
+
+ PCRE_CONFIG_UTF8
+
+The output is an integer that is set to one if UTF-8 support is available;
+otherwise it is set to zero.
+
+ PCRE_CONFIG_NEWLINE
+
+The output is an integer that it set to the value of the code that is used for
+newline. It is either LF (10) or CR (13).
+
+ PCRE_CONFIG_LINK_SIZE
+
+The output is an integer that contains the number of bytes used for internal
+linkage in compiled expressions. The value is 2, 3, or 4. See item 32 above.
+
+ PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
+
+The output is an integer that contains the threshold above which the POSIX
+interface uses malloc() for output vectors. See item 31 above.
+
+ PCRE_CONFIG_MATCH_LIMIT
+
+The output is an unsigned integer that contains the default limit of the number
+of match() calls in a pcre_exec() execution. See 42 above.
+
+44. pcretest has been upgraded by the addition of the -C option. This causes it
+to extract all the available output from the new pcre_config() function, and to
+output it. The program then exits immediately.
+
+45. A need has arisen to pass over additional data with calls to pcre_exec() in
+order to support additional features. One way would have been to define
+pcre_exec2() (for example) with extra arguments, but this would not have been
+extensible, and would also have required all calls to the original function to
+be mapped to the new one. Instead, I have chosen to extend the mechanism that
+is used for passing in "extra" data from pcre_study().
+
+The pcre_extra structure is now exposed and defined in pcre.h. It currently
+contains the following fields:
+
+ flags a bitmap indicating which of the following fields are set
+ study_data opaque data from pcre_study()
+ match_limit a way of specifying a limit on match() calls for a specific
+ call to pcre_exec()
+ callout_data data for callouts (see 49 below)
+
+The flag bits are also defined in pcre.h, and are
+
+ PCRE_EXTRA_STUDY_DATA
+ PCRE_EXTRA_MATCH_LIMIT
+ PCRE_EXTRA_CALLOUT_DATA
+
+The pcre_study() function now returns one of these new pcre_extra blocks, with
+the actual study data pointed to by the study_data field, and the
+PCRE_EXTRA_STUDY_DATA flag set. This can be passed directly to pcre_exec() as
+before. That is, this change is entirely upwards-compatible and requires no
+change to existing code.
+
+If you want to pass in additional data to pcre_exec(), you can either place it
+in a pcre_extra block provided by pcre_study(), or create your own pcre_extra
+block.
+
+46. pcretest has been extended to test the PCRE_EXTRA_MATCH_LIMIT feature. If a
+data string contains the escape sequence \M, pcretest calls pcre_exec() several
+times with different match limits, until it finds the minimum value needed for
+pcre_exec() to complete. The value is then output. This can be instructive; for
+most simple matches the number is quite small, but for pathological cases it
+gets very large very quickly.
+
+47. There's a new option for pcre_fullinfo() called PCRE_INFO_STUDYSIZE. It
+returns the size of the data block pointed to by the study_data field in a
+pcre_extra block, that is, the value that was passed as the argument to
+pcre_malloc() when PCRE was getting memory in which to place the information
+created by pcre_study(). The fourth argument should point to a size_t variable.
+pcretest has been extended so that this information is shown after a successful
+pcre_study() call when information about the compiled regex is being displayed.
+
+48. Cosmetic change to Makefile: there's no need to have / after $(DESTDIR)
+because what follows is always an absolute path. (Later: it turns out that this
+is more than cosmetic for MinGW, because it doesn't like empty path
+components.)
+
+49. Some changes have been made to the callout feature (see 28 above):
+
+(i) A callout function now has three choices for what it returns:
+
+ 0 => success, carry on matching
+ > 0 => failure at this point, but backtrack if possible
+ < 0 => serious error, return this value from pcre_exec()
+
+ Negative values should normally be chosen from the set of PCRE_ERROR_xxx
+ values. In particular, returning PCRE_ERROR_NOMATCH forces a standard
+ "match failed" error. The error number PCRE_ERROR_CALLOUT is reserved for
+ use by callout functions. It will never be used by PCRE itself.
+
+(ii) The pcre_extra structure (see 45 above) has a void * field called
+ callout_data, with corresponding flag bit PCRE_EXTRA_CALLOUT_DATA. The
+ pcre_callout_block structure has a field of the same name. The contents of
+ the field passed in the pcre_extra structure are passed to the callout
+ function in the corresponding field in the callout block. This makes it
+ easier to use the same callout-containing regex from multiple threads. For
+ testing, the pcretest program has a new data escape
+
+ \C*n pass the number n (may be negative) as callout_data
+
+ If the callout function in pcretest receives a non-zero value as
+ callout_data, it returns that value.
+
+50. Makefile wasn't handling CFLAGS properly when compiling dftables. Also,
+there were some redundant $(CFLAGS) in commands that are now specified as
+$(LINK), which already includes $(CFLAGS).
+
+51. Extensions to UTF-8 support are listed below. These all apply when (a) PCRE
+has been compiled with UTF-8 support *and* pcre_compile() has been compiled
+with the PCRE_UTF8 flag. Patterns that are compiled without that flag assume
+one-byte characters throughout. Note that case-insensitive matching applies
+only to characters whose values are less than 256. PCRE doesn't support the
+notion of cases for higher-valued characters.
+
+(i) A character class whose characters are all within 0-255 is handled as
+ a bit map, and the map is inverted for negative classes. Previously, a
+ character > 255 always failed to match such a class; however it should
+ match if the class was a negative one (e.g. [^ab]). This has been fixed.
+
+(ii) A negated character class with a single character < 255 is coded as
+ "not this character" (OP_NOT). This wasn't working properly when the test
+ character was multibyte, either singly or repeated.
+
+(iii) Repeats of multibyte characters are now handled correctly in UTF-8
+ mode, for example: \x{100}{2,3}.
+
+(iv) The character escapes \b, \B, \d, \D, \s, \S, \w, and \W (either
+ singly or repeated) now correctly test multibyte characters. However,
+ PCRE doesn't recognize any characters with values greater than 255 as
+ digits, spaces, or word characters. Such characters always match \D, \S,
+ and \W, and never match \d, \s, or \w.
+
+(v) Classes may now contain characters and character ranges with values
+ greater than 255. For example: [ab\x{100}-\x{400}].
+
+(vi) pcregrep now has a --utf-8 option (synonym -u) which makes it call
+ PCRE in UTF-8 mode.
+
+52. The info request value PCRE_INFO_FIRSTCHAR has been renamed
+PCRE_INFO_FIRSTBYTE because it is a byte value. However, the old name is
+retained for backwards compatibility. (Note that LASTLITERAL is also a byte
+value.)
+
+53. The single man page has become too large. I have therefore split it up into
+a number of separate man pages. These also give rise to individual HTML pages;
+these are now put in a separate directory, and there is an index.html page that
+lists them all. Some hyperlinking between the pages has been installed.
+
+54. Added convenience functions for handling named capturing parentheses.
+
+55. Unknown escapes inside character classes (e.g. [\M]) and escapes that
+aren't interpreted therein (e.g. [\C]) are literals in Perl. This is now also
+true in PCRE, except when the PCRE_EXTENDED option is set, in which case they
+are faulted.
+
+56. Introduced HOST_CC and HOST_CFLAGS which can be set in the environment when
+calling configure. These values are used when compiling the dftables.c program
+which is run to generate the source of the default character tables. They
+default to the values of CC and CFLAGS. If you are cross-compiling PCRE,
+you will need to set these values.
+
+57. Updated the building process for Windows DLL, as provided by Fred Cox.
+
+
+Version 3.9 02-Jan-02
+---------------------
+
+1. A bit of extraneous text had somehow crept into the pcregrep documentation.
+
+2. If --disable-static was given, the building process failed when trying to
+build pcretest and pcregrep. (For some reason it was using libtool to compile
+them, which is not right, as they aren't part of the library.)
+
+
+Version 3.8 18-Dec-01
+---------------------
+
+1. The experimental UTF-8 code was completely screwed up. It was packing the
+bytes in the wrong order. How dumb can you get?
+
+
+Version 3.7 29-Oct-01
+---------------------
+
+1. In updating pcretest to check change 1 of version 3.6, I screwed up.
+This caused pcretest, when used on the test data, to segfault. Unfortunately,
+this didn't happen under Solaris 8, where I normally test things.
+
+2. The Makefile had to be changed to make it work on BSD systems, where 'make'
+doesn't seem to recognize that ./xxx and xxx are the same file. (This entry
+isn't in ChangeLog distributed with 3.7 because I forgot when I hastily made
+this fix an hour or so after the initial 3.7 release.)
+
+
+Version 3.6 23-Oct-01
+---------------------
+
+1. Crashed with /(sens|respons)e and \1ibility/ and "sense and sensibility" if
+offsets passed as NULL with zero offset count.
+
+2. The config.guess and config.sub files had not been updated when I moved to
+the latest autoconf.
+
+
+Version 3.5 15-Aug-01
+---------------------
+
+1. Added some missing #if !defined NOPOSIX conditionals in pcretest.c that
+had been forgotten.
+
+2. By using declared but undefined structures, we can avoid using "void"
+definitions in pcre.h while keeping the internal definitions of the structures
+private.
+
+3. The distribution is now built using autoconf 2.50 and libtool 1.4. From a
+user point of view, this means that both static and shared libraries are built
+by default, but this can be individually controlled. More of the work of
+handling this static/shared cases is now inside libtool instead of PCRE's make
+file.
+
+4. The pcretest utility is now installed along with pcregrep because it is
+useful for users (to test regexs) and by doing this, it automatically gets
+relinked by libtool. The documentation has been turned into a man page, so
+there are now .1, .txt, and .html versions in /doc.
+
+5. Upgrades to pcregrep:
+ (i) Added long-form option names like gnu grep.
+ (ii) Added --help to list all options with an explanatory phrase.
+ (iii) Added -r, --recursive to recurse into sub-directories.
+ (iv) Added -f, --file to read patterns from a file.
+
+6. pcre_exec() was referring to its "code" argument before testing that
+argument for NULL (and giving an error if it was NULL).
+
+7. Upgraded Makefile.in to allow for compiling in a different directory from
+the source directory.
+
+8. Tiny buglet in pcretest: when pcre_fullinfo() was called to retrieve the
+options bits, the pointer it was passed was to an int instead of to an unsigned
+long int. This mattered only on 64-bit systems.
+
+9. Fixed typo (3.4/1) in pcre.h again. Sigh. I had changed pcre.h (which is
+generated) instead of pcre.in, which it its source. Also made the same change
+in several of the .c files.
+
+10. A new release of gcc defines printf() as a macro, which broke pcretest
+because it had an ifdef in the middle of a string argument for printf(). Fixed
+by using separate calls to printf().
+
+11. Added --enable-newline-is-cr and --enable-newline-is-lf to the configure
+script, to force use of CR or LF instead of \n in the source. On non-Unix
+systems, the value can be set in config.h.
+
+12. The limit of 200 on non-capturing parentheses is a _nesting_ limit, not an
+absolute limit. Changed the text of the error message to make this clear, and
+likewise updated the man page.
+
+13. The limit of 99 on the number of capturing subpatterns has been removed.
+The new limit is 65535, which I hope will not be a "real" limit.
+
+
+Version 3.4 22-Aug-00
+---------------------
+
+1. Fixed typo in pcre.h: unsigned const char * changed to const unsigned char *.
+
+2. Diagnose condition (?(0) as an error instead of crashing on matching.
+
+
+Version 3.3 01-Aug-00
+---------------------
+
+1. If an octal character was given, but the value was greater than \377, it
+was not getting masked to the least significant bits, as documented. This could
+lead to crashes in some systems.
+
+2. Perl 5.6 (if not earlier versions) accepts classes like [a-\d] and treats
+the hyphen as a literal. PCRE used to give an error; it now behaves like Perl.
+
+3. Added the functions pcre_free_substring() and pcre_free_substring_list().
+These just pass their arguments on to (pcre_free)(), but they are provided
+because some uses of PCRE bind it to non-C systems that can call its functions,
+but cannot call free() or pcre_free() directly.
+
+4. Add "make test" as a synonym for "make check". Corrected some comments in
+the Makefile.
+
+5. Add $(DESTDIR)/ in front of all the paths in the "install" target in the
+Makefile.
+
+6. Changed the name of pgrep to pcregrep, because Solaris has introduced a
+command called pgrep for grepping around the active processes.
+
+7. Added the beginnings of support for UTF-8 character strings.
+
+8. Arranged for the Makefile to pass over the settings of CC, CFLAGS, and
+RANLIB to ./ltconfig so that they are used by libtool. I think these are all
+the relevant ones. (AR is not passed because ./ltconfig does its own figuring
+out for the ar command.)
+
+
+Version 3.2 12-May-00
+---------------------
+
+This is purely a bug fixing release.
+
+1. If the pattern /((Z)+|A)*/ was matched agained ZABCDEFG it matched Z instead
+of ZA. This was just one example of several cases that could provoke this bug,
+which was introduced by change 9 of version 2.00. The code for breaking
+infinite loops after an iteration that matches an empty string was't working
+correctly.
+
+2. The pcretest program was not imitating Perl correctly for the pattern /a*/g
+when matched against abbab (for example). After matching an empty string, it
+wasn't forcing anchoring when setting PCRE_NOTEMPTY for the next attempt; this
+caused it to match further down the string than it should.
+
+3. The code contained an inclusion of sys/types.h. It isn't clear why this
+was there because it doesn't seem to be needed, and it causes trouble on some
+systems, as it is not a Standard C header. It has been removed.
+
+4. Made 4 silly changes to the source to avoid stupid compiler warnings that
+were reported on the Macintosh. The changes were from
+
+ while ((c = *(++ptr)) != 0 && c != '\n');
+to
+ while ((c = *(++ptr)) != 0 && c != '\n') ;
+
+Totally extraordinary, but if that's what it takes...
+
+5. PCRE is being used in one environment where neither memmove() nor bcopy() is
+available. Added HAVE_BCOPY and an autoconf test for it; if neither
+HAVE_MEMMOVE nor HAVE_BCOPY is set, use a built-in emulation function which
+assumes the way PCRE uses memmove() (always moving upwards).
+
+6. PCRE is being used in one environment where strchr() is not available. There
+was only one use in pcre.c, and writing it out to avoid strchr() probably gives
+faster code anyway.
+
+
+Version 3.1 09-Feb-00
+---------------------
+
+The only change in this release is the fixing of some bugs in Makefile.in for
+the "install" target:
+
+(1) It was failing to install pcreposix.h.
+
+(2) It was overwriting the pcre.3 man page with the pcreposix.3 man page.
+
+
+Version 3.0 01-Feb-00
+---------------------
+
+1. Add support for the /+ modifier to perltest (to output $` like it does in
+pcretest).
+
+2. Add support for the /g modifier to perltest.
+
+3. Fix pcretest so that it behaves even more like Perl for /g when the pattern
+matches null strings.
+
+4. Fix perltest so that it doesn't do unwanted things when fed an empty
+pattern. Perl treats empty patterns specially - it reuses the most recent
+pattern, which is not what we want. Replace // by /(?#)/ in order to avoid this
+effect.
+
+5. The POSIX interface was broken in that it was just handing over the POSIX
+captured string vector to pcre_exec(), but (since release 2.00) PCRE has
+required a bigger vector, with some working space on the end. This means that
+the POSIX wrapper now has to get and free some memory, and copy the results.
+
+6. Added some simple autoconf support, placing the test data and the
+documentation in separate directories, re-organizing some of the
+information files, and making it build pcre-config (a GNU standard). Also added
+libtool support for building PCRE as a shared library, which is now the
+default.
+
+7. Got rid of the leading zero in the definition of PCRE_MINOR because 08 and
+09 are not valid octal constants. Single digits will be used for minor values
+less than 10.
+
+8. Defined REG_EXTENDED and REG_NOSUB as zero in the POSIX header, so that
+existing programs that set these in the POSIX interface can use PCRE without
+modification.
+
+9. Added a new function, pcre_fullinfo() with an extensible interface. It can
+return all that pcre_info() returns, plus additional data. The pcre_info()
+function is retained for compatibility, but is considered to be obsolete.
+
+10. Added experimental recursion feature (?R) to handle one common case that
+Perl 5.6 will be able to do with (?p{...}).
+
+11. Added support for POSIX character classes like [:alpha:], which Perl is
+adopting.
+
+
+Version 2.08 31-Aug-99
+----------------------
+
+1. When startoffset was not zero and the pattern began with ".*", PCRE was not
+trying to match at the startoffset position, but instead was moving forward to
+the next newline as if a previous match had failed.
+
+2. pcretest was not making use of PCRE_NOTEMPTY when repeating for /g and /G,
+and could get into a loop if a null string was matched other than at the start
+of the subject.
+
+3. Added definitions of PCRE_MAJOR and PCRE_MINOR to pcre.h so the version can
+be distinguished at compile time, and for completeness also added PCRE_DATE.
+
+5. Added Paul Sokolovsky's minor changes to make it easy to compile a Win32 DLL
+in GnuWin32 environments.
+
+
+Version 2.07 29-Jul-99
+----------------------
+
+1. The documentation is now supplied in plain text form and HTML as well as in
+the form of man page sources.
+
+2. C++ compilers don't like assigning (void *) values to other pointer types.
+In particular this affects malloc(). Although there is no problem in Standard
+C, I've put in casts to keep C++ compilers happy.
+
+3. Typo on pcretest.c; a cast of (unsigned char *) in the POSIX regexec() call
+should be (const char *).
+
+4. If NOPOSIX is defined, pcretest.c compiles without POSIX support. This may
+be useful for non-Unix systems who don't want to bother with the POSIX stuff.
+However, I haven't made this a standard facility. The documentation doesn't
+mention it, and the Makefile doesn't support it.
+
+5. The Makefile now contains an "install" target, with editable destinations at
+the top of the file. The pcretest program is not installed.
+
+6. pgrep -V now gives the PCRE version number and date.
+
+7. Fixed bug: a zero repetition after a literal string (e.g. /abcde{0}/) was
+causing the entire string to be ignored, instead of just the last character.
+
+8. If a pattern like /"([^\\"]+|\\.)*"/ is applied in the normal way to a
+non-matching string, it can take a very, very long time, even for strings of
+quite modest length, because of the nested recursion. PCRE now does better in
+some of these cases. It does this by remembering the last required literal
+character in the pattern, and pre-searching the subject to ensure it is present
+before running the real match. In other words, it applies a heuristic to detect
+some types of certain failure quickly, and in the above example, if presented
+with a string that has no trailing " it gives "no match" very quickly.
+
+9. A new runtime option PCRE_NOTEMPTY causes null string matches to be ignored;
+other alternatives are tried instead.
+
+
+Version 2.06 09-Jun-99
+----------------------
+
+1. Change pcretest's output for amount of store used to show just the code
+space, because the remainder (the data block) varies in size between 32-bit and
+64-bit systems.
+
+2. Added an extra argument to pcre_exec() to supply an offset in the subject to
+start matching at. This allows lookbehinds to work when searching for multiple
+occurrences in a string.
+
+3. Added additional options to pcretest for testing multiple occurrences:
+
+ /+ outputs the rest of the string that follows a match
+ /g loops for multiple occurrences, using the new startoffset argument
+ /G loops for multiple occurrences by passing an incremented pointer
+
+4. PCRE wasn't doing the "first character" optimization for patterns starting
+with \b or \B, though it was doing it for other lookbehind assertions. That is,
+it wasn't noticing that a match for a pattern such as /\bxyz/ has to start with
+the letter 'x'. On long subject strings, this gives a significant speed-up.
+
+
+Version 2.05 21-Apr-99
+----------------------
+
+1. Changed the type of magic_number from int to long int so that it works
+properly on 16-bit systems.
+
+2. Fixed a bug which caused patterns starting with .* not to work correctly
+when the subject string contained newline characters. PCRE was assuming
+anchoring for such patterns in all cases, which is not correct because .* will
+not pass a newline unless PCRE_DOTALL is set. It now assumes anchoring only if
+DOTALL is set at top level; otherwise it knows that patterns starting with .*
+must be retried after every newline in the subject.
+
+
+Version 2.04 18-Feb-99
+----------------------
+
+1. For parenthesized subpatterns with repeats whose minimum was zero, the
+computation of the store needed to hold the pattern was incorrect (too large).
+If such patterns were nested a few deep, this could multiply and become a real
+problem.
+
+2. Added /M option to pcretest to show the memory requirement of a specific
+pattern. Made -m a synonym of -s (which does this globally) for compatibility.
+
+3. Subpatterns of the form (regex){n,m} (i.e. limited maximum) were being
+compiled in such a way that the backtracking after subsequent failure was
+pessimal. Something like (a){0,3} was compiled as (a)?(a)?(a)? instead of
+((a)((a)(a)?)?)? with disastrous performance if the maximum was of any size.
+
+
+Version 2.03 02-Feb-99
+----------------------
+
+1. Fixed typo and small mistake in man page.
+
+2. Added 4th condition (GPL supersedes if conflict) and created separate
+LICENCE file containing the conditions.
+
+3. Updated pcretest so that patterns such as /abc\/def/ work like they do in
+Perl, that is the internal \ allows the delimiter to be included in the
+pattern. Locked out the use of \ as a delimiter. If \ immediately follows
+the final delimiter, add \ to the end of the pattern (to test the error).
+
+4. Added the convenience functions for extracting substrings after a successful
+match. Updated pcretest to make it able to test these functions.
+
+
+Version 2.02 14-Jan-99
+----------------------
+
+1. Initialized the working variables associated with each extraction so that
+their saving and restoring doesn't refer to uninitialized store.
+
+2. Put dummy code into study.c in order to trick the optimizer of the IBM C
+compiler for OS/2 into generating correct code. Apparently IBM isn't going to
+fix the problem.
+
+3. Pcretest: the timing code wasn't using LOOPREPEAT for timing execution
+calls, and wasn't printing the correct value for compiling calls. Increased the
+default value of LOOPREPEAT, and the number of significant figures in the
+times.
+
+4. Changed "/bin/rm" in the Makefile to "-rm" so it works on Windows NT.
+
+5. Renamed "deftables" as "dftables" to get it down to 8 characters, to avoid
+a building problem on Windows NT with a FAT file system.
+
+
+Version 2.01 21-Oct-98
+----------------------
+
+1. Changed the API for pcre_compile() to allow for the provision of a pointer
+to character tables built by pcre_maketables() in the current locale. If NULL
+is passed, the default tables are used.
+
+
+Version 2.00 24-Sep-98
+----------------------
+
+1. Since the (>?) facility is in Perl 5.005, don't require PCRE_EXTRA to enable
+it any more.
+
+2. Allow quantification of (?>) groups, and make it work correctly.
+
+3. The first character computation wasn't working for (?>) groups.
+
+4. Correct the implementation of \Z (it is permitted to match on the \n at the
+end of the subject) and add 5.005's \z, which really does match only at the
+very end of the subject.
+
+5. Remove the \X "cut" facility; Perl doesn't have it, and (?> is neater.
+
+6. Remove the ability to specify CASELESS, MULTILINE, DOTALL, and
+DOLLAR_END_ONLY at runtime, to make it possible to implement the Perl 5.005
+localized options. All options to pcre_study() were also removed.
+
+7. Add other new features from 5.005:
+
+ $(?<= positive lookbehind
+ $(?<! negative lookbehind
+ (?imsx-imsx) added the unsetting capability
+ such a setting is global if at outer level; local otherwise
+ (?imsx-imsx:) non-capturing groups with option setting
+ (?(cond)re|re) conditional pattern matching
+
+ A backreference to itself in a repeated group matches the previous
+ captured string.
+
+8. General tidying up of studying (both automatic and via "study")
+consequential on the addition of new assertions.
+
+9. As in 5.005, unlimited repeated groups that could match an empty substring
+are no longer faulted at compile time. Instead, the loop is forcibly broken at
+runtime if any iteration does actually match an empty substring.
+
+10. Include the RunTest script in the distribution.
+
+11. Added tests from the Perl 5.005_02 distribution. This showed up a few
+discrepancies, some of which were old and were also with respect to 5.004. They
+have now been fixed.
+
+
+Version 1.09 28-Apr-98
+----------------------
+
+1. A negated single character class followed by a quantifier with a minimum
+value of one (e.g. [^x]{1,6} ) was not compiled correctly. This could lead to
+program crashes, or just wrong answers. This did not apply to negated classes
+containing more than one character, or to minima other than one.
+
+
+Version 1.08 27-Mar-98
+----------------------
+
+1. Add PCRE_UNGREEDY to invert the greediness of quantifiers.
+
+2. Add (?U) and (?X) to set PCRE_UNGREEDY and PCRE_EXTRA respectively. The
+latter must appear before anything that relies on it in the pattern.
+
+
+Version 1.07 16-Feb-98
+----------------------
+
+1. A pattern such as /((a)*)*/ was not being diagnosed as in error (unlimited
+repeat of a potentially empty string).
+
+
+Version 1.06 23-Jan-98
+----------------------
+
+1. Added Markus Oberhumer's little patches for C++.
+
+2. Literal strings longer than 255 characters were broken.
+
+
+Version 1.05 23-Dec-97
+----------------------
+
+1. Negated character classes containing more than one character were failing if
+PCRE_CASELESS was set at run time.
+
+
+Version 1.04 19-Dec-97
+----------------------
+
+1. Corrected the man page, where some "const" qualifiers had been omitted.
+
+2. Made debugging output print "{0,xxx}" instead of just "{,xxx}" to agree with
+input syntax.
+
+3. Fixed memory leak which occurred when a regex with back references was
+matched with an offsets vector that wasn't big enough. The temporary memory
+that is used in this case wasn't being freed if the match failed.
+
+4. Tidied pcretest to ensure it frees memory that it gets.
+
+5. Temporary memory was being obtained in the case where the passed offsets
+vector was exactly big enough.
+
+6. Corrected definition of offsetof() from change 5 below.
+
+7. I had screwed up change 6 below and broken the rules for the use of
+setjmp(). Now fixed.
+
+
+Version 1.03 18-Dec-97
+----------------------
+
+1. A erroneous regex with a missing opening parenthesis was correctly
+diagnosed, but PCRE attempted to access brastack[-1], which could cause crashes
+on some systems.
+
+2. Replaced offsetof(real_pcre, code) by offsetof(real_pcre, code[0]) because
+it was reported that one broken compiler failed on the former because "code" is
+also an independent variable.
+
+3. The erroneous regex a[]b caused an array overrun reference.
+
+4. A regex ending with a one-character negative class (e.g. /[^k]$/) did not
+fail on data ending with that character. (It was going on too far, and checking
+the next character, typically a binary zero.) This was specific to the
+optimized code for single-character negative classes.
+
+5. Added a contributed patch from the TIN world which does the following:
+
+ + Add an undef for memmove, in case the the system defines a macro for it.
+
+ + Add a definition of offsetof(), in case there isn't one. (I don't know
+ the reason behind this - offsetof() is part of the ANSI standard - but
+ it does no harm).
+
+ + Reduce the ifdef's in pcre.c using macro DPRINTF, thereby eliminating
+ most of the places where whitespace preceded '#'. I have given up and
+ allowed the remaining 2 cases to be at the margin.
+
+ + Rename some variables in pcre to eliminate shadowing. This seems very
+ pedantic, but does no harm, of course.
+
+6. Moved the call to setjmp() into its own function, to get rid of warnings
+from gcc -Wall, and avoided calling it at all unless PCRE_EXTRA is used.
+
+7. Constructs such as \d{8,} were compiling into the equivalent of
+\d{8}\d{0,65527} instead of \d{8}\d* which didn't make much difference to the
+outcome, but in this particular case used more store than had been allocated,
+which caused the bug to be discovered because it threw up an internal error.
+
+8. The debugging code in both pcre and pcretest for outputting the compiled
+form of a regex was going wrong in the case of back references followed by
+curly-bracketed repeats.
+
+
+Version 1.02 12-Dec-97
+----------------------
+
+1. Typos in pcre.3 and comments in the source fixed.
+
+2. Applied a contributed patch to get rid of places where it used to remove
+'const' from variables, and fixed some signed/unsigned and uninitialized
+variable warnings.
+
+3. Added the "runtest" target to Makefile.
+
+4. Set default compiler flag to -O2 rather than just -O.
+
+
+Version 1.01 19-Nov-97
+----------------------
+
+1. PCRE was failing to diagnose unlimited repeat of empty string for patterns
+like /([ab]*)*/, that is, for classes with more than one character in them.
+
+2. Likewise, it wasn't diagnosing patterns with "once-only" subpatterns, such
+as /((?>a*))*/ (a PCRE_EXTRA facility).
+
+
+Version 1.00 18-Nov-97
+----------------------
+
+1. Added compile-time macros to support systems such as SunOS4 which don't have
+memmove() or strerror() but have other things that can be used instead.
+
+2. Arranged that "make clean" removes the executables.
+
+
+Version 0.99 27-Oct-97
+----------------------
+
+1. Fixed bug in code for optimizing classes with only one character. It was
+initializing a 32-byte map regardless, which could cause it to run off the end
+of the memory it had got.
+
+2. Added, conditional on PCRE_EXTRA, the proposed (?>REGEX) construction.
+
+
+Version 0.98 22-Oct-97
+----------------------
+
+1. Fixed bug in code for handling temporary memory usage when there are more
+back references than supplied space in the ovector. This could cause segfaults.
+
+
+Version 0.97 21-Oct-97
+----------------------
+
+1. Added the \X "cut" facility, conditional on PCRE_EXTRA.
+
+2. Optimized negated single characters not to use a bit map.
+
+3. Brought error texts together as macro definitions; clarified some of them;
+fixed one that was wrong - it said "range out of order" when it meant "invalid
+escape sequence".
+
+4. Changed some char * arguments to const char *.
+
+5. Added PCRE_NOTBOL and PCRE_NOTEOL (from POSIX).
+
+6. Added the POSIX-style API wrapper in pcreposix.a and testing facilities in
+pcretest.
+
+
+Version 0.96 16-Oct-97
+----------------------
+
+1. Added a simple "pgrep" utility to the distribution.
+
+2. Fixed an incompatibility with Perl: "{" is now treated as a normal character
+unless it appears in one of the precise forms "{ddd}", "{ddd,}", or "{ddd,ddd}"
+where "ddd" means "one or more decimal digits".
+
+3. Fixed serious bug. If a pattern had a back reference, but the call to
+pcre_exec() didn't supply a large enough ovector to record the related
+identifying subpattern, the match always failed. PCRE now remembers the number
+of the largest back reference, and gets some temporary memory in which to save
+the offsets during matching if necessary, in order to ensure that
+backreferences always work.
+
+4. Increased the compatibility with Perl in a number of ways:
+
+ (a) . no longer matches \n by default; an option PCRE_DOTALL is provided
+ to request this handling. The option can be set at compile or exec time.
+
+ (b) $ matches before a terminating newline by default; an option
+ PCRE_DOLLAR_ENDONLY is provided to override this (but not in multiline
+ mode). The option can be set at compile or exec time.
+
+ (c) The handling of \ followed by a digit other than 0 is now supposed to be
+ the same as Perl's. If the decimal number it represents is less than 10
+ or there aren't that many previous left capturing parentheses, an octal
+ escape is read. Inside a character class, it's always an octal escape,
+ even if it is a single digit.
+
+ (d) An escaped but undefined alphabetic character is taken as a literal,
+ unless PCRE_EXTRA is set. Currently this just reserves the remaining
+ escapes.
+
+ (e) {0} is now permitted. (The previous item is removed from the compiled
+ pattern).
+
+5. Changed all the names of code files so that the basic parts are no longer
+than 10 characters, and abolished the teeny "globals.c" file.
+
+6. Changed the handling of character classes; they are now done with a 32-byte
+bit map always.
+
+7. Added the -d and /D options to pcretest to make it possible to look at the
+internals of compilation without having to recompile pcre.
+
+
+Version 0.95 23-Sep-97
+----------------------
+
+1. Fixed bug in pre-pass concerning escaped "normal" characters such as \x5c or
+\x20 at the start of a run of normal characters. These were being treated as
+real characters, instead of the source characters being re-checked.
+
+
+Version 0.94 18-Sep-97
+----------------------
+
+1. The functions are now thread-safe, with the caveat that the global variables
+containing pointers to malloc() and free() or alternative functions are the
+same for all threads.
+
+2. Get pcre_study() to generate a bitmap of initial characters for non-
+anchored patterns when this is possible, and use it if passed to pcre_exec().
+
+
+Version 0.93 15-Sep-97
+----------------------
+
+1. /(b)|(:+)/ was computing an incorrect first character.
+
+2. Add pcre_study() to the API and the passing of pcre_extra to pcre_exec(),
+but not actually doing anything yet.
+
+3. Treat "-" characters in classes that cannot be part of ranges as literals,
+as Perl does (e.g. [-az] or [az-]).
+
+4. Set the anchored flag if a branch starts with .* or .*? because that tests
+all possible positions.
+
+5. Split up into different modules to avoid including unneeded functions in a
+compiled binary. However, compile and exec are still in one module. The "study"
+function is split off.
+
+6. The character tables are now in a separate module whose source is generated
+by an auxiliary program - but can then be edited by hand if required. There are
+now no calls to isalnum(), isspace(), isdigit(), isxdigit(), tolower() or
+toupper() in the code.
+
+7. Turn the malloc/free funtions variables into pcre_malloc and pcre_free and
+make them global. Abolish the function for setting them, as the caller can now
+set them directly.
+
+
+Version 0.92 11-Sep-97
+----------------------
+
+1. A repeat with a fixed maximum and a minimum of 1 for an ordinary character
+(e.g. /a{1,3}/) was broken (I mis-optimized it).
+
+2. Caseless matching was not working in character classes if the characters in
+the pattern were in upper case.
+
+3. Make ranges like [W-c] work in the same way as Perl for caseless matching.
+
+4. Make PCRE_ANCHORED public and accept as a compile option.
+
+5. Add an options word to pcre_exec() and accept PCRE_ANCHORED and
+PCRE_CASELESS at run time. Add escapes \A and \I to pcretest to cause it to
+pass them.
+
+6. Give an error if bad option bits passed at compile or run time.
+
+7. Add PCRE_MULTILINE at compile and exec time, and (?m) as well. Add \M to
+pcretest to cause it to pass that flag.
+
+8. Add pcre_info(), to get the number of identifying subpatterns, the stored
+options, and the first character, if set.
+
+9. Recognize C+ or C{n,m} where n >= 1 as providing a fixed starting character.
+
+
+Version 0.91 10-Sep-97
+----------------------
+
+1. PCRE was failing to diagnose unlimited repeats of subpatterns that could
+match the empty string as in /(a*)*/. It was looping and ultimately crashing.
+
+2. PCRE was looping on encountering an indefinitely repeated back reference to
+a subpattern that had matched an empty string, e.g. /(a|)\1*/. It now does what
+Perl does - treats the match as successful.
+
+****
diff --git a/src/src/pcre/LICENCE b/src/src/pcre/LICENCE
new file mode 100644
index 000000000..1573583b6
--- /dev/null
+++ b/src/src/pcre/LICENCE
@@ -0,0 +1,45 @@
+PCRE LICENCE
+------------
+
+PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+Release 5 of PCRE is distributed under the terms of the "BSD" licence, as
+specified below. The documentation for PCRE, supplied in the "doc"
+directory, is distributed under the same terms as the software itself.
+
+Written by: Philip Hazel <ph10@cam.ac.uk>
+
+University of Cambridge Computing Service,
+Cambridge, England. Phone: +44 1223 334714.
+
+Copyright (c) 1997-2004 University of Cambridge
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+End
diff --git a/src/src/pcre/Makefile b/src/src/pcre/Makefile
new file mode 100644
index 000000000..c6562de3c
--- /dev/null
+++ b/src/src/pcre/Makefile
@@ -0,0 +1,51 @@
+# $Cambridge: exim/src/src/pcre/Makefile,v 1.1 2004/10/07 13:04:13 ph10 Exp $
+
+# Makefile for PCRE (Perl-Compatible Regular Expression) library for use by
+# Exim. This is a tailored Makefile, not the normal one that comes with the
+# PCRE distribution.
+
+# These variables are in practice overridden from the Exim Makefile.
+
+AR = ar cq
+CC = gcc -O2 -Wall
+CFLAGS =
+RANLIB = @true
+
+##############################################################################
+
+OBJ = maketables.o get.o pcre.o study.o
+
+all: libpcre.a ../pcretest
+
+../pcretest: libpcre.a pcretest.o
+ $(CC) $(CFLAGS) -o ../pcretest pcretest.o libpcre.a
+
+libpcre.a: $(OBJ)
+ -rm -f libpcre.a
+ $(AR) libpcre.a $(OBJ)
+ $(RANLIB) libpcre.a
+
+pcre.o: chartables.c pcre.c config.h pcre.h internal.h Makefile
+ $(CC) -c $(CFLAGS) pcre.c
+
+maketables.o: maketables.c config.h pcre.h internal.h Makefile
+ $(CC) -c $(CFLAGS) maketables.c
+
+get.o: get.c pcre.h config.h internal.h Makefile
+ $(CC) -c $(CFLAGS) get.c
+
+study.o: study.c pcre.h config.h internal.h Makefile
+ $(CC) -c $(CFLAGS) study.c
+
+pcretest.o: pcretest.c config.h pcre.h internal.h Makefile
+ $(CC) -c -DNOPOSIX $(CFLAGS) -I. pcretest.c
+
+# An auxiliary program makes the default character table source
+
+chartables.c: dftables
+ ./dftables chartables.c
+
+dftables: dftables.c maketables.c config.h pcre.h internal.h Makefile
+ $(CC) -o dftables $(CFLAGS) dftables.c
+
+# End
diff --git a/src/src/pcre/README b/src/src/pcre/README
new file mode 100644
index 000000000..07c2c5ad4
--- /dev/null
+++ b/src/src/pcre/README
@@ -0,0 +1,20 @@
+$Cambridge: exim/src/src/pcre/README,v 1.1 2004/10/07 13:04:13 ph10 Exp $
+
+PCRE for use in Exim
+--------------------
+
+This directory contains a subset of the files from the PCRE distribution,
+enough to supply regular expression support for Exim, plus the pcretest test
+program. Do not start from here if you want to install PCRE as a free-standing
+library for use by other programs. Get the full PCRE distribution, which can be
+obtained from
+
+ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-x.x.tar.gz
+
+where x.x is the version number. This contains support for a POSIX interface to
+PCRE, a "grep" program that uses PCRE, and the means to build PCRE as a shared
+library. It is configured by autoconf, and has "make install" support in the
+conventional way.
+
+Philip Hazel <ph10@cam.ac.uk>
+February 2000
diff --git a/src/src/pcre/config.h b/src/src/pcre/config.h
new file mode 100644
index 000000000..95be42c0e
--- /dev/null
+++ b/src/src/pcre/config.h
@@ -0,0 +1,39 @@
+/* $Cambridge: exim/src/src/pcre/config.h,v 1.1 2004/10/07 13:04:13 ph10 Exp $ */
+
+/*************************************************
+* config.h for PCRE for Exim *
+*************************************************/
+
+/* The PCRE sources include config.h, which for a free-standing PCRE build gets
+set up by autoconf. For the embedded version in Exim, this file, which is
+manually maintained, is used.
+
+The only configuration thing that matters for the PCRE library itself is
+whether the memmove() function exists or not. It should be present in all
+Standard C libraries, but is missing in SunOS4. PCRE expects autoconf to set
+HAVE_MEMMOVE to 1 in config.h when memmove() is present. If that is not set, it
+defines memmove() as a macro for bcopy().
+
+Exim works differently. It handles this case by defining memmove() as a macro
+in its os.h-SunOS4 file. We interface this to PCRE by including the os.h file
+here, and then defining HAVE_MEMOVE so that PCRE's code in internal.h leaves
+things alone. */
+
+#include "../os.h"
+#define HAVE_MEMMOVE 1
+
+/* We also set up directly a number of parameters that, in the freestanding
+PCRE, can be adjusted by "configure". */
+
+#define NEWLINE '\n'
+#define LINK_SIZE 2
+#define MATCH_LIMIT 10000000
+#define POSIX_MALLOC_THRESHOLD 10
+
+/* There is some stuff in the PCRE sources for compilation on non-Unix systems
+and non-ASCII systems. For Exim's purposes, just flatten it all. */
+
+#define EBCDIC 0
+#define EXPORT
+
+/* End */
diff --git a/src/src/pcre/dftables.c b/src/src/pcre/dftables.c
new file mode 100644
index 000000000..8458c60f1
--- /dev/null
+++ b/src/src/pcre/dftables.c
@@ -0,0 +1,173 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/*
+PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+Written by: Philip Hazel <ph10@cam.ac.uk>
+
+ Copyright (c) 1997-2004 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This is a support program to generate the file chartables.c, containing
+character tables of various kinds. They are built according to the default C
+locale and used as the default tables by PCRE. Now that pcre_maketables is
+a function visible to the outside world, we make use of its code from here in
+order to be consistent. */
+
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "internal.h"
+
+#define DFTABLES /* maketables.c notices this */
+#include "maketables.c"
+
+
+int main(int argc, char **argv)
+{
+int i;
+FILE *f;
+const unsigned char *tables = pcre_maketables();
+
+if (argc != 2)
+ {
+ fprintf(stderr, "dftables: one filename argument is required\n");
+ return 1;
+ }
+
+f = fopen(argv[1], "w");
+if (f == NULL)
+ {
+ fprintf(stderr, "dftables: failed to open %s for writing\n", argv[1]);
+ return 1;
+ }
+
+/* There are two fprintf() calls here, because gcc in pedantic mode complains
+about the very long string otherwise. */
+
+fprintf(f,
+ "/*************************************************\n"
+ "* Perl-Compatible Regular Expressions *\n"
+ "*************************************************/\n\n"
+ "/* This file is automatically written by the dftables auxiliary \n"
+ "program. If you edit it by hand, you might like to edit the Makefile to \n"
+ "prevent its ever being regenerated.\n\n");
+fprintf(f,
+ "This file is #included in the compilation of pcre.c to build the default\n"
+ "character tables which are used when no tables are passed to the compile\n"
+ "function. */\n\n"
+ "static unsigned char pcre_default_tables[] = {\n\n"
+ "/* This table is a lower casing table. */\n\n");
+
+fprintf(f, " ");
+for (i = 0; i < 256; i++)
+ {
+ if ((i & 7) == 0 && i != 0) fprintf(f, "\n ");
+ fprintf(f, "%3d", *tables++);
+ if (i != 255) fprintf(f, ",");
+ }
+fprintf(f, ",\n\n");
+
+fprintf(f, "/* This table is a case flipping table. */\n\n");
+
+fprintf(f, " ");
+for (i = 0; i < 256; i++)
+ {
+ if ((i & 7) == 0 && i != 0) fprintf(f, "\n ");
+ fprintf(f, "%3d", *tables++);
+ if (i != 255) fprintf(f, ",");
+ }
+fprintf(f, ",\n\n");
+
+fprintf(f,
+ "/* This table contains bit maps for various character classes.\n"
+ "Each map is 32 bytes long and the bits run from the least\n"
+ "significant end of each byte. The classes that have their own\n"
+ "maps are: space, xdigit, digit, upper, lower, word, graph\n"
+ "print, punct, and cntrl. Other classes are built from combinations. */\n\n");
+
+fprintf(f, " ");
+for (i = 0; i < cbit_length; i++)
+ {
+ if ((i & 7) == 0 && i != 0)
+ {
+ if ((i & 31) == 0) fprintf(f, "\n");
+ fprintf(f, "\n ");
+ }
+ fprintf(f, "0x%02x", *tables++);
+ if (i != cbit_length - 1) fprintf(f, ",");
+ }
+fprintf(f, ",\n\n");
+
+fprintf(f,
+ "/* This table identifies various classes of character by individual bits:\n"
+ " 0x%02x white space character\n"
+ " 0x%02x letter\n"
+ " 0x%02x decimal digit\n"
+ " 0x%02x hexadecimal digit\n"
+ " 0x%02x alphanumeric or '_'\n"
+ " 0x%02x regular expression metacharacter or binary zero\n*/\n\n",
+ ctype_space, ctype_letter, ctype_digit, ctype_xdigit, ctype_word,
+ ctype_meta);
+
+fprintf(f, " ");
+for (i = 0; i < 256; i++)
+ {
+ if ((i & 7) == 0 && i != 0)
+ {
+ fprintf(f, " /* ");
+ if (isprint(i-8)) fprintf(f, " %c -", i-8);
+ else fprintf(f, "%3d-", i-8);
+ if (isprint(i-1)) fprintf(f, " %c ", i-1);
+ else fprintf(f, "%3d", i-1);
+ fprintf(f, " */\n ");
+ }
+ fprintf(f, "0x%02x", *tables++);
+ if (i != 255) fprintf(f, ",");
+ }
+
+fprintf(f, "};/* ");
+if (isprint(i-8)) fprintf(f, " %c -", i-8);
+ else fprintf(f, "%3d-", i-8);
+if (isprint(i-1)) fprintf(f, " %c ", i-1);
+ else fprintf(f, "%3d", i-1);
+fprintf(f, " */\n\n/* End of chartables.c */\n");
+
+fclose(f);
+return 0;
+}
+
+/* End of dftables.c */
diff --git a/src/src/pcre/get.c b/src/src/pcre/get.c
new file mode 100644
index 000000000..225843e2e
--- /dev/null
+++ b/src/src/pcre/get.c
@@ -0,0 +1,357 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/*
+This is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language. See
+the file Tech.Notes for some information on the internals.
+
+Written by: Philip Hazel <ph10@cam.ac.uk>
+
+ Copyright (c) 1997-2003 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* This module contains some convenience functions for extracting substrings
+from the subject string after a regex match has succeeded. The original idea
+for these functions came from Scott Wimer. */
+
+
+/* Include the internals header, which itself includes Standard C headers plus
+the external pcre header. */
+
+#include "internal.h"
+
+
+/*************************************************
+* Find number for named string *
+*************************************************/
+
+/* This function is used by the two extraction functions below, as well
+as being generally available.
+
+Arguments:
+ code the compiled regex
+ stringname the name whose number is required
+
+Returns: the number of the named parentheses, or a negative number
+ (PCRE_ERROR_NOSUBSTRING) if not found
+*/
+
+int
+pcre_get_stringnumber(const pcre *code, const char *stringname)
+{
+int rc;
+int entrysize;
+int top, bot;
+uschar *nametable;
+
+if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
+ return rc;
+if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
+
+if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
+ return rc;
+if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
+ return rc;
+
+bot = 0;
+while (top > bot)
+ {
+ int mid = (top + bot) / 2;
+ uschar *entry = nametable + entrysize*mid;
+ int c = strcmp(stringname, (char *)(entry + 2));
+ if (c == 0) return (entry[0] << 8) + entry[1];
+ if (c > 0) bot = mid + 1; else top = mid;
+ }
+
+return PCRE_ERROR_NOSUBSTRING;
+}
+
+
+
+/*************************************************
+* Copy captured string to given buffer *
+*************************************************/
+
+/* This function copies a single captured substring into a given buffer.
+Note that we use memcpy() rather than strncpy() in case there are binary zeros
+in the string.
+
+Arguments:
+ subject the subject string that was matched
+ ovector pointer to the offsets table
+ stringcount the number of substrings that were captured
+ (i.e. the yield of the pcre_exec call, unless
+ that was zero, in which case it should be 1/3
+ of the offset table size)
+ stringnumber the number of the required substring
+ buffer where to put the substring
+ size the size of the buffer
+
+Returns: if successful:
+ the length of the copied string, not including the zero
+ that is put on the end; can be zero
+ if not successful:
+ PCRE_ERROR_NOMEMORY (-6) buffer too small
+ PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
+*/
+
+int
+pcre_copy_substring(const char *subject, int *ovector, int stringcount,
+ int stringnumber, char *buffer, int size)
+{
+int yield;
+if (stringnumber < 0 || stringnumber >= stringcount)
+ return PCRE_ERROR_NOSUBSTRING;
+stringnumber *= 2;
+yield = ovector[stringnumber+1] - ovector[stringnumber];
+if (size < yield + 1) return PCRE_ERROR_NOMEMORY;
+memcpy(buffer, subject + ovector[stringnumber], yield);
+buffer[yield] = 0;
+return yield;
+}
+
+
+
+/*************************************************
+* Copy named captured string to given buffer *
+*************************************************/
+
+/* This function copies a single captured substring into a given buffer,
+identifying it by name.
+
+Arguments:
+ code the compiled regex
+ subject the subject string that was matched
+ ovector pointer to the offsets table
+ stringcount the number of substrings that were captured
+ (i.e. the yield of the pcre_exec call, unless
+ that was zero, in which case it should be 1/3
+ of the offset table size)
+ stringname the name of the required substring
+ buffer where to put the substring
+ size the size of the buffer
+
+Returns: if successful:
+ the length of the copied string, not including the zero
+ that is put on the end; can be zero
+ if not successful:
+ PCRE_ERROR_NOMEMORY (-6) buffer too small
+ PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
+*/
+
+int
+pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector,
+ int stringcount, const char *stringname, char *buffer, int size)
+{
+int n = pcre_get_stringnumber(code, stringname);
+if (n <= 0) return n;
+return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size);
+}
+
+
+
+/*************************************************
+* Copy all captured strings to new store *
+*************************************************/
+
+/* This function gets one chunk of store and builds a list of pointers and all
+of the captured substrings in it. A NULL pointer is put on the end of the list.
+
+Arguments:
+ subject the subject string that was matched
+ ovector pointer to the offsets table
+ stringcount the number of substrings that were captured
+ (i.e. the yield of the pcre_exec call, unless
+ that was zero, in which case it should be 1/3
+ of the offset table size)
+ listptr set to point to the list of pointers
+
+Returns: if successful: 0
+ if not successful:
+ PCRE_ERROR_NOMEMORY (-6) failed to get store
+*/
+
+int
+pcre_get_substring_list(const char *subject, int *ovector, int stringcount,
+ const char ***listptr)
+{
+int i;
+int size = sizeof(char *);
+int double_count = stringcount * 2;
+char **stringlist;
+char *p;
+
+for (i = 0; i < double_count; i += 2)
+ size += sizeof(char *) + ovector[i+1] - ovector[i] + 1;
+
+stringlist = (char **)(pcre_malloc)(size);
+if (stringlist == NULL) return PCRE_ERROR_NOMEMORY;
+
+*listptr = (const char **)stringlist;
+p = (char *)(stringlist + stringcount + 1);
+
+for (i = 0; i < double_count; i += 2)
+ {
+ int len = ovector[i+1] - ovector[i];
+ memcpy(p, subject + ovector[i], len);
+ *stringlist++ = p;
+ p += len;
+ *p++ = 0;
+ }
+
+*stringlist = NULL;
+return 0;
+}
+
+
+
+/*************************************************
+* Free store obtained by get_substring_list *
+*************************************************/
+
+/* This function exists for the benefit of people calling PCRE from non-C
+programs that can call its functions, but not free() or (pcre_free)() directly.
+
+Argument: the result of a previous pcre_get_substring_list()
+Returns: nothing
+*/
+
+void
+pcre_free_substring_list(const char **pointer)
+{
+(pcre_free)((void *)pointer);
+}
+
+
+
+/*************************************************
+* Copy captured string to new store *
+*************************************************/
+
+/* This function copies a single captured substring into a piece of new
+store
+
+Arguments:
+ subject the subject string that was matched
+ ovector pointer to the offsets table
+ stringcount the number of substrings that were captured
+ (i.e. the yield of the pcre_exec call, unless
+ that was zero, in which case it should be 1/3
+ of the offset table size)
+ stringnumber the number of the required substring
+ stringptr where to put a pointer to the substring
+
+Returns: if successful:
+ the length of the string, not including the zero that
+ is put on the end; can be zero
+ if not successful:
+ PCRE_ERROR_NOMEMORY (-6) failed to get store
+ PCRE_ERROR_NOSUBSTRING (-7) substring not present
+*/
+
+int
+pcre_get_substring(const char *subject, int *ovector, int stringcount,
+ int stringnumber, const char **stringptr)
+{
+int yield;
+char *substring;
+if (stringnumber < 0 || stringnumber >= stringcount)
+ return PCRE_ERROR_NOSUBSTRING;
+stringnumber *= 2;
+yield = ovector[stringnumber+1] - ovector[stringnumber];
+substring = (char *)(pcre_malloc)(yield + 1);
+if (substring == NULL) return PCRE_ERROR_NOMEMORY;
+memcpy(substring, subject + ovector[stringnumber], yield);
+substring[yield] = 0;
+*stringptr = substring;
+return yield;
+}
+
+
+
+/*************************************************
+* Copy named captured string to new store *
+*************************************************/
+
+/* This function copies a single captured substring, identified by name, into
+new store.
+
+Arguments:
+ code the compiled regex
+ subject the subject string that was matched
+ ovector pointer to the offsets table
+ stringcount the number of substrings that were captured
+ (i.e. the yield of the pcre_exec call, unless
+ that was zero, in which case it should be 1/3
+ of the offset table size)
+ stringname the name of the required substring
+ stringptr where to put the pointer
+
+Returns: if successful:
+ the length of the copied string, not including the zero
+ that is put on the end; can be zero
+ if not successful:
+ PCRE_ERROR_NOMEMORY (-6) couldn't get memory
+ PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
+*/
+
+int
+pcre_get_named_substring(const pcre *code, const char *subject, int *ovector,
+ int stringcount, const char *stringname, const char **stringptr)
+{
+int n = pcre_get_stringnumber(code, stringname);
+if (n <= 0) return n;
+return pcre_get_substring(subject, ovector, stringcount, n, stringptr);
+}
+
+
+
+
+/*************************************************
+* Free store obtained by get_substring *
+*************************************************/
+
+/* This function exists for the benefit of people calling PCRE from non-C
+programs that can call its functions, but not free() or (pcre_free)() directly.
+
+Argument: the result of a previous pcre_get_substring()
+Returns: nothing
+*/
+
+void
+pcre_free_substring(const char *pointer)
+{
+(pcre_free)((void *)pointer);
+}
+
+/* End of get.c */
diff --git a/src/src/pcre/internal.h b/src/src/pcre/internal.h
new file mode 100644
index 000000000..5d1433178
--- /dev/null
+++ b/src/src/pcre/internal.h
@@ -0,0 +1,752 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+
+/* This is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language. See
+the file doc/Tech.Notes for some information on the internals.
+
+Written by: Philip Hazel <ph10@cam.ac.uk>
+
+ Copyright (c) 1997-2004 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* This header contains definitions that are shared between the different
+modules, but which are not relevant to the outside. */
+
+/* Get the definitions provided by running "configure" */
+
+#include "config.h"
+
+/* Standard C headers plus the external interface definition. The only time
+setjmp and stdarg are used is when NO_RECURSE is set. */
+
+#include <ctype.h>
+#include <limits.h>
+#include <setjmp.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef PCRE_SPY
+#define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */
+#endif
+
+/* We need to have types that specify unsigned 16-bit and 32-bit integers. We
+cannot determine these outside the compilation (e.g. by running a program as
+part of "configure") because PCRE is often cross-compiled for use on other
+systems. Instead we make use of the maximum sizes that are available at
+preprocessor time in standard C environments. */
+
+#if USHRT_MAX == 65535
+ typedef unsigned short pcre_uint16;
+#elif UINT_MAX == 65535
+ typedef unsigned int pcre_uint16;
+#else
+ #error Cannot determine a type for 16-bit unsigned integers
+#endif
+
+#if UINT_MAX == 4294967295
+ typedef unsigned int pcre_uint32;
+#elif ULONG_MAX == 4294967295
+ typedef unsigned long int pcre_uint32;
+#else
+ #error Cannot determine a type for 32-bit unsigned integers
+#endif
+
+/* All character handling must be done as unsigned characters. Otherwise there
+are problems with top-bit-set characters and functions such as isspace().
+However, we leave the interface to the outside world as char *, because that
+should make things easier for callers. We define a short type for unsigned char
+to save lots of typing. I tried "uchar", but it causes problems on Digital
+Unix, where it is defined in sys/types, so use "uschar" instead. */
+
+typedef unsigned char uschar;
+
+/* Include the public PCRE header */
+
+#include "pcre.h"
+
+/* When compiling for use with the Virtual Pascal compiler, these functions
+need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
+option on the command line. */
+
+#ifdef VPCOMPAT
+#define strncmp(s1,s2,m) _strncmp(s1,s2,m)
+#define memcpy(d,s,n) _memcpy(d,s,n)
+#define memmove(d,s,n) _memmove(d,s,n)
+#define memset(s,c,n) _memset(s,c,n)
+#else /* VPCOMPAT */
+
+/* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
+define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
+is set. Otherwise, include an emulating function for those systems that have
+neither (there some non-Unix environments where this is the case). This assumes
+that all calls to memmove are moving strings upwards in store, which is the
+case in PCRE. */
+
+#if ! HAVE_MEMMOVE
+#undef memmove /* some systems may have a macro */
+#if HAVE_BCOPY
+#define memmove(a, b, c) bcopy(b, a, c)
+#else /* HAVE_BCOPY */
+void *
+pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n)
+{
+int i;
+dest += n;
+src += n;
+for (i = 0; i < n; ++i) *(--dest) = *(--src);
+}
+#define memmove(a, b, c) pcre_memmove(a, b, c)
+#endif /* not HAVE_BCOPY */
+#endif /* not HAVE_MEMMOVE */
+#endif /* not VPCOMPAT */
+
+
+/* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
+in big-endian order) by default. These are used, for example, to link from the
+start of a subpattern to its alternatives and its end. The use of 2 bytes per
+offset limits the size of the compiled regex to around 64K, which is big enough
+for almost everybody. However, I received a request for an even bigger limit.
+For this reason, and also to make the code easier to maintain, the storing and
+loading of offsets from the byte string is now handled by the macros that are
+defined here.
+
+The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
+the config.h file, but can be overridden by using -D on the command line. This
+is automated on Unix systems via the "configure" command. */
+
+#if LINK_SIZE == 2
+
+#define PUT(a,n,d) \
+ (a[n] = (d) >> 8), \
+ (a[(n)+1] = (d) & 255)
+
+#define GET(a,n) \
+ (((a)[n] << 8) | (a)[(n)+1])
+
+#define MAX_PATTERN_SIZE (1 << 16)
+
+
+#elif LINK_SIZE == 3
+
+#define PUT(a,n,d) \
+ (a[n] = (d) >> 16), \
+ (a[(n)+1] = (d) >> 8), \
+ (a[(n)+2] = (d) & 255)
+
+#define GET(a,n) \
+ (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
+
+#define MAX_PATTERN_SIZE (1 << 24)
+
+
+#elif LINK_SIZE == 4
+
+#define PUT(a,n,d) \
+ (a[n] = (d) >> 24), \
+ (a[(n)+1] = (d) >> 16), \
+ (a[(n)+2] = (d) >> 8), \
+ (a[(n)+3] = (d) & 255)
+
+#define GET(a,n) \
+ (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
+
+#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
+
+
+#else
+#error LINK_SIZE must be either 2, 3, or 4
+#endif
+
+
+/* Convenience macro defined in terms of the others */
+
+#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
+
+
+/* PCRE uses some other 2-byte quantities that do not change when the size of
+offsets changes. There are used for repeat counts and for other things such as
+capturing parenthesis numbers in back references. */
+
+#define PUT2(a,n,d) \
+ a[n] = (d) >> 8; \
+ a[(n)+1] = (d) & 255
+
+#define GET2(a,n) \
+ (((a)[n] << 8) | (a)[(n)+1])
+
+#define PUT2INC(a,n,d) PUT2(a,n,d), a += 2
+
+
+/* In case there is no definition of offsetof() provided - though any proper
+Standard C system should have one. */
+
+#ifndef offsetof
+#define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
+#endif
+
+
+/* These are the public options that can change during matching. */
+
+#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
+
+/* Private options flags start at the most significant end of the four bytes,
+but skip the top bit so we can use ints for convenience without getting tangled
+with negative values. The public options defined in pcre.h start at the least
+significant end. Make sure they don't overlap, though now that we have expanded
+to four bytes, there is plenty of space. */
+
+#define PCRE_FIRSTSET 0x40000000 /* first_byte is set */
+#define PCRE_REQCHSET 0x20000000 /* req_byte is set */
+#define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */
+#define PCRE_ICHANGED 0x08000000 /* i option changes within regex */
+#define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */
+
+/* Options for the "extra" block produced by pcre_study(). */
+
+#define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
+
+/* Masks for identifying the public options which are permitted at compile
+time, run time or study time, respectively. */
+
+#define PUBLIC_OPTIONS \
+ (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
+ PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
+ PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT)
+
+#define PUBLIC_EXEC_OPTIONS \
+ (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
+ PCRE_PARTIAL)
+
+#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
+
+/* Magic number to provide a small check against being handed junk. */
+
+#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
+
+/* Negative values for the firstchar and reqchar variables */
+
+#define REQ_UNSET (-2)
+#define REQ_NONE (-1)
+
+/* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
+variable-length repeat, or a anything other than literal characters. */
+
+#define REQ_CASELESS 0x0100 /* indicates caselessness */
+#define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
+
+/* Miscellaneous definitions */
+
+typedef int BOOL;
+
+#define FALSE 0
+#define TRUE 1
+
+/* Escape items that are just an encoding of a particular data value. Note that
+ESC_n is defined as yet another macro, which is set in config.h to either \n
+(the default) or \r (which some people want). */
+
+#ifndef ESC_e
+#define ESC_e 27
+#endif
+
+#ifndef ESC_f
+#define ESC_f '\f'
+#endif
+
+#ifndef ESC_n
+#define ESC_n NEWLINE
+#endif
+
+#ifndef ESC_r
+#define ESC_r '\r'
+#endif
+
+/* We can't officially use ESC_t because it is a POSIX reserved identifier
+(presumably because of all the others like size_t). */
+
+#ifndef ESC_tee
+#define ESC_tee '\t'
+#endif
+
+/* These are escaped items that aren't just an encoding of a particular data
+value such as \n. They must have non-zero values, as check_escape() returns
+their negation. Also, they must appear in the same order as in the opcode
+definitions below, up to ESC_z. There's a dummy for OP_ANY because it
+corresponds to "." rather than an escape sequence. The final one must be
+ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
+tests in the code for an escape greater than ESC_b and less than ESC_Z to
+detect the types that may be repeated. These are the types that consume
+characters. If any new escapes are put in between that don't consume a
+character, that code will have to change. */
+
+enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
+ ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
+ ESC_Q, ESC_REF };
+
+/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
+contain UTF-8 characters with values greater than 255. */
+
+#define XCL_NOT 0x01 /* Flag: this is a negative class */
+#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
+
+#define XCL_END 0 /* Marks end of individual items */
+#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
+#define XCL_RANGE 2 /* A range (two multibyte chars) follows */
+#define XCL_PROP 3 /* Unicode property (one property code) follows */
+#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
+
+
+/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
+that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
+OP_EOD must correspond in order to the list of escapes immediately above.
+Note that whenever this list is updated, the two macro definitions that follow
+must also be updated to match. */
+
+enum {
+ OP_END, /* 0 End of pattern */
+
+ /* Values corresponding to backslashed metacharacters */
+
+ OP_SOD, /* 1 Start of data: \A */
+ OP_SOM, /* 2 Start of match (subject + offset): \G */
+ OP_NOT_WORD_BOUNDARY, /* 3 \B */
+ OP_WORD_BOUNDARY, /* 4 \b */
+ OP_NOT_DIGIT, /* 5 \D */
+ OP_DIGIT, /* 6 \d */
+ OP_NOT_WHITESPACE, /* 7 \S */
+ OP_WHITESPACE, /* 8 \s */
+ OP_NOT_WORDCHAR, /* 9 \W */
+ OP_WORDCHAR, /* 10 \w */
+ OP_ANY, /* 11 Match any character */
+ OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
+ OP_NOTPROP, /* 13 \P (not Unicode property) */
+ OP_PROP, /* 14 \p (Unicode property) */
+ OP_EXTUNI, /* 15 \X (extended Unicode sequence */
+ OP_EODN, /* 16 End of data or \n at end of data: \Z. */
+ OP_EOD, /* 17 End of data: \z */
+
+ OP_OPT, /* 18 Set runtime options */
+ OP_CIRC, /* 19 Start of line - varies with multiline switch */
+ OP_DOLL, /* 20 End of line - varies with multiline switch */
+ OP_CHAR, /* 21 Match one character, casefully */
+ OP_CHARNC, /* 22 Match one character, caselessly */
+ OP_NOT, /* 23 Match anything but the following char */
+
+ OP_STAR, /* 24 The maximizing and minimizing versions of */
+ OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */
+ OP_PLUS, /* 26 the minimizing one second. */
+ OP_MINPLUS, /* 27 This first set applies to single characters */
+ OP_QUERY, /* 28 */
+ OP_MINQUERY, /* 29 */
+ OP_UPTO, /* 30 From 0 to n matches */
+ OP_MINUPTO, /* 31 */
+ OP_EXACT, /* 32 Exactly n matches */
+
+ OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */
+ OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */
+ OP_NOTPLUS, /* 35 the minimizing one second. */
+ OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */
+ OP_NOTQUERY, /* 37 */
+ OP_NOTMINQUERY, /* 38 */
+ OP_NOTUPTO, /* 39 From 0 to n matches */
+ OP_NOTMINUPTO, /* 40 */
+ OP_NOTEXACT, /* 41 Exactly n matches */
+
+ OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */
+ OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */
+ OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */
+ OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */
+ OP_TYPEQUERY, /* 46 This set applies to character types such as \d */
+ OP_TYPEMINQUERY, /* 47 */
+ OP_TYPEUPTO, /* 48 From 0 to n matches */
+ OP_TYPEMINUPTO, /* 49 */
+ OP_TYPEEXACT, /* 50 Exactly n matches */
+
+ OP_CRSTAR, /* 51 The maximizing and minimizing versions of */
+ OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */
+ OP_CRPLUS, /* 53 the minimizing one second. These codes must */
+ OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */
+ OP_CRQUERY, /* 55 These are for character classes and back refs */
+ OP_CRMINQUERY, /* 56 */
+ OP_CRRANGE, /* 57 These are different to the three sets above. */
+ OP_CRMINRANGE, /* 58 */
+
+ OP_CLASS, /* 59 Match a character class, chars < 256 only */
+ OP_NCLASS, /* 60 Same, but the bitmap was created from a negative
+ class - the difference is relevant only when a UTF-8
+ character > 255 is encountered. */
+
+ OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the
+ class. This does both positive and negative. */
+
+ OP_REF, /* 62 Match a back reference */
+ OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */
+ OP_CALLOUT, /* 64 Call out to external function if provided */
+
+ OP_ALT, /* 65 Start of alternation */
+ OP_KET, /* 66 End of group that doesn't have an unbounded repeat */
+ OP_KETRMAX, /* 67 These two must remain together and in this */
+ OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */
+
+ /* The assertions must come before ONCE and COND */
+
+ OP_ASSERT, /* 69 Positive lookahead */
+ OP_ASSERT_NOT, /* 70 Negative lookahead */
+ OP_ASSERTBACK, /* 71 Positive lookbehind */
+ OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
+ OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */
+
+ /* ONCE and COND must come after the assertions, with ONCE first, as there's
+ a test for >= ONCE for a subpattern that isn't an assertion. */
+
+ OP_ONCE, /* 74 Once matched, don't back up into the subpattern */
+ OP_COND, /* 75 Conditional group */
+ OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */
+
+ OP_BRAZERO, /* 77 These two must remain together and in this */
+ OP_BRAMINZERO, /* 78 order. */
+
+ OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater
+ than can fit into an opcode. */
+
+ OP_BRA /* 80 This and greater values are used for brackets that
+ extract substrings up to EXTRACT_BASIC_MAX. After
+ that, use is made of OP_BRANUMBER. */
+};
+
+/* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
+study.c that all opcodes are less than 128 in value. This makes handling UTF-8
+character sequences easier. */
+
+/* The highest extraction number before we have to start using additional
+bytes. (Originally PCRE didn't have support for extraction counts highter than
+this number.) The value is limited by the number of opcodes left after OP_BRA,
+i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
+opcodes. */
+
+#define EXTRACT_BASIC_MAX 100
+
+
+/* This macro defines textual names for all the opcodes. There are used only
+for debugging, in pcre.c when DEBUG is defined, and also in pcretest.c. The
+macro is referenced only in printint.c. */
+
+#define OP_NAME_LIST \
+ "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
+ "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \
+ "notprop", "prop", "extuni", \
+ "\\Z", "\\z", \
+ "Opt", "^", "$", "char", "charnc", "not", \
+ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
+ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
+ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
+ "*", "*?", "+", "+?", "?", "??", "{", "{", \
+ "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
+ "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
+ "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
+ "Brazero", "Braminzero", "Branumber", "Bra"
+
+
+/* This macro defines the length of fixed length operations in the compiled
+regex. The lengths are used when searching for specific things, and also in the
+debugging printing of a compiled regex. We use a macro so that it can be
+incorporated both into pcre.c and pcretest.c without being publicly exposed.
+
+As things have been extended, some of these are no longer fixed lenths, but are
+minima instead. For example, the length of a single-character repeat may vary
+in UTF-8 mode. The code that uses this table must know about such things. */
+
+#define OP_LENGTHS \
+ 1, /* End */ \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
+ 1, 1, /* Any, Anybyte */ \
+ 2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \
+ 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
+ 2, /* Char - the minimum length */ \
+ 2, /* Charnc - the minimum length */ \
+ 2, /* not */ \
+ /* Positive single-char repeats ** These are */ \
+ 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
+ 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
+ /* Negative single-char repeats - only for chars < 256 */ \
+ 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
+ 4, 4, 4, /* NOT upto, minupto, exact */ \
+ /* Positive type repeats */ \
+ 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
+ 4, 4, 4, /* Type upto, minupto, exact */ \
+ /* Character class & ref repeats */ \
+ 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
+ 5, 5, /* CRRANGE, CRMINRANGE */ \
+ 33, /* CLASS */ \
+ 33, /* NCLASS */ \
+ 0, /* XCLASS - variable length */ \
+ 3, /* REF */ \
+ 1+LINK_SIZE, /* RECURSE */ \
+ 2+2*LINK_SIZE, /* CALLOUT */ \
+ 1+LINK_SIZE, /* Alt */ \
+ 1+LINK_SIZE, /* Ket */ \
+ 1+LINK_SIZE, /* KetRmax */ \
+ 1+LINK_SIZE, /* KetRmin */ \
+ 1+LINK_SIZE, /* Assert */ \
+ 1+LINK_SIZE, /* Assert not */ \
+ 1+LINK_SIZE, /* Assert behind */ \
+ 1+LINK_SIZE, /* Assert behind not */ \
+ 1+LINK_SIZE, /* Reverse */ \
+ 1+LINK_SIZE, /* Once */ \
+ 1+LINK_SIZE, /* COND */ \
+ 3, /* CREF */ \
+ 1, 1, /* BRAZERO, BRAMINZERO */ \
+ 3, /* BRANUMBER */ \
+ 1+LINK_SIZE /* BRA */ \
+
+
+/* A magic value for OP_CREF to indicate the "in recursion" condition. */
+
+#define CREF_RECURSE 0xffff
+
+/* The texts of compile-time error messages are defined as macros here so that
+they can be accessed by the POSIX wrapper and converted into error codes. Yes,
+I could have used error codes in the first place, but didn't feel like changing
+just to accommodate the POSIX wrapper. */
+
+#define ERR1 "\\ at end of pattern"
+#define ERR2 "\\c at end of pattern"
+#define ERR3 "unrecognized character follows \\"
+#define ERR4 "numbers out of order in {} quantifier"
+#define ERR5 "number too big in {} quantifier"
+#define ERR6 "missing terminating ] for character class"
+#define ERR7 "invalid escape sequence in character class"
+#define ERR8 "range out of order in character class"
+#define ERR9 "nothing to repeat"
+#define ERR10 "operand of unlimited repeat could match the empty string"
+#define ERR11 "internal error: unexpected repeat"
+#define ERR12 "unrecognized character after (?"
+#define ERR13 "POSIX named classes are supported only within a class"
+#define ERR14 "missing )"
+#define ERR15 "reference to non-existent subpattern"
+#define ERR16 "erroffset passed as NULL"
+#define ERR17 "unknown option bit(s) set"
+#define ERR18 "missing ) after comment"
+#define ERR19 "parentheses nested too deeply"
+#define ERR20 "regular expression too large"
+#define ERR21 "failed to get memory"
+#define ERR22 "unmatched parentheses"
+#define ERR23 "internal error: code overflow"
+#define ERR24 "unrecognized character after (?<"
+#define ERR25 "lookbehind assertion is not fixed length"
+#define ERR26 "malformed number after (?("
+#define ERR27 "conditional group contains more than two branches"
+#define ERR28 "assertion expected after (?("
+#define ERR29 "(?R or (?digits must be followed by )"
+#define ERR30 "unknown POSIX class name"
+#define ERR31 "POSIX collating elements are not supported"
+#define ERR32 "this version of PCRE is not compiled with PCRE_UTF8 support"
+#define ERR33 "spare error"
+#define ERR34 "character value in \\x{...} sequence is too large"
+#define ERR35 "invalid condition (?(0)"
+#define ERR36 "\\C not allowed in lookbehind assertion"
+#define ERR37 "PCRE does not support \\L, \\l, \\N, \\U, or \\u"
+#define ERR38 "number after (?C is > 255"
+#define ERR39 "closing ) for (?C expected"
+#define ERR40 "recursive call could loop indefinitely"
+#define ERR41 "unrecognized character after (?P"
+#define ERR42 "syntax error after (?P"
+#define ERR43 "two named groups have the same name"
+#define ERR44 "invalid UTF-8 string"
+#define ERR45 "support for \\P, \\p, and \\X has not been compiled"
+#define ERR46 "malformed \\P or \\p sequence"
+#define ERR47 "unknown property name after \\P or \\p"
+
+/* The real format of the start of the pcre block; the index of names and the
+code vector run on as long as necessary after the end. We store an explicit
+offset to the name table so that if a regex is compiled on one host, saved, and
+then run on another where the size of pointers is different, all might still
+be well. For the case of compiled-on-4 and run-on-8, we include an extra
+pointer that is always NULL. For future-proofing, we also include a few dummy
+fields - even though you can never get this planning right!
+
+NOTE NOTE NOTE:
+Because people can now save and re-use compiled patterns, any additions to this
+structure should be made at the end, and something earlier (e.g. a new
+flag in the options or one of the dummy fields) should indicate that the new
+fields are present. Currently PCRE always sets the dummy fields to zero.
+NOTE NOTE NOTE:
+*/
+
+typedef struct real_pcre {
+ pcre_uint32 magic_number;
+ pcre_uint32 size; /* Total that was malloced */
+ pcre_uint32 options;
+ pcre_uint32 dummy1; /* For future use, maybe */
+
+ pcre_uint16 top_bracket;
+ pcre_uint16 top_backref;
+ pcre_uint16 first_byte;
+ pcre_uint16 req_byte;
+ pcre_uint16 name_table_offset; /* Offset to name table that follows */
+ pcre_uint16 name_entry_size; /* Size of any name items */
+ pcre_uint16 name_count; /* Number of name items */
+ pcre_uint16 dummy2; /* For future use, maybe */
+
+ const unsigned char *tables; /* Pointer to tables or NULL for std */
+ const unsigned char *nullpad; /* NULL padding */
+} real_pcre;
+
+/* The format of the block used to store data from pcre_study(). The same
+remark (see NOTE above) about extending this structure applies. */
+
+typedef struct pcre_study_data {
+ pcre_uint32 size; /* Total that was malloced */
+ pcre_uint32 options;
+ uschar start_bits[32];
+} pcre_study_data;
+
+/* Structure for passing "static" information around between the functions
+doing the compiling, so that they are thread-safe. */
+
+typedef struct compile_data {
+ const uschar *lcc; /* Points to lower casing table */
+ const uschar *fcc; /* Points to case-flipping table */
+ const uschar *cbits; /* Points to character type table */
+ const uschar *ctypes; /* Points to table of type maps */
+ const uschar *start_code; /* The start of the compiled code */
+ const uschar *start_pattern; /* The start of the pattern */
+ uschar *name_table; /* The name/number table */
+ int names_found; /* Number of entries so far */
+ int name_entry_size; /* Size of each entry */
+ int top_backref; /* Maximum back reference */
+ unsigned int backref_map; /* Bitmap of low back refs */
+ int req_varyopt; /* "After variable item" flag for reqbyte */
+ BOOL nopartial; /* Set TRUE if partial won't work */
+} compile_data;
+
+/* Structure for maintaining a chain of pointers to the currently incomplete
+branches, for testing for left recursion. */
+
+typedef struct branch_chain {
+ struct branch_chain *outer;
+ uschar *current;
+} branch_chain;
+
+/* Structure for items in a linked list that represents an explicit recursive
+call within the pattern. */
+
+typedef struct recursion_info {
+ struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
+ int group_num; /* Number of group that was called */
+ const uschar *after_call; /* "Return value": points after the call in the expr */
+ const uschar *save_start; /* Old value of md->start_match */
+ int *offset_save; /* Pointer to start of saved offsets */
+ int saved_max; /* Number of saved offsets */
+} recursion_info;
+
+/* When compiling in a mode that doesn't use recursive calls to match(),
+a structure is used to remember local variables on the heap. It is defined in
+pcre.c, close to the match() function, so that it is easy to keep it in step
+with any changes of local variable. However, the pointer to the current frame
+must be saved in some "static" place over a longjmp(). We declare the
+structure here so that we can put a pointer in the match_data structure.
+NOTE: This isn't used for a "normal" compilation of pcre. */
+
+struct heapframe;
+
+/* Structure for passing "static" information around between the functions
+doing the matching, so that they are thread-safe. */
+
+typedef struct match_data {
+ unsigned long int match_call_count; /* As it says */
+ unsigned long int match_limit;/* As it says */
+ int *offset_vector; /* Offset vector */
+ int offset_end; /* One past the end */
+ int offset_max; /* The maximum usable for return data */
+ const uschar *lcc; /* Points to lower casing table */
+ const uschar *ctypes; /* Points to table of type maps */
+ BOOL offset_overflow; /* Set if too many extractions */
+ BOOL notbol; /* NOTBOL flag */
+ BOOL noteol; /* NOTEOL flag */
+ BOOL utf8; /* UTF8 flag */
+ BOOL endonly; /* Dollar not before final \n */
+ BOOL notempty; /* Empty string match not wanted */
+ BOOL partial; /* PARTIAL flag */
+ BOOL hitend; /* Hit the end of the subject at some point */
+ const uschar *start_code; /* For use when recursing */
+ const uschar *start_subject; /* Start of the subject string */
+ const uschar *end_subject; /* End of the subject string */
+ const uschar *start_match; /* Start of this match attempt */
+ const uschar *end_match_ptr; /* Subject position at end match */
+ int end_offset_top; /* Highwater mark at end of match */
+ int capture_last; /* Most recent capture number */
+ int start_offset; /* The start offset value */
+ recursion_info *recursive; /* Linked list of recursion data */
+ void *callout_data; /* To pass back to callouts */
+ struct heapframe *thisframe; /* Used only when compiling for no recursion */
+} match_data;
+
+/* Bit definitions for entries in the pcre_ctypes table. */
+
+#define ctype_space 0x01
+#define ctype_letter 0x02
+#define ctype_digit 0x04
+#define ctype_xdigit 0x08
+#define ctype_word 0x10 /* alphameric or '_' */
+#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
+
+/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
+of bits for a class map. Some classes are built by combining these tables. */
+
+#define cbit_space 0 /* [:space:] or \s */
+#define cbit_xdigit 32 /* [:xdigit:] */
+#define cbit_digit 64 /* [:digit:] or \d */
+#define cbit_upper 96 /* [:upper:] */
+#define cbit_lower 128 /* [:lower:] */
+#define cbit_word 160 /* [:word:] or \w */
+#define cbit_graph 192 /* [:graph:] */
+#define cbit_print 224 /* [:print:] */
+#define cbit_punct 256 /* [:punct:] */
+#define cbit_cntrl 288 /* [:cntrl:] */
+#define cbit_length 320 /* Length of the cbits table */
+
+/* Offsets of the various tables from the base tables pointer, and
+total length. */
+
+#define lcc_offset 0
+#define fcc_offset 256
+#define cbits_offset 512
+#define ctypes_offset (cbits_offset + cbit_length)
+#define tables_length (ctypes_offset + 256)
+
+/* End of internal.h */
diff --git a/src/src/pcre/maketables.c b/src/src/pcre/maketables.c
new file mode 100644
index 000000000..f1c7b9a5c
--- /dev/null
+++ b/src/src/pcre/maketables.c
@@ -0,0 +1,146 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/*
+PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+Written by: Philip Hazel <ph10@cam.ac.uk>
+
+ Copyright (c) 1997-2003 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This file is compiled on its own as part of the PCRE library. However,
+it is also included in the compilation of dftables.c, in which case the macro
+DFTABLES is defined. */
+
+#ifndef DFTABLES
+#include "internal.h"
+#endif
+
+
+
+/*************************************************
+* Create PCRE character tables *
+*************************************************/
+
+/* This function builds a set of character tables for use by PCRE and returns
+a pointer to them. They are build using the ctype functions, and consequently
+their contents will depend upon the current locale setting. When compiled as
+part of the library, the store is obtained via pcre_malloc(), but when compiled
+inside dftables, use malloc().
+
+Arguments: none
+Returns: pointer to the contiguous block of data
+*/
+
+const unsigned char *
+pcre_maketables(void)
+{
+unsigned char *yield, *p;
+int i;
+
+#ifndef DFTABLES
+yield = (unsigned char*)(pcre_malloc)(tables_length);
+#else
+yield = (unsigned char*)malloc(tables_length);
+#endif
+
+if (yield == NULL) return NULL;
+p = yield;
+
+/* First comes the lower casing table */
+
+for (i = 0; i < 256; i++) *p++ = tolower(i);
+
+/* Next the case-flipping table */
+
+for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
+
+/* Then the character class tables. Don't try to be clever and save effort
+on exclusive ones - in some locales things may be different. Note that the
+table for "space" includes everything "isspace" gives, including VT in the
+default locale. This makes it work for the POSIX class [:space:]. */
+
+memset(p, 0, cbit_length);
+for (i = 0; i < 256; i++)
+ {
+ if (isdigit(i))
+ {
+ p[cbit_digit + i/8] |= 1 << (i&7);
+ p[cbit_word + i/8] |= 1 << (i&7);
+ }
+ if (isupper(i))
+ {
+ p[cbit_upper + i/8] |= 1 << (i&7);
+ p[cbit_word + i/8] |= 1 << (i&7);
+ }
+ if (islower(i))
+ {
+ p[cbit_lower + i/8] |= 1 << (i&7);
+ p[cbit_word + i/8] |= 1 << (i&7);
+ }
+ if (i == '_') p[cbit_word + i/8] |= 1 << (i&7);
+ if (isspace(i)) p[cbit_space + i/8] |= 1 << (i&7);
+ if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7);
+ if (isgraph(i)) p[cbit_graph + i/8] |= 1 << (i&7);
+ if (isprint(i)) p[cbit_print + i/8] |= 1 << (i&7);
+ if (ispunct(i)) p[cbit_punct + i/8] |= 1 << (i&7);
+ if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1 << (i&7);
+ }
+p += cbit_length;
+
+/* Finally, the character type table. In this, we exclude VT from the white
+space chars, because Perl doesn't recognize it as such for \s and for comments
+within regexes. */
+
+for (i = 0; i < 256; i++)
+ {
+ int x = 0;
+ if (i != 0x0b && isspace(i)) x += ctype_space;
+ if (isalpha(i)) x += ctype_letter;
+ if (isdigit(i)) x += ctype_digit;
+ if (isxdigit(i)) x += ctype_xdigit;
+ if (isalnum(i) || i == '_') x += ctype_word;
+
+ /* Note: strchr includes the terminating zero in the characters it considers.
+ In this instance, that is ok because we want binary zero to be flagged as a
+ meta-character, which in this sense is any character that terminates a run
+ of data characters. */
+
+ if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta; *p++ = x; }
+
+return yield;
+}
+
+/* End of maketables.c */
diff --git a/src/src/pcre/pcre.c b/src/src/pcre/pcre.c
new file mode 100644
index 000000000..a411d00c3
--- /dev/null
+++ b/src/src/pcre/pcre.c
@@ -0,0 +1,9194 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/*
+This is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language. See
+the file Tech.Notes for some information on the internals.
+
+Written by: Philip Hazel <ph10@cam.ac.uk>
+
+ Copyright (c) 1997-2004 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* Define DEBUG to get debugging output on stdout. */
+/* #define DEBUG */
+
+/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
+inline, and there are *still* stupid compilers about that don't like indented
+pre-processor statements. I suppose it's only been 10 years... */
+
+#ifdef DEBUG
+#define DPRINTF(p) printf p
+#else
+#define DPRINTF(p) /*nothing*/
+#endif
+
+/* Include the internals header, which itself includes "config.h", the Standard
+C headers, and the external pcre header. */
+
+#include "internal.h"
+
+/* If Unicode Property support is wanted, include a private copy of the
+function that does it, and the table that translates names to numbers. */
+
+#ifdef SUPPORT_UCP
+#include "ucp.c"
+#include "ucptypetable.c"
+#endif
+
+/* Maximum number of items on the nested bracket stacks at compile time. This
+applies to the nesting of all kinds of parentheses. It does not limit
+un-nested, non-capturing parentheses. This number can be made bigger if
+necessary - it is used to dimension one int and one unsigned char vector at
+compile time. */
+
+#define BRASTACK_SIZE 200
+
+
+/* Maximum number of ints of offset to save on the stack for recursive calls.
+If the offset vector is bigger, malloc is used. This should be a multiple of 3,
+because the offset vector is always a multiple of 3 long. */
+
+#define REC_STACK_SAVE_MAX 30
+
+
+/* The maximum remaining length of subject we are prepared to search for a
+req_byte match. */
+
+#define REQ_BYTE_MAX 1000
+
+
+/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
+the definition is next to the definition of the opcodes in internal.h. */
+
+static const uschar OP_lengths[] = { OP_LENGTHS };
+
+/* Min and max values for the common repeats; for the maxima, 0 => infinity */
+
+static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
+static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
+
+/* Table for handling escaped characters in the range '0'-'z'. Positive returns
+are simple data values; negative values are for special things like \d and so
+on. Zero means further processing is needed (for things like \x), or the escape
+is invalid. */
+
+#if !EBCDIC /* This is the "normal" table for ASCII systems */
+static const short int escapes[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
+ 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
+ '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
+ 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
+-ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
+-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
+ '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
+ 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
+-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
+ 0, 0, -ESC_z /* x - z */
+};
+
+#else /* This is the "abnormal" table for EBCDIC systems */
+static const short int escapes[] = {
+/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
+/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
+/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
+/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
+/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
+/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
+/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
+/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
+/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
+/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
+/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
+/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
+/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
+/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
+/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
+/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
+/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
+/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
+/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
+/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
+/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
+/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
+/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
+};
+#endif
+
+
+/* Tables of names of POSIX character classes and their lengths. The list is
+terminated by a zero length entry. The first three must be alpha, upper, lower,
+as this is assumed for handling case independence. */
+
+static const char *const posix_names[] = {
+ "alpha", "lower", "upper",
+ "alnum", "ascii", "blank", "cntrl", "digit", "graph",
+ "print", "punct", "space", "word", "xdigit" };
+
+static const uschar posix_name_lengths[] = {
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
+
+/* Table of class bit maps for each POSIX class; up to three may be combined
+to form the class. The table for [:blank:] is dynamically modified to remove
+the vertical space characters. */
+
+static const int posix_class_maps[] = {
+ cbit_lower, cbit_upper, -1, /* alpha */
+ cbit_lower, -1, -1, /* lower */
+ cbit_upper, -1, -1, /* upper */
+ cbit_digit, cbit_lower, cbit_upper, /* alnum */
+ cbit_print, cbit_cntrl, -1, /* ascii */
+ cbit_space, -1, -1, /* blank - a GNU extension */
+ cbit_cntrl, -1, -1, /* cntrl */
+ cbit_digit, -1, -1, /* digit */
+ cbit_graph, -1, -1, /* graph */
+ cbit_print, -1, -1, /* print */
+ cbit_punct, -1, -1, /* punct */
+ cbit_space, -1, -1, /* space */
+ cbit_word, -1, -1, /* word - a Perl extension */
+ cbit_xdigit,-1, -1 /* xdigit */
+};
+
+/* Table to identify digits and hex digits. This is used when compiling
+patterns. Note that the tables in chartables are dependent on the locale, and
+may mark arbitrary characters as digits - but the PCRE compiling code expects
+to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
+a private table here. It costs 256 bytes, but it is a lot faster than doing
+character value tests (at least in some simple cases I timed), and in some
+applications one wants PCRE to compile efficiently as well as match
+efficiently.
+
+For convenience, we use the same bit definitions as in chartables:
+
+ 0x04 decimal digit
+ 0x08 hexadecimal digit
+
+Then we can use ctype_digit and ctype_xdigit in the code. */
+
+#if !EBCDIC /* This is the "normal" case, for ASCII systems */
+static const unsigned char digitab[] =
+ {
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
+ 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
+ 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
+ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
+ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
+
+#else /* This is the "abnormal" case, for EBCDIC systems */
+static const unsigned char digitab[] =
+ {
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
+ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
+ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
+ 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
+ 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
+
+static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
+ 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
+ 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
+ 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
+ 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
+ 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
+ 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
+ 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
+ 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
+ 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
+ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
+ 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
+ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
+ 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
+ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
+ 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
+ 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
+ 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
+ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
+ 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
+ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
+ 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
+ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
+ 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
+ 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
+#endif
+
+
+/* Definition to allow mutual recursion */
+
+static BOOL
+ compile_regex(int, int, int *, uschar **, const uschar **, const char **,
+ BOOL, int, int *, int *, branch_chain *, compile_data *);
+
+/* Structure for building a chain of data that actually lives on the
+stack, for holding the values of the subject pointer at the start of each
+subpattern, so as to detect when an empty string has been matched by a
+subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
+are on the heap, not on the stack. */
+
+typedef struct eptrblock {
+ struct eptrblock *epb_prev;
+ const uschar *epb_saved_eptr;
+} eptrblock;
+
+/* Flag bits for the match() function */
+
+#define match_condassert 0x01 /* Called to check a condition assertion */
+#define match_isgroup 0x02 /* Set if start of bracketed group */
+
+/* Non-error returns from the match() function. Error returns are externally
+defined PCRE_ERROR_xxx codes, which are all negative. */
+
+#define MATCH_MATCH 1
+#define MATCH_NOMATCH 0
+
+
+
+/*************************************************
+* Global variables *
+*************************************************/
+
+/* PCRE is thread-clean and doesn't use any global variables in the normal
+sense. However, it calls memory allocation and free functions via the four
+indirections below, and it can optionally do callouts. These values can be
+changed by the caller, but are shared between all threads. However, when
+compiling for Virtual Pascal, things are done differently (see pcre.in). */
+
+#ifndef VPCOMPAT
+#ifdef __cplusplus
+extern "C" void *(*pcre_malloc)(size_t) = malloc;
+extern "C" void (*pcre_free)(void *) = free;
+extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
+extern "C" void (*pcre_stack_free)(void *) = free;
+extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
+#else
+void *(*pcre_malloc)(size_t) = malloc;
+void (*pcre_free)(void *) = free;
+void *(*pcre_stack_malloc)(size_t) = malloc;
+void (*pcre_stack_free)(void *) = free;
+int (*pcre_callout)(pcre_callout_block *) = NULL;
+#endif
+#endif
+
+
+/*************************************************
+* Macros and tables for character handling *
+*************************************************/
+
+/* When UTF-8 encoding is being used, a character is no longer just a single
+byte. The macros for character handling generate simple sequences when used in
+byte-mode, and more complicated ones for UTF-8 characters. */
+
+#ifndef SUPPORT_UTF8
+#define GETCHAR(c, eptr) c = *eptr;
+#define GETCHARINC(c, eptr) c = *eptr++;
+#define GETCHARINCTEST(c, eptr) c = *eptr++;
+#define GETCHARLEN(c, eptr, len) c = *eptr;
+#define BACKCHAR(eptr)
+
+#else /* SUPPORT_UTF8 */
+
+/* Get the next UTF-8 character, not advancing the pointer. This is called when
+we know we are in UTF-8 mode. */
+
+#define GETCHAR(c, eptr) \
+ c = *eptr; \
+ if ((c & 0xc0) == 0xc0) \
+ { \
+ int gcii; \
+ int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
+ int gcss = 6*gcaa; \
+ c = (c & utf8_table3[gcaa]) << gcss; \
+ for (gcii = 1; gcii <= gcaa; gcii++) \
+ { \
+ gcss -= 6; \
+ c |= (eptr[gcii] & 0x3f) << gcss; \
+ } \
+ }
+
+/* Get the next UTF-8 character, advancing the pointer. This is called when we
+know we are in UTF-8 mode. */
+
+#define GETCHARINC(c, eptr) \
+ c = *eptr++; \
+ if ((c & 0xc0) == 0xc0) \
+ { \
+ int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
+ int gcss = 6*gcaa; \
+ c = (c & utf8_table3[gcaa]) << gcss; \
+ while (gcaa-- > 0) \
+ { \
+ gcss -= 6; \
+ c |= (*eptr++ & 0x3f) << gcss; \
+ } \
+ }
+
+/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
+
+#define GETCHARINCTEST(c, eptr) \
+ c = *eptr++; \
+ if (md->utf8 && (c & 0xc0) == 0xc0) \
+ { \
+ int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
+ int gcss = 6*gcaa; \
+ c = (c & utf8_table3[gcaa]) << gcss; \
+ while (gcaa-- > 0) \
+ { \
+ gcss -= 6; \
+ c |= (*eptr++ & 0x3f) << gcss; \
+ } \
+ }
+
+/* Get the next UTF-8 character, not advancing the pointer, incrementing length
+if there are extra bytes. This is called when we know we are in UTF-8 mode. */
+
+#define GETCHARLEN(c, eptr, len) \
+ c = *eptr; \
+ if ((c & 0xc0) == 0xc0) \
+ { \
+ int gcii; \
+ int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
+ int gcss = 6*gcaa; \
+ c = (c & utf8_table3[gcaa]) << gcss; \
+ for (gcii = 1; gcii <= gcaa; gcii++) \
+ { \
+ gcss -= 6; \
+ c |= (eptr[gcii] & 0x3f) << gcss; \
+ } \
+ len += gcaa; \
+ }
+
+/* If the pointer is not at the start of a character, move it back until
+it is. Called only in UTF-8 mode. */
+
+#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
+
+#endif
+
+
+
+/*************************************************
+* Default character tables *
+*************************************************/
+
+/* A default set of character tables is included in the PCRE binary. Its source
+is built by the maketables auxiliary program, which uses the default C ctypes
+functions, and put in the file chartables.c. These tables are used by PCRE
+whenever the caller of pcre_compile() does not provide an alternate set of
+tables. */
+
+#include "chartables.c"
+
+
+
+#ifdef SUPPORT_UTF8
+/*************************************************
+* Tables for UTF-8 support *
+*************************************************/
+
+/* These are the breakpoints for different numbers of bytes in a UTF-8
+character. */
+
+static const int utf8_table1[] =
+ { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
+
+/* These are the indicator bits and the mask for the data bits to set in the
+first byte of a character, indexed by the number of additional bytes. */
+
+static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
+static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
+
+/* Table of the number of extra characters, indexed by the first character
+masked with 0x3f. The highest number for a valid UTF-8 character is in fact
+0x3d. */
+
+static const uschar utf8_table4[] = {
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
+
+
+/*************************************************
+* Convert character value to UTF-8 *
+*************************************************/
+
+/* This function takes an integer value in the range 0 - 0x7fffffff
+and encodes it as a UTF-8 character in 0 to 6 bytes.
+
+Arguments:
+ cvalue the character value
+ buffer pointer to buffer for result - at least 6 bytes long
+
+Returns: number of characters placed in the buffer
+*/
+
+static int
+ord2utf8(int cvalue, uschar *buffer)
+{
+register int i, j;
+for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
+ if (cvalue <= utf8_table1[i]) break;
+buffer += i;
+for (j = i; j > 0; j--)
+ {
+ *buffer-- = 0x80 | (cvalue & 0x3f);
+ cvalue >>= 6;
+ }
+*buffer = utf8_table2[i] | cvalue;
+return i + 1;
+}
+#endif
+
+
+
+/*************************************************
+* Print compiled regex *
+*************************************************/
+
+/* The code for doing this is held in a separate file that is also included in
+pcretest.c. It defines a function called print_internals(). */
+
+#ifdef DEBUG
+#include "printint.c"
+#endif
+
+
+
+/*************************************************
+* Return version string *
+*************************************************/
+
+#define STRING(a) # a
+#define XSTRING(s) STRING(s)
+
+EXPORT const char *
+pcre_version(void)
+{
+return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
+}
+
+
+
+
+/*************************************************
+* Flip bytes in an integer *
+*************************************************/
+
+/* This function is called when the magic number in a regex doesn't match in
+order to flip its bytes to see if we are dealing with a pattern that was
+compiled on a host of different endianness. If so, this function is used to
+flip other byte values.
+
+Arguments:
+ value the number to flip
+ n the number of bytes to flip (assumed to be 2 or 4)
+
+Returns: the flipped value
+*/
+
+static long int
+byteflip(long int value, int n)
+{
+if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
+return ((value & 0x000000ff) << 24) |
+ ((value & 0x0000ff00) << 8) |
+ ((value & 0x00ff0000) >> 8) |
+ ((value & 0xff000000) >> 24);
+}
+
+
+
+/*************************************************
+* Test for a byte-flipped compiled regex *
+*************************************************/
+
+/* This function is called from pce_exec() and also from pcre_fullinfo(). Its
+job is to test whether the regex is byte-flipped - that is, it was compiled on
+a system of opposite endianness. The function is called only when the native
+MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
+relevant values into a different data block, and return it.
+
+Arguments:
+ re points to the regex
+ study points to study data, or NULL
+ internal_re points to a new regex block
+ internal_study points to a new study block
+
+Returns: the new block if is is indeed a byte-flipped regex
+ NULL if it is not
+*/
+
+static real_pcre *
+try_flipped(const real_pcre *re, real_pcre *internal_re,
+ const pcre_study_data *study, pcre_study_data *internal_study)
+{
+if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
+ return NULL;
+
+*internal_re = *re; /* To copy other fields */
+internal_re->size = byteflip(re->size, sizeof(re->size));
+internal_re->options = byteflip(re->options, sizeof(re->options));
+internal_re->top_bracket = byteflip(re->top_bracket, sizeof(re->top_bracket));
+internal_re->top_backref = byteflip(re->top_backref, sizeof(re->top_backref));
+internal_re->first_byte = byteflip(re->first_byte, sizeof(re->first_byte));
+internal_re->req_byte = byteflip(re->req_byte, sizeof(re->req_byte));
+internal_re->name_table_offset = byteflip(re->name_table_offset,
+ sizeof(re->name_table_offset));
+internal_re->name_entry_size = byteflip(re->name_entry_size,
+ sizeof(re->name_entry_size));
+internal_re->name_count = byteflip(re->name_count, sizeof(re->name_count));
+
+if (study != NULL)
+ {
+ *internal_study = *study; /* To copy other fields */
+ internal_study->size = byteflip(study->size, sizeof(study->size));
+ internal_study->options = byteflip(study->options, sizeof(study->options));
+ }
+
+return internal_re;
+}
+
+
+
+/*************************************************
+* (Obsolete) Return info about compiled pattern *
+*************************************************/
+
+/* This is the original "info" function. It picks potentially useful data out
+of the private structure, but its interface was too rigid. It remains for
+backwards compatibility. The public options are passed back in an int - though
+the re->options field has been expanded to a long int, all the public options
+at the low end of it, and so even on 16-bit systems this will still be OK.
+Therefore, I haven't changed the API for pcre_info().
+
+Arguments:
+ argument_re points to compiled code
+ optptr where to pass back the options
+ first_byte where to pass back the first character,
+ or -1 if multiline and all branches start ^,
+ or -2 otherwise
+
+Returns: number of capturing subpatterns
+ or negative values on error
+*/
+
+EXPORT int
+pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
+{
+real_pcre internal_re;
+const real_pcre *re = (const real_pcre *)argument_re;
+if (re == NULL) return PCRE_ERROR_NULL;
+if (re->magic_number != MAGIC_NUMBER)
+ {
+ re = try_flipped(re, &internal_re, NULL, NULL);
+ if (re == NULL) return PCRE_ERROR_BADMAGIC;
+ }
+if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
+if (first_byte != NULL)
+ *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
+ ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
+return re->top_bracket;
+}
+
+
+
+/*************************************************
+* Return info about compiled pattern *
+*************************************************/
+
+/* This is a newer "info" function which has an extensible interface so
+that additional items can be added compatibly.
+
+Arguments:
+ argument_re points to compiled code
+ extra_data points extra data, or NULL
+ what what information is required
+ where where to put the information
+
+Returns: 0 if data returned, negative on error
+*/
+
+EXPORT int
+pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
+ void *where)
+{
+real_pcre internal_re;
+pcre_study_data internal_study;
+const real_pcre *re = (const real_pcre *)argument_re;
+const pcre_study_data *study = NULL;
+
+if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
+
+if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
+ study = (const pcre_study_data *)extra_data->study_data;
+
+if (re->magic_number != MAGIC_NUMBER)
+ {
+ re = try_flipped(re, &internal_re, study, &internal_study);
+ if (re == NULL) return PCRE_ERROR_BADMAGIC;
+ if (study != NULL) study = &internal_study;
+ }
+
+switch (what)
+ {
+ case PCRE_INFO_OPTIONS:
+ *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
+ break;
+
+ case PCRE_INFO_SIZE:
+ *((size_t *)where) = re->size;
+ break;
+
+ case PCRE_INFO_STUDYSIZE:
+ *((size_t *)where) = (study == NULL)? 0 : study->size;
+ break;
+
+ case PCRE_INFO_CAPTURECOUNT:
+ *((int *)where) = re->top_bracket;
+ break;
+
+ case PCRE_INFO_BACKREFMAX:
+ *((int *)where) = re->top_backref;
+ break;
+
+ case PCRE_INFO_FIRSTBYTE:
+ *((int *)where) =
+ ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
+ ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
+ break;
+
+ /* Make sure we pass back the pointer to the bit vector in the external
+ block, not the internal copy (with flipped integer fields). */
+
+ case PCRE_INFO_FIRSTTABLE:
+ *((const uschar **)where) =
+ (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
+ ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
+ break;
+
+ case PCRE_INFO_LASTLITERAL:
+ *((int *)where) =
+ ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
+ break;
+
+ case PCRE_INFO_NAMEENTRYSIZE:
+ *((int *)where) = re->name_entry_size;
+ break;
+
+ case PCRE_INFO_NAMECOUNT:
+ *((int *)where) = re->name_count;
+ break;
+
+ case PCRE_INFO_NAMETABLE:
+ *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
+ break;
+
+ case PCRE_INFO_DEFAULT_TABLES:
+ *((const uschar **)where) = (const uschar *)pcre_default_tables;
+ break;
+
+ default: return PCRE_ERROR_BADOPTION;
+ }
+
+return 0;
+}
+
+
+
+/*************************************************
+* Return info about what features are configured *
+*************************************************/
+
+/* This is function which has an extensible interface so that additional items
+can be added compatibly.
+
+Arguments:
+ what what information is required
+ where where to put the information
+
+Returns: 0 if data returned, negative on error
+*/
+
+EXPORT int
+pcre_config(int what, void *where)
+{
+switch (what)
+ {
+ case PCRE_CONFIG_UTF8:
+#ifdef SUPPORT_UTF8
+ *((int *)where) = 1;
+#else
+ *((int *)where) = 0;
+#endif
+ break;
+
+ case PCRE_CONFIG_UNICODE_PROPERTIES:
+#ifdef SUPPORT_UCP
+ *((int *)where) = 1;
+#else
+ *((int *)where) = 0;
+#endif
+ break;
+
+ case PCRE_CONFIG_NEWLINE:
+ *((int *)where) = NEWLINE;
+ break;
+
+ case PCRE_CONFIG_LINK_SIZE:
+ *((int *)where) = LINK_SIZE;
+ break;
+
+ case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
+ *((int *)where) = POSIX_MALLOC_THRESHOLD;
+ break;
+
+ case PCRE_CONFIG_MATCH_LIMIT:
+ *((unsigned int *)where) = MATCH_LIMIT;
+ break;
+
+ case PCRE_CONFIG_STACKRECURSE:
+#ifdef NO_RECURSE
+ *((int *)where) = 0;
+#else
+ *((int *)where) = 1;
+#endif
+ break;
+
+ default: return PCRE_ERROR_BADOPTION;
+ }
+
+return 0;
+}
+
+
+
+#ifdef DEBUG
+/*************************************************
+* Debugging function to print chars *
+*************************************************/
+
+/* Print a sequence of chars in printable format, stopping at the end of the
+subject if the requested.
+
+Arguments:
+ p points to characters
+ length number to print
+ is_subject TRUE if printing from within md->start_subject
+ md pointer to matching data block, if is_subject is TRUE
+
+Returns: nothing
+*/
+
+static void
+pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
+{
+int c;
+if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
+while (length-- > 0)
+ if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
+}
+#endif
+
+
+
+
+/*************************************************
+* Handle escapes *
+*************************************************/
+
+/* This function is called when a \ has been encountered. It either returns a
+positive value for a simple escape such as \n, or a negative value which
+encodes one of the more complicated things such as \d. When UTF-8 is enabled,
+a positive value greater than 255 may be returned. On entry, ptr is pointing at
+the \. On exit, it is on the final character of the escape sequence.
+
+Arguments:
+ ptrptr points to the pattern position pointer
+ errorptr points to the pointer to the error message
+ bracount number of previous extracting brackets
+ options the options bits
+ isclass TRUE if inside a character class
+
+Returns: zero or positive => a data character
+ negative => a special escape sequence
+ on error, errorptr is set
+*/
+
+static int
+check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
+ int options, BOOL isclass)
+{
+const uschar *ptr = *ptrptr;
+int c, i;
+
+/* If backslash is at the end of the pattern, it's an error. */
+
+c = *(++ptr);
+if (c == 0) *errorptr = ERR1;
+
+/* Non-alphamerics are literals. For digits or letters, do an initial lookup in
+a table. A non-zero result is something that can be returned immediately.
+Otherwise further processing may be required. */
+
+#if !EBCDIC /* ASCII coding */
+else if (c < '0' || c > 'z') {} /* Not alphameric */
+else if ((i = escapes[c - '0']) != 0) c = i;
+
+#else /* EBCDIC coding */
+else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
+else if ((i = escapes[c - 0x48]) != 0) c = i;
+#endif
+
+/* Escapes that need further processing, or are illegal. */
+
+else
+ {
+ const uschar *oldptr;
+ switch (c)
+ {
+ /* A number of Perl escapes are not handled by PCRE. We give an explicit
+ error. */
+
+ case 'l':
+ case 'L':
+ case 'N':
+ case 'u':
+ case 'U':
+ *errorptr = ERR37;
+ break;
+
+ /* The handling of escape sequences consisting of a string of digits
+ starting with one that is not zero is not straightforward. By experiment,
+ the way Perl works seems to be as follows:
+
+ Outside a character class, the digits are read as a decimal number. If the
+ number is less than 10, or if there are that many previous extracting
+ left brackets, then it is a back reference. Otherwise, up to three octal
+ digits are read to form an escaped byte. Thus \123 is likely to be octal
+ 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
+ value is greater than 377, the least significant 8 bits are taken. Inside a
+ character class, \ followed by a digit is always an octal number. */
+
+ case '1': case '2': case '3': case '4': case '5':
+ case '6': case '7': case '8': case '9':
+
+ if (!isclass)
+ {
+ oldptr = ptr;
+ c -= '0';
+ while ((digitab[ptr[1]] & ctype_digit) != 0)
+ c = c * 10 + *(++ptr) - '0';
+ if (c < 10 || c <= bracount)
+ {
+ c = -(ESC_REF + c);
+ break;
+ }
+ ptr = oldptr; /* Put the pointer back and fall through */
+ }
+
+ /* Handle an octal number following \. If the first digit is 8 or 9, Perl
+ generates a binary zero byte and treats the digit as a following literal.
+ Thus we have to pull back the pointer by one. */
+
+ if ((c = *ptr) >= '8')
+ {
+ ptr--;
+ c = 0;
+ break;
+ }
+
+ /* \0 always starts an octal number, but we may drop through to here with a
+ larger first octal digit. */
+
+ case '0':
+ c -= '0';
+ while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
+ c = c * 8 + *(++ptr) - '0';
+ c &= 255; /* Take least significant 8 bits */
+ break;
+
+ /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
+ which can be greater than 0xff, but only if the ddd are hex digits. */
+
+ case 'x':
+#ifdef SUPPORT_UTF8
+ if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
+ {
+ const uschar *pt = ptr + 2;
+ register int count = 0;
+ c = 0;
+ while ((digitab[*pt] & ctype_xdigit) != 0)
+ {
+ int cc = *pt++;
+ count++;
+#if !EBCDIC /* ASCII coding */
+ if (cc >= 'a') cc -= 32; /* Convert to upper case */
+ c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
+#else /* EBCDIC coding */
+ if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
+ c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
+#endif
+ }
+ if (*pt == '}')
+ {
+ if (c < 0 || count > 8) *errorptr = ERR34;
+ ptr = pt;
+ break;
+ }
+ /* If the sequence of hex digits does not end with '}', then we don't
+ recognize this construct; fall through to the normal \x handling. */
+ }
+#endif
+
+ /* Read just a single hex char */
+
+ c = 0;
+ while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
+ {
+ int cc; /* Some compilers don't like ++ */
+ cc = *(++ptr); /* in initializers */
+#if !EBCDIC /* ASCII coding */
+ if (cc >= 'a') cc -= 32; /* Convert to upper case */
+ c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
+#else /* EBCDIC coding */
+ if (cc <= 'z') cc += 64; /* Convert to upper case */
+ c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
+#endif
+ }
+ break;
+
+ /* Other special escapes not starting with a digit are straightforward */
+
+ case 'c':
+ c = *(++ptr);
+ if (c == 0)
+ {
+ *errorptr = ERR2;
+ return 0;
+ }
+
+ /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
+ is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
+ (However, an EBCDIC equivalent has now been added.) */
+
+#if !EBCDIC /* ASCII coding */
+ if (c >= 'a' && c <= 'z') c -= 32;
+ c ^= 0x40;
+#else /* EBCDIC coding */
+ if (c >= 'a' && c <= 'z') c += 64;
+ c ^= 0xC0;
+#endif
+ break;
+
+ /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
+ other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
+ for Perl compatibility, it is a literal. This code looks a bit odd, but
+ there used to be some cases other than the default, and there may be again
+ in future, so I haven't "optimized" it. */
+
+ default:
+ if ((options & PCRE_EXTRA) != 0) switch(c)
+ {
+ default:
+ *errorptr = ERR3;
+ break;
+ }
+ break;
+ }
+ }
+
+*ptrptr = ptr;
+return c;
+}
+
+
+
+#ifdef SUPPORT_UCP
+/*************************************************
+* Handle \P and \p *
+*************************************************/
+
+/* This function is called after \P or \p has been encountered, provided that
+PCRE is compiled with support for Unicode properties. On entry, ptrptr is
+pointing at the P or p. On exit, it is pointing at the final character of the
+escape sequence.
+
+Argument:
+ ptrptr points to the pattern position pointer
+ negptr points to a boolean that is set TRUE for negation else FALSE
+ errorptr points to the pointer to the error message
+
+Returns: value from ucp_type_table, or -1 for an invalid type
+*/
+
+static int
+get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
+{
+int c, i, bot, top;
+const uschar *ptr = *ptrptr;
+char name[4];
+
+c = *(++ptr);
+if (c == 0) goto ERROR_RETURN;
+
+*negptr = FALSE;
+
+/* \P or \p can be followed by a one- or two-character name in {}, optionally
+preceded by ^ for negation. */
+
+if (c == '{')
+ {
+ if (ptr[1] == '^')
+ {
+ *negptr = TRUE;
+ ptr++;
+ }
+ for (i = 0; i <= 2; i++)
+ {
+ c = *(++ptr);
+ if (c == 0) goto ERROR_RETURN;
+ if (c == '}') break;
+ name[i] = c;
+ }
+ if (c !='}') /* Try to distinguish error cases */
+ {
+ while (*(++ptr) != 0 && *ptr != '}');
+ if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
+ }
+ name[i] = 0;
+ }
+
+/* Otherwise there is just one following character */
+
+else
+ {
+ name[0] = c;
+ name[1] = 0;
+ }
+
+*ptrptr = ptr;
+
+/* Search for a recognized property name using binary chop */
+
+bot = 0;
+top = sizeof(utt)/sizeof(ucp_type_table);
+
+while (bot < top)
+ {
+ i = (bot + top)/2;
+ c = strcmp(name, utt[i].name);
+ if (c == 0) return utt[i].value;
+ if (c > 0) bot = i + 1; else top = i;
+ }
+
+UNKNOWN_RETURN:
+*errorptr = ERR47;
+*ptrptr = ptr;
+return -1;
+
+ERROR_RETURN:
+*errorptr = ERR46;
+*ptrptr = ptr;
+return -1;
+}
+#endif
+
+
+
+
+/*************************************************
+* Check for counted repeat *
+*************************************************/
+
+/* This function is called when a '{' is encountered in a place where it might
+start a quantifier. It looks ahead to see if it really is a quantifier or not.
+It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
+where the ddds are digits.
+
+Arguments:
+ p pointer to the first char after '{'
+
+Returns: TRUE or FALSE
+*/
+
+static BOOL
+is_counted_repeat(const uschar *p)
+{
+if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
+while ((digitab[*p] & ctype_digit) != 0) p++;
+if (*p == '}') return TRUE;
+
+if (*p++ != ',') return FALSE;
+if (*p == '}') return TRUE;
+
+if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
+while ((digitab[*p] & ctype_digit) != 0) p++;
+
+return (*p == '}');
+}
+
+
+
+/*************************************************
+* Read repeat counts *
+*************************************************/
+
+/* Read an item of the form {n,m} and return the values. This is called only
+after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
+so the syntax is guaranteed to be correct, but we need to check the values.
+
+Arguments:
+ p pointer to first char after '{'
+ minp pointer to int for min
+ maxp pointer to int for max
+ returned as -1 if no max
+ errorptr points to pointer to error message
+
+Returns: pointer to '}' on success;
+ current ptr on error, with errorptr set
+*/
+
+static const uschar *
+read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
+{
+int min = 0;
+int max = -1;
+
+while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
+
+if (*p == '}') max = min; else
+ {
+ if (*(++p) != '}')
+ {
+ max = 0;
+ while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
+ if (max < min)
+ {
+ *errorptr = ERR4;
+ return p;
+ }
+ }
+ }
+
+/* Do paranoid checks, then fill in the required variables, and pass back the
+pointer to the terminating '}'. */
+
+if (min > 65535 || max > 65535)
+ *errorptr = ERR5;
+else
+ {
+ *minp = min;
+ *maxp = max;
+ }
+return p;
+}
+
+
+
+/*************************************************
+* Find first significant op code *
+*************************************************/
+
+/* This is called by several functions that scan a compiled expression looking
+for a fixed first character, or an anchoring op code etc. It skips over things
+that do not influence this. For some calls, a change of option is important.
+For some calls, it makes sense to skip negative forward and all backward
+assertions, and also the \b assertion; for others it does not.
+
+Arguments:
+ code pointer to the start of the group
+ options pointer to external options
+ optbit the option bit whose changing is significant, or
+ zero if none are
+ skipassert TRUE if certain assertions are to be skipped
+
+Returns: pointer to the first significant opcode
+*/
+
+static const uschar*
+first_significant_code(const uschar *code, int *options, int optbit,
+ BOOL skipassert)
+{
+for (;;)
+ {
+ switch ((int)*code)
+ {
+ case OP_OPT:
+ if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
+ *options = (int)code[1];
+ code += 2;
+ break;
+
+ case OP_ASSERT_NOT:
+ case OP_ASSERTBACK:
+ case OP_ASSERTBACK_NOT:
+ if (!skipassert) return code;
+ do code += GET(code, 1); while (*code == OP_ALT);
+ code += OP_lengths[*code];
+ break;
+
+ case OP_WORD_BOUNDARY:
+ case OP_NOT_WORD_BOUNDARY:
+ if (!skipassert) return code;
+ /* Fall through */
+
+ case OP_CALLOUT:
+ case OP_CREF:
+ case OP_BRANUMBER:
+ code += OP_lengths[*code];
+ break;
+
+ default:
+ return code;
+ }
+ }
+/* Control never reaches here */
+}
+
+
+
+
+/*************************************************
+* Find the fixed length of a pattern *
+*************************************************/
+
+/* Scan a pattern and compute the fixed length of subject that will match it,
+if the length is fixed. This is needed for dealing with backward assertions.
+In UTF8 mode, the result is in characters rather than bytes.
+
+Arguments:
+ code points to the start of the pattern (the bracket)
+ options the compiling options
+
+Returns: the fixed length, or -1 if there is no fixed length,
+ or -2 if \C was encountered
+*/
+
+static int
+find_fixedlength(uschar *code, int options)
+{
+int length = -1;
+
+register int branchlength = 0;
+register uschar *cc = code + 1 + LINK_SIZE;
+
+/* Scan along the opcodes for this branch. If we get to the end of the
+branch, check the length against that of the other branches. */
+
+for (;;)
+ {
+ int d;
+ register int op = *cc;
+ if (op >= OP_BRA) op = OP_BRA;
+
+ switch (op)
+ {
+ case OP_BRA:
+ case OP_ONCE:
+ case OP_COND:
+ d = find_fixedlength(cc, options);
+ if (d < 0) return d;
+ branchlength += d;
+ do cc += GET(cc, 1); while (*cc == OP_ALT);
+ cc += 1 + LINK_SIZE;
+ break;
+
+ /* Reached end of a branch; if it's a ket it is the end of a nested
+ call. If it's ALT it is an alternation in a nested call. If it is
+ END it's the end of the outer call. All can be handled by the same code. */
+
+ case OP_ALT:
+ case OP_KET:
+ case OP_KETRMAX:
+ case OP_KETRMIN:
+ case OP_END:
+ if (length < 0) length = branchlength;
+ else if (length != branchlength) return -1;
+ if (*cc != OP_ALT) return length;
+ cc += 1 + LINK_SIZE;
+ branchlength = 0;
+ break;
+
+ /* Skip over assertive subpatterns */
+
+ case OP_ASSERT:
+ case OP_ASSERT_NOT:
+ case OP_ASSERTBACK:
+ case OP_ASSERTBACK_NOT:
+ do cc += GET(cc, 1); while (*cc == OP_ALT);
+ /* Fall through */
+
+ /* Skip over things that don't match chars */
+
+ case OP_REVERSE:
+ case OP_BRANUMBER:
+ case OP_CREF:
+ case OP_OPT:
+ case OP_CALLOUT:
+ case OP_SOD:
+ case OP_SOM:
+ case OP_EOD:
+ case OP_EODN:
+ case OP_CIRC:
+ case OP_DOLL:
+ case OP_NOT_WORD_BOUNDARY:
+ case OP_WORD_BOUNDARY:
+ cc += OP_lengths[*cc];
+ break;
+
+ /* Handle literal characters */
+
+ case OP_CHAR:
+ case OP_CHARNC:
+ branchlength++;
+ cc += 2;
+#ifdef SUPPORT_UTF8
+ if ((options & PCRE_UTF8) != 0)
+ {
+ while ((*cc & 0xc0) == 0x80) cc++;
+ }
+#endif
+ break;
+
+ /* Handle exact repetitions. The count is already in characters, but we
+ need to skip over a multibyte character in UTF8 mode. */
+
+ case OP_EXACT:
+ branchlength += GET2(cc,1);
+ cc += 4;
+#ifdef SUPPORT_UTF8
+ if ((options & PCRE_UTF8) != 0)
+ {
+ while((*cc & 0x80) == 0x80) cc++;
+ }
+#endif
+ break;
+
+ case OP_TYPEEXACT:
+ branchlength += GET2(cc,1);
+ cc += 4;
+ break;
+
+ /* Handle single-char matchers */
+
+ case OP_PROP:
+ case OP_NOTPROP:
+ cc++;
+ /* Fall through */
+
+ case OP_NOT_DIGIT:
+ case OP_DIGIT:
+ case OP_NOT_WHITESPACE:
+ case OP_WHITESPACE:
+ case OP_NOT_WORDCHAR:
+ case OP_WORDCHAR:
+ case OP_ANY:
+ branchlength++;
+ cc++;
+ break;
+
+ /* The single-byte matcher isn't allowed */
+
+ case OP_ANYBYTE:
+ return -2;
+
+ /* Check a class for variable quantification */
+
+#ifdef SUPPORT_UTF8
+ case OP_XCLASS:
+ cc += GET(cc, 1) - 33;
+ /* Fall through */
+#endif
+
+ case OP_CLASS:
+ case OP_NCLASS:
+ cc += 33;
+
+ switch (*cc)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ return -1;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ if (GET2(cc,1) != GET2(cc,3)) return -1;
+ branchlength += GET2(cc,1);
+ cc += 5;
+ break;
+
+ default:
+ branchlength++;
+ }
+ break;
+
+ /* Anything else is variable length */
+
+ default:
+ return -1;
+ }
+ }
+/* Control never gets here */
+}
+
+
+
+
+/*************************************************
+* Scan compiled regex for numbered bracket *
+*************************************************/
+
+/* This little function scans through a compiled pattern until it finds a
+capturing bracket with the given number.
+
+Arguments:
+ code points to start of expression
+ utf8 TRUE in UTF-8 mode
+ number the required bracket number
+
+Returns: pointer to the opcode for the bracket, or NULL if not found
+*/
+
+static const uschar *
+find_bracket(const uschar *code, BOOL utf8, int number)
+{
+#ifndef SUPPORT_UTF8
+utf8 = utf8; /* Stop pedantic compilers complaining */
+#endif
+
+for (;;)
+ {
+ register int c = *code;
+ if (c == OP_END) return NULL;
+ else if (c > OP_BRA)
+ {
+ int n = c - OP_BRA;
+ if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
+ if (n == number) return (uschar *)code;
+ code += OP_lengths[OP_BRA];
+ }
+ else
+ {
+ code += OP_lengths[c];
+
+#ifdef SUPPORT_UTF8
+
+ /* In UTF-8 mode, opcodes that are followed by a character may be followed
+ by a multi-byte character. The length in the table is a minimum, so we have
+ to scan along to skip the extra bytes. All opcodes are less than 128, so we
+ can use relatively efficient code. */
+
+ if (utf8) switch(c)
+ {
+ case OP_CHAR:
+ case OP_CHARNC:
+ case OP_EXACT:
+ case OP_UPTO:
+ case OP_MINUPTO:
+ case OP_STAR:
+ case OP_MINSTAR:
+ case OP_PLUS:
+ case OP_MINPLUS:
+ case OP_QUERY:
+ case OP_MINQUERY:
+ while ((*code & 0xc0) == 0x80) code++;
+ break;
+
+ /* XCLASS is used for classes that cannot be represented just by a bit
+ map. This includes negated single high-valued characters. The length in
+ the table is zero; the actual length is stored in the compiled code. */
+
+ case OP_XCLASS:
+ code += GET(code, 1) + 1;
+ break;
+ }
+#endif
+ }
+ }
+}
+
+
+
+/*************************************************
+* Scan compiled regex for recursion reference *
+*************************************************/
+
+/* This little function scans through a compiled pattern until it finds an
+instance of OP_RECURSE.
+
+Arguments:
+ code points to start of expression
+ utf8 TRUE in UTF-8 mode
+
+Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
+*/
+
+static const uschar *
+find_recurse(const uschar *code, BOOL utf8)
+{
+#ifndef SUPPORT_UTF8
+utf8 = utf8; /* Stop pedantic compilers complaining */
+#endif
+
+for (;;)
+ {
+ register int c = *code;
+ if (c == OP_END) return NULL;
+ else if (c == OP_RECURSE) return code;
+ else if (c > OP_BRA)
+ {
+ code += OP_lengths[OP_BRA];
+ }
+ else
+ {
+ code += OP_lengths[c];
+
+#ifdef SUPPORT_UTF8
+
+ /* In UTF-8 mode, opcodes that are followed by a character may be followed
+ by a multi-byte character. The length in the table is a minimum, so we have
+ to scan along to skip the extra bytes. All opcodes are less than 128, so we
+ can use relatively efficient code. */
+
+ if (utf8) switch(c)
+ {
+ case OP_CHAR:
+ case OP_CHARNC:
+ case OP_EXACT:
+ case OP_UPTO:
+ case OP_MINUPTO:
+ case OP_STAR:
+ case OP_MINSTAR:
+ case OP_PLUS:
+ case OP_MINPLUS:
+ case OP_QUERY:
+ case OP_MINQUERY:
+ while ((*code & 0xc0) == 0x80) code++;
+ break;
+
+ /* XCLASS is used for classes that cannot be represented just by a bit
+ map. This includes negated single high-valued characters. The length in
+ the table is zero; the actual length is stored in the compiled code. */
+
+ case OP_XCLASS:
+ code += GET(code, 1) + 1;
+ break;
+ }
+#endif
+ }
+ }
+}
+
+
+
+/*************************************************
+* Scan compiled branch for non-emptiness *
+*************************************************/
+
+/* This function scans through a branch of a compiled pattern to see whether it
+can match the empty string or not. It is called only from could_be_empty()
+below. Note that first_significant_code() skips over assertions. If we hit an
+unclosed bracket, we return "empty" - this means we've struck an inner bracket
+whose current branch will already have been scanned.
+
+Arguments:
+ code points to start of search
+ endcode points to where to stop
+ utf8 TRUE if in UTF8 mode
+
+Returns: TRUE if what is matched could be empty
+*/
+
+static BOOL
+could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
+{
+register int c;
+for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
+ code < endcode;
+ code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
+ {
+ const uschar *ccode;
+
+ c = *code;
+
+ if (c >= OP_BRA)
+ {
+ BOOL empty_branch;
+ if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
+
+ /* Scan a closed bracket */
+
+ empty_branch = FALSE;
+ do
+ {
+ if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
+ empty_branch = TRUE;
+ code += GET(code, 1);
+ }
+ while (*code == OP_ALT);
+ if (!empty_branch) return FALSE; /* All branches are non-empty */
+ code += 1 + LINK_SIZE;
+ c = *code;
+ }
+
+ else switch (c)
+ {
+ /* Check for quantifiers after a class */
+
+#ifdef SUPPORT_UTF8
+ case OP_XCLASS:
+ ccode = code + GET(code, 1);
+ goto CHECK_CLASS_REPEAT;
+#endif
+
+ case OP_CLASS:
+ case OP_NCLASS:
+ ccode = code + 33;
+
+#ifdef SUPPORT_UTF8
+ CHECK_CLASS_REPEAT:
+#endif
+
+ switch (*ccode)
+ {
+ case OP_CRSTAR: /* These could be empty; continue */
+ case OP_CRMINSTAR:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ break;
+
+ default: /* Non-repeat => class must match */
+ case OP_CRPLUS: /* These repeats aren't empty */
+ case OP_CRMINPLUS:
+ return FALSE;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
+ break;
+ }
+ break;
+
+ /* Opcodes that must match a character */
+
+ case OP_PROP:
+ case OP_NOTPROP:
+ case OP_EXTUNI:
+ case OP_NOT_DIGIT:
+ case OP_DIGIT:
+ case OP_NOT_WHITESPACE:
+ case OP_WHITESPACE:
+ case OP_NOT_WORDCHAR:
+ case OP_WORDCHAR:
+ case OP_ANY:
+ case OP_ANYBYTE:
+ case OP_CHAR:
+ case OP_CHARNC:
+ case OP_NOT:
+ case OP_PLUS:
+ case OP_MINPLUS:
+ case OP_EXACT:
+ case OP_NOTPLUS:
+ case OP_NOTMINPLUS:
+ case OP_NOTEXACT:
+ case OP_TYPEPLUS:
+ case OP_TYPEMINPLUS:
+ case OP_TYPEEXACT:
+ return FALSE;
+
+ /* End of branch */
+
+ case OP_KET:
+ case OP_KETRMAX:
+ case OP_KETRMIN:
+ case OP_ALT:
+ return TRUE;
+
+ /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
+ followed by a multibyte character */
+
+#ifdef SUPPORT_UTF8
+ case OP_STAR:
+ case OP_MINSTAR:
+ case OP_QUERY:
+ case OP_MINQUERY:
+ case OP_UPTO:
+ case OP_MINUPTO:
+ if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
+ break;
+#endif
+ }
+ }
+
+return TRUE;
+}
+
+
+
+/*************************************************
+* Scan compiled regex for non-emptiness *
+*************************************************/
+
+/* This function is called to check for left recursive calls. We want to check
+the current branch of the current pattern to see if it could match the empty
+string. If it could, we must look outwards for branches at other levels,
+stopping when we pass beyond the bracket which is the subject of the recursion.
+
+Arguments:
+ code points to start of the recursion
+ endcode points to where to stop (current RECURSE item)
+ bcptr points to the chain of current (unclosed) branch starts
+ utf8 TRUE if in UTF-8 mode
+
+Returns: TRUE if what is matched could be empty
+*/
+
+static BOOL
+could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
+ BOOL utf8)
+{
+while (bcptr != NULL && bcptr->current >= code)
+ {
+ if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
+ bcptr = bcptr->outer;
+ }
+return TRUE;
+}
+
+
+
+/*************************************************
+* Check for POSIX class syntax *
+*************************************************/
+
+/* This function is called when the sequence "[:" or "[." or "[=" is
+encountered in a character class. It checks whether this is followed by an
+optional ^ and then a sequence of letters, terminated by a matching ":]" or
+".]" or "=]".
+
+Argument:
+ ptr pointer to the initial [
+ endptr where to return the end pointer
+ cd pointer to compile data
+
+Returns: TRUE or FALSE
+*/
+
+static BOOL
+check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
+{
+int terminator; /* Don't combine these lines; the Solaris cc */
+terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
+if (*(++ptr) == '^') ptr++;
+while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
+if (*ptr == terminator && ptr[1] == ']')
+ {
+ *endptr = ptr;
+ return TRUE;
+ }
+return FALSE;
+}
+
+
+
+
+/*************************************************
+* Check POSIX class name *
+*************************************************/
+
+/* This function is called to check the name given in a POSIX-style class entry
+such as [:alnum:].
+
+Arguments:
+ ptr points to the first letter
+ len the length of the name
+
+Returns: a value representing the name, or -1 if unknown
+*/
+
+static int
+check_posix_name(const uschar *ptr, int len)
+{
+register int yield = 0;
+while (posix_name_lengths[yield] != 0)
+ {
+ if (len == posix_name_lengths[yield] &&
+ strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
+ yield++;
+ }
+return -1;
+}
+
+
+/*************************************************
+* Adjust OP_RECURSE items in repeated group *
+*************************************************/
+
+/* OP_RECURSE items contain an offset from the start of the regex to the group
+that is referenced. This means that groups can be replicated for fixed
+repetition simply by copying (because the recursion is allowed to refer to
+earlier groups that are outside the current group). However, when a group is
+optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
+it, after it has been compiled. This means that any OP_RECURSE items within it
+that refer to the group itself or any contained groups have to have their
+offsets adjusted. That is the job of this function. Before it is called, the
+partially compiled regex must be temporarily terminated with OP_END.
+
+Arguments:
+ group points to the start of the group
+ adjust the amount by which the group is to be moved
+ utf8 TRUE in UTF-8 mode
+ cd contains pointers to tables etc.
+
+Returns: nothing
+*/
+
+static void
+adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
+{
+uschar *ptr = group;
+while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
+ {
+ int offset = GET(ptr, 1);
+ if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
+ ptr += 1 + LINK_SIZE;
+ }
+}
+
+
+
+/*************************************************
+* Insert an automatic callout point *
+*************************************************/
+
+/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
+callout points before each pattern item.
+
+Arguments:
+ code current code pointer
+ ptr current pattern pointer
+ cd pointers to tables etc
+
+Returns: new code pointer
+*/
+
+static uschar *
+auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
+{
+*code++ = OP_CALLOUT;
+*code++ = 255;
+PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
+PUT(code, LINK_SIZE, 0); /* Default length */
+return code + 2*LINK_SIZE;
+}
+
+
+
+/*************************************************
+* Complete a callout item *
+*************************************************/
+
+/* A callout item contains the length of the next item in the pattern, which
+we can't fill in till after we have reached the relevant point. This is used
+for both automatic and manual callouts.
+
+Arguments:
+ previous_callout points to previous callout item
+ ptr current pattern pointer
+ cd pointers to tables etc
+
+Returns: nothing
+*/
+
+static void
+complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
+{
+int length = ptr - cd->start_pattern - GET(previous_callout, 2);
+PUT(previous_callout, 2 + LINK_SIZE, length);
+}
+
+
+
+#ifdef SUPPORT_UCP
+/*************************************************
+* Get othercase range *
+*************************************************/
+
+/* This function is passed the start and end of a class range, in UTF-8 mode
+with UCP support. It searches up the characters, looking for internal ranges of
+characters in the "other" case. Each call returns the next one, updating the
+start address.
+
+Arguments:
+ cptr points to starting character value; updated
+ d end value
+ ocptr where to put start of othercase range
+ odptr where to put end of othercase range
+
+Yield: TRUE when range returned; FALSE when no more
+*/
+
+static BOOL
+get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
+{
+int c, chartype, othercase, next;
+
+for (c = *cptr; c <= d; c++)
+ {
+ if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
+ }
+
+if (c > d) return FALSE;
+
+*ocptr = othercase;
+next = othercase + 1;
+
+for (++c; c <= d; c++)
+ {
+ if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
+ break;
+ next++;
+ }
+
+*odptr = next - 1;
+*cptr = c;
+
+return TRUE;
+}
+#endif /* SUPPORT_UCP */
+
+
+/*************************************************
+* Compile one branch *
+*************************************************/
+
+/* Scan the pattern, compiling it into the code vector. If the options are
+changed during the branch, the pointer is used to change the external options
+bits.
+
+Arguments:
+ optionsptr pointer to the option bits
+ brackets points to number of extracting brackets used
+ codeptr points to the pointer to the current code point
+ ptrptr points to the current pattern pointer
+ errorptr points to pointer to error message
+ firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
+ reqbyteptr set to the last literal character required, else < 0
+ bcptr points to current branch chain
+ cd contains pointers to tables etc.
+
+Returns: TRUE on success
+ FALSE, with *errorptr set on error
+*/
+
+static BOOL
+compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
+ const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
+ int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
+{
+int repeat_type, op_type;
+int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
+int bravalue = 0;
+int greedy_default, greedy_non_default;
+int firstbyte, reqbyte;
+int zeroreqbyte, zerofirstbyte;
+int req_caseopt, reqvary, tempreqvary;
+int condcount = 0;
+int options = *optionsptr;
+int after_manual_callout = 0;
+register int c;
+register uschar *code = *codeptr;
+uschar *tempcode;
+BOOL inescq = FALSE;
+BOOL groupsetfirstbyte = FALSE;
+const uschar *ptr = *ptrptr;
+const uschar *tempptr;
+uschar *previous = NULL;
+uschar *previous_callout = NULL;
+uschar classbits[32];
+
+#ifdef SUPPORT_UTF8
+BOOL class_utf8;
+BOOL utf8 = (options & PCRE_UTF8) != 0;
+uschar *class_utf8data;
+uschar utf8_char[6];
+#else
+BOOL utf8 = FALSE;
+#endif
+
+/* Set up the default and non-default settings for greediness */
+
+greedy_default = ((options & PCRE_UNGREEDY) != 0);
+greedy_non_default = greedy_default ^ 1;
+
+/* Initialize no first byte, no required byte. REQ_UNSET means "no char
+matching encountered yet". It gets changed to REQ_NONE if we hit something that
+matches a non-fixed char first char; reqbyte just remains unset if we never
+find one.
+
+When we hit a repeat whose minimum is zero, we may have to adjust these values
+to take the zero repeat into account. This is implemented by setting them to
+zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
+item types that can be repeated set these backoff variables appropriately. */
+
+firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
+
+/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
+according to the current setting of the caseless flag. REQ_CASELESS is a bit
+value > 255. It is added into the firstbyte or reqbyte variables to record the
+case status of the value. This is used only for ASCII characters. */
+
+req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
+
+/* Switch on next character until the end of the branch */
+
+for (;; ptr++)
+ {
+ BOOL negate_class;
+ BOOL possessive_quantifier;
+ BOOL is_quantifier;
+ int class_charcount;
+ int class_lastchar;
+ int newoptions;
+ int recno;
+ int skipbytes;
+ int subreqbyte;
+ int subfirstbyte;
+ int mclength;
+ uschar mcbuffer[8];
+
+ /* Next byte in the pattern */
+
+ c = *ptr;
+
+ /* If in \Q...\E, check for the end; if not, we have a literal */
+
+ if (inescq && c != 0)
+ {
+ if (c == '\\' && ptr[1] == 'E')
+ {
+ inescq = FALSE;
+ ptr++;
+ continue;
+ }
+ else
+ {
+ if (previous_callout != NULL)
+ {
+ complete_callout(previous_callout, ptr, cd);
+ previous_callout = NULL;
+ }
+ if ((options & PCRE_AUTO_CALLOUT) != 0)
+ {
+ previous_callout = code;
+ code = auto_callout(code, ptr, cd);
+ }
+ goto NORMAL_CHAR;
+ }
+ }
+
+ /* Fill in length of a previous callout, except when the next thing is
+ a quantifier. */
+
+ is_quantifier = c == '*' || c == '+' || c == '?' ||
+ (c == '{' && is_counted_repeat(ptr+1));
+
+ if (!is_quantifier && previous_callout != NULL &&
+ after_manual_callout-- <= 0)
+ {
+ complete_callout(previous_callout, ptr, cd);
+ previous_callout = NULL;
+ }
+
+ /* In extended mode, skip white space and comments */
+
+ if ((options & PCRE_EXTENDED) != 0)
+ {
+ if ((cd->ctypes[c] & ctype_space) != 0) continue;
+ if (c == '#')
+ {
+ /* The space before the ; is to avoid a warning on a silly compiler
+ on the Macintosh. */
+ while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
+ if (c != 0) continue; /* Else fall through to handle end of string */
+ }
+ }
+
+ /* No auto callout for quantifiers. */
+
+ if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
+ {
+ previous_callout = code;
+ code = auto_callout(code, ptr, cd);
+ }
+
+ switch(c)
+ {
+ /* The branch terminates at end of string, |, or ). */
+
+ case 0:
+ case '|':
+ case ')':
+ *firstbyteptr = firstbyte;
+ *reqbyteptr = reqbyte;
+ *codeptr = code;
+ *ptrptr = ptr;
+ return TRUE;
+
+ /* Handle single-character metacharacters. In multiline mode, ^ disables
+ the setting of any following char as a first character. */
+
+ case '^':
+ if ((options & PCRE_MULTILINE) != 0)
+ {
+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
+ }
+ previous = NULL;
+ *code++ = OP_CIRC;
+ break;
+
+ case '$':
+ previous = NULL;
+ *code++ = OP_DOLL;
+ break;
+
+ /* There can never be a first char if '.' is first, whatever happens about
+ repeats. The value of reqbyte doesn't change either. */
+
+ case '.':
+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
+ zerofirstbyte = firstbyte;
+ zeroreqbyte = reqbyte;
+ previous = code;
+ *code++ = OP_ANY;
+ break;
+
+ /* Character classes. If the included characters are all < 255 in value, we
+ build a 32-byte bitmap of the permitted characters, except in the special
+ case where there is only one such character. For negated classes, we build
+ the map as usual, then invert it at the end. However, we use a different
+ opcode so that data characters > 255 can be handled correctly.
+
+ If the class contains characters outside the 0-255 range, a different
+ opcode is compiled. It may optionally have a bit map for characters < 256,
+ but those above are are explicitly listed afterwards. A flag byte tells
+ whether the bitmap is present, and whether this is a negated class or not.
+ */
+
+ case '[':
+ previous = code;
+
+ /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
+ they are encountered at the top level, so we'll do that too. */
+
+ if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
+ check_posix_syntax(ptr, &tempptr, cd))
+ {
+ *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
+ goto FAILED;
+ }
+
+ /* If the first character is '^', set the negation flag and skip it. */
+
+ if ((c = *(++ptr)) == '^')
+ {
+ negate_class = TRUE;
+ c = *(++ptr);
+ }
+ else
+ {
+ negate_class = FALSE;
+ }
+
+ /* Keep a count of chars with values < 256 so that we can optimize the case
+ of just a single character (as long as it's < 256). For higher valued UTF-8
+ characters, we don't yet do any optimization. */
+
+ class_charcount = 0;
+ class_lastchar = -1;
+
+#ifdef SUPPORT_UTF8
+ class_utf8 = FALSE; /* No chars >= 256 */
+ class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
+#endif
+
+ /* Initialize the 32-char bit map to all zeros. We have to build the
+ map in a temporary bit of store, in case the class contains only 1
+ character (< 256), because in that case the compiled code doesn't use the
+ bit map. */
+
+ memset(classbits, 0, 32 * sizeof(uschar));
+
+ /* Process characters until ] is reached. By writing this as a "do" it
+ means that an initial ] is taken as a data character. The first pass
+ through the regex checked the overall syntax, so we don't need to be very
+ strict here. At the start of the loop, c contains the first byte of the
+ character. */
+
+ do
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8 && c > 127)
+ { /* Braces are required because the */
+ GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
+ }
+#endif
+
+ /* Inside \Q...\E everything is literal except \E */
+
+ if (inescq)
+ {
+ if (c == '\\' && ptr[1] == 'E')
+ {
+ inescq = FALSE;
+ ptr++;
+ continue;
+ }
+ else goto LONE_SINGLE_CHARACTER;
+ }
+
+ /* Handle POSIX class names. Perl allows a negation extension of the
+ form [:^name:]. A square bracket that doesn't match the syntax is
+ treated as a literal. We also recognize the POSIX constructions
+ [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
+ 5.6 and 5.8 do. */
+
+ if (c == '[' &&
+ (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
+ check_posix_syntax(ptr, &tempptr, cd))
+ {
+ BOOL local_negate = FALSE;
+ int posix_class, i;
+ register const uschar *cbits = cd->cbits;
+
+ if (ptr[1] != ':')
+ {
+ *errorptr = ERR31;
+ goto FAILED;
+ }
+
+ ptr += 2;
+ if (*ptr == '^')
+ {
+ local_negate = TRUE;
+ ptr++;
+ }
+
+ posix_class = check_posix_name(ptr, tempptr - ptr);
+ if (posix_class < 0)
+ {
+ *errorptr = ERR30;
+ goto FAILED;
+ }
+
+ /* If matching is caseless, upper and lower are converted to
+ alpha. This relies on the fact that the class table starts with
+ alpha, lower, upper as the first 3 entries. */
+
+ if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
+ posix_class = 0;
+
+ /* Or into the map we are building up to 3 of the static class
+ tables, or their negations. The [:blank:] class sets up the same
+ chars as the [:space:] class (all white space). We remove the vertical
+ white space chars afterwards. */
+
+ posix_class *= 3;
+ for (i = 0; i < 3; i++)
+ {
+ BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
+ int taboffset = posix_class_maps[posix_class + i];
+ if (taboffset < 0) break;
+ if (local_negate)
+ {
+ if (i == 0)
+ for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
+ else
+ for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
+ if (blankclass) classbits[1] |= 0x3c;
+ }
+ else
+ {
+ for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
+ if (blankclass) classbits[1] &= ~0x3c;
+ }
+ }
+
+ ptr = tempptr + 1;
+ class_charcount = 10; /* Set > 1; assumes more than 1 per class */
+ continue; /* End of POSIX syntax handling */
+ }
+
+ /* Backslash may introduce a single character, or it may introduce one
+ of the specials, which just set a flag. Escaped items are checked for
+ validity in the pre-compiling pass. The sequence \b is a special case.
+ Inside a class (and only there) it is treated as backspace. Elsewhere
+ it marks a word boundary. Other escapes have preset maps ready to
+ or into the one we are building. We assume they have more than one
+ character in them, so set class_charcount bigger than one. */
+
+ if (c == '\\')
+ {
+ c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
+
+ if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
+ else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
+ else if (-c == ESC_Q) /* Handle start of quoted string */
+ {
+ if (ptr[1] == '\\' && ptr[2] == 'E')
+ {
+ ptr += 2; /* avoid empty string */
+ }
+ else inescq = TRUE;
+ continue;
+ }
+
+ if (c < 0)
+ {
+ register const uschar *cbits = cd->cbits;
+ class_charcount += 2; /* Greater than 1 is what matters */
+ switch (-c)
+ {
+ case ESC_d:
+ for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
+ continue;
+
+ case ESC_D:
+ for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
+ continue;
+
+ case ESC_w:
+ for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
+ continue;
+
+ case ESC_W:
+ for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
+ continue;
+
+ case ESC_s:
+ for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
+ classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
+ continue;
+
+ case ESC_S:
+ for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
+ classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
+ continue;
+
+#ifdef SUPPORT_UCP
+ case ESC_p:
+ case ESC_P:
+ {
+ BOOL negated;
+ int property = get_ucp(&ptr, &negated, errorptr);
+ if (property < 0) goto FAILED;
+ class_utf8 = TRUE;
+ *class_utf8data++ = ((-c == ESC_p) != negated)?
+ XCL_PROP : XCL_NOTPROP;
+ *class_utf8data++ = property;
+ class_charcount -= 2; /* Not a < 256 character */
+ }
+ continue;
+#endif
+
+ /* Unrecognized escapes are faulted if PCRE is running in its
+ strict mode. By default, for compatibility with Perl, they are
+ treated as literals. */
+
+ default:
+ if ((options & PCRE_EXTRA) != 0)
+ {
+ *errorptr = ERR7;
+ goto FAILED;
+ }
+ c = *ptr; /* The final character */
+ class_charcount -= 2; /* Undo the default count from above */
+ }
+ }
+
+ /* Fall through if we have a single character (c >= 0). This may be
+ > 256 in UTF-8 mode. */
+
+ } /* End of backslash handling */
+
+ /* A single character may be followed by '-' to form a range. However,
+ Perl does not permit ']' to be the end of the range. A '-' character
+ here is treated as a literal. */
+
+ if (ptr[1] == '-' && ptr[2] != ']')
+ {
+ int d;
+ ptr += 2;
+
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ { /* Braces are required because the */
+ GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
+ }
+ else
+#endif
+ d = *ptr; /* Not UTF-8 mode */
+
+ /* The second part of a range can be a single-character escape, but
+ not any of the other escapes. Perl 5.6 treats a hyphen as a literal
+ in such circumstances. */
+
+ if (d == '\\')
+ {
+ const uschar *oldptr = ptr;
+ d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
+
+ /* \b is backslash; \X is literal X; any other special means the '-'
+ was literal */
+
+ if (d < 0)
+ {
+ if (d == -ESC_b) d = '\b';
+ else if (d == -ESC_X) d = 'X'; else
+ {
+ ptr = oldptr - 2;
+ goto LONE_SINGLE_CHARACTER; /* A few lines below */
+ }
+ }
+ }
+
+ /* The check that the two values are in the correct order happens in
+ the pre-pass. Optimize one-character ranges */
+
+ if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
+
+ /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
+ matching, we have to use an XCLASS with extra data items. Caseless
+ matching for characters > 127 is available only if UCP support is
+ available. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
+ {
+ class_utf8 = TRUE;
+
+ /* With UCP support, we can find the other case equivalents of
+ the relevant characters. There may be several ranges. Optimize how
+ they fit with the basic range. */
+
+#ifdef SUPPORT_UCP
+ if ((options & PCRE_CASELESS) != 0)
+ {
+ int occ, ocd;
+ int cc = c;
+ int origd = d;
+ while (get_othercase_range(&cc, origd, &occ, &ocd))
+ {
+ if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
+
+ if (occ < c && ocd >= c - 1) /* Extend the basic range */
+ { /* if there is overlap, */
+ c = occ; /* noting that if occ < c */
+ continue; /* we can't have ocd > d */
+ } /* because a subrange is */
+ if (ocd > d && occ <= d + 1) /* always shorter than */
+ { /* the basic range. */
+ d = ocd;
+ continue;
+ }
+
+ if (occ == ocd)
+ {
+ *class_utf8data++ = XCL_SINGLE;
+ }
+ else
+ {
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += ord2utf8(occ, class_utf8data);
+ }
+ class_utf8data += ord2utf8(ocd, class_utf8data);
+ }
+ }
+#endif /* SUPPORT_UCP */
+
+ /* Now record the original range, possibly modified for UCP caseless
+ overlapping ranges. */
+
+ *class_utf8data++ = XCL_RANGE;
+ class_utf8data += ord2utf8(c, class_utf8data);
+ class_utf8data += ord2utf8(d, class_utf8data);
+
+ /* With UCP support, we are done. Without UCP support, there is no
+ caseless matching for UTF-8 characters > 127; we can use the bit map
+ for the smaller ones. */
+
+#ifdef SUPPORT_UCP
+ continue; /* With next character in the class */
+#else
+ if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
+
+ /* Adjust upper limit and fall through to set up the map */
+
+ d = 127;
+
+#endif /* SUPPORT_UCP */
+ }
+#endif /* SUPPORT_UTF8 */
+
+ /* We use the bit map for all cases when not in UTF-8 mode; else
+ ranges that lie entirely within 0-127 when there is UCP support; else
+ for partial ranges without UCP support. */
+
+ for (; c <= d; c++)
+ {
+ classbits[c/8] |= (1 << (c&7));
+ if ((options & PCRE_CASELESS) != 0)
+ {
+ int uc = cd->fcc[c]; /* flip case */
+ classbits[uc/8] |= (1 << (uc&7));
+ }
+ class_charcount++; /* in case a one-char range */
+ class_lastchar = c;
+ }
+
+ continue; /* Go get the next char in the class */
+ }
+
+ /* Handle a lone single character - we can get here for a normal
+ non-escape char, or after \ that introduces a single character or for an
+ apparent range that isn't. */
+
+ LONE_SINGLE_CHARACTER:
+
+ /* Handle a character that cannot go in the bit map */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
+ {
+ class_utf8 = TRUE;
+ *class_utf8data++ = XCL_SINGLE;
+ class_utf8data += ord2utf8(c, class_utf8data);
+
+#ifdef SUPPORT_UCP
+ if ((options & PCRE_CASELESS) != 0)
+ {
+ int chartype;
+ int othercase;
+ if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
+ {
+ *class_utf8data++ = XCL_SINGLE;
+ class_utf8data += ord2utf8(othercase, class_utf8data);
+ }
+ }
+#endif /* SUPPORT_UCP */
+
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+
+ /* Handle a single-byte character */
+ {
+ classbits[c/8] |= (1 << (c&7));
+ if ((options & PCRE_CASELESS) != 0)
+ {
+ c = cd->fcc[c]; /* flip case */
+ classbits[c/8] |= (1 << (c&7));
+ }
+ class_charcount++;
+ class_lastchar = c;
+ }
+ }
+
+ /* Loop until ']' reached; the check for end of string happens inside the
+ loop. This "while" is the end of the "do" above. */
+
+ while ((c = *(++ptr)) != ']' || inescq);
+
+ /* If class_charcount is 1, we saw precisely one character whose value is
+ less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
+ can optimize the negative case only if there were no characters >= 128
+ because OP_NOT and the related opcodes like OP_NOTSTAR operate on
+ single-bytes only. This is an historical hangover. Maybe one day we can
+ tidy these opcodes to handle multi-byte characters.
+
+ The optimization throws away the bit map. We turn the item into a
+ 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
+ that OP_NOT does not support multibyte characters. In the positive case, it
+ can cause firstbyte to be set. Otherwise, there can be no first char if
+ this item is first, whatever repeat count may follow. In the case of
+ reqbyte, save the previous value for reinstating. */
+
+#ifdef SUPPORT_UTF8
+ if (class_charcount == 1 &&
+ (!utf8 ||
+ (!class_utf8 && (!negate_class || class_lastchar < 128))))
+
+#else
+ if (class_charcount == 1)
+#endif
+ {
+ zeroreqbyte = reqbyte;
+
+ /* The OP_NOT opcode works on one-byte characters only. */
+
+ if (negate_class)
+ {
+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
+ zerofirstbyte = firstbyte;
+ *code++ = OP_NOT;
+ *code++ = class_lastchar;
+ break;
+ }
+
+ /* For a single, positive character, get the value into mcbuffer, and
+ then we can handle this with the normal one-character code. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && class_lastchar > 127)
+ mclength = ord2utf8(class_lastchar, mcbuffer);
+ else
+#endif
+ {
+ mcbuffer[0] = class_lastchar;
+ mclength = 1;
+ }
+ goto ONE_CHAR;
+ } /* End of 1-char optimization */
+
+ /* The general case - not the one-char optimization. If this is the first
+ thing in the branch, there can be no first char setting, whatever the
+ repeat count. Any reqbyte setting must remain unchanged after any kind of
+ repeat. */
+
+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
+ zerofirstbyte = firstbyte;
+ zeroreqbyte = reqbyte;
+
+ /* If there are characters with values > 255, we have to compile an
+ extended class, with its own opcode. If there are no characters < 256,
+ we can omit the bitmap. */
+
+#ifdef SUPPORT_UTF8
+ if (class_utf8)
+ {
+ *class_utf8data++ = XCL_END; /* Marks the end of extra data */
+ *code++ = OP_XCLASS;
+ code += LINK_SIZE;
+ *code = negate_class? XCL_NOT : 0;
+
+ /* If the map is required, install it, and move on to the end of
+ the extra data */
+
+ if (class_charcount > 0)
+ {
+ *code++ |= XCL_MAP;
+ memcpy(code, classbits, 32);
+ code = class_utf8data;
+ }
+
+ /* If the map is not required, slide down the extra data. */
+
+ else
+ {
+ int len = class_utf8data - (code + 33);
+ memmove(code + 1, code + 33, len);
+ code += len + 1;
+ }
+
+ /* Now fill in the complete length of the item */
+
+ PUT(previous, 1, code - previous);
+ break; /* End of class handling */
+ }
+#endif
+
+ /* If there are no characters > 255, negate the 32-byte map if necessary,
+ and copy it into the code vector. If this is the first thing in the branch,
+ there can be no first char setting, whatever the repeat count. Any reqbyte
+ setting must remain unchanged after any kind of repeat. */
+
+ if (negate_class)
+ {
+ *code++ = OP_NCLASS;
+ for (c = 0; c < 32; c++) code[c] = ~classbits[c];
+ }
+ else
+ {
+ *code++ = OP_CLASS;
+ memcpy(code, classbits, 32);
+ }
+ code += 32;
+ break;
+
+ /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
+ has been tested above. */
+
+ case '{':
+ if (!is_quantifier) goto NORMAL_CHAR;
+ ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
+ if (*errorptr != NULL) goto FAILED;
+ goto REPEAT;
+
+ case '*':
+ repeat_min = 0;
+ repeat_max = -1;
+ goto REPEAT;
+
+ case '+':
+ repeat_min = 1;
+ repeat_max = -1;
+ goto REPEAT;
+
+ case '?':
+ repeat_min = 0;
+ repeat_max = 1;
+
+ REPEAT:
+ if (previous == NULL)
+ {
+ *errorptr = ERR9;
+ goto FAILED;
+ }
+
+ if (repeat_min == 0)
+ {
+ firstbyte = zerofirstbyte; /* Adjust for zero repeat */
+ reqbyte = zeroreqbyte; /* Ditto */
+ }
+
+ /* Remember whether this is a variable length repeat */
+
+ reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
+
+ op_type = 0; /* Default single-char op codes */
+ possessive_quantifier = FALSE; /* Default not possessive quantifier */
+
+ /* Save start of previous item, in case we have to move it up to make space
+ for an inserted OP_ONCE for the additional '+' extension. */
+
+ tempcode = previous;
+
+ /* If the next character is '+', we have a possessive quantifier. This
+ implies greediness, whatever the setting of the PCRE_UNGREEDY option.
+ If the next character is '?' this is a minimizing repeat, by default,
+ but if PCRE_UNGREEDY is set, it works the other way round. We change the
+ repeat type to the non-default. */
+
+ if (ptr[1] == '+')
+ {
+ repeat_type = 0; /* Force greedy */
+ possessive_quantifier = TRUE;
+ ptr++;
+ }
+ else if (ptr[1] == '?')
+ {
+ repeat_type = greedy_non_default;
+ ptr++;
+ }
+ else repeat_type = greedy_default;
+
+ /* If previous was a recursion, we need to wrap it inside brackets so that
+ it can be replicated if necessary. */
+
+ if (*previous == OP_RECURSE)
+ {
+ memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
+ code += 1 + LINK_SIZE;
+ *previous = OP_BRA;
+ PUT(previous, 1, code - previous);
+ *code = OP_KET;
+ PUT(code, 1, code - previous);
+ code += 1 + LINK_SIZE;
+ }
+
+ /* If previous was a character match, abolish the item and generate a
+ repeat item instead. If a char item has a minumum of more than one, ensure
+ that it is set in reqbyte - it might not be if a sequence such as x{3} is
+ the first thing in a branch because the x will have gone into firstbyte
+ instead. */
+
+ if (*previous == OP_CHAR || *previous == OP_CHARNC)
+ {
+ /* Deal with UTF-8 characters that take up more than one byte. It's
+ easier to write this out separately than try to macrify it. Use c to
+ hold the length of the character in bytes, plus 0x80 to flag that it's a
+ length rather than a small character. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && (code[-1] & 0x80) != 0)
+ {
+ uschar *lastchar = code - 1;
+ while((*lastchar & 0xc0) == 0x80) lastchar--;
+ c = code - lastchar; /* Length of UTF-8 character */
+ memcpy(utf8_char, lastchar, c); /* Save the char */
+ c |= 0x80; /* Flag c as a length */
+ }
+ else
+#endif
+
+ /* Handle the case of a single byte - either with no UTF8 support, or
+ with UTF-8 disabled, or for a UTF-8 character < 128. */
+
+ {
+ c = code[-1];
+ if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
+ }
+
+ goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
+ }
+
+ /* If previous was a single negated character ([^a] or similar), we use
+ one of the special opcodes, replacing it. The code is shared with single-
+ character repeats by setting opt_type to add a suitable offset into
+ repeat_type. OP_NOT is currently used only for single-byte chars. */
+
+ else if (*previous == OP_NOT)
+ {
+ op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
+ c = previous[1];
+ goto OUTPUT_SINGLE_REPEAT;
+ }
+
+ /* If previous was a character type match (\d or similar), abolish it and
+ create a suitable repeat item. The code is shared with single-character
+ repeats by setting op_type to add a suitable offset into repeat_type. Note
+ the the Unicode property types will be present only when SUPPORT_UCP is
+ defined, but we don't wrap the little bits of code here because it just
+ makes it horribly messy. */
+
+ else if (*previous < OP_EODN)
+ {
+ uschar *oldcode;
+ int prop_type;
+ op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
+ c = *previous;
+
+ OUTPUT_SINGLE_REPEAT:
+ prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
+ previous[1] : -1;
+
+ oldcode = code;
+ code = previous; /* Usually overwrite previous item */
+
+ /* If the maximum is zero then the minimum must also be zero; Perl allows
+ this case, so we do too - by simply omitting the item altogether. */
+
+ if (repeat_max == 0) goto END_REPEAT;
+
+ /* All real repeats make it impossible to handle partial matching (maybe
+ one day we will be able to remove this restriction). */
+
+ if (repeat_max != 1) cd->nopartial = TRUE;
+
+ /* Combine the op_type with the repeat_type */
+
+ repeat_type += op_type;
+
+ /* A minimum of zero is handled either as the special case * or ?, or as
+ an UPTO, with the maximum given. */
+
+ if (repeat_min == 0)
+ {
+ if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
+ else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
+ else
+ {
+ *code++ = OP_UPTO + repeat_type;
+ PUT2INC(code, 0, repeat_max);
+ }
+ }
+
+ /* A repeat minimum of 1 is optimized into some special cases. If the
+ maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
+ left in place and, if the maximum is greater than 1, we use OP_UPTO with
+ one less than the maximum. */
+
+ else if (repeat_min == 1)
+ {
+ if (repeat_max == -1)
+ *code++ = OP_PLUS + repeat_type;
+ else
+ {
+ code = oldcode; /* leave previous item in place */
+ if (repeat_max == 1) goto END_REPEAT;
+ *code++ = OP_UPTO + repeat_type;
+ PUT2INC(code, 0, repeat_max - 1);
+ }
+ }
+
+ /* The case {n,n} is just an EXACT, while the general case {n,m} is
+ handled as an EXACT followed by an UPTO. */
+
+ else
+ {
+ *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
+ PUT2INC(code, 0, repeat_min);
+
+ /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
+ we have to insert the character for the previous code. For a repeated
+ Unicode property match, there is an extra byte that defines the
+ required property. In UTF-8 mode, long characters have their length in
+ c, with the 0x80 bit as a flag. */
+
+ if (repeat_max < 0)
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8 && c >= 128)
+ {
+ memcpy(code, utf8_char, c & 7);
+ code += c & 7;
+ }
+ else
+#endif
+ {
+ *code++ = c;
+ if (prop_type >= 0) *code++ = prop_type;
+ }
+ *code++ = OP_STAR + repeat_type;
+ }
+
+ /* Else insert an UPTO if the max is greater than the min, again
+ preceded by the character, for the previously inserted code. */
+
+ else if (repeat_max != repeat_min)
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8 && c >= 128)
+ {
+ memcpy(code, utf8_char, c & 7);
+ code += c & 7;
+ }
+ else
+#endif
+ *code++ = c;
+ if (prop_type >= 0) *code++ = prop_type;
+ repeat_max -= repeat_min;
+ *code++ = OP_UPTO + repeat_type;
+ PUT2INC(code, 0, repeat_max);
+ }
+ }
+
+ /* The character or character type itself comes last in all cases. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && c >= 128)
+ {
+ memcpy(code, utf8_char, c & 7);
+ code += c & 7;
+ }
+ else
+#endif
+ *code++ = c;
+
+ /* For a repeated Unicode property match, there is an extra byte that
+ defines the required property. */
+
+#ifdef SUPPORT_UCP
+ if (prop_type >= 0) *code++ = prop_type;
+#endif
+ }
+
+ /* If previous was a character class or a back reference, we put the repeat
+ stuff after it, but just skip the item if the repeat was {0,0}. */
+
+ else if (*previous == OP_CLASS ||
+ *previous == OP_NCLASS ||
+#ifdef SUPPORT_UTF8
+ *previous == OP_XCLASS ||
+#endif
+ *previous == OP_REF)
+ {
+ if (repeat_max == 0)
+ {
+ code = previous;
+ goto END_REPEAT;
+ }
+
+ /* All real repeats make it impossible to handle partial matching (maybe
+ one day we will be able to remove this restriction). */
+
+ if (repeat_max != 1) cd->nopartial = TRUE;
+
+ if (repeat_min == 0 && repeat_max == -1)
+ *code++ = OP_CRSTAR + repeat_type;
+ else if (repeat_min == 1 && repeat_max == -1)
+ *code++ = OP_CRPLUS + repeat_type;
+ else if (repeat_min == 0 && repeat_max == 1)
+ *code++ = OP_CRQUERY + repeat_type;
+ else
+ {
+ *code++ = OP_CRRANGE + repeat_type;
+ PUT2INC(code, 0, repeat_min);
+ if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
+ PUT2INC(code, 0, repeat_max);
+ }
+ }
+
+ /* If previous was a bracket group, we may have to replicate it in certain
+ cases. */
+
+ else if (*previous >= OP_BRA || *previous == OP_ONCE ||
+ *previous == OP_COND)
+ {
+ register int i;
+ int ketoffset = 0;
+ int len = code - previous;
+ uschar *bralink = NULL;
+
+ /* If the maximum repeat count is unlimited, find the end of the bracket
+ by scanning through from the start, and compute the offset back to it
+ from the current code pointer. There may be an OP_OPT setting following
+ the final KET, so we can't find the end just by going back from the code
+ pointer. */
+
+ if (repeat_max == -1)
+ {
+ register uschar *ket = previous;
+ do ket += GET(ket, 1); while (*ket != OP_KET);
+ ketoffset = code - ket;
+ }
+
+ /* The case of a zero minimum is special because of the need to stick
+ OP_BRAZERO in front of it, and because the group appears once in the
+ data, whereas in other cases it appears the minimum number of times. For
+ this reason, it is simplest to treat this case separately, as otherwise
+ the code gets far too messy. There are several special subcases when the
+ minimum is zero. */
+
+ if (repeat_min == 0)
+ {
+ /* If the maximum is also zero, we just omit the group from the output
+ altogether. */
+
+ if (repeat_max == 0)
+ {
+ code = previous;
+ goto END_REPEAT;
+ }
+
+ /* If the maximum is 1 or unlimited, we just have to stick in the
+ BRAZERO and do no more at this point. However, we do need to adjust
+ any OP_RECURSE calls inside the group that refer to the group itself or
+ any internal group, because the offset is from the start of the whole
+ regex. Temporarily terminate the pattern while doing this. */
+
+ if (repeat_max <= 1)
+ {
+ *code = OP_END;
+ adjust_recurse(previous, 1, utf8, cd);
+ memmove(previous+1, previous, len);
+ code++;
+ *previous++ = OP_BRAZERO + repeat_type;
+ }
+
+ /* If the maximum is greater than 1 and limited, we have to replicate
+ in a nested fashion, sticking OP_BRAZERO before each set of brackets.
+ The first one has to be handled carefully because it's the original
+ copy, which has to be moved up. The remainder can be handled by code
+ that is common with the non-zero minimum case below. We have to
+ adjust the value or repeat_max, since one less copy is required. Once
+ again, we may have to adjust any OP_RECURSE calls inside the group. */
+
+ else
+ {
+ int offset;
+ *code = OP_END;
+ adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
+ memmove(previous + 2 + LINK_SIZE, previous, len);
+ code += 2 + LINK_SIZE;
+ *previous++ = OP_BRAZERO + repeat_type;
+ *previous++ = OP_BRA;
+
+ /* We chain together the bracket offset fields that have to be
+ filled in later when the ends of the brackets are reached. */
+
+ offset = (bralink == NULL)? 0 : previous - bralink;
+ bralink = previous;
+ PUTINC(previous, 0, offset);
+ }
+
+ repeat_max--;
+ }
+
+ /* If the minimum is greater than zero, replicate the group as many
+ times as necessary, and adjust the maximum to the number of subsequent
+ copies that we need. If we set a first char from the group, and didn't
+ set a required char, copy the latter from the former. */
+
+ else
+ {
+ if (repeat_min > 1)
+ {
+ if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
+ for (i = 1; i < repeat_min; i++)
+ {
+ memcpy(code, previous, len);
+ code += len;
+ }
+ }
+ if (repeat_max > 0) repeat_max -= repeat_min;
+ }
+
+ /* This code is common to both the zero and non-zero minimum cases. If
+ the maximum is limited, it replicates the group in a nested fashion,
+ remembering the bracket starts on a stack. In the case of a zero minimum,
+ the first one was set up above. In all cases the repeat_max now specifies
+ the number of additional copies needed. */
+
+ if (repeat_max >= 0)
+ {
+ for (i = repeat_max - 1; i >= 0; i--)
+ {
+ *code++ = OP_BRAZERO + repeat_type;
+
+ /* All but the final copy start a new nesting, maintaining the
+ chain of brackets outstanding. */
+
+ if (i != 0)
+ {
+ int offset;
+ *code++ = OP_BRA;
+ offset = (bralink == NULL)? 0 : code - bralink;
+ bralink = code;
+ PUTINC(code, 0, offset);
+ }
+
+ memcpy(code, previous, len);
+ code += len;
+ }
+
+ /* Now chain through the pending brackets, and fill in their length
+ fields (which are holding the chain links pro tem). */
+
+ while (bralink != NULL)
+ {
+ int oldlinkoffset;
+ int offset = code - bralink + 1;
+ uschar *bra = code - offset;
+ oldlinkoffset = GET(bra, 1);
+ bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
+ *code++ = OP_KET;
+ PUTINC(code, 0, offset);
+ PUT(bra, 1, offset);
+ }
+ }
+
+ /* If the maximum is unlimited, set a repeater in the final copy. We
+ can't just offset backwards from the current code point, because we
+ don't know if there's been an options resetting after the ket. The
+ correct offset was computed above. */
+
+ else code[-ketoffset] = OP_KETRMAX + repeat_type;
+ }
+
+ /* Else there's some kind of shambles */
+
+ else
+ {
+ *errorptr = ERR11;
+ goto FAILED;
+ }
+
+ /* If the character following a repeat is '+', we wrap the entire repeated
+ item inside OP_ONCE brackets. This is just syntactic sugar, taken from
+ Sun's Java package. The repeated item starts at tempcode, not at previous,
+ which might be the first part of a string whose (former) last char we
+ repeated. However, we don't support '+' after a greediness '?'. */
+
+ if (possessive_quantifier)
+ {
+ int len = code - tempcode;
+ memmove(tempcode + 1+LINK_SIZE, tempcode, len);
+ code += 1 + LINK_SIZE;
+ len += 1 + LINK_SIZE;
+ tempcode[0] = OP_ONCE;
+ *code++ = OP_KET;
+ PUTINC(code, 0, len);
+ PUT(tempcode, 1, len);
+ }
+
+ /* In all case we no longer have a previous item. We also set the
+ "follows varying string" flag for subsequently encountered reqbytes if
+ it isn't already set and we have just passed a varying length item. */
+
+ END_REPEAT:
+ previous = NULL;
+ cd->req_varyopt |= reqvary;
+ break;
+
+
+ /* Start of nested bracket sub-expression, or comment or lookahead or
+ lookbehind or option setting or condition. First deal with special things
+ that can come after a bracket; all are introduced by ?, and the appearance
+ of any of them means that this is not a referencing group. They were
+ checked for validity in the first pass over the string, so we don't have to
+ check for syntax errors here. */
+
+ case '(':
+ newoptions = options;
+ skipbytes = 0;
+
+ if (*(++ptr) == '?')
+ {
+ int set, unset;
+ int *optset;
+
+ switch (*(++ptr))
+ {
+ case '#': /* Comment; skip to ket */
+ ptr++;
+ while (*ptr != ')') ptr++;
+ continue;
+
+ case ':': /* Non-extracting bracket */
+ bravalue = OP_BRA;
+ ptr++;
+ break;
+
+ case '(':
+ bravalue = OP_COND; /* Conditional group */
+
+ /* Condition to test for recursion */
+
+ if (ptr[1] == 'R')
+ {
+ code[1+LINK_SIZE] = OP_CREF;
+ PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
+ skipbytes = 3;
+ ptr += 3;
+ }
+
+ /* Condition to test for a numbered subpattern match. We know that
+ if a digit follows ( then there will just be digits until ) because
+ the syntax was checked in the first pass. */
+
+ else if ((digitab[ptr[1]] && ctype_digit) != 0)
+ {
+ int condref; /* Don't amalgamate; some compilers */
+ condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
+ while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
+ if (condref == 0)
+ {
+ *errorptr = ERR35;
+ goto FAILED;
+ }
+ ptr++;
+ code[1+LINK_SIZE] = OP_CREF;
+ PUT2(code, 2+LINK_SIZE, condref);
+ skipbytes = 3;
+ }
+ /* For conditions that are assertions, we just fall through, having
+ set bravalue above. */
+ break;
+
+ case '=': /* Positive lookahead */
+ bravalue = OP_ASSERT;
+ ptr++;
+ break;
+
+ case '!': /* Negative lookahead */
+ bravalue = OP_ASSERT_NOT;
+ ptr++;
+ break;
+
+ case '<': /* Lookbehinds */
+ switch (*(++ptr))
+ {
+ case '=': /* Positive lookbehind */
+ bravalue = OP_ASSERTBACK;
+ ptr++;
+ break;
+
+ case '!': /* Negative lookbehind */
+ bravalue = OP_ASSERTBACK_NOT;
+ ptr++;
+ break;
+ }
+ break;
+
+ case '>': /* One-time brackets */
+ bravalue = OP_ONCE;
+ ptr++;
+ break;
+
+ case 'C': /* Callout - may be followed by digits; */
+ previous_callout = code; /* Save for later completion */
+ after_manual_callout = 1; /* Skip one item before completing */
+ *code++ = OP_CALLOUT; /* Already checked that the terminating */
+ { /* closing parenthesis is present. */
+ int n = 0;
+ while ((digitab[*(++ptr)] & ctype_digit) != 0)
+ n = n * 10 + *ptr - '0';
+ if (n > 255)
+ {
+ *errorptr = ERR38;
+ goto FAILED;
+ }
+ *code++ = n;
+ PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
+ PUT(code, LINK_SIZE, 0); /* Default length */
+ code += 2 * LINK_SIZE;
+ }
+ previous = NULL;
+ continue;
+
+ case 'P': /* Named subpattern handling */
+ if (*(++ptr) == '<') /* Definition */
+ {
+ int i, namelen;
+ uschar *slot = cd->name_table;
+ const uschar *name; /* Don't amalgamate; some compilers */
+ name = ++ptr; /* grumble at autoincrement in declaration */
+
+ while (*ptr++ != '>');
+ namelen = ptr - name - 1;
+
+ for (i = 0; i < cd->names_found; i++)
+ {
+ int crc = memcmp(name, slot+2, namelen);
+ if (crc == 0)
+ {
+ if (slot[2+namelen] == 0)
+ {
+ *errorptr = ERR43;
+ goto FAILED;
+ }
+ crc = -1; /* Current name is substring */
+ }
+ if (crc < 0)
+ {
+ memmove(slot + cd->name_entry_size, slot,
+ (cd->names_found - i) * cd->name_entry_size);
+ break;
+ }
+ slot += cd->name_entry_size;
+ }
+
+ PUT2(slot, 0, *brackets + 1);
+ memcpy(slot + 2, name, namelen);
+ slot[2+namelen] = 0;
+ cd->names_found++;
+ goto NUMBERED_GROUP;
+ }
+
+ if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
+ {
+ int i, namelen;
+ int type = *ptr++;
+ const uschar *name = ptr;
+ uschar *slot = cd->name_table;
+
+ while (*ptr != ')') ptr++;
+ namelen = ptr - name;
+
+ for (i = 0; i < cd->names_found; i++)
+ {
+ if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
+ slot += cd->name_entry_size;
+ }
+ if (i >= cd->names_found)
+ {
+ *errorptr = ERR15;
+ goto FAILED;
+ }
+
+ recno = GET2(slot, 0);
+
+ if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
+
+ /* Back reference */
+
+ previous = code;
+ *code++ = OP_REF;
+ PUT2INC(code, 0, recno);
+ cd->backref_map |= (recno < 32)? (1 << recno) : 1;
+ if (recno > cd->top_backref) cd->top_backref = recno;
+ continue;
+ }
+
+ /* Should never happen */
+ break;
+
+ case 'R': /* Pattern recursion */
+ ptr++; /* Same as (?0) */
+ /* Fall through */
+
+ /* Recursion or "subroutine" call */
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ {
+ const uschar *called;
+ recno = 0;
+ while((digitab[*ptr] & ctype_digit) != 0)
+ recno = recno * 10 + *ptr++ - '0';
+
+ /* Come here from code above that handles a named recursion */
+
+ HANDLE_RECURSION:
+
+ previous = code;
+
+ /* Find the bracket that is being referenced. Temporarily end the
+ regex in case it doesn't exist. */
+
+ *code = OP_END;
+ called = (recno == 0)?
+ cd->start_code : find_bracket(cd->start_code, utf8, recno);
+
+ if (called == NULL)
+ {
+ *errorptr = ERR15;
+ goto FAILED;
+ }
+
+ /* If the subpattern is still open, this is a recursive call. We
+ check to see if this is a left recursion that could loop for ever,
+ and diagnose that case. */
+
+ if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
+ {
+ *errorptr = ERR40;
+ goto FAILED;
+ }
+
+ /* Insert the recursion/subroutine item */
+
+ *code = OP_RECURSE;
+ PUT(code, 1, called - cd->start_code);
+ code += 1 + LINK_SIZE;
+ }
+ continue;
+
+ /* Character after (? not specially recognized */
+
+ default: /* Option setting */
+ set = unset = 0;
+ optset = &set;
+
+ while (*ptr != ')' && *ptr != ':')
+ {
+ switch (*ptr++)
+ {
+ case '-': optset = &unset; break;
+
+ case 'i': *optset |= PCRE_CASELESS; break;
+ case 'm': *optset |= PCRE_MULTILINE; break;
+ case 's': *optset |= PCRE_DOTALL; break;
+ case 'x': *optset |= PCRE_EXTENDED; break;
+ case 'U': *optset |= PCRE_UNGREEDY; break;
+ case 'X': *optset |= PCRE_EXTRA; break;
+ }
+ }
+
+ /* Set up the changed option bits, but don't change anything yet. */
+
+ newoptions = (options | set) & (~unset);
+
+ /* If the options ended with ')' this is not the start of a nested
+ group with option changes, so the options change at this level. Compile
+ code to change the ims options if this setting actually changes any of
+ them. We also pass the new setting back so that it can be put at the
+ start of any following branches, and when this group ends (if we are in
+ a group), a resetting item can be compiled.
+
+ Note that if this item is right at the start of the pattern, the
+ options will have been abstracted and made global, so there will be no
+ change to compile. */
+
+ if (*ptr == ')')
+ {
+ if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
+ {
+ *code++ = OP_OPT;
+ *code++ = newoptions & PCRE_IMS;
+ }
+
+ /* Change options at this level, and pass them back for use
+ in subsequent branches. Reset the greedy defaults and the case
+ value for firstbyte and reqbyte. */
+
+ *optionsptr = options = newoptions;
+ greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
+ greedy_non_default = greedy_default ^ 1;
+ req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
+
+ previous = NULL; /* This item can't be repeated */
+ continue; /* It is complete */
+ }
+
+ /* If the options ended with ':' we are heading into a nested group
+ with possible change of options. Such groups are non-capturing and are
+ not assertions of any kind. All we need to do is skip over the ':';
+ the newoptions value is handled below. */
+
+ bravalue = OP_BRA;
+ ptr++;
+ }
+ }
+
+ /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
+ non-capturing and behave like (?:...) brackets */
+
+ else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
+ {
+ bravalue = OP_BRA;
+ }
+
+ /* Else we have a referencing group; adjust the opcode. If the bracket
+ number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
+ arrange for the true number to follow later, in an OP_BRANUMBER item. */
+
+ else
+ {
+ NUMBERED_GROUP:
+ if (++(*brackets) > EXTRACT_BASIC_MAX)
+ {
+ bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
+ code[1+LINK_SIZE] = OP_BRANUMBER;
+ PUT2(code, 2+LINK_SIZE, *brackets);
+ skipbytes = 3;
+ }
+ else bravalue = OP_BRA + *brackets;
+ }
+
+ /* Process nested bracketed re. Assertions may not be repeated, but other
+ kinds can be. We copy code into a non-register variable in order to be able
+ to pass its address because some compilers complain otherwise. Pass in a
+ new setting for the ims options if they have changed. */
+
+ previous = (bravalue >= OP_ONCE)? code : NULL;
+ *code = bravalue;
+ tempcode = code;
+ tempreqvary = cd->req_varyopt; /* Save value before bracket */
+
+ if (!compile_regex(
+ newoptions, /* The complete new option state */
+ options & PCRE_IMS, /* The previous ims option state */
+ brackets, /* Extracting bracket count */
+ &tempcode, /* Where to put code (updated) */
+ &ptr, /* Input pointer (updated) */
+ errorptr, /* Where to put an error message */
+ (bravalue == OP_ASSERTBACK ||
+ bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
+ skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
+ &subfirstbyte, /* For possible first char */
+ &subreqbyte, /* For possible last char */
+ bcptr, /* Current branch chain */
+ cd)) /* Tables block */
+ goto FAILED;
+
+ /* At the end of compiling, code is still pointing to the start of the
+ group, while tempcode has been updated to point past the end of the group
+ and any option resetting that may follow it. The pattern pointer (ptr)
+ is on the bracket. */
+
+ /* If this is a conditional bracket, check that there are no more than
+ two branches in the group. */
+
+ else if (bravalue == OP_COND)
+ {
+ uschar *tc = code;
+ condcount = 0;
+
+ do {
+ condcount++;
+ tc += GET(tc,1);
+ }
+ while (*tc != OP_KET);
+
+ if (condcount > 2)
+ {
+ *errorptr = ERR27;
+ goto FAILED;
+ }
+
+ /* If there is just one branch, we must not make use of its firstbyte or
+ reqbyte, because this is equivalent to an empty second branch. */
+
+ if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
+ }
+
+ /* Handle updating of the required and first characters. Update for normal
+ brackets of all kinds, and conditions with two branches (see code above).
+ If the bracket is followed by a quantifier with zero repeat, we have to
+ back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
+ main loop so that they can be accessed for the back off. */
+
+ zeroreqbyte = reqbyte;
+ zerofirstbyte = firstbyte;
+ groupsetfirstbyte = FALSE;
+
+ if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
+ {
+ /* If we have not yet set a firstbyte in this branch, take it from the
+ subpattern, remembering that it was set here so that a repeat of more
+ than one can replicate it as reqbyte if necessary. If the subpattern has
+ no firstbyte, set "none" for the whole branch. In both cases, a zero
+ repeat forces firstbyte to "none". */
+
+ if (firstbyte == REQ_UNSET)
+ {
+ if (subfirstbyte >= 0)
+ {
+ firstbyte = subfirstbyte;
+ groupsetfirstbyte = TRUE;
+ }
+ else firstbyte = REQ_NONE;
+ zerofirstbyte = REQ_NONE;
+ }
+
+ /* If firstbyte was previously set, convert the subpattern's firstbyte
+ into reqbyte if there wasn't one, using the vary flag that was in
+ existence beforehand. */
+
+ else if (subfirstbyte >= 0 && subreqbyte < 0)
+ subreqbyte = subfirstbyte | tempreqvary;
+
+ /* If the subpattern set a required byte (or set a first byte that isn't
+ really the first byte - see above), set it. */
+
+ if (subreqbyte >= 0) reqbyte = subreqbyte;
+ }
+
+ /* For a forward assertion, we take the reqbyte, if set. This can be
+ helpful if the pattern that follows the assertion doesn't set a different
+ char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
+ for an assertion, however because it leads to incorrect effect for patterns
+ such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
+ of a firstbyte. This is overcome by a scan at the end if there's no
+ firstbyte, looking for an asserted first char. */
+
+ else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
+
+ /* Now update the main code pointer to the end of the group. */
+
+ code = tempcode;
+
+ /* Error if hit end of pattern */
+
+ if (*ptr != ')')
+ {
+ *errorptr = ERR14;
+ goto FAILED;
+ }
+ break;
+
+ /* Check \ for being a real metacharacter; if not, fall through and handle
+ it as a data character at the start of a string. Escape items are checked
+ for validity in the pre-compiling pass. */
+
+ case '\\':
+ tempptr = ptr;
+ c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
+
+ /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
+ are arranged to be the negation of the corresponding OP_values. For the
+ back references, the values are ESC_REF plus the reference number. Only
+ back references and those types that consume a character may be repeated.
+ We can test for values between ESC_b and ESC_Z for the latter; this may
+ have to change if any new ones are ever created. */
+
+ if (c < 0)
+ {
+ if (-c == ESC_Q) /* Handle start of quoted string */
+ {
+ if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
+ else inescq = TRUE;
+ continue;
+ }
+
+ /* For metasequences that actually match a character, we disable the
+ setting of a first character if it hasn't already been set. */
+
+ if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
+ firstbyte = REQ_NONE;
+
+ /* Set values to reset to if this is followed by a zero repeat. */
+
+ zerofirstbyte = firstbyte;
+ zeroreqbyte = reqbyte;
+
+ /* Back references are handled specially */
+
+ if (-c >= ESC_REF)
+ {
+ int number = -c - ESC_REF;
+ previous = code;
+ *code++ = OP_REF;
+ PUT2INC(code, 0, number);
+ }
+
+ /* So are Unicode property matches, if supported. We know that get_ucp
+ won't fail because it was tested in the pre-pass. */
+
+#ifdef SUPPORT_UCP
+ else if (-c == ESC_P || -c == ESC_p)
+ {
+ BOOL negated;
+ int value = get_ucp(&ptr, &negated, errorptr);
+ previous = code;
+ *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
+ *code++ = value;
+ }
+#endif
+
+ /* For the rest, we can obtain the OP value by negating the escape
+ value */
+
+ else
+ {
+ previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
+ *code++ = -c;
+ }
+ continue;
+ }
+
+ /* We have a data character whose value is in c. In UTF-8 mode it may have
+ a value > 127. We set its representation in the length/buffer, and then
+ handle it as a data character. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && c > 127)
+ mclength = ord2utf8(c, mcbuffer);
+ else
+#endif
+
+ {
+ mcbuffer[0] = c;
+ mclength = 1;
+ }
+
+ goto ONE_CHAR;
+
+ /* Handle a literal character. It is guaranteed not to be whitespace or #
+ when the extended flag is set. If we are in UTF-8 mode, it may be a
+ multi-byte literal character. */
+
+ default:
+ NORMAL_CHAR:
+ mclength = 1;
+ mcbuffer[0] = c;
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && (c & 0xc0) == 0xc0)
+ {
+ while ((ptr[1] & 0xc0) == 0x80)
+ mcbuffer[mclength++] = *(++ptr);
+ }
+#endif
+
+ /* At this point we have the character's bytes in mcbuffer, and the length
+ in mclength. When not in UTF-8 mode, the length is always 1. */
+
+ ONE_CHAR:
+ previous = code;
+ *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
+ for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
+
+ /* Set the first and required bytes appropriately. If no previous first
+ byte, set it from this character, but revert to none on a zero repeat.
+ Otherwise, leave the firstbyte value alone, and don't change it on a zero
+ repeat. */
+
+ if (firstbyte == REQ_UNSET)
+ {
+ zerofirstbyte = REQ_NONE;
+ zeroreqbyte = reqbyte;
+
+ /* If the character is more than one byte long, we can set firstbyte
+ only if it is not to be matched caselessly. */
+
+ if (mclength == 1 || req_caseopt == 0)
+ {
+ firstbyte = mcbuffer[0] | req_caseopt;
+ if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
+ }
+ else firstbyte = reqbyte = REQ_NONE;
+ }
+
+ /* firstbyte was previously set; we can set reqbyte only the length is
+ 1 or the matching is caseful. */
+
+ else
+ {
+ zerofirstbyte = firstbyte;
+ zeroreqbyte = reqbyte;
+ if (mclength == 1 || req_caseopt == 0)
+ reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
+ }
+
+ break; /* End of literal character handling */
+ }
+ } /* end of big loop */
+
+/* Control never reaches here by falling through, only by a goto for all the
+error states. Pass back the position in the pattern so that it can be displayed
+to the user for diagnosing the error. */
+
+FAILED:
+*ptrptr = ptr;
+return FALSE;
+}
+
+
+
+
+/*************************************************
+* Compile sequence of alternatives *
+*************************************************/
+
+/* On entry, ptr is pointing past the bracket character, but on return
+it points to the closing bracket, or vertical bar, or end of string.
+The code variable is pointing at the byte into which the BRA operator has been
+stored. If the ims options are changed at the start (for a (?ims: group) or
+during any branch, we need to insert an OP_OPT item at the start of every
+following branch to ensure they get set correctly at run time, and also pass
+the new options into every subsequent branch compile.
+
+Argument:
+ options option bits, including any changes for this subpattern
+ oldims previous settings of ims option bits
+ brackets -> int containing the number of extracting brackets used
+ codeptr -> the address of the current code pointer
+ ptrptr -> the address of the current pattern pointer
+ errorptr -> pointer to error message
+ lookbehind TRUE if this is a lookbehind assertion
+ skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
+ firstbyteptr place to put the first required character, or a negative number
+ reqbyteptr place to put the last required character, or a negative number
+ bcptr pointer to the chain of currently open branches
+ cd points to the data block with tables pointers etc.
+
+Returns: TRUE on success
+*/
+
+static BOOL
+compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
+ const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
+ int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
+{
+const uschar *ptr = *ptrptr;
+uschar *code = *codeptr;
+uschar *last_branch = code;
+uschar *start_bracket = code;
+uschar *reverse_count = NULL;
+int firstbyte, reqbyte;
+int branchfirstbyte, branchreqbyte;
+branch_chain bc;
+
+bc.outer = bcptr;
+bc.current = code;
+
+firstbyte = reqbyte = REQ_UNSET;
+
+/* Offset is set zero to mark that this bracket is still open */
+
+PUT(code, 1, 0);
+code += 1 + LINK_SIZE + skipbytes;
+
+/* Loop for each alternative branch */
+
+for (;;)
+ {
+ /* Handle a change of ims options at the start of the branch */
+
+ if ((options & PCRE_IMS) != oldims)
+ {
+ *code++ = OP_OPT;
+ *code++ = options & PCRE_IMS;
+ }
+
+ /* Set up dummy OP_REVERSE if lookbehind assertion */
+
+ if (lookbehind)
+ {
+ *code++ = OP_REVERSE;
+ reverse_count = code;
+ PUTINC(code, 0, 0);
+ }
+
+ /* Now compile the branch */
+
+ if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
+ &branchfirstbyte, &branchreqbyte, &bc, cd))
+ {
+ *ptrptr = ptr;
+ return FALSE;
+ }
+
+ /* If this is the first branch, the firstbyte and reqbyte values for the
+ branch become the values for the regex. */
+
+ if (*last_branch != OP_ALT)
+ {
+ firstbyte = branchfirstbyte;
+ reqbyte = branchreqbyte;
+ }
+
+ /* If this is not the first branch, the first char and reqbyte have to
+ match the values from all the previous branches, except that if the previous
+ value for reqbyte didn't have REQ_VARY set, it can still match, and we set
+ REQ_VARY for the regex. */
+
+ else
+ {
+ /* If we previously had a firstbyte, but it doesn't match the new branch,
+ we have to abandon the firstbyte for the regex, but if there was previously
+ no reqbyte, it takes on the value of the old firstbyte. */
+
+ if (firstbyte >= 0 && firstbyte != branchfirstbyte)
+ {
+ if (reqbyte < 0) reqbyte = firstbyte;
+ firstbyte = REQ_NONE;
+ }
+
+ /* If we (now or from before) have no firstbyte, a firstbyte from the
+ branch becomes a reqbyte if there isn't a branch reqbyte. */
+
+ if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
+ branchreqbyte = branchfirstbyte;
+
+ /* Now ensure that the reqbytes match */
+
+ if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
+ reqbyte = REQ_NONE;
+ else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
+ }
+
+ /* If lookbehind, check that this branch matches a fixed-length string,
+ and put the length into the OP_REVERSE item. Temporarily mark the end of
+ the branch with OP_END. */
+
+ if (lookbehind)
+ {
+ int length;
+ *code = OP_END;
+ length = find_fixedlength(last_branch, options);
+ DPRINTF(("fixed length = %d\n", length));
+ if (length < 0)
+ {
+ *errorptr = (length == -2)? ERR36 : ERR25;
+ *ptrptr = ptr;
+ return FALSE;
+ }
+ PUT(reverse_count, 0, length);
+ }
+
+ /* Reached end of expression, either ')' or end of pattern. Go back through
+ the alternative branches and reverse the chain of offsets, with the field in
+ the BRA item now becoming an offset to the first alternative. If there are
+ no alternatives, it points to the end of the group. The length in the
+ terminating ket is always the length of the whole bracketed item. If any of
+ the ims options were changed inside the group, compile a resetting op-code
+ following, except at the very end of the pattern. Return leaving the pointer
+ at the terminating char. */
+
+ if (*ptr != '|')
+ {
+ int length = code - last_branch;
+ do
+ {
+ int prev_length = GET(last_branch, 1);
+ PUT(last_branch, 1, length);
+ length = prev_length;
+ last_branch -= length;
+ }
+ while (length > 0);
+
+ /* Fill in the ket */
+
+ *code = OP_KET;
+ PUT(code, 1, code - start_bracket);
+ code += 1 + LINK_SIZE;
+
+ /* Resetting option if needed */
+
+ if ((options & PCRE_IMS) != oldims && *ptr == ')')
+ {
+ *code++ = OP_OPT;
+ *code++ = oldims;
+ }
+
+ /* Set values to pass back */
+
+ *codeptr = code;
+ *ptrptr = ptr;
+ *firstbyteptr = firstbyte;
+ *reqbyteptr = reqbyte;
+ return TRUE;
+ }
+
+ /* Another branch follows; insert an "or" node. Its length field points back
+ to the previous branch while the bracket remains open. At the end the chain
+ is reversed. It's done like this so that the start of the bracket has a
+ zero offset until it is closed, making it possible to detect recursion. */
+
+ *code = OP_ALT;
+ PUT(code, 1, code - last_branch);
+ bc.current = last_branch = code;
+ code += 1 + LINK_SIZE;
+ ptr++;
+ }
+/* Control never reaches here */
+}
+
+
+
+
+/*************************************************
+* Check for anchored expression *
+*************************************************/
+
+/* Try to find out if this is an anchored regular expression. Consider each
+alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
+all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
+it's anchored. However, if this is a multiline pattern, then only OP_SOD
+counts, since OP_CIRC can match in the middle.
+
+We can also consider a regex to be anchored if OP_SOM starts all its branches.
+This is the code for \G, which means "match at start of match position, taking
+into account the match offset".
+
+A branch is also implicitly anchored if it starts with .* and DOTALL is set,
+because that will try the rest of the pattern at all possible matching points,
+so there is no point trying again.... er ....
+
+.... except when the .* appears inside capturing parentheses, and there is a
+subsequent back reference to those parentheses. We haven't enough information
+to catch that case precisely.
+
+At first, the best we could do was to detect when .* was in capturing brackets
+and the highest back reference was greater than or equal to that level.
+However, by keeping a bitmap of the first 31 back references, we can catch some
+of the more common cases more precisely.
+
+Arguments:
+ code points to start of expression (the bracket)
+ options points to the options setting
+ bracket_map a bitmap of which brackets we are inside while testing; this
+ handles up to substring 31; after that we just have to take
+ the less precise approach
+ backref_map the back reference bitmap
+
+Returns: TRUE or FALSE
+*/
+
+static BOOL
+is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
+ unsigned int backref_map)
+{
+do {
+ const uschar *scode =
+ first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
+ register int op = *scode;
+
+ /* Capturing brackets */
+
+ if (op > OP_BRA)
+ {
+ int new_map;
+ op -= OP_BRA;
+ if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
+ new_map = bracket_map | ((op < 32)? (1 << op) : 1);
+ if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
+ }
+
+ /* Other brackets */
+
+ else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
+ {
+ if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
+ }
+
+ /* .* is not anchored unless DOTALL is set and it isn't in brackets that
+ are or may be referenced. */
+
+ else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
+ (*options & PCRE_DOTALL) != 0)
+ {
+ if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
+ }
+
+ /* Check for explicit anchoring */
+
+ else if (op != OP_SOD && op != OP_SOM &&
+ ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
+ return FALSE;
+ code += GET(code, 1);
+ }
+while (*code == OP_ALT); /* Loop for each alternative */
+return TRUE;
+}
+
+
+
+/*************************************************
+* Check for starting with ^ or .* *
+*************************************************/
+
+/* This is called to find out if every branch starts with ^ or .* so that
+"first char" processing can be done to speed things up in multiline
+matching and for non-DOTALL patterns that start with .* (which must start at
+the beginning or after \n). As in the case of is_anchored() (see above), we
+have to take account of back references to capturing brackets that contain .*
+because in that case we can't make the assumption.
+
+Arguments:
+ code points to start of expression (the bracket)
+ bracket_map a bitmap of which brackets we are inside while testing; this
+ handles up to substring 31; after that we just have to take
+ the less precise approach
+ backref_map the back reference bitmap
+
+Returns: TRUE or FALSE
+*/
+
+static BOOL
+is_startline(const uschar *code, unsigned int bracket_map,
+ unsigned int backref_map)
+{
+do {
+ const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
+ FALSE);
+ register int op = *scode;
+
+ /* Capturing brackets */
+
+ if (op > OP_BRA)
+ {
+ int new_map;
+ op -= OP_BRA;
+ if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
+ new_map = bracket_map | ((op < 32)? (1 << op) : 1);
+ if (!is_startline(scode, new_map, backref_map)) return FALSE;
+ }
+
+ /* Other brackets */
+
+ else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
+ { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
+
+ /* .* means "start at start or after \n" if it isn't in brackets that
+ may be referenced. */
+
+ else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
+ {
+ if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
+ }
+
+ /* Check for explicit circumflex */
+
+ else if (op != OP_CIRC) return FALSE;
+
+ /* Move on to the next alternative */
+
+ code += GET(code, 1);
+ }
+while (*code == OP_ALT); /* Loop for each alternative */
+return TRUE;
+}
+
+
+
+/*************************************************
+* Check for asserted fixed first char *
+*************************************************/
+
+/* During compilation, the "first char" settings from forward assertions are
+discarded, because they can cause conflicts with actual literals that follow.
+However, if we end up without a first char setting for an unanchored pattern,
+it is worth scanning the regex to see if there is an initial asserted first
+char. If all branches start with the same asserted char, or with a bracket all
+of whose alternatives start with the same asserted char (recurse ad lib), then
+we return that char, otherwise -1.
+
+Arguments:
+ code points to start of expression (the bracket)
+ options pointer to the options (used to check casing changes)
+ inassert TRUE if in an assertion
+
+Returns: -1 or the fixed first char
+*/
+
+static int
+find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
+{
+register int c = -1;
+do {
+ int d;
+ const uschar *scode =
+ first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
+ register int op = *scode;
+
+ if (op >= OP_BRA) op = OP_BRA;
+
+ switch(op)
+ {
+ default:
+ return -1;
+
+ case OP_BRA:
+ case OP_ASSERT:
+ case OP_ONCE:
+ case OP_COND:
+ if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
+ return -1;
+ if (c < 0) c = d; else if (c != d) return -1;
+ break;
+
+ case OP_EXACT: /* Fall through */
+ scode += 2;
+
+ case OP_CHAR:
+ case OP_CHARNC:
+ case OP_PLUS:
+ case OP_MINPLUS:
+ if (!inassert) return -1;
+ if (c < 0)
+ {
+ c = scode[1];
+ if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
+ }
+ else if (c != scode[1]) return -1;
+ break;
+ }
+
+ code += GET(code, 1);
+ }
+while (*code == OP_ALT);
+return c;
+}
+
+
+
+
+#ifdef SUPPORT_UTF8
+/*************************************************
+* Validate a UTF-8 string *
+*************************************************/
+
+/* This function is called (optionally) at the start of compile or match, to
+validate that a supposed UTF-8 string is actually valid. The early check means
+that subsequent code can assume it is dealing with a valid string. The check
+can be turned off for maximum performance, but then consequences of supplying
+an invalid string are then undefined.
+
+Arguments:
+ string points to the string
+ length length of string, or -1 if the string is zero-terminated
+
+Returns: < 0 if the string is a valid UTF-8 string
+ >= 0 otherwise; the value is the offset of the bad byte
+*/
+
+static int
+valid_utf8(const uschar *string, int length)
+{
+register const uschar *p;
+
+if (length < 0)
+ {
+ for (p = string; *p != 0; p++);
+ length = p - string;
+ }
+
+for (p = string; length-- > 0; p++)
+ {
+ register int ab;
+ register int c = *p;
+ if (c < 128) continue;
+ if ((c & 0xc0) != 0xc0) return p - string;
+ ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
+ if (length < ab) return p - string;
+ length -= ab;
+
+ /* Check top bits in the second byte */
+ if ((*(++p) & 0xc0) != 0x80) return p - string;
+
+ /* Check for overlong sequences for each different length */
+ switch (ab)
+ {
+ /* Check for xx00 000x */
+ case 1:
+ if ((c & 0x3e) == 0) return p - string;
+ continue; /* We know there aren't any more bytes to check */
+
+ /* Check for 1110 0000, xx0x xxxx */
+ case 2:
+ if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
+ break;
+
+ /* Check for 1111 0000, xx00 xxxx */
+ case 3:
+ if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
+ break;
+
+ /* Check for 1111 1000, xx00 0xxx */
+ case 4:
+ if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
+ break;
+
+ /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
+ case 5:
+ if (c == 0xfe || c == 0xff ||
+ (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
+ break;
+ }
+
+ /* Check for valid bytes after the 2nd, if any; all must start 10 */
+ while (--ab > 0)
+ {
+ if ((*(++p) & 0xc0) != 0x80) return p - string;
+ }
+ }
+
+return -1;
+}
+#endif
+
+
+
+/*************************************************
+* Compile a Regular Expression *
+*************************************************/
+
+/* This function takes a string and returns a pointer to a block of store
+holding a compiled version of the expression.
+
+Arguments:
+ pattern the regular expression
+ options various option bits
+ errorptr pointer to pointer to error text
+ erroroffset ptr offset in pattern where error was detected
+ tables pointer to character tables or NULL
+
+Returns: pointer to compiled data block, or NULL on error,
+ with errorptr and erroroffset set
+*/
+
+EXPORT pcre *
+pcre_compile(const char *pattern, int options, const char **errorptr,
+ int *erroroffset, const unsigned char *tables)
+{
+real_pcre *re;
+int length = 1 + LINK_SIZE; /* For initial BRA plus length */
+int c, firstbyte, reqbyte;
+int bracount = 0;
+int branch_extra = 0;
+int branch_newextra;
+int item_count = -1;
+int name_count = 0;
+int max_name_size = 0;
+int lastitemlength = 0;
+#ifdef SUPPORT_UTF8
+BOOL utf8;
+BOOL class_utf8;
+#endif
+BOOL inescq = FALSE;
+unsigned int brastackptr = 0;
+size_t size;
+uschar *code;
+const uschar *codestart;
+const uschar *ptr;
+compile_data compile_block;
+int brastack[BRASTACK_SIZE];
+uschar bralenstack[BRASTACK_SIZE];
+
+/* We can't pass back an error message if errorptr is NULL; I guess the best we
+can do is just return NULL. */
+
+if (errorptr == NULL) return NULL;
+*errorptr = NULL;
+
+/* However, we can give a message for this error */
+
+if (erroroffset == NULL)
+ {
+ *errorptr = ERR16;
+ return NULL;
+ }
+*erroroffset = 0;
+
+/* Can't support UTF8 unless PCRE has been compiled to include the code. */
+
+#ifdef SUPPORT_UTF8
+utf8 = (options & PCRE_UTF8) != 0;
+if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
+ (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
+ {
+ *errorptr = ERR44;
+ return NULL;
+ }
+#else
+if ((options & PCRE_UTF8) != 0)
+ {
+ *errorptr = ERR32;
+ return NULL;
+ }
+#endif
+
+if ((options & ~PUBLIC_OPTIONS) != 0)
+ {
+ *errorptr = ERR17;
+ return NULL;
+ }
+
+/* Set up pointers to the individual character tables */
+
+if (tables == NULL) tables = pcre_default_tables;
+compile_block.lcc = tables + lcc_offset;
+compile_block.fcc = tables + fcc_offset;
+compile_block.cbits = tables + cbits_offset;
+compile_block.ctypes = tables + ctypes_offset;
+
+/* Maximum back reference and backref bitmap. This is updated for numeric
+references during the first pass, but for named references during the actual
+compile pass. The bitmap records up to 31 back references to help in deciding
+whether (.*) can be treated as anchored or not. */
+
+compile_block.top_backref = 0;
+compile_block.backref_map = 0;
+
+/* Reflect pattern for debugging output */
+
+DPRINTF(("------------------------------------------------------------------\n"));
+DPRINTF(("%s\n", pattern));
+
+/* The first thing to do is to make a pass over the pattern to compute the
+amount of store required to hold the compiled code. This does not have to be
+perfect as long as errors are overestimates. At the same time we can detect any
+flag settings right at the start, and extract them. Make an attempt to correct
+for any counted white space if an "extended" flag setting appears late in the
+pattern. We can't be so clever for #-comments. */
+
+ptr = (const uschar *)(pattern - 1);
+while ((c = *(++ptr)) != 0)
+ {
+ int min, max;
+ int class_optcount;
+ int bracket_length;
+ int duplength;
+
+ /* If we are inside a \Q...\E sequence, all chars are literal */
+
+ if (inescq)
+ {
+ if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
+ goto NORMAL_CHAR;
+ }
+
+ /* Otherwise, first check for ignored whitespace and comments */
+
+ if ((options & PCRE_EXTENDED) != 0)
+ {
+ if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
+ if (c == '#')
+ {
+ /* The space before the ; is to avoid a warning on a silly compiler
+ on the Macintosh. */
+ while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
+ if (c == 0) break;
+ continue;
+ }
+ }
+
+ item_count++; /* Is zero for the first non-comment item */
+
+ /* Allow space for auto callout before every item except quantifiers. */
+
+ if ((options & PCRE_AUTO_CALLOUT) != 0 &&
+ c != '*' && c != '+' && c != '?' &&
+ (c != '{' || !is_counted_repeat(ptr + 1)))
+ length += 2 + 2*LINK_SIZE;
+
+ switch(c)
+ {
+ /* A backslashed item may be an escaped data character or it may be a
+ character type. */
+
+ case '\\':
+ c = check_escape(&ptr, errorptr, bracount, options, FALSE);
+ if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
+
+ lastitemlength = 1; /* Default length of last item for repeats */
+
+ if (c >= 0) /* Data character */
+ {
+ length += 2; /* For a one-byte character */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && c > 127)
+ {
+ int i;
+ for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
+ if (c <= utf8_table1[i]) break;
+ length += i;
+ lastitemlength += i;
+ }
+#endif
+
+ continue;
+ }
+
+ /* If \Q, enter "literal" mode */
+
+ if (-c == ESC_Q)
+ {
+ inescq = TRUE;
+ continue;
+ }
+
+ /* \X is supported only if Unicode property support is compiled */
+
+#ifndef SUPPORT_UCP
+ if (-c == ESC_X)
+ {
+ *errorptr = ERR45;
+ goto PCRE_ERROR_RETURN;
+ }
+#endif
+
+ /* \P and \p are for Unicode properties, but only when the support has
+ been compiled. Each item needs 2 bytes. */
+
+ else if (-c == ESC_P || -c == ESC_p)
+ {
+#ifdef SUPPORT_UCP
+ BOOL negated;
+ length += 2;
+ lastitemlength = 2;
+ if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
+ continue;
+#else
+ *errorptr = ERR45;
+ goto PCRE_ERROR_RETURN;
+#endif
+ }
+
+ /* Other escapes need one byte */
+
+ length++;
+
+ /* A back reference needs an additional 2 bytes, plus either one or 5
+ bytes for a repeat. We also need to keep the value of the highest
+ back reference. */
+
+ if (c <= -ESC_REF)
+ {
+ int refnum = -c - ESC_REF;
+ compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
+ if (refnum > compile_block.top_backref)
+ compile_block.top_backref = refnum;
+ length += 2; /* For single back reference */
+ if (ptr[1] == '{' && is_counted_repeat(ptr+2))
+ {
+ ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
+ if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
+ if ((min == 0 && (max == 1 || max == -1)) ||
+ (min == 1 && max == -1))
+ length++;
+ else length += 5;
+ if (ptr[1] == '?') ptr++;
+ }
+ }
+ continue;
+
+ case '^': /* Single-byte metacharacters */
+ case '.':
+ case '$':
+ length++;
+ lastitemlength = 1;
+ continue;
+
+ case '*': /* These repeats won't be after brackets; */
+ case '+': /* those are handled separately */
+ case '?':
+ length++;
+ goto POSESSIVE; /* A few lines below */
+
+ /* This covers the cases of braced repeats after a single char, metachar,
+ class, or back reference. */
+
+ case '{':
+ if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
+ ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
+ if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
+
+ /* These special cases just insert one extra opcode */
+
+ if ((min == 0 && (max == 1 || max == -1)) ||
+ (min == 1 && max == -1))
+ length++;
+
+ /* These cases might insert additional copies of a preceding character. */
+
+ else
+ {
+ if (min != 1)
+ {
+ length -= lastitemlength; /* Uncount the original char or metachar */
+ if (min > 0) length += 3 + lastitemlength;
+ }
+ length += lastitemlength + ((max > 0)? 3 : 1);
+ }
+
+ if (ptr[1] == '?') ptr++; /* Needs no extra length */
+
+ POSESSIVE: /* Test for possessive quantifier */
+ if (ptr[1] == '+')
+ {
+ ptr++;
+ length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
+ }
+ continue;
+
+ /* An alternation contains an offset to the next branch or ket. If any ims
+ options changed in the previous branch(es), and/or if we are in a
+ lookbehind assertion, extra space will be needed at the start of the
+ branch. This is handled by branch_extra. */
+
+ case '|':
+ length += 1 + LINK_SIZE + branch_extra;
+ continue;
+
+ /* A character class uses 33 characters provided that all the character
+ values are less than 256. Otherwise, it uses a bit map for low valued
+ characters, and individual items for others. Don't worry about character
+ types that aren't allowed in classes - they'll get picked up during the
+ compile. A character class that contains only one single-byte character
+ uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
+ where we can. (In UTF-8 mode we can do this only for chars < 128.) */
+
+ case '[':
+ if (*(++ptr) == '^')
+ {
+ class_optcount = 10; /* Greater than one */
+ ptr++;
+ }
+ else class_optcount = 0;
+
+#ifdef SUPPORT_UTF8
+ class_utf8 = FALSE;
+#endif
+
+ /* Written as a "do" so that an initial ']' is taken as data */
+
+ if (*ptr != 0) do
+ {
+ /* Inside \Q...\E everything is literal except \E */
+
+ if (inescq)
+ {
+ if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
+ inescq = FALSE;
+ ptr += 1;
+ continue;
+ }
+
+ /* Outside \Q...\E, check for escapes */
+
+ if (*ptr == '\\')
+ {
+ c = check_escape(&ptr, errorptr, bracount, options, TRUE);
+ if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
+
+ /* \b is backspace inside a class; \X is literal */
+
+ if (-c == ESC_b) c = '\b';
+ else if (-c == ESC_X) c = 'X';
+
+ /* \Q enters quoting mode */
+
+ else if (-c == ESC_Q)
+ {
+ inescq = TRUE;
+ continue;
+ }
+
+ /* Handle escapes that turn into characters */
+
+ if (c >= 0) goto NON_SPECIAL_CHARACTER;
+
+ /* Escapes that are meta-things. The normal ones just affect the
+ bit map, but Unicode properties require an XCLASS extended item. */
+
+ else
+ {
+ class_optcount = 10; /* \d, \s etc; make sure > 1 */
+#ifdef SUPPORT_UTF8
+ if (-c == ESC_p || -c == ESC_P)
+ {
+ if (!class_utf8)
+ {
+ class_utf8 = TRUE;
+ length += LINK_SIZE + 2;
+ }
+ length += 2;
+ }
+#endif
+ }
+ }
+
+ /* Check the syntax for POSIX stuff. The bits we actually handle are
+ checked during the real compile phase. */
+
+ else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
+ {
+ ptr++;
+ class_optcount = 10; /* Make sure > 1 */
+ }
+
+ /* Anything else increments the possible optimization count. We have to
+ detect ranges here so that we can compute the number of extra ranges for
+ caseless wide characters when UCP support is available. If there are wide
+ characters, we are going to have to use an XCLASS, even for single
+ characters. */
+
+ else
+ {
+ int d;
+
+ GET_ONE_CHARACTER:
+
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ int extra = 0;
+ GETCHARLEN(c, ptr, extra);
+ ptr += extra;
+ }
+ else c = *ptr;
+#else
+ c = *ptr;
+#endif
+
+ /* Come here from handling \ above when it escapes to a char value */
+
+ NON_SPECIAL_CHARACTER:
+ class_optcount++;
+
+ d = -1;
+ if (ptr[1] == '-')
+ {
+ uschar const *hyptr = ptr++;
+ if (ptr[1] == '\\')
+ {
+ ptr++;
+ d = check_escape(&ptr, errorptr, bracount, options, TRUE);
+ if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
+ if (-d == ESC_b) d = '\b'; /* backspace */
+ else if (-d == ESC_X) d = 'X'; /* literal X in a class */
+ }
+ else if (ptr[1] != 0 && ptr[1] != ']')
+ {
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ int extra = 0;
+ GETCHARLEN(d, ptr, extra);
+ ptr += extra;
+ }
+ else
+#endif
+ d = *ptr;
+ }
+ if (d < 0) ptr = hyptr; /* go back to hyphen as data */
+ }
+
+ /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
+ 127 for caseless matching, we will need to use an XCLASS. */
+
+ if (d >= 0)
+ {
+ class_optcount = 10; /* Ensure > 1 */
+ if (d < c)
+ {
+ *errorptr = ERR8;
+ goto PCRE_ERROR_RETURN;
+ }
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
+ {
+ uschar buffer[6];
+ if (!class_utf8) /* Allow for XCLASS overhead */
+ {
+ class_utf8 = TRUE;
+ length += LINK_SIZE + 2;
+ }
+
+#ifdef SUPPORT_UCP
+ /* If we have UCP support, find out how many extra ranges are
+ needed to map the other case of characters within this range. We
+ have to mimic the range optimization here, because extending the
+ range upwards might push d over a boundary that makes is use
+ another byte in the UTF-8 representation. */
+
+ if ((options & PCRE_CASELESS) != 0)
+ {
+ int occ, ocd;
+ int cc = c;
+ int origd = d;
+ while (get_othercase_range(&cc, origd, &occ, &ocd))
+ {
+ if (occ >= c && ocd <= d) continue; /* Skip embedded */
+
+ if (occ < c && ocd >= c - 1) /* Extend the basic range */
+ { /* if there is overlap, */
+ c = occ; /* noting that if occ < c */
+ continue; /* we can't have ocd > d */
+ } /* because a subrange is */
+ if (ocd > d && occ <= d + 1) /* always shorter than */
+ { /* the basic range. */
+ d = ocd;
+ continue;
+ }
+
+ /* An extra item is needed */
+
+ length += 1 + ord2utf8(occ, buffer) +
+ ((occ == ocd)? 0 : ord2utf8(ocd, buffer));
+ }
+ }
+#endif /* SUPPORT_UCP */
+
+ /* The length of the (possibly extended) range */
+
+ length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
+ }
+#endif /* SUPPORT_UTF8 */
+
+ }
+
+ /* We have a single character. There is nothing to be done unless we
+ are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
+ allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
+ support. */
+
+ else
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
+ {
+ uschar buffer[6];
+ class_optcount = 10; /* Ensure > 1 */
+ if (!class_utf8) /* Allow for XCLASS overhead */
+ {
+ class_utf8 = TRUE;
+ length += LINK_SIZE + 2;
+ }
+#ifdef SUPPORT_UCP
+ length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
+ (1 + ord2utf8(c, buffer));
+#else /* SUPPORT_UCP */
+ length += 1 + ord2utf8(c, buffer);
+#endif /* SUPPORT_UCP */
+ }
+#endif /* SUPPORT_UTF8 */
+ }
+ }
+ }
+ while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
+
+ if (*ptr == 0) /* Missing terminating ']' */
+ {
+ *errorptr = ERR6;
+ goto PCRE_ERROR_RETURN;
+ }
+
+ /* We can optimize when there was only one optimizable character. Repeats
+ for positive and negated single one-byte chars are handled by the general
+ code. Here, we handle repeats for the class opcodes. */
+
+ if (class_optcount == 1) length += 3; else
+ {
+ length += 33;
+
+ /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
+ we also need extra for wrapping the whole thing in a sub-pattern. */
+
+ if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
+ {
+ ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
+ if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
+ if ((min == 0 && (max == 1 || max == -1)) ||
+ (min == 1 && max == -1))
+ length++;
+ else length += 5;
+ if (ptr[1] == '+')
+ {
+ ptr++;
+ length += 2 + 2*LINK_SIZE;
+ }
+ else if (ptr[1] == '?') ptr++;
+ }
+ }
+ continue;
+
+ /* Brackets may be genuine groups or special things */
+
+ case '(':
+ branch_newextra = 0;
+ bracket_length = 1 + LINK_SIZE;
+
+ /* Handle special forms of bracket, which all start (? */
+
+ if (ptr[1] == '?')
+ {
+ int set, unset;
+ int *optset;
+
+ switch (c = ptr[2])
+ {
+ /* Skip over comments entirely */
+ case '#':
+ ptr += 3;
+ while (*ptr != 0 && *ptr != ')') ptr++;
+ if (*ptr == 0)
+ {
+ *errorptr = ERR18;
+ goto PCRE_ERROR_RETURN;
+ }
+ continue;
+
+ /* Non-referencing groups and lookaheads just move the pointer on, and
+ then behave like a non-special bracket, except that they don't increment
+ the count of extracting brackets. Ditto for the "once only" bracket,
+ which is in Perl from version 5.005. */
+
+ case ':':
+ case '=':
+ case '!':
+ case '>':
+ ptr += 2;
+ break;
+
+ /* (?R) specifies a recursive call to the regex, which is an extension
+ to provide the facility which can be obtained by (?p{perl-code}) in
+ Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
+
+ From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
+ the appropriate numbered brackets. This includes both recursive and
+ non-recursive calls. (?R) is now synonymous with (?0). */
+
+ case 'R':
+ ptr++;
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ ptr += 2;
+ if (c != 'R')
+ while ((digitab[*(++ptr)] & ctype_digit) != 0);
+ if (*ptr != ')')
+ {
+ *errorptr = ERR29;
+ goto PCRE_ERROR_RETURN;
+ }
+ length += 1 + LINK_SIZE;
+
+ /* If this item is quantified, it will get wrapped inside brackets so
+ as to use the code for quantified brackets. We jump down and use the
+ code that handles this for real brackets. */
+
+ if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
+ {
+ length += 2 + 2 * LINK_SIZE; /* to make bracketed */
+ duplength = 5 + 3 * LINK_SIZE;
+ goto HANDLE_QUANTIFIED_BRACKETS;
+ }
+ continue;
+
+ /* (?C) is an extension which provides "callout" - to provide a bit of
+ the functionality of the Perl (?{...}) feature. An optional number may
+ follow (default is zero). */
+
+ case 'C':
+ ptr += 2;
+ while ((digitab[*(++ptr)] & ctype_digit) != 0);
+ if (*ptr != ')')
+ {
+ *errorptr = ERR39;
+ goto PCRE_ERROR_RETURN;
+ }
+ length += 2 + 2*LINK_SIZE;
+ continue;
+
+ /* Named subpatterns are an extension copied from Python */
+
+ case 'P':
+ ptr += 3;
+ if (*ptr == '<')
+ {
+ const uschar *p; /* Don't amalgamate; some compilers */
+ p = ++ptr; /* grumble at autoincrement in declaration */
+ while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
+ if (*ptr != '>')
+ {
+ *errorptr = ERR42;
+ goto PCRE_ERROR_RETURN;
+ }
+ name_count++;
+ if (ptr - p > max_name_size) max_name_size = (ptr - p);
+ break;
+ }
+
+ if (*ptr == '=' || *ptr == '>')
+ {
+ while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
+ if (*ptr != ')')
+ {
+ *errorptr = ERR42;
+ goto PCRE_ERROR_RETURN;
+ }
+ break;
+ }
+
+ /* Unknown character after (?P */
+
+ *errorptr = ERR41;
+ goto PCRE_ERROR_RETURN;
+
+ /* Lookbehinds are in Perl from version 5.005 */
+
+ case '<':
+ ptr += 3;
+ if (*ptr == '=' || *ptr == '!')
+ {
+ branch_newextra = 1 + LINK_SIZE;
+ length += 1 + LINK_SIZE; /* For the first branch */
+ break;
+ }
+ *errorptr = ERR24;
+ goto PCRE_ERROR_RETURN;
+
+ /* Conditionals are in Perl from version 5.005. The bracket must either
+ be followed by a number (for bracket reference) or by an assertion
+ group, or (a PCRE extension) by 'R' for a recursion test. */
+
+ case '(':
+ if (ptr[3] == 'R' && ptr[4] == ')')
+ {
+ ptr += 4;
+ length += 3;
+ }
+ else if ((digitab[ptr[3]] & ctype_digit) != 0)
+ {
+ ptr += 4;
+ length += 3;
+ while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
+ if (*ptr != ')')
+ {
+ *errorptr = ERR26;
+ goto PCRE_ERROR_RETURN;
+ }
+ }
+ else /* An assertion must follow */
+ {
+ ptr++; /* Can treat like ':' as far as spacing is concerned */
+ if (ptr[2] != '?' ||
+ (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
+ {
+ ptr += 2; /* To get right offset in message */
+ *errorptr = ERR28;
+ goto PCRE_ERROR_RETURN;
+ }
+ }
+ break;
+
+ /* Else loop checking valid options until ) is met. Anything else is an
+ error. If we are without any brackets, i.e. at top level, the settings
+ act as if specified in the options, so massage the options immediately.
+ This is for backward compatibility with Perl 5.004. */
+
+ default:
+ set = unset = 0;
+ optset = &set;
+ ptr += 2;
+
+ for (;; ptr++)
+ {
+ c = *ptr;
+ switch (c)
+ {
+ case 'i':
+ *optset |= PCRE_CASELESS;
+ continue;
+
+ case 'm':
+ *optset |= PCRE_MULTILINE;
+ continue;
+
+ case 's':
+ *optset |= PCRE_DOTALL;
+ continue;
+
+ case 'x':
+ *optset |= PCRE_EXTENDED;
+ continue;
+
+ case 'X':
+ *optset |= PCRE_EXTRA;
+ continue;
+
+ case 'U':
+ *optset |= PCRE_UNGREEDY;
+ continue;
+
+ case '-':
+ optset = &unset;
+ continue;
+
+ /* A termination by ')' indicates an options-setting-only item; if
+ this is at the very start of the pattern (indicated by item_count
+ being zero), we use it to set the global options. This is helpful
+ when analyzing the pattern for first characters, etc. Otherwise
+ nothing is done here and it is handled during the compiling
+ process.
+
+ [Historical note: Up to Perl 5.8, options settings at top level
+ were always global settings, wherever they appeared in the pattern.
+ That is, they were equivalent to an external setting. From 5.8
+ onwards, they apply only to what follows (which is what you might
+ expect).] */
+
+ case ')':
+ if (item_count == 0)
+ {
+ options = (options | set) & (~unset);
+ set = unset = 0; /* To save length */
+ item_count--; /* To allow for several */
+ }
+
+ /* Fall through */
+
+ /* A termination by ':' indicates the start of a nested group with
+ the given options set. This is again handled at compile time, but
+ we must allow for compiled space if any of the ims options are
+ set. We also have to allow for resetting space at the end of
+ the group, which is why 4 is added to the length and not just 2.
+ If there are several changes of options within the same group, this
+ will lead to an over-estimate on the length, but this shouldn't
+ matter very much. We also have to allow for resetting options at
+ the start of any alternations, which we do by setting
+ branch_newextra to 2. Finally, we record whether the case-dependent
+ flag ever changes within the regex. This is used by the "required
+ character" code. */
+
+ case ':':
+ if (((set|unset) & PCRE_IMS) != 0)
+ {
+ length += 4;
+ branch_newextra = 2;
+ if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
+ }
+ goto END_OPTIONS;
+
+ /* Unrecognized option character */
+
+ default:
+ *errorptr = ERR12;
+ goto PCRE_ERROR_RETURN;
+ }
+ }
+
+ /* If we hit a closing bracket, that's it - this is a freestanding
+ option-setting. We need to ensure that branch_extra is updated if
+ necessary. The only values branch_newextra can have here are 0 or 2.
+ If the value is 2, then branch_extra must either be 2 or 5, depending
+ on whether this is a lookbehind group or not. */
+
+ END_OPTIONS:
+ if (c == ')')
+ {
+ if (branch_newextra == 2 &&
+ (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
+ branch_extra += branch_newextra;
+ continue;
+ }
+
+ /* If options were terminated by ':' control comes here. Fall through
+ to handle the group below. */
+ }
+ }
+
+ /* Extracting brackets must be counted so we can process escapes in a
+ Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
+ need an additional 3 bytes of store per extracting bracket. However, if
+ PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
+ must leave the count alone (it will aways be zero). */
+
+ else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
+ {
+ bracount++;
+ if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
+ }
+
+ /* Save length for computing whole length at end if there's a repeat that
+ requires duplication of the group. Also save the current value of
+ branch_extra, and start the new group with the new value. If non-zero, this
+ will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
+
+ if (brastackptr >= sizeof(brastack)/sizeof(int))
+ {
+ *errorptr = ERR19;
+ goto PCRE_ERROR_RETURN;
+ }
+
+ bralenstack[brastackptr] = branch_extra;
+ branch_extra = branch_newextra;
+
+ brastack[brastackptr++] = length;
+ length += bracket_length;
+ continue;
+
+ /* Handle ket. Look for subsequent max/min; for certain sets of values we
+ have to replicate this bracket up to that many times. If brastackptr is
+ 0 this is an unmatched bracket which will generate an error, but take care
+ not to try to access brastack[-1] when computing the length and restoring
+ the branch_extra value. */
+
+ case ')':
+ length += 1 + LINK_SIZE;
+ if (brastackptr > 0)
+ {
+ duplength = length - brastack[--brastackptr];
+ branch_extra = bralenstack[brastackptr];
+ }
+ else duplength = 0;
+
+ /* The following code is also used when a recursion such as (?3) is
+ followed by a quantifier, because in that case, it has to be wrapped inside
+ brackets so that the quantifier works. The value of duplength must be
+ set before arrival. */
+
+ HANDLE_QUANTIFIED_BRACKETS:
+
+ /* Leave ptr at the final char; for read_repeat_counts this happens
+ automatically; for the others we need an increment. */
+
+ if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
+ {
+ ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
+ if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
+ }
+ else if (c == '*') { min = 0; max = -1; ptr++; }
+ else if (c == '+') { min = 1; max = -1; ptr++; }
+ else if (c == '?') { min = 0; max = 1; ptr++; }
+ else { min = 1; max = 1; }
+
+ /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
+ group, and if the maximum is greater than zero, we have to replicate
+ maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
+ bracket set. */
+
+ if (min == 0)
+ {
+ length++;
+ if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
+ }
+
+ /* When the minimum is greater than zero, we have to replicate up to
+ minval-1 times, with no additions required in the copies. Then, if there
+ is a limited maximum we have to replicate up to maxval-1 times allowing
+ for a BRAZERO item before each optional copy and nesting brackets for all
+ but one of the optional copies. */
+
+ else
+ {
+ length += (min - 1) * duplength;
+ if (max > min) /* Need this test as max=-1 means no limit */
+ length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
+ - (2 + 2*LINK_SIZE);
+ }
+
+ /* Allow space for once brackets for "possessive quantifier" */
+
+ if (ptr[1] == '+')
+ {
+ ptr++;
+ length += 2 + 2*LINK_SIZE;
+ }
+ continue;
+
+ /* Non-special character. It won't be space or # in extended mode, so it is
+ always a genuine character. If we are in a \Q...\E sequence, check for the
+ end; if not, we have a literal. */
+
+ default:
+ NORMAL_CHAR:
+
+ if (inescq && c == '\\' && ptr[1] == 'E')
+ {
+ inescq = FALSE;
+ ptr++;
+ continue;
+ }
+
+ length += 2; /* For a one-byte character */
+ lastitemlength = 1; /* Default length of last item for repeats */
+
+ /* In UTF-8 mode, check for additional bytes. */
+
+#ifdef SUPPORT_UTF8
+ if (utf8 && (c & 0xc0) == 0xc0)
+ {
+ while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
+ { /* because the end is marked */
+ lastitemlength++; /* by a zero byte. */
+ length++;
+ ptr++;
+ }
+ }
+#endif
+
+ continue;
+ }
+ }
+
+length += 2 + LINK_SIZE; /* For final KET and END */
+
+if ((options & PCRE_AUTO_CALLOUT) != 0)
+ length += 2 + 2*LINK_SIZE; /* For final callout */
+
+if (length > MAX_PATTERN_SIZE)
+ {
+ *errorptr = ERR20;
+ return NULL;
+ }
+
+/* Compute the size of data block needed and get it, either from malloc or
+externally provided function. */
+
+size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
+re = (real_pcre *)(pcre_malloc)(size);
+
+if (re == NULL)
+ {
+ *errorptr = ERR21;
+ return NULL;
+ }
+
+/* Put in the magic number, and save the sizes, options, and character table
+pointer. NULL is used for the default character tables. The nullpad field is at
+the end; it's there to help in the case when a regex compiled on a system with
+4-byte pointers is run on another with 8-byte pointers. */
+
+re->magic_number = MAGIC_NUMBER;
+re->size = size;
+re->options = options;
+re->dummy1 = re->dummy2 = 0;
+re->name_table_offset = sizeof(real_pcre);
+re->name_entry_size = max_name_size + 3;
+re->name_count = name_count;
+re->tables = (tables == pcre_default_tables)? NULL : tables;
+re->nullpad = NULL;
+
+/* The starting points of the name/number translation table and of the code are
+passed around in the compile data block. */
+
+compile_block.names_found = 0;
+compile_block.name_entry_size = max_name_size + 3;
+compile_block.name_table = (uschar *)re + re->name_table_offset;
+codestart = compile_block.name_table + re->name_entry_size * re->name_count;
+compile_block.start_code = codestart;
+compile_block.start_pattern = (const uschar *)pattern;
+compile_block.req_varyopt = 0;
+compile_block.nopartial = FALSE;
+
+/* Set up a starting, non-extracting bracket, then compile the expression. On
+error, *errorptr will be set non-NULL, so we don't need to look at the result
+of the function here. */
+
+ptr = (const uschar *)pattern;
+code = (uschar *)codestart;
+*code = OP_BRA;
+bracount = 0;
+(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
+ errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
+re->top_bracket = bracount;
+re->top_backref = compile_block.top_backref;
+
+if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
+
+/* If not reached end of pattern on success, there's an excess bracket. */
+
+if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
+
+/* Fill in the terminating state and check for disastrous overflow, but
+if debugging, leave the test till after things are printed out. */
+
+*code++ = OP_END;
+
+#ifndef DEBUG
+if (code - codestart > length) *errorptr = ERR23;
+#endif
+
+/* Give an error if there's back reference to a non-existent capturing
+subpattern. */
+
+if (re->top_backref > re->top_bracket) *errorptr = ERR15;
+
+/* Failed to compile, or error while post-processing */
+
+if (*errorptr != NULL)
+ {
+ (pcre_free)(re);
+ PCRE_ERROR_RETURN:
+ *erroroffset = ptr - (const uschar *)pattern;
+ return NULL;
+ }
+
+/* If the anchored option was not passed, set the flag if we can determine that
+the pattern is anchored by virtue of ^ characters or \A or anything else (such
+as starting with .* when DOTALL is set).
+
+Otherwise, if we know what the first character has to be, save it, because that
+speeds up unanchored matches no end. If not, see if we can set the
+PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
+start with ^. and also when all branches start with .* for non-DOTALL matches.
+*/
+
+if ((options & PCRE_ANCHORED) == 0)
+ {
+ int temp_options = options;
+ if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
+ re->options |= PCRE_ANCHORED;
+ else
+ {
+ if (firstbyte < 0)
+ firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
+ if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
+ {
+ int ch = firstbyte & 255;
+ re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
+ compile_block.fcc[ch] == ch)? ch : firstbyte;
+ re->options |= PCRE_FIRSTSET;
+ }
+ else if (is_startline(codestart, 0, compile_block.backref_map))
+ re->options |= PCRE_STARTLINE;
+ }
+ }
+
+/* For an anchored pattern, we use the "required byte" only if it follows a
+variable length item in the regex. Remove the caseless flag for non-caseable
+bytes. */
+
+if (reqbyte >= 0 &&
+ ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
+ {
+ int ch = reqbyte & 255;
+ re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
+ compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
+ re->options |= PCRE_REQCHSET;
+ }
+
+/* Print out the compiled data for debugging */
+
+#ifdef DEBUG
+
+printf("Length = %d top_bracket = %d top_backref = %d\n",
+ length, re->top_bracket, re->top_backref);
+
+if (re->options != 0)
+ {
+ printf("%s%s%s%s%s%s%s%s%s%s\n",
+ ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
+ ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
+ ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
+ ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
+ ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
+ ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
+ ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
+ ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
+ ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
+ ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
+ }
+
+if ((re->options & PCRE_FIRSTSET) != 0)
+ {
+ int ch = re->first_byte & 255;
+ const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
+ if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
+ else printf("First char = \\x%02x%s\n", ch, caseless);
+ }
+
+if ((re->options & PCRE_REQCHSET) != 0)
+ {
+ int ch = re->req_byte & 255;
+ const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
+ if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
+ else printf("Req char = \\x%02x%s\n", ch, caseless);
+ }
+
+print_internals(re, stdout);
+
+/* This check is done here in the debugging case so that the code that
+was compiled can be seen. */
+
+if (code - codestart > length)
+ {
+ *errorptr = ERR23;
+ (pcre_free)(re);
+ *erroroffset = ptr - (uschar *)pattern;
+ return NULL;
+ }
+#endif
+
+return (pcre *)re;
+}
+
+
+
+/*************************************************
+* Match a back-reference *
+*************************************************/
+
+/* If a back reference hasn't been set, the length that is passed is greater
+than the number of characters left in the string, so the match fails.
+
+Arguments:
+ offset index into the offset vector
+ eptr points into the subject
+ length length to be matched
+ md points to match data block
+ ims the ims flags
+
+Returns: TRUE if matched
+*/
+
+static BOOL
+match_ref(int offset, register const uschar *eptr, int length, match_data *md,
+ unsigned long int ims)
+{
+const uschar *p = md->start_subject + md->offset_vector[offset];
+
+#ifdef DEBUG
+if (eptr >= md->end_subject)
+ printf("matching subject <null>");
+else
+ {
+ printf("matching subject ");
+ pchars(eptr, length, TRUE, md);
+ }
+printf(" against backref ");
+pchars(p, length, FALSE, md);
+printf("\n");
+#endif
+
+/* Always fail if not enough characters left */
+
+if (length > md->end_subject - eptr) return FALSE;
+
+/* Separate the caselesss case for speed */
+
+if ((ims & PCRE_CASELESS) != 0)
+ {
+ while (length-- > 0)
+ if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
+ }
+else
+ { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
+
+return TRUE;
+}
+
+
+#ifdef SUPPORT_UTF8
+/*************************************************
+* Match character against an XCLASS *
+*************************************************/
+
+/* This function is called from within the XCLASS code below, to match a
+character against an extended class which might match values > 255.
+
+Arguments:
+ c the character
+ data points to the flag byte of the XCLASS data
+
+Returns: TRUE if character matches, else FALSE
+*/
+
+static BOOL
+match_xclass(int c, const uschar *data)
+{
+int t;
+BOOL negated = (*data & XCL_NOT) != 0;
+
+/* Character values < 256 are matched against a bitmap, if one is present. If
+not, we still carry on, because there may be ranges that start below 256 in the
+additional data. */
+
+if (c < 256)
+ {
+ if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
+ return !negated; /* char found */
+ }
+
+/* First skip the bit map if present. Then match against the list of Unicode
+properties or large chars or ranges that end with a large char. We won't ever
+encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
+
+if ((*data++ & XCL_MAP) != 0) data += 32;
+
+while ((t = *data++) != XCL_END)
+ {
+ int x, y;
+ if (t == XCL_SINGLE)
+ {
+ GETCHARINC(x, data);
+ if (c == x) return !negated;
+ }
+ else if (t == XCL_RANGE)
+ {
+ GETCHARINC(x, data);
+ GETCHARINC(y, data);
+ if (c >= x && c <= y) return !negated;
+ }
+
+#ifdef SUPPORT_UCP
+ else /* XCL_PROP & XCL_NOTPROP */
+ {
+ int chartype, othercase;
+ int rqdtype = *data++;
+ int category = ucp_findchar(c, &chartype, &othercase);
+ if (rqdtype >= 128)
+ {
+ if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
+ }
+ else
+ {
+ if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
+ }
+ }
+#endif /* SUPPORT_UCP */
+ }
+
+return negated; /* char did not match */
+}
+#endif
+
+
+/***************************************************************************
+****************************************************************************
+ RECURSION IN THE match() FUNCTION
+
+The match() function is highly recursive. Some regular expressions can cause
+it to recurse thousands of times. I was writing for Unix, so I just let it
+call itself recursively. This uses the stack for saving everything that has
+to be saved for a recursive call. On Unix, the stack can be large, and this
+works fine.
+
+It turns out that on non-Unix systems there are problems with programs that
+use a lot of stack. (This despite the fact that every last chip has oodles
+of memory these days, and techniques for extending the stack have been known
+for decades.) So....
+
+There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
+calls by keeping local variables that need to be preserved in blocks of memory
+obtained from malloc instead instead of on the stack. Macros are used to
+achieve this so that the actual code doesn't look very different to what it
+always used to.
+****************************************************************************
+***************************************************************************/
+
+
+/* These versions of the macros use the stack, as normal */
+
+#ifndef NO_RECURSE
+#define REGISTER register
+#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
+#define RRETURN(ra) return ra
+#else
+
+
+/* These versions of the macros manage a private stack on the heap. Note
+that the rd argument of RMATCH isn't actually used. It's the md argument of
+match(), which never changes. */
+
+#define REGISTER
+
+#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
+ {\
+ heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
+ if (setjmp(frame->Xwhere) == 0)\
+ {\
+ newframe->Xeptr = ra;\
+ newframe->Xecode = rb;\
+ newframe->Xoffset_top = rc;\
+ newframe->Xims = re;\
+ newframe->Xeptrb = rf;\
+ newframe->Xflags = rg;\
+ newframe->Xprevframe = frame;\
+ frame = newframe;\
+ DPRINTF(("restarting from line %d\n", __LINE__));\
+ goto HEAP_RECURSE;\
+ }\
+ else\
+ {\
+ DPRINTF(("longjumped back to line %d\n", __LINE__));\
+ frame = md->thisframe;\
+ rx = frame->Xresult;\
+ }\
+ }
+
+#define RRETURN(ra)\
+ {\
+ heapframe *newframe = frame;\
+ frame = newframe->Xprevframe;\
+ (pcre_stack_free)(newframe);\
+ if (frame != NULL)\
+ {\
+ frame->Xresult = ra;\
+ md->thisframe = frame;\
+ longjmp(frame->Xwhere, 1);\
+ }\
+ return ra;\
+ }
+
+
+/* Structure for remembering the local variables in a private frame */
+
+typedef struct heapframe {
+ struct heapframe *Xprevframe;
+
+ /* Function arguments that may change */
+
+ const uschar *Xeptr;
+ const uschar *Xecode;
+ int Xoffset_top;
+ long int Xims;
+ eptrblock *Xeptrb;
+ int Xflags;
+
+ /* Function local variables */
+
+ const uschar *Xcallpat;
+ const uschar *Xcharptr;
+ const uschar *Xdata;
+ const uschar *Xnext;
+ const uschar *Xpp;
+ const uschar *Xprev;
+ const uschar *Xsaved_eptr;
+
+ recursion_info Xnew_recursive;
+
+ BOOL Xcur_is_word;
+ BOOL Xcondition;
+ BOOL Xminimize;
+ BOOL Xprev_is_word;
+
+ unsigned long int Xoriginal_ims;
+
+#ifdef SUPPORT_UCP
+ int Xprop_type;
+ int Xprop_fail_result;
+ int Xprop_category;
+ int Xprop_chartype;
+ int Xprop_othercase;
+ int Xprop_test_against;
+ int *Xprop_test_variable;
+#endif
+
+ int Xctype;
+ int Xfc;
+ int Xfi;
+ int Xlength;
+ int Xmax;
+ int Xmin;
+ int Xnumber;
+ int Xoffset;
+ int Xop;
+ int Xsave_capture_last;
+ int Xsave_offset1, Xsave_offset2, Xsave_offset3;
+ int Xstacksave[REC_STACK_SAVE_MAX];
+
+ eptrblock Xnewptrb;
+
+ /* Place to pass back result, and where to jump back to */
+
+ int Xresult;
+ jmp_buf Xwhere;
+
+} heapframe;
+
+#endif
+
+
+/***************************************************************************
+***************************************************************************/
+
+
+
+/*************************************************
+* Match from current position *
+*************************************************/
+
+/* On entry ecode points to the first opcode, and eptr to the first character
+in the subject string, while eptrb holds the value of eptr at the start of the
+last bracketed group - used for breaking infinite loops matching zero-length
+strings. This function is called recursively in many circumstances. Whenever it
+returns a negative (error) response, the outer incarnation must also return the
+same response.
+
+Performance note: It might be tempting to extract commonly used fields from the
+md structure (e.g. utf8, end_subject) into individual variables to improve
+performance. Tests using gcc on a SPARC disproved this; in the first case, it
+made performance worse.
+
+Arguments:
+ eptr pointer in subject
+ ecode position in code
+ offset_top current top pointer
+ md pointer to "static" info for the match
+ ims current /i, /m, and /s options
+ eptrb pointer to chain of blocks containing eptr at start of
+ brackets - for testing for empty matches
+ flags can contain
+ match_condassert - this is an assertion condition
+ match_isgroup - this is the start of a bracketed group
+
+Returns: MATCH_MATCH if matched ) these values are >= 0
+ MATCH_NOMATCH if failed to match )
+ a negative PCRE_ERROR_xxx value if aborted by an error condition
+ (e.g. stopped by recursion limit)
+*/
+
+static int
+match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
+ int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
+ int flags)
+{
+/* These variables do not need to be preserved over recursion in this function,
+so they can be ordinary variables in all cases. Mark them with "register"
+because they are used a lot in loops. */
+
+register int rrc; /* Returns from recursive calls */
+register int i; /* Used for loops not involving calls to RMATCH() */
+register int c; /* Character values not kept over RMATCH() calls */
+
+/* When recursion is not being used, all "local" variables that have to be
+preserved over calls to RMATCH() are part of a "frame" which is obtained from
+heap storage. Set up the top-level frame here; others are obtained from the
+heap whenever RMATCH() does a "recursion". See the macro definitions above. */
+
+#ifdef NO_RECURSE
+heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
+frame->Xprevframe = NULL; /* Marks the top level */
+
+/* Copy in the original argument variables */
+
+frame->Xeptr = eptr;
+frame->Xecode = ecode;
+frame->Xoffset_top = offset_top;
+frame->Xims = ims;
+frame->Xeptrb = eptrb;
+frame->Xflags = flags;
+
+/* This is where control jumps back to to effect "recursion" */
+
+HEAP_RECURSE:
+
+/* Macros make the argument variables come from the current frame */
+
+#define eptr frame->Xeptr
+#define ecode frame->Xecode
+#define offset_top frame->Xoffset_top
+#define ims frame->Xims
+#define eptrb frame->Xeptrb
+#define flags frame->Xflags
+
+/* Ditto for the local variables */
+
+#ifdef SUPPORT_UTF8
+#define charptr frame->Xcharptr
+#endif
+#define callpat frame->Xcallpat
+#define data frame->Xdata
+#define next frame->Xnext
+#define pp frame->Xpp
+#define prev frame->Xprev
+#define saved_eptr frame->Xsaved_eptr
+
+#define new_recursive frame->Xnew_recursive
+
+#define cur_is_word frame->Xcur_is_word
+#define condition frame->Xcondition
+#define minimize frame->Xminimize
+#define prev_is_word frame->Xprev_is_word
+
+#define original_ims frame->Xoriginal_ims
+
+#ifdef SUPPORT_UCP
+#define prop_type frame->Xprop_type
+#define prop_fail_result frame->Xprop_fail_result
+#define prop_category frame->Xprop_category
+#define prop_chartype frame->Xprop_chartype
+#define prop_othercase frame->Xprop_othercase
+#define prop_test_against frame->Xprop_test_against
+#define prop_test_variable frame->Xprop_test_variable
+#endif
+
+#define ctype frame->Xctype
+#define fc frame->Xfc
+#define fi frame->Xfi
+#define length frame->Xlength
+#define max frame->Xmax
+#define min frame->Xmin
+#define number frame->Xnumber
+#define offset frame->Xoffset
+#define op frame->Xop
+#define save_capture_last frame->Xsave_capture_last
+#define save_offset1 frame->Xsave_offset1
+#define save_offset2 frame->Xsave_offset2
+#define save_offset3 frame->Xsave_offset3
+#define stacksave frame->Xstacksave
+
+#define newptrb frame->Xnewptrb
+
+/* When recursion is being used, local variables are allocated on the stack and
+get preserved during recursion in the normal way. In this environment, fi and
+i, and fc and c, can be the same variables. */
+
+#else
+#define fi i
+#define fc c
+
+
+#ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
+const uschar *charptr; /* small blocks of the code. My normal */
+#endif /* style of coding would have declared */
+const uschar *callpat; /* them within each of those blocks. */
+const uschar *data; /* However, in order to accommodate the */
+const uschar *next; /* version of this code that uses an */
+const uschar *pp; /* external "stack" implemented on the */
+const uschar *prev; /* heap, it is easier to declare them */
+const uschar *saved_eptr; /* all here, so the declarations can */
+ /* be cut out in a block. The only */
+recursion_info new_recursive; /* declarations within blocks below are */
+ /* for variables that do not have to */
+BOOL cur_is_word; /* be preserved over a recursive call */
+BOOL condition; /* to RMATCH(). */
+BOOL minimize;
+BOOL prev_is_word;
+
+unsigned long int original_ims;
+
+#ifdef SUPPORT_UCP
+int prop_type;
+int prop_fail_result;
+int prop_category;
+int prop_chartype;
+int prop_othercase;
+int prop_test_against;
+int *prop_test_variable;
+#endif
+
+int ctype;
+int length;
+int max;
+int min;
+int number;
+int offset;
+int op;
+int save_capture_last;
+int save_offset1, save_offset2, save_offset3;
+int stacksave[REC_STACK_SAVE_MAX];
+
+eptrblock newptrb;
+#endif
+
+/* These statements are here to stop the compiler complaining about unitialized
+variables. */
+
+#ifdef SUPPORT_UCP
+prop_fail_result = 0;
+prop_test_against = 0;
+prop_test_variable = NULL;
+#endif
+
+/* OK, now we can get on with the real code of the function. Recursion is
+specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
+these just turn into a recursive call to match() and a "return", respectively.
+However, RMATCH isn't like a function call because it's quite a complicated
+macro. It has to be used in one particular way. This shouldn't, however, impact
+performance when true recursion is being used. */
+
+if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
+
+original_ims = ims; /* Save for resetting on ')' */
+
+/* At the start of a bracketed group, add the current subject pointer to the
+stack of such pointers, to be re-instated at the end of the group when we hit
+the closing ket. When match() is called in other circumstances, we don't add to
+this stack. */
+
+if ((flags & match_isgroup) != 0)
+ {
+ newptrb.epb_prev = eptrb;
+ newptrb.epb_saved_eptr = eptr;
+ eptrb = &newptrb;
+ }
+
+/* Now start processing the operations. */
+
+for (;;)
+ {
+ op = *ecode;
+ minimize = FALSE;
+
+ /* For partial matching, remember if we ever hit the end of the subject after
+ matching at least one subject character. */
+
+ if (md->partial &&
+ eptr >= md->end_subject &&
+ eptr > md->start_match)
+ md->hitend = TRUE;
+
+ /* Opening capturing bracket. If there is space in the offset vector, save
+ the current subject position in the working slot at the top of the vector. We
+ mustn't change the current values of the data slot, because they may be set
+ from a previous iteration of this group, and be referred to by a reference
+ inside the group.
+
+ If the bracket fails to match, we need to restore this value and also the
+ values of the final offsets, in case they were set by a previous iteration of
+ the same bracket.
+
+ If there isn't enough space in the offset vector, treat this as if it were a
+ non-capturing bracket. Don't worry about setting the flag for the error case
+ here; that is handled in the code for KET. */
+
+ if (op > OP_BRA)
+ {
+ number = op - OP_BRA;
+
+ /* For extended extraction brackets (large number), we have to fish out the
+ number from a dummy opcode at the start. */
+
+ if (number > EXTRACT_BASIC_MAX)
+ number = GET2(ecode, 2+LINK_SIZE);
+ offset = number << 1;
+
+#ifdef DEBUG
+ printf("start bracket %d subject=", number);
+ pchars(eptr, 16, TRUE, md);
+ printf("\n");
+#endif
+
+ if (offset < md->offset_max)
+ {
+ save_offset1 = md->offset_vector[offset];
+ save_offset2 = md->offset_vector[offset+1];
+ save_offset3 = md->offset_vector[md->offset_end - number];
+ save_capture_last = md->capture_last;
+
+ DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
+ md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
+
+ do
+ {
+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
+ match_isgroup);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ md->capture_last = save_capture_last;
+ ecode += GET(ecode, 1);
+ }
+ while (*ecode == OP_ALT);
+
+ DPRINTF(("bracket %d failed\n", number));
+
+ md->offset_vector[offset] = save_offset1;
+ md->offset_vector[offset+1] = save_offset2;
+ md->offset_vector[md->offset_end - number] = save_offset3;
+
+ RRETURN(MATCH_NOMATCH);
+ }
+
+ /* Insufficient room for saving captured contents */
+
+ else op = OP_BRA;
+ }
+
+ /* Other types of node can be handled by a switch */
+
+ switch(op)
+ {
+ case OP_BRA: /* Non-capturing bracket: optimized */
+ DPRINTF(("start bracket 0\n"));
+ do
+ {
+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
+ match_isgroup);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ ecode += GET(ecode, 1);
+ }
+ while (*ecode == OP_ALT);
+ DPRINTF(("bracket 0 failed\n"));
+ RRETURN(MATCH_NOMATCH);
+
+ /* Conditional group: compilation checked that there are no more than
+ two branches. If the condition is false, skipping the first branch takes us
+ past the end if there is only one branch, but that's OK because that is
+ exactly what going to the ket would do. */
+
+ case OP_COND:
+ if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
+ {
+ offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
+ condition = (offset == CREF_RECURSE * 2)?
+ (md->recursive != NULL) :
+ (offset < offset_top && md->offset_vector[offset] >= 0);
+ RMATCH(rrc, eptr, ecode + (condition?
+ (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
+ offset_top, md, ims, eptrb, match_isgroup);
+ RRETURN(rrc);
+ }
+
+ /* The condition is an assertion. Call match() to evaluate it - setting
+ the final argument TRUE causes it to stop at the end of an assertion. */
+
+ else
+ {
+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
+ match_condassert | match_isgroup);
+ if (rrc == MATCH_MATCH)
+ {
+ ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
+ while (*ecode == OP_ALT) ecode += GET(ecode, 1);
+ }
+ else if (rrc != MATCH_NOMATCH)
+ {
+ RRETURN(rrc); /* Need braces because of following else */
+ }
+ else ecode += GET(ecode, 1);
+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
+ match_isgroup);
+ RRETURN(rrc);
+ }
+ /* Control never reaches here */
+
+ /* Skip over conditional reference or large extraction number data if
+ encountered. */
+
+ case OP_CREF:
+ case OP_BRANUMBER:
+ ecode += 3;
+ break;
+
+ /* End of the pattern. If we are in a recursion, we should restore the
+ offsets appropriately and continue from after the call. */
+
+ case OP_END:
+ if (md->recursive != NULL && md->recursive->group_num == 0)
+ {
+ recursion_info *rec = md->recursive;
+ DPRINTF(("Hit the end in a (?0) recursion\n"));
+ md->recursive = rec->prevrec;
+ memmove(md->offset_vector, rec->offset_save,
+ rec->saved_max * sizeof(int));
+ md->start_match = rec->save_start;
+ ims = original_ims;
+ ecode = rec->after_call;
+ break;
+ }
+
+ /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
+ string - backtracking will then try other alternatives, if any. */
+
+ if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
+ md->end_match_ptr = eptr; /* Record where we ended */
+ md->end_offset_top = offset_top; /* and how many extracts were taken */
+ RRETURN(MATCH_MATCH);
+
+ /* Change option settings */
+
+ case OP_OPT:
+ ims = ecode[1];
+ ecode += 2;
+ DPRINTF(("ims set to %02lx\n", ims));
+ break;
+
+ /* Assertion brackets. Check the alternative branches in turn - the
+ matching won't pass the KET for an assertion. If any one branch matches,
+ the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
+ start of each branch to move the current point backwards, so the code at
+ this level is identical to the lookahead case. */
+
+ case OP_ASSERT:
+ case OP_ASSERTBACK:
+ do
+ {
+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
+ match_isgroup);
+ if (rrc == MATCH_MATCH) break;
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ ecode += GET(ecode, 1);
+ }
+ while (*ecode == OP_ALT);
+ if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
+
+ /* If checking an assertion for a condition, return MATCH_MATCH. */
+
+ if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
+
+ /* Continue from after the assertion, updating the offsets high water
+ mark, since extracts may have been taken during the assertion. */
+
+ do ecode += GET(ecode,1); while (*ecode == OP_ALT);
+ ecode += 1 + LINK_SIZE;
+ offset_top = md->end_offset_top;
+ continue;
+
+ /* Negative assertion: all branches must fail to match */
+
+ case OP_ASSERT_NOT:
+ case OP_ASSERTBACK_NOT:
+ do
+ {
+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
+ match_isgroup);
+ if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ ecode += GET(ecode,1);
+ }
+ while (*ecode == OP_ALT);
+
+ if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
+
+ ecode += 1 + LINK_SIZE;
+ continue;
+
+ /* Move the subject pointer back. This occurs only at the start of
+ each branch of a lookbehind assertion. If we are too close to the start to
+ move back, this match function fails. When working with UTF-8 we move
+ back a number of characters, not bytes. */
+
+ case OP_REVERSE:
+#ifdef SUPPORT_UTF8
+ if (md->utf8)
+ {
+ c = GET(ecode,1);
+ for (i = 0; i < c; i++)
+ {
+ eptr--;
+ if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
+ BACKCHAR(eptr)
+ }
+ }
+ else
+#endif
+
+ /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
+
+ {
+ eptr -= GET(ecode,1);
+ if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
+ }
+
+ /* Skip to next op code */
+
+ ecode += 1 + LINK_SIZE;
+ break;
+
+ /* The callout item calls an external function, if one is provided, passing
+ details of the match so far. This is mainly for debugging, though the
+ function is able to force a failure. */
+
+ case OP_CALLOUT:
+ if (pcre_callout != NULL)
+ {
+ pcre_callout_block cb;
+ cb.version = 1; /* Version 1 of the callout block */
+ cb.callout_number = ecode[1];
+ cb.offset_vector = md->offset_vector;
+ cb.subject = (const char *)md->start_subject;
+ cb.subject_length = md->end_subject - md->start_subject;
+ cb.start_match = md->start_match - md->start_subject;
+ cb.current_position = eptr - md->start_subject;
+ cb.pattern_position = GET(ecode, 2);
+ cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
+ cb.capture_top = offset_top/2;
+ cb.capture_last = md->capture_last;
+ cb.callout_data = md->callout_data;
+ if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
+ if (rrc < 0) RRETURN(rrc);
+ }
+ ecode += 2 + 2*LINK_SIZE;
+ break;
+
+ /* Recursion either matches the current regex, or some subexpression. The
+ offset data is the offset to the starting bracket from the start of the
+ whole pattern. (This is so that it works from duplicated subpatterns.)
+
+ If there are any capturing brackets started but not finished, we have to
+ save their starting points and reinstate them after the recursion. However,
+ we don't know how many such there are (offset_top records the completed
+ total) so we just have to save all the potential data. There may be up to
+ 65535 such values, which is too large to put on the stack, but using malloc
+ for small numbers seems expensive. As a compromise, the stack is used when
+ there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
+ is used. A problem is what to do if the malloc fails ... there is no way of
+ returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
+ values on the stack, and accept that the rest may be wrong.
+
+ There are also other values that have to be saved. We use a chained
+ sequence of blocks that actually live on the stack. Thanks to Robin Houston
+ for the original version of this logic. */
+
+ case OP_RECURSE:
+ {
+ callpat = md->start_code + GET(ecode, 1);
+ new_recursive.group_num = *callpat - OP_BRA;
+
+ /* For extended extraction brackets (large number), we have to fish out
+ the number from a dummy opcode at the start. */
+
+ if (new_recursive.group_num > EXTRACT_BASIC_MAX)
+ new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
+
+ /* Add to "recursing stack" */
+
+ new_recursive.prevrec = md->recursive;
+ md->recursive = &new_recursive;
+
+ /* Find where to continue from afterwards */
+
+ ecode += 1 + LINK_SIZE;
+ new_recursive.after_call = ecode;
+
+ /* Now save the offset data. */
+
+ new_recursive.saved_max = md->offset_end;
+ if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
+ new_recursive.offset_save = stacksave;
+ else
+ {
+ new_recursive.offset_save =
+ (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
+ if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
+ }
+
+ memcpy(new_recursive.offset_save, md->offset_vector,
+ new_recursive.saved_max * sizeof(int));
+ new_recursive.save_start = md->start_match;
+ md->start_match = eptr;
+
+ /* OK, now we can do the recursion. For each top-level alternative we
+ restore the offset and recursion data. */
+
+ DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
+ do
+ {
+ RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
+ eptrb, match_isgroup);
+ if (rrc == MATCH_MATCH)
+ {
+ md->recursive = new_recursive.prevrec;
+ if (new_recursive.offset_save != stacksave)
+ (pcre_free)(new_recursive.offset_save);
+ RRETURN(MATCH_MATCH);
+ }
+ else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+
+ md->recursive = &new_recursive;
+ memcpy(md->offset_vector, new_recursive.offset_save,
+ new_recursive.saved_max * sizeof(int));
+ callpat += GET(callpat, 1);
+ }
+ while (*callpat == OP_ALT);
+
+ DPRINTF(("Recursion didn't match\n"));
+ md->recursive = new_recursive.prevrec;
+ if (new_recursive.offset_save != stacksave)
+ (pcre_free)(new_recursive.offset_save);
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never reaches here */
+
+ /* "Once" brackets are like assertion brackets except that after a match,
+ the point in the subject string is not moved back. Thus there can never be
+ a move back into the brackets. Friedl calls these "atomic" subpatterns.
+ Check the alternative branches in turn - the matching won't pass the KET
+ for this kind of subpattern. If any one branch matches, we carry on as at
+ the end of a normal bracket, leaving the subject pointer. */
+
+ case OP_ONCE:
+ {
+ prev = ecode;
+ saved_eptr = eptr;
+
+ do
+ {
+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
+ eptrb, match_isgroup);
+ if (rrc == MATCH_MATCH) break;
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ ecode += GET(ecode,1);
+ }
+ while (*ecode == OP_ALT);
+
+ /* If hit the end of the group (which could be repeated), fail */
+
+ if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
+
+ /* Continue as from after the assertion, updating the offsets high water
+ mark, since extracts may have been taken. */
+
+ do ecode += GET(ecode,1); while (*ecode == OP_ALT);
+
+ offset_top = md->end_offset_top;
+ eptr = md->end_match_ptr;
+
+ /* For a non-repeating ket, just continue at this level. This also
+ happens for a repeating ket if no characters were matched in the group.
+ This is the forcible breaking of infinite loops as implemented in Perl
+ 5.005. If there is an options reset, it will get obeyed in the normal
+ course of events. */
+
+ if (*ecode == OP_KET || eptr == saved_eptr)
+ {
+ ecode += 1+LINK_SIZE;
+ break;
+ }
+
+ /* The repeating kets try the rest of the pattern or restart from the
+ preceding bracket, in the appropriate order. We need to reset any options
+ that changed within the bracket before re-running it, so check the next
+ opcode. */
+
+ if (ecode[1+LINK_SIZE] == OP_OPT)
+ {
+ ims = (ims & ~PCRE_IMS) | ecode[4];
+ DPRINTF(("ims set to %02lx at group repeat\n", ims));
+ }
+
+ if (*ecode == OP_KETRMIN)
+ {
+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ }
+ else /* OP_KETRMAX */
+ {
+ RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ }
+ }
+ RRETURN(MATCH_NOMATCH);
+
+ /* An alternation is the end of a branch; scan along to find the end of the
+ bracketed group and go to there. */
+
+ case OP_ALT:
+ do ecode += GET(ecode,1); while (*ecode == OP_ALT);
+ break;
+
+ /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
+ that it may occur zero times. It may repeat infinitely, or not at all -
+ i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
+ repeat limits are compiled as a number of copies, with the optional ones
+ preceded by BRAZERO or BRAMINZERO. */
+
+ case OP_BRAZERO:
+ {
+ next = ecode+1;
+ RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ do next += GET(next,1); while (*next == OP_ALT);
+ ecode = next + 1+LINK_SIZE;
+ }
+ break;
+
+ case OP_BRAMINZERO:
+ {
+ next = ecode+1;
+ do next += GET(next,1); while (*next == OP_ALT);
+ RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
+ match_isgroup);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ ecode++;
+ }
+ break;
+
+ /* End of a group, repeated or non-repeating. If we are at the end of
+ an assertion "group", stop matching and return MATCH_MATCH, but record the
+ current high water mark for use by positive assertions. Do this also
+ for the "once" (not-backup up) groups. */
+
+ case OP_KET:
+ case OP_KETRMIN:
+ case OP_KETRMAX:
+ {
+ prev = ecode - GET(ecode, 1);
+ saved_eptr = eptrb->epb_saved_eptr;
+
+ /* Back up the stack of bracket start pointers. */
+
+ eptrb = eptrb->epb_prev;
+
+ if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
+ *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
+ *prev == OP_ONCE)
+ {
+ md->end_match_ptr = eptr; /* For ONCE */
+ md->end_offset_top = offset_top;
+ RRETURN(MATCH_MATCH);
+ }
+
+ /* In all other cases except a conditional group we have to check the
+ group number back at the start and if necessary complete handling an
+ extraction by setting the offsets and bumping the high water mark. */
+
+ if (*prev != OP_COND)
+ {
+ number = *prev - OP_BRA;
+
+ /* For extended extraction brackets (large number), we have to fish out
+ the number from a dummy opcode at the start. */
+
+ if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
+ offset = number << 1;
+
+#ifdef DEBUG
+ printf("end bracket %d", number);
+ printf("\n");
+#endif
+
+ /* Test for a numbered group. This includes groups called as a result
+ of recursion. Note that whole-pattern recursion is coded as a recurse
+ into group 0, so it won't be picked up here. Instead, we catch it when
+ the OP_END is reached. */
+
+ if (number > 0)
+ {
+ md->capture_last = number;
+ if (offset >= md->offset_max) md->offset_overflow = TRUE; else
+ {
+ md->offset_vector[offset] =
+ md->offset_vector[md->offset_end - number];
+ md->offset_vector[offset+1] = eptr - md->start_subject;
+ if (offset_top <= offset) offset_top = offset + 2;
+ }
+
+ /* Handle a recursively called group. Restore the offsets
+ appropriately and continue from after the call. */
+
+ if (md->recursive != NULL && md->recursive->group_num == number)
+ {
+ recursion_info *rec = md->recursive;
+ DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
+ md->recursive = rec->prevrec;
+ md->start_match = rec->save_start;
+ memcpy(md->offset_vector, rec->offset_save,
+ rec->saved_max * sizeof(int));
+ ecode = rec->after_call;
+ ims = original_ims;
+ break;
+ }
+ }
+ }
+
+ /* Reset the value of the ims flags, in case they got changed during
+ the group. */
+
+ ims = original_ims;
+ DPRINTF(("ims reset to %02lx\n", ims));
+
+ /* For a non-repeating ket, just continue at this level. This also
+ happens for a repeating ket if no characters were matched in the group.
+ This is the forcible breaking of infinite loops as implemented in Perl
+ 5.005. If there is an options reset, it will get obeyed in the normal
+ course of events. */
+
+ if (*ecode == OP_KET || eptr == saved_eptr)
+ {
+ ecode += 1 + LINK_SIZE;
+ break;
+ }
+
+ /* The repeating kets try the rest of the pattern or restart from the
+ preceding bracket, in the appropriate order. */
+
+ if (*ecode == OP_KETRMIN)
+ {
+ RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ }
+ else /* OP_KETRMAX */
+ {
+ RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ }
+ }
+
+ RRETURN(MATCH_NOMATCH);
+
+ /* Start of subject unless notbol, or after internal newline if multiline */
+
+ case OP_CIRC:
+ if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
+ if ((ims & PCRE_MULTILINE) != 0)
+ {
+ if (eptr != md->start_subject && eptr[-1] != NEWLINE)
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+ }
+ /* ... else fall through */
+
+ /* Start of subject assertion */
+
+ case OP_SOD:
+ if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ /* Start of match assertion */
+
+ case OP_SOM:
+ if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ /* Assert before internal newline if multiline, or before a terminating
+ newline unless endonly is set, else end of subject unless noteol is set. */
+
+ case OP_DOLL:
+ if ((ims & PCRE_MULTILINE) != 0)
+ {
+ if (eptr < md->end_subject)
+ { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
+ else
+ { if (md->noteol) RRETURN(MATCH_NOMATCH); }
+ ecode++;
+ break;
+ }
+ else
+ {
+ if (md->noteol) RRETURN(MATCH_NOMATCH);
+ if (!md->endonly)
+ {
+ if (eptr < md->end_subject - 1 ||
+ (eptr == md->end_subject - 1 && *eptr != NEWLINE))
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+ }
+ }
+ /* ... else fall through */
+
+ /* End of subject assertion (\z) */
+
+ case OP_EOD:
+ if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ /* End of subject or ending \n assertion (\Z) */
+
+ case OP_EODN:
+ if (eptr < md->end_subject - 1 ||
+ (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ /* Word boundary assertions */
+
+ case OP_NOT_WORD_BOUNDARY:
+ case OP_WORD_BOUNDARY:
+ {
+
+ /* Find out if the previous and current characters are "word" characters.
+ It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
+ be "non-word" characters. */
+
+#ifdef SUPPORT_UTF8
+ if (md->utf8)
+ {
+ if (eptr == md->start_subject) prev_is_word = FALSE; else
+ {
+ const uschar *lastptr = eptr - 1;
+ while((*lastptr & 0xc0) == 0x80) lastptr--;
+ GETCHAR(c, lastptr);
+ prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
+ }
+ if (eptr >= md->end_subject) cur_is_word = FALSE; else
+ {
+ GETCHAR(c, eptr);
+ cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
+ }
+ }
+ else
+#endif
+
+ /* More streamlined when not in UTF-8 mode */
+
+ {
+ prev_is_word = (eptr != md->start_subject) &&
+ ((md->ctypes[eptr[-1]] & ctype_word) != 0);
+ cur_is_word = (eptr < md->end_subject) &&
+ ((md->ctypes[*eptr] & ctype_word) != 0);
+ }
+
+ /* Now see if the situation is what we want */
+
+ if ((*ecode++ == OP_WORD_BOUNDARY)?
+ cur_is_word == prev_is_word : cur_is_word != prev_is_word)
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ /* Match a single character type; inline for speed */
+
+ case OP_ANY:
+ if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
+ RRETURN(MATCH_NOMATCH);
+ if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
+#ifdef SUPPORT_UTF8
+ if (md->utf8)
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+#endif
+ ecode++;
+ break;
+
+ /* Match a single byte, even in UTF-8 mode. This opcode really does match
+ any byte, even newline, independent of the setting of PCRE_DOTALL. */
+
+ case OP_ANYBYTE:
+ if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ case OP_NOT_DIGIT:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (
+#ifdef SUPPORT_UTF8
+ c < 256 &&
+#endif
+ (md->ctypes[c] & ctype_digit) != 0
+ )
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ case OP_DIGIT:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (
+#ifdef SUPPORT_UTF8
+ c >= 256 ||
+#endif
+ (md->ctypes[c] & ctype_digit) == 0
+ )
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ case OP_NOT_WHITESPACE:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (
+#ifdef SUPPORT_UTF8
+ c < 256 &&
+#endif
+ (md->ctypes[c] & ctype_space) != 0
+ )
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ case OP_WHITESPACE:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (
+#ifdef SUPPORT_UTF8
+ c >= 256 ||
+#endif
+ (md->ctypes[c] & ctype_space) == 0
+ )
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ case OP_NOT_WORDCHAR:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (
+#ifdef SUPPORT_UTF8
+ c < 256 &&
+#endif
+ (md->ctypes[c] & ctype_word) != 0
+ )
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+ case OP_WORDCHAR:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ if (
+#ifdef SUPPORT_UTF8
+ c >= 256 ||
+#endif
+ (md->ctypes[c] & ctype_word) == 0
+ )
+ RRETURN(MATCH_NOMATCH);
+ ecode++;
+ break;
+
+#ifdef SUPPORT_UCP
+ /* Check the next character by Unicode property. We will get here only
+ if the support is in the binary; otherwise a compile-time error occurs. */
+
+ case OP_PROP:
+ case OP_NOTPROP:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ {
+ int chartype, rqdtype;
+ int othercase;
+ int category = ucp_findchar(c, &chartype, &othercase);
+
+ rqdtype = *(++ecode);
+ ecode++;
+
+ if (rqdtype >= 128)
+ {
+ if ((rqdtype - 128 != category) == (op == OP_PROP))
+ RRETURN(MATCH_NOMATCH);
+ }
+ else
+ {
+ if ((rqdtype != chartype) == (op == OP_PROP))
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ break;
+
+ /* Match an extended Unicode sequence. We will get here only if the support
+ is in the binary; otherwise a compile-time error occurs. */
+
+ case OP_EXTUNI:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ {
+ int chartype;
+ int othercase;
+ int category = ucp_findchar(c, &chartype, &othercase);
+ if (category == ucp_M) RRETURN(MATCH_NOMATCH);
+ while (eptr < md->end_subject)
+ {
+ int len = 1;
+ if (!md->utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ category = ucp_findchar(c, &chartype, &othercase);
+ if (category != ucp_M) break;
+ eptr += len;
+ }
+ }
+ ecode++;
+ break;
+#endif
+
+
+ /* Match a back reference, possibly repeatedly. Look past the end of the
+ item to see if there is repeat information following. The code is similar
+ to that for character classes, but repeated for efficiency. Then obey
+ similar code to character type repeats - written out again for speed.
+ However, if the referenced string is the empty string, always treat
+ it as matched, any number of times (otherwise there could be infinite
+ loops). */
+
+ case OP_REF:
+ {
+ offset = GET2(ecode, 1) << 1; /* Doubled ref number */
+ ecode += 3; /* Advance past item */
+
+ /* If the reference is unset, set the length to be longer than the amount
+ of subject left; this ensures that every attempt at a match fails. We
+ can't just fail here, because of the possibility of quantifiers with zero
+ minima. */
+
+ length = (offset >= offset_top || md->offset_vector[offset] < 0)?
+ md->end_subject - eptr + 1 :
+ md->offset_vector[offset+1] - md->offset_vector[offset];
+
+ /* Set up for repetition, or handle the non-repeated case */
+
+ switch (*ecode)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRPLUS:
+ case OP_CRMINPLUS:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ c = *ecode++ - OP_CRSTAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+ break;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ minimize = (*ecode == OP_CRMINRANGE);
+ min = GET2(ecode, 1);
+ max = GET2(ecode, 3);
+ if (max == 0) max = INT_MAX;
+ ecode += 5;
+ break;
+
+ default: /* No repeat follows */
+ if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
+ eptr += length;
+ continue; /* With the main loop */
+ }
+
+ /* If the length of the reference is zero, just continue with the
+ main loop. */
+
+ if (length == 0) continue;
+
+ /* First, ensure the minimum number of matches are present. We get back
+ the length of the reference string explicitly rather than passing the
+ address of eptr, so that eptr can be a register variable. */
+
+ for (i = 1; i <= min; i++)
+ {
+ if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
+ eptr += length;
+ }
+
+ /* If min = max, continue at the same level without recursion.
+ They are not both allowed to be zero. */
+
+ if (min == max) continue;
+
+ /* If minimizing, keep trying and advancing the pointer */
+
+ if (minimize)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || !match_ref(offset, eptr, length, md, ims))
+ RRETURN(MATCH_NOMATCH);
+ eptr += length;
+ }
+ /* Control never gets here */
+ }
+
+ /* If maximizing, find the longest string and work backwards */
+
+ else
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
+ {
+ if (!match_ref(offset, eptr, length, md, ims)) break;
+ eptr += length;
+ }
+ while (eptr >= pp)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ eptr -= length;
+ }
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+
+
+
+ /* Match a bit-mapped character class, possibly repeatedly. This op code is
+ used when all the characters in the class have values in the range 0-255,
+ and either the matching is caseful, or the characters are in the range
+ 0-127 when UTF-8 processing is enabled. The only difference between
+ OP_CLASS and OP_NCLASS occurs when a data character outside the range is
+ encountered.
+
+ First, look past the end of the item to see if there is repeat information
+ following. Then obey similar code to character type repeats - written out
+ again for speed. */
+
+ case OP_NCLASS:
+ case OP_CLASS:
+ {
+ data = ecode + 1; /* Save for matching */
+ ecode += 33; /* Advance past the item */
+
+ switch (*ecode)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRPLUS:
+ case OP_CRMINPLUS:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ c = *ecode++ - OP_CRSTAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+ break;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ minimize = (*ecode == OP_CRMINRANGE);
+ min = GET2(ecode, 1);
+ max = GET2(ecode, 3);
+ if (max == 0) max = INT_MAX;
+ ecode += 5;
+ break;
+
+ default: /* No repeat follows */
+ min = max = 1;
+ break;
+ }
+
+ /* First, ensure the minimum number of matches are present. */
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (md->utf8)
+ {
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ if (c > 255)
+ {
+ if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+ }
+ else
+ {
+ if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ }
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ c = *eptr++;
+ if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ }
+ }
+
+ /* If max == min we can continue with the main loop without the
+ need to recurse. */
+
+ if (min == max) continue;
+
+ /* If minimizing, keep testing the rest of the expression and advancing
+ the pointer while it matches the class. */
+
+ if (minimize)
+ {
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (md->utf8)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ if (c > 255)
+ {
+ if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+ }
+ else
+ {
+ if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ }
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ c = *eptr++;
+ if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+ }
+
+ /* If maximizing, find the longest possible run, then work backwards. */
+
+ else
+ {
+ pp = eptr;
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (md->utf8)
+ {
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c > 255)
+ {
+ if (op == OP_CLASS) break;
+ }
+ else
+ {
+ if ((data[c/8] & (1 << (c&7))) == 0) break;
+ }
+ eptr += len;
+ }
+ for (;;)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ BACKCHAR(eptr);
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject) break;
+ c = *eptr;
+ if ((data[c/8] & (1 << (c&7))) == 0) break;
+ eptr++;
+ }
+ while (eptr >= pp)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ eptr--;
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ }
+ }
+
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+
+
+ /* Match an extended character class. This opcode is encountered only
+ in UTF-8 mode, because that's the only time it is compiled. */
+
+#ifdef SUPPORT_UTF8
+ case OP_XCLASS:
+ {
+ data = ecode + 1 + LINK_SIZE; /* Save for matching */
+ ecode += GET(ecode, 1); /* Advance past the item */
+
+ switch (*ecode)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRPLUS:
+ case OP_CRMINPLUS:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ c = *ecode++ - OP_CRSTAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+ break;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ minimize = (*ecode == OP_CRMINRANGE);
+ min = GET2(ecode, 1);
+ max = GET2(ecode, 3);
+ if (max == 0) max = INT_MAX;
+ ecode += 5;
+ break;
+
+ default: /* No repeat follows */
+ min = max = 1;
+ break;
+ }
+
+ /* First, ensure the minimum number of matches are present. */
+
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
+ }
+
+ /* If max == min we can continue with the main loop without the
+ need to recurse. */
+
+ if (min == max) continue;
+
+ /* If minimizing, keep testing the rest of the expression and advancing
+ the pointer while it matches the class. */
+
+ if (minimize)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+ }
+
+ /* If maximizing, find the longest possible run, then work backwards. */
+
+ else
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (!match_xclass(c, data)) break;
+ eptr += len;
+ }
+ for(;;)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ BACKCHAR(eptr)
+ }
+ RRETURN(MATCH_NOMATCH);
+ }
+
+ /* Control never gets here */
+ }
+#endif /* End of XCLASS */
+
+ /* Match a single character, casefully */
+
+ case OP_CHAR:
+#ifdef SUPPORT_UTF8
+ if (md->utf8)
+ {
+ length = 1;
+ ecode++;
+ GETCHARLEN(fc, ecode, length);
+ if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+ while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
+ }
+ else
+#endif
+
+ /* Non-UTF-8 mode */
+ {
+ if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
+ if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
+ ecode += 2;
+ }
+ break;
+
+ /* Match a single character, caselessly */
+
+ case OP_CHARNC:
+#ifdef SUPPORT_UTF8
+ if (md->utf8)
+ {
+ length = 1;
+ ecode++;
+ GETCHARLEN(fc, ecode, length);
+
+ if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+
+ /* If the pattern character's value is < 128, we have only one byte, and
+ can use the fast lookup table. */
+
+ if (fc < 128)
+ {
+ if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ }
+
+ /* Otherwise we must pick up the subject character */
+
+ else
+ {
+ int dc;
+ GETCHARINC(dc, eptr);
+ ecode += length;
+
+ /* If we have Unicode property support, we can use it to test the other
+ case of the character, if there is one. The result of ucp_findchar() is
+ < 0 if the char isn't found, and othercase is returned as zero if there
+ isn't one. */
+
+ if (fc != dc)
+ {
+#ifdef SUPPORT_UCP
+ int chartype;
+ int othercase;
+ if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
+#endif
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+
+ /* Non-UTF-8 mode */
+ {
+ if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
+ if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ ecode += 2;
+ }
+ break;
+
+ /* Match a single character repeatedly; different opcodes share code. */
+
+ case OP_EXACT:
+ min = max = GET2(ecode, 1);
+ ecode += 3;
+ goto REPEATCHAR;
+
+ case OP_UPTO:
+ case OP_MINUPTO:
+ min = 0;
+ max = GET2(ecode, 1);
+ minimize = *ecode == OP_MINUPTO;
+ ecode += 3;
+ goto REPEATCHAR;
+
+ case OP_STAR:
+ case OP_MINSTAR:
+ case OP_PLUS:
+ case OP_MINPLUS:
+ case OP_QUERY:
+ case OP_MINQUERY:
+ c = *ecode++ - OP_STAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+
+ /* Common code for all repeated single-character matches. We can give
+ up quickly if there are fewer than the minimum number of characters left in
+ the subject. */
+
+ REPEATCHAR:
+#ifdef SUPPORT_UTF8
+ if (md->utf8)
+ {
+ length = 1;
+ charptr = ecode;
+ GETCHARLEN(fc, ecode, length);
+ if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+ ecode += length;
+
+ /* Handle multibyte character matching specially here. There is
+ support for caseless matching if UCP support is present. */
+
+ if (length > 1)
+ {
+ int oclength = 0;
+ uschar occhars[8];
+
+#ifdef SUPPORT_UCP
+ int othercase;
+ int chartype;
+ if ((ims & PCRE_CASELESS) != 0 &&
+ ucp_findchar(fc, &chartype, &othercase) >= 0 &&
+ othercase > 0)
+ oclength = ord2utf8(othercase, occhars);
+#endif /* SUPPORT_UCP */
+
+ for (i = 1; i <= min; i++)
+ {
+ if (memcmp(eptr, charptr, length) == 0) eptr += length;
+ /* Need braces because of following else */
+ else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
+ else
+ {
+ if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
+ eptr += oclength;
+ }
+ }
+
+ if (min == max) continue;
+
+ if (minimize)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ if (memcmp(eptr, charptr, length) == 0) eptr += length;
+ /* Need braces because of following else */
+ else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
+ else
+ {
+ if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
+ eptr += oclength;
+ }
+ }
+ /* Control never gets here */
+ }
+ else
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
+ {
+ if (eptr > md->end_subject - length) break;
+ if (memcmp(eptr, charptr, length) == 0) eptr += length;
+ else if (oclength == 0) break;
+ else
+ {
+ if (memcmp(eptr, occhars, oclength) != 0) break;
+ eptr += oclength;
+ }
+ }
+ while (eptr >= pp)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ eptr -= length;
+ }
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+ }
+
+ /* If the length of a UTF-8 character is 1, we fall through here, and
+ obey the code as for non-UTF-8 characters below, though in this case the
+ value of fc will always be < 128. */
+ }
+ else
+#endif /* SUPPORT_UTF8 */
+
+ /* When not in UTF-8 mode, load a single-byte character. */
+ {
+ if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+ fc = *ecode++;
+ }
+
+ /* The value of fc at this point is always less than 256, though we may or
+ may not be in UTF-8 mode. The code is duplicated for the caseless and
+ caseful cases, for speed, since matching characters is likely to be quite
+ common. First, ensure the minimum number of matches are present. If min =
+ max, continue at the same level without recursing. Otherwise, if
+ minimizing, keep trying the rest of the expression and advancing one
+ matching character if failing, up to the maximum. Alternatively, if
+ maximizing, find the maximum number of characters and work backwards. */
+
+ DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
+ max, eptr));
+
+ if ((ims & PCRE_CASELESS) != 0)
+ {
+ fc = md->lcc[fc];
+ for (i = 1; i <= min; i++)
+ if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ if (min == max) continue;
+ if (minimize)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject ||
+ fc != md->lcc[*eptr++])
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+ }
+ else
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
+ eptr++;
+ }
+ while (eptr >= pp)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ eptr--;
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ }
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+ }
+
+ /* Caseful comparisons (includes all multi-byte characters) */
+
+ else
+ {
+ for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
+ if (min == max) continue;
+ if (minimize)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+ }
+ else
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || fc != *eptr) break;
+ eptr++;
+ }
+ while (eptr >= pp)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ eptr--;
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ }
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+
+ /* Match a negated single one-byte character. The character we are
+ checking can be multibyte. */
+
+ case OP_NOT:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ ecode++;
+ GETCHARINCTEST(c, eptr);
+ if ((ims & PCRE_CASELESS) != 0)
+ {
+#ifdef SUPPORT_UTF8
+ if (c < 256)
+#endif
+ c = md->lcc[c];
+ if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
+ }
+ else
+ {
+ if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ /* Match a negated single one-byte character repeatedly. This is almost a
+ repeat of the code for a repeated single character, but I haven't found a
+ nice way of commoning these up that doesn't require a test of the
+ positive/negative option for each character match. Maybe that wouldn't add
+ very much to the time taken, but character matching *is* what this is all
+ about... */
+
+ case OP_NOTEXACT:
+ min = max = GET2(ecode, 1);
+ ecode += 3;
+ goto REPEATNOTCHAR;
+
+ case OP_NOTUPTO:
+ case OP_NOTMINUPTO:
+ min = 0;
+ max = GET2(ecode, 1);
+ minimize = *ecode == OP_NOTMINUPTO;
+ ecode += 3;
+ goto REPEATNOTCHAR;
+
+ case OP_NOTSTAR:
+ case OP_NOTMINSTAR:
+ case OP_NOTPLUS:
+ case OP_NOTMINPLUS:
+ case OP_NOTQUERY:
+ case OP_NOTMINQUERY:
+ c = *ecode++ - OP_NOTSTAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+
+ /* Common code for all repeated single-byte matches. We can give up quickly
+ if there are fewer than the minimum number of bytes left in the
+ subject. */
+
+ REPEATNOTCHAR:
+ if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+ fc = *ecode++;
+
+ /* The code is duplicated for the caseless and caseful cases, for speed,
+ since matching characters is likely to be quite common. First, ensure the
+ minimum number of matches are present. If min = max, continue at the same
+ level without recursing. Otherwise, if minimizing, keep trying the rest of
+ the expression and advancing one matching character if failing, up to the
+ maximum. Alternatively, if maximizing, find the maximum number of
+ characters and work backwards. */
+
+ DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
+ max, eptr));
+
+ if ((ims & PCRE_CASELESS) != 0)
+ {
+ fc = md->lcc[fc];
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (md->utf8)
+ {
+ register int d;
+ for (i = 1; i <= min; i++)
+ {
+ GETCHARINC(d, eptr);
+ if (d < 256) d = md->lcc[d];
+ if (fc == d) RRETURN(MATCH_NOMATCH);
+ }
+ }
+ else
+#endif
+
+ /* Not UTF-8 mode */
+ {
+ for (i = 1; i <= min; i++)
+ if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
+ }
+
+ if (min == max) continue;
+
+ if (minimize)
+ {
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (md->utf8)
+ {
+ register int d;
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ GETCHARINC(d, eptr);
+ if (d < 256) d = md->lcc[d];
+ if (fi >= max || eptr >= md->end_subject || fc == d)
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+ }
+
+ /* Maximize case */
+
+ else
+ {
+ pp = eptr;
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (md->utf8)
+ {
+ register int d;
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(d, eptr, len);
+ if (d < 256) d = md->lcc[d];
+ if (fc == d) break;
+ eptr += len;
+ }
+ for(;;)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ BACKCHAR(eptr);
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
+ eptr++;
+ }
+ while (eptr >= pp)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ eptr--;
+ }
+ }
+
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+ }
+
+ /* Caseful comparisons */
+
+ else
+ {
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (md->utf8)
+ {
+ register int d;
+ for (i = 1; i <= min; i++)
+ {
+ GETCHARINC(d, eptr);
+ if (fc == d) RRETURN(MATCH_NOMATCH);
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (i = 1; i <= min; i++)
+ if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
+ }
+
+ if (min == max) continue;
+
+ if (minimize)
+ {
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (md->utf8)
+ {
+ register int d;
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ GETCHARINC(d, eptr);
+ if (fi >= max || eptr >= md->end_subject || fc == d)
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+ }
+
+ /* Maximize case */
+
+ else
+ {
+ pp = eptr;
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (md->utf8)
+ {
+ register int d;
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(d, eptr, len);
+ if (fc == d) break;
+ eptr += len;
+ }
+ for(;;)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ BACKCHAR(eptr);
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || fc == *eptr) break;
+ eptr++;
+ }
+ while (eptr >= pp)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ eptr--;
+ }
+ }
+
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+
+ /* Match a single character type repeatedly; several different opcodes
+ share code. This is very similar to the code for single characters, but we
+ repeat it in the interests of efficiency. */
+
+ case OP_TYPEEXACT:
+ min = max = GET2(ecode, 1);
+ minimize = TRUE;
+ ecode += 3;
+ goto REPEATTYPE;
+
+ case OP_TYPEUPTO:
+ case OP_TYPEMINUPTO:
+ min = 0;
+ max = GET2(ecode, 1);
+ minimize = *ecode == OP_TYPEMINUPTO;
+ ecode += 3;
+ goto REPEATTYPE;
+
+ case OP_TYPESTAR:
+ case OP_TYPEMINSTAR:
+ case OP_TYPEPLUS:
+ case OP_TYPEMINPLUS:
+ case OP_TYPEQUERY:
+ case OP_TYPEMINQUERY:
+ c = *ecode++ - OP_TYPESTAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+
+ /* Common code for all repeated single character type matches. Note that
+ in UTF-8 mode, '.' matches a character of any length, but for the other
+ character types, the valid characters are all one-byte long. */
+
+ REPEATTYPE:
+ ctype = *ecode++; /* Code for the character type */
+
+#ifdef SUPPORT_UCP
+ if (ctype == OP_PROP || ctype == OP_NOTPROP)
+ {
+ prop_fail_result = ctype == OP_NOTPROP;
+ prop_type = *ecode++;
+ if (prop_type >= 128)
+ {
+ prop_test_against = prop_type - 128;
+ prop_test_variable = &prop_category;
+ }
+ else
+ {
+ prop_test_against = prop_type;
+ prop_test_variable = &prop_chartype;
+ }
+ }
+ else prop_type = -1;
+#endif
+
+ /* First, ensure the minimum number of matches are present. Use inline
+ code for maximizing the speed, and do the type test once at the start
+ (i.e. keep it out of the loop). Also we can test that there are at least
+ the minimum number of bytes before we start. This isn't as effective in
+ UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
+ is tidier. Also separate the UCP code, which can be the same for both UTF-8
+ and single-bytes. */
+
+ if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+ if (min > 0)
+ {
+#ifdef SUPPORT_UCP
+ if (prop_type > 0)
+ {
+ for (i = 1; i <= min; i++)
+ {
+ GETCHARINC(c, eptr);
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if ((*prop_test_variable == prop_test_against) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+
+ /* Match extended Unicode sequences. We will get here only if the
+ support is in the binary; otherwise a compile-time error occurs. */
+
+ else if (ctype == OP_EXTUNI)
+ {
+ for (i = 1; i <= min; i++)
+ {
+ GETCHARINCTEST(c, eptr);
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
+ while (eptr < md->end_subject)
+ {
+ int len = 1;
+ if (!md->utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category != ucp_M) break;
+ eptr += len;
+ }
+ }
+ }
+
+ else
+#endif /* SUPPORT_UCP */
+
+/* Handle all other cases when the coding is UTF-8 */
+
+#ifdef SUPPORT_UTF8
+ if (md->utf8) switch(ctype)
+ {
+ case OP_ANY:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject ||
+ (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
+ RRETURN(MATCH_NOMATCH);
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ }
+ break;
+
+ case OP_ANYBYTE:
+ eptr += min;
+ break;
+
+ case OP_NOT_DIGIT:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case OP_DIGIT:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject ||
+ *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
+ RRETURN(MATCH_NOMATCH);
+ /* No need to skip more bytes - we know it's a 1-byte character */
+ }
+ break;
+
+ case OP_NOT_WHITESPACE:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject ||
+ (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
+ RRETURN(MATCH_NOMATCH);
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ }
+ break;
+
+ case OP_WHITESPACE:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject ||
+ *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
+ RRETURN(MATCH_NOMATCH);
+ /* No need to skip more bytes - we know it's a 1-byte character */
+ }
+ break;
+
+ case OP_NOT_WORDCHAR:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject ||
+ (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
+ RRETURN(MATCH_NOMATCH);
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ }
+ break;
+
+ case OP_WORDCHAR:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject ||
+ *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
+ RRETURN(MATCH_NOMATCH);
+ /* No need to skip more bytes - we know it's a 1-byte character */
+ }
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ } /* End switch(ctype) */
+
+ else
+#endif /* SUPPORT_UTF8 */
+
+ /* Code for the non-UTF-8 case for minimum matching of operators other
+ than OP_PROP and OP_NOTPROP. */
+
+ switch(ctype)
+ {
+ case OP_ANY:
+ if ((ims & PCRE_DOTALL) == 0)
+ {
+ for (i = 1; i <= min; i++)
+ if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
+ }
+ else eptr += min;
+ break;
+
+ case OP_ANYBYTE:
+ eptr += min;
+ break;
+
+ case OP_NOT_DIGIT:
+ for (i = 1; i <= min; i++)
+ if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_DIGIT:
+ for (i = 1; i <= min; i++)
+ if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_NOT_WHITESPACE:
+ for (i = 1; i <= min; i++)
+ if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_WHITESPACE:
+ for (i = 1; i <= min; i++)
+ if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_NOT_WORDCHAR:
+ for (i = 1; i <= min; i++)
+ if ((md->ctypes[*eptr++] & ctype_word) != 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_WORDCHAR:
+ for (i = 1; i <= min; i++)
+ if ((md->ctypes[*eptr++] & ctype_word) == 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ }
+ }
+
+ /* If min = max, continue at the same level without recursing */
+
+ if (min == max) continue;
+
+ /* If minimizing, we have to test the rest of the pattern before each
+ subsequent match. Again, separate the UTF-8 case for speed, and also
+ separate the UCP cases. */
+
+ if (minimize)
+ {
+#ifdef SUPPORT_UCP
+ if (prop_type > 0)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if ((*prop_test_variable == prop_test_against) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+
+ /* Match extended Unicode sequences. We will get here only if the
+ support is in the binary; otherwise a compile-time error occurs. */
+
+ else if (ctype == OP_EXTUNI)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
+ while (eptr < md->end_subject)
+ {
+ int len = 1;
+ if (!md->utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category != ucp_M) break;
+ eptr += len;
+ }
+ }
+ }
+
+ else
+#endif /* SUPPORT_UCP */
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+ if (md->utf8)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+
+ GETCHARINC(c, eptr);
+ switch(ctype)
+ {
+ case OP_ANY:
+ if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_ANYBYTE:
+ break;
+
+ case OP_NOT_DIGIT:
+ if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_DIGIT:
+ if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_NOT_WHITESPACE:
+ if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_WHITESPACE:
+ if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_NOT_WORDCHAR:
+ if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_WORDCHAR:
+ if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ }
+ }
+ }
+ else
+#endif
+ /* Not UTF-8 mode */
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ c = *eptr++;
+ switch(ctype)
+ {
+ case OP_ANY:
+ if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_ANYBYTE:
+ break;
+
+ case OP_NOT_DIGIT:
+ if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_DIGIT:
+ if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_NOT_WHITESPACE:
+ if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_WHITESPACE:
+ if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_NOT_WORDCHAR:
+ if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ case OP_WORDCHAR:
+ if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ }
+ }
+ }
+ /* Control never gets here */
+ }
+
+ /* If maximizing it is worth using inline code for speed, doing the type
+ test once at the start (i.e. keep it out of the loop). Again, keep the
+ UTF-8 and UCP stuff separate. */
+
+ else
+ {
+ pp = eptr; /* Remember where we started */
+
+#ifdef SUPPORT_UCP
+ if (prop_type > 0)
+ {
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if ((*prop_test_variable == prop_test_against) == prop_fail_result)
+ break;
+ eptr+= len;
+ }
+
+ /* eptr is now past the end of the maximum run */
+
+ for(;;)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ BACKCHAR(eptr);
+ }
+ }
+
+ /* Match extended Unicode sequences. We will get here only if the
+ support is in the binary; otherwise a compile-time error occurs. */
+
+ else if (ctype == OP_EXTUNI)
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject) break;
+ GETCHARINCTEST(c, eptr);
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category == ucp_M) break;
+ while (eptr < md->end_subject)
+ {
+ int len = 1;
+ if (!md->utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category != ucp_M) break;
+ eptr += len;
+ }
+ }
+
+ /* eptr is now past the end of the maximum run */
+
+ for(;;)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ for (;;) /* Move back over one extended */
+ {
+ int len = 1;
+ BACKCHAR(eptr);
+ if (!md->utf8) c = *eptr; else
+ {
+ GETCHARLEN(c, eptr, len);
+ }
+ prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
+ if (prop_category != ucp_M) break;
+ eptr--;
+ }
+ }
+ }
+
+ else
+#endif /* SUPPORT_UCP */
+
+#ifdef SUPPORT_UTF8
+ /* UTF-8 mode */
+
+ if (md->utf8)
+ {
+ switch(ctype)
+ {
+ case OP_ANY:
+
+ /* Special code is required for UTF8, but when the maximum is unlimited
+ we don't need it, so we repeat the non-UTF8 code. This is probably
+ worth it, because .* is quite a common idiom. */
+
+ if (max < INT_MAX)
+ {
+ if ((ims & PCRE_DOTALL) == 0)
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || *eptr == NEWLINE) break;
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ }
+ }
+ else
+ {
+ for (i = min; i < max; i++)
+ {
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ }
+ }
+ }
+
+ /* Handle unlimited UTF-8 repeat */
+
+ else
+ {
+ if ((ims & PCRE_DOTALL) == 0)
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || *eptr == NEWLINE) break;
+ eptr++;
+ }
+ break;
+ }
+ else
+ {
+ c = max - min;
+ if (c > md->end_subject - eptr) c = md->end_subject - eptr;
+ eptr += c;
+ }
+ }
+ break;
+
+ /* The byte case is the same as non-UTF8 */
+
+ case OP_ANYBYTE:
+ c = max - min;
+ if (c > md->end_subject - eptr) c = md->end_subject - eptr;
+ eptr += c;
+ break;
+
+ case OP_NOT_DIGIT:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
+ eptr+= len;
+ }
+ break;
+
+ case OP_DIGIT:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
+ eptr+= len;
+ }
+ break;
+
+ case OP_NOT_WHITESPACE:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
+ eptr+= len;
+ }
+ break;
+
+ case OP_WHITESPACE:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
+ eptr+= len;
+ }
+ break;
+
+ case OP_NOT_WORDCHAR:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
+ eptr+= len;
+ }
+ break;
+
+ case OP_WORDCHAR:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
+ eptr+= len;
+ }
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ }
+
+ /* eptr is now past the end of the maximum run */
+
+ for(;;)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ BACKCHAR(eptr);
+ }
+ }
+ else
+#endif
+
+ /* Not UTF-8 mode */
+ {
+ switch(ctype)
+ {
+ case OP_ANY:
+ if ((ims & PCRE_DOTALL) == 0)
+ {
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || *eptr == NEWLINE) break;
+ eptr++;
+ }
+ break;
+ }
+ /* For DOTALL case, fall through and treat as \C */
+
+ case OP_ANYBYTE:
+ c = max - min;
+ if (c > md->end_subject - eptr) c = md->end_subject - eptr;
+ eptr += c;
+ break;
+
+ case OP_NOT_DIGIT:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
+ break;
+ eptr++;
+ }
+ break;
+
+ case OP_DIGIT:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
+ break;
+ eptr++;
+ }
+ break;
+
+ case OP_NOT_WHITESPACE:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
+ break;
+ eptr++;
+ }
+ break;
+
+ case OP_WHITESPACE:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
+ break;
+ eptr++;
+ }
+ break;
+
+ case OP_NOT_WORDCHAR:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
+ break;
+ eptr++;
+ }
+ break;
+
+ case OP_WORDCHAR:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
+ break;
+ eptr++;
+ }
+ break;
+
+ default:
+ RRETURN(PCRE_ERROR_INTERNAL);
+ }
+
+ /* eptr is now past the end of the maximum run */
+
+ while (eptr >= pp)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ eptr--;
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ }
+ }
+
+ /* Get here if we can't make it match with any permitted repetitions */
+
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ /* There's been some horrible disaster. Since all codes > OP_BRA are
+ for capturing brackets, and there shouldn't be any gaps between 0 and
+ OP_BRA, arrival here can only mean there is something seriously wrong
+ in the code above or the OP_xxx definitions. */
+
+ default:
+ DPRINTF(("Unknown opcode %d\n", *ecode));
+ RRETURN(PCRE_ERROR_UNKNOWN_NODE);
+ }
+
+ /* Do not stick any code in here without much thought; it is assumed
+ that "continue" in the code above comes out to here to repeat the main
+ loop. */
+
+ } /* End of main loop */
+/* Control never reaches here */
+}
+
+
+/***************************************************************************
+****************************************************************************
+ RECURSION IN THE match() FUNCTION
+
+Undefine all the macros that were defined above to handle this. */
+
+#ifdef NO_RECURSE
+#undef eptr
+#undef ecode
+#undef offset_top
+#undef ims
+#undef eptrb
+#undef flags
+
+#undef callpat
+#undef charptr
+#undef data
+#undef next
+#undef pp
+#undef prev
+#undef saved_eptr
+
+#undef new_recursive
+
+#undef cur_is_word
+#undef condition
+#undef minimize
+#undef prev_is_word
+
+#undef original_ims
+
+#undef ctype
+#undef length
+#undef max
+#undef min
+#undef number
+#undef offset
+#undef op
+#undef save_capture_last
+#undef save_offset1
+#undef save_offset2
+#undef save_offset3
+#undef stacksave
+
+#undef newptrb
+
+#endif
+
+/* These two are defined as macros in both cases */
+
+#undef fc
+#undef fi
+
+/***************************************************************************
+***************************************************************************/
+
+
+
+/*************************************************
+* Execute a Regular Expression *
+*************************************************/
+
+/* This function applies a compiled re to a subject string and picks out
+portions of the string if it matches. Two elements in the vector are set for
+each substring: the offsets to the start and end of the substring.
+
+Arguments:
+ argument_re points to the compiled expression
+ extra_data points to extra data or is NULL
+ subject points to the subject string
+ length length of subject string (may contain binary zeros)
+ start_offset where to start in the subject string
+ options option bits
+ offsets points to a vector of ints to be filled in with offsets
+ offsetcount the number of elements in the vector
+
+Returns: > 0 => success; value is the number of elements filled in
+ = 0 => success, but offsets is not big enough
+ -1 => failed to match
+ < -1 => some kind of unexpected problem
+*/
+
+EXPORT int
+pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
+ const char *subject, int length, int start_offset, int options, int *offsets,
+ int offsetcount)
+{
+int rc, resetcount, ocount;
+int first_byte = -1;
+int req_byte = -1;
+int req_byte2 = -1;
+unsigned long int ims = 0;
+BOOL using_temporary_offsets = FALSE;
+BOOL anchored;
+BOOL startline;
+BOOL first_byte_caseless = FALSE;
+BOOL req_byte_caseless = FALSE;
+match_data match_block;
+const uschar *tables;
+const uschar *start_bits = NULL;
+const uschar *start_match = (const uschar *)subject + start_offset;
+const uschar *end_subject;
+const uschar *req_byte_ptr = start_match - 1;
+
+pcre_study_data internal_study;
+const pcre_study_data *study;
+
+real_pcre internal_re;
+const real_pcre *external_re = (const real_pcre *)argument_re;
+const real_pcre *re = external_re;
+
+/* Plausibility checks */
+
+if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
+if (re == NULL || subject == NULL ||
+ (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
+if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
+
+/* Fish out the optional data from the extra_data structure, first setting
+the default values. */
+
+study = NULL;
+match_block.match_limit = MATCH_LIMIT;
+match_block.callout_data = NULL;
+
+/* The table pointer is always in native byte order. */
+
+tables = external_re->tables;
+
+if (extra_data != NULL)
+ {
+ register unsigned int flags = extra_data->flags;
+ if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
+ study = (const pcre_study_data *)extra_data->study_data;
+ if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
+ match_block.match_limit = extra_data->match_limit;
+ if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
+ match_block.callout_data = extra_data->callout_data;
+ if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
+ }
+
+/* If the exec call supplied NULL for tables, use the inbuilt ones. This
+is a feature that makes it possible to save compiled regex and re-use them
+in other programs later. */
+
+if (tables == NULL) tables = pcre_default_tables;
+
+/* Check that the first field in the block is the magic number. If it is not,
+test for a regex that was compiled on a host of opposite endianness. If this is
+the case, flipped values are put in internal_re and internal_study if there was
+study data too. */
+
+if (re->magic_number != MAGIC_NUMBER)
+ {
+ re = try_flipped(re, &internal_re, study, &internal_study);
+ if (re == NULL) return PCRE_ERROR_BADMAGIC;
+ if (study != NULL) study = &internal_study;
+ }
+
+/* Set up other data */
+
+anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
+startline = (re->options & PCRE_STARTLINE) != 0;
+
+/* The code starts after the real_pcre block and the capture name table. */
+
+match_block.start_code = (const uschar *)external_re + re->name_table_offset +
+ re->name_count * re->name_entry_size;
+
+match_block.start_subject = (const uschar *)subject;
+match_block.start_offset = start_offset;
+match_block.end_subject = match_block.start_subject + length;
+end_subject = match_block.end_subject;
+
+match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
+match_block.utf8 = (re->options & PCRE_UTF8) != 0;
+
+match_block.notbol = (options & PCRE_NOTBOL) != 0;
+match_block.noteol = (options & PCRE_NOTEOL) != 0;
+match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
+match_block.partial = (options & PCRE_PARTIAL) != 0;
+match_block.hitend = FALSE;
+
+match_block.recursive = NULL; /* No recursion at top level */
+
+match_block.lcc = tables + lcc_offset;
+match_block.ctypes = tables + ctypes_offset;
+
+/* Partial matching is supported only for a restricted set of regexes at the
+moment. */
+
+if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
+ return PCRE_ERROR_BADPARTIAL;
+
+/* Check a UTF-8 string if required. Unfortunately there's no way of passing
+back the character offset. */
+
+#ifdef SUPPORT_UTF8
+if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
+ {
+ if (valid_utf8((uschar *)subject, length) >= 0)
+ return PCRE_ERROR_BADUTF8;
+ if (start_offset > 0 && start_offset < length)
+ {
+ int tb = ((uschar *)subject)[start_offset];
+ if (tb > 127)
+ {
+ tb &= 0xc0;
+ if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
+ }
+ }
+ }
+#endif
+
+/* The ims options can vary during the matching as a result of the presence
+of (?ims) items in the pattern. They are kept in a local variable so that
+restoring at the exit of a group is easy. */
+
+ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
+
+/* If the expression has got more back references than the offsets supplied can
+hold, we get a temporary chunk of working store to use during the matching.
+Otherwise, we can use the vector supplied, rounding down its size to a multiple
+of 3. */
+
+ocount = offsetcount - (offsetcount % 3);
+
+if (re->top_backref > 0 && re->top_backref >= ocount/3)
+ {
+ ocount = re->top_backref * 3 + 3;
+ match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
+ if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
+ using_temporary_offsets = TRUE;
+ DPRINTF(("Got memory to hold back references\n"));
+ }
+else match_block.offset_vector = offsets;
+
+match_block.offset_end = ocount;
+match_block.offset_max = (2*ocount)/3;
+match_block.offset_overflow = FALSE;
+match_block.capture_last = -1;
+
+/* Compute the minimum number of offsets that we need to reset each time. Doing
+this makes a huge difference to execution time when there aren't many brackets
+in the pattern. */
+
+resetcount = 2 + re->top_bracket * 2;
+if (resetcount > offsetcount) resetcount = ocount;
+
+/* Reset the working variable associated with each extraction. These should
+never be used unless previously set, but they get saved and restored, and so we
+initialize them to avoid reading uninitialized locations. */
+
+if (match_block.offset_vector != NULL)
+ {
+ register int *iptr = match_block.offset_vector + ocount;
+ register int *iend = iptr - resetcount/2 + 1;
+ while (--iptr >= iend) *iptr = -1;
+ }
+
+/* Set up the first character to match, if available. The first_byte value is
+never set for an anchored regular expression, but the anchoring may be forced
+at run time, so we have to test for anchoring. The first char may be unset for
+an unanchored pattern, of course. If there's no first char and the pattern was
+studied, there may be a bitmap of possible first characters. */
+
+if (!anchored)
+ {
+ if ((re->options & PCRE_FIRSTSET) != 0)
+ {
+ first_byte = re->first_byte & 255;
+ if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
+ first_byte = match_block.lcc[first_byte];
+ }
+ else
+ if (!startline && study != NULL &&
+ (study->options & PCRE_STUDY_MAPPED) != 0)
+ start_bits = study->start_bits;
+ }
+
+/* For anchored or unanchored matches, there may be a "last known required
+character" set. */
+
+if ((re->options & PCRE_REQCHSET) != 0)
+ {
+ req_byte = re->req_byte & 255;
+ req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
+ req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
+ }
+
+/* Loop for handling unanchored repeated matching attempts; for anchored regexs
+the loop runs just once. */
+
+do
+ {
+ /* Reset the maximum number of extractions we might see. */
+
+ if (match_block.offset_vector != NULL)
+ {
+ register int *iptr = match_block.offset_vector;
+ register int *iend = iptr + resetcount;
+ while (iptr < iend) *iptr++ = -1;
+ }
+
+ /* Advance to a unique first char if possible */
+
+ if (first_byte >= 0)
+ {
+ if (first_byte_caseless)
+ while (start_match < end_subject &&
+ match_block.lcc[*start_match] != first_byte)
+ start_match++;
+ else
+ while (start_match < end_subject && *start_match != first_byte)
+ start_match++;
+ }
+
+ /* Or to just after \n for a multiline match if possible */
+
+ else if (startline)
+ {
+ if (start_match > match_block.start_subject + start_offset)
+ {
+ while (start_match < end_subject && start_match[-1] != NEWLINE)
+ start_match++;
+ }
+ }
+
+ /* Or to a non-unique first char after study */
+
+ else if (start_bits != NULL)
+ {
+ while (start_match < end_subject)
+ {
+ register unsigned int c = *start_match;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
+ }
+ }
+
+#ifdef DEBUG /* Sigh. Some compilers never learn. */
+ printf(">>>> Match against: ");
+ pchars(start_match, end_subject - start_match, TRUE, &match_block);
+ printf("\n");
+#endif
+
+ /* If req_byte is set, we know that that character must appear in the subject
+ for the match to succeed. If the first character is set, req_byte must be
+ later in the subject; otherwise the test starts at the match point. This
+ optimization can save a huge amount of backtracking in patterns with nested
+ unlimited repeats that aren't going to match. Writing separate code for
+ cased/caseless versions makes it go faster, as does using an autoincrement
+ and backing off on a match.
+
+ HOWEVER: when the subject string is very, very long, searching to its end can
+ take a long time, and give bad performance on quite ordinary patterns. This
+ showed up when somebody was matching /^C/ on a 32-megabyte string... so we
+ don't do this when the string is sufficiently long.
+
+ ALSO: this processing is disabled when partial matching is requested.
+ */
+
+ if (req_byte >= 0 &&
+ end_subject - start_match < REQ_BYTE_MAX &&
+ !match_block.partial)
+ {
+ register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
+
+ /* We don't need to repeat the search if we haven't yet reached the
+ place we found it at last time. */
+
+ if (p > req_byte_ptr)
+ {
+ if (req_byte_caseless)
+ {
+ while (p < end_subject)
+ {
+ register int pp = *p++;
+ if (pp == req_byte || pp == req_byte2) { p--; break; }
+ }
+ }
+ else
+ {
+ while (p < end_subject)
+ {
+ if (*p++ == req_byte) { p--; break; }
+ }
+ }
+
+ /* If we can't find the required character, break the matching loop */
+
+ if (p >= end_subject) break;
+
+ /* If we have found the required character, save the point where we
+ found it, so that we don't search again next time round the loop if
+ the start hasn't passed this character yet. */
+
+ req_byte_ptr = p;
+ }
+ }
+
+ /* When a match occurs, substrings will be set for all internal extractions;
+ we just need to set up the whole thing as substring 0 before returning. If
+ there were too many extractions, set the return code to zero. In the case
+ where we had to get some local store to hold offsets for backreferences, copy
+ those back references that we can. In this case there need not be overflow
+ if certain parts of the pattern were not used. */
+
+ match_block.start_match = start_match;
+ match_block.match_call_count = 0;
+
+ rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
+ match_isgroup);
+
+ if (rc == MATCH_NOMATCH)
+ {
+ start_match++;
+#ifdef SUPPORT_UTF8
+ if (match_block.utf8)
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+#endif
+ continue;
+ }
+
+ if (rc != MATCH_MATCH)
+ {
+ DPRINTF((">>>> error: returning %d\n", rc));
+ return rc;
+ }
+
+ /* We have a match! Copy the offset information from temporary store if
+ necessary */
+
+ if (using_temporary_offsets)
+ {
+ if (offsetcount >= 4)
+ {
+ memcpy(offsets + 2, match_block.offset_vector + 2,
+ (offsetcount - 2) * sizeof(int));
+ DPRINTF(("Copied offsets from temporary memory\n"));
+ }
+ if (match_block.end_offset_top > offsetcount)
+ match_block.offset_overflow = TRUE;
+
+ DPRINTF(("Freeing temporary memory\n"));
+ (pcre_free)(match_block.offset_vector);
+ }
+
+ rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
+
+ if (offsetcount < 2) rc = 0; else
+ {
+ offsets[0] = start_match - match_block.start_subject;
+ offsets[1] = match_block.end_match_ptr - match_block.start_subject;
+ }
+
+ DPRINTF((">>>> returning %d\n", rc));
+ return rc;
+ }
+
+/* This "while" is the end of the "do" above */
+
+while (!anchored && start_match <= end_subject);
+
+if (using_temporary_offsets)
+ {
+ DPRINTF(("Freeing temporary memory\n"));
+ (pcre_free)(match_block.offset_vector);
+ }
+
+if (match_block.partial && match_block.hitend)
+ {
+ DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
+ return PCRE_ERROR_PARTIAL;
+ }
+else
+ {
+ DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
+ return PCRE_ERROR_NOMATCH;
+ }
+}
+
+/* End of pcre.c */
diff --git a/src/src/pcre/pcre.h b/src/src/pcre/pcre.h
new file mode 100644
index 000000000..ee5bf16e7
--- /dev/null
+++ b/src/src/pcre/pcre.h
@@ -0,0 +1,239 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* In its original form, this is the .in file that is transformed by
+"configure" into pcre.h.
+
+ Copyright (c) 1997-2004 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+#ifndef _PCRE_H
+#define _PCRE_H
+
+/* The file pcre.h is build by "configure". Do not edit it; instead
+make changes to pcre.in. */
+
+#define PCRE_MAJOR 5
+#define PCRE_MINOR 0
+#define PCRE_DATE 13-Sep-2004
+
+/* Win32 uses DLL by default */
+
+#ifdef _WIN32
+# ifdef PCRE_DEFINITION
+# ifdef DLL_EXPORT
+# define PCRE_DATA_SCOPE __declspec(dllexport)
+# endif
+# else
+# ifndef PCRE_STATIC
+# define PCRE_DATA_SCOPE extern __declspec(dllimport)
+# endif
+# endif
+#endif
+#ifndef PCRE_DATA_SCOPE
+# define PCRE_DATA_SCOPE extern
+#endif
+
+/* Have to include stdlib.h in order to ensure that size_t is defined;
+it is needed here for malloc. */
+
+#include <stdlib.h>
+
+/* Allow for C++ users */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Options */
+
+#define PCRE_CASELESS 0x0001
+#define PCRE_MULTILINE 0x0002
+#define PCRE_DOTALL 0x0004
+#define PCRE_EXTENDED 0x0008
+#define PCRE_ANCHORED 0x0010
+#define PCRE_DOLLAR_ENDONLY 0x0020
+#define PCRE_EXTRA 0x0040
+#define PCRE_NOTBOL 0x0080
+#define PCRE_NOTEOL 0x0100
+#define PCRE_UNGREEDY 0x0200
+#define PCRE_NOTEMPTY 0x0400
+#define PCRE_UTF8 0x0800
+#define PCRE_NO_AUTO_CAPTURE 0x1000
+#define PCRE_NO_UTF8_CHECK 0x2000
+#define PCRE_AUTO_CALLOUT 0x4000
+#define PCRE_PARTIAL 0x8000
+
+/* Exec-time and get/set-time error codes */
+
+#define PCRE_ERROR_NOMATCH (-1)
+#define PCRE_ERROR_NULL (-2)
+#define PCRE_ERROR_BADOPTION (-3)
+#define PCRE_ERROR_BADMAGIC (-4)
+#define PCRE_ERROR_UNKNOWN_NODE (-5)
+#define PCRE_ERROR_NOMEMORY (-6)
+#define PCRE_ERROR_NOSUBSTRING (-7)
+#define PCRE_ERROR_MATCHLIMIT (-8)
+#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */
+#define PCRE_ERROR_BADUTF8 (-10)
+#define PCRE_ERROR_BADUTF8_OFFSET (-11)
+#define PCRE_ERROR_PARTIAL (-12)
+#define PCRE_ERROR_BADPARTIAL (-13)
+#define PCRE_ERROR_INTERNAL (-14)
+#define PCRE_ERROR_BADCOUNT (-15)
+
+/* Request types for pcre_fullinfo() */
+
+#define PCRE_INFO_OPTIONS 0
+#define PCRE_INFO_SIZE 1
+#define PCRE_INFO_CAPTURECOUNT 2
+#define PCRE_INFO_BACKREFMAX 3
+#define PCRE_INFO_FIRSTBYTE 4
+#define PCRE_INFO_FIRSTCHAR 4 /* For backwards compatibility */
+#define PCRE_INFO_FIRSTTABLE 5
+#define PCRE_INFO_LASTLITERAL 6
+#define PCRE_INFO_NAMEENTRYSIZE 7
+#define PCRE_INFO_NAMECOUNT 8
+#define PCRE_INFO_NAMETABLE 9
+#define PCRE_INFO_STUDYSIZE 10
+#define PCRE_INFO_DEFAULT_TABLES 11
+
+/* Request types for pcre_config() */
+
+#define PCRE_CONFIG_UTF8 0
+#define PCRE_CONFIG_NEWLINE 1
+#define PCRE_CONFIG_LINK_SIZE 2
+#define PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3
+#define PCRE_CONFIG_MATCH_LIMIT 4
+#define PCRE_CONFIG_STACKRECURSE 5
+#define PCRE_CONFIG_UNICODE_PROPERTIES 6
+
+/* Bit flags for the pcre_extra structure */
+
+#define PCRE_EXTRA_STUDY_DATA 0x0001
+#define PCRE_EXTRA_MATCH_LIMIT 0x0002
+#define PCRE_EXTRA_CALLOUT_DATA 0x0004
+#define PCRE_EXTRA_TABLES 0x0008
+
+/* Types */
+
+struct real_pcre; /* declaration; the definition is private */
+typedef struct real_pcre pcre;
+
+/* The structure for passing additional data to pcre_exec(). This is defined in
+such as way as to be extensible. Always add new fields at the end, in order to
+remain compatible. */
+
+typedef struct pcre_extra {
+ unsigned long int flags; /* Bits for which fields are set */
+ void *study_data; /* Opaque data from pcre_study() */
+ unsigned long int match_limit; /* Maximum number of calls to match() */
+ void *callout_data; /* Data passed back in callouts */
+ const unsigned char *tables; /* Pointer to character tables */
+} pcre_extra;
+
+/* The structure for passing out data via the pcre_callout_function. We use a
+structure so that new fields can be added on the end in future versions,
+without changing the API of the function, thereby allowing old clients to work
+without modification. */
+
+typedef struct pcre_callout_block {
+ int version; /* Identifies version of block */
+ /* ------------------------ Version 0 ------------------------------- */
+ int callout_number; /* Number compiled into pattern */
+ int *offset_vector; /* The offset vector */
+ const char *subject; /* The subject being matched */
+ int subject_length; /* The length of the subject */
+ int start_match; /* Offset to start of this match attempt */
+ int current_position; /* Where we currently are in the subject */
+ int capture_top; /* Max current capture */
+ int capture_last; /* Most recently closed capture */
+ void *callout_data; /* Data passed in with the call */
+ /* ------------------- Added for Version 1 -------------------------- */
+ int pattern_position; /* Offset to next item in the pattern */
+ int next_item_length; /* Length of next item in the pattern */
+ /* ------------------------------------------------------------------ */
+} pcre_callout_block;
+
+/* Indirection for store get and free functions. These can be set to
+alternative malloc/free functions if required. Special ones are used in the
+non-recursive case for "frames". There is also an optional callout function
+that is triggered by the (?) regex item. Some magic is required for Win32 DLL;
+it is null on other OS. For Virtual Pascal, these have to be different again.
+*/
+
+#ifndef VPCOMPAT
+PCRE_DATA_SCOPE void *(*pcre_malloc)(size_t);
+PCRE_DATA_SCOPE void (*pcre_free)(void *);
+PCRE_DATA_SCOPE void *(*pcre_stack_malloc)(size_t);
+PCRE_DATA_SCOPE void (*pcre_stack_free)(void *);
+PCRE_DATA_SCOPE int (*pcre_callout)(pcre_callout_block *);
+#else /* VPCOMPAT */
+extern void *pcre_malloc(size_t);
+extern void pcre_free(void *);
+extern void *pcre_stack_malloc(size_t);
+extern void pcre_stack_free(void *);
+extern int pcre_callout(pcre_callout_block *);
+#endif /* VPCOMPAT */
+
+/* Exported PCRE functions */
+
+extern pcre *pcre_compile(const char *, int, const char **,
+ int *, const unsigned char *);
+extern int pcre_config(int, void *);
+extern int pcre_copy_named_substring(const pcre *, const char *,
+ int *, int, const char *, char *, int);
+extern int pcre_copy_substring(const char *, int *, int, int,
+ char *, int);
+extern int pcre_exec(const pcre *, const pcre_extra *,
+ const char *, int, int, int, int *, int);
+extern void pcre_free_substring(const char *);
+extern void pcre_free_substring_list(const char **);
+extern int pcre_fullinfo(const pcre *, const pcre_extra *, int,
+ void *);
+extern int pcre_get_named_substring(const pcre *, const char *,
+ int *, int, const char *, const char **);
+extern int pcre_get_stringnumber(const pcre *, const char *);
+extern int pcre_get_substring(const char *, int *, int, int,
+ const char **);
+extern int pcre_get_substring_list(const char *, int *, int,
+ const char ***);
+extern int pcre_info(const pcre *, int *, int *);
+extern const unsigned char *pcre_maketables(void);
+extern pcre_extra *pcre_study(const pcre *, int, const char **);
+extern const char *pcre_version(void);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* End of pcre.h */
diff --git a/src/src/pcre/pcretest.c b/src/src/pcre/pcretest.c
new file mode 100644
index 000000000..087ef2c04
--- /dev/null
+++ b/src/src/pcre/pcretest.c
@@ -0,0 +1,1789 @@
+/*************************************************
+* PCRE testing program *
+*************************************************/
+
+/* This program was hacked up as a tester for PCRE. I really should have
+written it more tidily in the first place. Will I ever learn? It has grown and
+been extended and consequently is now rather untidy in places.
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <locale.h>
+#include <errno.h>
+
+/* We need the internal info for displaying the results of pcre_study(). Also
+for getting the opcodes for showing compiled code. */
+
+#define PCRE_SPY /* For Win32 build, import data, not export */
+#include "internal.h"
+
+/* It is possible to compile this test program without including support for
+testing the POSIX interface, though this is not available via the standard
+Makefile. */
+
+#if !defined NOPOSIX
+#include "pcreposix.h"
+#endif
+
+#ifndef CLOCKS_PER_SEC
+#ifdef CLK_TCK
+#define CLOCKS_PER_SEC CLK_TCK
+#else
+#define CLOCKS_PER_SEC 100
+#endif
+#endif
+
+#define LOOPREPEAT 500000
+
+#define BUFFER_SIZE 30000
+#define PBUFFER_SIZE BUFFER_SIZE
+#define DBUFFER_SIZE BUFFER_SIZE
+
+
+static FILE *outfile;
+static int log_store = 0;
+static int callout_count;
+static int callout_extra;
+static int callout_fail_count;
+static int callout_fail_id;
+static int first_callout;
+static int show_malloc;
+static int use_utf8;
+static size_t gotten_store;
+
+static uschar *pbuffer = NULL;
+
+
+static const int utf8_table1[] = {
+ 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
+
+static const int utf8_table2[] = {
+ 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
+
+static const int utf8_table3[] = {
+ 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
+
+
+
+/*************************************************
+* Print compiled regex *
+*************************************************/
+
+/* The code for doing this is held in a separate file that is also included in
+pcre.c when it is compiled with the debug switch. It defines a function called
+print_internals(), which uses a table of opcode lengths defined by the macro
+OP_LENGTHS, whose name must be OP_lengths. It also uses a table that translates
+Unicode property names to numbers; this is kept in a separate file. */
+
+static uschar OP_lengths[] = { OP_LENGTHS };
+
+#ifdef SUPPORT_UCP
+#include "ucp.h"
+#include "ucptypetable.c"
+#endif
+
+#include "printint.c"
+
+
+
+/*************************************************
+* Read number from string *
+*************************************************/
+
+/* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess
+around with conditional compilation, just do the job by hand. It is only used
+for unpicking the -o argument, so just keep it simple.
+
+Arguments:
+ str string to be converted
+ endptr where to put the end pointer
+
+Returns: the unsigned long
+*/
+
+static int
+get_value(unsigned char *str, unsigned char **endptr)
+{
+int result = 0;
+while(*str != 0 && isspace(*str)) str++;
+while (isdigit(*str)) result = result * 10 + (int)(*str++ - '0');
+*endptr = str;
+return(result);
+}
+
+
+
+/*************************************************
+* Convert character value to UTF-8 *
+*************************************************/
+
+/* This function takes an integer value in the range 0 - 0x7fffffff
+and encodes it as a UTF-8 character in 0 to 6 bytes.
+
+Arguments:
+ cvalue the character value
+ buffer pointer to buffer for result - at least 6 bytes long
+
+Returns: number of characters placed in the buffer
+ -1 if input character is negative
+ 0 if input character is positive but too big (only when
+ int is longer than 32 bits)
+*/
+
+static int
+ord2utf8(int cvalue, unsigned char *buffer)
+{
+register int i, j;
+for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
+ if (cvalue <= utf8_table1[i]) break;
+if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
+if (cvalue < 0) return -1;
+
+buffer += i;
+for (j = i; j > 0; j--)
+ {
+ *buffer-- = 0x80 | (cvalue & 0x3f);
+ cvalue >>= 6;
+ }
+*buffer = utf8_table2[i] | cvalue;
+return i + 1;
+}
+
+
+/*************************************************
+* Convert UTF-8 string to value *
+*************************************************/
+
+/* This function takes one or more bytes that represents a UTF-8 character,
+and returns the value of the character.
+
+Argument:
+ buffer a pointer to the byte vector
+ vptr a pointer to an int to receive the value
+
+Returns: > 0 => the number of bytes consumed
+ -6 to 0 => malformed UTF-8 character at offset = (-return)
+*/
+
+static int
+utf82ord(unsigned char *buffer, int *vptr)
+{
+int c = *buffer++;
+int d = c;
+int i, j, s;
+
+for (i = -1; i < 6; i++) /* i is number of additional bytes */
+ {
+ if ((d & 0x80) == 0) break;
+ d <<= 1;
+ }
+
+if (i == -1) { *vptr = c; return 1; } /* ascii character */
+if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
+
+/* i now has a value in the range 1-5 */
+
+s = 6*i;
+d = (c & utf8_table3[i]) << s;
+
+for (j = 0; j < i; j++)
+ {
+ c = *buffer++;
+ if ((c & 0xc0) != 0x80) return -(j+1);
+ s -= 6;
+ d |= (c & 0x3f) << s;
+ }
+
+/* Check that encoding was the correct unique one */
+
+for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
+ if (d <= utf8_table1[j]) break;
+if (j != i) return -(i+1);
+
+/* Valid value */
+
+*vptr = d;
+return i+1;
+}
+
+
+
+/*************************************************
+* Print character string *
+*************************************************/
+
+/* Character string printing function. Must handle UTF-8 strings in utf8
+mode. Yields number of characters printed. If handed a NULL file, just counts
+chars without printing. */
+
+static int pchars(unsigned char *p, int length, FILE *f)
+{
+int c;
+int yield = 0;
+
+while (length-- > 0)
+ {
+ if (use_utf8)
+ {
+ int rc = utf82ord(p, &c);
+
+ if (rc > 0 && rc <= length + 1) /* Mustn't run over the end */
+ {
+ length -= rc - 1;
+ p += rc;
+ if (c < 256 && isprint(c))
+ {
+ if (f != NULL) fprintf(f, "%c", c);
+ yield++;
+ }
+ else
+ {
+ int n;
+ if (f != NULL) fprintf(f, "\\x{%02x}%n", c, &n);
+ yield += n;
+ }
+ continue;
+ }
+ }
+
+ /* Not UTF-8, or malformed UTF-8 */
+
+ if (isprint(c = *(p++)))
+ {
+ if (f != NULL) fprintf(f, "%c", c);
+ yield++;
+ }
+ else
+ {
+ if (f != NULL) fprintf(f, "\\x%02x", c);
+ yield += 4;
+ }
+ }
+
+return yield;
+}
+
+
+
+/*************************************************
+* Callout function *
+*************************************************/
+
+/* Called from PCRE as a result of the (?C) item. We print out where we are in
+the match. Yield zero unless more callouts than the fail count, or the callout
+data is not zero. */
+
+static int callout(pcre_callout_block *cb)
+{
+FILE *f = (first_callout | callout_extra)? outfile : NULL;
+int i, pre_start, post_start, subject_length;
+
+if (callout_extra)
+ {
+ fprintf(f, "Callout %d: last capture = %d\n",
+ cb->callout_number, cb->capture_last);
+
+ for (i = 0; i < cb->capture_top * 2; i += 2)
+ {
+ if (cb->offset_vector[i] < 0)
+ fprintf(f, "%2d: <unset>\n", i/2);
+ else
+ {
+ fprintf(f, "%2d: ", i/2);
+ (void)pchars((unsigned char *)cb->subject + cb->offset_vector[i],
+ cb->offset_vector[i+1] - cb->offset_vector[i], f);
+ fprintf(f, "\n");
+ }
+ }
+ }
+
+/* Re-print the subject in canonical form, the first time or if giving full
+datails. On subsequent calls in the same match, we use pchars just to find the
+printed lengths of the substrings. */
+
+if (f != NULL) fprintf(f, "--->");
+
+pre_start = pchars((unsigned char *)cb->subject, cb->start_match, f);
+post_start = pchars((unsigned char *)(cb->subject + cb->start_match),
+ cb->current_position - cb->start_match, f);
+
+subject_length = pchars((unsigned char *)cb->subject, cb->subject_length, NULL);
+
+(void)pchars((unsigned char *)(cb->subject + cb->current_position),
+ cb->subject_length - cb->current_position, f);
+
+if (f != NULL) fprintf(f, "\n");
+
+/* Always print appropriate indicators, with callout number if not already
+shown. For automatic callouts, show the pattern offset. */
+
+if (cb->callout_number == 255)
+ {
+ fprintf(outfile, "%+3d ", cb->pattern_position);
+ if (cb->pattern_position > 99) fprintf(outfile, "\n ");
+ }
+else
+ {
+ if (callout_extra) fprintf(outfile, " ");
+ else fprintf(outfile, "%3d ", cb->callout_number);
+ }
+
+for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
+fprintf(outfile, "^");
+
+if (post_start > 0)
+ {
+ for (i = 0; i < post_start - 1; i++) fprintf(outfile, " ");
+ fprintf(outfile, "^");
+ }
+
+for (i = 0; i < subject_length - pre_start - post_start + 4; i++)
+ fprintf(outfile, " ");
+
+fprintf(outfile, "%.*s", (cb->next_item_length == 0)? 1 : cb->next_item_length,
+ pbuffer + cb->pattern_position);
+
+fprintf(outfile, "\n");
+first_callout = 0;
+
+if (cb->callout_data != NULL)
+ {
+ int callout_data = *((int *)(cb->callout_data));
+ if (callout_data != 0)
+ {
+ fprintf(outfile, "Callout data = %d\n", callout_data);
+ return callout_data;
+ }
+ }
+
+return (cb->callout_number != callout_fail_id)? 0 :
+ (++callout_count >= callout_fail_count)? 1 : 0;
+}
+
+
+/*************************************************
+* Local malloc functions *
+*************************************************/
+
+/* Alternative malloc function, to test functionality and show the size of the
+compiled re. */
+
+static void *new_malloc(size_t size)
+{
+void *block = malloc(size);
+gotten_store = size;
+if (show_malloc)
+ fprintf(outfile, "malloc %3d %p\n", (int)size, block);
+return block;
+}
+
+static void new_free(void *block)
+{
+if (show_malloc)
+ fprintf(outfile, "free %p\n", block);
+free(block);
+}
+
+
+/* For recursion malloc/free, to test stacking calls */
+
+static void *stack_malloc(size_t size)
+{
+void *block = malloc(size);
+if (show_malloc)
+ fprintf(outfile, "stack_malloc %3d %p\n", (int)size, block);
+return block;
+}
+
+static void stack_free(void *block)
+{
+if (show_malloc)
+ fprintf(outfile, "stack_free %p\n", block);
+free(block);
+}
+
+
+/*************************************************
+* Call pcre_fullinfo() *
+*************************************************/
+
+/* Get one piece of information from the pcre_fullinfo() function */
+
+static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
+{
+int rc;
+if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
+ fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
+}
+
+
+
+/*************************************************
+* Byte flipping function *
+*************************************************/
+
+static long int
+byteflip(long int value, int n)
+{
+if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
+return ((value & 0x000000ff) << 24) |
+ ((value & 0x0000ff00) << 8) |
+ ((value & 0x00ff0000) >> 8) |
+ ((value & 0xff000000) >> 24);
+}
+
+
+
+
+/*************************************************
+* Main Program *
+*************************************************/
+
+/* Read lines from named file or stdin and write to named file or stdout; lines
+consist of a regular expression, in delimiters and optionally followed by
+options, followed by a set of test data, terminated by an empty line. */
+
+int main(int argc, char **argv)
+{
+FILE *infile = stdin;
+int options = 0;
+int study_options = 0;
+int op = 1;
+int timeit = 0;
+int showinfo = 0;
+int showstore = 0;
+int size_offsets = 45;
+int size_offsets_max;
+int *offsets;
+#if !defined NOPOSIX
+int posix = 0;
+#endif
+int debug = 0;
+int done = 0;
+
+unsigned char *buffer;
+unsigned char *dbuffer;
+
+/* Get buffers from malloc() so that Electric Fence will check their misuse
+when I am debugging. */
+
+buffer = (unsigned char *)malloc(BUFFER_SIZE);
+dbuffer = (unsigned char *)malloc(DBUFFER_SIZE);
+pbuffer = (unsigned char *)malloc(PBUFFER_SIZE);
+
+/* The outfile variable is static so that new_malloc can use it. The _setmode()
+stuff is some magic that I don't understand, but which apparently does good
+things in Windows. It's related to line terminations. */
+
+#if defined(_WIN32) || defined(WIN32)
+_setmode( _fileno( stdout ), 0x8000 );
+#endif /* defined(_WIN32) || defined(WIN32) */
+
+outfile = stdout;
+
+/* Scan options */
+
+while (argc > 1 && argv[op][0] == '-')
+ {
+ unsigned char *endptr;
+
+ if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
+ showstore = 1;
+ else if (strcmp(argv[op], "-t") == 0) timeit = 1;
+ else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
+ else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
+ else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
+ ((size_offsets = get_value((unsigned char *)argv[op+1], &endptr)),
+ *endptr == 0))
+ {
+ op++;
+ argc--;
+ }
+#if !defined NOPOSIX
+ else if (strcmp(argv[op], "-p") == 0) posix = 1;
+#endif
+ else if (strcmp(argv[op], "-C") == 0)
+ {
+ int rc;
+ printf("PCRE version %s\n", pcre_version());
+ printf("Compiled with\n");
+ (void)pcre_config(PCRE_CONFIG_UTF8, &rc);
+ printf(" %sUTF-8 support\n", rc? "" : "No ");
+ (void)pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &rc);
+ printf(" %sUnicode properties support\n", rc? "" : "No ");
+ (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
+ printf(" Newline character is %s\n", (rc == '\r')? "CR" : "LF");
+ (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
+ printf(" Internal link size = %d\n", rc);
+ (void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);
+ printf(" POSIX malloc threshold = %d\n", rc);
+ (void)pcre_config(PCRE_CONFIG_MATCH_LIMIT, &rc);
+ printf(" Default match limit = %d\n", rc);
+ (void)pcre_config(PCRE_CONFIG_STACKRECURSE, &rc);
+ printf(" Match recursion uses %s\n", rc? "stack" : "heap");
+ exit(0);
+ }
+ else
+ {
+ printf("** Unknown or malformed option %s\n", argv[op]);
+ printf("Usage: pcretest [-d] [-i] [-o <n>] [-p] [-s] [-t] [<input> [<output>]]\n");
+ printf(" -C show PCRE compile-time options and exit\n");
+ printf(" -d debug: show compiled code; implies -i\n"
+ " -i show information about compiled pattern\n"
+ " -m output memory used information\n"
+ " -o <n> set size of offsets vector to <n>\n");
+#if !defined NOPOSIX
+ printf(" -p use POSIX interface\n");
+#endif
+ printf(" -s output store (memory) used information\n"
+ " -t time compilation and execution\n");
+ return 1;
+ }
+ op++;
+ argc--;
+ }
+
+/* Get the store for the offsets vector, and remember what it was */
+
+size_offsets_max = size_offsets;
+offsets = (int *)malloc(size_offsets_max * sizeof(int));
+if (offsets == NULL)
+ {
+ printf("** Failed to get %d bytes of memory for offsets vector\n",
+ size_offsets_max * sizeof(int));
+ return 1;
+ }
+
+/* Sort out the input and output files */
+
+if (argc > 1)
+ {
+ infile = fopen(argv[op], "rb");
+ if (infile == NULL)
+ {
+ printf("** Failed to open %s\n", argv[op]);
+ return 1;
+ }
+ }
+
+if (argc > 2)
+ {
+ outfile = fopen(argv[op+1], "wb");
+ if (outfile == NULL)
+ {
+ printf("** Failed to open %s\n", argv[op+1]);
+ return 1;
+ }
+ }
+
+/* Set alternative malloc function */
+
+pcre_malloc = new_malloc;
+pcre_free = new_free;
+pcre_stack_malloc = stack_malloc;
+pcre_stack_free = stack_free;
+
+/* Heading line, then prompt for first regex if stdin */
+
+fprintf(outfile, "PCRE version %s\n\n", pcre_version());
+
+/* Main loop */
+
+while (!done)
+ {
+ pcre *re = NULL;
+ pcre_extra *extra = NULL;
+
+#if !defined NOPOSIX /* There are still compilers that require no indent */
+ regex_t preg;
+ int do_posix = 0;
+#endif
+
+ const char *error;
+ unsigned char *p, *pp, *ppp;
+ unsigned char *to_file = NULL;
+ const unsigned char *tables = NULL;
+ unsigned long int true_size, true_study_size = 0;
+ size_t size, regex_gotten_store;
+ int do_study = 0;
+ int do_debug = debug;
+ int do_G = 0;
+ int do_g = 0;
+ int do_showinfo = showinfo;
+ int do_showrest = 0;
+ int do_flip = 0;
+ int erroroffset, len, delimiter;
+
+ use_utf8 = 0;
+
+ if (infile == stdin) printf(" re> ");
+ if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL) break;
+ if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
+ fflush(outfile);
+
+ p = buffer;
+ while (isspace(*p)) p++;
+ if (*p == 0) continue;
+
+ /* See if the pattern is to be loaded pre-compiled from a file. */
+
+ if (*p == '<' && strchr((char *)(p+1), '<') == NULL)
+ {
+ unsigned long int magic;
+ uschar sbuf[8];
+ FILE *f;
+
+ p++;
+ pp = p + (int)strlen((char *)p);
+ while (isspace(pp[-1])) pp--;
+ *pp = 0;
+
+ f = fopen((char *)p, "rb");
+ if (f == NULL)
+ {
+ fprintf(outfile, "Failed to open %s: %s\n", p, strerror(errno));
+ continue;
+ }
+
+ if (fread(sbuf, 1, 8, f) != 8) goto FAIL_READ;
+
+ true_size =
+ (sbuf[0] << 24) | (sbuf[1] << 16) | (sbuf[2] << 8) | sbuf[3];
+ true_study_size =
+ (sbuf[4] << 24) | (sbuf[5] << 16) | (sbuf[6] << 8) | sbuf[7];
+
+ re = (real_pcre *)new_malloc(true_size);
+ regex_gotten_store = gotten_store;
+
+ if (fread(re, 1, true_size, f) != true_size) goto FAIL_READ;
+
+ magic = ((real_pcre *)re)->magic_number;
+ if (magic != MAGIC_NUMBER)
+ {
+ if (byteflip(magic, sizeof(magic)) == MAGIC_NUMBER)
+ {
+ do_flip = 1;
+ }
+ else
+ {
+ fprintf(outfile, "Data in %s is not a compiled PCRE regex\n", p);
+ fclose(f);
+ continue;
+ }
+ }
+
+ fprintf(outfile, "Compiled regex%s loaded from %s\n",
+ do_flip? " (byte-inverted)" : "", p);
+
+ /* Need to know if UTF-8 for printing data strings */
+
+ new_info(re, NULL, PCRE_INFO_OPTIONS, &options);
+ use_utf8 = (options & PCRE_UTF8) != 0;
+
+ /* Now see if there is any following study data */
+
+ if (true_study_size != 0)
+ {
+ pcre_study_data *psd;
+
+ extra = (pcre_extra *)new_malloc(sizeof(pcre_extra) + true_study_size);
+ extra->flags = PCRE_EXTRA_STUDY_DATA;
+
+ psd = (pcre_study_data *)(((char *)extra) + sizeof(pcre_extra));
+ extra->study_data = psd;
+
+ if (fread(psd, 1, true_study_size, f) != true_study_size)
+ {
+ FAIL_READ:
+ fprintf(outfile, "Failed to read data from %s\n", p);
+ if (extra != NULL) new_free(extra);
+ if (re != NULL) new_free(re);
+ fclose(f);
+ continue;
+ }
+ fprintf(outfile, "Study data loaded from %s\n", p);
+ do_study = 1; /* To get the data output if requested */
+ }
+ else fprintf(outfile, "No study data\n");
+
+ fclose(f);
+ goto SHOW_INFO;
+ }
+
+ /* In-line pattern (the usual case). Get the delimiter and seek the end of
+ the pattern; if is isn't complete, read more. */
+
+ delimiter = *p++;
+
+ if (isalnum(delimiter) || delimiter == '\\')
+ {
+ fprintf(outfile, "** Delimiter must not be alphameric or \\\n");
+ goto SKIP_DATA;
+ }
+
+ pp = p;
+
+ for(;;)
+ {
+ while (*pp != 0)
+ {
+ if (*pp == '\\' && pp[1] != 0) pp++;
+ else if (*pp == delimiter) break;
+ pp++;
+ }
+ if (*pp != 0) break;
+
+ len = BUFFER_SIZE - (pp - buffer);
+ if (len < 256)
+ {
+ fprintf(outfile, "** Expression too long - missing delimiter?\n");
+ goto SKIP_DATA;
+ }
+
+ if (infile == stdin) printf(" > ");
+ if (fgets((char *)pp, len, infile) == NULL)
+ {
+ fprintf(outfile, "** Unexpected EOF\n");
+ done = 1;
+ goto CONTINUE;
+ }
+ if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
+ }
+
+ /* If the first character after the delimiter is backslash, make
+ the pattern end with backslash. This is purely to provide a way
+ of testing for the error message when a pattern ends with backslash. */
+
+ if (pp[1] == '\\') *pp++ = '\\';
+
+ /* Terminate the pattern at the delimiter, and save a copy of the pattern
+ for callouts. */
+
+ *pp++ = 0;
+ strcpy((char *)pbuffer, (char *)p);
+
+ /* Look for options after final delimiter */
+
+ options = 0;
+ study_options = 0;
+ log_store = showstore; /* default from command line */
+
+ while (*pp != 0)
+ {
+ switch (*pp++)
+ {
+ case 'g': do_g = 1; break;
+ case 'i': options |= PCRE_CASELESS; break;
+ case 'm': options |= PCRE_MULTILINE; break;
+ case 's': options |= PCRE_DOTALL; break;
+ case 'x': options |= PCRE_EXTENDED; break;
+
+ case '+': do_showrest = 1; break;
+ case 'A': options |= PCRE_ANCHORED; break;
+ case 'C': options |= PCRE_AUTO_CALLOUT; break;
+ case 'D': do_debug = do_showinfo = 1; break;
+ case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
+ case 'F': do_flip = 1; break;
+ case 'G': do_G = 1; break;
+ case 'I': do_showinfo = 1; break;
+ case 'M': log_store = 1; break;
+ case 'N': options |= PCRE_NO_AUTO_CAPTURE; break;
+
+#if !defined NOPOSIX
+ case 'P': do_posix = 1; break;
+#endif
+
+ case 'S': do_study = 1; break;
+ case 'U': options |= PCRE_UNGREEDY; break;
+ case 'X': options |= PCRE_EXTRA; break;
+ case '8': options |= PCRE_UTF8; use_utf8 = 1; break;
+ case '?': options |= PCRE_NO_UTF8_CHECK; break;
+
+ case 'L':
+ ppp = pp;
+ while (*ppp != '\n' && *ppp != ' ') ppp++;
+ *ppp = 0;
+ if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
+ {
+ fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
+ goto SKIP_DATA;
+ }
+ tables = pcre_maketables();
+ pp = ppp;
+ break;
+
+ case '>':
+ to_file = pp;
+ while (*pp != 0) pp++;
+ while (isspace(pp[-1])) pp--;
+ *pp = 0;
+ break;
+
+ case '\n': case ' ': break;
+
+ default:
+ fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
+ goto SKIP_DATA;
+ }
+ }
+
+ /* Handle compiling via the POSIX interface, which doesn't support the
+ timing, showing, or debugging options, nor the ability to pass over
+ local character tables. */
+
+#if !defined NOPOSIX
+ if (posix || do_posix)
+ {
+ int rc;
+ int cflags = 0;
+
+ if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
+ if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
+ rc = regcomp(&preg, (char *)p, cflags);
+
+ /* Compilation failed; go back for another re, skipping to blank line
+ if non-interactive. */
+
+ if (rc != 0)
+ {
+ (void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
+ fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
+ goto SKIP_DATA;
+ }
+ }
+
+ /* Handle compiling via the native interface */
+
+ else
+#endif /* !defined NOPOSIX */
+
+ {
+ if (timeit)
+ {
+ register int i;
+ clock_t time_taken;
+ clock_t start_time = clock();
+ for (i = 0; i < LOOPREPEAT; i++)
+ {
+ re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
+ if (re != NULL) free(re);
+ }
+ time_taken = clock() - start_time;
+ fprintf(outfile, "Compile time %.3f milliseconds\n",
+ (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
+ (double)CLOCKS_PER_SEC);
+ }
+
+ re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
+
+ /* Compilation failed; go back for another re, skipping to blank line
+ if non-interactive. */
+
+ if (re == NULL)
+ {
+ fprintf(outfile, "Failed: %s at offset %d\n", error, erroroffset);
+ SKIP_DATA:
+ if (infile != stdin)
+ {
+ for (;;)
+ {
+ if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
+ {
+ done = 1;
+ goto CONTINUE;
+ }
+ len = (int)strlen((char *)buffer);
+ while (len > 0 && isspace(buffer[len-1])) len--;
+ if (len == 0) break;
+ }
+ fprintf(outfile, "\n");
+ }
+ goto CONTINUE;
+ }
+
+ /* Compilation succeeded; print data if required. There are now two
+ info-returning functions. The old one has a limited interface and
+ returns only limited data. Check that it agrees with the newer one. */
+
+ if (log_store)
+ fprintf(outfile, "Memory allocation (code space): %d\n",
+ (int)(gotten_store -
+ sizeof(real_pcre) -
+ ((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));
+
+ /* Extract the size for possible writing before possibly flipping it,
+ and remember the store that was got. */
+
+ true_size = ((real_pcre *)re)->size;
+ regex_gotten_store = gotten_store;
+
+ /* If /S was present, study the regexp to generate additional info to
+ help with the matching. */
+
+ if (do_study)
+ {
+ if (timeit)
+ {
+ register int i;
+ clock_t time_taken;
+ clock_t start_time = clock();
+ for (i = 0; i < LOOPREPEAT; i++)
+ extra = pcre_study(re, study_options, &error);
+ time_taken = clock() - start_time;
+ if (extra != NULL) free(extra);
+ fprintf(outfile, " Study time %.3f milliseconds\n",
+ (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
+ (double)CLOCKS_PER_SEC);
+ }
+ extra = pcre_study(re, study_options, &error);
+ if (error != NULL)
+ fprintf(outfile, "Failed to study: %s\n", error);
+ else if (extra != NULL)
+ true_study_size = ((pcre_study_data *)(extra->study_data))->size;
+ }
+
+ /* If the 'F' option was present, we flip the bytes of all the integer
+ fields in the regex data block and the study block. This is to make it
+ possible to test PCRE's handling of byte-flipped patterns, e.g. those
+ compiled on a different architecture. */
+
+ if (do_flip)
+ {
+ real_pcre *rre = (real_pcre *)re;
+ rre->magic_number = byteflip(rre->magic_number, sizeof(rre->magic_number));
+ rre->size = byteflip(rre->size, sizeof(rre->size));
+ rre->options = byteflip(rre->options, sizeof(rre->options));
+ rre->top_bracket = byteflip(rre->top_bracket, sizeof(rre->top_bracket));
+ rre->top_backref = byteflip(rre->top_backref, sizeof(rre->top_backref));
+ rre->first_byte = byteflip(rre->first_byte, sizeof(rre->first_byte));
+ rre->req_byte = byteflip(rre->req_byte, sizeof(rre->req_byte));
+ rre->name_table_offset = byteflip(rre->name_table_offset,
+ sizeof(rre->name_table_offset));
+ rre->name_entry_size = byteflip(rre->name_entry_size,
+ sizeof(rre->name_entry_size));
+ rre->name_count = byteflip(rre->name_count, sizeof(rre->name_count));
+
+ if (extra != NULL)
+ {
+ pcre_study_data *rsd = (pcre_study_data *)(extra->study_data);
+ rsd->size = byteflip(rsd->size, sizeof(rsd->size));
+ rsd->options = byteflip(rsd->options, sizeof(rsd->options));
+ }
+ }
+
+ /* Extract information from the compiled data if required */
+
+ SHOW_INFO:
+
+ if (do_showinfo)
+ {
+ unsigned long int get_options, all_options;
+ int old_first_char, old_options, old_count;
+ int count, backrefmax, first_char, need_char;
+ int nameentrysize, namecount;
+ const uschar *nametable;
+
+ if (do_debug)
+ {
+ fprintf(outfile, "------------------------------------------------------------------\n");
+ print_internals(re, outfile);
+ }
+
+ new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
+ new_info(re, NULL, PCRE_INFO_SIZE, &size);
+ new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
+ new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
+ new_info(re, NULL, PCRE_INFO_FIRSTBYTE, &first_char);
+ new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
+ new_info(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
+ new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
+ new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable);
+
+ old_count = pcre_info(re, &old_options, &old_first_char);
+ if (count < 0) fprintf(outfile,
+ "Error %d from pcre_info()\n", count);
+ else
+ {
+ if (old_count != count) fprintf(outfile,
+ "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
+ old_count);
+
+ if (old_first_char != first_char) fprintf(outfile,
+ "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
+ first_char, old_first_char);
+
+ if (old_options != (int)get_options) fprintf(outfile,
+ "Options disagreement: pcre_fullinfo=%ld pcre_info=%d\n",
+ get_options, old_options);
+ }
+
+ if (size != regex_gotten_store) fprintf(outfile,
+ "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
+ (int)size, (int)regex_gotten_store);
+
+ fprintf(outfile, "Capturing subpattern count = %d\n", count);
+ if (backrefmax > 0)
+ fprintf(outfile, "Max back reference = %d\n", backrefmax);
+
+ if (namecount > 0)
+ {
+ fprintf(outfile, "Named capturing subpatterns:\n");
+ while (namecount-- > 0)
+ {
+ fprintf(outfile, " %s %*s%3d\n", nametable + 2,
+ nameentrysize - 3 - (int)strlen((char *)nametable + 2), "",
+ GET2(nametable, 0));
+ nametable += nameentrysize;
+ }
+ }
+
+ /* The NOPARTIAL bit is a private bit in the options, so we have
+ to fish it out via out back door */
+
+ all_options = ((real_pcre *)re)->options;
+ if (do_flip)
+ {
+ all_options = byteflip(all_options, sizeof(all_options));
+ }
+
+ if ((all_options & PCRE_NOPARTIAL) != 0)
+ fprintf(outfile, "Partial matching not supported\n");
+
+ if (get_options == 0) fprintf(outfile, "No options\n");
+ else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s\n",
+ ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
+ ((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
+ ((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
+ ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
+ ((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
+ ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
+ ((get_options & PCRE_EXTRA) != 0)? " extra" : "",
+ ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
+ ((get_options & PCRE_UTF8) != 0)? " utf8" : "",
+ ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "");
+
+ if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
+ fprintf(outfile, "Case state changes\n");
+
+ if (first_char == -1)
+ {
+ fprintf(outfile, "First char at start or follows \\n\n");
+ }
+ else if (first_char < 0)
+ {
+ fprintf(outfile, "No first char\n");
+ }
+ else
+ {
+ int ch = first_char & 255;
+ const char *caseless = ((first_char & REQ_CASELESS) == 0)?
+ "" : " (caseless)";
+ if (isprint(ch))
+ fprintf(outfile, "First char = \'%c\'%s\n", ch, caseless);
+ else
+ fprintf(outfile, "First char = %d%s\n", ch, caseless);
+ }
+
+ if (need_char < 0)
+ {
+ fprintf(outfile, "No need char\n");
+ }
+ else
+ {
+ int ch = need_char & 255;
+ const char *caseless = ((need_char & REQ_CASELESS) == 0)?
+ "" : " (caseless)";
+ if (isprint(ch))
+ fprintf(outfile, "Need char = \'%c\'%s\n", ch, caseless);
+ else
+ fprintf(outfile, "Need char = %d%s\n", ch, caseless);
+ }
+
+ /* Don't output study size; at present it is in any case a fixed
+ value, but it varies, depending on the computer architecture, and
+ so messes up the test suite. (And with the /F option, it might be
+ flipped.) */
+
+ if (do_study)
+ {
+ if (extra == NULL)
+ fprintf(outfile, "Study returned NULL\n");
+ else
+ {
+ uschar *start_bits = NULL;
+ new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
+
+ if (start_bits == NULL)
+ fprintf(outfile, "No starting byte set\n");
+ else
+ {
+ int i;
+ int c = 24;
+ fprintf(outfile, "Starting byte set: ");
+ for (i = 0; i < 256; i++)
+ {
+ if ((start_bits[i/8] & (1<<(i&7))) != 0)
+ {
+ if (c > 75)
+ {
+ fprintf(outfile, "\n ");
+ c = 2;
+ }
+ if (isprint(i) && i != ' ')
+ {
+ fprintf(outfile, "%c ", i);
+ c += 2;
+ }
+ else
+ {
+ fprintf(outfile, "\\x%02x ", i);
+ c += 5;
+ }
+ }
+ }
+ fprintf(outfile, "\n");
+ }
+ }
+ }
+ }
+
+ /* If the '>' option was present, we write out the regex to a file, and
+ that is all. The first 8 bytes of the file are the regex length and then
+ the study length, in big-endian order. */
+
+ if (to_file != NULL)
+ {
+ FILE *f = fopen((char *)to_file, "wb");
+ if (f == NULL)
+ {
+ fprintf(outfile, "Unable to open %s: %s\n", to_file, strerror(errno));
+ }
+ else
+ {
+ uschar sbuf[8];
+ sbuf[0] = (true_size >> 24) & 255;
+ sbuf[1] = (true_size >> 16) & 255;
+ sbuf[2] = (true_size >> 8) & 255;
+ sbuf[3] = (true_size) & 255;
+
+ sbuf[4] = (true_study_size >> 24) & 255;
+ sbuf[5] = (true_study_size >> 16) & 255;
+ sbuf[6] = (true_study_size >> 8) & 255;
+ sbuf[7] = (true_study_size) & 255;
+
+ if (fwrite(sbuf, 1, 8, f) < 8 ||
+ fwrite(re, 1, true_size, f) < true_size)
+ {
+ fprintf(outfile, "Write error on %s: %s\n", to_file, strerror(errno));
+ }
+ else
+ {
+ fprintf(outfile, "Compiled regex written to %s\n", to_file);
+ if (extra != NULL)
+ {
+ if (fwrite(extra->study_data, 1, true_study_size, f) <
+ true_study_size)
+ {
+ fprintf(outfile, "Write error on %s: %s\n", to_file,
+ strerror(errno));
+ }
+ else fprintf(outfile, "Study data written to %s\n", to_file);
+ }
+ }
+ fclose(f);
+ }
+ continue; /* With next regex */
+ }
+ } /* End of non-POSIX compile */
+
+ /* Read data lines and test them */
+
+ for (;;)
+ {
+ unsigned char *q;
+ unsigned char *bptr = dbuffer;
+ int *use_offsets = offsets;
+ int use_size_offsets = size_offsets;
+ int callout_data = 0;
+ int callout_data_set = 0;
+ int count, c;
+ int copystrings = 0;
+ int find_match_limit = 0;
+ int getstrings = 0;
+ int getlist = 0;
+ int gmatched = 0;
+ int start_offset = 0;
+ int g_notempty = 0;
+
+ options = 0;
+
+ pcre_callout = callout;
+ first_callout = 1;
+ callout_extra = 0;
+ callout_count = 0;
+ callout_fail_count = 999999;
+ callout_fail_id = -1;
+ show_malloc = 0;
+
+ if (infile == stdin) printf("data> ");
+ if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
+ {
+ done = 1;
+ goto CONTINUE;
+ }
+ if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
+
+ len = (int)strlen((char *)buffer);
+ while (len > 0 && isspace(buffer[len-1])) len--;
+ buffer[len] = 0;
+ if (len == 0) break;
+
+ p = buffer;
+ while (isspace(*p)) p++;
+
+ q = dbuffer;
+ while ((c = *p++) != 0)
+ {
+ int i = 0;
+ int n = 0;
+
+ if (c == '\\') switch ((c = *p++))
+ {
+ case 'a': c = 7; break;
+ case 'b': c = '\b'; break;
+ case 'e': c = 27; break;
+ case 'f': c = '\f'; break;
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ case 'v': c = '\v'; break;
+
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ c -= '0';
+ while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
+ c = c * 8 + *p++ - '0';
+ break;
+
+ case 'x':
+
+ /* Handle \x{..} specially - new Perl thing for utf8 */
+
+ if (*p == '{')
+ {
+ unsigned char *pt = p;
+ c = 0;
+ while (isxdigit(*(++pt)))
+ c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
+ if (*pt == '}')
+ {
+ unsigned char buff8[8];
+ int ii, utn;
+ utn = ord2utf8(c, buff8);
+ for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
+ c = buff8[ii]; /* Last byte */
+ p = pt + 1;
+ break;
+ }
+ /* Not correct form; fall through */
+ }
+
+ /* Ordinary \x */
+
+ c = 0;
+ while (i++ < 2 && isxdigit(*p))
+ {
+ c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'W');
+ p++;
+ }
+ break;
+
+ case 0: /* \ followed by EOF allows for an empty line */
+ p--;
+ continue;
+
+ case '>':
+ while(isdigit(*p)) start_offset = start_offset * 10 + *p++ - '0';
+ continue;
+
+ case 'A': /* Option setting */
+ options |= PCRE_ANCHORED;
+ continue;
+
+ case 'B':
+ options |= PCRE_NOTBOL;
+ continue;
+
+ case 'C':
+ if (isdigit(*p)) /* Set copy string */
+ {
+ while(isdigit(*p)) n = n * 10 + *p++ - '0';
+ copystrings |= 1 << n;
+ }
+ else if (isalnum(*p))
+ {
+ uschar name[256];
+ uschar *npp = name;
+ while (isalnum(*p)) *npp++ = *p++;
+ *npp = 0;
+ n = pcre_get_stringnumber(re, (char *)name);
+ if (n < 0)
+ fprintf(outfile, "no parentheses with name \"%s\"\n", name);
+ else copystrings |= 1 << n;
+ }
+ else if (*p == '+')
+ {
+ callout_extra = 1;
+ p++;
+ }
+ else if (*p == '-')
+ {
+ pcre_callout = NULL;
+ p++;
+ }
+ else if (*p == '!')
+ {
+ callout_fail_id = 0;
+ p++;
+ while(isdigit(*p))
+ callout_fail_id = callout_fail_id * 10 + *p++ - '0';
+ callout_fail_count = 0;
+ if (*p == '!')
+ {
+ p++;
+ while(isdigit(*p))
+ callout_fail_count = callout_fail_count * 10 + *p++ - '0';
+ }
+ }
+ else if (*p == '*')
+ {
+ int sign = 1;
+ callout_data = 0;
+ if (*(++p) == '-') { sign = -1; p++; }
+ while(isdigit(*p))
+ callout_data = callout_data * 10 + *p++ - '0';
+ callout_data *= sign;
+ callout_data_set = 1;
+ }
+ continue;
+
+ case 'G':
+ if (isdigit(*p))
+ {
+ while(isdigit(*p)) n = n * 10 + *p++ - '0';
+ getstrings |= 1 << n;
+ }
+ else if (isalnum(*p))
+ {
+ uschar name[256];
+ uschar *npp = name;
+ while (isalnum(*p)) *npp++ = *p++;
+ *npp = 0;
+ n = pcre_get_stringnumber(re, (char *)name);
+ if (n < 0)
+ fprintf(outfile, "no parentheses with name \"%s\"\n", name);
+ else getstrings |= 1 << n;
+ }
+ continue;
+
+ case 'L':
+ getlist = 1;
+ continue;
+
+ case 'M':
+ find_match_limit = 1;
+ continue;
+
+ case 'N':
+ options |= PCRE_NOTEMPTY;
+ continue;
+
+ case 'O':
+ while(isdigit(*p)) n = n * 10 + *p++ - '0';
+ if (n > size_offsets_max)
+ {
+ size_offsets_max = n;
+ free(offsets);
+ use_offsets = offsets = (int *)malloc(size_offsets_max * sizeof(int));
+ if (offsets == NULL)
+ {
+ printf("** Failed to get %d bytes of memory for offsets vector\n",
+ size_offsets_max * sizeof(int));
+ return 1;
+ }
+ }
+ use_size_offsets = n;
+ if (n == 0) use_offsets = NULL; /* Ensures it can't write to it */
+ continue;
+
+ case 'P':
+ options |= PCRE_PARTIAL;
+ continue;
+
+ case 'S':
+ show_malloc = 1;
+ continue;
+
+ case 'Z':
+ options |= PCRE_NOTEOL;
+ continue;
+
+ case '?':
+ options |= PCRE_NO_UTF8_CHECK;
+ continue;
+ }
+ *q++ = c;
+ }
+ *q = 0;
+ len = q - dbuffer;
+
+ /* Handle matching via the POSIX interface, which does not
+ support timing or playing with the match limit or callout data. */
+
+#if !defined NOPOSIX
+ if (posix || do_posix)
+ {
+ int rc;
+ int eflags = 0;
+ regmatch_t *pmatch = NULL;
+ if (use_size_offsets > 0)
+ pmatch = (regmatch_t *)malloc(sizeof(regmatch_t) * use_size_offsets);
+ if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
+ if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
+
+ rc = regexec(&preg, (const char *)bptr, use_size_offsets, pmatch, eflags);
+
+ if (rc != 0)
+ {
+ (void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
+ fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
+ }
+ else
+ {
+ size_t i;
+ for (i = 0; i < (size_t)use_size_offsets; i++)
+ {
+ if (pmatch[i].rm_so >= 0)
+ {
+ fprintf(outfile, "%2d: ", (int)i);
+ (void)pchars(dbuffer + pmatch[i].rm_so,
+ pmatch[i].rm_eo - pmatch[i].rm_so, outfile);
+ fprintf(outfile, "\n");
+ if (i == 0 && do_showrest)
+ {
+ fprintf(outfile, " 0+ ");
+ (void)pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo,
+ outfile);
+ fprintf(outfile, "\n");
+ }
+ }
+ }
+ }
+ free(pmatch);
+ }
+
+ /* Handle matching via the native interface - repeats for /g and /G */
+
+ else
+#endif /* !defined NOPOSIX */
+
+ for (;; gmatched++) /* Loop for /g or /G */
+ {
+ if (timeit)
+ {
+ register int i;
+ clock_t time_taken;
+ clock_t start_time = clock();
+ for (i = 0; i < LOOPREPEAT; i++)
+ count = pcre_exec(re, extra, (char *)bptr, len,
+ start_offset, options | g_notempty, use_offsets, use_size_offsets);
+ time_taken = clock() - start_time;
+ fprintf(outfile, "Execute time %.3f milliseconds\n",
+ (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
+ (double)CLOCKS_PER_SEC);
+ }
+
+ /* If find_match_limit is set, we want to do repeated matches with
+ varying limits in order to find the minimum value. */
+
+ if (find_match_limit)
+ {
+ int min = 0;
+ int mid = 64;
+ int max = -1;
+
+ if (extra == NULL)
+ {
+ extra = (pcre_extra *)malloc(sizeof(pcre_extra));
+ extra->flags = 0;
+ }
+ extra->flags |= PCRE_EXTRA_MATCH_LIMIT;
+
+ for (;;)
+ {
+ extra->match_limit = mid;
+ count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
+ options | g_notempty, use_offsets, use_size_offsets);
+ if (count == PCRE_ERROR_MATCHLIMIT)
+ {
+ /* fprintf(outfile, "Testing match limit = %d\n", mid); */
+ min = mid;
+ mid = (mid == max - 1)? max : (max > 0)? (min + max)/2 : mid*2;
+ }
+ else if (count >= 0 || count == PCRE_ERROR_NOMATCH ||
+ count == PCRE_ERROR_PARTIAL)
+ {
+ if (mid == min + 1)
+ {
+ fprintf(outfile, "Minimum match limit = %d\n", mid);
+ break;
+ }
+ /* fprintf(outfile, "Testing match limit = %d\n", mid); */
+ max = mid;
+ mid = (min + mid)/2;
+ }
+ else break; /* Some other error */
+ }
+
+ extra->flags &= ~PCRE_EXTRA_MATCH_LIMIT;
+ }
+
+ /* If callout_data is set, use the interface with additional data */
+
+ else if (callout_data_set)
+ {
+ if (extra == NULL)
+ {
+ extra = (pcre_extra *)malloc(sizeof(pcre_extra));
+ extra->flags = 0;
+ }
+ extra->flags |= PCRE_EXTRA_CALLOUT_DATA;
+ extra->callout_data = &callout_data;
+ count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
+ options | g_notempty, use_offsets, use_size_offsets);
+ extra->flags &= ~PCRE_EXTRA_CALLOUT_DATA;
+ }
+
+ /* The normal case is just to do the match once, with the default
+ value of match_limit. */
+
+ else
+ {
+ count = pcre_exec(re, extra, (char *)bptr, len,
+ start_offset, options | g_notempty, use_offsets, use_size_offsets);
+ }
+
+ if (count == 0)
+ {
+ fprintf(outfile, "Matched, but too many substrings\n");
+ count = use_size_offsets/3;
+ }
+
+ /* Matched */
+
+ if (count >= 0)
+ {
+ int i;
+ for (i = 0; i < count * 2; i += 2)
+ {
+ if (use_offsets[i] < 0)
+ fprintf(outfile, "%2d: <unset>\n", i/2);
+ else
+ {
+ fprintf(outfile, "%2d: ", i/2);
+ (void)pchars(bptr + use_offsets[i],
+ use_offsets[i+1] - use_offsets[i], outfile);
+ fprintf(outfile, "\n");
+ if (i == 0)
+ {
+ if (do_showrest)
+ {
+ fprintf(outfile, " 0+ ");
+ (void)pchars(bptr + use_offsets[i+1], len - use_offsets[i+1],
+ outfile);
+ fprintf(outfile, "\n");
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < 32; i++)
+ {
+ if ((copystrings & (1 << i)) != 0)
+ {
+ char copybuffer[16];
+ int rc = pcre_copy_substring((char *)bptr, use_offsets, count,
+ i, copybuffer, sizeof(copybuffer));
+ if (rc < 0)
+ fprintf(outfile, "copy substring %d failed %d\n", i, rc);
+ else
+ fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
+ }
+ }
+
+ for (i = 0; i < 32; i++)
+ {
+ if ((getstrings & (1 << i)) != 0)
+ {
+ const char *substring;
+ int rc = pcre_get_substring((char *)bptr, use_offsets, count,
+ i, &substring);
+ if (rc < 0)
+ fprintf(outfile, "get substring %d failed %d\n", i, rc);
+ else
+ {
+ fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
+ /* free((void *)substring); */
+ pcre_free_substring(substring);
+ }
+ }
+ }
+
+ if (getlist)
+ {
+ const char **stringlist;
+ int rc = pcre_get_substring_list((char *)bptr, use_offsets, count,
+ &stringlist);
+ if (rc < 0)
+ fprintf(outfile, "get substring list failed %d\n", rc);
+ else
+ {
+ for (i = 0; i < count; i++)
+ fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
+ if (stringlist[i] != NULL)
+ fprintf(outfile, "string list not terminated by NULL\n");
+ /* free((void *)stringlist); */
+ pcre_free_substring_list(stringlist);
+ }
+ }
+ }
+
+ /* There was a partial match */
+
+ else if (count == PCRE_ERROR_PARTIAL)
+ {
+ fprintf(outfile, "Partial match\n");
+ break; /* Out of the /g loop */
+ }
+
+ /* Failed to match. If this is a /g or /G loop and we previously set
+ g_notempty after a null match, this is not necessarily the end.
+ We want to advance the start offset, and continue. In the case of UTF-8
+ matching, the advance must be one character, not one byte. Fudge the
+ offset values to achieve this. We won't be at the end of the string -
+ that was checked before setting g_notempty. */
+
+ else
+ {
+ if (g_notempty != 0)
+ {
+ int onechar = 1;
+ use_offsets[0] = start_offset;
+ if (use_utf8)
+ {
+ while (start_offset + onechar < len)
+ {
+ int tb = bptr[start_offset+onechar];
+ if (tb <= 127) break;
+ tb &= 0xc0;
+ if (tb != 0 && tb != 0xc0) onechar++;
+ }
+ }
+ use_offsets[1] = start_offset + onechar;
+ }
+ else
+ {
+ if (count == PCRE_ERROR_NOMATCH)
+ {
+ if (gmatched == 0) fprintf(outfile, "No match\n");
+ }
+ else fprintf(outfile, "Error %d\n", count);
+ break; /* Out of the /g loop */
+ }
+ }
+
+ /* If not /g or /G we are done */
+
+ if (!do_g && !do_G) break;
+
+ /* If we have matched an empty string, first check to see if we are at
+ the end of the subject. If so, the /g loop is over. Otherwise, mimic
+ what Perl's /g options does. This turns out to be rather cunning. First
+ we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
+ same point. If this fails (picked up above) we advance to the next
+ character. */
+
+ g_notempty = 0;
+ if (use_offsets[0] == use_offsets[1])
+ {
+ if (use_offsets[0] == len) break;
+ g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
+ }
+
+ /* For /g, update the start offset, leaving the rest alone */
+
+ if (do_g) start_offset = use_offsets[1];
+
+ /* For /G, update the pointer and length */
+
+ else
+ {
+ bptr += use_offsets[1];
+ len -= use_offsets[1];
+ }
+ } /* End of loop for /g and /G */
+ } /* End of loop for data lines */
+
+ CONTINUE:
+
+#if !defined NOPOSIX
+ if (posix || do_posix) regfree(&preg);
+#endif
+
+ if (re != NULL) free(re);
+ if (extra != NULL) free(extra);
+ if (tables != NULL)
+ {
+ free((void *)tables);
+ setlocale(LC_CTYPE, "C");
+ }
+ }
+
+if (infile == stdin) fprintf(outfile, "\n");
+return 0;
+}
+
+/* End */
diff --git a/src/src/pcre/printint.c b/src/src/pcre/printint.c
new file mode 100644
index 000000000..28f63d82f
--- /dev/null
+++ b/src/src/pcre/printint.c
@@ -0,0 +1,465 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/*
+This is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language. See
+the file Tech.Notes for some information on the internals.
+
+Written by: Philip Hazel <ph10@cam.ac.uk>
+
+ Copyright (c) 1997-2004 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains a debugging function for printing out the internal form
+of a compiled regular expression. It is kept in a separate file so that it can
+be #included both in the pcretest program, and in the library itself when
+compiled with the debugging switch. */
+
+
+static const char *OP_names[] = { OP_NAME_LIST };
+
+
+/*************************************************
+* Print single- or multi-byte character *
+*************************************************/
+
+/* These tables are actually copies of ones in pcre.c. If we compile the
+library with debugging, they are included twice, but that isn't really a
+problem - compiling with debugging is pretty rare and these are very small. */
+
+static const int utf8_t3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
+
+static const uschar utf8_t4[] = {
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
+
+static int
+print_char(FILE *f, uschar *ptr, BOOL utf8)
+{
+int c = *ptr;
+
+if (!utf8 || (c & 0xc0) != 0xc0)
+ {
+ if (isprint(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
+ return 0;
+ }
+else
+ {
+ int i;
+ int a = utf8_t4[c & 0x3f]; /* Number of additional bytes */
+ int s = 6*a;
+ c = (c & utf8_t3[a]) << s;
+ for (i = 1; i <= a; i++)
+ {
+ /* This is a check for malformed UTF-8; it should only occur if the sanity
+ check has been turned off. Rather than swallow random bytes, just stop if
+ we hit a bad one. Print it with \X instead of \x as an indication. */
+
+ if ((ptr[i] & 0xc0) != 0x80)
+ {
+ fprintf(f, "\\X{%x}", c);
+ return i - 1;
+ }
+
+ /* The byte is OK */
+
+ s -= 6;
+ c |= (ptr[i] & 0x3f) << s;
+ }
+ if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c);
+ return a;
+ }
+}
+
+
+
+
+/*************************************************
+* Find Unicode property name *
+*************************************************/
+
+static const char *
+get_ucpname(int property)
+{
+#ifdef SUPPORT_UCP
+int i;
+for (i = sizeof(utt)/sizeof(ucp_type_table); i >= 0; i--)
+ {
+ if (property == utt[i].value) break;
+ }
+return (i >= 0)? utt[i].name : "??";
+#else
+return "??";
+#endif
+}
+
+
+
+/*************************************************
+* Print compiled regex *
+*************************************************/
+
+/* Make this function work for a regex with integers either byte order.
+However, we assume that what we are passed is a compiled regex. */
+
+static void
+print_internals(pcre *external_re, FILE *f)
+{
+real_pcre *re = (real_pcre *)external_re;
+uschar *codestart, *code;
+BOOL utf8;
+
+unsigned int options = re->options;
+int offset = re->name_table_offset;
+int count = re->name_count;
+int size = re->name_entry_size;
+
+if (re->magic_number != MAGIC_NUMBER)
+ {
+ offset = ((offset << 8) & 0xff00) | ((offset >> 8) & 0xff);
+ count = ((count << 8) & 0xff00) | ((count >> 8) & 0xff);
+ size = ((size << 8) & 0xff00) | ((size >> 8) & 0xff);
+ options = ((options << 24) & 0xff000000) |
+ ((options << 8) & 0x00ff0000) |
+ ((options >> 8) & 0x0000ff00) |
+ ((options >> 24) & 0x000000ff);
+ }
+
+code = codestart = (uschar *)re + offset + count * size;
+utf8 = (options & PCRE_UTF8) != 0;
+
+for(;;)
+ {
+ uschar *ccode;
+ int c;
+ int extra = 0;
+
+ fprintf(f, "%3d ", (int)(code - codestart));
+
+ if (*code >= OP_BRA)
+ {
+ if (*code - OP_BRA > EXTRACT_BASIC_MAX)
+ fprintf(f, "%3d Bra extra\n", GET(code, 1));
+ else
+ fprintf(f, "%3d Bra %d\n", GET(code, 1), *code - OP_BRA);
+ code += OP_lengths[OP_BRA];
+ continue;
+ }
+
+ switch(*code)
+ {
+ case OP_END:
+ fprintf(f, " %s\n", OP_names[*code]);
+ fprintf(f, "------------------------------------------------------------------\n");
+ return;
+
+ case OP_OPT:
+ fprintf(f, " %.2x %s", code[1], OP_names[*code]);
+ break;
+
+ case OP_CHAR:
+ {
+ fprintf(f, " ");
+ do
+ {
+ code++;
+ code += 1 + print_char(f, code, utf8);
+ }
+ while (*code == OP_CHAR);
+ fprintf(f, "\n");
+ continue;
+ }
+ break;
+
+ case OP_CHARNC:
+ {
+ fprintf(f, " NC ");
+ do
+ {
+ code++;
+ code += 1 + print_char(f, code, utf8);
+ }
+ while (*code == OP_CHARNC);
+ fprintf(f, "\n");
+ continue;
+ }
+ break;
+
+ case OP_KETRMAX:
+ case OP_KETRMIN:
+ case OP_ALT:
+ case OP_KET:
+ case OP_ASSERT:
+ case OP_ASSERT_NOT:
+ case OP_ASSERTBACK:
+ case OP_ASSERTBACK_NOT:
+ case OP_ONCE:
+ case OP_COND:
+ case OP_REVERSE:
+ fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]);
+ break;
+
+ case OP_BRANUMBER:
+ printf("%3d %s", GET2(code, 1), OP_names[*code]);
+ break;
+
+ case OP_CREF:
+ if (GET2(code, 1) == CREF_RECURSE)
+ fprintf(f, " Cond recurse");
+ else
+ fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
+ break;
+
+ case OP_STAR:
+ case OP_MINSTAR:
+ case OP_PLUS:
+ case OP_MINPLUS:
+ case OP_QUERY:
+ case OP_MINQUERY:
+ case OP_TYPESTAR:
+ case OP_TYPEMINSTAR:
+ case OP_TYPEPLUS:
+ case OP_TYPEMINPLUS:
+ case OP_TYPEQUERY:
+ case OP_TYPEMINQUERY:
+ fprintf(f, " ");
+ if (*code >= OP_TYPESTAR)
+ {
+ fprintf(f, "%s", OP_names[code[1]]);
+ if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
+ {
+ fprintf(f, " %s ", get_ucpname(code[2]));
+ extra = 1;
+ }
+ }
+ else extra = print_char(f, code+1, utf8);
+ fprintf(f, "%s", OP_names[*code]);
+ break;
+
+ case OP_EXACT:
+ case OP_UPTO:
+ case OP_MINUPTO:
+ fprintf(f, " ");
+ extra = print_char(f, code+3, utf8);
+ fprintf(f, "{");
+ if (*code != OP_EXACT) fprintf(f, ",");
+ fprintf(f, "%d}", GET2(code,1));
+ if (*code == OP_MINUPTO) fprintf(f, "?");
+ break;
+
+ case OP_TYPEEXACT:
+ case OP_TYPEUPTO:
+ case OP_TYPEMINUPTO:
+ fprintf(f, " %s", OP_names[code[3]]);
+ if (code[3] == OP_PROP || code[3] == OP_NOTPROP)
+ {
+ fprintf(f, " %s ", get_ucpname(code[4]));
+ extra = 1;
+ }
+ fprintf(f, "{");
+ if (*code != OP_TYPEEXACT) fprintf(f, "0,");
+ fprintf(f, "%d}", GET2(code,1));
+ if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
+ break;
+
+ case OP_NOT:
+ if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
+ else fprintf(f, " [^\\x%02x]", c);
+ break;
+
+ case OP_NOTSTAR:
+ case OP_NOTMINSTAR:
+ case OP_NOTPLUS:
+ case OP_NOTMINPLUS:
+ case OP_NOTQUERY:
+ case OP_NOTMINQUERY:
+ if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
+ else fprintf(f, " [^\\x%02x]", c);
+ fprintf(f, "%s", OP_names[*code]);
+ break;
+
+ case OP_NOTEXACT:
+ case OP_NOTUPTO:
+ case OP_NOTMINUPTO:
+ if (isprint(c = code[3])) fprintf(f, " [^%c]{", c);
+ else fprintf(f, " [^\\x%02x]{", c);
+ if (*code != OP_NOTEXACT) fprintf(f, ",");
+ fprintf(f, "%d}", GET2(code,1));
+ if (*code == OP_NOTMINUPTO) fprintf(f, "?");
+ break;
+
+ case OP_RECURSE:
+ fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]);
+ break;
+
+ case OP_REF:
+ fprintf(f, " \\%d", GET2(code,1));
+ ccode = code + OP_lengths[*code];
+ goto CLASS_REF_REPEAT;
+
+ case OP_CALLOUT:
+ fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2),
+ GET(code, 2 + LINK_SIZE));
+ break;
+
+ case OP_PROP:
+ case OP_NOTPROP:
+ fprintf(f, " %s %s", OP_names[*code], get_ucpname(code[1]));
+ break;
+
+ /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in
+ having this code always here, and it makes it less messy without all those
+ #ifdefs. */
+
+ case OP_CLASS:
+ case OP_NCLASS:
+ case OP_XCLASS:
+ {
+ int i, min, max;
+ BOOL printmap;
+
+ fprintf(f, " [");
+
+ if (*code == OP_XCLASS)
+ {
+ extra = GET(code, 1);
+ ccode = code + LINK_SIZE + 1;
+ printmap = (*ccode & XCL_MAP) != 0;
+ if ((*ccode++ & XCL_NOT) != 0) fprintf(f, "^");
+ }
+ else
+ {
+ printmap = TRUE;
+ ccode = code + 1;
+ }
+
+ /* Print a bit map */
+
+ if (printmap)
+ {
+ for (i = 0; i < 256; i++)
+ {
+ if ((ccode[i/8] & (1 << (i&7))) != 0)
+ {
+ int j;
+ for (j = i+1; j < 256; j++)
+ if ((ccode[j/8] & (1 << (j&7))) == 0) break;
+ if (i == '-' || i == ']') fprintf(f, "\\");
+ if (isprint(i)) fprintf(f, "%c", i); else fprintf(f, "\\x%02x", i);
+ if (--j > i)
+ {
+ if (j != i + 1) fprintf(f, "-");
+ if (j == '-' || j == ']') fprintf(f, "\\");
+ if (isprint(j)) fprintf(f, "%c", j); else fprintf(f, "\\x%02x", j);
+ }
+ i = j;
+ }
+ }
+ ccode += 32;
+ }
+
+ /* For an XCLASS there is always some additional data */
+
+ if (*code == OP_XCLASS)
+ {
+ int ch;
+ while ((ch = *ccode++) != XCL_END)
+ {
+ if (ch == XCL_PROP)
+ {
+ fprintf(f, "\\p{%s}", get_ucpname(*ccode++));
+ }
+ else if (ch == XCL_NOTPROP)
+ {
+ fprintf(f, "\\P{%s}", get_ucpname(*ccode++));
+ }
+ else
+ {
+ ccode += 1 + print_char(f, ccode, TRUE);
+ if (ch == XCL_RANGE)
+ {
+ fprintf(f, "-");
+ ccode += 1 + print_char(f, ccode, TRUE);
+ }
+ }
+ }
+ }
+
+ /* Indicate a non-UTF8 class which was created by negation */
+
+ fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
+
+ /* Handle repeats after a class or a back reference */
+
+ CLASS_REF_REPEAT:
+ switch(*ccode)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRPLUS:
+ case OP_CRMINPLUS:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ fprintf(f, "%s", OP_names[*ccode]);
+ extra += OP_lengths[*ccode];
+ break;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ min = GET2(ccode,1);
+ max = GET2(ccode,3);
+ if (max == 0) fprintf(f, "{%d,}", min);
+ else fprintf(f, "{%d,%d}", min, max);
+ if (*ccode == OP_CRMINRANGE) fprintf(f, "?");
+ extra += OP_lengths[*ccode];
+ break;
+ }
+ }
+ break;
+
+ /* Anything else is just an item with no data*/
+
+ default:
+ fprintf(f, " %s", OP_names[*code]);
+ break;
+ }
+
+ code += OP_lengths[*code] + extra;
+ fprintf(f, "\n");
+ }
+}
+
+/* End of printint.c */
diff --git a/src/src/pcre/study.c b/src/src/pcre/study.c
new file mode 100644
index 000000000..d99b8a992
--- /dev/null
+++ b/src/src/pcre/study.c
@@ -0,0 +1,484 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/*
+This is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language. See
+the file Tech.Notes for some information on the internals.
+
+Written by: Philip Hazel <ph10@cam.ac.uk>
+
+ Copyright (c) 1997-2004 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* Include the internals header, which itself includes Standard C headers plus
+the external pcre header. */
+
+#include "internal.h"
+
+
+
+/*************************************************
+* Set a bit and maybe its alternate case *
+*************************************************/
+
+/* Given a character, set its bit in the table, and also the bit for the other
+version of a letter if we are caseless.
+
+Arguments:
+ start_bits points to the bit map
+ c is the character
+ caseless the caseless flag
+ cd the block with char table pointers
+
+Returns: nothing
+*/
+
+static void
+set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
+{
+start_bits[c/8] |= (1 << (c&7));
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
+ start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
+}
+
+
+
+/*************************************************
+* Create bitmap of starting chars *
+*************************************************/
+
+/* This function scans a compiled unanchored expression and attempts to build a
+bitmap of the set of initial characters. If it can't, it returns FALSE. As time
+goes by, we may be able to get more clever at doing this.
+
+Arguments:
+ code points to an expression
+ start_bits points to a 32-byte table, initialized to 0
+ caseless the current state of the caseless flag
+ utf8 TRUE if in UTF-8 mode
+ cd the block with char table pointers
+
+Returns: TRUE if table built, FALSE otherwise
+*/
+
+static BOOL
+set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
+ BOOL utf8, compile_data *cd)
+{
+register int c;
+
+/* This next statement and the later reference to dummy are here in order to
+trick the optimizer of the IBM C compiler for OS/2 into generating correct
+code. Apparently IBM isn't going to fix the problem, and we would rather not
+disable optimization (in this module it actually makes a big difference, and
+the pcre module can use all the optimization it can get). */
+
+volatile int dummy;
+
+do
+ {
+ const uschar *tcode = code + 1 + LINK_SIZE;
+ BOOL try_next = TRUE;
+
+ while (try_next)
+ {
+ /* If a branch starts with a bracket or a positive lookahead assertion,
+ recurse to set bits from within them. That's all for this branch. */
+
+ if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
+ {
+ if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
+ return FALSE;
+ try_next = FALSE;
+ }
+
+ else switch(*tcode)
+ {
+ default:
+ return FALSE;
+
+ /* Skip over callout */
+
+ case OP_CALLOUT:
+ tcode += 2 + 2*LINK_SIZE;
+ break;
+
+ /* Skip over extended extraction bracket number */
+
+ case OP_BRANUMBER:
+ tcode += 3;
+ break;
+
+ /* Skip over lookbehind and negative lookahead assertions */
+
+ case OP_ASSERT_NOT:
+ case OP_ASSERTBACK:
+ case OP_ASSERTBACK_NOT:
+ do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
+ tcode += 1+LINK_SIZE;
+ break;
+
+ /* Skip over an option setting, changing the caseless flag */
+
+ case OP_OPT:
+ caseless = (tcode[1] & PCRE_CASELESS) != 0;
+ tcode += 2;
+ break;
+
+ /* BRAZERO does the bracket, but carries on. */
+
+ case OP_BRAZERO:
+ case OP_BRAMINZERO:
+ if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
+ return FALSE;
+ dummy = 1;
+ do tcode += GET(tcode,1); while (*tcode == OP_ALT);
+ tcode += 1+LINK_SIZE;
+ break;
+
+ /* Single-char * or ? sets the bit and tries the next item */
+
+ case OP_STAR:
+ case OP_MINSTAR:
+ case OP_QUERY:
+ case OP_MINQUERY:
+ set_bit(start_bits, tcode[1], caseless, cd);
+ tcode += 2;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
+#endif
+ break;
+
+ /* Single-char upto sets the bit and tries the next */
+
+ case OP_UPTO:
+ case OP_MINUPTO:
+ set_bit(start_bits, tcode[3], caseless, cd);
+ tcode += 4;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
+#endif
+ break;
+
+ /* At least one single char sets the bit and stops */
+
+ case OP_EXACT: /* Fall through */
+ tcode += 2;
+
+ case OP_CHAR:
+ case OP_CHARNC:
+ case OP_PLUS:
+ case OP_MINPLUS:
+ set_bit(start_bits, tcode[1], caseless, cd);
+ try_next = FALSE;
+ break;
+
+ /* Single character type sets the bits and stops */
+
+ case OP_NOT_DIGIT:
+ for (c = 0; c < 32; c++)
+ start_bits[c] |= ~cd->cbits[c+cbit_digit];
+ try_next = FALSE;
+ break;
+
+ case OP_DIGIT:
+ for (c = 0; c < 32; c++)
+ start_bits[c] |= cd->cbits[c+cbit_digit];
+ try_next = FALSE;
+ break;
+
+ case OP_NOT_WHITESPACE:
+ for (c = 0; c < 32; c++)
+ start_bits[c] |= ~cd->cbits[c+cbit_space];
+ try_next = FALSE;
+ break;
+
+ case OP_WHITESPACE:
+ for (c = 0; c < 32; c++)
+ start_bits[c] |= cd->cbits[c+cbit_space];
+ try_next = FALSE;
+ break;
+
+ case OP_NOT_WORDCHAR:
+ for (c = 0; c < 32; c++)
+ start_bits[c] |= ~cd->cbits[c+cbit_word];
+ try_next = FALSE;
+ break;
+
+ case OP_WORDCHAR:
+ for (c = 0; c < 32; c++)
+ start_bits[c] |= cd->cbits[c+cbit_word];
+ try_next = FALSE;
+ break;
+
+ /* One or more character type fudges the pointer and restarts, knowing
+ it will hit a single character type and stop there. */
+
+ case OP_TYPEPLUS:
+ case OP_TYPEMINPLUS:
+ tcode++;
+ break;
+
+ case OP_TYPEEXACT:
+ tcode += 3;
+ break;
+
+ /* Zero or more repeats of character types set the bits and then
+ try again. */
+
+ case OP_TYPEUPTO:
+ case OP_TYPEMINUPTO:
+ tcode += 2; /* Fall through */
+
+ case OP_TYPESTAR:
+ case OP_TYPEMINSTAR:
+ case OP_TYPEQUERY:
+ case OP_TYPEMINQUERY:
+ switch(tcode[1])
+ {
+ case OP_ANY:
+ return FALSE;
+
+ case OP_NOT_DIGIT:
+ for (c = 0; c < 32; c++)
+ start_bits[c] |= ~cd->cbits[c+cbit_digit];
+ break;
+
+ case OP_DIGIT:
+ for (c = 0; c < 32; c++)
+ start_bits[c] |= cd->cbits[c+cbit_digit];
+ break;
+
+ case OP_NOT_WHITESPACE:
+ for (c = 0; c < 32; c++)
+ start_bits[c] |= ~cd->cbits[c+cbit_space];
+ break;
+
+ case OP_WHITESPACE:
+ for (c = 0; c < 32; c++)
+ start_bits[c] |= cd->cbits[c+cbit_space];
+ break;
+
+ case OP_NOT_WORDCHAR:
+ for (c = 0; c < 32; c++)
+ start_bits[c] |= ~cd->cbits[c+cbit_word];
+ break;
+
+ case OP_WORDCHAR:
+ for (c = 0; c < 32; c++)
+ start_bits[c] |= cd->cbits[c+cbit_word];
+ break;
+ }
+
+ tcode += 2;
+ break;
+
+ /* Character class where all the information is in a bit map: set the
+ bits and either carry on or not, according to the repeat count. If it was
+ a negative class, and we are operating with UTF-8 characters, any byte
+ with a value >= 0xc4 is a potentially valid starter because it starts a
+ character with a value > 255. */
+
+ case OP_NCLASS:
+ if (utf8)
+ {
+ start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
+ memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
+ }
+ /* Fall through */
+
+ case OP_CLASS:
+ {
+ tcode++;
+
+ /* In UTF-8 mode, the bits in a bit map correspond to character
+ values, not to byte values. However, the bit map we are constructing is
+ for byte values. So we have to do a conversion for characters whose
+ value is > 127. In fact, there are only two possible starting bytes for
+ characters in the range 128 - 255. */
+
+ if (utf8)
+ {
+ for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
+ for (c = 128; c < 256; c++)
+ {
+ if ((tcode[c/8] && (1 << (c&7))) != 0)
+ {
+ int d = (c >> 6) | 0xc0; /* Set bit for this starter */
+ start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
+ c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
+ }
+ }
+ }
+
+ /* In non-UTF-8 mode, the two bit maps are completely compatible. */
+
+ else
+ {
+ for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
+ }
+
+ /* Advance past the bit map, and act on what follows */
+
+ tcode += 32;
+ switch (*tcode)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ tcode++;
+ break;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
+ else try_next = FALSE;
+ break;
+
+ default:
+ try_next = FALSE;
+ break;
+ }
+ }
+ break; /* End of bitmap class handling */
+
+ } /* End of switch */
+ } /* End of try_next loop */
+
+ code += GET(code, 1); /* Advance to next branch */
+ }
+while (*code == OP_ALT);
+return TRUE;
+}
+
+
+
+/*************************************************
+* Study a compiled expression *
+*************************************************/
+
+/* This function is handed a compiled expression that it must study to produce
+information that will speed up the matching. It returns a pcre_extra block
+which then gets handed back to pcre_exec().
+
+Arguments:
+ re points to the compiled expression
+ options contains option bits
+ errorptr points to where to place error messages;
+ set NULL unless error
+
+Returns: pointer to a pcre_extra block, with study_data filled in and the
+ appropriate flag set;
+ NULL on error or if no optimization possible
+*/
+
+EXPORT pcre_extra *
+pcre_study(const pcre *external_re, int options, const char **errorptr)
+{
+uschar start_bits[32];
+pcre_extra *extra;
+pcre_study_data *study;
+const uschar *tables;
+const real_pcre *re = (const real_pcre *)external_re;
+uschar *code = (uschar *)re + re->name_table_offset +
+ (re->name_count * re->name_entry_size);
+compile_data compile_block;
+
+*errorptr = NULL;
+
+if (re == NULL || re->magic_number != MAGIC_NUMBER)
+ {
+ *errorptr = "argument is not a compiled regular expression";
+ return NULL;
+ }
+
+if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
+ {
+ *errorptr = "unknown or incorrect option bit(s) set";
+ return NULL;
+ }
+
+/* For an anchored pattern, or an unanchored pattern that has a first char, or
+a multiline pattern that matches only at "line starts", no further processing
+at present. */
+
+if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
+ return NULL;
+
+/* Set the character tables in the block that is passed around */
+
+tables = re->tables;
+if (tables == NULL)
+ (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, &tables);
+
+compile_block.lcc = tables + lcc_offset;
+compile_block.fcc = tables + fcc_offset;
+compile_block.cbits = tables + cbits_offset;
+compile_block.ctypes = tables + ctypes_offset;
+
+/* See if we can find a fixed set of initial characters for the pattern. */
+
+memset(start_bits, 0, 32 * sizeof(uschar));
+if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
+ (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
+
+/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
+the latter, which is pointed to by the former, which may also get additional
+data set later by the calling program. At the moment, the size of
+pcre_study_data is fixed. We nevertheless save it in a field for returning via
+the pcre_fullinfo() function so that if it becomes variable in the future, we
+don't have to change that code. */
+
+extra = (pcre_extra *)(pcre_malloc)
+ (sizeof(pcre_extra) + sizeof(pcre_study_data));
+
+if (extra == NULL)
+ {
+ *errorptr = "failed to get memory";
+ return NULL;
+ }
+
+study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
+extra->flags = PCRE_EXTRA_STUDY_DATA;
+extra->study_data = study;
+
+study->size = sizeof(pcre_study_data);
+study->options = PCRE_STUDY_MAPPED;
+memcpy(study->start_bits, start_bits, sizeof(start_bits));
+
+return extra;
+}
+
+/* End of study.c */